From patchwork Fri Mar 25 20:44:44 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 52371
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id A0EC53888C4C
	for <patchwork@sourceware.org>; Fri, 25 Mar 2022 20:46:10 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A0EC53888C4C
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1648241170;
	bh=pnxp2+ghk6Yi+/w8gd7HkxlEEAJ6V84Qcv5zmqgpAIQ=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=r5VP2d8a+PDxhvJaLvY9i0K92756DyMbjXcYA+q5wyXLlZTedMlVdZmCNN+afOet9
	 2n8gQw8Akm1X4MqR1zC85PDCq9UHrMUld1AP5JBpGjr46ldCmAj+QTe/JehPyZooAB
	 Qrmeicra8WwyzxzLq9XUqZ+CEicO7Opq1DGQUzyc=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-il1-x129.google.com (mail-il1-x129.google.com
 [IPv6:2607:f8b0:4864:20::129])
 by sourceware.org (Postfix) with ESMTPS id 67FA83857404
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 20:45:00 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 67FA83857404
Received: by mail-il1-x129.google.com with SMTP id 14so1301393ily.11
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 13:45:00 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=pnxp2+ghk6Yi+/w8gd7HkxlEEAJ6V84Qcv5zmqgpAIQ=;
 b=Q63ps9yS7/eyBfo75naTmWKvshNJSdyVAj0Ax/UoSnOjLnpaVXjZeEizfNWValcEOL
 QWdutCNUeorjmKiCLXqtHxJbsxbxLQ+/N7Wh4cpdkufkY9fwJlVFSXsHUgcqfyDR2mWv
 ROcI7H4JHBn7f0QuP7IxSL2qAVxZLzUjZvtuWXXqbgPQ9qU/Z6+nhnkB/OZcUzbMrqWa
 3QZYacmzpQrMrjCvtTwGiYoqaXjlGi08O5VtFdMprNlU5ltiabJTD/2wHk+x0pKfVao7
 en8ObQUme5fkbU6/mr3WuQgECHyTkBdkFTM37YP5Js1DuAwLW4lb6K8q5SzyA/Yg1IqG
 ySiA==
X-Gm-Message-State: AOAM5328ZtFmd+Dg6mVLATnSE6KEVw33uhLUwZ9CQBpx5oAhRABlqE1Q
 aiMcbSvXkUvXh5JG/ispN9heAZOP4OY=
X-Google-Smtp-Source: 
 ABdhPJzSUiBaICP9Q7gdKFB3O4w2HiE+b+GxkJ29RYaBEDBBdlnjKYOvsnPh9tV2bTXtBFC8KcXS8w==
X-Received: by 2002:a05:6e02:216f:b0:2c9:279d:d2fa with SMTP id
 s15-20020a056e02216f00b002c9279dd2famr278529ilv.207.1648241098775;
 Fri, 25 Mar 2022 13:44:58 -0700 (PDT)
Received: from localhost.localdomain (node-17-161.flex.volo.net.
 [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id
 a3-20020a5ec303000000b006496b4dd21csm3382118iok.5.2022.03.25.13.44.58
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Fri, 25 Mar 2022 13:44:58 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3
Date: Fri, 25 Mar 2022 15:44:44 -0500
Message-Id: <20220325204449.1284533-1-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20220325183625.1170867-2-goldstein.w.n@gmail.com>
References: <20220325183625.1170867-2-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-10.2 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_NUMSUBJECT, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_10_SHORT_WORD_LINES,
 SCC_20_SHORT_WORD_LINES, SCC_35_SHORT_WORD_LINES, SCC_5_SHORT_WORD_LINES,
 SPF_HELO_NONE, SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    4 -
 sysdeps/x86_64/multiarch/ifunc-memcmp.h    |    4 -
 sysdeps/x86_64/multiarch/memcmp-ssse3.S    | 1992 --------------------
 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S   |    4 -
 5 files changed, 2006 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 6507d1b7fa..51222dfab1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -12,7 +12,6 @@ sysdep_routines += \
   memcmp-evex-movbe \
   memcmp-sse2 \
   memcmp-sse4 \
-  memcmp-ssse3 \
   memcmpeq-avx2 \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
@@ -179,7 +178,6 @@ sysdep_routines += \
   wmemcmp-c \
   wmemcmp-evex-movbe \
   wmemcmp-sse4 \
-  wmemcmp-ssse3 \
 # sysdep_routines
 endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40cc6cc49e..f389928a4e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __memcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
 #ifdef SHARED
@@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wmemcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __wmemcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd12613699..44759a3ad5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
     return OPTIMIZE (sse4_1);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index df1b1fc494..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1992 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-#  define MEMCMP	__memcmp_ssse3
-# endif
-
-/* Warning!
-	   wmemcmp has to use SIGNED comparison for elements.
-	   memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
-	atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
-	shl	$2, %RDX_LP
-	test	%RDX_LP, %RDX_LP
-	jz	L(equal)
-# elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-# endif
-	mov	%rdx, %rcx
-	mov	%rdi, %rdx
-	cmp	$48, %rcx;
-	jae	L(48bytesormore)	/* LEN => 48  */
-
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-/* ECX >= 32.  */
-L(48bytesormore):
-	movdqu	(%rdi), %xmm3
-	movdqu	(%rsi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	16(%rdi), %rdi
-	lea	16(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(less16bytes)
-	mov	%edi, %edx
-	and	$0xf, %edx
-	xor	%rdx, %rdi
-	sub	%rdx, %rsi
-	add	%rdx, %rcx
-	mov	%esi, %edx
-	and	$0xf, %edx
-	jz	L(shr_0)
-	xor	%rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
-	cmp	$8, %edx
-	jae	L(next_unaligned_table)
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$1, %edx
-	je	L(shr_1)
-	cmp	$2, %edx
-	je	L(shr_2)
-	cmp	$3, %edx
-	je	L(shr_3)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$5, %edx
-	je	L(shr_5)
-	cmp	$6, %edx
-	je	L(shr_6)
-	jmp	L(shr_7)
-
-	.p2align 2
-L(next_unaligned_table):
-	cmp	$8, %edx
-	je	L(shr_8)
-	cmp	$9, %edx
-	je	L(shr_9)
-	cmp	$10, %edx
-	je	L(shr_10)
-	cmp	$11, %edx
-	je	L(shr_11)
-	cmp	$12, %edx
-	je	L(shr_12)
-	cmp	$13, %edx
-	je	L(shr_13)
-	cmp	$14, %edx
-	je	L(shr_14)
-	jmp	L(shr_15)
-# else
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$8, %edx
-	je	L(shr_8)
-	jmp	L(shr_12)
-# endif
-
-	.p2align 4
-L(shr_0):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	jae	L(shr_0_gobble)
-	xor	%eax, %eax
-	movdqa	(%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-	movdqa	16(%rsi), %xmm2
-	pcmpeqb	16(%rdi), %xmm2
-	pand	%xmm1, %xmm2
-	pmovmskb %xmm2, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_0_gobble):
-	movdqa	(%rsi), %xmm0
-	xor	%eax, %eax
-	pcmpeqb	(%rdi), %xmm0
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm2
-	pcmpeqb	16(%rdi), %xmm2
-L(shr_0_gobble_loop):
-	pand	%xmm0, %xmm2
-	sub	$32, %rcx
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	movdqa	32(%rsi), %xmm0
-	movdqa	48(%rsi), %xmm2
-	sbb	$0xffff, %edx
-	pcmpeqb	32(%rdi), %xmm0
-	pcmpeqb	48(%rdi), %xmm2
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	jz	L(shr_0_gobble_loop)
-
-	pand	%xmm0, %xmm2
-	cmp	$0, %rcx
-	jge	L(next)
-	inc	%edx
-	add	$32, %rcx
-L(next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_1):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_1_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$1, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$1, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$1, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_1_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$1, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$1, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$1, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$1, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_1_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_1_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_1_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	1(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-
-	.p2align 4
-L(shr_2):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_2_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$2, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$2, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$2, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_2_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$2, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$2, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$2, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$2, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_2_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_2_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_2_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	2(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_3):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_3_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$3, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$3, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$3, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_3_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$3, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$3, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$3, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$3, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_3_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_3_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_3_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	3(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_4):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_4_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$4, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$4, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$4, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_4_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$4, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$4, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$4, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$4, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_4_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_4_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_4_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	4(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_5):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_5_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$5, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$5, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$5, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_5_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$5, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$5, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$5, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$5, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_5_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_5_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_5_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	5(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_6):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_6_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$6, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$6, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$6, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_6_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$6, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$6, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$6, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$6, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_6_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_6_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_6_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	6(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_7):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_7_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$7, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$7, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$7, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_7_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$7, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$7, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$7, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$7, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_7_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_7_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_7_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	7(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_8):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_8_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$8, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$8, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$8, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_8_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$8, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$8, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$8, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$8, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_8_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_8_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_8_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	8(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_9):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_9_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$9, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$9, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$9, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_9_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$9, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$9, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$9, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$9, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_9_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_9_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_9_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	9(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_10):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_10_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$10, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$10, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$10, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_10_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$10, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$10, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$10, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$10, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_10_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_10_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_10_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	10(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_11):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_11_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$11, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$11, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$11, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_11_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$11, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$11, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$11, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$11, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_11_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_11_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_11_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	11(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_12):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_12_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$12, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$12, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$12, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_12_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$12, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$12, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$12, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$12, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_12_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_12_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_12_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	12(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_13):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_13_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$13, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$13, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$13, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_13_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$13, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$13, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$13, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$13, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_13_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_13_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_13_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	13(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_14):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_14_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$14, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$14, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$14, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_14_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$14, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$14, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$14, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$14, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_14_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_14_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_14_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	14(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_15):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_15_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$15, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$15, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$15, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_15_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$15, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$15, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$15, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$15, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_15_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_15_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_15_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	15(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-# endif
-	.p2align 4
-L(exit):
-	pmovmskb %xmm1, %r8d
-	sub	$0xffff, %r8d
-	jz	L(first16bytes)
-	lea	-16(%rsi), %rsi
-	lea	-16(%rdi), %rdi
-	mov	%r8d, %edx
-L(first16bytes):
-	add	%rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
-	test	%dl, %dl
-	jz	L(next_24_bytes)
-
-	test	$0x01, %dl
-	jnz	L(Byte16)
-
-	test	$0x02, %dl
-	jnz	L(Byte17)
-
-	test	$0x04, %dl
-	jnz	L(Byte18)
-
-	test	$0x08, %dl
-	jnz	L(Byte19)
-
-	test	$0x10, %dl
-	jnz	L(Byte20)
-
-	test	$0x20, %dl
-	jnz	L(Byte21)
-
-	test	$0x40, %dl
-	jnz	L(Byte22)
-
-	movzbl	-9(%rdi), %eax
-	movzbl	-9(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte16):
-	movzbl	-16(%rdi), %eax
-	movzbl	-16(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte17):
-	movzbl	-15(%rdi), %eax
-	movzbl	-15(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte18):
-	movzbl	-14(%rdi), %eax
-	movzbl	-14(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte19):
-	movzbl	-13(%rdi), %eax
-	movzbl	-13(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte20):
-	movzbl	-12(%rdi), %eax
-	movzbl	-12(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte21):
-	movzbl	-11(%rdi), %eax
-	movzbl	-11(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte22):
-	movzbl	-10(%rdi), %eax
-	movzbl	-10(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(next_24_bytes):
-	lea	8(%rdi), %rdi
-	lea	8(%rsi), %rsi
-	test	$0x01, %dh
-	jnz	L(Byte16)
-
-	test	$0x02, %dh
-	jnz	L(Byte17)
-
-	test	$0x04, %dh
-	jnz	L(Byte18)
-
-	test	$0x08, %dh
-	jnz	L(Byte19)
-
-	test	$0x10, %dh
-	jnz	L(Byte20)
-
-	test	$0x20, %dh
-	jnz	L(Byte21)
-
-	test	$0x40, %dh
-	jnz	L(Byte22)
-
-	movzbl	-9(%rdi), %eax
-	movzbl	-9(%rsi), %edx
-	sub	%edx, %eax
-	ret
-# else
-/* special for wmemcmp */
-	xor	%eax, %eax
-	test	%dl, %dl
-	jz	L(next_two_double_words)
-	and	$15, %dl
-	jz	L(second_double_word)
-	mov	-16(%rdi), %eax
-	cmp	-16(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(second_double_word):
-	mov	-12(%rdi), %eax
-	cmp	-12(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(next_two_double_words):
-	and	$15, %dh
-	jz	L(fourth_double_word)
-	mov	-8(%rdi), %eax
-	cmp	-8(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(fourth_double_word):
-	mov	-4(%rdi), %eax
-	cmp	-4(%rsi), %eax
-	jne	L(find_diff)
-	ret
-# endif
-
-	.p2align 4
-L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$0, %ecx
-	je	L(0bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$1, %ecx
-	je	L(1bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$3, %ecx
-	je	L(3bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	cmp	$5, %ecx
-	je	L(5bytes)
-	cmp	$6, %ecx
-	je	L(6bytes)
-	jmp	L(7bytes)
-# else
-	jmp	L(4bytes)
-# endif
-
-	.p2align 4
-L(more8bytes):
-	cmp	$16, %ecx
-	jae	L(more16bytes)
-	cmp	$8, %ecx
-	je	L(8bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$9, %ecx
-	je	L(9bytes)
-	cmp	$10, %ecx
-	je	L(10bytes)
-	cmp	$11, %ecx
-	je	L(11bytes)
-	cmp	$12, %ecx
-	je	L(12bytes)
-	cmp	$13, %ecx
-	je	L(13bytes)
-	cmp	$14, %ecx
-	je	L(14bytes)
-	jmp	L(15bytes)
-# else
-	jmp	L(12bytes)
-# endif
-
-	.p2align 4
-L(more16bytes):
-	cmp	$24, %ecx
-	jae	L(more24bytes)
-	cmp	$16, %ecx
-	je	L(16bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$17, %ecx
-	je	L(17bytes)
-	cmp	$18, %ecx
-	je	L(18bytes)
-	cmp	$19, %ecx
-	je	L(19bytes)
-	cmp	$20, %ecx
-	je	L(20bytes)
-	cmp	$21, %ecx
-	je	L(21bytes)
-	cmp	$22, %ecx
-	je	L(22bytes)
-	jmp	L(23bytes)
-# else
-	jmp	L(20bytes)
-# endif
-
-	.p2align 4
-L(more24bytes):
-	cmp	$32, %ecx
-	jae	L(more32bytes)
-	cmp	$24, %ecx
-	je	L(24bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$25, %ecx
-	je	L(25bytes)
-	cmp	$26, %ecx
-	je	L(26bytes)
-	cmp	$27, %ecx
-	je	L(27bytes)
-	cmp	$28, %ecx
-	je	L(28bytes)
-	cmp	$29, %ecx
-	je	L(29bytes)
-	cmp	$30, %ecx
-	je	L(30bytes)
-	jmp	L(31bytes)
-# else
-	jmp	L(28bytes)
-# endif
-
-	.p2align 4
-L(more32bytes):
-	cmp	$40, %ecx
-	jae	L(more40bytes)
-	cmp	$32, %ecx
-	je	L(32bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$33, %ecx
-	je	L(33bytes)
-	cmp	$34, %ecx
-	je	L(34bytes)
-	cmp	$35, %ecx
-	je	L(35bytes)
-	cmp	$36, %ecx
-	je	L(36bytes)
-	cmp	$37, %ecx
-	je	L(37bytes)
-	cmp	$38, %ecx
-	je	L(38bytes)
-	jmp	L(39bytes)
-# else
-	jmp	L(36bytes)
-# endif
-
-	.p2align 4
-L(more40bytes):
-	cmp	$40, %ecx
-	je	L(40bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$41, %ecx
-	je	L(41bytes)
-	cmp	$42, %ecx
-	je	L(42bytes)
-	cmp	$43, %ecx
-	je	L(43bytes)
-	cmp	$44, %ecx
-	je	L(44bytes)
-	cmp	$45, %ecx
-	je	L(45bytes)
-	cmp	$46, %ecx
-	je	L(46bytes)
-	jmp	L(47bytes)
-
-	.p2align 4
-L(44bytes):
-	movl	-44(%rdi), %eax
-	movl	-44(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(40bytes):
-	movl	-40(%rdi), %eax
-	movl	-40(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(36bytes):
-	movl	-36(%rdi), %eax
-	movl	-36(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(32bytes):
-	movl	-32(%rdi), %eax
-	movl	-32(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(28bytes):
-	movl	-28(%rdi), %eax
-	movl	-28(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(24bytes):
-	movl	-24(%rdi), %eax
-	movl	-24(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(20bytes):
-	movl	-20(%rdi), %eax
-	movl	-20(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(16bytes):
-	movl	-16(%rdi), %eax
-	movl	-16(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(12bytes):
-	movl	-12(%rdi), %eax
-	movl	-12(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(8bytes):
-	movl	-8(%rdi), %eax
-	movl	-8(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(4bytes):
-	movl	-4(%rdi), %eax
-	movl	-4(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-# else
-	.p2align 4
-L(44bytes):
-	movl	-44(%rdi), %eax
-	cmp	-44(%rsi), %eax
-	jne	L(find_diff)
-L(40bytes):
-	movl	-40(%rdi), %eax
-	cmp	-40(%rsi), %eax
-	jne	L(find_diff)
-L(36bytes):
-	movl	-36(%rdi), %eax
-	cmp	-36(%rsi), %eax
-	jne	L(find_diff)
-L(32bytes):
-	movl	-32(%rdi), %eax
-	cmp	-32(%rsi), %eax
-	jne	L(find_diff)
-L(28bytes):
-	movl	-28(%rdi), %eax
-	cmp	-28(%rsi), %eax
-	jne	L(find_diff)
-L(24bytes):
-	movl	-24(%rdi), %eax
-	cmp	-24(%rsi), %eax
-	jne	L(find_diff)
-L(20bytes):
-	movl	-20(%rdi), %eax
-	cmp	-20(%rsi), %eax
-	jne	L(find_diff)
-L(16bytes):
-	movl	-16(%rdi), %eax
-	cmp	-16(%rsi), %eax
-	jne	L(find_diff)
-L(12bytes):
-	movl	-12(%rdi), %eax
-	cmp	-12(%rsi), %eax
-	jne	L(find_diff)
-L(8bytes):
-	movl	-8(%rdi), %eax
-	cmp	-8(%rsi), %eax
-	jne	L(find_diff)
-L(4bytes):
-	movl	-4(%rdi), %eax
-	cmp	-4(%rsi), %eax
-	jne	L(find_diff)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
-	.p2align 4
-L(45bytes):
-	movl	-45(%rdi), %eax
-	movl	-45(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(41bytes):
-	movl	-41(%rdi), %eax
-	movl	-41(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(37bytes):
-	movl	-37(%rdi), %eax
-	movl	-37(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(33bytes):
-	movl	-33(%rdi), %eax
-	movl	-33(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(29bytes):
-	movl	-29(%rdi), %eax
-	movl	-29(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(25bytes):
-	movl	-25(%rdi), %eax
-	movl	-25(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(21bytes):
-	movl	-21(%rdi), %eax
-	movl	-21(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(17bytes):
-	movl	-17(%rdi), %eax
-	movl	-17(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(13bytes):
-	movl	-13(%rdi), %eax
-	movl	-13(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(9bytes):
-	movl	-9(%rdi), %eax
-	movl	-9(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(5bytes):
-	movl	-5(%rdi), %eax
-	movl	-5(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(1bytes):
-	movzbl	-1(%rdi), %eax
-	cmpb	-1(%rsi), %al
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(46bytes):
-	movl	-46(%rdi), %eax
-	movl	-46(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(42bytes):
-	movl	-42(%rdi), %eax
-	movl	-42(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(38bytes):
-	movl	-38(%rdi), %eax
-	movl	-38(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(34bytes):
-	movl	-34(%rdi), %eax
-	movl	-34(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(30bytes):
-	movl	-30(%rdi), %eax
-	movl	-30(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(26bytes):
-	movl	-26(%rdi), %eax
-	movl	-26(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(22bytes):
-	movl	-22(%rdi), %eax
-	movl	-22(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(18bytes):
-	movl	-18(%rdi), %eax
-	movl	-18(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(14bytes):
-	movl	-14(%rdi), %eax
-	movl	-14(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(10bytes):
-	movl	-10(%rdi), %eax
-	movl	-10(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(6bytes):
-	movl	-6(%rdi), %eax
-	movl	-6(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(2bytes):
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-	cmp	%ecx, %eax
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(47bytes):
-	movl	-47(%rdi), %eax
-	movl	-47(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(43bytes):
-	movl	-43(%rdi), %eax
-	movl	-43(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(39bytes):
-	movl	-39(%rdi), %eax
-	movl	-39(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(35bytes):
-	movl	-35(%rdi), %eax
-	movl	-35(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(31bytes):
-	movl	-31(%rdi), %eax
-	movl	-31(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(27bytes):
-	movl	-27(%rdi), %eax
-	movl	-27(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(23bytes):
-	movl	-23(%rdi), %eax
-	movl	-23(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(19bytes):
-	movl	-19(%rdi), %eax
-	movl	-19(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(15bytes):
-	movl	-15(%rdi), %eax
-	movl	-15(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(11bytes):
-	movl	-11(%rdi), %eax
-	movl	-11(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(7bytes):
-	movl	-7(%rdi), %eax
-	movl	-7(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(3bytes):
-	movzwl	-3(%rdi), %eax
-	movzwl	-3(%rsi), %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-	cmp	%ecx, %eax
-	jne	L(set)
-	movzbl	-1(%rdi), %eax
-	cmpb	-1(%rsi), %al
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(find_diff):
-	cmpb	%cl, %al
-	jne	L(set)
-	cmpw	%cx, %ax
-	jne	L(set)
-	shr	$16, %eax
-	shr	$16, %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-
-/* We get there only if we already know there is a
-difference.  */
-
-	cmp	%ecx, %eax
-L(set):
-	sbb	%eax, %eax
-	sbb	$-1, %eax
-	ret
-# else
-
-/* for wmemcmp */
-	.p2align 4
-L(find_diff):
-	mov	$1, %eax
-	jg	L(find_diff_bigger)
-	neg	%eax
-	ret
-
-	.p2align 4
-L(find_diff_bigger):
-	ret
-# endif
-
-	.p2align 4
-L(equal):
-	xor	%eax, %eax
-	ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"

From patchwork Fri Mar 25 20:44:45 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 52370
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id BF893385E451
	for <patchwork@sourceware.org>; Fri, 25 Mar 2022 20:45:23 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org BF893385E451
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1648241123;
	bh=taR4ackzeNtqsLwmyL00S9EUcc3sJFCjBhurh+RypRw=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=sET/QN+22SxZQaRSSFsMWON3QnlfY//kxDGOytEEEjXCPPsoszdc5xwpEU9mcuxF9
	 3zDazhsTRgTSHtcWjCN79WXlSAsG62VMQ0EJ11JZO+859+Up1W1Wj/HmSSrAun8gg4
	 kDW3PBI9JxWi8agTp/LuqvWTTHn5gMTOy9biMzbo=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-io1-xd31.google.com (mail-io1-xd31.google.com
 [IPv6:2607:f8b0:4864:20::d31])
 by sourceware.org (Postfix) with ESMTPS id 7132A3857034
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 20:45:01 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 7132A3857034
Received: by mail-io1-xd31.google.com with SMTP id 9so7111371iou.5
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 13:45:01 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=taR4ackzeNtqsLwmyL00S9EUcc3sJFCjBhurh+RypRw=;
 b=HzLP/EbaaQMiVzKikLHwgoqu30EsK/IbeX2aoiWWijOkXzSdpHcnCyKD+tXngocUdk
 b4KbuqL46dN65wVpv3UOXe79NnttFvShtllhELJXKqgHwXbWlxB3PVk6bwoJSwdBe34H
 eG1v6h8sdrFJk5NCTMmaFa0jaijne+7dxBf3aJGQuhnGka+Gky03JFS4EfYs/FJW87qK
 mqkNmXtM+VEvrU9SsgI4V0Fea6cNKBecnYI+HAk6+1K+zqlm5Vg+q4v87Mmm+wl1DWU3
 5C+zdEXudXfI4Ts0/HAr7O0qFti2FA0aUPnAfmLOwiOIMtAtjrNfjbHmmL44IWm32o6b
 sgaA==
X-Gm-Message-State: AOAM530kzOLwWvqKrhnRRzsGNfPprJXFey/JtPz3+ksOBxZHMsZJB+hl
 MUb9FtslTW24+rvqLogvys6mJG/vZ/4=
X-Google-Smtp-Source: 
 ABdhPJxucb3n5ZLFJM0ydrXHedVH4lBizG1oOWhd+VqI6jlYXU/hxFFCY+1WBY4dhv8+UdG87p9slg==
X-Received: by 2002:a5d:9c0f:0:b0:645:bc04:fd5 with SMTP id
 15-20020a5d9c0f000000b00645bc040fd5mr403670ioe.28.1648241100291;
 Fri, 25 Mar 2022 13:45:00 -0700 (PDT)
Received: from localhost.localdomain (node-17-161.flex.volo.net.
 [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id
 a3-20020a5ec303000000b006496b4dd21csm3382118iok.5.2022.03.25.13.44.59
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Fri, 25 Mar 2022 13:44:59 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3
Date: Fri, 25 Mar 2022 15:44:45 -0500
Message-Id: <20220325204449.1284533-2-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20220325204449.1284533-1-goldstein.w.n@gmail.com>
References: <20220325183625.1170867-2-goldstein.w.n@gmail.com>
 <20220325204449.1284533-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-12.2 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_NUMSUBJECT, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |   4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 --
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |   4 -
 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S |   6 -
 sysdeps/x86_64/multiarch/strcmp-ssse3.S       |   5 -
 sysdeps/x86_64/multiarch/strcmp.c             |   4 -
 sysdeps/x86_64/multiarch/strncase_l-ssse3.S   |   6 -
 sysdeps/x86_64/multiarch/strncmp-ssse3.S      |  28 ----
 sysdeps/x86_64/multiarch/strncmp.c            |   4 -
 sysdeps/x86_64/strcmp.S                       | 155 ++++--------------
 10 files changed, 30 insertions(+), 202 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 51222dfab1..ed2def288d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -58,7 +58,6 @@ sysdep_routines += \
   strcasecmp_l-evex \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
-  strcasecmp_l-ssse3 \
   strcat-avx2 \
   strcat-avx2-rtm \
   strcat-evex \
@@ -80,7 +79,6 @@ sysdep_routines += \
   strcmp-sse2 \
   strcmp-sse2-unaligned \
   strcmp-sse4_2 \
-  strcmp-ssse3 \
   strcpy-avx2 \
   strcpy-avx2-rtm \
   strcpy-evex \
@@ -98,7 +96,6 @@ sysdep_routines += \
   strncase_l-evex \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
-  strncase_l-ssse3 \
   strncat-avx2 \
   strncat-avx2-rtm \
   strncat-c \
@@ -110,7 +107,6 @@ sysdep_routines += \
   strncmp-evex \
   strncmp-sse2 \
   strncmp-sse4_2 \
-  strncmp-ssse3 \
   strncpy-avx2 \
   strncpy-avx2-rtm \
   strncpy-c \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f389928a4e..7e2be3554b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strcasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
@@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strcasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
 			      __strcasecmp_l_sse2))
 
@@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strcmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strcmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __strcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
 
@@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strncasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
 			      __strncasecmp_sse2))
 
@@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strncasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
 			      __strncasecmp_l_sse2))
 
@@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strncmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strncmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
-			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
 
 #ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 766539c241..296d32071b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -20,7 +20,6 @@
 #include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 68cb73baad..a248c2a6e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -28,7 +28,6 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
 
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index ec37308347..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strcmp optimized with SSSE3.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#define STRCMP __strncmp_ssse3
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(strcmp)
-
-#define USE_SSSE3 1
-#define USE_AS_STRNCMP
-#include <sysdeps/x86_64/strcmp.S>
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index fca74199d8..70ae6547c9 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -27,7 +27,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
 
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 99d8b36f1d..c38dc627f9 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -59,12 +59,7 @@
 # endif
 #endif
 
-#ifndef USE_SSSE3
 	.text
-#else
-	.section .text.ssse3,"ax",@progbits
-#endif
-
 #ifdef USE_AS_STRCASECMP_L
 # ifndef ENTRY2
 #  define ENTRY2(name) ENTRY (name)
@@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0

From patchwork Fri Mar 25 20:44:46 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 52373
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 2347F3840C05
	for <patchwork@sourceware.org>; Fri, 25 Mar 2022 20:47:40 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 2347F3840C05
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1648241260;
	bh=twaMs+vmjjG4jUR2v2sobiFdwYiV2jVcPwHkJKk09Z4=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=FZRGaRDFG/cvtsAMYvHXIQVuLt/Lq7L3xPXOn1UdOfG4YKgUoOZMjGK2hY7pPRpXv
	 I1+oioI5d9FKdLQMZA1zoaWfx75gqL3fm7kUxidV13WMFare2O+kRnDVsIAwI3AFNa
	 9ShfTO5o6Q/vDpfIWkvLPnAnAX2NJR951rXX4xPo=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-il1-x12f.google.com (mail-il1-x12f.google.com
 [IPv6:2607:f8b0:4864:20::12f])
 by sourceware.org (Postfix) with ESMTPS id 60B35385E44F
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 20:45:04 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 60B35385E44F
Received: by mail-il1-x12f.google.com with SMTP id 14so1301495ily.11
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 13:45:04 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=twaMs+vmjjG4jUR2v2sobiFdwYiV2jVcPwHkJKk09Z4=;
 b=XxB49zr8IIMFN7SdA/NY4j3fOMPvmMQrWOtsvDIdORxGDm3wDTNk6bl1dO4D4sBLyH
 l+NCAu+KlaELAfTZmYEw5J0gFYmldNm3+1PizkRSk5LaXwqNOLzXMY++jhcShOYOqghU
 58mhhKqZKa7LLICcOdnljUtIiyoQKd9nv0HNLXuHY3P7MhyVbPOxJkuUzgrCW0Un1tyA
 cTw8aDF1cVrxhaDcmVd+L2yauCa+GqzkIxIj1Zkx0vSE1aA7wzB/wzh+aTzLR8F+Ohw9
 L4QoDFbwE/9o173u0WAEVnyHxS84U8sxARSg4xVMtfBjWlw5OTAuKLDtLRx/6c5pG9ps
 HuOw==
X-Gm-Message-State: AOAM532P5HZ86MAwlKJP9PLI17TWYizsBY/nyltkSL/y5IDpbww2SlwX
 dj0BqD+ctpEclgEBF7EVTqWPzucRKlg=
X-Google-Smtp-Source: 
 ABdhPJzfaZNQHc7NvFz9/it/uft5yO77gd3pnxzfLAveqxshfXZpsN5V46IRA/cOpCBtDuj5HeRHqg==
X-Received: by 2002:a05:6e02:1a4f:b0:2c6:6499:9d1b with SMTP id
 u15-20020a056e021a4f00b002c664999d1bmr290421ilv.119.1648241101601;
 Fri, 25 Mar 2022 13:45:01 -0700 (PDT)
Received: from localhost.localdomain (node-17-161.flex.volo.net.
 [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id
 a3-20020a5ec303000000b006496b4dd21csm3382118iok.5.2022.03.25.13.45.00
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Fri, 25 Mar 2022 13:45:01 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3
Date: Fri, 25 Mar 2022 15:44:46 -0500
Message-Id: <20220325204449.1284533-3-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20220325204449.1284533-1-goldstein.w.n@gmail.com>
References: <20220325183625.1170867-2-goldstein.w.n@gmail.com>
 <20220325204449.1284533-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-10.2 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_NUMSUBJECT, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_10_SHORT_WORD_LINES,
 SCC_20_SHORT_WORD_LINES, SCC_35_SHORT_WORD_LINES, SCC_5_SHORT_WORD_LINES,
 SPF_HELO_NONE, SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
 sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
 sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
 sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
 5 files changed, 7 insertions(+), 3183 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ed2def288d..48f81711ae 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
   memcmpeq-sse2 \
-  memcpy-ssse3 \
   memcpy-ssse3-back \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
@@ -24,7 +23,6 @@ sysdep_routines += \
   memmove-avx512-unaligned-erms \
   memmove-evex-unaligned-erms \
   memmove-sse2-unaligned-erms \
-  memmove-ssse3 \
   memmove-ssse3-back \
   memrchr-avx2 \
   memrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7e2be3554b..70b0e9c62e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -135,9 +135,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
@@ -179,8 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
 			      __memmove_sse2_unaligned)
@@ -887,9 +882,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
@@ -922,8 +914,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memcpy_avx512_no_vzeroupper)
@@ -973,9 +963,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
@@ -1017,8 +1004,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..1ecdd4b0d3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -24,8 +24,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
   attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
@@ -94,17 +92,15 @@ IFUNC_SELECTOR (void)
 	}
     }
 
-  if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
     {
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (sse2_unaligned_erms);
-
-      return OPTIMIZE (sse2_unaligned);
+      if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
+        return OPTIMIZE (ssse3_back);
     }
 
-  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
-    return OPTIMIZE (ssse3_back);
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	return OPTIMIZE (sse2_unaligned_erms);
 
-  return OPTIMIZE (ssse3);
+  return OPTIMIZE (sse2_unaligned);
 }
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-# define MEMPCPY	__mempcpy_ssse3
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(write_0bytes)
-	cmp	$79, %rdx
-	jbe	L(copy_forward)
-	jmp	L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$79, %rdx
-	lea     L(table_less_80bytes)(%rip), %r11
-	ja	L(80bytesormore)
-	movslq	(%r11, %rdx, 4), %r9
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %rcx
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rcx, %r8
-	sub	%rdi, %rcx
-	add	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_fwd)
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
-	.p2align 4
-L(copy_backward):
-	movdqu	-16(%rsi, %rdx), %xmm0
-	add	%rdx, %rsi
-	lea	-16(%rdi, %rdx), %r8
-	add	%rdx, %rdi
-
-	mov	%rdi, %rcx
-	and	$0xf, %rcx
-	xor	%rcx, %rdi
-	sub	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_bwd)
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
-	.p2align 4
-L(shl_0):
-	sub	$16, %rdx
-	movdqa	(%rsi), %xmm1
-	add	$16, %rsi
-	movdqa	%xmm1, (%rdi)
-	add	$16, %rdi
-	cmp	$128, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes)
-	movaps	(%rsi), %xmm4
-	movaps	16(%rsi), %xmm1
-	movaps	32(%rsi), %xmm2
-	movaps	48(%rsi), %xmm3
-	movaps	%xmm4, (%rdi)
-	movaps	%xmm1, 16(%rdi)
-	movaps	%xmm2, 32(%rdi)
-	movaps	%xmm3, 48(%rdi)
-	sub	$64, %rdx
-	add	$64, %rsi
-	add	$64, %rdi
-L(shl_0_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
-	movdqa	(%rsi), %xmm4
-	movaps	0x10(%rsi), %xmm1
-	movaps	0x20(%rsi), %xmm2
-	movaps	0x30(%rsi), %xmm3
-
-	movdqa	%xmm4, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-
-	sub	$128, %rdx
-	movaps	0x40(%rsi), %xmm4
-	movaps	0x50(%rsi), %xmm5
-	movaps	0x60(%rsi), %xmm6
-	movaps	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_cache_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%rsi), %xmm4
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm4, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm4
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm4, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_cache_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x280(%rsi)
-
-	movdqa	(%rsi), %xmm0
-	movdqa	0x10(%rsi), %xmm1
-	movdqa	0x20(%rsi), %xmm2
-	movdqa	0x30(%rsi), %xmm3
-	movdqa	0x40(%rsi), %xmm4
-	movdqa	0x50(%rsi), %xmm5
-	movdqa	0x60(%rsi), %xmm6
-	movdqa	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm0
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm0, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_less_32bytes)
-	movdqa	(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	0x10(%rsi), %xmm1
-	add	$0x20, %rsi
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	add	$0x20, %rdi
-L(shl_0_mem_less_32bytes):
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$16, %rdx
-	movdqa	-0x10(%rsi), %xmm1
-	sub	$16, %rsi
-	movdqa	%xmm1, -0x10(%rdi)
-	sub	$16, %rdi
-	cmp	$0x80, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble_bwd)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes_bwd)
-	movaps	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	sub	$64, %rdx
-	sub	$0x40, %rsi
-	sub	$0x40, %rdi
-L(shl_0_less_64bytes_bwd):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
-	movdqa	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-
-	sub	$0x80, %rdx
-	movaps	-0x50(%rsi), %xmm4
-	movaps	-0x60(%rsi), %xmm5
-	movaps	-0x70(%rsi), %xmm6
-	movaps	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_gobble_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_bwd_loop):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x280(%rsi)
-	movdqa	-0x10(%rsi), %xmm0
-	movdqa	-0x20(%rsi), %xmm1
-	movdqa	-0x30(%rsi), %xmm2
-	movdqa	-0x40(%rsi), %xmm3
-	movdqa	-0x50(%rsi), %xmm4
-	movdqa	-0x60(%rsi), %xmm5
-	movdqa	-0x70(%rsi), %xmm6
-	movdqa	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	movdqa	%xmm2, -0x30(%rdi)
-	movdqa	%xmm3, -0x40(%rdi)
-	movdqa	%xmm4, -0x50(%rdi)
-	movdqa	%xmm5, -0x60(%rdi)
-	movdqa	%xmm6, -0x70(%rdi)
-	movdqa	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_bwd_less_32bytes)
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-	sub	$0x20, %rsi
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	sub	$0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_fwd)
-	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$1, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$1, %xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$1, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_1_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_bwd)
-	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
-	movaps	-0x11(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x21(%rsi), %xmm3
-	movaps	-0x31(%rsi), %xmm4
-	movaps	-0x41(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$1, %xmm2, %xmm1
-	palignr	$1, %xmm3, %xmm2
-	palignr	$1, %xmm4, %xmm3
-	palignr	$1, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_1_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_fwd)
-	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$2, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$2, %xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$2, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_2_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_bwd)
-	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
-	movaps	-0x12(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x22(%rsi), %xmm3
-	movaps	-0x32(%rsi), %xmm4
-	movaps	-0x42(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$2, %xmm2, %xmm1
-	palignr	$2, %xmm3, %xmm2
-	palignr	$2, %xmm4, %xmm3
-	palignr	$2, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_2_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_fwd)
-	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$3, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$3, %xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$3, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_3_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_bwd)
-	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
-	movaps	-0x13(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x23(%rsi), %xmm3
-	movaps	-0x33(%rsi), %xmm4
-	movaps	-0x43(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$3, %xmm2, %xmm1
-	palignr	$3, %xmm3, %xmm2
-	palignr	$3, %xmm4, %xmm3
-	palignr	$3, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_3_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_fwd)
-	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$4, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$4, %xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$4, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_4_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_bwd)
-	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
-	movaps	-0x14(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x24(%rsi), %xmm3
-	movaps	-0x34(%rsi), %xmm4
-	movaps	-0x44(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$4, %xmm2, %xmm1
-	palignr	$4, %xmm3, %xmm2
-	palignr	$4, %xmm4, %xmm3
-	palignr	$4, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_4_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_fwd)
-	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$5, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$5, %xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$5, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_5_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_bwd)
-	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
-	movaps	-0x15(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x25(%rsi), %xmm3
-	movaps	-0x35(%rsi), %xmm4
-	movaps	-0x45(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$5, %xmm2, %xmm1
-	palignr	$5, %xmm3, %xmm2
-	palignr	$5, %xmm4, %xmm3
-	palignr	$5, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_5_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_fwd)
-	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$6, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$6, %xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$6, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_6_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_bwd)
-	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
-	movaps	-0x16(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x26(%rsi), %xmm3
-	movaps	-0x36(%rsi), %xmm4
-	movaps	-0x46(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$6, %xmm2, %xmm1
-	palignr	$6, %xmm3, %xmm2
-	palignr	$6, %xmm4, %xmm3
-	palignr	$6, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_6_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_fwd)
-	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
-	sub	$64, %rdx
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$7, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$7, %xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$7, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_7_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_bwd)
-	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
-	movaps	-0x17(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x27(%rsi), %xmm3
-	movaps	-0x37(%rsi), %xmm4
-	movaps	-0x47(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$7, %xmm2, %xmm1
-	palignr	$7, %xmm3, %xmm2
-	palignr	$7, %xmm4, %xmm3
-	palignr	$7, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_7_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_fwd)
-	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
-	sub	$64, %rdx
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$8, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$8, %xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$8, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_8_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-	.p2align 4
-L(shl_8_end):
-	lea	64(%rdx), %rdx
-	movaps	%xmm4, -0x20(%rdi)
-	add	%rdx, %rsi
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_bwd)
-	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
-	movaps	-0x18(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x28(%rsi), %xmm3
-	movaps	-0x38(%rsi), %xmm4
-	movaps	-0x48(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$8, %xmm2, %xmm1
-	palignr	$8, %xmm3, %xmm2
-	palignr	$8, %xmm4, %xmm3
-	palignr	$8, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_8_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_fwd)
-	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
-	sub	$64, %rdx
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$9, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$9, %xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$9, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_9_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_bwd)
-	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
-	movaps	-0x19(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x29(%rsi), %xmm3
-	movaps	-0x39(%rsi), %xmm4
-	movaps	-0x49(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$9, %xmm2, %xmm1
-	palignr	$9, %xmm3, %xmm2
-	palignr	$9, %xmm4, %xmm3
-	palignr	$9, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_9_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_fwd)
-	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
-	sub	$64, %rdx
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$10, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$10, %xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$10, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_10_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_bwd)
-	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
-	movaps	-0x1a(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2a(%rsi), %xmm3
-	movaps	-0x3a(%rsi), %xmm4
-	movaps	-0x4a(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$10, %xmm2, %xmm1
-	palignr	$10, %xmm3, %xmm2
-	palignr	$10, %xmm4, %xmm3
-	palignr	$10, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_10_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_fwd)
-	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
-	sub	$64, %rdx
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$11, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$11, %xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$11, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_11_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_bwd)
-	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
-	movaps	-0x1b(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2b(%rsi), %xmm3
-	movaps	-0x3b(%rsi), %xmm4
-	movaps	-0x4b(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$11, %xmm2, %xmm1
-	palignr	$11, %xmm3, %xmm2
-	palignr	$11, %xmm4, %xmm3
-	palignr	$11, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_11_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_fwd)
-	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
-	sub	$64, %rdx
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$12, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$12, %xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$12, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_12_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_bwd)
-	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
-	movaps	-0x1c(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2c(%rsi), %xmm3
-	movaps	-0x3c(%rsi), %xmm4
-	movaps	-0x4c(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$12, %xmm2, %xmm1
-	palignr	$12, %xmm3, %xmm2
-	palignr	$12, %xmm4, %xmm3
-	palignr	$12, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_12_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_fwd)
-	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
-	sub	$64, %rdx
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$13, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$13, %xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$13, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_13_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_bwd)
-	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
-	movaps	-0x1d(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2d(%rsi), %xmm3
-	movaps	-0x3d(%rsi), %xmm4
-	movaps	-0x4d(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$13, %xmm2, %xmm1
-	palignr	$13, %xmm3, %xmm2
-	palignr	$13, %xmm4, %xmm3
-	palignr	$13, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_13_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_fwd)
-	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
-	sub	$64, %rdx
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$14, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$14, %xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$14, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_14_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_bwd)
-	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
-	movaps	-0x1e(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2e(%rsi), %xmm3
-	movaps	-0x3e(%rsi), %xmm4
-	movaps	-0x4e(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$14, %xmm2, %xmm1
-	palignr	$14, %xmm3, %xmm2
-	palignr	$14, %xmm4, %xmm3
-	palignr	$14, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_14_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_fwd)
-	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
-	sub	$64, %rdx
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$15, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$15, %xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$15, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_15_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_bwd)
-	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
-	movaps	-0x1f(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2f(%rsi), %xmm3
-	movaps	-0x3f(%rsi), %xmm4
-	movaps	-0x4f(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$15, %xmm2, %xmm1
-	palignr	$15, %xmm3, %xmm2
-	palignr	$15, %xmm4, %xmm3
-	palignr	$15, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_15_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(write_72bytes):
-	movdqu	-72(%rsi), %xmm0
-	movdqu	-56(%rsi), %xmm1
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -72(%rdi)
-	movdqu	 %xmm1, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_64bytes):
-	movdqu	-64(%rsi), %xmm0
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -64(%rdi)
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_56bytes):
-	movdqu	-56(%rsi), %xmm0
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_48bytes):
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_40bytes):
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_32bytes):
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_24bytes):
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_16bytes):
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	 %rdx, -8(%rdi)
-L(write_0bytes):
-	ret
-
-	.p2align 4
-L(write_73bytes):
-	movdqu	-73(%rsi), %xmm0
-	movdqu	-57(%rsi), %xmm1
-	mov	-41(%rsi), %rcx
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %r8
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -73(%rdi)
-	movdqu	 %xmm1, -57(%rdi)
-	mov	 %rcx, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %r8, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_65bytes):
-	movdqu	-65(%rsi), %xmm0
-	movdqu	-49(%rsi), %xmm1
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -65(%rdi)
-	movdqu	 %xmm1, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_57bytes):
-	movdqu	-57(%rsi), %xmm0
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -57(%rdi)
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_49bytes):
-	movdqu	-49(%rsi), %xmm0
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_41bytes):
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_33bytes):
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_25bytes):
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_17bytes):
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_9bytes):
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_1bytes):
-	mov	-1(%rsi), %dl
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_74bytes):
-	movdqu	-74(%rsi), %xmm0
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -74(%rdi)
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_66bytes):
-	movdqu	-66(%rsi), %xmm0
-	movdqu	-50(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -66(%rdi)
-	movdqu	 %xmm1, -50(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_58bytes):
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_50bytes):
-	movdqu	-50(%rsi), %xmm0
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -50(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_42bytes):
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_34bytes):
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_26bytes):
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_18bytes):
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_10bytes):
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_2bytes):
-	mov	-2(%rsi), %dx
-	mov	 %dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_75bytes):
-	movdqu	-75(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -75(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_67bytes):
-	movdqu	-67(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -67(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_59bytes):
-	movdqu	-59(%rsi), %xmm0
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_51bytes):
-	movdqu	-51(%rsi), %xmm0
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -51(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_43bytes):
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_35bytes):
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_27bytes):
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_19bytes):
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_11bytes):
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	 %dx, -3(%rdi)
-	mov	 %cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_76bytes):
-	movdqu	-76(%rsi), %xmm0
-	movdqu	-60(%rsi), %xmm1
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -76(%rdi)
-	movdqu	 %xmm1, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_68bytes):
-	movdqu	-68(%rsi), %xmm0
-	movdqu	-52(%rsi), %xmm1
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -68(%rdi)
-	movdqu	 %xmm1, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_60bytes):
-	movdqu	-60(%rsi), %xmm0
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_52bytes):
-	movdqu	-52(%rsi), %xmm0
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_44bytes):
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_36bytes):
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_28bytes):
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_20bytes):
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_12bytes):
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_77bytes):
-	movdqu	-77(%rsi), %xmm0
-	movdqu	-61(%rsi), %xmm1
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -77(%rdi)
-	movdqu	 %xmm1, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_69bytes):
-	movdqu	-69(%rsi), %xmm0
-	movdqu	-53(%rsi), %xmm1
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -69(%rdi)
-	movdqu	 %xmm1, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_61bytes):
-	movdqu	-61(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_53bytes):
-	movdqu	-53(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_45bytes):
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_37bytes):
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_29bytes):
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_21bytes):
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_13bytes):
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -5(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_78bytes):
-	movdqu	-78(%rsi), %xmm0
-	movdqu	-62(%rsi), %xmm1
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -78(%rdi)
-	movdqu	 %xmm1, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_70bytes):
-	movdqu	-70(%rsi), %xmm0
-	movdqu	-54(%rsi), %xmm1
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -70(%rdi)
-	movdqu	 %xmm1, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_62bytes):
-	movdqu	-62(%rsi), %xmm0
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_54bytes):
-	movdqu	-54(%rsi), %xmm0
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_46bytes):
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_38bytes):
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_30bytes):
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_22bytes):
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_14bytes):
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -6(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_79bytes):
-	movdqu	-79(%rsi), %xmm0
-	movdqu	-63(%rsi), %xmm1
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -79(%rdi)
-	movdqu	 %xmm1, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_71bytes):
-	movdqu	-71(%rsi), %xmm0
-	movdqu	-55(%rsi), %xmm1
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -71(%rdi)
-	movdqu	 %xmm1, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_63bytes):
-	movdqu	-63(%rsi), %xmm0
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_55bytes):
-	movdqu	-55(%rsi), %xmm0
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_47bytes):
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_39bytes):
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_31bytes):
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_23bytes):
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_15bytes):
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -7(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(large_page_fwd):
-	movdqu	(%rsi), %xmm1
-	lea	16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movntdq	%xmm1, (%rdi)
-	lea	16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	shl	$2, %rcx
-	cmp	%rcx, %rdx
-	jb	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(large_page_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_fwd_start):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x200(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_fwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_fwd_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
-	.p2align 4
-L(large_page_bwd):
-	movdqu	-0x10(%rsi), %xmm1
-	lea	-16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, -0x10(%rdi)
-	lea	-16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jb	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	movntdq	%xmm4, -0x50(%rdi)
-	movntdq	%xmm5, -0x60(%rdi)
-	movntdq	%xmm6, -0x70(%rdi)
-	movntdq	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(large_page_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_bwd_64bytes):
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_bwd_start):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x200(%rsi)
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_bwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_less_80bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
-	.p2align 3
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
deleted file mode 100644
index 295430b1ef..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3
-#define MEMCPY_CHK	__memmove_chk_ssse3
-#include "memcpy-ssse3.S"

From patchwork Fri Mar 25 20:44:47 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 52374
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id A86613889800
	for <patchwork@sourceware.org>; Fri, 25 Mar 2022 20:48:27 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org A86613889800
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1648241307;
	bh=PXkjMQ7uEFrIwUYYjbhj7bpPqXrr6CFeMf4Ez20OI3c=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=rUVLztpQkhfWNQZOprm4BFvkKL80vnia42jOYVD9rJJ04pOI0aVcWQ2rq1qVpX0zi
	 2Dn09ojdTtNjB2mP9Pt8Ufc7jWG5J0mlS4Yga2Y+5443Kv9qbxv5ajA2QFjvqRCHvn
	 QtXD6VEs3d9PZQad4N9/bYjWf8iEXSPu6c5QxvwE=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-io1-xd2d.google.com (mail-io1-xd2d.google.com
 [IPv6:2607:f8b0:4864:20::d2d])
 by sourceware.org (Postfix) with ESMTPS id 9A94E3888C4C
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 20:45:05 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 9A94E3888C4C
Received: by mail-io1-xd2d.google.com with SMTP id d62so10233235iog.13
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 13:45:05 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=PXkjMQ7uEFrIwUYYjbhj7bpPqXrr6CFeMf4Ez20OI3c=;
 b=jkAbpXl+bu5L1HOLVR/15GMhqEuNip6cc57+2us6LX53T79pSgmR1nVwqSNPAkTM14
 2RNqiVhRJbuMzWwvvCow3b0fDAhOwzl44aWFG7Bd9oaX6gfuEu8CO/7g6fT/0PHG5XUl
 NdQHqiCQ/3hmapbbVQaVXcs/Be9k7WmvZO0wkSnizVrip0vJZaIqfR0O8sHmMs4dgt0y
 yK8c38pdbHOmn0MVbAhHl3FtUSJcyvFxZiMxPu8GeJMGcLIcV07AXUc58qeZpcQJQtlC
 004QdHKXstbEKmGiI3ThSYBVh8Gy0rQNhtASFkRkfjdi4uQU+Lz4I0YPyuYlg9RhzkXi
 t1Kw==
X-Gm-Message-State: AOAM53131YRvBkMzPyUtwBFIupv6iowZ2wclycoMo0el9KwMqzHzZNj0
 bEMh7vFF+mfaHLoDcoHo7yZ7jkRGNV4=
X-Google-Smtp-Source: 
 ABdhPJzsX/IC/pr1/lRc+/lxrKcE5wSbCKYtYR7vKFioyMguaYlT3LAx5EyG/Bg2TBr2iPw7lgmcFg==
X-Received: by 2002:a05:6602:1512:b0:648:cced:ad64 with SMTP id
 g18-20020a056602151200b00648ccedad64mr383723iow.152.1648241102772;
 Fri, 25 Mar 2022 13:45:02 -0700 (PDT)
Received: from localhost.localdomain (node-17-161.flex.volo.net.
 [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id
 a3-20020a5ec303000000b006496b4dd21csm3382118iok.5.2022.03.25.13.45.02
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Fri, 25 Mar 2022 13:45:02 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back
Date: Fri, 25 Mar 2022 15:44:47 -0500
Message-Id: <20220325204449.1284533-4-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20220325204449.1284533-1-goldstein.w.n@gmail.com>
References: <20220325183625.1170867-2-goldstein.w.n@gmail.com>
 <20220325204449.1284533-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-11.4 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_10_SHORT_WORD_LINES,
 SCC_5_SHORT_WORD_LINES,
 SPF_HELO_NONE, SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   15 -
 sysdeps/x86_64/multiarch/ifunc-memmove.h      |    7 -
 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 3181 -----------------
 sysdeps/x86_64/multiarch/memmove-ssse3-back.S |    4 -
 5 files changed, 3209 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 48f81711ae..323be3b969 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,14 +16,12 @@ sysdep_routines += \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
   memcmpeq-sse2 \
-  memcpy-ssse3-back \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
   memmove-avx512-unaligned-erms \
   memmove-evex-unaligned-erms \
   memmove-sse2-unaligned-erms \
-  memmove-ssse3-back \
   memrchr-avx2 \
   memrchr-avx2-rtm \
   memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 70b0e9c62e..d6852ab365 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
@@ -174,8 +171,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
 			      __memmove_sse2_unaligned)
@@ -879,9 +874,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
@@ -912,8 +904,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memcpy_avx512_no_vzeroupper)
@@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
@@ -1002,8 +989,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 1ecdd4b0d3..5596ddea2c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -92,13 +92,6 @@ IFUNC_SELECTOR (void)
 	}
     }
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
-    {
-      if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
-        return OPTIMIZE (ssse3_back);
-    }
-
   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 	return OPTIMIZE (sse2_unaligned_erms);
 
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_back
-# define MEMCPY_CHK	__memcpy_chk_ssse3_back
-# define MEMPCPY	__mempcpy_ssse3_back
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(bwd_write_0bytes)
-	cmp	$144, %rdx
-	jae	L(copy_backward)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$144, %rdx
-	jae	L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jbe	L(bk_write)
-#endif
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
-	.p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %r8
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rdi, %r9
-	sub	%r8, %r9
-	sub	%r9, %rdx
-	add	%r9, %rsi
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	jae	L(gobble_mem_fwd)
-	lea    	L(shl_table_fwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	shl	$1, %rcx
-	cmp	%rcx, %rdx
-	ja	L(gobble_mem_bwd)
-
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$0xf, %r9
-	xor	%r9, %rdi
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-	lea    	L(shl_table_bwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(shl_0):
-
-	mov	%rdx, %r9
-	shr	$8, %r9
-	add	%rdx, %r9
-#ifdef DATA_CACHE_SIZE
-	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %R9_LP
-#endif
-	jae	L(gobble_mem_fwd)
-	sub	$0x80, %rdx
-	.p2align 4
-L(shl_0_loop):
-	movdqa	(%rsi), %xmm1
-	movdqa	%xmm1, (%rdi)
-	movaps	0x10(%rsi), %xmm2
-	movaps	%xmm2, 0x10(%rdi)
-	movaps	0x20(%rsi), %xmm3
-	movaps	%xmm3, 0x20(%rdi)
-	movaps	0x30(%rsi), %xmm4
-	movaps	%xmm4, 0x30(%rdi)
-	movaps	0x40(%rsi), %xmm1
-	movaps	%xmm1, 0x40(%rdi)
-	movaps	0x50(%rsi), %xmm2
-	movaps	%xmm2, 0x50(%rdi)
-	movaps	0x60(%rsi), %xmm3
-	movaps	%xmm3, 0x60(%rdi)
-	movaps	0x70(%rsi), %xmm4
-	movaps	%xmm4, 0x70(%rdi)
-	sub	$0x80, %rdx
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_0_loop)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$0x80, %rdx
-L(copy_backward_loop):
-	movaps	-0x10(%rsi), %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	-0x20(%rsi), %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-	movaps	-0x30(%rsi), %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-	movaps	-0x40(%rsi), %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-	movaps	-0x50(%rsi), %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-	movaps	-0x60(%rsi), %xmm5
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	-0x70(%rsi), %xmm5
-	movaps	%xmm5, -0x70(%rdi)
-	movaps	-0x80(%rsi), %xmm5
-	movaps	%xmm5, -0x80(%rdi)
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(copy_backward_loop)
-
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	sub	$0x80, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movaps	0x4f(%rsi), %xmm6
-	movaps	0x5f(%rsi), %xmm7
-	movaps	0x6f(%rsi), %xmm8
-	movaps	0x7f(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$1, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$1, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$1, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$1, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$1, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$1, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$1, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_1)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	movaps	-0x01(%rsi), %xmm1
-
-	movaps	-0x11(%rsi), %xmm2
-	palignr	$1, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x21(%rsi), %xmm3
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x31(%rsi), %xmm4
-	palignr	$1, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x41(%rsi), %xmm5
-	palignr	$1, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x51(%rsi), %xmm6
-	palignr	$1, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x61(%rsi), %xmm7
-	palignr	$1, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x71(%rsi), %xmm8
-	palignr	$1, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x81(%rsi), %xmm9
-	palignr	$1, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_1_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	sub	$0x80, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movaps	0x4e(%rsi), %xmm6
-	movaps	0x5e(%rsi), %xmm7
-	movaps	0x6e(%rsi), %xmm8
-	movaps	0x7e(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$2, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$2, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$2, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$2, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$2, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$2, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$2, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_2)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	movaps	-0x02(%rsi), %xmm1
-
-	movaps	-0x12(%rsi), %xmm2
-	palignr	$2, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x22(%rsi), %xmm3
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x32(%rsi), %xmm4
-	palignr	$2, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x42(%rsi), %xmm5
-	palignr	$2, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x52(%rsi), %xmm6
-	palignr	$2, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x62(%rsi), %xmm7
-	palignr	$2, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x72(%rsi), %xmm8
-	palignr	$2, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x82(%rsi), %xmm9
-	palignr	$2, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_2_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	sub	$0x80, %rdx
-	movaps -0x03(%rsi), %xmm1
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movaps	0x4d(%rsi), %xmm6
-	movaps	0x5d(%rsi), %xmm7
-	movaps	0x6d(%rsi), %xmm8
-	movaps	0x7d(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$3, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$3, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$3, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$3, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$3, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$3, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$3, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_3)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	movaps	-0x03(%rsi), %xmm1
-
-	movaps	-0x13(%rsi), %xmm2
-	palignr	$3, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x23(%rsi), %xmm3
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x33(%rsi), %xmm4
-	palignr	$3, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x43(%rsi), %xmm5
-	palignr	$3, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x53(%rsi), %xmm6
-	palignr	$3, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x63(%rsi), %xmm7
-	palignr	$3, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x73(%rsi), %xmm8
-	palignr	$3, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x83(%rsi), %xmm9
-	palignr	$3, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_3_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	sub	$0x80, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movaps	0x4c(%rsi), %xmm6
-	movaps	0x5c(%rsi), %xmm7
-	movaps	0x6c(%rsi), %xmm8
-	movaps	0x7c(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$4, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$4, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$4, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$4, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$4, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$4, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$4, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_4)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	movaps	-0x04(%rsi), %xmm1
-
-	movaps	-0x14(%rsi), %xmm2
-	palignr	$4, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x24(%rsi), %xmm3
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x34(%rsi), %xmm4
-	palignr	$4, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x44(%rsi), %xmm5
-	palignr	$4, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x54(%rsi), %xmm6
-	palignr	$4, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x64(%rsi), %xmm7
-	palignr	$4, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x74(%rsi), %xmm8
-	palignr	$4, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x84(%rsi), %xmm9
-	palignr	$4, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_4_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	sub	$0x80, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movaps	0x4b(%rsi), %xmm6
-	movaps	0x5b(%rsi), %xmm7
-	movaps	0x6b(%rsi), %xmm8
-	movaps	0x7b(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$5, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$5, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$5, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$5, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$5, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$5, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$5, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_5)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	movaps	-0x05(%rsi), %xmm1
-
-	movaps	-0x15(%rsi), %xmm2
-	palignr	$5, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x25(%rsi), %xmm3
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x35(%rsi), %xmm4
-	palignr	$5, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x45(%rsi), %xmm5
-	palignr	$5, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x55(%rsi), %xmm6
-	palignr	$5, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x65(%rsi), %xmm7
-	palignr	$5, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x75(%rsi), %xmm8
-	palignr	$5, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x85(%rsi), %xmm9
-	palignr	$5, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_5_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	sub	$0x80, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movaps	0x4a(%rsi), %xmm6
-	movaps	0x5a(%rsi), %xmm7
-	movaps	0x6a(%rsi), %xmm8
-	movaps	0x7a(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$6, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$6, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$6, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$6, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$6, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$6, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$6, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_6)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	movaps	-0x06(%rsi), %xmm1
-
-	movaps	-0x16(%rsi), %xmm2
-	palignr	$6, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x26(%rsi), %xmm3
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x36(%rsi), %xmm4
-	palignr	$6, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x46(%rsi), %xmm5
-	palignr	$6, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x56(%rsi), %xmm6
-	palignr	$6, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x66(%rsi), %xmm7
-	palignr	$6, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x76(%rsi), %xmm8
-	palignr	$6, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x86(%rsi), %xmm9
-	palignr	$6, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_6_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	sub	$0x80, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movaps	0x49(%rsi), %xmm6
-	movaps	0x59(%rsi), %xmm7
-	movaps	0x69(%rsi), %xmm8
-	movaps	0x79(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$7, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$7, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$7, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$7, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$7, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$7, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$7, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_7)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	movaps	-0x07(%rsi), %xmm1
-
-	movaps	-0x17(%rsi), %xmm2
-	palignr	$7, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x27(%rsi), %xmm3
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x37(%rsi), %xmm4
-	palignr	$7, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x47(%rsi), %xmm5
-	palignr	$7, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x57(%rsi), %xmm6
-	palignr	$7, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x67(%rsi), %xmm7
-	palignr	$7, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x77(%rsi), %xmm8
-	palignr	$7, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x87(%rsi), %xmm9
-	palignr	$7, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_7_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	sub	$0x80, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movaps	0x48(%rsi), %xmm6
-	movaps	0x58(%rsi), %xmm7
-	movaps	0x68(%rsi), %xmm8
-	movaps	0x78(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$8, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$8, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$8, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$8, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$8, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$8, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$8, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_8)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	movaps	-0x08(%rsi), %xmm1
-
-	movaps	-0x18(%rsi), %xmm2
-	palignr	$8, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x28(%rsi), %xmm3
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x38(%rsi), %xmm4
-	palignr	$8, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x48(%rsi), %xmm5
-	palignr	$8, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x58(%rsi), %xmm6
-	palignr	$8, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x68(%rsi), %xmm7
-	palignr	$8, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x78(%rsi), %xmm8
-	palignr	$8, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x88(%rsi), %xmm9
-	palignr	$8, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_8_bwd)
-L(shl_8_end_bwd):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	sub	$0x80, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movaps	0x47(%rsi), %xmm6
-	movaps	0x57(%rsi), %xmm7
-	movaps	0x67(%rsi), %xmm8
-	movaps	0x77(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$9, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$9, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$9, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$9, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$9, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$9, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$9, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_9)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	movaps	-0x09(%rsi), %xmm1
-
-	movaps	-0x19(%rsi), %xmm2
-	palignr	$9, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x29(%rsi), %xmm3
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x39(%rsi), %xmm4
-	palignr	$9, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x49(%rsi), %xmm5
-	palignr	$9, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x59(%rsi), %xmm6
-	palignr	$9, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x69(%rsi), %xmm7
-	palignr	$9, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x79(%rsi), %xmm8
-	palignr	$9, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x89(%rsi), %xmm9
-	palignr	$9, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_9_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	sub	$0x80, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movaps	0x46(%rsi), %xmm6
-	movaps	0x56(%rsi), %xmm7
-	movaps	0x66(%rsi), %xmm8
-	movaps	0x76(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$10, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$10, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$10, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$10, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$10, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$10, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$10, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_10)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	movaps	-0x0a(%rsi), %xmm1
-
-	movaps	-0x1a(%rsi), %xmm2
-	palignr	$10, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2a(%rsi), %xmm3
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3a(%rsi), %xmm4
-	palignr	$10, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4a(%rsi), %xmm5
-	palignr	$10, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5a(%rsi), %xmm6
-	palignr	$10, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6a(%rsi), %xmm7
-	palignr	$10, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7a(%rsi), %xmm8
-	palignr	$10, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8a(%rsi), %xmm9
-	palignr	$10, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_10_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	sub	$0x80, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movaps	0x45(%rsi), %xmm6
-	movaps	0x55(%rsi), %xmm7
-	movaps	0x65(%rsi), %xmm8
-	movaps	0x75(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$11, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$11, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$11, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$11, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$11, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$11, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$11, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_11)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	movaps	-0x0b(%rsi), %xmm1
-
-	movaps	-0x1b(%rsi), %xmm2
-	palignr	$11, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2b(%rsi), %xmm3
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3b(%rsi), %xmm4
-	palignr	$11, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4b(%rsi), %xmm5
-	palignr	$11, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5b(%rsi), %xmm6
-	palignr	$11, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6b(%rsi), %xmm7
-	palignr	$11, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7b(%rsi), %xmm8
-	palignr	$11, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8b(%rsi), %xmm9
-	palignr	$11, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_11_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	sub	$0x80, %rdx
-	movdqa	-0x0c(%rsi), %xmm1
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movaps	0x44(%rsi), %xmm6
-	movaps	0x54(%rsi), %xmm7
-	movaps	0x64(%rsi), %xmm8
-	movaps	0x74(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$12, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$12, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$12, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$12, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$12, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$12, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$12, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_12)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	movaps	-0x0c(%rsi), %xmm1
-
-	movaps	-0x1c(%rsi), %xmm2
-	palignr	$12, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2c(%rsi), %xmm3
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3c(%rsi), %xmm4
-	palignr	$12, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4c(%rsi), %xmm5
-	palignr	$12, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5c(%rsi), %xmm6
-	palignr	$12, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6c(%rsi), %xmm7
-	palignr	$12, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7c(%rsi), %xmm8
-	palignr	$12, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8c(%rsi), %xmm9
-	palignr	$12, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_12_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	sub	$0x80, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movaps	0x43(%rsi), %xmm6
-	movaps	0x53(%rsi), %xmm7
-	movaps	0x63(%rsi), %xmm8
-	movaps	0x73(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$13, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$13, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$13, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$13, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$13, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$13, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$13, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_13)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	movaps	-0x0d(%rsi), %xmm1
-
-	movaps	-0x1d(%rsi), %xmm2
-	palignr	$13, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2d(%rsi), %xmm3
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3d(%rsi), %xmm4
-	palignr	$13, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4d(%rsi), %xmm5
-	palignr	$13, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5d(%rsi), %xmm6
-	palignr	$13, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6d(%rsi), %xmm7
-	palignr	$13, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7d(%rsi), %xmm8
-	palignr	$13, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8d(%rsi), %xmm9
-	palignr	$13, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_13_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	sub	$0x80, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movaps	0x42(%rsi), %xmm6
-	movaps	0x52(%rsi), %xmm7
-	movaps	0x62(%rsi), %xmm8
-	movaps	0x72(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$14, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$14, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$14, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$14, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$14, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$14, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$14, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_14)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	movaps	-0x0e(%rsi), %xmm1
-
-	movaps	-0x1e(%rsi), %xmm2
-	palignr	$14, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2e(%rsi), %xmm3
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3e(%rsi), %xmm4
-	palignr	$14, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4e(%rsi), %xmm5
-	palignr	$14, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5e(%rsi), %xmm6
-	palignr	$14, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6e(%rsi), %xmm7
-	palignr	$14, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7e(%rsi), %xmm8
-	palignr	$14, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8e(%rsi), %xmm9
-	palignr	$14, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_14_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	sub	$0x80, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movaps	0x41(%rsi), %xmm6
-	movaps	0x51(%rsi), %xmm7
-	movaps	0x61(%rsi), %xmm8
-	movaps	0x71(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$15, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$15, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$15, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$15, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$15, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$15, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$15, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_15)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	movaps	-0x0f(%rsi), %xmm1
-
-	movaps	-0x1f(%rsi), %xmm2
-	palignr	$15, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2f(%rsi), %xmm3
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3f(%rsi), %xmm4
-	palignr	$15, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4f(%rsi), %xmm5
-	palignr	$15, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5f(%rsi), %xmm6
-	palignr	$15, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6f(%rsi), %xmm7
-	palignr	$15, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7f(%rsi), %xmm8
-	palignr	$15, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8f(%rsi), %xmm9
-	palignr	$15, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_15_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_fwd):
-	movdqu	(%rsi), %xmm1
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, (%rdi)
-	sub	$16, %rdx
-	add	$16, %rsi
-	add	$16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger_in_fwd)
-	mov	%rdx, %rcx
-L(bigger_in_fwd):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy_fwd)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy_fwd)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy_fwd):
-	sub	$0x80, %rdx
-L(gobble_mem_fwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 0x200(%rsi)
-	prefetcht0 0x300(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lfence
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_mem_fwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_fwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy_fwd):
-	add	%rcx, %rdx
-L(ll_cache_copy_fwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop_fwd):
-	prefetchnta 0x1c0(%rsi)
-	prefetchnta 0x280(%rsi)
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_bwd):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$-16, %rdi
-	sub	%rdi, %r9
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger)
-	mov	%rdx, %rcx
-L(bigger):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy):
-	sub	$0x80, %rdx
-L(gobble_mem_bwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 -0x200(%rsi)
-	prefetcht0 -0x300(%rsi)
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	lfence
-	movntdq	%xmm1, -0x10(%rdi)
-	movntdq	%xmm2, -0x20(%rdi)
-	movntdq	%xmm3, -0x30(%rdi)
-	movntdq	%xmm4, -0x40(%rdi)
-	movntdq	%xmm5, -0x50(%rdi)
-	movntdq	%xmm6, -0x60(%rdi)
-	movntdq	%xmm7, -0x70(%rdi)
-	movntdq	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_mem_bwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_bwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy):
-	add	%rcx, %rdx
-L(ll_cache_copy_bwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop):
-	prefetchnta -0x1c0(%rsi)
-	prefetchnta -0x280(%rsi)
-	prefetchnta -0x1c0(%rdi)
-	prefetchnta -0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	movdqa	%xmm1, -0x10(%rdi)
-	movdqa	%xmm2, -0x20(%rdi)
-	movdqa	%xmm3, -0x30(%rdi)
-	movdqa	%xmm4, -0x40(%rdi)
-	movdqa	%xmm5, -0x50(%rdi)
-	movdqa	%xmm6, -0x60(%rdi)
-	movdqa	%xmm7, -0x70(%rdi)
-	movdqa	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rsi
-	sub	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(fwd_write_128bytes):
-	lddqu	-128(%rsi), %xmm0
-	movdqu	%xmm0, -128(%rdi)
-L(fwd_write_112bytes):
-	lddqu	-112(%rsi), %xmm0
-	movdqu	%xmm0, -112(%rdi)
-L(fwd_write_96bytes):
-	lddqu	-96(%rsi), %xmm0
-	movdqu	%xmm0, -96(%rdi)
-L(fwd_write_80bytes):
-	lddqu	-80(%rsi), %xmm0
-	movdqu	%xmm0, -80(%rdi)
-L(fwd_write_64bytes):
-	lddqu	-64(%rsi), %xmm0
-	movdqu	%xmm0, -64(%rdi)
-L(fwd_write_48bytes):
-	lddqu	-48(%rsi), %xmm0
-	movdqu	%xmm0, -48(%rdi)
-L(fwd_write_32bytes):
-	lddqu	-32(%rsi), %xmm0
-	movdqu	%xmm0, -32(%rdi)
-L(fwd_write_16bytes):
-	lddqu	-16(%rsi), %xmm0
-	movdqu	%xmm0, -16(%rdi)
-L(fwd_write_0bytes):
-	ret
-
-
-	.p2align 4
-L(fwd_write_143bytes):
-	lddqu	-143(%rsi), %xmm0
-	movdqu	%xmm0, -143(%rdi)
-L(fwd_write_127bytes):
-	lddqu	-127(%rsi), %xmm0
-	movdqu	%xmm0, -127(%rdi)
-L(fwd_write_111bytes):
-	lddqu	-111(%rsi), %xmm0
-	movdqu	%xmm0, -111(%rdi)
-L(fwd_write_95bytes):
-	lddqu	-95(%rsi), %xmm0
-	movdqu	%xmm0, -95(%rdi)
-L(fwd_write_79bytes):
-	lddqu	-79(%rsi), %xmm0
-	movdqu	%xmm0, -79(%rdi)
-L(fwd_write_63bytes):
-	lddqu	-63(%rsi), %xmm0
-	movdqu	%xmm0, -63(%rdi)
-L(fwd_write_47bytes):
-	lddqu	-47(%rsi), %xmm0
-	movdqu	%xmm0, -47(%rdi)
-L(fwd_write_31bytes):
-	lddqu	-31(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -31(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_15bytes):
-	mov	-15(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -15(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_142bytes):
-	lddqu	-142(%rsi), %xmm0
-	movdqu	%xmm0, -142(%rdi)
-L(fwd_write_126bytes):
-	lddqu	-126(%rsi), %xmm0
-	movdqu	%xmm0, -126(%rdi)
-L(fwd_write_110bytes):
-	lddqu	-110(%rsi), %xmm0
-	movdqu	%xmm0, -110(%rdi)
-L(fwd_write_94bytes):
-	lddqu	-94(%rsi), %xmm0
-	movdqu	%xmm0, -94(%rdi)
-L(fwd_write_78bytes):
-	lddqu	-78(%rsi), %xmm0
-	movdqu	%xmm0, -78(%rdi)
-L(fwd_write_62bytes):
-	lddqu	-62(%rsi), %xmm0
-	movdqu	%xmm0, -62(%rdi)
-L(fwd_write_46bytes):
-	lddqu	-46(%rsi), %xmm0
-	movdqu	%xmm0, -46(%rdi)
-L(fwd_write_30bytes):
-	lddqu	-30(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -30(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_14bytes):
-	mov	-14(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -14(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_141bytes):
-	lddqu	-141(%rsi), %xmm0
-	movdqu	%xmm0, -141(%rdi)
-L(fwd_write_125bytes):
-	lddqu	-125(%rsi), %xmm0
-	movdqu	%xmm0, -125(%rdi)
-L(fwd_write_109bytes):
-	lddqu	-109(%rsi), %xmm0
-	movdqu	%xmm0, -109(%rdi)
-L(fwd_write_93bytes):
-	lddqu	-93(%rsi), %xmm0
-	movdqu	%xmm0, -93(%rdi)
-L(fwd_write_77bytes):
-	lddqu	-77(%rsi), %xmm0
-	movdqu	%xmm0, -77(%rdi)
-L(fwd_write_61bytes):
-	lddqu	-61(%rsi), %xmm0
-	movdqu	%xmm0, -61(%rdi)
-L(fwd_write_45bytes):
-	lddqu	-45(%rsi), %xmm0
-	movdqu	%xmm0, -45(%rdi)
-L(fwd_write_29bytes):
-	lddqu	-29(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -29(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_13bytes):
-	mov	-13(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -13(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_140bytes):
-	lddqu	-140(%rsi), %xmm0
-	movdqu	%xmm0, -140(%rdi)
-L(fwd_write_124bytes):
-	lddqu	-124(%rsi), %xmm0
-	movdqu	%xmm0, -124(%rdi)
-L(fwd_write_108bytes):
-	lddqu	-108(%rsi), %xmm0
-	movdqu	%xmm0, -108(%rdi)
-L(fwd_write_92bytes):
-	lddqu	-92(%rsi), %xmm0
-	movdqu	%xmm0, -92(%rdi)
-L(fwd_write_76bytes):
-	lddqu	-76(%rsi), %xmm0
-	movdqu	%xmm0, -76(%rdi)
-L(fwd_write_60bytes):
-	lddqu	-60(%rsi), %xmm0
-	movdqu	%xmm0, -60(%rdi)
-L(fwd_write_44bytes):
-	lddqu	-44(%rsi), %xmm0
-	movdqu	%xmm0, -44(%rdi)
-L(fwd_write_28bytes):
-	lddqu	-28(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -28(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_12bytes):
-	mov	-12(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -12(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_139bytes):
-	lddqu	-139(%rsi), %xmm0
-	movdqu	%xmm0, -139(%rdi)
-L(fwd_write_123bytes):
-	lddqu	-123(%rsi), %xmm0
-	movdqu	%xmm0, -123(%rdi)
-L(fwd_write_107bytes):
-	lddqu	-107(%rsi), %xmm0
-	movdqu	%xmm0, -107(%rdi)
-L(fwd_write_91bytes):
-	lddqu	-91(%rsi), %xmm0
-	movdqu	%xmm0, -91(%rdi)
-L(fwd_write_75bytes):
-	lddqu	-75(%rsi), %xmm0
-	movdqu	%xmm0, -75(%rdi)
-L(fwd_write_59bytes):
-	lddqu	-59(%rsi), %xmm0
-	movdqu	%xmm0, -59(%rdi)
-L(fwd_write_43bytes):
-	lddqu	-43(%rsi), %xmm0
-	movdqu	%xmm0, -43(%rdi)
-L(fwd_write_27bytes):
-	lddqu	-27(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -27(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_11bytes):
-	mov	-11(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -11(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_138bytes):
-	lddqu	-138(%rsi), %xmm0
-	movdqu	%xmm0, -138(%rdi)
-L(fwd_write_122bytes):
-	lddqu	-122(%rsi), %xmm0
-	movdqu	%xmm0, -122(%rdi)
-L(fwd_write_106bytes):
-	lddqu	-106(%rsi), %xmm0
-	movdqu	%xmm0, -106(%rdi)
-L(fwd_write_90bytes):
-	lddqu	-90(%rsi), %xmm0
-	movdqu	%xmm0, -90(%rdi)
-L(fwd_write_74bytes):
-	lddqu	-74(%rsi), %xmm0
-	movdqu	%xmm0, -74(%rdi)
-L(fwd_write_58bytes):
-	lddqu	-58(%rsi), %xmm0
-	movdqu	%xmm0, -58(%rdi)
-L(fwd_write_42bytes):
-	lddqu	-42(%rsi), %xmm0
-	movdqu	%xmm0, -42(%rdi)
-L(fwd_write_26bytes):
-	lddqu	-26(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -26(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_10bytes):
-	mov	-10(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -10(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_137bytes):
-	lddqu	-137(%rsi), %xmm0
-	movdqu	%xmm0, -137(%rdi)
-L(fwd_write_121bytes):
-	lddqu	-121(%rsi), %xmm0
-	movdqu	%xmm0, -121(%rdi)
-L(fwd_write_105bytes):
-	lddqu	-105(%rsi), %xmm0
-	movdqu	%xmm0, -105(%rdi)
-L(fwd_write_89bytes):
-	lddqu	-89(%rsi), %xmm0
-	movdqu	%xmm0, -89(%rdi)
-L(fwd_write_73bytes):
-	lddqu	-73(%rsi), %xmm0
-	movdqu	%xmm0, -73(%rdi)
-L(fwd_write_57bytes):
-	lddqu	-57(%rsi), %xmm0
-	movdqu	%xmm0, -57(%rdi)
-L(fwd_write_41bytes):
-	lddqu	-41(%rsi), %xmm0
-	movdqu	%xmm0, -41(%rdi)
-L(fwd_write_25bytes):
-	lddqu	-25(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -25(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_9bytes):
-	mov	-9(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -9(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_136bytes):
-	lddqu	-136(%rsi), %xmm0
-	movdqu	%xmm0, -136(%rdi)
-L(fwd_write_120bytes):
-	lddqu	-120(%rsi), %xmm0
-	movdqu	%xmm0, -120(%rdi)
-L(fwd_write_104bytes):
-	lddqu	-104(%rsi), %xmm0
-	movdqu	%xmm0, -104(%rdi)
-L(fwd_write_88bytes):
-	lddqu	-88(%rsi), %xmm0
-	movdqu	%xmm0, -88(%rdi)
-L(fwd_write_72bytes):
-	lddqu	-72(%rsi), %xmm0
-	movdqu	%xmm0, -72(%rdi)
-L(fwd_write_56bytes):
-	lddqu	-56(%rsi), %xmm0
-	movdqu	%xmm0, -56(%rdi)
-L(fwd_write_40bytes):
-	lddqu	-40(%rsi), %xmm0
-	movdqu	%xmm0, -40(%rdi)
-L(fwd_write_24bytes):
-	lddqu	-24(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -24(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	%rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_135bytes):
-	lddqu	-135(%rsi), %xmm0
-	movdqu	%xmm0, -135(%rdi)
-L(fwd_write_119bytes):
-	lddqu	-119(%rsi), %xmm0
-	movdqu	%xmm0, -119(%rdi)
-L(fwd_write_103bytes):
-	lddqu	-103(%rsi), %xmm0
-	movdqu	%xmm0, -103(%rdi)
-L(fwd_write_87bytes):
-	lddqu	-87(%rsi), %xmm0
-	movdqu	%xmm0, -87(%rdi)
-L(fwd_write_71bytes):
-	lddqu	-71(%rsi), %xmm0
-	movdqu	%xmm0, -71(%rdi)
-L(fwd_write_55bytes):
-	lddqu	-55(%rsi), %xmm0
-	movdqu	%xmm0, -55(%rdi)
-L(fwd_write_39bytes):
-	lddqu	-39(%rsi), %xmm0
-	movdqu	%xmm0, -39(%rdi)
-L(fwd_write_23bytes):
-	lddqu	-23(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -23(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -7(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_134bytes):
-	lddqu	-134(%rsi), %xmm0
-	movdqu	%xmm0, -134(%rdi)
-L(fwd_write_118bytes):
-	lddqu	-118(%rsi), %xmm0
-	movdqu	%xmm0, -118(%rdi)
-L(fwd_write_102bytes):
-	lddqu	-102(%rsi), %xmm0
-	movdqu	%xmm0, -102(%rdi)
-L(fwd_write_86bytes):
-	lddqu	-86(%rsi), %xmm0
-	movdqu	%xmm0, -86(%rdi)
-L(fwd_write_70bytes):
-	lddqu	-70(%rsi), %xmm0
-	movdqu	%xmm0, -70(%rdi)
-L(fwd_write_54bytes):
-	lddqu	-54(%rsi), %xmm0
-	movdqu	%xmm0, -54(%rdi)
-L(fwd_write_38bytes):
-	lddqu	-38(%rsi), %xmm0
-	movdqu	%xmm0, -38(%rdi)
-L(fwd_write_22bytes):
-	lddqu	-22(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -22(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -6(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_133bytes):
-	lddqu	-133(%rsi), %xmm0
-	movdqu	%xmm0, -133(%rdi)
-L(fwd_write_117bytes):
-	lddqu	-117(%rsi), %xmm0
-	movdqu	%xmm0, -117(%rdi)
-L(fwd_write_101bytes):
-	lddqu	-101(%rsi), %xmm0
-	movdqu	%xmm0, -101(%rdi)
-L(fwd_write_85bytes):
-	lddqu	-85(%rsi), %xmm0
-	movdqu	%xmm0, -85(%rdi)
-L(fwd_write_69bytes):
-	lddqu	-69(%rsi), %xmm0
-	movdqu	%xmm0, -69(%rdi)
-L(fwd_write_53bytes):
-	lddqu	-53(%rsi), %xmm0
-	movdqu	%xmm0, -53(%rdi)
-L(fwd_write_37bytes):
-	lddqu	-37(%rsi), %xmm0
-	movdqu	%xmm0, -37(%rdi)
-L(fwd_write_21bytes):
-	lddqu	-21(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -21(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -5(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_132bytes):
-	lddqu	-132(%rsi), %xmm0
-	movdqu	%xmm0, -132(%rdi)
-L(fwd_write_116bytes):
-	lddqu	-116(%rsi), %xmm0
-	movdqu	%xmm0, -116(%rdi)
-L(fwd_write_100bytes):
-	lddqu	-100(%rsi), %xmm0
-	movdqu	%xmm0, -100(%rdi)
-L(fwd_write_84bytes):
-	lddqu	-84(%rsi), %xmm0
-	movdqu	%xmm0, -84(%rdi)
-L(fwd_write_68bytes):
-	lddqu	-68(%rsi), %xmm0
-	movdqu	%xmm0, -68(%rdi)
-L(fwd_write_52bytes):
-	lddqu	-52(%rsi), %xmm0
-	movdqu	%xmm0, -52(%rdi)
-L(fwd_write_36bytes):
-	lddqu	-36(%rsi), %xmm0
-	movdqu	%xmm0, -36(%rdi)
-L(fwd_write_20bytes):
-	lddqu	-20(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -20(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	%edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_131bytes):
-	lddqu	-131(%rsi), %xmm0
-	movdqu	%xmm0, -131(%rdi)
-L(fwd_write_115bytes):
-	lddqu	-115(%rsi), %xmm0
-	movdqu	%xmm0, -115(%rdi)
-L(fwd_write_99bytes):
-	lddqu	-99(%rsi), %xmm0
-	movdqu	%xmm0, -99(%rdi)
-L(fwd_write_83bytes):
-	lddqu	-83(%rsi), %xmm0
-	movdqu	%xmm0, -83(%rdi)
-L(fwd_write_67bytes):
-	lddqu	-67(%rsi), %xmm0
-	movdqu	%xmm0, -67(%rdi)
-L(fwd_write_51bytes):
-	lddqu	-51(%rsi), %xmm0
-	movdqu	%xmm0, -51(%rdi)
-L(fwd_write_35bytes):
-	lddqu	-35(%rsi), %xmm0
-	movdqu	%xmm0, -35(%rdi)
-L(fwd_write_19bytes):
-	lddqu	-19(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -19(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	%dx, -3(%rdi)
-	mov	%cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_130bytes):
-	lddqu	-130(%rsi), %xmm0
-	movdqu	%xmm0, -130(%rdi)
-L(fwd_write_114bytes):
-	lddqu	-114(%rsi), %xmm0
-	movdqu	%xmm0, -114(%rdi)
-L(fwd_write_98bytes):
-	lddqu	-98(%rsi), %xmm0
-	movdqu	%xmm0, -98(%rdi)
-L(fwd_write_82bytes):
-	lddqu	-82(%rsi), %xmm0
-	movdqu	%xmm0, -82(%rdi)
-L(fwd_write_66bytes):
-	lddqu	-66(%rsi), %xmm0
-	movdqu	%xmm0, -66(%rdi)
-L(fwd_write_50bytes):
-	lddqu	-50(%rsi), %xmm0
-	movdqu	%xmm0, -50(%rdi)
-L(fwd_write_34bytes):
-	lddqu	-34(%rsi), %xmm0
-	movdqu	%xmm0, -34(%rdi)
-L(fwd_write_18bytes):
-	lddqu	-18(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -18(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_2bytes):
-	movzwl	-2(%rsi), %edx
-	mov	%dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_129bytes):
-	lddqu	-129(%rsi), %xmm0
-	movdqu	%xmm0, -129(%rdi)
-L(fwd_write_113bytes):
-	lddqu	-113(%rsi), %xmm0
-	movdqu	%xmm0, -113(%rdi)
-L(fwd_write_97bytes):
-	lddqu	-97(%rsi), %xmm0
-	movdqu	%xmm0, -97(%rdi)
-L(fwd_write_81bytes):
-	lddqu	-81(%rsi), %xmm0
-	movdqu	%xmm0, -81(%rdi)
-L(fwd_write_65bytes):
-	lddqu	-65(%rsi), %xmm0
-	movdqu	%xmm0, -65(%rdi)
-L(fwd_write_49bytes):
-	lddqu	-49(%rsi), %xmm0
-	movdqu	%xmm0, -49(%rdi)
-L(fwd_write_33bytes):
-	lddqu	-33(%rsi), %xmm0
-	movdqu	%xmm0, -33(%rdi)
-L(fwd_write_17bytes):
-	lddqu	-17(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -17(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_1bytes):
-	movzbl	-1(%rsi), %edx
-	mov	%dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_128bytes):
-	lddqu	112(%rsi), %xmm0
-	movdqu	%xmm0, 112(%rdi)
-L(bwd_write_112bytes):
-	lddqu	96(%rsi), %xmm0
-	movdqu	%xmm0, 96(%rdi)
-L(bwd_write_96bytes):
-	lddqu	80(%rsi), %xmm0
-	movdqu	%xmm0, 80(%rdi)
-L(bwd_write_80bytes):
-	lddqu	64(%rsi), %xmm0
-	movdqu	%xmm0, 64(%rdi)
-L(bwd_write_64bytes):
-	lddqu	48(%rsi), %xmm0
-	movdqu	%xmm0, 48(%rdi)
-L(bwd_write_48bytes):
-	lddqu	32(%rsi), %xmm0
-	movdqu	%xmm0, 32(%rdi)
-L(bwd_write_32bytes):
-	lddqu	16(%rsi), %xmm0
-	movdqu	%xmm0, 16(%rdi)
-L(bwd_write_16bytes):
-	lddqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
-L(bwd_write_0bytes):
-	ret
-
-	.p2align 4
-L(bwd_write_143bytes):
-	lddqu	127(%rsi), %xmm0
-	movdqu	%xmm0, 127(%rdi)
-L(bwd_write_127bytes):
-	lddqu	111(%rsi), %xmm0
-	movdqu	%xmm0, 111(%rdi)
-L(bwd_write_111bytes):
-	lddqu	95(%rsi), %xmm0
-	movdqu	%xmm0, 95(%rdi)
-L(bwd_write_95bytes):
-	lddqu	79(%rsi), %xmm0
-	movdqu	%xmm0, 79(%rdi)
-L(bwd_write_79bytes):
-	lddqu	63(%rsi), %xmm0
-	movdqu	%xmm0, 63(%rdi)
-L(bwd_write_63bytes):
-	lddqu	47(%rsi), %xmm0
-	movdqu	%xmm0, 47(%rdi)
-L(bwd_write_47bytes):
-	lddqu	31(%rsi), %xmm0
-	movdqu	%xmm0, 31(%rdi)
-L(bwd_write_31bytes):
-	lddqu	15(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 15(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-
-	.p2align 4
-L(bwd_write_15bytes):
-	mov	7(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 7(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_142bytes):
-	lddqu	126(%rsi), %xmm0
-	movdqu	%xmm0, 126(%rdi)
-L(bwd_write_126bytes):
-	lddqu	110(%rsi), %xmm0
-	movdqu	%xmm0, 110(%rdi)
-L(bwd_write_110bytes):
-	lddqu	94(%rsi), %xmm0
-	movdqu	%xmm0, 94(%rdi)
-L(bwd_write_94bytes):
-	lddqu	78(%rsi), %xmm0
-	movdqu	%xmm0, 78(%rdi)
-L(bwd_write_78bytes):
-	lddqu	62(%rsi), %xmm0
-	movdqu	%xmm0, 62(%rdi)
-L(bwd_write_62bytes):
-	lddqu	46(%rsi), %xmm0
-	movdqu	%xmm0, 46(%rdi)
-L(bwd_write_46bytes):
-	lddqu	30(%rsi), %xmm0
-	movdqu	%xmm0, 30(%rdi)
-L(bwd_write_30bytes):
-	lddqu	14(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 14(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_14bytes):
-	mov	6(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 6(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_141bytes):
-	lddqu	125(%rsi), %xmm0
-	movdqu	%xmm0, 125(%rdi)
-L(bwd_write_125bytes):
-	lddqu	109(%rsi), %xmm0
-	movdqu	%xmm0, 109(%rdi)
-L(bwd_write_109bytes):
-	lddqu	93(%rsi), %xmm0
-	movdqu	%xmm0, 93(%rdi)
-L(bwd_write_93bytes):
-	lddqu	77(%rsi), %xmm0
-	movdqu	%xmm0, 77(%rdi)
-L(bwd_write_77bytes):
-	lddqu	61(%rsi), %xmm0
-	movdqu	%xmm0, 61(%rdi)
-L(bwd_write_61bytes):
-	lddqu	45(%rsi), %xmm0
-	movdqu	%xmm0, 45(%rdi)
-L(bwd_write_45bytes):
-	lddqu	29(%rsi), %xmm0
-	movdqu	%xmm0, 29(%rdi)
-L(bwd_write_29bytes):
-	lddqu	13(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 13(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_13bytes):
-	mov	5(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 5(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_140bytes):
-	lddqu	124(%rsi), %xmm0
-	movdqu	%xmm0, 124(%rdi)
-L(bwd_write_124bytes):
-	lddqu	108(%rsi), %xmm0
-	movdqu	%xmm0, 108(%rdi)
-L(bwd_write_108bytes):
-	lddqu	92(%rsi), %xmm0
-	movdqu	%xmm0, 92(%rdi)
-L(bwd_write_92bytes):
-	lddqu	76(%rsi), %xmm0
-	movdqu	%xmm0, 76(%rdi)
-L(bwd_write_76bytes):
-	lddqu	60(%rsi), %xmm0
-	movdqu	%xmm0, 60(%rdi)
-L(bwd_write_60bytes):
-	lddqu	44(%rsi), %xmm0
-	movdqu	%xmm0, 44(%rdi)
-L(bwd_write_44bytes):
-	lddqu	28(%rsi), %xmm0
-	movdqu	%xmm0, 28(%rdi)
-L(bwd_write_28bytes):
-	lddqu	12(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 12(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_12bytes):
-	mov	4(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 4(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_139bytes):
-	lddqu	123(%rsi), %xmm0
-	movdqu	%xmm0, 123(%rdi)
-L(bwd_write_123bytes):
-	lddqu	107(%rsi), %xmm0
-	movdqu	%xmm0, 107(%rdi)
-L(bwd_write_107bytes):
-	lddqu	91(%rsi), %xmm0
-	movdqu	%xmm0, 91(%rdi)
-L(bwd_write_91bytes):
-	lddqu	75(%rsi), %xmm0
-	movdqu	%xmm0, 75(%rdi)
-L(bwd_write_75bytes):
-	lddqu	59(%rsi), %xmm0
-	movdqu	%xmm0, 59(%rdi)
-L(bwd_write_59bytes):
-	lddqu	43(%rsi), %xmm0
-	movdqu	%xmm0, 43(%rdi)
-L(bwd_write_43bytes):
-	lddqu	27(%rsi), %xmm0
-	movdqu	%xmm0, 27(%rdi)
-L(bwd_write_27bytes):
-	lddqu	11(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 11(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_11bytes):
-	mov	3(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 3(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_138bytes):
-	lddqu	122(%rsi), %xmm0
-	movdqu	%xmm0, 122(%rdi)
-L(bwd_write_122bytes):
-	lddqu	106(%rsi), %xmm0
-	movdqu	%xmm0, 106(%rdi)
-L(bwd_write_106bytes):
-	lddqu	90(%rsi), %xmm0
-	movdqu	%xmm0, 90(%rdi)
-L(bwd_write_90bytes):
-	lddqu	74(%rsi), %xmm0
-	movdqu	%xmm0, 74(%rdi)
-L(bwd_write_74bytes):
-	lddqu	58(%rsi), %xmm0
-	movdqu	%xmm0, 58(%rdi)
-L(bwd_write_58bytes):
-	lddqu	42(%rsi), %xmm0
-	movdqu	%xmm0, 42(%rdi)
-L(bwd_write_42bytes):
-	lddqu	26(%rsi), %xmm0
-	movdqu	%xmm0, 26(%rdi)
-L(bwd_write_26bytes):
-	lddqu	10(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 10(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_10bytes):
-	mov	2(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 2(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_137bytes):
-	lddqu	121(%rsi), %xmm0
-	movdqu	%xmm0, 121(%rdi)
-L(bwd_write_121bytes):
-	lddqu	105(%rsi), %xmm0
-	movdqu	%xmm0, 105(%rdi)
-L(bwd_write_105bytes):
-	lddqu	89(%rsi), %xmm0
-	movdqu	%xmm0, 89(%rdi)
-L(bwd_write_89bytes):
-	lddqu	73(%rsi), %xmm0
-	movdqu	%xmm0, 73(%rdi)
-L(bwd_write_73bytes):
-	lddqu	57(%rsi), %xmm0
-	movdqu	%xmm0, 57(%rdi)
-L(bwd_write_57bytes):
-	lddqu	41(%rsi), %xmm0
-	movdqu	%xmm0, 41(%rdi)
-L(bwd_write_41bytes):
-	lddqu	25(%rsi), %xmm0
-	movdqu	%xmm0, 25(%rdi)
-L(bwd_write_25bytes):
-	lddqu	9(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 9(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_9bytes):
-	mov	1(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 1(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_136bytes):
-	lddqu	120(%rsi), %xmm0
-	movdqu	%xmm0, 120(%rdi)
-L(bwd_write_120bytes):
-	lddqu	104(%rsi), %xmm0
-	movdqu	%xmm0, 104(%rdi)
-L(bwd_write_104bytes):
-	lddqu	88(%rsi), %xmm0
-	movdqu	%xmm0, 88(%rdi)
-L(bwd_write_88bytes):
-	lddqu	72(%rsi), %xmm0
-	movdqu	%xmm0, 72(%rdi)
-L(bwd_write_72bytes):
-	lddqu	56(%rsi), %xmm0
-	movdqu	%xmm0, 56(%rdi)
-L(bwd_write_56bytes):
-	lddqu	40(%rsi), %xmm0
-	movdqu	%xmm0, 40(%rdi)
-L(bwd_write_40bytes):
-	lddqu	24(%rsi), %xmm0
-	movdqu	%xmm0, 24(%rdi)
-L(bwd_write_24bytes):
-	lddqu	8(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 8(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_8bytes):
-	mov	(%rsi), %rdx
-	mov	%rdx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_135bytes):
-	lddqu	119(%rsi), %xmm0
-	movdqu	%xmm0, 119(%rdi)
-L(bwd_write_119bytes):
-	lddqu	103(%rsi), %xmm0
-	movdqu	%xmm0, 103(%rdi)
-L(bwd_write_103bytes):
-	lddqu	87(%rsi), %xmm0
-	movdqu	%xmm0, 87(%rdi)
-L(bwd_write_87bytes):
-	lddqu	71(%rsi), %xmm0
-	movdqu	%xmm0, 71(%rdi)
-L(bwd_write_71bytes):
-	lddqu	55(%rsi), %xmm0
-	movdqu	%xmm0, 55(%rdi)
-L(bwd_write_55bytes):
-	lddqu	39(%rsi), %xmm0
-	movdqu	%xmm0, 39(%rdi)
-L(bwd_write_39bytes):
-	lddqu	23(%rsi), %xmm0
-	movdqu	%xmm0, 23(%rdi)
-L(bwd_write_23bytes):
-	lddqu	7(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 7(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_7bytes):
-	mov	3(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 3(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_134bytes):
-	lddqu	118(%rsi), %xmm0
-	movdqu	%xmm0, 118(%rdi)
-L(bwd_write_118bytes):
-	lddqu	102(%rsi), %xmm0
-	movdqu	%xmm0, 102(%rdi)
-L(bwd_write_102bytes):
-	lddqu	86(%rsi), %xmm0
-	movdqu	%xmm0, 86(%rdi)
-L(bwd_write_86bytes):
-	lddqu	70(%rsi), %xmm0
-	movdqu	%xmm0, 70(%rdi)
-L(bwd_write_70bytes):
-	lddqu	54(%rsi), %xmm0
-	movdqu	%xmm0, 54(%rdi)
-L(bwd_write_54bytes):
-	lddqu	38(%rsi), %xmm0
-	movdqu	%xmm0, 38(%rdi)
-L(bwd_write_38bytes):
-	lddqu	22(%rsi), %xmm0
-	movdqu	%xmm0, 22(%rdi)
-L(bwd_write_22bytes):
-	lddqu	6(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 6(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_6bytes):
-	mov	2(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 2(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_133bytes):
-	lddqu	117(%rsi), %xmm0
-	movdqu	%xmm0, 117(%rdi)
-L(bwd_write_117bytes):
-	lddqu	101(%rsi), %xmm0
-	movdqu	%xmm0, 101(%rdi)
-L(bwd_write_101bytes):
-	lddqu	85(%rsi), %xmm0
-	movdqu	%xmm0, 85(%rdi)
-L(bwd_write_85bytes):
-	lddqu	69(%rsi), %xmm0
-	movdqu	%xmm0, 69(%rdi)
-L(bwd_write_69bytes):
-	lddqu	53(%rsi), %xmm0
-	movdqu	%xmm0, 53(%rdi)
-L(bwd_write_53bytes):
-	lddqu	37(%rsi), %xmm0
-	movdqu	%xmm0, 37(%rdi)
-L(bwd_write_37bytes):
-	lddqu	21(%rsi), %xmm0
-	movdqu	%xmm0, 21(%rdi)
-L(bwd_write_21bytes):
-	lddqu	5(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 5(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_5bytes):
-	mov	1(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 1(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_132bytes):
-	lddqu	116(%rsi), %xmm0
-	movdqu	%xmm0, 116(%rdi)
-L(bwd_write_116bytes):
-	lddqu	100(%rsi), %xmm0
-	movdqu	%xmm0, 100(%rdi)
-L(bwd_write_100bytes):
-	lddqu	84(%rsi), %xmm0
-	movdqu	%xmm0, 84(%rdi)
-L(bwd_write_84bytes):
-	lddqu	68(%rsi), %xmm0
-	movdqu	%xmm0, 68(%rdi)
-L(bwd_write_68bytes):
-	lddqu	52(%rsi), %xmm0
-	movdqu	%xmm0, 52(%rdi)
-L(bwd_write_52bytes):
-	lddqu	36(%rsi), %xmm0
-	movdqu	%xmm0, 36(%rdi)
-L(bwd_write_36bytes):
-	lddqu	20(%rsi), %xmm0
-	movdqu	%xmm0, 20(%rdi)
-L(bwd_write_20bytes):
-	lddqu	4(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 4(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_4bytes):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_131bytes):
-	lddqu	115(%rsi), %xmm0
-	movdqu	%xmm0, 115(%rdi)
-L(bwd_write_115bytes):
-	lddqu	99(%rsi), %xmm0
-	movdqu	%xmm0, 99(%rdi)
-L(bwd_write_99bytes):
-	lddqu	83(%rsi), %xmm0
-	movdqu	%xmm0, 83(%rdi)
-L(bwd_write_83bytes):
-	lddqu	67(%rsi), %xmm0
-	movdqu	%xmm0, 67(%rdi)
-L(bwd_write_67bytes):
-	lddqu	51(%rsi), %xmm0
-	movdqu	%xmm0, 51(%rdi)
-L(bwd_write_51bytes):
-	lddqu	35(%rsi), %xmm0
-	movdqu	%xmm0, 35(%rdi)
-L(bwd_write_35bytes):
-	lddqu	19(%rsi), %xmm0
-	movdqu	%xmm0, 19(%rdi)
-L(bwd_write_19bytes):
-	lddqu	3(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 3(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_3bytes):
-	mov	1(%rsi), %dx
-	mov	(%rsi), %cx
-	mov	%dx, 1(%rdi)
-	mov	%cx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_130bytes):
-	lddqu	114(%rsi), %xmm0
-	movdqu	%xmm0, 114(%rdi)
-L(bwd_write_114bytes):
-	lddqu	98(%rsi), %xmm0
-	movdqu	%xmm0, 98(%rdi)
-L(bwd_write_98bytes):
-	lddqu	82(%rsi), %xmm0
-	movdqu	%xmm0, 82(%rdi)
-L(bwd_write_82bytes):
-	lddqu	66(%rsi), %xmm0
-	movdqu	%xmm0, 66(%rdi)
-L(bwd_write_66bytes):
-	lddqu	50(%rsi), %xmm0
-	movdqu	%xmm0, 50(%rdi)
-L(bwd_write_50bytes):
-	lddqu	34(%rsi), %xmm0
-	movdqu	%xmm0, 34(%rdi)
-L(bwd_write_34bytes):
-	lddqu	18(%rsi), %xmm0
-	movdqu	%xmm0, 18(%rdi)
-L(bwd_write_18bytes):
-	lddqu	2(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 2(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_2bytes):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_129bytes):
-	lddqu	113(%rsi), %xmm0
-	movdqu	%xmm0, 113(%rdi)
-L(bwd_write_113bytes):
-	lddqu	97(%rsi), %xmm0
-	movdqu	%xmm0, 97(%rdi)
-L(bwd_write_97bytes):
-	lddqu	81(%rsi), %xmm0
-	movdqu	%xmm0, 81(%rdi)
-L(bwd_write_81bytes):
-	lddqu	65(%rsi), %xmm0
-	movdqu	%xmm0, 65(%rdi)
-L(bwd_write_65bytes):
-	lddqu	49(%rsi), %xmm0
-	movdqu	%xmm0, 49(%rdi)
-L(bwd_write_49bytes):
-	lddqu	33(%rsi), %xmm0
-	movdqu	%xmm0, 33(%rdi)
-L(bwd_write_33bytes):
-	lddqu	17(%rsi), %xmm0
-	movdqu	%xmm0, 17(%rdi)
-L(bwd_write_17bytes):
-	lddqu	1(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 1(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_1bytes):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-	ret
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_144_bytes_bwd):
-	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
-	.p2align 3
-L(table_144_bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
-	.p2align 3
-L(shl_table_fwd):
-	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3_back
-#define MEMCPY_CHK	__memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"

From patchwork Fri Mar 25 20:44:48 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 52372
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id 4D47E3889821
	for <patchwork@sourceware.org>; Fri, 25 Mar 2022 20:46:57 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 4D47E3889821
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1648241217;
	bh=llZduC1o68MgwY/putFQXNqxnwwLsgFhdnM/Ts4bT5g=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=iasX5mJhuZsMcft9aX1DN++3PTZZ9F9wxRddAWNFudy6pEr2wVv5Cbg1cwKd7aBYB
	 bTcdiPeiW1lhSzpyLN0+b3Gv0DzdglUOaQ+Soezey0d8fm9GJ/spKuNJYz6OaG2hX2
	 jRZ26OmD37nGX7irEI67a4y8Db+lE7qSOLgltPNs=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-il1-x12a.google.com (mail-il1-x12a.google.com
 [IPv6:2607:f8b0:4864:20::12a])
 by sourceware.org (Postfix) with ESMTPS id E07D63857034
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 20:45:04 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org E07D63857034
Received: by mail-il1-x12a.google.com with SMTP id r11so5994803ila.1
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 13:45:04 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=llZduC1o68MgwY/putFQXNqxnwwLsgFhdnM/Ts4bT5g=;
 b=j7pIAWqr1mZMu5vPokq9DtXl/ZrOJjsxSKiVDvCtgtQ3HvFqPgo3/CUTMDumiT2xb2
 cbiUYG1un62rixCBDcOlx0oW+/DVybeDFh487e2/NUmsNzG811q6pn0VrU530w6mJBx1
 nk9V5SgRol8bfTq/RQLpg/LVuBVz8CuS/SbSNAoHlx5VMzYxoKOQbw5GzZrGfwAwU2pj
 Q2otEi3EzgxyOJpBHu5M0yEZYlYszwZl60d0XyhijdvbZE9tQpRg5xItiKIvnuhtoQh+
 mvRz+GbhhcTs30c7Gxfhfeb00lwygGexd5HIBM6OWyWxSOvYTeYKqR2Z2+i33ipgEj+O
 s6tw==
X-Gm-Message-State: AOAM531vRSgjPQcki1WGav72XU7moA6YF+OyRJYGc5gsmAiK8cYEgVjd
 imFSGXHTJYCHM8UGBpJf+mjrlZGwjoY=
X-Google-Smtp-Source: 
 ABdhPJxBTwR/ycqnn1wB0+M11zLVHAd7ADqq8OSh2HUBa7yXNPnu3Izk8grMc7jwkNR7n584pxnSSw==
X-Received: by 2002:a05:6e02:1526:b0:2c7:b94e:195a with SMTP id
 i6-20020a056e02152600b002c7b94e195amr272017ilu.225.1648241103763;
 Fri, 25 Mar 2022 13:45:03 -0700 (PDT)
Received: from localhost.localdomain (node-17-161.flex.volo.net.
 [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id
 a3-20020a5ec303000000b006496b4dd21csm3382118iok.5.2022.03.25.13.45.03
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Fri, 25 Mar 2022 13:45:03 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v2 5/6] x86: Remove str{n}cat-ssse3
Date: Fri, 25 Mar 2022 15:44:48 -0500
Message-Id: <20220325204449.1284533-5-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20220325204449.1284533-1-goldstein.w.n@gmail.com>
References: <20220325183625.1170867-2-goldstein.w.n@gmail.com>
 <20220325204449.1284533-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-11.7 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_NUMSUBJECT, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES,
 SPF_HELO_NONE, SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |   2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
 sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   4 -
 sysdeps/x86_64/multiarch/strcat-ssse3.S    | 866 ---------------------
 sysdeps/x86_64/multiarch/strncat-ssse3.S   |   3 -
 5 files changed, 879 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 323be3b969..a2ebc06c5f 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -59,7 +59,6 @@ sysdep_routines += \
   strcat-evex \
   strcat-sse2 \
   strcat-sse2-unaligned \
-  strcat-ssse3 \
   strchr-avx2 \
   strchr-avx2-rtm \
   strchr-evex \
@@ -97,7 +96,6 @@ sysdep_routines += \
   strncat-c \
   strncat-evex \
   strncat-sse2-unaligned \
-  strncat-ssse3 \
   strncmp-avx2 \
   strncmp-avx2-rtm \
   strncmp-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d6852ab365..4133ed7e43 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -471,8 +471,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcat_evex)
-	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
-			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
 
@@ -620,8 +618,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncat_evex)
-	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
-			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 5bece38f78..a15afa44e9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -23,7 +23,6 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index 9f39e4fcd1..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,866 +0,0 @@
-/* strcat with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
-   implementation gets merged.  */
-
-	xor	%eax, %eax
-	cmpb	$0, (%rdi)
-	jz	L(exit_tail0)
-	cmpb	$0, 1(%rdi)
-	jz	L(exit_tail1)
-	cmpb	$0, 2(%rdi)
-	jz	L(exit_tail2)
-	cmpb	$0, 3(%rdi)
-	jz	L(exit_tail3)
-
-	cmpb	$0, 4(%rdi)
-	jz	L(exit_tail4)
-	cmpb	$0, 5(%rdi)
-	jz	L(exit_tail5)
-	cmpb	$0, 6(%rdi)
-	jz	L(exit_tail6)
-	cmpb	$0, 7(%rdi)
-	jz	L(exit_tail7)
-
-	cmpb	$0, 8(%rdi)
-	jz	L(exit_tail8)
-	cmpb	$0, 9(%rdi)
-	jz	L(exit_tail9)
-	cmpb	$0, 10(%rdi)
-	jz	L(exit_tail10)
-	cmpb	$0, 11(%rdi)
-	jz	L(exit_tail11)
-
-	cmpb	$0, 12(%rdi)
-	jz	L(exit_tail12)
-	cmpb	$0, 13(%rdi)
-	jz	L(exit_tail13)
-	cmpb	$0, 14(%rdi)
-	jz	L(exit_tail14)
-	cmpb	$0, 15(%rdi)
-	jz	L(exit_tail15)
-	pxor	%xmm0, %xmm0
-	lea	16(%rdi), %rcx
-	lea	16(%rdi), %rax
-	and	$-16, %rax
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	pxor	%xmm1, %xmm1
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	pxor	%xmm2, %xmm2
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	pxor	%xmm3, %xmm3
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	and	$-0x40, %rax
-
-	.p2align 4
-L(aligned_64):
-	pcmpeqb	(%rax), %xmm0
-	pcmpeqb	16(%rax), %xmm1
-	pcmpeqb	32(%rax), %xmm2
-	pcmpeqb	48(%rax), %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm1, %r11d
-	pmovmskb %xmm2, %r10d
-	pmovmskb %xmm3, %r9d
-	or	%edx, %r9d
-	or	%r11d, %r9d
-	or	%r10d, %r9d
-	lea	64(%rax), %rax
-	jz	L(aligned_64)
-
-	test	%edx, %edx
-	jnz	L(aligned_64_exit_16)
-	test	%r11d, %r11d
-	jnz	L(aligned_64_exit_32)
-	test	%r10d, %r10d
-	jnz	L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
-	pmovmskb %xmm3, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_48):
-	lea	-16(%rax), %rax
-	mov	%r10d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_32):
-	lea	-32(%rax), %rax
-	mov	%r11d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_16):
-	lea	-48(%rax), %rax
-
-L(exit):
-	sub	%rcx, %rax
-	test	%dl, %dl
-	jz	L(exit_high)
-	test	$0x01, %dl
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dl
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dl
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dl
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dl
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dl
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dl
-	jnz	L(exit_tail6)
-	add	$7, %eax
-L(exit_tail0):
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_high):
-	add	$8, %eax
-	test	$0x01, %dh
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dh
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dh
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dh
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dh
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dh
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dh
-	jnz	L(exit_tail6)
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail1):
-	add	$1, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail2):
-	add	$2, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail3):
-	add	$3, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail4):
-	add	$4, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail5):
-	add	$5, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail6):
-	add	$6, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail7):
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail8):
-	add	$8, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail9):
-	add	$9, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail10):
-	add	$10, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail11):
-	add	$11, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail12):
-	add	$12, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail13):
-	add	$13, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail14):
-	add	$14, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail15):
-	add	$15, %eax
-
-	.p2align 4
-L(StartStrcpyPart):
-	mov	%rsi, %rcx
-	lea	(%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(StrncatExit0)
-	cmp	$8, %r8
-	jbe	L(StrncatExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-# ifdef USE_AS_STRNCAT
-	cmp	$16, %r8
-	jb	L(StrncatExit15Bytes)
-# endif
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# ifdef USE_AS_STRNCAT
-	cmp	$16, %r8
-	je	L(StrncatExit16)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm0, (%rdx)
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit1):
-	xor	%ah, %ah
-	movb	%ah, 1(%rdx)
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit2):
-	xor	%ah, %ah
-	movb	%ah, 2(%rdx)
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit3):
-	xor	%ah, %ah
-	movb	%ah, 3(%rdx)
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit4):
-	xor	%ah, %ah
-	movb	%ah, 4(%rdx)
-L(Exit4):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit5):
-	xor	%ah, %ah
-	movb	%ah, 5(%rdx)
-L(Exit5):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit6):
-	xor	%ah, %ah
-	movb	%ah, 6(%rdx)
-L(Exit6):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit7):
-	xor	%ah, %ah
-	movb	%ah, 7(%rdx)
-L(Exit7):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	mov	3(%rcx), %eax
-	mov	%eax, 3(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit8):
-	xor	%ah, %ah
-	movb	%ah, 8(%rdx)
-L(Exit8):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit9):
-	xor	%ah, %ah
-	movb	%ah, 9(%rdx)
-L(Exit9):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movb	8(%rcx), %al
-	movb	%al, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit10):
-	xor	%ah, %ah
-	movb	%ah, 10(%rdx)
-L(Exit10):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movw	8(%rcx), %ax
-	movw	%ax, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit11):
-	xor	%ah, %ah
-	movb	%ah, 11(%rdx)
-L(Exit11):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit12):
-	xor	%ah, %ah
-	movb	%ah, 12(%rdx)
-L(Exit12):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit13):
-	xor	%ah, %ah
-	movb	%ah, 13(%rdx)
-L(Exit13):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	5(%rcx), %xmm1
-	movlpd	%xmm1, 5(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit14):
-	xor	%ah, %ah
-	movb	%ah, 14(%rdx)
-L(Exit14):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	6(%rcx), %xmm1
-	movlpd	%xmm1, 6(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit15):
-	xor	%ah, %ah
-	movb	%ah, 15(%rdx)
-L(Exit15):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	7(%rcx), %xmm1
-	movlpd	%xmm1, 7(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit16):
-	xor	%ah, %ah
-	movb	%ah, 16(%rdx)
-L(Exit16):
-	movlpd	(%rcx), %xmm0
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm0, (%rdx)
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-# ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHighCase2):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	cmp	$15, %r8
-	je	L(StrncatExit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$8, %r8
-	ja	L(ExitHighCase3)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	xor	%ah, %ah
-	movb	%ah, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHighCase3):
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	cmp	$15, %r8
-	je	L(StrncatExit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm1, 8(%rdx)
-	xor	%ah, %ah
-	movb	%ah, 16(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit0):
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit15Bytes):
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	7(%rcx), %xmm1
-	movlpd	%xmm1, 7(%rdx)
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit8Bytes):
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"

From patchwork Fri Mar 25 20:44:49 2022
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Noah Goldstein <goldstein.w.n@gmail.com>
X-Patchwork-Id: 52375
Return-Path: <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>
X-Original-To: patchwork@sourceware.org
Delivered-To: patchwork@sourceware.org
Received: from server2.sourceware.org (localhost [IPv6:::1])
	by sourceware.org (Postfix) with ESMTP id C81C33888C63
	for <patchwork@sourceware.org>; Fri, 25 Mar 2022 20:49:11 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C81C33888C63
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1648241351;
	bh=x7Jq4uZeczXdypJ+L9hKpAjsBCroWp6P5izbfqWNLy8=;
	h=To:Subject:Date:In-Reply-To:References:List-Id:List-Unsubscribe:
	 List-Archive:List-Post:List-Help:List-Subscribe:From:Reply-To:
	 From;
	b=HsKgvi2ij6UMwitcgSPpibisVkoJ6d5w6Sf+zs7YVFgNttP5PhubH5vLue/r3DPSn
	 0xMqswYOhQPa1DWB0zNYUnYT38QveKUTeyqGFmVfCJ74bMFb+BbYN7/PQUy8Fx6WvZ
	 Sa6Yn6jLMQeGBrkse0BzdEMbacfFWqyiohyouQXU=
X-Original-To: libc-alpha@sourceware.org
Delivered-To: libc-alpha@sourceware.org
Received: from mail-il1-x12d.google.com (mail-il1-x12d.google.com
 [IPv6:2607:f8b0:4864:20::12d])
 by sourceware.org (Postfix) with ESMTPS id 65AEA3889800
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 20:45:07 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 65AEA3889800
Received: by mail-il1-x12d.google.com with SMTP id x9so5988855ilc.3
 for <libc-alpha@sourceware.org>; Fri, 25 Mar 2022 13:45:07 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to
 :references:mime-version:content-transfer-encoding;
 bh=x7Jq4uZeczXdypJ+L9hKpAjsBCroWp6P5izbfqWNLy8=;
 b=F5V6/mV8NCcWt5J/jh5Hp8Ci83nvtIz7btG7GJP0EbWiLKV4DEbiNuINisWjddAC8M
 9ANB85pqI0o14FqrY/a2soFCz2eHFGMRGKi3n7i7lt5Zr19Ul2q6J/gJLlyQV1OCo1wW
 zPG2/r2uvv12cI1Qf2dAWPeV58XPp48p1go+ACkTYirRi87vn5AxNdlUXWTOpQ7kk+Ix
 WeLikvVTDM1PQhozpdXBcxt4EFF8Qepax8Z6Y51E/k9na3XfGV8WZ85p7Gp5kg5UgrAm
 y5+RNsmIXdnQdSxnxYDejm8hQplujc8+4JHW81kG4VmRaLq4E4J5nd/SpF/ivjZulbmj
 1A/Q==
X-Gm-Message-State: AOAM532Id/MQ6jpAjAYsQ1mUsbftHsoY/8k02xB5Tf24wREbzoE5UjDu
 YfuhJzep/HHDmJpJ5zSmo2ts/kGmv8s=
X-Google-Smtp-Source: 
 ABdhPJywH7Eu6Ek7DRP4FKMGWTk1W5JUZVZ+AINy1eHWD0H9iz+YjVEV7hCxyJJVfPrnUrkW2moqHg==
X-Received: by 2002:a92:c264:0:b0:2c8:1ef3:b36e with SMTP id
 h4-20020a92c264000000b002c81ef3b36emr287808ild.195.1648241105000;
 Fri, 25 Mar 2022 13:45:05 -0700 (PDT)
Received: from localhost.localdomain (node-17-161.flex.volo.net.
 [76.191.17.161]) by smtp.googlemail.com with ESMTPSA id
 a3-20020a5ec303000000b006496b4dd21csm3382118iok.5.2022.03.25.13.45.04
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Fri, 25 Mar 2022 13:45:04 -0700 (PDT)
To: libc-alpha@sourceware.org
Subject: [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3
Date: Fri, 25 Mar 2022 15:44:49 -0500
Message-Id: <20220325204449.1284533-6-goldstein.w.n@gmail.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20220325204449.1284533-1-goldstein.w.n@gmail.com>
References: <20220325183625.1170867-2-goldstein.w.n@gmail.com>
 <20220325204449.1284533-1-goldstein.w.n@gmail.com>
MIME-Version: 1.0
X-Spam-Status: No, score=-10.6 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_NUMSUBJECT, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_10_SHORT_WORD_LINES,
 SCC_20_SHORT_WORD_LINES, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS,
 TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-Patchwork-Original-From: Noah Goldstein via Libc-alpha
 <libc-alpha@sourceware.org>
From: Noah Goldstein <goldstein.w.n@gmail.com>
Reply-To: Noah Goldstein <goldstein.w.n@gmail.com>
Errors-To: libc-alpha-bounces+patchwork=sourceware.org@sourceware.org
Sender: "Libc-alpha"
 <libc-alpha-bounces+patchwork=sourceware.org@sourceware.org>

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
 sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
 sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
 sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
 sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
 6 files changed, 3572 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a2ebc06c5f..292353bad7 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -42,13 +42,11 @@ sysdep_routines += \
   stpcpy-evex \
   stpcpy-sse2 \
   stpcpy-sse2-unaligned \
-  stpcpy-ssse3 \
   stpncpy-avx2 \
   stpncpy-avx2-rtm \
   stpncpy-c \
   stpncpy-evex \
   stpncpy-sse2-unaligned \
-  stpncpy-ssse3 \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
   strcasecmp_l-evex \
@@ -79,7 +77,6 @@ sysdep_routines += \
   strcpy-evex \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
-  strcpy-ssse3 \
   strcspn-c \
   strcspn-sse2 \
   strlen-avx2 \
@@ -106,7 +103,6 @@ sysdep_routines += \
   strncpy-c \
   strncpy-evex \
   strncpy-sse2-unaligned \
-  strncpy-ssse3 \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 4133ed7e43..505b8002e0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
-	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
-	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
 
@@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			      __strncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_ssse3
-#  endif
-
-	.section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
-	mov	%rsi, %rcx
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-#  endif
-	mov	%rdi, %rdx
-#  ifdef USE_AS_STRNCPY
-	test	%R8_LP, %R8_LP
-	jz	L(Exit0)
-	cmp	$8, %R8_LP
-	jbe	L(StrncpyExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	jb	L(StrncpyExit15Bytes)
-# endif
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	je	L(Exit16)
-# endif
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
-	mov	%rcx, %rsi
-	sub	$16, %r8
-	and	$0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
-	add	%rsi, %r8
-# endif
-	lea	16(%rcx), %rsi
-	and	$-16, %rsi
-	pxor	%xmm0, %xmm0
-	mov	(%rcx), %r9
-	mov	%r9, (%rdx)
-	pcmpeqb	(%rsi), %xmm0
-	mov	8(%rcx), %r9
-	mov	%r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
-	pmovmskb %xmm0, %rax
-	sub	%rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	mov	%rdx, %rax
-	lea	16(%rdx), %rdx
-	and	$-16, %rdx
-	sub	%rdx, %rax
-
-# ifdef USE_AS_STRNCPY
-	add	%rax, %rsi
-	lea	-1(%rsi), %rsi
-	and	$1<<31, %esi
-	test	%rsi, %rsi
-	jnz	L(ContinueCopy)
-	lea	16(%r8), %r8
-
-L(ContinueCopy):
-# endif
-	sub	%rax, %rcx
-	mov	%rcx, %rax
-	and	$0xf, %rax
-	mov	$0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
-	jz	L(Align16Both)
-
-	cmp	$8, %rax
-	jae	L(ShlHigh8)
-	cmp	$1, %rax
-	je	L(Shl1)
-	cmp	$2, %rax
-	je	L(Shl2)
-	cmp	$3, %rax
-	je	L(Shl3)
-	cmp	$4, %rax
-	je	L(Shl4)
-	cmp	$5, %rax
-	je	L(Shl5)
-	cmp	$6, %rax
-	je	L(Shl6)
-	jmp	L(Shl7)
-
-L(ShlHigh8):
-	je	L(Shl8)
-	cmp	$9, %rax
-	je	L(Shl9)
-	cmp	$10, %rax
-	je	L(Shl10)
-	cmp	$11, %rax
-	je	L(Shl11)
-	cmp	$12, %rax
-	je	L(Shl12)
-	cmp	$13, %rax
-	je	L(Shl13)
-	cmp	$14, %rax
-	je	L(Shl14)
-	jmp	L(Shl15)
-
-L(Align16Both):
-	movaps	(%rcx), %xmm1
-	movaps	16(%rcx), %xmm2
-	movaps	%xmm1, (%rdx)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm4
-	movaps	%xmm3, (%rdx, %rsi)
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm1
-	movaps	%xmm4, (%rdx, %rsi)
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm2
-	movaps	%xmm1, (%rdx, %rsi)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm3, (%rdx, %rsi)
-	mov	%rcx, %rax
-	lea	16(%rcx, %rsi), %rcx
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	lea	112(%r8, %rax), %r8
-# endif
-	mov	$-0x40, %rsi
-
-	.p2align 4
-L(Aligned64Loop):
-	movaps	(%rcx), %xmm2
-	movaps	%xmm2, %xmm4
-	movaps	16(%rcx), %xmm5
-	movaps	32(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	48(%rcx), %xmm7
-	pminub	%xmm5, %xmm2
-	pminub	%xmm7, %xmm3
-	pminub	%xmm2, %xmm3
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %rax
-	lea	64(%rdx), %rdx
-	lea	64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeaveCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Aligned64Leave)
-	movaps	%xmm4, -64(%rdx)
-	movaps	%xmm5, -48(%rdx)
-	movaps	%xmm6, -32(%rdx)
-	movaps	%xmm7, -16(%rdx)
-	jmp	L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
-	lea	48(%r8), %r8
-# endif
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm6, -32(%rdx)
-	pcmpeqb	%xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl1):
-	movaps	-1(%rcx), %xmm1
-	movaps	15(%rcx), %xmm2
-L(Shl1Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	31(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-15(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-1(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl1LoopStart):
-	movaps	15(%rcx), %xmm2
-	movaps	31(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	47(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	63(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$1, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$1, %xmm3, %xmm4
-	jnz	L(Shl1Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave1)
-# endif
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl1LoopStart)
-
-L(Shl1LoopExit):
-	movdqu	-1(%rcx), %xmm1
-	mov	$15, %rsi
-	movdqu	%xmm1, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl2):
-	movaps	-2(%rcx), %xmm1
-	movaps	14(%rcx), %xmm2
-L(Shl2Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	30(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-14(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-2(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl2LoopStart):
-	movaps	14(%rcx), %xmm2
-	movaps	30(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	46(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	62(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$2, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$2, %xmm3, %xmm4
-	jnz	L(Shl2Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave2)
-# endif
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl2LoopStart)
-
-L(Shl2LoopExit):
-	movdqu	-2(%rcx), %xmm1
-	mov	$14, %rsi
-	movdqu	%xmm1, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl3):
-	movaps	-3(%rcx), %xmm1
-	movaps	13(%rcx), %xmm2
-L(Shl3Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	29(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-13(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-3(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl3LoopStart):
-	movaps	13(%rcx), %xmm2
-	movaps	29(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	45(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	61(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$3, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$3, %xmm3, %xmm4
-	jnz	L(Shl3Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave3)
-# endif
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl3LoopStart)
-
-L(Shl3LoopExit):
-	movdqu	-3(%rcx), %xmm1
-	mov	$13, %rsi
-	movdqu	%xmm1, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl4):
-	movaps	-4(%rcx), %xmm1
-	movaps	12(%rcx), %xmm2
-L(Shl4Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	28(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-12(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-4(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl4LoopStart):
-	movaps	12(%rcx), %xmm2
-	movaps	28(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	44(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	60(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$4, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$4, %xmm3, %xmm4
-	jnz	L(Shl4Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave4)
-# endif
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl4LoopStart)
-
-L(Shl4LoopExit):
-	movdqu	-4(%rcx), %xmm1
-	mov	$12, %rsi
-	movdqu	%xmm1, -4(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl5):
-	movaps	-5(%rcx), %xmm1
-	movaps	11(%rcx), %xmm2
-L(Shl5Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	27(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-11(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-5(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl5LoopStart):
-	movaps	11(%rcx), %xmm2
-	movaps	27(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	43(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	59(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$5, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$5, %xmm3, %xmm4
-	jnz	L(Shl5Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave5)
-# endif
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl5LoopStart)
-
-L(Shl5LoopExit):
-	movdqu	-5(%rcx), %xmm1
-	mov	$11, %rsi
-	movdqu	%xmm1, -5(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl6):
-	movaps	-6(%rcx), %xmm1
-	movaps	10(%rcx), %xmm2
-L(Shl6Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	26(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-10(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-6(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl6LoopStart):
-	movaps	10(%rcx), %xmm2
-	movaps	26(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	42(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	58(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$6, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$6, %xmm3, %xmm4
-	jnz	L(Shl6Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave6)
-# endif
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl6LoopStart)
-
-L(Shl6LoopExit):
-	mov	(%rcx), %r9
-	mov	6(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 6(%rdx)
-	mov	$10, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl7):
-	movaps	-7(%rcx), %xmm1
-	movaps	9(%rcx), %xmm2
-L(Shl7Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	25(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-9(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-7(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl7LoopStart):
-	movaps	9(%rcx), %xmm2
-	movaps	25(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	41(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	57(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$7, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$7, %xmm3, %xmm4
-	jnz	L(Shl7Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave7)
-# endif
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl7LoopStart)
-
-L(Shl7LoopExit):
-	mov	(%rcx), %r9
-	mov	5(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 5(%rdx)
-	mov	$9, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl8):
-	movaps	-8(%rcx), %xmm1
-	movaps	8(%rcx), %xmm2
-L(Shl8Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	24(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-8(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-8(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl8LoopStart):
-	movaps	8(%rcx), %xmm2
-	movaps	24(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	40(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	56(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$8, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$8, %xmm3, %xmm4
-	jnz	L(Shl8Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave8)
-# endif
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl8LoopStart)
-
-L(Shl8LoopExit):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl9):
-	movaps	-9(%rcx), %xmm1
-	movaps	7(%rcx), %xmm2
-L(Shl9Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	23(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-7(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-9(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl9LoopStart):
-	movaps	7(%rcx), %xmm2
-	movaps	23(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	39(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	55(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$9, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$9, %xmm3, %xmm4
-	jnz	L(Shl9Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave9)
-# endif
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl9LoopStart)
-
-L(Shl9LoopExit):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl10):
-	movaps	-10(%rcx), %xmm1
-	movaps	6(%rcx), %xmm2
-L(Shl10Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	22(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-6(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-10(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl10LoopStart):
-	movaps	6(%rcx), %xmm2
-	movaps	22(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	38(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	54(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$10, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$10, %xmm3, %xmm4
-	jnz	L(Shl10Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave10)
-# endif
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl10LoopStart)
-
-L(Shl10LoopExit):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl11):
-	movaps	-11(%rcx), %xmm1
-	movaps	5(%rcx), %xmm2
-L(Shl11Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	21(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-5(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-11(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl11LoopStart):
-	movaps	5(%rcx), %xmm2
-	movaps	21(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	37(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	53(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$11, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$11, %xmm3, %xmm4
-	jnz	L(Shl11Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave11)
-# endif
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl11LoopStart)
-
-L(Shl11LoopExit):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl12):
-	movaps	-12(%rcx), %xmm1
-	movaps	4(%rcx), %xmm2
-L(Shl12Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	20(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-4(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-12(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl12LoopStart):
-	movaps	4(%rcx), %xmm2
-	movaps	20(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	36(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	52(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$12, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$12, %xmm3, %xmm4
-	jnz	L(Shl12Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave12)
-# endif
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl12LoopStart)
-
-L(Shl12LoopExit):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl13):
-	movaps	-13(%rcx), %xmm1
-	movaps	3(%rcx), %xmm2
-L(Shl13Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	19(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-3(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-13(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl13LoopStart):
-	movaps	3(%rcx), %xmm2
-	movaps	19(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	35(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	51(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$13, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$13, %xmm3, %xmm4
-	jnz	L(Shl13Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave13)
-# endif
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl13LoopStart)
-
-L(Shl13LoopExit):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl14):
-	movaps	-14(%rcx), %xmm1
-	movaps	2(%rcx), %xmm2
-L(Shl14Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	18(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-2(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-14(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl14LoopStart):
-	movaps	2(%rcx), %xmm2
-	movaps	18(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	34(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	50(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$14, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$14, %xmm3, %xmm4
-	jnz	L(Shl14Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave14)
-# endif
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl14LoopStart)
-
-L(Shl14LoopExit):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl15):
-	movaps	-15(%rcx), %xmm1
-	movaps	1(%rcx), %xmm2
-L(Shl15Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	17(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-1(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-15(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl15LoopStart):
-	movaps	1(%rcx), %xmm2
-	movaps	17(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	33(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	49(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$15, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$15, %xmm3, %xmm4
-	jnz	L(Shl15Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave15)
-# endif
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl15LoopStart)
-
-L(Shl15LoopExit):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
-	jmp	L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-#  ifdef USE_AS_STRNCPY
-	add	$16, %r8
-#  endif
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-
-	.p2align 4
-L(Exit8):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$8, %r8
-	lea	8(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-
-	.p2align 4
-L(Exit16):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %rax
-	mov	%rax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	15(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	lea	16(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	cmp	$1, %r8
-	je	L(Exit1)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	jmp	L(Exit8)
-
-	.p2align 4
-L(ExitHighCase2):
-	cmp	$9, %r8
-	je	L(Exit9)
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$15, %r8
-	je	L(Exit15)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	jmp	L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$16, %r8
-	je	L(Exit16)
-	cmp	$8, %r8
-	je	L(Exit8)
-	jg	L(More8Case3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	jg	L(More4Case3)
-	cmp	$2, %r8
-	jl	L(Exit1)
-	je	L(Exit2)
-	jg	L(Exit3)
-L(More8Case3): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Exit12)
-	jl	L(Less12Case3)
-	cmp	$14, %r8
-	jl	L(Exit13)
-	je	L(Exit14)
-	jg	L(Exit15)
-L(More4Case3): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Exit5)
-	je	L(Exit6)
-	jg	L(Exit7)
-L(Less12Case3): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Exit9)
-	je	L(Exit10)
-	jg	L(Exit11)
-#  endif
-
-	.p2align 4
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$1, %r8
-	lea	1(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$2, %r8
-	lea	2(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$3, %r8
-	lea	3(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit4):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	3(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$4, %r8
-	lea	4(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit5):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	4(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$5, %r8
-	lea	5(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit6):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	5(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$6, %r8
-	lea	6(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit7):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movl	3(%rcx), %eax
-	movl	%eax, 3(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	6(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$7, %r8
-	lea	7(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit9):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %eax
-	mov	%eax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	8(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$9, %r8
-	lea	9(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit10):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %eax
-	mov	%eax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	9(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$10, %r8
-	lea	10(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit11):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	10(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$11, %r8
-	lea	11(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit12):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	11(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$12, %r8
-	lea	12(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit13):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %rax
-	mov	%rax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	12(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$13, %r8
-	lea	13(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit14):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %rax
-	mov	%rax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	13(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$14, %r8
-	lea	14(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit15):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$15, %r8
-	lea	15(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-	.p2align 4
-L(Fill0):
-	ret
-
-	.p2align 4
-L(Fill1):
-	movb	%dl, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill2):
-	movw	%dx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill3):
-	movw	%dx, (%rcx)
-	movb	%dl, 2(%rcx)
-	ret
-
-	.p2align 4
-L(Fill4):
-	movl	%edx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill5):
-	movl	%edx, (%rcx)
-	movb	%dl, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill6):
-	movl	%edx, (%rcx)
-	movw	%dx, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill7):
-	movl	%edx, (%rcx)
-	movl	%edx, 3(%rcx)
-	ret
-
-	.p2align 4
-L(Fill8):
-	mov	%rdx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill9):
-	mov	%rdx, (%rcx)
-	movb	%dl, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill10):
-	mov	%rdx, (%rcx)
-	movw	%dx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill11):
-	mov	%rdx, (%rcx)
-	movl	%edx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill12):
-	mov	%rdx, (%rcx)
-	movl	%edx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill13):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 5(%rcx)
-	ret
-
-	.p2align 4
-L(Fill14):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 6(%rcx)
-	ret
-
-	.p2align 4
-L(Fill15):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill16):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(StrncpyFillExit1):
-	lea	16(%r8), %r8
-L(FillFrom1To16Bytes):
-	test	%r8, %r8
-	jz	L(Fill0)
-	cmp	$16, %r8
-	je	L(Fill16)
-	cmp	$8, %r8
-	je	L(Fill8)
-	jg	L(FillMore8)
-	cmp	$4, %r8
-	je	L(Fill4)
-	jg	L(FillMore4)
-	cmp	$2, %r8
-	jl	L(Fill1)
-	je	L(Fill2)
-	jg	L(Fill3)
-L(FillMore8): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Fill12)
-	jl	L(FillLess12)
-	cmp	$14, %r8
-	jl	L(Fill13)
-	je	L(Fill14)
-	jg	L(Fill15)
-L(FillMore4): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Fill5)
-	je	L(Fill6)
-	jg	L(Fill7)
-L(FillLess12): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Fill9)
-	je	L(Fill10)
-	jmp	L(Fill11)
-
-	.p2align 4
-L(StrncpyFillTailWithZero1):
-	xor	%rdx, %rdx
-	sub	$16, %r8
-	jbe	L(StrncpyFillExit1)
-
-	pxor	%xmm0, %xmm0
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-
-	lea	16(%rcx), %rcx
-
-	mov	%rcx, %rdx
-	and	$0xf, %rdx
-	sub	%rdx, %rcx
-	add	%rdx, %r8
-	xor	%rdx, %rdx
-	sub	$64, %r8
-	jb	L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	movdqa	%xmm0, 32(%rcx)
-	movdqa	%xmm0, 48(%rcx)
-	lea	64(%rcx), %rcx
-	sub	$64, %r8
-	jae	L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
-	add	$32, %r8
-	jl	L(StrncpyFillLess32)
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	lea	32(%rcx), %rcx
-	sub	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
-	add	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-	.p2align 4
-L(Exit0):
-	mov	%rdx, %rax
-	ret
-
-	.p2align 4
-L(StrncpyExit15Bytes):
-	cmp	$9, %r8
-	je	L(Exit9)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-	.p2align 4
-L(StrncpyExit8Bytes):
-	cmp	$1, %r8
-	je	L(Exit1)
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-#  endif
-# endif
-
-# ifdef USE_AS_STRNCPY
-	.p2align 4
-L(StrncpyLeaveCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
-	lea	64(%r8), %r8
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	add	$48, %r8
-	jle	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
-	.p2align 4
-L(StrncpyExit1Case2OrCase3):
-	movdqu	-1(%rcx), %xmm0
-	movdqu	%xmm0, -1(%rdx)
-	mov	$15, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit2Case2OrCase3):
-	movdqu	-2(%rcx), %xmm0
-	movdqu	%xmm0, -2(%rdx)
-	mov	$14, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit3Case2OrCase3):
-	movdqu	-3(%rcx), %xmm0
-	movdqu	%xmm0, -3(%rdx)
-	mov	$13, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit4Case2OrCase3):
-	movdqu	-4(%rcx), %xmm0
-	movdqu	%xmm0, -4(%rdx)
-	mov	$12, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit5Case2OrCase3):
-	movdqu	-5(%rcx), %xmm0
-	movdqu	%xmm0, -5(%rdx)
-	mov	$11, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit6Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	6(%rcx), %r9d
-	mov	%r9d, 6(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$10, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit7Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	5(%rcx), %r9d
-	mov	%r9d, 5(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$9, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit8Case2OrCase3):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit9Case2OrCase3):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit10Case2OrCase3):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit11Case2OrCase3):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit12Case2OrCase3):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit13Case2OrCase3):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit14Case2OrCase3):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit15Case2OrCase3):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave1):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit1)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit1):
-	lea	15(%rdx, %rsi), %rdx
-	lea	15(%rcx, %rsi), %rcx
-	mov	-15(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -15(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave2):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit2)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit2):
-	lea	14(%rdx, %rsi), %rdx
-	lea	14(%rcx, %rsi), %rcx
-	mov	-14(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -14(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave3):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit3)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit3):
-	lea	13(%rdx, %rsi), %rdx
-	lea	13(%rcx, %rsi), %rcx
-	mov	-13(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -13(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave4):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit4)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit4):
-	lea	12(%rdx, %rsi), %rdx
-	lea	12(%rcx, %rsi), %rcx
-	mov	-12(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -12(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave5):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit5)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit5):
-	lea	11(%rdx, %rsi), %rdx
-	lea	11(%rcx, %rsi), %rcx
-	mov	-11(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -11(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave6):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit6)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit6):
-	lea	10(%rdx, %rsi), %rdx
-	lea	10(%rcx, %rsi), %rcx
-	mov	-10(%rcx), %rsi
-	movw	-2(%rcx), %ax
-	mov	%rsi, -10(%rdx)
-	movw	%ax, -2(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave7):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit7)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit7):
-	lea	9(%rdx, %rsi), %rdx
-	lea	9(%rcx, %rsi), %rcx
-	mov	-9(%rcx), %rsi
-	movb	-1(%rcx), %ah
-	mov	%rsi, -9(%rdx)
-	movb	%ah, -1(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave8):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit8)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit8):
-	lea	8(%rdx, %rsi), %rdx
-	lea	8(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave9):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit9)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit9):
-	lea	7(%rdx, %rsi), %rdx
-	lea	7(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave10):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit10)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit10):
-	lea	6(%rdx, %rsi), %rdx
-	lea	6(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave11):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit11)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit11):
-	lea	5(%rdx, %rsi), %rdx
-	lea	5(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave12):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit12)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit12):
-	lea	4(%rdx, %rsi), %rdx
-	lea	4(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave13):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit13)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit13):
-	lea	3(%rdx, %rsi), %rdx
-	lea	3(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave14):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit14)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit14):
-	lea	2(%rdx, %rsi), %rdx
-	lea	2(%rcx, %rsi), %rcx
-	movw	-2(%rcx), %ax
-	xor	%rsi, %rsi
-	movw	%ax, -2(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave15):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit15)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit15):
-	lea	1(%rdx, %rsi), %rdx
-	lea	1(%rcx, %rsi), %rcx
-	movb	-1(%rcx), %ah
-	xor	%rsi, %rsi
-	movb	%ah, -1(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"