From patchwork Wed Jun 21 06:51:21 2017
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Stefan Liebler <stli@linux.vnet.ibm.com>
X-Patchwork-Id: 21159
Received: (qmail 68732 invoked by alias); 21 Jun 2017 06:51:41 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 68637 invoked by uid 89); 21 Jun 2017 06:51:41 -0000
Authentication-Results: sourceware.org; auth=none
X-Virus-Found: No
X-Spam-SWARE-Status: No, score=-23.7 required=5.0 tests=AWL, BAYES_00,
	GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3,
	KAM_LAZY_DOMAIN_SECURITY, KHOP_DYNAMIC,
	RCVD_IN_DNSWL_LOW autolearn=ham version=3.3.2 spammy=enlarge,
	Enlarge
X-HELO: mx0a-001b2d01.pphosted.com
From: Stefan Liebler <stli@linux.vnet.ibm.com>
To: libc-alpha@sourceware.org
Cc: Stefan Liebler <stli@linux.vnet.ibm.com>
Subject: [PATCH 4/5] S390: Use cu24 instruction for converting from utf16 to
	utf32.
Date: Wed, 21 Jun 2017 08:51:21 +0200
In-Reply-To: <1498027882-24892-1-git-send-email-stli@linux.vnet.ibm.com>
References: <1498027882-24892-1-git-send-email-stli@linux.vnet.ibm.com>
X-TM-AS-GCONF: 00
x-cbid: 17062106-0040-0000-0000-000003CF5824
X-IBM-AV-DETECTION: SAVI=unused REMOTE=unused XFE=unused
x-cbparentid: 17062106-0041-0000-0000-000020690124
Message-Id: <1498027882-24892-4-git-send-email-stli@linux.vnet.ibm.com>
X-Proofpoint-Virus-Version: vendor=fsecure engine=2.50.10432:, ,
	definitions=2017-06-21_01:, , signatures=0
X-Proofpoint-Spam-Details: rule=outbound_notspam policy=outbound score=0
	spamscore=0 suspectscore=1
	malwarescore=0 phishscore=0 adultscore=0 bulkscore=0 classifier=spam
	adjust=0 reason=mlx scancount=1 engine=8.0.1-1703280000
	definitions=main-1706210113

This patch adds an ifunc variant to use the cu instruction on arch12 CPUs.
This new ifunc variant can be built if binutils support z13 vector
instructions.  At runtime, HWCAP_S390_VXE decides if we can use the
cu24 instruction.

ChangeLog:

	* sysdeps/s390/utf16-utf32-z9.c (__from_utf16_loop_vx_cu):
	Use vector and cu24 instruction.
    	* sysdeps/s390/multiarch/utf16-utf32-z9.c:
    	Add __from_utf16_loop_vx_cu in ifunc resolver.
---
 sysdeps/s390/multiarch/utf16-utf32-z9.c |   8 ++-
 sysdeps/s390/utf16-utf32-z9.c           | 113 +++++++++++++++++++++++++++++++-
 2 files changed, 117 insertions(+), 4 deletions(-)

diff --git a/sysdeps/s390/multiarch/utf16-utf32-z9.c b/sysdeps/s390/multiarch/utf16-utf32-z9.c
index ded3cc2..917de5f 100644
--- a/sysdeps/s390/multiarch/utf16-utf32-z9.c
+++ b/sysdeps/s390/multiarch/utf16-utf32-z9.c
@@ -32,9 +32,11 @@ strong_alias (SINGLE_NAME (TO_LOOP_DEFAULT), SINGLE_NAME (TO_LOOP))
 
 /* Generate ifunc'ed loop functions for FROM/TO_LOOP.  */
 s390_libc_ifunc_expr (FROM_LOOP_DEFAULT, FROM_LOOP,
-		      (HAVE_FROM_VX && (hwcap & HWCAP_S390_VX))
-		      ? FROM_LOOP_VX
-		      : FROM_LOOP_DEFAULT);
+		      (HAVE_FROM_VX_CU && (hwcap & HWCAP_S390_VXE))
+		      ? FROM_LOOP_VX_CU
+		      : (HAVE_FROM_VX && (hwcap & HWCAP_S390_VX))
+			? FROM_LOOP_VX
+			: FROM_LOOP_DEFAULT);
 
 s390_libc_ifunc_expr (TO_LOOP_DEFAULT, TO_LOOP,
 		      (HAVE_TO_VX_CU && (hwcap & HWCAP_S390_VXE))
diff --git a/sysdeps/s390/utf16-utf32-z9.c b/sysdeps/s390/utf16-utf32-z9.c
index 45c9f2b..07cff21 100644
--- a/sysdeps/s390/utf16-utf32-z9.c
+++ b/sysdeps/s390/utf16-utf32-z9.c
@@ -39,10 +39,12 @@
 
 #if defined HAVE_S390_VX_ASM_SUPPORT && defined USE_MULTIARCH
 # define HAVE_FROM_VX		1
+# define HAVE_FROM_VX_CU	1
 # define HAVE_TO_VX		1
 # define HAVE_TO_VX_CU		1
 #else
 # define HAVE_FROM_VX		0
+# define HAVE_FROM_VX_CU	0
 # define HAVE_TO_VX		0
 # define HAVE_TO_VX_CU		0
 #endif
@@ -383,7 +385,6 @@ gconv_end (struct __gconv_step *data)
     STANDARD_FROM_LOOP_ERR_HANDLER (2);					\
   }
 
-
 /* Generate loop-function with hardware vector instructions.  */
 # define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
 # define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
@@ -397,6 +398,116 @@ gconv_end (struct __gconv_step *data)
 # define FROM_LOOP_VX		NULL
 #endif /* HAVE_FROM_VX != 1  */
 
+#if HAVE_FROM_VX_CU == 1
+#define BODY_FROM_VX_CU							\
+  {									\
+    register const unsigned char* pInput asm ("8") = inptr;		\
+    register size_t inlen asm ("9") = inend - inptr;			\
+    register unsigned char* pOutput asm ("10") = outptr;		\
+    register size_t outlen asm ("11") = outend - outptr;		\
+    unsigned long tmp, tmp2, tmp3;					\
+    asm volatile (".machine push\n\t"					\
+		  ".machine \"z13\"\n\t"				\
+		  ".machinemode \"zarch_nohighgprs\"\n\t"		\
+		  /* Setup to check for surrogates.  */			\
+		  "    larl %[R_TMP],9f\n\t"				\
+		  "    vlm %%v30,%%v31,0(%[R_TMP])\n\t"			\
+		  CONVERT_32BIT_SIZE_T ([R_INLEN])			\
+		  CONVERT_32BIT_SIZE_T ([R_OUTLEN])			\
+		  /* Loop which handles UTF-16 chars <0xd800, >0xdfff.  */ \
+		  "0:  clgijl %[R_INLEN],16,20f\n\t"			\
+		  "    clgijl %[R_OUTLEN],32,20f\n\t"			\
+		  "1:  vl %%v16,0(%[R_IN])\n\t"				\
+		  /* Check for surrogate chars.  */			\
+		  "    vstrchs %%v19,%%v16,%%v30,%%v31\n\t"		\
+		  "    jno 10f\n\t"					\
+		  /* Enlarge to UTF-32.  */				\
+		  "    vuplhh %%v17,%%v16\n\t"				\
+		  "    la %[R_IN],16(%[R_IN])\n\t"			\
+		  "    vupllh %%v18,%%v16\n\t"				\
+		  "    aghi %[R_INLEN],-16\n\t"				\
+		  /* Store 32 bytes to buf_out.  */			\
+		  "    vstm %%v17,%%v18,0(%[R_OUT])\n\t"		\
+		  "    aghi %[R_OUTLEN],-32\n\t"			\
+		  "    la %[R_OUT],32(%[R_OUT])\n\t"			\
+		  "    clgijl %[R_INLEN],16,20f\n\t"			\
+		  "    clgijl %[R_OUTLEN],32,20f\n\t"			\
+		  "    j 1b\n\t"					\
+		  /* Setup to check for ch >= 0xd800 && ch <= 0xdfff. (v30, v31)  */ \
+		  "9:  .short 0xd800,0xdfff,0x0,0x0,0x0,0x0,0x0,0x0\n\t" \
+		  "    .short 0xa000,0xc000,0x0,0x0,0x0,0x0,0x0,0x0\n\t" \
+		  /* At least one uint16_t is in range of surrogates.	\
+		     Store the preceding chars.  */			\
+		  "10: vlgvb %[R_TMP],%%v19,7\n\t"			\
+		  "    vuplhh %%v17,%%v16\n\t"				\
+		  "    sllg %[R_TMP3],%[R_TMP],1\n\t" /* Number of out bytes.  */ \
+		  "    ahik %[R_TMP2],%[R_TMP3],-1\n\t" /* Highest index to store.  */ \
+		  "    jl 20f\n\t"					\
+		  "    vstl %%v17,%[R_TMP2],0(%[R_OUT])\n\t"		\
+		  "    vupllh %%v18,%%v16\n\t"				\
+		  "    ahi %[R_TMP2],-16\n\t"				\
+		  "    jl 11f\n\t"					\
+		  "    vstl %%v18,%[R_TMP2],16(%[R_OUT])\n\t"		\
+		  "11: \n\t" /* Update pointers.  */			\
+		  "    la %[R_IN],0(%[R_TMP],%[R_IN])\n\t"		\
+		  "    slgr %[R_INLEN],%[R_TMP]\n\t"			\
+		  "    la %[R_OUT],0(%[R_TMP3],%[R_OUT])\n\t"		\
+		  "    slgr %[R_OUTLEN],%[R_TMP3]\n\t"			\
+		  /* Handles UTF16 surrogates with convert instruction.  */ \
+		  "20: cu24 %[R_OUT],%[R_IN],1\n\t"			\
+		  "    jo 0b\n\t" /* Try vector implemenation again.  */ \
+		  "    lochil %[R_RES],%[RES_OUT_FULL]\n\t" /* cc == 1.  */ \
+		  "    lochih %[R_RES],%[RES_IN_ILL]\n\t" /* cc == 2.  */ \
+		  ".machine pop"					\
+		  : /* outputs */ [R_IN] "+a" (pInput)			\
+		    , [R_INLEN] "+d" (inlen), [R_OUT] "+a" (pOutput)	\
+		    , [R_OUTLEN] "+d" (outlen), [R_TMP] "=a" (tmp)	\
+		    , [R_TMP2] "=d" (tmp2), [R_TMP3] "=a" (tmp3)	\
+		    , [R_RES] "+d" (result)				\
+		  : /* inputs */					\
+		    [RES_OUT_FULL] "i" (__GCONV_FULL_OUTPUT)		\
+		    , [RES_IN_ILL] "i" (__GCONV_ILLEGAL_INPUT)		\
+		  : /* clobber list */ "memory", "cc"			\
+		    ASM_CLOBBER_VR ("v16") ASM_CLOBBER_VR ("v17")	\
+		    ASM_CLOBBER_VR ("v18") ASM_CLOBBER_VR ("v19")	\
+		    ASM_CLOBBER_VR ("v30") ASM_CLOBBER_VR ("v31")	\
+		  );							\
+    inptr = pInput;							\
+    outptr = pOutput;							\
+									\
+    if (__glibc_likely (inlen == 0)					\
+	|| result == __GCONV_FULL_OUTPUT)				\
+      break;								\
+    if (inlen == 1)							\
+      {									\
+	/* Input does not contain a complete utf16 character.  */	\
+	result = __GCONV_INCOMPLETE_INPUT;				\
+	break;								\
+      }									\
+    else if (result != __GCONV_ILLEGAL_INPUT)				\
+      {									\
+	/* Input is >= 2 and < 4 bytes (as cu24 would have processed	\
+	   a possible next utf16 character) and not illegal.		\
+	   => we have a single high surrogate at end of input.  */	\
+	result = __GCONV_INCOMPLETE_INPUT;				\
+	break;								\
+      }									\
+									\
+    STANDARD_FROM_LOOP_ERR_HANDLER (2);					\
+  }
+
+/* Generate loop-function with hardware vector and utf-convert instructions.  */
+# define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+# define MAX_NEEDED_INPUT	MAX_NEEDED_FROM
+# define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+# define FROM_LOOP_VX_CU	__from_utf16_loop_vx_cu
+# define LOOPFCT		FROM_LOOP_VX_CU
+# define LOOP_NEED_FLAGS
+# define BODY			BODY_FROM_VX_CU
+# include <iconv/loop.c>
+#else
+# define FROM_LOOP_VX_CU	NULL
+#endif /* HAVE_FROM_VX_CU != 1  */
 
 /* Conversion from UTF-32 internal/BE to UTF-16.  */