MIPS specific strcmp function

Message ID 1412181359.2740.479.camel@ubuntu-sellcey
State Committed
Headers

Commit Message

Steve Ellcey Oct. 1, 2014, 4:35 p.m. UTC
  I have created a new MIPS strcmp routine.  I tried rewriting this in C a
while back but it was still slower then the standard version in some
unaligned cases.  I have gone back and rewritten it in assembly language
and this version is faster then the standard version in all cases except
for one byte comparisons and one of the two byte comparison tests from
bench-strcmp.

I have attached old and new bench-strcmp.out files to show the
improvement.

There is an ifdef for using the CLZ instruction but on the machine where
I was doing performance testing using CLZ did not help so I did not turn
this ifdef on by default.  Other MIPS platforms might find it helpful
though so I left the code in.

I ran the glibc testsuite with no regressions on mips32 big and little
endian platforms and also did builds and tests using my own tests on a
mips64 simulator.

OK to checkin?

Steve Ellcey
sellcey@mips.com


2014-10-01  Steve Ellcey  <sellcey@mips.com>

	* sysdeps/mips/strcmp.S: New.
strcmp	simple_strcmp	stupid_strcmp
Length    1, alignment  1/01:	114.844	37.1719	124.5
Length    1, alignment  1/01:	28.8281	21.3281	67.125
Length    1, alignment  1/01:	27.7969	21.0781	72.3438
Length    2, alignment  2/02:	31.0312	45.3594	112.109
Length    2, alignment  2/02:	30.7656	42.4531	106.359
Length    2, alignment  2/02:	29.9219	30.9531	107.781
Length    3, alignment  3/03:	33.6406	44.6719	104.984
Length    3, alignment  3/03:	32.4844	46.6094	104.5
Length    3, alignment  3/03:	32.5156	45.6875	104.703
Length    4, alignment  4/04:	36.5938	48.7656	110.969
Length    4, alignment  4/04:	34.0156	43.5781	105.188
Length    4, alignment  4/04:	32.5156	43.8906	105.234
Length    5, alignment  5/05:	38.9531	53.5781	157.516
Length    5, alignment  5/05:	37.2812	48.8281	150.453
Length    5, alignment  5/05:	37.7344	48.2812	151.469
Length    6, alignment  6/06:	41.6094	58.5	154.281
Length    6, alignment  6/06:	40.0625	52.9375	148.234
Length    6, alignment  6/06:	40.4531	52.9375	147.688
Length    7, alignment  7/07:	44.5938	62.9844	146.047
Length    7, alignment  7/07:	43.9375	57.1406	140.922
Length    7, alignment  7/07:	42.3594	57.75	141.766
Length    8, alignment  8/08:	39.6094	67.125	153.547
Length    8, alignment  8/08:	38.1875	62.4688	147.219
Length    8, alignment  8/08:	38.0625	63.2656	147.844
Length    9, alignment  9/09:	60.5312	72.25	198.531
Length    9, alignment  9/09:	58.5781	67.0938	193.562
Length    9, alignment  9/09:	57.5625	67.0312	193.062
Length   10, alignment 10/10:	62.5469	76.9219	194.781
Length   10, alignment 10/10:	60.5625	71.8594	188.562
Length   10, alignment 10/10:	60.5938	72.1562	188.656
Length   11, alignment 11/11:	65.9375	82.2812	185.078
Length   11, alignment 11/11:	62.5469	76.9844	179.125
Length   11, alignment 11/11:	62.8906	76.3125	179.094
Length   12, alignment 12/12:	45.6562	85.7969	173.938
Length   12, alignment 12/12:	42.1719	81.1562	167.578
Length   12, alignment 12/12:	42.6094	80.4062	167.719
Length   13, alignment 13/13:	71.2969	91.1562	237.484
Length   13, alignment 13/13:	68.5625	85.625	230.531
Length   13, alignment 13/13:	67.875	85.7188	231.406
Length   14, alignment 14/14:	72.9062	95.1094	211.297
Length   14, alignment 14/14:	70.8594	90.1562	204.953
Length   14, alignment 14/14:	71.625	90.4062	205.484
Length   15, alignment 15/15:	75.25	100.969	205.5
Length   15, alignment 15/15:	73.5625	94.6562	199.812
Length   15, alignment 15/15:	73.2031	95.5625	198.906
Length   16, alignment 16/16:	50.2188	104.828	204.281
Length   16, alignment 16/16:	47.9062	99.0781	198.281
Length   16, alignment 16/16:	46.9688	99.8438	198.172
Length   17, alignment 17/17:	81.9375	110.234	257.344
Length   17, alignment 17/17:	80.5781	103.609	251.391
Length   17, alignment 17/17:	80.2344	104.328	251.25
Length   18, alignment 18/18:	84.875	114.406	241.781
Length   18, alignment 18/18:	82.9688	108.891	236.094
Length   18, alignment 18/18:	82.5	108.984	235.547
Length   19, alignment 19/19:	87.9375	118.281	235.516
Length   19, alignment 19/19:	85.5625	113.297	229.484
Length   19, alignment 19/19:	85.5938	113.484	230.031
Length   20, alignment 20/20:	54.3125	123.125	234.953
Length   20, alignment 20/20:	52.5312	118.594	229.281
Length   20, alignment 20/20:	52.625	118.875	229.172
Length   21, alignment 21/21:	93.5469	127.781	287.328
Length   21, alignment 21/21:	91.2656	122.656	282.156
Length   21, alignment 21/21:	91.25	122.094	281.938
Length   22, alignment 22/22:	95.5156	132.469	272.531
Length   22, alignment 22/22:	93.5469	127.641	267.25
Length   22, alignment 22/22:	93.5156	127.938	267.172
Length   23, alignment 23/23:	98.9375	137.438	267.219
Length   23, alignment 23/23:	97.3125	132.547	261.094
Length   23, alignment 23/23:	96.9531	131.438	260.812
Length   24, alignment 24/24:	57.875	142.406	266.297
Length   24, alignment 24/24:	57.1875	137.625	259.578
Length   24, alignment 24/24:	57.4375	137.219	259.922
Length   25, alignment 25/25:	105.234	147.266	318.672
Length   25, alignment 25/25:	103.594	142.547	312.828
Length   25, alignment 25/25:	103.266	141.672	312.812
Length   26, alignment 26/26:	108.625	151.5	303.672
Length   26, alignment 26/26:	106.234	146.594	297.062
Length   26, alignment 26/26:	105.531	146.656	297.469
Length   27, alignment 27/27:	110.953	156.312	296.328
Length   27, alignment 27/27:	108.516	151.516	291.234
Length   27, alignment 27/27:	108.938	151.406	291.016
Length   28, alignment 28/28:	63.4844	160.656	296.734
Length   28, alignment 28/28:	61.875	155.906	289.672
Length   28, alignment 28/28:	62	155.375	290.031
Length   29, alignment 29/29:	116.953	166.172	349.125
Length   29, alignment 29/29:	113.859	160.078	343.141
Length   29, alignment 29/29:	113.938	160.359	343.75
Length   30, alignment 30/30:	118.578	170.734	334.312
Length   30, alignment 30/30:	116.906	164.734	329.094
Length   30, alignment 30/30:	116.938	164.859	327.609
Length   31, alignment 31/31:	120.219	174.469	327.328
Length   31, alignment 31/31:	119.875	169.75	322.109
Length   31, alignment 31/31:	119.234	170.438	322.297
Length    4, alignment  0/00:	34.4531	49.0312	111.406
Length    4, alignment  0/00:	34.8906	48.9844	111.281
Length    4, alignment  0/00:	33.7188	50.5625	105.656
Length    4, alignment  0/00:	32.8594	50.5625	104.469
Length    4, alignment  0/00:	33.1875	51.3125	104.703
Length    4, alignment  0/00:	33.1406	51.2031	106.188
Length    4, alignment  0/01:	35.2812	51.3125	136.719
Length    4, alignment  1/02:	34.1719	50.0312	135.609
Length    8, alignment  0/00:	39.3594	68.7344	154
Length    8, alignment  0/00:	38.4688	67.8281	154.422
Length    8, alignment  0/00:	38.5312	62.3125	148
Length    8, alignment  0/00:	38.3281	62.9375	148.844
Length    8, alignment  0/00:	38.0469	62.0938	148.031
Length    8, alignment  0/00:	38.4375	62.4375	147.688
Length    8, alignment  0/02:	45.1406	62.125	171.516
Length    8, alignment  2/03:	45	62.7656	162.656
Length   16, alignment  0/00:	49.7031	105.125	204.141
Length   16, alignment  0/00:	48.1875	105.094	203.438
Length   16, alignment  0/00:	48.25	99.7656	198.359
Length   16, alignment  0/00:	47.6562	98.9531	198.344
Length   16, alignment  0/00:	48.3125	99.4375	198.141
Length   16, alignment  0/00:	47.625	99.1562	198.469
Length   16, alignment  0/03:	83.5781	100.75	212.297
Length   16, alignment  3/04:	83.625	99.9062	212.078
Length   32, alignment  0/00:	74.5	179.438	327.219
Length   32, alignment  0/00:	73.1094	179.453	326.844
Length   32, alignment  0/00:	65.8125	174.703	320.75
Length   32, alignment  0/00:	66.3125	174.781	320.688
Length   32, alignment  0/00:	65.9219	174.531	321.188
Length   32, alignment  0/00:	65.9062	174.438	321.016
Length   32, alignment  0/04:	66.25	174.781	321.141
Length   32, alignment  4/05:	122.406	175.25	351.141
Length   64, alignment  0/00:	112.141	329.406	572.359
Length   64, alignment  0/00:	111.062	329.391	572.312
Length   64, alignment  0/00:	109.375	323.078	566.5
Length   64, alignment  0/00:	110.297	323.438	566.516
Length   64, alignment  0/00:	109.828	323.688	566.562
Length   64, alignment  0/00:	109.5	324.062	566.734
Length   64, alignment  0/05:	212.812	323.297	595.484
Length   64, alignment  5/06:	212.906	323.438	603.75
Length  128, alignment  0/00:	188.125	627.844	1063.44
Length  128, alignment  0/00:	187.094	628.25	1063.12
Length  128, alignment  0/00:	184.625	622.422	1060.34
Length  128, alignment  0/00:	185.516	622.312	1057.12
Length  128, alignment  0/00:	185.641	622.734	1057.08
Length  128, alignment  0/00:	185.844	622.719	1057.84
Length  128, alignment  0/06:	394.672	621.406	1074.33
Length  128, alignment  6/07:	394	621.828	1080.03
Length  256, alignment  0/00:	340.891	1224.42	2044.42
Length  256, alignment  0/00:	339.391	1224.78	2043.81
Length  256, alignment  0/00:	337.016	1219.34	2038.44
Length  256, alignment  0/00:	337.141	1219.33	2038.11
Length  256, alignment  0/00:	336.844	1219.31	2037.48
Length  256, alignment  0/00:	336.812	1219	2038.86
Length  256, alignment  0/07:	756.531	1220.67	2051.34
Length  256, alignment  7/08:	756.5	1220.31	2052.14
Length  512, alignment  0/00:	645.344	2419.97	4007.19
Length  512, alignment  0/00:	644.156	2419.86	4006.53
Length  512, alignment  0/00:	642.047	2414.14	4001.16
Length  512, alignment  0/00:	642.25	2414.75	4000.55
Length  512, alignment  0/00:	640.828	2413.83	4001.61
Length  512, alignment  0/00:	640.922	2414.97	4001.12
Length  512, alignment  0/08:	641.828	2414.34	4001.53
Length  512, alignment  8/09:	1481.55	2414.28	4138.45
Length 1024, alignment  0/00:	1254.47	4808.61	7933.22
Length 1024, alignment  0/00:	1251.78	4809.58	7932.41
Length 1024, alignment  0/00:	1249.59	4803.31	7926.81
Length 1024, alignment  0/00:	1249.84	4803.58	7926.41
Length 1024, alignment  0/00:	1249.66	4803.38	7926.78
Length 1024, alignment  0/00:	1248.42	4803.52	7926.42
Length 1024, alignment  0/09:	2933.45	4803.7	7955.84
Length 1024, alignment  9/10:	2931.8	4803.69	7963.78
Length   16, alignment  1/02:	79.5625	105.344	241.438
Length   16, alignment  2/01:	79.1875	105	245.438
Length   16, alignment  1/02:	76.625	99.2969	235.422
Length   16, alignment  2/01:	76.1875	99.75	240.047
Length   16, alignment  1/02:	76.6562	99.625	236.453
Length   16, alignment  2/01:	77.0156	99.9375	239.641
Length   32, alignment  2/04:	125.219	179.453	349.75
Length   32, alignment  4/02:	124.547	179.875	346.938
Length   32, alignment  2/04:	121.562	174.281	340.516
Length   32, alignment  4/02:	121.969	173.969	340.438
Length   32, alignment  2/04:	122.25	174.25	340.719
Length   32, alignment  4/02:	122.969	173.578	339.688
Length   64, alignment  3/06:	215.531	329.812	598.859
Length   64, alignment  6/03:	216.188	329.266	595.875
Length   64, alignment  3/06:	212.5	323.609	593
Length   64, alignment  6/03:	211.844	323.156	589.766
Length   64, alignment  3/06:	212.125	430.922	593.125
Length   64, alignment  6/03:	213	323.625	589.109
Length  128, alignment  4/08:	188	627.094	1063.06
Length  128, alignment  8/04:	187.391	627.609	1062.83
Length  128, alignment  4/08:	185.281	621.938	1060.67
Length  128, alignment  8/04:	186.312	622.859	1057.53
Length  128, alignment  4/08:	186.328	622.156	1057.2
Length  128, alignment  8/04:	185.391	622.281	1057.5
Length  256, alignment  5/10:	759.828	1225.31	2082.08
Length  256, alignment 10/05:	760.734	1225.02	2085.28
Length  256, alignment  5/10:	756.938	1219.69	2076.19
Length  256, alignment 10/05:	755.469	1219.41	2078.8
Length  256, alignment  5/10:	756.547	1220.11	2075.52
Length  256, alignment 10/05:	756.125	1220.06	2078.53
Length  512, alignment  6/12:	1485.61	2419.7	4030.86
Length  512, alignment 12/06:	1485.84	2419.67	4027.44
Length  512, alignment  6/12:	1481.95	2415.03	4020.5
Length  512, alignment 12/06:	1481.91	2414.28	4020.39
Length  512, alignment  6/12:	1481.52	2415.12	4020.41
Length  512, alignment 12/06:	1481.47	2414.17	4020.75
Length 1024, alignment  7/14:	2936.28	4808.62	7958.83
Length 1024, alignment 14/07:	2935.16	4809.75	7956.25
Length 1024, alignment  7/14:	2931.77	4803.55	8206.61
Length 1024, alignment 14/07:	2933.27	4803.64	7948.53
Length 1024, alignment  7/14:	2932.2	4803.94	7953.44
Length 1024, alignment 14/07:	2932.61	4803.88	7949.81
  

Comments

Joseph Myers Oct. 1, 2014, 8:04 p.m. UTC | #1
On Wed, 1 Oct 2014, Steve Ellcey wrote:

> 2014-10-01  Steve Ellcey  <sellcey@mips.com>
> 
> 	* sysdeps/mips/strcmp.S: New.

OK.
  

Patch

diff --git a/sysdeps/mips/strcmp.S b/sysdeps/mips/strcmp.S
new file mode 100644
index 0000000..e0c8212
--- /dev/null
+++ b/sysdeps/mips/strcmp.S
@@ -0,0 +1,249 @@ 
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef ANDROID_CHANGES
+# include "machine/asm.h"
+# include "machine/regdef.h"
+#elif _LIBC
+# include <sysdep.h>
+# include <regdef.h>
+# include <sys/asm.h>
+#elif _COMPILING_NEWLIB
+# include "machine/asm.h"
+# include "machine/regdef.h"
+#else
+# include <regdef.h>
+# include <sys/asm.h>
+#endif
+
+/* Technically strcmp should not read past the end of the strings being
+   compared.  We will read a full word that may contain excess bits beyond
+   the NULL string terminator but unless ENABLE_READAHEAD is set, we will not
+   read the next word after the end of string.  Setting ENABLE_READAHEAD will
+   improve performance but is technically illegal based on the definition of
+   strcmp.  */
+#ifdef ENABLE_READAHEAD
+# define DELAY_READ
+#else
+# define DELAY_READ nop
+#endif
+
+/* Testing on a little endian machine showed using CLZ was a
+   performance loss, so we are not turning it on by default.  */
+#if defined(ENABLE_CLZ) && (__mips_isa_rev > 1)
+# define USE_CLZ
+#endif
+
+/* Some asm.h files do not have the L macro definition.  */
+#ifndef L
+# if _MIPS_SIM == _ABIO32
+#  define L(label) $L ## label
+# else
+#  define L(label) .L ## label
+# endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
+#ifndef PTR_ADDIU
+# ifdef USE_DOUBLE
+#  define PTR_ADDIU       daddiu
+# else
+#  define PTR_ADDIU       addiu
+# endif
+#endif
+
+/* Allow the routine to be named something else if desired.  */
+#ifndef STRCMP_NAME
+# define STRCMP_NAME strcmp
+#endif
+
+#ifdef ANDROID_CHANGES
+LEAF(STRCMP_NAME, 0)
+#else
+LEAF(STRCMP_NAME)
+#endif
+	.set	nomips16
+	.set	noreorder
+
+	or	t0, a0, a1
+	andi	t0,0x3
+	bne	t0, zero, L(byteloop)
+
+/* Both strings are 4 byte aligned at this point.  */
+
+	lui	t8, 0x0101
+	ori	t8, t8, 0x0101
+	lui	t9, 0x7f7f
+	ori	t9, 0x7f7f
+
+#define STRCMP32(OFFSET) \
+	lw	v0, OFFSET(a0); \
+	lw	v1, OFFSET(a1); \
+	subu	t0, v0, t8; \
+	bne	v0, v1, L(worddiff); \
+	nor	t1, v0, t9; \
+	and	t0, t0, t1; \
+	bne	t0, zero, L(returnzero)
+
+L(wordloop):
+	STRCMP32(0)
+	DELAY_READ
+	STRCMP32(4)
+	DELAY_READ
+	STRCMP32(8)
+	DELAY_READ
+	STRCMP32(12)
+	DELAY_READ
+	STRCMP32(16)
+	DELAY_READ
+	STRCMP32(20)
+	DELAY_READ
+	STRCMP32(24)
+	DELAY_READ
+	STRCMP32(28)
+	PTR_ADDIU a0, a0, 32
+	b	L(wordloop)
+	PTR_ADDIU a1, a1, 32
+
+L(returnzero):
+	j	ra
+	move	v0, zero
+
+L(worddiff):
+#ifdef USE_CLZ
+	subu	t0, v0, t8
+	nor	t1, v0, t9
+	and	t1, t0, t1
+	xor	t0, v0, v1	
+	or	t0, t0, t1
+# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	wsbh	t0, t0
+	rotr	t0, t0, 16
+# endif
+	clz	t1, t0
+	and	t1, 0xf8
+# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	neg	t1
+	addu	t1, 24
+# endif
+	rotrv	v0, v0, t1
+	rotrv	v1, v1, t1
+	and	v0, v0, 0xff
+	and	v1, v1, 0xff
+	j	ra
+	subu	v0, v0, v1
+#else /* USE_CLZ */
+# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	andi	t0, v0, 0xff
+	beq	t0, zero, L(wexit01)
+	andi	t1, v1, 0xff
+	bne	t0, t1, L(wexit01)
+
+	srl	t8, v0, 8
+	srl	t9, v1, 8
+	andi	t8, t8, 0xff
+	beq	t8, zero, L(wexit89)
+	andi	t9, t9, 0xff
+	bne	t8, t9, L(wexit89)
+
+	srl	t0, v0, 16
+	srl	t1, v1, 16
+	andi	t0, t0, 0xff
+	beq	t0, zero, L(wexit01)
+	andi	t1, t1, 0xff
+	bne	t0, t1, L(wexit01)
+
+	srl	t8, v0, 24
+	srl	t9, v1, 24
+# else /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
+	srl	t0, v0, 24
+	beq	t0, zero, L(wexit01)
+	srl	t1, v1, 24
+	bne	t0, t1, L(wexit01)
+
+	srl	t8, v0, 16
+	srl	t9, v1, 16
+	andi	t8, t8, 0xff
+	beq	t8, zero, L(wexit89)
+	andi	t9, t9, 0xff
+	bne	t8, t9, L(wexit89)
+
+	srl	t0, v0, 8
+	srl	t1, v1, 8
+	andi	t0, t0, 0xff
+	beq	t0, zero, L(wexit01)
+	andi	t1, t1, 0xff
+	bne	t0, t1, L(wexit01)
+
+	andi	t8, v0, 0xff
+	andi	t9, v1, 0xff
+# endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
+
+L(wexit89):
+	j	ra
+	subu	v0, t8, t9
+L(wexit01):
+	j	ra
+	subu	v0, t0, t1
+#endif /* USE_CLZ */
+
+/* It might seem better to do the 'beq' instruction between the two 'lbu'
+   instructions so that the nop is not needed but testing showed that this
+   code is actually faster (based on glibc strcmp test).  */
+#define BYTECMP01(OFFSET) \
+	lbu	v0, OFFSET(a0); \
+	lbu	v1, OFFSET(a1); \
+	beq	v0, zero, L(bexit01); \
+	nop; \
+	bne	v0, v1, L(bexit01)
+
+#define BYTECMP89(OFFSET) \
+	lbu	t8, OFFSET(a0); \
+	lbu	t9, OFFSET(a1); \
+	beq	t8, zero, L(bexit89); \
+	nop;	\
+	bne	t8, t9, L(bexit89)
+
+L(byteloop):
+	BYTECMP01(0)
+	BYTECMP89(1)
+	BYTECMP01(2)
+	BYTECMP89(3)
+	BYTECMP01(4)
+	BYTECMP89(5)
+	BYTECMP01(6)
+	BYTECMP89(7)
+	PTR_ADDIU a0, a0, 8
+	b	L(byteloop)
+	PTR_ADDIU a1, a1, 8
+
+L(bexit01):
+	j	ra
+	subu	v0, v0, v1
+L(bexit89):
+	j	ra
+	subu	v0, t8, t9
+
+	.set	at
+	.set	reorder
+
+END(STRCMP_NAME)
+#ifndef ANDROID_CHANGES
+# ifdef _LIBC
+libc_hidden_builtin_def (STRCMP_NAME)
+# endif
+#endif