MIPS specific strcmp function
Commit Message
I have created a new MIPS strcmp routine. I tried rewriting this in C a
while back but it was still slower then the standard version in some
unaligned cases. I have gone back and rewritten it in assembly language
and this version is faster then the standard version in all cases except
for one byte comparisons and one of the two byte comparison tests from
bench-strcmp.
I have attached old and new bench-strcmp.out files to show the
improvement.
There is an ifdef for using the CLZ instruction but on the machine where
I was doing performance testing using CLZ did not help so I did not turn
this ifdef on by default. Other MIPS platforms might find it helpful
though so I left the code in.
I ran the glibc testsuite with no regressions on mips32 big and little
endian platforms and also did builds and tests using my own tests on a
mips64 simulator.
OK to checkin?
Steve Ellcey
sellcey@mips.com
2014-10-01 Steve Ellcey <sellcey@mips.com>
* sysdeps/mips/strcmp.S: New.
strcmp simple_strcmp stupid_strcmp
Length 1, alignment 1/01: 114.844 37.1719 124.5
Length 1, alignment 1/01: 28.8281 21.3281 67.125
Length 1, alignment 1/01: 27.7969 21.0781 72.3438
Length 2, alignment 2/02: 31.0312 45.3594 112.109
Length 2, alignment 2/02: 30.7656 42.4531 106.359
Length 2, alignment 2/02: 29.9219 30.9531 107.781
Length 3, alignment 3/03: 33.6406 44.6719 104.984
Length 3, alignment 3/03: 32.4844 46.6094 104.5
Length 3, alignment 3/03: 32.5156 45.6875 104.703
Length 4, alignment 4/04: 36.5938 48.7656 110.969
Length 4, alignment 4/04: 34.0156 43.5781 105.188
Length 4, alignment 4/04: 32.5156 43.8906 105.234
Length 5, alignment 5/05: 38.9531 53.5781 157.516
Length 5, alignment 5/05: 37.2812 48.8281 150.453
Length 5, alignment 5/05: 37.7344 48.2812 151.469
Length 6, alignment 6/06: 41.6094 58.5 154.281
Length 6, alignment 6/06: 40.0625 52.9375 148.234
Length 6, alignment 6/06: 40.4531 52.9375 147.688
Length 7, alignment 7/07: 44.5938 62.9844 146.047
Length 7, alignment 7/07: 43.9375 57.1406 140.922
Length 7, alignment 7/07: 42.3594 57.75 141.766
Length 8, alignment 8/08: 39.6094 67.125 153.547
Length 8, alignment 8/08: 38.1875 62.4688 147.219
Length 8, alignment 8/08: 38.0625 63.2656 147.844
Length 9, alignment 9/09: 60.5312 72.25 198.531
Length 9, alignment 9/09: 58.5781 67.0938 193.562
Length 9, alignment 9/09: 57.5625 67.0312 193.062
Length 10, alignment 10/10: 62.5469 76.9219 194.781
Length 10, alignment 10/10: 60.5625 71.8594 188.562
Length 10, alignment 10/10: 60.5938 72.1562 188.656
Length 11, alignment 11/11: 65.9375 82.2812 185.078
Length 11, alignment 11/11: 62.5469 76.9844 179.125
Length 11, alignment 11/11: 62.8906 76.3125 179.094
Length 12, alignment 12/12: 45.6562 85.7969 173.938
Length 12, alignment 12/12: 42.1719 81.1562 167.578
Length 12, alignment 12/12: 42.6094 80.4062 167.719
Length 13, alignment 13/13: 71.2969 91.1562 237.484
Length 13, alignment 13/13: 68.5625 85.625 230.531
Length 13, alignment 13/13: 67.875 85.7188 231.406
Length 14, alignment 14/14: 72.9062 95.1094 211.297
Length 14, alignment 14/14: 70.8594 90.1562 204.953
Length 14, alignment 14/14: 71.625 90.4062 205.484
Length 15, alignment 15/15: 75.25 100.969 205.5
Length 15, alignment 15/15: 73.5625 94.6562 199.812
Length 15, alignment 15/15: 73.2031 95.5625 198.906
Length 16, alignment 16/16: 50.2188 104.828 204.281
Length 16, alignment 16/16: 47.9062 99.0781 198.281
Length 16, alignment 16/16: 46.9688 99.8438 198.172
Length 17, alignment 17/17: 81.9375 110.234 257.344
Length 17, alignment 17/17: 80.5781 103.609 251.391
Length 17, alignment 17/17: 80.2344 104.328 251.25
Length 18, alignment 18/18: 84.875 114.406 241.781
Length 18, alignment 18/18: 82.9688 108.891 236.094
Length 18, alignment 18/18: 82.5 108.984 235.547
Length 19, alignment 19/19: 87.9375 118.281 235.516
Length 19, alignment 19/19: 85.5625 113.297 229.484
Length 19, alignment 19/19: 85.5938 113.484 230.031
Length 20, alignment 20/20: 54.3125 123.125 234.953
Length 20, alignment 20/20: 52.5312 118.594 229.281
Length 20, alignment 20/20: 52.625 118.875 229.172
Length 21, alignment 21/21: 93.5469 127.781 287.328
Length 21, alignment 21/21: 91.2656 122.656 282.156
Length 21, alignment 21/21: 91.25 122.094 281.938
Length 22, alignment 22/22: 95.5156 132.469 272.531
Length 22, alignment 22/22: 93.5469 127.641 267.25
Length 22, alignment 22/22: 93.5156 127.938 267.172
Length 23, alignment 23/23: 98.9375 137.438 267.219
Length 23, alignment 23/23: 97.3125 132.547 261.094
Length 23, alignment 23/23: 96.9531 131.438 260.812
Length 24, alignment 24/24: 57.875 142.406 266.297
Length 24, alignment 24/24: 57.1875 137.625 259.578
Length 24, alignment 24/24: 57.4375 137.219 259.922
Length 25, alignment 25/25: 105.234 147.266 318.672
Length 25, alignment 25/25: 103.594 142.547 312.828
Length 25, alignment 25/25: 103.266 141.672 312.812
Length 26, alignment 26/26: 108.625 151.5 303.672
Length 26, alignment 26/26: 106.234 146.594 297.062
Length 26, alignment 26/26: 105.531 146.656 297.469
Length 27, alignment 27/27: 110.953 156.312 296.328
Length 27, alignment 27/27: 108.516 151.516 291.234
Length 27, alignment 27/27: 108.938 151.406 291.016
Length 28, alignment 28/28: 63.4844 160.656 296.734
Length 28, alignment 28/28: 61.875 155.906 289.672
Length 28, alignment 28/28: 62 155.375 290.031
Length 29, alignment 29/29: 116.953 166.172 349.125
Length 29, alignment 29/29: 113.859 160.078 343.141
Length 29, alignment 29/29: 113.938 160.359 343.75
Length 30, alignment 30/30: 118.578 170.734 334.312
Length 30, alignment 30/30: 116.906 164.734 329.094
Length 30, alignment 30/30: 116.938 164.859 327.609
Length 31, alignment 31/31: 120.219 174.469 327.328
Length 31, alignment 31/31: 119.875 169.75 322.109
Length 31, alignment 31/31: 119.234 170.438 322.297
Length 4, alignment 0/00: 34.4531 49.0312 111.406
Length 4, alignment 0/00: 34.8906 48.9844 111.281
Length 4, alignment 0/00: 33.7188 50.5625 105.656
Length 4, alignment 0/00: 32.8594 50.5625 104.469
Length 4, alignment 0/00: 33.1875 51.3125 104.703
Length 4, alignment 0/00: 33.1406 51.2031 106.188
Length 4, alignment 0/01: 35.2812 51.3125 136.719
Length 4, alignment 1/02: 34.1719 50.0312 135.609
Length 8, alignment 0/00: 39.3594 68.7344 154
Length 8, alignment 0/00: 38.4688 67.8281 154.422
Length 8, alignment 0/00: 38.5312 62.3125 148
Length 8, alignment 0/00: 38.3281 62.9375 148.844
Length 8, alignment 0/00: 38.0469 62.0938 148.031
Length 8, alignment 0/00: 38.4375 62.4375 147.688
Length 8, alignment 0/02: 45.1406 62.125 171.516
Length 8, alignment 2/03: 45 62.7656 162.656
Length 16, alignment 0/00: 49.7031 105.125 204.141
Length 16, alignment 0/00: 48.1875 105.094 203.438
Length 16, alignment 0/00: 48.25 99.7656 198.359
Length 16, alignment 0/00: 47.6562 98.9531 198.344
Length 16, alignment 0/00: 48.3125 99.4375 198.141
Length 16, alignment 0/00: 47.625 99.1562 198.469
Length 16, alignment 0/03: 83.5781 100.75 212.297
Length 16, alignment 3/04: 83.625 99.9062 212.078
Length 32, alignment 0/00: 74.5 179.438 327.219
Length 32, alignment 0/00: 73.1094 179.453 326.844
Length 32, alignment 0/00: 65.8125 174.703 320.75
Length 32, alignment 0/00: 66.3125 174.781 320.688
Length 32, alignment 0/00: 65.9219 174.531 321.188
Length 32, alignment 0/00: 65.9062 174.438 321.016
Length 32, alignment 0/04: 66.25 174.781 321.141
Length 32, alignment 4/05: 122.406 175.25 351.141
Length 64, alignment 0/00: 112.141 329.406 572.359
Length 64, alignment 0/00: 111.062 329.391 572.312
Length 64, alignment 0/00: 109.375 323.078 566.5
Length 64, alignment 0/00: 110.297 323.438 566.516
Length 64, alignment 0/00: 109.828 323.688 566.562
Length 64, alignment 0/00: 109.5 324.062 566.734
Length 64, alignment 0/05: 212.812 323.297 595.484
Length 64, alignment 5/06: 212.906 323.438 603.75
Length 128, alignment 0/00: 188.125 627.844 1063.44
Length 128, alignment 0/00: 187.094 628.25 1063.12
Length 128, alignment 0/00: 184.625 622.422 1060.34
Length 128, alignment 0/00: 185.516 622.312 1057.12
Length 128, alignment 0/00: 185.641 622.734 1057.08
Length 128, alignment 0/00: 185.844 622.719 1057.84
Length 128, alignment 0/06: 394.672 621.406 1074.33
Length 128, alignment 6/07: 394 621.828 1080.03
Length 256, alignment 0/00: 340.891 1224.42 2044.42
Length 256, alignment 0/00: 339.391 1224.78 2043.81
Length 256, alignment 0/00: 337.016 1219.34 2038.44
Length 256, alignment 0/00: 337.141 1219.33 2038.11
Length 256, alignment 0/00: 336.844 1219.31 2037.48
Length 256, alignment 0/00: 336.812 1219 2038.86
Length 256, alignment 0/07: 756.531 1220.67 2051.34
Length 256, alignment 7/08: 756.5 1220.31 2052.14
Length 512, alignment 0/00: 645.344 2419.97 4007.19
Length 512, alignment 0/00: 644.156 2419.86 4006.53
Length 512, alignment 0/00: 642.047 2414.14 4001.16
Length 512, alignment 0/00: 642.25 2414.75 4000.55
Length 512, alignment 0/00: 640.828 2413.83 4001.61
Length 512, alignment 0/00: 640.922 2414.97 4001.12
Length 512, alignment 0/08: 641.828 2414.34 4001.53
Length 512, alignment 8/09: 1481.55 2414.28 4138.45
Length 1024, alignment 0/00: 1254.47 4808.61 7933.22
Length 1024, alignment 0/00: 1251.78 4809.58 7932.41
Length 1024, alignment 0/00: 1249.59 4803.31 7926.81
Length 1024, alignment 0/00: 1249.84 4803.58 7926.41
Length 1024, alignment 0/00: 1249.66 4803.38 7926.78
Length 1024, alignment 0/00: 1248.42 4803.52 7926.42
Length 1024, alignment 0/09: 2933.45 4803.7 7955.84
Length 1024, alignment 9/10: 2931.8 4803.69 7963.78
Length 16, alignment 1/02: 79.5625 105.344 241.438
Length 16, alignment 2/01: 79.1875 105 245.438
Length 16, alignment 1/02: 76.625 99.2969 235.422
Length 16, alignment 2/01: 76.1875 99.75 240.047
Length 16, alignment 1/02: 76.6562 99.625 236.453
Length 16, alignment 2/01: 77.0156 99.9375 239.641
Length 32, alignment 2/04: 125.219 179.453 349.75
Length 32, alignment 4/02: 124.547 179.875 346.938
Length 32, alignment 2/04: 121.562 174.281 340.516
Length 32, alignment 4/02: 121.969 173.969 340.438
Length 32, alignment 2/04: 122.25 174.25 340.719
Length 32, alignment 4/02: 122.969 173.578 339.688
Length 64, alignment 3/06: 215.531 329.812 598.859
Length 64, alignment 6/03: 216.188 329.266 595.875
Length 64, alignment 3/06: 212.5 323.609 593
Length 64, alignment 6/03: 211.844 323.156 589.766
Length 64, alignment 3/06: 212.125 430.922 593.125
Length 64, alignment 6/03: 213 323.625 589.109
Length 128, alignment 4/08: 188 627.094 1063.06
Length 128, alignment 8/04: 187.391 627.609 1062.83
Length 128, alignment 4/08: 185.281 621.938 1060.67
Length 128, alignment 8/04: 186.312 622.859 1057.53
Length 128, alignment 4/08: 186.328 622.156 1057.2
Length 128, alignment 8/04: 185.391 622.281 1057.5
Length 256, alignment 5/10: 759.828 1225.31 2082.08
Length 256, alignment 10/05: 760.734 1225.02 2085.28
Length 256, alignment 5/10: 756.938 1219.69 2076.19
Length 256, alignment 10/05: 755.469 1219.41 2078.8
Length 256, alignment 5/10: 756.547 1220.11 2075.52
Length 256, alignment 10/05: 756.125 1220.06 2078.53
Length 512, alignment 6/12: 1485.61 2419.7 4030.86
Length 512, alignment 12/06: 1485.84 2419.67 4027.44
Length 512, alignment 6/12: 1481.95 2415.03 4020.5
Length 512, alignment 12/06: 1481.91 2414.28 4020.39
Length 512, alignment 6/12: 1481.52 2415.12 4020.41
Length 512, alignment 12/06: 1481.47 2414.17 4020.75
Length 1024, alignment 7/14: 2936.28 4808.62 7958.83
Length 1024, alignment 14/07: 2935.16 4809.75 7956.25
Length 1024, alignment 7/14: 2931.77 4803.55 8206.61
Length 1024, alignment 14/07: 2933.27 4803.64 7948.53
Length 1024, alignment 7/14: 2932.2 4803.94 7953.44
Length 1024, alignment 14/07: 2932.61 4803.88 7949.81
Comments
On Wed, 1 Oct 2014, Steve Ellcey wrote:
> 2014-10-01 Steve Ellcey <sellcey@mips.com>
>
> * sysdeps/mips/strcmp.S: New.
OK.
new file mode 100644
@@ -0,0 +1,249 @@
+/* Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifdef ANDROID_CHANGES
+# include "machine/asm.h"
+# include "machine/regdef.h"
+#elif _LIBC
+# include <sysdep.h>
+# include <regdef.h>
+# include <sys/asm.h>
+#elif _COMPILING_NEWLIB
+# include "machine/asm.h"
+# include "machine/regdef.h"
+#else
+# include <regdef.h>
+# include <sys/asm.h>
+#endif
+
+/* Technically strcmp should not read past the end of the strings being
+ compared. We will read a full word that may contain excess bits beyond
+ the NULL string terminator but unless ENABLE_READAHEAD is set, we will not
+ read the next word after the end of string. Setting ENABLE_READAHEAD will
+ improve performance but is technically illegal based on the definition of
+ strcmp. */
+#ifdef ENABLE_READAHEAD
+# define DELAY_READ
+#else
+# define DELAY_READ nop
+#endif
+
+/* Testing on a little endian machine showed using CLZ was a
+ performance loss, so we are not turning it on by default. */
+#if defined(ENABLE_CLZ) && (__mips_isa_rev > 1)
+# define USE_CLZ
+#endif
+
+/* Some asm.h files do not have the L macro definition. */
+#ifndef L
+# if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+# else
+# define L(label) .L ## label
+# endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition. */
+#ifndef PTR_ADDIU
+# ifdef USE_DOUBLE
+# define PTR_ADDIU daddiu
+# else
+# define PTR_ADDIU addiu
+# endif
+#endif
+
+/* Allow the routine to be named something else if desired. */
+#ifndef STRCMP_NAME
+# define STRCMP_NAME strcmp
+#endif
+
+#ifdef ANDROID_CHANGES
+LEAF(STRCMP_NAME, 0)
+#else
+LEAF(STRCMP_NAME)
+#endif
+ .set nomips16
+ .set noreorder
+
+ or t0, a0, a1
+ andi t0,0x3
+ bne t0, zero, L(byteloop)
+
+/* Both strings are 4 byte aligned at this point. */
+
+ lui t8, 0x0101
+ ori t8, t8, 0x0101
+ lui t9, 0x7f7f
+ ori t9, 0x7f7f
+
+#define STRCMP32(OFFSET) \
+ lw v0, OFFSET(a0); \
+ lw v1, OFFSET(a1); \
+ subu t0, v0, t8; \
+ bne v0, v1, L(worddiff); \
+ nor t1, v0, t9; \
+ and t0, t0, t1; \
+ bne t0, zero, L(returnzero)
+
+L(wordloop):
+ STRCMP32(0)
+ DELAY_READ
+ STRCMP32(4)
+ DELAY_READ
+ STRCMP32(8)
+ DELAY_READ
+ STRCMP32(12)
+ DELAY_READ
+ STRCMP32(16)
+ DELAY_READ
+ STRCMP32(20)
+ DELAY_READ
+ STRCMP32(24)
+ DELAY_READ
+ STRCMP32(28)
+ PTR_ADDIU a0, a0, 32
+ b L(wordloop)
+ PTR_ADDIU a1, a1, 32
+
+L(returnzero):
+ j ra
+ move v0, zero
+
+L(worddiff):
+#ifdef USE_CLZ
+ subu t0, v0, t8
+ nor t1, v0, t9
+ and t1, t0, t1
+ xor t0, v0, v1
+ or t0, t0, t1
+# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ wsbh t0, t0
+ rotr t0, t0, 16
+# endif
+ clz t1, t0
+ and t1, 0xf8
+# if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+ neg t1
+ addu t1, 24
+# endif
+ rotrv v0, v0, t1
+ rotrv v1, v1, t1
+ and v0, v0, 0xff
+ and v1, v1, 0xff
+ j ra
+ subu v0, v0, v1
+#else /* USE_CLZ */
+# if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ andi t0, v0, 0xff
+ beq t0, zero, L(wexit01)
+ andi t1, v1, 0xff
+ bne t0, t1, L(wexit01)
+
+ srl t8, v0, 8
+ srl t9, v1, 8
+ andi t8, t8, 0xff
+ beq t8, zero, L(wexit89)
+ andi t9, t9, 0xff
+ bne t8, t9, L(wexit89)
+
+ srl t0, v0, 16
+ srl t1, v1, 16
+ andi t0, t0, 0xff
+ beq t0, zero, L(wexit01)
+ andi t1, t1, 0xff
+ bne t0, t1, L(wexit01)
+
+ srl t8, v0, 24
+ srl t9, v1, 24
+# else /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
+ srl t0, v0, 24
+ beq t0, zero, L(wexit01)
+ srl t1, v1, 24
+ bne t0, t1, L(wexit01)
+
+ srl t8, v0, 16
+ srl t9, v1, 16
+ andi t8, t8, 0xff
+ beq t8, zero, L(wexit89)
+ andi t9, t9, 0xff
+ bne t8, t9, L(wexit89)
+
+ srl t0, v0, 8
+ srl t1, v1, 8
+ andi t0, t0, 0xff
+ beq t0, zero, L(wexit01)
+ andi t1, t1, 0xff
+ bne t0, t1, L(wexit01)
+
+ andi t8, v0, 0xff
+ andi t9, v1, 0xff
+# endif /* __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ */
+
+L(wexit89):
+ j ra
+ subu v0, t8, t9
+L(wexit01):
+ j ra
+ subu v0, t0, t1
+#endif /* USE_CLZ */
+
+/* It might seem better to do the 'beq' instruction between the two 'lbu'
+ instructions so that the nop is not needed but testing showed that this
+ code is actually faster (based on glibc strcmp test). */
+#define BYTECMP01(OFFSET) \
+ lbu v0, OFFSET(a0); \
+ lbu v1, OFFSET(a1); \
+ beq v0, zero, L(bexit01); \
+ nop; \
+ bne v0, v1, L(bexit01)
+
+#define BYTECMP89(OFFSET) \
+ lbu t8, OFFSET(a0); \
+ lbu t9, OFFSET(a1); \
+ beq t8, zero, L(bexit89); \
+ nop; \
+ bne t8, t9, L(bexit89)
+
+L(byteloop):
+ BYTECMP01(0)
+ BYTECMP89(1)
+ BYTECMP01(2)
+ BYTECMP89(3)
+ BYTECMP01(4)
+ BYTECMP89(5)
+ BYTECMP01(6)
+ BYTECMP89(7)
+ PTR_ADDIU a0, a0, 8
+ b L(byteloop)
+ PTR_ADDIU a1, a1, 8
+
+L(bexit01):
+ j ra
+ subu v0, v0, v1
+L(bexit89):
+ j ra
+ subu v0, t8, t9
+
+ .set at
+ .set reorder
+
+END(STRCMP_NAME)
+#ifndef ANDROID_CHANGES
+# ifdef _LIBC
+libc_hidden_builtin_def (STRCMP_NAME)
+# endif
+#endif