[AArch64] Add rawmemchr

Message ID AM3PR08MB00888F05E81BE0117C6876AB83420@AM3PR08MB0088.eurprd08.prod.outlook.com
State Committed
Headers

Commit Message

Wilco Dijkstra May 27, 2016, 1:29 p.m. UTC
  Add a simple rawmemchr implementation. Use strlen for rawmemchr(s, '\0') as
it is the fastest way to search for '\0'.  Otherwise use memchr with an infinite size.
This is 3x faster on benchtests for large sizes.

Passes GLIBC tests, OK for commit?

ChangeLog:
2016-05-27  Wilco Dijkstra  <wdijkstr@arm.com>

	* sysdeps/aarch64/rawmemchr.S (__rawmemchr): New file.
	* sysdeps/aarch64/strlen.S (__strlen): Change to __strlen to avoid PLT.

---
 sysdeps/aarch64/rawmemchr.S | 42 ++++++++++++++++++++++++++++++++++++++++++
 sysdeps/aarch64/strlen.S    |  5 +++--
 2 files changed, 45 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/aarch64/rawmemchr.S
  

Comments

Adhemerval Zanella Netto June 2, 2016, 11:57 a.m. UTC | #1
LGTM with a small nit.

On 27/05/2016 10:29, Wilco Dijkstra wrote:
> Add a simple rawmemchr implementation. Use strlen for rawmemchr(s, '\0') as
> it is the fastest way to search for '\0'.  Otherwise use memchr with an infinite size.
> This is 3x faster on benchtests for large sizes.
> 
> Passes GLIBC tests, OK for commit?
> 
> ChangeLog:
> 2016-05-27  Wilco Dijkstra  <wdijkstr@arm.com>
> 
> 	* sysdeps/aarch64/rawmemchr.S (__rawmemchr): New file.
> 	* sysdeps/aarch64/strlen.S (__strlen): Change to __strlen to avoid PLT.
> 
> ---
>  sysdeps/aarch64/rawmemchr.S | 42 ++++++++++++++++++++++++++++++++++++++++++
>  sysdeps/aarch64/strlen.S    |  5 +++--
>  2 files changed, 45 insertions(+), 2 deletions(-)
>  create mode 100644 sysdeps/aarch64/rawmemchr.S
> 
> diff --git a/sysdeps/aarch64/rawmemchr.S b/sysdeps/aarch64/rawmemchr.S
> new file mode 100644
> index 0000000..ec958e8
> --- /dev/null
> +++ b/sysdeps/aarch64/rawmemchr.S
> @@ -0,0 +1,42 @@
> +/* rawmemchr - find a character in a memory zone
> +
> +   Copyright (C) 2015-2016 Free Software Foundation, Inc.

I think it should be just 2016.

> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +/* Special case rawmemchr (s, 0) as strlen, otherwise tailcall memchr.
> +   Call strlen without setting up a full frame - it preserves x14/x15.
> +*/
> +
> +ENTRY (__rawmemchr)
> +	cbz	w1, L(do_strlen)
> +	mov	x2, -1
> +	b	__memchr
> +
> +L(do_strlen):
> +	mov	x15, x30
> +	cfi_return_column (x15)
> +	mov	x14, x0
> +	bl	__strlen
> +	add	x0, x14, x0
> +	ret	x15
> +
> +END (__rawmemchr)
> +weak_alias (__rawmemchr, rawmemchr)
> +libc_hidden_builtin_def (__rawmemchr)
> diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> index feb9e48..e2a4363 100644
> --- a/sysdeps/aarch64/strlen.S
> +++ b/sysdeps/aarch64/strlen.S
> @@ -84,7 +84,7 @@
>  	   whether the first fetch, which may be misaligned, crosses a page
>  	   boundary.  */
>  
> -ENTRY_ALIGN (strlen, 6)
> +ENTRY_ALIGN (__strlen, 6)
>  	and	tmp1, srcin, MIN_PAGE_SIZE - 1
>  	mov	zeroones, REP8_01
>  	cmp	tmp1, MIN_PAGE_SIZE - 16
> @@ -213,5 +213,6 @@ L(page_cross):
>  	csel	data1, data1, tmp4, eq
>  	csel	data2, data2, tmp2, eq
>  	b	L(page_cross_entry)
> -END (strlen)
> +END (__strlen)
> +weak_alias (__strlen, strlen)
>  libc_hidden_builtin_def (strlen)
>
  
Alexander Cherepanov June 2, 2016, 1:08 p.m. UTC | #2
On 2016-05-27 16:29, Wilco Dijkstra wrote:
> Add a simple rawmemchr implementation. Use strlen for rawmemchr(s, '\0') as
> it is the fastest way to search for '\0'.  Otherwise use memchr with an infinite size.
> This is 3x faster on benchtests for large sizes.

Is memchr on your arch guaranteed to work with an infinite size?

In theory, it's guaranteed by C11 and POSIX (e.g. see 
http://open-std.org/jtc1/sc22/wg14/www/docs/n1533.htm ) but IIUC there 
is a sentiment in the glibc community that passing to a library function 
a size greater than the real size of an object is an error. (Would be 
glad to find out I misunderstood something here.)

In practice, memchr with an infinite size is broken at least on x86-64 
and i386 -- https://sourceware.org/bugzilla/show_bug.cgi?id=19387 .
  
Szabolcs Nagy June 2, 2016, 1:32 p.m. UTC | #3
On 02/06/16 14:08, Alexander Cherepanov wrote:
> On 2016-05-27 16:29, Wilco Dijkstra wrote:
>> Add a simple rawmemchr implementation. Use strlen for rawmemchr(s, '\0') as
>> it is the fastest way to search for '\0'.  Otherwise use memchr with an infinite size.
>> This is 3x faster on benchtests for large sizes.
> 
> Is memchr on your arch guaranteed to work with an infinite size?
> 
> In theory, it's guaranteed by C11 and POSIX (e.g. see http://open-std.org/jtc1/sc22/wg14/www/docs/n1533.htm )
> but IIUC there is a sentiment in the glibc community that passing to a library function a size greater than the
> real size of an object is an error. (Would be glad to find out I misunderstood something here.)
> 

why would the glibc community want something that's in
conflict with the standard?

an implementation can turn ub into defined behaviour,
but not the other way.

> In practice, memchr with an infinite size is broken at least on x86-64 and i386 --
> https://sourceware.org/bugzilla/show_bug.cgi?id=19387 .
>
  
Adhemerval Zanella Netto June 2, 2016, 1:44 p.m. UTC | #4
On 02/06/2016 10:08, Alexander Cherepanov wrote:
> On 2016-05-27 16:29, Wilco Dijkstra wrote:
>> Add a simple rawmemchr implementation. Use strlen for rawmemchr(s, '\0') as
>> it is the fastest way to search for '\0'.  Otherwise use memchr with an infinite size.
>> This is 3x faster on benchtests for large sizes.
> 
> Is memchr on your arch guaranteed to work with an infinite size?
> 
> In theory, it's guaranteed by C11 and POSIX (e.g. see http://open-std.org/jtc1/sc22/wg14/www/docs/n1533.htm ) but IIUC there is a sentiment in the glibc community that passing to a library function a size greater than the real size of an object is an error. (Would be glad to find out I misunderstood something here.)
> 
> In practice, memchr with an infinite size is broken at least on x86-64 and i386 -- https://sourceware.org/bugzilla/show_bug.cgi?id=19387 .
> 

Currently aarch64 uses the default implementation (string/memchr.c)
and the optimized one in review seems also fine for infinite size.
  
Wilco Dijkstra June 2, 2016, 1:48 p.m. UTC | #5
Alexander Cherepanov wrote:
>
> Is memchr on your arch guaranteed to work with an infinite size?

Yes it works fine. The AArch64 assembler functions have been carefully
written to avoid overflows on the size as well as end-pointer comparisons.

> In theory, it's guaranteed by C11 and POSIX (e.g. see
> http://open-std.org/jtc1/sc22/wg14/www/docs/n1533.htm ) but IIUC there
> is a sentiment in the glibc community that passing to a library function
> a size greater than the real size of an object is an error. (Would be
> glad to find out I misunderstood something here.)

The rawmemchr API is a bug by itself - it really is memchr with infinite size
(so using it is extremely risky), but making that explicit is by itself not an issue.

Wilco
  
Szabolcs Nagy June 2, 2016, 2:22 p.m. UTC | #6
On 02/06/16 14:48, Wilco Dijkstra wrote:
> Alexander Cherepanov wrote:
>>
>> Is memchr on your arch guaranteed to work with an infinite size?
> 
> Yes it works fine. The AArch64 assembler functions have been carefully
> written to avoid overflows on the size as well as end-pointer comparisons.
> 

note that the libc implementation would not help
if the compiler assumed infinite size was ub..
the call would be dead code eliminated.

(which is why the compiler should not try to be
smart about library apis, the libc implementor
may want to give stricter guarantees.. this is
a common problem with gcc.)
  
Alexander Cherepanov June 2, 2016, 5:13 p.m. UTC | #7
On 2016-06-02 16:32, Szabolcs Nagy wrote:
> On 02/06/16 14:08, Alexander Cherepanov wrote:
>> On 2016-05-27 16:29, Wilco Dijkstra wrote:
>>> Add a simple rawmemchr implementation. Use strlen for rawmemchr(s, '\0') as
>>> it is the fastest way to search for '\0'.  Otherwise use memchr with an infinite size.
>>> This is 3x faster on benchtests for large sizes.
>>
>> Is memchr on your arch guaranteed to work with an infinite size?
>>
>> In theory, it's guaranteed by C11 and POSIX (e.g. see http://open-std.org/jtc1/sc22/wg14/www/docs/n1533.htm )
>> but IIUC there is a sentiment in the glibc community that passing to a library function a size greater than the
>> real size of an object is an error. (Would be glad to find out I misunderstood something here.)
>
> why would the glibc community want something that's in
> conflict with the standard?

Because what is in the standard is considered a mistake, because it 
hinders the possibilities to catch bugs in user code, or something like 
that. Not sure exactly.

The most controversial so far is fread. The kernel will fail the read if 
you pass to it a pointer/size that don't fit into user space (e.g. 
47-bits on x86-64) so you cannot call fread with the size anywhere close 
to SIZE_MAX.
  
Marcus Shawcroft June 11, 2016, 7:55 a.m. UTC | #8
On 27 May 2016 at 14:29, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> Add a simple rawmemchr implementation. Use strlen for rawmemchr(s, '\0') as
> it is the fastest way to search for '\0'.  Otherwise use memchr with an infinite size.
> This is 3x faster on benchtests for large sizes.
>
> Passes GLIBC tests, OK for commit?
>
> ChangeLog:
> 2016-05-27  Wilco Dijkstra  <wdijkstr@arm.com>
>
>         * sysdeps/aarch64/rawmemchr.S (__rawmemchr): New file.
>         * sysdeps/aarch64/strlen.S (__strlen): Change to __strlen to avoid PLT.

OK, Thanks Wilco. /Marcus
  

Patch

diff --git a/sysdeps/aarch64/rawmemchr.S b/sysdeps/aarch64/rawmemchr.S
new file mode 100644
index 0000000..ec958e8
--- /dev/null
+++ b/sysdeps/aarch64/rawmemchr.S
@@ -0,0 +1,42 @@ 
+/* rawmemchr - find a character in a memory zone
+
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Special case rawmemchr (s, 0) as strlen, otherwise tailcall memchr.
+   Call strlen without setting up a full frame - it preserves x14/x15.
+*/
+
+ENTRY (__rawmemchr)
+	cbz	w1, L(do_strlen)
+	mov	x2, -1
+	b	__memchr
+
+L(do_strlen):
+	mov	x15, x30
+	cfi_return_column (x15)
+	mov	x14, x0
+	bl	__strlen
+	add	x0, x14, x0
+	ret	x15
+
+END (__rawmemchr)
+weak_alias (__rawmemchr, rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
index feb9e48..e2a4363 100644
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@@ -84,7 +84,7 @@ 
 	   whether the first fetch, which may be misaligned, crosses a page
 	   boundary.  */
 
-ENTRY_ALIGN (strlen, 6)
+ENTRY_ALIGN (__strlen, 6)
 	and	tmp1, srcin, MIN_PAGE_SIZE - 1
 	mov	zeroones, REP8_01
 	cmp	tmp1, MIN_PAGE_SIZE - 16
@@ -213,5 +213,6 @@  L(page_cross):
 	csel	data1, data1, tmp4, eq
 	csel	data2, data2, tmp2, eq
 	b	L(page_cross_entry)
-END (strlen)
+END (__strlen)
+weak_alias (__strlen, strlen)
 libc_hidden_builtin_def (strlen)