diff mbox

[10/11] hppa: Add haszero.h and whichzero.h

Message ID 20161217065729.28561-11-rth@twiddle.net
State New
Headers show

Commit Message

Richard Henderson Dec. 17, 2016, 6:57 a.m. UTC
Use UXOR,SBZ to test for a zero byte within a word.  While we can
get semi-decent code out of asm-goto, we would do slightly better
with a compiler builtin.

For whichzero, sequential testing of bytes is less expensive than any
tricks that involve a count-leading-zeros insn that we don't have.

	* sysdeps/hppa/haszero.h: New file.
	* sysdeps/hppa/whichzero.h: New file.
---
 sysdeps/hppa/haszero.h   | 82 ++++++++++++++++++++++++++++++++++++++++++++++++
 sysdeps/hppa/whichzero.h | 70 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 152 insertions(+)
 create mode 100644 sysdeps/hppa/haszero.h
 create mode 100644 sysdeps/hppa/whichzero.h

Comments

Adhemerval Zanella Dec. 19, 2016, 2:54 p.m. UTC | #1
On 17/12/2016 04:57, Richard Henderson wrote:

> +static inline unsigned long int
> +haszero(unsigned long int x)
> +{
> +#if __GNUC_PREREQ(4, 5)
> +  /* It's more useful to expose a control transfer to the compiler
> +     than to expose a proper boolean result.  */
> +  if (sizeof(x) == 8)
> +    asm goto ("uxor,*sbz %%r0,%0,%%r0\n\tb,n %l1" : : "r"(x) : : nbz);
> +  else
> +    asm goto ("uxor,sbz %%r0,%0,%%r0\n\tb,n %l1" : : "r"(x) : : nbz);
> +  return 1;
> + nbz:
> +  return 0;
> +#else

Since current GLIBC requires GCC 4.7 as minimum compiler I think we
can get rid of snippets for old compilers.  Same for the other
override functios.

> +  unsigned long int ret;
> +  if (sizeof(x) == 8)
> +    asm ("uxor,*sbz %%r0,%1,%%r0\n\tcopy %%r0,%0"
> +	 : "=r"(ret) : "r"(x), "0"(1));
> +  else
> +    asm ("uxor,sbz %%r0,%1,%%r0\n\tcopy %%r0,%0"
> +        : "=r"(ret) : "r"(x), "0"(1));
> +  return ret;
> +#endif
> +}
> +
> +/* Likewise, but for two words simultaneously.  */
> +
> +static inline unsigned long int
> +haszero2(unsigned long int x1, unsigned long int x2)
> +{
> +#if __GNUC_PREREQ(4, 5)
> +  /* It's more useful to expose a control transfer to the compiler
> +     than to expose a proper boolean result.  */
> +  if (sizeof(x1) == 8)
> +    asm goto ("uxor,*sbz %%r0,%0,%%r0\n\t"
> +	      "uxor,*nbz %%r0,%1,%%r0\n\t"
> +	      "b,n %l2" : : "r"(x1), "r"(x2) : : sbz);
> +  else
> +    asm goto ("uxor,sbz %%r0,%0,%%r0\n\t"
> +	      "uxor,nbz %%r0,%1,%%r0\n\t"
> +	      "b,n %l2" : : "r"(x1), "r"(x2) : : sbz);
> +  return 0;
> + sbz:
> +  return 1;
> +#else
> +  unsigned long int ret;
> +  if (sizeof(x1) == 8)
> +    asm ("uxor,*sbz %%r0,%1,%%r0\n\t"
> +	 "uxor,*nbz %%r0,%2,%%r0\n\t"
> +	 "ldi 1,%0"
> +	 : "=r"(ret) : "r"(x1), "r"(x2), "0"(0));
> +  else
> +    asm ("uxor,sbz %%r0,%1,%%r0\n\t"
> +	 "uxor,nbz %%r0,%2,%%r0\n\t"
> +	 "ldi 1,%0"
> +	 : "=r"(ret) : "r"(x1), "r"(x2), "0"(0));
> +  return ret;
> +#endif
> +}
> +
> +#endif /* haszero.h */
> diff --git a/sysdeps/hppa/whichzero.h b/sysdeps/hppa/whichzero.h
> new file mode 100644
> index 0000000..ef18cc7
> --- /dev/null
> +++ b/sysdeps/hppa/whichzero.h
> @@ -0,0 +1,70 @@
> +/* whichzero.h -- functions for zero byte searching.  HPPA version.
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef HPPA_WHICHZERO_H
> +#define HPPA_WHICHZERO_H 1
> +
> +/* Given a long that is known to contain a zero byte, return the
> +   index of the first such within the long in host memory order.  */
> +
> +static inline unsigned int
> +whichzero(unsigned long int x)
> +{
> +  unsigned int ret;
> +
> +  _Static_assert (sizeof(x) == 4, "64-bit not supported");
> +
> +  /* Since we have no clz insn, direct tests of the bytes is faster
> +     than loading up the constants to do the masking.  */
> +  asm ("extrw,u,<> %1,23,8,%%r0\n\t"
> +       "ldi 2,%0\n\t"
> +       "extrw,u,<> %1,15,8,%%r0\n\t"
> +       "ldi 1,%0\n\t"
> +       "extrw,u,<> %1,7,8,%%r0\n\t"
> +       "ldi 0,%0"
> +       : "=r"(ret) : "r"(x), "0"(3));
> +
> +  return ret;
> +}
> +
> +/* Similarly, but perform the test for two longs simultaneously.  */
> +
> +static inline unsigned int
> +whichzero2(unsigned long int x1, unsigned long int x2)
> +{
> +  unsigned int ret;
> +
> +  _Static_assert (sizeof(x1) == 4, "64-bit not supported");
> +
> +  /* Since we have no clz insn, direct tests of the bytes is faster
> +     than loading up the constants to do the masking.  */
> +  asm ("extrw,u,= %1,23,8,%%r0\n\t"
> +       "extrw,u,<> %2,23,8,%%r0\n\t"
> +       "ldi 2,%0\n\t"
> +       "extrw,u,= %1,15,8,%%r0\n\t"
> +       "extrw,u,<> %2,15,8,%%r0\n\t"
> +       "ldi 1,%0\n\t"
> +       "extrw,u,= %1,7,8,%%r0\n\t"
> +       "extrw,u,<> %2,7,8,%%r0\n\t"
> +       "ldi 0,%0"
> +       : "=r"(ret) : "r"(x1), "r"(x2), "0"(3));
> +
> +  return ret;
> +}
> +
> +#endif /* whichzero.h */

I am far from a hppa expert, but can't we code the same snippet in C? How
bad would it be compared to this optimized asm?
Richard Henderson Dec. 19, 2016, 7:44 p.m. UTC | #2
On 12/19/2016 06:54 AM, Adhemerval Zanella wrote:
>> +static inline unsigned long int
>> +haszero(unsigned long int x)
>> +{
>> +#if __GNUC_PREREQ(4, 5)
>> +  /* It's more useful to expose a control transfer to the compiler
>> +     than to expose a proper boolean result.  */
>> +  if (sizeof(x) == 8)
>> +    asm goto ("uxor,*sbz %%r0,%0,%%r0\n\tb,n %l1" : : "r"(x) : : nbz);
>> +  else
>> +    asm goto ("uxor,sbz %%r0,%0,%%r0\n\tb,n %l1" : : "r"(x) : : nbz);
>> +  return 1;
>> + nbz:
>> +  return 0;
>> +#else
> 
> Since current GLIBC requires GCC 4.7 as minimum compiler I think we
> can get rid of snippets for old compilers.  Same for the other
> override functios.

Ah good.  I'd meant to go back and look for the minimum required gcc.

>> +  /* Since we have no clz insn, direct tests of the bytes is faster
>> +     than loading up the constants to do the masking.  */
>> +  asm ("extrw,u,= %1,23,8,%%r0\n\t"
>> +       "extrw,u,<> %2,23,8,%%r0\n\t"
>> +       "ldi 2,%0\n\t"
>> +       "extrw,u,= %1,15,8,%%r0\n\t"
>> +       "extrw,u,<> %2,15,8,%%r0\n\t"
>> +       "ldi 1,%0\n\t"
>> +       "extrw,u,= %1,7,8,%%r0\n\t"
>> +       "extrw,u,<> %2,7,8,%%r0\n\t"
>> +       "ldi 0,%0"
>> +       : "=r"(ret) : "r"(x1), "r"(x2), "0"(3));
>> +
>> +  return ret;
>> +}
>> +
>> +#endif /* whichzero.h */
> 
> I am far from a hppa expert, but can't we code the same snippet in C? How
> bad would it be compared to this optimized asm?

The compiler is not great at this.  It only attempts nullification on
comparisons (not directly as a result of an operation like extract), and it
never attempts double nullification as above.

So for whichzero gcc will use 10 insns instead of my 7; for whichzero2 gcc will
use 19 insns instead of my 10.


r~
diff mbox

Patch

diff --git a/sysdeps/hppa/haszero.h b/sysdeps/hppa/haszero.h
new file mode 100644
index 0000000..99cc6fc
--- /dev/null
+++ b/sysdeps/hppa/haszero.h
@@ -0,0 +1,82 @@ 
+/* haszero.h -- function for zero byte detection.  HPPA version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef HASZERO_H
+#define HASZERO_H	1
+
+static inline unsigned long int
+haszero(unsigned long int x)
+{
+#if __GNUC_PREREQ(4, 5)
+  /* It's more useful to expose a control transfer to the compiler
+     than to expose a proper boolean result.  */
+  if (sizeof(x) == 8)
+    asm goto ("uxor,*sbz %%r0,%0,%%r0\n\tb,n %l1" : : "r"(x) : : nbz);
+  else
+    asm goto ("uxor,sbz %%r0,%0,%%r0\n\tb,n %l1" : : "r"(x) : : nbz);
+  return 1;
+ nbz:
+  return 0;
+#else
+  unsigned long int ret;
+  if (sizeof(x) == 8)
+    asm ("uxor,*sbz %%r0,%1,%%r0\n\tcopy %%r0,%0"
+	 : "=r"(ret) : "r"(x), "0"(1));
+  else
+    asm ("uxor,sbz %%r0,%1,%%r0\n\tcopy %%r0,%0"
+        : "=r"(ret) : "r"(x), "0"(1));
+  return ret;
+#endif
+}
+
+/* Likewise, but for two words simultaneously.  */
+
+static inline unsigned long int
+haszero2(unsigned long int x1, unsigned long int x2)
+{
+#if __GNUC_PREREQ(4, 5)
+  /* It's more useful to expose a control transfer to the compiler
+     than to expose a proper boolean result.  */
+  if (sizeof(x1) == 8)
+    asm goto ("uxor,*sbz %%r0,%0,%%r0\n\t"
+	      "uxor,*nbz %%r0,%1,%%r0\n\t"
+	      "b,n %l2" : : "r"(x1), "r"(x2) : : sbz);
+  else
+    asm goto ("uxor,sbz %%r0,%0,%%r0\n\t"
+	      "uxor,nbz %%r0,%1,%%r0\n\t"
+	      "b,n %l2" : : "r"(x1), "r"(x2) : : sbz);
+  return 0;
+ sbz:
+  return 1;
+#else
+  unsigned long int ret;
+  if (sizeof(x1) == 8)
+    asm ("uxor,*sbz %%r0,%1,%%r0\n\t"
+	 "uxor,*nbz %%r0,%2,%%r0\n\t"
+	 "ldi 1,%0"
+	 : "=r"(ret) : "r"(x1), "r"(x2), "0"(0));
+  else
+    asm ("uxor,sbz %%r0,%1,%%r0\n\t"
+	 "uxor,nbz %%r0,%2,%%r0\n\t"
+	 "ldi 1,%0"
+	 : "=r"(ret) : "r"(x1), "r"(x2), "0"(0));
+  return ret;
+#endif
+}
+
+#endif /* haszero.h */
diff --git a/sysdeps/hppa/whichzero.h b/sysdeps/hppa/whichzero.h
new file mode 100644
index 0000000..ef18cc7
--- /dev/null
+++ b/sysdeps/hppa/whichzero.h
@@ -0,0 +1,70 @@ 
+/* whichzero.h -- functions for zero byte searching.  HPPA version.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef HPPA_WHICHZERO_H
+#define HPPA_WHICHZERO_H 1
+
+/* Given a long that is known to contain a zero byte, return the
+   index of the first such within the long in host memory order.  */
+
+static inline unsigned int
+whichzero(unsigned long int x)
+{
+  unsigned int ret;
+
+  _Static_assert (sizeof(x) == 4, "64-bit not supported");
+
+  /* Since we have no clz insn, direct tests of the bytes is faster
+     than loading up the constants to do the masking.  */
+  asm ("extrw,u,<> %1,23,8,%%r0\n\t"
+       "ldi 2,%0\n\t"
+       "extrw,u,<> %1,15,8,%%r0\n\t"
+       "ldi 1,%0\n\t"
+       "extrw,u,<> %1,7,8,%%r0\n\t"
+       "ldi 0,%0"
+       : "=r"(ret) : "r"(x), "0"(3));
+
+  return ret;
+}
+
+/* Similarly, but perform the test for two longs simultaneously.  */
+
+static inline unsigned int
+whichzero2(unsigned long int x1, unsigned long int x2)
+{
+  unsigned int ret;
+
+  _Static_assert (sizeof(x1) == 4, "64-bit not supported");
+
+  /* Since we have no clz insn, direct tests of the bytes is faster
+     than loading up the constants to do the masking.  */
+  asm ("extrw,u,= %1,23,8,%%r0\n\t"
+       "extrw,u,<> %2,23,8,%%r0\n\t"
+       "ldi 2,%0\n\t"
+       "extrw,u,= %1,15,8,%%r0\n\t"
+       "extrw,u,<> %2,15,8,%%r0\n\t"
+       "ldi 1,%0\n\t"
+       "extrw,u,= %1,7,8,%%r0\n\t"
+       "extrw,u,<> %2,7,8,%%r0\n\t"
+       "ldi 0,%0"
+       : "=r"(ret) : "r"(x1), "r"(x2), "0"(3));
+
+  return ret;
+}
+
+#endif /* whichzero.h */