IBM Z: Provide rawmemchr{qi,hi,si} expander

Message ID YUhTXlHFSjIeBbMu@localhost.localdomain
State New
Headers
Series IBM Z: Provide rawmemchr{qi,hi,si} expander |

Commit Message

Stefan Schulze Frielinghaus Sept. 20, 2021, 9:24 a.m. UTC
  This patch implements the rawmemchr expander as introduced in
https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579649.html

Bootstrapped and regtested in conjunction with the patch from above on
IBM Z.  Ok for mainline?
From 551362cda54048dc1a51588112f11c070ed52020 Mon Sep 17 00:00:00 2001
From: Stefan Schulze Frielinghaus <stefansf@linux.ibm.com>
Date: Mon, 8 Feb 2021 10:35:39 +0100
Subject: [PATCH 2/2] IBM Z: Provide rawmemchr{qi,hi,si} expander

gcc/ChangeLog:

	* config/s390/s390-protos.h (s390_rawmemchrqi): Add prototype.
	(s390_rawmemchrhi): Add prototype.
	(s390_rawmemchrsi): Add prototype.
	* config/s390/s390.c (s390_rawmemchr): New function.
	(s390_rawmemchrqi): New function.
	(s390_rawmemchrhi): New function.
	(s390_rawmemchrsi): New function.
	* config/s390/s390.md (rawmemchr<SINT:mode>): New expander.
	(rawmemchr<SINT:mode><P:mode>): New expander.
	* config/s390/vector.md (vec_vfees<mode>): Basically a copy of
	the pattern vfees<mode> from vx-builtins.md.
	* config/s390/vx-builtins.md (*vfees<mode>): Remove.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/rawmemchr-1.c: New test.
---
 gcc/config/s390/s390-protos.h               |  4 +
 gcc/config/s390/s390.c                      | 89 ++++++++++++++++++
 gcc/config/s390/s390.md                     | 20 +++++
 gcc/config/s390/vector.md                   | 26 ++++++
 gcc/config/s390/vx-builtins.md              | 26 ------
 gcc/testsuite/gcc.target/s390/rawmemchr-1.c | 99 +++++++++++++++++++++
 6 files changed, 238 insertions(+), 26 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/rawmemchr-1.c
  

Comments

Andreas Krebbel Oct. 7, 2021, 9:16 a.m. UTC | #1
On 9/20/21 11:24, Stefan Schulze Frielinghaus wrote:
> This patch implements the rawmemchr expander as introduced in
> https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579649.html
> 
> Bootstrapped and regtested in conjunction with the patch from above on
> IBM Z.  Ok for mainline?
> 

> From 551362cda54048dc1a51588112f11c070ed52020 Mon Sep 17 00:00:00 2001
> From: Stefan Schulze Frielinghaus <stefansf@linux.ibm.com>
> Date: Mon, 8 Feb 2021 10:35:39 +0100
> Subject: [PATCH 2/2] IBM Z: Provide rawmemchr{qi,hi,si} expander
>
> gcc/ChangeLog:
>
> 	* config/s390/s390-protos.h (s390_rawmemchrqi): Add prototype.
> 	(s390_rawmemchrhi): Add prototype.
> 	(s390_rawmemchrsi): Add prototype.
> 	* config/s390/s390.c (s390_rawmemchr): New function.
> 	(s390_rawmemchrqi): New function.
> 	(s390_rawmemchrhi): New function.
> 	(s390_rawmemchrsi): New function.
> 	* config/s390/s390.md (rawmemchr<SINT:mode>): New expander.
> 	(rawmemchr<SINT:mode><P:mode>): New expander.
> 	* config/s390/vector.md (vec_vfees<mode>): Basically a copy of
> 	the pattern vfees<mode> from vx-builtins.md.
> 	* config/s390/vx-builtins.md (*vfees<mode>): Remove.

Thanks! Would it make sense to also extend the strlen and movstr expanders
we have to support the additional character modes?

A few style comments below.

>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/s390/rawmemchr-1.c: New test.
> ---
>  gcc/config/s390/s390-protos.h               |  4 +
>  gcc/config/s390/s390.c                      | 89 ++++++++++++++++++
>  gcc/config/s390/s390.md                     | 20 +++++
>  gcc/config/s390/vector.md                   | 26 ++++++
>  gcc/config/s390/vx-builtins.md              | 26 ------
>  gcc/testsuite/gcc.target/s390/rawmemchr-1.c | 99 +++++++++++++++++++++
>  6 files changed, 238 insertions(+), 26 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/s390/rawmemchr-1.c
>
> diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
> index 4b03c6e99f5..0d9619e8254 100644
> --- a/gcc/config/s390/s390-protos.h
> +++ b/gcc/config/s390/s390-protos.h
> @@ -66,6 +66,10 @@ s390_asm_declare_function_size (FILE *asm_out_file,
>  				const char *fnname ATTRIBUTE_UNUSED, tree decl);
>  #endif
>
> +extern void s390_rawmemchrqi(rtx dst, rtx src, rtx pat);
> +extern void s390_rawmemchrhi(rtx dst, rtx src, rtx pat);
> +extern void s390_rawmemchrsi(rtx dst, rtx src, rtx pat);
> +
>  #ifdef RTX_CODE
>  extern int s390_extra_constraint_str (rtx, int, const char *);
>  extern int s390_const_ok_for_constraint_p (HOST_WIDE_INT, int, const char *);
> diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
> index 54dd6332c3a..1435ce156e2 100644
> --- a/gcc/config/s390/s390.c
> +++ b/gcc/config/s390/s390.c
> @@ -16559,6 +16559,95 @@ s390_excess_precision (enum excess_precision_type type)
>  }
>  #endif
>
> +template <machine_mode vec_mode,
> +	  machine_mode elt_mode,
> +	  rtx (*gen_vec_vfees) (rtx, rtx, rtx, rtx)>
> +static void
> +s390_rawmemchr(rtx dst, rtx src, rtx pat) {

I think it would be a bit easier to turn the vec_vfees expander into a
'parameterized name' and add the mode as parameter.  I'll attach a patch
to illustrate how this might look like.

> +  rtx lens = gen_reg_rtx (V16QImode);
> +  rtx pattern = gen_reg_rtx (vec_mode);
> +  rtx loop_start = gen_label_rtx ();
> +  rtx loop_end = gen_label_rtx ();
> +  rtx addr = gen_reg_rtx (Pmode);
> +  rtx offset = gen_reg_rtx (Pmode);
> +  rtx tmp = gen_reg_rtx (Pmode);
> +  rtx loadlen = gen_reg_rtx (SImode);
> +  rtx matchlen = gen_reg_rtx (SImode);
> +  rtx mem;
> +
> +  pat = GEN_INT (trunc_int_for_mode (INTVAL (pat), elt_mode));
> +  emit_insn (gen_rtx_SET (pattern, gen_rtx_VEC_DUPLICATE (vec_mode, pat)));
> +
> +  emit_move_insn (addr, XEXP (src, 0));
> +
> +  // alignment
> +  emit_insn (gen_vlbb (lens, gen_rtx_MEM (BLKmode, addr), GEN_INT (6)));
> +  emit_insn (gen_lcbb (loadlen, addr, GEN_INT (6)));
> +  lens = convert_to_mode (vec_mode, lens, 1);
> +  emit_insn (gen_vec_vfees (lens, lens, pattern, GEN_INT (0)));
> +  lens = convert_to_mode (V4SImode, lens, 1);
> +  emit_insn (gen_vec_extractv4sisi (matchlen, lens, GEN_INT (1)));
> +  lens = convert_to_mode (vec_mode, lens, 1);

That back and forth NOP conversion stuff is ugly but I couldn't find a
more elegant way to write this without generating worse code.  Of
course we want to benefit here from the fact that the result operand
of vfees is already zero-extended.  Perhaps factor this out into a
utility function or an extra expander because we appear to need this
frequently?! Not a requirement for this patch though.

> +  emit_cmp_and_jump_insns (matchlen, loadlen, LT, NULL_RTX, SImode, 1, loop_end);
> +  force_expand_binop (Pmode, and_optab, addr, GEN_INT (15), tmp, 1, OPTAB_DIRECT);
> +  force_expand_binop (Pmode, sub_optab, GEN_INT (16), tmp, tmp, 1, OPTAB_DIRECT);
> +  force_expand_binop (Pmode, add_optab, addr, tmp, addr, 1, OPTAB_DIRECT);

Couldn't we just do this as '(addr + 16) & ~0xf' here?
Something like this perhaps:
  force_expand_binop (Pmode, add_optab, addr, GEN_INT(16), addr, 1, OPTAB_DIRECT);
  force_expand_binop (Pmode, and_optab, addr, GEN_INT(~HOST_WIDE_INT_UC(0xf)), addr, 1, OPTAB_DIRECT);

> +  // now, addr is 16-byte aligned
> +
> +  mem = gen_rtx_MEM (vec_mode, addr);
> +  set_mem_align (mem, 128);
> +  emit_move_insn (lens, mem);
> +  emit_insn (gen_vec_vfees (lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
> +  add_int_reg_note (s390_emit_ccraw_jump (4, EQ, loop_end),
> +		    REG_BR_PROB,
> +		    profile_probability::very_unlikely ().to_reg_br_prob_note ());
> +
> +  emit_label (loop_start);
> +  LABEL_NUSES (loop_start) = 1;
> +
> +  force_expand_binop (Pmode, add_optab, addr, GEN_INT (16), addr, 1, OPTAB_DIRECT);
> +  mem = gen_rtx_MEM (vec_mode, addr);
> +  set_mem_align (mem, 128);
> +  emit_move_insn (lens, mem);
> +  emit_insn (gen_vec_vfees (lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
> +  add_int_reg_note (s390_emit_ccraw_jump (4, NE, loop_start),
> +		    REG_BR_PROB,
> +		    profile_probability::very_likely ().to_reg_br_prob_note ());
> +
> +  emit_label (loop_end);
> +  LABEL_NUSES (loop_end) = 1;
> +
> +  if (TARGET_64BIT)
> +    {
> +      lens = convert_to_mode (V2DImode, lens, 1);
> +      emit_insn (gen_vec_extractv2didi (offset, lens, GEN_INT (0)));
> +    }
> +  else
> +    {
> +      lens = convert_to_mode (V4SImode, lens, 1);
> +      emit_insn (gen_vec_extractv4sisi (offset, lens, GEN_INT (1)));
> +    }
> +  force_expand_binop (Pmode, add_optab, addr, offset, dst, 1, OPTAB_DIRECT);
> +}
> +
> +void
> +s390_rawmemchrqi (rtx dst, rtx src, rtx pat)
> +{
> +  s390_rawmemchr<V16QImode, QImode, gen_vec_vfeesv16qi> (dst, src, pat);
> +}
> +
> +void
> +s390_rawmemchrhi (rtx dst, rtx src, rtx pat)
> +{
> +  s390_rawmemchr<V8HImode, HImode, gen_vec_vfeesv8hi> (dst, src, pat);
> +}
> +
> +void
> +s390_rawmemchrsi (rtx dst, rtx src, rtx pat)
> +{
> +  s390_rawmemchr<V4SImode, SImode, gen_vec_vfeesv4si> (dst, src, pat);
> +}
> +
>  /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
>
>  static unsigned HOST_WIDE_INT
> diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
> index 1b894a926ce..f81bcef86ce 100644
> --- a/gcc/config/s390/s390.md
> +++ b/gcc/config/s390/s390.md
> @@ -12258,3 +12258,23 @@
>  		    UNSPECV_PPA)]
>    "TARGET_ZEC12"
>    "")
> +
> +(define_expand "rawmemchr<SINT:mode>"
> +  [(match_operand      0 "register_operand")
> +   (match_operand      1 "memory_operand")
> +   (match_operand:SINT 2 "const_int_operand")]
> +  "TARGET_VX"
> +{
> +  if (TARGET_64BIT)
> +    emit_insn (gen_rawmemchr<SINT:mode>di (operands[0], operands[1], operands[2]));
> +  else
> +    emit_insn (gen_rawmemchr<SINT:mode>si (operands[0], operands[1], operands[2]));
> +  DONE;
> +})

Couldn't you just invoke s390_rawmemchr from here instead of the
indirection through a separate expander?

> +
> +(define_expand "rawmemchr<SINT:mode><P:mode>"
> +  [(match_operand:P    0 "register_operand")
> +   (match_operand:BLK  1 "memory_operand")
> +   (match_operand:SINT 2 "const_int_operand")]
> +  "TARGET_VX"
> +  "s390_rawmemchr<SINT:mode> (operands[0], operands[1], operands[2]); DONE;")
> diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
> index 70274a6ab70..0870e2341fc 100644
> --- a/gcc/config/s390/vector.md
> +++ b/gcc/config/s390/vector.md
> @@ -1988,6 +1988,32 @@
>    "vll\t%v0,%1,%2"
>    [(set_attr "op_type" "VRS")])
>
> +; vfeebs, vfeehs, vfeefs
> +; vfeezbs, vfeezhs, vfeezfs
> +(define_insn "vec_vfees<mode>"
> +  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
> +	(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
> +			   (match_operand:VI_HW_QHS 2 "register_operand" "v")
> +			   (match_operand:QI 3 "const_mask_operand" "C")]
> +			  UNSPEC_VEC_VFEE))
> +   (set (reg:CCRAW CC_REGNUM)
> +	(unspec:CCRAW [(match_dup 1)
> +		       (match_dup 2)
> +		       (match_dup 3)]
> +		      UNSPEC_VEC_VFEECC))]
> +  "TARGET_VX"
> +{
> +  unsigned HOST_WIDE_INT flags = UINTVAL (operands[3]);
> +
> +  gcc_assert (!(flags & ~(VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
> +  flags &= ~VSTRING_FLAG_CS;
> +
> +  if (flags == VSTRING_FLAG_ZS)
> +    return "vfeez<bhfgq>s\t%v0,%v1,%v2";
> +  return "vfee<bhfgq>s\t%v0,%v1,%v2";
> +}
> +  [(set_attr "op_type" "VRR")])
> +
>  ; vfenebs, vfenehs, vfenefs
>  ; vfenezbs, vfenezhs, vfenezfs
>  (define_insn "vec_vfenes<mode>"
> diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
> index 3e7b8541887..efa77992f31 100644
> --- a/gcc/config/s390/vx-builtins.md
> +++ b/gcc/config/s390/vx-builtins.md
> @@ -1366,32 +1366,6 @@
>
>  ; Vector find element equal
>
> -; vfeebs, vfeehs, vfeefs
> -; vfeezbs, vfeezhs, vfeezfs
> -(define_insn "*vfees<mode>"
> -  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
> -	(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
> -			   (match_operand:VI_HW_QHS 2 "register_operand" "v")
> -			   (match_operand:QI 3 "const_mask_operand" "C")]
> -			  UNSPEC_VEC_VFEE))
> -   (set (reg:CCRAW CC_REGNUM)
> -	(unspec:CCRAW [(match_dup 1)
> -		       (match_dup 2)
> -		       (match_dup 3)]
> -		      UNSPEC_VEC_VFEECC))]
> -  "TARGET_VX"
> -{
> -  unsigned HOST_WIDE_INT flags = UINTVAL (operands[3]);
> -
> -  gcc_assert (!(flags & ~(VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
> -  flags &= ~VSTRING_FLAG_CS;
> -
> -  if (flags == VSTRING_FLAG_ZS)
> -    return "vfeez<bhfgq>s\t%v0,%v1,%v2";
> -  return "vfee<bhfgq>s\t%v0,%v1,%v2,%b3";
> -}
> -  [(set_attr "op_type" "VRR")])
> -
>  ; vfeeb, vfeeh, vfeef
>  (define_insn "vfee<mode>"
>    [(set (match_operand:VI_HW_QHS                    0 "register_operand" "=v")
> diff --git a/gcc/testsuite/gcc.target/s390/rawmemchr-1.c b/gcc/testsuite/gcc.target/s390/rawmemchr-1.c
> new file mode 100644
> index 00000000000..a5125702315
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/s390/rawmemchr-1.c
> @@ -0,0 +1,99 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details -mzarch -march=z13" } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */
> +/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */
> +
> +#include <string.h>
> +#include <assert.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +
> +#define rawmemchrT(T, pattern)     \
> +__attribute__((noinline,noclone))  \
> +T* rawmemchr_##T (T *s)            \
> +{                                  \
> +  while (*s != pattern)            \
> +    ++s;                           \
> +  return s;                        \
> +}
> +
> +rawmemchrT(int8_t, (int8_t)0xde)
> +rawmemchrT(uint8_t, 0xde)
> +rawmemchrT(int16_t, (int16_t)0xdead)
> +rawmemchrT(uint16_t, 0xdead)
> +rawmemchrT(int32_t, (int32_t)0xdeadbeef)
> +rawmemchrT(uint32_t, 0xdeadbeef)
> +
> +#define runT(T, pattern)                           \
> +void run_##T ()                                    \
> +{                                                  \
> +  T *buf = malloc (4096 * 2 * sizeof(T));          \
> +  assert (buf != NULL);                            \
> +  memset (buf, 0xa, 4096 * 2 * sizeof(T));         \
> +  /* ensure q is 4096-byte aligned */              \
> +  T *q = (T*)((unsigned char *)buf                 \
> +              + (4096 - ((uintptr_t)buf & 4095))); \
> +  T *p;                                            \
> +  /* unaligned + block boundary + 1st load */      \
> +  p = (T *) ((uintptr_t)q - 8);                    \
> +  p[2] = pattern;                                  \
> +  assert ((rawmemchr_##T (&p[0]) == &p[2]));       \
> +  p[2] = (T) 0xaaaaaaaa;                           \
> +  /* unaligned + block boundary + 2nd load */      \
> +  p = (T *) ((uintptr_t)q - 8);                    \
> +  p[6] = pattern;                                  \
> +  assert ((rawmemchr_##T (&p[0]) == &p[6]));       \
> +  p[6] = (T) 0xaaaaaaaa;                           \
> +  /* unaligned + 1st load */                       \
> +  q[5] = pattern;                                  \
> +  assert ((rawmemchr_##T (&q[2]) == &q[5]));       \
> +  q[5] = (T) 0xaaaaaaaa;                           \
> +  /* unaligned + 2nd load */                       \
> +  q[14] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[2]) == &q[14]));      \
> +  q[14] = (T) 0xaaaaaaaa;                          \
> +  /* unaligned + 3rd load */                       \
> +  q[19] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[2]) == &q[19]));      \
> +  q[19] = (T) 0xaaaaaaaa;                          \
> +  /* unaligned + 4th load */                       \
> +  q[25] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[2]) == &q[25]));      \
> +  q[25] = (T) 0xaaaaaaaa;                          \
> +  /* aligned + 1st load */                         \
> +  q[5] = pattern;                                  \
> +  assert ((rawmemchr_##T (&q[0]) == &q[5]));       \
> +  q[5] = (T) 0xaaaaaaaa;                           \
> +  /* aligned + 2nd load */                         \
> +  q[14] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[0]) == &q[14]));      \
> +  q[14] = (T) 0xaaaaaaaa;                          \
> +  /* aligned + 3rd load */                         \
> +  q[19] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[0]) == &q[19]));      \
> +  q[19] = (T) 0xaaaaaaaa;                          \
> +  /* aligned + 4th load */                         \
> +  q[25] = pattern;                                 \
> +  assert ((rawmemchr_##T (&q[0]) == &q[25]));      \
> +  q[25] = (T) 0xaaaaaaaa;                          \
> +  free (buf);                                      \
> +}
> +
> +runT(int8_t, (int8_t)0xde)
> +runT(uint8_t, 0xde)
> +runT(int16_t, (int16_t)0xdead)
> +runT(uint16_t, 0xdead)
> +runT(int32_t, (int32_t)0xdeadbeef)
> +runT(uint32_t, 0xdeadbeef)
> +
> +int main (void)
> +{
> +  run_uint8_t ();
> +  run_int8_t ();
> +  run_uint16_t ();
> +  run_int16_t ();
> +  run_uint32_t ();
> +  run_int32_t ();
> +  return 0;
> +}
> --
> 2.31.1
>
  
Stefan Schulze Frielinghaus Oct. 8, 2021, 2:23 p.m. UTC | #2
On Thu, Oct 07, 2021 at 11:16:24AM +0200, Andreas Krebbel wrote:
> On 9/20/21 11:24, Stefan Schulze Frielinghaus wrote:
> > This patch implements the rawmemchr expander as introduced in
> > https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579649.html
> > 
> > Bootstrapped and regtested in conjunction with the patch from above on
> > IBM Z.  Ok for mainline?
> > 
> 
> > From 551362cda54048dc1a51588112f11c070ed52020 Mon Sep 17 00:00:00 2001
> > From: Stefan Schulze Frielinghaus <stefansf@linux.ibm.com>
> > Date: Mon, 8 Feb 2021 10:35:39 +0100
> > Subject: [PATCH 2/2] IBM Z: Provide rawmemchr{qi,hi,si} expander
> >
> > gcc/ChangeLog:
> >
> > 	* config/s390/s390-protos.h (s390_rawmemchrqi): Add prototype.
> > 	(s390_rawmemchrhi): Add prototype.
> > 	(s390_rawmemchrsi): Add prototype.
> > 	* config/s390/s390.c (s390_rawmemchr): New function.
> > 	(s390_rawmemchrqi): New function.
> > 	(s390_rawmemchrhi): New function.
> > 	(s390_rawmemchrsi): New function.
> > 	* config/s390/s390.md (rawmemchr<SINT:mode>): New expander.
> > 	(rawmemchr<SINT:mode><P:mode>): New expander.
> > 	* config/s390/vector.md (vec_vfees<mode>): Basically a copy of
> > 	the pattern vfees<mode> from vx-builtins.md.
> > 	* config/s390/vx-builtins.md (*vfees<mode>): Remove.
> 
> Thanks! Would it make sense to also extend the strlen and movstr expanders
> we have to support the additional character modes?

For strlen-like loops over non-character arrays the current
implementation in the loop distribution pass uses rawmemchr<MODE> and
computes pointer difference in order to compute the length.  Thus we get
strlen<MODE> for free and don't need to reimplement it.

> 
> A few style comments below.
> 
> >
> > gcc/testsuite/ChangeLog:
> >
> > 	* gcc.target/s390/rawmemchr-1.c: New test.
> > ---
> >  gcc/config/s390/s390-protos.h               |  4 +
> >  gcc/config/s390/s390.c                      | 89 ++++++++++++++++++
> >  gcc/config/s390/s390.md                     | 20 +++++
> >  gcc/config/s390/vector.md                   | 26 ++++++
> >  gcc/config/s390/vx-builtins.md              | 26 ------
> >  gcc/testsuite/gcc.target/s390/rawmemchr-1.c | 99 +++++++++++++++++++++
> >  6 files changed, 238 insertions(+), 26 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/s390/rawmemchr-1.c
> >
> > diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
> > index 4b03c6e99f5..0d9619e8254 100644
> > --- a/gcc/config/s390/s390-protos.h
> > +++ b/gcc/config/s390/s390-protos.h
> > @@ -66,6 +66,10 @@ s390_asm_declare_function_size (FILE *asm_out_file,
> >  				const char *fnname ATTRIBUTE_UNUSED, tree decl);
> >  #endif
> >
> > +extern void s390_rawmemchrqi(rtx dst, rtx src, rtx pat);
> > +extern void s390_rawmemchrhi(rtx dst, rtx src, rtx pat);
> > +extern void s390_rawmemchrsi(rtx dst, rtx src, rtx pat);
> > +
> >  #ifdef RTX_CODE
> >  extern int s390_extra_constraint_str (rtx, int, const char *);
> >  extern int s390_const_ok_for_constraint_p (HOST_WIDE_INT, int, const char *);
> > diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
> > index 54dd6332c3a..1435ce156e2 100644
> > --- a/gcc/config/s390/s390.c
> > +++ b/gcc/config/s390/s390.c
> > @@ -16559,6 +16559,95 @@ s390_excess_precision (enum excess_precision_type type)
> >  }
> >  #endif
> >
> > +template <machine_mode vec_mode,
> > +	  machine_mode elt_mode,
> > +	  rtx (*gen_vec_vfees) (rtx, rtx, rtx, rtx)>
> > +static void
> > +s390_rawmemchr(rtx dst, rtx src, rtx pat) {
> 
> I think it would be a bit easier to turn the vec_vfees expander into a
> 'parameterized name' and add the mode as parameter.  I'll attach a patch
> to illustrate how this might look like.

Right, didn't know about parameterized names which looks more clean to
me.  Thanks for the hint!

> 
> > +  rtx lens = gen_reg_rtx (V16QImode);
> > +  rtx pattern = gen_reg_rtx (vec_mode);
> > +  rtx loop_start = gen_label_rtx ();
> > +  rtx loop_end = gen_label_rtx ();
> > +  rtx addr = gen_reg_rtx (Pmode);
> > +  rtx offset = gen_reg_rtx (Pmode);
> > +  rtx tmp = gen_reg_rtx (Pmode);
> > +  rtx loadlen = gen_reg_rtx (SImode);
> > +  rtx matchlen = gen_reg_rtx (SImode);
> > +  rtx mem;
> > +
> > +  pat = GEN_INT (trunc_int_for_mode (INTVAL (pat), elt_mode));
> > +  emit_insn (gen_rtx_SET (pattern, gen_rtx_VEC_DUPLICATE (vec_mode, pat)));
> > +
> > +  emit_move_insn (addr, XEXP (src, 0));
> > +
> > +  // alignment
> > +  emit_insn (gen_vlbb (lens, gen_rtx_MEM (BLKmode, addr), GEN_INT (6)));
> > +  emit_insn (gen_lcbb (loadlen, addr, GEN_INT (6)));
> > +  lens = convert_to_mode (vec_mode, lens, 1);
> > +  emit_insn (gen_vec_vfees (lens, lens, pattern, GEN_INT (0)));
> > +  lens = convert_to_mode (V4SImode, lens, 1);
> > +  emit_insn (gen_vec_extractv4sisi (matchlen, lens, GEN_INT (1)));
> > +  lens = convert_to_mode (vec_mode, lens, 1);
> 
> That back and forth NOP conversion stuff is ugly but I couldn't find a
> more elegant way to write this without generating worse code.  Of
> course we want to benefit here from the fact that the result operand
> of vfees is already zero-extended.  Perhaps factor this out into a
> utility function or an extra expander because we appear to need this
> frequently?! Not a requirement for this patch though.

I completely agree with this, though, I couldn't find an elegant way
either.  I will keep this in mind in the hope to find a more elegant
solution someday.

> 
> > +  emit_cmp_and_jump_insns (matchlen, loadlen, LT, NULL_RTX, SImode, 1, loop_end);
> > +  force_expand_binop (Pmode, and_optab, addr, GEN_INT (15), tmp, 1, OPTAB_DIRECT);
> > +  force_expand_binop (Pmode, sub_optab, GEN_INT (16), tmp, tmp, 1, OPTAB_DIRECT);
> > +  force_expand_binop (Pmode, add_optab, addr, tmp, addr, 1, OPTAB_DIRECT);
> 
> Couldn't we just do this as '(addr + 16) & ~0xf' here?
> Something like this perhaps:
>   force_expand_binop (Pmode, add_optab, addr, GEN_INT(16), addr, 1, OPTAB_DIRECT);
>   force_expand_binop (Pmode, and_optab, addr, GEN_INT(~HOST_WIDE_INT_UC(0xf)), addr, 1, OPTAB_DIRECT);

Good point.  Changed it.

> 
> > +  // now, addr is 16-byte aligned
> > +
> > +  mem = gen_rtx_MEM (vec_mode, addr);
> > +  set_mem_align (mem, 128);
> > +  emit_move_insn (lens, mem);
> > +  emit_insn (gen_vec_vfees (lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
> > +  add_int_reg_note (s390_emit_ccraw_jump (4, EQ, loop_end),
> > +		    REG_BR_PROB,
> > +		    profile_probability::very_unlikely ().to_reg_br_prob_note ());
> > +
> > +  emit_label (loop_start);
> > +  LABEL_NUSES (loop_start) = 1;
> > +
> > +  force_expand_binop (Pmode, add_optab, addr, GEN_INT (16), addr, 1, OPTAB_DIRECT);
> > +  mem = gen_rtx_MEM (vec_mode, addr);
> > +  set_mem_align (mem, 128);
> > +  emit_move_insn (lens, mem);
> > +  emit_insn (gen_vec_vfees (lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
> > +  add_int_reg_note (s390_emit_ccraw_jump (4, NE, loop_start),
> > +		    REG_BR_PROB,
> > +		    profile_probability::very_likely ().to_reg_br_prob_note ());
> > +
> > +  emit_label (loop_end);
> > +  LABEL_NUSES (loop_end) = 1;
> > +
> > +  if (TARGET_64BIT)
> > +    {
> > +      lens = convert_to_mode (V2DImode, lens, 1);
> > +      emit_insn (gen_vec_extractv2didi (offset, lens, GEN_INT (0)));
> > +    }
> > +  else
> > +    {
> > +      lens = convert_to_mode (V4SImode, lens, 1);
> > +      emit_insn (gen_vec_extractv4sisi (offset, lens, GEN_INT (1)));
> > +    }
> > +  force_expand_binop (Pmode, add_optab, addr, offset, dst, 1, OPTAB_DIRECT);
> > +}
> > +
> > +void
> > +s390_rawmemchrqi (rtx dst, rtx src, rtx pat)
> > +{
> > +  s390_rawmemchr<V16QImode, QImode, gen_vec_vfeesv16qi> (dst, src, pat);
> > +}
> > +
> > +void
> > +s390_rawmemchrhi (rtx dst, rtx src, rtx pat)
> > +{
> > +  s390_rawmemchr<V8HImode, HImode, gen_vec_vfeesv8hi> (dst, src, pat);
> > +}
> > +
> > +void
> > +s390_rawmemchrsi (rtx dst, rtx src, rtx pat)
> > +{
> > +  s390_rawmemchr<V4SImode, SImode, gen_vec_vfeesv4si> (dst, src, pat);
> > +}
> > +
> >  /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
> >
> >  static unsigned HOST_WIDE_INT
> > diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
> > index 1b894a926ce..f81bcef86ce 100644
> > --- a/gcc/config/s390/s390.md
> > +++ b/gcc/config/s390/s390.md
> > @@ -12258,3 +12258,23 @@
> >  		    UNSPECV_PPA)]
> >    "TARGET_ZEC12"
> >    "")
> > +
> > +(define_expand "rawmemchr<SINT:mode>"
> > +  [(match_operand      0 "register_operand")
> > +   (match_operand      1 "memory_operand")
> > +   (match_operand:SINT 2 "const_int_operand")]
> > +  "TARGET_VX"
> > +{
> > +  if (TARGET_64BIT)
> > +    emit_insn (gen_rawmemchr<SINT:mode>di (operands[0], operands[1], operands[2]));
> > +  else
> > +    emit_insn (gen_rawmemchr<SINT:mode>si (operands[0], operands[1], operands[2]));
> > +  DONE;
> > +})
> 
> Couldn't you just invoke s390_rawmemchr from here instead of the
> indirection through a separate expander?

Yea, right.  Changed it.

Please find a new version attached.  I did another bootstrap+regtest on
IBM Z.  Ok for mainline?

Thanks for your detailed review!

Cheers,
Stefan
From 04184da07cc3570d03d90feb1017e377aeea53bf Mon Sep 17 00:00:00 2001
From: Stefan Schulze Frielinghaus <stefansf@linux.ibm.com>
Date: Mon, 8 Feb 2021 10:35:39 +0100
Subject: [PATCH 2/2] IBM Z: Provide rawmemchr{qi,hi,si} expander

gcc/ChangeLog:

	* config/s390/s390-protos.h (s390_rawmemchr): Add prototype.
	* config/s390/s390.c (s390_rawmemchr): New function.
	* config/s390/s390.md (rawmemchr<SINT:mode>): New expander.
	* config/s390/vector.md (@vec_vfees<mode>): Basically a copy of
	the pattern vfees<mode> from vx-builtins.md.
	* config/s390/vx-builtins.md (*vfees<mode>): Remove.

gcc/testsuite/ChangeLog:

	* gcc.target/s390/rawmemchr-1.c: New test.
---
 gcc/config/s390/s390-protos.h               |  2 +
 gcc/config/s390/s390.c                      | 69 ++++++++++++++
 gcc/config/s390/s390.md                     |  7 ++
 gcc/config/s390/vector.md                   | 26 ++++++
 gcc/config/s390/vx-builtins.md              | 26 ------
 gcc/testsuite/gcc.target/s390/rawmemchr-1.c | 99 +++++++++++++++++++++
 6 files changed, 203 insertions(+), 26 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/s390/rawmemchr-1.c

diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index 4b03c6e99f5..c1616357b8c 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -66,6 +66,8 @@ s390_asm_declare_function_size (FILE *asm_out_file,
 				const char *fnname ATTRIBUTE_UNUSED, tree decl);
 #endif
 
+extern void s390_rawmemchr (machine_mode elt_mode, rtx dst, rtx src, rtx pat);
+
 #ifdef RTX_CODE
 extern int s390_extra_constraint_str (rtx, int, const char *);
 extern int s390_const_ok_for_constraint_p (HOST_WIDE_INT, int, const char *);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index e04385451cf..dae8bb3cbbd 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -16569,6 +16569,75 @@ s390_excess_precision (enum excess_precision_type type)
 }
 #endif
 
+void
+s390_rawmemchr (machine_mode elt_mode, rtx dst, rtx src, rtx pat)
+{
+  machine_mode vec_mode = mode_for_vector (as_a <scalar_int_mode> (elt_mode),
+					   16 / GET_MODE_SIZE (elt_mode)).require();
+  rtx lens = gen_reg_rtx (V16QImode);
+  rtx pattern = gen_reg_rtx (vec_mode);
+  rtx loop_start = gen_label_rtx ();
+  rtx loop_end = gen_label_rtx ();
+  rtx addr = gen_reg_rtx (Pmode);
+  rtx offset = gen_reg_rtx (Pmode);
+  rtx loadlen = gen_reg_rtx (SImode);
+  rtx matchlen = gen_reg_rtx (SImode);
+  rtx mem;
+
+  pat = GEN_INT (trunc_int_for_mode (INTVAL (pat), elt_mode));
+  emit_insn (gen_rtx_SET (pattern, gen_rtx_VEC_DUPLICATE (vec_mode, pat)));
+
+  emit_move_insn (addr, XEXP (src, 0));
+
+  // alignment
+  emit_insn (gen_vlbb (lens, gen_rtx_MEM (BLKmode, addr), GEN_INT (6)));
+  emit_insn (gen_lcbb (loadlen, addr, GEN_INT (6)));
+  lens = convert_to_mode (vec_mode, lens, 1);
+  emit_insn (gen_vec_vfees (vec_mode, lens, lens, pattern, GEN_INT (0)));
+  lens = convert_to_mode (V4SImode, lens, 1);
+  emit_insn (gen_vec_extractv4sisi (matchlen, lens, GEN_INT (1)));
+  lens = convert_to_mode (vec_mode, lens, 1);
+  emit_cmp_and_jump_insns (matchlen, loadlen, LT, NULL_RTX, SImode, 1, loop_end);
+  force_expand_binop (Pmode, add_optab, addr, GEN_INT(16), addr, 1, OPTAB_DIRECT);
+  force_expand_binop (Pmode, and_optab, addr, GEN_INT(~HOST_WIDE_INT_UC(0xf)), addr, 1, OPTAB_DIRECT);
+  // now, addr is 16-byte aligned
+
+  mem = gen_rtx_MEM (vec_mode, addr);
+  set_mem_align (mem, 128);
+  emit_move_insn (lens, mem);
+  emit_insn (gen_vec_vfees (vec_mode, lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
+  add_int_reg_note (s390_emit_ccraw_jump (4, EQ, loop_end),
+		    REG_BR_PROB,
+		    profile_probability::very_unlikely ().to_reg_br_prob_note ());
+
+  emit_label (loop_start);
+  LABEL_NUSES (loop_start) = 1;
+
+  force_expand_binop (Pmode, add_optab, addr, GEN_INT (16), addr, 1, OPTAB_DIRECT);
+  mem = gen_rtx_MEM (vec_mode, addr);
+  set_mem_align (mem, 128);
+  emit_move_insn (lens, mem);
+  emit_insn (gen_vec_vfees (vec_mode, lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
+  add_int_reg_note (s390_emit_ccraw_jump (4, NE, loop_start),
+		    REG_BR_PROB,
+		    profile_probability::very_likely ().to_reg_br_prob_note ());
+
+  emit_label (loop_end);
+  LABEL_NUSES (loop_end) = 1;
+
+  if (TARGET_64BIT)
+    {
+      lens = convert_to_mode (V2DImode, lens, 1);
+      emit_insn (gen_vec_extractv2didi (offset, lens, GEN_INT (0)));
+    }
+  else
+    {
+      lens = convert_to_mode (V4SImode, lens, 1);
+      emit_insn (gen_vec_extractv4sisi (offset, lens, GEN_INT (1)));
+    }
+  force_expand_binop (Pmode, add_optab, addr, offset, dst, 1, OPTAB_DIRECT);
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 1b894a926ce..b8bdbaec468 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -12258,3 +12258,10 @@
 		    UNSPECV_PPA)]
   "TARGET_ZEC12"
   "")
+
+(define_expand "rawmemchr<SINT:mode>"
+  [(match_operand      0 "register_operand")
+   (match_operand      1 "memory_operand")
+   (match_operand:SINT 2 "const_int_operand")]
+  "TARGET_VX"
+  "s390_rawmemchr(<SINT:MODE>mode, operands[0], operands[1], operands[2]); DONE;")
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 70274a6ab70..1ed1d0665d4 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -1988,6 +1988,32 @@
   "vll\t%v0,%1,%2"
   [(set_attr "op_type" "VRS")])
 
+; vfeebs, vfeehs, vfeefs
+; vfeezbs, vfeezhs, vfeezfs
+(define_insn "@vec_vfees<mode>"
+  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
+	(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
+			   (match_operand:VI_HW_QHS 2 "register_operand" "v")
+			   (match_operand:QI 3 "const_mask_operand" "C")]
+			  UNSPEC_VEC_VFEE))
+   (set (reg:CCRAW CC_REGNUM)
+	(unspec:CCRAW [(match_dup 1)
+		       (match_dup 2)
+		       (match_dup 3)]
+		      UNSPEC_VEC_VFEECC))]
+  "TARGET_VX"
+{
+  unsigned HOST_WIDE_INT flags = UINTVAL (operands[3]);
+
+  gcc_assert (!(flags & ~(VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
+  flags &= ~VSTRING_FLAG_CS;
+
+  if (flags == VSTRING_FLAG_ZS)
+    return "vfeez<bhfgq>s\t%v0,%v1,%v2";
+  return "vfee<bhfgq>s\t%v0,%v1,%v2";
+}
+  [(set_attr "op_type" "VRR")])
+
 ; vfenebs, vfenehs, vfenefs
 ; vfenezbs, vfenezhs, vfenezfs
 (define_insn "vec_vfenes<mode>"
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 3e7b8541887..efa77992f31 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -1366,32 +1366,6 @@
 
 ; Vector find element equal
 
-; vfeebs, vfeehs, vfeefs
-; vfeezbs, vfeezhs, vfeezfs
-(define_insn "*vfees<mode>"
-  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
-	(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
-			   (match_operand:VI_HW_QHS 2 "register_operand" "v")
-			   (match_operand:QI 3 "const_mask_operand" "C")]
-			  UNSPEC_VEC_VFEE))
-   (set (reg:CCRAW CC_REGNUM)
-	(unspec:CCRAW [(match_dup 1)
-		       (match_dup 2)
-		       (match_dup 3)]
-		      UNSPEC_VEC_VFEECC))]
-  "TARGET_VX"
-{
-  unsigned HOST_WIDE_INT flags = UINTVAL (operands[3]);
-
-  gcc_assert (!(flags & ~(VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
-  flags &= ~VSTRING_FLAG_CS;
-
-  if (flags == VSTRING_FLAG_ZS)
-    return "vfeez<bhfgq>s\t%v0,%v1,%v2";
-  return "vfee<bhfgq>s\t%v0,%v1,%v2,%b3";
-}
-  [(set_attr "op_type" "VRR")])
-
 ; vfeeb, vfeeh, vfeef
 (define_insn "vfee<mode>"
   [(set (match_operand:VI_HW_QHS                    0 "register_operand" "=v")
diff --git a/gcc/testsuite/gcc.target/s390/rawmemchr-1.c b/gcc/testsuite/gcc.target/s390/rawmemchr-1.c
new file mode 100644
index 00000000000..a5125702315
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/rawmemchr-1.c
@@ -0,0 +1,99 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details -mzarch -march=z13" } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */
+
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#define rawmemchrT(T, pattern)     \
+__attribute__((noinline,noclone))  \
+T* rawmemchr_##T (T *s)            \
+{                                  \
+  while (*s != pattern)            \
+    ++s;                           \
+  return s;                        \
+}
+
+rawmemchrT(int8_t, (int8_t)0xde)
+rawmemchrT(uint8_t, 0xde)
+rawmemchrT(int16_t, (int16_t)0xdead)
+rawmemchrT(uint16_t, 0xdead)
+rawmemchrT(int32_t, (int32_t)0xdeadbeef)
+rawmemchrT(uint32_t, 0xdeadbeef)
+
+#define runT(T, pattern)                           \
+void run_##T ()                                    \
+{                                                  \
+  T *buf = malloc (4096 * 2 * sizeof(T));          \
+  assert (buf != NULL);                            \
+  memset (buf, 0xa, 4096 * 2 * sizeof(T));         \
+  /* ensure q is 4096-byte aligned */              \
+  T *q = (T*)((unsigned char *)buf                 \
+              + (4096 - ((uintptr_t)buf & 4095))); \
+  T *p;                                            \
+  /* unaligned + block boundary + 1st load */      \
+  p = (T *) ((uintptr_t)q - 8);                    \
+  p[2] = pattern;                                  \
+  assert ((rawmemchr_##T (&p[0]) == &p[2]));       \
+  p[2] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + block boundary + 2nd load */      \
+  p = (T *) ((uintptr_t)q - 8);                    \
+  p[6] = pattern;                                  \
+  assert ((rawmemchr_##T (&p[0]) == &p[6]));       \
+  p[6] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + 1st load */                       \
+  q[5] = pattern;                                  \
+  assert ((rawmemchr_##T (&q[2]) == &q[5]));       \
+  q[5] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + 2nd load */                       \
+  q[14] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[14]));      \
+  q[14] = (T) 0xaaaaaaaa;                          \
+  /* unaligned + 3rd load */                       \
+  q[19] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[19]));      \
+  q[19] = (T) 0xaaaaaaaa;                          \
+  /* unaligned + 4th load */                       \
+  q[25] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[25]));      \
+  q[25] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 1st load */                         \
+  q[5] = pattern;                                  \
+  assert ((rawmemchr_##T (&q[0]) == &q[5]));       \
+  q[5] = (T) 0xaaaaaaaa;                           \
+  /* aligned + 2nd load */                         \
+  q[14] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[14]));      \
+  q[14] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 3rd load */                         \
+  q[19] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[19]));      \
+  q[19] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 4th load */                         \
+  q[25] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[25]));      \
+  q[25] = (T) 0xaaaaaaaa;                          \
+  free (buf);                                      \
+}
+
+runT(int8_t, (int8_t)0xde)
+runT(uint8_t, 0xde)
+runT(int16_t, (int16_t)0xdead)
+runT(uint16_t, 0xdead)
+runT(int32_t, (int32_t)0xdeadbeef)
+runT(uint32_t, 0xdeadbeef)
+
+int main (void)
+{
+  run_uint8_t ();
+  run_int8_t ();
+  run_uint16_t ();
+  run_int16_t ();
+  run_uint32_t ();
+  run_int32_t ();
+  return 0;
+}
  
Andreas Krebbel Oct. 8, 2021, 3:05 p.m. UTC | #3
On 10/8/21 16:23, Stefan Schulze Frielinghaus wrote:
> On Thu, Oct 07, 2021 at 11:16:24AM +0200, Andreas Krebbel wrote:
>> On 9/20/21 11:24, Stefan Schulze Frielinghaus wrote:
>>> This patch implements the rawmemchr expander as introduced in
>>> https://gcc.gnu.org/pipermail/gcc-patches/2021-September/579649.html
>>>
>>> Bootstrapped and regtested in conjunction with the patch from above on
>>> IBM Z.  Ok for mainline?
>>>
>>
>>> From 551362cda54048dc1a51588112f11c070ed52020 Mon Sep 17 00:00:00 2001
>>> From: Stefan Schulze Frielinghaus <stefansf@linux.ibm.com>
>>> Date: Mon, 8 Feb 2021 10:35:39 +0100
>>> Subject: [PATCH 2/2] IBM Z: Provide rawmemchr{qi,hi,si} expander
>>>
>>> gcc/ChangeLog:
>>>
>>> 	* config/s390/s390-protos.h (s390_rawmemchrqi): Add prototype.
>>> 	(s390_rawmemchrhi): Add prototype.
>>> 	(s390_rawmemchrsi): Add prototype.
>>> 	* config/s390/s390.c (s390_rawmemchr): New function.
>>> 	(s390_rawmemchrqi): New function.
>>> 	(s390_rawmemchrhi): New function.
>>> 	(s390_rawmemchrsi): New function.
>>> 	* config/s390/s390.md (rawmemchr<SINT:mode>): New expander.
>>> 	(rawmemchr<SINT:mode><P:mode>): New expander.
>>> 	* config/s390/vector.md (vec_vfees<mode>): Basically a copy of
>>> 	the pattern vfees<mode> from vx-builtins.md.
>>> 	* config/s390/vx-builtins.md (*vfees<mode>): Remove.
>>
>> Thanks! Would it make sense to also extend the strlen and movstr expanders
>> we have to support the additional character modes?
> 
> For strlen-like loops over non-character arrays the current
> implementation in the loop distribution pass uses rawmemchr<MODE> and
> computes pointer difference in order to compute the length.  Thus we get
> strlen<MODE> for free and don't need to reimplement it.

Good to know. Thanks!

...
> Please find a new version attached.  I did another bootstrap+regtest on
> IBM Z.  Ok for mainline?
> 
> Thanks for your detailed review!

Ok for mainline. Thanks!

Andreas
  

Patch

diff --git a/gcc/config/s390/s390-protos.h b/gcc/config/s390/s390-protos.h
index 4b03c6e99f5..0d9619e8254 100644
--- a/gcc/config/s390/s390-protos.h
+++ b/gcc/config/s390/s390-protos.h
@@ -66,6 +66,10 @@  s390_asm_declare_function_size (FILE *asm_out_file,
 				const char *fnname ATTRIBUTE_UNUSED, tree decl);
 #endif
 
+extern void s390_rawmemchrqi(rtx dst, rtx src, rtx pat);
+extern void s390_rawmemchrhi(rtx dst, rtx src, rtx pat);
+extern void s390_rawmemchrsi(rtx dst, rtx src, rtx pat);
+
 #ifdef RTX_CODE
 extern int s390_extra_constraint_str (rtx, int, const char *);
 extern int s390_const_ok_for_constraint_p (HOST_WIDE_INT, int, const char *);
diff --git a/gcc/config/s390/s390.c b/gcc/config/s390/s390.c
index 54dd6332c3a..1435ce156e2 100644
--- a/gcc/config/s390/s390.c
+++ b/gcc/config/s390/s390.c
@@ -16559,6 +16559,95 @@  s390_excess_precision (enum excess_precision_type type)
 }
 #endif
 
+template <machine_mode vec_mode,
+	  machine_mode elt_mode,
+	  rtx (*gen_vec_vfees) (rtx, rtx, rtx, rtx)>
+static void
+s390_rawmemchr(rtx dst, rtx src, rtx pat) {
+  rtx lens = gen_reg_rtx (V16QImode);
+  rtx pattern = gen_reg_rtx (vec_mode);
+  rtx loop_start = gen_label_rtx ();
+  rtx loop_end = gen_label_rtx ();
+  rtx addr = gen_reg_rtx (Pmode);
+  rtx offset = gen_reg_rtx (Pmode);
+  rtx tmp = gen_reg_rtx (Pmode);
+  rtx loadlen = gen_reg_rtx (SImode);
+  rtx matchlen = gen_reg_rtx (SImode);
+  rtx mem;
+
+  pat = GEN_INT (trunc_int_for_mode (INTVAL (pat), elt_mode));
+  emit_insn (gen_rtx_SET (pattern, gen_rtx_VEC_DUPLICATE (vec_mode, pat)));
+
+  emit_move_insn (addr, XEXP (src, 0));
+
+  // alignment
+  emit_insn (gen_vlbb (lens, gen_rtx_MEM (BLKmode, addr), GEN_INT (6)));
+  emit_insn (gen_lcbb (loadlen, addr, GEN_INT (6)));
+  lens = convert_to_mode (vec_mode, lens, 1);
+  emit_insn (gen_vec_vfees (lens, lens, pattern, GEN_INT (0)));
+  lens = convert_to_mode (V4SImode, lens, 1);
+  emit_insn (gen_vec_extractv4sisi (matchlen, lens, GEN_INT (1)));
+  lens = convert_to_mode (vec_mode, lens, 1);
+  emit_cmp_and_jump_insns (matchlen, loadlen, LT, NULL_RTX, SImode, 1, loop_end);
+  force_expand_binop (Pmode, and_optab, addr, GEN_INT (15), tmp, 1, OPTAB_DIRECT);
+  force_expand_binop (Pmode, sub_optab, GEN_INT (16), tmp, tmp, 1, OPTAB_DIRECT);
+  force_expand_binop (Pmode, add_optab, addr, tmp, addr, 1, OPTAB_DIRECT);
+  // now, addr is 16-byte aligned
+
+  mem = gen_rtx_MEM (vec_mode, addr);
+  set_mem_align (mem, 128);
+  emit_move_insn (lens, mem);
+  emit_insn (gen_vec_vfees (lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
+  add_int_reg_note (s390_emit_ccraw_jump (4, EQ, loop_end),
+		    REG_BR_PROB,
+		    profile_probability::very_unlikely ().to_reg_br_prob_note ());
+
+  emit_label (loop_start);
+  LABEL_NUSES (loop_start) = 1;
+
+  force_expand_binop (Pmode, add_optab, addr, GEN_INT (16), addr, 1, OPTAB_DIRECT);
+  mem = gen_rtx_MEM (vec_mode, addr);
+  set_mem_align (mem, 128);
+  emit_move_insn (lens, mem);
+  emit_insn (gen_vec_vfees (lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
+  add_int_reg_note (s390_emit_ccraw_jump (4, NE, loop_start),
+		    REG_BR_PROB,
+		    profile_probability::very_likely ().to_reg_br_prob_note ());
+
+  emit_label (loop_end);
+  LABEL_NUSES (loop_end) = 1;
+
+  if (TARGET_64BIT)
+    {
+      lens = convert_to_mode (V2DImode, lens, 1);
+      emit_insn (gen_vec_extractv2didi (offset, lens, GEN_INT (0)));
+    }
+  else
+    {
+      lens = convert_to_mode (V4SImode, lens, 1);
+      emit_insn (gen_vec_extractv4sisi (offset, lens, GEN_INT (1)));
+    }
+  force_expand_binop (Pmode, add_optab, addr, offset, dst, 1, OPTAB_DIRECT);
+}
+
+void
+s390_rawmemchrqi (rtx dst, rtx src, rtx pat)
+{
+  s390_rawmemchr<V16QImode, QImode, gen_vec_vfeesv16qi> (dst, src, pat);
+}
+
+void
+s390_rawmemchrhi (rtx dst, rtx src, rtx pat)
+{
+  s390_rawmemchr<V8HImode, HImode, gen_vec_vfeesv8hi> (dst, src, pat);
+}
+
+void
+s390_rawmemchrsi (rtx dst, rtx src, rtx pat)
+{
+  s390_rawmemchr<V4SImode, SImode, gen_vec_vfeesv4si> (dst, src, pat);
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/config/s390/s390.md b/gcc/config/s390/s390.md
index 1b894a926ce..f81bcef86ce 100644
--- a/gcc/config/s390/s390.md
+++ b/gcc/config/s390/s390.md
@@ -12258,3 +12258,23 @@ 
 		    UNSPECV_PPA)]
   "TARGET_ZEC12"
   "")
+
+(define_expand "rawmemchr<SINT:mode>"
+  [(match_operand      0 "register_operand")
+   (match_operand      1 "memory_operand")
+   (match_operand:SINT 2 "const_int_operand")]
+  "TARGET_VX"
+{
+  if (TARGET_64BIT)
+    emit_insn (gen_rawmemchr<SINT:mode>di (operands[0], operands[1], operands[2]));
+  else
+    emit_insn (gen_rawmemchr<SINT:mode>si (operands[0], operands[1], operands[2]));
+  DONE;
+})
+
+(define_expand "rawmemchr<SINT:mode><P:mode>"
+  [(match_operand:P    0 "register_operand")
+   (match_operand:BLK  1 "memory_operand")
+   (match_operand:SINT 2 "const_int_operand")]
+  "TARGET_VX"
+  "s390_rawmemchr<SINT:mode> (operands[0], operands[1], operands[2]); DONE;")
diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md
index 70274a6ab70..0870e2341fc 100644
--- a/gcc/config/s390/vector.md
+++ b/gcc/config/s390/vector.md
@@ -1988,6 +1988,32 @@ 
   "vll\t%v0,%1,%2"
   [(set_attr "op_type" "VRS")])
 
+; vfeebs, vfeehs, vfeefs
+; vfeezbs, vfeezhs, vfeezfs
+(define_insn "vec_vfees<mode>"
+  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
+	(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
+			   (match_operand:VI_HW_QHS 2 "register_operand" "v")
+			   (match_operand:QI 3 "const_mask_operand" "C")]
+			  UNSPEC_VEC_VFEE))
+   (set (reg:CCRAW CC_REGNUM)
+	(unspec:CCRAW [(match_dup 1)
+		       (match_dup 2)
+		       (match_dup 3)]
+		      UNSPEC_VEC_VFEECC))]
+  "TARGET_VX"
+{
+  unsigned HOST_WIDE_INT flags = UINTVAL (operands[3]);
+
+  gcc_assert (!(flags & ~(VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
+  flags &= ~VSTRING_FLAG_CS;
+
+  if (flags == VSTRING_FLAG_ZS)
+    return "vfeez<bhfgq>s\t%v0,%v1,%v2";
+  return "vfee<bhfgq>s\t%v0,%v1,%v2";
+}
+  [(set_attr "op_type" "VRR")])
+
 ; vfenebs, vfenehs, vfenefs
 ; vfenezbs, vfenezhs, vfenezfs
 (define_insn "vec_vfenes<mode>"
diff --git a/gcc/config/s390/vx-builtins.md b/gcc/config/s390/vx-builtins.md
index 3e7b8541887..efa77992f31 100644
--- a/gcc/config/s390/vx-builtins.md
+++ b/gcc/config/s390/vx-builtins.md
@@ -1366,32 +1366,6 @@ 
 
 ; Vector find element equal
 
-; vfeebs, vfeehs, vfeefs
-; vfeezbs, vfeezhs, vfeezfs
-(define_insn "*vfees<mode>"
-  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
-	(unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
-			   (match_operand:VI_HW_QHS 2 "register_operand" "v")
-			   (match_operand:QI 3 "const_mask_operand" "C")]
-			  UNSPEC_VEC_VFEE))
-   (set (reg:CCRAW CC_REGNUM)
-	(unspec:CCRAW [(match_dup 1)
-		       (match_dup 2)
-		       (match_dup 3)]
-		      UNSPEC_VEC_VFEECC))]
-  "TARGET_VX"
-{
-  unsigned HOST_WIDE_INT flags = UINTVAL (operands[3]);
-
-  gcc_assert (!(flags & ~(VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
-  flags &= ~VSTRING_FLAG_CS;
-
-  if (flags == VSTRING_FLAG_ZS)
-    return "vfeez<bhfgq>s\t%v0,%v1,%v2";
-  return "vfee<bhfgq>s\t%v0,%v1,%v2,%b3";
-}
-  [(set_attr "op_type" "VRR")])
-
 ; vfeeb, vfeeh, vfeef
 (define_insn "vfee<mode>"
   [(set (match_operand:VI_HW_QHS                    0 "register_operand" "=v")
diff --git a/gcc/testsuite/gcc.target/s390/rawmemchr-1.c b/gcc/testsuite/gcc.target/s390/rawmemchr-1.c
new file mode 100644
index 00000000000..a5125702315
--- /dev/null
+++ b/gcc/testsuite/gcc.target/s390/rawmemchr-1.c
@@ -0,0 +1,99 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details -mzarch -march=z13" } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */
+
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#define rawmemchrT(T, pattern)     \
+__attribute__((noinline,noclone))  \
+T* rawmemchr_##T (T *s)            \
+{                                  \
+  while (*s != pattern)            \
+    ++s;                           \
+  return s;                        \
+}
+
+rawmemchrT(int8_t, (int8_t)0xde)
+rawmemchrT(uint8_t, 0xde)
+rawmemchrT(int16_t, (int16_t)0xdead)
+rawmemchrT(uint16_t, 0xdead)
+rawmemchrT(int32_t, (int32_t)0xdeadbeef)
+rawmemchrT(uint32_t, 0xdeadbeef)
+
+#define runT(T, pattern)                           \
+void run_##T ()                                    \
+{                                                  \
+  T *buf = malloc (4096 * 2 * sizeof(T));          \
+  assert (buf != NULL);                            \
+  memset (buf, 0xa, 4096 * 2 * sizeof(T));         \
+  /* ensure q is 4096-byte aligned */              \
+  T *q = (T*)((unsigned char *)buf                 \
+              + (4096 - ((uintptr_t)buf & 4095))); \
+  T *p;                                            \
+  /* unaligned + block boundary + 1st load */      \
+  p = (T *) ((uintptr_t)q - 8);                    \
+  p[2] = pattern;                                  \
+  assert ((rawmemchr_##T (&p[0]) == &p[2]));       \
+  p[2] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + block boundary + 2nd load */      \
+  p = (T *) ((uintptr_t)q - 8);                    \
+  p[6] = pattern;                                  \
+  assert ((rawmemchr_##T (&p[0]) == &p[6]));       \
+  p[6] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + 1st load */                       \
+  q[5] = pattern;                                  \
+  assert ((rawmemchr_##T (&q[2]) == &q[5]));       \
+  q[5] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + 2nd load */                       \
+  q[14] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[14]));      \
+  q[14] = (T) 0xaaaaaaaa;                          \
+  /* unaligned + 3rd load */                       \
+  q[19] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[19]));      \
+  q[19] = (T) 0xaaaaaaaa;                          \
+  /* unaligned + 4th load */                       \
+  q[25] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[25]));      \
+  q[25] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 1st load */                         \
+  q[5] = pattern;                                  \
+  assert ((rawmemchr_##T (&q[0]) == &q[5]));       \
+  q[5] = (T) 0xaaaaaaaa;                           \
+  /* aligned + 2nd load */                         \
+  q[14] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[14]));      \
+  q[14] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 3rd load */                         \
+  q[19] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[19]));      \
+  q[19] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 4th load */                         \
+  q[25] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[25]));      \
+  q[25] = (T) 0xaaaaaaaa;                          \
+  free (buf);                                      \
+}
+
+runT(int8_t, (int8_t)0xde)
+runT(uint8_t, 0xde)
+runT(int16_t, (int16_t)0xdead)
+runT(uint16_t, 0xdead)
+runT(int32_t, (int32_t)0xdeadbeef)
+runT(uint32_t, 0xdeadbeef)
+
+int main (void)
+{
+  run_uint8_t ();
+  run_int8_t ();
+  run_uint16_t ();
+  run_int16_t ();
+  run_uint32_t ();
+  run_int32_t ();
+  return 0;
+}