[rs6000] Enable overlap memory store for block memory clear

Message ID d8008a79-5937-4928-931f-1e29938cbf1e@linux.ibm.com
State New
Headers
Series [rs6000] Enable overlap memory store for block memory clear |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed

Commit Message

HAO CHEN GUI Feb. 26, 2024, 2:25 a.m. UTC
  Hi,
  This patch enables overlap memory store for block memory clear which
saves the number of store instructions. The expander calls
widest_fixed_size_mode_for_block_clear to get the mode for looped block
clear and calls widest_fixed_size_mode_for_block_clear to get the mode
for last overlapped clear.

    Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
regressions. Is it OK for the trunk or next stage 1?

Thanks
Gui Haochen


ChangeLog
rs6000: Enable overlap memory store for block memory clear

gcc/
	* config/rs6000/rs6000-string.cc
	(widest_fixed_size_mode_for_block_clear): New.
	(smallest_fixed_size_mode_for_block_clear): New.
	(expand_block_clear): Call widest_fixed_size_mode_for_block_clear to
	get the mode for looped memory stores and call
	smallest_fixed_size_mode_for_block_clear to get the mode for the last
	overlapped memory store.

gcc/testsuite
	* gcc.target/powerpc/block-clear-1.c: New.


patch.diff
  

Comments

HAO CHEN GUI May 8, 2024, 1:55 a.m. UTC | #1
Hi,
  As now it's stage 1, gently ping this:
https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646478.html

Thanks
Gui Haochen

在 2024/2/26 10:25, HAO CHEN GUI 写道:
> Hi,
>   This patch enables overlap memory store for block memory clear which
> saves the number of store instructions. The expander calls
> widest_fixed_size_mode_for_block_clear to get the mode for looped block
> clear and calls widest_fixed_size_mode_for_block_clear to get the mode
> for last overlapped clear.
> 
>     Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
> regressions. Is it OK for the trunk or next stage 1?
> 
> Thanks
> Gui Haochen
> 
> 
> ChangeLog
> rs6000: Enable overlap memory store for block memory clear
> 
> gcc/
> 	* config/rs6000/rs6000-string.cc
> 	(widest_fixed_size_mode_for_block_clear): New.
> 	(smallest_fixed_size_mode_for_block_clear): New.
> 	(expand_block_clear): Call widest_fixed_size_mode_for_block_clear to
> 	get the mode for looped memory stores and call
> 	smallest_fixed_size_mode_for_block_clear to get the mode for the last
> 	overlapped memory store.
> 
> gcc/testsuite
> 	* gcc.target/powerpc/block-clear-1.c: New.
> 
> 
> patch.diff
> diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc
> index 133e5382af2..c2a6095a586 100644
> --- a/gcc/config/rs6000/rs6000-string.cc
> +++ b/gcc/config/rs6000/rs6000-string.cc
> @@ -38,6 +38,49 @@
>  #include "profile-count.h"
>  #include "predict.h"
> 
> +/* Return the widest mode which mode size is less than or equal to the
> +   size.  */
> +static fixed_size_mode
> +widest_fixed_size_mode_for_block_clear (unsigned int size, unsigned int align,
> +					bool unaligned_vsx_ok)
> +{
> +  machine_mode mode;
> +
> +  if (TARGET_ALTIVEC
> +      && size >= 16
> +      && (align >= 128
> +	  || unaligned_vsx_ok))
> +    mode = V4SImode;
> +  else if (size >= 8
> +	   && TARGET_POWERPC64
> +	   && (align >= 64
> +	       || !STRICT_ALIGNMENT))
> +    mode = DImode;
> +  else if (size >= 4
> +	   && (align >= 32
> +	       || !STRICT_ALIGNMENT))
> +    mode = SImode;
> +  else if (size >= 2
> +	   && (align >= 16
> +	       || !STRICT_ALIGNMENT))
> +    mode = HImode;
> +  else
> +    mode = QImode;
> +
> +  return as_a <fixed_size_mode> (mode);
> +}
> +
> +/* Return the smallest mode which mode size is smaller than or eqaul to
> +   the size.  */
> +static fixed_size_mode
> +smallest_fixed_size_mode_for_block_clear (unsigned int size)
> +{
> +  if (size > UNITS_PER_WORD)
> +    return as_a <fixed_size_mode> (V4SImode);
> +
> +  return smallest_int_mode_for_size (size * BITS_PER_UNIT);
> +}
> +
>  /* Expand a block clear operation, and return 1 if successful.  Return 0
>     if we should let the compiler generate normal code.
> 
> @@ -55,7 +98,6 @@ expand_block_clear (rtx operands[])
>    HOST_WIDE_INT align;
>    HOST_WIDE_INT bytes;
>    int offset;
> -  int clear_bytes;
>    int clear_step;
> 
>    /* If this is not a fixed size move, just call memcpy */
> @@ -89,62 +131,36 @@ expand_block_clear (rtx operands[])
> 
>    bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
> 
> -  for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
> +  auto mode = widest_fixed_size_mode_for_block_clear (bytes, align,
> +						      unaligned_vsx_ok);
> +  offset = 0;
> +  rtx dest;
> +
> +  do
>      {
> -      machine_mode mode = BLKmode;
> -      rtx dest;
> +      unsigned int size = GET_MODE_SIZE (mode);
> 
> -      if (TARGET_ALTIVEC
> -	  && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
> +      while (bytes >= size)
>  	{
> -	  clear_bytes = 16;
> -	  mode = V4SImode;
> -	}
> -      else if (bytes >= 8 && TARGET_POWERPC64
> -	       && (align >= 64 || !STRICT_ALIGNMENT))
> -	{
> -	  clear_bytes = 8;
> -	  mode = DImode;
> -	  if (offset == 0 && align < 64)
> -	    {
> -	      rtx addr;
> +	  dest = adjust_address (orig_dest, mode, offset);
> +	  emit_move_insn (dest, CONST0_RTX (mode));
> 
> -	      /* If the address form is reg+offset with offset not a
> -		 multiple of four, reload into reg indirect form here
> -		 rather than waiting for reload.  This way we get one
> -		 reload, not one per store.  */
> -	      addr = XEXP (orig_dest, 0);
> -	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
> -		  && CONST_INT_P (XEXP (addr, 1))
> -		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
> -		{
> -		  addr = copy_addr_to_reg (addr);
> -		  orig_dest = replace_equiv_address (orig_dest, addr);
> -		}
> -	    }
> -	}
> -      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
> -	{			/* move 4 bytes */
> -	  clear_bytes = 4;
> -	  mode = SImode;
> -	}
> -      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
> -	{			/* move 2 bytes */
> -	  clear_bytes = 2;
> -	  mode = HImode;
> -	}
> -      else /* move 1 byte at a time */
> -	{
> -	  clear_bytes = 1;
> -	  mode = QImode;
> +	  offset += size;
> +	  bytes -= size;
>  	}
> 
> -      dest = adjust_address (orig_dest, mode, offset);
> +      if (bytes == 0)
> +	return 1;
> 
> -      emit_move_insn (dest, CONST0_RTX (mode));
> +      mode = smallest_fixed_size_mode_for_block_clear (bytes);
> +      int gap = GET_MODE_SIZE (mode) - bytes;
> +      if (gap > 0)
> +	{
> +	  offset -= gap;
> +	  bytes += gap;
> +	}
>      }
> -
> -  return 1;
> +  while (1);
>  }
> 
>  /* Figure out the correct instructions to generate to load data for
> diff --git a/gcc/testsuite/gcc.target/powerpc/block-clear-1.c b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
> new file mode 100644
> index 00000000000..5e16c44fea3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +/* { dg-final { scan-assembler-not {\mst[hb]\M} } } */
> +
> +/* Verify that memclear takes overlap store. */
> +void* foo (char* s1)
> +{
> +  __builtin_memset (s1, 0, 31);
> +}
  
HAO CHEN GUI May 27, 2024, 2:19 a.m. UTC | #2
Hi,
  Gently ping it.
https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646478.html

Thanks
Gui Haochen

在 2024/5/8 9:55, HAO CHEN GUI 写道:
> Hi,
>   As now it's stage 1, gently ping this:
> https://gcc.gnu.org/pipermail/gcc-patches/2024-February/646478.html
> 
> Thanks
> Gui Haochen
> 
> 在 2024/2/26 10:25, HAO CHEN GUI 写道:
>> Hi,
>>   This patch enables overlap memory store for block memory clear which
>> saves the number of store instructions. The expander calls
>> widest_fixed_size_mode_for_block_clear to get the mode for looped block
>> clear and calls widest_fixed_size_mode_for_block_clear to get the mode
>> for last overlapped clear.
>>
>>     Bootstrapped and tested on x86 and powerpc64-linux BE and LE with no
>> regressions. Is it OK for the trunk or next stage 1?
>>
>> Thanks
>> Gui Haochen
>>
>>
>> ChangeLog
>> rs6000: Enable overlap memory store for block memory clear
>>
>> gcc/
>> 	* config/rs6000/rs6000-string.cc
>> 	(widest_fixed_size_mode_for_block_clear): New.
>> 	(smallest_fixed_size_mode_for_block_clear): New.
>> 	(expand_block_clear): Call widest_fixed_size_mode_for_block_clear to
>> 	get the mode for looped memory stores and call
>> 	smallest_fixed_size_mode_for_block_clear to get the mode for the last
>> 	overlapped memory store.
>>
>> gcc/testsuite
>> 	* gcc.target/powerpc/block-clear-1.c: New.
>>
>>
>> patch.diff
>> diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc
>> index 133e5382af2..c2a6095a586 100644
>> --- a/gcc/config/rs6000/rs6000-string.cc
>> +++ b/gcc/config/rs6000/rs6000-string.cc
>> @@ -38,6 +38,49 @@
>>  #include "profile-count.h"
>>  #include "predict.h"
>>
>> +/* Return the widest mode which mode size is less than or equal to the
>> +   size.  */
>> +static fixed_size_mode
>> +widest_fixed_size_mode_for_block_clear (unsigned int size, unsigned int align,
>> +					bool unaligned_vsx_ok)
>> +{
>> +  machine_mode mode;
>> +
>> +  if (TARGET_ALTIVEC
>> +      && size >= 16
>> +      && (align >= 128
>> +	  || unaligned_vsx_ok))
>> +    mode = V4SImode;
>> +  else if (size >= 8
>> +	   && TARGET_POWERPC64
>> +	   && (align >= 64
>> +	       || !STRICT_ALIGNMENT))
>> +    mode = DImode;
>> +  else if (size >= 4
>> +	   && (align >= 32
>> +	       || !STRICT_ALIGNMENT))
>> +    mode = SImode;
>> +  else if (size >= 2
>> +	   && (align >= 16
>> +	       || !STRICT_ALIGNMENT))
>> +    mode = HImode;
>> +  else
>> +    mode = QImode;
>> +
>> +  return as_a <fixed_size_mode> (mode);
>> +}
>> +
>> +/* Return the smallest mode which mode size is smaller than or eqaul to
>> +   the size.  */
>> +static fixed_size_mode
>> +smallest_fixed_size_mode_for_block_clear (unsigned int size)
>> +{
>> +  if (size > UNITS_PER_WORD)
>> +    return as_a <fixed_size_mode> (V4SImode);
>> +
>> +  return smallest_int_mode_for_size (size * BITS_PER_UNIT);
>> +}
>> +
>>  /* Expand a block clear operation, and return 1 if successful.  Return 0
>>     if we should let the compiler generate normal code.
>>
>> @@ -55,7 +98,6 @@ expand_block_clear (rtx operands[])
>>    HOST_WIDE_INT align;
>>    HOST_WIDE_INT bytes;
>>    int offset;
>> -  int clear_bytes;
>>    int clear_step;
>>
>>    /* If this is not a fixed size move, just call memcpy */
>> @@ -89,62 +131,36 @@ expand_block_clear (rtx operands[])
>>
>>    bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);
>>
>> -  for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
>> +  auto mode = widest_fixed_size_mode_for_block_clear (bytes, align,
>> +						      unaligned_vsx_ok);
>> +  offset = 0;
>> +  rtx dest;
>> +
>> +  do
>>      {
>> -      machine_mode mode = BLKmode;
>> -      rtx dest;
>> +      unsigned int size = GET_MODE_SIZE (mode);
>>
>> -      if (TARGET_ALTIVEC
>> -	  && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
>> +      while (bytes >= size)
>>  	{
>> -	  clear_bytes = 16;
>> -	  mode = V4SImode;
>> -	}
>> -      else if (bytes >= 8 && TARGET_POWERPC64
>> -	       && (align >= 64 || !STRICT_ALIGNMENT))
>> -	{
>> -	  clear_bytes = 8;
>> -	  mode = DImode;
>> -	  if (offset == 0 && align < 64)
>> -	    {
>> -	      rtx addr;
>> +	  dest = adjust_address (orig_dest, mode, offset);
>> +	  emit_move_insn (dest, CONST0_RTX (mode));
>>
>> -	      /* If the address form is reg+offset with offset not a
>> -		 multiple of four, reload into reg indirect form here
>> -		 rather than waiting for reload.  This way we get one
>> -		 reload, not one per store.  */
>> -	      addr = XEXP (orig_dest, 0);
>> -	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
>> -		  && CONST_INT_P (XEXP (addr, 1))
>> -		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
>> -		{
>> -		  addr = copy_addr_to_reg (addr);
>> -		  orig_dest = replace_equiv_address (orig_dest, addr);
>> -		}
>> -	    }
>> -	}
>> -      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
>> -	{			/* move 4 bytes */
>> -	  clear_bytes = 4;
>> -	  mode = SImode;
>> -	}
>> -      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
>> -	{			/* move 2 bytes */
>> -	  clear_bytes = 2;
>> -	  mode = HImode;
>> -	}
>> -      else /* move 1 byte at a time */
>> -	{
>> -	  clear_bytes = 1;
>> -	  mode = QImode;
>> +	  offset += size;
>> +	  bytes -= size;
>>  	}
>>
>> -      dest = adjust_address (orig_dest, mode, offset);
>> +      if (bytes == 0)
>> +	return 1;
>>
>> -      emit_move_insn (dest, CONST0_RTX (mode));
>> +      mode = smallest_fixed_size_mode_for_block_clear (bytes);
>> +      int gap = GET_MODE_SIZE (mode) - bytes;
>> +      if (gap > 0)
>> +	{
>> +	  offset -= gap;
>> +	  bytes += gap;
>> +	}
>>      }
>> -
>> -  return 1;
>> +  while (1);
>>  }
>>
>>  /* Figure out the correct instructions to generate to load data for
>> diff --git a/gcc/testsuite/gcc.target/powerpc/block-clear-1.c b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
>> new file mode 100644
>> index 00000000000..5e16c44fea3
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
>> @@ -0,0 +1,9 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O2" } */
>> +/* { dg-final { scan-assembler-not {\mst[hb]\M} } } */
>> +
>> +/* Verify that memclear takes overlap store. */
>> +void* foo (char* s1)
>> +{
>> +  __builtin_memset (s1, 0, 31);
>> +}
  

Patch

diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc
index 133e5382af2..c2a6095a586 100644
--- a/gcc/config/rs6000/rs6000-string.cc
+++ b/gcc/config/rs6000/rs6000-string.cc
@@ -38,6 +38,49 @@ 
 #include "profile-count.h"
 #include "predict.h"

+/* Return the widest mode which mode size is less than or equal to the
+   size.  */
+static fixed_size_mode
+widest_fixed_size_mode_for_block_clear (unsigned int size, unsigned int align,
+					bool unaligned_vsx_ok)
+{
+  machine_mode mode;
+
+  if (TARGET_ALTIVEC
+      && size >= 16
+      && (align >= 128
+	  || unaligned_vsx_ok))
+    mode = V4SImode;
+  else if (size >= 8
+	   && TARGET_POWERPC64
+	   && (align >= 64
+	       || !STRICT_ALIGNMENT))
+    mode = DImode;
+  else if (size >= 4
+	   && (align >= 32
+	       || !STRICT_ALIGNMENT))
+    mode = SImode;
+  else if (size >= 2
+	   && (align >= 16
+	       || !STRICT_ALIGNMENT))
+    mode = HImode;
+  else
+    mode = QImode;
+
+  return as_a <fixed_size_mode> (mode);
+}
+
+/* Return the smallest mode which mode size is smaller than or eqaul to
+   the size.  */
+static fixed_size_mode
+smallest_fixed_size_mode_for_block_clear (unsigned int size)
+{
+  if (size > UNITS_PER_WORD)
+    return as_a <fixed_size_mode> (V4SImode);
+
+  return smallest_int_mode_for_size (size * BITS_PER_UNIT);
+}
+
 /* Expand a block clear operation, and return 1 if successful.  Return 0
    if we should let the compiler generate normal code.

@@ -55,7 +98,6 @@  expand_block_clear (rtx operands[])
   HOST_WIDE_INT align;
   HOST_WIDE_INT bytes;
   int offset;
-  int clear_bytes;
   int clear_step;

   /* If this is not a fixed size move, just call memcpy */
@@ -89,62 +131,36 @@  expand_block_clear (rtx operands[])

   bool unaligned_vsx_ok = (bytes >= 32 && TARGET_EFFICIENT_UNALIGNED_VSX);

-  for (offset = 0; bytes > 0; offset += clear_bytes, bytes -= clear_bytes)
+  auto mode = widest_fixed_size_mode_for_block_clear (bytes, align,
+						      unaligned_vsx_ok);
+  offset = 0;
+  rtx dest;
+
+  do
     {
-      machine_mode mode = BLKmode;
-      rtx dest;
+      unsigned int size = GET_MODE_SIZE (mode);

-      if (TARGET_ALTIVEC
-	  && (bytes >= 16 && (align >= 128 || unaligned_vsx_ok)))
+      while (bytes >= size)
 	{
-	  clear_bytes = 16;
-	  mode = V4SImode;
-	}
-      else if (bytes >= 8 && TARGET_POWERPC64
-	       && (align >= 64 || !STRICT_ALIGNMENT))
-	{
-	  clear_bytes = 8;
-	  mode = DImode;
-	  if (offset == 0 && align < 64)
-	    {
-	      rtx addr;
+	  dest = adjust_address (orig_dest, mode, offset);
+	  emit_move_insn (dest, CONST0_RTX (mode));

-	      /* If the address form is reg+offset with offset not a
-		 multiple of four, reload into reg indirect form here
-		 rather than waiting for reload.  This way we get one
-		 reload, not one per store.  */
-	      addr = XEXP (orig_dest, 0);
-	      if ((GET_CODE (addr) == PLUS || GET_CODE (addr) == LO_SUM)
-		  && CONST_INT_P (XEXP (addr, 1))
-		  && (INTVAL (XEXP (addr, 1)) & 3) != 0)
-		{
-		  addr = copy_addr_to_reg (addr);
-		  orig_dest = replace_equiv_address (orig_dest, addr);
-		}
-	    }
-	}
-      else if (bytes >= 4 && (align >= 32 || !STRICT_ALIGNMENT))
-	{			/* move 4 bytes */
-	  clear_bytes = 4;
-	  mode = SImode;
-	}
-      else if (bytes >= 2 && (align >= 16 || !STRICT_ALIGNMENT))
-	{			/* move 2 bytes */
-	  clear_bytes = 2;
-	  mode = HImode;
-	}
-      else /* move 1 byte at a time */
-	{
-	  clear_bytes = 1;
-	  mode = QImode;
+	  offset += size;
+	  bytes -= size;
 	}

-      dest = adjust_address (orig_dest, mode, offset);
+      if (bytes == 0)
+	return 1;

-      emit_move_insn (dest, CONST0_RTX (mode));
+      mode = smallest_fixed_size_mode_for_block_clear (bytes);
+      int gap = GET_MODE_SIZE (mode) - bytes;
+      if (gap > 0)
+	{
+	  offset -= gap;
+	  bytes += gap;
+	}
     }
-
-  return 1;
+  while (1);
 }

 /* Figure out the correct instructions to generate to load data for
diff --git a/gcc/testsuite/gcc.target/powerpc/block-clear-1.c b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
new file mode 100644
index 00000000000..5e16c44fea3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/block-clear-1.c
@@ -0,0 +1,9 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { scan-assembler-not {\mst[hb]\M} } } */
+
+/* Verify that memclear takes overlap store. */
+void* foo (char* s1)
+{
+  __builtin_memset (s1, 0, 31);
+}