powerpc64: strcpy optimization for unaligned string

Message ID 20141218211348.GA16854@domone
State Rejected
Delegated to: Adhemerval Zanella Netto
Headers

Commit Message

Ondrej Bilka Dec. 18, 2014, 9:13 p.m. UTC
  On Wed, Dec 17, 2014 at 09:34:53PM +0530, Rajalakshmi Srinivasaraghavan wrote:
> 
> 
> This patch optimizes strcpy for ppc64 for unaligned source or
> destination address. The source or destination address is aligned
> to doubleword and data is shifted based on the alignment and
> added with the previous loaded data to be written as a doubleword.
> For each load, cmpb instruction is used for faster null check.
> 
> More combination of unaligned inputs is also added in benchtest
> to measure the improvement.The new optimization shows 2 to 80% of
> performance improvement for longer string though it does not show
> big difference on string size less than 16 due to additional checks.
> 
> This patch is tested on powerpc64 BE and LE and I have also attached
> the benchtest result.
> 
As I wrote that benchtests are suspect first retest what happens if you
do not always call strcpy with same input and output buffer. What
diffence that makes in benchmark?
  

Comments

Rajalakshmi S Dec. 19, 2014, 3 p.m. UTC | #1
On 12/19/2014 02:43 AM, Ondřej Bílka wrote:
> On Wed, Dec 17, 2014 at 09:34:53PM +0530, Rajalakshmi Srinivasaraghavan wrote:
>>
>> This patch optimizes strcpy for ppc64 for unaligned source or
>> destination address. The source or destination address is aligned
>> to doubleword and data is shifted based on the alignment and
>> added with the previous loaded data to be written as a doubleword.
>> For each load, cmpb instruction is used for faster null check.
>>
>> More combination of unaligned inputs is also added in benchtest
>> to measure the improvement.The new optimization shows 2 to 80% of
>> performance improvement for longer string though it does not show
>> big difference on string size less than 16 due to additional checks.
>>
>> This patch is tested on powerpc64 BE and LE and I have also attached
>> the benchtest result.
>>
> As I wrote that benchtests are suspect first retest what happens if you
> do not always call strcpy with same input and output buffer. What
> diffence that makes in benchmark?
>
I applied this patch with and without my optimization and I
could not see any decrease in performance. Attached the results.
> diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
> index c3ab4cf..0329f60 100644
> --- a/benchtests/bench-strcpy.c
> +++ b/benchtests/bench-strcpy.c
> @@ -71,25 +71,25 @@ SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
>   typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
>
>   static void
> -do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
> +do_one_test (impl_t *impl, CHAR **dst, CHAR **src,
>   	     size_t len __attribute__((unused)))
>   {
>     size_t i, iters = INNER_LOOP_ITERS;
>     timing_t start, stop, cur;
>
> -  if (CALL (impl, dst, src) != STRCPY_RESULT (dst, len))
> +  if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len[0]))
Modified it as

  if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len))

>       {
>         error (0, 0, "Wrong result in function %s %p %p", impl->name,
> -	     CALL (impl, dst, src), STRCPY_RESULT (dst, len));
> +	     CALL (impl, dst[0], src[0]), STRCPY_RESULT (dst[0], len));
>         ret = 1;
>         return;
>       }
>
> -  if (STRCMP (dst, src) != 0)
> +  if (STRCMP (dst[0], src[0]) != 0)
>       {
>         error (0, 0,
>   	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
> -	     impl->name, dst, src);
> +	     impl->name, dst[0], src[0]);
>         ret = 1;
>         return;
>       }
> @@ -97,7 +97,7 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
>     TIMING_NOW (start);
>     for (i = 0; i < iters; ++i)
>       {
> -	  CALL (impl, dst, src);
> +	  CALL (impl, dst[i % 16], src[i % 16]);
>       }
>     TIMING_NOW (stop);
>
> @@ -109,8 +109,8 @@ do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
>   static void
>   do_test (size_t align1, size_t align2, size_t len, int max_char)
>   {
> -  size_t i;
> -  CHAR *s1, *s2;
> +  size_t i, j;
> +  CHAR **s1, **s2;
>   /* For wcscpy: align1 and align2 here mean alignment not in bytes,
>      but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
>      len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
> @@ -122,12 +122,17 @@ do_test (size_t align1, size_t align2, size_t len, int max_char)
>     if ((align2 + len) * sizeof(CHAR) >= page_size)
>       return;
>
> -  s1 = (CHAR *) (buf1) + align1;
> -  s2 = (CHAR *) (buf2) + align2;
> +  s1 = calloc (sizeof (char *), 16);
> +  s2 = calloc (sizeof (char *), 16);
> +  for (j = 0; j < 16; j++)
> +    {
> +      s1[j] = ((CHAR *) calloc (align1 + len + 1, sizeof (CHAR))) + align1;
> +      s2[j] = ((CHAR *) calloc (align2 + len + 1, sizeof (CHAR))) + align2;
>
> -  for (i = 0; i < len; i++)
> -    s1[i] = 32 + 23 * i % (max_char - 32);
> -  s1[len] = 0;
> +      for (i = 0; i < len; i++)
> +        s1[j][i] = 32 + 23 * i % (max_char - 32);
> +      s1[j][len] = 0;
> +    }
>
>     printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len, align1 * sizeof(CHAR), align2 * sizeof(CHAR));
>
>
>
  

Patch

diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index c3ab4cf..0329f60 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -71,25 +71,25 @@  SIMPLE_STRCPY (CHAR *dst, const CHAR *src)
 typedef CHAR *(*proto_t) (CHAR *, const CHAR *);
 
 static void
-do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
+do_one_test (impl_t *impl, CHAR **dst, CHAR **src,
 	     size_t len __attribute__((unused)))
 {
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
 
-  if (CALL (impl, dst, src) != STRCPY_RESULT (dst, len))
+  if (CALL (impl, dst[0], src[0]) != STRCPY_RESULT (dst[0], len[0]))
     {
       error (0, 0, "Wrong result in function %s %p %p", impl->name,
-	     CALL (impl, dst, src), STRCPY_RESULT (dst, len));
+	     CALL (impl, dst[0], src[0]), STRCPY_RESULT (dst[0], len));
       ret = 1;
       return;
     }
 
-  if (STRCMP (dst, src) != 0)
+  if (STRCMP (dst[0], src[0]) != 0)
     {
       error (0, 0,
 	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
-	     impl->name, dst, src);
+	     impl->name, dst[0], src[0]);
       ret = 1;
       return;
     }
@@ -97,7 +97,7 @@  do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
   TIMING_NOW (start);
   for (i = 0; i < iters; ++i)
     {
-	  CALL (impl, dst, src);
+	  CALL (impl, dst[i % 16], src[i % 16]);
     }
   TIMING_NOW (stop);
 
@@ -109,8 +109,8 @@  do_one_test (impl_t *impl, CHAR *dst, const CHAR *src,
 static void
 do_test (size_t align1, size_t align2, size_t len, int max_char)
 {
-  size_t i;
-  CHAR *s1, *s2;
+  size_t i, j;
+  CHAR **s1, **s2;
 /* For wcscpy: align1 and align2 here mean alignment not in bytes,
    but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
    len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
@@ -122,12 +122,17 @@  do_test (size_t align1, size_t align2, size_t len, int max_char)
   if ((align2 + len) * sizeof(CHAR) >= page_size)
     return;
 
-  s1 = (CHAR *) (buf1) + align1;
-  s2 = (CHAR *) (buf2) + align2;
+  s1 = calloc (sizeof (char *), 16);
+  s2 = calloc (sizeof (char *), 16);
+  for (j = 0; j < 16; j++)
+    {
+      s1[j] = ((CHAR *) calloc (align1 + len + 1, sizeof (CHAR))) + align1;
+      s2[j] = ((CHAR *) calloc (align2 + len + 1, sizeof (CHAR))) + align2;
 
-  for (i = 0; i < len; i++)
-    s1[i] = 32 + 23 * i % (max_char - 32);
-  s1[len] = 0;
+      for (i = 0; i < len; i++)
+        s1[j][i] = 32 + 23 * i % (max_char - 32);
+      s1[j][len] = 0;
+    }
 
   printf ("Length %4zd, alignments in bytes %2zd/%2zd:", len, align1 * sizeof(CHAR), align2 * sizeof(CHAR));