[1/1] x86: Optimizing memcpy for AMD Zen architecture.

Message ID 20201022045005.17371-2-sajan.karumanchi@amd.com
State Committed
Headers
Series Optimizing memcpy for AMD Zen architecture. |

Commit Message

Karumanchi, Sajan Oct. 22, 2020, 4:50 a.m. UTC
  From: Sajan Karumanchi <sajan.karumanchi@amd.com>

Modifying the shareable cache '__x86_shared_cache_size', which is a
factor in computing the non-temporal threshold parameter
'__x86_shared_non_temporal_threshold' to optimize memcpy for AMD Zen
architectures.
In the existing implementation, the shareable cache is computed as 'L3
per thread, L2 per core'. Recomputing this shareable cache as 'L3 per
CCX(Core-Complex)' has brought in performance gains.
As per the large bench variant results, this patch also addresses the
regression problem on AMD Zen architectures.

Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
Signed-off-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
Signed-off-by: Sajan Karumanchi <sajan.karumanchi@amd.com>
---
 sysdeps/x86/cacheinfo.h | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)
  

Comments

Florian Weimer Oct. 27, 2020, 10:38 a.m. UTC | #1
* sajan karumanchi:

> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> index 7f342fdc23..d6d6877702 100644
> --- a/sysdeps/x86/cacheinfo.h
> +++ b/sysdeps/x86/cacheinfo.h
> @@ -303,6 +303,8 @@ init_cacheinfo (void)
>        data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
>        long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
>        shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
> +      unsigned int eax;
> +      unsigned int threads_per_ccx = 0;
>  
>        /* Get maximum extended function. */
>        __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
> @@ -320,7 +322,7 @@ init_cacheinfo (void)
>  	      threads = 1 << ((ecx >> 12) & 0x0f);
>  	    }
>  
> -	  if (threads == 0)
> +	  if (threads == 0 || cpu_features->basic.family >= 0x17)
>  	    {
>  	      /* If APIC ID width is not available, use logical
>  		 processor count.  */
> @@ -335,13 +337,27 @@ init_cacheinfo (void)
>  	  if (threads > 0)
>  	    shared /= threads;
>  
> -	  /* Account for exclusive L2 and L3 caches.  */
> -	  shared += core;
> -	}
> +	  /* Get shared cache per ccx for Zen architectures */
> +	  if (cpu_features->basic.family >= 0x17)
> +	    {
> +	      /* Get number of threads share the L3 cache in CCX */
> +	      __cpuid_count(0x8000001D, 0x3, eax, ebx, ecx, edx);
> +	      threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
> +	      shared = shared * threads_per_ccx;
> +	    }
> +	  else
> +	    {
> +	      /* Account for exclusive L2 and L3 caches.  */
> +	      shared += core;
> +            }
> +      }
>      }

Although not visible in the patch, these changes a properly guarded by
an arch_kind_amd check, as expected.

Beyond that, I can't comment on the substance of the patch, but I'd like
to request the follow style changes:

* Move the definitions of eax and threads_per_ccx closer
  to their usage site.  Initialize threads_per_ccx directly with its
  final variable.  (The separate variable is nice for documentation
  purposes.)
* Add a space after __cpuid_count (to follow GNU style).
* Add ".  " (period and two spaces) at the end of all new comments.
* Remove Signed-off-by.  glibc does not use DCO
  <https://developercertificate.org/>.  I assume this patch is covered
  by AMD's copyright assignment instead.

I can make these changes for you and push this, or you can post a new
patch.

Thanks,
Florian
  
H.J. Lu Oct. 28, 2020, 2 p.m. UTC | #2
On Wed, Oct 21, 2020 at 9:51 PM <sajan.karumanchi@amd.com> wrote:
>
> From: Sajan Karumanchi <sajan.karumanchi@amd.com>
>
> Modifying the shareable cache '__x86_shared_cache_size', which is a
> factor in computing the non-temporal threshold parameter
> '__x86_shared_non_temporal_threshold' to optimize memcpy for AMD Zen
> architectures.
> In the existing implementation, the shareable cache is computed as 'L3
> per thread, L2 per core'. Recomputing this shareable cache as 'L3 per
> CCX(Core-Complex)' has brought in performance gains.
> As per the large bench variant results, this patch also addresses the
> regression problem on AMD Zen architectures.
>
> Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
> Signed-off-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
> Signed-off-by: Sajan Karumanchi <sajan.karumanchi@amd.com>
> ---
>  sysdeps/x86/cacheinfo.h | 31 +++++++++++++++++++++++++------
>  1 file changed, 25 insertions(+), 6 deletions(-)
>
> diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
> index 7f342fdc23..d6d6877702 100644
> --- a/sysdeps/x86/cacheinfo.h
> +++ b/sysdeps/x86/cacheinfo.h
> @@ -303,6 +303,8 @@ init_cacheinfo (void)
>        data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
>        long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
>        shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
> +      unsigned int eax;
> +      unsigned int threads_per_ccx = 0;
>
>        /* Get maximum extended function. */
>        __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
> @@ -320,7 +322,7 @@ init_cacheinfo (void)
>               threads = 1 << ((ecx >> 12) & 0x0f);
>             }
>
> -         if (threads == 0)
> +         if (threads == 0 || cpu_features->basic.family >= 0x17)
>             {
>               /* If APIC ID width is not available, use logical
>                  processor count.  */
> @@ -335,13 +337,27 @@ init_cacheinfo (void)
>           if (threads > 0)
>             shared /= threads;
>
> -         /* Account for exclusive L2 and L3 caches.  */
> -         shared += core;
> -       }
> +         /* Get shared cache per ccx for Zen architectures */
> +         if (cpu_features->basic.family >= 0x17)
> +           {
> +             /* Get number of threads share the L3 cache in CCX */
> +             __cpuid_count(0x8000001D, 0x3, eax, ebx, ecx, edx);
> +             threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
> +             shared = shared * threads_per_ccx;
> +           }
> +         else
> +           {
> +             /* Account for exclusive L2 and L3 caches.  */
> +             shared += core;
> +            }
> +      }
>      }
>
>    if (cpu_features->data_cache_size != 0)
> -    data = cpu_features->data_cache_size;
> +    {
> +      if (data == 0 || cpu_features->basic.kind != arch_kind_amd)
> +        data = cpu_features->data_cache_size;
> +    }

This looks wrong.  cpu_features->data_cache_size is set by
GLIBC tunables:

-- Tunable: glibc.cpu.x86_data_cache_size
     The ‘glibc.cpu.x86_data_cache_size’ tunable allows the user to set
     data cache size in bytes for use in memory and string routines.

     This tunable is specific to i386 and x86-64.

Why is it ignored?

>    if (data > 0)
>      {
> @@ -354,7 +370,10 @@ init_cacheinfo (void)
>      }
>
>    if (cpu_features->shared_cache_size != 0)
> -    shared = cpu_features->shared_cache_size;
> +    {
> +      if (shared == 0 || cpu_features->basic.kind != arch_kind_amd)
> +        shared = cpu_features->shared_cache_size;
> +    }

This looks wrong.  cpu_features->shared_cache_size is set by
GLIBC tunables:

-- Tunable: glibc.cpu.x86_shared_cache_size
     The ‘glibc.cpu.x86_shared_cache_size’ tunable allows the user to
     set shared cache size in bytes for use in memory and string
     routines.

Why is it ignored?

>    if (shared > 0)
>      {
> --
> 2.17.1
>

I think they should be reverted.
  
Florian Weimer Oct. 28, 2020, 2:29 p.m. UTC | #3
* H. J. Lu:

>>    if (cpu_features->data_cache_size != 0)
>> -    data = cpu_features->data_cache_size;
>> +    {
>> +      if (data == 0 || cpu_features->basic.kind != arch_kind_amd)
>> +        data = cpu_features->data_cache_size;
>> +    }
>
> This looks wrong.  cpu_features->data_cache_size is set by
> GLIBC tunables:
>
> -- Tunable: glibc.cpu.x86_data_cache_size
>      The ‘glibc.cpu.x86_data_cache_size’ tunable allows the user to set
>      data cache size in bytes for use in memory and string routines.
>
>      This tunable is specific to i386 and x86-64.
>
> Why is it ignored?

So we should revert those two hunks only?

Thanks,
Florian
  
H.J. Lu Oct. 28, 2020, 2:44 p.m. UTC | #4
On Wed, Oct 28, 2020 at 7:29 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * H. J. Lu:
>
> >>    if (cpu_features->data_cache_size != 0)
> >> -    data = cpu_features->data_cache_size;
> >> +    {
> >> +      if (data == 0 || cpu_features->basic.kind != arch_kind_amd)
> >> +        data = cpu_features->data_cache_size;
> >> +    }
> >
> > This looks wrong.  cpu_features->data_cache_size is set by
> > GLIBC tunables:
> >
> > -- Tunable: glibc.cpu.x86_data_cache_size
> >      The ‘glibc.cpu.x86_data_cache_size’ tunable allows the user to set
> >      data cache size in bytes for use in memory and string routines.
> >
> >      This tunable is specific to i386 and x86-64.
> >
> > Why is it ignored?
>
> So we should revert those two hunks only?
>

Yes.
  
Karumanchi, Sajan Oct. 30, 2020, 6:35 a.m. UTC | #5
[AMD Public Use]

Hi H. J .Lu,

Thanks for pointing it out and sorry for my ignorance of the Glibc tunables.  I overlooked this tunables part, as it was once reviewed by you for another patch which did not make through.
   Patch:  https://sourceware.org/pipermail/libc-alpha/2020-August/117081.html
   Review: https://sourceware.org/pipermail/libc-alpha/2020-September/117385.html



Thanks & Regards,
Sajan K.

-----Original Message-----
From: H.J. Lu <hjl.tools@gmail.com> 
Sent: Wednesday, October 28, 2020 8:14 PM
To: Florian Weimer <fweimer@redhat.com>
Cc: Karumanchi, Sajan <Sajan.Karumanchi@amd.com>; GNU C Library <libc-alpha@sourceware.org>; Carlos O'Donell <carlos@redhat.com>; Mallappa, Premachandra <Premachandra.Mallappa@amd.com>
Subject: Re: [PATCH 1/1] x86: Optimizing memcpy for AMD Zen architecture.

[CAUTION: External Email]

On Wed, Oct 28, 2020 at 7:29 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * H. J. Lu:
>
> >>    if (cpu_features->data_cache_size != 0)
> >> -    data = cpu_features->data_cache_size;
> >> +    {
> >> +      if (data == 0 || cpu_features->basic.kind != arch_kind_amd)
> >> +        data = cpu_features->data_cache_size;
> >> +    }
> >
> > This looks wrong.  cpu_features->data_cache_size is set by GLIBC 
> > tunables:
> >
> > -- Tunable: glibc.cpu.x86_data_cache_size
> >      The ‘glibc.cpu.x86_data_cache_size’ tunable allows the user to set
> >      data cache size in bytes for use in memory and string routines.
> >
> >      This tunable is specific to i386 and x86-64.
> >
> > Why is it ignored?
>
> So we should revert those two hunks only?
>

Yes.

--
H.J.
  

Patch

diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index 7f342fdc23..d6d6877702 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -303,6 +303,8 @@  init_cacheinfo (void)
       data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
       long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+      unsigned int eax;
+      unsigned int threads_per_ccx = 0;
 
       /* Get maximum extended function. */
       __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
@@ -320,7 +322,7 @@  init_cacheinfo (void)
 	      threads = 1 << ((ecx >> 12) & 0x0f);
 	    }
 
-	  if (threads == 0)
+	  if (threads == 0 || cpu_features->basic.family >= 0x17)
 	    {
 	      /* If APIC ID width is not available, use logical
 		 processor count.  */
@@ -335,13 +337,27 @@  init_cacheinfo (void)
 	  if (threads > 0)
 	    shared /= threads;
 
-	  /* Account for exclusive L2 and L3 caches.  */
-	  shared += core;
-	}
+	  /* Get shared cache per ccx for Zen architectures */
+	  if (cpu_features->basic.family >= 0x17)
+	    {
+	      /* Get number of threads share the L3 cache in CCX */
+	      __cpuid_count(0x8000001D, 0x3, eax, ebx, ecx, edx);
+	      threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
+	      shared = shared * threads_per_ccx;
+	    }
+	  else
+	    {
+	      /* Account for exclusive L2 and L3 caches.  */
+	      shared += core;
+            }
+      }
     }
 
   if (cpu_features->data_cache_size != 0)
-    data = cpu_features->data_cache_size;
+    {
+      if (data == 0 || cpu_features->basic.kind != arch_kind_amd)
+        data = cpu_features->data_cache_size;
+    }
 
   if (data > 0)
     {
@@ -354,7 +370,10 @@  init_cacheinfo (void)
     }
 
   if (cpu_features->shared_cache_size != 0)
-    shared = cpu_features->shared_cache_size;
+    {
+      if (shared == 0 || cpu_features->basic.kind != arch_kind_amd)
+        shared = cpu_features->shared_cache_size;
+    }
 
   if (shared > 0)
     {