[v2,2/3] x86: Add cache information support for Zhaoxin processors

Message ID 1585546430-6167-3-git-send-email-MayShao@zhaoxin.com
State Committed
Headers
Series x86: Add support for Zhaoxin processors |

Commit Message

May Shao(BJ-RD) March 30, 2020, 5:33 a.m. UTC
  To obtain Zhaoxin CPU cache information, add a new function
handle_zhaoxin().

Add Zhaoxin branch in init_cacheinfo() for initializing variables,
such as __x86_shared_cache_size.

---
 sysdeps/x86/cacheinfo.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 185 insertions(+)

--
2.7.4



保密声明:
本邮件含有保密或专有信息,仅供指定收件人使用。严禁对本邮件或其内容做任何未经授权的查阅、使用、复制或转发。
CONFIDENTIAL NOTE:
This email contains confidential or legally privileged information and is for the sole use of its intended recipient. Any unauthorized review, use, copying or forwarding of this email or the content of this email is strictly prohibited.
  

Comments

H.J. Lu April 7, 2020, 12:43 p.m. UTC | #1
On Sun, Mar 29, 2020 at 10:35 PM MayShao <MayShao@zhaoxin.com> wrote:
>
> To obtain Zhaoxin CPU cache information, add a new function
> handle_zhaoxin().
>
> Add Zhaoxin branch in init_cacheinfo() for initializing variables,
> such as __x86_shared_cache_size.
>
> ---
>  sysdeps/x86/cacheinfo.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 185 insertions(+)
>
> diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
> index e3e8ef2..e5a3284 100644
> --- a/sysdeps/x86/cacheinfo.c
> +++ b/sysdeps/x86/cacheinfo.c
> @@ -436,6 +436,57 @@ handle_amd (int name)
>  }
>
>
> +static long int __attribute__ ((noinline))
> +handle_zhaoxin (int name)
> +{
> +  unsigned int eax;
> +  unsigned int ebx;
> +  unsigned int ecx;
> +  unsigned int edx;
> +
> +  int folded_rel_name = (M(name) / 3) * 3;
> +
> +  unsigned int round = 0;
> +  while (1)
> +    {
> +      __cpuid_count (4, round, eax, ebx, ecx, edx);
> +
> +      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
> +      if (type == null)
> +        break;
> +
> +      unsigned int level = (eax >> 5) & 0x7;
> +
> +      if ((level == 1 && type == data
> +        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
> +        || (level == 1 && type == inst
> +            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
> +        || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
> +        || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
> +        {
> +          unsigned int offset = M(name) - folded_rel_name;
> +
> +          if (offset == 0)
> +            /* Cache size.  */
> +            return (((ebx >> 22) + 1)
> +                * (((ebx >> 12) & 0x3ff) + 1)
> +                * ((ebx & 0xfff) + 1)
> +                * (ecx + 1));
> +          if (offset == 1)
> +            return (ebx >> 22) + 1;
> +
> +          assert (offset == 2);
> +          return (ebx & 0xfff) + 1;
> +        }
> +
> +      ++round;
> +    }
> +
> +  /* Nothing found.  */
> +  return 0;
> +}
> +
> +
>  /* Get the value of the system variable NAME.  */
>  long int
>  attribute_hidden
> @@ -449,6 +500,9 @@ __cache_sysconf (int name)
>    if (cpu_features->basic.kind == arch_kind_amd)
>      return handle_amd (name);
>
> +  if (cpu_features->basic.kind == arch_kind_zhaoxin)
> +    return handle_zhaoxin (name);
> +
>    // XXX Fill in more vendors.
>
>    /* CPU not known, we have no information.  */
> @@ -751,6 +805,137 @@ intel_bug_no_cache_info:
>         }
>  #endif
>      }
> +  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> +    {
> +      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> +      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> +      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> +
> +      /* Number of logical processors sharing L2 cache.  */
> +      int threads_l2;
> +
> +      /* Number of logical processors sharing L3 cache.  */
> +      int threads_l3;
> +
> +      if (shared <= 0)
> +        {
> +          /* No shared L3 cache.  All we have is the L2 cache.  */
> +          level = 2;
> +          shared = core;
> +          threads_l2 = 0;
> +          threads_l3 = -1;
> +        }
> +      else
> +        {
> +          level = 3;
> +          threads_l2 = 0;
> +          threads_l3 = 0;
> +        }
> +
> +      int i = 0;
> +
> +      /* Query until cache level 2 and 3 are enumerated.  */
> +      int check = 0x1 | (threads_l3 == 0) << 1;
> +      do
> +        {
> +          __cpuid_count (4, i++, eax, ebx, ecx, edx);
> +
> +          switch ((eax >> 5) & 0x7)
> +            {
> +            default:
> +              break;
> +            case 2:
> +              if ((check & 0x1))
> +                {
> +                  /* Get maximum number of logical processors
> +                     sharing L2 cache.  */
> +                  threads_l2 = (eax >> 14) & 0x3ff;
> +                  check &= ~0x1;
> +                }
> +              break;
> +            case 3:
> +              if ((check & (0x1 << 1)))
> +               {
> +                  /* Get maximum number of logical processors
> +                     sharing L3 cache.  */
> +                  threads_l3 = (eax >> 14) & 0x3ff;
> +                  check &= ~(0x1 << 1);
> +                }
> +              break;
> +           }
> +        }
> +      while (check);
> +
> +      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
> +         numbers of addressable IDs for logical processors sharing
> +         the cache, instead of the maximum number of threads
> +         sharing the cache.  */
> +      if (max_cpuid >= 11)
> +        {
> +          /* Find the number of logical processors shipped in
> +             one core and apply count mask.  */
> +          i = 0;
> +
> +          /* Count SMT only if there is L3 cache.  Always count
> +             core if there is no L3 cache.  */
> +          int count = ((threads_l2 > 0 && level == 3)
> +                       | ((threads_l3 > 0
> +                           || (threads_l2 > 0 && level == 2)) << 1));
> +
> +          while (count)
> +            {
> +              __cpuid_count (11, i++, eax, ebx, ecx, edx);
> +
> +              int shipped = ebx & 0xff;
> +              int type = ecx & 0xff00;
> +              if (shipped == 0 || type == 0)
> +                break;
> +              else if (type == 0x100)
> +                {
> +                  /* Count SMT.  */
> +                  if ((count & 0x1))
> +                    {
> +                      int count_mask;
> +
> +                      /* Compute count mask.  */
> +                      asm ("bsr %1, %0"
> +                           : "=r" (count_mask) : "g" (threads_l2));
> +                      count_mask = ~(-1 << (count_mask + 1));
> +                      threads_l2 = (shipped - 1) & count_mask;
> +                      count &= ~0x1;
> +                    }
> +                }
> +              else if (type == 0x200)
> +                {
> +                  /* Count core.  */
> +                  if ((count & (0x1 << 1)))
> +                    {
> +                      int count_mask;
> +                      int threads_core
> +                        = (level == 2 ? threads_l2 : threads_l3);
> +
> +                      /* Compute count mask.  */
> +                      asm ("bsr %1, %0"
> +                           : "=r" (count_mask) : "g" (threads_core));
> +                      count_mask = ~(-1 << (count_mask + 1));
> +                      threads_core = (shipped - 1) & count_mask;
> +                      if (level == 2)
> +                        threads_l2 = threads_core;
> +                      else
> +                        threads_l3 = threads_core;
> +                      count &= ~(0x1 << 1);
> +                    }
> +                }
> +            }
> +        }
> +      if (level == 2 && threads_l2 > 0)
> +        threads = threads_l2 + 1;
> +      if (level == 3 && threads_l3 > 0)
> +        threads = threads_l3 + 1;
> +
> +      if (shared > 0 && threads > 0)
> +        shared /= threads;
> +    }

This code looks very similar to Intel code.   Can you factor it out and reuse
it for you?

>    if (cpu_features->data_cache_size != 0)
>      data = cpu_features->data_cache_size;
  
Mayshao-oc April 10, 2020, 2:34 a.m. UTC | #2
On Tue, April 7, 2020 at 8:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> 
> On Sun, Mar 29, 2020 at 10:35 PM MayShao <MayShao@zhaoxin.com> wrote:
> >
> > To obtain Zhaoxin CPU cache information, add a new function
> > handle_zhaoxin().
> >
> > Add Zhaoxin branch in init_cacheinfo() for initializing variables,
> > such as __x86_shared_cache_size.
> >
> > ---
> >  sysdeps/x86/cacheinfo.c | 185
> ++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 185 insertions(+)
> >
> > diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
> > index e3e8ef2..e5a3284 100644
> > --- a/sysdeps/x86/cacheinfo.c
> > +++ b/sysdeps/x86/cacheinfo.c
> > @@ -436,6 +436,57 @@ handle_amd (int name)
> >  }
> >
> >
> > +static long int __attribute__ ((noinline))
> > +handle_zhaoxin (int name)
> > +{
> > +  unsigned int eax;
> > +  unsigned int ebx;
> > +  unsigned int ecx;
> > +  unsigned int edx;
> > +
> > +  int folded_rel_name = (M(name) / 3) * 3;
> > +
> > +  unsigned int round = 0;
> > +  while (1)
> > +    {
> > +      __cpuid_count (4, round, eax, ebx, ecx, edx);
> > +
> > +      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
> > +      if (type == null)
> > +        break;
> > +
> > +      unsigned int level = (eax >> 5) & 0x7;
> > +
> > +      if ((level == 1 && type == data
> > +        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
> > +        || (level == 1 && type == inst
> > +            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
> > +        || (level == 2 && folded_rel_name ==
> M(_SC_LEVEL2_CACHE_SIZE))
> > +        || (level == 3 && folded_rel_name ==
> M(_SC_LEVEL3_CACHE_SIZE)))
> > +        {
> > +          unsigned int offset = M(name) - folded_rel_name;
> > +
> > +          if (offset == 0)
> > +            /* Cache size.  */
> > +            return (((ebx >> 22) + 1)
> > +                * (((ebx >> 12) & 0x3ff) + 1)
> > +                * ((ebx & 0xfff) + 1)
> > +                * (ecx + 1));
> > +          if (offset == 1)
> > +            return (ebx >> 22) + 1;
> > +
> > +          assert (offset == 2);
> > +          return (ebx & 0xfff) + 1;
> > +        }
> > +
> > +      ++round;
> > +    }
> > +
> > +  /* Nothing found.  */
> > +  return 0;
> > +}
> > +
> > +
> >  /* Get the value of the system variable NAME.  */
> >  long int
> >  attribute_hidden
> > @@ -449,6 +500,9 @@ __cache_sysconf (int name)
> >    if (cpu_features->basic.kind == arch_kind_amd)
> >      return handle_amd (name);
> >
> > +  if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > +    return handle_zhaoxin (name);
> > +
> >    // XXX Fill in more vendors.
> >
> >    /* CPU not known, we have no information.  */
> > @@ -751,6 +805,137 @@ intel_bug_no_cache_info:
> >         }
> >  #endif
> >      }
> > +  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > +    {
> > +      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> > +      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> > +      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> > +
> > +      /* Number of logical processors sharing L2 cache.  */
> > +      int threads_l2;
> > +
> > +      /* Number of logical processors sharing L3 cache.  */
> > +      int threads_l3;
> > +
> > +      if (shared <= 0)
> > +        {
> > +          /* No shared L3 cache.  All we have is the L2 cache.  */
> > +          level = 2;
> > +          shared = core;
> > +          threads_l2 = 0;
> > +          threads_l3 = -1;
> > +        }
> > +      else
> > +        {
> > +          level = 3;
> > +          threads_l2 = 0;
> > +          threads_l3 = 0;
> > +        }
> > +
> > +      int i = 0;
> > +
> > +      /* Query until cache level 2 and 3 are enumerated.  */
> > +      int check = 0x1 | (threads_l3 == 0) << 1;
> > +      do
> > +        {
> > +          __cpuid_count (4, i++, eax, ebx, ecx, edx);
> > +
> > +          switch ((eax >> 5) & 0x7)
> > +            {
> > +            default:
> > +              break;
> > +            case 2:
> > +              if ((check & 0x1))
> > +                {
> > +                  /* Get maximum number of logical processors
> > +                     sharing L2 cache.  */
> > +                  threads_l2 = (eax >> 14) & 0x3ff;
> > +                  check &= ~0x1;
> > +                }
> > +              break;
> > +            case 3:
> > +              if ((check & (0x1 << 1)))
> > +               {
> > +                  /* Get maximum number of logical processors
> > +                     sharing L3 cache.  */
> > +                  threads_l3 = (eax >> 14) & 0x3ff;
> > +                  check &= ~(0x1 << 1);
> > +                }
> > +              break;
> > +           }
> > +        }
> > +      while (check);
> > +
> > +      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
> > +         numbers of addressable IDs for logical processors sharing
> > +         the cache, instead of the maximum number of threads
> > +         sharing the cache.  */
> > +      if (max_cpuid >= 11)
> > +        {
> > +          /* Find the number of logical processors shipped in
> > +             one core and apply count mask.  */
> > +          i = 0;
> > +
> > +          /* Count SMT only if there is L3 cache.  Always count
> > +             core if there is no L3 cache.  */
> > +          int count = ((threads_l2 > 0 && level == 3)
> > +                       | ((threads_l3 > 0
> > +                           || (threads_l2 > 0 && level == 2)) << 1));
> > +
> > +          while (count)
> > +            {
> > +              __cpuid_count (11, i++, eax, ebx, ecx, edx);
> > +
> > +              int shipped = ebx & 0xff;
> > +              int type = ecx & 0xff00;
> > +              if (shipped == 0 || type == 0)
> > +                break;
> > +              else if (type == 0x100)
> > +                {
> > +                  /* Count SMT.  */
> > +                  if ((count & 0x1))
> > +                    {
> > +                      int count_mask;
> > +
> > +                      /* Compute count mask.  */
> > +                      asm ("bsr %1, %0"
> > +                           : "=r" (count_mask) : "g" (threads_l2));
> > +                      count_mask = ~(-1 << (count_mask + 1));
> > +                      threads_l2 = (shipped - 1) & count_mask;
> > +                      count &= ~0x1;
> > +                    }
> > +                }
> > +              else if (type == 0x200)
> > +                {
> > +                  /* Count core.  */
> > +                  if ((count & (0x1 << 1)))
> > +                    {
> > +                      int count_mask;
> > +                      int threads_core
> > +                        = (level == 2 ? threads_l2 : threads_l3);
> > +
> > +                      /* Compute count mask.  */
> > +                      asm ("bsr %1, %0"
> > +                           : "=r" (count_mask) : "g" (threads_core));
> > +                      count_mask = ~(-1 << (count_mask + 1));
> > +                      threads_core = (shipped - 1) & count_mask;
> > +                      if (level == 2)
> > +                        threads_l2 = threads_core;
> > +                      else
> > +                        threads_l3 = threads_core;
> > +                      count &= ~(0x1 << 1);
> > +                    }
> > +                }
> > +            }
> > +        }
> > +      if (level == 2 && threads_l2 > 0)
> > +        threads = threads_l2 + 1;
> > +      if (level == 3 && threads_l3 > 0)
> > +        threads = threads_l3 + 1;
> > +
> > +      if (shared > 0 && threads > 0)
> > +        shared /= threads;
> > +    }
> 
> This code looks very similar to Intel code.   Can you factor it out and reuse
> it for you?

I tried to extract this part of the code, but it didn’t look very clean. 
For Example, the case of max_cpuid < 4 does not exist on Zhaoxin processors.
Zhaoxin processors are currently inclusive caches, and the number of threads
sharing L2 cache is not affected by the family or model.  Considering the possible
changes of CPU design in future, it may be more convenient to keep separate branches.

I was wandering if you had any concerns, or if you could give some suggestions,
that would be great.


Best Regards,
May Shao
  
H.J. Lu April 10, 2020, 11:53 a.m. UTC | #3
On Thu, Apr 9, 2020 at 7:34 PM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
>
>
> On Tue, April 7, 2020 at 8:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sun, Mar 29, 2020 at 10:35 PM MayShao <MayShao@zhaoxin.com> wrote:
> > >
> > > To obtain Zhaoxin CPU cache information, add a new function
> > > handle_zhaoxin().
> > >
> > > Add Zhaoxin branch in init_cacheinfo() for initializing variables,
> > > such as __x86_shared_cache_size.
> > >
> > > ---
> > >  sysdeps/x86/cacheinfo.c | 185
> > ++++++++++++++++++++++++++++++++++++++++++++++++
> > >  1 file changed, 185 insertions(+)
> > >
> > > diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
> > > index e3e8ef2..e5a3284 100644
> > > --- a/sysdeps/x86/cacheinfo.c
> > > +++ b/sysdeps/x86/cacheinfo.c
> > > @@ -436,6 +436,57 @@ handle_amd (int name)
> > >  }
> > >
> > >
> > > +static long int __attribute__ ((noinline))
> > > +handle_zhaoxin (int name)
> > > +{
> > > +  unsigned int eax;
> > > +  unsigned int ebx;
> > > +  unsigned int ecx;
> > > +  unsigned int edx;
> > > +
> > > +  int folded_rel_name = (M(name) / 3) * 3;
> > > +
> > > +  unsigned int round = 0;
> > > +  while (1)
> > > +    {
> > > +      __cpuid_count (4, round, eax, ebx, ecx, edx);
> > > +
> > > +      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
> > > +      if (type == null)
> > > +        break;
> > > +
> > > +      unsigned int level = (eax >> 5) & 0x7;
> > > +
> > > +      if ((level == 1 && type == data
> > > +        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
> > > +        || (level == 1 && type == inst
> > > +            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
> > > +        || (level == 2 && folded_rel_name ==
> > M(_SC_LEVEL2_CACHE_SIZE))
> > > +        || (level == 3 && folded_rel_name ==
> > M(_SC_LEVEL3_CACHE_SIZE)))
> > > +        {
> > > +          unsigned int offset = M(name) - folded_rel_name;
> > > +
> > > +          if (offset == 0)
> > > +            /* Cache size.  */
> > > +            return (((ebx >> 22) + 1)
> > > +                * (((ebx >> 12) & 0x3ff) + 1)
> > > +                * ((ebx & 0xfff) + 1)
> > > +                * (ecx + 1));
> > > +          if (offset == 1)
> > > +            return (ebx >> 22) + 1;
> > > +
> > > +          assert (offset == 2);
> > > +          return (ebx & 0xfff) + 1;
> > > +        }
> > > +
> > > +      ++round;
> > > +    }
> > > +
> > > +  /* Nothing found.  */
> > > +  return 0;
> > > +}
> > > +
> > > +
> > >  /* Get the value of the system variable NAME.  */
> > >  long int
> > >  attribute_hidden
> > > @@ -449,6 +500,9 @@ __cache_sysconf (int name)
> > >    if (cpu_features->basic.kind == arch_kind_amd)
> > >      return handle_amd (name);
> > >
> > > +  if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > +    return handle_zhaoxin (name);
> > > +
> > >    // XXX Fill in more vendors.
> > >
> > >    /* CPU not known, we have no information.  */
> > > @@ -751,6 +805,137 @@ intel_bug_no_cache_info:
> > >         }
> > >  #endif
> > >      }
> > > +  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > +    {
> > > +      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> > > +      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> > > +      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> > > +
> > > +      /* Number of logical processors sharing L2 cache.  */
> > > +      int threads_l2;
> > > +
> > > +      /* Number of logical processors sharing L3 cache.  */
> > > +      int threads_l3;
> > > +
> > > +      if (shared <= 0)
> > > +        {
> > > +          /* No shared L3 cache.  All we have is the L2 cache.  */
> > > +          level = 2;
> > > +          shared = core;
> > > +          threads_l2 = 0;
> > > +          threads_l3 = -1;
> > > +        }
> > > +      else
> > > +        {
> > > +          level = 3;
> > > +          threads_l2 = 0;
> > > +          threads_l3 = 0;
> > > +        }
> > > +
> > > +      int i = 0;
> > > +
> > > +      /* Query until cache level 2 and 3 are enumerated.  */
> > > +      int check = 0x1 | (threads_l3 == 0) << 1;
> > > +      do
> > > +        {
> > > +          __cpuid_count (4, i++, eax, ebx, ecx, edx);
> > > +
> > > +          switch ((eax >> 5) & 0x7)
> > > +            {
> > > +            default:
> > > +              break;
> > > +            case 2:
> > > +              if ((check & 0x1))
> > > +                {
> > > +                  /* Get maximum number of logical processors
> > > +                     sharing L2 cache.  */
> > > +                  threads_l2 = (eax >> 14) & 0x3ff;
> > > +                  check &= ~0x1;
> > > +                }
> > > +              break;
> > > +            case 3:
> > > +              if ((check & (0x1 << 1)))
> > > +               {
> > > +                  /* Get maximum number of logical processors
> > > +                     sharing L3 cache.  */
> > > +                  threads_l3 = (eax >> 14) & 0x3ff;
> > > +                  check &= ~(0x1 << 1);
> > > +                }
> > > +              break;
> > > +           }
> > > +        }
> > > +      while (check);
> > > +
> > > +      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
> > > +         numbers of addressable IDs for logical processors sharing
> > > +         the cache, instead of the maximum number of threads
> > > +         sharing the cache.  */
> > > +      if (max_cpuid >= 11)
> > > +        {
> > > +          /* Find the number of logical processors shipped in
> > > +             one core and apply count mask.  */
> > > +          i = 0;
> > > +
> > > +          /* Count SMT only if there is L3 cache.  Always count
> > > +             core if there is no L3 cache.  */
> > > +          int count = ((threads_l2 > 0 && level == 3)
> > > +                       | ((threads_l3 > 0
> > > +                           || (threads_l2 > 0 && level == 2)) << 1));
> > > +
> > > +          while (count)
> > > +            {
> > > +              __cpuid_count (11, i++, eax, ebx, ecx, edx);
> > > +
> > > +              int shipped = ebx & 0xff;
> > > +              int type = ecx & 0xff00;
> > > +              if (shipped == 0 || type == 0)
> > > +                break;
> > > +              else if (type == 0x100)
> > > +                {
> > > +                  /* Count SMT.  */
> > > +                  if ((count & 0x1))
> > > +                    {
> > > +                      int count_mask;
> > > +
> > > +                      /* Compute count mask.  */
> > > +                      asm ("bsr %1, %0"
> > > +                           : "=r" (count_mask) : "g" (threads_l2));
> > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > +                      threads_l2 = (shipped - 1) & count_mask;
> > > +                      count &= ~0x1;
> > > +                    }
> > > +                }
> > > +              else if (type == 0x200)
> > > +                {
> > > +                  /* Count core.  */
> > > +                  if ((count & (0x1 << 1)))
> > > +                    {
> > > +                      int count_mask;
> > > +                      int threads_core
> > > +                        = (level == 2 ? threads_l2 : threads_l3);
> > > +
> > > +                      /* Compute count mask.  */
> > > +                      asm ("bsr %1, %0"
> > > +                           : "=r" (count_mask) : "g" (threads_core));
> > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > +                      threads_core = (shipped - 1) & count_mask;
> > > +                      if (level == 2)
> > > +                        threads_l2 = threads_core;
> > > +                      else
> > > +                        threads_l3 = threads_core;
> > > +                      count &= ~(0x1 << 1);
> > > +                    }
> > > +                }
> > > +            }
> > > +        }
> > > +      if (level == 2 && threads_l2 > 0)
> > > +        threads = threads_l2 + 1;
> > > +      if (level == 3 && threads_l3 > 0)
> > > +        threads = threads_l3 + 1;
> > > +
> > > +      if (shared > 0 && threads > 0)
> > > +        shared /= threads;
> > > +    }
> >
> > This code looks very similar to Intel code.   Can you factor it out and reuse
> > it for you?
>
> I tried to extract this part of the code, but it didn’t look very clean.
> For Example, the case of max_cpuid < 4 does not exist on Zhaoxin processors.

The same as new Intel processors.

> Zhaoxin processors are currently inclusive caches, and the number of threads

Do Zhaoxin processors use CPUID bit to indicate inclusive caches?

> sharing L2 cache is not affected by the family or model.  Considering the possible

You can check cpu_features->basic.kind for that.

> changes of CPU design in future, it may be more convenient to keep separate branches.
>
> I was wandering if you had any concerns, or if you could give some suggestions,
> that would be great.

Let's avoid code duplication for now and revisit it in the future.
  
Mayshao-oc April 10, 2020, 12:49 p.m. UTC | #4
On Fri, Apr 10, 2020 at 7:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> 
> On Thu, Apr 9, 2020 at 7:34 PM Mayshao-oc <Mayshao-oc@zhaoxin.com>
> wrote:
> >
> >
> > On Tue, April 7, 2020 at 8:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sun, Mar 29, 2020 at 10:35 PM MayShao <MayShao@zhaoxin.com>
> wrote:
> > > >
> > > > To obtain Zhaoxin CPU cache information, add a new function
> > > > handle_zhaoxin().
> > > >
> > > > Add Zhaoxin branch in init_cacheinfo() for initializing variables,
> > > > such as __x86_shared_cache_size.
> > > >
> > > > ---
> > > >  sysdeps/x86/cacheinfo.c | 185
> > > ++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 185 insertions(+)
> > > >
> > > > diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
> > > > index e3e8ef2..e5a3284 100644
> > > > --- a/sysdeps/x86/cacheinfo.c
> > > > +++ b/sysdeps/x86/cacheinfo.c
> > > > @@ -436,6 +436,57 @@ handle_amd (int name)
> > > >  }
> > > >
> > > >
> > > > +static long int __attribute__ ((noinline))
> > > > +handle_zhaoxin (int name)
> > > > +{
> > > > +  unsigned int eax;
> > > > +  unsigned int ebx;
> > > > +  unsigned int ecx;
> > > > +  unsigned int edx;
> > > > +
> > > > +  int folded_rel_name = (M(name) / 3) * 3;
> > > > +
> > > > +  unsigned int round = 0;
> > > > +  while (1)
> > > > +    {
> > > > +      __cpuid_count (4, round, eax, ebx, ecx, edx);
> > > > +
> > > > +      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
> > > > +      if (type == null)
> > > > +        break;
> > > > +
> > > > +      unsigned int level = (eax >> 5) & 0x7;
> > > > +
> > > > +      if ((level == 1 && type == data
> > > > +        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
> > > > +        || (level == 1 && type == inst
> > > > +            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
> > > > +        || (level == 2 && folded_rel_name ==
> > > M(_SC_LEVEL2_CACHE_SIZE))
> > > > +        || (level == 3 && folded_rel_name ==
> > > M(_SC_LEVEL3_CACHE_SIZE)))
> > > > +        {
> > > > +          unsigned int offset = M(name) - folded_rel_name;
> > > > +
> > > > +          if (offset == 0)
> > > > +            /* Cache size.  */
> > > > +            return (((ebx >> 22) + 1)
> > > > +                * (((ebx >> 12) & 0x3ff) + 1)
> > > > +                * ((ebx & 0xfff) + 1)
> > > > +                * (ecx + 1));
> > > > +          if (offset == 1)
> > > > +            return (ebx >> 22) + 1;
> > > > +
> > > > +          assert (offset == 2);
> > > > +          return (ebx & 0xfff) + 1;
> > > > +        }
> > > > +
> > > > +      ++round;
> > > > +    }
> > > > +
> > > > +  /* Nothing found.  */
> > > > +  return 0;
> > > > +}
> > > > +
> > > > +
> > > >  /* Get the value of the system variable NAME.  */
> > > >  long int
> > > >  attribute_hidden
> > > > @@ -449,6 +500,9 @@ __cache_sysconf (int name)
> > > >    if (cpu_features->basic.kind == arch_kind_amd)
> > > >      return handle_amd (name);
> > > >
> > > > +  if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > > +    return handle_zhaoxin (name);
> > > > +
> > > >    // XXX Fill in more vendors.
> > > >
> > > >    /* CPU not known, we have no information.  */
> > > > @@ -751,6 +805,137 @@ intel_bug_no_cache_info:
> > > >         }
> > > >  #endif
> > > >      }
> > > > +  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > > +    {
> > > > +      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> > > > +      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> > > > +      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> > > > +
> > > > +      /* Number of logical processors sharing L2 cache.  */
> > > > +      int threads_l2;
> > > > +
> > > > +      /* Number of logical processors sharing L3 cache.  */
> > > > +      int threads_l3;
> > > > +
> > > > +      if (shared <= 0)
> > > > +        {
> > > > +          /* No shared L3 cache.  All we have is the L2 cache.  */
> > > > +          level = 2;
> > > > +          shared = core;
> > > > +          threads_l2 = 0;
> > > > +          threads_l3 = -1;
> > > > +        }
> > > > +      else
> > > > +        {
> > > > +          level = 3;
> > > > +          threads_l2 = 0;
> > > > +          threads_l3 = 0;
> > > > +        }
> > > > +
> > > > +      int i = 0;
> > > > +
> > > > +      /* Query until cache level 2 and 3 are enumerated.  */
> > > > +      int check = 0x1 | (threads_l3 == 0) << 1;
> > > > +      do
> > > > +        {
> > > > +          __cpuid_count (4, i++, eax, ebx, ecx, edx);
> > > > +
> > > > +          switch ((eax >> 5) & 0x7)
> > > > +            {
> > > > +            default:
> > > > +              break;
> > > > +            case 2:
> > > > +              if ((check & 0x1))
> > > > +                {
> > > > +                  /* Get maximum number of logical processors
> > > > +                     sharing L2 cache.  */
> > > > +                  threads_l2 = (eax >> 14) & 0x3ff;
> > > > +                  check &= ~0x1;
> > > > +                }
> > > > +              break;
> > > > +            case 3:
> > > > +              if ((check & (0x1 << 1)))
> > > > +               {
> > > > +                  /* Get maximum number of logical processors
> > > > +                     sharing L3 cache.  */
> > > > +                  threads_l3 = (eax >> 14) & 0x3ff;
> > > > +                  check &= ~(0x1 << 1);
> > > > +                }
> > > > +              break;
> > > > +           }
> > > > +        }
> > > > +      while (check);
> > > > +
> > > > +      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the
> maximum
> > > > +         numbers of addressable IDs for logical processors sharing
> > > > +         the cache, instead of the maximum number of threads
> > > > +         sharing the cache.  */
> > > > +      if (max_cpuid >= 11)
> > > > +        {
> > > > +          /* Find the number of logical processors shipped in
> > > > +             one core and apply count mask.  */
> > > > +          i = 0;
> > > > +
> > > > +          /* Count SMT only if there is L3 cache.  Always count
> > > > +             core if there is no L3 cache.  */
> > > > +          int count = ((threads_l2 > 0 && level == 3)
> > > > +                       | ((threads_l3 > 0
> > > > +                           || (threads_l2 > 0 && level == 2)) <<
> 1));
> > > > +
> > > > +          while (count)
> > > > +            {
> > > > +              __cpuid_count (11, i++, eax, ebx, ecx, edx);
> > > > +
> > > > +              int shipped = ebx & 0xff;
> > > > +              int type = ecx & 0xff00;
> > > > +              if (shipped == 0 || type == 0)
> > > > +                break;
> > > > +              else if (type == 0x100)
> > > > +                {
> > > > +                  /* Count SMT.  */
> > > > +                  if ((count & 0x1))
> > > > +                    {
> > > > +                      int count_mask;
> > > > +
> > > > +                      /* Compute count mask.  */
> > > > +                      asm ("bsr %1, %0"
> > > > +                           : "=r" (count_mask) : "g" (threads_l2));
> > > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > > +                      threads_l2 = (shipped - 1) & count_mask;
> > > > +                      count &= ~0x1;
> > > > +                    }
> > > > +                }
> > > > +              else if (type == 0x200)
> > > > +                {
> > > > +                  /* Count core.  */
> > > > +                  if ((count & (0x1 << 1)))
> > > > +                    {
> > > > +                      int count_mask;
> > > > +                      int threads_core
> > > > +                        = (level == 2 ? threads_l2 : threads_l3);
> > > > +
> > > > +                      /* Compute count mask.  */
> > > > +                      asm ("bsr %1, %0"
> > > > +                           : "=r" (count_mask) : "g"
> (threads_core));
> > > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > > +                      threads_core = (shipped - 1) & count_mask;
> > > > +                      if (level == 2)
> > > > +                        threads_l2 = threads_core;
> > > > +                      else
> > > > +                        threads_l3 = threads_core;
> > > > +                      count &= ~(0x1 << 1);
> > > > +                    }
> > > > +                }
> > > > +            }
> > > > +        }
> > > > +      if (level == 2 && threads_l2 > 0)
> > > > +        threads = threads_l2 + 1;
> > > > +      if (level == 3 && threads_l3 > 0)
> > > > +        threads = threads_l3 + 1;
> > > > +
> > > > +      if (shared > 0 && threads > 0)
> > > > +        shared /= threads;
> > > > +    }
> > >
> > > This code looks very similar to Intel code.   Can you factor it out and reuse
> > > it for you?
> >
> > I tried to extract this part of the code, but it didn’t look very clean.
> > For Example, the case of max_cpuid < 4 does not exist on Zhaoxin processors.
> 
> The same as new Intel processors.
> 
> > Zhaoxin processors are currently inclusive caches, and the number of threads
> 
> Do Zhaoxin processors use CPUID bit to indicate inclusive caches?

Yes, Zhaoxin use CPUID bit to indicate it.

> > sharing L2 cache is not affected by the family or model.  Considering the
> possible
> 
> You can check cpu_features->basic.kind for that.
> 
> > changes of CPU design in future, it may be more convenient to keep separate
> branches.
> >
> > I was wandering if you had any concerns, or if you could give some
> suggestions,
> > that would be great.
> 
> Let's avoid code duplication for now and revisit it in the future.

You’re right.  I will try as you suggest and send it as patch v3.

Thanks for your time.


Best Regards,
May Shao
  
Mayshao-oc April 10, 2020, 1:01 p.m. UTC | #5
On Fri, Apr 10, 2020 at 7:53 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> 
> On Thu, Apr 9, 2020 at 7:34 PM Mayshao-oc <Mayshao-oc@zhaoxin.com>
> wrote:
> >
> >
> > On Tue, April 7, 2020 at 8:44 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sun, Mar 29, 2020 at 10:35 PM MayShao <MayShao@zhaoxin.com>
> wrote:
> > > >
> > > > To obtain Zhaoxin CPU cache information, add a new function
> > > > handle_zhaoxin().
> > > >
> > > > Add Zhaoxin branch in init_cacheinfo() for initializing variables,
> > > > such as __x86_shared_cache_size.
> > > >
> > > > ---
> > > >  sysdeps/x86/cacheinfo.c | 185
> > > ++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  1 file changed, 185 insertions(+)
> > > >
> > > > diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
> > > > index e3e8ef2..e5a3284 100644
> > > > --- a/sysdeps/x86/cacheinfo.c
> > > > +++ b/sysdeps/x86/cacheinfo.c
> > > > @@ -436,6 +436,57 @@ handle_amd (int name)
> > > >  }
> > > >
> > > >
> > > > +static long int __attribute__ ((noinline))
> > > > +handle_zhaoxin (int name)
> > > > +{
> > > > +  unsigned int eax;
> > > > +  unsigned int ebx;
> > > > +  unsigned int ecx;
> > > > +  unsigned int edx;
> > > > +
> > > > +  int folded_rel_name = (M(name) / 3) * 3;
> > > > +
> > > > +  unsigned int round = 0;
> > > > +  while (1)
> > > > +    {
> > > > +      __cpuid_count (4, round, eax, ebx, ecx, edx);
> > > > +
> > > > +      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
> > > > +      if (type == null)
> > > > +        break;
> > > > +
> > > > +      unsigned int level = (eax >> 5) & 0x7;
> > > > +
> > > > +      if ((level == 1 && type == data
> > > > +        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
> > > > +        || (level == 1 && type == inst
> > > > +            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
> > > > +        || (level == 2 && folded_rel_name ==
> > > M(_SC_LEVEL2_CACHE_SIZE))
> > > > +        || (level == 3 && folded_rel_name ==
> > > M(_SC_LEVEL3_CACHE_SIZE)))
> > > > +        {
> > > > +          unsigned int offset = M(name) - folded_rel_name;
> > > > +
> > > > +          if (offset == 0)
> > > > +            /* Cache size.  */
> > > > +            return (((ebx >> 22) + 1)
> > > > +                * (((ebx >> 12) & 0x3ff) + 1)
> > > > +                * ((ebx & 0xfff) + 1)
> > > > +                * (ecx + 1));
> > > > +          if (offset == 1)
> > > > +            return (ebx >> 22) + 1;
> > > > +
> > > > +          assert (offset == 2);
> > > > +          return (ebx & 0xfff) + 1;
> > > > +        }
> > > > +
> > > > +      ++round;
> > > > +    }
> > > > +
> > > > +  /* Nothing found.  */
> > > > +  return 0;
> > > > +}
> > > > +
> > > > +
> > > >  /* Get the value of the system variable NAME.  */
> > > >  long int
> > > >  attribute_hidden
> > > > @@ -449,6 +500,9 @@ __cache_sysconf (int name)
> > > >    if (cpu_features->basic.kind == arch_kind_amd)
> > > >      return handle_amd (name);
> > > >
> > > > +  if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > > +    return handle_zhaoxin (name);
> > > > +
> > > >    // XXX Fill in more vendors.
> > > >
> > > >    /* CPU not known, we have no information.  */
> > > > @@ -751,6 +805,137 @@ intel_bug_no_cache_info:
> > > >         }
> > > >  #endif
> > > >      }
> > > > +  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> > > > +    {
> > > > +      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> > > > +      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> > > > +      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> > > > +
> > > > +      /* Number of logical processors sharing L2 cache.  */
> > > > +      int threads_l2;
> > > > +
> > > > +      /* Number of logical processors sharing L3 cache.  */
> > > > +      int threads_l3;
> > > > +
> > > > +      if (shared <= 0)
> > > > +        {
> > > > +          /* No shared L3 cache.  All we have is the L2 cache.  */
> > > > +          level = 2;
> > > > +          shared = core;
> > > > +          threads_l2 = 0;
> > > > +          threads_l3 = -1;
> > > > +        }
> > > > +      else
> > > > +        {
> > > > +          level = 3;
> > > > +          threads_l2 = 0;
> > > > +          threads_l3 = 0;
> > > > +        }
> > > > +
> > > > +      int i = 0;
> > > > +
> > > > +      /* Query until cache level 2 and 3 are enumerated.  */
> > > > +      int check = 0x1 | (threads_l3 == 0) << 1;
> > > > +      do
> > > > +        {
> > > > +          __cpuid_count (4, i++, eax, ebx, ecx, edx);
> > > > +
> > > > +          switch ((eax >> 5) & 0x7)
> > > > +            {
> > > > +            default:
> > > > +              break;
> > > > +            case 2:
> > > > +              if ((check & 0x1))
> > > > +                {
> > > > +                  /* Get maximum number of logical processors
> > > > +                     sharing L2 cache.  */
> > > > +                  threads_l2 = (eax >> 14) & 0x3ff;
> > > > +                  check &= ~0x1;
> > > > +                }
> > > > +              break;
> > > > +            case 3:
> > > > +              if ((check & (0x1 << 1)))
> > > > +               {
> > > > +                  /* Get maximum number of logical processors
> > > > +                     sharing L3 cache.  */
> > > > +                  threads_l3 = (eax >> 14) & 0x3ff;
> > > > +                  check &= ~(0x1 << 1);
> > > > +                }
> > > > +              break;
> > > > +           }
> > > > +        }
> > > > +      while (check);
> > > > +
> > > > +      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the
> maximum
> > > > +         numbers of addressable IDs for logical processors sharing
> > > > +         the cache, instead of the maximum number of threads
> > > > +         sharing the cache.  */
> > > > +      if (max_cpuid >= 11)
> > > > +        {
> > > > +          /* Find the number of logical processors shipped in
> > > > +             one core and apply count mask.  */
> > > > +          i = 0;
> > > > +
> > > > +          /* Count SMT only if there is L3 cache.  Always count
> > > > +             core if there is no L3 cache.  */
> > > > +          int count = ((threads_l2 > 0 && level == 3)
> > > > +                       | ((threads_l3 > 0
> > > > +                           || (threads_l2 > 0 && level == 2)) <<
> 1));
> > > > +
> > > > +          while (count)
> > > > +            {
> > > > +              __cpuid_count (11, i++, eax, ebx, ecx, edx);
> > > > +
> > > > +              int shipped = ebx & 0xff;
> > > > +              int type = ecx & 0xff00;
> > > > +              if (shipped == 0 || type == 0)
> > > > +                break;
> > > > +              else if (type == 0x100)
> > > > +                {
> > > > +                  /* Count SMT.  */
> > > > +                  if ((count & 0x1))
> > > > +                    {
> > > > +                      int count_mask;
> > > > +
> > > > +                      /* Compute count mask.  */
> > > > +                      asm ("bsr %1, %0"
> > > > +                           : "=r" (count_mask) : "g" (threads_l2));
> > > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > > +                      threads_l2 = (shipped - 1) & count_mask;
> > > > +                      count &= ~0x1;
> > > > +                    }
> > > > +                }
> > > > +              else if (type == 0x200)
> > > > +                {
> > > > +                  /* Count core.  */
> > > > +                  if ((count & (0x1 << 1)))
> > > > +                    {
> > > > +                      int count_mask;
> > > > +                      int threads_core
> > > > +                        = (level == 2 ? threads_l2 : threads_l3);
> > > > +
> > > > +                      /* Compute count mask.  */
> > > > +                      asm ("bsr %1, %0"
> > > > +                           : "=r" (count_mask) : "g"
> (threads_core));
> > > > +                      count_mask = ~(-1 << (count_mask + 1));
> > > > +                      threads_core = (shipped - 1) & count_mask;
> > > > +                      if (level == 2)
> > > > +                        threads_l2 = threads_core;
> > > > +                      else
> > > > +                        threads_l3 = threads_core;
> > > > +                      count &= ~(0x1 << 1);
> > > > +                    }
> > > > +                }
> > > > +            }
> > > > +        }
> > > > +      if (level == 2 && threads_l2 > 0)
> > > > +        threads = threads_l2 + 1;
> > > > +      if (level == 3 && threads_l3 > 0)
> > > > +        threads = threads_l3 + 1;
> > > > +
> > > > +      if (shared > 0 && threads > 0)
> > > > +        shared /= threads;
> > > > +    }
> > >
> > > This code looks very similar to Intel code.   Can you factor it out and reuse
> > > it for you?
> >
> > I tried to extract this part of the code, but it didn’t look very clean.
> > For Example, the case of max_cpuid < 4 does not exist on Zhaoxin processors.
> 
> The same as new Intel processors.
> 
> > Zhaoxin processors are currently inclusive caches, and the number of threads
> 
> Do Zhaoxin processors use CPUID bit to indicate inclusive caches?

Yes, Zhaoxin use CPUID bit to indicate it.

> > sharing L2 cache is not affected by the family or model.  Considering the
> possible
> 
> You can check cpu_features->basic.kind for that.
> 
> > changes of CPU design in future, it may be more convenient to keep separate
> branches.
> >
> > I was wandering if you had any concerns, or if you could give some
> suggestions,
> > that would be great.
> 
> Let's avoid code duplication for now and revisit it in the future.
> 
You are right.  I will try as you suggest and send it as patch v3.

Thank you for your comment.

Best Regards,
May Shao
  

Patch

diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index e3e8ef2..e5a3284 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -436,6 +436,57 @@  handle_amd (int name)
 }


+static long int __attribute__ ((noinline))
+handle_zhaoxin (int name)
+{
+  unsigned int eax;
+  unsigned int ebx;
+  unsigned int ecx;
+  unsigned int edx;
+
+  int folded_rel_name = (M(name) / 3) * 3;
+
+  unsigned int round = 0;
+  while (1)
+    {
+      __cpuid_count (4, round, eax, ebx, ecx, edx);
+
+      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
+      if (type == null)
+        break;
+
+      unsigned int level = (eax >> 5) & 0x7;
+
+      if ((level == 1 && type == data
+        && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
+        || (level == 1 && type == inst
+            && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
+        || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
+        || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
+        {
+          unsigned int offset = M(name) - folded_rel_name;
+
+          if (offset == 0)
+            /* Cache size.  */
+            return (((ebx >> 22) + 1)
+                * (((ebx >> 12) & 0x3ff) + 1)
+                * ((ebx & 0xfff) + 1)
+                * (ecx + 1));
+          if (offset == 1)
+            return (ebx >> 22) + 1;
+
+          assert (offset == 2);
+          return (ebx & 0xfff) + 1;
+        }
+
+      ++round;
+    }
+
+  /* Nothing found.  */
+  return 0;
+}
+
+
 /* Get the value of the system variable NAME.  */
 long int
 attribute_hidden
@@ -449,6 +500,9 @@  __cache_sysconf (int name)
   if (cpu_features->basic.kind == arch_kind_amd)
     return handle_amd (name);

+  if (cpu_features->basic.kind == arch_kind_zhaoxin)
+    return handle_zhaoxin (name);
+
   // XXX Fill in more vendors.

   /* CPU not known, we have no information.  */
@@ -751,6 +805,137 @@  intel_bug_no_cache_info:
        }
 #endif
     }
+  else if (cpu_features->basic.kind == arch_kind_zhaoxin)
+    {
+      data   = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
+      long int core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
+      shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
+
+      /* Number of logical processors sharing L2 cache.  */
+      int threads_l2;
+
+      /* Number of logical processors sharing L3 cache.  */
+      int threads_l3;
+
+      if (shared <= 0)
+        {
+          /* No shared L3 cache.  All we have is the L2 cache.  */
+          level = 2;
+          shared = core;
+          threads_l2 = 0;
+          threads_l3 = -1;
+        }
+      else
+        {
+          level = 3;
+          threads_l2 = 0;
+          threads_l3 = 0;
+        }
+
+      int i = 0;
+
+      /* Query until cache level 2 and 3 are enumerated.  */
+      int check = 0x1 | (threads_l3 == 0) << 1;
+      do
+        {
+          __cpuid_count (4, i++, eax, ebx, ecx, edx);
+
+          switch ((eax >> 5) & 0x7)
+            {
+            default:
+              break;
+            case 2:
+              if ((check & 0x1))
+                {
+                  /* Get maximum number of logical processors
+                     sharing L2 cache.  */
+                  threads_l2 = (eax >> 14) & 0x3ff;
+                  check &= ~0x1;
+                }
+              break;
+            case 3:
+              if ((check & (0x1 << 1)))
+               {
+                  /* Get maximum number of logical processors
+                     sharing L3 cache.  */
+                  threads_l3 = (eax >> 14) & 0x3ff;
+                  check &= ~(0x1 << 1);
+                }
+              break;
+           }
+        }
+      while (check);
+
+      /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
+         numbers of addressable IDs for logical processors sharing
+         the cache, instead of the maximum number of threads
+         sharing the cache.  */
+      if (max_cpuid >= 11)
+        {
+          /* Find the number of logical processors shipped in
+             one core and apply count mask.  */
+          i = 0;
+
+          /* Count SMT only if there is L3 cache.  Always count
+             core if there is no L3 cache.  */
+          int count = ((threads_l2 > 0 && level == 3)
+                       | ((threads_l3 > 0
+                           || (threads_l2 > 0 && level == 2)) << 1));
+
+          while (count)
+            {
+              __cpuid_count (11, i++, eax, ebx, ecx, edx);
+
+              int shipped = ebx & 0xff;
+              int type = ecx & 0xff00;
+              if (shipped == 0 || type == 0)
+                break;
+              else if (type == 0x100)
+                {
+                  /* Count SMT.  */
+                  if ((count & 0x1))
+                    {
+                      int count_mask;
+
+                      /* Compute count mask.  */
+                      asm ("bsr %1, %0"
+                           : "=r" (count_mask) : "g" (threads_l2));
+                      count_mask = ~(-1 << (count_mask + 1));
+                      threads_l2 = (shipped - 1) & count_mask;
+                      count &= ~0x1;
+                    }
+                }
+              else if (type == 0x200)
+                {
+                  /* Count core.  */
+                  if ((count & (0x1 << 1)))
+                    {
+                      int count_mask;
+                      int threads_core
+                        = (level == 2 ? threads_l2 : threads_l3);
+
+                      /* Compute count mask.  */
+                      asm ("bsr %1, %0"
+                           : "=r" (count_mask) : "g" (threads_core));
+                      count_mask = ~(-1 << (count_mask + 1));
+                      threads_core = (shipped - 1) & count_mask;
+                      if (level == 2)
+                        threads_l2 = threads_core;
+                      else
+                        threads_l3 = threads_core;
+                      count &= ~(0x1 << 1);
+                    }
+                }
+            }
+        }
+      if (level == 2 && threads_l2 > 0)
+        threads = threads_l2 + 1;
+      if (level == 3 && threads_l3 > 0)
+        threads = threads_l3 + 1;
+
+      if (shared > 0 && threads > 0)
+        shared /= threads;
+    }

   if (cpu_features->data_cache_size != 0)
     data = cpu_features->data_cache_size;