aarch64: Add tunable glibc.memset.dc_zva_threshold
Commit Message
For SPEC2017 502.gcc_r (rate=32), which uses quite a few memsets, we can get 2.3%
improvement on emag processor if DC ZVA threshold is changed from 512 to 8M.
> This should be called cache.aarch64_dc_zva_threshold or
> cache.aarch64_dczva_threshold.
I think dc_zva implies aarch64 architecture, so the name "cache.dc_zva_threshold"
seems to be concise a little bit.
> The default threshold initialization needs to take place in the emag
> file and not here, this code is already getting complicated and it won't
> be long until it starts looking like a character soup. That should also
> take care of the unexplained magic number (8M).
Changed, and added comment for this default value, please refer to new patch below.
Thanks,
Feng
--------------
* manual/tunables.texi: Document glibc.cache.dc_zva_threshold.
* sysdeps/aarch64/dl-tunables.list (glibc):
Add cache.dc_zva_threshold.
* sysdeps/aarch64/multiarch/memset_emag.c: New file.
* sysdeps/aarch64/multiarch/memset_base64.S (__memset_base64) : Add
conditional compare over __dc_zva_threshold.
* sysdeps/aarch64/multiarch/memset_emag.S (DC_ZVA_THRESHOLD): Change
to a new value.
(HAVE_THRESHOLD_TUNABLE): New macro.
---
ChangeLog | 12 +++++++++
manual/tunables.texi | 9 +++++++
sysdeps/aarch64/dl-tunables.list | 6 +++++
sysdeps/aarch64/multiarch/memset.c | 2 ++
sysdeps/aarch64/multiarch/memset_base64.S | 7 ++++++
sysdeps/aarch64/multiarch/memset_emag.S | 15 +++++++----
sysdeps/aarch64/multiarch/memset_emag.c | 41 +++++++++++++++++++++++++++++++
7 files changed, 87 insertions(+), 5 deletions(-)
create mode 100644 sysdeps/aarch64/multiarch/memset_emag.c
--
1.8.3.1
Comments
On 02/08/19 7:19 AM, Feng Xue OS wrote:
> For SPEC2017 502.gcc_r (rate=32), which uses quite a few memsets, we can get 2.3%
> improvement on emag processor if DC ZVA threshold is changed from 512 to 8M.
That's great, can you test for another part too? Making a case for a
tunable is easier if you can show applicability to a larger set of
processors.
>> This should be called cache.aarch64_dc_zva_threshold or
>> cache.aarch64_dczva_threshold.
> I think dc_zva implies aarch64 architecture, so the name "cache.dc_zva_threshold"
> seems to be concise a little bit.
It's not just about whether the meaning is clear, it is about naming
convention. Not having an architecture name in the tunable implies that
it could be generally applicable.
Siddhesh
On 02/08/19 8:37 AM, Siddhesh Poyarekar wrote:
> On 02/08/19 7:19 AM, Feng Xue OS wrote:
>> For SPEC2017 502.gcc_r (rate=32), which uses quite a few memsets, we can get 2.3%
>> improvement on emag processor if DC ZVA threshold is changed from 512 to 8M.
>
> That's great, can you test for another part too? Making a case for a
> tunable is easier if you can show applicability to a larger set of
> processors.
Oops, I assumed this explanation was for introducing the tunable, I
realize now that it is for the default 8M value on emag. The test is
still desirable to make the case for a tunable stronger, but not for
this specific point :)
Siddhesh
@@ -1,3 +1,15 @@
+2019-07-31 Feng Xue <fxue@os.amperecomputing.com>
+
+ * manual/tunables.texi: Document glibc.cache.dc_zva_threshold.
+ * sysdeps/aarch64/dl-tunables.list (glibc):
+ Add cache.dc_zva_threshold.
+ * sysdeps/aarch64/multiarch/memset_emag.c: New file.
+ * sysdeps/aarch64/multiarch/memset_base64.S (__memset_base64) : Add
+ conditional compare over __dc_zva_threshold.
+ * sysdeps/aarch64/multiarch/memset_emag.S (DC_ZVA_THRESHOLD): Change
+ to a new value.
+ (HAVE_THRESHOLD_TUNABLE): New macro.
+
2019-07-25 Florian Weimer <fweimer@redhat.com>
[BZ #24677]
@@ -411,3 +411,12 @@ instead.
This tunable is specific to i386 and x86-64.
@end deftp
+
+@deftp Tunable glibc.cache.dc_zva_threshold
+The @code{glibc.cache.dc_zva_threshold} tunable allows the user to set
+threshold to trigger DC ZVA in memset on emag processor. When memset size
+is less than this threshold, normal memory store instruction will be used,
+otherwise DC ZVA instruction will be used.
+
+This tunable is specific to emag aarch64.
+@end deftp
@@ -22,4 +22,10 @@ glibc {
type: STRING
}
}
+ cache {
+ dc_zva_threshold {
+ type: SIZE_T
+ default: 0
+ }
+ }
}
@@ -41,4 +41,6 @@ libc_ifunc (__libc_memset,
# undef memset
strong_alias (__libc_memset, memset);
+
+# include "./memset_emag.c"
#endif
@@ -91,7 +91,14 @@ L(set96):
.p2align 4
L(set_long):
stp val, val, [dstin]
+#ifdef HAVE_THRESHOLD_TUNABLE
+ adrp tmp1, __dc_zva_threshold
+ add tmp1, tmp1, :lo12:__dc_zva_threshold
+ ldr tmp2, [tmp1] /* Load DC ZVA tunable threshold value. */
+ cmp count, tmp2
+#else
cmp count, DC_ZVA_THRESHOLD
+#endif
ccmp val, 0, 0, cs
bic dst, dstin, 15
b.eq L(zva_64)
@@ -20,13 +20,18 @@
#if IS_IN (libc)
# define MEMSET __memset_emag
+# if HAVE_TUNABLES
+# define HAVE_THRESHOLD_TUNABLE 1
+# endif
+
/*
- * Using dc zva to zero memory does not produce better performance if
+ * Using DC ZVA to zero memory does not produce better performance if
* memory size is not very large, especially when there are multiple
- * processes/threads contending memory/cache. Here we use a somewhat
- * large threshold to trigger usage of dc zva.
-*/
-# define DC_ZVA_THRESHOLD 1024
+ * processes/threads contending memory/cache. Here we use a very
+ * large threshold to trigger usage of DC ZVA, which is good for
+ * multi-process/thread workloads.
+ */
+# define DC_ZVA_THRESHOLD 8*1024*1024
# include "./memset_base64.S"
#endif
new file mode 100644
@@ -0,0 +1,41 @@
+/* Setup threshold to trigger DC ZVA in memset for emag.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if HAVE_TUNABLES
+# include <elf/dl-tunables.h>
+
+/* We assume common workloads on server are likely to consist of multiple
+ * processes/threads, contending memory/cache. For this scenario, disabling
+ * DC ZVA in memset can archive better performance on emag processor.
+ * Therefore, by default, we use a very larget threshold, here is 8M,
+ * which has similar effect as disabling DC ZVA, kind of optimization
+ * for multi-process/thread workload.
+ */
+uint64_t __dc_zva_threshold = 8 * 1024 * 1024;
+
+static void
+__attribute__ ((constructor))
+init_dc_zva_threshold (void)
+{
+ uint64_t threshold
+ = TUNABLE_GET (glibc, cache, dc_zva_threshold, uint64_t, NULL);
+
+ if (threshold)
+ __dc_zva_threshold = threshold;
+}
+#endif