aarch64: optimize the unaligned case of memcmp

Message ID 1498174226-16525-1-git-send-email-s.pop@samsung.com
State New, archived
Headers

Commit Message

Sebastian Pop June 22, 2017, 11:30 p.m. UTC
  This brings to glibc a performance improvement that we developed in Bionic libc.
That change has been submitted for review to Bionic libc:
https://android-review.googlesource.com/418279

This patch has been tested on glibc master with "configure; make; make check" on
an aarch64-linux Juno-r0 with no new fails.  We would appreciate help to test
the performance and correctness of this change.

Patch written by Vikas Sinha and Sebastian Pop.  Both Vikas and I are working
for Samsung Austin R&D Center who has a copyright assignment on file with the
FSF for work in glibc.

The performance was measured on the bionic-benchmarks on a hikey (aarch64 8xA53)
board. There was no performance change to the existing benchmark
and a performance improvement on the new benchmark for memcmp
on the unaligned side. The new benchmark has been submitted for
review at https://android-review.googlesource.com/414860

The overall performance improves by 18% for the small data set 8
and the performance improves by 450% for the large data set 64k.

The base is with the libc from /system/lib64. The bionic libc
with this patch is in /data.

hikey:/data # export LD_LIBRARY_PATH=/system/lib64
hikey:/data # ./bionic-benchmarks --benchmark_filter='BM_string_memcmp*'
Run on (8 X 2.4 MHz CPU s)
Benchmark                                Time           CPU Iterations
----------------------------------------------------------------------
BM_string_memcmp/8                      30 ns         30 ns   22955680    251.07MB/s
BM_string_memcmp/64                     57 ns         57 ns   12349184   1076.99MB/s
BM_string_memcmp/512                   305 ns        305 ns    2297163   1.56496GB/s
BM_string_memcmp/1024                  571 ns        571 ns    1225211   1.66912GB/s
BM_string_memcmp/8k                   4307 ns       4306 ns     162562   1.77177GB/s
BM_string_memcmp/16k                  8676 ns       8675 ns      80676   1.75887GB/s
BM_string_memcmp/32k                 19233 ns      19230 ns      36394   1.58695GB/s
BM_string_memcmp/64k                 36986 ns      36984 ns      18952   1.65029GB/s
BM_string_memcmp_aligned/8             199 ns        199 ns    3519166   38.3336MB/s
BM_string_memcmp_aligned/64            386 ns        386 ns    1810734   158.073MB/s
BM_string_memcmp_aligned/512          1735 ns       1734 ns     403981   281.525MB/s
BM_string_memcmp_aligned/1024         3200 ns       3200 ns     218838   305.151MB/s
BM_string_memcmp_aligned/8k          25084 ns      25080 ns      28180   311.507MB/s
BM_string_memcmp_aligned/16k         51730 ns      51729 ns      13521   302.057MB/s
BM_string_memcmp_aligned/32k        103228 ns     103228 ns       6782   302.727MB/s
BM_string_memcmp_aligned/64k        207117 ns     207087 ns       3450   301.806MB/s
BM_string_memcmp_unaligned/8           339 ns        339 ns    2070998   22.5302MB/s
BM_string_memcmp_unaligned/64         1392 ns       1392 ns     502796   43.8454MB/s
BM_string_memcmp_unaligned/512        9194 ns       9194 ns      76133   53.1104MB/s
BM_string_memcmp_unaligned/1024      18325 ns      18323 ns      38206   53.2963MB/s
BM_string_memcmp_unaligned/8k       148579 ns     148574 ns       4713   52.5831MB/s
BM_string_memcmp_unaligned/16k      298169 ns     298120 ns       2344   52.4118MB/s
BM_string_memcmp_unaligned/32k      598813 ns     598797 ns       1085    52.188MB/s
BM_string_memcmp_unaligned/64k     1196079 ns    1196083 ns        540   52.2539MB/s

hikey:/data # export LD_LIBRARY_PATH=/data
hikey:/data # ./bionic-benchmarks --benchmark_filter='BM_string_memcmp*'
Run on (8 X 2.4 MHz CPU s)
Benchmark                                Time           CPU Iterations
----------------------------------------------------------------------
BM_string_memcmp/8                      30 ns         30 ns   23209918   252.802MB/s
BM_string_memcmp/64                     57 ns         57 ns   12348447   1076.95MB/s
BM_string_memcmp/512                   305 ns        305 ns    2296878   1.56471GB/s
BM_string_memcmp/1024                  572 ns        571 ns    1224426    1.6689GB/s
BM_string_memcmp/8k                   4309 ns       4308 ns     162491   1.77109GB/s
BM_string_memcmp/16k                  9348 ns       9345 ns      74894   1.63285GB/s
BM_string_memcmp/32k                 18329 ns      18322 ns      38249    1.6656GB/s
BM_string_memcmp/64k                 36992 ns      36981 ns      18952   1.65045GB/s
BM_string_memcmp_aligned/8             199 ns        199 ns    3513925   38.3162MB/s
BM_string_memcmp_aligned/64            386 ns        386 ns    1814038   158.192MB/s
BM_string_memcmp_aligned/512          1735 ns       1735 ns     402279   281.502MB/s
BM_string_memcmp_aligned/1024         3204 ns       3202 ns     218761   304.941MB/s
BM_string_memcmp_aligned/8k          25577 ns      25569 ns      27406   305.548MB/s
BM_string_memcmp_aligned/16k         52143 ns      52123 ns      13522   299.769MB/s
BM_string_memcmp_aligned/32k        105169 ns     105127 ns       6637    297.26MB/s
BM_string_memcmp_aligned/64k        206508 ns     206383 ns       3417   302.835MB/s
BM_string_memcmp_unaligned/8           287 ns        287 ns    2441787   26.6141MB/s
BM_string_memcmp_unaligned/64          556 ns        556 ns    1257709   109.764MB/s
BM_string_memcmp_unaligned/512        2167 ns       2166 ns     323159   225.443MB/s
BM_string_memcmp_unaligned/1024       4041 ns       4039 ns     173282   241.797MB/s
BM_string_memcmp_unaligned/8k        32234 ns      32221 ns      21645   242.464MB/s
BM_string_memcmp_unaligned/16k       65715 ns      65684 ns      10573   237.882MB/s
BM_string_memcmp_unaligned/32k      133390 ns     133348 ns       5350   234.349MB/s
BM_string_memcmp_unaligned/64k      264506 ns     264401 ns       2644   236.383MB/s
---
 sysdeps/aarch64/memcmp.S | 59 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)
  

Comments

Ramana Radhakrishnan June 23, 2017, 10:38 p.m. UTC | #1
On Fri, Jun 23, 2017 at 12:30 AM, Sebastian Pop <s.pop@samsung.com> wrote:
> This brings to glibc a performance improvement that we developed in Bionic libc.
> That change has been submitted for review to Bionic libc:
> https://android-review.googlesource.com/418279
>
> This patch has been tested on glibc master with "configure; make; make check" on
> an aarch64-linux Juno-r0 with no new fails.  We would appreciate help to test
> the performance and correctness of this change.
>
> Patch written by Vikas Sinha and Sebastian Pop.  Both Vikas and I are working
> for Samsung Austin R&D Center who has a copyright assignment on file with the
> FSF for work in glibc.
>
> The performance was measured on the bionic-benchmarks on a hikey (aarch64 8xA53)
> board. There was no performance change to the existing benchmark
> and a performance improvement on the new benchmark for memcmp
> on the unaligned side. The new benchmark has been submitted for
> review at https://android-review.googlesource.com/414860
>
> The overall performance improves by 18% for the small data set 8
> and the performance improves by 450% for the large data set 64k.
>
> The base is with the libc from /system/lib64. The bionic libc
> with this patch is in /data.

Is there a chance you can add a benchmark to the glibc benchmarking
framework ? For most changes like this it would be good to see if the
existing benchmarks handle these cases and provide equivalent
improvements so that any changes to this can be rationalized in the
future.

Thanks,
Ramana

>
> hikey:/data # export LD_LIBRARY_PATH=/system/lib64
> hikey:/data # ./bionic-benchmarks --benchmark_filter='BM_string_memcmp*'
> Run on (8 X 2.4 MHz CPU s)
> Benchmark                                Time           CPU Iterations
> ----------------------------------------------------------------------
> BM_string_memcmp/8                      30 ns         30 ns   22955680    251.07MB/s
> BM_string_memcmp/64                     57 ns         57 ns   12349184   1076.99MB/s
> BM_string_memcmp/512                   305 ns        305 ns    2297163   1.56496GB/s
> BM_string_memcmp/1024                  571 ns        571 ns    1225211   1.66912GB/s
> BM_string_memcmp/8k                   4307 ns       4306 ns     162562   1.77177GB/s
> BM_string_memcmp/16k                  8676 ns       8675 ns      80676   1.75887GB/s
> BM_string_memcmp/32k                 19233 ns      19230 ns      36394   1.58695GB/s
> BM_string_memcmp/64k                 36986 ns      36984 ns      18952   1.65029GB/s
> BM_string_memcmp_aligned/8             199 ns        199 ns    3519166   38.3336MB/s
> BM_string_memcmp_aligned/64            386 ns        386 ns    1810734   158.073MB/s
> BM_string_memcmp_aligned/512          1735 ns       1734 ns     403981   281.525MB/s
> BM_string_memcmp_aligned/1024         3200 ns       3200 ns     218838   305.151MB/s
> BM_string_memcmp_aligned/8k          25084 ns      25080 ns      28180   311.507MB/s
> BM_string_memcmp_aligned/16k         51730 ns      51729 ns      13521   302.057MB/s
> BM_string_memcmp_aligned/32k        103228 ns     103228 ns       6782   302.727MB/s
> BM_string_memcmp_aligned/64k        207117 ns     207087 ns       3450   301.806MB/s
> BM_string_memcmp_unaligned/8           339 ns        339 ns    2070998   22.5302MB/s
> BM_string_memcmp_unaligned/64         1392 ns       1392 ns     502796   43.8454MB/s
> BM_string_memcmp_unaligned/512        9194 ns       9194 ns      76133   53.1104MB/s
> BM_string_memcmp_unaligned/1024      18325 ns      18323 ns      38206   53.2963MB/s
> BM_string_memcmp_unaligned/8k       148579 ns     148574 ns       4713   52.5831MB/s
> BM_string_memcmp_unaligned/16k      298169 ns     298120 ns       2344   52.4118MB/s
> BM_string_memcmp_unaligned/32k      598813 ns     598797 ns       1085    52.188MB/s
> BM_string_memcmp_unaligned/64k     1196079 ns    1196083 ns        540   52.2539MB/s
>
> hikey:/data # export LD_LIBRARY_PATH=/data
> hikey:/data # ./bionic-benchmarks --benchmark_filter='BM_string_memcmp*'
> Run on (8 X 2.4 MHz CPU s)
> Benchmark                                Time           CPU Iterations
> ----------------------------------------------------------------------
> BM_string_memcmp/8                      30 ns         30 ns   23209918   252.802MB/s
> BM_string_memcmp/64                     57 ns         57 ns   12348447   1076.95MB/s
> BM_string_memcmp/512                   305 ns        305 ns    2296878   1.56471GB/s
> BM_string_memcmp/1024                  572 ns        571 ns    1224426    1.6689GB/s
> BM_string_memcmp/8k                   4309 ns       4308 ns     162491   1.77109GB/s
> BM_string_memcmp/16k                  9348 ns       9345 ns      74894   1.63285GB/s
> BM_string_memcmp/32k                 18329 ns      18322 ns      38249    1.6656GB/s
> BM_string_memcmp/64k                 36992 ns      36981 ns      18952   1.65045GB/s
> BM_string_memcmp_aligned/8             199 ns        199 ns    3513925   38.3162MB/s
> BM_string_memcmp_aligned/64            386 ns        386 ns    1814038   158.192MB/s
> BM_string_memcmp_aligned/512          1735 ns       1735 ns     402279   281.502MB/s
> BM_string_memcmp_aligned/1024         3204 ns       3202 ns     218761   304.941MB/s
> BM_string_memcmp_aligned/8k          25577 ns      25569 ns      27406   305.548MB/s
> BM_string_memcmp_aligned/16k         52143 ns      52123 ns      13522   299.769MB/s
> BM_string_memcmp_aligned/32k        105169 ns     105127 ns       6637    297.26MB/s
> BM_string_memcmp_aligned/64k        206508 ns     206383 ns       3417   302.835MB/s
> BM_string_memcmp_unaligned/8           287 ns        287 ns    2441787   26.6141MB/s
> BM_string_memcmp_unaligned/64          556 ns        556 ns    1257709   109.764MB/s
> BM_string_memcmp_unaligned/512        2167 ns       2166 ns     323159   225.443MB/s
> BM_string_memcmp_unaligned/1024       4041 ns       4039 ns     173282   241.797MB/s
> BM_string_memcmp_unaligned/8k        32234 ns      32221 ns      21645   242.464MB/s
> BM_string_memcmp_unaligned/16k       65715 ns      65684 ns      10573   237.882MB/s
> BM_string_memcmp_unaligned/32k      133390 ns     133348 ns       5350   234.349MB/s
> BM_string_memcmp_unaligned/64k      264506 ns     264401 ns       2644   236.383MB/s
> ---
>  sysdeps/aarch64/memcmp.S | 59 +++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 58 insertions(+), 1 deletion(-)
>
> diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
> index 4cfcb89..d259831 100644
> --- a/sysdeps/aarch64/memcmp.S
> +++ b/sysdeps/aarch64/memcmp.S
> @@ -138,9 +138,66 @@ L(ret0):
>
>         .p2align 6
>  L(misaligned8):
> +       cmp     limit, #8
> +       b.lo    L(misalignedLt8)
> +
> +L(unalignedGe8):
> +
> +       /* Load the first dword with both src potentially unaligned.  */
> +       ldr     data1, [src1]
> +       ldr     data2, [src2]
> +
> +       eor     diff, data1, data2      /* Non-zero if differences found.  */
> +       cbnz    diff, L(not_limit)
> +
> +       /* Sources are not aligned align one of the sources find max offset
> +          from aligned boundary.  */
> +
> +       and     tmp1, src1, #0x7
> +       orr     tmp3, xzr, #0x8
> +       and     tmp2, src2, #0x7
> +       sub     tmp1, tmp3, tmp1
> +       sub     tmp2, tmp3, tmp2
> +       cmp     tmp1, tmp2
> +       /* Choose the maximum.  */
> +       csel    pos, tmp1, tmp2, hi
> +
> +       /* Increment SRC pointers by POS so one of the SRC pointers is word-aligned.  */
> +       add     src1, src1, pos
> +       add     src2, src2, pos
> +
> +       sub     limit, limit, pos
> +       lsr     limit_wd, limit, #3
> +
> +       cmp limit_wd, #0
> +
> +       /* Save #bytes to go back to be able to read 8byte at end
> +          pos=negative offset position to read 8 bytes when len%8 != 0.  */
> +       and     limit, limit, #7
> +       sub     pos, limit, #8
> +
> +       b       L(start_part_realigned)
> +
> +       .p2align 5
> +L(loop_part_aligned):
> +       ldr     data1, [src1], #8
> +       ldr     data2, [src2], #8
> +       subs    limit_wd, limit_wd, #1
> +L(start_part_realigned):
> +       eor     diff, data1, data2      /* Non-zero if differences found.  */
> +       cbnz    diff, L(not_limit)
> +       b.ne    L(loop_part_aligned)
> +
> +       /* Process leftover bytes: read the leftover bytes, starting with
> +          negative offset - so we can load 8 bytes.  */
> +       ldr     data1, [src1, pos]
> +       ldr     data2, [src2, pos]
> +       eor     diff, data1, data2      /* Non-zero if differences found.  */
> +       b       L(not_limit)
> +
> +L(misalignedLt8):
>         sub     limit, limit, #1
>  1:
> -       /* Perhaps we can do better than this.  */
>         ldrb    data1w, [src1], #1
>         ldrb    data2w, [src2], #1
>         subs    limit, limit, #1
> --
> 2.6.3
>
  
Sebastian Pop June 23, 2017, 10:41 p.m. UTC | #2
On Fri, Jun 23, 2017 at 5:38 PM, Ramana Radhakrishnan
<ramana.gcc@googlemail.com> wrote:
> Is there a chance you can add a benchmark to the glibc benchmarking
> framework ? For most changes like this it would be good to see if the
> existing benchmarks handle these cases and provide equivalent
> improvements so that any changes to this can be rationalized in the
> future.

Sure, I can adapt the benchmark that we submitted to AOSP to fit in
the glibc benchmark.
Do you have a pointer to where memcmp is currently benchmarked?
That would be a good place for the unaligned memcmp testcase.

Thanks,
Sebastian
  
Ramana Radhakrishnan June 23, 2017, 10:50 p.m. UTC | #3
On Fri, Jun 23, 2017 at 11:41 PM, Sebastian Pop <sebpop@gmail.com> wrote:
> On Fri, Jun 23, 2017 at 5:38 PM, Ramana Radhakrishnan
> <ramana.gcc@googlemail.com> wrote:
>> Is there a chance you can add a benchmark to the glibc benchmarking
>> framework ? For most changes like this it would be good to see if the
>> existing benchmarks handle these cases and provide equivalent
>> improvements so that any changes to this can be rationalized in the
>> future.
>
> Sure, I can adapt the benchmark that we submitted to AOSP to fit in
> the glibc benchmark.
> Do you have a pointer to where memcmp is currently benchmarked?
> That would be a good place for the unaligned memcmp testcase.


https://sourceware.org/git/?p=glibc.git;a=blob;f=benchtests/bench-memcmp.c;h=96d01a1e7c6c0188b8fd0cf6d9c91f4a4fa8ace5;hb=96d01a1e7c6c0188b8fd0cf6d9c91f4a4fa8ace5

IIRC you just run make bench or benchtests - haven't done this for a while now.

I couldn't find the information on the wiki easily - is there
something that can link to this in the contribution checklist ?
Maintainers ?

regards
Ramana
>
> Thanks,
> Sebastian
  

Patch

diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 4cfcb89..d259831 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -138,9 +138,66 @@  L(ret0):
 
 	.p2align 6
 L(misaligned8):
+	cmp	limit, #8
+	b.lo	L(misalignedLt8)
+
+L(unalignedGe8):
+
+	/* Load the first dword with both src potentially unaligned.  */
+	ldr	data1, [src1]
+	ldr	data2, [src2]
+
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	cbnz	diff, L(not_limit)
+
+	/* Sources are not aligned align one of the sources find max offset
+	   from aligned boundary.  */
+
+	and	tmp1, src1, #0x7
+	orr	tmp3, xzr, #0x8
+	and	tmp2, src2, #0x7
+	sub	tmp1, tmp3, tmp1
+	sub	tmp2, tmp3, tmp2
+	cmp	tmp1, tmp2
+	/* Choose the maximum.  */
+	csel	pos, tmp1, tmp2, hi
+
+	/* Increment SRC pointers by POS so one of the SRC pointers is word-aligned.  */
+	add	src1, src1, pos
+	add	src2, src2, pos
+
+	sub	limit, limit, pos
+	lsr	limit_wd, limit, #3
+
+	cmp limit_wd, #0
+
+	/* Save #bytes to go back to be able to read 8byte at end
+	   pos=negative offset position to read 8 bytes when len%8 != 0.  */
+	and	limit, limit, #7
+	sub	pos, limit, #8
+
+	b	L(start_part_realigned)
+
+	.p2align 5
+L(loop_part_aligned):
+	ldr	data1, [src1], #8
+	ldr	data2, [src2], #8
+	subs	limit_wd, limit_wd, #1
+L(start_part_realigned):
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	cbnz	diff, L(not_limit)
+	b.ne	L(loop_part_aligned)
+
+	/* Process leftover bytes: read the leftover bytes, starting with
+	   negative offset - so we can load 8 bytes.  */
+	ldr	data1, [src1, pos]
+	ldr	data2, [src2, pos]
+	eor	diff, data1, data2	/* Non-zero if differences found.  */
+	b	L(not_limit)
+
+L(misalignedLt8):
 	sub	limit, limit, #1
 1:
-	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1