From patchwork Mon Oct  1 16:22:19 2018
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
X-Patchwork-Submitter: Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
X-Patchwork-Id: 29599
Received: (qmail 93675 invoked by alias); 1 Oct 2018 16:22:43 -0000
Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm
Precedence: bulk
List-Id: <libc-alpha.sourceware.org>
List-Unsubscribe: <mailto:libc-alpha-unsubscribe-##L=##H@sourceware.org>
List-Subscribe: <mailto:libc-alpha-subscribe@sourceware.org>
List-Archive: <http://sourceware.org/ml/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-help@sourceware.org>,
	<http://sourceware.org/ml/#faqs>
Sender: libc-alpha-owner@sourceware.org
Delivered-To: mailing list libc-alpha@sourceware.org
Received: (qmail 93654 invoked by uid 89); 1 Oct 2018 16:22:42 -0000
Authentication-Results: sourceware.org; auth=none
X-Spam-SWARE-Status: No, score=-27.1 required=5.0 tests=BAYES_00, GIT_PATCH_0,
	GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_NUMSUBJECT,
	RCVD_IN_DNSWL_LOW,
	SPF_PASS autolearn=ham version=3.3.2 spammy=pipelined, 086,
	089, loaded
X-HELO: forward100p.mail.yandex.net
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=bell-sw.com; s=mail;
	t=1538410943;
	bh=syxS+lTwc7pmMH/2kzUbVXQwCbSaEuJB+o6VOXYYdmQ=;
	h=Date:From:To:Subject:Message-ID:References:In-Reply-To;
	b=OU27rTEL3H2036BZfU8i4Fg6QKT34DvcBPhrtZ4AafpZIk+q29qf4zNE/i0JrxPZi
	Q0GsKZ8ajlA+RgFTzeKgNkaTn7mzZBr9EFIkQmx552BLLhOxV+Hrq2s8ioQDYUG4GX
	Yvar9GlqzHu9g/nMeRAyz3rWk537XJC4hf/pV2bQ=
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=bell-sw.com; s=mail;
	t=1538410941;
	bh=syxS+lTwc7pmMH/2kzUbVXQwCbSaEuJB+o6VOXYYdmQ=;
	h=Date:From:To:Subject:Message-ID:References:In-Reply-To;
	b=DOMnZwbOMQKOKjCu2xdKq3lDan3u3UMFqs+YErAQTz7X927np9BqqpR0lnGOAAneQ
	EzDtRFmpl7uivI3Nr8xdYeyE+8tr/iHRfST/tU4tQ0SPvER1Res9wdGUeWTtIi21J1
	OuL1syMuQVjHiGslo/DKE18FNcmyujsVY6SN0AJE=
Authentication-Results: smtp3p.mail.yandex.net;
	dkim=pass header.i=@bell-sw.com
Date: Mon, 1 Oct 2018 19:22:19 +0300
From: Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
To: Siddhesh Poyarekar <siddhesh@gotplt.org>,
	Steve Ellcey <sellcey@cavium.com>, libc-alpha@sourceware.org
Subject: Re: [PATCH] aarch64: optimized memcpy implementation for thunderx2
Message-ID: <20181001162219.GA8242@bell-sw.com>
References: <2063a582-d65f-9e9f-50f5-80e4502edbd8@gotplt.org>
	<1538408223.18948.85.camel@cavium.com>
	<0899c6de-9462-8cca-5283-adc263d4b650@gotplt.org>
MIME-Version: 1.0
Content-Disposition: inline
In-Reply-To: <0899c6de-9462-8cca-5283-adc263d4b650@gotplt.org>
User-Agent: Mutt/1.5.24 (2015-08-30)

Below is the benchmark data along with the slightly
modified implementation.

On Mon, Oct 01, 2018 at 09:11:47PM +0530, Siddhesh Poyarekar wrote:
> On 01/10/18 9:07 PM, Steve Ellcey wrote:
> >Anton is doing this work under a contract with Cavium. ??Cavium has a
> >corporate copyright assignment on file with FSF so I am hoping that
> >is sufficient for Anton to make contributions. ??I don't have my login
> >to fencepost anymore so I can't double check the specifics of the
> >Cavium copyright assignment myself but my understanding is that it is
> >a general one that does not restrict contributions to be from just
> >a list of specific people.
> 
> Thanks for the clarification Steve.  That should be sufficient assuming that
> Anton's work is owned by Cavium.
> 
> As for the patch itself, I'll leave it to you for a deeper review since
> you're the better judge for tx2 performance.  It would still be nice to see
> the results from memcpy-walk and not just memcpy-large, assuming that's what
> Anton tested with.
> 
> The patch needs a ChangeLog too.
> 
> Thanks,
> Siddhesh

memcpy-large, the original T2 implementation as the baseline

Length       Old         New

65543:       3592.50     3432.19 (  4.46%)	
65551:       6554.06     4035.31 ( 38.43%)	
65567:       6563.75     4028.75 ( 38.62%)	
65599:       6727.19     4028.44 ( 40.12%)	
131079:      7289.06     7002.81 (  3.93%)	
131087:     12587.80     8230.94 ( 34.61%)	
131103:     12563.10     8226.56 ( 34.52%)	
131135:     12901.60     8247.50 ( 36.07%)	
262151:     20497.20    19074.10 (  6.94%)	
262159:     25104.10    20833.80 ( 17.01%)	
262175:     24629.10    20776.60 ( 15.64%)	
262207:     25103.80    20922.50 ( 16.66%)	
524295:     41298.10    38061.90 (  7.84%)	
524303:     50114.10    41634.70 ( 16.92%)	
524319:     49232.80    41417.50 ( 15.87%)	
524351:     50357.20    41590.00 ( 17.41%)	
1048583:    81622.80    75968.80 (  6.93%)	
1048591:   100746.00    82978.10 ( 17.64%)	
1048607:    98254.70    82763.40 ( 15.77%)	
1048639:   100326.00    83549.40 ( 16.72%)	
2097159:   163283.00   152319.00 (  6.00%)	
2097167:   201279.00   165965.00 ( 17.00%)	
2097183:   196811.00   165500.00 ( 15.00%)	
2097215:   200769.00   166158.00 ( 17.00%)	
4194311:   326463.00   303574.00 (  7.00%)	
4194319:   401840.00   331810.00 ( 17.00%)	
4194335:   394120.00   330547.00 ( 16.00%)	
4194367:   401803.00   331893.00 ( 17.00%)	
8388615:   652982.00   607675.00 (  6.00%)	
8388623:   802170.00   663562.00 ( 17.00%)	
8388639:   786795.00   661482.00 ( 15.00%)	
8388671:   802130.00   663625.00 ( 17.00%)	
16777223: 1404170.00  1272160.00 (  9.40%)	
16777231: 1611260.00  1339180.00 ( 16.89%)	
16777247: 1573870.00  1326400.00 ( 15.72%)	
16777279: 1604780.00  1330200.00 ( 17.11%)	
33554439: 3700280.00  3167250.00 ( 14.41%)	
33554447: 4085260.00  3647450.00 ( 10.72%)	
33554463: 4006190.00  3636820.00 (  9.22%)	
33554495: 4081450.00  3635760.00 ( 10.92%)	


memcpy-walk, the original T2 implementation as the baseline

Length:       Old:             New:

128:          10.56	        9.87 (  6.59%)	
144:          10.68	       10.35 (  3.13%)	
129:          14.68	       12.56 ( 14.47%)	
143:          16.75	       13.70 ( 18.18%)	
130:          14.03	       12.19 ( 13.08%)	
142:          15.82	       13.21 ( 16.50%)	
131:          15.05	       12.91 ( 14.21%)	
141:          16.57	       13.63 ( 17.72%)	
132:          12.74	       11.46 ( 10.05%)	
140:          13.92	       12.14 ( 12.79%)	
133:          15.36	       13.07 ( 14.88%)	
139:          16.18	       13.49 ( 16.63%)	
134:          14.67	       12.64 ( 13.86%)	
138:          15.21	       12.99 ( 14.59%)	
135:          15.65	       13.24 ( 15.43%)	
137:          15.91	       13.42 ( 15.63%)	
256:          32.36	       19.82 ( 38.75%)	
272:          20.73	       20.53 (  0.97%)	
257:          23.91	       22.61 (  5.46%)	
271:          25.90	       23.71 (  8.46%)	
258:          23.41	       21.59 (  7.77%)	
270:          25.10	       22.83 (  9.03%)	
259:          23.97	       22.08 (  7.90%)	
269:          25.44	       23.17 (  8.89%)	
260:          22.24	       21.08 (  5.22%)	
268:          23.47	       21.72 (  7.45%)	
261:          24.35	       22.12 (  9.14%)	
267:          25.22	       23.18 (  8.07%)	
262:          23.65	       21.83 (  7.72%)	
266:          25.04	       22.17 ( 11.45%)	
263:          24.60	       22.29 (  9.41%)	
265:          25.13	       22.53 ( 10.34%)	
512:          60.10	       39.10 ( 34.94%)	
528:          38.97	       38.98 ( -0.03%)	
513:          43.74	       41.41 (  5.31%)	
527:          45.35	       42.24 (  6.85%)	
514:          43.55	       41.21 (  5.38%)	
526:          44.93	       41.88 (  6.79%)	
515:          44.34	       41.69 (  5.98%)	
525:          45.37	       42.39 (  6.56%)	
516:          42.98	       41.34 (  3.81%)	
524:          43.61	       41.62 (  4.57%)	
517:          44.76	       41.70 (  6.85%)	
523:          45.29	       42.03 (  7.19%)	
518:          43.98	       41.17 (  6.39%)	
522:          44.47	       41.37 (  6.96%)	
519:          44.62	       41.90 (  6.10%)	
521:          44.82	       41.84 (  6.65%)	
1024:         92.67	       78.24 ( 15.57%)	
1040:         75.14	       75.48 ( -0.46%)	
1025:         82.23	       78.58 (  4.44%)	
1039:         80.86	       78.48 (  2.95%)	
1026:         81.74	       78.61 (  3.83%)	
1038:         81.07	       78.64 (  3.00%)	
1027:         82.62	       78.24 (  5.30%)	
1037:         81.87	       78.26 (  4.41%)	
1028:         81.50	       76.85 (  5.71%)	
1036:         80.41	       77.37 (  3.78%)	
1029:         82.85	       77.57 (  6.38%)	
1035:         81.88	       77.52 (  5.33%)	
1030:         82.27	       76.73 (  6.73%)	
1034:         80.61	       76.51 (  5.08%)	
1031:         82.95	       77.28 (  6.84%)	
1033:         82.22	       77.46 (  5.79%)	
2048:        162.40	      143.79 ( 11.46%)	
2064:        141.59	      139.96 (  1.16%)	
2049:        151.28	      143.47 (  5.16%)	
2063:        147.76	      143.20 (  3.09%)	
2050:        150.12	      143.27 (  4.57%)	
2062:        147.95	      142.99 (  3.35%)	
2051:        150.78	      143.21 (  5.02%)	
2061:        149.21	      142.89 (  4.24%)	
2052:        149.21	      141.79 (  4.97%)	
2060:        147.81	      142.17 (  3.82%)	
2053:        151.31	      142.50 (  5.82%)	
2059:        149.39	      142.10 (  4.88%)	
2054:        150.49	      141.93 (  5.69%)	
2058:        149.70	      141.49 (  5.49%)	
2055:        150.97	      142.20 (  5.81%)	
2057:        149.50	      142.35 (  4.79%)	
4096:        303.24	      281.71 (  7.10%)	
4112:        276.36	      272.07 (  1.55%)	
4097:        289.49	      279.64 (  3.40%)	
4111:        285.08	      278.72 (  2.23%)	
4098:        288.09	      279.75 (  2.90%)	
4110:        285.33	      277.88 (  2.61%)	
4099:        288.90	      279.66 (  3.20%)	
4109:        285.42	      277.08 (  2.92%)	
4100:        286.11	      276.89 (  3.22%)	
4108:        285.30	      275.74 (  3.35%)	
4101:        287.57	      276.41 (  3.88%)	
4107:        287.46	      275.66 (  4.11%)	
4102:        287.98	      274.84 (  4.56%)	
4106:        287.78	      276.39 (  3.96%)	
4103:        287.88	      275.74 (  4.21%)	
4105:        286.37	      275.38 (  3.84%)	
8192:        584.91	      546.80 (  6.52%)	
8208:        567.20	      548.22 (  3.35%)	
8193:        570.20	      550.95 (  3.38%)	
8207:        575.19	      552.11 (  4.01%)	
8194:        570.07	      552.84 (  3.02%)	
8206:        574.37	      551.27 (  4.02%)	
8195:        571.65	      550.30 (  3.74%)	
8205:        573.86	      550.05 (  4.15%)	
8196:        569.58	      550.20 (  3.40%)	
8204:        571.56	      548.25 (  4.08%)	
8197:        571.27	      549.32 (  3.84%)	
8203:        574.61	      549.93 (  4.29%)	
8198:        574.07	      549.76 (  4.23%)	
8202:        574.54	      548.60 (  4.51%)	
8199:        574.75	      550.24 (  4.26%)	
8201:        572.30	      549.55 (  3.97%)	
16384:      1119.46	     1071.00 (  4.33%)	
16400:      1103.77	     1074.23 (  2.68%)	
16385:      1105.45	     1078.87 (  2.40%)	
16399:      1106.27	     1080.64 (  2.32%)	
16386:      1103.68	     1075.02 (  2.60%)	
16398:      1105.78	     1073.91 (  2.88%)	
16387:      1105.48	     1074.46 (  2.81%)	
16397:      1111.44	     1074.85 (  3.29%)	
16388:      1108.83	     1074.96 (  3.05%)	
16396:      1110.16	     1075.69 (  3.10%)	
16389:      1104.51	     1073.80 (  2.78%)	
16395:      1105.41	     1075.22 (  2.73%)	
16390:      1104.02	     1082.29 (  1.97%)	
16394:      1104.55	     1079.86 (  2.24%)	
16391:      1104.67	     1079.74 (  2.26%)	
16393:      1106.43	     1075.68 (  2.78%)	
32768:      2166.84	     2120.03 (  2.16%)	
32784:      2151.61	     2123.45 (  1.31%)	
32769:      2159.75	     2120.54 (  1.82%)	
32783:      2161.50	     2122.79 (  1.79%)	
32770:      2150.59	     2123.10 (  1.28%)	
32782:      2151.40	     2122.74 (  1.33%)	
32771:      2154.45	     2135.24 (  0.89%)	
32781:      2150.22	     2132.97 (  0.80%)	
32772:      2151.94	     2125.86 (  1.21%)	
32780:      2152.97	     2125.27 (  1.29%)	
32773:      2152.49	     2123.85 (  1.33%)	
32779:      2163.28	     2124.11 (  1.81%)	
32774:      2162.51	     2122.53 (  1.85%)	
32778:      2151.93	     2123.59 (  1.32%)	
32775:      2151.68	     2123.54 (  1.31%)	
32777:      2149.72	     2132.82 (  0.79%)	
65536:      4295.72	     4239.57 (  1.31%)	
65552:      4283.87	     4222.39 (  1.44%)	
65537:      4274.55	     4214.65 (  1.40%)	
65551:      4280.03	     4221.12 (  1.38%)	
65538:      4297.00	     4219.75 (  1.80%)	
65550:      4315.21	     4219.55 (  2.22%)	
65539:      4278.71	     4220.25 (  1.37%)	
65549:      4277.82	     4222.09 (  1.30%)	
65540:      4279.59	     4245.00 (  0.81%)	
65548:      4281.26	     4247.10 (  0.80%)	
65541:      4280.69	     4224.98 (  1.30%)	
65547:      4279.46	     4221.36 (  1.36%)	
65542:      4292.06	     4232.92 (  1.38%)	
65546:      4299.94	     4218.97 (  1.88%)	
65543:      4303.29	     4223.02 (  1.87%)	
65545:      4280.18	     4221.31 (  1.38%)	
131072:     8539.06	     8407.54 (  1.54%)	
131088:     8531.11	     8466.35 (  0.76%)	
131073:     8530.33	     8455.33 (  0.88%)	
131087:     8533.24	     8415.62 (  1.38%)	
131074:     8527.70	     8412.99 (  1.35%)	
131086:     8533.75	     8413.98 (  1.40%)	
131075:     8570.35	     8412.75 (  1.84%)	
131085:     8575.33	     8420.96 (  1.80%)	
131076:     8529.16	     8414.49 (  1.34%)	
131084:     8530.96	     8450.14 (  0.95%)	
131077:     8527.81	     8455.23 (  0.85%)	
131083:     8530.29	     8412.71 (  1.38%)	
131078:     8530.68	     8415.02 (  1.36%)	
131082:     8526.46	     8412.15 (  1.34%)	
131079:     8573.24	     8415.12 (  1.84%)	
131081:     8563.96	     8409.38 (  1.81%)	
262144:    17040.60	    16801.50 (  1.40%)	
262160:    17051.10	    16815.00 (  1.38%)	
262145:    17047.60	    16902.20 (  0.85%)	
262159:    17042.10	    16893.20 (  0.87%)	
262146:    17039.40	    16800.20 (  1.40%)	
262158:    17042.50	    16807.00 (  1.38%)	
262147:    17038.40	    16798.30 (  1.41%)	
262157:    17116.00	    16808.10 (  1.80%)	
262148:    17109.80	    16800.40 (  1.81%)	
262156:    17040.50	    16807.30 (  1.37%)	
262149:    17029.20	    16878.90 (  0.88%)	
262155:    17029.70	    16886.00 (  0.84%)	
262150:    17035.20	    16799.50 (  1.38%)	
262154:    17035.60	    16803.10 (  1.36%)	
262151:    17037.70	    16802.10 (  1.38%)	
262153:    17106.10	    16812.70 (  1.72%)	
524288:    34204.90	    33576.70 (  1.84%)	
524304:    34076.60	    33594.10 (  1.42%)	
524289:    34072.60	    33616.60 (  1.34%)	
524303:    34040.40	    33733.00 (  0.90%)	
524290:    34064.90	    33754.80 (  0.91%)	
524302:    34059.30	    33602.60 (  1.34%)	
524291:    34077.50	    33598.00 (  1.41%)	
524301:    34060.20	    33585.50 (  1.39%)	
524292:    34220.10	    33601.80 (  1.81%)	
524300:    34051.60	    33600.30 (  1.33%)	
524293:    34071.20	    33594.80 (  1.40%)	
524299:    34054.70	    33740.30 (  0.92%)	
524294:    34067.20	    33762.40 (  0.89%)	
524298:    34066.80	    33597.30 (  1.38%)	
524295:    34049.90	    33587.70 (  1.36%)	
524297:    34048.30	    33578.80 (  1.38%)	
1048576:   68512.70	    67215.50 (  1.89%)	
1048592:   68456.10	    67137.00 (  1.93%)	
1048577:   68161.20	    67191.10 (  1.42%)	
1048591:   68101.60	    67146.60 (  1.40%)	
1048578:   68105.50	    67527.30 (  0.85%)	
1048590:   68120.60	    67532.50 (  0.86%)	
1048579:   68123.30	    67158.90 (  1.42%)	
1048589:   68109.40	    67128.10 (  1.44%)	
1048580:   68420.80	    67141.20 (  1.87%)	
1048588:   68387.70	    67105.90 (  1.87%)	
1048581:   68111.40	    67163.60 (  1.39%)	
1048587:   68100.20	    67156.40 (  1.39%)	
1048582:   68079.20	    67464.20 (  0.90%)	
1048586:   68092.00	    67579.80 (  0.75%)	
1048583:   68103.40	    67150.20 (  1.40%)	
1048585:   68100.30	    67154.80 (  1.39%)	
2097152:  135942.00	   134168.00 (  1.00%)	
2097168:  136859.00	   134261.00 (  1.00%)	
2097153:  137141.00	   134278.00 (  2.00%)	
2097167:  136145.00	   134289.00 (  1.00%)	
2097154:  136326.00	   134327.00 (  1.00%)	
2097166:  136221.00	   134941.00 (  0.00%)	
2097155:  136244.00	   134299.00 (  1.00%)	
2097165:  136273.00	   134367.00 (  1.00%)	
2097156:  136221.00	   134286.00 (  1.00%)	
2097164:  136793.00	   134281.00 (  1.00%)	
2097157:  136947.00	   134346.00 (  1.00%)	
2097163:  136241.00	   134288.00 (  1.00%)	
2097158:  136256.00	   134288.00 (  1.00%)	
2097162:  136229.00	   134982.00 (  0.00%)	
2097159:  136227.00	   134913.00 (  0.00%)	
2097161:  136176.00	   134265.00 (  1.00%)	
4194304:  271842.00	   268390.00 (  1.00%)	
4194320:  272394.00	   268479.00 (  1.00%)	
4194305:  274327.00	   268534.00 (  2.00%)	
4194319:  273587.00	   268584.00 (  1.00%)	
4194306:  272976.00	   268614.00 (  1.00%)	
4194318:  272294.00	   269878.00 (  0.00%)	
4194307:  272749.00	   269894.00 (  1.00%)	
4194317:  272459.00	   268622.00 (  1.00%)	
4194308:  272632.00	   268565.00 (  1.00%)	
4194316:  272439.00	   268609.00 (  1.00%)	
4194309:  273908.00	   268734.00 (  1.00%)	
4194315:  273827.00	   268492.00 (  1.00%)	
4194310:  272508.00	   268573.00 (  1.00%)	
4194314:  272512.00	   268566.00 (  1.00%)	
4194311:  272395.00	   269761.00 (  0.00%)	
4194313:  272305.00	   269994.00 (  0.00%)	
8388608:  543440.00	   536600.00 (  1.00%)	
8388624:  544635.00	   536972.00 (  1.00%)	
8388609:  548560.00	   537180.00 (  2.00%)	
8388623:  547458.00	   537086.00 (  1.00%)	
8388610:  545852.00	   537179.00 (  1.00%)	
8388622:  544720.00	   536976.00 (  1.00%)	
8388611:  545748.00	   539724.00 (  1.00%)	
8388621:  544660.00	   539579.00 (  0.00%)	
8388612:  545815.00	   536950.00 (  1.00%)	
8388620:  544914.00	   537276.00 (  1.00%)	
8388613:  545538.00	   537080.00 (  1.00%)	
8388619:  549975.00	   537315.00 (  2.00%)	
8388614:  548025.00	   537379.00 (  1.00%)	
8388618:  544850.00	   538511.00 (  1.00%)	
8388615:  546555.00	   538226.00 (  1.00%)	
8388617:  544815.00	   540246.00 (  0.00%)	
16777216: 1086650.00	   1072510.00 (  1.30%)	
16777232: 1090790.00	   1073660.00 (  1.57%)	
16777217: 1091930.00	   1074220.00 (  1.62%)	
16777231: 1096050.00	   1074300.00 (  1.98%)	
16777218: 1096680.00	   1074520.00 (  2.02%)	
16777230: 1090850.00	   1074100.00 (  1.54%)	
16777219: 1091580.00	   1074500.00 (  1.56%)	
16777229: 1090860.00	   1079600.00 (  1.03%)	
16777220: 1091380.00	   1079380.00 (  1.10%)	
16777228: 1091350.00	   1074350.00 (  1.56%)	
16777221: 1091300.00	   1074090.00 (  1.58%)	
16777227: 1090770.00	   1074680.00 (  1.48%)	
16777222: 1096710.00	   1074380.00 (  2.04%)	
16777226: 1095200.00	   1073920.00 (  1.94%)	
16777223: 1091620.00	   1074550.00 (  1.56%)	
16777225: 1091020.00	   1074570.00 (  1.51%)	
33554432: 2174840.00	   2156700.00 (  0.83%)	
33554448: 2182480.00	   2148180.00 (  1.57%)	
33554433: 2183340.00	   2147960.00 (  1.62%)	
33554447: 2184400.00	   2149360.00 (  1.60%)	
33554434: 2192560.00	   2149040.00 (  1.98%)	
33554446: 2194490.00	   2149140.00 (  2.07%)	
33554435: 2183870.00	   2146940.00 (  1.69%)	
33554445: 2183470.00	   2148720.00 (  1.59%)	
33554436: 2183370.00	   2158320.00 (  1.15%)	
33554444: 2182820.00	   2159310.00 (  1.08%)	
33554437: 2183770.00	   2149160.00 (  1.58%)	
33554443: 2183190.00	   2147080.00 (  1.65%)	
33554438: 2183250.00	   2148280.00 (  1.60%)	
33554442: 2193750.00	   2148190.00 (  2.08%)	
33554439: 2183280.00	   2147770.00 (  1.63%)	
33554441: 2183830.00	   2148430.00 (  1.62%)	


2018-10-01 Anton Youdkevitch <anton.youdkevitch@bell-sw.com>

* sysdeps/aarch64/multiarch/memcpy_thunderx2.S: rewritten
  implementation considering thunderX2 chip specifics

diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx.S b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
index de494d9..6000365 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx.S
@@ -74,13 +74,10 @@
 
 #if IS_IN (libc)
 
-# ifndef USE_THUNDERX2
 #  undef MEMCPY
 #  define MEMCPY __memcpy_thunderx
 #  undef MEMMOVE
 #  define MEMMOVE __memmove_thunderx
-#  define USE_THUNDERX
-# endif
 
 ENTRY_ALIGN (MEMMOVE, 6)
 
@@ -182,8 +179,6 @@ L(copy96):
 	.p2align 4
 L(copy_long):
 
-# if defined(USE_THUNDERX) || defined (USE_THUNDERX2)
-
 	/* On thunderx, large memcpy's are helped by software prefetching.
 	   This loop is identical to the one below it but with prefetching
 	   instructions included.  For loops that are less than 32768 bytes,
@@ -196,11 +191,7 @@ L(copy_long):
 	bic	dst, dstin, 15
 	ldp	D_l, D_h, [src]
 	sub	src, src, tmp1
-#  if defined(USE_THUNDERX)
 	prfm	pldl1strm, [src, 384]
-#  elif defined(USE_THUNDERX2)
-	prfm	pldl1strm, [src, 256]
-#  endif
 	add	count, count, tmp1	/* Count is now 16 too large.  */
 	ldp	A_l, A_h, [src, 16]
 	stp	D_l, D_h, [dstin]
@@ -210,13 +201,9 @@ L(copy_long):
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 
 L(prefetch_loop64):
-#  if defined(USE_THUNDERX)
 	tbz	src, #6, 1f
 	prfm	pldl1strm, [src, 512]
 1:
-#  elif defined(USE_THUNDERX2)
-	prfm	pldl1strm, [src, 256]
-#  endif
 	stp	A_l, A_h, [dst, 16]
 	ldp	A_l, A_h, [src, 16]
 	stp	B_l, B_h, [dst, 32]
@@ -230,7 +217,6 @@ L(prefetch_loop64):
 	b	L(last64)
 
 L(copy_long_without_prefetch):
-# endif
 
 	and	tmp1, dstin, 15
 	bic	dst, dstin, 15
diff --git a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
index 8501abf..945d1e8 100644
--- a/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
+++ b/sysdeps/aarch64/multiarch/memcpy_thunderx2.S
@@ -20,8 +20,997 @@
 /* The actual code in this memcpy and memmove is in memcpy_thunderx.S.
    The only real differences are with the prefetching instructions.  */
 
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
+ *
+ */
+
+#define dstin	x0
+#define src	x1
+#define count	x2
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define tmp2    x6
+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define A_hw	w7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	src
+#define E_h	count
+#define F_l	srcend
+#define F_h	dst
+#define G_l	count
+#define G_h	dst
+#define tmp1	x14
+
+#define A_q     q0
+#define B_q     q1
+#define C_q     q2
+#define D_q     q3
+#define E_q     q4
+#define F_q     q5
+#define G_q     q6
+#define H_q	q7
+#define I_q	q16
+#define J_q	q17
+
+#define A_v     v0
+#define B_v     v1
+#define C_v     v2
+#define D_v     v3
+#define E_v     v4
+#define F_v     v5
+#define G_v     v6
+#define H_v     v7
+#define I_v     v16
+#define J_v	v17
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+#if IS_IN (libc)
+
+#undef MEMCPY
+#undef MEMMOVE
 #define MEMCPY __memcpy_thunderx2
 #define MEMMOVE __memmove_thunderx2
-#define USE_THUNDERX2
 
-#include "memcpy_thunderx.S"
+
+/* Copies are split into 3 main cases: small copies of up to 16 bytes,
+   medium copies of 17..96 bytes which are fully unrolled. Large copies
+   of more than 96 bytes align the destination and use an unrolled loop
+   processing 64 bytes per iteration.
+   The current optimized memcpy implementation is not compatible with
+   memmove and is separated from it completely. See below.
+   Overlapping large forward memmoves use a loop that copies backwards.
+*/
+
+ENTRY_ALIGN (MEMMOVE, 6)
+
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	sub	tmp1, dstin, src
+	cmp	count, 96
+	ccmp	tmp1, count, 2, hi
+	b.lo	L(move_long)
+
+	prfm	PLDL1KEEP, [src]
+	add	srcend, src, count
+	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
+	cmp	count, 96
+	b.hi	L(copy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(copy16):
+	cmp	count, 8
+	b.lo	1f
+	ldr	A_l, [src]
+	ldr	A_h, [srcend, -8]
+	str	A_l, [dstin]
+	str	A_h, [dstend, -8]
+	ret
+	.p2align 4
+1:
+	tbz	count, 2, 1f
+	ldr	A_lw, [src]
+	ldr	A_hw, [srcend, -4]
+	str	A_lw, [dstin]
+	str	A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+1:
+	cbz	count, 2f
+	lsr	tmp1, count, 1
+	ldrb	A_lw, [src]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
+2:	ret
+
+	.p2align 4
+	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
+	   32 bytes from the end.  */
+L(copy96):
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [src, 32]
+	ldp	D_l, D_h, [src, 48]
+	ldp	E_l, E_h, [srcend, -32]
+	ldp	F_l, F_h, [srcend, -16]
+	stp	A_l, A_h, [dstin]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin, 32]
+	stp	D_l, D_h, [dstin, 48]
+	stp	E_l, E_h, [dstend, -32]
+	stp	F_l, F_h, [dstend, -16]
+	ret
+
+	/* Align DST to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	.p2align 4
+L(copy_long):
+	and	tmp1, dstin, 15
+	bic	dst, dstin, 15
+	ldp	D_l, D_h, [src]
+	sub	src, src, tmp1
+	add	count, count, tmp1	/* Count is now 16 too large.  */
+	ldp	A_l, A_h, [src, 16]
+	stp	D_l, D_h, [dstin]
+	ldp	B_l, B_h, [src, 32]
+	ldp	C_l, C_h, [src, 48]
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 128 + 16	/* Test and readjust count.  */
+	b.ls	L(last64)
+L(loop64):
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [src, 16]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [src, 32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [src, 48]
+	stp	D_l, D_h, [dst, 64]!
+	ldp	D_l, D_h, [src, 64]!
+	subs	count, count, 64
+	b.hi	L(loop64)
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the end even if
+	   there is just 1 byte left.  */
+L(last64):
+	ldp	E_l, E_h, [srcend, -64]
+	stp	A_l, A_h, [dst, 16]
+	ldp	A_l, A_h, [srcend, -48]
+	stp	B_l, B_h, [dst, 32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dst, 48]
+	ldp	C_l, C_h, [srcend, -16]
+	stp	D_l, D_h, [dst, 64]
+	stp	E_l, E_h, [dstend, -64]
+	stp	A_l, A_h, [dstend, -48]
+	stp	B_l, B_h, [dstend, -32]
+	stp	C_l, C_h, [dstend, -16]
+	ret
+
+	.p2align 4
+L(move_long):
+	cbz	tmp1, 3f
+
+	add	srcend, src, count
+	add	dstend, dstin, count
+
+	/* Align dstend to 16 byte alignment so that we don't cross cache line
+	   boundaries on both loads and stores.  There are at least 96 bytes
+	   to copy, so copy 16 bytes unaligned and then align.  The loop
+	   copies 64 bytes per iteration and prefetches one iteration ahead.  */
+
+	and	tmp1, dstend, 15
+	ldp	D_l, D_h, [srcend, -16]
+	sub	srcend, srcend, tmp1
+	sub	count, count, tmp1
+	ldp	A_l, A_h, [srcend, -16]
+	stp	D_l, D_h, [dstend, -16]
+	ldp	B_l, B_h, [srcend, -32]
+	ldp	C_l, C_h, [srcend, -48]
+	ldp	D_l, D_h, [srcend, -64]!
+	sub	dstend, dstend, tmp1
+	subs	count, count, 128
+	b.ls	2f
+
+	nop
+1:
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [srcend, -16]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [srcend, -32]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [srcend, -48]
+	stp	D_l, D_h, [dstend, -64]!
+	ldp	D_l, D_h, [srcend, -64]!
+	subs	count, count, 64
+	b.hi	1b
+
+	/* Write the last full set of 64 bytes.  The remainder is at most 64
+	   bytes, so it is safe to always copy 64 bytes from the start even if
+	   there is just 1 byte left.  */
+2:
+	ldp	G_l, G_h, [src, 48]
+	stp	A_l, A_h, [dstend, -16]
+	ldp	A_l, A_h, [src, 32]
+	stp	B_l, B_h, [dstend, -32]
+	ldp	B_l, B_h, [src, 16]
+	stp	C_l, C_h, [dstend, -48]
+	ldp	C_l, C_h, [src]
+	stp	D_l, D_h, [dstend, -64]
+	stp	G_l, G_h, [dstin, 48]
+	stp	A_l, A_h, [dstin, 32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstin]
+3:	ret
+
+END (MEMMOVE)
+libc_hidden_builtin_def (MEMMOVE)
+
+
+/* memcpy implementation below is not compatible with memmove
+   because of pipelined loads/stores, which are faster, but they
+   can't be used in the case of overlapping memmove arrays */
+
+#define MEMCPY_PREFETCH_LDR 640
+
+ENTRY (MEMCPY)
+	DELOUSE (0)
+	DELOUSE (1)
+	DELOUSE (2)
+
+	add     srcend, src, count
+	cmp     count, 16
+	b.ls    L(memcopy16)
+	ldr     A_q, [src], #16
+	add     dstend, dstin, count
+	and     tmp1, src, 15
+	cmp     count, 96
+	b.hi    L(memcopy_long)
+
+	/* Medium copies: 17..96 bytes.  */
+	ldr     E_q, [srcend, -16]
+	cmp     count, 64
+	b.gt    L(memcpy_copy96)
+	cmp     count, 48
+	b.le    L(bytes_17_to_48)
+	/* 49..64 bytes */
+	ldp     B_q, C_q, [src]
+	str     E_q, [dstend, -16]
+	stp     A_q, B_q, [dstin]
+	str     C_q, [dstin, 32]
+	ret
+
+L(bytes_17_to_48):
+	/* 17..48 bytes*/
+	cmp     count, 32
+	b.gt    L(bytes_32_to_48)
+	/* 17..32 bytes*/
+	str     A_q, [dstin]
+	str     E_q, [dstend, -16]
+	ret
+
+L(bytes_32_to_48):
+	/* 32..48 */
+	ldr     B_q, [src]
+	str     A_q, [dstin]
+	str     E_q, [dstend, -16]
+	str     B_q, [dstin, 16]
+	ret
+
+	.p2align 4
+	/* Small copies: 0..16 bytes.  */
+L(memcopy16):
+	cmp     count, 8
+	b.lo    L(bytes_0_to_8)
+	ldr     A_l, [src]
+	ldr     A_h, [srcend, -8]
+	add     dstend, dstin, count
+	str     A_l, [dstin]
+	str     A_h, [dstend, -8]
+	ret
+	.p2align 4
+
+L(bytes_0_to_8):
+	tbz     count, 2, L(bytes_0_to_3)
+	ldr     A_lw, [src]
+	ldr     A_hw, [srcend, -4]
+	add     dstend, dstin, count
+	str     A_lw, [dstin]
+	str     A_hw, [dstend, -4]
+	ret
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
+L(bytes_0_to_3):
+	cbz     count, L(end)
+	lsr     tmp1, count, 1
+	ldrb    A_lw, [src]
+	ldrb    A_hw, [srcend, -1]
+	add     dstend, dstin, count
+	ldrb    B_lw, [src, tmp1]
+	strb    A_lw, [dstin]
+	strb    B_lw, [dstin, tmp1]
+	strb    A_hw, [dstend, -1]
+L(end): ret
+
+	.p2align 4
+
+L(memcpy_copy96):
+	/* Copying 65..96 bytes. A_q (first 16 bytes) and
+	   E_q(last 16 bytes) are already loaded.
+
+	   The size is large enough to benefit from aligned
+	   loads */
+	bic     src, src, 15
+	ldp     B_q, C_q, [src]
+	str     A_q, [dstin]
+	/* Loaded 64 bytes, second 16-bytes chunk can be
+	   overlapping with the first chunk by tmp1 bytes.
+	   Stored 16 bytes. */
+	sub     dst, dstin, tmp1
+	add     count, count, tmp1
+	/* the range of count being [65..96] becomes [65..111]
+	   after tmp [0..15] gets added to it,
+	   count now is <bytes-left-to-load>+48 */
+	cmp     count, 80
+	b.gt    L(copy96_medium)
+	ldr     D_q, [src, 32]
+	stp     B_q, C_q, [dst, 16]
+	str     E_q, [dstend, -16]
+	str     D_q, [dst, 48]
+	ret
+
+	.p2align 4
+L(copy96_medium):
+	ldp     D_q, A_q, [src, 32]
+	str     B_q, [dst, 16]
+	cmp     count, 96
+	b.gt    L(copy96_large)
+	str     E_q, [dstend, -16]
+	stp     C_q, D_q, [dst, 32]
+	str     A_q, [dst, 64]
+	ret
+
+L(copy96_large):
+	ldr     F_q, [src, 64]
+	stp     C_q, D_q, [dst, 32]
+	str     E_q, [dstend, -16]
+	stp     A_q, F_q, [dst, 64]
+	ret
+
+	.p2align 4
+L(memcopy_long):
+	bic     src, src, 15
+	ldp     B_q, C_q, [src], #32
+	str     A_q, [dstin]
+	sub     dst, dstin, tmp1
+	add     count, count, tmp1
+	add     dst, dst, 16
+	and	tmp1, dst, 15
+	ldp     D_q, E_q, [src], #32
+	str     B_q, [dst], #16
+
+	/* Already loaded 64+16 bytes. Check if at
+	   least 64 more bytes left */
+	subs    count, count, 64+64+16
+	b.lt    L(loop128_exit2)
+	cmp     count, MEMCPY_PREFETCH_LDR + 64 + 32
+	b.lt    L(loop128)
+	cbnz	tmp1, L(dst_unaligned)
+	sub     count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+
+	.p2align 4
+
+L(loop128_prefetch):
+	str     C_q, [dst], #16
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	str     D_q, [dst], #16
+	ldp     F_q, G_q, [src], #32
+	str	E_q, [dst], #16
+	ldp     H_q, A_q, [src], #32
+	str     F_q, [dst], #16
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	str     G_q, [dst], #16
+	ldp     B_q, C_q, [src], #32
+	str	H_q, [dst], #16
+	ldp     D_q, E_q, [src], #32
+	stp	A_q, B_q, [dst], #32
+	subs	count, count, 128
+	b.ge    L(loop128_prefetch)
+
+L(preloop128):
+	add	count, count, MEMCPY_PREFETCH_LDR + 64 + 32
+	.p2align 4
+L(loop128):
+	ldp     F_q, G_q, [src], #32
+	str     C_q, [dst], #16
+	ldp     B_q, A_q, [src], #32
+	str     D_q, [dst], #16
+	stp     E_q, F_q, [dst], #32
+	stp     G_q, B_q, [dst], #32
+	subs    count, count, 64
+	b.lt    L(loop128_exit1)
+L(loop128_proceed):
+	ldp     B_q, C_q, [src], #32
+	str     A_q, [dst], #16
+	ldp     D_q, E_q, [src], #32
+	str     B_q, [dst], #16
+	subs    count, count, 64
+	b.ge    L(loop128)
+
+	.p2align 4
+L(loop128_exit2):
+	stp     C_q, D_q, [dst], #32
+	str     E_q, [dst], #16
+	b       L(copy_long_check32);
+
+L(loop128_exit1):
+	/* A_q is still not stored and 0..63 bytes left,
+	   so, count is -64..-1.
+	   Check if less than 32 bytes left (count < -32) */
+	str     A_q, [dst], #16
+L(copy_long_check32):
+	cmn     count, 64
+	b.eq    L(copy_long_done)
+	cmn     count, 32
+	b.le    L(copy_long_last32)
+	ldp     B_q, C_q, [src]
+	stp     B_q, C_q, [dst]
+
+L(copy_long_last32):
+	ldp     F_q, G_q, [srcend, -32]
+	stp     F_q, G_q, [dstend, -32]
+
+L(copy_long_done):
+	ret
+
+L(dst_unaligned):
+	/* For the unaligned store case the code loads two
+	   aligned chunks and then merges them using ext
+	   instrunction. This can be up to 30% faster than
+	   the the simple unaligned store access.
+
+	   Current state: tmp1 = dst % 16; C_q, D_q, E_q
+	   contains data yet to be stored. src and dst points
+	   to next-to-be-processed data. A_q, B_q contains
+	   data already stored before, count = bytes left to
+	   be load decremented by 64.
+
+	   The control is passed here if at least 64 bytes left
+	   to be loaded. The code does two aligned loads and then
+	   extracts (16-tmp1) bytes from the first register and
+	   tmp1 bytes from the next register forming the value
+	   for the aligned store.
+
+	   As ext instruction can only have it's index encoded
+	   as immediate. 15 code chunks process each possible
+	   index value. Computed goto is used to reach the
+	   required code. */
+	
+	/* Store the 16 bytes to dst and align dst for further
+	   operations, several bytes will be stored at this
+	   address once more */
+	str     C_q, [dst], #16
+	ldp     F_q, G_q, [src], #32
+	bic	dst, dst, 15
+	adr	tmp2, L(load_and_merge)
+	add	tmp2, tmp2, tmp1, LSL 7
+	sub	tmp2, tmp2, 128
+	br	tmp2
+
+.p2align 7
+L(load_and_merge):
+#define EXT_SIZE 1
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 2
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+2:
+
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 3
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 4
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 5
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 6
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 7
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 8
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 9
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 10
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 11
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 12
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 13
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 14
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+.p2align 7
+#define EXT_SIZE 15
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, E_v.16b, 16-EXT_SIZE
+	subs    count, count, 32
+	b.ge    2f
+1:
+	stp     A_q, B_q, [dst], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	stp     H_q, I_q, [dst], #16
+	add     dst, dst, tmp1
+	str     G_q, [dst], #16
+	b       L(copy_long_check32)
+2:
+	stp     A_q, B_q, [dst], #32
+	prfm    pldl1strm, [src, MEMCPY_PREFETCH_LDR]
+	ldp     D_q, J_q, [src], #32
+	ext     H_v.16b, E_v.16b, F_v.16b, 16-EXT_SIZE
+	ext     I_v.16b, F_v.16b, G_v.16b, 16-EXT_SIZE
+	mov     C_v.16b, G_v.16b
+	stp     H_q, I_q, [dst], #32
+	ldp     F_q, G_q, [src], #32
+	ext     A_v.16b, C_v.16b, D_v.16b, 16-EXT_SIZE
+	ext     B_v.16b, D_v.16b, J_v.16b, 16-EXT_SIZE
+	mov     E_v.16b, J_v.16b
+	subs    count, count, 64
+	b.ge    2b
+	b	1b
+#undef EXT_SIZE
+
+END (MEMCPY)
+libc_hidden_builtin_def (MEMCPY)
+#endif