[v8,1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S

Message ID 20210403081215.2309505-1-goldstein.w.n@gmail.com
State Committed
Commit 1a8605b6cd257e8a74e29b5b71c057211f5fb847
Headers
Series [v8,1/2] x86: Update large memcpy case in memmove-vec-unaligned-erms.S |

Commit Message

Noah Goldstein April 3, 2021, 8:12 a.m. UTC
  From: noah <goldstein.w.n@gmail.com>

No Bug. This commit updates the large memcpy case (no overlap). The
update is to perform memcpy on either 2 or 4 contiguous pages at
once. This 1) helps to alleviate the affects of false memory aliasing
when destination and source have a close 4k alignment and 2) In most
cases and for most DRAM units is a modestly more efficient access
pattern. These changes are a clear performance improvement for
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
pass.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Issue was alignment related AFAICT. Added `.p2align 4` infront of the
loops and no longer see any meaningful regression.

Also added back the temporal stores for the tail. Saw a regression
when doing these tests.

Two tables below for skylake and icelake numbers for the areas around
where you saw the regression. Below is all data from the tests.

N = 10.

Skylake
Len         ,align1      ,align2      ,new mean    ,old mean  
4103        ,0           ,64          ,84.5        ,88.6
4111        ,0           ,3           ,99.0        ,99.9
4127        ,3           ,0           ,102.1       ,102.3
4159        ,3           ,7           ,88.7        ,90.9
4223        ,9           ,5           ,88.1        ,87.4
8199        ,0           ,64          ,146.7       ,150.2
8207        ,0           ,3           ,167.9       ,168.5
8223        ,3           ,0           ,168.5       ,168.1
8255        ,3           ,7           ,157.0       ,159.2
8319        ,9           ,5           ,155.5       ,155.7
16391       ,0           ,64          ,286.2       ,288.8
16399       ,0           ,3           ,307.0       ,308.7
16415       ,3           ,0           ,307.4       ,307.6
16447       ,3           ,7           ,294.6       ,295.5
16511       ,9           ,5           ,291.5       ,462.1
32775       ,0           ,64          ,603.4       ,601.5
32783       ,0           ,3           ,604.8       ,606.4
32799       ,3           ,0           ,603.0       ,604.1
32831       ,3           ,7           ,600.2       ,737.3
32895       ,9           ,5           ,604.4       ,599.5
65543       ,0           ,64          ,1873.5      ,1854.3
65551       ,0           ,3           ,1862.9      ,1846.6
65567       ,3           ,0           ,1885.5      ,1966.0
65599       ,3           ,7           ,1833.2      ,1833.1
65663       ,9           ,5           ,1884.9      ,1887.4
131079      ,0           ,64          ,3944.3      ,3949.4
131087      ,0           ,3           ,3927.3      ,3913.3
131103      ,3           ,0           ,4415.8      ,4169.4
131135      ,3           ,7           ,4224.5      ,4157.6
131199      ,9           ,5           ,5974.0      ,4983.8
262151      ,0           ,64          ,11050.2     ,10620.6
262159      ,0           ,3           ,9932.8      ,10037.3
262175      ,3           ,0           ,10188.8     ,9206.6
262207      ,3           ,7           ,9633.3      ,9216.7
262271      ,9           ,5           ,9732.7      ,9345.3
524295      ,0           ,64          ,24823.9     ,24880.7
524303      ,0           ,3           ,24514.0     ,24556.7
524319      ,3           ,0           ,23974.4     ,24219.9
524351      ,3           ,7           ,24159.7     ,24207.0
524415      ,9           ,5           ,23946.5     ,24142.8

Icelake:
Len         ,align1      ,align2      ,new mean    ,old mean  
4103        ,0           ,64          ,50.2        ,63.7
4111        ,0           ,3           ,63.7        ,65.1
4127        ,3           ,0           ,68.2        ,69.4
4159        ,3           ,7           ,59.6        ,68.0
4223        ,9           ,5           ,68.2        ,66.8
8199        ,0           ,64          ,92.1        ,89.9
8207        ,0           ,3           ,119.7       ,118.3
8223        ,3           ,0           ,119.1       ,120.9
8255        ,3           ,7           ,122.9       ,123.7
8319        ,9           ,5           ,122.1       ,121.8
16391       ,0           ,64          ,162.7       ,158.0
16399       ,0           ,3           ,227.6       ,234.1
16415       ,3           ,0           ,230.8       ,232.7
16447       ,3           ,7           ,226.8       ,232.6
16511       ,9           ,5           ,233.4       ,233.8
32775       ,0           ,64          ,312.2       ,301.8
32783       ,0           ,3           ,449.7       ,450.0
32799       ,3           ,0           ,452.7       ,455.9
32831       ,3           ,7           ,449.8       ,458.0
32895       ,9           ,5           ,456.3       ,459.4
65543       ,0           ,64          ,1460.6      ,1463.9
65551       ,0           ,3           ,1462.0      ,1465.4
65567       ,3           ,0           ,1466.6      ,1480.4
65599       ,3           ,7           ,1488.0      ,1488.9
65663       ,9           ,5           ,1680.8      ,1499.5
131079      ,0           ,64          ,2988.5      ,3010.1
131087      ,0           ,3           ,2995.5      ,2996.4
131103      ,3           ,0           ,3006.2      ,3000.5
131135      ,3           ,7           ,3032.4      ,3073.7
131199      ,9           ,5           ,3010.4      ,3027.4
262151      ,0           ,64          ,6143.2      ,6079.1
262159      ,0           ,3           ,6085.1      ,6075.8
262175      ,3           ,0           ,6088.0      ,6064.9
262207      ,3           ,7           ,6018.7      ,6023.5
262271      ,9           ,5           ,6019.8      ,5959.2
524295      ,0           ,64          ,14464.2     ,14095.1
524303      ,0           ,3           ,14761.6     ,14050.2
524319      ,3           ,0           ,14534.1     ,14087.5
524351      ,3           ,7           ,14147.7     ,13903.8
524415      ,9           ,5           ,14157.0     ,13982.9



cpu         ,version     ,Len         ,align1      ,align2      ,new mean    ,old mean  
skylake     ,avx         ,4103        ,0           ,64          ,84.5        ,88.6
skylake     ,avx         ,4111        ,0           ,3           ,99.0        ,99.9
skylake     ,avx         ,4127        ,3           ,0           ,102.1       ,102.3
skylake     ,avx         ,4159        ,3           ,7           ,88.7        ,90.9
skylake     ,avx         ,4223        ,9           ,5           ,88.1        ,87.4
skylake     ,avx         ,8199        ,0           ,64          ,146.7       ,150.2
skylake     ,avx         ,8207        ,0           ,3           ,167.9       ,168.5
skylake     ,avx         ,8223        ,3           ,0           ,168.5       ,168.1
skylake     ,avx         ,8255        ,3           ,7           ,157.0       ,159.2
skylake     ,avx         ,8319        ,9           ,5           ,155.5       ,155.7
skylake     ,avx         ,16391       ,0           ,64          ,286.2       ,288.8
skylake     ,avx         ,16399       ,0           ,3           ,307.0       ,308.7
skylake     ,avx         ,16415       ,3           ,0           ,307.4       ,307.6
skylake     ,avx         ,16447       ,3           ,7           ,294.6       ,295.5
skylake     ,avx         ,16511       ,9           ,5           ,291.5       ,462.1
skylake     ,avx         ,32775       ,0           ,64          ,603.4       ,601.5
skylake     ,avx         ,32783       ,0           ,3           ,604.8       ,606.4
skylake     ,avx         ,32799       ,3           ,0           ,603.0       ,604.1
skylake     ,avx         ,32831       ,3           ,7           ,600.2       ,737.3
skylake     ,avx         ,32895       ,9           ,5           ,604.4       ,599.5
skylake     ,avx         ,65543       ,0           ,64          ,1873.5      ,1854.3
skylake     ,avx         ,65551       ,0           ,3           ,1862.9      ,1846.6
skylake     ,avx         ,65567       ,3           ,0           ,1885.5      ,1966.0
skylake     ,avx         ,65599       ,3           ,7           ,1833.2      ,1833.1
skylake     ,avx         ,65663       ,9           ,5           ,1884.9      ,1887.4
skylake     ,avx         ,131079      ,0           ,64          ,3944.3      ,3949.4
skylake     ,avx         ,131087      ,0           ,3           ,3927.3      ,3913.3
skylake     ,avx         ,131103      ,3           ,0           ,4415.8      ,4169.4
skylake     ,avx         ,131135      ,3           ,7           ,4224.5      ,4157.6
skylake     ,avx         ,131199      ,9           ,5           ,5974.0      ,4983.8
skylake     ,avx         ,262151      ,0           ,64          ,11050.2     ,10620.6
skylake     ,avx         ,262159      ,0           ,3           ,9932.8      ,10037.3
skylake     ,avx         ,262175      ,3           ,0           ,10188.8     ,9206.6
skylake     ,avx         ,262207      ,3           ,7           ,9633.3      ,9216.7
skylake     ,avx         ,262271      ,9           ,5           ,9732.7      ,9345.3
skylake     ,avx         ,524295      ,0           ,64          ,24823.9     ,24880.7
skylake     ,avx         ,524303      ,0           ,3           ,24514.0     ,24556.7
skylake     ,avx         ,524319      ,3           ,0           ,23974.4     ,24219.9
skylake     ,avx         ,524351      ,3           ,7           ,24159.7     ,24207.0
skylake     ,avx         ,524415      ,9           ,5           ,23946.5     ,24142.8
skylake     ,avx         ,1048583     ,0           ,64          ,49163.9     ,49454.6
skylake     ,avx         ,1048591     ,0           ,3           ,49879.3     ,49400.8
skylake     ,avx         ,1048607     ,3           ,0           ,49738.0     ,48864.6
skylake     ,avx         ,1048639     ,3           ,7           ,48804.0     ,47588.5
skylake     ,avx         ,1048703     ,9           ,5           ,49629.4     ,49796.3
skylake     ,avx         ,2097159     ,0           ,64          ,98271.7     ,96330.6
skylake     ,avx         ,2097167     ,0           ,3           ,97801.8     ,98638.1
skylake     ,avx         ,2097183     ,3           ,0           ,98041.1     ,99287.6
skylake     ,avx         ,2097215     ,3           ,7           ,96629.5     ,96521.9
skylake     ,avx         ,2097279     ,9           ,5           ,98961.8     ,98909.8
skylake     ,avx         ,4194311     ,0           ,64          ,194667.7    ,195377.1
skylake     ,avx         ,4194319     ,0           ,3           ,194919.5    ,198576.2
skylake     ,avx         ,4194335     ,3           ,0           ,192949.8    ,194584.7
skylake     ,avx         ,4194367     ,3           ,7           ,189943.5    ,189177.9
skylake     ,avx         ,4194431     ,9           ,5           ,192479.1    ,196494.2
skylake     ,avx         ,8388615     ,0           ,64          ,588671.6    ,587215.4
skylake     ,avx         ,8388623     ,0           ,3           ,581640.7    ,582812.5
skylake     ,avx         ,8388639     ,3           ,0           ,549811.9    ,544697.6
skylake     ,avx         ,8388671     ,3           ,7           ,591155.0    ,577951.8
skylake     ,avx         ,8388735     ,9           ,5           ,547583.2    ,545133.3
skylake     ,avx         ,16777223    ,0           ,64          ,1787503.0   ,1811146.0
skylake     ,avx         ,16777231    ,0           ,3           ,1758671.0   ,1756343.0
skylake     ,avx         ,16777247    ,3           ,0           ,1691781.0   ,1694661.0
skylake     ,avx         ,16777279    ,3           ,7           ,1768150.0   ,1754785.0
skylake     ,avx         ,16777343    ,9           ,5           ,1695179.0   ,1710794.0
skylake     ,sse2        ,4103        ,0           ,64          ,150.8       ,150.5
skylake     ,sse2        ,4111        ,0           ,3           ,156.8       ,158.4
skylake     ,sse2        ,4127        ,3           ,0           ,99.7        ,99.4
skylake     ,sse2        ,4159        ,3           ,7           ,154.8       ,154.5
skylake     ,sse2        ,4223        ,9           ,5           ,137.3       ,137.2
skylake     ,sse2        ,8199        ,0           ,64          ,284.8       ,285.5
skylake     ,sse2        ,8207        ,0           ,3           ,296.0       ,296.1
skylake     ,sse2        ,8223        ,3           ,0           ,168.0       ,168.2
skylake     ,sse2        ,8255        ,3           ,7           ,293.0       ,292.4
skylake     ,sse2        ,8319        ,9           ,5           ,251.3       ,250.7
skylake     ,sse2        ,16391       ,0           ,64          ,561.3       ,608.3
skylake     ,sse2        ,16399       ,0           ,3           ,571.0       ,574.8
skylake     ,sse2        ,16415       ,3           ,0           ,305.4       ,305.0
skylake     ,sse2        ,16447       ,3           ,7           ,563.2       ,565.0
skylake     ,sse2        ,16511       ,9           ,5           ,477.1       ,475.1
skylake     ,sse2        ,32775       ,0           ,64          ,1128.2      ,1131.7
skylake     ,sse2        ,32783       ,0           ,3           ,1126.6      ,1131.0
skylake     ,sse2        ,32799       ,3           ,0           ,587.6       ,590.8
skylake     ,sse2        ,32831       ,3           ,7           ,1130.6      ,1126.2
skylake     ,sse2        ,32895       ,9           ,5           ,957.6       ,953.0
skylake     ,sse2        ,65543       ,0           ,64          ,2718.9      ,2704.2
skylake     ,sse2        ,65551       ,0           ,3           ,2724.1      ,2725.0
skylake     ,sse2        ,65567       ,3           ,0           ,1888.4      ,1914.3
skylake     ,sse2        ,65599       ,3           ,7           ,2787.6      ,2748.7
skylake     ,sse2        ,65663       ,9           ,5           ,2400.5      ,2369.4
skylake     ,sse2        ,131079      ,0           ,64          ,5603.3      ,5654.9
skylake     ,sse2        ,131087      ,0           ,3           ,5939.3      ,5871.4
skylake     ,sse2        ,131103      ,3           ,0           ,4272.4      ,4190.0
skylake     ,sse2        ,131135      ,3           ,7           ,7601.4      ,7524.6
skylake     ,sse2        ,131199      ,9           ,5           ,7022.1      ,6864.7
skylake     ,sse2        ,262151      ,0           ,64          ,13736.2     ,14030.0
skylake     ,sse2        ,262159      ,0           ,3           ,12407.3     ,12334.1
skylake     ,sse2        ,262175      ,3           ,0           ,9661.1      ,9249.4
skylake     ,sse2        ,262207      ,3           ,7           ,12850.2     ,12351.6
skylake     ,sse2        ,262271      ,9           ,5           ,10792.6     ,10435.8
skylake     ,sse2        ,524295      ,0           ,64          ,27754.5     ,28177.7
skylake     ,sse2        ,524303      ,0           ,3           ,27766.2     ,28152.0
skylake     ,sse2        ,524319      ,3           ,0           ,24030.9     ,24438.3
skylake     ,sse2        ,524351      ,3           ,7           ,27787.5     ,27933.0
skylake     ,sse2        ,524415      ,9           ,5           ,24263.2     ,25249.1
skylake     ,sse2        ,1048583     ,0           ,64          ,56199.9     ,56039.8
skylake     ,sse2        ,1048591     ,0           ,3           ,56750.2     ,58889.7
skylake     ,sse2        ,1048607     ,3           ,0           ,56394.0     ,55115.3
skylake     ,sse2        ,1048639     ,3           ,7           ,57233.1     ,57473.8
skylake     ,sse2        ,1048703     ,9           ,5           ,56324.3     ,55917.9
skylake     ,sse2        ,2097159     ,0           ,64          ,113234.8    ,114346.4
skylake     ,sse2        ,2097167     ,0           ,3           ,114373.1    ,115522.5
skylake     ,sse2        ,2097183     ,3           ,0           ,108113.3    ,108513.3
skylake     ,sse2        ,2097215     ,3           ,7           ,116863.6    ,116549.9
skylake     ,sse2        ,2097279     ,9           ,5           ,108945.1    ,108843.7
skylake     ,sse2        ,4194311     ,0           ,64          ,230250.1    ,232350.0
skylake     ,sse2        ,4194319     ,0           ,3           ,231895.3    ,235055.6
skylake     ,sse2        ,4194335     ,3           ,0           ,218442.8    ,219199.8
skylake     ,sse2        ,4194367     ,3           ,7           ,242564.2    ,235587.7
skylake     ,sse2        ,4194431     ,9           ,5           ,224167.4    ,215261.8
skylake     ,sse2        ,8388615     ,0           ,64          ,679801.8    ,674832.0
skylake     ,sse2        ,8388623     ,0           ,3           ,684913.2    ,685238.7
skylake     ,sse2        ,8388639     ,3           ,0           ,644865.4    ,631388.6
skylake     ,sse2        ,8388671     ,3           ,7           ,698700.9    ,689316.1
skylake     ,sse2        ,8388735     ,9           ,5           ,644820.2    ,631366.8
skylake     ,sse2        ,16777223    ,0           ,64          ,1877984.0   ,1876437.0
skylake     ,sse2        ,16777231    ,0           ,3           ,1898086.0   ,1913053.0
skylake     ,sse2        ,16777247    ,3           ,0           ,1857018.0   ,1866949.0
skylake     ,sse2        ,16777279    ,3           ,7           ,1914905.0   ,1897134.0
skylake     ,sse2        ,16777343    ,9           ,5           ,1859937.0   ,1881939.0
icelake     ,avx512      ,4103        ,0           ,64          ,75.2        ,75.8
icelake     ,avx512      ,4111        ,0           ,3           ,56.9        ,56.4
icelake     ,avx512      ,4127        ,3           ,0           ,59.1        ,59.6
icelake     ,avx512      ,4159        ,3           ,7           ,50.7        ,51.3
icelake     ,avx512      ,4223        ,9           ,5           ,59.2        ,58.9
icelake     ,avx512      ,8199        ,0           ,64          ,67.8        ,63.9
icelake     ,avx512      ,8207        ,0           ,3           ,89.0        ,89.9
icelake     ,avx512      ,8223        ,3           ,0           ,90.2        ,90.1
icelake     ,avx512      ,8255        ,3           ,7           ,82.6        ,84.9
icelake     ,avx512      ,8319        ,9           ,5           ,91.5        ,92.8
icelake     ,avx512      ,16391       ,0           ,64          ,118.0       ,117.6
icelake     ,avx512      ,16399       ,0           ,3           ,156.5       ,157.0
icelake     ,avx512      ,16415       ,3           ,0           ,157.4       ,157.3
icelake     ,avx512      ,16447       ,3           ,7           ,151.0       ,151.6
icelake     ,avx512      ,16511       ,9           ,5           ,159.1       ,159.6
icelake     ,avx512      ,32775       ,0           ,64          ,231.8       ,230.8
icelake     ,avx512      ,32783       ,0           ,3           ,297.8       ,299.3
icelake     ,avx512      ,32799       ,3           ,0           ,299.1       ,299.0
icelake     ,avx512      ,32831       ,3           ,7           ,293.5       ,295.4
icelake     ,avx512      ,32895       ,9           ,5           ,300.3       ,302.5
icelake     ,avx512      ,65543       ,0           ,64          ,1473.4      ,1479.2
icelake     ,avx512      ,65551       ,0           ,3           ,1438.2      ,1445.3
icelake     ,avx512      ,65567       ,3           ,0           ,1450.3      ,1463.8
icelake     ,avx512      ,65599       ,3           ,7           ,1469.0      ,1473.8
icelake     ,avx512      ,65663       ,9           ,5           ,1480.0      ,1483.5
icelake     ,avx512      ,131079      ,0           ,64          ,3015.1      ,3037.5
icelake     ,avx512      ,131087      ,0           ,3           ,2952.3      ,2960.4
icelake     ,avx512      ,131103      ,3           ,0           ,2966.2      ,2964.4
icelake     ,avx512      ,131135      ,3           ,7           ,2961.6      ,3047.9
icelake     ,avx512      ,131199      ,9           ,5           ,2967.4      ,3183.8
icelake     ,avx512      ,262151      ,0           ,64          ,6206.0      ,6141.5
icelake     ,avx512      ,262159      ,0           ,3           ,5990.8      ,5959.2
icelake     ,avx512      ,262175      ,3           ,0           ,5976.7      ,5963.8
icelake     ,avx512      ,262207      ,3           ,7           ,5939.5      ,5924.3
icelake     ,avx512      ,262271      ,9           ,5           ,5944.6      ,5990.3
icelake     ,avx512      ,524295      ,0           ,64          ,14726.7     ,14307.0
icelake     ,avx512      ,524303      ,0           ,3           ,14344.2     ,14040.5
icelake     ,avx512      ,524319      ,3           ,0           ,14175.0     ,13862.2
icelake     ,avx512      ,524351      ,3           ,7           ,14261.4     ,13821.5
icelake     ,avx512      ,524415      ,9           ,5           ,14266.5     ,14064.7
icelake     ,avx512      ,1048583     ,0           ,64          ,35211.4     ,35414.6
icelake     ,avx512      ,1048591     ,0           ,3           ,35156.8     ,35591.2
icelake     ,avx512      ,1048607     ,3           ,0           ,35273.1     ,35503.3
icelake     ,avx512      ,1048639     ,3           ,7           ,35255.8     ,35725.0
icelake     ,avx512      ,1048703     ,9           ,5           ,35703.6     ,36289.9
icelake     ,avx512      ,2097159     ,0           ,64          ,72613.9     ,72063.2
icelake     ,avx512      ,2097167     ,0           ,3           ,72301.6     ,73504.2
icelake     ,avx512      ,2097183     ,3           ,0           ,73448.8     ,72133.6
icelake     ,avx512      ,2097215     ,3           ,7           ,73762.9     ,72825.8
icelake     ,avx512      ,2097279     ,9           ,5           ,72097.3     ,72914.6
icelake     ,avx512      ,4194311     ,0           ,64          ,144793.4    ,144182.1
icelake     ,avx512      ,4194319     ,0           ,3           ,143710.3    ,145063.3
icelake     ,avx512      ,4194335     ,3           ,0           ,146722.1    ,144046.4
icelake     ,avx512      ,4194367     ,3           ,7           ,144267.0    ,144874.6
icelake     ,avx512      ,4194431     ,9           ,5           ,143808.2    ,144560.0
icelake     ,avx512      ,8388615     ,0           ,64          ,427993.4    ,424521.5
icelake     ,avx512      ,8388623     ,0           ,3           ,470267.1    ,473290.8
icelake     ,avx512      ,8388639     ,3           ,0           ,457179.7    ,461797.7
icelake     ,avx512      ,8388671     ,3           ,7           ,472507.9    ,481561.4
icelake     ,avx512      ,8388735     ,9           ,5           ,463611.9    ,467388.7
icelake     ,avx512      ,16777223    ,0           ,64          ,1490426.0   ,1526996.0
icelake     ,avx512      ,16777231    ,0           ,3           ,1516687.0   ,1517095.0
icelake     ,avx512      ,16777247    ,3           ,0           ,1497688.0   ,1512766.0
icelake     ,avx512      ,16777279    ,3           ,7           ,1512331.0   ,1524317.0
icelake     ,avx512      ,16777343    ,9           ,5           ,1498908.0   ,1500526.0
icelake     ,avx         ,4103        ,0           ,64          ,50.2        ,63.7
icelake     ,avx         ,4111        ,0           ,3           ,63.7        ,65.1
icelake     ,avx         ,4127        ,3           ,0           ,68.2        ,69.4
icelake     ,avx         ,4159        ,3           ,7           ,59.6        ,68.0
icelake     ,avx         ,4223        ,9           ,5           ,68.2        ,66.8
icelake     ,avx         ,8199        ,0           ,64          ,92.1        ,89.9
icelake     ,avx         ,8207        ,0           ,3           ,119.7       ,118.3
icelake     ,avx         ,8223        ,3           ,0           ,119.1       ,120.9
icelake     ,avx         ,8255        ,3           ,7           ,122.9       ,123.7
icelake     ,avx         ,8319        ,9           ,5           ,122.1       ,121.8
icelake     ,avx         ,16391       ,0           ,64          ,162.7       ,158.0
icelake     ,avx         ,16399       ,0           ,3           ,227.6       ,234.1
icelake     ,avx         ,16415       ,3           ,0           ,230.8       ,232.7
icelake     ,avx         ,16447       ,3           ,7           ,226.8       ,232.6
icelake     ,avx         ,16511       ,9           ,5           ,233.4       ,233.8
icelake     ,avx         ,32775       ,0           ,64          ,312.2       ,301.8
icelake     ,avx         ,32783       ,0           ,3           ,449.7       ,450.0
icelake     ,avx         ,32799       ,3           ,0           ,452.7       ,455.9
icelake     ,avx         ,32831       ,3           ,7           ,449.8       ,458.0
icelake     ,avx         ,32895       ,9           ,5           ,456.3       ,459.4
icelake     ,avx         ,65543       ,0           ,64          ,1460.6      ,1463.9
icelake     ,avx         ,65551       ,0           ,3           ,1462.0      ,1465.4
icelake     ,avx         ,65567       ,3           ,0           ,1466.6      ,1480.4
icelake     ,avx         ,65599       ,3           ,7           ,1488.0      ,1488.9
icelake     ,avx         ,65663       ,9           ,5           ,1680.8      ,1499.5
icelake     ,avx         ,131079      ,0           ,64          ,2988.5      ,3010.1
icelake     ,avx         ,131087      ,0           ,3           ,2995.5      ,2996.4
icelake     ,avx         ,131103      ,3           ,0           ,3006.2      ,3000.5
icelake     ,avx         ,131135      ,3           ,7           ,3032.4      ,3073.7
icelake     ,avx         ,131199      ,9           ,5           ,3010.4      ,3027.4
icelake     ,avx         ,262151      ,0           ,64          ,6143.2      ,6079.1
icelake     ,avx         ,262159      ,0           ,3           ,6085.1      ,6075.8
icelake     ,avx         ,262175      ,3           ,0           ,6088.0      ,6064.9
icelake     ,avx         ,262207      ,3           ,7           ,6018.7      ,6023.5
icelake     ,avx         ,262271      ,9           ,5           ,6019.8      ,5959.2
icelake     ,avx         ,524295      ,0           ,64          ,14464.2     ,14095.1
icelake     ,avx         ,524303      ,0           ,3           ,14761.6     ,14050.2
icelake     ,avx         ,524319      ,3           ,0           ,14534.1     ,14087.5
icelake     ,avx         ,524351      ,3           ,7           ,14147.7     ,13903.8
icelake     ,avx         ,524415      ,9           ,5           ,14157.0     ,13982.9
icelake     ,avx         ,1048583     ,0           ,64          ,36599.0     ,37461.4
icelake     ,avx         ,1048591     ,0           ,3           ,36717.8     ,37454.9
icelake     ,avx         ,1048607     ,3           ,0           ,36821.2     ,37343.3
icelake     ,avx         ,1048639     ,3           ,7           ,36958.0     ,37507.2
icelake     ,avx         ,1048703     ,9           ,5           ,36869.2     ,37413.1
icelake     ,avx         ,2097159     ,0           ,64          ,74765.8     ,75330.9
icelake     ,avx         ,2097167     ,0           ,3           ,75175.4     ,74891.9
icelake     ,avx         ,2097183     ,3           ,0           ,75451.4     ,74787.7
icelake     ,avx         ,2097215     ,3           ,7           ,75394.8     ,75839.1
icelake     ,avx         ,2097279     ,9           ,5           ,75099.2     ,75421.2
icelake     ,avx         ,4194311     ,0           ,64          ,146809.6    ,146619.4
icelake     ,avx         ,4194319     ,0           ,3           ,148866.4    ,149898.2
icelake     ,avx         ,4194335     ,3           ,0           ,148719.7    ,150165.4
icelake     ,avx         ,4194367     ,3           ,7           ,150600.1    ,150925.9
icelake     ,avx         ,4194431     ,9           ,5           ,149457.3    ,150519.2
icelake     ,avx         ,8388615     ,0           ,64          ,412709.8    ,423666.1
icelake     ,avx         ,8388623     ,0           ,3           ,423717.4    ,424418.2
icelake     ,avx         ,8388639     ,3           ,0           ,414387.5    ,413445.6
icelake     ,avx         ,8388671     ,3           ,7           ,449010.7    ,417553.5
icelake     ,avx         ,8388735     ,9           ,5           ,414128.6    ,411815.3
icelake     ,avx         ,16777223    ,0           ,64          ,1490032.0   ,1510004.0
icelake     ,avx         ,16777231    ,0           ,3           ,1379638.0   ,1422097.0
icelake     ,avx         ,16777247    ,3           ,0           ,1418930.0   ,1367557.0
icelake     ,avx         ,16777279    ,3           ,7           ,1515152.0   ,1500176.0
icelake     ,avx         ,16777343    ,9           ,5           ,1344117.0   ,1411795.0
icelake     ,sse2        ,4103        ,0           ,64          ,113.2       ,114.6
icelake     ,sse2        ,4111        ,0           ,3           ,121.5       ,120.4
icelake     ,sse2        ,4127        ,3           ,0           ,1700.5      ,1771.5
icelake     ,sse2        ,4159        ,3           ,7           ,119.3       ,118.8
icelake     ,sse2        ,4223        ,9           ,5           ,1739.7      ,1735.2
icelake     ,sse2        ,8199        ,0           ,64          ,207.0       ,203.9
icelake     ,sse2        ,8207        ,0           ,3           ,225.5       ,220.8
icelake     ,sse2        ,8223        ,3           ,0           ,3444.3      ,3743.5
icelake     ,sse2        ,8255        ,3           ,7           ,219.9       ,216.8
icelake     ,sse2        ,8319        ,9           ,5           ,4117.1      ,3487.3
icelake     ,sse2        ,16391       ,0           ,64          ,397.1       ,394.3
icelake     ,sse2        ,16399       ,0           ,3           ,439.6       ,428.6
icelake     ,sse2        ,16415       ,3           ,0           ,6997.0      ,7031.2
icelake     ,sse2        ,16447       ,3           ,7           ,426.8       ,421.8
icelake     ,sse2        ,16511       ,9           ,5           ,7037.6      ,7038.3
icelake     ,sse2        ,32775       ,0           ,64          ,790.9       ,779.0
icelake     ,sse2        ,32783       ,0           ,3           ,863.1       ,849.6
icelake     ,sse2        ,32799       ,3           ,0           ,14043.0     ,14390.9
icelake     ,sse2        ,32831       ,3           ,7           ,841.6       ,833.1
icelake     ,sse2        ,32895       ,9           ,5           ,14277.6     ,14344.2
icelake     ,sse2        ,65543       ,0           ,64          ,1897.0      ,1897.3
icelake     ,sse2        ,65551       ,0           ,3           ,1927.1      ,1955.4
icelake     ,sse2        ,65567       ,3           ,0           ,28834.7     ,28727.8
icelake     ,sse2        ,65599       ,3           ,7           ,1961.4      ,1969.7
icelake     ,sse2        ,65663       ,9           ,5           ,28867.6     ,29019.8
icelake     ,sse2        ,131079      ,0           ,64          ,3879.3      ,3872.6
icelake     ,sse2        ,131087      ,0           ,3           ,3955.3      ,3990.7
icelake     ,sse2        ,131103      ,3           ,0           ,58001.8     ,60567.9
icelake     ,sse2        ,131135      ,3           ,7           ,3951.5      ,4002.6
icelake     ,sse2        ,131199      ,9           ,5           ,57886.7     ,58391.4
icelake     ,sse2        ,262151      ,0           ,64          ,7851.4      ,7894.7
icelake     ,sse2        ,262159      ,0           ,3           ,7947.5      ,8016.2
icelake     ,sse2        ,262175      ,3           ,0           ,115036.2    ,115968.6
icelake     ,sse2        ,262207      ,3           ,7           ,7883.9      ,7814.1
icelake     ,sse2        ,262271      ,9           ,5           ,113776.4    ,119733.6
icelake     ,sse2        ,524295      ,0           ,64          ,17198.1     ,16974.9
icelake     ,sse2        ,524303      ,0           ,3           ,17402.2     ,17096.3
icelake     ,sse2        ,524319      ,3           ,0           ,223980.4    ,225889.9
icelake     ,sse2        ,524351      ,3           ,7           ,17034.9     ,16910.3
icelake     ,sse2        ,524415      ,9           ,5           ,224027.7    ,224962.5
icelake     ,sse2        ,1048583     ,0           ,64          ,38822.3     ,39178.6
icelake     ,sse2        ,1048591     ,0           ,3           ,41686.7     ,40247.4
icelake     ,sse2        ,1048607     ,3           ,0           ,38814.8     ,39323.3
icelake     ,sse2        ,1048639     ,3           ,7           ,39568.3     ,41325.7
icelake     ,sse2        ,1048703     ,9           ,5           ,39354.2     ,39637.9
icelake     ,sse2        ,2097159     ,0           ,64          ,84074.7     ,84543.1
icelake     ,sse2        ,2097167     ,0           ,3           ,83665.7     ,82358.2
icelake     ,sse2        ,2097183     ,3           ,0           ,81817.8     ,79638.9
icelake     ,sse2        ,2097215     ,3           ,7           ,83649.1     ,83497.6
icelake     ,sse2        ,2097279     ,9           ,5           ,80287.6     ,79980.9
icelake     ,sse2        ,4194311     ,0           ,64          ,165409.8    ,168343.1
icelake     ,sse2        ,4194319     ,0           ,3           ,165216.7    ,177632.0
icelake     ,sse2        ,4194335     ,3           ,0           ,158718.7    ,160342.2
icelake     ,sse2        ,4194367     ,3           ,7           ,167944.9    ,167204.4
icelake     ,sse2        ,4194431     ,9           ,5           ,161530.1    ,164839.7
icelake     ,sse2        ,8388615     ,0           ,64          ,626504.3    ,629858.5
icelake     ,sse2        ,8388623     ,0           ,3           ,623969.5    ,631509.1
icelake     ,sse2        ,8388639     ,3           ,0           ,599366.7    ,600016.0
icelake     ,sse2        ,8388671     ,3           ,7           ,619964.2    ,619113.2
icelake     ,sse2        ,8388735     ,9           ,5           ,595338.1    ,604172.4
icelake     ,sse2        ,16777223    ,0           ,64          ,1709597.0   ,1725184.0
icelake     ,sse2        ,16777231    ,0           ,3           ,1725452.0   ,1719746.0
icelake     ,sse2        ,16777247    ,3           ,0           ,1614269.0   ,1607164.0
icelake     ,sse2        ,16777279    ,3           ,7           ,1705295.0   ,1733018.0
icelake     ,sse2        ,16777343    ,9           ,5           ,1604197.0   ,1595690.0

        
 .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
 1 file changed, 265 insertions(+), 73 deletions(-)
  

Comments

H.J. Lu April 3, 2021, 5:45 p.m. UTC | #1
On Sat, Apr 3, 2021 at 1:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> From: noah <goldstein.w.n@gmail.com>
>
> No Bug. This commit updates the large memcpy case (no overlap). The
> update is to perform memcpy on either 2 or 4 contiguous pages at
> once. This 1) helps to alleviate the affects of false memory aliasing
> when destination and source have a close 4k alignment and 2) In most
> cases and for most DRAM units is a modestly more efficient access
> pattern. These changes are a clear performance improvement for
> VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> pass.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> Issue was alignment related AFAICT. Added `.p2align 4` infront of the
> loops and no longer see any meaningful regression.
>
> Also added back the temporal stores for the tail. Saw a regression
> when doing these tests.
>
> Two tables below for skylake and icelake numbers for the areas around
> where you saw the regression. Below is all data from the tests.
>
> N = 10.
>
> Skylake
> Len         ,align1      ,align2      ,new mean    ,old mean
> 4103        ,0           ,64          ,84.5        ,88.6
> 4111        ,0           ,3           ,99.0        ,99.9
> 4127        ,3           ,0           ,102.1       ,102.3
> 4159        ,3           ,7           ,88.7        ,90.9
> 4223        ,9           ,5           ,88.1        ,87.4
> 8199        ,0           ,64          ,146.7       ,150.2
> 8207        ,0           ,3           ,167.9       ,168.5
> 8223        ,3           ,0           ,168.5       ,168.1
> 8255        ,3           ,7           ,157.0       ,159.2
> 8319        ,9           ,5           ,155.5       ,155.7
> 16391       ,0           ,64          ,286.2       ,288.8
> 16399       ,0           ,3           ,307.0       ,308.7
> 16415       ,3           ,0           ,307.4       ,307.6
> 16447       ,3           ,7           ,294.6       ,295.5
> 16511       ,9           ,5           ,291.5       ,462.1
> 32775       ,0           ,64          ,603.4       ,601.5
> 32783       ,0           ,3           ,604.8       ,606.4
> 32799       ,3           ,0           ,603.0       ,604.1
> 32831       ,3           ,7           ,600.2       ,737.3
> 32895       ,9           ,5           ,604.4       ,599.5
> 65543       ,0           ,64          ,1873.5      ,1854.3
> 65551       ,0           ,3           ,1862.9      ,1846.6
> 65567       ,3           ,0           ,1885.5      ,1966.0
> 65599       ,3           ,7           ,1833.2      ,1833.1
> 65663       ,9           ,5           ,1884.9      ,1887.4
> 131079      ,0           ,64          ,3944.3      ,3949.4
> 131087      ,0           ,3           ,3927.3      ,3913.3
> 131103      ,3           ,0           ,4415.8      ,4169.4
> 131135      ,3           ,7           ,4224.5      ,4157.6
> 131199      ,9           ,5           ,5974.0      ,4983.8
> 262151      ,0           ,64          ,11050.2     ,10620.6
> 262159      ,0           ,3           ,9932.8      ,10037.3
> 262175      ,3           ,0           ,10188.8     ,9206.6
> 262207      ,3           ,7           ,9633.3      ,9216.7
> 262271      ,9           ,5           ,9732.7      ,9345.3
> 524295      ,0           ,64          ,24823.9     ,24880.7
> 524303      ,0           ,3           ,24514.0     ,24556.7
> 524319      ,3           ,0           ,23974.4     ,24219.9
> 524351      ,3           ,7           ,24159.7     ,24207.0
> 524415      ,9           ,5           ,23946.5     ,24142.8
>
> Icelake:
> Len         ,align1      ,align2      ,new mean    ,old mean
> 4103        ,0           ,64          ,50.2        ,63.7
> 4111        ,0           ,3           ,63.7        ,65.1
> 4127        ,3           ,0           ,68.2        ,69.4
> 4159        ,3           ,7           ,59.6        ,68.0
> 4223        ,9           ,5           ,68.2        ,66.8
> 8199        ,0           ,64          ,92.1        ,89.9
> 8207        ,0           ,3           ,119.7       ,118.3
> 8223        ,3           ,0           ,119.1       ,120.9
> 8255        ,3           ,7           ,122.9       ,123.7
> 8319        ,9           ,5           ,122.1       ,121.8
> 16391       ,0           ,64          ,162.7       ,158.0
> 16399       ,0           ,3           ,227.6       ,234.1
> 16415       ,3           ,0           ,230.8       ,232.7
> 16447       ,3           ,7           ,226.8       ,232.6
> 16511       ,9           ,5           ,233.4       ,233.8
> 32775       ,0           ,64          ,312.2       ,301.8
> 32783       ,0           ,3           ,449.7       ,450.0
> 32799       ,3           ,0           ,452.7       ,455.9
> 32831       ,3           ,7           ,449.8       ,458.0
> 32895       ,9           ,5           ,456.3       ,459.4
> 65543       ,0           ,64          ,1460.6      ,1463.9
> 65551       ,0           ,3           ,1462.0      ,1465.4
> 65567       ,3           ,0           ,1466.6      ,1480.4
> 65599       ,3           ,7           ,1488.0      ,1488.9
> 65663       ,9           ,5           ,1680.8      ,1499.5
> 131079      ,0           ,64          ,2988.5      ,3010.1
> 131087      ,0           ,3           ,2995.5      ,2996.4
> 131103      ,3           ,0           ,3006.2      ,3000.5
> 131135      ,3           ,7           ,3032.4      ,3073.7
> 131199      ,9           ,5           ,3010.4      ,3027.4
> 262151      ,0           ,64          ,6143.2      ,6079.1
> 262159      ,0           ,3           ,6085.1      ,6075.8
> 262175      ,3           ,0           ,6088.0      ,6064.9
> 262207      ,3           ,7           ,6018.7      ,6023.5
> 262271      ,9           ,5           ,6019.8      ,5959.2
> 524295      ,0           ,64          ,14464.2     ,14095.1
> 524303      ,0           ,3           ,14761.6     ,14050.2
> 524319      ,3           ,0           ,14534.1     ,14087.5
> 524351      ,3           ,7           ,14147.7     ,13903.8
> 524415      ,9           ,5           ,14157.0     ,13982.9
>
>
>
> cpu         ,version     ,Len         ,align1      ,align2      ,new mean    ,old mean
> skylake     ,avx         ,4103        ,0           ,64          ,84.5        ,88.6
> skylake     ,avx         ,4111        ,0           ,3           ,99.0        ,99.9
> skylake     ,avx         ,4127        ,3           ,0           ,102.1       ,102.3
> skylake     ,avx         ,4159        ,3           ,7           ,88.7        ,90.9
> skylake     ,avx         ,4223        ,9           ,5           ,88.1        ,87.4
> skylake     ,avx         ,8199        ,0           ,64          ,146.7       ,150.2
> skylake     ,avx         ,8207        ,0           ,3           ,167.9       ,168.5
> skylake     ,avx         ,8223        ,3           ,0           ,168.5       ,168.1
> skylake     ,avx         ,8255        ,3           ,7           ,157.0       ,159.2
> skylake     ,avx         ,8319        ,9           ,5           ,155.5       ,155.7
> skylake     ,avx         ,16391       ,0           ,64          ,286.2       ,288.8
> skylake     ,avx         ,16399       ,0           ,3           ,307.0       ,308.7
> skylake     ,avx         ,16415       ,3           ,0           ,307.4       ,307.6
> skylake     ,avx         ,16447       ,3           ,7           ,294.6       ,295.5
> skylake     ,avx         ,16511       ,9           ,5           ,291.5       ,462.1
> skylake     ,avx         ,32775       ,0           ,64          ,603.4       ,601.5
> skylake     ,avx         ,32783       ,0           ,3           ,604.8       ,606.4
> skylake     ,avx         ,32799       ,3           ,0           ,603.0       ,604.1
> skylake     ,avx         ,32831       ,3           ,7           ,600.2       ,737.3
> skylake     ,avx         ,32895       ,9           ,5           ,604.4       ,599.5
> skylake     ,avx         ,65543       ,0           ,64          ,1873.5      ,1854.3
> skylake     ,avx         ,65551       ,0           ,3           ,1862.9      ,1846.6
> skylake     ,avx         ,65567       ,3           ,0           ,1885.5      ,1966.0
> skylake     ,avx         ,65599       ,3           ,7           ,1833.2      ,1833.1
> skylake     ,avx         ,65663       ,9           ,5           ,1884.9      ,1887.4
> skylake     ,avx         ,131079      ,0           ,64          ,3944.3      ,3949.4
> skylake     ,avx         ,131087      ,0           ,3           ,3927.3      ,3913.3
> skylake     ,avx         ,131103      ,3           ,0           ,4415.8      ,4169.4
> skylake     ,avx         ,131135      ,3           ,7           ,4224.5      ,4157.6
> skylake     ,avx         ,131199      ,9           ,5           ,5974.0      ,4983.8
> skylake     ,avx         ,262151      ,0           ,64          ,11050.2     ,10620.6
> skylake     ,avx         ,262159      ,0           ,3           ,9932.8      ,10037.3
> skylake     ,avx         ,262175      ,3           ,0           ,10188.8     ,9206.6
> skylake     ,avx         ,262207      ,3           ,7           ,9633.3      ,9216.7
> skylake     ,avx         ,262271      ,9           ,5           ,9732.7      ,9345.3
> skylake     ,avx         ,524295      ,0           ,64          ,24823.9     ,24880.7
> skylake     ,avx         ,524303      ,0           ,3           ,24514.0     ,24556.7
> skylake     ,avx         ,524319      ,3           ,0           ,23974.4     ,24219.9
> skylake     ,avx         ,524351      ,3           ,7           ,24159.7     ,24207.0
> skylake     ,avx         ,524415      ,9           ,5           ,23946.5     ,24142.8
> skylake     ,avx         ,1048583     ,0           ,64          ,49163.9     ,49454.6
> skylake     ,avx         ,1048591     ,0           ,3           ,49879.3     ,49400.8
> skylake     ,avx         ,1048607     ,3           ,0           ,49738.0     ,48864.6
> skylake     ,avx         ,1048639     ,3           ,7           ,48804.0     ,47588.5
> skylake     ,avx         ,1048703     ,9           ,5           ,49629.4     ,49796.3
> skylake     ,avx         ,2097159     ,0           ,64          ,98271.7     ,96330.6
> skylake     ,avx         ,2097167     ,0           ,3           ,97801.8     ,98638.1
> skylake     ,avx         ,2097183     ,3           ,0           ,98041.1     ,99287.6
> skylake     ,avx         ,2097215     ,3           ,7           ,96629.5     ,96521.9
> skylake     ,avx         ,2097279     ,9           ,5           ,98961.8     ,98909.8
> skylake     ,avx         ,4194311     ,0           ,64          ,194667.7    ,195377.1
> skylake     ,avx         ,4194319     ,0           ,3           ,194919.5    ,198576.2
> skylake     ,avx         ,4194335     ,3           ,0           ,192949.8    ,194584.7
> skylake     ,avx         ,4194367     ,3           ,7           ,189943.5    ,189177.9
> skylake     ,avx         ,4194431     ,9           ,5           ,192479.1    ,196494.2
> skylake     ,avx         ,8388615     ,0           ,64          ,588671.6    ,587215.4
> skylake     ,avx         ,8388623     ,0           ,3           ,581640.7    ,582812.5
> skylake     ,avx         ,8388639     ,3           ,0           ,549811.9    ,544697.6
> skylake     ,avx         ,8388671     ,3           ,7           ,591155.0    ,577951.8
> skylake     ,avx         ,8388735     ,9           ,5           ,547583.2    ,545133.3
> skylake     ,avx         ,16777223    ,0           ,64          ,1787503.0   ,1811146.0
> skylake     ,avx         ,16777231    ,0           ,3           ,1758671.0   ,1756343.0
> skylake     ,avx         ,16777247    ,3           ,0           ,1691781.0   ,1694661.0
> skylake     ,avx         ,16777279    ,3           ,7           ,1768150.0   ,1754785.0
> skylake     ,avx         ,16777343    ,9           ,5           ,1695179.0   ,1710794.0
> skylake     ,sse2        ,4103        ,0           ,64          ,150.8       ,150.5
> skylake     ,sse2        ,4111        ,0           ,3           ,156.8       ,158.4
> skylake     ,sse2        ,4127        ,3           ,0           ,99.7        ,99.4
> skylake     ,sse2        ,4159        ,3           ,7           ,154.8       ,154.5
> skylake     ,sse2        ,4223        ,9           ,5           ,137.3       ,137.2
> skylake     ,sse2        ,8199        ,0           ,64          ,284.8       ,285.5
> skylake     ,sse2        ,8207        ,0           ,3           ,296.0       ,296.1
> skylake     ,sse2        ,8223        ,3           ,0           ,168.0       ,168.2
> skylake     ,sse2        ,8255        ,3           ,7           ,293.0       ,292.4
> skylake     ,sse2        ,8319        ,9           ,5           ,251.3       ,250.7
> skylake     ,sse2        ,16391       ,0           ,64          ,561.3       ,608.3
> skylake     ,sse2        ,16399       ,0           ,3           ,571.0       ,574.8
> skylake     ,sse2        ,16415       ,3           ,0           ,305.4       ,305.0
> skylake     ,sse2        ,16447       ,3           ,7           ,563.2       ,565.0
> skylake     ,sse2        ,16511       ,9           ,5           ,477.1       ,475.1
> skylake     ,sse2        ,32775       ,0           ,64          ,1128.2      ,1131.7
> skylake     ,sse2        ,32783       ,0           ,3           ,1126.6      ,1131.0
> skylake     ,sse2        ,32799       ,3           ,0           ,587.6       ,590.8
> skylake     ,sse2        ,32831       ,3           ,7           ,1130.6      ,1126.2
> skylake     ,sse2        ,32895       ,9           ,5           ,957.6       ,953.0
> skylake     ,sse2        ,65543       ,0           ,64          ,2718.9      ,2704.2
> skylake     ,sse2        ,65551       ,0           ,3           ,2724.1      ,2725.0
> skylake     ,sse2        ,65567       ,3           ,0           ,1888.4      ,1914.3
> skylake     ,sse2        ,65599       ,3           ,7           ,2787.6      ,2748.7
> skylake     ,sse2        ,65663       ,9           ,5           ,2400.5      ,2369.4
> skylake     ,sse2        ,131079      ,0           ,64          ,5603.3      ,5654.9
> skylake     ,sse2        ,131087      ,0           ,3           ,5939.3      ,5871.4
> skylake     ,sse2        ,131103      ,3           ,0           ,4272.4      ,4190.0
> skylake     ,sse2        ,131135      ,3           ,7           ,7601.4      ,7524.6
> skylake     ,sse2        ,131199      ,9           ,5           ,7022.1      ,6864.7
> skylake     ,sse2        ,262151      ,0           ,64          ,13736.2     ,14030.0
> skylake     ,sse2        ,262159      ,0           ,3           ,12407.3     ,12334.1
> skylake     ,sse2        ,262175      ,3           ,0           ,9661.1      ,9249.4
> skylake     ,sse2        ,262207      ,3           ,7           ,12850.2     ,12351.6
> skylake     ,sse2        ,262271      ,9           ,5           ,10792.6     ,10435.8
> skylake     ,sse2        ,524295      ,0           ,64          ,27754.5     ,28177.7
> skylake     ,sse2        ,524303      ,0           ,3           ,27766.2     ,28152.0
> skylake     ,sse2        ,524319      ,3           ,0           ,24030.9     ,24438.3
> skylake     ,sse2        ,524351      ,3           ,7           ,27787.5     ,27933.0
> skylake     ,sse2        ,524415      ,9           ,5           ,24263.2     ,25249.1
> skylake     ,sse2        ,1048583     ,0           ,64          ,56199.9     ,56039.8
> skylake     ,sse2        ,1048591     ,0           ,3           ,56750.2     ,58889.7
> skylake     ,sse2        ,1048607     ,3           ,0           ,56394.0     ,55115.3
> skylake     ,sse2        ,1048639     ,3           ,7           ,57233.1     ,57473.8
> skylake     ,sse2        ,1048703     ,9           ,5           ,56324.3     ,55917.9
> skylake     ,sse2        ,2097159     ,0           ,64          ,113234.8    ,114346.4
> skylake     ,sse2        ,2097167     ,0           ,3           ,114373.1    ,115522.5
> skylake     ,sse2        ,2097183     ,3           ,0           ,108113.3    ,108513.3
> skylake     ,sse2        ,2097215     ,3           ,7           ,116863.6    ,116549.9
> skylake     ,sse2        ,2097279     ,9           ,5           ,108945.1    ,108843.7
> skylake     ,sse2        ,4194311     ,0           ,64          ,230250.1    ,232350.0
> skylake     ,sse2        ,4194319     ,0           ,3           ,231895.3    ,235055.6
> skylake     ,sse2        ,4194335     ,3           ,0           ,218442.8    ,219199.8
> skylake     ,sse2        ,4194367     ,3           ,7           ,242564.2    ,235587.7
> skylake     ,sse2        ,4194431     ,9           ,5           ,224167.4    ,215261.8
> skylake     ,sse2        ,8388615     ,0           ,64          ,679801.8    ,674832.0
> skylake     ,sse2        ,8388623     ,0           ,3           ,684913.2    ,685238.7
> skylake     ,sse2        ,8388639     ,3           ,0           ,644865.4    ,631388.6
> skylake     ,sse2        ,8388671     ,3           ,7           ,698700.9    ,689316.1
> skylake     ,sse2        ,8388735     ,9           ,5           ,644820.2    ,631366.8
> skylake     ,sse2        ,16777223    ,0           ,64          ,1877984.0   ,1876437.0
> skylake     ,sse2        ,16777231    ,0           ,3           ,1898086.0   ,1913053.0
> skylake     ,sse2        ,16777247    ,3           ,0           ,1857018.0   ,1866949.0
> skylake     ,sse2        ,16777279    ,3           ,7           ,1914905.0   ,1897134.0
> skylake     ,sse2        ,16777343    ,9           ,5           ,1859937.0   ,1881939.0
> icelake     ,avx512      ,4103        ,0           ,64          ,75.2        ,75.8
> icelake     ,avx512      ,4111        ,0           ,3           ,56.9        ,56.4
> icelake     ,avx512      ,4127        ,3           ,0           ,59.1        ,59.6
> icelake     ,avx512      ,4159        ,3           ,7           ,50.7        ,51.3
> icelake     ,avx512      ,4223        ,9           ,5           ,59.2        ,58.9
> icelake     ,avx512      ,8199        ,0           ,64          ,67.8        ,63.9
> icelake     ,avx512      ,8207        ,0           ,3           ,89.0        ,89.9
> icelake     ,avx512      ,8223        ,3           ,0           ,90.2        ,90.1
> icelake     ,avx512      ,8255        ,3           ,7           ,82.6        ,84.9
> icelake     ,avx512      ,8319        ,9           ,5           ,91.5        ,92.8
> icelake     ,avx512      ,16391       ,0           ,64          ,118.0       ,117.6
> icelake     ,avx512      ,16399       ,0           ,3           ,156.5       ,157.0
> icelake     ,avx512      ,16415       ,3           ,0           ,157.4       ,157.3
> icelake     ,avx512      ,16447       ,3           ,7           ,151.0       ,151.6
> icelake     ,avx512      ,16511       ,9           ,5           ,159.1       ,159.6
> icelake     ,avx512      ,32775       ,0           ,64          ,231.8       ,230.8
> icelake     ,avx512      ,32783       ,0           ,3           ,297.8       ,299.3
> icelake     ,avx512      ,32799       ,3           ,0           ,299.1       ,299.0
> icelake     ,avx512      ,32831       ,3           ,7           ,293.5       ,295.4
> icelake     ,avx512      ,32895       ,9           ,5           ,300.3       ,302.5
> icelake     ,avx512      ,65543       ,0           ,64          ,1473.4      ,1479.2
> icelake     ,avx512      ,65551       ,0           ,3           ,1438.2      ,1445.3
> icelake     ,avx512      ,65567       ,3           ,0           ,1450.3      ,1463.8
> icelake     ,avx512      ,65599       ,3           ,7           ,1469.0      ,1473.8
> icelake     ,avx512      ,65663       ,9           ,5           ,1480.0      ,1483.5
> icelake     ,avx512      ,131079      ,0           ,64          ,3015.1      ,3037.5
> icelake     ,avx512      ,131087      ,0           ,3           ,2952.3      ,2960.4
> icelake     ,avx512      ,131103      ,3           ,0           ,2966.2      ,2964.4
> icelake     ,avx512      ,131135      ,3           ,7           ,2961.6      ,3047.9
> icelake     ,avx512      ,131199      ,9           ,5           ,2967.4      ,3183.8
> icelake     ,avx512      ,262151      ,0           ,64          ,6206.0      ,6141.5
> icelake     ,avx512      ,262159      ,0           ,3           ,5990.8      ,5959.2
> icelake     ,avx512      ,262175      ,3           ,0           ,5976.7      ,5963.8
> icelake     ,avx512      ,262207      ,3           ,7           ,5939.5      ,5924.3
> icelake     ,avx512      ,262271      ,9           ,5           ,5944.6      ,5990.3
> icelake     ,avx512      ,524295      ,0           ,64          ,14726.7     ,14307.0
> icelake     ,avx512      ,524303      ,0           ,3           ,14344.2     ,14040.5
> icelake     ,avx512      ,524319      ,3           ,0           ,14175.0     ,13862.2
> icelake     ,avx512      ,524351      ,3           ,7           ,14261.4     ,13821.5
> icelake     ,avx512      ,524415      ,9           ,5           ,14266.5     ,14064.7
> icelake     ,avx512      ,1048583     ,0           ,64          ,35211.4     ,35414.6
> icelake     ,avx512      ,1048591     ,0           ,3           ,35156.8     ,35591.2
> icelake     ,avx512      ,1048607     ,3           ,0           ,35273.1     ,35503.3
> icelake     ,avx512      ,1048639     ,3           ,7           ,35255.8     ,35725.0
> icelake     ,avx512      ,1048703     ,9           ,5           ,35703.6     ,36289.9
> icelake     ,avx512      ,2097159     ,0           ,64          ,72613.9     ,72063.2
> icelake     ,avx512      ,2097167     ,0           ,3           ,72301.6     ,73504.2
> icelake     ,avx512      ,2097183     ,3           ,0           ,73448.8     ,72133.6
> icelake     ,avx512      ,2097215     ,3           ,7           ,73762.9     ,72825.8
> icelake     ,avx512      ,2097279     ,9           ,5           ,72097.3     ,72914.6
> icelake     ,avx512      ,4194311     ,0           ,64          ,144793.4    ,144182.1
> icelake     ,avx512      ,4194319     ,0           ,3           ,143710.3    ,145063.3
> icelake     ,avx512      ,4194335     ,3           ,0           ,146722.1    ,144046.4
> icelake     ,avx512      ,4194367     ,3           ,7           ,144267.0    ,144874.6
> icelake     ,avx512      ,4194431     ,9           ,5           ,143808.2    ,144560.0
> icelake     ,avx512      ,8388615     ,0           ,64          ,427993.4    ,424521.5
> icelake     ,avx512      ,8388623     ,0           ,3           ,470267.1    ,473290.8
> icelake     ,avx512      ,8388639     ,3           ,0           ,457179.7    ,461797.7
> icelake     ,avx512      ,8388671     ,3           ,7           ,472507.9    ,481561.4
> icelake     ,avx512      ,8388735     ,9           ,5           ,463611.9    ,467388.7
> icelake     ,avx512      ,16777223    ,0           ,64          ,1490426.0   ,1526996.0
> icelake     ,avx512      ,16777231    ,0           ,3           ,1516687.0   ,1517095.0
> icelake     ,avx512      ,16777247    ,3           ,0           ,1497688.0   ,1512766.0
> icelake     ,avx512      ,16777279    ,3           ,7           ,1512331.0   ,1524317.0
> icelake     ,avx512      ,16777343    ,9           ,5           ,1498908.0   ,1500526.0
> icelake     ,avx         ,4103        ,0           ,64          ,50.2        ,63.7
> icelake     ,avx         ,4111        ,0           ,3           ,63.7        ,65.1
> icelake     ,avx         ,4127        ,3           ,0           ,68.2        ,69.4
> icelake     ,avx         ,4159        ,3           ,7           ,59.6        ,68.0
> icelake     ,avx         ,4223        ,9           ,5           ,68.2        ,66.8
> icelake     ,avx         ,8199        ,0           ,64          ,92.1        ,89.9
> icelake     ,avx         ,8207        ,0           ,3           ,119.7       ,118.3
> icelake     ,avx         ,8223        ,3           ,0           ,119.1       ,120.9
> icelake     ,avx         ,8255        ,3           ,7           ,122.9       ,123.7
> icelake     ,avx         ,8319        ,9           ,5           ,122.1       ,121.8
> icelake     ,avx         ,16391       ,0           ,64          ,162.7       ,158.0
> icelake     ,avx         ,16399       ,0           ,3           ,227.6       ,234.1
> icelake     ,avx         ,16415       ,3           ,0           ,230.8       ,232.7
> icelake     ,avx         ,16447       ,3           ,7           ,226.8       ,232.6
> icelake     ,avx         ,16511       ,9           ,5           ,233.4       ,233.8
> icelake     ,avx         ,32775       ,0           ,64          ,312.2       ,301.8
> icelake     ,avx         ,32783       ,0           ,3           ,449.7       ,450.0
> icelake     ,avx         ,32799       ,3           ,0           ,452.7       ,455.9
> icelake     ,avx         ,32831       ,3           ,7           ,449.8       ,458.0
> icelake     ,avx         ,32895       ,9           ,5           ,456.3       ,459.4
> icelake     ,avx         ,65543       ,0           ,64          ,1460.6      ,1463.9
> icelake     ,avx         ,65551       ,0           ,3           ,1462.0      ,1465.4
> icelake     ,avx         ,65567       ,3           ,0           ,1466.6      ,1480.4
> icelake     ,avx         ,65599       ,3           ,7           ,1488.0      ,1488.9
> icelake     ,avx         ,65663       ,9           ,5           ,1680.8      ,1499.5
> icelake     ,avx         ,131079      ,0           ,64          ,2988.5      ,3010.1
> icelake     ,avx         ,131087      ,0           ,3           ,2995.5      ,2996.4
> icelake     ,avx         ,131103      ,3           ,0           ,3006.2      ,3000.5
> icelake     ,avx         ,131135      ,3           ,7           ,3032.4      ,3073.7
> icelake     ,avx         ,131199      ,9           ,5           ,3010.4      ,3027.4
> icelake     ,avx         ,262151      ,0           ,64          ,6143.2      ,6079.1
> icelake     ,avx         ,262159      ,0           ,3           ,6085.1      ,6075.8
> icelake     ,avx         ,262175      ,3           ,0           ,6088.0      ,6064.9
> icelake     ,avx         ,262207      ,3           ,7           ,6018.7      ,6023.5
> icelake     ,avx         ,262271      ,9           ,5           ,6019.8      ,5959.2
> icelake     ,avx         ,524295      ,0           ,64          ,14464.2     ,14095.1
> icelake     ,avx         ,524303      ,0           ,3           ,14761.6     ,14050.2
> icelake     ,avx         ,524319      ,3           ,0           ,14534.1     ,14087.5
> icelake     ,avx         ,524351      ,3           ,7           ,14147.7     ,13903.8
> icelake     ,avx         ,524415      ,9           ,5           ,14157.0     ,13982.9
> icelake     ,avx         ,1048583     ,0           ,64          ,36599.0     ,37461.4
> icelake     ,avx         ,1048591     ,0           ,3           ,36717.8     ,37454.9
> icelake     ,avx         ,1048607     ,3           ,0           ,36821.2     ,37343.3
> icelake     ,avx         ,1048639     ,3           ,7           ,36958.0     ,37507.2
> icelake     ,avx         ,1048703     ,9           ,5           ,36869.2     ,37413.1
> icelake     ,avx         ,2097159     ,0           ,64          ,74765.8     ,75330.9
> icelake     ,avx         ,2097167     ,0           ,3           ,75175.4     ,74891.9
> icelake     ,avx         ,2097183     ,3           ,0           ,75451.4     ,74787.7
> icelake     ,avx         ,2097215     ,3           ,7           ,75394.8     ,75839.1
> icelake     ,avx         ,2097279     ,9           ,5           ,75099.2     ,75421.2
> icelake     ,avx         ,4194311     ,0           ,64          ,146809.6    ,146619.4
> icelake     ,avx         ,4194319     ,0           ,3           ,148866.4    ,149898.2
> icelake     ,avx         ,4194335     ,3           ,0           ,148719.7    ,150165.4
> icelake     ,avx         ,4194367     ,3           ,7           ,150600.1    ,150925.9
> icelake     ,avx         ,4194431     ,9           ,5           ,149457.3    ,150519.2
> icelake     ,avx         ,8388615     ,0           ,64          ,412709.8    ,423666.1
> icelake     ,avx         ,8388623     ,0           ,3           ,423717.4    ,424418.2
> icelake     ,avx         ,8388639     ,3           ,0           ,414387.5    ,413445.6
> icelake     ,avx         ,8388671     ,3           ,7           ,449010.7    ,417553.5
> icelake     ,avx         ,8388735     ,9           ,5           ,414128.6    ,411815.3
> icelake     ,avx         ,16777223    ,0           ,64          ,1490032.0   ,1510004.0
> icelake     ,avx         ,16777231    ,0           ,3           ,1379638.0   ,1422097.0
> icelake     ,avx         ,16777247    ,3           ,0           ,1418930.0   ,1367557.0
> icelake     ,avx         ,16777279    ,3           ,7           ,1515152.0   ,1500176.0
> icelake     ,avx         ,16777343    ,9           ,5           ,1344117.0   ,1411795.0
> icelake     ,sse2        ,4103        ,0           ,64          ,113.2       ,114.6
> icelake     ,sse2        ,4111        ,0           ,3           ,121.5       ,120.4
> icelake     ,sse2        ,4127        ,3           ,0           ,1700.5      ,1771.5
> icelake     ,sse2        ,4159        ,3           ,7           ,119.3       ,118.8
> icelake     ,sse2        ,4223        ,9           ,5           ,1739.7      ,1735.2
> icelake     ,sse2        ,8199        ,0           ,64          ,207.0       ,203.9
> icelake     ,sse2        ,8207        ,0           ,3           ,225.5       ,220.8
> icelake     ,sse2        ,8223        ,3           ,0           ,3444.3      ,3743.5
> icelake     ,sse2        ,8255        ,3           ,7           ,219.9       ,216.8
> icelake     ,sse2        ,8319        ,9           ,5           ,4117.1      ,3487.3
> icelake     ,sse2        ,16391       ,0           ,64          ,397.1       ,394.3
> icelake     ,sse2        ,16399       ,0           ,3           ,439.6       ,428.6
> icelake     ,sse2        ,16415       ,3           ,0           ,6997.0      ,7031.2
> icelake     ,sse2        ,16447       ,3           ,7           ,426.8       ,421.8
> icelake     ,sse2        ,16511       ,9           ,5           ,7037.6      ,7038.3
> icelake     ,sse2        ,32775       ,0           ,64          ,790.9       ,779.0
> icelake     ,sse2        ,32783       ,0           ,3           ,863.1       ,849.6
> icelake     ,sse2        ,32799       ,3           ,0           ,14043.0     ,14390.9
> icelake     ,sse2        ,32831       ,3           ,7           ,841.6       ,833.1
> icelake     ,sse2        ,32895       ,9           ,5           ,14277.6     ,14344.2
> icelake     ,sse2        ,65543       ,0           ,64          ,1897.0      ,1897.3
> icelake     ,sse2        ,65551       ,0           ,3           ,1927.1      ,1955.4
> icelake     ,sse2        ,65567       ,3           ,0           ,28834.7     ,28727.8
> icelake     ,sse2        ,65599       ,3           ,7           ,1961.4      ,1969.7
> icelake     ,sse2        ,65663       ,9           ,5           ,28867.6     ,29019.8
> icelake     ,sse2        ,131079      ,0           ,64          ,3879.3      ,3872.6
> icelake     ,sse2        ,131087      ,0           ,3           ,3955.3      ,3990.7
> icelake     ,sse2        ,131103      ,3           ,0           ,58001.8     ,60567.9
> icelake     ,sse2        ,131135      ,3           ,7           ,3951.5      ,4002.6
> icelake     ,sse2        ,131199      ,9           ,5           ,57886.7     ,58391.4
> icelake     ,sse2        ,262151      ,0           ,64          ,7851.4      ,7894.7
> icelake     ,sse2        ,262159      ,0           ,3           ,7947.5      ,8016.2
> icelake     ,sse2        ,262175      ,3           ,0           ,115036.2    ,115968.6
> icelake     ,sse2        ,262207      ,3           ,7           ,7883.9      ,7814.1
> icelake     ,sse2        ,262271      ,9           ,5           ,113776.4    ,119733.6
> icelake     ,sse2        ,524295      ,0           ,64          ,17198.1     ,16974.9
> icelake     ,sse2        ,524303      ,0           ,3           ,17402.2     ,17096.3
> icelake     ,sse2        ,524319      ,3           ,0           ,223980.4    ,225889.9
> icelake     ,sse2        ,524351      ,3           ,7           ,17034.9     ,16910.3
> icelake     ,sse2        ,524415      ,9           ,5           ,224027.7    ,224962.5
> icelake     ,sse2        ,1048583     ,0           ,64          ,38822.3     ,39178.6
> icelake     ,sse2        ,1048591     ,0           ,3           ,41686.7     ,40247.4
> icelake     ,sse2        ,1048607     ,3           ,0           ,38814.8     ,39323.3
> icelake     ,sse2        ,1048639     ,3           ,7           ,39568.3     ,41325.7
> icelake     ,sse2        ,1048703     ,9           ,5           ,39354.2     ,39637.9
> icelake     ,sse2        ,2097159     ,0           ,64          ,84074.7     ,84543.1
> icelake     ,sse2        ,2097167     ,0           ,3           ,83665.7     ,82358.2
> icelake     ,sse2        ,2097183     ,3           ,0           ,81817.8     ,79638.9
> icelake     ,sse2        ,2097215     ,3           ,7           ,83649.1     ,83497.6
> icelake     ,sse2        ,2097279     ,9           ,5           ,80287.6     ,79980.9
> icelake     ,sse2        ,4194311     ,0           ,64          ,165409.8    ,168343.1
> icelake     ,sse2        ,4194319     ,0           ,3           ,165216.7    ,177632.0
> icelake     ,sse2        ,4194335     ,3           ,0           ,158718.7    ,160342.2
> icelake     ,sse2        ,4194367     ,3           ,7           ,167944.9    ,167204.4
> icelake     ,sse2        ,4194431     ,9           ,5           ,161530.1    ,164839.7
> icelake     ,sse2        ,8388615     ,0           ,64          ,626504.3    ,629858.5
> icelake     ,sse2        ,8388623     ,0           ,3           ,623969.5    ,631509.1
> icelake     ,sse2        ,8388639     ,3           ,0           ,599366.7    ,600016.0
> icelake     ,sse2        ,8388671     ,3           ,7           ,619964.2    ,619113.2
> icelake     ,sse2        ,8388735     ,9           ,5           ,595338.1    ,604172.4
> icelake     ,sse2        ,16777223    ,0           ,64          ,1709597.0   ,1725184.0
> icelake     ,sse2        ,16777231    ,0           ,3           ,1725452.0   ,1719746.0
> icelake     ,sse2        ,16777247    ,3           ,0           ,1614269.0   ,1607164.0
> icelake     ,sse2        ,16777279    ,3           ,7           ,1705295.0   ,1733018.0
> icelake     ,sse2        ,16777343    ,9           ,5           ,1604197.0   ,1595690.0
>

I am having a hard time to convince myself that this patch is really necessary.
What are geomeans of all different cases for each processors?
  
Noah Goldstein April 3, 2021, 7:41 p.m. UTC | #2
On Sat, Apr 3, 2021 at 1:46 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Apr 3, 2021 at 1:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > From: noah <goldstein.w.n@gmail.com>
> >
> > No Bug. This commit updates the large memcpy case (no overlap). The
> > update is to perform memcpy on either 2 or 4 contiguous pages at
> > once. This 1) helps to alleviate the affects of false memory aliasing
> > when destination and source have a close 4k alignment and 2) In most
> > cases and for most DRAM units is a modestly more efficient access
> > pattern. These changes are a clear performance improvement for
> > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > pass.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > Issue was alignment related AFAICT. Added `.p2align 4` infront of the
> > loops and no longer see any meaningful regression.
> >
> > Also added back the temporal stores for the tail. Saw a regression
> > when doing these tests.
> >
> > Two tables below for skylake and icelake numbers for the areas around
> > where you saw the regression. Below is all data from the tests.
> >
> > N = 10.
> >
> > Skylake
> > Len         ,align1      ,align2      ,new mean    ,old mean
> > 4103        ,0           ,64          ,84.5        ,88.6
> > 4111        ,0           ,3           ,99.0        ,99.9
> > 4127        ,3           ,0           ,102.1       ,102.3
> > 4159        ,3           ,7           ,88.7        ,90.9
> > 4223        ,9           ,5           ,88.1        ,87.4
> > 8199        ,0           ,64          ,146.7       ,150.2
> > 8207        ,0           ,3           ,167.9       ,168.5
> > 8223        ,3           ,0           ,168.5       ,168.1
> > 8255        ,3           ,7           ,157.0       ,159.2
> > 8319        ,9           ,5           ,155.5       ,155.7
> > 16391       ,0           ,64          ,286.2       ,288.8
> > 16399       ,0           ,3           ,307.0       ,308.7
> > 16415       ,3           ,0           ,307.4       ,307.6
> > 16447       ,3           ,7           ,294.6       ,295.5
> > 16511       ,9           ,5           ,291.5       ,462.1
> > 32775       ,0           ,64          ,603.4       ,601.5
> > 32783       ,0           ,3           ,604.8       ,606.4
> > 32799       ,3           ,0           ,603.0       ,604.1
> > 32831       ,3           ,7           ,600.2       ,737.3
> > 32895       ,9           ,5           ,604.4       ,599.5
> > 65543       ,0           ,64          ,1873.5      ,1854.3
> > 65551       ,0           ,3           ,1862.9      ,1846.6
> > 65567       ,3           ,0           ,1885.5      ,1966.0
> > 65599       ,3           ,7           ,1833.2      ,1833.1
> > 65663       ,9           ,5           ,1884.9      ,1887.4
> > 131079      ,0           ,64          ,3944.3      ,3949.4
> > 131087      ,0           ,3           ,3927.3      ,3913.3
> > 131103      ,3           ,0           ,4415.8      ,4169.4
> > 131135      ,3           ,7           ,4224.5      ,4157.6
> > 131199      ,9           ,5           ,5974.0      ,4983.8
> > 262151      ,0           ,64          ,11050.2     ,10620.6
> > 262159      ,0           ,3           ,9932.8      ,10037.3
> > 262175      ,3           ,0           ,10188.8     ,9206.6
> > 262207      ,3           ,7           ,9633.3      ,9216.7
> > 262271      ,9           ,5           ,9732.7      ,9345.3
> > 524295      ,0           ,64          ,24823.9     ,24880.7
> > 524303      ,0           ,3           ,24514.0     ,24556.7
> > 524319      ,3           ,0           ,23974.4     ,24219.9
> > 524351      ,3           ,7           ,24159.7     ,24207.0
> > 524415      ,9           ,5           ,23946.5     ,24142.8
> >
> > Icelake:
> > Len         ,align1      ,align2      ,new mean    ,old mean
> > 4103        ,0           ,64          ,50.2        ,63.7
> > 4111        ,0           ,3           ,63.7        ,65.1
> > 4127        ,3           ,0           ,68.2        ,69.4
> > 4159        ,3           ,7           ,59.6        ,68.0
> > 4223        ,9           ,5           ,68.2        ,66.8
> > 8199        ,0           ,64          ,92.1        ,89.9
> > 8207        ,0           ,3           ,119.7       ,118.3
> > 8223        ,3           ,0           ,119.1       ,120.9
> > 8255        ,3           ,7           ,122.9       ,123.7
> > 8319        ,9           ,5           ,122.1       ,121.8
> > 16391       ,0           ,64          ,162.7       ,158.0
> > 16399       ,0           ,3           ,227.6       ,234.1
> > 16415       ,3           ,0           ,230.8       ,232.7
> > 16447       ,3           ,7           ,226.8       ,232.6
> > 16511       ,9           ,5           ,233.4       ,233.8
> > 32775       ,0           ,64          ,312.2       ,301.8
> > 32783       ,0           ,3           ,449.7       ,450.0
> > 32799       ,3           ,0           ,452.7       ,455.9
> > 32831       ,3           ,7           ,449.8       ,458.0
> > 32895       ,9           ,5           ,456.3       ,459.4
> > 65543       ,0           ,64          ,1460.6      ,1463.9
> > 65551       ,0           ,3           ,1462.0      ,1465.4
> > 65567       ,3           ,0           ,1466.6      ,1480.4
> > 65599       ,3           ,7           ,1488.0      ,1488.9
> > 65663       ,9           ,5           ,1680.8      ,1499.5
> > 131079      ,0           ,64          ,2988.5      ,3010.1
> > 131087      ,0           ,3           ,2995.5      ,2996.4
> > 131103      ,3           ,0           ,3006.2      ,3000.5
> > 131135      ,3           ,7           ,3032.4      ,3073.7
> > 131199      ,9           ,5           ,3010.4      ,3027.4
> > 262151      ,0           ,64          ,6143.2      ,6079.1
> > 262159      ,0           ,3           ,6085.1      ,6075.8
> > 262175      ,3           ,0           ,6088.0      ,6064.9
> > 262207      ,3           ,7           ,6018.7      ,6023.5
> > 262271      ,9           ,5           ,6019.8      ,5959.2
> > 524295      ,0           ,64          ,14464.2     ,14095.1
> > 524303      ,0           ,3           ,14761.6     ,14050.2
> > 524319      ,3           ,0           ,14534.1     ,14087.5
> > 524351      ,3           ,7           ,14147.7     ,13903.8
> > 524415      ,9           ,5           ,14157.0     ,13982.9
> >
> >
> >
> > cpu         ,version     ,Len         ,align1      ,align2      ,new mean    ,old mean
> > skylake     ,avx         ,4103        ,0           ,64          ,84.5        ,88.6
> > skylake     ,avx         ,4111        ,0           ,3           ,99.0        ,99.9
> > skylake     ,avx         ,4127        ,3           ,0           ,102.1       ,102.3
> > skylake     ,avx         ,4159        ,3           ,7           ,88.7        ,90.9
> > skylake     ,avx         ,4223        ,9           ,5           ,88.1        ,87.4
> > skylake     ,avx         ,8199        ,0           ,64          ,146.7       ,150.2
> > skylake     ,avx         ,8207        ,0           ,3           ,167.9       ,168.5
> > skylake     ,avx         ,8223        ,3           ,0           ,168.5       ,168.1
> > skylake     ,avx         ,8255        ,3           ,7           ,157.0       ,159.2
> > skylake     ,avx         ,8319        ,9           ,5           ,155.5       ,155.7
> > skylake     ,avx         ,16391       ,0           ,64          ,286.2       ,288.8
> > skylake     ,avx         ,16399       ,0           ,3           ,307.0       ,308.7
> > skylake     ,avx         ,16415       ,3           ,0           ,307.4       ,307.6
> > skylake     ,avx         ,16447       ,3           ,7           ,294.6       ,295.5
> > skylake     ,avx         ,16511       ,9           ,5           ,291.5       ,462.1
> > skylake     ,avx         ,32775       ,0           ,64          ,603.4       ,601.5
> > skylake     ,avx         ,32783       ,0           ,3           ,604.8       ,606.4
> > skylake     ,avx         ,32799       ,3           ,0           ,603.0       ,604.1
> > skylake     ,avx         ,32831       ,3           ,7           ,600.2       ,737.3
> > skylake     ,avx         ,32895       ,9           ,5           ,604.4       ,599.5
> > skylake     ,avx         ,65543       ,0           ,64          ,1873.5      ,1854.3
> > skylake     ,avx         ,65551       ,0           ,3           ,1862.9      ,1846.6
> > skylake     ,avx         ,65567       ,3           ,0           ,1885.5      ,1966.0
> > skylake     ,avx         ,65599       ,3           ,7           ,1833.2      ,1833.1
> > skylake     ,avx         ,65663       ,9           ,5           ,1884.9      ,1887.4
> > skylake     ,avx         ,131079      ,0           ,64          ,3944.3      ,3949.4
> > skylake     ,avx         ,131087      ,0           ,3           ,3927.3      ,3913.3
> > skylake     ,avx         ,131103      ,3           ,0           ,4415.8      ,4169.4
> > skylake     ,avx         ,131135      ,3           ,7           ,4224.5      ,4157.6
> > skylake     ,avx         ,131199      ,9           ,5           ,5974.0      ,4983.8
> > skylake     ,avx         ,262151      ,0           ,64          ,11050.2     ,10620.6
> > skylake     ,avx         ,262159      ,0           ,3           ,9932.8      ,10037.3
> > skylake     ,avx         ,262175      ,3           ,0           ,10188.8     ,9206.6
> > skylake     ,avx         ,262207      ,3           ,7           ,9633.3      ,9216.7
> > skylake     ,avx         ,262271      ,9           ,5           ,9732.7      ,9345.3
> > skylake     ,avx         ,524295      ,0           ,64          ,24823.9     ,24880.7
> > skylake     ,avx         ,524303      ,0           ,3           ,24514.0     ,24556.7
> > skylake     ,avx         ,524319      ,3           ,0           ,23974.4     ,24219.9
> > skylake     ,avx         ,524351      ,3           ,7           ,24159.7     ,24207.0
> > skylake     ,avx         ,524415      ,9           ,5           ,23946.5     ,24142.8
> > skylake     ,avx         ,1048583     ,0           ,64          ,49163.9     ,49454.6
> > skylake     ,avx         ,1048591     ,0           ,3           ,49879.3     ,49400.8
> > skylake     ,avx         ,1048607     ,3           ,0           ,49738.0     ,48864.6
> > skylake     ,avx         ,1048639     ,3           ,7           ,48804.0     ,47588.5
> > skylake     ,avx         ,1048703     ,9           ,5           ,49629.4     ,49796.3
> > skylake     ,avx         ,2097159     ,0           ,64          ,98271.7     ,96330.6
> > skylake     ,avx         ,2097167     ,0           ,3           ,97801.8     ,98638.1
> > skylake     ,avx         ,2097183     ,3           ,0           ,98041.1     ,99287.6
> > skylake     ,avx         ,2097215     ,3           ,7           ,96629.5     ,96521.9
> > skylake     ,avx         ,2097279     ,9           ,5           ,98961.8     ,98909.8
> > skylake     ,avx         ,4194311     ,0           ,64          ,194667.7    ,195377.1
> > skylake     ,avx         ,4194319     ,0           ,3           ,194919.5    ,198576.2
> > skylake     ,avx         ,4194335     ,3           ,0           ,192949.8    ,194584.7
> > skylake     ,avx         ,4194367     ,3           ,7           ,189943.5    ,189177.9
> > skylake     ,avx         ,4194431     ,9           ,5           ,192479.1    ,196494.2
> > skylake     ,avx         ,8388615     ,0           ,64          ,588671.6    ,587215.4
> > skylake     ,avx         ,8388623     ,0           ,3           ,581640.7    ,582812.5
> > skylake     ,avx         ,8388639     ,3           ,0           ,549811.9    ,544697.6
> > skylake     ,avx         ,8388671     ,3           ,7           ,591155.0    ,577951.8
> > skylake     ,avx         ,8388735     ,9           ,5           ,547583.2    ,545133.3
> > skylake     ,avx         ,16777223    ,0           ,64          ,1787503.0   ,1811146.0
> > skylake     ,avx         ,16777231    ,0           ,3           ,1758671.0   ,1756343.0
> > skylake     ,avx         ,16777247    ,3           ,0           ,1691781.0   ,1694661.0
> > skylake     ,avx         ,16777279    ,3           ,7           ,1768150.0   ,1754785.0
> > skylake     ,avx         ,16777343    ,9           ,5           ,1695179.0   ,1710794.0
> > skylake     ,sse2        ,4103        ,0           ,64          ,150.8       ,150.5
> > skylake     ,sse2        ,4111        ,0           ,3           ,156.8       ,158.4
> > skylake     ,sse2        ,4127        ,3           ,0           ,99.7        ,99.4
> > skylake     ,sse2        ,4159        ,3           ,7           ,154.8       ,154.5
> > skylake     ,sse2        ,4223        ,9           ,5           ,137.3       ,137.2
> > skylake     ,sse2        ,8199        ,0           ,64          ,284.8       ,285.5
> > skylake     ,sse2        ,8207        ,0           ,3           ,296.0       ,296.1
> > skylake     ,sse2        ,8223        ,3           ,0           ,168.0       ,168.2
> > skylake     ,sse2        ,8255        ,3           ,7           ,293.0       ,292.4
> > skylake     ,sse2        ,8319        ,9           ,5           ,251.3       ,250.7
> > skylake     ,sse2        ,16391       ,0           ,64          ,561.3       ,608.3
> > skylake     ,sse2        ,16399       ,0           ,3           ,571.0       ,574.8
> > skylake     ,sse2        ,16415       ,3           ,0           ,305.4       ,305.0
> > skylake     ,sse2        ,16447       ,3           ,7           ,563.2       ,565.0
> > skylake     ,sse2        ,16511       ,9           ,5           ,477.1       ,475.1
> > skylake     ,sse2        ,32775       ,0           ,64          ,1128.2      ,1131.7
> > skylake     ,sse2        ,32783       ,0           ,3           ,1126.6      ,1131.0
> > skylake     ,sse2        ,32799       ,3           ,0           ,587.6       ,590.8
> > skylake     ,sse2        ,32831       ,3           ,7           ,1130.6      ,1126.2
> > skylake     ,sse2        ,32895       ,9           ,5           ,957.6       ,953.0
> > skylake     ,sse2        ,65543       ,0           ,64          ,2718.9      ,2704.2
> > skylake     ,sse2        ,65551       ,0           ,3           ,2724.1      ,2725.0
> > skylake     ,sse2        ,65567       ,3           ,0           ,1888.4      ,1914.3
> > skylake     ,sse2        ,65599       ,3           ,7           ,2787.6      ,2748.7
> > skylake     ,sse2        ,65663       ,9           ,5           ,2400.5      ,2369.4
> > skylake     ,sse2        ,131079      ,0           ,64          ,5603.3      ,5654.9
> > skylake     ,sse2        ,131087      ,0           ,3           ,5939.3      ,5871.4
> > skylake     ,sse2        ,131103      ,3           ,0           ,4272.4      ,4190.0
> > skylake     ,sse2        ,131135      ,3           ,7           ,7601.4      ,7524.6
> > skylake     ,sse2        ,131199      ,9           ,5           ,7022.1      ,6864.7
> > skylake     ,sse2        ,262151      ,0           ,64          ,13736.2     ,14030.0
> > skylake     ,sse2        ,262159      ,0           ,3           ,12407.3     ,12334.1
> > skylake     ,sse2        ,262175      ,3           ,0           ,9661.1      ,9249.4
> > skylake     ,sse2        ,262207      ,3           ,7           ,12850.2     ,12351.6
> > skylake     ,sse2        ,262271      ,9           ,5           ,10792.6     ,10435.8
> > skylake     ,sse2        ,524295      ,0           ,64          ,27754.5     ,28177.7
> > skylake     ,sse2        ,524303      ,0           ,3           ,27766.2     ,28152.0
> > skylake     ,sse2        ,524319      ,3           ,0           ,24030.9     ,24438.3
> > skylake     ,sse2        ,524351      ,3           ,7           ,27787.5     ,27933.0
> > skylake     ,sse2        ,524415      ,9           ,5           ,24263.2     ,25249.1
> > skylake     ,sse2        ,1048583     ,0           ,64          ,56199.9     ,56039.8
> > skylake     ,sse2        ,1048591     ,0           ,3           ,56750.2     ,58889.7
> > skylake     ,sse2        ,1048607     ,3           ,0           ,56394.0     ,55115.3
> > skylake     ,sse2        ,1048639     ,3           ,7           ,57233.1     ,57473.8
> > skylake     ,sse2        ,1048703     ,9           ,5           ,56324.3     ,55917.9
> > skylake     ,sse2        ,2097159     ,0           ,64          ,113234.8    ,114346.4
> > skylake     ,sse2        ,2097167     ,0           ,3           ,114373.1    ,115522.5
> > skylake     ,sse2        ,2097183     ,3           ,0           ,108113.3    ,108513.3
> > skylake     ,sse2        ,2097215     ,3           ,7           ,116863.6    ,116549.9
> > skylake     ,sse2        ,2097279     ,9           ,5           ,108945.1    ,108843.7
> > skylake     ,sse2        ,4194311     ,0           ,64          ,230250.1    ,232350.0
> > skylake     ,sse2        ,4194319     ,0           ,3           ,231895.3    ,235055.6
> > skylake     ,sse2        ,4194335     ,3           ,0           ,218442.8    ,219199.8
> > skylake     ,sse2        ,4194367     ,3           ,7           ,242564.2    ,235587.7
> > skylake     ,sse2        ,4194431     ,9           ,5           ,224167.4    ,215261.8
> > skylake     ,sse2        ,8388615     ,0           ,64          ,679801.8    ,674832.0
> > skylake     ,sse2        ,8388623     ,0           ,3           ,684913.2    ,685238.7
> > skylake     ,sse2        ,8388639     ,3           ,0           ,644865.4    ,631388.6
> > skylake     ,sse2        ,8388671     ,3           ,7           ,698700.9    ,689316.1
> > skylake     ,sse2        ,8388735     ,9           ,5           ,644820.2    ,631366.8
> > skylake     ,sse2        ,16777223    ,0           ,64          ,1877984.0   ,1876437.0
> > skylake     ,sse2        ,16777231    ,0           ,3           ,1898086.0   ,1913053.0
> > skylake     ,sse2        ,16777247    ,3           ,0           ,1857018.0   ,1866949.0
> > skylake     ,sse2        ,16777279    ,3           ,7           ,1914905.0   ,1897134.0
> > skylake     ,sse2        ,16777343    ,9           ,5           ,1859937.0   ,1881939.0
> > icelake     ,avx512      ,4103        ,0           ,64          ,75.2        ,75.8
> > icelake     ,avx512      ,4111        ,0           ,3           ,56.9        ,56.4
> > icelake     ,avx512      ,4127        ,3           ,0           ,59.1        ,59.6
> > icelake     ,avx512      ,4159        ,3           ,7           ,50.7        ,51.3
> > icelake     ,avx512      ,4223        ,9           ,5           ,59.2        ,58.9
> > icelake     ,avx512      ,8199        ,0           ,64          ,67.8        ,63.9
> > icelake     ,avx512      ,8207        ,0           ,3           ,89.0        ,89.9
> > icelake     ,avx512      ,8223        ,3           ,0           ,90.2        ,90.1
> > icelake     ,avx512      ,8255        ,3           ,7           ,82.6        ,84.9
> > icelake     ,avx512      ,8319        ,9           ,5           ,91.5        ,92.8
> > icelake     ,avx512      ,16391       ,0           ,64          ,118.0       ,117.6
> > icelake     ,avx512      ,16399       ,0           ,3           ,156.5       ,157.0
> > icelake     ,avx512      ,16415       ,3           ,0           ,157.4       ,157.3
> > icelake     ,avx512      ,16447       ,3           ,7           ,151.0       ,151.6
> > icelake     ,avx512      ,16511       ,9           ,5           ,159.1       ,159.6
> > icelake     ,avx512      ,32775       ,0           ,64          ,231.8       ,230.8
> > icelake     ,avx512      ,32783       ,0           ,3           ,297.8       ,299.3
> > icelake     ,avx512      ,32799       ,3           ,0           ,299.1       ,299.0
> > icelake     ,avx512      ,32831       ,3           ,7           ,293.5       ,295.4
> > icelake     ,avx512      ,32895       ,9           ,5           ,300.3       ,302.5
> > icelake     ,avx512      ,65543       ,0           ,64          ,1473.4      ,1479.2
> > icelake     ,avx512      ,65551       ,0           ,3           ,1438.2      ,1445.3
> > icelake     ,avx512      ,65567       ,3           ,0           ,1450.3      ,1463.8
> > icelake     ,avx512      ,65599       ,3           ,7           ,1469.0      ,1473.8
> > icelake     ,avx512      ,65663       ,9           ,5           ,1480.0      ,1483.5
> > icelake     ,avx512      ,131079      ,0           ,64          ,3015.1      ,3037.5
> > icelake     ,avx512      ,131087      ,0           ,3           ,2952.3      ,2960.4
> > icelake     ,avx512      ,131103      ,3           ,0           ,2966.2      ,2964.4
> > icelake     ,avx512      ,131135      ,3           ,7           ,2961.6      ,3047.9
> > icelake     ,avx512      ,131199      ,9           ,5           ,2967.4      ,3183.8
> > icelake     ,avx512      ,262151      ,0           ,64          ,6206.0      ,6141.5
> > icelake     ,avx512      ,262159      ,0           ,3           ,5990.8      ,5959.2
> > icelake     ,avx512      ,262175      ,3           ,0           ,5976.7      ,5963.8
> > icelake     ,avx512      ,262207      ,3           ,7           ,5939.5      ,5924.3
> > icelake     ,avx512      ,262271      ,9           ,5           ,5944.6      ,5990.3
> > icelake     ,avx512      ,524295      ,0           ,64          ,14726.7     ,14307.0
> > icelake     ,avx512      ,524303      ,0           ,3           ,14344.2     ,14040.5
> > icelake     ,avx512      ,524319      ,3           ,0           ,14175.0     ,13862.2
> > icelake     ,avx512      ,524351      ,3           ,7           ,14261.4     ,13821.5
> > icelake     ,avx512      ,524415      ,9           ,5           ,14266.5     ,14064.7
> > icelake     ,avx512      ,1048583     ,0           ,64          ,35211.4     ,35414.6
> > icelake     ,avx512      ,1048591     ,0           ,3           ,35156.8     ,35591.2
> > icelake     ,avx512      ,1048607     ,3           ,0           ,35273.1     ,35503.3
> > icelake     ,avx512      ,1048639     ,3           ,7           ,35255.8     ,35725.0
> > icelake     ,avx512      ,1048703     ,9           ,5           ,35703.6     ,36289.9
> > icelake     ,avx512      ,2097159     ,0           ,64          ,72613.9     ,72063.2
> > icelake     ,avx512      ,2097167     ,0           ,3           ,72301.6     ,73504.2
> > icelake     ,avx512      ,2097183     ,3           ,0           ,73448.8     ,72133.6
> > icelake     ,avx512      ,2097215     ,3           ,7           ,73762.9     ,72825.8
> > icelake     ,avx512      ,2097279     ,9           ,5           ,72097.3     ,72914.6
> > icelake     ,avx512      ,4194311     ,0           ,64          ,144793.4    ,144182.1
> > icelake     ,avx512      ,4194319     ,0           ,3           ,143710.3    ,145063.3
> > icelake     ,avx512      ,4194335     ,3           ,0           ,146722.1    ,144046.4
> > icelake     ,avx512      ,4194367     ,3           ,7           ,144267.0    ,144874.6
> > icelake     ,avx512      ,4194431     ,9           ,5           ,143808.2    ,144560.0
> > icelake     ,avx512      ,8388615     ,0           ,64          ,427993.4    ,424521.5
> > icelake     ,avx512      ,8388623     ,0           ,3           ,470267.1    ,473290.8
> > icelake     ,avx512      ,8388639     ,3           ,0           ,457179.7    ,461797.7
> > icelake     ,avx512      ,8388671     ,3           ,7           ,472507.9    ,481561.4
> > icelake     ,avx512      ,8388735     ,9           ,5           ,463611.9    ,467388.7
> > icelake     ,avx512      ,16777223    ,0           ,64          ,1490426.0   ,1526996.0
> > icelake     ,avx512      ,16777231    ,0           ,3           ,1516687.0   ,1517095.0
> > icelake     ,avx512      ,16777247    ,3           ,0           ,1497688.0   ,1512766.0
> > icelake     ,avx512      ,16777279    ,3           ,7           ,1512331.0   ,1524317.0
> > icelake     ,avx512      ,16777343    ,9           ,5           ,1498908.0   ,1500526.0
> > icelake     ,avx         ,4103        ,0           ,64          ,50.2        ,63.7
> > icelake     ,avx         ,4111        ,0           ,3           ,63.7        ,65.1
> > icelake     ,avx         ,4127        ,3           ,0           ,68.2        ,69.4
> > icelake     ,avx         ,4159        ,3           ,7           ,59.6        ,68.0
> > icelake     ,avx         ,4223        ,9           ,5           ,68.2        ,66.8
> > icelake     ,avx         ,8199        ,0           ,64          ,92.1        ,89.9
> > icelake     ,avx         ,8207        ,0           ,3           ,119.7       ,118.3
> > icelake     ,avx         ,8223        ,3           ,0           ,119.1       ,120.9
> > icelake     ,avx         ,8255        ,3           ,7           ,122.9       ,123.7
> > icelake     ,avx         ,8319        ,9           ,5           ,122.1       ,121.8
> > icelake     ,avx         ,16391       ,0           ,64          ,162.7       ,158.0
> > icelake     ,avx         ,16399       ,0           ,3           ,227.6       ,234.1
> > icelake     ,avx         ,16415       ,3           ,0           ,230.8       ,232.7
> > icelake     ,avx         ,16447       ,3           ,7           ,226.8       ,232.6
> > icelake     ,avx         ,16511       ,9           ,5           ,233.4       ,233.8
> > icelake     ,avx         ,32775       ,0           ,64          ,312.2       ,301.8
> > icelake     ,avx         ,32783       ,0           ,3           ,449.7       ,450.0
> > icelake     ,avx         ,32799       ,3           ,0           ,452.7       ,455.9
> > icelake     ,avx         ,32831       ,3           ,7           ,449.8       ,458.0
> > icelake     ,avx         ,32895       ,9           ,5           ,456.3       ,459.4
> > icelake     ,avx         ,65543       ,0           ,64          ,1460.6      ,1463.9
> > icelake     ,avx         ,65551       ,0           ,3           ,1462.0      ,1465.4
> > icelake     ,avx         ,65567       ,3           ,0           ,1466.6      ,1480.4
> > icelake     ,avx         ,65599       ,3           ,7           ,1488.0      ,1488.9
> > icelake     ,avx         ,65663       ,9           ,5           ,1680.8      ,1499.5
> > icelake     ,avx         ,131079      ,0           ,64          ,2988.5      ,3010.1
> > icelake     ,avx         ,131087      ,0           ,3           ,2995.5      ,2996.4
> > icelake     ,avx         ,131103      ,3           ,0           ,3006.2      ,3000.5
> > icelake     ,avx         ,131135      ,3           ,7           ,3032.4      ,3073.7
> > icelake     ,avx         ,131199      ,9           ,5           ,3010.4      ,3027.4
> > icelake     ,avx         ,262151      ,0           ,64          ,6143.2      ,6079.1
> > icelake     ,avx         ,262159      ,0           ,3           ,6085.1      ,6075.8
> > icelake     ,avx         ,262175      ,3           ,0           ,6088.0      ,6064.9
> > icelake     ,avx         ,262207      ,3           ,7           ,6018.7      ,6023.5
> > icelake     ,avx         ,262271      ,9           ,5           ,6019.8      ,5959.2
> > icelake     ,avx         ,524295      ,0           ,64          ,14464.2     ,14095.1
> > icelake     ,avx         ,524303      ,0           ,3           ,14761.6     ,14050.2
> > icelake     ,avx         ,524319      ,3           ,0           ,14534.1     ,14087.5
> > icelake     ,avx         ,524351      ,3           ,7           ,14147.7     ,13903.8
> > icelake     ,avx         ,524415      ,9           ,5           ,14157.0     ,13982.9
> > icelake     ,avx         ,1048583     ,0           ,64          ,36599.0     ,37461.4
> > icelake     ,avx         ,1048591     ,0           ,3           ,36717.8     ,37454.9
> > icelake     ,avx         ,1048607     ,3           ,0           ,36821.2     ,37343.3
> > icelake     ,avx         ,1048639     ,3           ,7           ,36958.0     ,37507.2
> > icelake     ,avx         ,1048703     ,9           ,5           ,36869.2     ,37413.1
> > icelake     ,avx         ,2097159     ,0           ,64          ,74765.8     ,75330.9
> > icelake     ,avx         ,2097167     ,0           ,3           ,75175.4     ,74891.9
> > icelake     ,avx         ,2097183     ,3           ,0           ,75451.4     ,74787.7
> > icelake     ,avx         ,2097215     ,3           ,7           ,75394.8     ,75839.1
> > icelake     ,avx         ,2097279     ,9           ,5           ,75099.2     ,75421.2
> > icelake     ,avx         ,4194311     ,0           ,64          ,146809.6    ,146619.4
> > icelake     ,avx         ,4194319     ,0           ,3           ,148866.4    ,149898.2
> > icelake     ,avx         ,4194335     ,3           ,0           ,148719.7    ,150165.4
> > icelake     ,avx         ,4194367     ,3           ,7           ,150600.1    ,150925.9
> > icelake     ,avx         ,4194431     ,9           ,5           ,149457.3    ,150519.2
> > icelake     ,avx         ,8388615     ,0           ,64          ,412709.8    ,423666.1
> > icelake     ,avx         ,8388623     ,0           ,3           ,423717.4    ,424418.2
> > icelake     ,avx         ,8388639     ,3           ,0           ,414387.5    ,413445.6
> > icelake     ,avx         ,8388671     ,3           ,7           ,449010.7    ,417553.5
> > icelake     ,avx         ,8388735     ,9           ,5           ,414128.6    ,411815.3
> > icelake     ,avx         ,16777223    ,0           ,64          ,1490032.0   ,1510004.0
> > icelake     ,avx         ,16777231    ,0           ,3           ,1379638.0   ,1422097.0
> > icelake     ,avx         ,16777247    ,3           ,0           ,1418930.0   ,1367557.0
> > icelake     ,avx         ,16777279    ,3           ,7           ,1515152.0   ,1500176.0
> > icelake     ,avx         ,16777343    ,9           ,5           ,1344117.0   ,1411795.0
> > icelake     ,sse2        ,4103        ,0           ,64          ,113.2       ,114.6
> > icelake     ,sse2        ,4111        ,0           ,3           ,121.5       ,120.4
> > icelake     ,sse2        ,4127        ,3           ,0           ,1700.5      ,1771.5
> > icelake     ,sse2        ,4159        ,3           ,7           ,119.3       ,118.8
> > icelake     ,sse2        ,4223        ,9           ,5           ,1739.7      ,1735.2
> > icelake     ,sse2        ,8199        ,0           ,64          ,207.0       ,203.9
> > icelake     ,sse2        ,8207        ,0           ,3           ,225.5       ,220.8
> > icelake     ,sse2        ,8223        ,3           ,0           ,3444.3      ,3743.5
> > icelake     ,sse2        ,8255        ,3           ,7           ,219.9       ,216.8
> > icelake     ,sse2        ,8319        ,9           ,5           ,4117.1      ,3487.3
> > icelake     ,sse2        ,16391       ,0           ,64          ,397.1       ,394.3
> > icelake     ,sse2        ,16399       ,0           ,3           ,439.6       ,428.6
> > icelake     ,sse2        ,16415       ,3           ,0           ,6997.0      ,7031.2
> > icelake     ,sse2        ,16447       ,3           ,7           ,426.8       ,421.8
> > icelake     ,sse2        ,16511       ,9           ,5           ,7037.6      ,7038.3
> > icelake     ,sse2        ,32775       ,0           ,64          ,790.9       ,779.0
> > icelake     ,sse2        ,32783       ,0           ,3           ,863.1       ,849.6
> > icelake     ,sse2        ,32799       ,3           ,0           ,14043.0     ,14390.9
> > icelake     ,sse2        ,32831       ,3           ,7           ,841.6       ,833.1
> > icelake     ,sse2        ,32895       ,9           ,5           ,14277.6     ,14344.2
> > icelake     ,sse2        ,65543       ,0           ,64          ,1897.0      ,1897.3
> > icelake     ,sse2        ,65551       ,0           ,3           ,1927.1      ,1955.4
> > icelake     ,sse2        ,65567       ,3           ,0           ,28834.7     ,28727.8
> > icelake     ,sse2        ,65599       ,3           ,7           ,1961.4      ,1969.7
> > icelake     ,sse2        ,65663       ,9           ,5           ,28867.6     ,29019.8
> > icelake     ,sse2        ,131079      ,0           ,64          ,3879.3      ,3872.6
> > icelake     ,sse2        ,131087      ,0           ,3           ,3955.3      ,3990.7
> > icelake     ,sse2        ,131103      ,3           ,0           ,58001.8     ,60567.9
> > icelake     ,sse2        ,131135      ,3           ,7           ,3951.5      ,4002.6
> > icelake     ,sse2        ,131199      ,9           ,5           ,57886.7     ,58391.4
> > icelake     ,sse2        ,262151      ,0           ,64          ,7851.4      ,7894.7
> > icelake     ,sse2        ,262159      ,0           ,3           ,7947.5      ,8016.2
> > icelake     ,sse2        ,262175      ,3           ,0           ,115036.2    ,115968.6
> > icelake     ,sse2        ,262207      ,3           ,7           ,7883.9      ,7814.1
> > icelake     ,sse2        ,262271      ,9           ,5           ,113776.4    ,119733.6
> > icelake     ,sse2        ,524295      ,0           ,64          ,17198.1     ,16974.9
> > icelake     ,sse2        ,524303      ,0           ,3           ,17402.2     ,17096.3
> > icelake     ,sse2        ,524319      ,3           ,0           ,223980.4    ,225889.9
> > icelake     ,sse2        ,524351      ,3           ,7           ,17034.9     ,16910.3
> > icelake     ,sse2        ,524415      ,9           ,5           ,224027.7    ,224962.5
> > icelake     ,sse2        ,1048583     ,0           ,64          ,38822.3     ,39178.6
> > icelake     ,sse2        ,1048591     ,0           ,3           ,41686.7     ,40247.4
> > icelake     ,sse2        ,1048607     ,3           ,0           ,38814.8     ,39323.3
> > icelake     ,sse2        ,1048639     ,3           ,7           ,39568.3     ,41325.7
> > icelake     ,sse2        ,1048703     ,9           ,5           ,39354.2     ,39637.9
> > icelake     ,sse2        ,2097159     ,0           ,64          ,84074.7     ,84543.1
> > icelake     ,sse2        ,2097167     ,0           ,3           ,83665.7     ,82358.2
> > icelake     ,sse2        ,2097183     ,3           ,0           ,81817.8     ,79638.9
> > icelake     ,sse2        ,2097215     ,3           ,7           ,83649.1     ,83497.6
> > icelake     ,sse2        ,2097279     ,9           ,5           ,80287.6     ,79980.9
> > icelake     ,sse2        ,4194311     ,0           ,64          ,165409.8    ,168343.1
> > icelake     ,sse2        ,4194319     ,0           ,3           ,165216.7    ,177632.0
> > icelake     ,sse2        ,4194335     ,3           ,0           ,158718.7    ,160342.2
> > icelake     ,sse2        ,4194367     ,3           ,7           ,167944.9    ,167204.4
> > icelake     ,sse2        ,4194431     ,9           ,5           ,161530.1    ,164839.7
> > icelake     ,sse2        ,8388615     ,0           ,64          ,626504.3    ,629858.5
> > icelake     ,sse2        ,8388623     ,0           ,3           ,623969.5    ,631509.1
> > icelake     ,sse2        ,8388639     ,3           ,0           ,599366.7    ,600016.0
> > icelake     ,sse2        ,8388671     ,3           ,7           ,619964.2    ,619113.2
> > icelake     ,sse2        ,8388735     ,9           ,5           ,595338.1    ,604172.4
> > icelake     ,sse2        ,16777223    ,0           ,64          ,1709597.0   ,1725184.0
> > icelake     ,sse2        ,16777231    ,0           ,3           ,1725452.0   ,1719746.0
> > icelake     ,sse2        ,16777247    ,3           ,0           ,1614269.0   ,1607164.0
> > icelake     ,sse2        ,16777279    ,3           ,7           ,1705295.0   ,1733018.0
> > icelake     ,sse2        ,16777343    ,9           ,5           ,1604197.0   ,1595690.0
> >
>
> I am having a hard time to convince myself that this patch is really necessary.
> What are geomeans of all different cases for each processors?

N = 100, Geometric mean of Current vs New for memcpy-bench-large. Note the
bench-memmove-large numbers should be unaffected by this patch as the new
logic only applies to the no overlap case.

cpu         ,inst        ,Len         ,align1      ,align2      ,new
geomean ,cur geomean ,New/Cur
icelake     ,sse2        ,65543       ,0           ,0
,5566.1      ,5564.7      ,1.0
icelake     ,sse2        ,65551       ,0           ,3
,5856.4      ,5725.7      ,1.02
icelake     ,sse2        ,65567       ,3           ,0
,5622.8      ,5892.9      ,0.95
icelake     ,sse2        ,65599       ,3           ,5
,5857.3      ,5723.8      ,1.02
icelake     ,sse2        ,65536       ,0           ,127
,5953.3      ,5831.1      ,1.02
icelake     ,sse2        ,65536       ,0           ,255
,5811.7      ,5789.5      ,1.0
icelake     ,sse2        ,65536       ,0           ,256
,5373.5      ,5284.1      ,1.02
icelake     ,sse2        ,65536       ,0           ,4064
,5820.1      ,5761.6      ,1.01
icelake     ,sse2        ,131079      ,0           ,0
,12421.5     ,12424.1     ,1.0
icelake     ,sse2        ,131087      ,0           ,3
,12389.5     ,12276.4     ,1.01
icelake     ,sse2        ,131103      ,3           ,0
,11587.0     ,12607.6     ,0.92
icelake     ,sse2        ,131135      ,3           ,5
,11596.9     ,11896.2     ,0.97
icelake     ,sse2        ,131072      ,0           ,127
,11746.4     ,12490.1     ,0.94
icelake     ,sse2        ,131072      ,0           ,255
,11486.8     ,11831.7     ,0.97
icelake     ,sse2        ,131072      ,0           ,256
,10453.5     ,10451.7     ,1.0
icelake     ,sse2        ,131072      ,0           ,4064
,11231.7     ,11223.6     ,1.0
icelake     ,sse2        ,262151      ,0           ,0
,29408.5     ,30831.2     ,0.95
icelake     ,sse2        ,262159      ,0           ,3
,30813.6     ,32235.6     ,0.96
icelake     ,sse2        ,262175      ,3           ,0
,30245.0     ,31392.5     ,0.96
icelake     ,sse2        ,262207      ,3           ,5
,30775.6     ,32298.6     ,0.95
icelake     ,sse2        ,262144      ,0           ,127
,31784.7     ,32791.5     ,0.97
icelake     ,sse2        ,262144      ,0           ,255
,30726.0     ,31997.5     ,0.96
icelake     ,sse2        ,262144      ,0           ,256
,28418.9     ,29440.9     ,0.97
icelake     ,sse2        ,262144      ,0           ,4064
,29984.1     ,31048.9     ,0.97
icelake     ,sse2        ,524295      ,0           ,0
,76079.0     ,75752.0     ,1.0
icelake     ,sse2        ,524303      ,0           ,3
,79939.3     ,80796.4     ,0.99
icelake     ,sse2        ,524319      ,3           ,0
,79018.1     ,79928.5     ,0.99
icelake     ,sse2        ,524351      ,3           ,5
,81219.4     ,81053.8     ,1.0
icelake     ,sse2        ,524288      ,0           ,127
,80111.8     ,80087.2     ,1.0
icelake     ,sse2        ,524288      ,0           ,255
,79334.0     ,79525.6     ,1.0
icelake     ,sse2        ,524288      ,0           ,256
,75766.9     ,75918.9     ,1.0
icelake     ,sse2        ,524288      ,0           ,4064
,78907.9     ,79550.8     ,0.99
icelake     ,sse2        ,1048583     ,0           ,0
,144672.6    ,147457.7    ,0.98
icelake     ,sse2        ,1048591     ,0           ,3
,173803.9    ,400563.2    ,0.43
icelake     ,sse2        ,1048607     ,3           ,0
,149391.9    ,151772.1    ,0.98
icelake     ,sse2        ,1048639     ,3           ,5
,174774.1    ,400657.4    ,0.44
icelake     ,sse2        ,1048576     ,0           ,127
,175350.9    ,347110.6    ,0.51
icelake     ,sse2        ,1048576     ,0           ,255
,150152.6    ,144242.9    ,1.04
icelake     ,sse2        ,1048576     ,0           ,256
,145869.7    ,147489.6    ,0.99
icelake     ,sse2        ,1048576     ,0           ,4064
,145814.7    ,147497.7    ,0.99
icelake     ,sse2        ,2097159     ,0           ,0
,289460.6    ,295574.6    ,0.98
icelake     ,sse2        ,2097167     ,0           ,3
,347057.0    ,799549.1    ,0.43
icelake     ,sse2        ,2097183     ,3           ,0
,298565.7    ,301424.3    ,0.99
icelake     ,sse2        ,2097215     ,3           ,5
,348620.4    ,797557.4    ,0.44
icelake     ,sse2        ,2097152     ,0           ,127
,348751.4    ,695260.9    ,0.5
icelake     ,sse2        ,2097152     ,0           ,255
,298960.5    ,286590.0    ,1.04
icelake     ,sse2        ,2097152     ,0           ,256
,290978.4    ,293225.6    ,0.99
icelake     ,sse2        ,2097152     ,0           ,4064
,290476.0    ,292283.2    ,0.99
icelake     ,sse2        ,4194311     ,0           ,0
,583386.3    ,588284.3    ,0.99
icelake     ,sse2        ,4194319     ,0           ,3
,703870.5    ,1595268.0   ,0.44
icelake     ,sse2        ,4194335     ,3           ,0
,599400.2    ,601591.6    ,1.0
icelake     ,sse2        ,4194367     ,3           ,5
,694569.7    ,1595608.0   ,0.44
icelake     ,sse2        ,4194304     ,0           ,127
,700229.1    ,1389061.9   ,0.5
icelake     ,sse2        ,4194304     ,0           ,255
,600779.0    ,573361.2    ,1.05
icelake     ,sse2        ,4194304     ,0           ,256
,586610.7    ,589269.6    ,1.0
icelake     ,sse2        ,4194304     ,0           ,4064
,583616.3    ,584806.4    ,1.0
icelake     ,sse2        ,8388615     ,0           ,0
,1214632.8   ,1266616.0   ,0.96
icelake     ,sse2        ,8388623     ,0           ,3
,1405136.9   ,3198827.1   ,0.44
icelake     ,sse2        ,8388639     ,3           ,0
,1244302.6   ,1297425.9   ,0.96
icelake     ,sse2        ,8388671     ,3           ,5
,1404685.1   ,3196389.9   ,0.44
icelake     ,sse2        ,8388608     ,0           ,127
,1419888.5   ,2792729.4   ,0.51
icelake     ,sse2        ,8388608     ,0           ,255
,1249044.6   ,1259726.7   ,0.99
icelake     ,sse2        ,8388608     ,0           ,256
,1234471.9   ,1300463.6   ,0.95
icelake     ,sse2        ,8388608     ,0           ,4064
,1220102.2   ,1265190.5   ,0.96
icelake     ,sse2        ,16777223    ,0           ,0
,2689516.3   ,2846521.1   ,0.94
icelake     ,sse2        ,16777231    ,0           ,3
,3001317.4   ,6428733.7   ,0.47
icelake     ,sse2        ,16777247    ,3           ,0
,2770040.8   ,2910434.9   ,0.95
icelake     ,sse2        ,16777279    ,3           ,5
,3002076.1   ,6415835.9   ,0.47
icelake     ,sse2        ,16777216    ,0           ,127
,3063786.3   ,5609895.3   ,0.55
icelake     ,sse2        ,16777216    ,0           ,255
,2821606.1   ,2833843.6   ,1.0
icelake     ,sse2        ,16777216    ,0           ,256
,2719765.5   ,2925344.2   ,0.93
icelake     ,sse2        ,16777216    ,0           ,4064
,2686189.2   ,2848017.5   ,0.94
icelake     ,sse2        ,33554439    ,0           ,0
,5577945.0   ,5913674.6   ,0.94
icelake     ,sse2        ,33554447    ,0           ,3
,6152758.8   ,12863855.0  ,0.48
icelake     ,sse2        ,33554463    ,3           ,0
,5773351.4   ,6035289.3   ,0.96
icelake     ,sse2        ,33554495    ,3           ,5
,6160006.2   ,12878153.9  ,0.48
icelake     ,sse2        ,33554432    ,0           ,127
,6303495.4   ,11221070.2  ,0.56
icelake     ,sse2        ,33554432    ,0           ,255
,5830879.6   ,5944978.6   ,0.98
icelake     ,sse2        ,33554432    ,0           ,256
,5611968.2   ,6068255.4   ,0.92
icelake     ,sse2        ,33554432    ,0           ,4064
,5570321.0   ,5964542.6   ,0.93   icelake     ,avx         ,65543
 ,0           ,0           ,5561.1      ,5659.7      ,0.98
icelake     ,avx         ,65551       ,0           ,3
,5859.9      ,5724.8      ,1.02
icelake     ,avx         ,65567       ,3           ,0
,5636.7      ,5623.3      ,1.0
icelake     ,avx         ,65599       ,3           ,5
,5856.3      ,5720.2      ,1.02
icelake     ,avx         ,65536       ,0           ,127
,6011.1      ,5910.0      ,1.02
icelake     ,avx         ,65536       ,0           ,255
,5854.5      ,5792.3      ,1.01
icelake     ,avx         ,65536       ,0           ,256
,5213.0      ,5273.9      ,0.99
icelake     ,avx         ,65536       ,0           ,4064
,5760.7      ,5661.1      ,1.02
icelake     ,avx         ,131079      ,0           ,0
,12371.4     ,12707.0     ,0.97
icelake     ,avx         ,131087      ,0           ,3
,13220.1     ,12515.7     ,1.06
icelake     ,avx         ,131103      ,3           ,0
,11628.2     ,11546.9     ,1.01
icelake     ,avx         ,131135      ,3           ,5
,13025.7     ,13967.6     ,0.93
icelake     ,avx         ,131072      ,0           ,127
,11781.7     ,11936.4     ,0.99
icelake     ,avx         ,131072      ,0           ,255
,11802.2     ,11583.9     ,1.02
icelake     ,avx         ,131072      ,0           ,256
,10436.9     ,10693.1     ,0.98
icelake     ,avx         ,131072      ,0           ,4064
,11880.9     ,11395.6     ,1.04
icelake     ,avx         ,262151      ,0           ,0
,29132.6     ,30542.8     ,0.95
icelake     ,avx         ,262159      ,0           ,3
,30533.5     ,31468.8     ,0.97
icelake     ,avx         ,262175      ,3           ,0
,29879.5     ,30933.7     ,0.97
icelake     ,avx         ,262207      ,3           ,5
,30263.1     ,31445.0     ,0.96
icelake     ,avx         ,262144      ,0           ,127
,30180.9     ,31405.3     ,0.96
icelake     ,avx         ,262144      ,0           ,255
,30152.9     ,31372.5     ,0.96
icelake     ,avx         ,262144      ,0           ,256
,28121.9     ,28990.9     ,0.97
icelake     ,avx         ,262144      ,0           ,4064
,29785.2     ,31078.4     ,0.96
icelake     ,avx         ,524295      ,0           ,0
,76045.7     ,75824.3     ,1.0
icelake     ,avx         ,524303      ,0           ,3
,79303.7     ,80433.3     ,0.99
icelake     ,avx         ,524319      ,3           ,0
,79323.8     ,79411.3     ,1.0
icelake     ,avx         ,524351      ,3           ,5
,79797.9     ,80179.4     ,1.0
icelake     ,avx         ,524288      ,0           ,127
,80046.7     ,80254.1     ,1.0
icelake     ,avx         ,524288      ,0           ,255
,78580.6     ,79210.4     ,0.99
icelake     ,avx         ,524288      ,0           ,256
,75464.4     ,75184.2     ,1.0
icelake     ,avx         ,524288      ,0           ,4064
,78863.6     ,78677.9     ,1.0
icelake     ,avx         ,1048583     ,0           ,0
,131017.9    ,133962.4    ,0.98
icelake     ,avx         ,1048591     ,0           ,3
,143451.3    ,210311.7    ,0.68
icelake     ,avx         ,1048607     ,3           ,0
,136944.0    ,138426.4    ,0.99
icelake     ,avx         ,1048639     ,3           ,5
,143594.3    ,209887.9    ,0.68
icelake     ,avx         ,1048576     ,0           ,127
,156462.0    ,218873.2    ,0.71
icelake     ,avx         ,1048576     ,0           ,255
,148026.3    ,179419.0    ,0.83
icelake     ,avx         ,1048576     ,0           ,256
,143365.7    ,137816.3    ,1.04
icelake     ,avx         ,1048576     ,0           ,4064
,131683.4    ,132731.6    ,0.99
icelake     ,avx         ,2097159     ,0           ,0
,263807.1    ,267984.5    ,0.98
icelake     ,avx         ,2097167     ,0           ,3
,286949.8    ,422279.2    ,0.68
icelake     ,avx         ,2097183     ,3           ,0
,274675.6    ,276702.2    ,0.99
icelake     ,avx         ,2097215     ,3           ,5
,286681.7    ,420176.7    ,0.68
icelake     ,avx         ,2097152     ,0           ,127
,314499.2    ,437864.2    ,0.72
icelake     ,avx         ,2097152     ,0           ,255
,297458.4    ,359520.9    ,0.83
icelake     ,avx         ,2097152     ,0           ,256
,285883.2    ,276043.2    ,1.04
icelake     ,avx         ,2097152     ,0           ,4064
,263436.6    ,265516.6    ,0.99
icelake     ,avx         ,4194311     ,0           ,0
,529119.4    ,536745.2    ,0.99
icelake     ,avx         ,4194319     ,0           ,3
,573960.0    ,839002.3    ,0.68
icelake     ,avx         ,4194335     ,3           ,0
,550617.2    ,553117.5    ,1.0
icelake     ,avx         ,4194367     ,3           ,5
,572742.8    ,838784.5    ,0.68
icelake     ,avx         ,4194304     ,0           ,127
,629413.6    ,876512.1    ,0.72
icelake     ,avx         ,4194304     ,0           ,255
,594224.1    ,717425.1    ,0.83
icelake     ,avx         ,4194304     ,0           ,256
,573365.0    ,552538.3    ,1.04
icelake     ,avx         ,4194304     ,0           ,4064
,527459.3    ,531907.1    ,0.99
icelake     ,avx         ,8388615     ,0           ,0
,1094256.8   ,1145619.9   ,0.96
icelake     ,avx         ,8388623     ,0           ,3
,1170367.1   ,1700076.4   ,0.69
icelake     ,avx         ,8388639     ,3           ,0
,1136168.1   ,1174752.4   ,0.97
icelake     ,avx         ,8388671     ,3           ,5
,1172015.6   ,1703032.8   ,0.69
icelake     ,avx         ,8388608     ,0           ,127
,1276748.6   ,1771351.9   ,0.72
icelake     ,avx         ,8388608     ,0           ,255
,1207712.0   ,1449267.0   ,0.83
icelake     ,avx         ,8388608     ,0           ,256
,1167958.9   ,1178243.1   ,0.99
icelake     ,avx         ,8388608     ,0           ,4064
,1106155.9   ,1145128.6   ,0.97
icelake     ,avx         ,16777223    ,0           ,0
,2479317.5   ,2630301.0   ,0.94
icelake     ,avx         ,16777231    ,0           ,3
,2643303.6   ,3536980.7   ,0.75
icelake     ,avx         ,16777247    ,3           ,0
,2571967.0   ,2672246.4   ,0.96
icelake     ,avx         ,16777279    ,3           ,5
,2641320.5   ,3538388.9   ,0.75
icelake     ,avx         ,16777216    ,0           ,127
,2832921.6   ,3593702.5   ,0.79
icelake     ,avx         ,16777216    ,0           ,255
,2700272.1   ,3025346.1   ,0.89
icelake     ,avx         ,16777216    ,0           ,256
,2622133.7   ,2709087.6   ,0.97
icelake     ,avx         ,16777216    ,0           ,4064
,2475020.7   ,2610977.8   ,0.95
icelake     ,avx         ,33554439    ,0           ,0
,5190103.1   ,5576047.9   ,0.93
icelake     ,avx         ,33554447    ,0           ,3
,5477752.1   ,7215479.2   ,0.76
icelake     ,avx         ,33554463    ,3           ,0
,5338711.7   ,5625026.7   ,0.95
icelake     ,avx         ,33554495    ,3           ,5
,5505164.8   ,7223660.8   ,0.76
icelake     ,avx         ,33554432    ,0           ,127
,5859232.3   ,7279581.9   ,0.8
icelake     ,avx         ,33554432    ,0           ,255
,5681634.7   ,6156488.6   ,0.92
icelake     ,avx         ,33554432    ,0           ,256
,5440721.4   ,5728347.4   ,0.95
icelake     ,avx         ,33554432    ,0           ,4064
,5191213.2   ,5538716.4   ,0.94
icelake     ,avx512      ,65543       ,0           ,0
,5563.5      ,5634.1      ,0.99
icelake     ,avx512      ,65551       ,0           ,3
,5864.1      ,5728.4      ,1.02
icelake     ,avx512      ,65567       ,3           ,0
,5720.2      ,5625.3      ,1.02
icelake     ,avx512      ,65599       ,3           ,5
,5857.2      ,5722.0      ,1.02
icelake     ,avx512      ,65536       ,0           ,127
,6040.7      ,5844.0      ,1.03
icelake     ,avx512      ,65536       ,0           ,255
,5826.5      ,5799.6      ,1.0
icelake     ,avx512      ,65536       ,0           ,256
,5234.4      ,5230.0      ,1.0
icelake     ,avx512      ,65536       ,0           ,4064
,5800.7      ,5655.4      ,1.03
icelake     ,avx512      ,131079      ,0           ,0
,12591.4     ,11767.1     ,1.07
icelake     ,avx512      ,131087      ,0           ,3
,12694.9     ,12292.1     ,1.03
icelake     ,avx512      ,131103      ,3           ,0
,11374.7     ,12236.3     ,0.93
icelake     ,avx512      ,131135      ,3           ,5
,11958.2     ,11745.5     ,1.02
icelake     ,avx512      ,131072      ,0           ,127
,11803.4     ,11908.6     ,0.99
icelake     ,avx512      ,131072      ,0           ,255
,11569.0     ,11487.9     ,1.01
icelake     ,avx512      ,131072      ,0           ,256
,11087.6     ,10456.4     ,1.06
icelake     ,avx512      ,131072      ,0           ,4064
,11166.0     ,11248.2     ,0.99
icelake     ,avx512      ,262151      ,0           ,0
,30232.1     ,29932.7     ,1.01
icelake     ,avx512      ,262159      ,0           ,3
,30093.8     ,31315.1     ,0.96
icelake     ,avx512      ,262175      ,3           ,0
,30147.7     ,30643.4     ,0.98
icelake     ,avx512      ,262207      ,3           ,5
,29985.9     ,31479.8     ,0.95
icelake     ,avx512      ,262144      ,0           ,127
,30099.7     ,31552.9     ,0.95
icelake     ,avx512      ,262144      ,0           ,255
,29772.8     ,30698.1     ,0.97
icelake     ,avx512      ,262144      ,0           ,256
,28109.3     ,28957.9     ,0.97
icelake     ,avx512      ,262144      ,0           ,4064
,29787.5     ,30637.2     ,0.97
icelake     ,avx512      ,524295      ,0           ,0
,75920.7     ,75047.1     ,1.01
icelake     ,avx512      ,524303      ,0           ,3
,79218.6     ,79529.2     ,1.0
icelake     ,avx512      ,524319      ,3           ,0
,78446.9     ,78550.7     ,1.0
icelake     ,avx512      ,524351      ,3           ,5
,79055.0     ,79425.2     ,1.0
icelake     ,avx512      ,524288      ,0           ,127
,79070.6     ,79626.7     ,0.99
icelake     ,avx512      ,524288      ,0           ,255
,77891.8     ,78078.3     ,1.0
icelake     ,avx512      ,524288      ,0           ,256
,74797.3     ,74436.9     ,1.0
icelake     ,avx512      ,524288      ,0           ,4064
,78339.3     ,78337.2     ,1.0
icelake     ,avx512      ,1048583     ,0           ,0
,131427.6    ,133891.3    ,0.98
icelake     ,avx512      ,1048591     ,0           ,3
,143984.1    ,142003.7    ,1.01
icelake     ,avx512      ,1048607     ,3           ,0
,137547.9    ,134450.1    ,1.02
icelake     ,avx512      ,1048639     ,3           ,5
,144630.4    ,142174.6    ,1.02
icelake     ,avx512      ,1048576     ,0           ,127
,149810.7    ,142684.9    ,1.05
icelake     ,avx512      ,1048576     ,0           ,255
,156212.6    ,143509.2    ,1.09
icelake     ,avx512      ,1048576     ,0           ,256
,153776.9    ,139788.0    ,1.1
icelake     ,avx512      ,1048576     ,0           ,4064
,137926.6    ,134832.8    ,1.02
icelake     ,avx512      ,2097159     ,0           ,0
,263465.3    ,267681.6    ,0.98
icelake     ,avx512      ,2097167     ,0           ,3
,288947.7    ,284129.9    ,1.02
icelake     ,avx512      ,2097183     ,3           ,0
,275395.5    ,269216.0    ,1.02
icelake     ,avx512      ,2097215     ,3           ,5
,289131.5    ,284475.3    ,1.02
icelake     ,avx512      ,2097152     ,0           ,127
,299404.5    ,286193.2    ,1.05
icelake     ,avx512      ,2097152     ,0           ,255
,312913.2    ,286785.6    ,1.09
icelake     ,avx512      ,2097152     ,0           ,256
,307882.7    ,279708.7    ,1.1
icelake     ,avx512      ,2097152     ,0           ,4064
,275552.3    ,269867.0    ,1.02
icelake     ,avx512      ,4194311     ,0           ,0
,526480.1    ,536038.9    ,0.98
icelake     ,avx512      ,4194319     ,0           ,3
,579122.9    ,569512.5    ,1.02
icelake     ,avx512      ,4194335     ,3           ,0
,551658.1    ,542973.3    ,1.02
icelake     ,avx512      ,4194367     ,3           ,5
,578575.2    ,569497.2    ,1.02
icelake     ,avx512      ,4194304     ,0           ,127
,599943.6    ,569138.2    ,1.05
icelake     ,avx512      ,4194304     ,0           ,255
,628419.2    ,575908.4    ,1.09
icelake     ,avx512      ,4194304     ,0           ,256
,617242.8    ,561417.7    ,1.1
icelake     ,avx512      ,4194304     ,0           ,4064
,552012.3    ,540617.2    ,1.02
icelake     ,avx512      ,8388615     ,0           ,0
,1092471.4   ,1133834.9   ,0.96
icelake     ,avx512      ,8388623     ,0           ,3
,1185623.5   ,1218150.0   ,0.97
icelake     ,avx512      ,8388639     ,3           ,0
,1142647.1   ,1139201.6   ,1.0
icelake     ,avx512      ,8388671     ,3           ,5
,1183702.5   ,1225474.6   ,0.97
icelake     ,avx512      ,8388608     ,0           ,127
,1231862.8   ,1221685.1   ,1.01
icelake     ,avx512      ,8388608     ,0           ,255
,1290816.7   ,1221576.2   ,1.06
icelake     ,avx512      ,8388608     ,0           ,256
,1299047.6   ,1195021.2   ,1.09
icelake     ,avx512      ,8388608     ,0           ,4064
,1139648.9   ,1140113.0   ,1.0
icelake     ,avx512      ,16777223    ,0           ,0
,2464861.2   ,2599120.4   ,0.95
icelake     ,avx512      ,16777231    ,0           ,3
,2651029.7   ,2758867.1   ,0.96
icelake     ,avx512      ,16777247    ,3           ,0
,2570099.8   ,2601099.4   ,0.99
icelake     ,avx512      ,16777279    ,3           ,5
,2660529.4   ,2762598.6   ,0.96
icelake     ,avx512      ,16777216    ,0           ,127
,2759531.7   ,2756811.1   ,1.0
icelake     ,avx512      ,16777216    ,0           ,255
,2878568.5   ,2777650.3   ,1.04
icelake     ,avx512      ,16777216    ,0           ,256
,2931879.3   ,2709687.7   ,1.08
icelake     ,avx512      ,16777216    ,0           ,4064
,2587161.1   ,2632011.2   ,0.98
icelake     ,avx512      ,33554439    ,0           ,0
,5175406.0   ,5528857.2   ,0.94
icelake     ,avx512      ,33554447    ,0           ,3
,5537561.9   ,5818119.1   ,0.95
icelake     ,avx512      ,33554463    ,3           ,0
,5435099.5   ,5560442.2   ,0.98
icelake     ,avx512      ,33554495    ,3           ,5
,5546314.9   ,5800995.0   ,0.96
icelake     ,avx512      ,33554432    ,0           ,127
,5770248.0   ,5781104.9   ,1.0
icelake     ,avx512      ,33554432    ,0           ,255
,6019120.7   ,5836023.3   ,1.03
icelake     ,avx512      ,33554432    ,0           ,256
,6107033.4   ,5681798.8   ,1.07
icelake     ,avx512      ,33554432    ,0           ,4064
,5356238.5   ,5598521.5   ,0.96
skylake     ,sse2        ,65543       ,0           ,0
,3091.4      ,2940.2      ,1.05
skylake     ,sse2        ,65551       ,0           ,3
,3682.6      ,3403.7      ,1.08
skylake     ,sse2        ,65567       ,3           ,0
,3031.3      ,3070.2      ,0.99
skylake     ,sse2        ,65599       ,3           ,5
,3731.2      ,3718.7      ,1.0
skylake     ,sse2        ,65536       ,0           ,127
,3642.3      ,3390.5      ,1.07
skylake     ,sse2        ,65536       ,0           ,255
,3493.9      ,3333.0      ,1.05
skylake     ,sse2        ,65536       ,0           ,256
,3043.2      ,2981.0      ,1.02
skylake     ,sse2        ,65536       ,0           ,4064
,2796.6      ,2843.9      ,0.98
skylake     ,sse2        ,131079      ,0           ,0
,6347.4      ,6309.8      ,1.01
skylake     ,sse2        ,131087      ,0           ,3
,7318.4      ,7486.2      ,0.98
skylake     ,sse2        ,131103      ,3           ,0
,6297.4      ,6516.8      ,0.97
skylake     ,sse2        ,131135      ,3           ,5
,7544.5      ,7823.5      ,0.96
skylake     ,sse2        ,131072      ,0           ,127
,7426.4      ,7554.3      ,0.98
skylake     ,sse2        ,131072      ,0           ,255
,7349.0      ,7195.4      ,1.02
skylake     ,sse2        ,131072      ,0           ,256
,7068.1      ,6804.8      ,1.04
skylake     ,sse2        ,131072      ,0           ,4064
,6884.6      ,7566.7      ,0.91
skylake     ,sse2        ,262151      ,0           ,0
,15848.1     ,15552.2     ,1.02
skylake     ,sse2        ,262159      ,0           ,3
,17864.6     ,16787.9     ,1.06
skylake     ,sse2        ,262175      ,3           ,0
,15748.1     ,16266.0     ,0.97
skylake     ,sse2        ,262207      ,3           ,5
,17022.3     ,17229.8     ,0.99
skylake     ,sse2        ,262144      ,0           ,127
,16158.7     ,16093.6     ,1.0
skylake     ,sse2        ,262144      ,0           ,255
,15670.7     ,15949.2     ,0.98
skylake     ,sse2        ,262144      ,0           ,256
,14806.3     ,14970.3     ,0.99
skylake     ,sse2        ,262144      ,0           ,4064
,14751.7     ,15008.2     ,0.98
skylake     ,sse2        ,524295      ,0           ,0
,32874.8     ,33731.2     ,0.97
skylake     ,sse2        ,524303      ,0           ,3
,34035.1     ,34777.8     ,0.98
skylake     ,sse2        ,524319      ,3           ,0
,34325.6     ,34108.9     ,1.01
skylake     ,sse2        ,524351      ,3           ,5
,34853.5     ,35624.4     ,0.98
skylake     ,sse2        ,524288      ,0           ,127
,33437.4     ,33816.7     ,0.99
skylake     ,sse2        ,524288      ,0           ,255
,33256.1     ,33664.7     ,0.99
skylake     ,sse2        ,524288      ,0           ,256
,32006.3     ,32396.3     ,0.99
skylake     ,sse2        ,524288      ,0           ,4064
,32284.7     ,32713.9     ,0.99
skylake     ,sse2        ,1048583     ,0           ,0
,71891.7     ,73858.4     ,0.97
skylake     ,sse2        ,1048591     ,0           ,3
,74621.3     ,74389.7     ,1.0
skylake     ,sse2        ,1048607     ,3           ,0
,72515.0     ,73573.2     ,0.99
skylake     ,sse2        ,1048639     ,3           ,5
,72471.7     ,73782.6     ,0.98
skylake     ,sse2        ,1048576     ,0           ,127
,77638.6     ,82474.6     ,0.94
skylake     ,sse2        ,1048576     ,0           ,255
,71870.0     ,71933.6     ,1.0
skylake     ,sse2        ,1048576     ,0           ,256
,70410.0     ,73243.6     ,0.96
skylake     ,sse2        ,1048576     ,0           ,4064
,71267.1     ,72274.6     ,0.99
skylake     ,sse2        ,2097159     ,0           ,0
,140052.6    ,144880.1    ,0.97
skylake     ,sse2        ,2097167     ,0           ,3
,146626.5    ,147972.6    ,0.99
skylake     ,sse2        ,2097183     ,3           ,0
,141750.1    ,146353.6    ,0.97
skylake     ,sse2        ,2097215     ,3           ,5
,144169.0    ,148120.1    ,0.97
skylake     ,sse2        ,2097152     ,0           ,127
,156575.9    ,165844.4    ,0.94
skylake     ,sse2        ,2097152     ,0           ,255
,144277.7    ,146971.5    ,0.98
skylake     ,sse2        ,2097152     ,0           ,256
,143047.4    ,146810.9    ,0.97
skylake     ,sse2        ,2097152     ,0           ,4064
,142795.6    ,145805.8    ,0.98
skylake     ,sse2        ,4194311     ,0           ,0
,284353.3    ,298092.5    ,0.95
skylake     ,sse2        ,4194319     ,0           ,3
,296656.4    ,311960.2    ,0.95
skylake     ,sse2        ,4194335     ,3           ,0
,285922.6    ,304100.5    ,0.94
skylake     ,sse2        ,4194367     ,3           ,5
,297135.4    ,312532.5    ,0.95
skylake     ,sse2        ,4194304     ,0           ,127
,323938.6    ,340414.3    ,0.95
skylake     ,sse2        ,4194304     ,0           ,255
,301460.9    ,310042.7    ,0.97
skylake     ,sse2        ,4194304     ,0           ,256
,287155.8    ,303580.6    ,0.95
skylake     ,sse2        ,4194304     ,0           ,4064
,291006.2    ,302441.3    ,0.96
skylake     ,sse2        ,8388615     ,0           ,0
,714424.7    ,747484.3    ,0.96
skylake     ,sse2        ,8388623     ,0           ,3
,748995.5    ,774116.5    ,0.97
skylake     ,sse2        ,8388639     ,3           ,0
,720563.4    ,757386.9    ,0.95
skylake     ,sse2        ,8388671     ,3           ,5
,748028.7    ,773907.8    ,0.97
skylake     ,sse2        ,8388608     ,0           ,127
,750775.3    ,780245.2    ,0.96
skylake     ,sse2        ,8388608     ,0           ,255
,724940.3    ,764197.8    ,0.95
skylake     ,sse2        ,8388608     ,0           ,256
,722035.0    ,759408.9    ,0.95
skylake     ,sse2        ,8388608     ,0           ,4064
,756977.8    ,755532.4    ,1.0
skylake     ,sse2        ,16777223    ,0           ,0
,1971686.0   ,2111263.4   ,0.93
skylake     ,sse2        ,16777231    ,0           ,3
,1953608.9   ,2128493.8   ,0.92
skylake     ,sse2        ,16777247    ,3           ,0
,1967075.6   ,2103772.3   ,0.94
skylake     ,sse2        ,16777279    ,3           ,5
,1950851.6   ,2133601.6   ,0.91
skylake     ,sse2        ,16777216    ,0           ,127
,1991168.2   ,2078249.3   ,0.96
skylake     ,sse2        ,16777216    ,0           ,255
,1958502.9   ,2111955.5   ,0.93
skylake     ,sse2        ,16777216    ,0           ,256
,1965103.7   ,2114293.0   ,0.93
skylake     ,sse2        ,16777216    ,0           ,4064
,1958381.3   ,2103438.6   ,0.93
skylake     ,sse2        ,33554439    ,0           ,0
,4456144.2   ,4660837.1   ,0.96
skylake     ,sse2        ,33554447    ,0           ,3
,4431097.0   ,4679042.6   ,0.95
skylake     ,sse2        ,33554463    ,3           ,0
,4448225.6   ,4648538.3   ,0.96
skylake     ,sse2        ,33554495    ,3           ,5
,4427743.0   ,4678340.1   ,0.95
skylake     ,sse2        ,33554432    ,0           ,127
,4437517.3   ,4552005.9   ,0.97
skylake     ,sse2        ,33554432    ,0           ,255
,4427135.1   ,4543412.0   ,0.97
skylake     ,sse2        ,33554432    ,0           ,256
,4441311.2   ,4658315.5   ,0.95
skylake     ,sse2        ,33554432    ,0           ,4064
,4429798.4   ,4659499.6   ,0.95   skylake     ,avx         ,65543
 ,0           ,0           ,3115.8      ,3043.7      ,1.02
skylake     ,avx         ,65551       ,0           ,3
,3673.2      ,3551.7      ,1.03
skylake     ,avx         ,65567       ,3           ,0
,3024.6      ,2887.4      ,1.05
skylake     ,avx         ,65599       ,3           ,5
,3907.8      ,3636.4      ,1.07
skylake     ,avx         ,65536       ,0           ,127
,3539.2      ,3372.3      ,1.05
skylake     ,avx         ,65536       ,0           ,255
,3489.9      ,3344.0      ,1.04
skylake     ,avx         ,65536       ,0           ,256
,3059.0      ,2924.4      ,1.05
skylake     ,avx         ,65536       ,0           ,4064
,2805.0      ,2869.3      ,0.98
skylake     ,avx         ,131079      ,0           ,0
,6129.2      ,6263.4      ,0.98
skylake     ,avx         ,131087      ,0           ,3
,7096.8      ,7570.0      ,0.94
skylake     ,avx         ,131103      ,3           ,0
,6394.5      ,6842.5      ,0.93
skylake     ,avx         ,131135      ,3           ,5
,7462.8      ,7776.0      ,0.96
skylake     ,avx         ,131072      ,0           ,127
,7726.9      ,7428.5      ,1.04
skylake     ,avx         ,131072      ,0           ,255
,7167.4      ,7278.9      ,0.98
skylake     ,avx         ,131072      ,0           ,256
,7197.9      ,6284.3      ,1.15
skylake     ,avx         ,131072      ,0           ,4064
,6984.0      ,6940.4      ,1.01
skylake     ,avx         ,262151      ,0           ,0
,15787.3     ,16403.1     ,0.96
skylake     ,avx         ,262159      ,0           ,3
,17800.1     ,17628.1     ,1.01
skylake     ,avx         ,262175      ,3           ,0
,16622.8     ,16244.3     ,1.02
skylake     ,avx         ,262207      ,3           ,5
,16989.7     ,17509.0     ,0.97
skylake     ,avx         ,262144      ,0           ,127
,16190.8     ,15971.8     ,1.01
skylake     ,avx         ,262144      ,0           ,255
,15787.1     ,15876.7     ,0.99
skylake     ,avx         ,262144      ,0           ,256
,14840.1     ,14997.0     ,0.99
skylake     ,avx         ,262144      ,0           ,4064
,15743.0     ,14976.2     ,1.05
skylake     ,avx         ,524295      ,0           ,0
,32848.5     ,33397.8     ,0.98
skylake     ,avx         ,524303      ,0           ,3
,34872.1     ,34862.2     ,1.0
skylake     ,avx         ,524319      ,3           ,0
,33784.6     ,34023.8     ,0.99
skylake     ,avx         ,524351      ,3           ,5
,35337.1     ,35364.5     ,1.0
skylake     ,avx         ,524288      ,0           ,127
,33624.5     ,33596.5     ,1.0
skylake     ,avx         ,524288      ,0           ,255
,33390.7     ,33842.8     ,0.99
skylake     ,avx         ,524288      ,0           ,256
,31937.0     ,32357.2     ,0.99
skylake     ,avx         ,524288      ,0           ,4064
,32233.5     ,32267.3     ,1.0
skylake     ,avx         ,1048583     ,0           ,0
,100354.7    ,105840.6    ,0.95
skylake     ,avx         ,1048591     ,0           ,3
,68102.5     ,67496.0     ,1.01
skylake     ,avx         ,1048607     ,3           ,0
,66146.1     ,67540.0     ,0.98
skylake     ,avx         ,1048639     ,3           ,5
,67530.8     ,67726.4     ,1.0
skylake     ,avx         ,1048576     ,0           ,127
,67105.6     ,66533.5     ,1.01
skylake     ,avx         ,1048576     ,0           ,255
,67101.8     ,65666.7     ,1.02
skylake     ,avx         ,1048576     ,0           ,256
,65092.6     ,67103.0     ,0.97
skylake     ,avx         ,1048576     ,0           ,4064
,65700.0     ,67031.5     ,0.98
skylake     ,avx         ,2097159     ,0           ,0
,133101.0    ,135171.6    ,0.98
skylake     ,avx         ,2097167     ,0           ,3
,134174.4    ,135782.1    ,0.99
skylake     ,avx         ,2097183     ,3           ,0
,132056.4    ,134170.0    ,0.98
skylake     ,avx         ,2097215     ,3           ,5
,134413.5    ,136341.1    ,0.99
skylake     ,avx         ,2097152     ,0           ,127
,133003.9    ,132992.1    ,1.0
skylake     ,avx         ,2097152     ,0           ,255
,133344.3    ,132883.1    ,1.0
skylake     ,avx         ,2097152     ,0           ,256
,134051.7    ,136185.8    ,0.98
skylake     ,avx         ,2097152     ,0           ,4064
,132976.3    ,135029.4    ,0.98
skylake     ,avx         ,4194311     ,0           ,0
,268004.1    ,282650.3    ,0.95
skylake     ,avx         ,4194319     ,0           ,3
,270270.0    ,286700.3    ,0.94
skylake     ,avx         ,4194335     ,3           ,0
,264288.5    ,279582.4    ,0.95
skylake     ,avx         ,4194367     ,3           ,5
,270498.4    ,286294.5    ,0.94
skylake     ,avx         ,4194304     ,0           ,127
,271219.3    ,275129.8    ,0.99
skylake     ,avx         ,4194304     ,0           ,255
,269996.5    ,270227.6    ,1.0
skylake     ,avx         ,4194304     ,0           ,256
,267901.1    ,281673.1    ,0.95
skylake     ,avx         ,4194304     ,0           ,4064
,268390.0    ,279100.3    ,0.96
skylake     ,avx         ,8388615     ,0           ,0
,803547.9    ,813229.9    ,0.99
skylake     ,avx         ,8388623     ,0           ,3
,828872.4    ,869413.0    ,0.95
skylake     ,avx         ,8388639     ,3           ,0
,818000.0    ,873781.7    ,0.94
skylake     ,avx         ,8388671     ,3           ,5
,824679.0    ,863561.5    ,0.95
skylake     ,avx         ,8388608     ,0           ,127
,800728.5    ,779000.8    ,1.03
skylake     ,avx         ,8388608     ,0           ,255
,820071.4    ,770113.2    ,1.06
skylake     ,avx         ,8388608     ,0           ,256
,825624.6    ,867247.7    ,0.95
skylake     ,avx         ,8388608     ,0           ,4064
,830209.7    ,894086.6    ,0.93
skylake     ,avx         ,16777223    ,0           ,0
,1989391.3   ,2132829.8   ,0.93
skylake     ,avx         ,16777231    ,0           ,3
,1994225.1   ,2211556.0   ,0.9
skylake     ,avx         ,16777247    ,3           ,0
,1993572.9   ,2213029.9   ,0.9
skylake     ,avx         ,16777279    ,3           ,5
,2001956.9   ,2211769.7   ,0.91
skylake     ,avx         ,16777216    ,0           ,127
,1968155.9   ,2127764.7   ,0.92
skylake     ,avx         ,16777216    ,0           ,255
,1978305.1   ,2121371.3   ,0.93
skylake     ,avx         ,16777216    ,0           ,256
,1993261.9   ,2206494.1   ,0.9
skylake     ,avx         ,16777216    ,0           ,4064
,1993808.3   ,2198137.4   ,0.91
skylake     ,avx         ,33554439    ,0           ,0
,4540216.7   ,4870021.8   ,0.93
skylake     ,avx         ,33554447    ,0           ,3
,4483505.3   ,4850545.5   ,0.92
skylake     ,avx         ,33554463    ,3           ,0
,4501944.5   ,4870922.4   ,0.92
skylake     ,avx         ,33554495    ,3           ,5
,4484565.5   ,4845392.4   ,0.93
skylake     ,avx         ,33554432    ,0           ,127
,4408639.3   ,4701698.6   ,0.94
skylake     ,avx         ,33554432    ,0           ,255
,4445826.0   ,4678142.9   ,0.95
skylake     ,avx         ,33554432    ,0           ,256
,4497953.2   ,4844498.6   ,0.93
skylake     ,avx         ,33554432    ,0           ,4064
,4501572.4   ,4839209.4   ,0.93

>
> --
> H.J.
  
Noah Goldstein April 3, 2021, 7:44 p.m. UTC | #3
Last message got formatted weirdly. Here is file with the data.

On Sat, Apr 3, 2021 at 3:41 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Sat, Apr 3, 2021 at 1:46 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sat, Apr 3, 2021 at 1:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > From: noah <goldstein.w.n@gmail.com>
> > >
> > > No Bug. This commit updates the large memcpy case (no overlap). The
> > > update is to perform memcpy on either 2 or 4 contiguous pages at
> > > once. This 1) helps to alleviate the affects of false memory aliasing
> > > when destination and source have a close 4k alignment and 2) In most
> > > cases and for most DRAM units is a modestly more efficient access
> > > pattern. These changes are a clear performance improvement for
> > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > > pass.
> > >
> > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > ---
> > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the
> > > loops and no longer see any meaningful regression.
> > >
> > > Also added back the temporal stores for the tail. Saw a regression
> > > when doing these tests.
> > >
> > > Two tables below for skylake and icelake numbers for the areas around
> > > where you saw the regression. Below is all data from the tests.
> > >
> > > N = 10.
> > >
> > > Skylake
> > > Len         ,align1      ,align2      ,new mean    ,old mean
> > > 4103        ,0           ,64          ,84.5        ,88.6
> > > 4111        ,0           ,3           ,99.0        ,99.9
> > > 4127        ,3           ,0           ,102.1       ,102.3
> > > 4159        ,3           ,7           ,88.7        ,90.9
> > > 4223        ,9           ,5           ,88.1        ,87.4
> > > 8199        ,0           ,64          ,146.7       ,150.2
> > > 8207        ,0           ,3           ,167.9       ,168.5
> > > 8223        ,3           ,0           ,168.5       ,168.1
> > > 8255        ,3           ,7           ,157.0       ,159.2
> > > 8319        ,9           ,5           ,155.5       ,155.7
> > > 16391       ,0           ,64          ,286.2       ,288.8
> > > 16399       ,0           ,3           ,307.0       ,308.7
> > > 16415       ,3           ,0           ,307.4       ,307.6
> > > 16447       ,3           ,7           ,294.6       ,295.5
> > > 16511       ,9           ,5           ,291.5       ,462.1
> > > 32775       ,0           ,64          ,603.4       ,601.5
> > > 32783       ,0           ,3           ,604.8       ,606.4
> > > 32799       ,3           ,0           ,603.0       ,604.1
> > > 32831       ,3           ,7           ,600.2       ,737.3
> > > 32895       ,9           ,5           ,604.4       ,599.5
> > > 65543       ,0           ,64          ,1873.5      ,1854.3
> > > 65551       ,0           ,3           ,1862.9      ,1846.6
> > > 65567       ,3           ,0           ,1885.5      ,1966.0
> > > 65599       ,3           ,7           ,1833.2      ,1833.1
> > > 65663       ,9           ,5           ,1884.9      ,1887.4
> > > 131079      ,0           ,64          ,3944.3      ,3949.4
> > > 131087      ,0           ,3           ,3927.3      ,3913.3
> > > 131103      ,3           ,0           ,4415.8      ,4169.4
> > > 131135      ,3           ,7           ,4224.5      ,4157.6
> > > 131199      ,9           ,5           ,5974.0      ,4983.8
> > > 262151      ,0           ,64          ,11050.2     ,10620.6
> > > 262159      ,0           ,3           ,9932.8      ,10037.3
> > > 262175      ,3           ,0           ,10188.8     ,9206.6
> > > 262207      ,3           ,7           ,9633.3      ,9216.7
> > > 262271      ,9           ,5           ,9732.7      ,9345.3
> > > 524295      ,0           ,64          ,24823.9     ,24880.7
> > > 524303      ,0           ,3           ,24514.0     ,24556.7
> > > 524319      ,3           ,0           ,23974.4     ,24219.9
> > > 524351      ,3           ,7           ,24159.7     ,24207.0
> > > 524415      ,9           ,5           ,23946.5     ,24142.8
> > >
> > > Icelake:
> > > Len         ,align1      ,align2      ,new mean    ,old mean
> > > 4103        ,0           ,64          ,50.2        ,63.7
> > > 4111        ,0           ,3           ,63.7        ,65.1
> > > 4127        ,3           ,0           ,68.2        ,69.4
> > > 4159        ,3           ,7           ,59.6        ,68.0
> > > 4223        ,9           ,5           ,68.2        ,66.8
> > > 8199        ,0           ,64          ,92.1        ,89.9
> > > 8207        ,0           ,3           ,119.7       ,118.3
> > > 8223        ,3           ,0           ,119.1       ,120.9
> > > 8255        ,3           ,7           ,122.9       ,123.7
> > > 8319        ,9           ,5           ,122.1       ,121.8
> > > 16391       ,0           ,64          ,162.7       ,158.0
> > > 16399       ,0           ,3           ,227.6       ,234.1
> > > 16415       ,3           ,0           ,230.8       ,232.7
> > > 16447       ,3           ,7           ,226.8       ,232.6
> > > 16511       ,9           ,5           ,233.4       ,233.8
> > > 32775       ,0           ,64          ,312.2       ,301.8
> > > 32783       ,0           ,3           ,449.7       ,450.0
> > > 32799       ,3           ,0           ,452.7       ,455.9
> > > 32831       ,3           ,7           ,449.8       ,458.0
> > > 32895       ,9           ,5           ,456.3       ,459.4
> > > 65543       ,0           ,64          ,1460.6      ,1463.9
> > > 65551       ,0           ,3           ,1462.0      ,1465.4
> > > 65567       ,3           ,0           ,1466.6      ,1480.4
> > > 65599       ,3           ,7           ,1488.0      ,1488.9
> > > 65663       ,9           ,5           ,1680.8      ,1499.5
> > > 131079      ,0           ,64          ,2988.5      ,3010.1
> > > 131087      ,0           ,3           ,2995.5      ,2996.4
> > > 131103      ,3           ,0           ,3006.2      ,3000.5
> > > 131135      ,3           ,7           ,3032.4      ,3073.7
> > > 131199      ,9           ,5           ,3010.4      ,3027.4
> > > 262151      ,0           ,64          ,6143.2      ,6079.1
> > > 262159      ,0           ,3           ,6085.1      ,6075.8
> > > 262175      ,3           ,0           ,6088.0      ,6064.9
> > > 262207      ,3           ,7           ,6018.7      ,6023.5
> > > 262271      ,9           ,5           ,6019.8      ,5959.2
> > > 524295      ,0           ,64          ,14464.2     ,14095.1
> > > 524303      ,0           ,3           ,14761.6     ,14050.2
> > > 524319      ,3           ,0           ,14534.1     ,14087.5
> > > 524351      ,3           ,7           ,14147.7     ,13903.8
> > > 524415      ,9           ,5           ,14157.0     ,13982.9
> > >
> > >
> > >
> > > cpu         ,version     ,Len         ,align1      ,align2      ,new mean    ,old mean
> > > skylake     ,avx         ,4103        ,0           ,64          ,84.5        ,88.6
> > > skylake     ,avx         ,4111        ,0           ,3           ,99.0        ,99.9
> > > skylake     ,avx         ,4127        ,3           ,0           ,102.1       ,102.3
> > > skylake     ,avx         ,4159        ,3           ,7           ,88.7        ,90.9
> > > skylake     ,avx         ,4223        ,9           ,5           ,88.1        ,87.4
> > > skylake     ,avx         ,8199        ,0           ,64          ,146.7       ,150.2
> > > skylake     ,avx         ,8207        ,0           ,3           ,167.9       ,168.5
> > > skylake     ,avx         ,8223        ,3           ,0           ,168.5       ,168.1
> > > skylake     ,avx         ,8255        ,3           ,7           ,157.0       ,159.2
> > > skylake     ,avx         ,8319        ,9           ,5           ,155.5       ,155.7
> > > skylake     ,avx         ,16391       ,0           ,64          ,286.2       ,288.8
> > > skylake     ,avx         ,16399       ,0           ,3           ,307.0       ,308.7
> > > skylake     ,avx         ,16415       ,3           ,0           ,307.4       ,307.6
> > > skylake     ,avx         ,16447       ,3           ,7           ,294.6       ,295.5
> > > skylake     ,avx         ,16511       ,9           ,5           ,291.5       ,462.1
> > > skylake     ,avx         ,32775       ,0           ,64          ,603.4       ,601.5
> > > skylake     ,avx         ,32783       ,0           ,3           ,604.8       ,606.4
> > > skylake     ,avx         ,32799       ,3           ,0           ,603.0       ,604.1
> > > skylake     ,avx         ,32831       ,3           ,7           ,600.2       ,737.3
> > > skylake     ,avx         ,32895       ,9           ,5           ,604.4       ,599.5
> > > skylake     ,avx         ,65543       ,0           ,64          ,1873.5      ,1854.3
> > > skylake     ,avx         ,65551       ,0           ,3           ,1862.9      ,1846.6
> > > skylake     ,avx         ,65567       ,3           ,0           ,1885.5      ,1966.0
> > > skylake     ,avx         ,65599       ,3           ,7           ,1833.2      ,1833.1
> > > skylake     ,avx         ,65663       ,9           ,5           ,1884.9      ,1887.4
> > > skylake     ,avx         ,131079      ,0           ,64          ,3944.3      ,3949.4
> > > skylake     ,avx         ,131087      ,0           ,3           ,3927.3      ,3913.3
> > > skylake     ,avx         ,131103      ,3           ,0           ,4415.8      ,4169.4
> > > skylake     ,avx         ,131135      ,3           ,7           ,4224.5      ,4157.6
> > > skylake     ,avx         ,131199      ,9           ,5           ,5974.0      ,4983.8
> > > skylake     ,avx         ,262151      ,0           ,64          ,11050.2     ,10620.6
> > > skylake     ,avx         ,262159      ,0           ,3           ,9932.8      ,10037.3
> > > skylake     ,avx         ,262175      ,3           ,0           ,10188.8     ,9206.6
> > > skylake     ,avx         ,262207      ,3           ,7           ,9633.3      ,9216.7
> > > skylake     ,avx         ,262271      ,9           ,5           ,9732.7      ,9345.3
> > > skylake     ,avx         ,524295      ,0           ,64          ,24823.9     ,24880.7
> > > skylake     ,avx         ,524303      ,0           ,3           ,24514.0     ,24556.7
> > > skylake     ,avx         ,524319      ,3           ,0           ,23974.4     ,24219.9
> > > skylake     ,avx         ,524351      ,3           ,7           ,24159.7     ,24207.0
> > > skylake     ,avx         ,524415      ,9           ,5           ,23946.5     ,24142.8
> > > skylake     ,avx         ,1048583     ,0           ,64          ,49163.9     ,49454.6
> > > skylake     ,avx         ,1048591     ,0           ,3           ,49879.3     ,49400.8
> > > skylake     ,avx         ,1048607     ,3           ,0           ,49738.0     ,48864.6
> > > skylake     ,avx         ,1048639     ,3           ,7           ,48804.0     ,47588.5
> > > skylake     ,avx         ,1048703     ,9           ,5           ,49629.4     ,49796.3
> > > skylake     ,avx         ,2097159     ,0           ,64          ,98271.7     ,96330.6
> > > skylake     ,avx         ,2097167     ,0           ,3           ,97801.8     ,98638.1
> > > skylake     ,avx         ,2097183     ,3           ,0           ,98041.1     ,99287.6
> > > skylake     ,avx         ,2097215     ,3           ,7           ,96629.5     ,96521.9
> > > skylake     ,avx         ,2097279     ,9           ,5           ,98961.8     ,98909.8
> > > skylake     ,avx         ,4194311     ,0           ,64          ,194667.7    ,195377.1
> > > skylake     ,avx         ,4194319     ,0           ,3           ,194919.5    ,198576.2
> > > skylake     ,avx         ,4194335     ,3           ,0           ,192949.8    ,194584.7
> > > skylake     ,avx         ,4194367     ,3           ,7           ,189943.5    ,189177.9
> > > skylake     ,avx         ,4194431     ,9           ,5           ,192479.1    ,196494.2
> > > skylake     ,avx         ,8388615     ,0           ,64          ,588671.6    ,587215.4
> > > skylake     ,avx         ,8388623     ,0           ,3           ,581640.7    ,582812.5
> > > skylake     ,avx         ,8388639     ,3           ,0           ,549811.9    ,544697.6
> > > skylake     ,avx         ,8388671     ,3           ,7           ,591155.0    ,577951.8
> > > skylake     ,avx         ,8388735     ,9           ,5           ,547583.2    ,545133.3
> > > skylake     ,avx         ,16777223    ,0           ,64          ,1787503.0   ,1811146.0
> > > skylake     ,avx         ,16777231    ,0           ,3           ,1758671.0   ,1756343.0
> > > skylake     ,avx         ,16777247    ,3           ,0           ,1691781.0   ,1694661.0
> > > skylake     ,avx         ,16777279    ,3           ,7           ,1768150.0   ,1754785.0
> > > skylake     ,avx         ,16777343    ,9           ,5           ,1695179.0   ,1710794.0
> > > skylake     ,sse2        ,4103        ,0           ,64          ,150.8       ,150.5
> > > skylake     ,sse2        ,4111        ,0           ,3           ,156.8       ,158.4
> > > skylake     ,sse2        ,4127        ,3           ,0           ,99.7        ,99.4
> > > skylake     ,sse2        ,4159        ,3           ,7           ,154.8       ,154.5
> > > skylake     ,sse2        ,4223        ,9           ,5           ,137.3       ,137.2
> > > skylake     ,sse2        ,8199        ,0           ,64          ,284.8       ,285.5
> > > skylake     ,sse2        ,8207        ,0           ,3           ,296.0       ,296.1
> > > skylake     ,sse2        ,8223        ,3           ,0           ,168.0       ,168.2
> > > skylake     ,sse2        ,8255        ,3           ,7           ,293.0       ,292.4
> > > skylake     ,sse2        ,8319        ,9           ,5           ,251.3       ,250.7
> > > skylake     ,sse2        ,16391       ,0           ,64          ,561.3       ,608.3
> > > skylake     ,sse2        ,16399       ,0           ,3           ,571.0       ,574.8
> > > skylake     ,sse2        ,16415       ,3           ,0           ,305.4       ,305.0
> > > skylake     ,sse2        ,16447       ,3           ,7           ,563.2       ,565.0
> > > skylake     ,sse2        ,16511       ,9           ,5           ,477.1       ,475.1
> > > skylake     ,sse2        ,32775       ,0           ,64          ,1128.2      ,1131.7
> > > skylake     ,sse2        ,32783       ,0           ,3           ,1126.6      ,1131.0
> > > skylake     ,sse2        ,32799       ,3           ,0           ,587.6       ,590.8
> > > skylake     ,sse2        ,32831       ,3           ,7           ,1130.6      ,1126.2
> > > skylake     ,sse2        ,32895       ,9           ,5           ,957.6       ,953.0
> > > skylake     ,sse2        ,65543       ,0           ,64          ,2718.9      ,2704.2
> > > skylake     ,sse2        ,65551       ,0           ,3           ,2724.1      ,2725.0
> > > skylake     ,sse2        ,65567       ,3           ,0           ,1888.4      ,1914.3
> > > skylake     ,sse2        ,65599       ,3           ,7           ,2787.6      ,2748.7
> > > skylake     ,sse2        ,65663       ,9           ,5           ,2400.5      ,2369.4
> > > skylake     ,sse2        ,131079      ,0           ,64          ,5603.3      ,5654.9
> > > skylake     ,sse2        ,131087      ,0           ,3           ,5939.3      ,5871.4
> > > skylake     ,sse2        ,131103      ,3           ,0           ,4272.4      ,4190.0
> > > skylake     ,sse2        ,131135      ,3           ,7           ,7601.4      ,7524.6
> > > skylake     ,sse2        ,131199      ,9           ,5           ,7022.1      ,6864.7
> > > skylake     ,sse2        ,262151      ,0           ,64          ,13736.2     ,14030.0
> > > skylake     ,sse2        ,262159      ,0           ,3           ,12407.3     ,12334.1
> > > skylake     ,sse2        ,262175      ,3           ,0           ,9661.1      ,9249.4
> > > skylake     ,sse2        ,262207      ,3           ,7           ,12850.2     ,12351.6
> > > skylake     ,sse2        ,262271      ,9           ,5           ,10792.6     ,10435.8
> > > skylake     ,sse2        ,524295      ,0           ,64          ,27754.5     ,28177.7
> > > skylake     ,sse2        ,524303      ,0           ,3           ,27766.2     ,28152.0
> > > skylake     ,sse2        ,524319      ,3           ,0           ,24030.9     ,24438.3
> > > skylake     ,sse2        ,524351      ,3           ,7           ,27787.5     ,27933.0
> > > skylake     ,sse2        ,524415      ,9           ,5           ,24263.2     ,25249.1
> > > skylake     ,sse2        ,1048583     ,0           ,64          ,56199.9     ,56039.8
> > > skylake     ,sse2        ,1048591     ,0           ,3           ,56750.2     ,58889.7
> > > skylake     ,sse2        ,1048607     ,3           ,0           ,56394.0     ,55115.3
> > > skylake     ,sse2        ,1048639     ,3           ,7           ,57233.1     ,57473.8
> > > skylake     ,sse2        ,1048703     ,9           ,5           ,56324.3     ,55917.9
> > > skylake     ,sse2        ,2097159     ,0           ,64          ,113234.8    ,114346.4
> > > skylake     ,sse2        ,2097167     ,0           ,3           ,114373.1    ,115522.5
> > > skylake     ,sse2        ,2097183     ,3           ,0           ,108113.3    ,108513.3
> > > skylake     ,sse2        ,2097215     ,3           ,7           ,116863.6    ,116549.9
> > > skylake     ,sse2        ,2097279     ,9           ,5           ,108945.1    ,108843.7
> > > skylake     ,sse2        ,4194311     ,0           ,64          ,230250.1    ,232350.0
> > > skylake     ,sse2        ,4194319     ,0           ,3           ,231895.3    ,235055.6
> > > skylake     ,sse2        ,4194335     ,3           ,0           ,218442.8    ,219199.8
> > > skylake     ,sse2        ,4194367     ,3           ,7           ,242564.2    ,235587.7
> > > skylake     ,sse2        ,4194431     ,9           ,5           ,224167.4    ,215261.8
> > > skylake     ,sse2        ,8388615     ,0           ,64          ,679801.8    ,674832.0
> > > skylake     ,sse2        ,8388623     ,0           ,3           ,684913.2    ,685238.7
> > > skylake     ,sse2        ,8388639     ,3           ,0           ,644865.4    ,631388.6
> > > skylake     ,sse2        ,8388671     ,3           ,7           ,698700.9    ,689316.1
> > > skylake     ,sse2        ,8388735     ,9           ,5           ,644820.2    ,631366.8
> > > skylake     ,sse2        ,16777223    ,0           ,64          ,1877984.0   ,1876437.0
> > > skylake     ,sse2        ,16777231    ,0           ,3           ,1898086.0   ,1913053.0
> > > skylake     ,sse2        ,16777247    ,3           ,0           ,1857018.0   ,1866949.0
> > > skylake     ,sse2        ,16777279    ,3           ,7           ,1914905.0   ,1897134.0
> > > skylake     ,sse2        ,16777343    ,9           ,5           ,1859937.0   ,1881939.0
> > > icelake     ,avx512      ,4103        ,0           ,64          ,75.2        ,75.8
> > > icelake     ,avx512      ,4111        ,0           ,3           ,56.9        ,56.4
> > > icelake     ,avx512      ,4127        ,3           ,0           ,59.1        ,59.6
> > > icelake     ,avx512      ,4159        ,3           ,7           ,50.7        ,51.3
> > > icelake     ,avx512      ,4223        ,9           ,5           ,59.2        ,58.9
> > > icelake     ,avx512      ,8199        ,0           ,64          ,67.8        ,63.9
> > > icelake     ,avx512      ,8207        ,0           ,3           ,89.0        ,89.9
> > > icelake     ,avx512      ,8223        ,3           ,0           ,90.2        ,90.1
> > > icelake     ,avx512      ,8255        ,3           ,7           ,82.6        ,84.9
> > > icelake     ,avx512      ,8319        ,9           ,5           ,91.5        ,92.8
> > > icelake     ,avx512      ,16391       ,0           ,64          ,118.0       ,117.6
> > > icelake     ,avx512      ,16399       ,0           ,3           ,156.5       ,157.0
> > > icelake     ,avx512      ,16415       ,3           ,0           ,157.4       ,157.3
> > > icelake     ,avx512      ,16447       ,3           ,7           ,151.0       ,151.6
> > > icelake     ,avx512      ,16511       ,9           ,5           ,159.1       ,159.6
> > > icelake     ,avx512      ,32775       ,0           ,64          ,231.8       ,230.8
> > > icelake     ,avx512      ,32783       ,0           ,3           ,297.8       ,299.3
> > > icelake     ,avx512      ,32799       ,3           ,0           ,299.1       ,299.0
> > > icelake     ,avx512      ,32831       ,3           ,7           ,293.5       ,295.4
> > > icelake     ,avx512      ,32895       ,9           ,5           ,300.3       ,302.5
> > > icelake     ,avx512      ,65543       ,0           ,64          ,1473.4      ,1479.2
> > > icelake     ,avx512      ,65551       ,0           ,3           ,1438.2      ,1445.3
> > > icelake     ,avx512      ,65567       ,3           ,0           ,1450.3      ,1463.8
> > > icelake     ,avx512      ,65599       ,3           ,7           ,1469.0      ,1473.8
> > > icelake     ,avx512      ,65663       ,9           ,5           ,1480.0      ,1483.5
> > > icelake     ,avx512      ,131079      ,0           ,64          ,3015.1      ,3037.5
> > > icelake     ,avx512      ,131087      ,0           ,3           ,2952.3      ,2960.4
> > > icelake     ,avx512      ,131103      ,3           ,0           ,2966.2      ,2964.4
> > > icelake     ,avx512      ,131135      ,3           ,7           ,2961.6      ,3047.9
> > > icelake     ,avx512      ,131199      ,9           ,5           ,2967.4      ,3183.8
> > > icelake     ,avx512      ,262151      ,0           ,64          ,6206.0      ,6141.5
> > > icelake     ,avx512      ,262159      ,0           ,3           ,5990.8      ,5959.2
> > > icelake     ,avx512      ,262175      ,3           ,0           ,5976.7      ,5963.8
> > > icelake     ,avx512      ,262207      ,3           ,7           ,5939.5      ,5924.3
> > > icelake     ,avx512      ,262271      ,9           ,5           ,5944.6      ,5990.3
> > > icelake     ,avx512      ,524295      ,0           ,64          ,14726.7     ,14307.0
> > > icelake     ,avx512      ,524303      ,0           ,3           ,14344.2     ,14040.5
> > > icelake     ,avx512      ,524319      ,3           ,0           ,14175.0     ,13862.2
> > > icelake     ,avx512      ,524351      ,3           ,7           ,14261.4     ,13821.5
> > > icelake     ,avx512      ,524415      ,9           ,5           ,14266.5     ,14064.7
> > > icelake     ,avx512      ,1048583     ,0           ,64          ,35211.4     ,35414.6
> > > icelake     ,avx512      ,1048591     ,0           ,3           ,35156.8     ,35591.2
> > > icelake     ,avx512      ,1048607     ,3           ,0           ,35273.1     ,35503.3
> > > icelake     ,avx512      ,1048639     ,3           ,7           ,35255.8     ,35725.0
> > > icelake     ,avx512      ,1048703     ,9           ,5           ,35703.6     ,36289.9
> > > icelake     ,avx512      ,2097159     ,0           ,64          ,72613.9     ,72063.2
> > > icelake     ,avx512      ,2097167     ,0           ,3           ,72301.6     ,73504.2
> > > icelake     ,avx512      ,2097183     ,3           ,0           ,73448.8     ,72133.6
> > > icelake     ,avx512      ,2097215     ,3           ,7           ,73762.9     ,72825.8
> > > icelake     ,avx512      ,2097279     ,9           ,5           ,72097.3     ,72914.6
> > > icelake     ,avx512      ,4194311     ,0           ,64          ,144793.4    ,144182.1
> > > icelake     ,avx512      ,4194319     ,0           ,3           ,143710.3    ,145063.3
> > > icelake     ,avx512      ,4194335     ,3           ,0           ,146722.1    ,144046.4
> > > icelake     ,avx512      ,4194367     ,3           ,7           ,144267.0    ,144874.6
> > > icelake     ,avx512      ,4194431     ,9           ,5           ,143808.2    ,144560.0
> > > icelake     ,avx512      ,8388615     ,0           ,64          ,427993.4    ,424521.5
> > > icelake     ,avx512      ,8388623     ,0           ,3           ,470267.1    ,473290.8
> > > icelake     ,avx512      ,8388639     ,3           ,0           ,457179.7    ,461797.7
> > > icelake     ,avx512      ,8388671     ,3           ,7           ,472507.9    ,481561.4
> > > icelake     ,avx512      ,8388735     ,9           ,5           ,463611.9    ,467388.7
> > > icelake     ,avx512      ,16777223    ,0           ,64          ,1490426.0   ,1526996.0
> > > icelake     ,avx512      ,16777231    ,0           ,3           ,1516687.0   ,1517095.0
> > > icelake     ,avx512      ,16777247    ,3           ,0           ,1497688.0   ,1512766.0
> > > icelake     ,avx512      ,16777279    ,3           ,7           ,1512331.0   ,1524317.0
> > > icelake     ,avx512      ,16777343    ,9           ,5           ,1498908.0   ,1500526.0
> > > icelake     ,avx         ,4103        ,0           ,64          ,50.2        ,63.7
> > > icelake     ,avx         ,4111        ,0           ,3           ,63.7        ,65.1
> > > icelake     ,avx         ,4127        ,3           ,0           ,68.2        ,69.4
> > > icelake     ,avx         ,4159        ,3           ,7           ,59.6        ,68.0
> > > icelake     ,avx         ,4223        ,9           ,5           ,68.2        ,66.8
> > > icelake     ,avx         ,8199        ,0           ,64          ,92.1        ,89.9
> > > icelake     ,avx         ,8207        ,0           ,3           ,119.7       ,118.3
> > > icelake     ,avx         ,8223        ,3           ,0           ,119.1       ,120.9
> > > icelake     ,avx         ,8255        ,3           ,7           ,122.9       ,123.7
> > > icelake     ,avx         ,8319        ,9           ,5           ,122.1       ,121.8
> > > icelake     ,avx         ,16391       ,0           ,64          ,162.7       ,158.0
> > > icelake     ,avx         ,16399       ,0           ,3           ,227.6       ,234.1
> > > icelake     ,avx         ,16415       ,3           ,0           ,230.8       ,232.7
> > > icelake     ,avx         ,16447       ,3           ,7           ,226.8       ,232.6
> > > icelake     ,avx         ,16511       ,9           ,5           ,233.4       ,233.8
> > > icelake     ,avx         ,32775       ,0           ,64          ,312.2       ,301.8
> > > icelake     ,avx         ,32783       ,0           ,3           ,449.7       ,450.0
> > > icelake     ,avx         ,32799       ,3           ,0           ,452.7       ,455.9
> > > icelake     ,avx         ,32831       ,3           ,7           ,449.8       ,458.0
> > > icelake     ,avx         ,32895       ,9           ,5           ,456.3       ,459.4
> > > icelake     ,avx         ,65543       ,0           ,64          ,1460.6      ,1463.9
> > > icelake     ,avx         ,65551       ,0           ,3           ,1462.0      ,1465.4
> > > icelake     ,avx         ,65567       ,3           ,0           ,1466.6      ,1480.4
> > > icelake     ,avx         ,65599       ,3           ,7           ,1488.0      ,1488.9
> > > icelake     ,avx         ,65663       ,9           ,5           ,1680.8      ,1499.5
> > > icelake     ,avx         ,131079      ,0           ,64          ,2988.5      ,3010.1
> > > icelake     ,avx         ,131087      ,0           ,3           ,2995.5      ,2996.4
> > > icelake     ,avx         ,131103      ,3           ,0           ,3006.2      ,3000.5
> > > icelake     ,avx         ,131135      ,3           ,7           ,3032.4      ,3073.7
> > > icelake     ,avx         ,131199      ,9           ,5           ,3010.4      ,3027.4
> > > icelake     ,avx         ,262151      ,0           ,64          ,6143.2      ,6079.1
> > > icelake     ,avx         ,262159      ,0           ,3           ,6085.1      ,6075.8
> > > icelake     ,avx         ,262175      ,3           ,0           ,6088.0      ,6064.9
> > > icelake     ,avx         ,262207      ,3           ,7           ,6018.7      ,6023.5
> > > icelake     ,avx         ,262271      ,9           ,5           ,6019.8      ,5959.2
> > > icelake     ,avx         ,524295      ,0           ,64          ,14464.2     ,14095.1
> > > icelake     ,avx         ,524303      ,0           ,3           ,14761.6     ,14050.2
> > > icelake     ,avx         ,524319      ,3           ,0           ,14534.1     ,14087.5
> > > icelake     ,avx         ,524351      ,3           ,7           ,14147.7     ,13903.8
> > > icelake     ,avx         ,524415      ,9           ,5           ,14157.0     ,13982.9
> > > icelake     ,avx         ,1048583     ,0           ,64          ,36599.0     ,37461.4
> > > icelake     ,avx         ,1048591     ,0           ,3           ,36717.8     ,37454.9
> > > icelake     ,avx         ,1048607     ,3           ,0           ,36821.2     ,37343.3
> > > icelake     ,avx         ,1048639     ,3           ,7           ,36958.0     ,37507.2
> > > icelake     ,avx         ,1048703     ,9           ,5           ,36869.2     ,37413.1
> > > icelake     ,avx         ,2097159     ,0           ,64          ,74765.8     ,75330.9
> > > icelake     ,avx         ,2097167     ,0           ,3           ,75175.4     ,74891.9
> > > icelake     ,avx         ,2097183     ,3           ,0           ,75451.4     ,74787.7
> > > icelake     ,avx         ,2097215     ,3           ,7           ,75394.8     ,75839.1
> > > icelake     ,avx         ,2097279     ,9           ,5           ,75099.2     ,75421.2
> > > icelake     ,avx         ,4194311     ,0           ,64          ,146809.6    ,146619.4
> > > icelake     ,avx         ,4194319     ,0           ,3           ,148866.4    ,149898.2
> > > icelake     ,avx         ,4194335     ,3           ,0           ,148719.7    ,150165.4
> > > icelake     ,avx         ,4194367     ,3           ,7           ,150600.1    ,150925.9
> > > icelake     ,avx         ,4194431     ,9           ,5           ,149457.3    ,150519.2
> > > icelake     ,avx         ,8388615     ,0           ,64          ,412709.8    ,423666.1
> > > icelake     ,avx         ,8388623     ,0           ,3           ,423717.4    ,424418.2
> > > icelake     ,avx         ,8388639     ,3           ,0           ,414387.5    ,413445.6
> > > icelake     ,avx         ,8388671     ,3           ,7           ,449010.7    ,417553.5
> > > icelake     ,avx         ,8388735     ,9           ,5           ,414128.6    ,411815.3
> > > icelake     ,avx         ,16777223    ,0           ,64          ,1490032.0   ,1510004.0
> > > icelake     ,avx         ,16777231    ,0           ,3           ,1379638.0   ,1422097.0
> > > icelake     ,avx         ,16777247    ,3           ,0           ,1418930.0   ,1367557.0
> > > icelake     ,avx         ,16777279    ,3           ,7           ,1515152.0   ,1500176.0
> > > icelake     ,avx         ,16777343    ,9           ,5           ,1344117.0   ,1411795.0
> > > icelake     ,sse2        ,4103        ,0           ,64          ,113.2       ,114.6
> > > icelake     ,sse2        ,4111        ,0           ,3           ,121.5       ,120.4
> > > icelake     ,sse2        ,4127        ,3           ,0           ,1700.5      ,1771.5
> > > icelake     ,sse2        ,4159        ,3           ,7           ,119.3       ,118.8
> > > icelake     ,sse2        ,4223        ,9           ,5           ,1739.7      ,1735.2
> > > icelake     ,sse2        ,8199        ,0           ,64          ,207.0       ,203.9
> > > icelake     ,sse2        ,8207        ,0           ,3           ,225.5       ,220.8
> > > icelake     ,sse2        ,8223        ,3           ,0           ,3444.3      ,3743.5
> > > icelake     ,sse2        ,8255        ,3           ,7           ,219.9       ,216.8
> > > icelake     ,sse2        ,8319        ,9           ,5           ,4117.1      ,3487.3
> > > icelake     ,sse2        ,16391       ,0           ,64          ,397.1       ,394.3
> > > icelake     ,sse2        ,16399       ,0           ,3           ,439.6       ,428.6
> > > icelake     ,sse2        ,16415       ,3           ,0           ,6997.0      ,7031.2
> > > icelake     ,sse2        ,16447       ,3           ,7           ,426.8       ,421.8
> > > icelake     ,sse2        ,16511       ,9           ,5           ,7037.6      ,7038.3
> > > icelake     ,sse2        ,32775       ,0           ,64          ,790.9       ,779.0
> > > icelake     ,sse2        ,32783       ,0           ,3           ,863.1       ,849.6
> > > icelake     ,sse2        ,32799       ,3           ,0           ,14043.0     ,14390.9
> > > icelake     ,sse2        ,32831       ,3           ,7           ,841.6       ,833.1
> > > icelake     ,sse2        ,32895       ,9           ,5           ,14277.6     ,14344.2
> > > icelake     ,sse2        ,65543       ,0           ,64          ,1897.0      ,1897.3
> > > icelake     ,sse2        ,65551       ,0           ,3           ,1927.1      ,1955.4
> > > icelake     ,sse2        ,65567       ,3           ,0           ,28834.7     ,28727.8
> > > icelake     ,sse2        ,65599       ,3           ,7           ,1961.4      ,1969.7
> > > icelake     ,sse2        ,65663       ,9           ,5           ,28867.6     ,29019.8
> > > icelake     ,sse2        ,131079      ,0           ,64          ,3879.3      ,3872.6
> > > icelake     ,sse2        ,131087      ,0           ,3           ,3955.3      ,3990.7
> > > icelake     ,sse2        ,131103      ,3           ,0           ,58001.8     ,60567.9
> > > icelake     ,sse2        ,131135      ,3           ,7           ,3951.5      ,4002.6
> > > icelake     ,sse2        ,131199      ,9           ,5           ,57886.7     ,58391.4
> > > icelake     ,sse2        ,262151      ,0           ,64          ,7851.4      ,7894.7
> > > icelake     ,sse2        ,262159      ,0           ,3           ,7947.5      ,8016.2
> > > icelake     ,sse2        ,262175      ,3           ,0           ,115036.2    ,115968.6
> > > icelake     ,sse2        ,262207      ,3           ,7           ,7883.9      ,7814.1
> > > icelake     ,sse2        ,262271      ,9           ,5           ,113776.4    ,119733.6
> > > icelake     ,sse2        ,524295      ,0           ,64          ,17198.1     ,16974.9
> > > icelake     ,sse2        ,524303      ,0           ,3           ,17402.2     ,17096.3
> > > icelake     ,sse2        ,524319      ,3           ,0           ,223980.4    ,225889.9
> > > icelake     ,sse2        ,524351      ,3           ,7           ,17034.9     ,16910.3
> > > icelake     ,sse2        ,524415      ,9           ,5           ,224027.7    ,224962.5
> > > icelake     ,sse2        ,1048583     ,0           ,64          ,38822.3     ,39178.6
> > > icelake     ,sse2        ,1048591     ,0           ,3           ,41686.7     ,40247.4
> > > icelake     ,sse2        ,1048607     ,3           ,0           ,38814.8     ,39323.3
> > > icelake     ,sse2        ,1048639     ,3           ,7           ,39568.3     ,41325.7
> > > icelake     ,sse2        ,1048703     ,9           ,5           ,39354.2     ,39637.9
> > > icelake     ,sse2        ,2097159     ,0           ,64          ,84074.7     ,84543.1
> > > icelake     ,sse2        ,2097167     ,0           ,3           ,83665.7     ,82358.2
> > > icelake     ,sse2        ,2097183     ,3           ,0           ,81817.8     ,79638.9
> > > icelake     ,sse2        ,2097215     ,3           ,7           ,83649.1     ,83497.6
> > > icelake     ,sse2        ,2097279     ,9           ,5           ,80287.6     ,79980.9
> > > icelake     ,sse2        ,4194311     ,0           ,64          ,165409.8    ,168343.1
> > > icelake     ,sse2        ,4194319     ,0           ,3           ,165216.7    ,177632.0
> > > icelake     ,sse2        ,4194335     ,3           ,0           ,158718.7    ,160342.2
> > > icelake     ,sse2        ,4194367     ,3           ,7           ,167944.9    ,167204.4
> > > icelake     ,sse2        ,4194431     ,9           ,5           ,161530.1    ,164839.7
> > > icelake     ,sse2        ,8388615     ,0           ,64          ,626504.3    ,629858.5
> > > icelake     ,sse2        ,8388623     ,0           ,3           ,623969.5    ,631509.1
> > > icelake     ,sse2        ,8388639     ,3           ,0           ,599366.7    ,600016.0
> > > icelake     ,sse2        ,8388671     ,3           ,7           ,619964.2    ,619113.2
> > > icelake     ,sse2        ,8388735     ,9           ,5           ,595338.1    ,604172.4
> > > icelake     ,sse2        ,16777223    ,0           ,64          ,1709597.0   ,1725184.0
> > > icelake     ,sse2        ,16777231    ,0           ,3           ,1725452.0   ,1719746.0
> > > icelake     ,sse2        ,16777247    ,3           ,0           ,1614269.0   ,1607164.0
> > > icelake     ,sse2        ,16777279    ,3           ,7           ,1705295.0   ,1733018.0
> > > icelake     ,sse2        ,16777343    ,9           ,5           ,1604197.0   ,1595690.0
> > >
> >
> > I am having a hard time to convince myself that this patch is really necessary.
> > What are geomeans of all different cases for each processors?
>
> N = 100, Geometric mean of Current vs New for memcpy-bench-large. Note the
> bench-memmove-large numbers should be unaffected by this patch as the new
> logic only applies to the no overlap case.
>
> cpu         ,inst        ,Len         ,align1      ,align2      ,new
> geomean ,cur geomean ,New/Cur
> icelake     ,sse2        ,65543       ,0           ,0
> ,5566.1      ,5564.7      ,1.0
> icelake     ,sse2        ,65551       ,0           ,3
> ,5856.4      ,5725.7      ,1.02
> icelake     ,sse2        ,65567       ,3           ,0
> ,5622.8      ,5892.9      ,0.95
> icelake     ,sse2        ,65599       ,3           ,5
> ,5857.3      ,5723.8      ,1.02
> icelake     ,sse2        ,65536       ,0           ,127
> ,5953.3      ,5831.1      ,1.02
> icelake     ,sse2        ,65536       ,0           ,255
> ,5811.7      ,5789.5      ,1.0
> icelake     ,sse2        ,65536       ,0           ,256
> ,5373.5      ,5284.1      ,1.02
> icelake     ,sse2        ,65536       ,0           ,4064
> ,5820.1      ,5761.6      ,1.01
> icelake     ,sse2        ,131079      ,0           ,0
> ,12421.5     ,12424.1     ,1.0
> icelake     ,sse2        ,131087      ,0           ,3
> ,12389.5     ,12276.4     ,1.01
> icelake     ,sse2        ,131103      ,3           ,0
> ,11587.0     ,12607.6     ,0.92
> icelake     ,sse2        ,131135      ,3           ,5
> ,11596.9     ,11896.2     ,0.97
> icelake     ,sse2        ,131072      ,0           ,127
> ,11746.4     ,12490.1     ,0.94
> icelake     ,sse2        ,131072      ,0           ,255
> ,11486.8     ,11831.7     ,0.97
> icelake     ,sse2        ,131072      ,0           ,256
> ,10453.5     ,10451.7     ,1.0
> icelake     ,sse2        ,131072      ,0           ,4064
> ,11231.7     ,11223.6     ,1.0
> icelake     ,sse2        ,262151      ,0           ,0
> ,29408.5     ,30831.2     ,0.95
> icelake     ,sse2        ,262159      ,0           ,3
> ,30813.6     ,32235.6     ,0.96
> icelake     ,sse2        ,262175      ,3           ,0
> ,30245.0     ,31392.5     ,0.96
> icelake     ,sse2        ,262207      ,3           ,5
> ,30775.6     ,32298.6     ,0.95
> icelake     ,sse2        ,262144      ,0           ,127
> ,31784.7     ,32791.5     ,0.97
> icelake     ,sse2        ,262144      ,0           ,255
> ,30726.0     ,31997.5     ,0.96
> icelake     ,sse2        ,262144      ,0           ,256
> ,28418.9     ,29440.9     ,0.97
> icelake     ,sse2        ,262144      ,0           ,4064
> ,29984.1     ,31048.9     ,0.97
> icelake     ,sse2        ,524295      ,0           ,0
> ,76079.0     ,75752.0     ,1.0
> icelake     ,sse2        ,524303      ,0           ,3
> ,79939.3     ,80796.4     ,0.99
> icelake     ,sse2        ,524319      ,3           ,0
> ,79018.1     ,79928.5     ,0.99
> icelake     ,sse2        ,524351      ,3           ,5
> ,81219.4     ,81053.8     ,1.0
> icelake     ,sse2        ,524288      ,0           ,127
> ,80111.8     ,80087.2     ,1.0
> icelake     ,sse2        ,524288      ,0           ,255
> ,79334.0     ,79525.6     ,1.0
> icelake     ,sse2        ,524288      ,0           ,256
> ,75766.9     ,75918.9     ,1.0
> icelake     ,sse2        ,524288      ,0           ,4064
> ,78907.9     ,79550.8     ,0.99
> icelake     ,sse2        ,1048583     ,0           ,0
> ,144672.6    ,147457.7    ,0.98
> icelake     ,sse2        ,1048591     ,0           ,3
> ,173803.9    ,400563.2    ,0.43
> icelake     ,sse2        ,1048607     ,3           ,0
> ,149391.9    ,151772.1    ,0.98
> icelake     ,sse2        ,1048639     ,3           ,5
> ,174774.1    ,400657.4    ,0.44
> icelake     ,sse2        ,1048576     ,0           ,127
> ,175350.9    ,347110.6    ,0.51
> icelake     ,sse2        ,1048576     ,0           ,255
> ,150152.6    ,144242.9    ,1.04
> icelake     ,sse2        ,1048576     ,0           ,256
> ,145869.7    ,147489.6    ,0.99
> icelake     ,sse2        ,1048576     ,0           ,4064
> ,145814.7    ,147497.7    ,0.99
> icelake     ,sse2        ,2097159     ,0           ,0
> ,289460.6    ,295574.6    ,0.98
> icelake     ,sse2        ,2097167     ,0           ,3
> ,347057.0    ,799549.1    ,0.43
> icelake     ,sse2        ,2097183     ,3           ,0
> ,298565.7    ,301424.3    ,0.99
> icelake     ,sse2        ,2097215     ,3           ,5
> ,348620.4    ,797557.4    ,0.44
> icelake     ,sse2        ,2097152     ,0           ,127
> ,348751.4    ,695260.9    ,0.5
> icelake     ,sse2        ,2097152     ,0           ,255
> ,298960.5    ,286590.0    ,1.04
> icelake     ,sse2        ,2097152     ,0           ,256
> ,290978.4    ,293225.6    ,0.99
> icelake     ,sse2        ,2097152     ,0           ,4064
> ,290476.0    ,292283.2    ,0.99
> icelake     ,sse2        ,4194311     ,0           ,0
> ,583386.3    ,588284.3    ,0.99
> icelake     ,sse2        ,4194319     ,0           ,3
> ,703870.5    ,1595268.0   ,0.44
> icelake     ,sse2        ,4194335     ,3           ,0
> ,599400.2    ,601591.6    ,1.0
> icelake     ,sse2        ,4194367     ,3           ,5
> ,694569.7    ,1595608.0   ,0.44
> icelake     ,sse2        ,4194304     ,0           ,127
> ,700229.1    ,1389061.9   ,0.5
> icelake     ,sse2        ,4194304     ,0           ,255
> ,600779.0    ,573361.2    ,1.05
> icelake     ,sse2        ,4194304     ,0           ,256
> ,586610.7    ,589269.6    ,1.0
> icelake     ,sse2        ,4194304     ,0           ,4064
> ,583616.3    ,584806.4    ,1.0
> icelake     ,sse2        ,8388615     ,0           ,0
> ,1214632.8   ,1266616.0   ,0.96
> icelake     ,sse2        ,8388623     ,0           ,3
> ,1405136.9   ,3198827.1   ,0.44
> icelake     ,sse2        ,8388639     ,3           ,0
> ,1244302.6   ,1297425.9   ,0.96
> icelake     ,sse2        ,8388671     ,3           ,5
> ,1404685.1   ,3196389.9   ,0.44
> icelake     ,sse2        ,8388608     ,0           ,127
> ,1419888.5   ,2792729.4   ,0.51
> icelake     ,sse2        ,8388608     ,0           ,255
> ,1249044.6   ,1259726.7   ,0.99
> icelake     ,sse2        ,8388608     ,0           ,256
> ,1234471.9   ,1300463.6   ,0.95
> icelake     ,sse2        ,8388608     ,0           ,4064
> ,1220102.2   ,1265190.5   ,0.96
> icelake     ,sse2        ,16777223    ,0           ,0
> ,2689516.3   ,2846521.1   ,0.94
> icelake     ,sse2        ,16777231    ,0           ,3
> ,3001317.4   ,6428733.7   ,0.47
> icelake     ,sse2        ,16777247    ,3           ,0
> ,2770040.8   ,2910434.9   ,0.95
> icelake     ,sse2        ,16777279    ,3           ,5
> ,3002076.1   ,6415835.9   ,0.47
> icelake     ,sse2        ,16777216    ,0           ,127
> ,3063786.3   ,5609895.3   ,0.55
> icelake     ,sse2        ,16777216    ,0           ,255
> ,2821606.1   ,2833843.6   ,1.0
> icelake     ,sse2        ,16777216    ,0           ,256
> ,2719765.5   ,2925344.2   ,0.93
> icelake     ,sse2        ,16777216    ,0           ,4064
> ,2686189.2   ,2848017.5   ,0.94
> icelake     ,sse2        ,33554439    ,0           ,0
> ,5577945.0   ,5913674.6   ,0.94
> icelake     ,sse2        ,33554447    ,0           ,3
> ,6152758.8   ,12863855.0  ,0.48
> icelake     ,sse2        ,33554463    ,3           ,0
> ,5773351.4   ,6035289.3   ,0.96
> icelake     ,sse2        ,33554495    ,3           ,5
> ,6160006.2   ,12878153.9  ,0.48
> icelake     ,sse2        ,33554432    ,0           ,127
> ,6303495.4   ,11221070.2  ,0.56
> icelake     ,sse2        ,33554432    ,0           ,255
> ,5830879.6   ,5944978.6   ,0.98
> icelake     ,sse2        ,33554432    ,0           ,256
> ,5611968.2   ,6068255.4   ,0.92
> icelake     ,sse2        ,33554432    ,0           ,4064
> ,5570321.0   ,5964542.6   ,0.93   icelake     ,avx         ,65543
>  ,0           ,0           ,5561.1      ,5659.7      ,0.98
> icelake     ,avx         ,65551       ,0           ,3
> ,5859.9      ,5724.8      ,1.02
> icelake     ,avx         ,65567       ,3           ,0
> ,5636.7      ,5623.3      ,1.0
> icelake     ,avx         ,65599       ,3           ,5
> ,5856.3      ,5720.2      ,1.02
> icelake     ,avx         ,65536       ,0           ,127
> ,6011.1      ,5910.0      ,1.02
> icelake     ,avx         ,65536       ,0           ,255
> ,5854.5      ,5792.3      ,1.01
> icelake     ,avx         ,65536       ,0           ,256
> ,5213.0      ,5273.9      ,0.99
> icelake     ,avx         ,65536       ,0           ,4064
> ,5760.7      ,5661.1      ,1.02
> icelake     ,avx         ,131079      ,0           ,0
> ,12371.4     ,12707.0     ,0.97
> icelake     ,avx         ,131087      ,0           ,3
> ,13220.1     ,12515.7     ,1.06
> icelake     ,avx         ,131103      ,3           ,0
> ,11628.2     ,11546.9     ,1.01
> icelake     ,avx         ,131135      ,3           ,5
> ,13025.7     ,13967.6     ,0.93
> icelake     ,avx         ,131072      ,0           ,127
> ,11781.7     ,11936.4     ,0.99
> icelake     ,avx         ,131072      ,0           ,255
> ,11802.2     ,11583.9     ,1.02
> icelake     ,avx         ,131072      ,0           ,256
> ,10436.9     ,10693.1     ,0.98
> icelake     ,avx         ,131072      ,0           ,4064
> ,11880.9     ,11395.6     ,1.04
> icelake     ,avx         ,262151      ,0           ,0
> ,29132.6     ,30542.8     ,0.95
> icelake     ,avx         ,262159      ,0           ,3
> ,30533.5     ,31468.8     ,0.97
> icelake     ,avx         ,262175      ,3           ,0
> ,29879.5     ,30933.7     ,0.97
> icelake     ,avx         ,262207      ,3           ,5
> ,30263.1     ,31445.0     ,0.96
> icelake     ,avx         ,262144      ,0           ,127
> ,30180.9     ,31405.3     ,0.96
> icelake     ,avx         ,262144      ,0           ,255
> ,30152.9     ,31372.5     ,0.96
> icelake     ,avx         ,262144      ,0           ,256
> ,28121.9     ,28990.9     ,0.97
> icelake     ,avx         ,262144      ,0           ,4064
> ,29785.2     ,31078.4     ,0.96
> icelake     ,avx         ,524295      ,0           ,0
> ,76045.7     ,75824.3     ,1.0
> icelake     ,avx         ,524303      ,0           ,3
> ,79303.7     ,80433.3     ,0.99
> icelake     ,avx         ,524319      ,3           ,0
> ,79323.8     ,79411.3     ,1.0
> icelake     ,avx         ,524351      ,3           ,5
> ,79797.9     ,80179.4     ,1.0
> icelake     ,avx         ,524288      ,0           ,127
> ,80046.7     ,80254.1     ,1.0
> icelake     ,avx         ,524288      ,0           ,255
> ,78580.6     ,79210.4     ,0.99
> icelake     ,avx         ,524288      ,0           ,256
> ,75464.4     ,75184.2     ,1.0
> icelake     ,avx         ,524288      ,0           ,4064
> ,78863.6     ,78677.9     ,1.0
> icelake     ,avx         ,1048583     ,0           ,0
> ,131017.9    ,133962.4    ,0.98
> icelake     ,avx         ,1048591     ,0           ,3
> ,143451.3    ,210311.7    ,0.68
> icelake     ,avx         ,1048607     ,3           ,0
> ,136944.0    ,138426.4    ,0.99
> icelake     ,avx         ,1048639     ,3           ,5
> ,143594.3    ,209887.9    ,0.68
> icelake     ,avx         ,1048576     ,0           ,127
> ,156462.0    ,218873.2    ,0.71
> icelake     ,avx         ,1048576     ,0           ,255
> ,148026.3    ,179419.0    ,0.83
> icelake     ,avx         ,1048576     ,0           ,256
> ,143365.7    ,137816.3    ,1.04
> icelake     ,avx         ,1048576     ,0           ,4064
> ,131683.4    ,132731.6    ,0.99
> icelake     ,avx         ,2097159     ,0           ,0
> ,263807.1    ,267984.5    ,0.98
> icelake     ,avx         ,2097167     ,0           ,3
> ,286949.8    ,422279.2    ,0.68
> icelake     ,avx         ,2097183     ,3           ,0
> ,274675.6    ,276702.2    ,0.99
> icelake     ,avx         ,2097215     ,3           ,5
> ,286681.7    ,420176.7    ,0.68
> icelake     ,avx         ,2097152     ,0           ,127
> ,314499.2    ,437864.2    ,0.72
> icelake     ,avx         ,2097152     ,0           ,255
> ,297458.4    ,359520.9    ,0.83
> icelake     ,avx         ,2097152     ,0           ,256
> ,285883.2    ,276043.2    ,1.04
> icelake     ,avx         ,2097152     ,0           ,4064
> ,263436.6    ,265516.6    ,0.99
> icelake     ,avx         ,4194311     ,0           ,0
> ,529119.4    ,536745.2    ,0.99
> icelake     ,avx         ,4194319     ,0           ,3
> ,573960.0    ,839002.3    ,0.68
> icelake     ,avx         ,4194335     ,3           ,0
> ,550617.2    ,553117.5    ,1.0
> icelake     ,avx         ,4194367     ,3           ,5
> ,572742.8    ,838784.5    ,0.68
> icelake     ,avx         ,4194304     ,0           ,127
> ,629413.6    ,876512.1    ,0.72
> icelake     ,avx         ,4194304     ,0           ,255
> ,594224.1    ,717425.1    ,0.83
> icelake     ,avx         ,4194304     ,0           ,256
> ,573365.0    ,552538.3    ,1.04
> icelake     ,avx         ,4194304     ,0           ,4064
> ,527459.3    ,531907.1    ,0.99
> icelake     ,avx         ,8388615     ,0           ,0
> ,1094256.8   ,1145619.9   ,0.96
> icelake     ,avx         ,8388623     ,0           ,3
> ,1170367.1   ,1700076.4   ,0.69
> icelake     ,avx         ,8388639     ,3           ,0
> ,1136168.1   ,1174752.4   ,0.97
> icelake     ,avx         ,8388671     ,3           ,5
> ,1172015.6   ,1703032.8   ,0.69
> icelake     ,avx         ,8388608     ,0           ,127
> ,1276748.6   ,1771351.9   ,0.72
> icelake     ,avx         ,8388608     ,0           ,255
> ,1207712.0   ,1449267.0   ,0.83
> icelake     ,avx         ,8388608     ,0           ,256
> ,1167958.9   ,1178243.1   ,0.99
> icelake     ,avx         ,8388608     ,0           ,4064
> ,1106155.9   ,1145128.6   ,0.97
> icelake     ,avx         ,16777223    ,0           ,0
> ,2479317.5   ,2630301.0   ,0.94
> icelake     ,avx         ,16777231    ,0           ,3
> ,2643303.6   ,3536980.7   ,0.75
> icelake     ,avx         ,16777247    ,3           ,0
> ,2571967.0   ,2672246.4   ,0.96
> icelake     ,avx         ,16777279    ,3           ,5
> ,2641320.5   ,3538388.9   ,0.75
> icelake     ,avx         ,16777216    ,0           ,127
> ,2832921.6   ,3593702.5   ,0.79
> icelake     ,avx         ,16777216    ,0           ,255
> ,2700272.1   ,3025346.1   ,0.89
> icelake     ,avx         ,16777216    ,0           ,256
> ,2622133.7   ,2709087.6   ,0.97
> icelake     ,avx         ,16777216    ,0           ,4064
> ,2475020.7   ,2610977.8   ,0.95
> icelake     ,avx         ,33554439    ,0           ,0
> ,5190103.1   ,5576047.9   ,0.93
> icelake     ,avx         ,33554447    ,0           ,3
> ,5477752.1   ,7215479.2   ,0.76
> icelake     ,avx         ,33554463    ,3           ,0
> ,5338711.7   ,5625026.7   ,0.95
> icelake     ,avx         ,33554495    ,3           ,5
> ,5505164.8   ,7223660.8   ,0.76
> icelake     ,avx         ,33554432    ,0           ,127
> ,5859232.3   ,7279581.9   ,0.8
> icelake     ,avx         ,33554432    ,0           ,255
> ,5681634.7   ,6156488.6   ,0.92
> icelake     ,avx         ,33554432    ,0           ,256
> ,5440721.4   ,5728347.4   ,0.95
> icelake     ,avx         ,33554432    ,0           ,4064
> ,5191213.2   ,5538716.4   ,0.94
> icelake     ,avx512      ,65543       ,0           ,0
> ,5563.5      ,5634.1      ,0.99
> icelake     ,avx512      ,65551       ,0           ,3
> ,5864.1      ,5728.4      ,1.02
> icelake     ,avx512      ,65567       ,3           ,0
> ,5720.2      ,5625.3      ,1.02
> icelake     ,avx512      ,65599       ,3           ,5
> ,5857.2      ,5722.0      ,1.02
> icelake     ,avx512      ,65536       ,0           ,127
> ,6040.7      ,5844.0      ,1.03
> icelake     ,avx512      ,65536       ,0           ,255
> ,5826.5      ,5799.6      ,1.0
> icelake     ,avx512      ,65536       ,0           ,256
> ,5234.4      ,5230.0      ,1.0
> icelake     ,avx512      ,65536       ,0           ,4064
> ,5800.7      ,5655.4      ,1.03
> icelake     ,avx512      ,131079      ,0           ,0
> ,12591.4     ,11767.1     ,1.07
> icelake     ,avx512      ,131087      ,0           ,3
> ,12694.9     ,12292.1     ,1.03
> icelake     ,avx512      ,131103      ,3           ,0
> ,11374.7     ,12236.3     ,0.93
> icelake     ,avx512      ,131135      ,3           ,5
> ,11958.2     ,11745.5     ,1.02
> icelake     ,avx512      ,131072      ,0           ,127
> ,11803.4     ,11908.6     ,0.99
> icelake     ,avx512      ,131072      ,0           ,255
> ,11569.0     ,11487.9     ,1.01
> icelake     ,avx512      ,131072      ,0           ,256
> ,11087.6     ,10456.4     ,1.06
> icelake     ,avx512      ,131072      ,0           ,4064
> ,11166.0     ,11248.2     ,0.99
> icelake     ,avx512      ,262151      ,0           ,0
> ,30232.1     ,29932.7     ,1.01
> icelake     ,avx512      ,262159      ,0           ,3
> ,30093.8     ,31315.1     ,0.96
> icelake     ,avx512      ,262175      ,3           ,0
> ,30147.7     ,30643.4     ,0.98
> icelake     ,avx512      ,262207      ,3           ,5
> ,29985.9     ,31479.8     ,0.95
> icelake     ,avx512      ,262144      ,0           ,127
> ,30099.7     ,31552.9     ,0.95
> icelake     ,avx512      ,262144      ,0           ,255
> ,29772.8     ,30698.1     ,0.97
> icelake     ,avx512      ,262144      ,0           ,256
> ,28109.3     ,28957.9     ,0.97
> icelake     ,avx512      ,262144      ,0           ,4064
> ,29787.5     ,30637.2     ,0.97
> icelake     ,avx512      ,524295      ,0           ,0
> ,75920.7     ,75047.1     ,1.01
> icelake     ,avx512      ,524303      ,0           ,3
> ,79218.6     ,79529.2     ,1.0
> icelake     ,avx512      ,524319      ,3           ,0
> ,78446.9     ,78550.7     ,1.0
> icelake     ,avx512      ,524351      ,3           ,5
> ,79055.0     ,79425.2     ,1.0
> icelake     ,avx512      ,524288      ,0           ,127
> ,79070.6     ,79626.7     ,0.99
> icelake     ,avx512      ,524288      ,0           ,255
> ,77891.8     ,78078.3     ,1.0
> icelake     ,avx512      ,524288      ,0           ,256
> ,74797.3     ,74436.9     ,1.0
> icelake     ,avx512      ,524288      ,0           ,4064
> ,78339.3     ,78337.2     ,1.0
> icelake     ,avx512      ,1048583     ,0           ,0
> ,131427.6    ,133891.3    ,0.98
> icelake     ,avx512      ,1048591     ,0           ,3
> ,143984.1    ,142003.7    ,1.01
> icelake     ,avx512      ,1048607     ,3           ,0
> ,137547.9    ,134450.1    ,1.02
> icelake     ,avx512      ,1048639     ,3           ,5
> ,144630.4    ,142174.6    ,1.02
> icelake     ,avx512      ,1048576     ,0           ,127
> ,149810.7    ,142684.9    ,1.05
> icelake     ,avx512      ,1048576     ,0           ,255
> ,156212.6    ,143509.2    ,1.09
> icelake     ,avx512      ,1048576     ,0           ,256
> ,153776.9    ,139788.0    ,1.1
> icelake     ,avx512      ,1048576     ,0           ,4064
> ,137926.6    ,134832.8    ,1.02
> icelake     ,avx512      ,2097159     ,0           ,0
> ,263465.3    ,267681.6    ,0.98
> icelake     ,avx512      ,2097167     ,0           ,3
> ,288947.7    ,284129.9    ,1.02
> icelake     ,avx512      ,2097183     ,3           ,0
> ,275395.5    ,269216.0    ,1.02
> icelake     ,avx512      ,2097215     ,3           ,5
> ,289131.5    ,284475.3    ,1.02
> icelake     ,avx512      ,2097152     ,0           ,127
> ,299404.5    ,286193.2    ,1.05
> icelake     ,avx512      ,2097152     ,0           ,255
> ,312913.2    ,286785.6    ,1.09
> icelake     ,avx512      ,2097152     ,0           ,256
> ,307882.7    ,279708.7    ,1.1
> icelake     ,avx512      ,2097152     ,0           ,4064
> ,275552.3    ,269867.0    ,1.02
> icelake     ,avx512      ,4194311     ,0           ,0
> ,526480.1    ,536038.9    ,0.98
> icelake     ,avx512      ,4194319     ,0           ,3
> ,579122.9    ,569512.5    ,1.02
> icelake     ,avx512      ,4194335     ,3           ,0
> ,551658.1    ,542973.3    ,1.02
> icelake     ,avx512      ,4194367     ,3           ,5
> ,578575.2    ,569497.2    ,1.02
> icelake     ,avx512      ,4194304     ,0           ,127
> ,599943.6    ,569138.2    ,1.05
> icelake     ,avx512      ,4194304     ,0           ,255
> ,628419.2    ,575908.4    ,1.09
> icelake     ,avx512      ,4194304     ,0           ,256
> ,617242.8    ,561417.7    ,1.1
> icelake     ,avx512      ,4194304     ,0           ,4064
> ,552012.3    ,540617.2    ,1.02
> icelake     ,avx512      ,8388615     ,0           ,0
> ,1092471.4   ,1133834.9   ,0.96
> icelake     ,avx512      ,8388623     ,0           ,3
> ,1185623.5   ,1218150.0   ,0.97
> icelake     ,avx512      ,8388639     ,3           ,0
> ,1142647.1   ,1139201.6   ,1.0
> icelake     ,avx512      ,8388671     ,3           ,5
> ,1183702.5   ,1225474.6   ,0.97
> icelake     ,avx512      ,8388608     ,0           ,127
> ,1231862.8   ,1221685.1   ,1.01
> icelake     ,avx512      ,8388608     ,0           ,255
> ,1290816.7   ,1221576.2   ,1.06
> icelake     ,avx512      ,8388608     ,0           ,256
> ,1299047.6   ,1195021.2   ,1.09
> icelake     ,avx512      ,8388608     ,0           ,4064
> ,1139648.9   ,1140113.0   ,1.0
> icelake     ,avx512      ,16777223    ,0           ,0
> ,2464861.2   ,2599120.4   ,0.95
> icelake     ,avx512      ,16777231    ,0           ,3
> ,2651029.7   ,2758867.1   ,0.96
> icelake     ,avx512      ,16777247    ,3           ,0
> ,2570099.8   ,2601099.4   ,0.99
> icelake     ,avx512      ,16777279    ,3           ,5
> ,2660529.4   ,2762598.6   ,0.96
> icelake     ,avx512      ,16777216    ,0           ,127
> ,2759531.7   ,2756811.1   ,1.0
> icelake     ,avx512      ,16777216    ,0           ,255
> ,2878568.5   ,2777650.3   ,1.04
> icelake     ,avx512      ,16777216    ,0           ,256
> ,2931879.3   ,2709687.7   ,1.08
> icelake     ,avx512      ,16777216    ,0           ,4064
> ,2587161.1   ,2632011.2   ,0.98
> icelake     ,avx512      ,33554439    ,0           ,0
> ,5175406.0   ,5528857.2   ,0.94
> icelake     ,avx512      ,33554447    ,0           ,3
> ,5537561.9   ,5818119.1   ,0.95
> icelake     ,avx512      ,33554463    ,3           ,0
> ,5435099.5   ,5560442.2   ,0.98
> icelake     ,avx512      ,33554495    ,3           ,5
> ,5546314.9   ,5800995.0   ,0.96
> icelake     ,avx512      ,33554432    ,0           ,127
> ,5770248.0   ,5781104.9   ,1.0
> icelake     ,avx512      ,33554432    ,0           ,255
> ,6019120.7   ,5836023.3   ,1.03
> icelake     ,avx512      ,33554432    ,0           ,256
> ,6107033.4   ,5681798.8   ,1.07
> icelake     ,avx512      ,33554432    ,0           ,4064
> ,5356238.5   ,5598521.5   ,0.96
> skylake     ,sse2        ,65543       ,0           ,0
> ,3091.4      ,2940.2      ,1.05
> skylake     ,sse2        ,65551       ,0           ,3
> ,3682.6      ,3403.7      ,1.08
> skylake     ,sse2        ,65567       ,3           ,0
> ,3031.3      ,3070.2      ,0.99
> skylake     ,sse2        ,65599       ,3           ,5
> ,3731.2      ,3718.7      ,1.0
> skylake     ,sse2        ,65536       ,0           ,127
> ,3642.3      ,3390.5      ,1.07
> skylake     ,sse2        ,65536       ,0           ,255
> ,3493.9      ,3333.0      ,1.05
> skylake     ,sse2        ,65536       ,0           ,256
> ,3043.2      ,2981.0      ,1.02
> skylake     ,sse2        ,65536       ,0           ,4064
> ,2796.6      ,2843.9      ,0.98
> skylake     ,sse2        ,131079      ,0           ,0
> ,6347.4      ,6309.8      ,1.01
> skylake     ,sse2        ,131087      ,0           ,3
> ,7318.4      ,7486.2      ,0.98
> skylake     ,sse2        ,131103      ,3           ,0
> ,6297.4      ,6516.8      ,0.97
> skylake     ,sse2        ,131135      ,3           ,5
> ,7544.5      ,7823.5      ,0.96
> skylake     ,sse2        ,131072      ,0           ,127
> ,7426.4      ,7554.3      ,0.98
> skylake     ,sse2        ,131072      ,0           ,255
> ,7349.0      ,7195.4      ,1.02
> skylake     ,sse2        ,131072      ,0           ,256
> ,7068.1      ,6804.8      ,1.04
> skylake     ,sse2        ,131072      ,0           ,4064
> ,6884.6      ,7566.7      ,0.91
> skylake     ,sse2        ,262151      ,0           ,0
> ,15848.1     ,15552.2     ,1.02
> skylake     ,sse2        ,262159      ,0           ,3
> ,17864.6     ,16787.9     ,1.06
> skylake     ,sse2        ,262175      ,3           ,0
> ,15748.1     ,16266.0     ,0.97
> skylake     ,sse2        ,262207      ,3           ,5
> ,17022.3     ,17229.8     ,0.99
> skylake     ,sse2        ,262144      ,0           ,127
> ,16158.7     ,16093.6     ,1.0
> skylake     ,sse2        ,262144      ,0           ,255
> ,15670.7     ,15949.2     ,0.98
> skylake     ,sse2        ,262144      ,0           ,256
> ,14806.3     ,14970.3     ,0.99
> skylake     ,sse2        ,262144      ,0           ,4064
> ,14751.7     ,15008.2     ,0.98
> skylake     ,sse2        ,524295      ,0           ,0
> ,32874.8     ,33731.2     ,0.97
> skylake     ,sse2        ,524303      ,0           ,3
> ,34035.1     ,34777.8     ,0.98
> skylake     ,sse2        ,524319      ,3           ,0
> ,34325.6     ,34108.9     ,1.01
> skylake     ,sse2        ,524351      ,3           ,5
> ,34853.5     ,35624.4     ,0.98
> skylake     ,sse2        ,524288      ,0           ,127
> ,33437.4     ,33816.7     ,0.99
> skylake     ,sse2        ,524288      ,0           ,255
> ,33256.1     ,33664.7     ,0.99
> skylake     ,sse2        ,524288      ,0           ,256
> ,32006.3     ,32396.3     ,0.99
> skylake     ,sse2        ,524288      ,0           ,4064
> ,32284.7     ,32713.9     ,0.99
> skylake     ,sse2        ,1048583     ,0           ,0
> ,71891.7     ,73858.4     ,0.97
> skylake     ,sse2        ,1048591     ,0           ,3
> ,74621.3     ,74389.7     ,1.0
> skylake     ,sse2        ,1048607     ,3           ,0
> ,72515.0     ,73573.2     ,0.99
> skylake     ,sse2        ,1048639     ,3           ,5
> ,72471.7     ,73782.6     ,0.98
> skylake     ,sse2        ,1048576     ,0           ,127
> ,77638.6     ,82474.6     ,0.94
> skylake     ,sse2        ,1048576     ,0           ,255
> ,71870.0     ,71933.6     ,1.0
> skylake     ,sse2        ,1048576     ,0           ,256
> ,70410.0     ,73243.6     ,0.96
> skylake     ,sse2        ,1048576     ,0           ,4064
> ,71267.1     ,72274.6     ,0.99
> skylake     ,sse2        ,2097159     ,0           ,0
> ,140052.6    ,144880.1    ,0.97
> skylake     ,sse2        ,2097167     ,0           ,3
> ,146626.5    ,147972.6    ,0.99
> skylake     ,sse2        ,2097183     ,3           ,0
> ,141750.1    ,146353.6    ,0.97
> skylake     ,sse2        ,2097215     ,3           ,5
> ,144169.0    ,148120.1    ,0.97
> skylake     ,sse2        ,2097152     ,0           ,127
> ,156575.9    ,165844.4    ,0.94
> skylake     ,sse2        ,2097152     ,0           ,255
> ,144277.7    ,146971.5    ,0.98
> skylake     ,sse2        ,2097152     ,0           ,256
> ,143047.4    ,146810.9    ,0.97
> skylake     ,sse2        ,2097152     ,0           ,4064
> ,142795.6    ,145805.8    ,0.98
> skylake     ,sse2        ,4194311     ,0           ,0
> ,284353.3    ,298092.5    ,0.95
> skylake     ,sse2        ,4194319     ,0           ,3
> ,296656.4    ,311960.2    ,0.95
> skylake     ,sse2        ,4194335     ,3           ,0
> ,285922.6    ,304100.5    ,0.94
> skylake     ,sse2        ,4194367     ,3           ,5
> ,297135.4    ,312532.5    ,0.95
> skylake     ,sse2        ,4194304     ,0           ,127
> ,323938.6    ,340414.3    ,0.95
> skylake     ,sse2        ,4194304     ,0           ,255
> ,301460.9    ,310042.7    ,0.97
> skylake     ,sse2        ,4194304     ,0           ,256
> ,287155.8    ,303580.6    ,0.95
> skylake     ,sse2        ,4194304     ,0           ,4064
> ,291006.2    ,302441.3    ,0.96
> skylake     ,sse2        ,8388615     ,0           ,0
> ,714424.7    ,747484.3    ,0.96
> skylake     ,sse2        ,8388623     ,0           ,3
> ,748995.5    ,774116.5    ,0.97
> skylake     ,sse2        ,8388639     ,3           ,0
> ,720563.4    ,757386.9    ,0.95
> skylake     ,sse2        ,8388671     ,3           ,5
> ,748028.7    ,773907.8    ,0.97
> skylake     ,sse2        ,8388608     ,0           ,127
> ,750775.3    ,780245.2    ,0.96
> skylake     ,sse2        ,8388608     ,0           ,255
> ,724940.3    ,764197.8    ,0.95
> skylake     ,sse2        ,8388608     ,0           ,256
> ,722035.0    ,759408.9    ,0.95
> skylake     ,sse2        ,8388608     ,0           ,4064
> ,756977.8    ,755532.4    ,1.0
> skylake     ,sse2        ,16777223    ,0           ,0
> ,1971686.0   ,2111263.4   ,0.93
> skylake     ,sse2        ,16777231    ,0           ,3
> ,1953608.9   ,2128493.8   ,0.92
> skylake     ,sse2        ,16777247    ,3           ,0
> ,1967075.6   ,2103772.3   ,0.94
> skylake     ,sse2        ,16777279    ,3           ,5
> ,1950851.6   ,2133601.6   ,0.91
> skylake     ,sse2        ,16777216    ,0           ,127
> ,1991168.2   ,2078249.3   ,0.96
> skylake     ,sse2        ,16777216    ,0           ,255
> ,1958502.9   ,2111955.5   ,0.93
> skylake     ,sse2        ,16777216    ,0           ,256
> ,1965103.7   ,2114293.0   ,0.93
> skylake     ,sse2        ,16777216    ,0           ,4064
> ,1958381.3   ,2103438.6   ,0.93
> skylake     ,sse2        ,33554439    ,0           ,0
> ,4456144.2   ,4660837.1   ,0.96
> skylake     ,sse2        ,33554447    ,0           ,3
> ,4431097.0   ,4679042.6   ,0.95
> skylake     ,sse2        ,33554463    ,3           ,0
> ,4448225.6   ,4648538.3   ,0.96
> skylake     ,sse2        ,33554495    ,3           ,5
> ,4427743.0   ,4678340.1   ,0.95
> skylake     ,sse2        ,33554432    ,0           ,127
> ,4437517.3   ,4552005.9   ,0.97
> skylake     ,sse2        ,33554432    ,0           ,255
> ,4427135.1   ,4543412.0   ,0.97
> skylake     ,sse2        ,33554432    ,0           ,256
> ,4441311.2   ,4658315.5   ,0.95
> skylake     ,sse2        ,33554432    ,0           ,4064
> ,4429798.4   ,4659499.6   ,0.95   skylake     ,avx         ,65543
>  ,0           ,0           ,3115.8      ,3043.7      ,1.02
> skylake     ,avx         ,65551       ,0           ,3
> ,3673.2      ,3551.7      ,1.03
> skylake     ,avx         ,65567       ,3           ,0
> ,3024.6      ,2887.4      ,1.05
> skylake     ,avx         ,65599       ,3           ,5
> ,3907.8      ,3636.4      ,1.07
> skylake     ,avx         ,65536       ,0           ,127
> ,3539.2      ,3372.3      ,1.05
> skylake     ,avx         ,65536       ,0           ,255
> ,3489.9      ,3344.0      ,1.04
> skylake     ,avx         ,65536       ,0           ,256
> ,3059.0      ,2924.4      ,1.05
> skylake     ,avx         ,65536       ,0           ,4064
> ,2805.0      ,2869.3      ,0.98
> skylake     ,avx         ,131079      ,0           ,0
> ,6129.2      ,6263.4      ,0.98
> skylake     ,avx         ,131087      ,0           ,3
> ,7096.8      ,7570.0      ,0.94
> skylake     ,avx         ,131103      ,3           ,0
> ,6394.5      ,6842.5      ,0.93
> skylake     ,avx         ,131135      ,3           ,5
> ,7462.8      ,7776.0      ,0.96
> skylake     ,avx         ,131072      ,0           ,127
> ,7726.9      ,7428.5      ,1.04
> skylake     ,avx         ,131072      ,0           ,255
> ,7167.4      ,7278.9      ,0.98
> skylake     ,avx         ,131072      ,0           ,256
> ,7197.9      ,6284.3      ,1.15
> skylake     ,avx         ,131072      ,0           ,4064
> ,6984.0      ,6940.4      ,1.01
> skylake     ,avx         ,262151      ,0           ,0
> ,15787.3     ,16403.1     ,0.96
> skylake     ,avx         ,262159      ,0           ,3
> ,17800.1     ,17628.1     ,1.01
> skylake     ,avx         ,262175      ,3           ,0
> ,16622.8     ,16244.3     ,1.02
> skylake     ,avx         ,262207      ,3           ,5
> ,16989.7     ,17509.0     ,0.97
> skylake     ,avx         ,262144      ,0           ,127
> ,16190.8     ,15971.8     ,1.01
> skylake     ,avx         ,262144      ,0           ,255
> ,15787.1     ,15876.7     ,0.99
> skylake     ,avx         ,262144      ,0           ,256
> ,14840.1     ,14997.0     ,0.99
> skylake     ,avx         ,262144      ,0           ,4064
> ,15743.0     ,14976.2     ,1.05
> skylake     ,avx         ,524295      ,0           ,0
> ,32848.5     ,33397.8     ,0.98
> skylake     ,avx         ,524303      ,0           ,3
> ,34872.1     ,34862.2     ,1.0
> skylake     ,avx         ,524319      ,3           ,0
> ,33784.6     ,34023.8     ,0.99
> skylake     ,avx         ,524351      ,3           ,5
> ,35337.1     ,35364.5     ,1.0
> skylake     ,avx         ,524288      ,0           ,127
> ,33624.5     ,33596.5     ,1.0
> skylake     ,avx         ,524288      ,0           ,255
> ,33390.7     ,33842.8     ,0.99
> skylake     ,avx         ,524288      ,0           ,256
> ,31937.0     ,32357.2     ,0.99
> skylake     ,avx         ,524288      ,0           ,4064
> ,32233.5     ,32267.3     ,1.0
> skylake     ,avx         ,1048583     ,0           ,0
> ,100354.7    ,105840.6    ,0.95
> skylake     ,avx         ,1048591     ,0           ,3
> ,68102.5     ,67496.0     ,1.01
> skylake     ,avx         ,1048607     ,3           ,0
> ,66146.1     ,67540.0     ,0.98
> skylake     ,avx         ,1048639     ,3           ,5
> ,67530.8     ,67726.4     ,1.0
> skylake     ,avx         ,1048576     ,0           ,127
> ,67105.6     ,66533.5     ,1.01
> skylake     ,avx         ,1048576     ,0           ,255
> ,67101.8     ,65666.7     ,1.02
> skylake     ,avx         ,1048576     ,0           ,256
> ,65092.6     ,67103.0     ,0.97
> skylake     ,avx         ,1048576     ,0           ,4064
> ,65700.0     ,67031.5     ,0.98
> skylake     ,avx         ,2097159     ,0           ,0
> ,133101.0    ,135171.6    ,0.98
> skylake     ,avx         ,2097167     ,0           ,3
> ,134174.4    ,135782.1    ,0.99
> skylake     ,avx         ,2097183     ,3           ,0
> ,132056.4    ,134170.0    ,0.98
> skylake     ,avx         ,2097215     ,3           ,5
> ,134413.5    ,136341.1    ,0.99
> skylake     ,avx         ,2097152     ,0           ,127
> ,133003.9    ,132992.1    ,1.0
> skylake     ,avx         ,2097152     ,0           ,255
> ,133344.3    ,132883.1    ,1.0
> skylake     ,avx         ,2097152     ,0           ,256
> ,134051.7    ,136185.8    ,0.98
> skylake     ,avx         ,2097152     ,0           ,4064
> ,132976.3    ,135029.4    ,0.98
> skylake     ,avx         ,4194311     ,0           ,0
> ,268004.1    ,282650.3    ,0.95
> skylake     ,avx         ,4194319     ,0           ,3
> ,270270.0    ,286700.3    ,0.94
> skylake     ,avx         ,4194335     ,3           ,0
> ,264288.5    ,279582.4    ,0.95
> skylake     ,avx         ,4194367     ,3           ,5
> ,270498.4    ,286294.5    ,0.94
> skylake     ,avx         ,4194304     ,0           ,127
> ,271219.3    ,275129.8    ,0.99
> skylake     ,avx         ,4194304     ,0           ,255
> ,269996.5    ,270227.6    ,1.0
> skylake     ,avx         ,4194304     ,0           ,256
> ,267901.1    ,281673.1    ,0.95
> skylake     ,avx         ,4194304     ,0           ,4064
> ,268390.0    ,279100.3    ,0.96
> skylake     ,avx         ,8388615     ,0           ,0
> ,803547.9    ,813229.9    ,0.99
> skylake     ,avx         ,8388623     ,0           ,3
> ,828872.4    ,869413.0    ,0.95
> skylake     ,avx         ,8388639     ,3           ,0
> ,818000.0    ,873781.7    ,0.94
> skylake     ,avx         ,8388671     ,3           ,5
> ,824679.0    ,863561.5    ,0.95
> skylake     ,avx         ,8388608     ,0           ,127
> ,800728.5    ,779000.8    ,1.03
> skylake     ,avx         ,8388608     ,0           ,255
> ,820071.4    ,770113.2    ,1.06
> skylake     ,avx         ,8388608     ,0           ,256
> ,825624.6    ,867247.7    ,0.95
> skylake     ,avx         ,8388608     ,0           ,4064
> ,830209.7    ,894086.6    ,0.93
> skylake     ,avx         ,16777223    ,0           ,0
> ,1989391.3   ,2132829.8   ,0.93
> skylake     ,avx         ,16777231    ,0           ,3
> ,1994225.1   ,2211556.0   ,0.9
> skylake     ,avx         ,16777247    ,3           ,0
> ,1993572.9   ,2213029.9   ,0.9
> skylake     ,avx         ,16777279    ,3           ,5
> ,2001956.9   ,2211769.7   ,0.91
> skylake     ,avx         ,16777216    ,0           ,127
> ,1968155.9   ,2127764.7   ,0.92
> skylake     ,avx         ,16777216    ,0           ,255
> ,1978305.1   ,2121371.3   ,0.93
> skylake     ,avx         ,16777216    ,0           ,256
> ,1993261.9   ,2206494.1   ,0.9
> skylake     ,avx         ,16777216    ,0           ,4064
> ,1993808.3   ,2198137.4   ,0.91
> skylake     ,avx         ,33554439    ,0           ,0
> ,4540216.7   ,4870021.8   ,0.93
> skylake     ,avx         ,33554447    ,0           ,3
> ,4483505.3   ,4850545.5   ,0.92
> skylake     ,avx         ,33554463    ,3           ,0
> ,4501944.5   ,4870922.4   ,0.92
> skylake     ,avx         ,33554495    ,3           ,5
> ,4484565.5   ,4845392.4   ,0.93
> skylake     ,avx         ,33554432    ,0           ,127
> ,4408639.3   ,4701698.6   ,0.94
> skylake     ,avx         ,33554432    ,0           ,255
> ,4445826.0   ,4678142.9   ,0.95
> skylake     ,avx         ,33554432    ,0           ,256
> ,4497953.2   ,4844498.6   ,0.93
> skylake     ,avx         ,33554432    ,0           ,4064
> ,4501572.4   ,4839209.4   ,0.93
>
> >
> > --
> > H.J.
  
Noah Goldstein April 3, 2021, 7:52 p.m. UTC | #4
Sorry, there was a mistake in the last set of data. It was truncated
to N = 10. Here is the N = 100 data. Sorry for the spam!

On Sat, Apr 3, 2021 at 3:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Last message got formatted weirdly. Here is file with the data.
>
> On Sat, Apr 3, 2021 at 3:41 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Sat, Apr 3, 2021 at 1:46 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sat, Apr 3, 2021 at 1:12 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > From: noah <goldstein.w.n@gmail.com>
> > > >
> > > > No Bug. This commit updates the large memcpy case (no overlap). The
> > > > update is to perform memcpy on either 2 or 4 contiguous pages at
> > > > once. This 1) helps to alleviate the affects of false memory aliasing
> > > > when destination and source have a close 4k alignment and 2) In most
> > > > cases and for most DRAM units is a modestly more efficient access
> > > > pattern. These changes are a clear performance improvement for
> > > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > > > pass.
> > > >
> > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > ---
> > > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the
> > > > loops and no longer see any meaningful regression.
> > > >
> > > > Also added back the temporal stores for the tail. Saw a regression
> > > > when doing these tests.
> > > >
> > > > Two tables below for skylake and icelake numbers for the areas around
> > > > where you saw the regression. Below is all data from the tests.
> > > >
> > > > N = 10.
> > > >
> > > > Skylake
> > > > Len         ,align1      ,align2      ,new mean    ,old mean
> > > > 4103        ,0           ,64          ,84.5        ,88.6
> > > > 4111        ,0           ,3           ,99.0        ,99.9
> > > > 4127        ,3           ,0           ,102.1       ,102.3
> > > > 4159        ,3           ,7           ,88.7        ,90.9
> > > > 4223        ,9           ,5           ,88.1        ,87.4
> > > > 8199        ,0           ,64          ,146.7       ,150.2
> > > > 8207        ,0           ,3           ,167.9       ,168.5
> > > > 8223        ,3           ,0           ,168.5       ,168.1
> > > > 8255        ,3           ,7           ,157.0       ,159.2
> > > > 8319        ,9           ,5           ,155.5       ,155.7
> > > > 16391       ,0           ,64          ,286.2       ,288.8
> > > > 16399       ,0           ,3           ,307.0       ,308.7
> > > > 16415       ,3           ,0           ,307.4       ,307.6
> > > > 16447       ,3           ,7           ,294.6       ,295.5
> > > > 16511       ,9           ,5           ,291.5       ,462.1
> > > > 32775       ,0           ,64          ,603.4       ,601.5
> > > > 32783       ,0           ,3           ,604.8       ,606.4
> > > > 32799       ,3           ,0           ,603.0       ,604.1
> > > > 32831       ,3           ,7           ,600.2       ,737.3
> > > > 32895       ,9           ,5           ,604.4       ,599.5
> > > > 65543       ,0           ,64          ,1873.5      ,1854.3
> > > > 65551       ,0           ,3           ,1862.9      ,1846.6
> > > > 65567       ,3           ,0           ,1885.5      ,1966.0
> > > > 65599       ,3           ,7           ,1833.2      ,1833.1
> > > > 65663       ,9           ,5           ,1884.9      ,1887.4
> > > > 131079      ,0           ,64          ,3944.3      ,3949.4
> > > > 131087      ,0           ,3           ,3927.3      ,3913.3
> > > > 131103      ,3           ,0           ,4415.8      ,4169.4
> > > > 131135      ,3           ,7           ,4224.5      ,4157.6
> > > > 131199      ,9           ,5           ,5974.0      ,4983.8
> > > > 262151      ,0           ,64          ,11050.2     ,10620.6
> > > > 262159      ,0           ,3           ,9932.8      ,10037.3
> > > > 262175      ,3           ,0           ,10188.8     ,9206.6
> > > > 262207      ,3           ,7           ,9633.3      ,9216.7
> > > > 262271      ,9           ,5           ,9732.7      ,9345.3
> > > > 524295      ,0           ,64          ,24823.9     ,24880.7
> > > > 524303      ,0           ,3           ,24514.0     ,24556.7
> > > > 524319      ,3           ,0           ,23974.4     ,24219.9
> > > > 524351      ,3           ,7           ,24159.7     ,24207.0
> > > > 524415      ,9           ,5           ,23946.5     ,24142.8
> > > >
> > > > Icelake:
> > > > Len         ,align1      ,align2      ,new mean    ,old mean
> > > > 4103        ,0           ,64          ,50.2        ,63.7
> > > > 4111        ,0           ,3           ,63.7        ,65.1
> > > > 4127        ,3           ,0           ,68.2        ,69.4
> > > > 4159        ,3           ,7           ,59.6        ,68.0
> > > > 4223        ,9           ,5           ,68.2        ,66.8
> > > > 8199        ,0           ,64          ,92.1        ,89.9
> > > > 8207        ,0           ,3           ,119.7       ,118.3
> > > > 8223        ,3           ,0           ,119.1       ,120.9
> > > > 8255        ,3           ,7           ,122.9       ,123.7
> > > > 8319        ,9           ,5           ,122.1       ,121.8
> > > > 16391       ,0           ,64          ,162.7       ,158.0
> > > > 16399       ,0           ,3           ,227.6       ,234.1
> > > > 16415       ,3           ,0           ,230.8       ,232.7
> > > > 16447       ,3           ,7           ,226.8       ,232.6
> > > > 16511       ,9           ,5           ,233.4       ,233.8
> > > > 32775       ,0           ,64          ,312.2       ,301.8
> > > > 32783       ,0           ,3           ,449.7       ,450.0
> > > > 32799       ,3           ,0           ,452.7       ,455.9
> > > > 32831       ,3           ,7           ,449.8       ,458.0
> > > > 32895       ,9           ,5           ,456.3       ,459.4
> > > > 65543       ,0           ,64          ,1460.6      ,1463.9
> > > > 65551       ,0           ,3           ,1462.0      ,1465.4
> > > > 65567       ,3           ,0           ,1466.6      ,1480.4
> > > > 65599       ,3           ,7           ,1488.0      ,1488.9
> > > > 65663       ,9           ,5           ,1680.8      ,1499.5
> > > > 131079      ,0           ,64          ,2988.5      ,3010.1
> > > > 131087      ,0           ,3           ,2995.5      ,2996.4
> > > > 131103      ,3           ,0           ,3006.2      ,3000.5
> > > > 131135      ,3           ,7           ,3032.4      ,3073.7
> > > > 131199      ,9           ,5           ,3010.4      ,3027.4
> > > > 262151      ,0           ,64          ,6143.2      ,6079.1
> > > > 262159      ,0           ,3           ,6085.1      ,6075.8
> > > > 262175      ,3           ,0           ,6088.0      ,6064.9
> > > > 262207      ,3           ,7           ,6018.7      ,6023.5
> > > > 262271      ,9           ,5           ,6019.8      ,5959.2
> > > > 524295      ,0           ,64          ,14464.2     ,14095.1
> > > > 524303      ,0           ,3           ,14761.6     ,14050.2
> > > > 524319      ,3           ,0           ,14534.1     ,14087.5
> > > > 524351      ,3           ,7           ,14147.7     ,13903.8
> > > > 524415      ,9           ,5           ,14157.0     ,13982.9
> > > >
> > > >
> > > >
> > > > cpu         ,version     ,Len         ,align1      ,align2      ,new mean    ,old mean
> > > > skylake     ,avx         ,4103        ,0           ,64          ,84.5        ,88.6
> > > > skylake     ,avx         ,4111        ,0           ,3           ,99.0        ,99.9
> > > > skylake     ,avx         ,4127        ,3           ,0           ,102.1       ,102.3
> > > > skylake     ,avx         ,4159        ,3           ,7           ,88.7        ,90.9
> > > > skylake     ,avx         ,4223        ,9           ,5           ,88.1        ,87.4
> > > > skylake     ,avx         ,8199        ,0           ,64          ,146.7       ,150.2
> > > > skylake     ,avx         ,8207        ,0           ,3           ,167.9       ,168.5
> > > > skylake     ,avx         ,8223        ,3           ,0           ,168.5       ,168.1
> > > > skylake     ,avx         ,8255        ,3           ,7           ,157.0       ,159.2
> > > > skylake     ,avx         ,8319        ,9           ,5           ,155.5       ,155.7
> > > > skylake     ,avx         ,16391       ,0           ,64          ,286.2       ,288.8
> > > > skylake     ,avx         ,16399       ,0           ,3           ,307.0       ,308.7
> > > > skylake     ,avx         ,16415       ,3           ,0           ,307.4       ,307.6
> > > > skylake     ,avx         ,16447       ,3           ,7           ,294.6       ,295.5
> > > > skylake     ,avx         ,16511       ,9           ,5           ,291.5       ,462.1
> > > > skylake     ,avx         ,32775       ,0           ,64          ,603.4       ,601.5
> > > > skylake     ,avx         ,32783       ,0           ,3           ,604.8       ,606.4
> > > > skylake     ,avx         ,32799       ,3           ,0           ,603.0       ,604.1
> > > > skylake     ,avx         ,32831       ,3           ,7           ,600.2       ,737.3
> > > > skylake     ,avx         ,32895       ,9           ,5           ,604.4       ,599.5
> > > > skylake     ,avx         ,65543       ,0           ,64          ,1873.5      ,1854.3
> > > > skylake     ,avx         ,65551       ,0           ,3           ,1862.9      ,1846.6
> > > > skylake     ,avx         ,65567       ,3           ,0           ,1885.5      ,1966.0
> > > > skylake     ,avx         ,65599       ,3           ,7           ,1833.2      ,1833.1
> > > > skylake     ,avx         ,65663       ,9           ,5           ,1884.9      ,1887.4
> > > > skylake     ,avx         ,131079      ,0           ,64          ,3944.3      ,3949.4
> > > > skylake     ,avx         ,131087      ,0           ,3           ,3927.3      ,3913.3
> > > > skylake     ,avx         ,131103      ,3           ,0           ,4415.8      ,4169.4
> > > > skylake     ,avx         ,131135      ,3           ,7           ,4224.5      ,4157.6
> > > > skylake     ,avx         ,131199      ,9           ,5           ,5974.0      ,4983.8
> > > > skylake     ,avx         ,262151      ,0           ,64          ,11050.2     ,10620.6
> > > > skylake     ,avx         ,262159      ,0           ,3           ,9932.8      ,10037.3
> > > > skylake     ,avx         ,262175      ,3           ,0           ,10188.8     ,9206.6
> > > > skylake     ,avx         ,262207      ,3           ,7           ,9633.3      ,9216.7
> > > > skylake     ,avx         ,262271      ,9           ,5           ,9732.7      ,9345.3
> > > > skylake     ,avx         ,524295      ,0           ,64          ,24823.9     ,24880.7
> > > > skylake     ,avx         ,524303      ,0           ,3           ,24514.0     ,24556.7
> > > > skylake     ,avx         ,524319      ,3           ,0           ,23974.4     ,24219.9
> > > > skylake     ,avx         ,524351      ,3           ,7           ,24159.7     ,24207.0
> > > > skylake     ,avx         ,524415      ,9           ,5           ,23946.5     ,24142.8
> > > > skylake     ,avx         ,1048583     ,0           ,64          ,49163.9     ,49454.6
> > > > skylake     ,avx         ,1048591     ,0           ,3           ,49879.3     ,49400.8
> > > > skylake     ,avx         ,1048607     ,3           ,0           ,49738.0     ,48864.6
> > > > skylake     ,avx         ,1048639     ,3           ,7           ,48804.0     ,47588.5
> > > > skylake     ,avx         ,1048703     ,9           ,5           ,49629.4     ,49796.3
> > > > skylake     ,avx         ,2097159     ,0           ,64          ,98271.7     ,96330.6
> > > > skylake     ,avx         ,2097167     ,0           ,3           ,97801.8     ,98638.1
> > > > skylake     ,avx         ,2097183     ,3           ,0           ,98041.1     ,99287.6
> > > > skylake     ,avx         ,2097215     ,3           ,7           ,96629.5     ,96521.9
> > > > skylake     ,avx         ,2097279     ,9           ,5           ,98961.8     ,98909.8
> > > > skylake     ,avx         ,4194311     ,0           ,64          ,194667.7    ,195377.1
> > > > skylake     ,avx         ,4194319     ,0           ,3           ,194919.5    ,198576.2
> > > > skylake     ,avx         ,4194335     ,3           ,0           ,192949.8    ,194584.7
> > > > skylake     ,avx         ,4194367     ,3           ,7           ,189943.5    ,189177.9
> > > > skylake     ,avx         ,4194431     ,9           ,5           ,192479.1    ,196494.2
> > > > skylake     ,avx         ,8388615     ,0           ,64          ,588671.6    ,587215.4
> > > > skylake     ,avx         ,8388623     ,0           ,3           ,581640.7    ,582812.5
> > > > skylake     ,avx         ,8388639     ,3           ,0           ,549811.9    ,544697.6
> > > > skylake     ,avx         ,8388671     ,3           ,7           ,591155.0    ,577951.8
> > > > skylake     ,avx         ,8388735     ,9           ,5           ,547583.2    ,545133.3
> > > > skylake     ,avx         ,16777223    ,0           ,64          ,1787503.0   ,1811146.0
> > > > skylake     ,avx         ,16777231    ,0           ,3           ,1758671.0   ,1756343.0
> > > > skylake     ,avx         ,16777247    ,3           ,0           ,1691781.0   ,1694661.0
> > > > skylake     ,avx         ,16777279    ,3           ,7           ,1768150.0   ,1754785.0
> > > > skylake     ,avx         ,16777343    ,9           ,5           ,1695179.0   ,1710794.0
> > > > skylake     ,sse2        ,4103        ,0           ,64          ,150.8       ,150.5
> > > > skylake     ,sse2        ,4111        ,0           ,3           ,156.8       ,158.4
> > > > skylake     ,sse2        ,4127        ,3           ,0           ,99.7        ,99.4
> > > > skylake     ,sse2        ,4159        ,3           ,7           ,154.8       ,154.5
> > > > skylake     ,sse2        ,4223        ,9           ,5           ,137.3       ,137.2
> > > > skylake     ,sse2        ,8199        ,0           ,64          ,284.8       ,285.5
> > > > skylake     ,sse2        ,8207        ,0           ,3           ,296.0       ,296.1
> > > > skylake     ,sse2        ,8223        ,3           ,0           ,168.0       ,168.2
> > > > skylake     ,sse2        ,8255        ,3           ,7           ,293.0       ,292.4
> > > > skylake     ,sse2        ,8319        ,9           ,5           ,251.3       ,250.7
> > > > skylake     ,sse2        ,16391       ,0           ,64          ,561.3       ,608.3
> > > > skylake     ,sse2        ,16399       ,0           ,3           ,571.0       ,574.8
> > > > skylake     ,sse2        ,16415       ,3           ,0           ,305.4       ,305.0
> > > > skylake     ,sse2        ,16447       ,3           ,7           ,563.2       ,565.0
> > > > skylake     ,sse2        ,16511       ,9           ,5           ,477.1       ,475.1
> > > > skylake     ,sse2        ,32775       ,0           ,64          ,1128.2      ,1131.7
> > > > skylake     ,sse2        ,32783       ,0           ,3           ,1126.6      ,1131.0
> > > > skylake     ,sse2        ,32799       ,3           ,0           ,587.6       ,590.8
> > > > skylake     ,sse2        ,32831       ,3           ,7           ,1130.6      ,1126.2
> > > > skylake     ,sse2        ,32895       ,9           ,5           ,957.6       ,953.0
> > > > skylake     ,sse2        ,65543       ,0           ,64          ,2718.9      ,2704.2
> > > > skylake     ,sse2        ,65551       ,0           ,3           ,2724.1      ,2725.0
> > > > skylake     ,sse2        ,65567       ,3           ,0           ,1888.4      ,1914.3
> > > > skylake     ,sse2        ,65599       ,3           ,7           ,2787.6      ,2748.7
> > > > skylake     ,sse2        ,65663       ,9           ,5           ,2400.5      ,2369.4
> > > > skylake     ,sse2        ,131079      ,0           ,64          ,5603.3      ,5654.9
> > > > skylake     ,sse2        ,131087      ,0           ,3           ,5939.3      ,5871.4
> > > > skylake     ,sse2        ,131103      ,3           ,0           ,4272.4      ,4190.0
> > > > skylake     ,sse2        ,131135      ,3           ,7           ,7601.4      ,7524.6
> > > > skylake     ,sse2        ,131199      ,9           ,5           ,7022.1      ,6864.7
> > > > skylake     ,sse2        ,262151      ,0           ,64          ,13736.2     ,14030.0
> > > > skylake     ,sse2        ,262159      ,0           ,3           ,12407.3     ,12334.1
> > > > skylake     ,sse2        ,262175      ,3           ,0           ,9661.1      ,9249.4
> > > > skylake     ,sse2        ,262207      ,3           ,7           ,12850.2     ,12351.6
> > > > skylake     ,sse2        ,262271      ,9           ,5           ,10792.6     ,10435.8
> > > > skylake     ,sse2        ,524295      ,0           ,64          ,27754.5     ,28177.7
> > > > skylake     ,sse2        ,524303      ,0           ,3           ,27766.2     ,28152.0
> > > > skylake     ,sse2        ,524319      ,3           ,0           ,24030.9     ,24438.3
> > > > skylake     ,sse2        ,524351      ,3           ,7           ,27787.5     ,27933.0
> > > > skylake     ,sse2        ,524415      ,9           ,5           ,24263.2     ,25249.1
> > > > skylake     ,sse2        ,1048583     ,0           ,64          ,56199.9     ,56039.8
> > > > skylake     ,sse2        ,1048591     ,0           ,3           ,56750.2     ,58889.7
> > > > skylake     ,sse2        ,1048607     ,3           ,0           ,56394.0     ,55115.3
> > > > skylake     ,sse2        ,1048639     ,3           ,7           ,57233.1     ,57473.8
> > > > skylake     ,sse2        ,1048703     ,9           ,5           ,56324.3     ,55917.9
> > > > skylake     ,sse2        ,2097159     ,0           ,64          ,113234.8    ,114346.4
> > > > skylake     ,sse2        ,2097167     ,0           ,3           ,114373.1    ,115522.5
> > > > skylake     ,sse2        ,2097183     ,3           ,0           ,108113.3    ,108513.3
> > > > skylake     ,sse2        ,2097215     ,3           ,7           ,116863.6    ,116549.9
> > > > skylake     ,sse2        ,2097279     ,9           ,5           ,108945.1    ,108843.7
> > > > skylake     ,sse2        ,4194311     ,0           ,64          ,230250.1    ,232350.0
> > > > skylake     ,sse2        ,4194319     ,0           ,3           ,231895.3    ,235055.6
> > > > skylake     ,sse2        ,4194335     ,3           ,0           ,218442.8    ,219199.8
> > > > skylake     ,sse2        ,4194367     ,3           ,7           ,242564.2    ,235587.7
> > > > skylake     ,sse2        ,4194431     ,9           ,5           ,224167.4    ,215261.8
> > > > skylake     ,sse2        ,8388615     ,0           ,64          ,679801.8    ,674832.0
> > > > skylake     ,sse2        ,8388623     ,0           ,3           ,684913.2    ,685238.7
> > > > skylake     ,sse2        ,8388639     ,3           ,0           ,644865.4    ,631388.6
> > > > skylake     ,sse2        ,8388671     ,3           ,7           ,698700.9    ,689316.1
> > > > skylake     ,sse2        ,8388735     ,9           ,5           ,644820.2    ,631366.8
> > > > skylake     ,sse2        ,16777223    ,0           ,64          ,1877984.0   ,1876437.0
> > > > skylake     ,sse2        ,16777231    ,0           ,3           ,1898086.0   ,1913053.0
> > > > skylake     ,sse2        ,16777247    ,3           ,0           ,1857018.0   ,1866949.0
> > > > skylake     ,sse2        ,16777279    ,3           ,7           ,1914905.0   ,1897134.0
> > > > skylake     ,sse2        ,16777343    ,9           ,5           ,1859937.0   ,1881939.0
> > > > icelake     ,avx512      ,4103        ,0           ,64          ,75.2        ,75.8
> > > > icelake     ,avx512      ,4111        ,0           ,3           ,56.9        ,56.4
> > > > icelake     ,avx512      ,4127        ,3           ,0           ,59.1        ,59.6
> > > > icelake     ,avx512      ,4159        ,3           ,7           ,50.7        ,51.3
> > > > icelake     ,avx512      ,4223        ,9           ,5           ,59.2        ,58.9
> > > > icelake     ,avx512      ,8199        ,0           ,64          ,67.8        ,63.9
> > > > icelake     ,avx512      ,8207        ,0           ,3           ,89.0        ,89.9
> > > > icelake     ,avx512      ,8223        ,3           ,0           ,90.2        ,90.1
> > > > icelake     ,avx512      ,8255        ,3           ,7           ,82.6        ,84.9
> > > > icelake     ,avx512      ,8319        ,9           ,5           ,91.5        ,92.8
> > > > icelake     ,avx512      ,16391       ,0           ,64          ,118.0       ,117.6
> > > > icelake     ,avx512      ,16399       ,0           ,3           ,156.5       ,157.0
> > > > icelake     ,avx512      ,16415       ,3           ,0           ,157.4       ,157.3
> > > > icelake     ,avx512      ,16447       ,3           ,7           ,151.0       ,151.6
> > > > icelake     ,avx512      ,16511       ,9           ,5           ,159.1       ,159.6
> > > > icelake     ,avx512      ,32775       ,0           ,64          ,231.8       ,230.8
> > > > icelake     ,avx512      ,32783       ,0           ,3           ,297.8       ,299.3
> > > > icelake     ,avx512      ,32799       ,3           ,0           ,299.1       ,299.0
> > > > icelake     ,avx512      ,32831       ,3           ,7           ,293.5       ,295.4
> > > > icelake     ,avx512      ,32895       ,9           ,5           ,300.3       ,302.5
> > > > icelake     ,avx512      ,65543       ,0           ,64          ,1473.4      ,1479.2
> > > > icelake     ,avx512      ,65551       ,0           ,3           ,1438.2      ,1445.3
> > > > icelake     ,avx512      ,65567       ,3           ,0           ,1450.3      ,1463.8
> > > > icelake     ,avx512      ,65599       ,3           ,7           ,1469.0      ,1473.8
> > > > icelake     ,avx512      ,65663       ,9           ,5           ,1480.0      ,1483.5
> > > > icelake     ,avx512      ,131079      ,0           ,64          ,3015.1      ,3037.5
> > > > icelake     ,avx512      ,131087      ,0           ,3           ,2952.3      ,2960.4
> > > > icelake     ,avx512      ,131103      ,3           ,0           ,2966.2      ,2964.4
> > > > icelake     ,avx512      ,131135      ,3           ,7           ,2961.6      ,3047.9
> > > > icelake     ,avx512      ,131199      ,9           ,5           ,2967.4      ,3183.8
> > > > icelake     ,avx512      ,262151      ,0           ,64          ,6206.0      ,6141.5
> > > > icelake     ,avx512      ,262159      ,0           ,3           ,5990.8      ,5959.2
> > > > icelake     ,avx512      ,262175      ,3           ,0           ,5976.7      ,5963.8
> > > > icelake     ,avx512      ,262207      ,3           ,7           ,5939.5      ,5924.3
> > > > icelake     ,avx512      ,262271      ,9           ,5           ,5944.6      ,5990.3
> > > > icelake     ,avx512      ,524295      ,0           ,64          ,14726.7     ,14307.0
> > > > icelake     ,avx512      ,524303      ,0           ,3           ,14344.2     ,14040.5
> > > > icelake     ,avx512      ,524319      ,3           ,0           ,14175.0     ,13862.2
> > > > icelake     ,avx512      ,524351      ,3           ,7           ,14261.4     ,13821.5
> > > > icelake     ,avx512      ,524415      ,9           ,5           ,14266.5     ,14064.7
> > > > icelake     ,avx512      ,1048583     ,0           ,64          ,35211.4     ,35414.6
> > > > icelake     ,avx512      ,1048591     ,0           ,3           ,35156.8     ,35591.2
> > > > icelake     ,avx512      ,1048607     ,3           ,0           ,35273.1     ,35503.3
> > > > icelake     ,avx512      ,1048639     ,3           ,7           ,35255.8     ,35725.0
> > > > icelake     ,avx512      ,1048703     ,9           ,5           ,35703.6     ,36289.9
> > > > icelake     ,avx512      ,2097159     ,0           ,64          ,72613.9     ,72063.2
> > > > icelake     ,avx512      ,2097167     ,0           ,3           ,72301.6     ,73504.2
> > > > icelake     ,avx512      ,2097183     ,3           ,0           ,73448.8     ,72133.6
> > > > icelake     ,avx512      ,2097215     ,3           ,7           ,73762.9     ,72825.8
> > > > icelake     ,avx512      ,2097279     ,9           ,5           ,72097.3     ,72914.6
> > > > icelake     ,avx512      ,4194311     ,0           ,64          ,144793.4    ,144182.1
> > > > icelake     ,avx512      ,4194319     ,0           ,3           ,143710.3    ,145063.3
> > > > icelake     ,avx512      ,4194335     ,3           ,0           ,146722.1    ,144046.4
> > > > icelake     ,avx512      ,4194367     ,3           ,7           ,144267.0    ,144874.6
> > > > icelake     ,avx512      ,4194431     ,9           ,5           ,143808.2    ,144560.0
> > > > icelake     ,avx512      ,8388615     ,0           ,64          ,427993.4    ,424521.5
> > > > icelake     ,avx512      ,8388623     ,0           ,3           ,470267.1    ,473290.8
> > > > icelake     ,avx512      ,8388639     ,3           ,0           ,457179.7    ,461797.7
> > > > icelake     ,avx512      ,8388671     ,3           ,7           ,472507.9    ,481561.4
> > > > icelake     ,avx512      ,8388735     ,9           ,5           ,463611.9    ,467388.7
> > > > icelake     ,avx512      ,16777223    ,0           ,64          ,1490426.0   ,1526996.0
> > > > icelake     ,avx512      ,16777231    ,0           ,3           ,1516687.0   ,1517095.0
> > > > icelake     ,avx512      ,16777247    ,3           ,0           ,1497688.0   ,1512766.0
> > > > icelake     ,avx512      ,16777279    ,3           ,7           ,1512331.0   ,1524317.0
> > > > icelake     ,avx512      ,16777343    ,9           ,5           ,1498908.0   ,1500526.0
> > > > icelake     ,avx         ,4103        ,0           ,64          ,50.2        ,63.7
> > > > icelake     ,avx         ,4111        ,0           ,3           ,63.7        ,65.1
> > > > icelake     ,avx         ,4127        ,3           ,0           ,68.2        ,69.4
> > > > icelake     ,avx         ,4159        ,3           ,7           ,59.6        ,68.0
> > > > icelake     ,avx         ,4223        ,9           ,5           ,68.2        ,66.8
> > > > icelake     ,avx         ,8199        ,0           ,64          ,92.1        ,89.9
> > > > icelake     ,avx         ,8207        ,0           ,3           ,119.7       ,118.3
> > > > icelake     ,avx         ,8223        ,3           ,0           ,119.1       ,120.9
> > > > icelake     ,avx         ,8255        ,3           ,7           ,122.9       ,123.7
> > > > icelake     ,avx         ,8319        ,9           ,5           ,122.1       ,121.8
> > > > icelake     ,avx         ,16391       ,0           ,64          ,162.7       ,158.0
> > > > icelake     ,avx         ,16399       ,0           ,3           ,227.6       ,234.1
> > > > icelake     ,avx         ,16415       ,3           ,0           ,230.8       ,232.7
> > > > icelake     ,avx         ,16447       ,3           ,7           ,226.8       ,232.6
> > > > icelake     ,avx         ,16511       ,9           ,5           ,233.4       ,233.8
> > > > icelake     ,avx         ,32775       ,0           ,64          ,312.2       ,301.8
> > > > icelake     ,avx         ,32783       ,0           ,3           ,449.7       ,450.0
> > > > icelake     ,avx         ,32799       ,3           ,0           ,452.7       ,455.9
> > > > icelake     ,avx         ,32831       ,3           ,7           ,449.8       ,458.0
> > > > icelake     ,avx         ,32895       ,9           ,5           ,456.3       ,459.4
> > > > icelake     ,avx         ,65543       ,0           ,64          ,1460.6      ,1463.9
> > > > icelake     ,avx         ,65551       ,0           ,3           ,1462.0      ,1465.4
> > > > icelake     ,avx         ,65567       ,3           ,0           ,1466.6      ,1480.4
> > > > icelake     ,avx         ,65599       ,3           ,7           ,1488.0      ,1488.9
> > > > icelake     ,avx         ,65663       ,9           ,5           ,1680.8      ,1499.5
> > > > icelake     ,avx         ,131079      ,0           ,64          ,2988.5      ,3010.1
> > > > icelake     ,avx         ,131087      ,0           ,3           ,2995.5      ,2996.4
> > > > icelake     ,avx         ,131103      ,3           ,0           ,3006.2      ,3000.5
> > > > icelake     ,avx         ,131135      ,3           ,7           ,3032.4      ,3073.7
> > > > icelake     ,avx         ,131199      ,9           ,5           ,3010.4      ,3027.4
> > > > icelake     ,avx         ,262151      ,0           ,64          ,6143.2      ,6079.1
> > > > icelake     ,avx         ,262159      ,0           ,3           ,6085.1      ,6075.8
> > > > icelake     ,avx         ,262175      ,3           ,0           ,6088.0      ,6064.9
> > > > icelake     ,avx         ,262207      ,3           ,7           ,6018.7      ,6023.5
> > > > icelake     ,avx         ,262271      ,9           ,5           ,6019.8      ,5959.2
> > > > icelake     ,avx         ,524295      ,0           ,64          ,14464.2     ,14095.1
> > > > icelake     ,avx         ,524303      ,0           ,3           ,14761.6     ,14050.2
> > > > icelake     ,avx         ,524319      ,3           ,0           ,14534.1     ,14087.5
> > > > icelake     ,avx         ,524351      ,3           ,7           ,14147.7     ,13903.8
> > > > icelake     ,avx         ,524415      ,9           ,5           ,14157.0     ,13982.9
> > > > icelake     ,avx         ,1048583     ,0           ,64          ,36599.0     ,37461.4
> > > > icelake     ,avx         ,1048591     ,0           ,3           ,36717.8     ,37454.9
> > > > icelake     ,avx         ,1048607     ,3           ,0           ,36821.2     ,37343.3
> > > > icelake     ,avx         ,1048639     ,3           ,7           ,36958.0     ,37507.2
> > > > icelake     ,avx         ,1048703     ,9           ,5           ,36869.2     ,37413.1
> > > > icelake     ,avx         ,2097159     ,0           ,64          ,74765.8     ,75330.9
> > > > icelake     ,avx         ,2097167     ,0           ,3           ,75175.4     ,74891.9
> > > > icelake     ,avx         ,2097183     ,3           ,0           ,75451.4     ,74787.7
> > > > icelake     ,avx         ,2097215     ,3           ,7           ,75394.8     ,75839.1
> > > > icelake     ,avx         ,2097279     ,9           ,5           ,75099.2     ,75421.2
> > > > icelake     ,avx         ,4194311     ,0           ,64          ,146809.6    ,146619.4
> > > > icelake     ,avx         ,4194319     ,0           ,3           ,148866.4    ,149898.2
> > > > icelake     ,avx         ,4194335     ,3           ,0           ,148719.7    ,150165.4
> > > > icelake     ,avx         ,4194367     ,3           ,7           ,150600.1    ,150925.9
> > > > icelake     ,avx         ,4194431     ,9           ,5           ,149457.3    ,150519.2
> > > > icelake     ,avx         ,8388615     ,0           ,64          ,412709.8    ,423666.1
> > > > icelake     ,avx         ,8388623     ,0           ,3           ,423717.4    ,424418.2
> > > > icelake     ,avx         ,8388639     ,3           ,0           ,414387.5    ,413445.6
> > > > icelake     ,avx         ,8388671     ,3           ,7           ,449010.7    ,417553.5
> > > > icelake     ,avx         ,8388735     ,9           ,5           ,414128.6    ,411815.3
> > > > icelake     ,avx         ,16777223    ,0           ,64          ,1490032.0   ,1510004.0
> > > > icelake     ,avx         ,16777231    ,0           ,3           ,1379638.0   ,1422097.0
> > > > icelake     ,avx         ,16777247    ,3           ,0           ,1418930.0   ,1367557.0
> > > > icelake     ,avx         ,16777279    ,3           ,7           ,1515152.0   ,1500176.0
> > > > icelake     ,avx         ,16777343    ,9           ,5           ,1344117.0   ,1411795.0
> > > > icelake     ,sse2        ,4103        ,0           ,64          ,113.2       ,114.6
> > > > icelake     ,sse2        ,4111        ,0           ,3           ,121.5       ,120.4
> > > > icelake     ,sse2        ,4127        ,3           ,0           ,1700.5      ,1771.5
> > > > icelake     ,sse2        ,4159        ,3           ,7           ,119.3       ,118.8
> > > > icelake     ,sse2        ,4223        ,9           ,5           ,1739.7      ,1735.2
> > > > icelake     ,sse2        ,8199        ,0           ,64          ,207.0       ,203.9
> > > > icelake     ,sse2        ,8207        ,0           ,3           ,225.5       ,220.8
> > > > icelake     ,sse2        ,8223        ,3           ,0           ,3444.3      ,3743.5
> > > > icelake     ,sse2        ,8255        ,3           ,7           ,219.9       ,216.8
> > > > icelake     ,sse2        ,8319        ,9           ,5           ,4117.1      ,3487.3
> > > > icelake     ,sse2        ,16391       ,0           ,64          ,397.1       ,394.3
> > > > icelake     ,sse2        ,16399       ,0           ,3           ,439.6       ,428.6
> > > > icelake     ,sse2        ,16415       ,3           ,0           ,6997.0      ,7031.2
> > > > icelake     ,sse2        ,16447       ,3           ,7           ,426.8       ,421.8
> > > > icelake     ,sse2        ,16511       ,9           ,5           ,7037.6      ,7038.3
> > > > icelake     ,sse2        ,32775       ,0           ,64          ,790.9       ,779.0
> > > > icelake     ,sse2        ,32783       ,0           ,3           ,863.1       ,849.6
> > > > icelake     ,sse2        ,32799       ,3           ,0           ,14043.0     ,14390.9
> > > > icelake     ,sse2        ,32831       ,3           ,7           ,841.6       ,833.1
> > > > icelake     ,sse2        ,32895       ,9           ,5           ,14277.6     ,14344.2
> > > > icelake     ,sse2        ,65543       ,0           ,64          ,1897.0      ,1897.3
> > > > icelake     ,sse2        ,65551       ,0           ,3           ,1927.1      ,1955.4
> > > > icelake     ,sse2        ,65567       ,3           ,0           ,28834.7     ,28727.8
> > > > icelake     ,sse2        ,65599       ,3           ,7           ,1961.4      ,1969.7
> > > > icelake     ,sse2        ,65663       ,9           ,5           ,28867.6     ,29019.8
> > > > icelake     ,sse2        ,131079      ,0           ,64          ,3879.3      ,3872.6
> > > > icelake     ,sse2        ,131087      ,0           ,3           ,3955.3      ,3990.7
> > > > icelake     ,sse2        ,131103      ,3           ,0           ,58001.8     ,60567.9
> > > > icelake     ,sse2        ,131135      ,3           ,7           ,3951.5      ,4002.6
> > > > icelake     ,sse2        ,131199      ,9           ,5           ,57886.7     ,58391.4
> > > > icelake     ,sse2        ,262151      ,0           ,64          ,7851.4      ,7894.7
> > > > icelake     ,sse2        ,262159      ,0           ,3           ,7947.5      ,8016.2
> > > > icelake     ,sse2        ,262175      ,3           ,0           ,115036.2    ,115968.6
> > > > icelake     ,sse2        ,262207      ,3           ,7           ,7883.9      ,7814.1
> > > > icelake     ,sse2        ,262271      ,9           ,5           ,113776.4    ,119733.6
> > > > icelake     ,sse2        ,524295      ,0           ,64          ,17198.1     ,16974.9
> > > > icelake     ,sse2        ,524303      ,0           ,3           ,17402.2     ,17096.3
> > > > icelake     ,sse2        ,524319      ,3           ,0           ,223980.4    ,225889.9
> > > > icelake     ,sse2        ,524351      ,3           ,7           ,17034.9     ,16910.3
> > > > icelake     ,sse2        ,524415      ,9           ,5           ,224027.7    ,224962.5
> > > > icelake     ,sse2        ,1048583     ,0           ,64          ,38822.3     ,39178.6
> > > > icelake     ,sse2        ,1048591     ,0           ,3           ,41686.7     ,40247.4
> > > > icelake     ,sse2        ,1048607     ,3           ,0           ,38814.8     ,39323.3
> > > > icelake     ,sse2        ,1048639     ,3           ,7           ,39568.3     ,41325.7
> > > > icelake     ,sse2        ,1048703     ,9           ,5           ,39354.2     ,39637.9
> > > > icelake     ,sse2        ,2097159     ,0           ,64          ,84074.7     ,84543.1
> > > > icelake     ,sse2        ,2097167     ,0           ,3           ,83665.7     ,82358.2
> > > > icelake     ,sse2        ,2097183     ,3           ,0           ,81817.8     ,79638.9
> > > > icelake     ,sse2        ,2097215     ,3           ,7           ,83649.1     ,83497.6
> > > > icelake     ,sse2        ,2097279     ,9           ,5           ,80287.6     ,79980.9
> > > > icelake     ,sse2        ,4194311     ,0           ,64          ,165409.8    ,168343.1
> > > > icelake     ,sse2        ,4194319     ,0           ,3           ,165216.7    ,177632.0
> > > > icelake     ,sse2        ,4194335     ,3           ,0           ,158718.7    ,160342.2
> > > > icelake     ,sse2        ,4194367     ,3           ,7           ,167944.9    ,167204.4
> > > > icelake     ,sse2        ,4194431     ,9           ,5           ,161530.1    ,164839.7
> > > > icelake     ,sse2        ,8388615     ,0           ,64          ,626504.3    ,629858.5
> > > > icelake     ,sse2        ,8388623     ,0           ,3           ,623969.5    ,631509.1
> > > > icelake     ,sse2        ,8388639     ,3           ,0           ,599366.7    ,600016.0
> > > > icelake     ,sse2        ,8388671     ,3           ,7           ,619964.2    ,619113.2
> > > > icelake     ,sse2        ,8388735     ,9           ,5           ,595338.1    ,604172.4
> > > > icelake     ,sse2        ,16777223    ,0           ,64          ,1709597.0   ,1725184.0
> > > > icelake     ,sse2        ,16777231    ,0           ,3           ,1725452.0   ,1719746.0
> > > > icelake     ,sse2        ,16777247    ,3           ,0           ,1614269.0   ,1607164.0
> > > > icelake     ,sse2        ,16777279    ,3           ,7           ,1705295.0   ,1733018.0
> > > > icelake     ,sse2        ,16777343    ,9           ,5           ,1604197.0   ,1595690.0
> > > >
> > >
> > > I am having a hard time to convince myself that this patch is really necessary.
> > > What are geomeans of all different cases for each processors?
> >
> > N = 100, Geometric mean of Current vs New for memcpy-bench-large. Note the
> > bench-memmove-large numbers should be unaffected by this patch as the new
> > logic only applies to the no overlap case.
> >
> > cpu         ,inst        ,Len         ,align1      ,align2      ,new
> > geomean ,cur geomean ,New/Cur
> > icelake     ,sse2        ,65543       ,0           ,0
> > ,5566.1      ,5564.7      ,1.0
> > icelake     ,sse2        ,65551       ,0           ,3
> > ,5856.4      ,5725.7      ,1.02
> > icelake     ,sse2        ,65567       ,3           ,0
> > ,5622.8      ,5892.9      ,0.95
> > icelake     ,sse2        ,65599       ,3           ,5
> > ,5857.3      ,5723.8      ,1.02
> > icelake     ,sse2        ,65536       ,0           ,127
> > ,5953.3      ,5831.1      ,1.02
> > icelake     ,sse2        ,65536       ,0           ,255
> > ,5811.7      ,5789.5      ,1.0
> > icelake     ,sse2        ,65536       ,0           ,256
> > ,5373.5      ,5284.1      ,1.02
> > icelake     ,sse2        ,65536       ,0           ,4064
> > ,5820.1      ,5761.6      ,1.01
> > icelake     ,sse2        ,131079      ,0           ,0
> > ,12421.5     ,12424.1     ,1.0
> > icelake     ,sse2        ,131087      ,0           ,3
> > ,12389.5     ,12276.4     ,1.01
> > icelake     ,sse2        ,131103      ,3           ,0
> > ,11587.0     ,12607.6     ,0.92
> > icelake     ,sse2        ,131135      ,3           ,5
> > ,11596.9     ,11896.2     ,0.97
> > icelake     ,sse2        ,131072      ,0           ,127
> > ,11746.4     ,12490.1     ,0.94
> > icelake     ,sse2        ,131072      ,0           ,255
> > ,11486.8     ,11831.7     ,0.97
> > icelake     ,sse2        ,131072      ,0           ,256
> > ,10453.5     ,10451.7     ,1.0
> > icelake     ,sse2        ,131072      ,0           ,4064
> > ,11231.7     ,11223.6     ,1.0
> > icelake     ,sse2        ,262151      ,0           ,0
> > ,29408.5     ,30831.2     ,0.95
> > icelake     ,sse2        ,262159      ,0           ,3
> > ,30813.6     ,32235.6     ,0.96
> > icelake     ,sse2        ,262175      ,3           ,0
> > ,30245.0     ,31392.5     ,0.96
> > icelake     ,sse2        ,262207      ,3           ,5
> > ,30775.6     ,32298.6     ,0.95
> > icelake     ,sse2        ,262144      ,0           ,127
> > ,31784.7     ,32791.5     ,0.97
> > icelake     ,sse2        ,262144      ,0           ,255
> > ,30726.0     ,31997.5     ,0.96
> > icelake     ,sse2        ,262144      ,0           ,256
> > ,28418.9     ,29440.9     ,0.97
> > icelake     ,sse2        ,262144      ,0           ,4064
> > ,29984.1     ,31048.9     ,0.97
> > icelake     ,sse2        ,524295      ,0           ,0
> > ,76079.0     ,75752.0     ,1.0
> > icelake     ,sse2        ,524303      ,0           ,3
> > ,79939.3     ,80796.4     ,0.99
> > icelake     ,sse2        ,524319      ,3           ,0
> > ,79018.1     ,79928.5     ,0.99
> > icelake     ,sse2        ,524351      ,3           ,5
> > ,81219.4     ,81053.8     ,1.0
> > icelake     ,sse2        ,524288      ,0           ,127
> > ,80111.8     ,80087.2     ,1.0
> > icelake     ,sse2        ,524288      ,0           ,255
> > ,79334.0     ,79525.6     ,1.0
> > icelake     ,sse2        ,524288      ,0           ,256
> > ,75766.9     ,75918.9     ,1.0
> > icelake     ,sse2        ,524288      ,0           ,4064
> > ,78907.9     ,79550.8     ,0.99
> > icelake     ,sse2        ,1048583     ,0           ,0
> > ,144672.6    ,147457.7    ,0.98
> > icelake     ,sse2        ,1048591     ,0           ,3
> > ,173803.9    ,400563.2    ,0.43
> > icelake     ,sse2        ,1048607     ,3           ,0
> > ,149391.9    ,151772.1    ,0.98
> > icelake     ,sse2        ,1048639     ,3           ,5
> > ,174774.1    ,400657.4    ,0.44
> > icelake     ,sse2        ,1048576     ,0           ,127
> > ,175350.9    ,347110.6    ,0.51
> > icelake     ,sse2        ,1048576     ,0           ,255
> > ,150152.6    ,144242.9    ,1.04
> > icelake     ,sse2        ,1048576     ,0           ,256
> > ,145869.7    ,147489.6    ,0.99
> > icelake     ,sse2        ,1048576     ,0           ,4064
> > ,145814.7    ,147497.7    ,0.99
> > icelake     ,sse2        ,2097159     ,0           ,0
> > ,289460.6    ,295574.6    ,0.98
> > icelake     ,sse2        ,2097167     ,0           ,3
> > ,347057.0    ,799549.1    ,0.43
> > icelake     ,sse2        ,2097183     ,3           ,0
> > ,298565.7    ,301424.3    ,0.99
> > icelake     ,sse2        ,2097215     ,3           ,5
> > ,348620.4    ,797557.4    ,0.44
> > icelake     ,sse2        ,2097152     ,0           ,127
> > ,348751.4    ,695260.9    ,0.5
> > icelake     ,sse2        ,2097152     ,0           ,255
> > ,298960.5    ,286590.0    ,1.04
> > icelake     ,sse2        ,2097152     ,0           ,256
> > ,290978.4    ,293225.6    ,0.99
> > icelake     ,sse2        ,2097152     ,0           ,4064
> > ,290476.0    ,292283.2    ,0.99
> > icelake     ,sse2        ,4194311     ,0           ,0
> > ,583386.3    ,588284.3    ,0.99
> > icelake     ,sse2        ,4194319     ,0           ,3
> > ,703870.5    ,1595268.0   ,0.44
> > icelake     ,sse2        ,4194335     ,3           ,0
> > ,599400.2    ,601591.6    ,1.0
> > icelake     ,sse2        ,4194367     ,3           ,5
> > ,694569.7    ,1595608.0   ,0.44
> > icelake     ,sse2        ,4194304     ,0           ,127
> > ,700229.1    ,1389061.9   ,0.5
> > icelake     ,sse2        ,4194304     ,0           ,255
> > ,600779.0    ,573361.2    ,1.05
> > icelake     ,sse2        ,4194304     ,0           ,256
> > ,586610.7    ,589269.6    ,1.0
> > icelake     ,sse2        ,4194304     ,0           ,4064
> > ,583616.3    ,584806.4    ,1.0
> > icelake     ,sse2        ,8388615     ,0           ,0
> > ,1214632.8   ,1266616.0   ,0.96
> > icelake     ,sse2        ,8388623     ,0           ,3
> > ,1405136.9   ,3198827.1   ,0.44
> > icelake     ,sse2        ,8388639     ,3           ,0
> > ,1244302.6   ,1297425.9   ,0.96
> > icelake     ,sse2        ,8388671     ,3           ,5
> > ,1404685.1   ,3196389.9   ,0.44
> > icelake     ,sse2        ,8388608     ,0           ,127
> > ,1419888.5   ,2792729.4   ,0.51
> > icelake     ,sse2        ,8388608     ,0           ,255
> > ,1249044.6   ,1259726.7   ,0.99
> > icelake     ,sse2        ,8388608     ,0           ,256
> > ,1234471.9   ,1300463.6   ,0.95
> > icelake     ,sse2        ,8388608     ,0           ,4064
> > ,1220102.2   ,1265190.5   ,0.96
> > icelake     ,sse2        ,16777223    ,0           ,0
> > ,2689516.3   ,2846521.1   ,0.94
> > icelake     ,sse2        ,16777231    ,0           ,3
> > ,3001317.4   ,6428733.7   ,0.47
> > icelake     ,sse2        ,16777247    ,3           ,0
> > ,2770040.8   ,2910434.9   ,0.95
> > icelake     ,sse2        ,16777279    ,3           ,5
> > ,3002076.1   ,6415835.9   ,0.47
> > icelake     ,sse2        ,16777216    ,0           ,127
> > ,3063786.3   ,5609895.3   ,0.55
> > icelake     ,sse2        ,16777216    ,0           ,255
> > ,2821606.1   ,2833843.6   ,1.0
> > icelake     ,sse2        ,16777216    ,0           ,256
> > ,2719765.5   ,2925344.2   ,0.93
> > icelake     ,sse2        ,16777216    ,0           ,4064
> > ,2686189.2   ,2848017.5   ,0.94
> > icelake     ,sse2        ,33554439    ,0           ,0
> > ,5577945.0   ,5913674.6   ,0.94
> > icelake     ,sse2        ,33554447    ,0           ,3
> > ,6152758.8   ,12863855.0  ,0.48
> > icelake     ,sse2        ,33554463    ,3           ,0
> > ,5773351.4   ,6035289.3   ,0.96
> > icelake     ,sse2        ,33554495    ,3           ,5
> > ,6160006.2   ,12878153.9  ,0.48
> > icelake     ,sse2        ,33554432    ,0           ,127
> > ,6303495.4   ,11221070.2  ,0.56
> > icelake     ,sse2        ,33554432    ,0           ,255
> > ,5830879.6   ,5944978.6   ,0.98
> > icelake     ,sse2        ,33554432    ,0           ,256
> > ,5611968.2   ,6068255.4   ,0.92
> > icelake     ,sse2        ,33554432    ,0           ,4064
> > ,5570321.0   ,5964542.6   ,0.93   icelake     ,avx         ,65543
> >  ,0           ,0           ,5561.1      ,5659.7      ,0.98
> > icelake     ,avx         ,65551       ,0           ,3
> > ,5859.9      ,5724.8      ,1.02
> > icelake     ,avx         ,65567       ,3           ,0
> > ,5636.7      ,5623.3      ,1.0
> > icelake     ,avx         ,65599       ,3           ,5
> > ,5856.3      ,5720.2      ,1.02
> > icelake     ,avx         ,65536       ,0           ,127
> > ,6011.1      ,5910.0      ,1.02
> > icelake     ,avx         ,65536       ,0           ,255
> > ,5854.5      ,5792.3      ,1.01
> > icelake     ,avx         ,65536       ,0           ,256
> > ,5213.0      ,5273.9      ,0.99
> > icelake     ,avx         ,65536       ,0           ,4064
> > ,5760.7      ,5661.1      ,1.02
> > icelake     ,avx         ,131079      ,0           ,0
> > ,12371.4     ,12707.0     ,0.97
> > icelake     ,avx         ,131087      ,0           ,3
> > ,13220.1     ,12515.7     ,1.06
> > icelake     ,avx         ,131103      ,3           ,0
> > ,11628.2     ,11546.9     ,1.01
> > icelake     ,avx         ,131135      ,3           ,5
> > ,13025.7     ,13967.6     ,0.93
> > icelake     ,avx         ,131072      ,0           ,127
> > ,11781.7     ,11936.4     ,0.99
> > icelake     ,avx         ,131072      ,0           ,255
> > ,11802.2     ,11583.9     ,1.02
> > icelake     ,avx         ,131072      ,0           ,256
> > ,10436.9     ,10693.1     ,0.98
> > icelake     ,avx         ,131072      ,0           ,4064
> > ,11880.9     ,11395.6     ,1.04
> > icelake     ,avx         ,262151      ,0           ,0
> > ,29132.6     ,30542.8     ,0.95
> > icelake     ,avx         ,262159      ,0           ,3
> > ,30533.5     ,31468.8     ,0.97
> > icelake     ,avx         ,262175      ,3           ,0
> > ,29879.5     ,30933.7     ,0.97
> > icelake     ,avx         ,262207      ,3           ,5
> > ,30263.1     ,31445.0     ,0.96
> > icelake     ,avx         ,262144      ,0           ,127
> > ,30180.9     ,31405.3     ,0.96
> > icelake     ,avx         ,262144      ,0           ,255
> > ,30152.9     ,31372.5     ,0.96
> > icelake     ,avx         ,262144      ,0           ,256
> > ,28121.9     ,28990.9     ,0.97
> > icelake     ,avx         ,262144      ,0           ,4064
> > ,29785.2     ,31078.4     ,0.96
> > icelake     ,avx         ,524295      ,0           ,0
> > ,76045.7     ,75824.3     ,1.0
> > icelake     ,avx         ,524303      ,0           ,3
> > ,79303.7     ,80433.3     ,0.99
> > icelake     ,avx         ,524319      ,3           ,0
> > ,79323.8     ,79411.3     ,1.0
> > icelake     ,avx         ,524351      ,3           ,5
> > ,79797.9     ,80179.4     ,1.0
> > icelake     ,avx         ,524288      ,0           ,127
> > ,80046.7     ,80254.1     ,1.0
> > icelake     ,avx         ,524288      ,0           ,255
> > ,78580.6     ,79210.4     ,0.99
> > icelake     ,avx         ,524288      ,0           ,256
> > ,75464.4     ,75184.2     ,1.0
> > icelake     ,avx         ,524288      ,0           ,4064
> > ,78863.6     ,78677.9     ,1.0
> > icelake     ,avx         ,1048583     ,0           ,0
> > ,131017.9    ,133962.4    ,0.98
> > icelake     ,avx         ,1048591     ,0           ,3
> > ,143451.3    ,210311.7    ,0.68
> > icelake     ,avx         ,1048607     ,3           ,0
> > ,136944.0    ,138426.4    ,0.99
> > icelake     ,avx         ,1048639     ,3           ,5
> > ,143594.3    ,209887.9    ,0.68
> > icelake     ,avx         ,1048576     ,0           ,127
> > ,156462.0    ,218873.2    ,0.71
> > icelake     ,avx         ,1048576     ,0           ,255
> > ,148026.3    ,179419.0    ,0.83
> > icelake     ,avx         ,1048576     ,0           ,256
> > ,143365.7    ,137816.3    ,1.04
> > icelake     ,avx         ,1048576     ,0           ,4064
> > ,131683.4    ,132731.6    ,0.99
> > icelake     ,avx         ,2097159     ,0           ,0
> > ,263807.1    ,267984.5    ,0.98
> > icelake     ,avx         ,2097167     ,0           ,3
> > ,286949.8    ,422279.2    ,0.68
> > icelake     ,avx         ,2097183     ,3           ,0
> > ,274675.6    ,276702.2    ,0.99
> > icelake     ,avx         ,2097215     ,3           ,5
> > ,286681.7    ,420176.7    ,0.68
> > icelake     ,avx         ,2097152     ,0           ,127
> > ,314499.2    ,437864.2    ,0.72
> > icelake     ,avx         ,2097152     ,0           ,255
> > ,297458.4    ,359520.9    ,0.83
> > icelake     ,avx         ,2097152     ,0           ,256
> > ,285883.2    ,276043.2    ,1.04
> > icelake     ,avx         ,2097152     ,0           ,4064
> > ,263436.6    ,265516.6    ,0.99
> > icelake     ,avx         ,4194311     ,0           ,0
> > ,529119.4    ,536745.2    ,0.99
> > icelake     ,avx         ,4194319     ,0           ,3
> > ,573960.0    ,839002.3    ,0.68
> > icelake     ,avx         ,4194335     ,3           ,0
> > ,550617.2    ,553117.5    ,1.0
> > icelake     ,avx         ,4194367     ,3           ,5
> > ,572742.8    ,838784.5    ,0.68
> > icelake     ,avx         ,4194304     ,0           ,127
> > ,629413.6    ,876512.1    ,0.72
> > icelake     ,avx         ,4194304     ,0           ,255
> > ,594224.1    ,717425.1    ,0.83
> > icelake     ,avx         ,4194304     ,0           ,256
> > ,573365.0    ,552538.3    ,1.04
> > icelake     ,avx         ,4194304     ,0           ,4064
> > ,527459.3    ,531907.1    ,0.99
> > icelake     ,avx         ,8388615     ,0           ,0
> > ,1094256.8   ,1145619.9   ,0.96
> > icelake     ,avx         ,8388623     ,0           ,3
> > ,1170367.1   ,1700076.4   ,0.69
> > icelake     ,avx         ,8388639     ,3           ,0
> > ,1136168.1   ,1174752.4   ,0.97
> > icelake     ,avx         ,8388671     ,3           ,5
> > ,1172015.6   ,1703032.8   ,0.69
> > icelake     ,avx         ,8388608     ,0           ,127
> > ,1276748.6   ,1771351.9   ,0.72
> > icelake     ,avx         ,8388608     ,0           ,255
> > ,1207712.0   ,1449267.0   ,0.83
> > icelake     ,avx         ,8388608     ,0           ,256
> > ,1167958.9   ,1178243.1   ,0.99
> > icelake     ,avx         ,8388608     ,0           ,4064
> > ,1106155.9   ,1145128.6   ,0.97
> > icelake     ,avx         ,16777223    ,0           ,0
> > ,2479317.5   ,2630301.0   ,0.94
> > icelake     ,avx         ,16777231    ,0           ,3
> > ,2643303.6   ,3536980.7   ,0.75
> > icelake     ,avx         ,16777247    ,3           ,0
> > ,2571967.0   ,2672246.4   ,0.96
> > icelake     ,avx         ,16777279    ,3           ,5
> > ,2641320.5   ,3538388.9   ,0.75
> > icelake     ,avx         ,16777216    ,0           ,127
> > ,2832921.6   ,3593702.5   ,0.79
> > icelake     ,avx         ,16777216    ,0           ,255
> > ,2700272.1   ,3025346.1   ,0.89
> > icelake     ,avx         ,16777216    ,0           ,256
> > ,2622133.7   ,2709087.6   ,0.97
> > icelake     ,avx         ,16777216    ,0           ,4064
> > ,2475020.7   ,2610977.8   ,0.95
> > icelake     ,avx         ,33554439    ,0           ,0
> > ,5190103.1   ,5576047.9   ,0.93
> > icelake     ,avx         ,33554447    ,0           ,3
> > ,5477752.1   ,7215479.2   ,0.76
> > icelake     ,avx         ,33554463    ,3           ,0
> > ,5338711.7   ,5625026.7   ,0.95
> > icelake     ,avx         ,33554495    ,3           ,5
> > ,5505164.8   ,7223660.8   ,0.76
> > icelake     ,avx         ,33554432    ,0           ,127
> > ,5859232.3   ,7279581.9   ,0.8
> > icelake     ,avx         ,33554432    ,0           ,255
> > ,5681634.7   ,6156488.6   ,0.92
> > icelake     ,avx         ,33554432    ,0           ,256
> > ,5440721.4   ,5728347.4   ,0.95
> > icelake     ,avx         ,33554432    ,0           ,4064
> > ,5191213.2   ,5538716.4   ,0.94
> > icelake     ,avx512      ,65543       ,0           ,0
> > ,5563.5      ,5634.1      ,0.99
> > icelake     ,avx512      ,65551       ,0           ,3
> > ,5864.1      ,5728.4      ,1.02
> > icelake     ,avx512      ,65567       ,3           ,0
> > ,5720.2      ,5625.3      ,1.02
> > icelake     ,avx512      ,65599       ,3           ,5
> > ,5857.2      ,5722.0      ,1.02
> > icelake     ,avx512      ,65536       ,0           ,127
> > ,6040.7      ,5844.0      ,1.03
> > icelake     ,avx512      ,65536       ,0           ,255
> > ,5826.5      ,5799.6      ,1.0
> > icelake     ,avx512      ,65536       ,0           ,256
> > ,5234.4      ,5230.0      ,1.0
> > icelake     ,avx512      ,65536       ,0           ,4064
> > ,5800.7      ,5655.4      ,1.03
> > icelake     ,avx512      ,131079      ,0           ,0
> > ,12591.4     ,11767.1     ,1.07
> > icelake     ,avx512      ,131087      ,0           ,3
> > ,12694.9     ,12292.1     ,1.03
> > icelake     ,avx512      ,131103      ,3           ,0
> > ,11374.7     ,12236.3     ,0.93
> > icelake     ,avx512      ,131135      ,3           ,5
> > ,11958.2     ,11745.5     ,1.02
> > icelake     ,avx512      ,131072      ,0           ,127
> > ,11803.4     ,11908.6     ,0.99
> > icelake     ,avx512      ,131072      ,0           ,255
> > ,11569.0     ,11487.9     ,1.01
> > icelake     ,avx512      ,131072      ,0           ,256
> > ,11087.6     ,10456.4     ,1.06
> > icelake     ,avx512      ,131072      ,0           ,4064
> > ,11166.0     ,11248.2     ,0.99
> > icelake     ,avx512      ,262151      ,0           ,0
> > ,30232.1     ,29932.7     ,1.01
> > icelake     ,avx512      ,262159      ,0           ,3
> > ,30093.8     ,31315.1     ,0.96
> > icelake     ,avx512      ,262175      ,3           ,0
> > ,30147.7     ,30643.4     ,0.98
> > icelake     ,avx512      ,262207      ,3           ,5
> > ,29985.9     ,31479.8     ,0.95
> > icelake     ,avx512      ,262144      ,0           ,127
> > ,30099.7     ,31552.9     ,0.95
> > icelake     ,avx512      ,262144      ,0           ,255
> > ,29772.8     ,30698.1     ,0.97
> > icelake     ,avx512      ,262144      ,0           ,256
> > ,28109.3     ,28957.9     ,0.97
> > icelake     ,avx512      ,262144      ,0           ,4064
> > ,29787.5     ,30637.2     ,0.97
> > icelake     ,avx512      ,524295      ,0           ,0
> > ,75920.7     ,75047.1     ,1.01
> > icelake     ,avx512      ,524303      ,0           ,3
> > ,79218.6     ,79529.2     ,1.0
> > icelake     ,avx512      ,524319      ,3           ,0
> > ,78446.9     ,78550.7     ,1.0
> > icelake     ,avx512      ,524351      ,3           ,5
> > ,79055.0     ,79425.2     ,1.0
> > icelake     ,avx512      ,524288      ,0           ,127
> > ,79070.6     ,79626.7     ,0.99
> > icelake     ,avx512      ,524288      ,0           ,255
> > ,77891.8     ,78078.3     ,1.0
> > icelake     ,avx512      ,524288      ,0           ,256
> > ,74797.3     ,74436.9     ,1.0
> > icelake     ,avx512      ,524288      ,0           ,4064
> > ,78339.3     ,78337.2     ,1.0
> > icelake     ,avx512      ,1048583     ,0           ,0
> > ,131427.6    ,133891.3    ,0.98
> > icelake     ,avx512      ,1048591     ,0           ,3
> > ,143984.1    ,142003.7    ,1.01
> > icelake     ,avx512      ,1048607     ,3           ,0
> > ,137547.9    ,134450.1    ,1.02
> > icelake     ,avx512      ,1048639     ,3           ,5
> > ,144630.4    ,142174.6    ,1.02
> > icelake     ,avx512      ,1048576     ,0           ,127
> > ,149810.7    ,142684.9    ,1.05
> > icelake     ,avx512      ,1048576     ,0           ,255
> > ,156212.6    ,143509.2    ,1.09
> > icelake     ,avx512      ,1048576     ,0           ,256
> > ,153776.9    ,139788.0    ,1.1
> > icelake     ,avx512      ,1048576     ,0           ,4064
> > ,137926.6    ,134832.8    ,1.02
> > icelake     ,avx512      ,2097159     ,0           ,0
> > ,263465.3    ,267681.6    ,0.98
> > icelake     ,avx512      ,2097167     ,0           ,3
> > ,288947.7    ,284129.9    ,1.02
> > icelake     ,avx512      ,2097183     ,3           ,0
> > ,275395.5    ,269216.0    ,1.02
> > icelake     ,avx512      ,2097215     ,3           ,5
> > ,289131.5    ,284475.3    ,1.02
> > icelake     ,avx512      ,2097152     ,0           ,127
> > ,299404.5    ,286193.2    ,1.05
> > icelake     ,avx512      ,2097152     ,0           ,255
> > ,312913.2    ,286785.6    ,1.09
> > icelake     ,avx512      ,2097152     ,0           ,256
> > ,307882.7    ,279708.7    ,1.1
> > icelake     ,avx512      ,2097152     ,0           ,4064
> > ,275552.3    ,269867.0    ,1.02
> > icelake     ,avx512      ,4194311     ,0           ,0
> > ,526480.1    ,536038.9    ,0.98
> > icelake     ,avx512      ,4194319     ,0           ,3
> > ,579122.9    ,569512.5    ,1.02
> > icelake     ,avx512      ,4194335     ,3           ,0
> > ,551658.1    ,542973.3    ,1.02
> > icelake     ,avx512      ,4194367     ,3           ,5
> > ,578575.2    ,569497.2    ,1.02
> > icelake     ,avx512      ,4194304     ,0           ,127
> > ,599943.6    ,569138.2    ,1.05
> > icelake     ,avx512      ,4194304     ,0           ,255
> > ,628419.2    ,575908.4    ,1.09
> > icelake     ,avx512      ,4194304     ,0           ,256
> > ,617242.8    ,561417.7    ,1.1
> > icelake     ,avx512      ,4194304     ,0           ,4064
> > ,552012.3    ,540617.2    ,1.02
> > icelake     ,avx512      ,8388615     ,0           ,0
> > ,1092471.4   ,1133834.9   ,0.96
> > icelake     ,avx512      ,8388623     ,0           ,3
> > ,1185623.5   ,1218150.0   ,0.97
> > icelake     ,avx512      ,8388639     ,3           ,0
> > ,1142647.1   ,1139201.6   ,1.0
> > icelake     ,avx512      ,8388671     ,3           ,5
> > ,1183702.5   ,1225474.6   ,0.97
> > icelake     ,avx512      ,8388608     ,0           ,127
> > ,1231862.8   ,1221685.1   ,1.01
> > icelake     ,avx512      ,8388608     ,0           ,255
> > ,1290816.7   ,1221576.2   ,1.06
> > icelake     ,avx512      ,8388608     ,0           ,256
> > ,1299047.6   ,1195021.2   ,1.09
> > icelake     ,avx512      ,8388608     ,0           ,4064
> > ,1139648.9   ,1140113.0   ,1.0
> > icelake     ,avx512      ,16777223    ,0           ,0
> > ,2464861.2   ,2599120.4   ,0.95
> > icelake     ,avx512      ,16777231    ,0           ,3
> > ,2651029.7   ,2758867.1   ,0.96
> > icelake     ,avx512      ,16777247    ,3           ,0
> > ,2570099.8   ,2601099.4   ,0.99
> > icelake     ,avx512      ,16777279    ,3           ,5
> > ,2660529.4   ,2762598.6   ,0.96
> > icelake     ,avx512      ,16777216    ,0           ,127
> > ,2759531.7   ,2756811.1   ,1.0
> > icelake     ,avx512      ,16777216    ,0           ,255
> > ,2878568.5   ,2777650.3   ,1.04
> > icelake     ,avx512      ,16777216    ,0           ,256
> > ,2931879.3   ,2709687.7   ,1.08
> > icelake     ,avx512      ,16777216    ,0           ,4064
> > ,2587161.1   ,2632011.2   ,0.98
> > icelake     ,avx512      ,33554439    ,0           ,0
> > ,5175406.0   ,5528857.2   ,0.94
> > icelake     ,avx512      ,33554447    ,0           ,3
> > ,5537561.9   ,5818119.1   ,0.95
> > icelake     ,avx512      ,33554463    ,3           ,0
> > ,5435099.5   ,5560442.2   ,0.98
> > icelake     ,avx512      ,33554495    ,3           ,5
> > ,5546314.9   ,5800995.0   ,0.96
> > icelake     ,avx512      ,33554432    ,0           ,127
> > ,5770248.0   ,5781104.9   ,1.0
> > icelake     ,avx512      ,33554432    ,0           ,255
> > ,6019120.7   ,5836023.3   ,1.03
> > icelake     ,avx512      ,33554432    ,0           ,256
> > ,6107033.4   ,5681798.8   ,1.07
> > icelake     ,avx512      ,33554432    ,0           ,4064
> > ,5356238.5   ,5598521.5   ,0.96
> > skylake     ,sse2        ,65543       ,0           ,0
> > ,3091.4      ,2940.2      ,1.05
> > skylake     ,sse2        ,65551       ,0           ,3
> > ,3682.6      ,3403.7      ,1.08
> > skylake     ,sse2        ,65567       ,3           ,0
> > ,3031.3      ,3070.2      ,0.99
> > skylake     ,sse2        ,65599       ,3           ,5
> > ,3731.2      ,3718.7      ,1.0
> > skylake     ,sse2        ,65536       ,0           ,127
> > ,3642.3      ,3390.5      ,1.07
> > skylake     ,sse2        ,65536       ,0           ,255
> > ,3493.9      ,3333.0      ,1.05
> > skylake     ,sse2        ,65536       ,0           ,256
> > ,3043.2      ,2981.0      ,1.02
> > skylake     ,sse2        ,65536       ,0           ,4064
> > ,2796.6      ,2843.9      ,0.98
> > skylake     ,sse2        ,131079      ,0           ,0
> > ,6347.4      ,6309.8      ,1.01
> > skylake     ,sse2        ,131087      ,0           ,3
> > ,7318.4      ,7486.2      ,0.98
> > skylake     ,sse2        ,131103      ,3           ,0
> > ,6297.4      ,6516.8      ,0.97
> > skylake     ,sse2        ,131135      ,3           ,5
> > ,7544.5      ,7823.5      ,0.96
> > skylake     ,sse2        ,131072      ,0           ,127
> > ,7426.4      ,7554.3      ,0.98
> > skylake     ,sse2        ,131072      ,0           ,255
> > ,7349.0      ,7195.4      ,1.02
> > skylake     ,sse2        ,131072      ,0           ,256
> > ,7068.1      ,6804.8      ,1.04
> > skylake     ,sse2        ,131072      ,0           ,4064
> > ,6884.6      ,7566.7      ,0.91
> > skylake     ,sse2        ,262151      ,0           ,0
> > ,15848.1     ,15552.2     ,1.02
> > skylake     ,sse2        ,262159      ,0           ,3
> > ,17864.6     ,16787.9     ,1.06
> > skylake     ,sse2        ,262175      ,3           ,0
> > ,15748.1     ,16266.0     ,0.97
> > skylake     ,sse2        ,262207      ,3           ,5
> > ,17022.3     ,17229.8     ,0.99
> > skylake     ,sse2        ,262144      ,0           ,127
> > ,16158.7     ,16093.6     ,1.0
> > skylake     ,sse2        ,262144      ,0           ,255
> > ,15670.7     ,15949.2     ,0.98
> > skylake     ,sse2        ,262144      ,0           ,256
> > ,14806.3     ,14970.3     ,0.99
> > skylake     ,sse2        ,262144      ,0           ,4064
> > ,14751.7     ,15008.2     ,0.98
> > skylake     ,sse2        ,524295      ,0           ,0
> > ,32874.8     ,33731.2     ,0.97
> > skylake     ,sse2        ,524303      ,0           ,3
> > ,34035.1     ,34777.8     ,0.98
> > skylake     ,sse2        ,524319      ,3           ,0
> > ,34325.6     ,34108.9     ,1.01
> > skylake     ,sse2        ,524351      ,3           ,5
> > ,34853.5     ,35624.4     ,0.98
> > skylake     ,sse2        ,524288      ,0           ,127
> > ,33437.4     ,33816.7     ,0.99
> > skylake     ,sse2        ,524288      ,0           ,255
> > ,33256.1     ,33664.7     ,0.99
> > skylake     ,sse2        ,524288      ,0           ,256
> > ,32006.3     ,32396.3     ,0.99
> > skylake     ,sse2        ,524288      ,0           ,4064
> > ,32284.7     ,32713.9     ,0.99
> > skylake     ,sse2        ,1048583     ,0           ,0
> > ,71891.7     ,73858.4     ,0.97
> > skylake     ,sse2        ,1048591     ,0           ,3
> > ,74621.3     ,74389.7     ,1.0
> > skylake     ,sse2        ,1048607     ,3           ,0
> > ,72515.0     ,73573.2     ,0.99
> > skylake     ,sse2        ,1048639     ,3           ,5
> > ,72471.7     ,73782.6     ,0.98
> > skylake     ,sse2        ,1048576     ,0           ,127
> > ,77638.6     ,82474.6     ,0.94
> > skylake     ,sse2        ,1048576     ,0           ,255
> > ,71870.0     ,71933.6     ,1.0
> > skylake     ,sse2        ,1048576     ,0           ,256
> > ,70410.0     ,73243.6     ,0.96
> > skylake     ,sse2        ,1048576     ,0           ,4064
> > ,71267.1     ,72274.6     ,0.99
> > skylake     ,sse2        ,2097159     ,0           ,0
> > ,140052.6    ,144880.1    ,0.97
> > skylake     ,sse2        ,2097167     ,0           ,3
> > ,146626.5    ,147972.6    ,0.99
> > skylake     ,sse2        ,2097183     ,3           ,0
> > ,141750.1    ,146353.6    ,0.97
> > skylake     ,sse2        ,2097215     ,3           ,5
> > ,144169.0    ,148120.1    ,0.97
> > skylake     ,sse2        ,2097152     ,0           ,127
> > ,156575.9    ,165844.4    ,0.94
> > skylake     ,sse2        ,2097152     ,0           ,255
> > ,144277.7    ,146971.5    ,0.98
> > skylake     ,sse2        ,2097152     ,0           ,256
> > ,143047.4    ,146810.9    ,0.97
> > skylake     ,sse2        ,2097152     ,0           ,4064
> > ,142795.6    ,145805.8    ,0.98
> > skylake     ,sse2        ,4194311     ,0           ,0
> > ,284353.3    ,298092.5    ,0.95
> > skylake     ,sse2        ,4194319     ,0           ,3
> > ,296656.4    ,311960.2    ,0.95
> > skylake     ,sse2        ,4194335     ,3           ,0
> > ,285922.6    ,304100.5    ,0.94
> > skylake     ,sse2        ,4194367     ,3           ,5
> > ,297135.4    ,312532.5    ,0.95
> > skylake     ,sse2        ,4194304     ,0           ,127
> > ,323938.6    ,340414.3    ,0.95
> > skylake     ,sse2        ,4194304     ,0           ,255
> > ,301460.9    ,310042.7    ,0.97
> > skylake     ,sse2        ,4194304     ,0           ,256
> > ,287155.8    ,303580.6    ,0.95
> > skylake     ,sse2        ,4194304     ,0           ,4064
> > ,291006.2    ,302441.3    ,0.96
> > skylake     ,sse2        ,8388615     ,0           ,0
> > ,714424.7    ,747484.3    ,0.96
> > skylake     ,sse2        ,8388623     ,0           ,3
> > ,748995.5    ,774116.5    ,0.97
> > skylake     ,sse2        ,8388639     ,3           ,0
> > ,720563.4    ,757386.9    ,0.95
> > skylake     ,sse2        ,8388671     ,3           ,5
> > ,748028.7    ,773907.8    ,0.97
> > skylake     ,sse2        ,8388608     ,0           ,127
> > ,750775.3    ,780245.2    ,0.96
> > skylake     ,sse2        ,8388608     ,0           ,255
> > ,724940.3    ,764197.8    ,0.95
> > skylake     ,sse2        ,8388608     ,0           ,256
> > ,722035.0    ,759408.9    ,0.95
> > skylake     ,sse2        ,8388608     ,0           ,4064
> > ,756977.8    ,755532.4    ,1.0
> > skylake     ,sse2        ,16777223    ,0           ,0
> > ,1971686.0   ,2111263.4   ,0.93
> > skylake     ,sse2        ,16777231    ,0           ,3
> > ,1953608.9   ,2128493.8   ,0.92
> > skylake     ,sse2        ,16777247    ,3           ,0
> > ,1967075.6   ,2103772.3   ,0.94
> > skylake     ,sse2        ,16777279    ,3           ,5
> > ,1950851.6   ,2133601.6   ,0.91
> > skylake     ,sse2        ,16777216    ,0           ,127
> > ,1991168.2   ,2078249.3   ,0.96
> > skylake     ,sse2        ,16777216    ,0           ,255
> > ,1958502.9   ,2111955.5   ,0.93
> > skylake     ,sse2        ,16777216    ,0           ,256
> > ,1965103.7   ,2114293.0   ,0.93
> > skylake     ,sse2        ,16777216    ,0           ,4064
> > ,1958381.3   ,2103438.6   ,0.93
> > skylake     ,sse2        ,33554439    ,0           ,0
> > ,4456144.2   ,4660837.1   ,0.96
> > skylake     ,sse2        ,33554447    ,0           ,3
> > ,4431097.0   ,4679042.6   ,0.95
> > skylake     ,sse2        ,33554463    ,3           ,0
> > ,4448225.6   ,4648538.3   ,0.96
> > skylake     ,sse2        ,33554495    ,3           ,5
> > ,4427743.0   ,4678340.1   ,0.95
> > skylake     ,sse2        ,33554432    ,0           ,127
> > ,4437517.3   ,4552005.9   ,0.97
> > skylake     ,sse2        ,33554432    ,0           ,255
> > ,4427135.1   ,4543412.0   ,0.97
> > skylake     ,sse2        ,33554432    ,0           ,256
> > ,4441311.2   ,4658315.5   ,0.95
> > skylake     ,sse2        ,33554432    ,0           ,4064
> > ,4429798.4   ,4659499.6   ,0.95   skylake     ,avx         ,65543
> >  ,0           ,0           ,3115.8      ,3043.7      ,1.02
> > skylake     ,avx         ,65551       ,0           ,3
> > ,3673.2      ,3551.7      ,1.03
> > skylake     ,avx         ,65567       ,3           ,0
> > ,3024.6      ,2887.4      ,1.05
> > skylake     ,avx         ,65599       ,3           ,5
> > ,3907.8      ,3636.4      ,1.07
> > skylake     ,avx         ,65536       ,0           ,127
> > ,3539.2      ,3372.3      ,1.05
> > skylake     ,avx         ,65536       ,0           ,255
> > ,3489.9      ,3344.0      ,1.04
> > skylake     ,avx         ,65536       ,0           ,256
> > ,3059.0      ,2924.4      ,1.05
> > skylake     ,avx         ,65536       ,0           ,4064
> > ,2805.0      ,2869.3      ,0.98
> > skylake     ,avx         ,131079      ,0           ,0
> > ,6129.2      ,6263.4      ,0.98
> > skylake     ,avx         ,131087      ,0           ,3
> > ,7096.8      ,7570.0      ,0.94
> > skylake     ,avx         ,131103      ,3           ,0
> > ,6394.5      ,6842.5      ,0.93
> > skylake     ,avx         ,131135      ,3           ,5
> > ,7462.8      ,7776.0      ,0.96
> > skylake     ,avx         ,131072      ,0           ,127
> > ,7726.9      ,7428.5      ,1.04
> > skylake     ,avx         ,131072      ,0           ,255
> > ,7167.4      ,7278.9      ,0.98
> > skylake     ,avx         ,131072      ,0           ,256
> > ,7197.9      ,6284.3      ,1.15
> > skylake     ,avx         ,131072      ,0           ,4064
> > ,6984.0      ,6940.4      ,1.01
> > skylake     ,avx         ,262151      ,0           ,0
> > ,15787.3     ,16403.1     ,0.96
> > skylake     ,avx         ,262159      ,0           ,3
> > ,17800.1     ,17628.1     ,1.01
> > skylake     ,avx         ,262175      ,3           ,0
> > ,16622.8     ,16244.3     ,1.02
> > skylake     ,avx         ,262207      ,3           ,5
> > ,16989.7     ,17509.0     ,0.97
> > skylake     ,avx         ,262144      ,0           ,127
> > ,16190.8     ,15971.8     ,1.01
> > skylake     ,avx         ,262144      ,0           ,255
> > ,15787.1     ,15876.7     ,0.99
> > skylake     ,avx         ,262144      ,0           ,256
> > ,14840.1     ,14997.0     ,0.99
> > skylake     ,avx         ,262144      ,0           ,4064
> > ,15743.0     ,14976.2     ,1.05
> > skylake     ,avx         ,524295      ,0           ,0
> > ,32848.5     ,33397.8     ,0.98
> > skylake     ,avx         ,524303      ,0           ,3
> > ,34872.1     ,34862.2     ,1.0
> > skylake     ,avx         ,524319      ,3           ,0
> > ,33784.6     ,34023.8     ,0.99
> > skylake     ,avx         ,524351      ,3           ,5
> > ,35337.1     ,35364.5     ,1.0
> > skylake     ,avx         ,524288      ,0           ,127
> > ,33624.5     ,33596.5     ,1.0
> > skylake     ,avx         ,524288      ,0           ,255
> > ,33390.7     ,33842.8     ,0.99
> > skylake     ,avx         ,524288      ,0           ,256
> > ,31937.0     ,32357.2     ,0.99
> > skylake     ,avx         ,524288      ,0           ,4064
> > ,32233.5     ,32267.3     ,1.0
> > skylake     ,avx         ,1048583     ,0           ,0
> > ,100354.7    ,105840.6    ,0.95
> > skylake     ,avx         ,1048591     ,0           ,3
> > ,68102.5     ,67496.0     ,1.01
> > skylake     ,avx         ,1048607     ,3           ,0
> > ,66146.1     ,67540.0     ,0.98
> > skylake     ,avx         ,1048639     ,3           ,5
> > ,67530.8     ,67726.4     ,1.0
> > skylake     ,avx         ,1048576     ,0           ,127
> > ,67105.6     ,66533.5     ,1.01
> > skylake     ,avx         ,1048576     ,0           ,255
> > ,67101.8     ,65666.7     ,1.02
> > skylake     ,avx         ,1048576     ,0           ,256
> > ,65092.6     ,67103.0     ,0.97
> > skylake     ,avx         ,1048576     ,0           ,4064
> > ,65700.0     ,67031.5     ,0.98
> > skylake     ,avx         ,2097159     ,0           ,0
> > ,133101.0    ,135171.6    ,0.98
> > skylake     ,avx         ,2097167     ,0           ,3
> > ,134174.4    ,135782.1    ,0.99
> > skylake     ,avx         ,2097183     ,3           ,0
> > ,132056.4    ,134170.0    ,0.98
> > skylake     ,avx         ,2097215     ,3           ,5
> > ,134413.5    ,136341.1    ,0.99
> > skylake     ,avx         ,2097152     ,0           ,127
> > ,133003.9    ,132992.1    ,1.0
> > skylake     ,avx         ,2097152     ,0           ,255
> > ,133344.3    ,132883.1    ,1.0
> > skylake     ,avx         ,2097152     ,0           ,256
> > ,134051.7    ,136185.8    ,0.98
> > skylake     ,avx         ,2097152     ,0           ,4064
> > ,132976.3    ,135029.4    ,0.98
> > skylake     ,avx         ,4194311     ,0           ,0
> > ,268004.1    ,282650.3    ,0.95
> > skylake     ,avx         ,4194319     ,0           ,3
> > ,270270.0    ,286700.3    ,0.94
> > skylake     ,avx         ,4194335     ,3           ,0
> > ,264288.5    ,279582.4    ,0.95
> > skylake     ,avx         ,4194367     ,3           ,5
> > ,270498.4    ,286294.5    ,0.94
> > skylake     ,avx         ,4194304     ,0           ,127
> > ,271219.3    ,275129.8    ,0.99
> > skylake     ,avx         ,4194304     ,0           ,255
> > ,269996.5    ,270227.6    ,1.0
> > skylake     ,avx         ,4194304     ,0           ,256
> > ,267901.1    ,281673.1    ,0.95
> > skylake     ,avx         ,4194304     ,0           ,4064
> > ,268390.0    ,279100.3    ,0.96
> > skylake     ,avx         ,8388615     ,0           ,0
> > ,803547.9    ,813229.9    ,0.99
> > skylake     ,avx         ,8388623     ,0           ,3
> > ,828872.4    ,869413.0    ,0.95
> > skylake     ,avx         ,8388639     ,3           ,0
> > ,818000.0    ,873781.7    ,0.94
> > skylake     ,avx         ,8388671     ,3           ,5
> > ,824679.0    ,863561.5    ,0.95
> > skylake     ,avx         ,8388608     ,0           ,127
> > ,800728.5    ,779000.8    ,1.03
> > skylake     ,avx         ,8388608     ,0           ,255
> > ,820071.4    ,770113.2    ,1.06
> > skylake     ,avx         ,8388608     ,0           ,256
> > ,825624.6    ,867247.7    ,0.95
> > skylake     ,avx         ,8388608     ,0           ,4064
> > ,830209.7    ,894086.6    ,0.93
> > skylake     ,avx         ,16777223    ,0           ,0
> > ,1989391.3   ,2132829.8   ,0.93
> > skylake     ,avx         ,16777231    ,0           ,3
> > ,1994225.1   ,2211556.0   ,0.9
> > skylake     ,avx         ,16777247    ,3           ,0
> > ,1993572.9   ,2213029.9   ,0.9
> > skylake     ,avx         ,16777279    ,3           ,5
> > ,2001956.9   ,2211769.7   ,0.91
> > skylake     ,avx         ,16777216    ,0           ,127
> > ,1968155.9   ,2127764.7   ,0.92
> > skylake     ,avx         ,16777216    ,0           ,255
> > ,1978305.1   ,2121371.3   ,0.93
> > skylake     ,avx         ,16777216    ,0           ,256
> > ,1993261.9   ,2206494.1   ,0.9
> > skylake     ,avx         ,16777216    ,0           ,4064
> > ,1993808.3   ,2198137.4   ,0.91
> > skylake     ,avx         ,33554439    ,0           ,0
> > ,4540216.7   ,4870021.8   ,0.93
> > skylake     ,avx         ,33554447    ,0           ,3
> > ,4483505.3   ,4850545.5   ,0.92
> > skylake     ,avx         ,33554463    ,3           ,0
> > ,4501944.5   ,4870922.4   ,0.92
> > skylake     ,avx         ,33554495    ,3           ,5
> > ,4484565.5   ,4845392.4   ,0.93
> > skylake     ,avx         ,33554432    ,0           ,127
> > ,4408639.3   ,4701698.6   ,0.94
> > skylake     ,avx         ,33554432    ,0           ,255
> > ,4445826.0   ,4678142.9   ,0.95
> > skylake     ,avx         ,33554432    ,0           ,256
> > ,4497953.2   ,4844498.6   ,0.93
> > skylake     ,avx         ,33554432    ,0           ,4064
> > ,4501572.4   ,4839209.4   ,0.93
> >
> > >
> > > --
> > > H.J.
  
H.J. Lu April 16, 2021, 12:59 p.m. UTC | #5
On Sat, Apr 03, 2021 at 04:12:15AM -0400, Noah Goldstein wrote:
> From: noah <goldstein.w.n@gmail.com>
> 
> No Bug. This commit updates the large memcpy case (no overlap). The
> update is to perform memcpy on either 2 or 4 contiguous pages at
> once. This 1) helps to alleviate the affects of false memory aliasing
> when destination and source have a close 4k alignment and 2) In most
> cases and for most DRAM units is a modestly more efficient access
> pattern. These changes are a clear performance improvement for
> VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> pass.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> Issue was alignment related AFAICT. Added `.p2align 4` infront of the
> loops and no longer see any meaningful regression.
> 
> Also added back the temporal stores for the tail. Saw a regression
> when doing these tests.
> 
> Two tables below for skylake and icelake numbers for the areas around
> where you saw the regression. Below is all data from the tests.
> 
> N = 10.
> 
> Skylake
> Len         ,align1      ,align2      ,new mean    ,old mean  
> 4103        ,0           ,64          ,84.5        ,88.6
> 4111        ,0           ,3           ,99.0        ,99.9
> 4127        ,3           ,0           ,102.1       ,102.3
> 4159        ,3           ,7           ,88.7        ,90.9
> 4223        ,9           ,5           ,88.1        ,87.4
> 8199        ,0           ,64          ,146.7       ,150.2
> 8207        ,0           ,3           ,167.9       ,168.5
> 8223        ,3           ,0           ,168.5       ,168.1
> 8255        ,3           ,7           ,157.0       ,159.2
> 8319        ,9           ,5           ,155.5       ,155.7
> 16391       ,0           ,64          ,286.2       ,288.8
> 16399       ,0           ,3           ,307.0       ,308.7
> 16415       ,3           ,0           ,307.4       ,307.6
> 16447       ,3           ,7           ,294.6       ,295.5
> 16511       ,9           ,5           ,291.5       ,462.1
> 32775       ,0           ,64          ,603.4       ,601.5
> 32783       ,0           ,3           ,604.8       ,606.4
> 32799       ,3           ,0           ,603.0       ,604.1
> 32831       ,3           ,7           ,600.2       ,737.3
> 32895       ,9           ,5           ,604.4       ,599.5
> 65543       ,0           ,64          ,1873.5      ,1854.3
> 65551       ,0           ,3           ,1862.9      ,1846.6
> 65567       ,3           ,0           ,1885.5      ,1966.0
> 65599       ,3           ,7           ,1833.2      ,1833.1
> 65663       ,9           ,5           ,1884.9      ,1887.4
> 131079      ,0           ,64          ,3944.3      ,3949.4
> 131087      ,0           ,3           ,3927.3      ,3913.3
> 131103      ,3           ,0           ,4415.8      ,4169.4
> 131135      ,3           ,7           ,4224.5      ,4157.6
> 131199      ,9           ,5           ,5974.0      ,4983.8
> 262151      ,0           ,64          ,11050.2     ,10620.6
> 262159      ,0           ,3           ,9932.8      ,10037.3
> 262175      ,3           ,0           ,10188.8     ,9206.6
> 262207      ,3           ,7           ,9633.3      ,9216.7
> 262271      ,9           ,5           ,9732.7      ,9345.3
> 524295      ,0           ,64          ,24823.9     ,24880.7
> 524303      ,0           ,3           ,24514.0     ,24556.7
> 524319      ,3           ,0           ,23974.4     ,24219.9
> 524351      ,3           ,7           ,24159.7     ,24207.0
> 524415      ,9           ,5           ,23946.5     ,24142.8
> 
> Icelake:
> Len         ,align1      ,align2      ,new mean    ,old mean  
> 4103        ,0           ,64          ,50.2        ,63.7
> 4111        ,0           ,3           ,63.7        ,65.1
> 4127        ,3           ,0           ,68.2        ,69.4
> 4159        ,3           ,7           ,59.6        ,68.0
> 4223        ,9           ,5           ,68.2        ,66.8
> 8199        ,0           ,64          ,92.1        ,89.9
> 8207        ,0           ,3           ,119.7       ,118.3
> 8223        ,3           ,0           ,119.1       ,120.9
> 8255        ,3           ,7           ,122.9       ,123.7
> 8319        ,9           ,5           ,122.1       ,121.8
> 16391       ,0           ,64          ,162.7       ,158.0
> 16399       ,0           ,3           ,227.6       ,234.1
> 16415       ,3           ,0           ,230.8       ,232.7
> 16447       ,3           ,7           ,226.8       ,232.6
> 16511       ,9           ,5           ,233.4       ,233.8
> 32775       ,0           ,64          ,312.2       ,301.8
> 32783       ,0           ,3           ,449.7       ,450.0
> 32799       ,3           ,0           ,452.7       ,455.9
> 32831       ,3           ,7           ,449.8       ,458.0
> 32895       ,9           ,5           ,456.3       ,459.4
> 65543       ,0           ,64          ,1460.6      ,1463.9
> 65551       ,0           ,3           ,1462.0      ,1465.4
> 65567       ,3           ,0           ,1466.6      ,1480.4
> 65599       ,3           ,7           ,1488.0      ,1488.9
> 65663       ,9           ,5           ,1680.8      ,1499.5
> 131079      ,0           ,64          ,2988.5      ,3010.1
> 131087      ,0           ,3           ,2995.5      ,2996.4
> 131103      ,3           ,0           ,3006.2      ,3000.5
> 131135      ,3           ,7           ,3032.4      ,3073.7
> 131199      ,9           ,5           ,3010.4      ,3027.4
> 262151      ,0           ,64          ,6143.2      ,6079.1
> 262159      ,0           ,3           ,6085.1      ,6075.8
> 262175      ,3           ,0           ,6088.0      ,6064.9
> 262207      ,3           ,7           ,6018.7      ,6023.5
> 262271      ,9           ,5           ,6019.8      ,5959.2
> 524295      ,0           ,64          ,14464.2     ,14095.1
> 524303      ,0           ,3           ,14761.6     ,14050.2
> 524319      ,3           ,0           ,14534.1     ,14087.5
> 524351      ,3           ,7           ,14147.7     ,13903.8
> 524415      ,9           ,5           ,14157.0     ,13982.9
> 
> 
> 
> cpu         ,version     ,Len         ,align1      ,align2      ,new mean    ,old mean  
> skylake     ,avx         ,4103        ,0           ,64          ,84.5        ,88.6
> skylake     ,avx         ,4111        ,0           ,3           ,99.0        ,99.9
> skylake     ,avx         ,4127        ,3           ,0           ,102.1       ,102.3
> skylake     ,avx         ,4159        ,3           ,7           ,88.7        ,90.9
> skylake     ,avx         ,4223        ,9           ,5           ,88.1        ,87.4
> skylake     ,avx         ,8199        ,0           ,64          ,146.7       ,150.2
> skylake     ,avx         ,8207        ,0           ,3           ,167.9       ,168.5
> skylake     ,avx         ,8223        ,3           ,0           ,168.5       ,168.1
> skylake     ,avx         ,8255        ,3           ,7           ,157.0       ,159.2
> skylake     ,avx         ,8319        ,9           ,5           ,155.5       ,155.7
> skylake     ,avx         ,16391       ,0           ,64          ,286.2       ,288.8
> skylake     ,avx         ,16399       ,0           ,3           ,307.0       ,308.7
> skylake     ,avx         ,16415       ,3           ,0           ,307.4       ,307.6
> skylake     ,avx         ,16447       ,3           ,7           ,294.6       ,295.5
> skylake     ,avx         ,16511       ,9           ,5           ,291.5       ,462.1
> skylake     ,avx         ,32775       ,0           ,64          ,603.4       ,601.5
> skylake     ,avx         ,32783       ,0           ,3           ,604.8       ,606.4
> skylake     ,avx         ,32799       ,3           ,0           ,603.0       ,604.1
> skylake     ,avx         ,32831       ,3           ,7           ,600.2       ,737.3
> skylake     ,avx         ,32895       ,9           ,5           ,604.4       ,599.5
> skylake     ,avx         ,65543       ,0           ,64          ,1873.5      ,1854.3
> skylake     ,avx         ,65551       ,0           ,3           ,1862.9      ,1846.6
> skylake     ,avx         ,65567       ,3           ,0           ,1885.5      ,1966.0
> skylake     ,avx         ,65599       ,3           ,7           ,1833.2      ,1833.1
> skylake     ,avx         ,65663       ,9           ,5           ,1884.9      ,1887.4
> skylake     ,avx         ,131079      ,0           ,64          ,3944.3      ,3949.4
> skylake     ,avx         ,131087      ,0           ,3           ,3927.3      ,3913.3
> skylake     ,avx         ,131103      ,3           ,0           ,4415.8      ,4169.4
> skylake     ,avx         ,131135      ,3           ,7           ,4224.5      ,4157.6
> skylake     ,avx         ,131199      ,9           ,5           ,5974.0      ,4983.8
> skylake     ,avx         ,262151      ,0           ,64          ,11050.2     ,10620.6
> skylake     ,avx         ,262159      ,0           ,3           ,9932.8      ,10037.3
> skylake     ,avx         ,262175      ,3           ,0           ,10188.8     ,9206.6
> skylake     ,avx         ,262207      ,3           ,7           ,9633.3      ,9216.7
> skylake     ,avx         ,262271      ,9           ,5           ,9732.7      ,9345.3
> skylake     ,avx         ,524295      ,0           ,64          ,24823.9     ,24880.7
> skylake     ,avx         ,524303      ,0           ,3           ,24514.0     ,24556.7
> skylake     ,avx         ,524319      ,3           ,0           ,23974.4     ,24219.9
> skylake     ,avx         ,524351      ,3           ,7           ,24159.7     ,24207.0
> skylake     ,avx         ,524415      ,9           ,5           ,23946.5     ,24142.8
> skylake     ,avx         ,1048583     ,0           ,64          ,49163.9     ,49454.6
> skylake     ,avx         ,1048591     ,0           ,3           ,49879.3     ,49400.8
> skylake     ,avx         ,1048607     ,3           ,0           ,49738.0     ,48864.6
> skylake     ,avx         ,1048639     ,3           ,7           ,48804.0     ,47588.5
> skylake     ,avx         ,1048703     ,9           ,5           ,49629.4     ,49796.3
> skylake     ,avx         ,2097159     ,0           ,64          ,98271.7     ,96330.6
> skylake     ,avx         ,2097167     ,0           ,3           ,97801.8     ,98638.1
> skylake     ,avx         ,2097183     ,3           ,0           ,98041.1     ,99287.6
> skylake     ,avx         ,2097215     ,3           ,7           ,96629.5     ,96521.9
> skylake     ,avx         ,2097279     ,9           ,5           ,98961.8     ,98909.8
> skylake     ,avx         ,4194311     ,0           ,64          ,194667.7    ,195377.1
> skylake     ,avx         ,4194319     ,0           ,3           ,194919.5    ,198576.2
> skylake     ,avx         ,4194335     ,3           ,0           ,192949.8    ,194584.7
> skylake     ,avx         ,4194367     ,3           ,7           ,189943.5    ,189177.9
> skylake     ,avx         ,4194431     ,9           ,5           ,192479.1    ,196494.2
> skylake     ,avx         ,8388615     ,0           ,64          ,588671.6    ,587215.4
> skylake     ,avx         ,8388623     ,0           ,3           ,581640.7    ,582812.5
> skylake     ,avx         ,8388639     ,3           ,0           ,549811.9    ,544697.6
> skylake     ,avx         ,8388671     ,3           ,7           ,591155.0    ,577951.8
> skylake     ,avx         ,8388735     ,9           ,5           ,547583.2    ,545133.3
> skylake     ,avx         ,16777223    ,0           ,64          ,1787503.0   ,1811146.0
> skylake     ,avx         ,16777231    ,0           ,3           ,1758671.0   ,1756343.0
> skylake     ,avx         ,16777247    ,3           ,0           ,1691781.0   ,1694661.0
> skylake     ,avx         ,16777279    ,3           ,7           ,1768150.0   ,1754785.0
> skylake     ,avx         ,16777343    ,9           ,5           ,1695179.0   ,1710794.0
> skylake     ,sse2        ,4103        ,0           ,64          ,150.8       ,150.5
> skylake     ,sse2        ,4111        ,0           ,3           ,156.8       ,158.4
> skylake     ,sse2        ,4127        ,3           ,0           ,99.7        ,99.4
> skylake     ,sse2        ,4159        ,3           ,7           ,154.8       ,154.5
> skylake     ,sse2        ,4223        ,9           ,5           ,137.3       ,137.2
> skylake     ,sse2        ,8199        ,0           ,64          ,284.8       ,285.5
> skylake     ,sse2        ,8207        ,0           ,3           ,296.0       ,296.1
> skylake     ,sse2        ,8223        ,3           ,0           ,168.0       ,168.2
> skylake     ,sse2        ,8255        ,3           ,7           ,293.0       ,292.4
> skylake     ,sse2        ,8319        ,9           ,5           ,251.3       ,250.7
> skylake     ,sse2        ,16391       ,0           ,64          ,561.3       ,608.3
> skylake     ,sse2        ,16399       ,0           ,3           ,571.0       ,574.8
> skylake     ,sse2        ,16415       ,3           ,0           ,305.4       ,305.0
> skylake     ,sse2        ,16447       ,3           ,7           ,563.2       ,565.0
> skylake     ,sse2        ,16511       ,9           ,5           ,477.1       ,475.1
> skylake     ,sse2        ,32775       ,0           ,64          ,1128.2      ,1131.7
> skylake     ,sse2        ,32783       ,0           ,3           ,1126.6      ,1131.0
> skylake     ,sse2        ,32799       ,3           ,0           ,587.6       ,590.8
> skylake     ,sse2        ,32831       ,3           ,7           ,1130.6      ,1126.2
> skylake     ,sse2        ,32895       ,9           ,5           ,957.6       ,953.0
> skylake     ,sse2        ,65543       ,0           ,64          ,2718.9      ,2704.2
> skylake     ,sse2        ,65551       ,0           ,3           ,2724.1      ,2725.0
> skylake     ,sse2        ,65567       ,3           ,0           ,1888.4      ,1914.3
> skylake     ,sse2        ,65599       ,3           ,7           ,2787.6      ,2748.7
> skylake     ,sse2        ,65663       ,9           ,5           ,2400.5      ,2369.4
> skylake     ,sse2        ,131079      ,0           ,64          ,5603.3      ,5654.9
> skylake     ,sse2        ,131087      ,0           ,3           ,5939.3      ,5871.4
> skylake     ,sse2        ,131103      ,3           ,0           ,4272.4      ,4190.0
> skylake     ,sse2        ,131135      ,3           ,7           ,7601.4      ,7524.6
> skylake     ,sse2        ,131199      ,9           ,5           ,7022.1      ,6864.7
> skylake     ,sse2        ,262151      ,0           ,64          ,13736.2     ,14030.0
> skylake     ,sse2        ,262159      ,0           ,3           ,12407.3     ,12334.1
> skylake     ,sse2        ,262175      ,3           ,0           ,9661.1      ,9249.4
> skylake     ,sse2        ,262207      ,3           ,7           ,12850.2     ,12351.6
> skylake     ,sse2        ,262271      ,9           ,5           ,10792.6     ,10435.8
> skylake     ,sse2        ,524295      ,0           ,64          ,27754.5     ,28177.7
> skylake     ,sse2        ,524303      ,0           ,3           ,27766.2     ,28152.0
> skylake     ,sse2        ,524319      ,3           ,0           ,24030.9     ,24438.3
> skylake     ,sse2        ,524351      ,3           ,7           ,27787.5     ,27933.0
> skylake     ,sse2        ,524415      ,9           ,5           ,24263.2     ,25249.1
> skylake     ,sse2        ,1048583     ,0           ,64          ,56199.9     ,56039.8
> skylake     ,sse2        ,1048591     ,0           ,3           ,56750.2     ,58889.7
> skylake     ,sse2        ,1048607     ,3           ,0           ,56394.0     ,55115.3
> skylake     ,sse2        ,1048639     ,3           ,7           ,57233.1     ,57473.8
> skylake     ,sse2        ,1048703     ,9           ,5           ,56324.3     ,55917.9
> skylake     ,sse2        ,2097159     ,0           ,64          ,113234.8    ,114346.4
> skylake     ,sse2        ,2097167     ,0           ,3           ,114373.1    ,115522.5
> skylake     ,sse2        ,2097183     ,3           ,0           ,108113.3    ,108513.3
> skylake     ,sse2        ,2097215     ,3           ,7           ,116863.6    ,116549.9
> skylake     ,sse2        ,2097279     ,9           ,5           ,108945.1    ,108843.7
> skylake     ,sse2        ,4194311     ,0           ,64          ,230250.1    ,232350.0
> skylake     ,sse2        ,4194319     ,0           ,3           ,231895.3    ,235055.6
> skylake     ,sse2        ,4194335     ,3           ,0           ,218442.8    ,219199.8
> skylake     ,sse2        ,4194367     ,3           ,7           ,242564.2    ,235587.7
> skylake     ,sse2        ,4194431     ,9           ,5           ,224167.4    ,215261.8
> skylake     ,sse2        ,8388615     ,0           ,64          ,679801.8    ,674832.0
> skylake     ,sse2        ,8388623     ,0           ,3           ,684913.2    ,685238.7
> skylake     ,sse2        ,8388639     ,3           ,0           ,644865.4    ,631388.6
> skylake     ,sse2        ,8388671     ,3           ,7           ,698700.9    ,689316.1
> skylake     ,sse2        ,8388735     ,9           ,5           ,644820.2    ,631366.8
> skylake     ,sse2        ,16777223    ,0           ,64          ,1877984.0   ,1876437.0
> skylake     ,sse2        ,16777231    ,0           ,3           ,1898086.0   ,1913053.0
> skylake     ,sse2        ,16777247    ,3           ,0           ,1857018.0   ,1866949.0
> skylake     ,sse2        ,16777279    ,3           ,7           ,1914905.0   ,1897134.0
> skylake     ,sse2        ,16777343    ,9           ,5           ,1859937.0   ,1881939.0
> icelake     ,avx512      ,4103        ,0           ,64          ,75.2        ,75.8
> icelake     ,avx512      ,4111        ,0           ,3           ,56.9        ,56.4
> icelake     ,avx512      ,4127        ,3           ,0           ,59.1        ,59.6
> icelake     ,avx512      ,4159        ,3           ,7           ,50.7        ,51.3
> icelake     ,avx512      ,4223        ,9           ,5           ,59.2        ,58.9
> icelake     ,avx512      ,8199        ,0           ,64          ,67.8        ,63.9
> icelake     ,avx512      ,8207        ,0           ,3           ,89.0        ,89.9
> icelake     ,avx512      ,8223        ,3           ,0           ,90.2        ,90.1
> icelake     ,avx512      ,8255        ,3           ,7           ,82.6        ,84.9
> icelake     ,avx512      ,8319        ,9           ,5           ,91.5        ,92.8
> icelake     ,avx512      ,16391       ,0           ,64          ,118.0       ,117.6
> icelake     ,avx512      ,16399       ,0           ,3           ,156.5       ,157.0
> icelake     ,avx512      ,16415       ,3           ,0           ,157.4       ,157.3
> icelake     ,avx512      ,16447       ,3           ,7           ,151.0       ,151.6
> icelake     ,avx512      ,16511       ,9           ,5           ,159.1       ,159.6
> icelake     ,avx512      ,32775       ,0           ,64          ,231.8       ,230.8
> icelake     ,avx512      ,32783       ,0           ,3           ,297.8       ,299.3
> icelake     ,avx512      ,32799       ,3           ,0           ,299.1       ,299.0
> icelake     ,avx512      ,32831       ,3           ,7           ,293.5       ,295.4
> icelake     ,avx512      ,32895       ,9           ,5           ,300.3       ,302.5
> icelake     ,avx512      ,65543       ,0           ,64          ,1473.4      ,1479.2
> icelake     ,avx512      ,65551       ,0           ,3           ,1438.2      ,1445.3
> icelake     ,avx512      ,65567       ,3           ,0           ,1450.3      ,1463.8
> icelake     ,avx512      ,65599       ,3           ,7           ,1469.0      ,1473.8
> icelake     ,avx512      ,65663       ,9           ,5           ,1480.0      ,1483.5
> icelake     ,avx512      ,131079      ,0           ,64          ,3015.1      ,3037.5
> icelake     ,avx512      ,131087      ,0           ,3           ,2952.3      ,2960.4
> icelake     ,avx512      ,131103      ,3           ,0           ,2966.2      ,2964.4
> icelake     ,avx512      ,131135      ,3           ,7           ,2961.6      ,3047.9
> icelake     ,avx512      ,131199      ,9           ,5           ,2967.4      ,3183.8
> icelake     ,avx512      ,262151      ,0           ,64          ,6206.0      ,6141.5
> icelake     ,avx512      ,262159      ,0           ,3           ,5990.8      ,5959.2
> icelake     ,avx512      ,262175      ,3           ,0           ,5976.7      ,5963.8
> icelake     ,avx512      ,262207      ,3           ,7           ,5939.5      ,5924.3
> icelake     ,avx512      ,262271      ,9           ,5           ,5944.6      ,5990.3
> icelake     ,avx512      ,524295      ,0           ,64          ,14726.7     ,14307.0
> icelake     ,avx512      ,524303      ,0           ,3           ,14344.2     ,14040.5
> icelake     ,avx512      ,524319      ,3           ,0           ,14175.0     ,13862.2
> icelake     ,avx512      ,524351      ,3           ,7           ,14261.4     ,13821.5
> icelake     ,avx512      ,524415      ,9           ,5           ,14266.5     ,14064.7
> icelake     ,avx512      ,1048583     ,0           ,64          ,35211.4     ,35414.6
> icelake     ,avx512      ,1048591     ,0           ,3           ,35156.8     ,35591.2
> icelake     ,avx512      ,1048607     ,3           ,0           ,35273.1     ,35503.3
> icelake     ,avx512      ,1048639     ,3           ,7           ,35255.8     ,35725.0
> icelake     ,avx512      ,1048703     ,9           ,5           ,35703.6     ,36289.9
> icelake     ,avx512      ,2097159     ,0           ,64          ,72613.9     ,72063.2
> icelake     ,avx512      ,2097167     ,0           ,3           ,72301.6     ,73504.2
> icelake     ,avx512      ,2097183     ,3           ,0           ,73448.8     ,72133.6
> icelake     ,avx512      ,2097215     ,3           ,7           ,73762.9     ,72825.8
> icelake     ,avx512      ,2097279     ,9           ,5           ,72097.3     ,72914.6
> icelake     ,avx512      ,4194311     ,0           ,64          ,144793.4    ,144182.1
> icelake     ,avx512      ,4194319     ,0           ,3           ,143710.3    ,145063.3
> icelake     ,avx512      ,4194335     ,3           ,0           ,146722.1    ,144046.4
> icelake     ,avx512      ,4194367     ,3           ,7           ,144267.0    ,144874.6
> icelake     ,avx512      ,4194431     ,9           ,5           ,143808.2    ,144560.0
> icelake     ,avx512      ,8388615     ,0           ,64          ,427993.4    ,424521.5
> icelake     ,avx512      ,8388623     ,0           ,3           ,470267.1    ,473290.8
> icelake     ,avx512      ,8388639     ,3           ,0           ,457179.7    ,461797.7
> icelake     ,avx512      ,8388671     ,3           ,7           ,472507.9    ,481561.4
> icelake     ,avx512      ,8388735     ,9           ,5           ,463611.9    ,467388.7
> icelake     ,avx512      ,16777223    ,0           ,64          ,1490426.0   ,1526996.0
> icelake     ,avx512      ,16777231    ,0           ,3           ,1516687.0   ,1517095.0
> icelake     ,avx512      ,16777247    ,3           ,0           ,1497688.0   ,1512766.0
> icelake     ,avx512      ,16777279    ,3           ,7           ,1512331.0   ,1524317.0
> icelake     ,avx512      ,16777343    ,9           ,5           ,1498908.0   ,1500526.0
> icelake     ,avx         ,4103        ,0           ,64          ,50.2        ,63.7
> icelake     ,avx         ,4111        ,0           ,3           ,63.7        ,65.1
> icelake     ,avx         ,4127        ,3           ,0           ,68.2        ,69.4
> icelake     ,avx         ,4159        ,3           ,7           ,59.6        ,68.0
> icelake     ,avx         ,4223        ,9           ,5           ,68.2        ,66.8
> icelake     ,avx         ,8199        ,0           ,64          ,92.1        ,89.9
> icelake     ,avx         ,8207        ,0           ,3           ,119.7       ,118.3
> icelake     ,avx         ,8223        ,3           ,0           ,119.1       ,120.9
> icelake     ,avx         ,8255        ,3           ,7           ,122.9       ,123.7
> icelake     ,avx         ,8319        ,9           ,5           ,122.1       ,121.8
> icelake     ,avx         ,16391       ,0           ,64          ,162.7       ,158.0
> icelake     ,avx         ,16399       ,0           ,3           ,227.6       ,234.1
> icelake     ,avx         ,16415       ,3           ,0           ,230.8       ,232.7
> icelake     ,avx         ,16447       ,3           ,7           ,226.8       ,232.6
> icelake     ,avx         ,16511       ,9           ,5           ,233.4       ,233.8
> icelake     ,avx         ,32775       ,0           ,64          ,312.2       ,301.8
> icelake     ,avx         ,32783       ,0           ,3           ,449.7       ,450.0
> icelake     ,avx         ,32799       ,3           ,0           ,452.7       ,455.9
> icelake     ,avx         ,32831       ,3           ,7           ,449.8       ,458.0
> icelake     ,avx         ,32895       ,9           ,5           ,456.3       ,459.4
> icelake     ,avx         ,65543       ,0           ,64          ,1460.6      ,1463.9
> icelake     ,avx         ,65551       ,0           ,3           ,1462.0      ,1465.4
> icelake     ,avx         ,65567       ,3           ,0           ,1466.6      ,1480.4
> icelake     ,avx         ,65599       ,3           ,7           ,1488.0      ,1488.9
> icelake     ,avx         ,65663       ,9           ,5           ,1680.8      ,1499.5
> icelake     ,avx         ,131079      ,0           ,64          ,2988.5      ,3010.1
> icelake     ,avx         ,131087      ,0           ,3           ,2995.5      ,2996.4
> icelake     ,avx         ,131103      ,3           ,0           ,3006.2      ,3000.5
> icelake     ,avx         ,131135      ,3           ,7           ,3032.4      ,3073.7
> icelake     ,avx         ,131199      ,9           ,5           ,3010.4      ,3027.4
> icelake     ,avx         ,262151      ,0           ,64          ,6143.2      ,6079.1
> icelake     ,avx         ,262159      ,0           ,3           ,6085.1      ,6075.8
> icelake     ,avx         ,262175      ,3           ,0           ,6088.0      ,6064.9
> icelake     ,avx         ,262207      ,3           ,7           ,6018.7      ,6023.5
> icelake     ,avx         ,262271      ,9           ,5           ,6019.8      ,5959.2
> icelake     ,avx         ,524295      ,0           ,64          ,14464.2     ,14095.1
> icelake     ,avx         ,524303      ,0           ,3           ,14761.6     ,14050.2
> icelake     ,avx         ,524319      ,3           ,0           ,14534.1     ,14087.5
> icelake     ,avx         ,524351      ,3           ,7           ,14147.7     ,13903.8
> icelake     ,avx         ,524415      ,9           ,5           ,14157.0     ,13982.9
> icelake     ,avx         ,1048583     ,0           ,64          ,36599.0     ,37461.4
> icelake     ,avx         ,1048591     ,0           ,3           ,36717.8     ,37454.9
> icelake     ,avx         ,1048607     ,3           ,0           ,36821.2     ,37343.3
> icelake     ,avx         ,1048639     ,3           ,7           ,36958.0     ,37507.2
> icelake     ,avx         ,1048703     ,9           ,5           ,36869.2     ,37413.1
> icelake     ,avx         ,2097159     ,0           ,64          ,74765.8     ,75330.9
> icelake     ,avx         ,2097167     ,0           ,3           ,75175.4     ,74891.9
> icelake     ,avx         ,2097183     ,3           ,0           ,75451.4     ,74787.7
> icelake     ,avx         ,2097215     ,3           ,7           ,75394.8     ,75839.1
> icelake     ,avx         ,2097279     ,9           ,5           ,75099.2     ,75421.2
> icelake     ,avx         ,4194311     ,0           ,64          ,146809.6    ,146619.4
> icelake     ,avx         ,4194319     ,0           ,3           ,148866.4    ,149898.2
> icelake     ,avx         ,4194335     ,3           ,0           ,148719.7    ,150165.4
> icelake     ,avx         ,4194367     ,3           ,7           ,150600.1    ,150925.9
> icelake     ,avx         ,4194431     ,9           ,5           ,149457.3    ,150519.2
> icelake     ,avx         ,8388615     ,0           ,64          ,412709.8    ,423666.1
> icelake     ,avx         ,8388623     ,0           ,3           ,423717.4    ,424418.2
> icelake     ,avx         ,8388639     ,3           ,0           ,414387.5    ,413445.6
> icelake     ,avx         ,8388671     ,3           ,7           ,449010.7    ,417553.5
> icelake     ,avx         ,8388735     ,9           ,5           ,414128.6    ,411815.3
> icelake     ,avx         ,16777223    ,0           ,64          ,1490032.0   ,1510004.0
> icelake     ,avx         ,16777231    ,0           ,3           ,1379638.0   ,1422097.0
> icelake     ,avx         ,16777247    ,3           ,0           ,1418930.0   ,1367557.0
> icelake     ,avx         ,16777279    ,3           ,7           ,1515152.0   ,1500176.0
> icelake     ,avx         ,16777343    ,9           ,5           ,1344117.0   ,1411795.0
> icelake     ,sse2        ,4103        ,0           ,64          ,113.2       ,114.6
> icelake     ,sse2        ,4111        ,0           ,3           ,121.5       ,120.4
> icelake     ,sse2        ,4127        ,3           ,0           ,1700.5      ,1771.5
> icelake     ,sse2        ,4159        ,3           ,7           ,119.3       ,118.8
> icelake     ,sse2        ,4223        ,9           ,5           ,1739.7      ,1735.2
> icelake     ,sse2        ,8199        ,0           ,64          ,207.0       ,203.9
> icelake     ,sse2        ,8207        ,0           ,3           ,225.5       ,220.8
> icelake     ,sse2        ,8223        ,3           ,0           ,3444.3      ,3743.5
> icelake     ,sse2        ,8255        ,3           ,7           ,219.9       ,216.8
> icelake     ,sse2        ,8319        ,9           ,5           ,4117.1      ,3487.3
> icelake     ,sse2        ,16391       ,0           ,64          ,397.1       ,394.3
> icelake     ,sse2        ,16399       ,0           ,3           ,439.6       ,428.6
> icelake     ,sse2        ,16415       ,3           ,0           ,6997.0      ,7031.2
> icelake     ,sse2        ,16447       ,3           ,7           ,426.8       ,421.8
> icelake     ,sse2        ,16511       ,9           ,5           ,7037.6      ,7038.3
> icelake     ,sse2        ,32775       ,0           ,64          ,790.9       ,779.0
> icelake     ,sse2        ,32783       ,0           ,3           ,863.1       ,849.6
> icelake     ,sse2        ,32799       ,3           ,0           ,14043.0     ,14390.9
> icelake     ,sse2        ,32831       ,3           ,7           ,841.6       ,833.1
> icelake     ,sse2        ,32895       ,9           ,5           ,14277.6     ,14344.2
> icelake     ,sse2        ,65543       ,0           ,64          ,1897.0      ,1897.3
> icelake     ,sse2        ,65551       ,0           ,3           ,1927.1      ,1955.4
> icelake     ,sse2        ,65567       ,3           ,0           ,28834.7     ,28727.8
> icelake     ,sse2        ,65599       ,3           ,7           ,1961.4      ,1969.7
> icelake     ,sse2        ,65663       ,9           ,5           ,28867.6     ,29019.8
> icelake     ,sse2        ,131079      ,0           ,64          ,3879.3      ,3872.6
> icelake     ,sse2        ,131087      ,0           ,3           ,3955.3      ,3990.7
> icelake     ,sse2        ,131103      ,3           ,0           ,58001.8     ,60567.9
> icelake     ,sse2        ,131135      ,3           ,7           ,3951.5      ,4002.6
> icelake     ,sse2        ,131199      ,9           ,5           ,57886.7     ,58391.4
> icelake     ,sse2        ,262151      ,0           ,64          ,7851.4      ,7894.7
> icelake     ,sse2        ,262159      ,0           ,3           ,7947.5      ,8016.2
> icelake     ,sse2        ,262175      ,3           ,0           ,115036.2    ,115968.6
> icelake     ,sse2        ,262207      ,3           ,7           ,7883.9      ,7814.1
> icelake     ,sse2        ,262271      ,9           ,5           ,113776.4    ,119733.6
> icelake     ,sse2        ,524295      ,0           ,64          ,17198.1     ,16974.9
> icelake     ,sse2        ,524303      ,0           ,3           ,17402.2     ,17096.3
> icelake     ,sse2        ,524319      ,3           ,0           ,223980.4    ,225889.9
> icelake     ,sse2        ,524351      ,3           ,7           ,17034.9     ,16910.3
> icelake     ,sse2        ,524415      ,9           ,5           ,224027.7    ,224962.5
> icelake     ,sse2        ,1048583     ,0           ,64          ,38822.3     ,39178.6
> icelake     ,sse2        ,1048591     ,0           ,3           ,41686.7     ,40247.4
> icelake     ,sse2        ,1048607     ,3           ,0           ,38814.8     ,39323.3
> icelake     ,sse2        ,1048639     ,3           ,7           ,39568.3     ,41325.7
> icelake     ,sse2        ,1048703     ,9           ,5           ,39354.2     ,39637.9
> icelake     ,sse2        ,2097159     ,0           ,64          ,84074.7     ,84543.1
> icelake     ,sse2        ,2097167     ,0           ,3           ,83665.7     ,82358.2
> icelake     ,sse2        ,2097183     ,3           ,0           ,81817.8     ,79638.9
> icelake     ,sse2        ,2097215     ,3           ,7           ,83649.1     ,83497.6
> icelake     ,sse2        ,2097279     ,9           ,5           ,80287.6     ,79980.9
> icelake     ,sse2        ,4194311     ,0           ,64          ,165409.8    ,168343.1
> icelake     ,sse2        ,4194319     ,0           ,3           ,165216.7    ,177632.0
> icelake     ,sse2        ,4194335     ,3           ,0           ,158718.7    ,160342.2
> icelake     ,sse2        ,4194367     ,3           ,7           ,167944.9    ,167204.4
> icelake     ,sse2        ,4194431     ,9           ,5           ,161530.1    ,164839.7
> icelake     ,sse2        ,8388615     ,0           ,64          ,626504.3    ,629858.5
> icelake     ,sse2        ,8388623     ,0           ,3           ,623969.5    ,631509.1
> icelake     ,sse2        ,8388639     ,3           ,0           ,599366.7    ,600016.0
> icelake     ,sse2        ,8388671     ,3           ,7           ,619964.2    ,619113.2
> icelake     ,sse2        ,8388735     ,9           ,5           ,595338.1    ,604172.4
> icelake     ,sse2        ,16777223    ,0           ,64          ,1709597.0   ,1725184.0
> icelake     ,sse2        ,16777231    ,0           ,3           ,1725452.0   ,1719746.0
> icelake     ,sse2        ,16777247    ,3           ,0           ,1614269.0   ,1607164.0
> icelake     ,sse2        ,16777279    ,3           ,7           ,1705295.0   ,1733018.0
> icelake     ,sse2        ,16777343    ,9           ,5           ,1604197.0   ,1595690.0
> 
>         
>  .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
>  1 file changed, 265 insertions(+), 73 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> index 897a3d9762..5e4a071f16 100644
> --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> @@ -35,7 +35,16 @@
>        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
>     7. If size >= __x86_shared_non_temporal_threshold and there is no
>        overlap between destination and source, use non-temporal store
> -      instead of aligned store.  */
> +      instead of aligned store copying from either 2 or 4 pages at
> +      once.
> +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> +      and source and destination do not page alias, copy from 2 pages
> +      at once using non-temporal stores. Page aliasing in this case is
> +      considered true if destination's page alignment - sources' page
> +      alignment is less than 8 * VEC_SIZE.
> +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> +      and destination do page alias copy from 4 pages at once using
> +      non-temporal stores.  */
>  
>  #include <sysdep.h>
>  
> @@ -67,6 +76,34 @@
>  # endif
>  #endif
>  
> +#ifndef PAGE_SIZE
> +# define PAGE_SIZE 4096
> +#endif
> +
> +#if PAGE_SIZE != 4096
> +# error Unsupported PAGE_SIZE
> +#endif
> +
> +#ifndef LOG_PAGE_SIZE
> +# define LOG_PAGE_SIZE 12
> +#endif
> +
> +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> +# error Invalid LOG_PAGE_SIZE
> +#endif
> +
> +/* Byte per page for large_memcpy inner loop.  */
> +#if VEC_SIZE == 64
> +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> +#else
> +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> +#endif
> +
> +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
> +#ifndef LOG_4X_MEMCPY_THRESH
> +# define LOG_4X_MEMCPY_THRESH 4
> +#endif
> +
>  /* Avoid short distance rep movsb only with non-SSE vector.  */
>  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
>  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> @@ -106,6 +143,28 @@
>  # error Unsupported PREFETCH_SIZE!
>  #endif
>  
> +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> +	VMOVU	(offset)base, vec0; \
> +	VMOVU	((offset) + VEC_SIZE)base, vec1;
> +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> +	VMOVNT  vec0, (offset)base; \
> +	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
> +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> +	VMOVU	(offset)base, vec0; \
> +	VMOVU	((offset) + VEC_SIZE)base, vec1; \
> +	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
> +	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
> +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> +	VMOVNT	vec0, (offset)base; \
> +	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
> +	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
> +	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
> +#else
> +# error Invalid LARGE_LOAD_SIZE
> +#endif
> +
>  #ifndef SECTION
>  # error SECTION is not defined!
>  #endif
> @@ -393,6 +452,15 @@ L(last_4x_vec):
>  	VZEROUPPER_RETURN
>  
>  L(more_8x_vec):
> +	/* Check if non-temporal move candidate.  */
> +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> +	/* Check non-temporal store threshold.  */
> +	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> +	ja	L(large_memcpy_2x)
> +#endif
> +	/* Entry if rdx is greater than non-temporal threshold but there
> +       is overlap.  */
> +L(more_8x_vec_check):
>  	cmpq	%rsi, %rdi
>  	ja	L(more_8x_vec_backward)
>  	/* Source == destination is less common.  */
> @@ -419,24 +487,21 @@ L(more_8x_vec):
>  	subq	%r8, %rdi
>  	/* Adjust length.  */
>  	addq	%r8, %rdx
> -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -	/* Check non-temporal store threshold.  */
> -	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
> -	ja	L(large_forward)
> -#endif
> +
> +	.p2align 4
>  L(loop_4x_vec_forward):
>  	/* Copy 4 * VEC a time forward.  */
>  	VMOVU	(%rsi), %VEC(0)
>  	VMOVU	VEC_SIZE(%rsi), %VEC(1)
>  	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
>  	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
> -	addq	$(VEC_SIZE * 4), %rsi
> -	subq	$(VEC_SIZE * 4), %rdx
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	addq	$-(VEC_SIZE * 4), %rdx
>  	VMOVA	%VEC(0), (%rdi)
>  	VMOVA	%VEC(1), VEC_SIZE(%rdi)
>  	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
>  	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
> -	addq	$(VEC_SIZE * 4), %rdi
> +	subq	$-(VEC_SIZE * 4), %rdi
>  	cmpq	$(VEC_SIZE * 4), %rdx
>  	ja	L(loop_4x_vec_forward)
>  	/* Store the last 4 * VEC.  */
> @@ -470,24 +535,21 @@ L(more_8x_vec_backward):
>  	subq	%r8, %r9
>  	/* Adjust length.  */
>  	subq	%r8, %rdx
> -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -	/* Check non-temporal store threshold.  */
> -	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
> -	ja	L(large_backward)
> -#endif
> +
> +	.p2align 4
>  L(loop_4x_vec_backward):
>  	/* Copy 4 * VEC a time backward.  */
>  	VMOVU	(%rcx), %VEC(0)
>  	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
>  	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
>  	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
> -	subq	$(VEC_SIZE * 4), %rcx
> -	subq	$(VEC_SIZE * 4), %rdx
> +	addq	$-(VEC_SIZE * 4), %rcx
> +	addq	$-(VEC_SIZE * 4), %rdx
>  	VMOVA	%VEC(0), (%r9)
>  	VMOVA	%VEC(1), -VEC_SIZE(%r9)
>  	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
>  	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
> -	subq	$(VEC_SIZE * 4), %r9
> +	addq	$-(VEC_SIZE * 4), %r9
>  	cmpq	$(VEC_SIZE * 4), %rdx
>  	ja	L(loop_4x_vec_backward)
>  	/* Store the first 4 * VEC.  */
> @@ -500,72 +562,202 @@ L(loop_4x_vec_backward):
>  	VZEROUPPER_RETURN
>  
>  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> -L(large_forward):
> +	.p2align 4
> +L(large_memcpy_2x):
> +	/* Compute absolute value of difference between source and
> +	   destination.  */
> +	movq	%rdi, %r9
> +	subq	%rsi, %r9
> +	movq	%r9, %r8
> +	leaq	-1(%r9), %rcx
> +	sarq	$63, %r8
> +	xorq	%r8, %r9
> +	subq	%r8, %r9
>  	/* Don't use non-temporal store if there is overlap between
> -	   destination and source since destination may be in cache
> -	   when source is loaded.  */
> -	leaq    (%rdi, %rdx), %r10
> -	cmpq    %r10, %rsi
> -	jb	L(loop_4x_vec_forward)
> -L(loop_large_forward):
> +	   destination and source since destination may be in cache when
> +	   source is loaded.  */
> +	cmpq	%r9, %rdx
> +	ja	L(more_8x_vec_check)
> +
> +	/* Cache align destination. First store the first 64 bytes then
> +	   adjust alignments.  */
> +	VMOVU	(%rsi), %VEC(8)
> +#if VEC_SIZE < 64
> +	VMOVU	VEC_SIZE(%rsi), %VEC(9)
> +#if VEC_SIZE < 32
> +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
> +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
> +#endif
> +#endif
> +	VMOVU	%VEC(8), (%rdi)
> +#if VEC_SIZE < 64
> +	VMOVU	%VEC(9), VEC_SIZE(%rdi)
> +#if VEC_SIZE < 32
> +	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
> +	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
> +#endif
> +#endif
> +	/* Adjust source, destination, and size.  */
> +	movq	%rdi, %r8
> +	andq	$63, %r8
> +	/* Get the negative of offset for alignment.  */
> +	subq	$64, %r8
> +	/* Adjust source.  */
> +	subq	%r8, %rsi
> +	/* Adjust destination which should be aligned now.  */
> +	subq	%r8, %rdi
> +	/* Adjust length.  */
> +	addq	%r8, %rdx
> +
> +	/* Test if source and destination addresses will alias. If they do
> +	   the larger pipeline in large_memcpy_4x alleviated the
> +	   performance drop.  */
> +	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
> +	jz	L(large_memcpy_4x)
> +
> +	movq	%rdx, %r10
> +	shrq	$LOG_4X_MEMCPY_THRESH, %r10
> +	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
> +	jae	L(large_memcpy_4x)
> +
> +	/* edx will store remainder size for copying tail.  */
> +	andl	$(PAGE_SIZE * 2 - 1), %edx
> +	/* r10 stores outer loop counter.  */
> +	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> +	/* Copy 4x VEC at a time from 2 pages.  */
> +	.p2align 4
> +L(loop_large_memcpy_2x_outer):
> +	/* ecx stores inner loop counter.  */
> +	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> +L(loop_large_memcpy_2x_inner):
> +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> +	/* Load vectors from rsi.  */
> +	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +	subq	$-LARGE_LOAD_SIZE, %rsi
> +	/* Non-temporal store vectors to rdi.  */
> +	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +	subq	$-LARGE_LOAD_SIZE, %rdi
> +	decl	%ecx
> +	jnz	L(loop_large_memcpy_2x_inner)
> +	addq	$PAGE_SIZE, %rdi
> +	addq	$PAGE_SIZE, %rsi
> +	decq	%r10
> +	jne	L(loop_large_memcpy_2x_outer)
> +	sfence
> +
> +	/* Check if only last 4 loads are needed.  */
> +	cmpl	$(VEC_SIZE * 4), %edx
> +	jbe	L(large_memcpy_2x_end)
> +
> +	/* Handle the last 2 * PAGE_SIZE bytes.  */
> +L(loop_large_memcpy_2x_tail):
>  	/* Copy 4 * VEC a time forward with non-temporal stores.  */
> -	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> -	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> +	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
>  	VMOVU	(%rsi), %VEC(0)
>  	VMOVU	VEC_SIZE(%rsi), %VEC(1)
>  	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
>  	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
> -	addq	$PREFETCHED_LOAD_SIZE, %rsi
> -	subq	$PREFETCHED_LOAD_SIZE, %rdx
> -	VMOVNT	%VEC(0), (%rdi)
> -	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
> -	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
> -	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
> -	addq	$PREFETCHED_LOAD_SIZE, %rdi
> -	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
> -	ja	L(loop_large_forward)
> -	sfence
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	addl	$-(VEC_SIZE * 4), %edx
> +	VMOVA	%VEC(0), (%rdi)
> +	VMOVA	%VEC(1), VEC_SIZE(%rdi)
> +	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
> +	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpl	$(VEC_SIZE * 4), %edx
> +	ja	L(loop_large_memcpy_2x_tail)
> +
> +L(large_memcpy_2x_end):
>  	/* Store the last 4 * VEC.  */
> -	VMOVU	%VEC(5), (%rcx)
> -	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
> -	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
> -	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
> -	/* Store the first VEC.  */
> -	VMOVU	%VEC(4), (%r11)
> +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
> +
> +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> +	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> +	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> +	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
>  	VZEROUPPER_RETURN
>  
> -L(large_backward):
> -	/* Don't use non-temporal store if there is overlap between
> -	   destination and source since destination may be in cache
> -	   when source is loaded.  */
> -	leaq    (%rcx, %rdx), %r10
> -	cmpq    %r10, %r9
> -	jb	L(loop_4x_vec_backward)
> -L(loop_large_backward):
> -	/* Copy 4 * VEC a time backward with non-temporal stores.  */
> -	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> -	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> -	VMOVU	(%rcx), %VEC(0)
> -	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
> -	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
> -	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
> -	subq	$PREFETCHED_LOAD_SIZE, %rcx
> -	subq	$PREFETCHED_LOAD_SIZE, %rdx
> -	VMOVNT	%VEC(0), (%r9)
> -	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
> -	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
> -	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
> -	subq	$PREFETCHED_LOAD_SIZE, %r9
> -	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
> -	ja	L(loop_large_backward)
> +	.p2align 4
> +L(large_memcpy_4x):
> +	movq	%rdx, %r10
> +	/* edx will store remainder size for copying tail.  */
> +	andl	$(PAGE_SIZE * 4 - 1), %edx
> +	/* r10 stores outer loop counter.  */
> +	shrq	$(LOG_PAGE_SIZE + 2), %r10
> +	/* Copy 4x VEC at a time from 4 pages.  */
> +	.p2align 4
> +L(loop_large_memcpy_4x_outer):
> +	/* ecx stores inner loop counter.  */
> +	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> +L(loop_large_memcpy_4x_inner):
> +	/* Only one prefetch set per page as doing 4 pages give more time
> +	   for prefetcher to keep up.  */
> +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> +	/* Load vectors from rsi.  */
> +	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> +	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> +	subq	$-LARGE_LOAD_SIZE, %rsi
> +	/* Non-temporal store vectors to rdi.  */
> +	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> +	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> +	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> +	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> +	subq	$-LARGE_LOAD_SIZE, %rdi
> +	decl	%ecx
> +	jnz	L(loop_large_memcpy_4x_inner)
> +	addq	$(PAGE_SIZE * 3), %rdi
> +	addq	$(PAGE_SIZE * 3), %rsi
> +	decq	%r10
> +	jne	L(loop_large_memcpy_4x_outer)
>  	sfence
> -	/* Store the first 4 * VEC.  */
> -	VMOVU	%VEC(4), (%rdi)
> -	VMOVU	%VEC(5), VEC_SIZE(%rdi)
> -	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
> -	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
> -	/* Store the last VEC.  */
> -	VMOVU	%VEC(8), (%r11)
> +	/* Check if only last 4 loads are needed.  */
> +	cmpl	$(VEC_SIZE * 4), %edx
> +	jbe	L(large_memcpy_4x_end)
> +
> +	/* Handle the last 4  * PAGE_SIZE bytes.  */
> +L(loop_large_memcpy_4x_tail):
> +	/* Copy 4 * VEC a time forward with non-temporal stores.  */
> +	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> +	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> +	VMOVU	(%rsi), %VEC(0)
> +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
> +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
> +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
> +	subq	$-(VEC_SIZE * 4), %rsi
> +	addl	$-(VEC_SIZE * 4), %edx
> +	VMOVA	%VEC(0), (%rdi)
> +	VMOVA	%VEC(1), VEC_SIZE(%rdi)
> +	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
> +	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	cmpl	$(VEC_SIZE * 4), %edx
> +	ja	L(loop_large_memcpy_4x_tail)
> +
> +L(large_memcpy_4x_end):
> +	/* Store the last 4 * VEC.  */
> +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
> +
> +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> +	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> +	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> +	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
>  	VZEROUPPER_RETURN
>  #endif
>  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> -- 
> 2.29.2
> 

LGTM.  Please commit it.

Thanks.


H.J.
  
Noah Goldstein April 16, 2021, 4:34 p.m. UTC | #6
> LGTM.  Please commit it.

Are you saying that to me or someone else? If its to me what do you
mean, is the patch not enough?

> Thanks.

On Fri, Apr 16, 2021 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Apr 03, 2021 at 04:12:15AM -0400, Noah Goldstein wrote:
> > From: noah <goldstein.w.n@gmail.com>
> >
> > No Bug. This commit updates the large memcpy case (no overlap). The
> > update is to perform memcpy on either 2 or 4 contiguous pages at
> > once. This 1) helps to alleviate the affects of false memory aliasing
> > when destination and source have a close 4k alignment and 2) In most
> > cases and for most DRAM units is a modestly more efficient access
> > pattern. These changes are a clear performance improvement for
> > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > pass.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > Issue was alignment related AFAICT. Added `.p2align 4` infront of the
> > loops and no longer see any meaningful regression.
> >
> > Also added back the temporal stores for the tail. Saw a regression
> > when doing these tests.
> >
> > Two tables below for skylake and icelake numbers for the areas around
> > where you saw the regression. Below is all data from the tests.
> >
> > N = 10.
> >
> > Skylake
> > Len         ,align1      ,align2      ,new mean    ,old mean
> > 4103        ,0           ,64          ,84.5        ,88.6
> > 4111        ,0           ,3           ,99.0        ,99.9
> > 4127        ,3           ,0           ,102.1       ,102.3
> > 4159        ,3           ,7           ,88.7        ,90.9
> > 4223        ,9           ,5           ,88.1        ,87.4
> > 8199        ,0           ,64          ,146.7       ,150.2
> > 8207        ,0           ,3           ,167.9       ,168.5
> > 8223        ,3           ,0           ,168.5       ,168.1
> > 8255        ,3           ,7           ,157.0       ,159.2
> > 8319        ,9           ,5           ,155.5       ,155.7
> > 16391       ,0           ,64          ,286.2       ,288.8
> > 16399       ,0           ,3           ,307.0       ,308.7
> > 16415       ,3           ,0           ,307.4       ,307.6
> > 16447       ,3           ,7           ,294.6       ,295.5
> > 16511       ,9           ,5           ,291.5       ,462.1
> > 32775       ,0           ,64          ,603.4       ,601.5
> > 32783       ,0           ,3           ,604.8       ,606.4
> > 32799       ,3           ,0           ,603.0       ,604.1
> > 32831       ,3           ,7           ,600.2       ,737.3
> > 32895       ,9           ,5           ,604.4       ,599.5
> > 65543       ,0           ,64          ,1873.5      ,1854.3
> > 65551       ,0           ,3           ,1862.9      ,1846.6
> > 65567       ,3           ,0           ,1885.5      ,1966.0
> > 65599       ,3           ,7           ,1833.2      ,1833.1
> > 65663       ,9           ,5           ,1884.9      ,1887.4
> > 131079      ,0           ,64          ,3944.3      ,3949.4
> > 131087      ,0           ,3           ,3927.3      ,3913.3
> > 131103      ,3           ,0           ,4415.8      ,4169.4
> > 131135      ,3           ,7           ,4224.5      ,4157.6
> > 131199      ,9           ,5           ,5974.0      ,4983.8
> > 262151      ,0           ,64          ,11050.2     ,10620.6
> > 262159      ,0           ,3           ,9932.8      ,10037.3
> > 262175      ,3           ,0           ,10188.8     ,9206.6
> > 262207      ,3           ,7           ,9633.3      ,9216.7
> > 262271      ,9           ,5           ,9732.7      ,9345.3
> > 524295      ,0           ,64          ,24823.9     ,24880.7
> > 524303      ,0           ,3           ,24514.0     ,24556.7
> > 524319      ,3           ,0           ,23974.4     ,24219.9
> > 524351      ,3           ,7           ,24159.7     ,24207.0
> > 524415      ,9           ,5           ,23946.5     ,24142.8
> >
> > Icelake:
> > Len         ,align1      ,align2      ,new mean    ,old mean
> > 4103        ,0           ,64          ,50.2        ,63.7
> > 4111        ,0           ,3           ,63.7        ,65.1
> > 4127        ,3           ,0           ,68.2        ,69.4
> > 4159        ,3           ,7           ,59.6        ,68.0
> > 4223        ,9           ,5           ,68.2        ,66.8
> > 8199        ,0           ,64          ,92.1        ,89.9
> > 8207        ,0           ,3           ,119.7       ,118.3
> > 8223        ,3           ,0           ,119.1       ,120.9
> > 8255        ,3           ,7           ,122.9       ,123.7
> > 8319        ,9           ,5           ,122.1       ,121.8
> > 16391       ,0           ,64          ,162.7       ,158.0
> > 16399       ,0           ,3           ,227.6       ,234.1
> > 16415       ,3           ,0           ,230.8       ,232.7
> > 16447       ,3           ,7           ,226.8       ,232.6
> > 16511       ,9           ,5           ,233.4       ,233.8
> > 32775       ,0           ,64          ,312.2       ,301.8
> > 32783       ,0           ,3           ,449.7       ,450.0
> > 32799       ,3           ,0           ,452.7       ,455.9
> > 32831       ,3           ,7           ,449.8       ,458.0
> > 32895       ,9           ,5           ,456.3       ,459.4
> > 65543       ,0           ,64          ,1460.6      ,1463.9
> > 65551       ,0           ,3           ,1462.0      ,1465.4
> > 65567       ,3           ,0           ,1466.6      ,1480.4
> > 65599       ,3           ,7           ,1488.0      ,1488.9
> > 65663       ,9           ,5           ,1680.8      ,1499.5
> > 131079      ,0           ,64          ,2988.5      ,3010.1
> > 131087      ,0           ,3           ,2995.5      ,2996.4
> > 131103      ,3           ,0           ,3006.2      ,3000.5
> > 131135      ,3           ,7           ,3032.4      ,3073.7
> > 131199      ,9           ,5           ,3010.4      ,3027.4
> > 262151      ,0           ,64          ,6143.2      ,6079.1
> > 262159      ,0           ,3           ,6085.1      ,6075.8
> > 262175      ,3           ,0           ,6088.0      ,6064.9
> > 262207      ,3           ,7           ,6018.7      ,6023.5
> > 262271      ,9           ,5           ,6019.8      ,5959.2
> > 524295      ,0           ,64          ,14464.2     ,14095.1
> > 524303      ,0           ,3           ,14761.6     ,14050.2
> > 524319      ,3           ,0           ,14534.1     ,14087.5
> > 524351      ,3           ,7           ,14147.7     ,13903.8
> > 524415      ,9           ,5           ,14157.0     ,13982.9
> >
> >
> >
> > cpu         ,version     ,Len         ,align1      ,align2      ,new mean    ,old mean
> > skylake     ,avx         ,4103        ,0           ,64          ,84.5        ,88.6
> > skylake     ,avx         ,4111        ,0           ,3           ,99.0        ,99.9
> > skylake     ,avx         ,4127        ,3           ,0           ,102.1       ,102.3
> > skylake     ,avx         ,4159        ,3           ,7           ,88.7        ,90.9
> > skylake     ,avx         ,4223        ,9           ,5           ,88.1        ,87.4
> > skylake     ,avx         ,8199        ,0           ,64          ,146.7       ,150.2
> > skylake     ,avx         ,8207        ,0           ,3           ,167.9       ,168.5
> > skylake     ,avx         ,8223        ,3           ,0           ,168.5       ,168.1
> > skylake     ,avx         ,8255        ,3           ,7           ,157.0       ,159.2
> > skylake     ,avx         ,8319        ,9           ,5           ,155.5       ,155.7
> > skylake     ,avx         ,16391       ,0           ,64          ,286.2       ,288.8
> > skylake     ,avx         ,16399       ,0           ,3           ,307.0       ,308.7
> > skylake     ,avx         ,16415       ,3           ,0           ,307.4       ,307.6
> > skylake     ,avx         ,16447       ,3           ,7           ,294.6       ,295.5
> > skylake     ,avx         ,16511       ,9           ,5           ,291.5       ,462.1
> > skylake     ,avx         ,32775       ,0           ,64          ,603.4       ,601.5
> > skylake     ,avx         ,32783       ,0           ,3           ,604.8       ,606.4
> > skylake     ,avx         ,32799       ,3           ,0           ,603.0       ,604.1
> > skylake     ,avx         ,32831       ,3           ,7           ,600.2       ,737.3
> > skylake     ,avx         ,32895       ,9           ,5           ,604.4       ,599.5
> > skylake     ,avx         ,65543       ,0           ,64          ,1873.5      ,1854.3
> > skylake     ,avx         ,65551       ,0           ,3           ,1862.9      ,1846.6
> > skylake     ,avx         ,65567       ,3           ,0           ,1885.5      ,1966.0
> > skylake     ,avx         ,65599       ,3           ,7           ,1833.2      ,1833.1
> > skylake     ,avx         ,65663       ,9           ,5           ,1884.9      ,1887.4
> > skylake     ,avx         ,131079      ,0           ,64          ,3944.3      ,3949.4
> > skylake     ,avx         ,131087      ,0           ,3           ,3927.3      ,3913.3
> > skylake     ,avx         ,131103      ,3           ,0           ,4415.8      ,4169.4
> > skylake     ,avx         ,131135      ,3           ,7           ,4224.5      ,4157.6
> > skylake     ,avx         ,131199      ,9           ,5           ,5974.0      ,4983.8
> > skylake     ,avx         ,262151      ,0           ,64          ,11050.2     ,10620.6
> > skylake     ,avx         ,262159      ,0           ,3           ,9932.8      ,10037.3
> > skylake     ,avx         ,262175      ,3           ,0           ,10188.8     ,9206.6
> > skylake     ,avx         ,262207      ,3           ,7           ,9633.3      ,9216.7
> > skylake     ,avx         ,262271      ,9           ,5           ,9732.7      ,9345.3
> > skylake     ,avx         ,524295      ,0           ,64          ,24823.9     ,24880.7
> > skylake     ,avx         ,524303      ,0           ,3           ,24514.0     ,24556.7
> > skylake     ,avx         ,524319      ,3           ,0           ,23974.4     ,24219.9
> > skylake     ,avx         ,524351      ,3           ,7           ,24159.7     ,24207.0
> > skylake     ,avx         ,524415      ,9           ,5           ,23946.5     ,24142.8
> > skylake     ,avx         ,1048583     ,0           ,64          ,49163.9     ,49454.6
> > skylake     ,avx         ,1048591     ,0           ,3           ,49879.3     ,49400.8
> > skylake     ,avx         ,1048607     ,3           ,0           ,49738.0     ,48864.6
> > skylake     ,avx         ,1048639     ,3           ,7           ,48804.0     ,47588.5
> > skylake     ,avx         ,1048703     ,9           ,5           ,49629.4     ,49796.3
> > skylake     ,avx         ,2097159     ,0           ,64          ,98271.7     ,96330.6
> > skylake     ,avx         ,2097167     ,0           ,3           ,97801.8     ,98638.1
> > skylake     ,avx         ,2097183     ,3           ,0           ,98041.1     ,99287.6
> > skylake     ,avx         ,2097215     ,3           ,7           ,96629.5     ,96521.9
> > skylake     ,avx         ,2097279     ,9           ,5           ,98961.8     ,98909.8
> > skylake     ,avx         ,4194311     ,0           ,64          ,194667.7    ,195377.1
> > skylake     ,avx         ,4194319     ,0           ,3           ,194919.5    ,198576.2
> > skylake     ,avx         ,4194335     ,3           ,0           ,192949.8    ,194584.7
> > skylake     ,avx         ,4194367     ,3           ,7           ,189943.5    ,189177.9
> > skylake     ,avx         ,4194431     ,9           ,5           ,192479.1    ,196494.2
> > skylake     ,avx         ,8388615     ,0           ,64          ,588671.6    ,587215.4
> > skylake     ,avx         ,8388623     ,0           ,3           ,581640.7    ,582812.5
> > skylake     ,avx         ,8388639     ,3           ,0           ,549811.9    ,544697.6
> > skylake     ,avx         ,8388671     ,3           ,7           ,591155.0    ,577951.8
> > skylake     ,avx         ,8388735     ,9           ,5           ,547583.2    ,545133.3
> > skylake     ,avx         ,16777223    ,0           ,64          ,1787503.0   ,1811146.0
> > skylake     ,avx         ,16777231    ,0           ,3           ,1758671.0   ,1756343.0
> > skylake     ,avx         ,16777247    ,3           ,0           ,1691781.0   ,1694661.0
> > skylake     ,avx         ,16777279    ,3           ,7           ,1768150.0   ,1754785.0
> > skylake     ,avx         ,16777343    ,9           ,5           ,1695179.0   ,1710794.0
> > skylake     ,sse2        ,4103        ,0           ,64          ,150.8       ,150.5
> > skylake     ,sse2        ,4111        ,0           ,3           ,156.8       ,158.4
> > skylake     ,sse2        ,4127        ,3           ,0           ,99.7        ,99.4
> > skylake     ,sse2        ,4159        ,3           ,7           ,154.8       ,154.5
> > skylake     ,sse2        ,4223        ,9           ,5           ,137.3       ,137.2
> > skylake     ,sse2        ,8199        ,0           ,64          ,284.8       ,285.5
> > skylake     ,sse2        ,8207        ,0           ,3           ,296.0       ,296.1
> > skylake     ,sse2        ,8223        ,3           ,0           ,168.0       ,168.2
> > skylake     ,sse2        ,8255        ,3           ,7           ,293.0       ,292.4
> > skylake     ,sse2        ,8319        ,9           ,5           ,251.3       ,250.7
> > skylake     ,sse2        ,16391       ,0           ,64          ,561.3       ,608.3
> > skylake     ,sse2        ,16399       ,0           ,3           ,571.0       ,574.8
> > skylake     ,sse2        ,16415       ,3           ,0           ,305.4       ,305.0
> > skylake     ,sse2        ,16447       ,3           ,7           ,563.2       ,565.0
> > skylake     ,sse2        ,16511       ,9           ,5           ,477.1       ,475.1
> > skylake     ,sse2        ,32775       ,0           ,64          ,1128.2      ,1131.7
> > skylake     ,sse2        ,32783       ,0           ,3           ,1126.6      ,1131.0
> > skylake     ,sse2        ,32799       ,3           ,0           ,587.6       ,590.8
> > skylake     ,sse2        ,32831       ,3           ,7           ,1130.6      ,1126.2
> > skylake     ,sse2        ,32895       ,9           ,5           ,957.6       ,953.0
> > skylake     ,sse2        ,65543       ,0           ,64          ,2718.9      ,2704.2
> > skylake     ,sse2        ,65551       ,0           ,3           ,2724.1      ,2725.0
> > skylake     ,sse2        ,65567       ,3           ,0           ,1888.4      ,1914.3
> > skylake     ,sse2        ,65599       ,3           ,7           ,2787.6      ,2748.7
> > skylake     ,sse2        ,65663       ,9           ,5           ,2400.5      ,2369.4
> > skylake     ,sse2        ,131079      ,0           ,64          ,5603.3      ,5654.9
> > skylake     ,sse2        ,131087      ,0           ,3           ,5939.3      ,5871.4
> > skylake     ,sse2        ,131103      ,3           ,0           ,4272.4      ,4190.0
> > skylake     ,sse2        ,131135      ,3           ,7           ,7601.4      ,7524.6
> > skylake     ,sse2        ,131199      ,9           ,5           ,7022.1      ,6864.7
> > skylake     ,sse2        ,262151      ,0           ,64          ,13736.2     ,14030.0
> > skylake     ,sse2        ,262159      ,0           ,3           ,12407.3     ,12334.1
> > skylake     ,sse2        ,262175      ,3           ,0           ,9661.1      ,9249.4
> > skylake     ,sse2        ,262207      ,3           ,7           ,12850.2     ,12351.6
> > skylake     ,sse2        ,262271      ,9           ,5           ,10792.6     ,10435.8
> > skylake     ,sse2        ,524295      ,0           ,64          ,27754.5     ,28177.7
> > skylake     ,sse2        ,524303      ,0           ,3           ,27766.2     ,28152.0
> > skylake     ,sse2        ,524319      ,3           ,0           ,24030.9     ,24438.3
> > skylake     ,sse2        ,524351      ,3           ,7           ,27787.5     ,27933.0
> > skylake     ,sse2        ,524415      ,9           ,5           ,24263.2     ,25249.1
> > skylake     ,sse2        ,1048583     ,0           ,64          ,56199.9     ,56039.8
> > skylake     ,sse2        ,1048591     ,0           ,3           ,56750.2     ,58889.7
> > skylake     ,sse2        ,1048607     ,3           ,0           ,56394.0     ,55115.3
> > skylake     ,sse2        ,1048639     ,3           ,7           ,57233.1     ,57473.8
> > skylake     ,sse2        ,1048703     ,9           ,5           ,56324.3     ,55917.9
> > skylake     ,sse2        ,2097159     ,0           ,64          ,113234.8    ,114346.4
> > skylake     ,sse2        ,2097167     ,0           ,3           ,114373.1    ,115522.5
> > skylake     ,sse2        ,2097183     ,3           ,0           ,108113.3    ,108513.3
> > skylake     ,sse2        ,2097215     ,3           ,7           ,116863.6    ,116549.9
> > skylake     ,sse2        ,2097279     ,9           ,5           ,108945.1    ,108843.7
> > skylake     ,sse2        ,4194311     ,0           ,64          ,230250.1    ,232350.0
> > skylake     ,sse2        ,4194319     ,0           ,3           ,231895.3    ,235055.6
> > skylake     ,sse2        ,4194335     ,3           ,0           ,218442.8    ,219199.8
> > skylake     ,sse2        ,4194367     ,3           ,7           ,242564.2    ,235587.7
> > skylake     ,sse2        ,4194431     ,9           ,5           ,224167.4    ,215261.8
> > skylake     ,sse2        ,8388615     ,0           ,64          ,679801.8    ,674832.0
> > skylake     ,sse2        ,8388623     ,0           ,3           ,684913.2    ,685238.7
> > skylake     ,sse2        ,8388639     ,3           ,0           ,644865.4    ,631388.6
> > skylake     ,sse2        ,8388671     ,3           ,7           ,698700.9    ,689316.1
> > skylake     ,sse2        ,8388735     ,9           ,5           ,644820.2    ,631366.8
> > skylake     ,sse2        ,16777223    ,0           ,64          ,1877984.0   ,1876437.0
> > skylake     ,sse2        ,16777231    ,0           ,3           ,1898086.0   ,1913053.0
> > skylake     ,sse2        ,16777247    ,3           ,0           ,1857018.0   ,1866949.0
> > skylake     ,sse2        ,16777279    ,3           ,7           ,1914905.0   ,1897134.0
> > skylake     ,sse2        ,16777343    ,9           ,5           ,1859937.0   ,1881939.0
> > icelake     ,avx512      ,4103        ,0           ,64          ,75.2        ,75.8
> > icelake     ,avx512      ,4111        ,0           ,3           ,56.9        ,56.4
> > icelake     ,avx512      ,4127        ,3           ,0           ,59.1        ,59.6
> > icelake     ,avx512      ,4159        ,3           ,7           ,50.7        ,51.3
> > icelake     ,avx512      ,4223        ,9           ,5           ,59.2        ,58.9
> > icelake     ,avx512      ,8199        ,0           ,64          ,67.8        ,63.9
> > icelake     ,avx512      ,8207        ,0           ,3           ,89.0        ,89.9
> > icelake     ,avx512      ,8223        ,3           ,0           ,90.2        ,90.1
> > icelake     ,avx512      ,8255        ,3           ,7           ,82.6        ,84.9
> > icelake     ,avx512      ,8319        ,9           ,5           ,91.5        ,92.8
> > icelake     ,avx512      ,16391       ,0           ,64          ,118.0       ,117.6
> > icelake     ,avx512      ,16399       ,0           ,3           ,156.5       ,157.0
> > icelake     ,avx512      ,16415       ,3           ,0           ,157.4       ,157.3
> > icelake     ,avx512      ,16447       ,3           ,7           ,151.0       ,151.6
> > icelake     ,avx512      ,16511       ,9           ,5           ,159.1       ,159.6
> > icelake     ,avx512      ,32775       ,0           ,64          ,231.8       ,230.8
> > icelake     ,avx512      ,32783       ,0           ,3           ,297.8       ,299.3
> > icelake     ,avx512      ,32799       ,3           ,0           ,299.1       ,299.0
> > icelake     ,avx512      ,32831       ,3           ,7           ,293.5       ,295.4
> > icelake     ,avx512      ,32895       ,9           ,5           ,300.3       ,302.5
> > icelake     ,avx512      ,65543       ,0           ,64          ,1473.4      ,1479.2
> > icelake     ,avx512      ,65551       ,0           ,3           ,1438.2      ,1445.3
> > icelake     ,avx512      ,65567       ,3           ,0           ,1450.3      ,1463.8
> > icelake     ,avx512      ,65599       ,3           ,7           ,1469.0      ,1473.8
> > icelake     ,avx512      ,65663       ,9           ,5           ,1480.0      ,1483.5
> > icelake     ,avx512      ,131079      ,0           ,64          ,3015.1      ,3037.5
> > icelake     ,avx512      ,131087      ,0           ,3           ,2952.3      ,2960.4
> > icelake     ,avx512      ,131103      ,3           ,0           ,2966.2      ,2964.4
> > icelake     ,avx512      ,131135      ,3           ,7           ,2961.6      ,3047.9
> > icelake     ,avx512      ,131199      ,9           ,5           ,2967.4      ,3183.8
> > icelake     ,avx512      ,262151      ,0           ,64          ,6206.0      ,6141.5
> > icelake     ,avx512      ,262159      ,0           ,3           ,5990.8      ,5959.2
> > icelake     ,avx512      ,262175      ,3           ,0           ,5976.7      ,5963.8
> > icelake     ,avx512      ,262207      ,3           ,7           ,5939.5      ,5924.3
> > icelake     ,avx512      ,262271      ,9           ,5           ,5944.6      ,5990.3
> > icelake     ,avx512      ,524295      ,0           ,64          ,14726.7     ,14307.0
> > icelake     ,avx512      ,524303      ,0           ,3           ,14344.2     ,14040.5
> > icelake     ,avx512      ,524319      ,3           ,0           ,14175.0     ,13862.2
> > icelake     ,avx512      ,524351      ,3           ,7           ,14261.4     ,13821.5
> > icelake     ,avx512      ,524415      ,9           ,5           ,14266.5     ,14064.7
> > icelake     ,avx512      ,1048583     ,0           ,64          ,35211.4     ,35414.6
> > icelake     ,avx512      ,1048591     ,0           ,3           ,35156.8     ,35591.2
> > icelake     ,avx512      ,1048607     ,3           ,0           ,35273.1     ,35503.3
> > icelake     ,avx512      ,1048639     ,3           ,7           ,35255.8     ,35725.0
> > icelake     ,avx512      ,1048703     ,9           ,5           ,35703.6     ,36289.9
> > icelake     ,avx512      ,2097159     ,0           ,64          ,72613.9     ,72063.2
> > icelake     ,avx512      ,2097167     ,0           ,3           ,72301.6     ,73504.2
> > icelake     ,avx512      ,2097183     ,3           ,0           ,73448.8     ,72133.6
> > icelake     ,avx512      ,2097215     ,3           ,7           ,73762.9     ,72825.8
> > icelake     ,avx512      ,2097279     ,9           ,5           ,72097.3     ,72914.6
> > icelake     ,avx512      ,4194311     ,0           ,64          ,144793.4    ,144182.1
> > icelake     ,avx512      ,4194319     ,0           ,3           ,143710.3    ,145063.3
> > icelake     ,avx512      ,4194335     ,3           ,0           ,146722.1    ,144046.4
> > icelake     ,avx512      ,4194367     ,3           ,7           ,144267.0    ,144874.6
> > icelake     ,avx512      ,4194431     ,9           ,5           ,143808.2    ,144560.0
> > icelake     ,avx512      ,8388615     ,0           ,64          ,427993.4    ,424521.5
> > icelake     ,avx512      ,8388623     ,0           ,3           ,470267.1    ,473290.8
> > icelake     ,avx512      ,8388639     ,3           ,0           ,457179.7    ,461797.7
> > icelake     ,avx512      ,8388671     ,3           ,7           ,472507.9    ,481561.4
> > icelake     ,avx512      ,8388735     ,9           ,5           ,463611.9    ,467388.7
> > icelake     ,avx512      ,16777223    ,0           ,64          ,1490426.0   ,1526996.0
> > icelake     ,avx512      ,16777231    ,0           ,3           ,1516687.0   ,1517095.0
> > icelake     ,avx512      ,16777247    ,3           ,0           ,1497688.0   ,1512766.0
> > icelake     ,avx512      ,16777279    ,3           ,7           ,1512331.0   ,1524317.0
> > icelake     ,avx512      ,16777343    ,9           ,5           ,1498908.0   ,1500526.0
> > icelake     ,avx         ,4103        ,0           ,64          ,50.2        ,63.7
> > icelake     ,avx         ,4111        ,0           ,3           ,63.7        ,65.1
> > icelake     ,avx         ,4127        ,3           ,0           ,68.2        ,69.4
> > icelake     ,avx         ,4159        ,3           ,7           ,59.6        ,68.0
> > icelake     ,avx         ,4223        ,9           ,5           ,68.2        ,66.8
> > icelake     ,avx         ,8199        ,0           ,64          ,92.1        ,89.9
> > icelake     ,avx         ,8207        ,0           ,3           ,119.7       ,118.3
> > icelake     ,avx         ,8223        ,3           ,0           ,119.1       ,120.9
> > icelake     ,avx         ,8255        ,3           ,7           ,122.9       ,123.7
> > icelake     ,avx         ,8319        ,9           ,5           ,122.1       ,121.8
> > icelake     ,avx         ,16391       ,0           ,64          ,162.7       ,158.0
> > icelake     ,avx         ,16399       ,0           ,3           ,227.6       ,234.1
> > icelake     ,avx         ,16415       ,3           ,0           ,230.8       ,232.7
> > icelake     ,avx         ,16447       ,3           ,7           ,226.8       ,232.6
> > icelake     ,avx         ,16511       ,9           ,5           ,233.4       ,233.8
> > icelake     ,avx         ,32775       ,0           ,64          ,312.2       ,301.8
> > icelake     ,avx         ,32783       ,0           ,3           ,449.7       ,450.0
> > icelake     ,avx         ,32799       ,3           ,0           ,452.7       ,455.9
> > icelake     ,avx         ,32831       ,3           ,7           ,449.8       ,458.0
> > icelake     ,avx         ,32895       ,9           ,5           ,456.3       ,459.4
> > icelake     ,avx         ,65543       ,0           ,64          ,1460.6      ,1463.9
> > icelake     ,avx         ,65551       ,0           ,3           ,1462.0      ,1465.4
> > icelake     ,avx         ,65567       ,3           ,0           ,1466.6      ,1480.4
> > icelake     ,avx         ,65599       ,3           ,7           ,1488.0      ,1488.9
> > icelake     ,avx         ,65663       ,9           ,5           ,1680.8      ,1499.5
> > icelake     ,avx         ,131079      ,0           ,64          ,2988.5      ,3010.1
> > icelake     ,avx         ,131087      ,0           ,3           ,2995.5      ,2996.4
> > icelake     ,avx         ,131103      ,3           ,0           ,3006.2      ,3000.5
> > icelake     ,avx         ,131135      ,3           ,7           ,3032.4      ,3073.7
> > icelake     ,avx         ,131199      ,9           ,5           ,3010.4      ,3027.4
> > icelake     ,avx         ,262151      ,0           ,64          ,6143.2      ,6079.1
> > icelake     ,avx         ,262159      ,0           ,3           ,6085.1      ,6075.8
> > icelake     ,avx         ,262175      ,3           ,0           ,6088.0      ,6064.9
> > icelake     ,avx         ,262207      ,3           ,7           ,6018.7      ,6023.5
> > icelake     ,avx         ,262271      ,9           ,5           ,6019.8      ,5959.2
> > icelake     ,avx         ,524295      ,0           ,64          ,14464.2     ,14095.1
> > icelake     ,avx         ,524303      ,0           ,3           ,14761.6     ,14050.2
> > icelake     ,avx         ,524319      ,3           ,0           ,14534.1     ,14087.5
> > icelake     ,avx         ,524351      ,3           ,7           ,14147.7     ,13903.8
> > icelake     ,avx         ,524415      ,9           ,5           ,14157.0     ,13982.9
> > icelake     ,avx         ,1048583     ,0           ,64          ,36599.0     ,37461.4
> > icelake     ,avx         ,1048591     ,0           ,3           ,36717.8     ,37454.9
> > icelake     ,avx         ,1048607     ,3           ,0           ,36821.2     ,37343.3
> > icelake     ,avx         ,1048639     ,3           ,7           ,36958.0     ,37507.2
> > icelake     ,avx         ,1048703     ,9           ,5           ,36869.2     ,37413.1
> > icelake     ,avx         ,2097159     ,0           ,64          ,74765.8     ,75330.9
> > icelake     ,avx         ,2097167     ,0           ,3           ,75175.4     ,74891.9
> > icelake     ,avx         ,2097183     ,3           ,0           ,75451.4     ,74787.7
> > icelake     ,avx         ,2097215     ,3           ,7           ,75394.8     ,75839.1
> > icelake     ,avx         ,2097279     ,9           ,5           ,75099.2     ,75421.2
> > icelake     ,avx         ,4194311     ,0           ,64          ,146809.6    ,146619.4
> > icelake     ,avx         ,4194319     ,0           ,3           ,148866.4    ,149898.2
> > icelake     ,avx         ,4194335     ,3           ,0           ,148719.7    ,150165.4
> > icelake     ,avx         ,4194367     ,3           ,7           ,150600.1    ,150925.9
> > icelake     ,avx         ,4194431     ,9           ,5           ,149457.3    ,150519.2
> > icelake     ,avx         ,8388615     ,0           ,64          ,412709.8    ,423666.1
> > icelake     ,avx         ,8388623     ,0           ,3           ,423717.4    ,424418.2
> > icelake     ,avx         ,8388639     ,3           ,0           ,414387.5    ,413445.6
> > icelake     ,avx         ,8388671     ,3           ,7           ,449010.7    ,417553.5
> > icelake     ,avx         ,8388735     ,9           ,5           ,414128.6    ,411815.3
> > icelake     ,avx         ,16777223    ,0           ,64          ,1490032.0   ,1510004.0
> > icelake     ,avx         ,16777231    ,0           ,3           ,1379638.0   ,1422097.0
> > icelake     ,avx         ,16777247    ,3           ,0           ,1418930.0   ,1367557.0
> > icelake     ,avx         ,16777279    ,3           ,7           ,1515152.0   ,1500176.0
> > icelake     ,avx         ,16777343    ,9           ,5           ,1344117.0   ,1411795.0
> > icelake     ,sse2        ,4103        ,0           ,64          ,113.2       ,114.6
> > icelake     ,sse2        ,4111        ,0           ,3           ,121.5       ,120.4
> > icelake     ,sse2        ,4127        ,3           ,0           ,1700.5      ,1771.5
> > icelake     ,sse2        ,4159        ,3           ,7           ,119.3       ,118.8
> > icelake     ,sse2        ,4223        ,9           ,5           ,1739.7      ,1735.2
> > icelake     ,sse2        ,8199        ,0           ,64          ,207.0       ,203.9
> > icelake     ,sse2        ,8207        ,0           ,3           ,225.5       ,220.8
> > icelake     ,sse2        ,8223        ,3           ,0           ,3444.3      ,3743.5
> > icelake     ,sse2        ,8255        ,3           ,7           ,219.9       ,216.8
> > icelake     ,sse2        ,8319        ,9           ,5           ,4117.1      ,3487.3
> > icelake     ,sse2        ,16391       ,0           ,64          ,397.1       ,394.3
> > icelake     ,sse2        ,16399       ,0           ,3           ,439.6       ,428.6
> > icelake     ,sse2        ,16415       ,3           ,0           ,6997.0      ,7031.2
> > icelake     ,sse2        ,16447       ,3           ,7           ,426.8       ,421.8
> > icelake     ,sse2        ,16511       ,9           ,5           ,7037.6      ,7038.3
> > icelake     ,sse2        ,32775       ,0           ,64          ,790.9       ,779.0
> > icelake     ,sse2        ,32783       ,0           ,3           ,863.1       ,849.6
> > icelake     ,sse2        ,32799       ,3           ,0           ,14043.0     ,14390.9
> > icelake     ,sse2        ,32831       ,3           ,7           ,841.6       ,833.1
> > icelake     ,sse2        ,32895       ,9           ,5           ,14277.6     ,14344.2
> > icelake     ,sse2        ,65543       ,0           ,64          ,1897.0      ,1897.3
> > icelake     ,sse2        ,65551       ,0           ,3           ,1927.1      ,1955.4
> > icelake     ,sse2        ,65567       ,3           ,0           ,28834.7     ,28727.8
> > icelake     ,sse2        ,65599       ,3           ,7           ,1961.4      ,1969.7
> > icelake     ,sse2        ,65663       ,9           ,5           ,28867.6     ,29019.8
> > icelake     ,sse2        ,131079      ,0           ,64          ,3879.3      ,3872.6
> > icelake     ,sse2        ,131087      ,0           ,3           ,3955.3      ,3990.7
> > icelake     ,sse2        ,131103      ,3           ,0           ,58001.8     ,60567.9
> > icelake     ,sse2        ,131135      ,3           ,7           ,3951.5      ,4002.6
> > icelake     ,sse2        ,131199      ,9           ,5           ,57886.7     ,58391.4
> > icelake     ,sse2        ,262151      ,0           ,64          ,7851.4      ,7894.7
> > icelake     ,sse2        ,262159      ,0           ,3           ,7947.5      ,8016.2
> > icelake     ,sse2        ,262175      ,3           ,0           ,115036.2    ,115968.6
> > icelake     ,sse2        ,262207      ,3           ,7           ,7883.9      ,7814.1
> > icelake     ,sse2        ,262271      ,9           ,5           ,113776.4    ,119733.6
> > icelake     ,sse2        ,524295      ,0           ,64          ,17198.1     ,16974.9
> > icelake     ,sse2        ,524303      ,0           ,3           ,17402.2     ,17096.3
> > icelake     ,sse2        ,524319      ,3           ,0           ,223980.4    ,225889.9
> > icelake     ,sse2        ,524351      ,3           ,7           ,17034.9     ,16910.3
> > icelake     ,sse2        ,524415      ,9           ,5           ,224027.7    ,224962.5
> > icelake     ,sse2        ,1048583     ,0           ,64          ,38822.3     ,39178.6
> > icelake     ,sse2        ,1048591     ,0           ,3           ,41686.7     ,40247.4
> > icelake     ,sse2        ,1048607     ,3           ,0           ,38814.8     ,39323.3
> > icelake     ,sse2        ,1048639     ,3           ,7           ,39568.3     ,41325.7
> > icelake     ,sse2        ,1048703     ,9           ,5           ,39354.2     ,39637.9
> > icelake     ,sse2        ,2097159     ,0           ,64          ,84074.7     ,84543.1
> > icelake     ,sse2        ,2097167     ,0           ,3           ,83665.7     ,82358.2
> > icelake     ,sse2        ,2097183     ,3           ,0           ,81817.8     ,79638.9
> > icelake     ,sse2        ,2097215     ,3           ,7           ,83649.1     ,83497.6
> > icelake     ,sse2        ,2097279     ,9           ,5           ,80287.6     ,79980.9
> > icelake     ,sse2        ,4194311     ,0           ,64          ,165409.8    ,168343.1
> > icelake     ,sse2        ,4194319     ,0           ,3           ,165216.7    ,177632.0
> > icelake     ,sse2        ,4194335     ,3           ,0           ,158718.7    ,160342.2
> > icelake     ,sse2        ,4194367     ,3           ,7           ,167944.9    ,167204.4
> > icelake     ,sse2        ,4194431     ,9           ,5           ,161530.1    ,164839.7
> > icelake     ,sse2        ,8388615     ,0           ,64          ,626504.3    ,629858.5
> > icelake     ,sse2        ,8388623     ,0           ,3           ,623969.5    ,631509.1
> > icelake     ,sse2        ,8388639     ,3           ,0           ,599366.7    ,600016.0
> > icelake     ,sse2        ,8388671     ,3           ,7           ,619964.2    ,619113.2
> > icelake     ,sse2        ,8388735     ,9           ,5           ,595338.1    ,604172.4
> > icelake     ,sse2        ,16777223    ,0           ,64          ,1709597.0   ,1725184.0
> > icelake     ,sse2        ,16777231    ,0           ,3           ,1725452.0   ,1719746.0
> > icelake     ,sse2        ,16777247    ,3           ,0           ,1614269.0   ,1607164.0
> > icelake     ,sse2        ,16777279    ,3           ,7           ,1705295.0   ,1733018.0
> > icelake     ,sse2        ,16777343    ,9           ,5           ,1604197.0   ,1595690.0
> >
> >
> >  .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
> >  1 file changed, 265 insertions(+), 73 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > index 897a3d9762..5e4a071f16 100644
> > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > @@ -35,7 +35,16 @@
> >        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
> >     7. If size >= __x86_shared_non_temporal_threshold and there is no
> >        overlap between destination and source, use non-temporal store
> > -      instead of aligned store.  */
> > +      instead of aligned store copying from either 2 or 4 pages at
> > +      once.
> > +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> > +      and source and destination do not page alias, copy from 2 pages
> > +      at once using non-temporal stores. Page aliasing in this case is
> > +      considered true if destination's page alignment - sources' page
> > +      alignment is less than 8 * VEC_SIZE.
> > +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> > +      and destination do page alias copy from 4 pages at once using
> > +      non-temporal stores.  */
> >
> >  #include <sysdep.h>
> >
> > @@ -67,6 +76,34 @@
> >  # endif
> >  #endif
> >
> > +#ifndef PAGE_SIZE
> > +# define PAGE_SIZE 4096
> > +#endif
> > +
> > +#if PAGE_SIZE != 4096
> > +# error Unsupported PAGE_SIZE
> > +#endif
> > +
> > +#ifndef LOG_PAGE_SIZE
> > +# define LOG_PAGE_SIZE 12
> > +#endif
> > +
> > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> > +# error Invalid LOG_PAGE_SIZE
> > +#endif
> > +
> > +/* Byte per page for large_memcpy inner loop.  */
> > +#if VEC_SIZE == 64
> > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> > +#else
> > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> > +#endif
> > +
> > +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
> > +#ifndef LOG_4X_MEMCPY_THRESH
> > +# define LOG_4X_MEMCPY_THRESH 4
> > +#endif
> > +
> >  /* Avoid short distance rep movsb only with non-SSE vector.  */
> >  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
> >  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> > @@ -106,6 +143,28 @@
> >  # error Unsupported PREFETCH_SIZE!
> >  #endif
> >
> > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> > +     VMOVU   (offset)base, vec0; \
> > +     VMOVU   ((offset) + VEC_SIZE)base, vec1;
> > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> > +     VMOVNT  vec0, (offset)base; \
> > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base;
> > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > +     VMOVU   (offset)base, vec0; \
> > +     VMOVU   ((offset) + VEC_SIZE)base, vec1; \
> > +     VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \
> > +     VMOVU   ((offset) + VEC_SIZE * 3)base, vec3;
> > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > +     VMOVNT  vec0, (offset)base; \
> > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base; \
> > +     VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \
> > +     VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;
> > +#else
> > +# error Invalid LARGE_LOAD_SIZE
> > +#endif
> > +
> >  #ifndef SECTION
> >  # error SECTION is not defined!
> >  #endif
> > @@ -393,6 +452,15 @@ L(last_4x_vec):
> >       VZEROUPPER_RETURN
> >
> >  L(more_8x_vec):
> > +     /* Check if non-temporal move candidate.  */
> > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > +     /* Check non-temporal store threshold.  */
> > +     cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > +     ja      L(large_memcpy_2x)
> > +#endif
> > +     /* Entry if rdx is greater than non-temporal threshold but there
> > +       is overlap.  */
> > +L(more_8x_vec_check):
> >       cmpq    %rsi, %rdi
> >       ja      L(more_8x_vec_backward)
> >       /* Source == destination is less common.  */
> > @@ -419,24 +487,21 @@ L(more_8x_vec):
> >       subq    %r8, %rdi
> >       /* Adjust length.  */
> >       addq    %r8, %rdx
> > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -     /* Check non-temporal store threshold.  */
> > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > -     ja      L(large_forward)
> > -#endif
> > +
> > +     .p2align 4
> >  L(loop_4x_vec_forward):
> >       /* Copy 4 * VEC a time forward.  */
> >       VMOVU   (%rsi), %VEC(0)
> >       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > -     addq    $(VEC_SIZE * 4), %rsi
> > -     subq    $(VEC_SIZE * 4), %rdx
> > +     subq    $-(VEC_SIZE * 4), %rsi
> > +     addq    $-(VEC_SIZE * 4), %rdx
> >       VMOVA   %VEC(0), (%rdi)
> >       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> >       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> >       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > -     addq    $(VEC_SIZE * 4), %rdi
> > +     subq    $-(VEC_SIZE * 4), %rdi
> >       cmpq    $(VEC_SIZE * 4), %rdx
> >       ja      L(loop_4x_vec_forward)
> >       /* Store the last 4 * VEC.  */
> > @@ -470,24 +535,21 @@ L(more_8x_vec_backward):
> >       subq    %r8, %r9
> >       /* Adjust length.  */
> >       subq    %r8, %rdx
> > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -     /* Check non-temporal store threshold.  */
> > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > -     ja      L(large_backward)
> > -#endif
> > +
> > +     .p2align 4
> >  L(loop_4x_vec_backward):
> >       /* Copy 4 * VEC a time backward.  */
> >       VMOVU   (%rcx), %VEC(0)
> >       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> >       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> >       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > -     subq    $(VEC_SIZE * 4), %rcx
> > -     subq    $(VEC_SIZE * 4), %rdx
> > +     addq    $-(VEC_SIZE * 4), %rcx
> > +     addq    $-(VEC_SIZE * 4), %rdx
> >       VMOVA   %VEC(0), (%r9)
> >       VMOVA   %VEC(1), -VEC_SIZE(%r9)
> >       VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
> >       VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
> > -     subq    $(VEC_SIZE * 4), %r9
> > +     addq    $-(VEC_SIZE * 4), %r9
> >       cmpq    $(VEC_SIZE * 4), %rdx
> >       ja      L(loop_4x_vec_backward)
> >       /* Store the first 4 * VEC.  */
> > @@ -500,72 +562,202 @@ L(loop_4x_vec_backward):
> >       VZEROUPPER_RETURN
> >
> >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > -L(large_forward):
> > +     .p2align 4
> > +L(large_memcpy_2x):
> > +     /* Compute absolute value of difference between source and
> > +        destination.  */
> > +     movq    %rdi, %r9
> > +     subq    %rsi, %r9
> > +     movq    %r9, %r8
> > +     leaq    -1(%r9), %rcx
> > +     sarq    $63, %r8
> > +     xorq    %r8, %r9
> > +     subq    %r8, %r9
> >       /* Don't use non-temporal store if there is overlap between
> > -        destination and source since destination may be in cache
> > -        when source is loaded.  */
> > -     leaq    (%rdi, %rdx), %r10
> > -     cmpq    %r10, %rsi
> > -     jb      L(loop_4x_vec_forward)
> > -L(loop_large_forward):
> > +        destination and source since destination may be in cache when
> > +        source is loaded.  */
> > +     cmpq    %r9, %rdx
> > +     ja      L(more_8x_vec_check)
> > +
> > +     /* Cache align destination. First store the first 64 bytes then
> > +        adjust alignments.  */
> > +     VMOVU   (%rsi), %VEC(8)
> > +#if VEC_SIZE < 64
> > +     VMOVU   VEC_SIZE(%rsi), %VEC(9)
> > +#if VEC_SIZE < 32
> > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> > +#endif
> > +#endif
> > +     VMOVU   %VEC(8), (%rdi)
> > +#if VEC_SIZE < 64
> > +     VMOVU   %VEC(9), VEC_SIZE(%rdi)
> > +#if VEC_SIZE < 32
> > +     VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> > +     VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> > +#endif
> > +#endif
> > +     /* Adjust source, destination, and size.  */
> > +     movq    %rdi, %r8
> > +     andq    $63, %r8
> > +     /* Get the negative of offset for alignment.  */
> > +     subq    $64, %r8
> > +     /* Adjust source.  */
> > +     subq    %r8, %rsi
> > +     /* Adjust destination which should be aligned now.  */
> > +     subq    %r8, %rdi
> > +     /* Adjust length.  */
> > +     addq    %r8, %rdx
> > +
> > +     /* Test if source and destination addresses will alias. If they do
> > +        the larger pipeline in large_memcpy_4x alleviated the
> > +        performance drop.  */
> > +     testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> > +     jz      L(large_memcpy_4x)
> > +
> > +     movq    %rdx, %r10
> > +     shrq    $LOG_4X_MEMCPY_THRESH, %r10
> > +     cmp     __x86_shared_non_temporal_threshold(%rip), %r10
> > +     jae     L(large_memcpy_4x)
> > +
> > +     /* edx will store remainder size for copying tail.  */
> > +     andl    $(PAGE_SIZE * 2 - 1), %edx
> > +     /* r10 stores outer loop counter.  */
> > +     shrq    $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> > +     /* Copy 4x VEC at a time from 2 pages.  */
> > +     .p2align 4
> > +L(loop_large_memcpy_2x_outer):
> > +     /* ecx stores inner loop counter.  */
> > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > +L(loop_large_memcpy_2x_inner):
> > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> > +     /* Load vectors from rsi.  */
> > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +     subq    $-LARGE_LOAD_SIZE, %rsi
> > +     /* Non-temporal store vectors to rdi.  */
> > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +     subq    $-LARGE_LOAD_SIZE, %rdi
> > +     decl    %ecx
> > +     jnz     L(loop_large_memcpy_2x_inner)
> > +     addq    $PAGE_SIZE, %rdi
> > +     addq    $PAGE_SIZE, %rsi
> > +     decq    %r10
> > +     jne     L(loop_large_memcpy_2x_outer)
> > +     sfence
> > +
> > +     /* Check if only last 4 loads are needed.  */
> > +     cmpl    $(VEC_SIZE * 4), %edx
> > +     jbe     L(large_memcpy_2x_end)
> > +
> > +     /* Handle the last 2 * PAGE_SIZE bytes.  */
> > +L(loop_large_memcpy_2x_tail):
> >       /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> >       VMOVU   (%rsi), %VEC(0)
> >       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > -     addq    $PREFETCHED_LOAD_SIZE, %rsi
> > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > -     VMOVNT  %VEC(0), (%rdi)
> > -     VMOVNT  %VEC(1), VEC_SIZE(%rdi)
> > -     VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
> > -     VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
> > -     addq    $PREFETCHED_LOAD_SIZE, %rdi
> > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > -     ja      L(loop_large_forward)
> > -     sfence
> > +     subq    $-(VEC_SIZE * 4), %rsi
> > +     addl    $-(VEC_SIZE * 4), %edx
> > +     VMOVA   %VEC(0), (%rdi)
> > +     VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > +     VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > +     VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     cmpl    $(VEC_SIZE * 4), %edx
> > +     ja      L(loop_large_memcpy_2x_tail)
> > +
> > +L(large_memcpy_2x_end):
> >       /* Store the last 4 * VEC.  */
> > -     VMOVU   %VEC(5), (%rcx)
> > -     VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> > -     VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > -     VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > -     /* Store the first VEC.  */
> > -     VMOVU   %VEC(4), (%r11)
> > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > +
> > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> >       VZEROUPPER_RETURN
> >
> > -L(large_backward):
> > -     /* Don't use non-temporal store if there is overlap between
> > -        destination and source since destination may be in cache
> > -        when source is loaded.  */
> > -     leaq    (%rcx, %rdx), %r10
> > -     cmpq    %r10, %r9
> > -     jb      L(loop_4x_vec_backward)
> > -L(loop_large_backward):
> > -     /* Copy 4 * VEC a time backward with non-temporal stores.  */
> > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> > -     VMOVU   (%rcx), %VEC(0)
> > -     VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > -     VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > -     VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > -     subq    $PREFETCHED_LOAD_SIZE, %rcx
> > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > -     VMOVNT  %VEC(0), (%r9)
> > -     VMOVNT  %VEC(1), -VEC_SIZE(%r9)
> > -     VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
> > -     VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
> > -     subq    $PREFETCHED_LOAD_SIZE, %r9
> > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > -     ja      L(loop_large_backward)
> > +     .p2align 4
> > +L(large_memcpy_4x):
> > +     movq    %rdx, %r10
> > +     /* edx will store remainder size for copying tail.  */
> > +     andl    $(PAGE_SIZE * 4 - 1), %edx
> > +     /* r10 stores outer loop counter.  */
> > +     shrq    $(LOG_PAGE_SIZE + 2), %r10
> > +     /* Copy 4x VEC at a time from 4 pages.  */
> > +     .p2align 4
> > +L(loop_large_memcpy_4x_outer):
> > +     /* ecx stores inner loop counter.  */
> > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > +L(loop_large_memcpy_4x_inner):
> > +     /* Only one prefetch set per page as doing 4 pages give more time
> > +        for prefetcher to keep up.  */
> > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> > +     /* Load vectors from rsi.  */
> > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > +     subq    $-LARGE_LOAD_SIZE, %rsi
> > +     /* Non-temporal store vectors to rdi.  */
> > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > +     subq    $-LARGE_LOAD_SIZE, %rdi
> > +     decl    %ecx
> > +     jnz     L(loop_large_memcpy_4x_inner)
> > +     addq    $(PAGE_SIZE * 3), %rdi
> > +     addq    $(PAGE_SIZE * 3), %rsi
> > +     decq    %r10
> > +     jne     L(loop_large_memcpy_4x_outer)
> >       sfence
> > -     /* Store the first 4 * VEC.  */
> > -     VMOVU   %VEC(4), (%rdi)
> > -     VMOVU   %VEC(5), VEC_SIZE(%rdi)
> > -     VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> > -     VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> > -     /* Store the last VEC.  */
> > -     VMOVU   %VEC(8), (%r11)
> > +     /* Check if only last 4 loads are needed.  */
> > +     cmpl    $(VEC_SIZE * 4), %edx
> > +     jbe     L(large_memcpy_4x_end)
> > +
> > +     /* Handle the last 4  * PAGE_SIZE bytes.  */
> > +L(loop_large_memcpy_4x_tail):
> > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > +     PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > +     VMOVU   (%rsi), %VEC(0)
> > +     VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > +     subq    $-(VEC_SIZE * 4), %rsi
> > +     addl    $-(VEC_SIZE * 4), %edx
> > +     VMOVA   %VEC(0), (%rdi)
> > +     VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > +     VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > +     VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     cmpl    $(VEC_SIZE * 4), %edx
> > +     ja      L(loop_large_memcpy_4x_tail)
> > +
> > +L(large_memcpy_4x_end):
> > +     /* Store the last 4 * VEC.  */
> > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > +
> > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> >       VZEROUPPER_RETURN
> >  #endif
> >  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > --
> > 2.29.2
> >
>
> LGTM.  Please commit it.
>
> Thanks.
>
>
> H.J.
  
H.J. Lu April 16, 2021, 5:04 p.m. UTC | #7
On Fri, Apr 16, 2021 at 9:35 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> > LGTM.  Please commit it.
>
> Are you saying that to me or someone else? If its to me what do you
> mean, is the patch not enough?

I will commit it for you.

> > Thanks.
>
> On Fri, Apr 16, 2021 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sat, Apr 03, 2021 at 04:12:15AM -0400, Noah Goldstein wrote:
> > > From: noah <goldstein.w.n@gmail.com>
> > >
> > > No Bug. This commit updates the large memcpy case (no overlap). The
> > > update is to perform memcpy on either 2 or 4 contiguous pages at
> > > once. This 1) helps to alleviate the affects of false memory aliasing
> > > when destination and source have a close 4k alignment and 2) In most
> > > cases and for most DRAM units is a modestly more efficient access
> > > pattern. These changes are a clear performance improvement for
> > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > > pass.
> > >
> > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > ---
> > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the
> > > loops and no longer see any meaningful regression.
> > >
> > > Also added back the temporal stores for the tail. Saw a regression
> > > when doing these tests.
> > >
> > > Two tables below for skylake and icelake numbers for the areas around
> > > where you saw the regression. Below is all data from the tests.
> > >
> > > N = 10.
> > >
> > > Skylake
> > > Len         ,align1      ,align2      ,new mean    ,old mean
> > > 4103        ,0           ,64          ,84.5        ,88.6
> > > 4111        ,0           ,3           ,99.0        ,99.9
> > > 4127        ,3           ,0           ,102.1       ,102.3
> > > 4159        ,3           ,7           ,88.7        ,90.9
> > > 4223        ,9           ,5           ,88.1        ,87.4
> > > 8199        ,0           ,64          ,146.7       ,150.2
> > > 8207        ,0           ,3           ,167.9       ,168.5
> > > 8223        ,3           ,0           ,168.5       ,168.1
> > > 8255        ,3           ,7           ,157.0       ,159.2
> > > 8319        ,9           ,5           ,155.5       ,155.7
> > > 16391       ,0           ,64          ,286.2       ,288.8
> > > 16399       ,0           ,3           ,307.0       ,308.7
> > > 16415       ,3           ,0           ,307.4       ,307.6
> > > 16447       ,3           ,7           ,294.6       ,295.5
> > > 16511       ,9           ,5           ,291.5       ,462.1
> > > 32775       ,0           ,64          ,603.4       ,601.5
> > > 32783       ,0           ,3           ,604.8       ,606.4
> > > 32799       ,3           ,0           ,603.0       ,604.1
> > > 32831       ,3           ,7           ,600.2       ,737.3
> > > 32895       ,9           ,5           ,604.4       ,599.5
> > > 65543       ,0           ,64          ,1873.5      ,1854.3
> > > 65551       ,0           ,3           ,1862.9      ,1846.6
> > > 65567       ,3           ,0           ,1885.5      ,1966.0
> > > 65599       ,3           ,7           ,1833.2      ,1833.1
> > > 65663       ,9           ,5           ,1884.9      ,1887.4
> > > 131079      ,0           ,64          ,3944.3      ,3949.4
> > > 131087      ,0           ,3           ,3927.3      ,3913.3
> > > 131103      ,3           ,0           ,4415.8      ,4169.4
> > > 131135      ,3           ,7           ,4224.5      ,4157.6
> > > 131199      ,9           ,5           ,5974.0      ,4983.8
> > > 262151      ,0           ,64          ,11050.2     ,10620.6
> > > 262159      ,0           ,3           ,9932.8      ,10037.3
> > > 262175      ,3           ,0           ,10188.8     ,9206.6
> > > 262207      ,3           ,7           ,9633.3      ,9216.7
> > > 262271      ,9           ,5           ,9732.7      ,9345.3
> > > 524295      ,0           ,64          ,24823.9     ,24880.7
> > > 524303      ,0           ,3           ,24514.0     ,24556.7
> > > 524319      ,3           ,0           ,23974.4     ,24219.9
> > > 524351      ,3           ,7           ,24159.7     ,24207.0
> > > 524415      ,9           ,5           ,23946.5     ,24142.8
> > >
> > > Icelake:
> > > Len         ,align1      ,align2      ,new mean    ,old mean
> > > 4103        ,0           ,64          ,50.2        ,63.7
> > > 4111        ,0           ,3           ,63.7        ,65.1
> > > 4127        ,3           ,0           ,68.2        ,69.4
> > > 4159        ,3           ,7           ,59.6        ,68.0
> > > 4223        ,9           ,5           ,68.2        ,66.8
> > > 8199        ,0           ,64          ,92.1        ,89.9
> > > 8207        ,0           ,3           ,119.7       ,118.3
> > > 8223        ,3           ,0           ,119.1       ,120.9
> > > 8255        ,3           ,7           ,122.9       ,123.7
> > > 8319        ,9           ,5           ,122.1       ,121.8
> > > 16391       ,0           ,64          ,162.7       ,158.0
> > > 16399       ,0           ,3           ,227.6       ,234.1
> > > 16415       ,3           ,0           ,230.8       ,232.7
> > > 16447       ,3           ,7           ,226.8       ,232.6
> > > 16511       ,9           ,5           ,233.4       ,233.8
> > > 32775       ,0           ,64          ,312.2       ,301.8
> > > 32783       ,0           ,3           ,449.7       ,450.0
> > > 32799       ,3           ,0           ,452.7       ,455.9
> > > 32831       ,3           ,7           ,449.8       ,458.0
> > > 32895       ,9           ,5           ,456.3       ,459.4
> > > 65543       ,0           ,64          ,1460.6      ,1463.9
> > > 65551       ,0           ,3           ,1462.0      ,1465.4
> > > 65567       ,3           ,0           ,1466.6      ,1480.4
> > > 65599       ,3           ,7           ,1488.0      ,1488.9
> > > 65663       ,9           ,5           ,1680.8      ,1499.5
> > > 131079      ,0           ,64          ,2988.5      ,3010.1
> > > 131087      ,0           ,3           ,2995.5      ,2996.4
> > > 131103      ,3           ,0           ,3006.2      ,3000.5
> > > 131135      ,3           ,7           ,3032.4      ,3073.7
> > > 131199      ,9           ,5           ,3010.4      ,3027.4
> > > 262151      ,0           ,64          ,6143.2      ,6079.1
> > > 262159      ,0           ,3           ,6085.1      ,6075.8
> > > 262175      ,3           ,0           ,6088.0      ,6064.9
> > > 262207      ,3           ,7           ,6018.7      ,6023.5
> > > 262271      ,9           ,5           ,6019.8      ,5959.2
> > > 524295      ,0           ,64          ,14464.2     ,14095.1
> > > 524303      ,0           ,3           ,14761.6     ,14050.2
> > > 524319      ,3           ,0           ,14534.1     ,14087.5
> > > 524351      ,3           ,7           ,14147.7     ,13903.8
> > > 524415      ,9           ,5           ,14157.0     ,13982.9
> > >
> > >
> > >
> > > cpu         ,version     ,Len         ,align1      ,align2      ,new mean    ,old mean
> > > skylake     ,avx         ,4103        ,0           ,64          ,84.5        ,88.6
> > > skylake     ,avx         ,4111        ,0           ,3           ,99.0        ,99.9
> > > skylake     ,avx         ,4127        ,3           ,0           ,102.1       ,102.3
> > > skylake     ,avx         ,4159        ,3           ,7           ,88.7        ,90.9
> > > skylake     ,avx         ,4223        ,9           ,5           ,88.1        ,87.4
> > > skylake     ,avx         ,8199        ,0           ,64          ,146.7       ,150.2
> > > skylake     ,avx         ,8207        ,0           ,3           ,167.9       ,168.5
> > > skylake     ,avx         ,8223        ,3           ,0           ,168.5       ,168.1
> > > skylake     ,avx         ,8255        ,3           ,7           ,157.0       ,159.2
> > > skylake     ,avx         ,8319        ,9           ,5           ,155.5       ,155.7
> > > skylake     ,avx         ,16391       ,0           ,64          ,286.2       ,288.8
> > > skylake     ,avx         ,16399       ,0           ,3           ,307.0       ,308.7
> > > skylake     ,avx         ,16415       ,3           ,0           ,307.4       ,307.6
> > > skylake     ,avx         ,16447       ,3           ,7           ,294.6       ,295.5
> > > skylake     ,avx         ,16511       ,9           ,5           ,291.5       ,462.1
> > > skylake     ,avx         ,32775       ,0           ,64          ,603.4       ,601.5
> > > skylake     ,avx         ,32783       ,0           ,3           ,604.8       ,606.4
> > > skylake     ,avx         ,32799       ,3           ,0           ,603.0       ,604.1
> > > skylake     ,avx         ,32831       ,3           ,7           ,600.2       ,737.3
> > > skylake     ,avx         ,32895       ,9           ,5           ,604.4       ,599.5
> > > skylake     ,avx         ,65543       ,0           ,64          ,1873.5      ,1854.3
> > > skylake     ,avx         ,65551       ,0           ,3           ,1862.9      ,1846.6
> > > skylake     ,avx         ,65567       ,3           ,0           ,1885.5      ,1966.0
> > > skylake     ,avx         ,65599       ,3           ,7           ,1833.2      ,1833.1
> > > skylake     ,avx         ,65663       ,9           ,5           ,1884.9      ,1887.4
> > > skylake     ,avx         ,131079      ,0           ,64          ,3944.3      ,3949.4
> > > skylake     ,avx         ,131087      ,0           ,3           ,3927.3      ,3913.3
> > > skylake     ,avx         ,131103      ,3           ,0           ,4415.8      ,4169.4
> > > skylake     ,avx         ,131135      ,3           ,7           ,4224.5      ,4157.6
> > > skylake     ,avx         ,131199      ,9           ,5           ,5974.0      ,4983.8
> > > skylake     ,avx         ,262151      ,0           ,64          ,11050.2     ,10620.6
> > > skylake     ,avx         ,262159      ,0           ,3           ,9932.8      ,10037.3
> > > skylake     ,avx         ,262175      ,3           ,0           ,10188.8     ,9206.6
> > > skylake     ,avx         ,262207      ,3           ,7           ,9633.3      ,9216.7
> > > skylake     ,avx         ,262271      ,9           ,5           ,9732.7      ,9345.3
> > > skylake     ,avx         ,524295      ,0           ,64          ,24823.9     ,24880.7
> > > skylake     ,avx         ,524303      ,0           ,3           ,24514.0     ,24556.7
> > > skylake     ,avx         ,524319      ,3           ,0           ,23974.4     ,24219.9
> > > skylake     ,avx         ,524351      ,3           ,7           ,24159.7     ,24207.0
> > > skylake     ,avx         ,524415      ,9           ,5           ,23946.5     ,24142.8
> > > skylake     ,avx         ,1048583     ,0           ,64          ,49163.9     ,49454.6
> > > skylake     ,avx         ,1048591     ,0           ,3           ,49879.3     ,49400.8
> > > skylake     ,avx         ,1048607     ,3           ,0           ,49738.0     ,48864.6
> > > skylake     ,avx         ,1048639     ,3           ,7           ,48804.0     ,47588.5
> > > skylake     ,avx         ,1048703     ,9           ,5           ,49629.4     ,49796.3
> > > skylake     ,avx         ,2097159     ,0           ,64          ,98271.7     ,96330.6
> > > skylake     ,avx         ,2097167     ,0           ,3           ,97801.8     ,98638.1
> > > skylake     ,avx         ,2097183     ,3           ,0           ,98041.1     ,99287.6
> > > skylake     ,avx         ,2097215     ,3           ,7           ,96629.5     ,96521.9
> > > skylake     ,avx         ,2097279     ,9           ,5           ,98961.8     ,98909.8
> > > skylake     ,avx         ,4194311     ,0           ,64          ,194667.7    ,195377.1
> > > skylake     ,avx         ,4194319     ,0           ,3           ,194919.5    ,198576.2
> > > skylake     ,avx         ,4194335     ,3           ,0           ,192949.8    ,194584.7
> > > skylake     ,avx         ,4194367     ,3           ,7           ,189943.5    ,189177.9
> > > skylake     ,avx         ,4194431     ,9           ,5           ,192479.1    ,196494.2
> > > skylake     ,avx         ,8388615     ,0           ,64          ,588671.6    ,587215.4
> > > skylake     ,avx         ,8388623     ,0           ,3           ,581640.7    ,582812.5
> > > skylake     ,avx         ,8388639     ,3           ,0           ,549811.9    ,544697.6
> > > skylake     ,avx         ,8388671     ,3           ,7           ,591155.0    ,577951.8
> > > skylake     ,avx         ,8388735     ,9           ,5           ,547583.2    ,545133.3
> > > skylake     ,avx         ,16777223    ,0           ,64          ,1787503.0   ,1811146.0
> > > skylake     ,avx         ,16777231    ,0           ,3           ,1758671.0   ,1756343.0
> > > skylake     ,avx         ,16777247    ,3           ,0           ,1691781.0   ,1694661.0
> > > skylake     ,avx         ,16777279    ,3           ,7           ,1768150.0   ,1754785.0
> > > skylake     ,avx         ,16777343    ,9           ,5           ,1695179.0   ,1710794.0
> > > skylake     ,sse2        ,4103        ,0           ,64          ,150.8       ,150.5
> > > skylake     ,sse2        ,4111        ,0           ,3           ,156.8       ,158.4
> > > skylake     ,sse2        ,4127        ,3           ,0           ,99.7        ,99.4
> > > skylake     ,sse2        ,4159        ,3           ,7           ,154.8       ,154.5
> > > skylake     ,sse2        ,4223        ,9           ,5           ,137.3       ,137.2
> > > skylake     ,sse2        ,8199        ,0           ,64          ,284.8       ,285.5
> > > skylake     ,sse2        ,8207        ,0           ,3           ,296.0       ,296.1
> > > skylake     ,sse2        ,8223        ,3           ,0           ,168.0       ,168.2
> > > skylake     ,sse2        ,8255        ,3           ,7           ,293.0       ,292.4
> > > skylake     ,sse2        ,8319        ,9           ,5           ,251.3       ,250.7
> > > skylake     ,sse2        ,16391       ,0           ,64          ,561.3       ,608.3
> > > skylake     ,sse2        ,16399       ,0           ,3           ,571.0       ,574.8
> > > skylake     ,sse2        ,16415       ,3           ,0           ,305.4       ,305.0
> > > skylake     ,sse2        ,16447       ,3           ,7           ,563.2       ,565.0
> > > skylake     ,sse2        ,16511       ,9           ,5           ,477.1       ,475.1
> > > skylake     ,sse2        ,32775       ,0           ,64          ,1128.2      ,1131.7
> > > skylake     ,sse2        ,32783       ,0           ,3           ,1126.6      ,1131.0
> > > skylake     ,sse2        ,32799       ,3           ,0           ,587.6       ,590.8
> > > skylake     ,sse2        ,32831       ,3           ,7           ,1130.6      ,1126.2
> > > skylake     ,sse2        ,32895       ,9           ,5           ,957.6       ,953.0
> > > skylake     ,sse2        ,65543       ,0           ,64          ,2718.9      ,2704.2
> > > skylake     ,sse2        ,65551       ,0           ,3           ,2724.1      ,2725.0
> > > skylake     ,sse2        ,65567       ,3           ,0           ,1888.4      ,1914.3
> > > skylake     ,sse2        ,65599       ,3           ,7           ,2787.6      ,2748.7
> > > skylake     ,sse2        ,65663       ,9           ,5           ,2400.5      ,2369.4
> > > skylake     ,sse2        ,131079      ,0           ,64          ,5603.3      ,5654.9
> > > skylake     ,sse2        ,131087      ,0           ,3           ,5939.3      ,5871.4
> > > skylake     ,sse2        ,131103      ,3           ,0           ,4272.4      ,4190.0
> > > skylake     ,sse2        ,131135      ,3           ,7           ,7601.4      ,7524.6
> > > skylake     ,sse2        ,131199      ,9           ,5           ,7022.1      ,6864.7
> > > skylake     ,sse2        ,262151      ,0           ,64          ,13736.2     ,14030.0
> > > skylake     ,sse2        ,262159      ,0           ,3           ,12407.3     ,12334.1
> > > skylake     ,sse2        ,262175      ,3           ,0           ,9661.1      ,9249.4
> > > skylake     ,sse2        ,262207      ,3           ,7           ,12850.2     ,12351.6
> > > skylake     ,sse2        ,262271      ,9           ,5           ,10792.6     ,10435.8
> > > skylake     ,sse2        ,524295      ,0           ,64          ,27754.5     ,28177.7
> > > skylake     ,sse2        ,524303      ,0           ,3           ,27766.2     ,28152.0
> > > skylake     ,sse2        ,524319      ,3           ,0           ,24030.9     ,24438.3
> > > skylake     ,sse2        ,524351      ,3           ,7           ,27787.5     ,27933.0
> > > skylake     ,sse2        ,524415      ,9           ,5           ,24263.2     ,25249.1
> > > skylake     ,sse2        ,1048583     ,0           ,64          ,56199.9     ,56039.8
> > > skylake     ,sse2        ,1048591     ,0           ,3           ,56750.2     ,58889.7
> > > skylake     ,sse2        ,1048607     ,3           ,0           ,56394.0     ,55115.3
> > > skylake     ,sse2        ,1048639     ,3           ,7           ,57233.1     ,57473.8
> > > skylake     ,sse2        ,1048703     ,9           ,5           ,56324.3     ,55917.9
> > > skylake     ,sse2        ,2097159     ,0           ,64          ,113234.8    ,114346.4
> > > skylake     ,sse2        ,2097167     ,0           ,3           ,114373.1    ,115522.5
> > > skylake     ,sse2        ,2097183     ,3           ,0           ,108113.3    ,108513.3
> > > skylake     ,sse2        ,2097215     ,3           ,7           ,116863.6    ,116549.9
> > > skylake     ,sse2        ,2097279     ,9           ,5           ,108945.1    ,108843.7
> > > skylake     ,sse2        ,4194311     ,0           ,64          ,230250.1    ,232350.0
> > > skylake     ,sse2        ,4194319     ,0           ,3           ,231895.3    ,235055.6
> > > skylake     ,sse2        ,4194335     ,3           ,0           ,218442.8    ,219199.8
> > > skylake     ,sse2        ,4194367     ,3           ,7           ,242564.2    ,235587.7
> > > skylake     ,sse2        ,4194431     ,9           ,5           ,224167.4    ,215261.8
> > > skylake     ,sse2        ,8388615     ,0           ,64          ,679801.8    ,674832.0
> > > skylake     ,sse2        ,8388623     ,0           ,3           ,684913.2    ,685238.7
> > > skylake     ,sse2        ,8388639     ,3           ,0           ,644865.4    ,631388.6
> > > skylake     ,sse2        ,8388671     ,3           ,7           ,698700.9    ,689316.1
> > > skylake     ,sse2        ,8388735     ,9           ,5           ,644820.2    ,631366.8
> > > skylake     ,sse2        ,16777223    ,0           ,64          ,1877984.0   ,1876437.0
> > > skylake     ,sse2        ,16777231    ,0           ,3           ,1898086.0   ,1913053.0
> > > skylake     ,sse2        ,16777247    ,3           ,0           ,1857018.0   ,1866949.0
> > > skylake     ,sse2        ,16777279    ,3           ,7           ,1914905.0   ,1897134.0
> > > skylake     ,sse2        ,16777343    ,9           ,5           ,1859937.0   ,1881939.0
> > > icelake     ,avx512      ,4103        ,0           ,64          ,75.2        ,75.8
> > > icelake     ,avx512      ,4111        ,0           ,3           ,56.9        ,56.4
> > > icelake     ,avx512      ,4127        ,3           ,0           ,59.1        ,59.6
> > > icelake     ,avx512      ,4159        ,3           ,7           ,50.7        ,51.3
> > > icelake     ,avx512      ,4223        ,9           ,5           ,59.2        ,58.9
> > > icelake     ,avx512      ,8199        ,0           ,64          ,67.8        ,63.9
> > > icelake     ,avx512      ,8207        ,0           ,3           ,89.0        ,89.9
> > > icelake     ,avx512      ,8223        ,3           ,0           ,90.2        ,90.1
> > > icelake     ,avx512      ,8255        ,3           ,7           ,82.6        ,84.9
> > > icelake     ,avx512      ,8319        ,9           ,5           ,91.5        ,92.8
> > > icelake     ,avx512      ,16391       ,0           ,64          ,118.0       ,117.6
> > > icelake     ,avx512      ,16399       ,0           ,3           ,156.5       ,157.0
> > > icelake     ,avx512      ,16415       ,3           ,0           ,157.4       ,157.3
> > > icelake     ,avx512      ,16447       ,3           ,7           ,151.0       ,151.6
> > > icelake     ,avx512      ,16511       ,9           ,5           ,159.1       ,159.6
> > > icelake     ,avx512      ,32775       ,0           ,64          ,231.8       ,230.8
> > > icelake     ,avx512      ,32783       ,0           ,3           ,297.8       ,299.3
> > > icelake     ,avx512      ,32799       ,3           ,0           ,299.1       ,299.0
> > > icelake     ,avx512      ,32831       ,3           ,7           ,293.5       ,295.4
> > > icelake     ,avx512      ,32895       ,9           ,5           ,300.3       ,302.5
> > > icelake     ,avx512      ,65543       ,0           ,64          ,1473.4      ,1479.2
> > > icelake     ,avx512      ,65551       ,0           ,3           ,1438.2      ,1445.3
> > > icelake     ,avx512      ,65567       ,3           ,0           ,1450.3      ,1463.8
> > > icelake     ,avx512      ,65599       ,3           ,7           ,1469.0      ,1473.8
> > > icelake     ,avx512      ,65663       ,9           ,5           ,1480.0      ,1483.5
> > > icelake     ,avx512      ,131079      ,0           ,64          ,3015.1      ,3037.5
> > > icelake     ,avx512      ,131087      ,0           ,3           ,2952.3      ,2960.4
> > > icelake     ,avx512      ,131103      ,3           ,0           ,2966.2      ,2964.4
> > > icelake     ,avx512      ,131135      ,3           ,7           ,2961.6      ,3047.9
> > > icelake     ,avx512      ,131199      ,9           ,5           ,2967.4      ,3183.8
> > > icelake     ,avx512      ,262151      ,0           ,64          ,6206.0      ,6141.5
> > > icelake     ,avx512      ,262159      ,0           ,3           ,5990.8      ,5959.2
> > > icelake     ,avx512      ,262175      ,3           ,0           ,5976.7      ,5963.8
> > > icelake     ,avx512      ,262207      ,3           ,7           ,5939.5      ,5924.3
> > > icelake     ,avx512      ,262271      ,9           ,5           ,5944.6      ,5990.3
> > > icelake     ,avx512      ,524295      ,0           ,64          ,14726.7     ,14307.0
> > > icelake     ,avx512      ,524303      ,0           ,3           ,14344.2     ,14040.5
> > > icelake     ,avx512      ,524319      ,3           ,0           ,14175.0     ,13862.2
> > > icelake     ,avx512      ,524351      ,3           ,7           ,14261.4     ,13821.5
> > > icelake     ,avx512      ,524415      ,9           ,5           ,14266.5     ,14064.7
> > > icelake     ,avx512      ,1048583     ,0           ,64          ,35211.4     ,35414.6
> > > icelake     ,avx512      ,1048591     ,0           ,3           ,35156.8     ,35591.2
> > > icelake     ,avx512      ,1048607     ,3           ,0           ,35273.1     ,35503.3
> > > icelake     ,avx512      ,1048639     ,3           ,7           ,35255.8     ,35725.0
> > > icelake     ,avx512      ,1048703     ,9           ,5           ,35703.6     ,36289.9
> > > icelake     ,avx512      ,2097159     ,0           ,64          ,72613.9     ,72063.2
> > > icelake     ,avx512      ,2097167     ,0           ,3           ,72301.6     ,73504.2
> > > icelake     ,avx512      ,2097183     ,3           ,0           ,73448.8     ,72133.6
> > > icelake     ,avx512      ,2097215     ,3           ,7           ,73762.9     ,72825.8
> > > icelake     ,avx512      ,2097279     ,9           ,5           ,72097.3     ,72914.6
> > > icelake     ,avx512      ,4194311     ,0           ,64          ,144793.4    ,144182.1
> > > icelake     ,avx512      ,4194319     ,0           ,3           ,143710.3    ,145063.3
> > > icelake     ,avx512      ,4194335     ,3           ,0           ,146722.1    ,144046.4
> > > icelake     ,avx512      ,4194367     ,3           ,7           ,144267.0    ,144874.6
> > > icelake     ,avx512      ,4194431     ,9           ,5           ,143808.2    ,144560.0
> > > icelake     ,avx512      ,8388615     ,0           ,64          ,427993.4    ,424521.5
> > > icelake     ,avx512      ,8388623     ,0           ,3           ,470267.1    ,473290.8
> > > icelake     ,avx512      ,8388639     ,3           ,0           ,457179.7    ,461797.7
> > > icelake     ,avx512      ,8388671     ,3           ,7           ,472507.9    ,481561.4
> > > icelake     ,avx512      ,8388735     ,9           ,5           ,463611.9    ,467388.7
> > > icelake     ,avx512      ,16777223    ,0           ,64          ,1490426.0   ,1526996.0
> > > icelake     ,avx512      ,16777231    ,0           ,3           ,1516687.0   ,1517095.0
> > > icelake     ,avx512      ,16777247    ,3           ,0           ,1497688.0   ,1512766.0
> > > icelake     ,avx512      ,16777279    ,3           ,7           ,1512331.0   ,1524317.0
> > > icelake     ,avx512      ,16777343    ,9           ,5           ,1498908.0   ,1500526.0
> > > icelake     ,avx         ,4103        ,0           ,64          ,50.2        ,63.7
> > > icelake     ,avx         ,4111        ,0           ,3           ,63.7        ,65.1
> > > icelake     ,avx         ,4127        ,3           ,0           ,68.2        ,69.4
> > > icelake     ,avx         ,4159        ,3           ,7           ,59.6        ,68.0
> > > icelake     ,avx         ,4223        ,9           ,5           ,68.2        ,66.8
> > > icelake     ,avx         ,8199        ,0           ,64          ,92.1        ,89.9
> > > icelake     ,avx         ,8207        ,0           ,3           ,119.7       ,118.3
> > > icelake     ,avx         ,8223        ,3           ,0           ,119.1       ,120.9
> > > icelake     ,avx         ,8255        ,3           ,7           ,122.9       ,123.7
> > > icelake     ,avx         ,8319        ,9           ,5           ,122.1       ,121.8
> > > icelake     ,avx         ,16391       ,0           ,64          ,162.7       ,158.0
> > > icelake     ,avx         ,16399       ,0           ,3           ,227.6       ,234.1
> > > icelake     ,avx         ,16415       ,3           ,0           ,230.8       ,232.7
> > > icelake     ,avx         ,16447       ,3           ,7           ,226.8       ,232.6
> > > icelake     ,avx         ,16511       ,9           ,5           ,233.4       ,233.8
> > > icelake     ,avx         ,32775       ,0           ,64          ,312.2       ,301.8
> > > icelake     ,avx         ,32783       ,0           ,3           ,449.7       ,450.0
> > > icelake     ,avx         ,32799       ,3           ,0           ,452.7       ,455.9
> > > icelake     ,avx         ,32831       ,3           ,7           ,449.8       ,458.0
> > > icelake     ,avx         ,32895       ,9           ,5           ,456.3       ,459.4
> > > icelake     ,avx         ,65543       ,0           ,64          ,1460.6      ,1463.9
> > > icelake     ,avx         ,65551       ,0           ,3           ,1462.0      ,1465.4
> > > icelake     ,avx         ,65567       ,3           ,0           ,1466.6      ,1480.4
> > > icelake     ,avx         ,65599       ,3           ,7           ,1488.0      ,1488.9
> > > icelake     ,avx         ,65663       ,9           ,5           ,1680.8      ,1499.5
> > > icelake     ,avx         ,131079      ,0           ,64          ,2988.5      ,3010.1
> > > icelake     ,avx         ,131087      ,0           ,3           ,2995.5      ,2996.4
> > > icelake     ,avx         ,131103      ,3           ,0           ,3006.2      ,3000.5
> > > icelake     ,avx         ,131135      ,3           ,7           ,3032.4      ,3073.7
> > > icelake     ,avx         ,131199      ,9           ,5           ,3010.4      ,3027.4
> > > icelake     ,avx         ,262151      ,0           ,64          ,6143.2      ,6079.1
> > > icelake     ,avx         ,262159      ,0           ,3           ,6085.1      ,6075.8
> > > icelake     ,avx         ,262175      ,3           ,0           ,6088.0      ,6064.9
> > > icelake     ,avx         ,262207      ,3           ,7           ,6018.7      ,6023.5
> > > icelake     ,avx         ,262271      ,9           ,5           ,6019.8      ,5959.2
> > > icelake     ,avx         ,524295      ,0           ,64          ,14464.2     ,14095.1
> > > icelake     ,avx         ,524303      ,0           ,3           ,14761.6     ,14050.2
> > > icelake     ,avx         ,524319      ,3           ,0           ,14534.1     ,14087.5
> > > icelake     ,avx         ,524351      ,3           ,7           ,14147.7     ,13903.8
> > > icelake     ,avx         ,524415      ,9           ,5           ,14157.0     ,13982.9
> > > icelake     ,avx         ,1048583     ,0           ,64          ,36599.0     ,37461.4
> > > icelake     ,avx         ,1048591     ,0           ,3           ,36717.8     ,37454.9
> > > icelake     ,avx         ,1048607     ,3           ,0           ,36821.2     ,37343.3
> > > icelake     ,avx         ,1048639     ,3           ,7           ,36958.0     ,37507.2
> > > icelake     ,avx         ,1048703     ,9           ,5           ,36869.2     ,37413.1
> > > icelake     ,avx         ,2097159     ,0           ,64          ,74765.8     ,75330.9
> > > icelake     ,avx         ,2097167     ,0           ,3           ,75175.4     ,74891.9
> > > icelake     ,avx         ,2097183     ,3           ,0           ,75451.4     ,74787.7
> > > icelake     ,avx         ,2097215     ,3           ,7           ,75394.8     ,75839.1
> > > icelake     ,avx         ,2097279     ,9           ,5           ,75099.2     ,75421.2
> > > icelake     ,avx         ,4194311     ,0           ,64          ,146809.6    ,146619.4
> > > icelake     ,avx         ,4194319     ,0           ,3           ,148866.4    ,149898.2
> > > icelake     ,avx         ,4194335     ,3           ,0           ,148719.7    ,150165.4
> > > icelake     ,avx         ,4194367     ,3           ,7           ,150600.1    ,150925.9
> > > icelake     ,avx         ,4194431     ,9           ,5           ,149457.3    ,150519.2
> > > icelake     ,avx         ,8388615     ,0           ,64          ,412709.8    ,423666.1
> > > icelake     ,avx         ,8388623     ,0           ,3           ,423717.4    ,424418.2
> > > icelake     ,avx         ,8388639     ,3           ,0           ,414387.5    ,413445.6
> > > icelake     ,avx         ,8388671     ,3           ,7           ,449010.7    ,417553.5
> > > icelake     ,avx         ,8388735     ,9           ,5           ,414128.6    ,411815.3
> > > icelake     ,avx         ,16777223    ,0           ,64          ,1490032.0   ,1510004.0
> > > icelake     ,avx         ,16777231    ,0           ,3           ,1379638.0   ,1422097.0
> > > icelake     ,avx         ,16777247    ,3           ,0           ,1418930.0   ,1367557.0
> > > icelake     ,avx         ,16777279    ,3           ,7           ,1515152.0   ,1500176.0
> > > icelake     ,avx         ,16777343    ,9           ,5           ,1344117.0   ,1411795.0
> > > icelake     ,sse2        ,4103        ,0           ,64          ,113.2       ,114.6
> > > icelake     ,sse2        ,4111        ,0           ,3           ,121.5       ,120.4
> > > icelake     ,sse2        ,4127        ,3           ,0           ,1700.5      ,1771.5
> > > icelake     ,sse2        ,4159        ,3           ,7           ,119.3       ,118.8
> > > icelake     ,sse2        ,4223        ,9           ,5           ,1739.7      ,1735.2
> > > icelake     ,sse2        ,8199        ,0           ,64          ,207.0       ,203.9
> > > icelake     ,sse2        ,8207        ,0           ,3           ,225.5       ,220.8
> > > icelake     ,sse2        ,8223        ,3           ,0           ,3444.3      ,3743.5
> > > icelake     ,sse2        ,8255        ,3           ,7           ,219.9       ,216.8
> > > icelake     ,sse2        ,8319        ,9           ,5           ,4117.1      ,3487.3
> > > icelake     ,sse2        ,16391       ,0           ,64          ,397.1       ,394.3
> > > icelake     ,sse2        ,16399       ,0           ,3           ,439.6       ,428.6
> > > icelake     ,sse2        ,16415       ,3           ,0           ,6997.0      ,7031.2
> > > icelake     ,sse2        ,16447       ,3           ,7           ,426.8       ,421.8
> > > icelake     ,sse2        ,16511       ,9           ,5           ,7037.6      ,7038.3
> > > icelake     ,sse2        ,32775       ,0           ,64          ,790.9       ,779.0
> > > icelake     ,sse2        ,32783       ,0           ,3           ,863.1       ,849.6
> > > icelake     ,sse2        ,32799       ,3           ,0           ,14043.0     ,14390.9
> > > icelake     ,sse2        ,32831       ,3           ,7           ,841.6       ,833.1
> > > icelake     ,sse2        ,32895       ,9           ,5           ,14277.6     ,14344.2
> > > icelake     ,sse2        ,65543       ,0           ,64          ,1897.0      ,1897.3
> > > icelake     ,sse2        ,65551       ,0           ,3           ,1927.1      ,1955.4
> > > icelake     ,sse2        ,65567       ,3           ,0           ,28834.7     ,28727.8
> > > icelake     ,sse2        ,65599       ,3           ,7           ,1961.4      ,1969.7
> > > icelake     ,sse2        ,65663       ,9           ,5           ,28867.6     ,29019.8
> > > icelake     ,sse2        ,131079      ,0           ,64          ,3879.3      ,3872.6
> > > icelake     ,sse2        ,131087      ,0           ,3           ,3955.3      ,3990.7
> > > icelake     ,sse2        ,131103      ,3           ,0           ,58001.8     ,60567.9
> > > icelake     ,sse2        ,131135      ,3           ,7           ,3951.5      ,4002.6
> > > icelake     ,sse2        ,131199      ,9           ,5           ,57886.7     ,58391.4
> > > icelake     ,sse2        ,262151      ,0           ,64          ,7851.4      ,7894.7
> > > icelake     ,sse2        ,262159      ,0           ,3           ,7947.5      ,8016.2
> > > icelake     ,sse2        ,262175      ,3           ,0           ,115036.2    ,115968.6
> > > icelake     ,sse2        ,262207      ,3           ,7           ,7883.9      ,7814.1
> > > icelake     ,sse2        ,262271      ,9           ,5           ,113776.4    ,119733.6
> > > icelake     ,sse2        ,524295      ,0           ,64          ,17198.1     ,16974.9
> > > icelake     ,sse2        ,524303      ,0           ,3           ,17402.2     ,17096.3
> > > icelake     ,sse2        ,524319      ,3           ,0           ,223980.4    ,225889.9
> > > icelake     ,sse2        ,524351      ,3           ,7           ,17034.9     ,16910.3
> > > icelake     ,sse2        ,524415      ,9           ,5           ,224027.7    ,224962.5
> > > icelake     ,sse2        ,1048583     ,0           ,64          ,38822.3     ,39178.6
> > > icelake     ,sse2        ,1048591     ,0           ,3           ,41686.7     ,40247.4
> > > icelake     ,sse2        ,1048607     ,3           ,0           ,38814.8     ,39323.3
> > > icelake     ,sse2        ,1048639     ,3           ,7           ,39568.3     ,41325.7
> > > icelake     ,sse2        ,1048703     ,9           ,5           ,39354.2     ,39637.9
> > > icelake     ,sse2        ,2097159     ,0           ,64          ,84074.7     ,84543.1
> > > icelake     ,sse2        ,2097167     ,0           ,3           ,83665.7     ,82358.2
> > > icelake     ,sse2        ,2097183     ,3           ,0           ,81817.8     ,79638.9
> > > icelake     ,sse2        ,2097215     ,3           ,7           ,83649.1     ,83497.6
> > > icelake     ,sse2        ,2097279     ,9           ,5           ,80287.6     ,79980.9
> > > icelake     ,sse2        ,4194311     ,0           ,64          ,165409.8    ,168343.1
> > > icelake     ,sse2        ,4194319     ,0           ,3           ,165216.7    ,177632.0
> > > icelake     ,sse2        ,4194335     ,3           ,0           ,158718.7    ,160342.2
> > > icelake     ,sse2        ,4194367     ,3           ,7           ,167944.9    ,167204.4
> > > icelake     ,sse2        ,4194431     ,9           ,5           ,161530.1    ,164839.7
> > > icelake     ,sse2        ,8388615     ,0           ,64          ,626504.3    ,629858.5
> > > icelake     ,sse2        ,8388623     ,0           ,3           ,623969.5    ,631509.1
> > > icelake     ,sse2        ,8388639     ,3           ,0           ,599366.7    ,600016.0
> > > icelake     ,sse2        ,8388671     ,3           ,7           ,619964.2    ,619113.2
> > > icelake     ,sse2        ,8388735     ,9           ,5           ,595338.1    ,604172.4
> > > icelake     ,sse2        ,16777223    ,0           ,64          ,1709597.0   ,1725184.0
> > > icelake     ,sse2        ,16777231    ,0           ,3           ,1725452.0   ,1719746.0
> > > icelake     ,sse2        ,16777247    ,3           ,0           ,1614269.0   ,1607164.0
> > > icelake     ,sse2        ,16777279    ,3           ,7           ,1705295.0   ,1733018.0
> > > icelake     ,sse2        ,16777343    ,9           ,5           ,1604197.0   ,1595690.0
> > >
> > >
> > >  .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
> > >  1 file changed, 265 insertions(+), 73 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > index 897a3d9762..5e4a071f16 100644
> > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > @@ -35,7 +35,16 @@
> > >        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
> > >     7. If size >= __x86_shared_non_temporal_threshold and there is no
> > >        overlap between destination and source, use non-temporal store
> > > -      instead of aligned store.  */
> > > +      instead of aligned store copying from either 2 or 4 pages at
> > > +      once.
> > > +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> > > +      and source and destination do not page alias, copy from 2 pages
> > > +      at once using non-temporal stores. Page aliasing in this case is
> > > +      considered true if destination's page alignment - sources' page
> > > +      alignment is less than 8 * VEC_SIZE.
> > > +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> > > +      and destination do page alias copy from 4 pages at once using
> > > +      non-temporal stores.  */
> > >
> > >  #include <sysdep.h>
> > >
> > > @@ -67,6 +76,34 @@
> > >  # endif
> > >  #endif
> > >
> > > +#ifndef PAGE_SIZE
> > > +# define PAGE_SIZE 4096
> > > +#endif
> > > +
> > > +#if PAGE_SIZE != 4096
> > > +# error Unsupported PAGE_SIZE
> > > +#endif
> > > +
> > > +#ifndef LOG_PAGE_SIZE
> > > +# define LOG_PAGE_SIZE 12
> > > +#endif
> > > +
> > > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> > > +# error Invalid LOG_PAGE_SIZE
> > > +#endif
> > > +
> > > +/* Byte per page for large_memcpy inner loop.  */
> > > +#if VEC_SIZE == 64
> > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> > > +#else
> > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> > > +#endif
> > > +
> > > +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
> > > +#ifndef LOG_4X_MEMCPY_THRESH
> > > +# define LOG_4X_MEMCPY_THRESH 4
> > > +#endif
> > > +
> > >  /* Avoid short distance rep movsb only with non-SSE vector.  */
> > >  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
> > >  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> > > @@ -106,6 +143,28 @@
> > >  # error Unsupported PREFETCH_SIZE!
> > >  #endif
> > >
> > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> > > +     VMOVU   (offset)base, vec0; \
> > > +     VMOVU   ((offset) + VEC_SIZE)base, vec1;
> > > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> > > +     VMOVNT  vec0, (offset)base; \
> > > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base;
> > > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > > +     VMOVU   (offset)base, vec0; \
> > > +     VMOVU   ((offset) + VEC_SIZE)base, vec1; \
> > > +     VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \
> > > +     VMOVU   ((offset) + VEC_SIZE * 3)base, vec3;
> > > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > > +     VMOVNT  vec0, (offset)base; \
> > > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base; \
> > > +     VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \
> > > +     VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;
> > > +#else
> > > +# error Invalid LARGE_LOAD_SIZE
> > > +#endif
> > > +
> > >  #ifndef SECTION
> > >  # error SECTION is not defined!
> > >  #endif
> > > @@ -393,6 +452,15 @@ L(last_4x_vec):
> > >       VZEROUPPER_RETURN
> > >
> > >  L(more_8x_vec):
> > > +     /* Check if non-temporal move candidate.  */
> > > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > +     /* Check non-temporal store threshold.  */
> > > +     cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > +     ja      L(large_memcpy_2x)
> > > +#endif
> > > +     /* Entry if rdx is greater than non-temporal threshold but there
> > > +       is overlap.  */
> > > +L(more_8x_vec_check):
> > >       cmpq    %rsi, %rdi
> > >       ja      L(more_8x_vec_backward)
> > >       /* Source == destination is less common.  */
> > > @@ -419,24 +487,21 @@ L(more_8x_vec):
> > >       subq    %r8, %rdi
> > >       /* Adjust length.  */
> > >       addq    %r8, %rdx
> > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > -     /* Check non-temporal store threshold.  */
> > > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > -     ja      L(large_forward)
> > > -#endif
> > > +
> > > +     .p2align 4
> > >  L(loop_4x_vec_forward):
> > >       /* Copy 4 * VEC a time forward.  */
> > >       VMOVU   (%rsi), %VEC(0)
> > >       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > -     addq    $(VEC_SIZE * 4), %rsi
> > > -     subq    $(VEC_SIZE * 4), %rdx
> > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > +     addq    $-(VEC_SIZE * 4), %rdx
> > >       VMOVA   %VEC(0), (%rdi)
> > >       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > >       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > >       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > -     addq    $(VEC_SIZE * 4), %rdi
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > >       cmpq    $(VEC_SIZE * 4), %rdx
> > >       ja      L(loop_4x_vec_forward)
> > >       /* Store the last 4 * VEC.  */
> > > @@ -470,24 +535,21 @@ L(more_8x_vec_backward):
> > >       subq    %r8, %r9
> > >       /* Adjust length.  */
> > >       subq    %r8, %rdx
> > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > -     /* Check non-temporal store threshold.  */
> > > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > -     ja      L(large_backward)
> > > -#endif
> > > +
> > > +     .p2align 4
> > >  L(loop_4x_vec_backward):
> > >       /* Copy 4 * VEC a time backward.  */
> > >       VMOVU   (%rcx), %VEC(0)
> > >       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > >       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > >       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > > -     subq    $(VEC_SIZE * 4), %rcx
> > > -     subq    $(VEC_SIZE * 4), %rdx
> > > +     addq    $-(VEC_SIZE * 4), %rcx
> > > +     addq    $-(VEC_SIZE * 4), %rdx
> > >       VMOVA   %VEC(0), (%r9)
> > >       VMOVA   %VEC(1), -VEC_SIZE(%r9)
> > >       VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
> > >       VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
> > > -     subq    $(VEC_SIZE * 4), %r9
> > > +     addq    $-(VEC_SIZE * 4), %r9
> > >       cmpq    $(VEC_SIZE * 4), %rdx
> > >       ja      L(loop_4x_vec_backward)
> > >       /* Store the first 4 * VEC.  */
> > > @@ -500,72 +562,202 @@ L(loop_4x_vec_backward):
> > >       VZEROUPPER_RETURN
> > >
> > >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > -L(large_forward):
> > > +     .p2align 4
> > > +L(large_memcpy_2x):
> > > +     /* Compute absolute value of difference between source and
> > > +        destination.  */
> > > +     movq    %rdi, %r9
> > > +     subq    %rsi, %r9
> > > +     movq    %r9, %r8
> > > +     leaq    -1(%r9), %rcx
> > > +     sarq    $63, %r8
> > > +     xorq    %r8, %r9
> > > +     subq    %r8, %r9
> > >       /* Don't use non-temporal store if there is overlap between
> > > -        destination and source since destination may be in cache
> > > -        when source is loaded.  */
> > > -     leaq    (%rdi, %rdx), %r10
> > > -     cmpq    %r10, %rsi
> > > -     jb      L(loop_4x_vec_forward)
> > > -L(loop_large_forward):
> > > +        destination and source since destination may be in cache when
> > > +        source is loaded.  */
> > > +     cmpq    %r9, %rdx
> > > +     ja      L(more_8x_vec_check)
> > > +
> > > +     /* Cache align destination. First store the first 64 bytes then
> > > +        adjust alignments.  */
> > > +     VMOVU   (%rsi), %VEC(8)
> > > +#if VEC_SIZE < 64
> > > +     VMOVU   VEC_SIZE(%rsi), %VEC(9)
> > > +#if VEC_SIZE < 32
> > > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> > > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> > > +#endif
> > > +#endif
> > > +     VMOVU   %VEC(8), (%rdi)
> > > +#if VEC_SIZE < 64
> > > +     VMOVU   %VEC(9), VEC_SIZE(%rdi)
> > > +#if VEC_SIZE < 32
> > > +     VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> > > +     VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> > > +#endif
> > > +#endif
> > > +     /* Adjust source, destination, and size.  */
> > > +     movq    %rdi, %r8
> > > +     andq    $63, %r8
> > > +     /* Get the negative of offset for alignment.  */
> > > +     subq    $64, %r8
> > > +     /* Adjust source.  */
> > > +     subq    %r8, %rsi
> > > +     /* Adjust destination which should be aligned now.  */
> > > +     subq    %r8, %rdi
> > > +     /* Adjust length.  */
> > > +     addq    %r8, %rdx
> > > +
> > > +     /* Test if source and destination addresses will alias. If they do
> > > +        the larger pipeline in large_memcpy_4x alleviated the
> > > +        performance drop.  */
> > > +     testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> > > +     jz      L(large_memcpy_4x)
> > > +
> > > +     movq    %rdx, %r10
> > > +     shrq    $LOG_4X_MEMCPY_THRESH, %r10
> > > +     cmp     __x86_shared_non_temporal_threshold(%rip), %r10
> > > +     jae     L(large_memcpy_4x)
> > > +
> > > +     /* edx will store remainder size for copying tail.  */
> > > +     andl    $(PAGE_SIZE * 2 - 1), %edx
> > > +     /* r10 stores outer loop counter.  */
> > > +     shrq    $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> > > +     /* Copy 4x VEC at a time from 2 pages.  */
> > > +     .p2align 4
> > > +L(loop_large_memcpy_2x_outer):
> > > +     /* ecx stores inner loop counter.  */
> > > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > > +L(loop_large_memcpy_2x_inner):
> > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> > > +     /* Load vectors from rsi.  */
> > > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > +     subq    $-LARGE_LOAD_SIZE, %rsi
> > > +     /* Non-temporal store vectors to rdi.  */
> > > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > +     subq    $-LARGE_LOAD_SIZE, %rdi
> > > +     decl    %ecx
> > > +     jnz     L(loop_large_memcpy_2x_inner)
> > > +     addq    $PAGE_SIZE, %rdi
> > > +     addq    $PAGE_SIZE, %rsi
> > > +     decq    %r10
> > > +     jne     L(loop_large_memcpy_2x_outer)
> > > +     sfence
> > > +
> > > +     /* Check if only last 4 loads are needed.  */
> > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > +     jbe     L(large_memcpy_2x_end)
> > > +
> > > +     /* Handle the last 2 * PAGE_SIZE bytes.  */
> > > +L(loop_large_memcpy_2x_tail):
> > >       /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> > > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > >       VMOVU   (%rsi), %VEC(0)
> > >       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > -     addq    $PREFETCHED_LOAD_SIZE, %rsi
> > > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > > -     VMOVNT  %VEC(0), (%rdi)
> > > -     VMOVNT  %VEC(1), VEC_SIZE(%rdi)
> > > -     VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > -     VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > -     addq    $PREFETCHED_LOAD_SIZE, %rdi
> > > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > > -     ja      L(loop_large_forward)
> > > -     sfence
> > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > +     addl    $-(VEC_SIZE * 4), %edx
> > > +     VMOVA   %VEC(0), (%rdi)
> > > +     VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > > +     VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > +     VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > +     ja      L(loop_large_memcpy_2x_tail)
> > > +
> > > +L(large_memcpy_2x_end):
> > >       /* Store the last 4 * VEC.  */
> > > -     VMOVU   %VEC(5), (%rcx)
> > > -     VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> > > -     VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > > -     VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > > -     /* Store the first VEC.  */
> > > -     VMOVU   %VEC(4), (%r11)
> > > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > > +
> > > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > >       VZEROUPPER_RETURN
> > >
> > > -L(large_backward):
> > > -     /* Don't use non-temporal store if there is overlap between
> > > -        destination and source since destination may be in cache
> > > -        when source is loaded.  */
> > > -     leaq    (%rcx, %rdx), %r10
> > > -     cmpq    %r10, %r9
> > > -     jb      L(loop_4x_vec_backward)
> > > -L(loop_large_backward):
> > > -     /* Copy 4 * VEC a time backward with non-temporal stores.  */
> > > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> > > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> > > -     VMOVU   (%rcx), %VEC(0)
> > > -     VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > > -     VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > > -     VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > > -     subq    $PREFETCHED_LOAD_SIZE, %rcx
> > > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > > -     VMOVNT  %VEC(0), (%r9)
> > > -     VMOVNT  %VEC(1), -VEC_SIZE(%r9)
> > > -     VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
> > > -     VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
> > > -     subq    $PREFETCHED_LOAD_SIZE, %r9
> > > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > > -     ja      L(loop_large_backward)
> > > +     .p2align 4
> > > +L(large_memcpy_4x):
> > > +     movq    %rdx, %r10
> > > +     /* edx will store remainder size for copying tail.  */
> > > +     andl    $(PAGE_SIZE * 4 - 1), %edx
> > > +     /* r10 stores outer loop counter.  */
> > > +     shrq    $(LOG_PAGE_SIZE + 2), %r10
> > > +     /* Copy 4x VEC at a time from 4 pages.  */
> > > +     .p2align 4
> > > +L(loop_large_memcpy_4x_outer):
> > > +     /* ecx stores inner loop counter.  */
> > > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > > +L(loop_large_memcpy_4x_inner):
> > > +     /* Only one prefetch set per page as doing 4 pages give more time
> > > +        for prefetcher to keep up.  */
> > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> > > +     /* Load vectors from rsi.  */
> > > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > > +     subq    $-LARGE_LOAD_SIZE, %rsi
> > > +     /* Non-temporal store vectors to rdi.  */
> > > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > > +     subq    $-LARGE_LOAD_SIZE, %rdi
> > > +     decl    %ecx
> > > +     jnz     L(loop_large_memcpy_4x_inner)
> > > +     addq    $(PAGE_SIZE * 3), %rdi
> > > +     addq    $(PAGE_SIZE * 3), %rsi
> > > +     decq    %r10
> > > +     jne     L(loop_large_memcpy_4x_outer)
> > >       sfence
> > > -     /* Store the first 4 * VEC.  */
> > > -     VMOVU   %VEC(4), (%rdi)
> > > -     VMOVU   %VEC(5), VEC_SIZE(%rdi)
> > > -     VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> > > -     VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> > > -     /* Store the last VEC.  */
> > > -     VMOVU   %VEC(8), (%r11)
> > > +     /* Check if only last 4 loads are needed.  */
> > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > +     jbe     L(large_memcpy_4x_end)
> > > +
> > > +     /* Handle the last 4  * PAGE_SIZE bytes.  */
> > > +L(loop_large_memcpy_4x_tail):
> > > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > +     PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > > +     VMOVU   (%rsi), %VEC(0)
> > > +     VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > +     addl    $-(VEC_SIZE * 4), %edx
> > > +     VMOVA   %VEC(0), (%rdi)
> > > +     VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > > +     VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > +     VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > +     ja      L(loop_large_memcpy_4x_tail)
> > > +
> > > +L(large_memcpy_4x_end):
> > > +     /* Store the last 4 * VEC.  */
> > > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > > +
> > > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > >       VZEROUPPER_RETURN
> > >  #endif
> > >  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > > --
> > > 2.29.2
> > >
> >
> > LGTM.  Please commit it.
> >
> > Thanks.
> >
> >
> > H.J.
  
Noah Goldstein April 16, 2021, 6:12 p.m. UTC | #8
On Fri, Apr 16, 2021 at 1:05 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Apr 16, 2021 at 9:35 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > > LGTM.  Please commit it.
> >
> > Are you saying that to me or someone else? If its to me what do you
> > mean, is the patch not enough?
>
> I will commit it for you.

Thanks! Are you planning on accepting the bench / testing changes as well?

>
> > > Thanks.
> >
> > On Fri, Apr 16, 2021 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sat, Apr 03, 2021 at 04:12:15AM -0400, Noah Goldstein wrote:
> > > > From: noah <goldstein.w.n@gmail.com>
> > > >
> > > > No Bug. This commit updates the large memcpy case (no overlap). The
> > > > update is to perform memcpy on either 2 or 4 contiguous pages at
> > > > once. This 1) helps to alleviate the affects of false memory aliasing
> > > > when destination and source have a close 4k alignment and 2) In most
> > > > cases and for most DRAM units is a modestly more efficient access
> > > > pattern. These changes are a clear performance improvement for
> > > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > > > pass.
> > > >
> > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > ---
> > > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the
> > > > loops and no longer see any meaningful regression.
> > > >
> > > > Also added back the temporal stores for the tail. Saw a regression
> > > > when doing these tests.
> > > >
> > > > Two tables below for skylake and icelake numbers for the areas around
> > > > where you saw the regression. Below is all data from the tests.
> > > >
> > > > N = 10.
> > > >
> > > > Skylake
> > > > Len         ,align1      ,align2      ,new mean    ,old mean
> > > > 4103        ,0           ,64          ,84.5        ,88.6
> > > > 4111        ,0           ,3           ,99.0        ,99.9
> > > > 4127        ,3           ,0           ,102.1       ,102.3
> > > > 4159        ,3           ,7           ,88.7        ,90.9
> > > > 4223        ,9           ,5           ,88.1        ,87.4
> > > > 8199        ,0           ,64          ,146.7       ,150.2
> > > > 8207        ,0           ,3           ,167.9       ,168.5
> > > > 8223        ,3           ,0           ,168.5       ,168.1
> > > > 8255        ,3           ,7           ,157.0       ,159.2
> > > > 8319        ,9           ,5           ,155.5       ,155.7
> > > > 16391       ,0           ,64          ,286.2       ,288.8
> > > > 16399       ,0           ,3           ,307.0       ,308.7
> > > > 16415       ,3           ,0           ,307.4       ,307.6
> > > > 16447       ,3           ,7           ,294.6       ,295.5
> > > > 16511       ,9           ,5           ,291.5       ,462.1
> > > > 32775       ,0           ,64          ,603.4       ,601.5
> > > > 32783       ,0           ,3           ,604.8       ,606.4
> > > > 32799       ,3           ,0           ,603.0       ,604.1
> > > > 32831       ,3           ,7           ,600.2       ,737.3
> > > > 32895       ,9           ,5           ,604.4       ,599.5
> > > > 65543       ,0           ,64          ,1873.5      ,1854.3
> > > > 65551       ,0           ,3           ,1862.9      ,1846.6
> > > > 65567       ,3           ,0           ,1885.5      ,1966.0
> > > > 65599       ,3           ,7           ,1833.2      ,1833.1
> > > > 65663       ,9           ,5           ,1884.9      ,1887.4
> > > > 131079      ,0           ,64          ,3944.3      ,3949.4
> > > > 131087      ,0           ,3           ,3927.3      ,3913.3
> > > > 131103      ,3           ,0           ,4415.8      ,4169.4
> > > > 131135      ,3           ,7           ,4224.5      ,4157.6
> > > > 131199      ,9           ,5           ,5974.0      ,4983.8
> > > > 262151      ,0           ,64          ,11050.2     ,10620.6
> > > > 262159      ,0           ,3           ,9932.8      ,10037.3
> > > > 262175      ,3           ,0           ,10188.8     ,9206.6
> > > > 262207      ,3           ,7           ,9633.3      ,9216.7
> > > > 262271      ,9           ,5           ,9732.7      ,9345.3
> > > > 524295      ,0           ,64          ,24823.9     ,24880.7
> > > > 524303      ,0           ,3           ,24514.0     ,24556.7
> > > > 524319      ,3           ,0           ,23974.4     ,24219.9
> > > > 524351      ,3           ,7           ,24159.7     ,24207.0
> > > > 524415      ,9           ,5           ,23946.5     ,24142.8
> > > >
> > > > Icelake:
> > > > Len         ,align1      ,align2      ,new mean    ,old mean
> > > > 4103        ,0           ,64          ,50.2        ,63.7
> > > > 4111        ,0           ,3           ,63.7        ,65.1
> > > > 4127        ,3           ,0           ,68.2        ,69.4
> > > > 4159        ,3           ,7           ,59.6        ,68.0
> > > > 4223        ,9           ,5           ,68.2        ,66.8
> > > > 8199        ,0           ,64          ,92.1        ,89.9
> > > > 8207        ,0           ,3           ,119.7       ,118.3
> > > > 8223        ,3           ,0           ,119.1       ,120.9
> > > > 8255        ,3           ,7           ,122.9       ,123.7
> > > > 8319        ,9           ,5           ,122.1       ,121.8
> > > > 16391       ,0           ,64          ,162.7       ,158.0
> > > > 16399       ,0           ,3           ,227.6       ,234.1
> > > > 16415       ,3           ,0           ,230.8       ,232.7
> > > > 16447       ,3           ,7           ,226.8       ,232.6
> > > > 16511       ,9           ,5           ,233.4       ,233.8
> > > > 32775       ,0           ,64          ,312.2       ,301.8
> > > > 32783       ,0           ,3           ,449.7       ,450.0
> > > > 32799       ,3           ,0           ,452.7       ,455.9
> > > > 32831       ,3           ,7           ,449.8       ,458.0
> > > > 32895       ,9           ,5           ,456.3       ,459.4
> > > > 65543       ,0           ,64          ,1460.6      ,1463.9
> > > > 65551       ,0           ,3           ,1462.0      ,1465.4
> > > > 65567       ,3           ,0           ,1466.6      ,1480.4
> > > > 65599       ,3           ,7           ,1488.0      ,1488.9
> > > > 65663       ,9           ,5           ,1680.8      ,1499.5
> > > > 131079      ,0           ,64          ,2988.5      ,3010.1
> > > > 131087      ,0           ,3           ,2995.5      ,2996.4
> > > > 131103      ,3           ,0           ,3006.2      ,3000.5
> > > > 131135      ,3           ,7           ,3032.4      ,3073.7
> > > > 131199      ,9           ,5           ,3010.4      ,3027.4
> > > > 262151      ,0           ,64          ,6143.2      ,6079.1
> > > > 262159      ,0           ,3           ,6085.1      ,6075.8
> > > > 262175      ,3           ,0           ,6088.0      ,6064.9
> > > > 262207      ,3           ,7           ,6018.7      ,6023.5
> > > > 262271      ,9           ,5           ,6019.8      ,5959.2
> > > > 524295      ,0           ,64          ,14464.2     ,14095.1
> > > > 524303      ,0           ,3           ,14761.6     ,14050.2
> > > > 524319      ,3           ,0           ,14534.1     ,14087.5
> > > > 524351      ,3           ,7           ,14147.7     ,13903.8
> > > > 524415      ,9           ,5           ,14157.0     ,13982.9
> > > >
> > > >
> > > >
> > > > cpu         ,version     ,Len         ,align1      ,align2      ,new mean    ,old mean
> > > > skylake     ,avx         ,4103        ,0           ,64          ,84.5        ,88.6
> > > > skylake     ,avx         ,4111        ,0           ,3           ,99.0        ,99.9
> > > > skylake     ,avx         ,4127        ,3           ,0           ,102.1       ,102.3
> > > > skylake     ,avx         ,4159        ,3           ,7           ,88.7        ,90.9
> > > > skylake     ,avx         ,4223        ,9           ,5           ,88.1        ,87.4
> > > > skylake     ,avx         ,8199        ,0           ,64          ,146.7       ,150.2
> > > > skylake     ,avx         ,8207        ,0           ,3           ,167.9       ,168.5
> > > > skylake     ,avx         ,8223        ,3           ,0           ,168.5       ,168.1
> > > > skylake     ,avx         ,8255        ,3           ,7           ,157.0       ,159.2
> > > > skylake     ,avx         ,8319        ,9           ,5           ,155.5       ,155.7
> > > > skylake     ,avx         ,16391       ,0           ,64          ,286.2       ,288.8
> > > > skylake     ,avx         ,16399       ,0           ,3           ,307.0       ,308.7
> > > > skylake     ,avx         ,16415       ,3           ,0           ,307.4       ,307.6
> > > > skylake     ,avx         ,16447       ,3           ,7           ,294.6       ,295.5
> > > > skylake     ,avx         ,16511       ,9           ,5           ,291.5       ,462.1
> > > > skylake     ,avx         ,32775       ,0           ,64          ,603.4       ,601.5
> > > > skylake     ,avx         ,32783       ,0           ,3           ,604.8       ,606.4
> > > > skylake     ,avx         ,32799       ,3           ,0           ,603.0       ,604.1
> > > > skylake     ,avx         ,32831       ,3           ,7           ,600.2       ,737.3
> > > > skylake     ,avx         ,32895       ,9           ,5           ,604.4       ,599.5
> > > > skylake     ,avx         ,65543       ,0           ,64          ,1873.5      ,1854.3
> > > > skylake     ,avx         ,65551       ,0           ,3           ,1862.9      ,1846.6
> > > > skylake     ,avx         ,65567       ,3           ,0           ,1885.5      ,1966.0
> > > > skylake     ,avx         ,65599       ,3           ,7           ,1833.2      ,1833.1
> > > > skylake     ,avx         ,65663       ,9           ,5           ,1884.9      ,1887.4
> > > > skylake     ,avx         ,131079      ,0           ,64          ,3944.3      ,3949.4
> > > > skylake     ,avx         ,131087      ,0           ,3           ,3927.3      ,3913.3
> > > > skylake     ,avx         ,131103      ,3           ,0           ,4415.8      ,4169.4
> > > > skylake     ,avx         ,131135      ,3           ,7           ,4224.5      ,4157.6
> > > > skylake     ,avx         ,131199      ,9           ,5           ,5974.0      ,4983.8
> > > > skylake     ,avx         ,262151      ,0           ,64          ,11050.2     ,10620.6
> > > > skylake     ,avx         ,262159      ,0           ,3           ,9932.8      ,10037.3
> > > > skylake     ,avx         ,262175      ,3           ,0           ,10188.8     ,9206.6
> > > > skylake     ,avx         ,262207      ,3           ,7           ,9633.3      ,9216.7
> > > > skylake     ,avx         ,262271      ,9           ,5           ,9732.7      ,9345.3
> > > > skylake     ,avx         ,524295      ,0           ,64          ,24823.9     ,24880.7
> > > > skylake     ,avx         ,524303      ,0           ,3           ,24514.0     ,24556.7
> > > > skylake     ,avx         ,524319      ,3           ,0           ,23974.4     ,24219.9
> > > > skylake     ,avx         ,524351      ,3           ,7           ,24159.7     ,24207.0
> > > > skylake     ,avx         ,524415      ,9           ,5           ,23946.5     ,24142.8
> > > > skylake     ,avx         ,1048583     ,0           ,64          ,49163.9     ,49454.6
> > > > skylake     ,avx         ,1048591     ,0           ,3           ,49879.3     ,49400.8
> > > > skylake     ,avx         ,1048607     ,3           ,0           ,49738.0     ,48864.6
> > > > skylake     ,avx         ,1048639     ,3           ,7           ,48804.0     ,47588.5
> > > > skylake     ,avx         ,1048703     ,9           ,5           ,49629.4     ,49796.3
> > > > skylake     ,avx         ,2097159     ,0           ,64          ,98271.7     ,96330.6
> > > > skylake     ,avx         ,2097167     ,0           ,3           ,97801.8     ,98638.1
> > > > skylake     ,avx         ,2097183     ,3           ,0           ,98041.1     ,99287.6
> > > > skylake     ,avx         ,2097215     ,3           ,7           ,96629.5     ,96521.9
> > > > skylake     ,avx         ,2097279     ,9           ,5           ,98961.8     ,98909.8
> > > > skylake     ,avx         ,4194311     ,0           ,64          ,194667.7    ,195377.1
> > > > skylake     ,avx         ,4194319     ,0           ,3           ,194919.5    ,198576.2
> > > > skylake     ,avx         ,4194335     ,3           ,0           ,192949.8    ,194584.7
> > > > skylake     ,avx         ,4194367     ,3           ,7           ,189943.5    ,189177.9
> > > > skylake     ,avx         ,4194431     ,9           ,5           ,192479.1    ,196494.2
> > > > skylake     ,avx         ,8388615     ,0           ,64          ,588671.6    ,587215.4
> > > > skylake     ,avx         ,8388623     ,0           ,3           ,581640.7    ,582812.5
> > > > skylake     ,avx         ,8388639     ,3           ,0           ,549811.9    ,544697.6
> > > > skylake     ,avx         ,8388671     ,3           ,7           ,591155.0    ,577951.8
> > > > skylake     ,avx         ,8388735     ,9           ,5           ,547583.2    ,545133.3
> > > > skylake     ,avx         ,16777223    ,0           ,64          ,1787503.0   ,1811146.0
> > > > skylake     ,avx         ,16777231    ,0           ,3           ,1758671.0   ,1756343.0
> > > > skylake     ,avx         ,16777247    ,3           ,0           ,1691781.0   ,1694661.0
> > > > skylake     ,avx         ,16777279    ,3           ,7           ,1768150.0   ,1754785.0
> > > > skylake     ,avx         ,16777343    ,9           ,5           ,1695179.0   ,1710794.0
> > > > skylake     ,sse2        ,4103        ,0           ,64          ,150.8       ,150.5
> > > > skylake     ,sse2        ,4111        ,0           ,3           ,156.8       ,158.4
> > > > skylake     ,sse2        ,4127        ,3           ,0           ,99.7        ,99.4
> > > > skylake     ,sse2        ,4159        ,3           ,7           ,154.8       ,154.5
> > > > skylake     ,sse2        ,4223        ,9           ,5           ,137.3       ,137.2
> > > > skylake     ,sse2        ,8199        ,0           ,64          ,284.8       ,285.5
> > > > skylake     ,sse2        ,8207        ,0           ,3           ,296.0       ,296.1
> > > > skylake     ,sse2        ,8223        ,3           ,0           ,168.0       ,168.2
> > > > skylake     ,sse2        ,8255        ,3           ,7           ,293.0       ,292.4
> > > > skylake     ,sse2        ,8319        ,9           ,5           ,251.3       ,250.7
> > > > skylake     ,sse2        ,16391       ,0           ,64          ,561.3       ,608.3
> > > > skylake     ,sse2        ,16399       ,0           ,3           ,571.0       ,574.8
> > > > skylake     ,sse2        ,16415       ,3           ,0           ,305.4       ,305.0
> > > > skylake     ,sse2        ,16447       ,3           ,7           ,563.2       ,565.0
> > > > skylake     ,sse2        ,16511       ,9           ,5           ,477.1       ,475.1
> > > > skylake     ,sse2        ,32775       ,0           ,64          ,1128.2      ,1131.7
> > > > skylake     ,sse2        ,32783       ,0           ,3           ,1126.6      ,1131.0
> > > > skylake     ,sse2        ,32799       ,3           ,0           ,587.6       ,590.8
> > > > skylake     ,sse2        ,32831       ,3           ,7           ,1130.6      ,1126.2
> > > > skylake     ,sse2        ,32895       ,9           ,5           ,957.6       ,953.0
> > > > skylake     ,sse2        ,65543       ,0           ,64          ,2718.9      ,2704.2
> > > > skylake     ,sse2        ,65551       ,0           ,3           ,2724.1      ,2725.0
> > > > skylake     ,sse2        ,65567       ,3           ,0           ,1888.4      ,1914.3
> > > > skylake     ,sse2        ,65599       ,3           ,7           ,2787.6      ,2748.7
> > > > skylake     ,sse2        ,65663       ,9           ,5           ,2400.5      ,2369.4
> > > > skylake     ,sse2        ,131079      ,0           ,64          ,5603.3      ,5654.9
> > > > skylake     ,sse2        ,131087      ,0           ,3           ,5939.3      ,5871.4
> > > > skylake     ,sse2        ,131103      ,3           ,0           ,4272.4      ,4190.0
> > > > skylake     ,sse2        ,131135      ,3           ,7           ,7601.4      ,7524.6
> > > > skylake     ,sse2        ,131199      ,9           ,5           ,7022.1      ,6864.7
> > > > skylake     ,sse2        ,262151      ,0           ,64          ,13736.2     ,14030.0
> > > > skylake     ,sse2        ,262159      ,0           ,3           ,12407.3     ,12334.1
> > > > skylake     ,sse2        ,262175      ,3           ,0           ,9661.1      ,9249.4
> > > > skylake     ,sse2        ,262207      ,3           ,7           ,12850.2     ,12351.6
> > > > skylake     ,sse2        ,262271      ,9           ,5           ,10792.6     ,10435.8
> > > > skylake     ,sse2        ,524295      ,0           ,64          ,27754.5     ,28177.7
> > > > skylake     ,sse2        ,524303      ,0           ,3           ,27766.2     ,28152.0
> > > > skylake     ,sse2        ,524319      ,3           ,0           ,24030.9     ,24438.3
> > > > skylake     ,sse2        ,524351      ,3           ,7           ,27787.5     ,27933.0
> > > > skylake     ,sse2        ,524415      ,9           ,5           ,24263.2     ,25249.1
> > > > skylake     ,sse2        ,1048583     ,0           ,64          ,56199.9     ,56039.8
> > > > skylake     ,sse2        ,1048591     ,0           ,3           ,56750.2     ,58889.7
> > > > skylake     ,sse2        ,1048607     ,3           ,0           ,56394.0     ,55115.3
> > > > skylake     ,sse2        ,1048639     ,3           ,7           ,57233.1     ,57473.8
> > > > skylake     ,sse2        ,1048703     ,9           ,5           ,56324.3     ,55917.9
> > > > skylake     ,sse2        ,2097159     ,0           ,64          ,113234.8    ,114346.4
> > > > skylake     ,sse2        ,2097167     ,0           ,3           ,114373.1    ,115522.5
> > > > skylake     ,sse2        ,2097183     ,3           ,0           ,108113.3    ,108513.3
> > > > skylake     ,sse2        ,2097215     ,3           ,7           ,116863.6    ,116549.9
> > > > skylake     ,sse2        ,2097279     ,9           ,5           ,108945.1    ,108843.7
> > > > skylake     ,sse2        ,4194311     ,0           ,64          ,230250.1    ,232350.0
> > > > skylake     ,sse2        ,4194319     ,0           ,3           ,231895.3    ,235055.6
> > > > skylake     ,sse2        ,4194335     ,3           ,0           ,218442.8    ,219199.8
> > > > skylake     ,sse2        ,4194367     ,3           ,7           ,242564.2    ,235587.7
> > > > skylake     ,sse2        ,4194431     ,9           ,5           ,224167.4    ,215261.8
> > > > skylake     ,sse2        ,8388615     ,0           ,64          ,679801.8    ,674832.0
> > > > skylake     ,sse2        ,8388623     ,0           ,3           ,684913.2    ,685238.7
> > > > skylake     ,sse2        ,8388639     ,3           ,0           ,644865.4    ,631388.6
> > > > skylake     ,sse2        ,8388671     ,3           ,7           ,698700.9    ,689316.1
> > > > skylake     ,sse2        ,8388735     ,9           ,5           ,644820.2    ,631366.8
> > > > skylake     ,sse2        ,16777223    ,0           ,64          ,1877984.0   ,1876437.0
> > > > skylake     ,sse2        ,16777231    ,0           ,3           ,1898086.0   ,1913053.0
> > > > skylake     ,sse2        ,16777247    ,3           ,0           ,1857018.0   ,1866949.0
> > > > skylake     ,sse2        ,16777279    ,3           ,7           ,1914905.0   ,1897134.0
> > > > skylake     ,sse2        ,16777343    ,9           ,5           ,1859937.0   ,1881939.0
> > > > icelake     ,avx512      ,4103        ,0           ,64          ,75.2        ,75.8
> > > > icelake     ,avx512      ,4111        ,0           ,3           ,56.9        ,56.4
> > > > icelake     ,avx512      ,4127        ,3           ,0           ,59.1        ,59.6
> > > > icelake     ,avx512      ,4159        ,3           ,7           ,50.7        ,51.3
> > > > icelake     ,avx512      ,4223        ,9           ,5           ,59.2        ,58.9
> > > > icelake     ,avx512      ,8199        ,0           ,64          ,67.8        ,63.9
> > > > icelake     ,avx512      ,8207        ,0           ,3           ,89.0        ,89.9
> > > > icelake     ,avx512      ,8223        ,3           ,0           ,90.2        ,90.1
> > > > icelake     ,avx512      ,8255        ,3           ,7           ,82.6        ,84.9
> > > > icelake     ,avx512      ,8319        ,9           ,5           ,91.5        ,92.8
> > > > icelake     ,avx512      ,16391       ,0           ,64          ,118.0       ,117.6
> > > > icelake     ,avx512      ,16399       ,0           ,3           ,156.5       ,157.0
> > > > icelake     ,avx512      ,16415       ,3           ,0           ,157.4       ,157.3
> > > > icelake     ,avx512      ,16447       ,3           ,7           ,151.0       ,151.6
> > > > icelake     ,avx512      ,16511       ,9           ,5           ,159.1       ,159.6
> > > > icelake     ,avx512      ,32775       ,0           ,64          ,231.8       ,230.8
> > > > icelake     ,avx512      ,32783       ,0           ,3           ,297.8       ,299.3
> > > > icelake     ,avx512      ,32799       ,3           ,0           ,299.1       ,299.0
> > > > icelake     ,avx512      ,32831       ,3           ,7           ,293.5       ,295.4
> > > > icelake     ,avx512      ,32895       ,9           ,5           ,300.3       ,302.5
> > > > icelake     ,avx512      ,65543       ,0           ,64          ,1473.4      ,1479.2
> > > > icelake     ,avx512      ,65551       ,0           ,3           ,1438.2      ,1445.3
> > > > icelake     ,avx512      ,65567       ,3           ,0           ,1450.3      ,1463.8
> > > > icelake     ,avx512      ,65599       ,3           ,7           ,1469.0      ,1473.8
> > > > icelake     ,avx512      ,65663       ,9           ,5           ,1480.0      ,1483.5
> > > > icelake     ,avx512      ,131079      ,0           ,64          ,3015.1      ,3037.5
> > > > icelake     ,avx512      ,131087      ,0           ,3           ,2952.3      ,2960.4
> > > > icelake     ,avx512      ,131103      ,3           ,0           ,2966.2      ,2964.4
> > > > icelake     ,avx512      ,131135      ,3           ,7           ,2961.6      ,3047.9
> > > > icelake     ,avx512      ,131199      ,9           ,5           ,2967.4      ,3183.8
> > > > icelake     ,avx512      ,262151      ,0           ,64          ,6206.0      ,6141.5
> > > > icelake     ,avx512      ,262159      ,0           ,3           ,5990.8      ,5959.2
> > > > icelake     ,avx512      ,262175      ,3           ,0           ,5976.7      ,5963.8
> > > > icelake     ,avx512      ,262207      ,3           ,7           ,5939.5      ,5924.3
> > > > icelake     ,avx512      ,262271      ,9           ,5           ,5944.6      ,5990.3
> > > > icelake     ,avx512      ,524295      ,0           ,64          ,14726.7     ,14307.0
> > > > icelake     ,avx512      ,524303      ,0           ,3           ,14344.2     ,14040.5
> > > > icelake     ,avx512      ,524319      ,3           ,0           ,14175.0     ,13862.2
> > > > icelake     ,avx512      ,524351      ,3           ,7           ,14261.4     ,13821.5
> > > > icelake     ,avx512      ,524415      ,9           ,5           ,14266.5     ,14064.7
> > > > icelake     ,avx512      ,1048583     ,0           ,64          ,35211.4     ,35414.6
> > > > icelake     ,avx512      ,1048591     ,0           ,3           ,35156.8     ,35591.2
> > > > icelake     ,avx512      ,1048607     ,3           ,0           ,35273.1     ,35503.3
> > > > icelake     ,avx512      ,1048639     ,3           ,7           ,35255.8     ,35725.0
> > > > icelake     ,avx512      ,1048703     ,9           ,5           ,35703.6     ,36289.9
> > > > icelake     ,avx512      ,2097159     ,0           ,64          ,72613.9     ,72063.2
> > > > icelake     ,avx512      ,2097167     ,0           ,3           ,72301.6     ,73504.2
> > > > icelake     ,avx512      ,2097183     ,3           ,0           ,73448.8     ,72133.6
> > > > icelake     ,avx512      ,2097215     ,3           ,7           ,73762.9     ,72825.8
> > > > icelake     ,avx512      ,2097279     ,9           ,5           ,72097.3     ,72914.6
> > > > icelake     ,avx512      ,4194311     ,0           ,64          ,144793.4    ,144182.1
> > > > icelake     ,avx512      ,4194319     ,0           ,3           ,143710.3    ,145063.3
> > > > icelake     ,avx512      ,4194335     ,3           ,0           ,146722.1    ,144046.4
> > > > icelake     ,avx512      ,4194367     ,3           ,7           ,144267.0    ,144874.6
> > > > icelake     ,avx512      ,4194431     ,9           ,5           ,143808.2    ,144560.0
> > > > icelake     ,avx512      ,8388615     ,0           ,64          ,427993.4    ,424521.5
> > > > icelake     ,avx512      ,8388623     ,0           ,3           ,470267.1    ,473290.8
> > > > icelake     ,avx512      ,8388639     ,3           ,0           ,457179.7    ,461797.7
> > > > icelake     ,avx512      ,8388671     ,3           ,7           ,472507.9    ,481561.4
> > > > icelake     ,avx512      ,8388735     ,9           ,5           ,463611.9    ,467388.7
> > > > icelake     ,avx512      ,16777223    ,0           ,64          ,1490426.0   ,1526996.0
> > > > icelake     ,avx512      ,16777231    ,0           ,3           ,1516687.0   ,1517095.0
> > > > icelake     ,avx512      ,16777247    ,3           ,0           ,1497688.0   ,1512766.0
> > > > icelake     ,avx512      ,16777279    ,3           ,7           ,1512331.0   ,1524317.0
> > > > icelake     ,avx512      ,16777343    ,9           ,5           ,1498908.0   ,1500526.0
> > > > icelake     ,avx         ,4103        ,0           ,64          ,50.2        ,63.7
> > > > icelake     ,avx         ,4111        ,0           ,3           ,63.7        ,65.1
> > > > icelake     ,avx         ,4127        ,3           ,0           ,68.2        ,69.4
> > > > icelake     ,avx         ,4159        ,3           ,7           ,59.6        ,68.0
> > > > icelake     ,avx         ,4223        ,9           ,5           ,68.2        ,66.8
> > > > icelake     ,avx         ,8199        ,0           ,64          ,92.1        ,89.9
> > > > icelake     ,avx         ,8207        ,0           ,3           ,119.7       ,118.3
> > > > icelake     ,avx         ,8223        ,3           ,0           ,119.1       ,120.9
> > > > icelake     ,avx         ,8255        ,3           ,7           ,122.9       ,123.7
> > > > icelake     ,avx         ,8319        ,9           ,5           ,122.1       ,121.8
> > > > icelake     ,avx         ,16391       ,0           ,64          ,162.7       ,158.0
> > > > icelake     ,avx         ,16399       ,0           ,3           ,227.6       ,234.1
> > > > icelake     ,avx         ,16415       ,3           ,0           ,230.8       ,232.7
> > > > icelake     ,avx         ,16447       ,3           ,7           ,226.8       ,232.6
> > > > icelake     ,avx         ,16511       ,9           ,5           ,233.4       ,233.8
> > > > icelake     ,avx         ,32775       ,0           ,64          ,312.2       ,301.8
> > > > icelake     ,avx         ,32783       ,0           ,3           ,449.7       ,450.0
> > > > icelake     ,avx         ,32799       ,3           ,0           ,452.7       ,455.9
> > > > icelake     ,avx         ,32831       ,3           ,7           ,449.8       ,458.0
> > > > icelake     ,avx         ,32895       ,9           ,5           ,456.3       ,459.4
> > > > icelake     ,avx         ,65543       ,0           ,64          ,1460.6      ,1463.9
> > > > icelake     ,avx         ,65551       ,0           ,3           ,1462.0      ,1465.4
> > > > icelake     ,avx         ,65567       ,3           ,0           ,1466.6      ,1480.4
> > > > icelake     ,avx         ,65599       ,3           ,7           ,1488.0      ,1488.9
> > > > icelake     ,avx         ,65663       ,9           ,5           ,1680.8      ,1499.5
> > > > icelake     ,avx         ,131079      ,0           ,64          ,2988.5      ,3010.1
> > > > icelake     ,avx         ,131087      ,0           ,3           ,2995.5      ,2996.4
> > > > icelake     ,avx         ,131103      ,3           ,0           ,3006.2      ,3000.5
> > > > icelake     ,avx         ,131135      ,3           ,7           ,3032.4      ,3073.7
> > > > icelake     ,avx         ,131199      ,9           ,5           ,3010.4      ,3027.4
> > > > icelake     ,avx         ,262151      ,0           ,64          ,6143.2      ,6079.1
> > > > icelake     ,avx         ,262159      ,0           ,3           ,6085.1      ,6075.8
> > > > icelake     ,avx         ,262175      ,3           ,0           ,6088.0      ,6064.9
> > > > icelake     ,avx         ,262207      ,3           ,7           ,6018.7      ,6023.5
> > > > icelake     ,avx         ,262271      ,9           ,5           ,6019.8      ,5959.2
> > > > icelake     ,avx         ,524295      ,0           ,64          ,14464.2     ,14095.1
> > > > icelake     ,avx         ,524303      ,0           ,3           ,14761.6     ,14050.2
> > > > icelake     ,avx         ,524319      ,3           ,0           ,14534.1     ,14087.5
> > > > icelake     ,avx         ,524351      ,3           ,7           ,14147.7     ,13903.8
> > > > icelake     ,avx         ,524415      ,9           ,5           ,14157.0     ,13982.9
> > > > icelake     ,avx         ,1048583     ,0           ,64          ,36599.0     ,37461.4
> > > > icelake     ,avx         ,1048591     ,0           ,3           ,36717.8     ,37454.9
> > > > icelake     ,avx         ,1048607     ,3           ,0           ,36821.2     ,37343.3
> > > > icelake     ,avx         ,1048639     ,3           ,7           ,36958.0     ,37507.2
> > > > icelake     ,avx         ,1048703     ,9           ,5           ,36869.2     ,37413.1
> > > > icelake     ,avx         ,2097159     ,0           ,64          ,74765.8     ,75330.9
> > > > icelake     ,avx         ,2097167     ,0           ,3           ,75175.4     ,74891.9
> > > > icelake     ,avx         ,2097183     ,3           ,0           ,75451.4     ,74787.7
> > > > icelake     ,avx         ,2097215     ,3           ,7           ,75394.8     ,75839.1
> > > > icelake     ,avx         ,2097279     ,9           ,5           ,75099.2     ,75421.2
> > > > icelake     ,avx         ,4194311     ,0           ,64          ,146809.6    ,146619.4
> > > > icelake     ,avx         ,4194319     ,0           ,3           ,148866.4    ,149898.2
> > > > icelake     ,avx         ,4194335     ,3           ,0           ,148719.7    ,150165.4
> > > > icelake     ,avx         ,4194367     ,3           ,7           ,150600.1    ,150925.9
> > > > icelake     ,avx         ,4194431     ,9           ,5           ,149457.3    ,150519.2
> > > > icelake     ,avx         ,8388615     ,0           ,64          ,412709.8    ,423666.1
> > > > icelake     ,avx         ,8388623     ,0           ,3           ,423717.4    ,424418.2
> > > > icelake     ,avx         ,8388639     ,3           ,0           ,414387.5    ,413445.6
> > > > icelake     ,avx         ,8388671     ,3           ,7           ,449010.7    ,417553.5
> > > > icelake     ,avx         ,8388735     ,9           ,5           ,414128.6    ,411815.3
> > > > icelake     ,avx         ,16777223    ,0           ,64          ,1490032.0   ,1510004.0
> > > > icelake     ,avx         ,16777231    ,0           ,3           ,1379638.0   ,1422097.0
> > > > icelake     ,avx         ,16777247    ,3           ,0           ,1418930.0   ,1367557.0
> > > > icelake     ,avx         ,16777279    ,3           ,7           ,1515152.0   ,1500176.0
> > > > icelake     ,avx         ,16777343    ,9           ,5           ,1344117.0   ,1411795.0
> > > > icelake     ,sse2        ,4103        ,0           ,64          ,113.2       ,114.6
> > > > icelake     ,sse2        ,4111        ,0           ,3           ,121.5       ,120.4
> > > > icelake     ,sse2        ,4127        ,3           ,0           ,1700.5      ,1771.5
> > > > icelake     ,sse2        ,4159        ,3           ,7           ,119.3       ,118.8
> > > > icelake     ,sse2        ,4223        ,9           ,5           ,1739.7      ,1735.2
> > > > icelake     ,sse2        ,8199        ,0           ,64          ,207.0       ,203.9
> > > > icelake     ,sse2        ,8207        ,0           ,3           ,225.5       ,220.8
> > > > icelake     ,sse2        ,8223        ,3           ,0           ,3444.3      ,3743.5
> > > > icelake     ,sse2        ,8255        ,3           ,7           ,219.9       ,216.8
> > > > icelake     ,sse2        ,8319        ,9           ,5           ,4117.1      ,3487.3
> > > > icelake     ,sse2        ,16391       ,0           ,64          ,397.1       ,394.3
> > > > icelake     ,sse2        ,16399       ,0           ,3           ,439.6       ,428.6
> > > > icelake     ,sse2        ,16415       ,3           ,0           ,6997.0      ,7031.2
> > > > icelake     ,sse2        ,16447       ,3           ,7           ,426.8       ,421.8
> > > > icelake     ,sse2        ,16511       ,9           ,5           ,7037.6      ,7038.3
> > > > icelake     ,sse2        ,32775       ,0           ,64          ,790.9       ,779.0
> > > > icelake     ,sse2        ,32783       ,0           ,3           ,863.1       ,849.6
> > > > icelake     ,sse2        ,32799       ,3           ,0           ,14043.0     ,14390.9
> > > > icelake     ,sse2        ,32831       ,3           ,7           ,841.6       ,833.1
> > > > icelake     ,sse2        ,32895       ,9           ,5           ,14277.6     ,14344.2
> > > > icelake     ,sse2        ,65543       ,0           ,64          ,1897.0      ,1897.3
> > > > icelake     ,sse2        ,65551       ,0           ,3           ,1927.1      ,1955.4
> > > > icelake     ,sse2        ,65567       ,3           ,0           ,28834.7     ,28727.8
> > > > icelake     ,sse2        ,65599       ,3           ,7           ,1961.4      ,1969.7
> > > > icelake     ,sse2        ,65663       ,9           ,5           ,28867.6     ,29019.8
> > > > icelake     ,sse2        ,131079      ,0           ,64          ,3879.3      ,3872.6
> > > > icelake     ,sse2        ,131087      ,0           ,3           ,3955.3      ,3990.7
> > > > icelake     ,sse2        ,131103      ,3           ,0           ,58001.8     ,60567.9
> > > > icelake     ,sse2        ,131135      ,3           ,7           ,3951.5      ,4002.6
> > > > icelake     ,sse2        ,131199      ,9           ,5           ,57886.7     ,58391.4
> > > > icelake     ,sse2        ,262151      ,0           ,64          ,7851.4      ,7894.7
> > > > icelake     ,sse2        ,262159      ,0           ,3           ,7947.5      ,8016.2
> > > > icelake     ,sse2        ,262175      ,3           ,0           ,115036.2    ,115968.6
> > > > icelake     ,sse2        ,262207      ,3           ,7           ,7883.9      ,7814.1
> > > > icelake     ,sse2        ,262271      ,9           ,5           ,113776.4    ,119733.6
> > > > icelake     ,sse2        ,524295      ,0           ,64          ,17198.1     ,16974.9
> > > > icelake     ,sse2        ,524303      ,0           ,3           ,17402.2     ,17096.3
> > > > icelake     ,sse2        ,524319      ,3           ,0           ,223980.4    ,225889.9
> > > > icelake     ,sse2        ,524351      ,3           ,7           ,17034.9     ,16910.3
> > > > icelake     ,sse2        ,524415      ,9           ,5           ,224027.7    ,224962.5
> > > > icelake     ,sse2        ,1048583     ,0           ,64          ,38822.3     ,39178.6
> > > > icelake     ,sse2        ,1048591     ,0           ,3           ,41686.7     ,40247.4
> > > > icelake     ,sse2        ,1048607     ,3           ,0           ,38814.8     ,39323.3
> > > > icelake     ,sse2        ,1048639     ,3           ,7           ,39568.3     ,41325.7
> > > > icelake     ,sse2        ,1048703     ,9           ,5           ,39354.2     ,39637.9
> > > > icelake     ,sse2        ,2097159     ,0           ,64          ,84074.7     ,84543.1
> > > > icelake     ,sse2        ,2097167     ,0           ,3           ,83665.7     ,82358.2
> > > > icelake     ,sse2        ,2097183     ,3           ,0           ,81817.8     ,79638.9
> > > > icelake     ,sse2        ,2097215     ,3           ,7           ,83649.1     ,83497.6
> > > > icelake     ,sse2        ,2097279     ,9           ,5           ,80287.6     ,79980.9
> > > > icelake     ,sse2        ,4194311     ,0           ,64          ,165409.8    ,168343.1
> > > > icelake     ,sse2        ,4194319     ,0           ,3           ,165216.7    ,177632.0
> > > > icelake     ,sse2        ,4194335     ,3           ,0           ,158718.7    ,160342.2
> > > > icelake     ,sse2        ,4194367     ,3           ,7           ,167944.9    ,167204.4
> > > > icelake     ,sse2        ,4194431     ,9           ,5           ,161530.1    ,164839.7
> > > > icelake     ,sse2        ,8388615     ,0           ,64          ,626504.3    ,629858.5
> > > > icelake     ,sse2        ,8388623     ,0           ,3           ,623969.5    ,631509.1
> > > > icelake     ,sse2        ,8388639     ,3           ,0           ,599366.7    ,600016.0
> > > > icelake     ,sse2        ,8388671     ,3           ,7           ,619964.2    ,619113.2
> > > > icelake     ,sse2        ,8388735     ,9           ,5           ,595338.1    ,604172.4
> > > > icelake     ,sse2        ,16777223    ,0           ,64          ,1709597.0   ,1725184.0
> > > > icelake     ,sse2        ,16777231    ,0           ,3           ,1725452.0   ,1719746.0
> > > > icelake     ,sse2        ,16777247    ,3           ,0           ,1614269.0   ,1607164.0
> > > > icelake     ,sse2        ,16777279    ,3           ,7           ,1705295.0   ,1733018.0
> > > > icelake     ,sse2        ,16777343    ,9           ,5           ,1604197.0   ,1595690.0
> > > >
> > > >
> > > >  .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
> > > >  1 file changed, 265 insertions(+), 73 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > > index 897a3d9762..5e4a071f16 100644
> > > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > > @@ -35,7 +35,16 @@
> > > >        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
> > > >     7. If size >= __x86_shared_non_temporal_threshold and there is no
> > > >        overlap between destination and source, use non-temporal store
> > > > -      instead of aligned store.  */
> > > > +      instead of aligned store copying from either 2 or 4 pages at
> > > > +      once.
> > > > +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> > > > +      and source and destination do not page alias, copy from 2 pages
> > > > +      at once using non-temporal stores. Page aliasing in this case is
> > > > +      considered true if destination's page alignment - sources' page
> > > > +      alignment is less than 8 * VEC_SIZE.
> > > > +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> > > > +      and destination do page alias copy from 4 pages at once using
> > > > +      non-temporal stores.  */
> > > >
> > > >  #include <sysdep.h>
> > > >
> > > > @@ -67,6 +76,34 @@
> > > >  # endif
> > > >  #endif
> > > >
> > > > +#ifndef PAGE_SIZE
> > > > +# define PAGE_SIZE 4096
> > > > +#endif
> > > > +
> > > > +#if PAGE_SIZE != 4096
> > > > +# error Unsupported PAGE_SIZE
> > > > +#endif
> > > > +
> > > > +#ifndef LOG_PAGE_SIZE
> > > > +# define LOG_PAGE_SIZE 12
> > > > +#endif
> > > > +
> > > > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> > > > +# error Invalid LOG_PAGE_SIZE
> > > > +#endif
> > > > +
> > > > +/* Byte per page for large_memcpy inner loop.  */
> > > > +#if VEC_SIZE == 64
> > > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> > > > +#else
> > > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> > > > +#endif
> > > > +
> > > > +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
> > > > +#ifndef LOG_4X_MEMCPY_THRESH
> > > > +# define LOG_4X_MEMCPY_THRESH 4
> > > > +#endif
> > > > +
> > > >  /* Avoid short distance rep movsb only with non-SSE vector.  */
> > > >  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
> > > >  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> > > > @@ -106,6 +143,28 @@
> > > >  # error Unsupported PREFETCH_SIZE!
> > > >  #endif
> > > >
> > > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> > > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> > > > +     VMOVU   (offset)base, vec0; \
> > > > +     VMOVU   ((offset) + VEC_SIZE)base, vec1;
> > > > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> > > > +     VMOVNT  vec0, (offset)base; \
> > > > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base;
> > > > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> > > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > > > +     VMOVU   (offset)base, vec0; \
> > > > +     VMOVU   ((offset) + VEC_SIZE)base, vec1; \
> > > > +     VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \
> > > > +     VMOVU   ((offset) + VEC_SIZE * 3)base, vec3;
> > > > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > > > +     VMOVNT  vec0, (offset)base; \
> > > > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base; \
> > > > +     VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \
> > > > +     VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;
> > > > +#else
> > > > +# error Invalid LARGE_LOAD_SIZE
> > > > +#endif
> > > > +
> > > >  #ifndef SECTION
> > > >  # error SECTION is not defined!
> > > >  #endif
> > > > @@ -393,6 +452,15 @@ L(last_4x_vec):
> > > >       VZEROUPPER_RETURN
> > > >
> > > >  L(more_8x_vec):
> > > > +     /* Check if non-temporal move candidate.  */
> > > > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > > +     /* Check non-temporal store threshold.  */
> > > > +     cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > > +     ja      L(large_memcpy_2x)
> > > > +#endif
> > > > +     /* Entry if rdx is greater than non-temporal threshold but there
> > > > +       is overlap.  */
> > > > +L(more_8x_vec_check):
> > > >       cmpq    %rsi, %rdi
> > > >       ja      L(more_8x_vec_backward)
> > > >       /* Source == destination is less common.  */
> > > > @@ -419,24 +487,21 @@ L(more_8x_vec):
> > > >       subq    %r8, %rdi
> > > >       /* Adjust length.  */
> > > >       addq    %r8, %rdx
> > > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > > -     /* Check non-temporal store threshold.  */
> > > > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > > -     ja      L(large_forward)
> > > > -#endif
> > > > +
> > > > +     .p2align 4
> > > >  L(loop_4x_vec_forward):
> > > >       /* Copy 4 * VEC a time forward.  */
> > > >       VMOVU   (%rsi), %VEC(0)
> > > >       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > > >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > > >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > > -     addq    $(VEC_SIZE * 4), %rsi
> > > > -     subq    $(VEC_SIZE * 4), %rdx
> > > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > > +     addq    $-(VEC_SIZE * 4), %rdx
> > > >       VMOVA   %VEC(0), (%rdi)
> > > >       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > > >       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > >       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > >       cmpq    $(VEC_SIZE * 4), %rdx
> > > >       ja      L(loop_4x_vec_forward)
> > > >       /* Store the last 4 * VEC.  */
> > > > @@ -470,24 +535,21 @@ L(more_8x_vec_backward):
> > > >       subq    %r8, %r9
> > > >       /* Adjust length.  */
> > > >       subq    %r8, %rdx
> > > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > > -     /* Check non-temporal store threshold.  */
> > > > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > > -     ja      L(large_backward)
> > > > -#endif
> > > > +
> > > > +     .p2align 4
> > > >  L(loop_4x_vec_backward):
> > > >       /* Copy 4 * VEC a time backward.  */
> > > >       VMOVU   (%rcx), %VEC(0)
> > > >       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > > >       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > > >       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > > > -     subq    $(VEC_SIZE * 4), %rcx
> > > > -     subq    $(VEC_SIZE * 4), %rdx
> > > > +     addq    $-(VEC_SIZE * 4), %rcx
> > > > +     addq    $-(VEC_SIZE * 4), %rdx
> > > >       VMOVA   %VEC(0), (%r9)
> > > >       VMOVA   %VEC(1), -VEC_SIZE(%r9)
> > > >       VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
> > > >       VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
> > > > -     subq    $(VEC_SIZE * 4), %r9
> > > > +     addq    $-(VEC_SIZE * 4), %r9
> > > >       cmpq    $(VEC_SIZE * 4), %rdx
> > > >       ja      L(loop_4x_vec_backward)
> > > >       /* Store the first 4 * VEC.  */
> > > > @@ -500,72 +562,202 @@ L(loop_4x_vec_backward):
> > > >       VZEROUPPER_RETURN
> > > >
> > > >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > > -L(large_forward):
> > > > +     .p2align 4
> > > > +L(large_memcpy_2x):
> > > > +     /* Compute absolute value of difference between source and
> > > > +        destination.  */
> > > > +     movq    %rdi, %r9
> > > > +     subq    %rsi, %r9
> > > > +     movq    %r9, %r8
> > > > +     leaq    -1(%r9), %rcx
> > > > +     sarq    $63, %r8
> > > > +     xorq    %r8, %r9
> > > > +     subq    %r8, %r9
> > > >       /* Don't use non-temporal store if there is overlap between
> > > > -        destination and source since destination may be in cache
> > > > -        when source is loaded.  */
> > > > -     leaq    (%rdi, %rdx), %r10
> > > > -     cmpq    %r10, %rsi
> > > > -     jb      L(loop_4x_vec_forward)
> > > > -L(loop_large_forward):
> > > > +        destination and source since destination may be in cache when
> > > > +        source is loaded.  */
> > > > +     cmpq    %r9, %rdx
> > > > +     ja      L(more_8x_vec_check)
> > > > +
> > > > +     /* Cache align destination. First store the first 64 bytes then
> > > > +        adjust alignments.  */
> > > > +     VMOVU   (%rsi), %VEC(8)
> > > > +#if VEC_SIZE < 64
> > > > +     VMOVU   VEC_SIZE(%rsi), %VEC(9)
> > > > +#if VEC_SIZE < 32
> > > > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> > > > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> > > > +#endif
> > > > +#endif
> > > > +     VMOVU   %VEC(8), (%rdi)
> > > > +#if VEC_SIZE < 64
> > > > +     VMOVU   %VEC(9), VEC_SIZE(%rdi)
> > > > +#if VEC_SIZE < 32
> > > > +     VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> > > > +     VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> > > > +#endif
> > > > +#endif
> > > > +     /* Adjust source, destination, and size.  */
> > > > +     movq    %rdi, %r8
> > > > +     andq    $63, %r8
> > > > +     /* Get the negative of offset for alignment.  */
> > > > +     subq    $64, %r8
> > > > +     /* Adjust source.  */
> > > > +     subq    %r8, %rsi
> > > > +     /* Adjust destination which should be aligned now.  */
> > > > +     subq    %r8, %rdi
> > > > +     /* Adjust length.  */
> > > > +     addq    %r8, %rdx
> > > > +
> > > > +     /* Test if source and destination addresses will alias. If they do
> > > > +        the larger pipeline in large_memcpy_4x alleviated the
> > > > +        performance drop.  */
> > > > +     testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> > > > +     jz      L(large_memcpy_4x)
> > > > +
> > > > +     movq    %rdx, %r10
> > > > +     shrq    $LOG_4X_MEMCPY_THRESH, %r10
> > > > +     cmp     __x86_shared_non_temporal_threshold(%rip), %r10
> > > > +     jae     L(large_memcpy_4x)
> > > > +
> > > > +     /* edx will store remainder size for copying tail.  */
> > > > +     andl    $(PAGE_SIZE * 2 - 1), %edx
> > > > +     /* r10 stores outer loop counter.  */
> > > > +     shrq    $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> > > > +     /* Copy 4x VEC at a time from 2 pages.  */
> > > > +     .p2align 4
> > > > +L(loop_large_memcpy_2x_outer):
> > > > +     /* ecx stores inner loop counter.  */
> > > > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > > > +L(loop_large_memcpy_2x_inner):
> > > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> > > > +     /* Load vectors from rsi.  */
> > > > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > > +     subq    $-LARGE_LOAD_SIZE, %rsi
> > > > +     /* Non-temporal store vectors to rdi.  */
> > > > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > > +     subq    $-LARGE_LOAD_SIZE, %rdi
> > > > +     decl    %ecx
> > > > +     jnz     L(loop_large_memcpy_2x_inner)
> > > > +     addq    $PAGE_SIZE, %rdi
> > > > +     addq    $PAGE_SIZE, %rsi
> > > > +     decq    %r10
> > > > +     jne     L(loop_large_memcpy_2x_outer)
> > > > +     sfence
> > > > +
> > > > +     /* Check if only last 4 loads are needed.  */
> > > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > > +     jbe     L(large_memcpy_2x_end)
> > > > +
> > > > +     /* Handle the last 2 * PAGE_SIZE bytes.  */
> > > > +L(loop_large_memcpy_2x_tail):
> > > >       /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > > > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> > > > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > > +     PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > > >       VMOVU   (%rsi), %VEC(0)
> > > >       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > > >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > > >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > > -     addq    $PREFETCHED_LOAD_SIZE, %rsi
> > > > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > > > -     VMOVNT  %VEC(0), (%rdi)
> > > > -     VMOVNT  %VEC(1), VEC_SIZE(%rdi)
> > > > -     VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > > -     VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > > -     addq    $PREFETCHED_LOAD_SIZE, %rdi
> > > > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > > > -     ja      L(loop_large_forward)
> > > > -     sfence
> > > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > > +     addl    $-(VEC_SIZE * 4), %edx
> > > > +     VMOVA   %VEC(0), (%rdi)
> > > > +     VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > > > +     VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > > +     VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > > +     ja      L(loop_large_memcpy_2x_tail)
> > > > +
> > > > +L(large_memcpy_2x_end):
> > > >       /* Store the last 4 * VEC.  */
> > > > -     VMOVU   %VEC(5), (%rcx)
> > > > -     VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> > > > -     VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > > > -     VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > > > -     /* Store the first VEC.  */
> > > > -     VMOVU   %VEC(4), (%r11)
> > > > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > > > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > > > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > > > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > > > +
> > > > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > > > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > > > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > > > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > > >       VZEROUPPER_RETURN
> > > >
> > > > -L(large_backward):
> > > > -     /* Don't use non-temporal store if there is overlap between
> > > > -        destination and source since destination may be in cache
> > > > -        when source is loaded.  */
> > > > -     leaq    (%rcx, %rdx), %r10
> > > > -     cmpq    %r10, %r9
> > > > -     jb      L(loop_4x_vec_backward)
> > > > -L(loop_large_backward):
> > > > -     /* Copy 4 * VEC a time backward with non-temporal stores.  */
> > > > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> > > > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> > > > -     VMOVU   (%rcx), %VEC(0)
> > > > -     VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > > > -     VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > > > -     VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > > > -     subq    $PREFETCHED_LOAD_SIZE, %rcx
> > > > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > > > -     VMOVNT  %VEC(0), (%r9)
> > > > -     VMOVNT  %VEC(1), -VEC_SIZE(%r9)
> > > > -     VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
> > > > -     VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
> > > > -     subq    $PREFETCHED_LOAD_SIZE, %r9
> > > > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > > > -     ja      L(loop_large_backward)
> > > > +     .p2align 4
> > > > +L(large_memcpy_4x):
> > > > +     movq    %rdx, %r10
> > > > +     /* edx will store remainder size for copying tail.  */
> > > > +     andl    $(PAGE_SIZE * 4 - 1), %edx
> > > > +     /* r10 stores outer loop counter.  */
> > > > +     shrq    $(LOG_PAGE_SIZE + 2), %r10
> > > > +     /* Copy 4x VEC at a time from 4 pages.  */
> > > > +     .p2align 4
> > > > +L(loop_large_memcpy_4x_outer):
> > > > +     /* ecx stores inner loop counter.  */
> > > > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > > > +L(loop_large_memcpy_4x_inner):
> > > > +     /* Only one prefetch set per page as doing 4 pages give more time
> > > > +        for prefetcher to keep up.  */
> > > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> > > > +     /* Load vectors from rsi.  */
> > > > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > > > +     subq    $-LARGE_LOAD_SIZE, %rsi
> > > > +     /* Non-temporal store vectors to rdi.  */
> > > > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > > > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > > > +     subq    $-LARGE_LOAD_SIZE, %rdi
> > > > +     decl    %ecx
> > > > +     jnz     L(loop_large_memcpy_4x_inner)
> > > > +     addq    $(PAGE_SIZE * 3), %rdi
> > > > +     addq    $(PAGE_SIZE * 3), %rsi
> > > > +     decq    %r10
> > > > +     jne     L(loop_large_memcpy_4x_outer)
> > > >       sfence
> > > > -     /* Store the first 4 * VEC.  */
> > > > -     VMOVU   %VEC(4), (%rdi)
> > > > -     VMOVU   %VEC(5), VEC_SIZE(%rdi)
> > > > -     VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> > > > -     VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> > > > -     /* Store the last VEC.  */
> > > > -     VMOVU   %VEC(8), (%r11)
> > > > +     /* Check if only last 4 loads are needed.  */
> > > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > > +     jbe     L(large_memcpy_4x_end)
> > > > +
> > > > +     /* Handle the last 4  * PAGE_SIZE bytes.  */
> > > > +L(loop_large_memcpy_4x_tail):
> > > > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > > +     PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > > > +     VMOVU   (%rsi), %VEC(0)
> > > > +     VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > > > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > > > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > > +     addl    $-(VEC_SIZE * 4), %edx
> > > > +     VMOVA   %VEC(0), (%rdi)
> > > > +     VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > > > +     VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > > +     VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > > +     ja      L(loop_large_memcpy_4x_tail)
> > > > +
> > > > +L(large_memcpy_4x_end):
> > > > +     /* Store the last 4 * VEC.  */
> > > > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > > > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > > > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > > > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > > > +
> > > > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > > > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > > > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > > > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > > >       VZEROUPPER_RETURN
> > > >  #endif
> > > >  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > > > --
> > > > 2.29.2
> > > >
> > >
> > > LGTM.  Please commit it.
> > >
> > > Thanks.
> > >
> > >
> > > H.J.
>
>
>
> --
> H.J.
  
Sunil Pandey April 27, 2022, 11:46 p.m. UTC | #9
On Fri, Apr 16, 2021 at 12:25 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Apr 16, 2021 at 1:05 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Apr 16, 2021 at 9:35 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > > LGTM.  Please commit it.
> > >
> > > Are you saying that to me or someone else? If its to me what do you
> > > mean, is the patch not enough?
> >
> > I will commit it for you.
>
> Thanks! Are you planning on accepting the bench / testing changes as well?
>
> >
> > > > Thanks.
> > >
> > > On Fri, Apr 16, 2021 at 8:59 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Sat, Apr 03, 2021 at 04:12:15AM -0400, Noah Goldstein wrote:
> > > > > From: noah <goldstein.w.n@gmail.com>
> > > > >
> > > > > No Bug. This commit updates the large memcpy case (no overlap). The
> > > > > update is to perform memcpy on either 2 or 4 contiguous pages at
> > > > > once. This 1) helps to alleviate the affects of false memory aliasing
> > > > > when destination and source have a close 4k alignment and 2) In most
> > > > > cases and for most DRAM units is a modestly more efficient access
> > > > > pattern. These changes are a clear performance improvement for
> > > > > VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
> > > > > test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
> > > > > pass.
> > > > >
> > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > > ---
> > > > > Issue was alignment related AFAICT. Added `.p2align 4` infront of the
> > > > > loops and no longer see any meaningful regression.
> > > > >
> > > > > Also added back the temporal stores for the tail. Saw a regression
> > > > > when doing these tests.
> > > > >
> > > > > Two tables below for skylake and icelake numbers for the areas around
> > > > > where you saw the regression. Below is all data from the tests.
> > > > >
> > > > > N = 10.
> > > > >
> > > > > Skylake
> > > > > Len         ,align1      ,align2      ,new mean    ,old mean
> > > > > 4103        ,0           ,64          ,84.5        ,88.6
> > > > > 4111        ,0           ,3           ,99.0        ,99.9
> > > > > 4127        ,3           ,0           ,102.1       ,102.3
> > > > > 4159        ,3           ,7           ,88.7        ,90.9
> > > > > 4223        ,9           ,5           ,88.1        ,87.4
> > > > > 8199        ,0           ,64          ,146.7       ,150.2
> > > > > 8207        ,0           ,3           ,167.9       ,168.5
> > > > > 8223        ,3           ,0           ,168.5       ,168.1
> > > > > 8255        ,3           ,7           ,157.0       ,159.2
> > > > > 8319        ,9           ,5           ,155.5       ,155.7
> > > > > 16391       ,0           ,64          ,286.2       ,288.8
> > > > > 16399       ,0           ,3           ,307.0       ,308.7
> > > > > 16415       ,3           ,0           ,307.4       ,307.6
> > > > > 16447       ,3           ,7           ,294.6       ,295.5
> > > > > 16511       ,9           ,5           ,291.5       ,462.1
> > > > > 32775       ,0           ,64          ,603.4       ,601.5
> > > > > 32783       ,0           ,3           ,604.8       ,606.4
> > > > > 32799       ,3           ,0           ,603.0       ,604.1
> > > > > 32831       ,3           ,7           ,600.2       ,737.3
> > > > > 32895       ,9           ,5           ,604.4       ,599.5
> > > > > 65543       ,0           ,64          ,1873.5      ,1854.3
> > > > > 65551       ,0           ,3           ,1862.9      ,1846.6
> > > > > 65567       ,3           ,0           ,1885.5      ,1966.0
> > > > > 65599       ,3           ,7           ,1833.2      ,1833.1
> > > > > 65663       ,9           ,5           ,1884.9      ,1887.4
> > > > > 131079      ,0           ,64          ,3944.3      ,3949.4
> > > > > 131087      ,0           ,3           ,3927.3      ,3913.3
> > > > > 131103      ,3           ,0           ,4415.8      ,4169.4
> > > > > 131135      ,3           ,7           ,4224.5      ,4157.6
> > > > > 131199      ,9           ,5           ,5974.0      ,4983.8
> > > > > 262151      ,0           ,64          ,11050.2     ,10620.6
> > > > > 262159      ,0           ,3           ,9932.8      ,10037.3
> > > > > 262175      ,3           ,0           ,10188.8     ,9206.6
> > > > > 262207      ,3           ,7           ,9633.3      ,9216.7
> > > > > 262271      ,9           ,5           ,9732.7      ,9345.3
> > > > > 524295      ,0           ,64          ,24823.9     ,24880.7
> > > > > 524303      ,0           ,3           ,24514.0     ,24556.7
> > > > > 524319      ,3           ,0           ,23974.4     ,24219.9
> > > > > 524351      ,3           ,7           ,24159.7     ,24207.0
> > > > > 524415      ,9           ,5           ,23946.5     ,24142.8
> > > > >
> > > > > Icelake:
> > > > > Len         ,align1      ,align2      ,new mean    ,old mean
> > > > > 4103        ,0           ,64          ,50.2        ,63.7
> > > > > 4111        ,0           ,3           ,63.7        ,65.1
> > > > > 4127        ,3           ,0           ,68.2        ,69.4
> > > > > 4159        ,3           ,7           ,59.6        ,68.0
> > > > > 4223        ,9           ,5           ,68.2        ,66.8
> > > > > 8199        ,0           ,64          ,92.1        ,89.9
> > > > > 8207        ,0           ,3           ,119.7       ,118.3
> > > > > 8223        ,3           ,0           ,119.1       ,120.9
> > > > > 8255        ,3           ,7           ,122.9       ,123.7
> > > > > 8319        ,9           ,5           ,122.1       ,121.8
> > > > > 16391       ,0           ,64          ,162.7       ,158.0
> > > > > 16399       ,0           ,3           ,227.6       ,234.1
> > > > > 16415       ,3           ,0           ,230.8       ,232.7
> > > > > 16447       ,3           ,7           ,226.8       ,232.6
> > > > > 16511       ,9           ,5           ,233.4       ,233.8
> > > > > 32775       ,0           ,64          ,312.2       ,301.8
> > > > > 32783       ,0           ,3           ,449.7       ,450.0
> > > > > 32799       ,3           ,0           ,452.7       ,455.9
> > > > > 32831       ,3           ,7           ,449.8       ,458.0
> > > > > 32895       ,9           ,5           ,456.3       ,459.4
> > > > > 65543       ,0           ,64          ,1460.6      ,1463.9
> > > > > 65551       ,0           ,3           ,1462.0      ,1465.4
> > > > > 65567       ,3           ,0           ,1466.6      ,1480.4
> > > > > 65599       ,3           ,7           ,1488.0      ,1488.9
> > > > > 65663       ,9           ,5           ,1680.8      ,1499.5
> > > > > 131079      ,0           ,64          ,2988.5      ,3010.1
> > > > > 131087      ,0           ,3           ,2995.5      ,2996.4
> > > > > 131103      ,3           ,0           ,3006.2      ,3000.5
> > > > > 131135      ,3           ,7           ,3032.4      ,3073.7
> > > > > 131199      ,9           ,5           ,3010.4      ,3027.4
> > > > > 262151      ,0           ,64          ,6143.2      ,6079.1
> > > > > 262159      ,0           ,3           ,6085.1      ,6075.8
> > > > > 262175      ,3           ,0           ,6088.0      ,6064.9
> > > > > 262207      ,3           ,7           ,6018.7      ,6023.5
> > > > > 262271      ,9           ,5           ,6019.8      ,5959.2
> > > > > 524295      ,0           ,64          ,14464.2     ,14095.1
> > > > > 524303      ,0           ,3           ,14761.6     ,14050.2
> > > > > 524319      ,3           ,0           ,14534.1     ,14087.5
> > > > > 524351      ,3           ,7           ,14147.7     ,13903.8
> > > > > 524415      ,9           ,5           ,14157.0     ,13982.9
> > > > >
> > > > >
> > > > >
> > > > > cpu         ,version     ,Len         ,align1      ,align2      ,new mean    ,old mean
> > > > > skylake     ,avx         ,4103        ,0           ,64          ,84.5        ,88.6
> > > > > skylake     ,avx         ,4111        ,0           ,3           ,99.0        ,99.9
> > > > > skylake     ,avx         ,4127        ,3           ,0           ,102.1       ,102.3
> > > > > skylake     ,avx         ,4159        ,3           ,7           ,88.7        ,90.9
> > > > > skylake     ,avx         ,4223        ,9           ,5           ,88.1        ,87.4
> > > > > skylake     ,avx         ,8199        ,0           ,64          ,146.7       ,150.2
> > > > > skylake     ,avx         ,8207        ,0           ,3           ,167.9       ,168.5
> > > > > skylake     ,avx         ,8223        ,3           ,0           ,168.5       ,168.1
> > > > > skylake     ,avx         ,8255        ,3           ,7           ,157.0       ,159.2
> > > > > skylake     ,avx         ,8319        ,9           ,5           ,155.5       ,155.7
> > > > > skylake     ,avx         ,16391       ,0           ,64          ,286.2       ,288.8
> > > > > skylake     ,avx         ,16399       ,0           ,3           ,307.0       ,308.7
> > > > > skylake     ,avx         ,16415       ,3           ,0           ,307.4       ,307.6
> > > > > skylake     ,avx         ,16447       ,3           ,7           ,294.6       ,295.5
> > > > > skylake     ,avx         ,16511       ,9           ,5           ,291.5       ,462.1
> > > > > skylake     ,avx         ,32775       ,0           ,64          ,603.4       ,601.5
> > > > > skylake     ,avx         ,32783       ,0           ,3           ,604.8       ,606.4
> > > > > skylake     ,avx         ,32799       ,3           ,0           ,603.0       ,604.1
> > > > > skylake     ,avx         ,32831       ,3           ,7           ,600.2       ,737.3
> > > > > skylake     ,avx         ,32895       ,9           ,5           ,604.4       ,599.5
> > > > > skylake     ,avx         ,65543       ,0           ,64          ,1873.5      ,1854.3
> > > > > skylake     ,avx         ,65551       ,0           ,3           ,1862.9      ,1846.6
> > > > > skylake     ,avx         ,65567       ,3           ,0           ,1885.5      ,1966.0
> > > > > skylake     ,avx         ,65599       ,3           ,7           ,1833.2      ,1833.1
> > > > > skylake     ,avx         ,65663       ,9           ,5           ,1884.9      ,1887.4
> > > > > skylake     ,avx         ,131079      ,0           ,64          ,3944.3      ,3949.4
> > > > > skylake     ,avx         ,131087      ,0           ,3           ,3927.3      ,3913.3
> > > > > skylake     ,avx         ,131103      ,3           ,0           ,4415.8      ,4169.4
> > > > > skylake     ,avx         ,131135      ,3           ,7           ,4224.5      ,4157.6
> > > > > skylake     ,avx         ,131199      ,9           ,5           ,5974.0      ,4983.8
> > > > > skylake     ,avx         ,262151      ,0           ,64          ,11050.2     ,10620.6
> > > > > skylake     ,avx         ,262159      ,0           ,3           ,9932.8      ,10037.3
> > > > > skylake     ,avx         ,262175      ,3           ,0           ,10188.8     ,9206.6
> > > > > skylake     ,avx         ,262207      ,3           ,7           ,9633.3      ,9216.7
> > > > > skylake     ,avx         ,262271      ,9           ,5           ,9732.7      ,9345.3
> > > > > skylake     ,avx         ,524295      ,0           ,64          ,24823.9     ,24880.7
> > > > > skylake     ,avx         ,524303      ,0           ,3           ,24514.0     ,24556.7
> > > > > skylake     ,avx         ,524319      ,3           ,0           ,23974.4     ,24219.9
> > > > > skylake     ,avx         ,524351      ,3           ,7           ,24159.7     ,24207.0
> > > > > skylake     ,avx         ,524415      ,9           ,5           ,23946.5     ,24142.8
> > > > > skylake     ,avx         ,1048583     ,0           ,64          ,49163.9     ,49454.6
> > > > > skylake     ,avx         ,1048591     ,0           ,3           ,49879.3     ,49400.8
> > > > > skylake     ,avx         ,1048607     ,3           ,0           ,49738.0     ,48864.6
> > > > > skylake     ,avx         ,1048639     ,3           ,7           ,48804.0     ,47588.5
> > > > > skylake     ,avx         ,1048703     ,9           ,5           ,49629.4     ,49796.3
> > > > > skylake     ,avx         ,2097159     ,0           ,64          ,98271.7     ,96330.6
> > > > > skylake     ,avx         ,2097167     ,0           ,3           ,97801.8     ,98638.1
> > > > > skylake     ,avx         ,2097183     ,3           ,0           ,98041.1     ,99287.6
> > > > > skylake     ,avx         ,2097215     ,3           ,7           ,96629.5     ,96521.9
> > > > > skylake     ,avx         ,2097279     ,9           ,5           ,98961.8     ,98909.8
> > > > > skylake     ,avx         ,4194311     ,0           ,64          ,194667.7    ,195377.1
> > > > > skylake     ,avx         ,4194319     ,0           ,3           ,194919.5    ,198576.2
> > > > > skylake     ,avx         ,4194335     ,3           ,0           ,192949.8    ,194584.7
> > > > > skylake     ,avx         ,4194367     ,3           ,7           ,189943.5    ,189177.9
> > > > > skylake     ,avx         ,4194431     ,9           ,5           ,192479.1    ,196494.2
> > > > > skylake     ,avx         ,8388615     ,0           ,64          ,588671.6    ,587215.4
> > > > > skylake     ,avx         ,8388623     ,0           ,3           ,581640.7    ,582812.5
> > > > > skylake     ,avx         ,8388639     ,3           ,0           ,549811.9    ,544697.6
> > > > > skylake     ,avx         ,8388671     ,3           ,7           ,591155.0    ,577951.8
> > > > > skylake     ,avx         ,8388735     ,9           ,5           ,547583.2    ,545133.3
> > > > > skylake     ,avx         ,16777223    ,0           ,64          ,1787503.0   ,1811146.0
> > > > > skylake     ,avx         ,16777231    ,0           ,3           ,1758671.0   ,1756343.0
> > > > > skylake     ,avx         ,16777247    ,3           ,0           ,1691781.0   ,1694661.0
> > > > > skylake     ,avx         ,16777279    ,3           ,7           ,1768150.0   ,1754785.0
> > > > > skylake     ,avx         ,16777343    ,9           ,5           ,1695179.0   ,1710794.0
> > > > > skylake     ,sse2        ,4103        ,0           ,64          ,150.8       ,150.5
> > > > > skylake     ,sse2        ,4111        ,0           ,3           ,156.8       ,158.4
> > > > > skylake     ,sse2        ,4127        ,3           ,0           ,99.7        ,99.4
> > > > > skylake     ,sse2        ,4159        ,3           ,7           ,154.8       ,154.5
> > > > > skylake     ,sse2        ,4223        ,9           ,5           ,137.3       ,137.2
> > > > > skylake     ,sse2        ,8199        ,0           ,64          ,284.8       ,285.5
> > > > > skylake     ,sse2        ,8207        ,0           ,3           ,296.0       ,296.1
> > > > > skylake     ,sse2        ,8223        ,3           ,0           ,168.0       ,168.2
> > > > > skylake     ,sse2        ,8255        ,3           ,7           ,293.0       ,292.4
> > > > > skylake     ,sse2        ,8319        ,9           ,5           ,251.3       ,250.7
> > > > > skylake     ,sse2        ,16391       ,0           ,64          ,561.3       ,608.3
> > > > > skylake     ,sse2        ,16399       ,0           ,3           ,571.0       ,574.8
> > > > > skylake     ,sse2        ,16415       ,3           ,0           ,305.4       ,305.0
> > > > > skylake     ,sse2        ,16447       ,3           ,7           ,563.2       ,565.0
> > > > > skylake     ,sse2        ,16511       ,9           ,5           ,477.1       ,475.1
> > > > > skylake     ,sse2        ,32775       ,0           ,64          ,1128.2      ,1131.7
> > > > > skylake     ,sse2        ,32783       ,0           ,3           ,1126.6      ,1131.0
> > > > > skylake     ,sse2        ,32799       ,3           ,0           ,587.6       ,590.8
> > > > > skylake     ,sse2        ,32831       ,3           ,7           ,1130.6      ,1126.2
> > > > > skylake     ,sse2        ,32895       ,9           ,5           ,957.6       ,953.0
> > > > > skylake     ,sse2        ,65543       ,0           ,64          ,2718.9      ,2704.2
> > > > > skylake     ,sse2        ,65551       ,0           ,3           ,2724.1      ,2725.0
> > > > > skylake     ,sse2        ,65567       ,3           ,0           ,1888.4      ,1914.3
> > > > > skylake     ,sse2        ,65599       ,3           ,7           ,2787.6      ,2748.7
> > > > > skylake     ,sse2        ,65663       ,9           ,5           ,2400.5      ,2369.4
> > > > > skylake     ,sse2        ,131079      ,0           ,64          ,5603.3      ,5654.9
> > > > > skylake     ,sse2        ,131087      ,0           ,3           ,5939.3      ,5871.4
> > > > > skylake     ,sse2        ,131103      ,3           ,0           ,4272.4      ,4190.0
> > > > > skylake     ,sse2        ,131135      ,3           ,7           ,7601.4      ,7524.6
> > > > > skylake     ,sse2        ,131199      ,9           ,5           ,7022.1      ,6864.7
> > > > > skylake     ,sse2        ,262151      ,0           ,64          ,13736.2     ,14030.0
> > > > > skylake     ,sse2        ,262159      ,0           ,3           ,12407.3     ,12334.1
> > > > > skylake     ,sse2        ,262175      ,3           ,0           ,9661.1      ,9249.4
> > > > > skylake     ,sse2        ,262207      ,3           ,7           ,12850.2     ,12351.6
> > > > > skylake     ,sse2        ,262271      ,9           ,5           ,10792.6     ,10435.8
> > > > > skylake     ,sse2        ,524295      ,0           ,64          ,27754.5     ,28177.7
> > > > > skylake     ,sse2        ,524303      ,0           ,3           ,27766.2     ,28152.0
> > > > > skylake     ,sse2        ,524319      ,3           ,0           ,24030.9     ,24438.3
> > > > > skylake     ,sse2        ,524351      ,3           ,7           ,27787.5     ,27933.0
> > > > > skylake     ,sse2        ,524415      ,9           ,5           ,24263.2     ,25249.1
> > > > > skylake     ,sse2        ,1048583     ,0           ,64          ,56199.9     ,56039.8
> > > > > skylake     ,sse2        ,1048591     ,0           ,3           ,56750.2     ,58889.7
> > > > > skylake     ,sse2        ,1048607     ,3           ,0           ,56394.0     ,55115.3
> > > > > skylake     ,sse2        ,1048639     ,3           ,7           ,57233.1     ,57473.8
> > > > > skylake     ,sse2        ,1048703     ,9           ,5           ,56324.3     ,55917.9
> > > > > skylake     ,sse2        ,2097159     ,0           ,64          ,113234.8    ,114346.4
> > > > > skylake     ,sse2        ,2097167     ,0           ,3           ,114373.1    ,115522.5
> > > > > skylake     ,sse2        ,2097183     ,3           ,0           ,108113.3    ,108513.3
> > > > > skylake     ,sse2        ,2097215     ,3           ,7           ,116863.6    ,116549.9
> > > > > skylake     ,sse2        ,2097279     ,9           ,5           ,108945.1    ,108843.7
> > > > > skylake     ,sse2        ,4194311     ,0           ,64          ,230250.1    ,232350.0
> > > > > skylake     ,sse2        ,4194319     ,0           ,3           ,231895.3    ,235055.6
> > > > > skylake     ,sse2        ,4194335     ,3           ,0           ,218442.8    ,219199.8
> > > > > skylake     ,sse2        ,4194367     ,3           ,7           ,242564.2    ,235587.7
> > > > > skylake     ,sse2        ,4194431     ,9           ,5           ,224167.4    ,215261.8
> > > > > skylake     ,sse2        ,8388615     ,0           ,64          ,679801.8    ,674832.0
> > > > > skylake     ,sse2        ,8388623     ,0           ,3           ,684913.2    ,685238.7
> > > > > skylake     ,sse2        ,8388639     ,3           ,0           ,644865.4    ,631388.6
> > > > > skylake     ,sse2        ,8388671     ,3           ,7           ,698700.9    ,689316.1
> > > > > skylake     ,sse2        ,8388735     ,9           ,5           ,644820.2    ,631366.8
> > > > > skylake     ,sse2        ,16777223    ,0           ,64          ,1877984.0   ,1876437.0
> > > > > skylake     ,sse2        ,16777231    ,0           ,3           ,1898086.0   ,1913053.0
> > > > > skylake     ,sse2        ,16777247    ,3           ,0           ,1857018.0   ,1866949.0
> > > > > skylake     ,sse2        ,16777279    ,3           ,7           ,1914905.0   ,1897134.0
> > > > > skylake     ,sse2        ,16777343    ,9           ,5           ,1859937.0   ,1881939.0
> > > > > icelake     ,avx512      ,4103        ,0           ,64          ,75.2        ,75.8
> > > > > icelake     ,avx512      ,4111        ,0           ,3           ,56.9        ,56.4
> > > > > icelake     ,avx512      ,4127        ,3           ,0           ,59.1        ,59.6
> > > > > icelake     ,avx512      ,4159        ,3           ,7           ,50.7        ,51.3
> > > > > icelake     ,avx512      ,4223        ,9           ,5           ,59.2        ,58.9
> > > > > icelake     ,avx512      ,8199        ,0           ,64          ,67.8        ,63.9
> > > > > icelake     ,avx512      ,8207        ,0           ,3           ,89.0        ,89.9
> > > > > icelake     ,avx512      ,8223        ,3           ,0           ,90.2        ,90.1
> > > > > icelake     ,avx512      ,8255        ,3           ,7           ,82.6        ,84.9
> > > > > icelake     ,avx512      ,8319        ,9           ,5           ,91.5        ,92.8
> > > > > icelake     ,avx512      ,16391       ,0           ,64          ,118.0       ,117.6
> > > > > icelake     ,avx512      ,16399       ,0           ,3           ,156.5       ,157.0
> > > > > icelake     ,avx512      ,16415       ,3           ,0           ,157.4       ,157.3
> > > > > icelake     ,avx512      ,16447       ,3           ,7           ,151.0       ,151.6
> > > > > icelake     ,avx512      ,16511       ,9           ,5           ,159.1       ,159.6
> > > > > icelake     ,avx512      ,32775       ,0           ,64          ,231.8       ,230.8
> > > > > icelake     ,avx512      ,32783       ,0           ,3           ,297.8       ,299.3
> > > > > icelake     ,avx512      ,32799       ,3           ,0           ,299.1       ,299.0
> > > > > icelake     ,avx512      ,32831       ,3           ,7           ,293.5       ,295.4
> > > > > icelake     ,avx512      ,32895       ,9           ,5           ,300.3       ,302.5
> > > > > icelake     ,avx512      ,65543       ,0           ,64          ,1473.4      ,1479.2
> > > > > icelake     ,avx512      ,65551       ,0           ,3           ,1438.2      ,1445.3
> > > > > icelake     ,avx512      ,65567       ,3           ,0           ,1450.3      ,1463.8
> > > > > icelake     ,avx512      ,65599       ,3           ,7           ,1469.0      ,1473.8
> > > > > icelake     ,avx512      ,65663       ,9           ,5           ,1480.0      ,1483.5
> > > > > icelake     ,avx512      ,131079      ,0           ,64          ,3015.1      ,3037.5
> > > > > icelake     ,avx512      ,131087      ,0           ,3           ,2952.3      ,2960.4
> > > > > icelake     ,avx512      ,131103      ,3           ,0           ,2966.2      ,2964.4
> > > > > icelake     ,avx512      ,131135      ,3           ,7           ,2961.6      ,3047.9
> > > > > icelake     ,avx512      ,131199      ,9           ,5           ,2967.4      ,3183.8
> > > > > icelake     ,avx512      ,262151      ,0           ,64          ,6206.0      ,6141.5
> > > > > icelake     ,avx512      ,262159      ,0           ,3           ,5990.8      ,5959.2
> > > > > icelake     ,avx512      ,262175      ,3           ,0           ,5976.7      ,5963.8
> > > > > icelake     ,avx512      ,262207      ,3           ,7           ,5939.5      ,5924.3
> > > > > icelake     ,avx512      ,262271      ,9           ,5           ,5944.6      ,5990.3
> > > > > icelake     ,avx512      ,524295      ,0           ,64          ,14726.7     ,14307.0
> > > > > icelake     ,avx512      ,524303      ,0           ,3           ,14344.2     ,14040.5
> > > > > icelake     ,avx512      ,524319      ,3           ,0           ,14175.0     ,13862.2
> > > > > icelake     ,avx512      ,524351      ,3           ,7           ,14261.4     ,13821.5
> > > > > icelake     ,avx512      ,524415      ,9           ,5           ,14266.5     ,14064.7
> > > > > icelake     ,avx512      ,1048583     ,0           ,64          ,35211.4     ,35414.6
> > > > > icelake     ,avx512      ,1048591     ,0           ,3           ,35156.8     ,35591.2
> > > > > icelake     ,avx512      ,1048607     ,3           ,0           ,35273.1     ,35503.3
> > > > > icelake     ,avx512      ,1048639     ,3           ,7           ,35255.8     ,35725.0
> > > > > icelake     ,avx512      ,1048703     ,9           ,5           ,35703.6     ,36289.9
> > > > > icelake     ,avx512      ,2097159     ,0           ,64          ,72613.9     ,72063.2
> > > > > icelake     ,avx512      ,2097167     ,0           ,3           ,72301.6     ,73504.2
> > > > > icelake     ,avx512      ,2097183     ,3           ,0           ,73448.8     ,72133.6
> > > > > icelake     ,avx512      ,2097215     ,3           ,7           ,73762.9     ,72825.8
> > > > > icelake     ,avx512      ,2097279     ,9           ,5           ,72097.3     ,72914.6
> > > > > icelake     ,avx512      ,4194311     ,0           ,64          ,144793.4    ,144182.1
> > > > > icelake     ,avx512      ,4194319     ,0           ,3           ,143710.3    ,145063.3
> > > > > icelake     ,avx512      ,4194335     ,3           ,0           ,146722.1    ,144046.4
> > > > > icelake     ,avx512      ,4194367     ,3           ,7           ,144267.0    ,144874.6
> > > > > icelake     ,avx512      ,4194431     ,9           ,5           ,143808.2    ,144560.0
> > > > > icelake     ,avx512      ,8388615     ,0           ,64          ,427993.4    ,424521.5
> > > > > icelake     ,avx512      ,8388623     ,0           ,3           ,470267.1    ,473290.8
> > > > > icelake     ,avx512      ,8388639     ,3           ,0           ,457179.7    ,461797.7
> > > > > icelake     ,avx512      ,8388671     ,3           ,7           ,472507.9    ,481561.4
> > > > > icelake     ,avx512      ,8388735     ,9           ,5           ,463611.9    ,467388.7
> > > > > icelake     ,avx512      ,16777223    ,0           ,64          ,1490426.0   ,1526996.0
> > > > > icelake     ,avx512      ,16777231    ,0           ,3           ,1516687.0   ,1517095.0
> > > > > icelake     ,avx512      ,16777247    ,3           ,0           ,1497688.0   ,1512766.0
> > > > > icelake     ,avx512      ,16777279    ,3           ,7           ,1512331.0   ,1524317.0
> > > > > icelake     ,avx512      ,16777343    ,9           ,5           ,1498908.0   ,1500526.0
> > > > > icelake     ,avx         ,4103        ,0           ,64          ,50.2        ,63.7
> > > > > icelake     ,avx         ,4111        ,0           ,3           ,63.7        ,65.1
> > > > > icelake     ,avx         ,4127        ,3           ,0           ,68.2        ,69.4
> > > > > icelake     ,avx         ,4159        ,3           ,7           ,59.6        ,68.0
> > > > > icelake     ,avx         ,4223        ,9           ,5           ,68.2        ,66.8
> > > > > icelake     ,avx         ,8199        ,0           ,64          ,92.1        ,89.9
> > > > > icelake     ,avx         ,8207        ,0           ,3           ,119.7       ,118.3
> > > > > icelake     ,avx         ,8223        ,3           ,0           ,119.1       ,120.9
> > > > > icelake     ,avx         ,8255        ,3           ,7           ,122.9       ,123.7
> > > > > icelake     ,avx         ,8319        ,9           ,5           ,122.1       ,121.8
> > > > > icelake     ,avx         ,16391       ,0           ,64          ,162.7       ,158.0
> > > > > icelake     ,avx         ,16399       ,0           ,3           ,227.6       ,234.1
> > > > > icelake     ,avx         ,16415       ,3           ,0           ,230.8       ,232.7
> > > > > icelake     ,avx         ,16447       ,3           ,7           ,226.8       ,232.6
> > > > > icelake     ,avx         ,16511       ,9           ,5           ,233.4       ,233.8
> > > > > icelake     ,avx         ,32775       ,0           ,64          ,312.2       ,301.8
> > > > > icelake     ,avx         ,32783       ,0           ,3           ,449.7       ,450.0
> > > > > icelake     ,avx         ,32799       ,3           ,0           ,452.7       ,455.9
> > > > > icelake     ,avx         ,32831       ,3           ,7           ,449.8       ,458.0
> > > > > icelake     ,avx         ,32895       ,9           ,5           ,456.3       ,459.4
> > > > > icelake     ,avx         ,65543       ,0           ,64          ,1460.6      ,1463.9
> > > > > icelake     ,avx         ,65551       ,0           ,3           ,1462.0      ,1465.4
> > > > > icelake     ,avx         ,65567       ,3           ,0           ,1466.6      ,1480.4
> > > > > icelake     ,avx         ,65599       ,3           ,7           ,1488.0      ,1488.9
> > > > > icelake     ,avx         ,65663       ,9           ,5           ,1680.8      ,1499.5
> > > > > icelake     ,avx         ,131079      ,0           ,64          ,2988.5      ,3010.1
> > > > > icelake     ,avx         ,131087      ,0           ,3           ,2995.5      ,2996.4
> > > > > icelake     ,avx         ,131103      ,3           ,0           ,3006.2      ,3000.5
> > > > > icelake     ,avx         ,131135      ,3           ,7           ,3032.4      ,3073.7
> > > > > icelake     ,avx         ,131199      ,9           ,5           ,3010.4      ,3027.4
> > > > > icelake     ,avx         ,262151      ,0           ,64          ,6143.2      ,6079.1
> > > > > icelake     ,avx         ,262159      ,0           ,3           ,6085.1      ,6075.8
> > > > > icelake     ,avx         ,262175      ,3           ,0           ,6088.0      ,6064.9
> > > > > icelake     ,avx         ,262207      ,3           ,7           ,6018.7      ,6023.5
> > > > > icelake     ,avx         ,262271      ,9           ,5           ,6019.8      ,5959.2
> > > > > icelake     ,avx         ,524295      ,0           ,64          ,14464.2     ,14095.1
> > > > > icelake     ,avx         ,524303      ,0           ,3           ,14761.6     ,14050.2
> > > > > icelake     ,avx         ,524319      ,3           ,0           ,14534.1     ,14087.5
> > > > > icelake     ,avx         ,524351      ,3           ,7           ,14147.7     ,13903.8
> > > > > icelake     ,avx         ,524415      ,9           ,5           ,14157.0     ,13982.9
> > > > > icelake     ,avx         ,1048583     ,0           ,64          ,36599.0     ,37461.4
> > > > > icelake     ,avx         ,1048591     ,0           ,3           ,36717.8     ,37454.9
> > > > > icelake     ,avx         ,1048607     ,3           ,0           ,36821.2     ,37343.3
> > > > > icelake     ,avx         ,1048639     ,3           ,7           ,36958.0     ,37507.2
> > > > > icelake     ,avx         ,1048703     ,9           ,5           ,36869.2     ,37413.1
> > > > > icelake     ,avx         ,2097159     ,0           ,64          ,74765.8     ,75330.9
> > > > > icelake     ,avx         ,2097167     ,0           ,3           ,75175.4     ,74891.9
> > > > > icelake     ,avx         ,2097183     ,3           ,0           ,75451.4     ,74787.7
> > > > > icelake     ,avx         ,2097215     ,3           ,7           ,75394.8     ,75839.1
> > > > > icelake     ,avx         ,2097279     ,9           ,5           ,75099.2     ,75421.2
> > > > > icelake     ,avx         ,4194311     ,0           ,64          ,146809.6    ,146619.4
> > > > > icelake     ,avx         ,4194319     ,0           ,3           ,148866.4    ,149898.2
> > > > > icelake     ,avx         ,4194335     ,3           ,0           ,148719.7    ,150165.4
> > > > > icelake     ,avx         ,4194367     ,3           ,7           ,150600.1    ,150925.9
> > > > > icelake     ,avx         ,4194431     ,9           ,5           ,149457.3    ,150519.2
> > > > > icelake     ,avx         ,8388615     ,0           ,64          ,412709.8    ,423666.1
> > > > > icelake     ,avx         ,8388623     ,0           ,3           ,423717.4    ,424418.2
> > > > > icelake     ,avx         ,8388639     ,3           ,0           ,414387.5    ,413445.6
> > > > > icelake     ,avx         ,8388671     ,3           ,7           ,449010.7    ,417553.5
> > > > > icelake     ,avx         ,8388735     ,9           ,5           ,414128.6    ,411815.3
> > > > > icelake     ,avx         ,16777223    ,0           ,64          ,1490032.0   ,1510004.0
> > > > > icelake     ,avx         ,16777231    ,0           ,3           ,1379638.0   ,1422097.0
> > > > > icelake     ,avx         ,16777247    ,3           ,0           ,1418930.0   ,1367557.0
> > > > > icelake     ,avx         ,16777279    ,3           ,7           ,1515152.0   ,1500176.0
> > > > > icelake     ,avx         ,16777343    ,9           ,5           ,1344117.0   ,1411795.0
> > > > > icelake     ,sse2        ,4103        ,0           ,64          ,113.2       ,114.6
> > > > > icelake     ,sse2        ,4111        ,0           ,3           ,121.5       ,120.4
> > > > > icelake     ,sse2        ,4127        ,3           ,0           ,1700.5      ,1771.5
> > > > > icelake     ,sse2        ,4159        ,3           ,7           ,119.3       ,118.8
> > > > > icelake     ,sse2        ,4223        ,9           ,5           ,1739.7      ,1735.2
> > > > > icelake     ,sse2        ,8199        ,0           ,64          ,207.0       ,203.9
> > > > > icelake     ,sse2        ,8207        ,0           ,3           ,225.5       ,220.8
> > > > > icelake     ,sse2        ,8223        ,3           ,0           ,3444.3      ,3743.5
> > > > > icelake     ,sse2        ,8255        ,3           ,7           ,219.9       ,216.8
> > > > > icelake     ,sse2        ,8319        ,9           ,5           ,4117.1      ,3487.3
> > > > > icelake     ,sse2        ,16391       ,0           ,64          ,397.1       ,394.3
> > > > > icelake     ,sse2        ,16399       ,0           ,3           ,439.6       ,428.6
> > > > > icelake     ,sse2        ,16415       ,3           ,0           ,6997.0      ,7031.2
> > > > > icelake     ,sse2        ,16447       ,3           ,7           ,426.8       ,421.8
> > > > > icelake     ,sse2        ,16511       ,9           ,5           ,7037.6      ,7038.3
> > > > > icelake     ,sse2        ,32775       ,0           ,64          ,790.9       ,779.0
> > > > > icelake     ,sse2        ,32783       ,0           ,3           ,863.1       ,849.6
> > > > > icelake     ,sse2        ,32799       ,3           ,0           ,14043.0     ,14390.9
> > > > > icelake     ,sse2        ,32831       ,3           ,7           ,841.6       ,833.1
> > > > > icelake     ,sse2        ,32895       ,9           ,5           ,14277.6     ,14344.2
> > > > > icelake     ,sse2        ,65543       ,0           ,64          ,1897.0      ,1897.3
> > > > > icelake     ,sse2        ,65551       ,0           ,3           ,1927.1      ,1955.4
> > > > > icelake     ,sse2        ,65567       ,3           ,0           ,28834.7     ,28727.8
> > > > > icelake     ,sse2        ,65599       ,3           ,7           ,1961.4      ,1969.7
> > > > > icelake     ,sse2        ,65663       ,9           ,5           ,28867.6     ,29019.8
> > > > > icelake     ,sse2        ,131079      ,0           ,64          ,3879.3      ,3872.6
> > > > > icelake     ,sse2        ,131087      ,0           ,3           ,3955.3      ,3990.7
> > > > > icelake     ,sse2        ,131103      ,3           ,0           ,58001.8     ,60567.9
> > > > > icelake     ,sse2        ,131135      ,3           ,7           ,3951.5      ,4002.6
> > > > > icelake     ,sse2        ,131199      ,9           ,5           ,57886.7     ,58391.4
> > > > > icelake     ,sse2        ,262151      ,0           ,64          ,7851.4      ,7894.7
> > > > > icelake     ,sse2        ,262159      ,0           ,3           ,7947.5      ,8016.2
> > > > > icelake     ,sse2        ,262175      ,3           ,0           ,115036.2    ,115968.6
> > > > > icelake     ,sse2        ,262207      ,3           ,7           ,7883.9      ,7814.1
> > > > > icelake     ,sse2        ,262271      ,9           ,5           ,113776.4    ,119733.6
> > > > > icelake     ,sse2        ,524295      ,0           ,64          ,17198.1     ,16974.9
> > > > > icelake     ,sse2        ,524303      ,0           ,3           ,17402.2     ,17096.3
> > > > > icelake     ,sse2        ,524319      ,3           ,0           ,223980.4    ,225889.9
> > > > > icelake     ,sse2        ,524351      ,3           ,7           ,17034.9     ,16910.3
> > > > > icelake     ,sse2        ,524415      ,9           ,5           ,224027.7    ,224962.5
> > > > > icelake     ,sse2        ,1048583     ,0           ,64          ,38822.3     ,39178.6
> > > > > icelake     ,sse2        ,1048591     ,0           ,3           ,41686.7     ,40247.4
> > > > > icelake     ,sse2        ,1048607     ,3           ,0           ,38814.8     ,39323.3
> > > > > icelake     ,sse2        ,1048639     ,3           ,7           ,39568.3     ,41325.7
> > > > > icelake     ,sse2        ,1048703     ,9           ,5           ,39354.2     ,39637.9
> > > > > icelake     ,sse2        ,2097159     ,0           ,64          ,84074.7     ,84543.1
> > > > > icelake     ,sse2        ,2097167     ,0           ,3           ,83665.7     ,82358.2
> > > > > icelake     ,sse2        ,2097183     ,3           ,0           ,81817.8     ,79638.9
> > > > > icelake     ,sse2        ,2097215     ,3           ,7           ,83649.1     ,83497.6
> > > > > icelake     ,sse2        ,2097279     ,9           ,5           ,80287.6     ,79980.9
> > > > > icelake     ,sse2        ,4194311     ,0           ,64          ,165409.8    ,168343.1
> > > > > icelake     ,sse2        ,4194319     ,0           ,3           ,165216.7    ,177632.0
> > > > > icelake     ,sse2        ,4194335     ,3           ,0           ,158718.7    ,160342.2
> > > > > icelake     ,sse2        ,4194367     ,3           ,7           ,167944.9    ,167204.4
> > > > > icelake     ,sse2        ,4194431     ,9           ,5           ,161530.1    ,164839.7
> > > > > icelake     ,sse2        ,8388615     ,0           ,64          ,626504.3    ,629858.5
> > > > > icelake     ,sse2        ,8388623     ,0           ,3           ,623969.5    ,631509.1
> > > > > icelake     ,sse2        ,8388639     ,3           ,0           ,599366.7    ,600016.0
> > > > > icelake     ,sse2        ,8388671     ,3           ,7           ,619964.2    ,619113.2
> > > > > icelake     ,sse2        ,8388735     ,9           ,5           ,595338.1    ,604172.4
> > > > > icelake     ,sse2        ,16777223    ,0           ,64          ,1709597.0   ,1725184.0
> > > > > icelake     ,sse2        ,16777231    ,0           ,3           ,1725452.0   ,1719746.0
> > > > > icelake     ,sse2        ,16777247    ,3           ,0           ,1614269.0   ,1607164.0
> > > > > icelake     ,sse2        ,16777279    ,3           ,7           ,1705295.0   ,1733018.0
> > > > > icelake     ,sse2        ,16777343    ,9           ,5           ,1604197.0   ,1595690.0
> > > > >
> > > > >
> > > > >  .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
> > > > >  1 file changed, 265 insertions(+), 73 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > > > index 897a3d9762..5e4a071f16 100644
> > > > > --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > > > +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
> > > > > @@ -35,7 +35,16 @@
> > > > >        __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
> > > > >     7. If size >= __x86_shared_non_temporal_threshold and there is no
> > > > >        overlap between destination and source, use non-temporal store
> > > > > -      instead of aligned store.  */
> > > > > +      instead of aligned store copying from either 2 or 4 pages at
> > > > > +      once.
> > > > > +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
> > > > > +      and source and destination do not page alias, copy from 2 pages
> > > > > +      at once using non-temporal stores. Page aliasing in this case is
> > > > > +      considered true if destination's page alignment - sources' page
> > > > > +      alignment is less than 8 * VEC_SIZE.
> > > > > +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
> > > > > +      and destination do page alias copy from 4 pages at once using
> > > > > +      non-temporal stores.  */
> > > > >
> > > > >  #include <sysdep.h>
> > > > >
> > > > > @@ -67,6 +76,34 @@
> > > > >  # endif
> > > > >  #endif
> > > > >
> > > > > +#ifndef PAGE_SIZE
> > > > > +# define PAGE_SIZE 4096
> > > > > +#endif
> > > > > +
> > > > > +#if PAGE_SIZE != 4096
> > > > > +# error Unsupported PAGE_SIZE
> > > > > +#endif
> > > > > +
> > > > > +#ifndef LOG_PAGE_SIZE
> > > > > +# define LOG_PAGE_SIZE 12
> > > > > +#endif
> > > > > +
> > > > > +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
> > > > > +# error Invalid LOG_PAGE_SIZE
> > > > > +#endif
> > > > > +
> > > > > +/* Byte per page for large_memcpy inner loop.  */
> > > > > +#if VEC_SIZE == 64
> > > > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
> > > > > +#else
> > > > > +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
> > > > > +#endif
> > > > > +
> > > > > +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
> > > > > +#ifndef LOG_4X_MEMCPY_THRESH
> > > > > +# define LOG_4X_MEMCPY_THRESH 4
> > > > > +#endif
> > > > > +
> > > > >  /* Avoid short distance rep movsb only with non-SSE vector.  */
> > > > >  #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
> > > > >  # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
> > > > > @@ -106,6 +143,28 @@
> > > > >  # error Unsupported PREFETCH_SIZE!
> > > > >  #endif
> > > > >
> > > > > +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
> > > > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
> > > > > +     VMOVU   (offset)base, vec0; \
> > > > > +     VMOVU   ((offset) + VEC_SIZE)base, vec1;
> > > > > +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
> > > > > +     VMOVNT  vec0, (offset)base; \
> > > > > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base;
> > > > > +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
> > > > > +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > > > > +     VMOVU   (offset)base, vec0; \
> > > > > +     VMOVU   ((offset) + VEC_SIZE)base, vec1; \
> > > > > +     VMOVU   ((offset) + VEC_SIZE * 2)base, vec2; \
> > > > > +     VMOVU   ((offset) + VEC_SIZE * 3)base, vec3;
> > > > > +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
> > > > > +     VMOVNT  vec0, (offset)base; \
> > > > > +     VMOVNT  vec1, ((offset) + VEC_SIZE)base; \
> > > > > +     VMOVNT  vec2, ((offset) + VEC_SIZE * 2)base; \
> > > > > +     VMOVNT  vec3, ((offset) + VEC_SIZE * 3)base;
> > > > > +#else
> > > > > +# error Invalid LARGE_LOAD_SIZE
> > > > > +#endif
> > > > > +
> > > > >  #ifndef SECTION
> > > > >  # error SECTION is not defined!
> > > > >  #endif
> > > > > @@ -393,6 +452,15 @@ L(last_4x_vec):
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > >  L(more_8x_vec):
> > > > > +     /* Check if non-temporal move candidate.  */
> > > > > +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > > > +     /* Check non-temporal store threshold.  */
> > > > > +     cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > > > +     ja      L(large_memcpy_2x)
> > > > > +#endif
> > > > > +     /* Entry if rdx is greater than non-temporal threshold but there
> > > > > +       is overlap.  */
> > > > > +L(more_8x_vec_check):
> > > > >       cmpq    %rsi, %rdi
> > > > >       ja      L(more_8x_vec_backward)
> > > > >       /* Source == destination is less common.  */
> > > > > @@ -419,24 +487,21 @@ L(more_8x_vec):
> > > > >       subq    %r8, %rdi
> > > > >       /* Adjust length.  */
> > > > >       addq    %r8, %rdx
> > > > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > > > -     /* Check non-temporal store threshold.  */
> > > > > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > > > -     ja      L(large_forward)
> > > > > -#endif
> > > > > +
> > > > > +     .p2align 4
> > > > >  L(loop_4x_vec_forward):
> > > > >       /* Copy 4 * VEC a time forward.  */
> > > > >       VMOVU   (%rsi), %VEC(0)
> > > > >       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > > > >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > > > >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > > > -     addq    $(VEC_SIZE * 4), %rsi
> > > > > -     subq    $(VEC_SIZE * 4), %rdx
> > > > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > > > +     addq    $-(VEC_SIZE * 4), %rdx
> > > > >       VMOVA   %VEC(0), (%rdi)
> > > > >       VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > > > >       VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > > >       VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > >       cmpq    $(VEC_SIZE * 4), %rdx
> > > > >       ja      L(loop_4x_vec_forward)
> > > > >       /* Store the last 4 * VEC.  */
> > > > > @@ -470,24 +535,21 @@ L(more_8x_vec_backward):
> > > > >       subq    %r8, %r9
> > > > >       /* Adjust length.  */
> > > > >       subq    %r8, %rdx
> > > > > -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > > > -     /* Check non-temporal store threshold.  */
> > > > > -     cmp     __x86_shared_non_temporal_threshold(%rip), %RDX_LP
> > > > > -     ja      L(large_backward)
> > > > > -#endif
> > > > > +
> > > > > +     .p2align 4
> > > > >  L(loop_4x_vec_backward):
> > > > >       /* Copy 4 * VEC a time backward.  */
> > > > >       VMOVU   (%rcx), %VEC(0)
> > > > >       VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > > > >       VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > > > >       VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > > > > -     subq    $(VEC_SIZE * 4), %rcx
> > > > > -     subq    $(VEC_SIZE * 4), %rdx
> > > > > +     addq    $-(VEC_SIZE * 4), %rcx
> > > > > +     addq    $-(VEC_SIZE * 4), %rdx
> > > > >       VMOVA   %VEC(0), (%r9)
> > > > >       VMOVA   %VEC(1), -VEC_SIZE(%r9)
> > > > >       VMOVA   %VEC(2), -(VEC_SIZE * 2)(%r9)
> > > > >       VMOVA   %VEC(3), -(VEC_SIZE * 3)(%r9)
> > > > > -     subq    $(VEC_SIZE * 4), %r9
> > > > > +     addq    $-(VEC_SIZE * 4), %r9
> > > > >       cmpq    $(VEC_SIZE * 4), %rdx
> > > > >       ja      L(loop_4x_vec_backward)
> > > > >       /* Store the first 4 * VEC.  */
> > > > > @@ -500,72 +562,202 @@ L(loop_4x_vec_backward):
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > >  #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
> > > > > -L(large_forward):
> > > > > +     .p2align 4
> > > > > +L(large_memcpy_2x):
> > > > > +     /* Compute absolute value of difference between source and
> > > > > +        destination.  */
> > > > > +     movq    %rdi, %r9
> > > > > +     subq    %rsi, %r9
> > > > > +     movq    %r9, %r8
> > > > > +     leaq    -1(%r9), %rcx
> > > > > +     sarq    $63, %r8
> > > > > +     xorq    %r8, %r9
> > > > > +     subq    %r8, %r9
> > > > >       /* Don't use non-temporal store if there is overlap between
> > > > > -        destination and source since destination may be in cache
> > > > > -        when source is loaded.  */
> > > > > -     leaq    (%rdi, %rdx), %r10
> > > > > -     cmpq    %r10, %rsi
> > > > > -     jb      L(loop_4x_vec_forward)
> > > > > -L(loop_large_forward):
> > > > > +        destination and source since destination may be in cache when
> > > > > +        source is loaded.  */
> > > > > +     cmpq    %r9, %rdx
> > > > > +     ja      L(more_8x_vec_check)
> > > > > +
> > > > > +     /* Cache align destination. First store the first 64 bytes then
> > > > > +        adjust alignments.  */
> > > > > +     VMOVU   (%rsi), %VEC(8)
> > > > > +#if VEC_SIZE < 64
> > > > > +     VMOVU   VEC_SIZE(%rsi), %VEC(9)
> > > > > +#if VEC_SIZE < 32
> > > > > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(10)
> > > > > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(11)
> > > > > +#endif
> > > > > +#endif
> > > > > +     VMOVU   %VEC(8), (%rdi)
> > > > > +#if VEC_SIZE < 64
> > > > > +     VMOVU   %VEC(9), VEC_SIZE(%rdi)
> > > > > +#if VEC_SIZE < 32
> > > > > +     VMOVU   %VEC(10), (VEC_SIZE * 2)(%rdi)
> > > > > +     VMOVU   %VEC(11), (VEC_SIZE * 3)(%rdi)
> > > > > +#endif
> > > > > +#endif
> > > > > +     /* Adjust source, destination, and size.  */
> > > > > +     movq    %rdi, %r8
> > > > > +     andq    $63, %r8
> > > > > +     /* Get the negative of offset for alignment.  */
> > > > > +     subq    $64, %r8
> > > > > +     /* Adjust source.  */
> > > > > +     subq    %r8, %rsi
> > > > > +     /* Adjust destination which should be aligned now.  */
> > > > > +     subq    %r8, %rdi
> > > > > +     /* Adjust length.  */
> > > > > +     addq    %r8, %rdx
> > > > > +
> > > > > +     /* Test if source and destination addresses will alias. If they do
> > > > > +        the larger pipeline in large_memcpy_4x alleviated the
> > > > > +        performance drop.  */
> > > > > +     testl   $(PAGE_SIZE - VEC_SIZE * 8), %ecx
> > > > > +     jz      L(large_memcpy_4x)
> > > > > +
> > > > > +     movq    %rdx, %r10
> > > > > +     shrq    $LOG_4X_MEMCPY_THRESH, %r10
> > > > > +     cmp     __x86_shared_non_temporal_threshold(%rip), %r10
> > > > > +     jae     L(large_memcpy_4x)
> > > > > +
> > > > > +     /* edx will store remainder size for copying tail.  */
> > > > > +     andl    $(PAGE_SIZE * 2 - 1), %edx
> > > > > +     /* r10 stores outer loop counter.  */
> > > > > +     shrq    $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
> > > > > +     /* Copy 4x VEC at a time from 2 pages.  */
> > > > > +     .p2align 4
> > > > > +L(loop_large_memcpy_2x_outer):
> > > > > +     /* ecx stores inner loop counter.  */
> > > > > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > > > > +L(loop_large_memcpy_2x_inner):
> > > > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > > > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > > > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
> > > > > +     /* Load vectors from rsi.  */
> > > > > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > > > +     subq    $-LARGE_LOAD_SIZE, %rsi
> > > > > +     /* Non-temporal store vectors to rdi.  */
> > > > > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > > > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > > > +     subq    $-LARGE_LOAD_SIZE, %rdi
> > > > > +     decl    %ecx
> > > > > +     jnz     L(loop_large_memcpy_2x_inner)
> > > > > +     addq    $PAGE_SIZE, %rdi
> > > > > +     addq    $PAGE_SIZE, %rsi
> > > > > +     decq    %r10
> > > > > +     jne     L(loop_large_memcpy_2x_outer)
> > > > > +     sfence
> > > > > +
> > > > > +     /* Check if only last 4 loads are needed.  */
> > > > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > > > +     jbe     L(large_memcpy_2x_end)
> > > > > +
> > > > > +     /* Handle the last 2 * PAGE_SIZE bytes.  */
> > > > > +L(loop_large_memcpy_2x_tail):
> > > > >       /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > > > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
> > > > > -     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
> > > > > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > > > +     PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > > > >       VMOVU   (%rsi), %VEC(0)
> > > > >       VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > > > >       VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > > > >       VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > > > -     addq    $PREFETCHED_LOAD_SIZE, %rsi
> > > > > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > > > > -     VMOVNT  %VEC(0), (%rdi)
> > > > > -     VMOVNT  %VEC(1), VEC_SIZE(%rdi)
> > > > > -     VMOVNT  %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > > > -     VMOVNT  %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > > > -     addq    $PREFETCHED_LOAD_SIZE, %rdi
> > > > > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > > > > -     ja      L(loop_large_forward)
> > > > > -     sfence
> > > > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > > > +     addl    $-(VEC_SIZE * 4), %edx
> > > > > +     VMOVA   %VEC(0), (%rdi)
> > > > > +     VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > > > > +     VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > > > +     VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > > > +     ja      L(loop_large_memcpy_2x_tail)
> > > > > +
> > > > > +L(large_memcpy_2x_end):
> > > > >       /* Store the last 4 * VEC.  */
> > > > > -     VMOVU   %VEC(5), (%rcx)
> > > > > -     VMOVU   %VEC(6), -VEC_SIZE(%rcx)
> > > > > -     VMOVU   %VEC(7), -(VEC_SIZE * 2)(%rcx)
> > > > > -     VMOVU   %VEC(8), -(VEC_SIZE * 3)(%rcx)
> > > > > -     /* Store the first VEC.  */
> > > > > -     VMOVU   %VEC(4), (%r11)
> > > > > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > > > > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > > > > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > > > > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > > > > +
> > > > > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > > > > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > > > > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > > > > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > > -L(large_backward):
> > > > > -     /* Don't use non-temporal store if there is overlap between
> > > > > -        destination and source since destination may be in cache
> > > > > -        when source is loaded.  */
> > > > > -     leaq    (%rcx, %rdx), %r10
> > > > > -     cmpq    %r10, %r9
> > > > > -     jb      L(loop_4x_vec_backward)
> > > > > -L(loop_large_backward):
> > > > > -     /* Copy 4 * VEC a time backward with non-temporal stores.  */
> > > > > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
> > > > > -     PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
> > > > > -     VMOVU   (%rcx), %VEC(0)
> > > > > -     VMOVU   -VEC_SIZE(%rcx), %VEC(1)
> > > > > -     VMOVU   -(VEC_SIZE * 2)(%rcx), %VEC(2)
> > > > > -     VMOVU   -(VEC_SIZE * 3)(%rcx), %VEC(3)
> > > > > -     subq    $PREFETCHED_LOAD_SIZE, %rcx
> > > > > -     subq    $PREFETCHED_LOAD_SIZE, %rdx
> > > > > -     VMOVNT  %VEC(0), (%r9)
> > > > > -     VMOVNT  %VEC(1), -VEC_SIZE(%r9)
> > > > > -     VMOVNT  %VEC(2), -(VEC_SIZE * 2)(%r9)
> > > > > -     VMOVNT  %VEC(3), -(VEC_SIZE * 3)(%r9)
> > > > > -     subq    $PREFETCHED_LOAD_SIZE, %r9
> > > > > -     cmpq    $PREFETCHED_LOAD_SIZE, %rdx
> > > > > -     ja      L(loop_large_backward)
> > > > > +     .p2align 4
> > > > > +L(large_memcpy_4x):
> > > > > +     movq    %rdx, %r10
> > > > > +     /* edx will store remainder size for copying tail.  */
> > > > > +     andl    $(PAGE_SIZE * 4 - 1), %edx
> > > > > +     /* r10 stores outer loop counter.  */
> > > > > +     shrq    $(LOG_PAGE_SIZE + 2), %r10
> > > > > +     /* Copy 4x VEC at a time from 4 pages.  */
> > > > > +     .p2align 4
> > > > > +L(loop_large_memcpy_4x_outer):
> > > > > +     /* ecx stores inner loop counter.  */
> > > > > +     movl    $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
> > > > > +L(loop_large_memcpy_4x_inner):
> > > > > +     /* Only one prefetch set per page as doing 4 pages give more time
> > > > > +        for prefetcher to keep up.  */
> > > > > +     PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
> > > > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
> > > > > +     PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
> > > > > +     /* Load vectors from rsi.  */
> > > > > +     LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > > > > +     LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > > > > +     subq    $-LARGE_LOAD_SIZE, %rsi
> > > > > +     /* Non-temporal store vectors to rdi.  */
> > > > > +     STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
> > > > > +     STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
> > > > > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
> > > > > +     STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
> > > > > +     subq    $-LARGE_LOAD_SIZE, %rdi
> > > > > +     decl    %ecx
> > > > > +     jnz     L(loop_large_memcpy_4x_inner)
> > > > > +     addq    $(PAGE_SIZE * 3), %rdi
> > > > > +     addq    $(PAGE_SIZE * 3), %rsi
> > > > > +     decq    %r10
> > > > > +     jne     L(loop_large_memcpy_4x_outer)
> > > > >       sfence
> > > > > -     /* Store the first 4 * VEC.  */
> > > > > -     VMOVU   %VEC(4), (%rdi)
> > > > > -     VMOVU   %VEC(5), VEC_SIZE(%rdi)
> > > > > -     VMOVU   %VEC(6), (VEC_SIZE * 2)(%rdi)
> > > > > -     VMOVU   %VEC(7), (VEC_SIZE * 3)(%rdi)
> > > > > -     /* Store the last VEC.  */
> > > > > -     VMOVU   %VEC(8), (%r11)
> > > > > +     /* Check if only last 4 loads are needed.  */
> > > > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > > > +     jbe     L(large_memcpy_4x_end)
> > > > > +
> > > > > +     /* Handle the last 4  * PAGE_SIZE bytes.  */
> > > > > +L(loop_large_memcpy_4x_tail):
> > > > > +     /* Copy 4 * VEC a time forward with non-temporal stores.  */
> > > > > +     PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
> > > > > +     PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
> > > > > +     VMOVU   (%rsi), %VEC(0)
> > > > > +     VMOVU   VEC_SIZE(%rsi), %VEC(1)
> > > > > +     VMOVU   (VEC_SIZE * 2)(%rsi), %VEC(2)
> > > > > +     VMOVU   (VEC_SIZE * 3)(%rsi), %VEC(3)
> > > > > +     subq    $-(VEC_SIZE * 4), %rsi
> > > > > +     addl    $-(VEC_SIZE * 4), %edx
> > > > > +     VMOVA   %VEC(0), (%rdi)
> > > > > +     VMOVA   %VEC(1), VEC_SIZE(%rdi)
> > > > > +     VMOVA   %VEC(2), (VEC_SIZE * 2)(%rdi)
> > > > > +     VMOVA   %VEC(3), (VEC_SIZE * 3)(%rdi)
> > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > > +     cmpl    $(VEC_SIZE * 4), %edx
> > > > > +     ja      L(loop_large_memcpy_4x_tail)
> > > > > +
> > > > > +L(large_memcpy_4x_end):
> > > > > +     /* Store the last 4 * VEC.  */
> > > > > +     VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
> > > > > +     VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
> > > > > +     VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
> > > > > +     VMOVU   -VEC_SIZE(%rsi, %rdx), %VEC(3)
> > > > > +
> > > > > +     VMOVU   %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
> > > > > +     VMOVU   %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
> > > > > +     VMOVU   %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
> > > > > +     VMOVU   %VEC(3), -VEC_SIZE(%rdi, %rdx)
> > > > >       VZEROUPPER_RETURN
> > > > >  #endif
> > > > >  END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
> > > > > --
> > > > > 2.29.2
> > > > >
> > > >
> > > > LGTM.  Please commit it.
> > > >
> > > > Thanks.
> > > >
> > > >
> > > > H.J.
> >
> >
> >
> > --
> > H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil
  

Patch

diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 897a3d9762..5e4a071f16 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -35,7 +35,16 @@ 
       __x86_rep_movsb_stop_threshold, then REP MOVSB will be used.
    7. If size >= __x86_shared_non_temporal_threshold and there is no
       overlap between destination and source, use non-temporal store
-      instead of aligned store.  */
+      instead of aligned store copying from either 2 or 4 pages at
+      once.
+   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+      and source and destination do not page alias, copy from 2 pages
+      at once using non-temporal stores. Page aliasing in this case is
+      considered true if destination's page alignment - sources' page
+      alignment is less than 8 * VEC_SIZE.
+   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+      and destination do page alias copy from 4 pages at once using
+      non-temporal stores.  */
 
 #include <sysdep.h>
 
@@ -67,6 +76,34 @@ 
 # endif
 #endif
 
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12
+#endif
+
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE
+#endif
+
+/* Byte per page for large_memcpy inner loop.  */
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+/* Amount to shift rdx by to compare for memcpy_large_4x.  */
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+
 /* Avoid short distance rep movsb only with non-SSE vector.  */
 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@@ -106,6 +143,28 @@ 
 # error Unsupported PREFETCH_SIZE!
 #endif
 
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVNT  vec0, (offset)base; \
+	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1; \
+	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
+	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVNT	vec0, (offset)base; \
+	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
+	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
+	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -393,6 +452,15 @@  L(last_4x_vec):
 	VZEROUPPER_RETURN
 
 L(more_8x_vec):
+	/* Check if non-temporal move candidate.  */
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	ja	L(large_memcpy_2x)
+#endif
+	/* Entry if rdx is greater than non-temporal threshold but there
+       is overlap.  */
+L(more_8x_vec_check):
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
 	/* Source == destination is less common.  */
@@ -419,24 +487,21 @@  L(more_8x_vec):
 	subq	%r8, %rdi
 	/* Adjust length.  */
 	addq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_forward)
-#endif
+
+	.p2align 4
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$(VEC_SIZE * 4), %rsi
-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$-(VEC_SIZE * 4), %rsi
+	addq	$-(VEC_SIZE * 4), %rdx
 	VMOVA	%VEC(0), (%rdi)
 	VMOVA	%VEC(1), VEC_SIZE(%rdi)
 	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rdi
 	cmpq	$(VEC_SIZE * 4), %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
@@ -470,24 +535,21 @@  L(more_8x_vec_backward):
 	subq	%r8, %r9
 	/* Adjust length.  */
 	subq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_backward)
-#endif
+
+	.p2align 4
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
 	VMOVU	(%rcx), %VEC(0)
 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$(VEC_SIZE * 4), %rcx
-	subq	$(VEC_SIZE * 4), %rdx
+	addq	$-(VEC_SIZE * 4), %rcx
+	addq	$-(VEC_SIZE * 4), %rdx
 	VMOVA	%VEC(0), (%r9)
 	VMOVA	%VEC(1), -VEC_SIZE(%r9)
 	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
 	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$(VEC_SIZE * 4), %r9
+	addq	$-(VEC_SIZE * 4), %r9
 	cmpq	$(VEC_SIZE * 4), %rdx
 	ja	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
@@ -500,72 +562,202 @@  L(loop_4x_vec_backward):
 	VZEROUPPER_RETURN
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
+	.p2align 4
+L(large_memcpy_2x):
+	/* Compute absolute value of difference between source and
+	   destination.  */
+	movq	%rdi, %r9
+	subq	%rsi, %r9
+	movq	%r9, %r8
+	leaq	-1(%r9), %rcx
+	sarq	$63, %r8
+	xorq	%r8, %r9
+	subq	%r8, %r9
 	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rdi, %rdx), %r10
-	cmpq    %r10, %rsi
-	jb	L(loop_4x_vec_forward)
-L(loop_large_forward):
+	   destination and source since destination may be in cache when
+	   source is loaded.  */
+	cmpq	%r9, %rdx
+	ja	L(more_8x_vec_check)
+
+	/* Cache align destination. First store the first 64 bytes then
+	   adjust alignments.  */
+	VMOVU	(%rsi), %VEC(8)
+#if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(9)
+#if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
+#endif
+#endif
+	VMOVU	%VEC(8), (%rdi)
+#if VEC_SIZE < 64
+	VMOVU	%VEC(9), VEC_SIZE(%rdi)
+#if VEC_SIZE < 32
+	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
+#endif
+#endif
+	/* Adjust source, destination, and size.  */
+	movq	%rdi, %r8
+	andq	$63, %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$64, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+
+	/* Test if source and destination addresses will alias. If they do
+	   the larger pipeline in large_memcpy_4x alleviated the
+	   performance drop.  */
+	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+	jz	L(large_memcpy_4x)
+
+	movq	%rdx, %r10
+	shrq	$LOG_4X_MEMCPY_THRESH, %r10
+	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
+	jae	L(large_memcpy_4x)
+
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 2 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+	/* Copy 4x VEC at a time from 2 pages.  */
+	.p2align 4
+L(loop_large_memcpy_2x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	subq	$-LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	subq	$-LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_2x_inner)
+	addq	$PAGE_SIZE, %rdi
+	addq	$PAGE_SIZE, %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_2x_outer)
+	sfence
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_2x_end)
+
+	/* Handle the last 2 * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_2x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$PREFETCHED_LOAD_SIZE, %rsi
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%rdi)
-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$PREFETCHED_LOAD_SIZE, %rdi
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_forward)
-	sfence
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$-(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_2x_tail)
+
+L(large_memcpy_2x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
-	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
-L(large_backward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rcx, %rdx), %r10
-	cmpq    %r10, %r9
-	jb	L(loop_4x_vec_backward)
-L(loop_large_backward):
-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$PREFETCHED_LOAD_SIZE, %rcx
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%r9)
-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$PREFETCHED_LOAD_SIZE, %r9
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_backward)
+	.p2align 4
+L(large_memcpy_4x):
+	movq	%rdx, %r10
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 4 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$(LOG_PAGE_SIZE + 2), %r10
+	/* Copy 4x VEC at a time from 4 pages.  */
+	.p2align 4
+L(loop_large_memcpy_4x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+	/* Only one prefetch set per page as doing 4 pages give more time
+	   for prefetcher to keep up.  */
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	subq	$-LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	subq	$-LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_4x_inner)
+	addq	$(PAGE_SIZE * 3), %rdi
+	addq	$(PAGE_SIZE * 3), %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_4x_outer)
 	sfence
-	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
-	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_4x_end)
+
+	/* Handle the last 4  * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_4x_tail):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$-(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
+	/* Store the last 4 * VEC.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))