Patchwork [ARM] Optimise memchr for NEON-enabled processors

login
register
mail settings
Submitter Prakhar Bahuguna
Date June 13, 2017, 9:25 a.m.
Message ID <20170613092505.rxuxbujpgfq6fbqo@e107464-lin.cambridge.arm.com>
Download mbox | patch
Permalink /patch/20982/
State New
Headers show

Comments

Prakhar Bahuguna - June 13, 2017, 9:25 a.m.
This patch provides an optimised implementation of memchr using NEON
instructions to improve its performance, especially with longer search regions.
This gave an improvement in performance against the Thumb2+DSP optimised code,
with more significant gains for larger inputs. The NEON code also wins in cases
where the input is small (less than 8 bytes) by defaulting to a simple
byte-by-byte search. This avoids the overhead imposed by filling two quadword
registers from memory.

Results from the glibc bench-memchr benchmark are as follows:

Cortex-A53:
-----------

	vs simple_memchr	vs __memchr_noneon
Length 2048, position   32, alignment  0:	297.51%	120.87%
Length  256, position   64, alignment  1:	406.70%	153.35%
Length 2048, position   32, alignment  0:	292.97%	120.77%
Length  256, position   64, alignment  1:	406.95%	152.61%
Length 2048, position   64, alignment  0:	450.82%	138.74%
Length  256, position   64, alignment  2:	408.46%	147.51%
Length 2048, position   64, alignment  0:	440.32%	133.33%
Length  256, position   64, alignment  2:	405.45%	147.28%
Length 2048, position  128, alignment  0:	633.26%	152.98%
Length  256, position   64, alignment  3:	405.71%	136.48%
Length 2048, position  128, alignment  0:	634.77%	152.88%
Length  256, position   64, alignment  3:	405.45%	136.39%
Length 2048, position  256, alignment  0:	872.41%	178.25%
Length  256, position   64, alignment  4:	408.23%	132.42%
Length 2048, position  256, alignment  0:	867.49%	177.65%
Length  256, position   64, alignment  4:	405.94%	130.69%
Length 2048, position  512, alignment  0:	1089.90%	202.59%
Length  256, position   64, alignment  5:	406.19%	129.70%
Length 2048, position  512, alignment  0:	1089.43%	202.78%
Length  256, position   64, alignment  5:	407.21%	130.60%
Length 2048, position 1024, alignment  0:	1254.09%	221.24%
Length  256, position   64, alignment  6:	407.21%	123.13%
Length 2048, position 1024, alignment  0:	1253.20%	221.12%
Length  256, position   64, alignment  6:	406.45%	122.58%
Length 2048, position 2048, alignment  0:	1388.94%	237.35%
Length  256, position   64, alignment  7:	407.21%	117.16%
Length 2048, position 2048, alignment  0:	1387.31%	237.04%
Length  256, position   64, alignment  7:	407.96%	325.87%
Length    2, position    1, alignment  0:	118.47%	115.29%
Length    2, position    1, alignment  0:	109.68%	116.13%
Length    2, position    1, alignment  1:	112.50%	117.76%
Length    2, position    1, alignment  1:	119.48%	114.94%
Length    3, position    2, alignment  0:	116.56%	119.02%
Length    3, position    2, alignment  0:	122.02%	117.26%
Length    3, position    2, alignment  2:	123.35%	117.96%
Length    3, position    2, alignment  2:	123.53%	114.71%
Length    4, position    3, alignment  0:	138.59%	119.02%
Length    4, position    3, alignment  0:	147.98%	124.86%
Length    4, position    3, alignment  3:	113.64%	125.00%
Length    4, position    3, alignment  3:	111.73%	123.46%
Length    5, position    4, alignment  0:	124.34%	139.68%
Length    5, position    4, alignment  0:	120.97%	124.73%
Length    5, position    4, alignment  4:	118.62%	121.28%
Length    5, position    4, alignment  4:	116.84%	138.42%
Length    6, position    5, alignment  0:	118.36%	110.16%
Length    6, position    5, alignment  0:	119.12%	111.95%
Length    6, position    5, alignment  5:	118.90%	112.20%
Length    6, position    5, alignment  5:	121.03%	111.90%
Length    7, position    6, alignment  0:	120.51%	109.52%
Length    7, position    6, alignment  0:	121.56%	110.41%
Length    7, position    6, alignment  6:	120.15%	109.16%
Length    7, position    6, alignment  6:	120.66%	109.59%
Length    8, position    7, alignment  0:	129.26%	115.56%
Length    8, position    7, alignment  0:	129.93%	115.33%
Length    8, position    7, alignment  7:	140.56%	126.51%
Length    8, position    7, alignment  7:	144.63%	128.51%
Length    9, position    8, alignment  0:	138.01%	121.40%
Length    9, position    8, alignment  0:	138.66%	122.68%
Length    9, position    8, alignment  0:	135.90%	119.78%
Length    9, position    8, alignment  0:	138.38%	122.51%
Length   10, position    9, alignment  0:	147.78%	126.30%
Length   10, position    9, alignment  0:	146.86%	125.83%
Length   10, position    9, alignment  1:	165.42%	143.33%
Length   10, position    9, alignment  1:	163.93%	140.16%
Length   11, position   10, alignment  0:	154.61%	129.89%
Length   11, position   10, alignment  0:	155.39%	133.46%
Length   11, position   10, alignment  2:	173.75%	148.75%
Length   11, position   10, alignment  2:	173.55%	147.11%
Length   12, position   11, alignment  0:	165.54%	139.70%
Length   12, position   11, alignment  0:	163.94%	137.55%
Length   12, position   11, alignment  3:	180.66%	153.91%
Length   12, position   11, alignment  3:	184.17%	157.08%
Length   13, position   12, alignment  0:	172.12%	144.61%
Length   13, position   12, alignment  0:	175.56%	146.62%
Length   13, position   12, alignment  4:	192.89%	162.76%
Length   13, position   12, alignment  4:	194.14%	163.18%
Length   14, position   13, alignment  0:	180.67%	149.44%
Length   14, position   13, alignment  0:	180.74%	151.11%
Length   14, position   13, alignment  5:	199.59%	164.23%
Length   14, position   13, alignment  5:	202.49%	166.80%
Length   15, position   14, alignment  0:	189.92%	157.46%
Length   15, position   14, alignment  0:	189.85%	157.14%
Length   15, position   14, alignment  6:	206.88%	169.64%
Length   15, position   14, alignment  6:	206.91%	169.92%
Length   16, position   15, alignment  0:	197.03%	89.59%
Length   16, position   15, alignment  0:	198.88%	89.55%
Length   16, position   15, alignment  7:	223.01%	151.46%
Length   16, position   15, alignment  7:	219.75%	148.15%
Length   17, position   16, alignment  0:	203.32%	83.39%
Length   17, position   16, alignment  0:	205.58%	86.25%
Length   17, position   16, alignment  0:	208.24%	86.52%
Length   17, position   16, alignment  0:	204.40%	83.88%
Length   18, position   17, alignment  0:	213.33%	92.22%
Length   18, position   17, alignment  0:	215.41%	92.86%
Length   18, position   17, alignment  1:	239.09%	183.54%
Length   18, position   17, alignment  1:	231.20%	175.60%
Length   19, position   18, alignment  0:	219.48%	98.16%
Length   19, position   18, alignment  0:	223.59%	98.88%
Length   19, position   18, alignment  2:	240.00%	188.00%
Length   19, position   18, alignment  2:	251.05%	194.14%
Length   20, position   19, alignment  0:	230.97%	106.34%
Length   20, position   19, alignment  0:	226.18%	104.00%
Length   20, position   19, alignment  3:	255.33%	180.33%
Length   20, position   19, alignment  3:	260.25%	182.84%
Length   21, position   20, alignment  0:	239.93%	129.48%
Length   21, position   20, alignment  0:	241.04%	112.31%
Length   21, position   20, alignment  4:	258.87%	116.53%
Length   21, position   20, alignment  4:	264.20%	116.46%
Length   22, position   21, alignment  0:	245.76%	134.32%
Length   22, position   21, alignment  0:	251.32%	140.00%
Length   22, position   21, alignment  5:	275.62%	128.93%
Length   22, position   21, alignment  5:	276.03%	128.10%
Length   23, position   22, alignment  0:	258.21%	142.16%
Length   23, position   22, alignment  0:	257.09%	143.66%
Length   23, position   22, alignment  6:	277.82%	150.00%
Length   23, position   22, alignment  6:	285.95%	135.95%
Length   24, position   23, alignment  0:	264.68%	101.12%
Length   24, position   23, alignment  0:	266.67%	100.75%
Length   24, position   23, alignment  7:	288.71%	158.47%
Length   24, position   23, alignment  7:	290.20%	483.27%
Length   25, position   24, alignment  0:	275.56%	115.04%
Length   25, position   24, alignment  0:	272.86%	100.00%
Length   25, position   24, alignment  0:	270.85%	97.79%
Length   25, position   24, alignment  0:	278.03%	99.24%
Length   26, position   25, alignment  0:	284.21%	106.77%
Length   26, position   25, alignment  0:	283.21%	103.73%
Length   26, position   25, alignment  1:	300.00%	160.32%
Length   26, position   25, alignment  1:	314.46%	166.53%
Length   27, position   26, alignment  0:	291.39%	111.24%
Length   27, position   26, alignment  0:	289.96%	110.41%
Length   27, position   26, alignment  2:	311.15%	190.44%
Length   27, position   26, alignment  2:	324.07%	181.33%
Length   28, position   27, alignment  0:	295.22%	118.75%
Length   28, position   27, alignment  0:	300.75%	117.98%
Length   28, position   27, alignment  3:	322.49%	187.55%
Length   28, position   27, alignment  3:	335.98%	195.40%
Length   29, position   28, alignment  0:	303.69%	124.72%
Length   29, position   28, alignment  0:	305.58%	126.02%
Length   29, position   28, alignment  4:	236.78%	91.95%
Length   29, position   28, alignment  4:	238.44%	90.75%
Length   30, position   29, alignment  0:	317.29%	177.07%
Length   30, position   29, alignment  0:	314.13%	147.58%
Length   30, position   29, alignment  5:	236.59%	94.13%
Length   30, position   29, alignment  5:	244.80%	100.58%
Length   31, position   30, alignment  0:	328.19%	156.02%
Length   31, position   30, alignment  0:	321.03%	155.35%
Length   31, position   30, alignment  6:	241.94%	100.28%
Length   31, position   30, alignment  6:	246.02%	103.69%
Length   32, position   31, alignment  0:	333.58%	156.34%
Length   32, position   31, alignment  0:	330.15%	125.37%
Length   32, position   31, alignment  7:	252.69%	117.85%
Length   32, position   31, alignment  7:	260.35%	120.70%

Cortex-A57:
-----------

	vs simple_memchr	vs __memchr_noneon
Length 2048, position   32, alignment  0:	192.83%	68.30%
Length  256, position   64, alignment  1:	288.73%	116.90%
Length 2048, position   32, alignment  0:	185.02%	64.79%
Length  256, position   64, alignment  1:	292.12%	118.28%
Length 2048, position   64, alignment  0:	449.72%	157.46%
Length  256, position   64, alignment  2:	293.53%	116.55%
Length 2048, position   64, alignment  0:	468.39%	163.22%
Length  256, position   64, alignment  2:	293.53%	115.83%
Length 2048, position  128, alignment  0:	577.25%	148.24%
Length  256, position   64, alignment  3:	291.43%	113.57%
Length 2048, position  128, alignment  0:	645.61%	165.35%
Length  256, position   64, alignment  3:	294.24%	112.95%
Length 2048, position  256, alignment  0:	919.87%	189.73%
Length  256, position   64, alignment  4:	292.81%	114.39%
Length 2048, position  256, alignment  0:	960.55%	195.16%
Length  256, position   64, alignment  4:	294.22%	114.80%
Length 2048, position  512, alignment  0:	974.82%	169.75%
Length  256, position   64, alignment  5:	291.43%	108.93%
Length 2048, position  512, alignment  0:	977.45%	170.36%
Length  256, position   64, alignment  5:	292.47%	107.89%
Length 2048, position 1024, alignment  0:	1215.38%	192.88%
Length  256, position   64, alignment  6:	294.93%	106.16%
Length 2048, position 1024, alignment  0:	1216.78%	193.22%
Length  256, position   64, alignment  6:	292.12%	103.23%
Length 2048, position 2048, alignment  0:	1442.14%	215.99%
Length  256, position   64, alignment  7:	285.97%	99.30%
Length 2048, position 2048, alignment  0:	1449.97%	216.84%
Length  256, position   64, alignment  7:	289.68%	98.93%
Length    2, position    1, alignment  0:	108.96%	92.54%
Length    2, position    1, alignment  0:	107.09%	97.64%
Length    2, position    1, alignment  1:	108.06%	98.39%
Length    2, position    1, alignment  1:	109.02%	97.54%
Length    3, position    2, alignment  0:	103.52%	133.80%
Length    3, position    2, alignment  0:	108.09%	136.03%
Length    3, position    2, alignment  2:	107.52%	140.60%
Length    3, position    2, alignment  2:	109.09%	140.91%
Length    4, position    3, alignment  0:	101.32%	92.76%
Length    4, position    3, alignment  0:	109.22%	102.13%
Length    4, position    3, alignment  3:	109.42%	101.45%
Length    4, position    3, alignment  3:	110.22%	100.73%
Length    5, position    4, alignment  0:	109.74%	101.95%
Length    5, position    4, alignment  0:	110.27%	100.68%
Length    5, position    4, alignment  4:	112.59%	101.40%
Length    5, position    4, alignment  4:	113.38%	101.41%
Length    6, position    5, alignment  0:	110.83%	100.64%
Length    6, position    5, alignment  0:	111.92%	100.66%
Length    6, position    5, alignment  5:	112.75%	100.00%
Length    6, position    5, alignment  5:	114.19%	101.35%
Length    7, position    6, alignment  0:	113.84%	101.26%
Length    7, position    6, alignment  0:	113.46%	100.64%
Length    7, position    6, alignment  6:	112.03%	96.84%
Length    7, position    6, alignment  6:	114.19%	99.35%
Length    8, position    7, alignment  0:	187.41%	122.22%
Length    8, position    7, alignment  0:	191.67%	121.21%
Length    8, position    7, alignment  7:	182.01%	114.39%
Length    8, position    7, alignment  7:	194.62%	123.08%
Length    9, position    8, alignment  0:	176.87%	126.12%
Length    9, position    8, alignment  0:	178.03%	125.76%
Length    9, position    8, alignment  0:	180.15%	127.48%
Length    9, position    8, alignment  0:	178.20%	126.32%
Length   10, position    9, alignment  0:	187.88%	178.79%
Length   10, position    9, alignment  0:	187.12%	178.03%
Length   10, position    9, alignment  1:	192.25%	175.19%
Length   10, position    9, alignment  1:	187.88%	165.91%
Length   11, position   10, alignment  0:	194.70%	172.73%
Length   11, position   10, alignment  0:	194.70%	171.21%
Length   11, position   10, alignment  2:	194.70%	171.97%
Length   11, position   10, alignment  2:	199.22%	178.13%
Length   12, position   11, alignment  0:	201.50%	175.19%
Length   12, position   11, alignment  0:	203.03%	175.76%
Length   12, position   11, alignment  3:	205.38%	179.23%
Length   12, position   11, alignment  3:	205.38%	179.23%
Length   13, position   12, alignment  0:	209.85%	181.06%
Length   13, position   12, alignment  0:	209.09%	181.06%
Length   13, position   12, alignment  4:	209.09%	180.30%
Length   13, position   12, alignment  4:	214.73%	185.27%
Length   14, position   13, alignment  0:	217.29%	184.21%
Length   14, position   13, alignment  0:	215.79%	184.21%
Length   14, position   13, alignment  5:	218.18%	186.36%
Length   14, position   13, alignment  5:	224.03%	189.15%
Length   15, position   14, alignment  0:	225.76%	188.64%
Length   15, position   14, alignment  0:	225.00%	187.12%
Length   15, position   14, alignment  6:	225.00%	187.88%
Length   15, position   14, alignment  6:	230.23%	193.02%
Length   16, position   15, alignment  0:	235.11%	114.50%
Length   16, position   15, alignment  0:	233.33%	107.58%
Length   16, position   15, alignment  7:	238.76%	132.56%
Length   16, position   15, alignment  7:	237.69%	126.15%
Length   17, position   16, alignment  0:	242.75%	118.32%
Length   17, position   16, alignment  0:	240.15%	122.73%
Length   17, position   16, alignment  0:	239.39%	112.88%
Length   17, position   16, alignment  0:	241.22%	110.69%
Length   18, position   17, alignment  0:	254.96%	173.28%
Length   18, position   17, alignment  0:	256.49%	165.65%
Length   18, position   17, alignment  1:	256.92%	163.85%
Length   18, position   17, alignment  1:	256.92%	154.62%
Length   19, position   18, alignment  0:	257.90%	127.07%
Length   19, position   18, alignment  0:	262.60%	125.95%
Length   19, position   18, alignment  2:	263.08%	156.15%
Length   19, position   18, alignment  2:	266.67%	155.04%
Length   20, position   19, alignment  0:	264.66%	138.35%
Length   20, position   19, alignment  0:	264.66%	133.08%
Length   20, position   19, alignment  3:	272.09%	164.34%
Length   20, position   19, alignment  3:	270.77%	160.00%
Length   21, position   20, alignment  0:	277.10%	145.80%
Length   21, position   20, alignment  0:	275.76%	133.33%
Length   21, position   20, alignment  4:	280.77%	147.69%
Length   21, position   20, alignment  4:	279.23%	138.46%
Length   22, position   21, alignment  0:	279.70%	147.37%
Length   22, position   21, alignment  0:	279.10%	138.06%
Length   22, position   21, alignment  5:	283.97%	155.73%
Length   22, position   21, alignment  5:	283.97%	148.85%
Length   23, position   22, alignment  0:	291.67%	145.45%
Length   23, position   22, alignment  0:	291.67%	143.94%
Length   23, position   22, alignment  6:	293.13%	163.36%
Length   23, position   22, alignment  6:	296.15%	157.69%
Length   24, position   23, alignment  0:	299.25%	123.31%
Length   24, position   23, alignment  0:	301.52%	120.45%
Length   24, position   23, alignment  7:	306.15%	153.08%
Length   24, position   23, alignment  7:	306.15%	145.38%
Length   25, position   24, alignment  0:	309.09%	124.24%
Length   25, position   24, alignment  0:	310.69%	119.08%
Length   25, position   24, alignment  0:	304.48%	116.42%
Length   25, position   24, alignment  0:	310.69%	117.56%
Length   26, position   25, alignment  0:	315.91%	180.30%
Length   26, position   25, alignment  0:	315.15%	171.97%
Length   26, position   25, alignment  1:	320.77%	175.38%
Length   26, position   25, alignment  1:	322.48%	170.54%
Length   27, position   26, alignment  0:	324.24%	139.39%
Length   27, position   26, alignment  0:	326.72%	132.82%
Length   27, position   26, alignment  2:	329.23%	176.15%
Length   27, position   26, alignment  2:	331.78%	172.87%
Length   28, position   27, alignment  0:	328.57%	144.36%
Length   28, position   27, alignment  0:	330.30%	137.12%
Length   28, position   27, alignment  3:	333.59%	182.44%
Length   28, position   27, alignment  3:	334.35%	175.57%
Length   29, position   28, alignment  0:	341.98%	152.67%
Length   29, position   28, alignment  0:	339.39%	143.94%
Length   29, position   28, alignment  4:	268.86%	124.55%
Length   29, position   28, alignment  4:	282.39%	118.87%
Length   30, position   29, alignment  0:	345.86%	152.63%
Length   30, position   29, alignment  0:	345.86%	146.62%
Length   30, position   29, alignment  5:	285.71%	136.65%
Length   30, position   29, alignment  5:	288.75%	131.25%
Length   31, position   30, alignment  0:	357.58%	153.03%
Length   31, position   30, alignment  0:	356.72%	150.75%
Length   31, position   30, alignment  6:	286.06%	141.21%
Length   31, position   30, alignment  6:	287.80%	128.66%
Length   32, position   31, alignment  0:	363.16%	130.83%
Length   32, position   31, alignment  0:	365.91%	127.27%
Length   32, position   31, alignment  7:	300.00%	136.02%
Length   32, position   31, alignment  7:	301.88%	126.88%

glibc/ChangeLog:

2017-06-13  Prakhar Bahuguna  <prakhar.bahuguna@arm.com>

	* sysdeps/arm/armv7/multiarch/Makefile: Add memchr_neon to
	sysdep_routines.
	* sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Add define for
	__memchr_neon.
	Add ifunc definitions for __memchr_neon and __memchr_noneon.
	* sysdeps/arm/armv7/multiarch/memchr.S: New file.
	* sysdeps/arm/armv7/multiarch/memchr_impl.S: Likewise.
	* sysdeps/arm/armv7/multiarch/memchr_neon.S: Likewise.


Testing done: Ran regression tests for arm-none-linux-gnueabihf as well as a
full toolchain bootstrap. Benchmark tests were ran on ARMv7-A and ARMv8-A
hardware targets.
Joseph Myers - June 13, 2017, 2:22 p.m.
On Tue, 13 Jun 2017, Prakhar Bahuguna wrote:

> Testing done: Ran regression tests for arm-none-linux-gnueabihf as well as a
> full toolchain bootstrap. Benchmark tests were ran on ARMv7-A and ARMv8-A
> hardware targets.

It's important to test string functions for both endiannesses, since they 
can easily have endian-specific bugs.  You should be able to run string/ 
tests for big-endian with QEMU userspace emulation, for example, if you 
don't have an actual big-endian system to run tests on (some parts of the 
glibc testsuite, such as threading tests, may have problems with QEMU 
userspace emulation, but for string tests it should be fine).
Prakhar Bahuguna - June 13, 2017, 4:11 p.m.
On 13/06/2017 14:22:52, Joseph Myers wrote:
> On Tue, 13 Jun 2017, Prakhar Bahuguna wrote:
> 
> > Testing done: Ran regression tests for arm-none-linux-gnueabihf as well as a
> > full toolchain bootstrap. Benchmark tests were ran on ARMv7-A and ARMv8-A
> > hardware targets.
> 
> It's important to test string functions for both endiannesses, since they 
> can easily have endian-specific bugs.  You should be able to run string/ 
> tests for big-endian with QEMU userspace emulation, for example, if you 
> don't have an actual big-endian system to run tests on (some parts of the 
> glibc testsuite, such as threading tests, may have problems with QEMU 
> userspace emulation, but for string tests it should be fine).
> 
> -- 
> Joseph S. Myers
> joseph@codesourcery.com

This implementation was tested for big-endian targets as well (via qemu) and it
also passed regression tests for armeb-none-linux-gnueabihf. No bootstrap or
benchmarking was performed for big-endian though.
Joseph Myers - June 27, 2017, 3:45 p.m.
Thanks, committed.

Patch

diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile
index e834cc937f..9e1e61c21a 100644
--- a/sysdeps/arm/armv7/multiarch/Makefile
+++ b/sysdeps/arm/armv7/multiarch/Makefile
@@ -1,3 +1,3 @@ 
 ifeq ($(subdir),string)
-sysdep_routines += memcpy_neon memcpy_vfp
+sysdep_routines += memcpy_neon memcpy_vfp memchr_neon
 endif
diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
index b8094fd393..8f33156317 100644
--- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
+++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c
@@ -34,6 +34,7 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   bool use_neon = true;
 #ifdef __ARM_NEON__
 # define __memcpy_neon	memcpy
+# define __memchr_neon	memchr
 #else
   use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0;
 #endif
@@ -52,5 +53,9 @@  __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
 
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr, use_neon, __memchr_neon)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_noneon));
+
   return i;
 }
diff --git a/sysdeps/arm/armv7/multiarch/memchr.S b/sysdeps/arm/armv7/multiarch/memchr.S
new file mode 100644
index 0000000000..f1d0eda9b1
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr.S
@@ -0,0 +1,59 @@ 
+/* Multiple versions of memchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2013-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <rtld-global-offsets.h>
+
+#if IS_IN (libc)
+/* Under __ARM_NEON__, memchr_neon.S defines the name memchr.  */
+# ifndef __ARM_NEON__
+	.text
+	.arm
+ENTRY(memchr)
+	.type	memchr, %gnu_indirect_function
+	ldr	r1, .Lmemchr_noneon
+	tst	r0, #HWCAP_ARM_NEON
+	ldrne	r1, .Lmemchr_neon
+1:
+	add	r0, r1, pc
+	DO_RET(lr)
+
+.Lmemchr_noneon:
+	.long	C_SYMBOL_NAME(__memchr_noneon) - 1b - 8
+.Lmemchr_neon:
+	.long	C_SYMBOL_NAME(__memchr_neon) - 1b - 8
+
+END(memchr)
+
+libc_hidden_builtin_def (memchr)
+#endif  /* Not __ARM_NEON__.  */
+libc_hidden_def (__memchr_noneon)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+#undef weak_alias
+#define weak_alias(x, y)
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+#define memchr __memchr_noneon
+
+#endif
+
+#include "memchr_impl.S"
diff --git a/sysdeps/arm/armv7/multiarch/memchr_impl.S b/sysdeps/arm/armv7/multiarch/memchr_impl.S
new file mode 100644
index 0000000000..df8647ccf8
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr_impl.S
@@ -0,0 +1,218 @@ 
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef MEMCHR_NEON
+
+#include <sysdep.h>
+
+	.arch	armv7-a
+	.fpu	neon
+
+
+/* Arguments */
+#define srcin		r0
+#define chrin		r1
+#define cntin		r2
+
+/* Retval */
+#define result		r0	/* Live range does not overlap with srcin */
+
+/* Working registers */
+#define src		r1	/* Live range does not overlap with chrin */
+#define tmp		r3
+#define synd		r0	/* No overlap with srcin or result */
+#define soff		r12
+
+/* Working NEON registers */
+#define vrepchr		q0
+#define vdata0		q1
+#define vdata0_0	d2	/* Lower half of vdata0 */
+#define vdata0_1	d3	/* Upper half of vdata0 */
+#define vdata1		q2
+#define vdata1_0	d4	/* Lower half of vhas_chr0 */
+#define vdata1_1	d5	/* Upper half of vhas_chr0 */
+#define vrepmask	q3
+#define vrepmask0	d6
+#define vrepmask1	d7
+#define vend		q4
+#define vend0		d8
+#define vend1		d9
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
+ * byte. Each bit is set if the relevant byte matched the requested character
+ * and cleared otherwise. Since the bits in the syndrome reflect exactly the
+ * order in which things occur in the original string, counting trailing zeros
+ * allows to identify exactly which byte has matched.
+ */
+
+#ifndef NO_THUMB
+	.thumb_func
+#else
+	.arm
+#endif
+	.p2align 4,,15
+
+ENTRY(memchr)
+	/* Use a simple loop if there are less than 8 bytes to search.  */
+	cmp	cntin, #7
+	bhi	.Llargestr
+	and	chrin, chrin, #0xff
+
+.Lsmallstr:
+	subs	cntin, cntin, #1
+	blo	.Lnotfound	/* Return not found if reached end.  */
+	ldrb	tmp, [srcin], #1
+	cmp	tmp, chrin
+	bne	.Lsmallstr	/* Loop again if not found.  */
+	/* Otherwise fixup address and return.  */
+	sub	result, srcin, #1
+	bx	lr
+
+
+.Llargestr:
+	vdup.8	vrepchr, chrin	/* Duplicate char across all lanes. */
+	/*
+	 * Magic constant 0x8040201008040201 allows us to identify which lane
+	 * matches the requested byte.
+	 */
+	movw	tmp, #0x0201
+	movt	tmp, #0x0804
+	lsl	soff, tmp, #4
+	vmov	vrepmask0, tmp, soff
+	vmov	vrepmask1, tmp, soff
+	/* Work with aligned 32-byte chunks */
+	bic	src, srcin, #31
+	ands	soff, srcin, #31
+	beq	.Lloopintro	/* Go straight to main loop if it's aligned. */
+
+	/*
+	 * Input string is not 32-byte aligned. We calculate the syndrome
+	 * value for the aligned 32 bytes block containing the first bytes
+	 * and mask the irrelevant part.
+	 */
+	vld1.8		{vdata0, vdata1}, [src:256]!
+	sub		tmp, soff, #32
+	adds		cntin, cntin, tmp
+	vceq.i8		vdata0, vdata0, vrepchr
+	vceq.i8		vdata1, vdata1, vrepchr
+	vand		vdata0, vdata0, vrepmask
+	vand		vdata1, vdata1, vrepmask
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
+	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
+	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
+	vmov		synd, vdata0_0[0]
+
+	/* Clear the soff lower bits */
+	lsr		synd, synd, soff
+	lsl		synd, synd, soff
+	/* The first block can also be the last */
+	bls		.Lmasklast
+	/* Have we found something already? */
+#ifndef NO_THUMB
+	cbnz		synd, .Ltail
+#else
+	cmp		synd, #0
+	bne		.Ltail
+#endif
+
+
+.Lloopintro:
+	vpush	{vend}
+	/* 264/265 correspond to d8/d9 for q4 */
+	cfi_adjust_cfa_offset (16)
+	cfi_rel_offset (264, 0)
+	cfi_rel_offset (265, 8)
+	.p2align 3,,7
+.Lloop:
+	vld1.8		{vdata0, vdata1}, [src:256]!
+	subs		cntin, cntin, #32
+	vceq.i8		vdata0, vdata0, vrepchr
+	vceq.i8		vdata1, vdata1, vrepchr
+	/* If we're out of data we finish regardless of the result. */
+	bls		.Lend
+	/* Use a fast check for the termination condition. */
+	vorr		vend, vdata0, vdata1
+	vorr		vend0, vend0, vend1
+	vmov		synd, tmp, vend0
+	orrs		synd, synd, tmp
+	/* We're not out of data, loop if we haven't found the character. */
+	beq		.Lloop
+
+.Lend:
+	vpop		{vend}
+	cfi_adjust_cfa_offset (-16)
+	cfi_restore (264)
+	cfi_restore (265)
+
+	/* Termination condition found, let's calculate the syndrome value. */
+	vand		vdata0, vdata0, vrepmask
+	vand		vdata1, vdata1, vrepmask
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_1
+	vpadd.i8	vdata1_0, vdata1_0, vdata1_1
+	vpadd.i8	vdata0_0, vdata0_0, vdata1_0
+	vpadd.i8	vdata0_0, vdata0_0, vdata0_0
+	vmov		synd, vdata0_0[0]
+#ifndef NO_THUMB
+	cbz		synd, .Lnotfound
+	bhi		.Ltail	/* Uses the condition code from
+				   subs cntin, cntin, #32 above.  */
+#else
+	cmp		synd, #0
+	beq		.Lnotfound
+	cmp		cntin, #0
+	bhi		.Ltail
+#endif
+
+
+.Lmasklast:
+	/* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
+	neg	cntin, cntin
+	lsl	synd, synd, cntin
+	lsrs	synd, synd, cntin
+	it	eq
+	moveq	src, #0	/* If no match, set src to 0 so the retval is 0. */
+
+
+.Ltail:
+	/* Count the trailing zeros using bit reversing */
+	rbit	synd, synd
+	/* Compensate the last post-increment */
+	sub	src, src, #32
+	/* Count the leading zeros */
+	clz	synd, synd
+	/* Compute the potential result and return */
+	add	result, src, synd
+	bx	lr
+
+
+.Lnotfound:
+	/* Set result to NULL if not found and return */
+	mov	result, #0
+	bx	lr
+
+END(memchr)
+libc_hidden_builtin_def (memchr)
+
+#else
+
+#include "../../armv6t2/memchr.S"
+
+#endif
diff --git a/sysdeps/arm/armv7/multiarch/memchr_neon.S b/sysdeps/arm/armv7/multiarch/memchr_neon.S
new file mode 100644
index 0000000000..ee21818f10
--- /dev/null
+++ b/sysdeps/arm/armv7/multiarch/memchr_neon.S
@@ -0,0 +1,9 @@ 
+#ifdef __ARM_NEON__
+/* Under __ARM_NEON__, this file defines memchr directly.  */
+libc_hidden_builtin_def (memchr)
+#else
+# define memchr __memchr_neon
+#endif
+
+#define MEMCHR_NEON
+#include "memchr_impl.S"