This patch provides an optimised implementation of memchr using NEON
instructions to improve its performance, especially with longer search regions.
This gave an improvement in performance against the Thumb2+DSP optimised code,
with more significant gains for larger inputs. The NEON code also wins in cases
where the input is small (less than 8 bytes) by defaulting to a simple
byte-by-byte search. This avoids the overhead imposed by filling two quadword
registers from memory.
Results from the glibc bench-memchr benchmark are as follows:
Cortex-A53:
-----------
vs simple_memchr vs __memchr_noneon
Length 2048, position 32, alignment 0: 297.51% 120.87%
Length 256, position 64, alignment 1: 406.70% 153.35%
Length 2048, position 32, alignment 0: 292.97% 120.77%
Length 256, position 64, alignment 1: 406.95% 152.61%
Length 2048, position 64, alignment 0: 450.82% 138.74%
Length 256, position 64, alignment 2: 408.46% 147.51%
Length 2048, position 64, alignment 0: 440.32% 133.33%
Length 256, position 64, alignment 2: 405.45% 147.28%
Length 2048, position 128, alignment 0: 633.26% 152.98%
Length 256, position 64, alignment 3: 405.71% 136.48%
Length 2048, position 128, alignment 0: 634.77% 152.88%
Length 256, position 64, alignment 3: 405.45% 136.39%
Length 2048, position 256, alignment 0: 872.41% 178.25%
Length 256, position 64, alignment 4: 408.23% 132.42%
Length 2048, position 256, alignment 0: 867.49% 177.65%
Length 256, position 64, alignment 4: 405.94% 130.69%
Length 2048, position 512, alignment 0: 1089.90% 202.59%
Length 256, position 64, alignment 5: 406.19% 129.70%
Length 2048, position 512, alignment 0: 1089.43% 202.78%
Length 256, position 64, alignment 5: 407.21% 130.60%
Length 2048, position 1024, alignment 0: 1254.09% 221.24%
Length 256, position 64, alignment 6: 407.21% 123.13%
Length 2048, position 1024, alignment 0: 1253.20% 221.12%
Length 256, position 64, alignment 6: 406.45% 122.58%
Length 2048, position 2048, alignment 0: 1388.94% 237.35%
Length 256, position 64, alignment 7: 407.21% 117.16%
Length 2048, position 2048, alignment 0: 1387.31% 237.04%
Length 256, position 64, alignment 7: 407.96% 325.87%
Length 2, position 1, alignment 0: 118.47% 115.29%
Length 2, position 1, alignment 0: 109.68% 116.13%
Length 2, position 1, alignment 1: 112.50% 117.76%
Length 2, position 1, alignment 1: 119.48% 114.94%
Length 3, position 2, alignment 0: 116.56% 119.02%
Length 3, position 2, alignment 0: 122.02% 117.26%
Length 3, position 2, alignment 2: 123.35% 117.96%
Length 3, position 2, alignment 2: 123.53% 114.71%
Length 4, position 3, alignment 0: 138.59% 119.02%
Length 4, position 3, alignment 0: 147.98% 124.86%
Length 4, position 3, alignment 3: 113.64% 125.00%
Length 4, position 3, alignment 3: 111.73% 123.46%
Length 5, position 4, alignment 0: 124.34% 139.68%
Length 5, position 4, alignment 0: 120.97% 124.73%
Length 5, position 4, alignment 4: 118.62% 121.28%
Length 5, position 4, alignment 4: 116.84% 138.42%
Length 6, position 5, alignment 0: 118.36% 110.16%
Length 6, position 5, alignment 0: 119.12% 111.95%
Length 6, position 5, alignment 5: 118.90% 112.20%
Length 6, position 5, alignment 5: 121.03% 111.90%
Length 7, position 6, alignment 0: 120.51% 109.52%
Length 7, position 6, alignment 0: 121.56% 110.41%
Length 7, position 6, alignment 6: 120.15% 109.16%
Length 7, position 6, alignment 6: 120.66% 109.59%
Length 8, position 7, alignment 0: 129.26% 115.56%
Length 8, position 7, alignment 0: 129.93% 115.33%
Length 8, position 7, alignment 7: 140.56% 126.51%
Length 8, position 7, alignment 7: 144.63% 128.51%
Length 9, position 8, alignment 0: 138.01% 121.40%
Length 9, position 8, alignment 0: 138.66% 122.68%
Length 9, position 8, alignment 0: 135.90% 119.78%
Length 9, position 8, alignment 0: 138.38% 122.51%
Length 10, position 9, alignment 0: 147.78% 126.30%
Length 10, position 9, alignment 0: 146.86% 125.83%
Length 10, position 9, alignment 1: 165.42% 143.33%
Length 10, position 9, alignment 1: 163.93% 140.16%
Length 11, position 10, alignment 0: 154.61% 129.89%
Length 11, position 10, alignment 0: 155.39% 133.46%
Length 11, position 10, alignment 2: 173.75% 148.75%
Length 11, position 10, alignment 2: 173.55% 147.11%
Length 12, position 11, alignment 0: 165.54% 139.70%
Length 12, position 11, alignment 0: 163.94% 137.55%
Length 12, position 11, alignment 3: 180.66% 153.91%
Length 12, position 11, alignment 3: 184.17% 157.08%
Length 13, position 12, alignment 0: 172.12% 144.61%
Length 13, position 12, alignment 0: 175.56% 146.62%
Length 13, position 12, alignment 4: 192.89% 162.76%
Length 13, position 12, alignment 4: 194.14% 163.18%
Length 14, position 13, alignment 0: 180.67% 149.44%
Length 14, position 13, alignment 0: 180.74% 151.11%
Length 14, position 13, alignment 5: 199.59% 164.23%
Length 14, position 13, alignment 5: 202.49% 166.80%
Length 15, position 14, alignment 0: 189.92% 157.46%
Length 15, position 14, alignment 0: 189.85% 157.14%
Length 15, position 14, alignment 6: 206.88% 169.64%
Length 15, position 14, alignment 6: 206.91% 169.92%
Length 16, position 15, alignment 0: 197.03% 89.59%
Length 16, position 15, alignment 0: 198.88% 89.55%
Length 16, position 15, alignment 7: 223.01% 151.46%
Length 16, position 15, alignment 7: 219.75% 148.15%
Length 17, position 16, alignment 0: 203.32% 83.39%
Length 17, position 16, alignment 0: 205.58% 86.25%
Length 17, position 16, alignment 0: 208.24% 86.52%
Length 17, position 16, alignment 0: 204.40% 83.88%
Length 18, position 17, alignment 0: 213.33% 92.22%
Length 18, position 17, alignment 0: 215.41% 92.86%
Length 18, position 17, alignment 1: 239.09% 183.54%
Length 18, position 17, alignment 1: 231.20% 175.60%
Length 19, position 18, alignment 0: 219.48% 98.16%
Length 19, position 18, alignment 0: 223.59% 98.88%
Length 19, position 18, alignment 2: 240.00% 188.00%
Length 19, position 18, alignment 2: 251.05% 194.14%
Length 20, position 19, alignment 0: 230.97% 106.34%
Length 20, position 19, alignment 0: 226.18% 104.00%
Length 20, position 19, alignment 3: 255.33% 180.33%
Length 20, position 19, alignment 3: 260.25% 182.84%
Length 21, position 20, alignment 0: 239.93% 129.48%
Length 21, position 20, alignment 0: 241.04% 112.31%
Length 21, position 20, alignment 4: 258.87% 116.53%
Length 21, position 20, alignment 4: 264.20% 116.46%
Length 22, position 21, alignment 0: 245.76% 134.32%
Length 22, position 21, alignment 0: 251.32% 140.00%
Length 22, position 21, alignment 5: 275.62% 128.93%
Length 22, position 21, alignment 5: 276.03% 128.10%
Length 23, position 22, alignment 0: 258.21% 142.16%
Length 23, position 22, alignment 0: 257.09% 143.66%
Length 23, position 22, alignment 6: 277.82% 150.00%
Length 23, position 22, alignment 6: 285.95% 135.95%
Length 24, position 23, alignment 0: 264.68% 101.12%
Length 24, position 23, alignment 0: 266.67% 100.75%
Length 24, position 23, alignment 7: 288.71% 158.47%
Length 24, position 23, alignment 7: 290.20% 483.27%
Length 25, position 24, alignment 0: 275.56% 115.04%
Length 25, position 24, alignment 0: 272.86% 100.00%
Length 25, position 24, alignment 0: 270.85% 97.79%
Length 25, position 24, alignment 0: 278.03% 99.24%
Length 26, position 25, alignment 0: 284.21% 106.77%
Length 26, position 25, alignment 0: 283.21% 103.73%
Length 26, position 25, alignment 1: 300.00% 160.32%
Length 26, position 25, alignment 1: 314.46% 166.53%
Length 27, position 26, alignment 0: 291.39% 111.24%
Length 27, position 26, alignment 0: 289.96% 110.41%
Length 27, position 26, alignment 2: 311.15% 190.44%
Length 27, position 26, alignment 2: 324.07% 181.33%
Length 28, position 27, alignment 0: 295.22% 118.75%
Length 28, position 27, alignment 0: 300.75% 117.98%
Length 28, position 27, alignment 3: 322.49% 187.55%
Length 28, position 27, alignment 3: 335.98% 195.40%
Length 29, position 28, alignment 0: 303.69% 124.72%
Length 29, position 28, alignment 0: 305.58% 126.02%
Length 29, position 28, alignment 4: 236.78% 91.95%
Length 29, position 28, alignment 4: 238.44% 90.75%
Length 30, position 29, alignment 0: 317.29% 177.07%
Length 30, position 29, alignment 0: 314.13% 147.58%
Length 30, position 29, alignment 5: 236.59% 94.13%
Length 30, position 29, alignment 5: 244.80% 100.58%
Length 31, position 30, alignment 0: 328.19% 156.02%
Length 31, position 30, alignment 0: 321.03% 155.35%
Length 31, position 30, alignment 6: 241.94% 100.28%
Length 31, position 30, alignment 6: 246.02% 103.69%
Length 32, position 31, alignment 0: 333.58% 156.34%
Length 32, position 31, alignment 0: 330.15% 125.37%
Length 32, position 31, alignment 7: 252.69% 117.85%
Length 32, position 31, alignment 7: 260.35% 120.70%
Cortex-A57:
-----------
vs simple_memchr vs __memchr_noneon
Length 2048, position 32, alignment 0: 192.83% 68.30%
Length 256, position 64, alignment 1: 288.73% 116.90%
Length 2048, position 32, alignment 0: 185.02% 64.79%
Length 256, position 64, alignment 1: 292.12% 118.28%
Length 2048, position 64, alignment 0: 449.72% 157.46%
Length 256, position 64, alignment 2: 293.53% 116.55%
Length 2048, position 64, alignment 0: 468.39% 163.22%
Length 256, position 64, alignment 2: 293.53% 115.83%
Length 2048, position 128, alignment 0: 577.25% 148.24%
Length 256, position 64, alignment 3: 291.43% 113.57%
Length 2048, position 128, alignment 0: 645.61% 165.35%
Length 256, position 64, alignment 3: 294.24% 112.95%
Length 2048, position 256, alignment 0: 919.87% 189.73%
Length 256, position 64, alignment 4: 292.81% 114.39%
Length 2048, position 256, alignment 0: 960.55% 195.16%
Length 256, position 64, alignment 4: 294.22% 114.80%
Length 2048, position 512, alignment 0: 974.82% 169.75%
Length 256, position 64, alignment 5: 291.43% 108.93%
Length 2048, position 512, alignment 0: 977.45% 170.36%
Length 256, position 64, alignment 5: 292.47% 107.89%
Length 2048, position 1024, alignment 0: 1215.38% 192.88%
Length 256, position 64, alignment 6: 294.93% 106.16%
Length 2048, position 1024, alignment 0: 1216.78% 193.22%
Length 256, position 64, alignment 6: 292.12% 103.23%
Length 2048, position 2048, alignment 0: 1442.14% 215.99%
Length 256, position 64, alignment 7: 285.97% 99.30%
Length 2048, position 2048, alignment 0: 1449.97% 216.84%
Length 256, position 64, alignment 7: 289.68% 98.93%
Length 2, position 1, alignment 0: 108.96% 92.54%
Length 2, position 1, alignment 0: 107.09% 97.64%
Length 2, position 1, alignment 1: 108.06% 98.39%
Length 2, position 1, alignment 1: 109.02% 97.54%
Length 3, position 2, alignment 0: 103.52% 133.80%
Length 3, position 2, alignment 0: 108.09% 136.03%
Length 3, position 2, alignment 2: 107.52% 140.60%
Length 3, position 2, alignment 2: 109.09% 140.91%
Length 4, position 3, alignment 0: 101.32% 92.76%
Length 4, position 3, alignment 0: 109.22% 102.13%
Length 4, position 3, alignment 3: 109.42% 101.45%
Length 4, position 3, alignment 3: 110.22% 100.73%
Length 5, position 4, alignment 0: 109.74% 101.95%
Length 5, position 4, alignment 0: 110.27% 100.68%
Length 5, position 4, alignment 4: 112.59% 101.40%
Length 5, position 4, alignment 4: 113.38% 101.41%
Length 6, position 5, alignment 0: 110.83% 100.64%
Length 6, position 5, alignment 0: 111.92% 100.66%
Length 6, position 5, alignment 5: 112.75% 100.00%
Length 6, position 5, alignment 5: 114.19% 101.35%
Length 7, position 6, alignment 0: 113.84% 101.26%
Length 7, position 6, alignment 0: 113.46% 100.64%
Length 7, position 6, alignment 6: 112.03% 96.84%
Length 7, position 6, alignment 6: 114.19% 99.35%
Length 8, position 7, alignment 0: 187.41% 122.22%
Length 8, position 7, alignment 0: 191.67% 121.21%
Length 8, position 7, alignment 7: 182.01% 114.39%
Length 8, position 7, alignment 7: 194.62% 123.08%
Length 9, position 8, alignment 0: 176.87% 126.12%
Length 9, position 8, alignment 0: 178.03% 125.76%
Length 9, position 8, alignment 0: 180.15% 127.48%
Length 9, position 8, alignment 0: 178.20% 126.32%
Length 10, position 9, alignment 0: 187.88% 178.79%
Length 10, position 9, alignment 0: 187.12% 178.03%
Length 10, position 9, alignment 1: 192.25% 175.19%
Length 10, position 9, alignment 1: 187.88% 165.91%
Length 11, position 10, alignment 0: 194.70% 172.73%
Length 11, position 10, alignment 0: 194.70% 171.21%
Length 11, position 10, alignment 2: 194.70% 171.97%
Length 11, position 10, alignment 2: 199.22% 178.13%
Length 12, position 11, alignment 0: 201.50% 175.19%
Length 12, position 11, alignment 0: 203.03% 175.76%
Length 12, position 11, alignment 3: 205.38% 179.23%
Length 12, position 11, alignment 3: 205.38% 179.23%
Length 13, position 12, alignment 0: 209.85% 181.06%
Length 13, position 12, alignment 0: 209.09% 181.06%
Length 13, position 12, alignment 4: 209.09% 180.30%
Length 13, position 12, alignment 4: 214.73% 185.27%
Length 14, position 13, alignment 0: 217.29% 184.21%
Length 14, position 13, alignment 0: 215.79% 184.21%
Length 14, position 13, alignment 5: 218.18% 186.36%
Length 14, position 13, alignment 5: 224.03% 189.15%
Length 15, position 14, alignment 0: 225.76% 188.64%
Length 15, position 14, alignment 0: 225.00% 187.12%
Length 15, position 14, alignment 6: 225.00% 187.88%
Length 15, position 14, alignment 6: 230.23% 193.02%
Length 16, position 15, alignment 0: 235.11% 114.50%
Length 16, position 15, alignment 0: 233.33% 107.58%
Length 16, position 15, alignment 7: 238.76% 132.56%
Length 16, position 15, alignment 7: 237.69% 126.15%
Length 17, position 16, alignment 0: 242.75% 118.32%
Length 17, position 16, alignment 0: 240.15% 122.73%
Length 17, position 16, alignment 0: 239.39% 112.88%
Length 17, position 16, alignment 0: 241.22% 110.69%
Length 18, position 17, alignment 0: 254.96% 173.28%
Length 18, position 17, alignment 0: 256.49% 165.65%
Length 18, position 17, alignment 1: 256.92% 163.85%
Length 18, position 17, alignment 1: 256.92% 154.62%
Length 19, position 18, alignment 0: 257.90% 127.07%
Length 19, position 18, alignment 0: 262.60% 125.95%
Length 19, position 18, alignment 2: 263.08% 156.15%
Length 19, position 18, alignment 2: 266.67% 155.04%
Length 20, position 19, alignment 0: 264.66% 138.35%
Length 20, position 19, alignment 0: 264.66% 133.08%
Length 20, position 19, alignment 3: 272.09% 164.34%
Length 20, position 19, alignment 3: 270.77% 160.00%
Length 21, position 20, alignment 0: 277.10% 145.80%
Length 21, position 20, alignment 0: 275.76% 133.33%
Length 21, position 20, alignment 4: 280.77% 147.69%
Length 21, position 20, alignment 4: 279.23% 138.46%
Length 22, position 21, alignment 0: 279.70% 147.37%
Length 22, position 21, alignment 0: 279.10% 138.06%
Length 22, position 21, alignment 5: 283.97% 155.73%
Length 22, position 21, alignment 5: 283.97% 148.85%
Length 23, position 22, alignment 0: 291.67% 145.45%
Length 23, position 22, alignment 0: 291.67% 143.94%
Length 23, position 22, alignment 6: 293.13% 163.36%
Length 23, position 22, alignment 6: 296.15% 157.69%
Length 24, position 23, alignment 0: 299.25% 123.31%
Length 24, position 23, alignment 0: 301.52% 120.45%
Length 24, position 23, alignment 7: 306.15% 153.08%
Length 24, position 23, alignment 7: 306.15% 145.38%
Length 25, position 24, alignment 0: 309.09% 124.24%
Length 25, position 24, alignment 0: 310.69% 119.08%
Length 25, position 24, alignment 0: 304.48% 116.42%
Length 25, position 24, alignment 0: 310.69% 117.56%
Length 26, position 25, alignment 0: 315.91% 180.30%
Length 26, position 25, alignment 0: 315.15% 171.97%
Length 26, position 25, alignment 1: 320.77% 175.38%
Length 26, position 25, alignment 1: 322.48% 170.54%
Length 27, position 26, alignment 0: 324.24% 139.39%
Length 27, position 26, alignment 0: 326.72% 132.82%
Length 27, position 26, alignment 2: 329.23% 176.15%
Length 27, position 26, alignment 2: 331.78% 172.87%
Length 28, position 27, alignment 0: 328.57% 144.36%
Length 28, position 27, alignment 0: 330.30% 137.12%
Length 28, position 27, alignment 3: 333.59% 182.44%
Length 28, position 27, alignment 3: 334.35% 175.57%
Length 29, position 28, alignment 0: 341.98% 152.67%
Length 29, position 28, alignment 0: 339.39% 143.94%
Length 29, position 28, alignment 4: 268.86% 124.55%
Length 29, position 28, alignment 4: 282.39% 118.87%
Length 30, position 29, alignment 0: 345.86% 152.63%
Length 30, position 29, alignment 0: 345.86% 146.62%
Length 30, position 29, alignment 5: 285.71% 136.65%
Length 30, position 29, alignment 5: 288.75% 131.25%
Length 31, position 30, alignment 0: 357.58% 153.03%
Length 31, position 30, alignment 0: 356.72% 150.75%
Length 31, position 30, alignment 6: 286.06% 141.21%
Length 31, position 30, alignment 6: 287.80% 128.66%
Length 32, position 31, alignment 0: 363.16% 130.83%
Length 32, position 31, alignment 0: 365.91% 127.27%
Length 32, position 31, alignment 7: 300.00% 136.02%
Length 32, position 31, alignment 7: 301.88% 126.88%
glibc/ChangeLog:
2017-06-13 Prakhar Bahuguna <prakhar.bahuguna@arm.com>
* sysdeps/arm/armv7/multiarch/Makefile: Add memchr_neon to
sysdep_routines.
* sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Add define for
__memchr_neon.
Add ifunc definitions for __memchr_neon and __memchr_noneon.
* sysdeps/arm/armv7/multiarch/memchr.S: New file.
* sysdeps/arm/armv7/multiarch/memchr_impl.S: Likewise.
* sysdeps/arm/armv7/multiarch/memchr_neon.S: Likewise.
Testing done: Ran regression tests for arm-none-linux-gnueabihf as well as a
full toolchain bootstrap. Benchmark tests were ran on ARMv7-A and ARMv8-A
hardware targets.
@@ -1,3 +1,3 @@
ifeq ($(subdir),string)
-sysdep_routines += memcpy_neon memcpy_vfp
+sysdep_routines += memcpy_neon memcpy_vfp memchr_neon
endif
@@ -34,6 +34,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
bool use_neon = true;
#ifdef __ARM_NEON__
# define __memcpy_neon memcpy
+# define __memchr_neon memchr
#else
use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0;
#endif
@@ -52,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm));
+ IFUNC_IMPL (i, name, memchr,
+ IFUNC_IMPL_ADD (array, i, memchr, use_neon, __memchr_neon)
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_noneon));
+
return i;
}
new file mode 100644
@@ -0,0 +1,59 @@
+/* Multiple versions of memchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2013-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <rtld-global-offsets.h>
+
+#if IS_IN (libc)
+/* Under __ARM_NEON__, memchr_neon.S defines the name memchr. */
+# ifndef __ARM_NEON__
+ .text
+ .arm
+ENTRY(memchr)
+ .type memchr, %gnu_indirect_function
+ ldr r1, .Lmemchr_noneon
+ tst r0, #HWCAP_ARM_NEON
+ ldrne r1, .Lmemchr_neon
+1:
+ add r0, r1, pc
+ DO_RET(lr)
+
+.Lmemchr_noneon:
+ .long C_SYMBOL_NAME(__memchr_noneon) - 1b - 8
+.Lmemchr_neon:
+ .long C_SYMBOL_NAME(__memchr_neon) - 1b - 8
+
+END(memchr)
+
+libc_hidden_builtin_def (memchr)
+#endif /* Not __ARM_NEON__. */
+libc_hidden_def (__memchr_noneon)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+#undef weak_alias
+#define weak_alias(x, y)
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+#define memchr __memchr_noneon
+
+#endif
+
+#include "memchr_impl.S"
new file mode 100644
@@ -0,0 +1,218 @@
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifdef MEMCHR_NEON
+
+#include <sysdep.h>
+
+ .arch armv7-a
+ .fpu neon
+
+
+/* Arguments */
+#define srcin r0
+#define chrin r1
+#define cntin r2
+
+/* Retval */
+#define result r0 /* Live range does not overlap with srcin */
+
+/* Working registers */
+#define src r1 /* Live range does not overlap with chrin */
+#define tmp r3
+#define synd r0 /* No overlap with srcin or result */
+#define soff r12
+
+/* Working NEON registers */
+#define vrepchr q0
+#define vdata0 q1
+#define vdata0_0 d2 /* Lower half of vdata0 */
+#define vdata0_1 d3 /* Upper half of vdata0 */
+#define vdata1 q2
+#define vdata1_0 d4 /* Lower half of vhas_chr0 */
+#define vdata1_1 d5 /* Upper half of vhas_chr0 */
+#define vrepmask q3
+#define vrepmask0 d6
+#define vrepmask1 d7
+#define vend q4
+#define vend0 d8
+#define vend1 d9
+
+/*
+ * Core algorithm:
+ *
+ * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per
+ * byte. Each bit is set if the relevant byte matched the requested character
+ * and cleared otherwise. Since the bits in the syndrome reflect exactly the
+ * order in which things occur in the original string, counting trailing zeros
+ * allows to identify exactly which byte has matched.
+ */
+
+#ifndef NO_THUMB
+ .thumb_func
+#else
+ .arm
+#endif
+ .p2align 4,,15
+
+ENTRY(memchr)
+ /* Use a simple loop if there are less than 8 bytes to search. */
+ cmp cntin, #7
+ bhi .Llargestr
+ and chrin, chrin, #0xff
+
+.Lsmallstr:
+ subs cntin, cntin, #1
+ blo .Lnotfound /* Return not found if reached end. */
+ ldrb tmp, [srcin], #1
+ cmp tmp, chrin
+ bne .Lsmallstr /* Loop again if not found. */
+ /* Otherwise fixup address and return. */
+ sub result, srcin, #1
+ bx lr
+
+
+.Llargestr:
+ vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */
+ /*
+ * Magic constant 0x8040201008040201 allows us to identify which lane
+ * matches the requested byte.
+ */
+ movw tmp, #0x0201
+ movt tmp, #0x0804
+ lsl soff, tmp, #4
+ vmov vrepmask0, tmp, soff
+ vmov vrepmask1, tmp, soff
+ /* Work with aligned 32-byte chunks */
+ bic src, srcin, #31
+ ands soff, srcin, #31
+ beq .Lloopintro /* Go straight to main loop if it's aligned. */
+
+ /*
+ * Input string is not 32-byte aligned. We calculate the syndrome
+ * value for the aligned 32 bytes block containing the first bytes
+ * and mask the irrelevant part.
+ */
+ vld1.8 {vdata0, vdata1}, [src:256]!
+ sub tmp, soff, #32
+ adds cntin, cntin, tmp
+ vceq.i8 vdata0, vdata0, vrepchr
+ vceq.i8 vdata1, vdata1, vrepchr
+ vand vdata0, vdata0, vrepmask
+ vand vdata1, vdata1, vrepmask
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_1
+ vpadd.i8 vdata1_0, vdata1_0, vdata1_1
+ vpadd.i8 vdata0_0, vdata0_0, vdata1_0
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_0
+ vmov synd, vdata0_0[0]
+
+ /* Clear the soff lower bits */
+ lsr synd, synd, soff
+ lsl synd, synd, soff
+ /* The first block can also be the last */
+ bls .Lmasklast
+ /* Have we found something already? */
+#ifndef NO_THUMB
+ cbnz synd, .Ltail
+#else
+ cmp synd, #0
+ bne .Ltail
+#endif
+
+
+.Lloopintro:
+ vpush {vend}
+ /* 264/265 correspond to d8/d9 for q4 */
+ cfi_adjust_cfa_offset (16)
+ cfi_rel_offset (264, 0)
+ cfi_rel_offset (265, 8)
+ .p2align 3,,7
+.Lloop:
+ vld1.8 {vdata0, vdata1}, [src:256]!
+ subs cntin, cntin, #32
+ vceq.i8 vdata0, vdata0, vrepchr
+ vceq.i8 vdata1, vdata1, vrepchr
+ /* If we're out of data we finish regardless of the result. */
+ bls .Lend
+ /* Use a fast check for the termination condition. */
+ vorr vend, vdata0, vdata1
+ vorr vend0, vend0, vend1
+ vmov synd, tmp, vend0
+ orrs synd, synd, tmp
+ /* We're not out of data, loop if we haven't found the character. */
+ beq .Lloop
+
+.Lend:
+ vpop {vend}
+ cfi_adjust_cfa_offset (-16)
+ cfi_restore (264)
+ cfi_restore (265)
+
+ /* Termination condition found, let's calculate the syndrome value. */
+ vand vdata0, vdata0, vrepmask
+ vand vdata1, vdata1, vrepmask
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_1
+ vpadd.i8 vdata1_0, vdata1_0, vdata1_1
+ vpadd.i8 vdata0_0, vdata0_0, vdata1_0
+ vpadd.i8 vdata0_0, vdata0_0, vdata0_0
+ vmov synd, vdata0_0[0]
+#ifndef NO_THUMB
+ cbz synd, .Lnotfound
+ bhi .Ltail /* Uses the condition code from
+ subs cntin, cntin, #32 above. */
+#else
+ cmp synd, #0
+ beq .Lnotfound
+ cmp cntin, #0
+ bhi .Ltail
+#endif
+
+
+.Lmasklast:
+ /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */
+ neg cntin, cntin
+ lsl synd, synd, cntin
+ lsrs synd, synd, cntin
+ it eq
+ moveq src, #0 /* If no match, set src to 0 so the retval is 0. */
+
+
+.Ltail:
+ /* Count the trailing zeros using bit reversing */
+ rbit synd, synd
+ /* Compensate the last post-increment */
+ sub src, src, #32
+ /* Count the leading zeros */
+ clz synd, synd
+ /* Compute the potential result and return */
+ add result, src, synd
+ bx lr
+
+
+.Lnotfound:
+ /* Set result to NULL if not found and return */
+ mov result, #0
+ bx lr
+
+END(memchr)
+libc_hidden_builtin_def (memchr)
+
+#else
+
+#include "../../armv6t2/memchr.S"
+
+#endif
new file mode 100644
@@ -0,0 +1,9 @@
+#ifdef __ARM_NEON__
+/* Under __ARM_NEON__, this file defines memchr directly. */
+libc_hidden_builtin_def (memchr)
+#else
+# define memchr __memchr_neon
+#endif
+
+#define MEMCHR_NEON
+#include "memchr_impl.S"