From patchwork Tue Jun 13 09:25:05 2017 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Prakhar Bahuguna X-Patchwork-Id: 20982 Received: (qmail 51855 invoked by alias); 13 Jun 2017 09:25:16 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Unsubscribe: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Delivered-To: mailing list libc-alpha@sourceware.org Received: (qmail 51825 invoked by uid 89); 13 Jun 2017 09:25:13 -0000 Authentication-Results: sourceware.org; auth=none X-Virus-Found: No X-Spam-SWARE-Status: No, score=-25.6 required=5.0 tests=AWL, BAYES_00, GIT_PATCH_0, GIT_PATCH_1, GIT_PATCH_2, GIT_PATCH_3, KAM_STOCKGEN, RCVD_IN_DNSWL_NONE, SPF_HELO_PASS, SPF_PASS autolearn=ham version=3.3.2 spammy=imposed, 24000, 17121, 16000 X-HELO: EUR01-VE1-obe.outbound.protection.outlook.com Authentication-Results: spf=none (sender IP is ) smtp.mailfrom=Prakhar.Bahuguna@arm.com; Date: Tue, 13 Jun 2017 10:25:05 +0100 From: Prakhar Bahuguna To: libc-alpha@sourceware.org Cc: nd@arm.com Subject: [PATCH, ARM] Optimise memchr for NEON-enabled processors Message-ID: <20170613092505.rxuxbujpgfq6fbqo@e107464-lin.cambridge.arm.com> MIME-Version: 1.0 Content-Disposition: inline X-ClientProxiedBy: VI1PR09CA0047.eurprd09.prod.outlook.com (2603:10a6:802:28::15) To AM5PR0802MB2450.eurprd08.prod.outlook.com (2603:10a6:203:9f::19) X-MS-PublicTrafficType: Email X-MS-TrafficTypeDiagnostic: AM5PR0802MB2450: X-MS-Office365-Filtering-Correlation-Id: 13d9a8ae-d289-48d7-be73-08d4b23e15b8 X-MS-Office365-Filtering-HT: Tenant X-Microsoft-Antispam: UriScan:; BCL:0; PCL:0; RULEID:(22001)(48565401081)(201703131423075)(201703031133081); SRVR:AM5PR0802MB2450; X-Microsoft-Exchange-Diagnostics: 1; AM5PR0802MB2450; 3:il3qme2z8Vle9obXt6we4wD6Q3rh4xeZBbWTu2iFitl6hpgheLl4P89/3l+6fF7xbbWac49nHxRmQsrSc1kthCgo+WrtZ6e6ZwAjgFqfjqIxMrfJ0zbKkbaWrP/oVJJdY7v8+kmGGtDAaXXiefOWxxYgQ2WmOt6gZZH2tdTEYWjbQ4L3sJ4dr5zFCKr4TVwO7gvcewrUPycPy7Oi7AA7cePGp+70yIVyR2zvZtWnvbz7hqKjrKDZ2oYUVJ/Gpd72zLPImLr9RQWAmmkhMpqrK5uQ4fr0Vj/AleiyA6oc6etNIuZ8n5HZfyQPsDTFRnG1jnTk7v8VZv2mONjjB55txGECwAwqIWDNyxfxkJrZlNc=; 25:q8jjoG9DTt4HleNhLPK1wyDQbB5Bdl1vjBMuGLU23e9O6njI7OGxks94z0xuIk0cbDdDwDieBo9Hhi1WK24Y/0yYZOXiuvL6zydqA230Ok9MYipSAHt/D04YBAuM4G6Okml6Au5tJwQej0TiXvXsFK2aUQQE/NKMzVrm+1W8FXlv5aNTE7Xy337b/HyHzGAiGxxTMIpcX8SMPuiBgjkKNgLY4Yb4V761UXNc6aLe+Dpg1QMGOByPOMrhuJmOUTl7UbQCyLRliikBFrQ6FUFh0TaOhbJ59hGQ8CPsV7iP68+GY6oenhvHRFHd97XTqlIon0cGcy/zSI0Yn8DVeGwIJNjWGVu5r1t8koSn96pYMUaXy+t+43S467dLLerlxSCq/11rUPyz929krNlaaGlqj1omRUGtlPEBQTPGzNpPd9QF+gadjSYnYcdEzf6Fg1bYztrukgL+KKiooaaPtTalP5IIvVSN4rmod5cljE+iZcTAdOv1P+vBazkHIVnVbBG9SN6M7LRnROCrFsUtF6bPbA== X-Microsoft-Exchange-Diagnostics: 1; AM5PR0802MB2450; 31:+R2KQ894la2ZQhJVvhuoUUcSrT8vKBgwEL6b+1pMK2wBpJ47t0kiyP/hluzFvcVqbr+Mz/hCFmmjB57Ai7IumQQsRcPxsM9F7+nMlCq3+A4kA1MXxZUePXaZJdM4VKhFaQhXIJARx5zTnaOc748pVUp1xXuqOcRSpeVJlQI+vvk6jOHNP19kjcvSWUhW6/otH8wa5dA9oxq7EndnOAORmfrJzQGI9Gh655TFW5VWWsHebs5W78eIYWzbb4PqXbzpg5RME8TH4fJW633Wi1XByg==; 20:K6WKegCTx1eMI4CmWml3C26OWyPIH1UHDBA+lxUkr0x9stjvNtO/NTss6S9aVyo5V7sN6vcc5mKAbJh7ZXv5jPkl5EtF17f3gOR4N5ZiYDBJZ/vqX0XIr6TO/yWHDDnRL1ZwT1T2WvQ88RGbRd+2ghx5vKjPgD6OdMJFTgGvZbU= NoDisclaimer: True X-Microsoft-Antispam-PRVS: X-Exchange-Antispam-Report-Test: UriScan:(180628864354917); X-Exchange-Antispam-Report-CFA-Test: BCL:0; PCL:0; RULEID:(100000700101)(100105000095)(100000701101)(100105300095)(100000702101)(100105100095)(102415395)(6040450)(601004)(2401047)(8121501046)(5005006)(93006095)(93001095)(100000703101)(100105400095)(10201501046)(3002001)(6055026)(6041248)(20161123558100)(201703131423075)(201702281528075)(201703061421075)(201703061406153)(20161123555025)(20161123564025)(20161123560025)(20161123562025)(6072148)(100000704101)(100105200095)(100000705101)(100105500095); SRVR:AM5PR0802MB2450; BCL:0; PCL:0; RULEID:(100000800101)(100110000095)(100000801101)(100110300095)(100000802101)(100110100095)(100000803101)(100110400095)(100000804101)(100110200095)(100000805101)(100110500095); SRVR:AM5PR0802MB2450; X-Microsoft-Exchange-Diagnostics: =?us-ascii?Q?1; AM5PR0802MB2450; 4:iNXoHVNstagr95wkauCbrnS+3trekXZ0CdoLVWjF?= =?us-ascii?Q?OH3FHqV9eocqw8xZjcgWKkLk2Xvlw80e6vjCVHnOO+D6Q2AXe3unIbI7jT9/?= =?us-ascii?Q?+A7TrJJgSH2x1Z8s2A51/sYXlIUhUVojF5v5nQyl39FhUFainV8tGjG3qEue?= =?us-ascii?Q?7VaRLQwjO9ILMrG6ey8XGRZ6PJuNIWh7O7R11fHq6lbE0zJt4uaySYI357MV?= =?us-ascii?Q?E3gFqbJfPVLKzxUKlDsxEO/weG8CZccuSqbplrkO+bTPHhqRhUy++h5pg77K?= =?us-ascii?Q?YJBsS7sOwxcCEFICADtN/a3rgSr+Mwya6h6DtA3Prx202XM7MZOGpnSzoaOZ?= =?us-ascii?Q?Pc2QDjPBgH609y8pp8l98sYqqgZmdKVX6xXTkaE62ulfHoSXhhAxDXQIN4GU?= =?us-ascii?Q?x25Rvdw8KbF9Gn/HQ3htHvZc2sywelGFBa9JlQNX6BfXF4qP+uWnoNdrAbeU?= =?us-ascii?Q?maIS/6ZRFMgJ1svim51kZ45Nm1i8Qzd/aJVKSNBkPUC7xcJOAU3v4b+HjOop?= =?us-ascii?Q?jGM2qfMCiW4NztPcwfZZP5oPzO4n4UdV1uUM7LH8nmPHbOmWt3g6bV9/wBvn?= =?us-ascii?Q?GMzoBxGCdQZX1qp/CEPvMTvAroHoK18K/DYY1QSv932zgB/k4R8+/9KOsiF3?= =?us-ascii?Q?r8daN67iekHY8U67YV32YJOj/C6AKLs326sQ95G5G1qECcEMuNBttU08Gl0b?= =?us-ascii?Q?bbkd3lu2UyztlRRS2ihmQJ2Nq89bGKIzESSKVh/+55qo9uHtCW2WW4eNMfrY?= =?us-ascii?Q?9MBDb7dB2qvBn1jcpgm2RvG5cce9Qa3glK8ruGPOFgR7vBMYhAWdiMYndGzW?= =?us-ascii?Q?9lZD6rZAJ7Cz9deF9oF/WQprn6nn6xl1erL+MPCWBdCelp41WHEmdVVdnHPS?= =?us-ascii?Q?LCJQeceSXct9FGNlU2GbYe7m9sqnuK0XbKTbpltAGw2E7gjxVaO2rQliQHxC?= =?us-ascii?Q?e0CBOVTZyJ3VzkicPanJ4mpqgDw1G27U5mHS5ejY3+9ZvnKenWCGrjXXiHj8?= =?us-ascii?Q?RDnQY3nH7Jg7bjeNCgMbErf8d/ep3Sg65a3UEDh+npc7Ho900Q26d/zzqM1G?= =?us-ascii?Q?4+HYndeui2TDwnPFebJQASN1rDe3XSg/iS5DfZdKww0VdFSsJhZyir/xrF3i?= =?us-ascii?Q?54Lr6fjY8Z219ietGxbJiIeRb1FogJh+aZY/ej94UCl763QpCCU0++7JGr/H?= =?us-ascii?Q?cMJNgUYnFWCY5fMnqqnt05+7uBZqh1FCDot0WP8wvMw1qQ+MZ+XF60Bc5Q?= =?us-ascii?Q?=3D=3D?= X-Forefront-PRVS: 0337AFFE9A X-Forefront-Antispam-Report: SFV:NSPM; SFS:(10009020)(4630300001)(6069001)(6009001)(39850400002)(39410400002)(39450400003)(39400400002)(39840400002)(39860400002)(199003)(377424004)(189002)(54534003)(54356999)(101416001)(2906002)(50986999)(5660300001)(8676002)(81166006)(81156014)(53936002)(7736002)(68736007)(86362001)(305945005)(42186005)(66066001)(4610100001)(97736004)(25786009)(76506005)(4326008)(105586002)(2476003)(106356001)(33646002)(2351001)(2361001)(72206003)(6486002)(478600001)(6496005)(189998001)(3846002)(568964002)(6116002)(38730400002)(110136004)(5890100001)(6916009)(6666003)(84326002)(1076002)(2700100001); DIR:OUT; SFP:1101; SCL:1; SRVR:AM5PR0802MB2450; H:localhost; FPR:; SPF:None; PTR:InfoNoRecords; A:1; MX:1; LANG:en; Received-SPF: None (protection.outlook.com: arm.com does not designate permitted sender hosts) X-Microsoft-Exchange-Diagnostics: =?us-ascii?Q?1; AM5PR0802MB2450; 23:luUID7ECpgUZgbe+Gh69soR4XZPYFMamAiRBGN2?= =?us-ascii?Q?IFqzD9Dup1lkn3j5NRkjuUyziFeEaXA37EKngqJbpFUxjwjVDJJJ4c/4Irr4?= =?us-ascii?Q?B13XWz6+7R3GtUng8PdhPWYf9OXJeBjdXMdUnDW2PYsMKMCOgPfVsH6fPc/x?= =?us-ascii?Q?2DAivAX+9yH32dFB2ZEQgW+ov4XgAPDOvddNj5xOcilnXVjRU4evb8L0gasQ?= =?us-ascii?Q?CABIgZyz1r1yDhFX52GZ1pbi94X0TYNXMtaBQTWRetTHSNV33e0btihBPNAt?= =?us-ascii?Q?eCHE9ZHTyMVXnJ2Yu4aXCYetMZBt11AlQfz0RPxZWoWVjoHI30N6mff9p0sZ?= =?us-ascii?Q?jN453oT8JBd3ZzzjFvMVw4DnbeNtGliEGWEOeRcabagc4Z8KxcjmbUMKPqR5?= =?us-ascii?Q?4zf70PWm/psyZhqA6/EzVnwOvNYTyC6Mk0Au6aX7iji7WtcifbtIe90wuIQk?= =?us-ascii?Q?OTZ4NZuVZEqrwItX7/oKGb/HZoRMFFROd6CnAkPxSrzrgZZrKrDtRa9+xlVb?= =?us-ascii?Q?ibNblH1gBZUc7sLV12EPfjwd2xO6JK8XqOGuiHMHETCJDMRyguy09YdOFCj9?= =?us-ascii?Q?dXggYGj+G336ZfBspE8e1e6kYcZ6fY3FGkDieglcEMQcGst3yrnbqyljTXug?= =?us-ascii?Q?L48CiH/GxhngghG94h7S5MiiNghbaZ2YePrhn0cCmf3vurzuJ2mP7CJ1+Bgv?= =?us-ascii?Q?em13kZx19d0YLoCwquDjwaBZeOe5VZfqHA5PKpgYuNldOBURtDNarzVmY1wS?= =?us-ascii?Q?uBu7/nSJCAqzRMUWCyZPp/EB95BBu5/o0wXfXwj2WDeSU7B9kT9jOza+/2eJ?= =?us-ascii?Q?L4e2EUH6SHdQJYoh9trkIi61utWyDB8HMO5naw7jLHxxiZH+8U8UAeFtyIt8?= =?us-ascii?Q?0IX4zfK40RvuxWbofrQxXGfj4ljxPlZmSjHmfAZR9YQ8FPRpdxFc9X+nN2VI?= =?us-ascii?Q?TtdEqC3rX1Ej6nONKkQp5aaTudgaJSWH0mNzAAilaCMVrsYXpKkik78WkrIX?= =?us-ascii?Q?Oh5pOWhmLI/W/3R5k6LwzvPgpMjadyIZVfrm67INeAiXtGEmvX5SFXVHpSvP?= =?us-ascii?Q?f18LB+vwODtXUe/4hcDklng+e42OSYHMYQkmWP9MhRg7CpzHuIPKMlkBslvc?= =?us-ascii?Q?ZGAF4SmTtUX0XKYXU81pVzPL40yFS90GH64vHMH8yzsU7PO05uQByEVQkBWd?= =?us-ascii?Q?GcCyXpQQ6LBskn84dWpzFhz8WR6lAeYFmzF/E0uszZUn8atoV6Izov4NyCsT?= =?us-ascii?Q?a2fg9cbnszTvU149GUHH5mqiHWEXq2TwbJhGvCdLaNetNzx3inAZG3o8HwmO?= =?us-ascii?Q?t7iIpJdB1Gab+zOh+J/a7lMFpEWejt/w8Er6EfWkknk5m?= X-Microsoft-Exchange-Diagnostics: 1; AM5PR0802MB2450; 6:tGPSB4gIKRmc8Pglfgz8Xque1Q3AFe0PACx8+jt2oj3Fh1wmZTDqE/O12aklYdWYVzp++jHWllQ96LY2PiP8J9AYYIftXdXl0Y2LZ0zSsfl92kFuwFPKaFkb5+ja5hiq2S+D8TNK2Ouikv9ZK2Rd3yu3M/dzhHjXiYVWn7oKteliFhgYJKHzA/Jol/h9sqihcBJsOdDuDkhzO2mE91YnownZODM1Gn29EyNRK4p/jY/aPyQwM3wP4JdL9nOaqVv685Z8Rf+iJrSGa6H2AtVqy9P+mupUeQELMz+RgmE+EI59Eix0FnDy8pdvVD8kCW7zrMT0W97oMSMOT5DRDIupvYAe96rYmdw1+O0mtSD+IwIBwvHUZVj07Ro/jQ8jynfLJMa+5E7Gz6nL7qQtNYk3cktc9cccH1Yj/DFNu1Y9Y6NzZjb4jiRy92qJLPM5RUbjDMxy/ONKKVI/DFxAcNSliZJ7zJwB5dE6cmNqzFzqtMg58lDQpqWrWEFBXR6nNRq8uK8Y0TeoJCdCaQdGFSFOE1x3NhW+rtHd0pjlyqmWqs0= X-Microsoft-Exchange-Diagnostics: 1; AM5PR0802MB2450; 5:VxvFCcJ9nzih4wkLfYFDIMHY2AOVTZ8afld6RilaNN4zF4JYaB1cyZp9C9H2QeR9DKURdfs7aLaNOpRArBoYT14qRyUXrbimMDYuwlgByY128KxqtkuoIHG0Gn2oHdsBwVxt0jTaozgJLlb/Sz7ipuf8jyjbkicnNK05OZsMrCzBZPdagFMve19lgeg6xelf7pJ4Dq0hixJtzlZF22pNF67aMCEJTFdB88LQU6WBP1MEGX44g7mRSsI6fhlzwHEdsB0MyPLp78aCNe84wZLyJwT7RjZLhBO6Gc25Wjq93H7qiKl3bF5+xv0UolqGiVaYOyYRXJs2NWHsVFjs3TEkzkk5NqFvE82L6nc58u/QL6i7RxE/OuZJq3o2PFB/yJrI5Rfvg25wXFlRjWvoP03jar/FrcUSEBTDnNGN/BY72acT6XE0ObKm9ruopVzu5aqFKzcj3zJQjJ97GjhqZPbkAll2Ol4aB4rOJ3jsQHuIN9L6/knm99yOn1h98JRA/LE1; 24:kV/gpmoOs4lGSgEJ3vwPG3EmujAnopsSy/IDckHZHYUNtZykACgyAoZNrF/e/Wgr2wIXvU43jLlIIt86JiEDJuRpMG88h0N5AMFGf3lynCE= SpamDiagnosticOutput: 1:99 SpamDiagnosticMetadata: NSPM X-Microsoft-Exchange-Diagnostics: 1; AM5PR0802MB2450; 7:xa7y19+rl4AsH727wkNq0tASLfhOZIYbqztee04jBByIqN8JimRacqye5Qw4rbovHwluCSSEevHYOc1hZkpHadJDasJHRtQM10B8IU33giR97GVUeAqETiv+Zo3FFKA1+w+UxFW0zRTiZKSjSdJ6dVRgKwNOfrPBANmBai4/s9AE2F68RN+KwficexUDNl2XzyyOJa+tPKefHkznM6PFuEQL4AwsoHiQQveRMAvOsEoyJWs+s0atcfpbC+EWNwVYkZhHp1pJqgL7hMbUqpFfxEeB1+k7UkH1OjOOE87jDQ+JSfAQAypt5bgbM4Y2nS/21H5Glre5KAsmkiNRUTFXSQ== X-OriginatorOrg: arm.com X-MS-Exchange-CrossTenant-OriginalArrivalTime: 13 Jun 2017 09:25:08.6509 (UTC) X-MS-Exchange-CrossTenant-FromEntityHeader: Hosted X-MS-Exchange-Transport-CrossTenantHeadersStamped: AM5PR0802MB2450 This patch provides an optimised implementation of memchr using NEON instructions to improve its performance, especially with longer search regions. This gave an improvement in performance against the Thumb2+DSP optimised code, with more significant gains for larger inputs. The NEON code also wins in cases where the input is small (less than 8 bytes) by defaulting to a simple byte-by-byte search. This avoids the overhead imposed by filling two quadword registers from memory. Results from the glibc bench-memchr benchmark are as follows: Cortex-A53: ----------- vs simple_memchr vs __memchr_noneon Length 2048, position 32, alignment 0: 297.51% 120.87% Length 256, position 64, alignment 1: 406.70% 153.35% Length 2048, position 32, alignment 0: 292.97% 120.77% Length 256, position 64, alignment 1: 406.95% 152.61% Length 2048, position 64, alignment 0: 450.82% 138.74% Length 256, position 64, alignment 2: 408.46% 147.51% Length 2048, position 64, alignment 0: 440.32% 133.33% Length 256, position 64, alignment 2: 405.45% 147.28% Length 2048, position 128, alignment 0: 633.26% 152.98% Length 256, position 64, alignment 3: 405.71% 136.48% Length 2048, position 128, alignment 0: 634.77% 152.88% Length 256, position 64, alignment 3: 405.45% 136.39% Length 2048, position 256, alignment 0: 872.41% 178.25% Length 256, position 64, alignment 4: 408.23% 132.42% Length 2048, position 256, alignment 0: 867.49% 177.65% Length 256, position 64, alignment 4: 405.94% 130.69% Length 2048, position 512, alignment 0: 1089.90% 202.59% Length 256, position 64, alignment 5: 406.19% 129.70% Length 2048, position 512, alignment 0: 1089.43% 202.78% Length 256, position 64, alignment 5: 407.21% 130.60% Length 2048, position 1024, alignment 0: 1254.09% 221.24% Length 256, position 64, alignment 6: 407.21% 123.13% Length 2048, position 1024, alignment 0: 1253.20% 221.12% Length 256, position 64, alignment 6: 406.45% 122.58% Length 2048, position 2048, alignment 0: 1388.94% 237.35% Length 256, position 64, alignment 7: 407.21% 117.16% Length 2048, position 2048, alignment 0: 1387.31% 237.04% Length 256, position 64, alignment 7: 407.96% 325.87% Length 2, position 1, alignment 0: 118.47% 115.29% Length 2, position 1, alignment 0: 109.68% 116.13% Length 2, position 1, alignment 1: 112.50% 117.76% Length 2, position 1, alignment 1: 119.48% 114.94% Length 3, position 2, alignment 0: 116.56% 119.02% Length 3, position 2, alignment 0: 122.02% 117.26% Length 3, position 2, alignment 2: 123.35% 117.96% Length 3, position 2, alignment 2: 123.53% 114.71% Length 4, position 3, alignment 0: 138.59% 119.02% Length 4, position 3, alignment 0: 147.98% 124.86% Length 4, position 3, alignment 3: 113.64% 125.00% Length 4, position 3, alignment 3: 111.73% 123.46% Length 5, position 4, alignment 0: 124.34% 139.68% Length 5, position 4, alignment 0: 120.97% 124.73% Length 5, position 4, alignment 4: 118.62% 121.28% Length 5, position 4, alignment 4: 116.84% 138.42% Length 6, position 5, alignment 0: 118.36% 110.16% Length 6, position 5, alignment 0: 119.12% 111.95% Length 6, position 5, alignment 5: 118.90% 112.20% Length 6, position 5, alignment 5: 121.03% 111.90% Length 7, position 6, alignment 0: 120.51% 109.52% Length 7, position 6, alignment 0: 121.56% 110.41% Length 7, position 6, alignment 6: 120.15% 109.16% Length 7, position 6, alignment 6: 120.66% 109.59% Length 8, position 7, alignment 0: 129.26% 115.56% Length 8, position 7, alignment 0: 129.93% 115.33% Length 8, position 7, alignment 7: 140.56% 126.51% Length 8, position 7, alignment 7: 144.63% 128.51% Length 9, position 8, alignment 0: 138.01% 121.40% Length 9, position 8, alignment 0: 138.66% 122.68% Length 9, position 8, alignment 0: 135.90% 119.78% Length 9, position 8, alignment 0: 138.38% 122.51% Length 10, position 9, alignment 0: 147.78% 126.30% Length 10, position 9, alignment 0: 146.86% 125.83% Length 10, position 9, alignment 1: 165.42% 143.33% Length 10, position 9, alignment 1: 163.93% 140.16% Length 11, position 10, alignment 0: 154.61% 129.89% Length 11, position 10, alignment 0: 155.39% 133.46% Length 11, position 10, alignment 2: 173.75% 148.75% Length 11, position 10, alignment 2: 173.55% 147.11% Length 12, position 11, alignment 0: 165.54% 139.70% Length 12, position 11, alignment 0: 163.94% 137.55% Length 12, position 11, alignment 3: 180.66% 153.91% Length 12, position 11, alignment 3: 184.17% 157.08% Length 13, position 12, alignment 0: 172.12% 144.61% Length 13, position 12, alignment 0: 175.56% 146.62% Length 13, position 12, alignment 4: 192.89% 162.76% Length 13, position 12, alignment 4: 194.14% 163.18% Length 14, position 13, alignment 0: 180.67% 149.44% Length 14, position 13, alignment 0: 180.74% 151.11% Length 14, position 13, alignment 5: 199.59% 164.23% Length 14, position 13, alignment 5: 202.49% 166.80% Length 15, position 14, alignment 0: 189.92% 157.46% Length 15, position 14, alignment 0: 189.85% 157.14% Length 15, position 14, alignment 6: 206.88% 169.64% Length 15, position 14, alignment 6: 206.91% 169.92% Length 16, position 15, alignment 0: 197.03% 89.59% Length 16, position 15, alignment 0: 198.88% 89.55% Length 16, position 15, alignment 7: 223.01% 151.46% Length 16, position 15, alignment 7: 219.75% 148.15% Length 17, position 16, alignment 0: 203.32% 83.39% Length 17, position 16, alignment 0: 205.58% 86.25% Length 17, position 16, alignment 0: 208.24% 86.52% Length 17, position 16, alignment 0: 204.40% 83.88% Length 18, position 17, alignment 0: 213.33% 92.22% Length 18, position 17, alignment 0: 215.41% 92.86% Length 18, position 17, alignment 1: 239.09% 183.54% Length 18, position 17, alignment 1: 231.20% 175.60% Length 19, position 18, alignment 0: 219.48% 98.16% Length 19, position 18, alignment 0: 223.59% 98.88% Length 19, position 18, alignment 2: 240.00% 188.00% Length 19, position 18, alignment 2: 251.05% 194.14% Length 20, position 19, alignment 0: 230.97% 106.34% Length 20, position 19, alignment 0: 226.18% 104.00% Length 20, position 19, alignment 3: 255.33% 180.33% Length 20, position 19, alignment 3: 260.25% 182.84% Length 21, position 20, alignment 0: 239.93% 129.48% Length 21, position 20, alignment 0: 241.04% 112.31% Length 21, position 20, alignment 4: 258.87% 116.53% Length 21, position 20, alignment 4: 264.20% 116.46% Length 22, position 21, alignment 0: 245.76% 134.32% Length 22, position 21, alignment 0: 251.32% 140.00% Length 22, position 21, alignment 5: 275.62% 128.93% Length 22, position 21, alignment 5: 276.03% 128.10% Length 23, position 22, alignment 0: 258.21% 142.16% Length 23, position 22, alignment 0: 257.09% 143.66% Length 23, position 22, alignment 6: 277.82% 150.00% Length 23, position 22, alignment 6: 285.95% 135.95% Length 24, position 23, alignment 0: 264.68% 101.12% Length 24, position 23, alignment 0: 266.67% 100.75% Length 24, position 23, alignment 7: 288.71% 158.47% Length 24, position 23, alignment 7: 290.20% 483.27% Length 25, position 24, alignment 0: 275.56% 115.04% Length 25, position 24, alignment 0: 272.86% 100.00% Length 25, position 24, alignment 0: 270.85% 97.79% Length 25, position 24, alignment 0: 278.03% 99.24% Length 26, position 25, alignment 0: 284.21% 106.77% Length 26, position 25, alignment 0: 283.21% 103.73% Length 26, position 25, alignment 1: 300.00% 160.32% Length 26, position 25, alignment 1: 314.46% 166.53% Length 27, position 26, alignment 0: 291.39% 111.24% Length 27, position 26, alignment 0: 289.96% 110.41% Length 27, position 26, alignment 2: 311.15% 190.44% Length 27, position 26, alignment 2: 324.07% 181.33% Length 28, position 27, alignment 0: 295.22% 118.75% Length 28, position 27, alignment 0: 300.75% 117.98% Length 28, position 27, alignment 3: 322.49% 187.55% Length 28, position 27, alignment 3: 335.98% 195.40% Length 29, position 28, alignment 0: 303.69% 124.72% Length 29, position 28, alignment 0: 305.58% 126.02% Length 29, position 28, alignment 4: 236.78% 91.95% Length 29, position 28, alignment 4: 238.44% 90.75% Length 30, position 29, alignment 0: 317.29% 177.07% Length 30, position 29, alignment 0: 314.13% 147.58% Length 30, position 29, alignment 5: 236.59% 94.13% Length 30, position 29, alignment 5: 244.80% 100.58% Length 31, position 30, alignment 0: 328.19% 156.02% Length 31, position 30, alignment 0: 321.03% 155.35% Length 31, position 30, alignment 6: 241.94% 100.28% Length 31, position 30, alignment 6: 246.02% 103.69% Length 32, position 31, alignment 0: 333.58% 156.34% Length 32, position 31, alignment 0: 330.15% 125.37% Length 32, position 31, alignment 7: 252.69% 117.85% Length 32, position 31, alignment 7: 260.35% 120.70% Cortex-A57: ----------- vs simple_memchr vs __memchr_noneon Length 2048, position 32, alignment 0: 192.83% 68.30% Length 256, position 64, alignment 1: 288.73% 116.90% Length 2048, position 32, alignment 0: 185.02% 64.79% Length 256, position 64, alignment 1: 292.12% 118.28% Length 2048, position 64, alignment 0: 449.72% 157.46% Length 256, position 64, alignment 2: 293.53% 116.55% Length 2048, position 64, alignment 0: 468.39% 163.22% Length 256, position 64, alignment 2: 293.53% 115.83% Length 2048, position 128, alignment 0: 577.25% 148.24% Length 256, position 64, alignment 3: 291.43% 113.57% Length 2048, position 128, alignment 0: 645.61% 165.35% Length 256, position 64, alignment 3: 294.24% 112.95% Length 2048, position 256, alignment 0: 919.87% 189.73% Length 256, position 64, alignment 4: 292.81% 114.39% Length 2048, position 256, alignment 0: 960.55% 195.16% Length 256, position 64, alignment 4: 294.22% 114.80% Length 2048, position 512, alignment 0: 974.82% 169.75% Length 256, position 64, alignment 5: 291.43% 108.93% Length 2048, position 512, alignment 0: 977.45% 170.36% Length 256, position 64, alignment 5: 292.47% 107.89% Length 2048, position 1024, alignment 0: 1215.38% 192.88% Length 256, position 64, alignment 6: 294.93% 106.16% Length 2048, position 1024, alignment 0: 1216.78% 193.22% Length 256, position 64, alignment 6: 292.12% 103.23% Length 2048, position 2048, alignment 0: 1442.14% 215.99% Length 256, position 64, alignment 7: 285.97% 99.30% Length 2048, position 2048, alignment 0: 1449.97% 216.84% Length 256, position 64, alignment 7: 289.68% 98.93% Length 2, position 1, alignment 0: 108.96% 92.54% Length 2, position 1, alignment 0: 107.09% 97.64% Length 2, position 1, alignment 1: 108.06% 98.39% Length 2, position 1, alignment 1: 109.02% 97.54% Length 3, position 2, alignment 0: 103.52% 133.80% Length 3, position 2, alignment 0: 108.09% 136.03% Length 3, position 2, alignment 2: 107.52% 140.60% Length 3, position 2, alignment 2: 109.09% 140.91% Length 4, position 3, alignment 0: 101.32% 92.76% Length 4, position 3, alignment 0: 109.22% 102.13% Length 4, position 3, alignment 3: 109.42% 101.45% Length 4, position 3, alignment 3: 110.22% 100.73% Length 5, position 4, alignment 0: 109.74% 101.95% Length 5, position 4, alignment 0: 110.27% 100.68% Length 5, position 4, alignment 4: 112.59% 101.40% Length 5, position 4, alignment 4: 113.38% 101.41% Length 6, position 5, alignment 0: 110.83% 100.64% Length 6, position 5, alignment 0: 111.92% 100.66% Length 6, position 5, alignment 5: 112.75% 100.00% Length 6, position 5, alignment 5: 114.19% 101.35% Length 7, position 6, alignment 0: 113.84% 101.26% Length 7, position 6, alignment 0: 113.46% 100.64% Length 7, position 6, alignment 6: 112.03% 96.84% Length 7, position 6, alignment 6: 114.19% 99.35% Length 8, position 7, alignment 0: 187.41% 122.22% Length 8, position 7, alignment 0: 191.67% 121.21% Length 8, position 7, alignment 7: 182.01% 114.39% Length 8, position 7, alignment 7: 194.62% 123.08% Length 9, position 8, alignment 0: 176.87% 126.12% Length 9, position 8, alignment 0: 178.03% 125.76% Length 9, position 8, alignment 0: 180.15% 127.48% Length 9, position 8, alignment 0: 178.20% 126.32% Length 10, position 9, alignment 0: 187.88% 178.79% Length 10, position 9, alignment 0: 187.12% 178.03% Length 10, position 9, alignment 1: 192.25% 175.19% Length 10, position 9, alignment 1: 187.88% 165.91% Length 11, position 10, alignment 0: 194.70% 172.73% Length 11, position 10, alignment 0: 194.70% 171.21% Length 11, position 10, alignment 2: 194.70% 171.97% Length 11, position 10, alignment 2: 199.22% 178.13% Length 12, position 11, alignment 0: 201.50% 175.19% Length 12, position 11, alignment 0: 203.03% 175.76% Length 12, position 11, alignment 3: 205.38% 179.23% Length 12, position 11, alignment 3: 205.38% 179.23% Length 13, position 12, alignment 0: 209.85% 181.06% Length 13, position 12, alignment 0: 209.09% 181.06% Length 13, position 12, alignment 4: 209.09% 180.30% Length 13, position 12, alignment 4: 214.73% 185.27% Length 14, position 13, alignment 0: 217.29% 184.21% Length 14, position 13, alignment 0: 215.79% 184.21% Length 14, position 13, alignment 5: 218.18% 186.36% Length 14, position 13, alignment 5: 224.03% 189.15% Length 15, position 14, alignment 0: 225.76% 188.64% Length 15, position 14, alignment 0: 225.00% 187.12% Length 15, position 14, alignment 6: 225.00% 187.88% Length 15, position 14, alignment 6: 230.23% 193.02% Length 16, position 15, alignment 0: 235.11% 114.50% Length 16, position 15, alignment 0: 233.33% 107.58% Length 16, position 15, alignment 7: 238.76% 132.56% Length 16, position 15, alignment 7: 237.69% 126.15% Length 17, position 16, alignment 0: 242.75% 118.32% Length 17, position 16, alignment 0: 240.15% 122.73% Length 17, position 16, alignment 0: 239.39% 112.88% Length 17, position 16, alignment 0: 241.22% 110.69% Length 18, position 17, alignment 0: 254.96% 173.28% Length 18, position 17, alignment 0: 256.49% 165.65% Length 18, position 17, alignment 1: 256.92% 163.85% Length 18, position 17, alignment 1: 256.92% 154.62% Length 19, position 18, alignment 0: 257.90% 127.07% Length 19, position 18, alignment 0: 262.60% 125.95% Length 19, position 18, alignment 2: 263.08% 156.15% Length 19, position 18, alignment 2: 266.67% 155.04% Length 20, position 19, alignment 0: 264.66% 138.35% Length 20, position 19, alignment 0: 264.66% 133.08% Length 20, position 19, alignment 3: 272.09% 164.34% Length 20, position 19, alignment 3: 270.77% 160.00% Length 21, position 20, alignment 0: 277.10% 145.80% Length 21, position 20, alignment 0: 275.76% 133.33% Length 21, position 20, alignment 4: 280.77% 147.69% Length 21, position 20, alignment 4: 279.23% 138.46% Length 22, position 21, alignment 0: 279.70% 147.37% Length 22, position 21, alignment 0: 279.10% 138.06% Length 22, position 21, alignment 5: 283.97% 155.73% Length 22, position 21, alignment 5: 283.97% 148.85% Length 23, position 22, alignment 0: 291.67% 145.45% Length 23, position 22, alignment 0: 291.67% 143.94% Length 23, position 22, alignment 6: 293.13% 163.36% Length 23, position 22, alignment 6: 296.15% 157.69% Length 24, position 23, alignment 0: 299.25% 123.31% Length 24, position 23, alignment 0: 301.52% 120.45% Length 24, position 23, alignment 7: 306.15% 153.08% Length 24, position 23, alignment 7: 306.15% 145.38% Length 25, position 24, alignment 0: 309.09% 124.24% Length 25, position 24, alignment 0: 310.69% 119.08% Length 25, position 24, alignment 0: 304.48% 116.42% Length 25, position 24, alignment 0: 310.69% 117.56% Length 26, position 25, alignment 0: 315.91% 180.30% Length 26, position 25, alignment 0: 315.15% 171.97% Length 26, position 25, alignment 1: 320.77% 175.38% Length 26, position 25, alignment 1: 322.48% 170.54% Length 27, position 26, alignment 0: 324.24% 139.39% Length 27, position 26, alignment 0: 326.72% 132.82% Length 27, position 26, alignment 2: 329.23% 176.15% Length 27, position 26, alignment 2: 331.78% 172.87% Length 28, position 27, alignment 0: 328.57% 144.36% Length 28, position 27, alignment 0: 330.30% 137.12% Length 28, position 27, alignment 3: 333.59% 182.44% Length 28, position 27, alignment 3: 334.35% 175.57% Length 29, position 28, alignment 0: 341.98% 152.67% Length 29, position 28, alignment 0: 339.39% 143.94% Length 29, position 28, alignment 4: 268.86% 124.55% Length 29, position 28, alignment 4: 282.39% 118.87% Length 30, position 29, alignment 0: 345.86% 152.63% Length 30, position 29, alignment 0: 345.86% 146.62% Length 30, position 29, alignment 5: 285.71% 136.65% Length 30, position 29, alignment 5: 288.75% 131.25% Length 31, position 30, alignment 0: 357.58% 153.03% Length 31, position 30, alignment 0: 356.72% 150.75% Length 31, position 30, alignment 6: 286.06% 141.21% Length 31, position 30, alignment 6: 287.80% 128.66% Length 32, position 31, alignment 0: 363.16% 130.83% Length 32, position 31, alignment 0: 365.91% 127.27% Length 32, position 31, alignment 7: 300.00% 136.02% Length 32, position 31, alignment 7: 301.88% 126.88% glibc/ChangeLog: 2017-06-13 Prakhar Bahuguna * sysdeps/arm/armv7/multiarch/Makefile: Add memchr_neon to sysdep_routines. * sysdeps/arm/armv7/multiarch/ifunc-impl-list.c: Add define for __memchr_neon. Add ifunc definitions for __memchr_neon and __memchr_noneon. * sysdeps/arm/armv7/multiarch/memchr.S: New file. * sysdeps/arm/armv7/multiarch/memchr_impl.S: Likewise. * sysdeps/arm/armv7/multiarch/memchr_neon.S: Likewise. Testing done: Ran regression tests for arm-none-linux-gnueabihf as well as a full toolchain bootstrap. Benchmark tests were ran on ARMv7-A and ARMv8-A hardware targets. diff --git a/sysdeps/arm/armv7/multiarch/Makefile b/sysdeps/arm/armv7/multiarch/Makefile index e834cc937f..9e1e61c21a 100644 --- a/sysdeps/arm/armv7/multiarch/Makefile +++ b/sysdeps/arm/armv7/multiarch/Makefile @@ -1,3 +1,3 @@ ifeq ($(subdir),string) -sysdep_routines += memcpy_neon memcpy_vfp +sysdep_routines += memcpy_neon memcpy_vfp memchr_neon endif diff --git a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c index b8094fd393..8f33156317 100644 --- a/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c +++ b/sysdeps/arm/armv7/multiarch/ifunc-impl-list.c @@ -34,6 +34,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, bool use_neon = true; #ifdef __ARM_NEON__ # define __memcpy_neon memcpy +# define __memchr_neon memchr #else use_neon = (GLRO(dl_hwcap) & HWCAP_ARM_NEON) != 0; #endif @@ -52,5 +53,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, #endif IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_arm)); + IFUNC_IMPL (i, name, memchr, + IFUNC_IMPL_ADD (array, i, memchr, use_neon, __memchr_neon) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_noneon)); + return i; } diff --git a/sysdeps/arm/armv7/multiarch/memchr.S b/sysdeps/arm/armv7/multiarch/memchr.S new file mode 100644 index 0000000000..f1d0eda9b1 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr.S @@ -0,0 +1,59 @@ +/* Multiple versions of memchr + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2013-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +#if IS_IN (libc) +/* Under __ARM_NEON__, memchr_neon.S defines the name memchr. */ +# ifndef __ARM_NEON__ + .text + .arm +ENTRY(memchr) + .type memchr, %gnu_indirect_function + ldr r1, .Lmemchr_noneon + tst r0, #HWCAP_ARM_NEON + ldrne r1, .Lmemchr_neon +1: + add r0, r1, pc + DO_RET(lr) + +.Lmemchr_noneon: + .long C_SYMBOL_NAME(__memchr_noneon) - 1b - 8 +.Lmemchr_neon: + .long C_SYMBOL_NAME(__memchr_neon) - 1b - 8 + +END(memchr) + +libc_hidden_builtin_def (memchr) +#endif /* Not __ARM_NEON__. */ +libc_hidden_def (__memchr_noneon) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) +#undef weak_alias +#define weak_alias(x, y) +#undef libc_hidden_def +#define libc_hidden_def(name) + +#define memchr __memchr_noneon + +#endif + +#include "memchr_impl.S" diff --git a/sysdeps/arm/armv7/multiarch/memchr_impl.S b/sysdeps/arm/armv7/multiarch/memchr_impl.S new file mode 100644 index 0000000000..df8647ccf8 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr_impl.S @@ -0,0 +1,218 @@ +/* Copyright (C) 2011-2017 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#ifdef MEMCHR_NEON + +#include + + .arch armv7-a + .fpu neon + + +/* Arguments */ +#define srcin r0 +#define chrin r1 +#define cntin r2 + +/* Retval */ +#define result r0 /* Live range does not overlap with srcin */ + +/* Working registers */ +#define src r1 /* Live range does not overlap with chrin */ +#define tmp r3 +#define synd r0 /* No overlap with srcin or result */ +#define soff r12 + +/* Working NEON registers */ +#define vrepchr q0 +#define vdata0 q1 +#define vdata0_0 d2 /* Lower half of vdata0 */ +#define vdata0_1 d3 /* Upper half of vdata0 */ +#define vdata1 q2 +#define vdata1_0 d4 /* Lower half of vhas_chr0 */ +#define vdata1_1 d5 /* Upper half of vhas_chr0 */ +#define vrepmask q3 +#define vrepmask0 d6 +#define vrepmask1 d7 +#define vend q4 +#define vend0 d8 +#define vend1 d9 + +/* + * Core algorithm: + * + * For each 32-byte chunk we calculate a 32-bit syndrome value, with one bit per + * byte. Each bit is set if the relevant byte matched the requested character + * and cleared otherwise. Since the bits in the syndrome reflect exactly the + * order in which things occur in the original string, counting trailing zeros + * allows to identify exactly which byte has matched. + */ + +#ifndef NO_THUMB + .thumb_func +#else + .arm +#endif + .p2align 4,,15 + +ENTRY(memchr) + /* Use a simple loop if there are less than 8 bytes to search. */ + cmp cntin, #7 + bhi .Llargestr + and chrin, chrin, #0xff + +.Lsmallstr: + subs cntin, cntin, #1 + blo .Lnotfound /* Return not found if reached end. */ + ldrb tmp, [srcin], #1 + cmp tmp, chrin + bne .Lsmallstr /* Loop again if not found. */ + /* Otherwise fixup address and return. */ + sub result, srcin, #1 + bx lr + + +.Llargestr: + vdup.8 vrepchr, chrin /* Duplicate char across all lanes. */ + /* + * Magic constant 0x8040201008040201 allows us to identify which lane + * matches the requested byte. + */ + movw tmp, #0x0201 + movt tmp, #0x0804 + lsl soff, tmp, #4 + vmov vrepmask0, tmp, soff + vmov vrepmask1, tmp, soff + /* Work with aligned 32-byte chunks */ + bic src, srcin, #31 + ands soff, srcin, #31 + beq .Lloopintro /* Go straight to main loop if it's aligned. */ + + /* + * Input string is not 32-byte aligned. We calculate the syndrome + * value for the aligned 32 bytes block containing the first bytes + * and mask the irrelevant part. + */ + vld1.8 {vdata0, vdata1}, [src:256]! + sub tmp, soff, #32 + adds cntin, cntin, tmp + vceq.i8 vdata0, vdata0, vrepchr + vceq.i8 vdata1, vdata1, vrepchr + vand vdata0, vdata0, vrepmask + vand vdata1, vdata1, vrepmask + vpadd.i8 vdata0_0, vdata0_0, vdata0_1 + vpadd.i8 vdata1_0, vdata1_0, vdata1_1 + vpadd.i8 vdata0_0, vdata0_0, vdata1_0 + vpadd.i8 vdata0_0, vdata0_0, vdata0_0 + vmov synd, vdata0_0[0] + + /* Clear the soff lower bits */ + lsr synd, synd, soff + lsl synd, synd, soff + /* The first block can also be the last */ + bls .Lmasklast + /* Have we found something already? */ +#ifndef NO_THUMB + cbnz synd, .Ltail +#else + cmp synd, #0 + bne .Ltail +#endif + + +.Lloopintro: + vpush {vend} + /* 264/265 correspond to d8/d9 for q4 */ + cfi_adjust_cfa_offset (16) + cfi_rel_offset (264, 0) + cfi_rel_offset (265, 8) + .p2align 3,,7 +.Lloop: + vld1.8 {vdata0, vdata1}, [src:256]! + subs cntin, cntin, #32 + vceq.i8 vdata0, vdata0, vrepchr + vceq.i8 vdata1, vdata1, vrepchr + /* If we're out of data we finish regardless of the result. */ + bls .Lend + /* Use a fast check for the termination condition. */ + vorr vend, vdata0, vdata1 + vorr vend0, vend0, vend1 + vmov synd, tmp, vend0 + orrs synd, synd, tmp + /* We're not out of data, loop if we haven't found the character. */ + beq .Lloop + +.Lend: + vpop {vend} + cfi_adjust_cfa_offset (-16) + cfi_restore (264) + cfi_restore (265) + + /* Termination condition found, let's calculate the syndrome value. */ + vand vdata0, vdata0, vrepmask + vand vdata1, vdata1, vrepmask + vpadd.i8 vdata0_0, vdata0_0, vdata0_1 + vpadd.i8 vdata1_0, vdata1_0, vdata1_1 + vpadd.i8 vdata0_0, vdata0_0, vdata1_0 + vpadd.i8 vdata0_0, vdata0_0, vdata0_0 + vmov synd, vdata0_0[0] +#ifndef NO_THUMB + cbz synd, .Lnotfound + bhi .Ltail /* Uses the condition code from + subs cntin, cntin, #32 above. */ +#else + cmp synd, #0 + beq .Lnotfound + cmp cntin, #0 + bhi .Ltail +#endif + + +.Lmasklast: + /* Clear the (-cntin) upper bits to avoid out-of-bounds matches. */ + neg cntin, cntin + lsl synd, synd, cntin + lsrs synd, synd, cntin + it eq + moveq src, #0 /* If no match, set src to 0 so the retval is 0. */ + + +.Ltail: + /* Count the trailing zeros using bit reversing */ + rbit synd, synd + /* Compensate the last post-increment */ + sub src, src, #32 + /* Count the leading zeros */ + clz synd, synd + /* Compute the potential result and return */ + add result, src, synd + bx lr + + +.Lnotfound: + /* Set result to NULL if not found and return */ + mov result, #0 + bx lr + +END(memchr) +libc_hidden_builtin_def (memchr) + +#else + +#include "../../armv6t2/memchr.S" + +#endif diff --git a/sysdeps/arm/armv7/multiarch/memchr_neon.S b/sysdeps/arm/armv7/multiarch/memchr_neon.S new file mode 100644 index 0000000000..ee21818f10 --- /dev/null +++ b/sysdeps/arm/armv7/multiarch/memchr_neon.S @@ -0,0 +1,9 @@ +#ifdef __ARM_NEON__ +/* Under __ARM_NEON__, this file defines memchr directly. */ +libc_hidden_builtin_def (memchr) +#else +# define memchr __memchr_neon +#endif + +#define MEMCHR_NEON +#include "memchr_impl.S"