aarch64: MTE compatible strchrnul

Message ID gkrv9k88oxu.fsf@arm.com
State Committed
Commit f7de454f20c05a748b5d421ed22d96a5232b6093
Headers
Series aarch64: MTE compatible strchrnul |

Commit Message

Andrea Corallo June 3, 2020, 9:43 a.m. UTC
  Hi all,

I'd like to submit this patch introducing an Arm MTE compatible
strchrnul implementation.

Follows a performance comparison of the strchrnul benchmark run on 
Cortex-A72, Cortex-A53, Neoverse N1.

| length | alignment | perf-uplift A72 | perf-uplift A53 | perf-uplift N1 |
|--------+-----------+-----------------+-----------------+----------------|
|     32 |         0 |           1.16x |           1.07x |          1.35x |
|     32 |         1 |           1.25x |           1.16x |          1.15x |
|     64 |         0 |           1.26x |           0.97x |          1.20x |
|     64 |         2 |           1.35x |           1.04x |          1.30x |
|    128 |         0 |           1.12x |           0.84x |          1.22x |
|    128 |         3 |           1.25x |           0.87x |          1.30x |
|    256 |         0 |           1.14x |           0.84x |          1.16x |
|    256 |         4 |           1.24x |           0.81x |          1.16x |
|    512 |         0 |           1.15x |           0.80x |          1.13x |
|    512 |         5 |           1.17x |           0.81x |          1.14x |
|   1024 |         0 |           1.14x |           0.78x |          1.08x |
|   1024 |         6 |           1.03x |           0.78x |          1.10x |
|   2048 |         0 |           1.12x |           0.76x |          1.08x |
|   2048 |         7 |           1.14x |           0.77x |          1.09x |
|     64 |         1 |           1.35x |           1.04x |          1.37x |
|     64 |         1 |           1.36x |           1.04x |          1.37x |
|     64 |         2 |           1.36x |           1.04x |          1.37x |
|     64 |         2 |           1.37x |           1.04x |          1.38x |
|     64 |         3 |           1.38x |           1.04x |          1.36x |
|     64 |         3 |           1.40x |           1.04x |          1.36x |
|     64 |         4 |           1.41x |           1.04x |          1.36x |
|     64 |         4 |           1.36x |           1.04x |          1.36x |
|     64 |         5 |           1.34x |           1.04x |          1.40x |
|     64 |         5 |           1.35x |           1.04x |          1.36x |
|     64 |         6 |           1.34x |           1.04x |          1.37x |
|     64 |         6 |           1.41x |           1.04x |          1.37x |
|     64 |         7 |           1.39x |           1.04x |          1.36x |
|     64 |         7 |           1.34x |           1.04x |          1.37x |
|      0 |         0 |           1.18x |           1.63x |          1.66x |
|      0 |         0 |           1.18x |           1.63x |          1.66x |
|      1 |         0 |           1.18x |           1.63x |          1.66x |
|      1 |         0 |           1.18x |           1.63x |          1.67x |
|      2 |         0 |           1.18x |           1.63x |          1.66x |
|      2 |         0 |           1.18x |           1.63x |          1.65x |
|      3 |         0 |           1.18x |           1.63x |          1.66x |
|      3 |         0 |           1.18x |           1.63x |          1.66x |
|      4 |         0 |           1.18x |           1.63x |          1.65x |
|      4 |         0 |           1.18x |           1.63x |          1.66x |
|      5 |         0 |           1.18x |           1.63x |          1.66x |
|      5 |         0 |           1.18x |           1.63x |          1.66x |
|      6 |         0 |           1.18x |           1.63x |          1.66x |
|      6 |         0 |           1.18x |           1.63x |          1.66x |
|      7 |         0 |           1.18x |           1.63x |          1.66x |
|      7 |         0 |           1.18x |           1.63x |          1.64x |
|      8 |         0 |           1.18x |           1.63x |          1.66x |
|      8 |         0 |           1.18x |           1.63x |          1.66x |
|      9 |         0 |           1.18x |           1.63x |          1.65x |
|      9 |         0 |           1.18x |           1.63x |          1.66x |
|     10 |         0 |           1.18x |           1.63x |          1.66x |
|     10 |         0 |           1.18x |           1.63x |          1.66x |
|     11 |         0 |           1.18x |           1.63x |          1.64x |
|     11 |         0 |           1.18x |           1.63x |          1.63x |
|     12 |         0 |           1.18x |           1.63x |          1.63x |
|     12 |         0 |           1.18x |           1.63x |          1.66x |
|     13 |         0 |           1.18x |           1.63x |          1.63x |
|     13 |         0 |           1.18x |           1.63x |          1.63x |
|     14 |         0 |           1.18x |           1.63x |          1.63x |
|     14 |         0 |           1.18x |           1.63x |          1.22x |
|     15 |         0 |           1.19x |           1.63x |          1.22x |
|     15 |         0 |           1.18x |           1.63x |          1.63x |
|     16 |         0 |           1.03x |           0.96x |          1.15x |
|     16 |         0 |           1.03x |           0.96x |          1.13x |
|     17 |         0 |           1.03x |           0.96x |          0.98x |
|     17 |         0 |           1.03x |           0.96x |          0.98x |
|     18 |         0 |           1.03x |           0.96x |          0.98x |
|     18 |         0 |           1.03x |           0.96x |          0.98x |
|     19 |         0 |           1.04x |           0.96x |          0.98x |
|     19 |         0 |           1.04x |           0.96x |          0.98x |
|     20 |         0 |           1.04x |           0.96x |          1.00x |
|     20 |         0 |           1.03x |           0.96x |          0.99x |
|     21 |         0 |           1.04x |           0.96x |          0.99x |
|     21 |         0 |           1.03x |           0.96x |          1.14x |
|     22 |         0 |           1.04x |           0.96x |          1.14x |
|     22 |         0 |           1.03x |           0.96x |          1.14x |
|     23 |         0 |           1.03x |           0.96x |          1.13x |
|     23 |         0 |           1.03x |           0.96x |          1.15x |
|     24 |         0 |           1.04x |           0.96x |          1.13x |
|     24 |         0 |           1.04x |           0.95x |          1.13x |
|     25 |         0 |           1.03x |           0.96x |          1.15x |
|     25 |         0 |           1.04x |           0.96x |          1.12x |
|     26 |         0 |           1.04x |           0.96x |          1.13x |
|     26 |         0 |           1.02x |           0.96x |          1.13x |
|     27 |         0 |           1.04x |           0.96x |          1.13x |
|     27 |         0 |           1.03x |           0.96x |          1.13x |
|     28 |         0 |           1.03x |           0.96x |          0.98x |
|     28 |         0 |           1.04x |           0.96x |          1.05x |
|     29 |         0 |           1.02x |           0.96x |          1.00x |
|     29 |         0 |           1.03x |           0.96x |          1.00x |
|     30 |         0 |           1.04x |           0.96x |          1.00x |
|     30 |         0 |           1.04x |           0.96x |          1.00x |
|     31 |         0 |           1.04x |           0.96x |          0.99x |
|     31 |         0 |           1.03x |           0.96x |          0.99x |
|     32 |         0 |           1.09x |           1.07x |          1.09x |
|     32 |         1 |           1.25x |           1.15x |          1.38x |
|     64 |         0 |           1.27x |           0.98x |          1.20x |
|     64 |         2 |           1.41x |           1.04x |          1.30x |
|    128 |         0 |           1.15x |           0.84x |          1.22x |
|    128 |         3 |           1.23x |           0.87x |          1.30x |
|    256 |         0 |           1.16x |           0.84x |          1.16x |
|    256 |         4 |           1.23x |           0.81x |          1.17x |
|    512 |         0 |           1.14x |           0.80x |          1.12x |
|    512 |         5 |           1.18x |           0.81x |          1.14x |
|   1024 |         0 |           1.16x |           0.78x |          1.09x |
|   1024 |         6 |           1.03x |           0.78x |          1.11x |
|   2048 |         0 |           1.14x |           0.76x |          1.08x |
|   2048 |         7 |           1.14x |           0.77x |          1.09x |
|     64 |         1 |           1.40x |           1.04x |          1.37x |
|     64 |         1 |           1.40x |           1.04x |          1.37x |
|     64 |         2 |           1.35x |           1.04x |          1.37x |
|     64 |         2 |           1.38x |           1.04x |          1.37x |
|     64 |         3 |           1.36x |           1.04x |          1.37x |
|     64 |         3 |           1.34x |           1.04x |          1.37x |
|     64 |         4 |           1.41x |           1.04x |          1.37x |
|     64 |         4 |           1.38x |           1.04x |          1.37x |
|     64 |         5 |           1.36x |           1.04x |          1.37x |
|     64 |         5 |           1.36x |           1.04x |          1.37x |
|     64 |         6 |           1.35x |           1.04x |          1.37x |
|     64 |         6 |           1.40x |           1.04x |          1.37x |
|     64 |         7 |           1.35x |           1.04x |          1.37x |
|     64 |         7 |           1.40x |           1.04x |          1.37x |
|      0 |         0 |           1.19x |           1.63x |          1.66x |
|      0 |         0 |           1.19x |           1.63x |          1.66x |
|      1 |         0 |           1.19x |           1.63x |          1.66x |
|      1 |         0 |           1.19x |           1.63x |          1.66x |
|      2 |         0 |           1.18x |           1.63x |          1.63x |
|      2 |         0 |           1.18x |           1.63x |          1.66x |
|      3 |         0 |           1.18x |           1.63x |          1.66x |
|      3 |         0 |           1.20x |           1.63x |          1.63x |
|      4 |         0 |           1.18x |           1.63x |          1.63x |
|      4 |         0 |           1.18x |           1.63x |          1.66x |
|      5 |         0 |           1.18x |           1.63x |          1.66x |
|      5 |         0 |           1.18x |           1.63x |          1.66x |
|      6 |         0 |           1.18x |           1.63x |          1.66x |
|      6 |         0 |           1.18x |           1.63x |          1.66x |
|      7 |         0 |           1.18x |           1.63x |          1.66x |
|      7 |         0 |           1.18x |           1.63x |          1.66x |
|      8 |         0 |           1.18x |           1.63x |          1.25x |
|      8 |         0 |           1.18x |           1.63x |          1.66x |
|      9 |         0 |           1.18x |           1.63x |          1.66x |
|      9 |         0 |           1.18x |           1.63x |          1.66x |
|     10 |         0 |           1.18x |           1.63x |          1.66x |
|     10 |         0 |           1.18x |           1.63x |          1.66x |
|     11 |         0 |           1.18x |           1.63x |          1.66x |
|     11 |         0 |           1.18x |           1.63x |          1.66x |
|     12 |         0 |           1.18x |           1.63x |          1.66x |
|     12 |         0 |           1.19x |           1.63x |          1.66x |
|     13 |         0 |           1.18x |           1.63x |          1.66x |
|     13 |         0 |           1.18x |           1.63x |          1.66x |
|     14 |         0 |           1.19x |           1.63x |          1.66x |
|     14 |         0 |           1.19x |           1.63x |          1.66x |
|     15 |         0 |           1.18x |           1.63x |          1.66x |
|     15 |         0 |           1.18x |           1.63x |          1.66x |
|     16 |         0 |           1.03x |           0.96x |          1.00x |
|     16 |         0 |           1.03x |           0.96x |          1.00x |
|     17 |         0 |           1.03x |           0.96x |          1.00x |
|     17 |         0 |           1.03x |           0.96x |          1.15x |
|     18 |         0 |           1.03x |           0.96x |          1.14x |
|     18 |         0 |           1.04x |           0.96x |          1.15x |
|     19 |         0 |           1.04x |           0.96x |          1.15x |
|     19 |         0 |           1.04x |           0.96x |          1.15x |
|     20 |         0 |           1.04x |           0.96x |          1.15x |
|     20 |         0 |           1.03x |           0.96x |          1.15x |
|     21 |         0 |           1.04x |           0.96x |          1.15x |
|     21 |         0 |           1.03x |           0.96x |          1.15x |
|     22 |         0 |           1.02x |           0.96x |          1.15x |
|     22 |         0 |           1.03x |           0.96x |          1.15x |
|     23 |         0 |           1.03x |           0.96x |          1.15x |
|     23 |         0 |           1.03x |           0.96x |          1.15x |
|     24 |         0 |           1.03x |           0.96x |          1.00x |
|     24 |         0 |           1.02x |           0.96x |          1.00x |
|     25 |         0 |           1.04x |           0.96x |          1.00x |
|     25 |         0 |           1.03x |           0.96x |          1.16x |
|     26 |         0 |           1.04x |           0.96x |          1.15x |
|     26 |         0 |           1.03x |           0.96x |          1.15x |
|     27 |         0 |           1.04x |           0.96x |          1.00x |
|     27 |         0 |           1.03x |           0.96x |          1.00x |
|     28 |         0 |           1.04x |           0.96x |          1.00x |
|     28 |         0 |           1.04x |           0.96x |          1.00x |
|     29 |         0 |           1.03x |           0.96x |          1.00x |
|     29 |         0 |           1.04x |           0.96x |          1.15x |
|     30 |         0 |           1.04x |           0.96x |          1.15x |
|     30 |         0 |           1.03x |           0.95x |          1.00x |
|     31 |         0 |           1.03x |           0.96x |          1.00x |
|     31 |         0 |           1.04x |           0.96x |          1.00x |

This patch is passing GLIBC tests.

Regards

  Andrea

8< --- 8< --- 8<
Introduce an Arm MTE compatible strchrnul implementation.

Benchmarked on Cortex-A72, Cortex-A53, Neoverse N1 does not show
performance regressions.

Co-authored-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
  

Comments

Florian Weimer June 3, 2020, 11:27 a.m. UTC | #1
* Andrea Corallo:

> Introduce an Arm MTE compatible strchrnul implementation.
>
> Benchmarked on Cortex-A72, Cortex-A53, Neoverse N1 does not show
> performance regressions.
>
> Co-authored-by: Wilco Dijkstra <wilco.dijkstra@arm.com>

As a very high-level comment, I would expect some sort of markup in the
file that this implementation is now MTE-safe, similar to what we have
for executable stacks.

Or do you plan to handle that in some other fashion?

Thanks,
Florian
  
Andrea Corallo June 3, 2020, 2:14 p.m. UTC | #2
Florian Weimer <fweimer@redhat.com> writes:

> * Andrea Corallo:
>
>> Introduce an Arm MTE compatible strchrnul implementation.
>>
>> Benchmarked on Cortex-A72, Cortex-A53, Neoverse N1 does not show
>> performance regressions.
>>
>> Co-authored-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
>
> As a very high-level comment, I would expect some sort of markup in the
> file that this implementation is now MTE-safe, similar to what we have
> for executable stacks.
>
> Or do you plan to handle that in some other fashion?
>
> Thanks,
> Florian

Hi Florian,

Now the only markup is the comment on the top of the file stating the
MTE compatibility of the routine.

I'm not aware of how this marking is done for executable stacks, perhaps
could you give an hook on where to look for?

Just to make sure we are one the same page wanted to add: these
functions are supposed to be backward compatible with what they are
replacing, so I'm not sure a marking is necessary.

Thanks

  Andrea
  
Florian Weimer June 3, 2020, 2:24 p.m. UTC | #3
* Andrea Corallo:

> Florian Weimer <fweimer@redhat.com> writes:
>
>> * Andrea Corallo:
>>
>>> Introduce an Arm MTE compatible strchrnul implementation.
>>>
>>> Benchmarked on Cortex-A72, Cortex-A53, Neoverse N1 does not show
>>> performance regressions.
>>>
>>> Co-authored-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
>>
>> As a very high-level comment, I would expect some sort of markup in the
>> file that this implementation is now MTE-safe, similar to what we have
>> for executable stacks.
>>
>> Or do you plan to handle that in some other fashion?
>>
>> Thanks,
>> Florian
>
> Hi Florian,
>
> Now the only markup is the comment on the top of the file stating the
> MTE compatibility of the routine.
>
> I'm not aware of how this marking is done for executable stacks, perhaps
> could you give an hook on where to look for?

Typically, the -z noexecstack flag or a special .note.GNU-stack section
is used for that.

> Just to make sure we are one the same page wanted to add: these
> functions are supposed to be backward compatible with what they are
> replacing, so I'm not sure a marking is necessary.

It's MTE that isn't backwards-compatible without such markup.

Thanks,
Florian
  
Adhemerval Zanella Netto June 3, 2020, 2:33 p.m. UTC | #4
On 03/06/2020 11:24, Florian Weimer via Libc-alpha wrote:
> * Andrea Corallo:
> 
>> Florian Weimer <fweimer@redhat.com> writes:
>>
>>> * Andrea Corallo:
>>>
>>>> Introduce an Arm MTE compatible strchrnul implementation.
>>>>
>>>> Benchmarked on Cortex-A72, Cortex-A53, Neoverse N1 does not show
>>>> performance regressions.
>>>>
>>>> Co-authored-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
>>>
>>> As a very high-level comment, I would expect some sort of markup in the
>>> file that this implementation is now MTE-safe, similar to what we have
>>> for executable stacks.
>>>
>>> Or do you plan to handle that in some other fashion?
>>>
>>> Thanks,
>>> Florian
>>
>> Hi Florian,
>>
>> Now the only markup is the comment on the top of the file stating the
>> MTE compatibility of the routine.
>>
>> I'm not aware of how this marking is done for executable stacks, perhaps
>> could you give an hook on where to look for?
> 
> Typically, the -z noexecstack flag or a special .note.GNU-stack section
> is used for that.
> 
>> Just to make sure we are one the same page wanted to add: these
>> functions are supposed to be backward compatible with what they are
>> replacing, so I'm not sure a marking is necessary.
> 
> It's MTE that isn't backwards-compatible without such markup.

Afaiu there is no need to add any marking for MTE, the main difference
it enforce 16-byte granularity read. I think you are confusing with
BTI, which does require the GNU note.
  
Adhemerval Zanella Netto June 3, 2020, 2:40 p.m. UTC | #5
On 03/06/2020 06:43, Andrea Corallo wrote:
> Hi all,
> 
> I'd like to submit this patch introducing an Arm MTE compatible
> strchrnul implementation.
> 
> Follows a performance comparison of the strchrnul benchmark run on 
> Cortex-A72, Cortex-A53, Neoverse N1.

How these performance numbers were calculated? Did you use glibc benchtests
or an external one?

Besides it the commit message does not give an overall description of 
which changes it does to the generic implementation neither the requirements
of MTE support.

You could use the description on optimized-routines to state it changes the
granularity of the read instruction to 16 bytes from the 32 bytes strategy 
and briefly describe the MTE requirements for string routines.

I would like to ask you to send patches as inline instead of attachment (it
make easier to review in most email clients).

> 
> | length | alignment | perf-uplift A72 | perf-uplift A53 | perf-uplift N1 |
> |--------+-----------+-----------------+-----------------+----------------|
> |     32 |         0 |           1.16x |           1.07x |          1.35x |
> |     32 |         1 |           1.25x |           1.16x |          1.15x |
> |     64 |         0 |           1.26x |           0.97x |          1.20x |
> |     64 |         2 |           1.35x |           1.04x |          1.30x |
> |    128 |         0 |           1.12x |           0.84x |          1.22x |
> |    128 |         3 |           1.25x |           0.87x |          1.30x |
> |    256 |         0 |           1.14x |           0.84x |          1.16x |
> |    256 |         4 |           1.24x |           0.81x |          1.16x |
> |    512 |         0 |           1.15x |           0.80x |          1.13x |
> |    512 |         5 |           1.17x |           0.81x |          1.14x |
> |   1024 |         0 |           1.14x |           0.78x |          1.08x |
> |   1024 |         6 |           1.03x |           0.78x |          1.10x |
> |   2048 |         0 |           1.12x |           0.76x |          1.08x |
> |   2048 |         7 |           1.14x |           0.77x |          1.09x |
> |     64 |         1 |           1.35x |           1.04x |          1.37x |
> |     64 |         1 |           1.36x |           1.04x |          1.37x |
> |     64 |         2 |           1.36x |           1.04x |          1.37x |
> |     64 |         2 |           1.37x |           1.04x |          1.38x |
> |     64 |         3 |           1.38x |           1.04x |          1.36x |
> |     64 |         3 |           1.40x |           1.04x |          1.36x |
> |     64 |         4 |           1.41x |           1.04x |          1.36x |
> |     64 |         4 |           1.36x |           1.04x |          1.36x |
> |     64 |         5 |           1.34x |           1.04x |          1.40x |
> |     64 |         5 |           1.35x |           1.04x |          1.36x |
> |     64 |         6 |           1.34x |           1.04x |          1.37x |
> |     64 |         6 |           1.41x |           1.04x |          1.37x |
> |     64 |         7 |           1.39x |           1.04x |          1.36x |
> |     64 |         7 |           1.34x |           1.04x |          1.37x |
> |      0 |         0 |           1.18x |           1.63x |          1.66x |
> |      0 |         0 |           1.18x |           1.63x |          1.66x |
> |      1 |         0 |           1.18x |           1.63x |          1.66x |
> |      1 |         0 |           1.18x |           1.63x |          1.67x |
> |      2 |         0 |           1.18x |           1.63x |          1.66x |
> |      2 |         0 |           1.18x |           1.63x |          1.65x |
> |      3 |         0 |           1.18x |           1.63x |          1.66x |
> |      3 |         0 |           1.18x |           1.63x |          1.66x |
> |      4 |         0 |           1.18x |           1.63x |          1.65x |
> |      4 |         0 |           1.18x |           1.63x |          1.66x |
> |      5 |         0 |           1.18x |           1.63x |          1.66x |
> |      5 |         0 |           1.18x |           1.63x |          1.66x |
> |      6 |         0 |           1.18x |           1.63x |          1.66x |
> |      6 |         0 |           1.18x |           1.63x |          1.66x |
> |      7 |         0 |           1.18x |           1.63x |          1.66x |
> |      7 |         0 |           1.18x |           1.63x |          1.64x |
> |      8 |         0 |           1.18x |           1.63x |          1.66x |
> |      8 |         0 |           1.18x |           1.63x |          1.66x |
> |      9 |         0 |           1.18x |           1.63x |          1.65x |
> |      9 |         0 |           1.18x |           1.63x |          1.66x |
> |     10 |         0 |           1.18x |           1.63x |          1.66x |
> |     10 |         0 |           1.18x |           1.63x |          1.66x |
> |     11 |         0 |           1.18x |           1.63x |          1.64x |
> |     11 |         0 |           1.18x |           1.63x |          1.63x |
> |     12 |         0 |           1.18x |           1.63x |          1.63x |
> |     12 |         0 |           1.18x |           1.63x |          1.66x |
> |     13 |         0 |           1.18x |           1.63x |          1.63x |
> |     13 |         0 |           1.18x |           1.63x |          1.63x |
> |     14 |         0 |           1.18x |           1.63x |          1.63x |
> |     14 |         0 |           1.18x |           1.63x |          1.22x |
> |     15 |         0 |           1.19x |           1.63x |          1.22x |
> |     15 |         0 |           1.18x |           1.63x |          1.63x |
> |     16 |         0 |           1.03x |           0.96x |          1.15x |
> |     16 |         0 |           1.03x |           0.96x |          1.13x |
> |     17 |         0 |           1.03x |           0.96x |          0.98x |
> |     17 |         0 |           1.03x |           0.96x |          0.98x |
> |     18 |         0 |           1.03x |           0.96x |          0.98x |
> |     18 |         0 |           1.03x |           0.96x |          0.98x |
> |     19 |         0 |           1.04x |           0.96x |          0.98x |
> |     19 |         0 |           1.04x |           0.96x |          0.98x |
> |     20 |         0 |           1.04x |           0.96x |          1.00x |
> |     20 |         0 |           1.03x |           0.96x |          0.99x |
> |     21 |         0 |           1.04x |           0.96x |          0.99x |
> |     21 |         0 |           1.03x |           0.96x |          1.14x |
> |     22 |         0 |           1.04x |           0.96x |          1.14x |
> |     22 |         0 |           1.03x |           0.96x |          1.14x |
> |     23 |         0 |           1.03x |           0.96x |          1.13x |
> |     23 |         0 |           1.03x |           0.96x |          1.15x |
> |     24 |         0 |           1.04x |           0.96x |          1.13x |
> |     24 |         0 |           1.04x |           0.95x |          1.13x |
> |     25 |         0 |           1.03x |           0.96x |          1.15x |
> |     25 |         0 |           1.04x |           0.96x |          1.12x |
> |     26 |         0 |           1.04x |           0.96x |          1.13x |
> |     26 |         0 |           1.02x |           0.96x |          1.13x |
> |     27 |         0 |           1.04x |           0.96x |          1.13x |
> |     27 |         0 |           1.03x |           0.96x |          1.13x |
> |     28 |         0 |           1.03x |           0.96x |          0.98x |
> |     28 |         0 |           1.04x |           0.96x |          1.05x |
> |     29 |         0 |           1.02x |           0.96x |          1.00x |
> |     29 |         0 |           1.03x |           0.96x |          1.00x |
> |     30 |         0 |           1.04x |           0.96x |          1.00x |
> |     30 |         0 |           1.04x |           0.96x |          1.00x |
> |     31 |         0 |           1.04x |           0.96x |          0.99x |
> |     31 |         0 |           1.03x |           0.96x |          0.99x |
> |     32 |         0 |           1.09x |           1.07x |          1.09x |
> |     32 |         1 |           1.25x |           1.15x |          1.38x |
> |     64 |         0 |           1.27x |           0.98x |          1.20x |
> |     64 |         2 |           1.41x |           1.04x |          1.30x |
> |    128 |         0 |           1.15x |           0.84x |          1.22x |
> |    128 |         3 |           1.23x |           0.87x |          1.30x |
> |    256 |         0 |           1.16x |           0.84x |          1.16x |
> |    256 |         4 |           1.23x |           0.81x |          1.17x |
> |    512 |         0 |           1.14x |           0.80x |          1.12x |
> |    512 |         5 |           1.18x |           0.81x |          1.14x |
> |   1024 |         0 |           1.16x |           0.78x |          1.09x |
> |   1024 |         6 |           1.03x |           0.78x |          1.11x |
> |   2048 |         0 |           1.14x |           0.76x |          1.08x |
> |   2048 |         7 |           1.14x |           0.77x |          1.09x |
> |     64 |         1 |           1.40x |           1.04x |          1.37x |
> |     64 |         1 |           1.40x |           1.04x |          1.37x |
> |     64 |         2 |           1.35x |           1.04x |          1.37x |
> |     64 |         2 |           1.38x |           1.04x |          1.37x |
> |     64 |         3 |           1.36x |           1.04x |          1.37x |
> |     64 |         3 |           1.34x |           1.04x |          1.37x |
> |     64 |         4 |           1.41x |           1.04x |          1.37x |
> |     64 |         4 |           1.38x |           1.04x |          1.37x |
> |     64 |         5 |           1.36x |           1.04x |          1.37x |
> |     64 |         5 |           1.36x |           1.04x |          1.37x |
> |     64 |         6 |           1.35x |           1.04x |          1.37x |
> |     64 |         6 |           1.40x |           1.04x |          1.37x |
> |     64 |         7 |           1.35x |           1.04x |          1.37x |
> |     64 |         7 |           1.40x |           1.04x |          1.37x |
> |      0 |         0 |           1.19x |           1.63x |          1.66x |
> |      0 |         0 |           1.19x |           1.63x |          1.66x |
> |      1 |         0 |           1.19x |           1.63x |          1.66x |
> |      1 |         0 |           1.19x |           1.63x |          1.66x |
> |      2 |         0 |           1.18x |           1.63x |          1.63x |
> |      2 |         0 |           1.18x |           1.63x |          1.66x |
> |      3 |         0 |           1.18x |           1.63x |          1.66x |
> |      3 |         0 |           1.20x |           1.63x |          1.63x |
> |      4 |         0 |           1.18x |           1.63x |          1.63x |
> |      4 |         0 |           1.18x |           1.63x |          1.66x |
> |      5 |         0 |           1.18x |           1.63x |          1.66x |
> |      5 |         0 |           1.18x |           1.63x |          1.66x |
> |      6 |         0 |           1.18x |           1.63x |          1.66x |
> |      6 |         0 |           1.18x |           1.63x |          1.66x |
> |      7 |         0 |           1.18x |           1.63x |          1.66x |
> |      7 |         0 |           1.18x |           1.63x |          1.66x |
> |      8 |         0 |           1.18x |           1.63x |          1.25x |
> |      8 |         0 |           1.18x |           1.63x |          1.66x |
> |      9 |         0 |           1.18x |           1.63x |          1.66x |
> |      9 |         0 |           1.18x |           1.63x |          1.66x |
> |     10 |         0 |           1.18x |           1.63x |          1.66x |
> |     10 |         0 |           1.18x |           1.63x |          1.66x |
> |     11 |         0 |           1.18x |           1.63x |          1.66x |
> |     11 |         0 |           1.18x |           1.63x |          1.66x |
> |     12 |         0 |           1.18x |           1.63x |          1.66x |
> |     12 |         0 |           1.19x |           1.63x |          1.66x |
> |     13 |         0 |           1.18x |           1.63x |          1.66x |
> |     13 |         0 |           1.18x |           1.63x |          1.66x |
> |     14 |         0 |           1.19x |           1.63x |          1.66x |
> |     14 |         0 |           1.19x |           1.63x |          1.66x |
> |     15 |         0 |           1.18x |           1.63x |          1.66x |
> |     15 |         0 |           1.18x |           1.63x |          1.66x |
> |     16 |         0 |           1.03x |           0.96x |          1.00x |
> |     16 |         0 |           1.03x |           0.96x |          1.00x |
> |     17 |         0 |           1.03x |           0.96x |          1.00x |
> |     17 |         0 |           1.03x |           0.96x |          1.15x |
> |     18 |         0 |           1.03x |           0.96x |          1.14x |
> |     18 |         0 |           1.04x |           0.96x |          1.15x |
> |     19 |         0 |           1.04x |           0.96x |          1.15x |
> |     19 |         0 |           1.04x |           0.96x |          1.15x |
> |     20 |         0 |           1.04x |           0.96x |          1.15x |
> |     20 |         0 |           1.03x |           0.96x |          1.15x |
> |     21 |         0 |           1.04x |           0.96x |          1.15x |
> |     21 |         0 |           1.03x |           0.96x |          1.15x |
> |     22 |         0 |           1.02x |           0.96x |          1.15x |
> |     22 |         0 |           1.03x |           0.96x |          1.15x |
> |     23 |         0 |           1.03x |           0.96x |          1.15x |
> |     23 |         0 |           1.03x |           0.96x |          1.15x |
> |     24 |         0 |           1.03x |           0.96x |          1.00x |
> |     24 |         0 |           1.02x |           0.96x |          1.00x |
> |     25 |         0 |           1.04x |           0.96x |          1.00x |
> |     25 |         0 |           1.03x |           0.96x |          1.16x |
> |     26 |         0 |           1.04x |           0.96x |          1.15x |
> |     26 |         0 |           1.03x |           0.96x |          1.15x |
> |     27 |         0 |           1.04x |           0.96x |          1.00x |
> |     27 |         0 |           1.03x |           0.96x |          1.00x |
> |     28 |         0 |           1.04x |           0.96x |          1.00x |
> |     28 |         0 |           1.04x |           0.96x |          1.00x |
> |     29 |         0 |           1.03x |           0.96x |          1.00x |
> |     29 |         0 |           1.04x |           0.96x |          1.15x |
> |     30 |         0 |           1.04x |           0.96x |          1.15x |
> |     30 |         0 |           1.03x |           0.95x |          1.00x |
> |     31 |         0 |           1.03x |           0.96x |          1.00x |
> |     31 |         0 |           1.04x |           0.96x |          1.00x |
> 
> This patch is passing GLIBC tests.
> 
> Regards
> 
>   Andrea
> 
> 8< --- 8< --- 8<
> Introduce an Arm MTE compatible strchrnul implementation.
> 
> Benchmarked on Cortex-A72, Cortex-A53, Neoverse N1 does not show
> performance regressions.
> 
> Co-authored-by: Wilco Dijkstra <wilco.dijkstra@arm.com>
>
  
Szabolcs Nagy June 3, 2020, 2:53 p.m. UTC | #6
The 06/03/2020 11:33, Adhemerval Zanella via Libc-alpha wrote:
> On 03/06/2020 11:24, Florian Weimer via Libc-alpha wrote:
> > * Andrea Corallo:
> >> Florian Weimer <fweimer@redhat.com> writes:
> >>> As a very high-level comment, I would expect some sort of markup in the
> >>> file that this implementation is now MTE-safe, similar to what we have
> >>> for executable stacks.
> >>>
> >>> Or do you plan to handle that in some other fashion?
> >>>
> >>> Thanks,
> >>> Florian
> >>
> >> Hi Florian,
> >>
> >> Now the only markup is the comment on the top of the file stating the
> >> MTE compatibility of the routine.
> >>
> >> I'm not aware of how this marking is done for executable stacks, perhaps
> >> could you give an hook on where to look for?
> > 
> > Typically, the -z noexecstack flag or a special .note.GNU-stack section
> > is used for that.
> > 
> >> Just to make sure we are one the same page wanted to add: these
> >> functions are supposed to be backward compatible with what they are
> >> replacing, so I'm not sure a marking is necessary.
> > 
> > It's MTE that isn't backwards-compatible without such markup.
> 
> Afaiu there is no need to add any marking for MTE, the main difference
> it enforce 16-byte granularity read. I think you are confusing with
> BTI, which does require the GNU note.

in principle existing binaries may not be mte safe:

(1) the top byte may be used by user code,
(2) page size granularity may be assumed for memory protection.

so it is a valid question if we want to mark binaries that
are mte-safe to make mte an opt-in feature.

the problem is that then we cannot use heap tagging for debugging
heap corruption problems in existing binaries. so even if we
apply an mte markup we have to allow the user to override that.

we probably don't want heap tagging on by default since there
is an overhead so in the end it will be a user choice if mte
is enabled or not.

the difference compared to noexecstack is that the runtime can
fall back to executable stack if there are non-marked binaries,
but with mte once it's on the runtime cannot fall back, just
reject incompatible binaries (and it has to be on very early:
before malloc calls are made).

i don't know if it makes sense to introduce an object markup just
for aborting / rejecting loading libraries when the user asked
for mte.
  
Szabolcs Nagy June 3, 2020, 3:03 p.m. UTC | #7
The 06/03/2020 15:53, Szabolcs Nagy wrote:
> The 06/03/2020 11:33, Adhemerval Zanella via Libc-alpha wrote:
> > On 03/06/2020 11:24, Florian Weimer via Libc-alpha wrote:
> > > * Andrea Corallo:
> > >> Florian Weimer <fweimer@redhat.com> writes:
> > >>> As a very high-level comment, I would expect some sort of markup in the
> > >>> file that this implementation is now MTE-safe, similar to what we have
> > >>> for executable stacks.
> > >>>
> > >>> Or do you plan to handle that in some other fashion?
> > >>>
> > >>> Thanks,
> > >>> Florian
> > >>
> > >> Hi Florian,
> > >>
> > >> Now the only markup is the comment on the top of the file stating the
> > >> MTE compatibility of the routine.
> > >>
> > >> I'm not aware of how this marking is done for executable stacks, perhaps
> > >> could you give an hook on where to look for?
> > > 
> > > Typically, the -z noexecstack flag or a special .note.GNU-stack section
> > > is used for that.
> > > 
> > >> Just to make sure we are one the same page wanted to add: these
> > >> functions are supposed to be backward compatible with what they are
> > >> replacing, so I'm not sure a marking is necessary.
> > > 
> > > It's MTE that isn't backwards-compatible without such markup.
> > 
> > Afaiu there is no need to add any marking for MTE, the main difference
> > it enforce 16-byte granularity read. I think you are confusing with
> > BTI, which does require the GNU note.
> 
> in principle existing binaries may not be mte safe:
> 
> (1) the top byte may be used by user code,
> (2) page size granularity may be assumed for memory protection.
> 
> so it is a valid question if we want to mark binaries that
> are mte-safe to make mte an opt-in feature.
> 
> the problem is that then we cannot use heap tagging for debugging
> heap corruption problems in existing binaries. so even if we
> apply an mte markup we have to allow the user to override that.
> 
> we probably don't want heap tagging on by default since there
> is an overhead so in the end it will be a user choice if mte
> is enabled or not.
> 
> the difference compared to noexecstack is that the runtime can
> fall back to executable stack if there are non-marked binaries,
> but with mte once it's on the runtime cannot fall back, just
> reject incompatible binaries (and it has to be on very early:
> before malloc calls are made).

this is not entirely true: (2) can be fixed at runtime
by keeping tags but turning tag checks off (it's a per
thread setting so a bit tricky to do across all threads),
but (1) cannot be fixed at runtime: if there are tagged
pointers already being passed around we cannot get rid
of them, so we have to reject the incompatible library.

we could have separate markup for problems (1) and (2)
but neither of them is easily discoverable by the compiler
(well conforming c code is not supposed to do either),
so i don't know how the markup would be added to object
files in a reliable way.

> 
> i don't know if it makes sense to introduce an object markup just
> for aborting / rejecting loading libraries when the user asked
> for mte.

--
  
Florian Weimer June 3, 2020, 3:04 p.m. UTC | #8
* Adhemerval Zanella via Libc-alpha:

> Afaiu there is no need to add any marking for MTE, the main difference
> it enforce 16-byte granularity read. I think you are confusing with
> BTI, which does require the GNU note.

If there is no compatibility issue, then why are these changes to the
glibc string functions needed?

Clearly I'm confused.

Thanks,
Florian
  
Adhemerval Zanella Netto June 3, 2020, 3:04 p.m. UTC | #9
On 03/06/2020 11:53, Szabolcs Nagy wrote:
> The 06/03/2020 11:33, Adhemerval Zanella via Libc-alpha wrote:
>> On 03/06/2020 11:24, Florian Weimer via Libc-alpha wrote:
>>> * Andrea Corallo:
>>>> Florian Weimer <fweimer@redhat.com> writes:
>>>>> As a very high-level comment, I would expect some sort of markup in the
>>>>> file that this implementation is now MTE-safe, similar to what we have
>>>>> for executable stacks.
>>>>>
>>>>> Or do you plan to handle that in some other fashion?
>>>>>
>>>>> Thanks,
>>>>> Florian
>>>>
>>>> Hi Florian,
>>>>
>>>> Now the only markup is the comment on the top of the file stating the
>>>> MTE compatibility of the routine.
>>>>
>>>> I'm not aware of how this marking is done for executable stacks, perhaps
>>>> could you give an hook on where to look for?
>>>
>>> Typically, the -z noexecstack flag or a special .note.GNU-stack section
>>> is used for that.
>>>
>>>> Just to make sure we are one the same page wanted to add: these
>>>> functions are supposed to be backward compatible with what they are
>>>> replacing, so I'm not sure a marking is necessary.
>>>
>>> It's MTE that isn't backwards-compatible without such markup.
>>
>> Afaiu there is no need to add any marking for MTE, the main difference
>> it enforce 16-byte granularity read. I think you are confusing with
>> BTI, which does require the GNU note.
> 
> in principle existing binaries may not be mte safe:
> 
> (1) the top byte may be used by user code,
> (2) page size granularity may be assumed for memory protection.
> 
> so it is a valid question if we want to mark binaries that
> are mte-safe to make mte an opt-in feature.

My understanding is MTE binary tagging support is a orthogonal 
discussion, although related. This change afaiu is mainly a algorithm
change it is backwards compatible, it was motivated by MTE support but
it could also be reviewed independently from its support.

> 
> the problem is that then we cannot use heap tagging for debugging
> heap corruption problems in existing binaries. so even if we
> apply an mte markup we have to allow the user to override that.
> 
> we probably don't want heap tagging on by default since there
> is an overhead so in the end it will be a user choice if mte
> is enabled or not.
> 
> the difference compared to noexecstack is that the runtime can
> fall back to executable stack if there are non-marked binaries,
> but with mte once it's on the runtime cannot fall back, just
> reject incompatible binaries (and it has to be on very early:
> before malloc calls are made).
> 
> i don't know if it makes sense to introduce an object markup just
> for aborting / rejecting loading libraries when the user asked
> for mte.
  
Szabolcs Nagy June 3, 2020, 3:18 p.m. UTC | #10
The 06/03/2020 17:04, Florian Weimer via Libc-alpha wrote:
> * Adhemerval Zanella via Libc-alpha:
> 
> > Afaiu there is no need to add any marking for MTE, the main difference
> > it enforce 16-byte granularity read. I think you are confusing with
> > BTI, which does require the GNU note.
> 
> If there is no compatibility issue, then why are these changes to the
> glibc string functions needed?
> 
> Clearly I'm confused.

string functions have problem (2): they assume that
if an address is accessible then everything on that
page is accessible via the same pointer (which
is no longer true with mte).

i think we can add the mte-safe string functions and
tackle the abi compatibility issues separately: if
we introduce mte-safe markups we will have to add
that to all asm code to make libc.so marked.
(which will be tricky since we have non-mte-safe
ifunc variants that are only selected on non-mte hw
so the code is not safe but way it is used is safe)
  
Florian Weimer June 3, 2020, 3:24 p.m. UTC | #11
* Szabolcs Nagy:

> The 06/03/2020 17:04, Florian Weimer via Libc-alpha wrote:
>> * Adhemerval Zanella via Libc-alpha:
>> 
>> > Afaiu there is no need to add any marking for MTE, the main difference
>> > it enforce 16-byte granularity read. I think you are confusing with
>> > BTI, which does require the GNU note.
>> 
>> If there is no compatibility issue, then why are these changes to the
>> glibc string functions needed?
>> 
>> Clearly I'm confused.
>
> string functions have problem (2): they assume that
> if an address is accessible then everything on that
> page is accessible via the same pointer (which
> is no longer true with mte).

Ahh.

> i think we can add the mte-safe string functions and
> tackle the abi compatibility issues separately:

I agree that markup is a separate issue.

Thanks,
Florian
  
Andrea Corallo June 3, 2020, 4:01 p.m. UTC | #12
Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:

> On 03/06/2020 06:43, Andrea Corallo wrote:
>> Hi all,
>> 
>> I'd like to submit this patch introducing an Arm MTE compatible
>> strchrnul implementation.
>> 
>> Follows a performance comparison of the strchrnul benchmark run on 
>> Cortex-A72, Cortex-A53, Neoverse N1.
>
> How these performance numbers were calculated? Did you use glibc benchtests
> or an external one?

Yes the glibc benchtests has been used, forgot to mention sorry.

> Besides it the commit message does not give an overall description of 
> which changes it does to the generic implementation neither the requirements
> of MTE support.

I'll improve the commit message as suggested.  Regarding the MTE
requirements given the function is backward compatible not sure what
should be mentioned.

Thanks

  Andrea
  
H.J. Lu June 4, 2020, 12:04 p.m. UTC | #13
On Wed, Jun 3, 2020 at 9:02 AM Andrea Corallo <andrea.corallo@arm.com> wrote:
>
> Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:
>
> > On 03/06/2020 06:43, Andrea Corallo wrote:
> >> Hi all,
> >>
> >> I'd like to submit this patch introducing an Arm MTE compatible
> >> strchrnul implementation.
> >>
> >> Follows a performance comparison of the strchrnul benchmark run on
> >> Cortex-A72, Cortex-A53, Neoverse N1.
> >
> > How these performance numbers were calculated? Did you use glibc benchtests
> > or an external one?
>
> Yes the glibc benchtests has been used, forgot to mention sorry.
>
> > Besides it the commit message does not give an overall description of
> > which changes it does to the generic implementation neither the requirements
> > of MTE support.
>
> I'll improve the commit message as suggested.  Regarding the MTE
> requirements given the function is backward compatible not sure what
> should be mentioned.
>

My impression is if some object files aren't MTE compatible, you can't enable
MTE in the executable.  Is that correct?
  
Szabolcs Nagy June 4, 2020, 7:48 p.m. UTC | #14
* H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> [2020-06-04 05:04:23 -0700]:
> On Wed, Jun 3, 2020 at 9:02 AM Andrea Corallo <andrea.corallo@arm.com> wrote:
> > Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:
> > > Besides it the commit message does not give an overall description of
> > > which changes it does to the generic implementation neither the requirements
> > > of MTE support.
> >
> > I'll improve the commit message as suggested.  Regarding the MTE
> > requirements given the function is backward compatible not sure what
> > should be mentioned.
> >
> 
> My impression is if some object files aren't MTE compatible, you can't enable
> MTE in the executable.  Is that correct?

there can be code that is not mte compatible that would
crash when memory tagging and tag checking are enabled.
currently there is no object file marking for compatibility.

i think we should mention in the commit message in what
way the old code is incompatible with mte (e.g. it does
out of bound loads that can go across a tag granule
boundaries which can cause tag check failures with mte)
  
H.J. Lu June 4, 2020, 8:03 p.m. UTC | #15
On Thu, Jun 4, 2020 at 12:49 PM Szabolcs Nagy <nsz@port70.net> wrote:
>
> * H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> [2020-06-04 05:04:23 -0700]:
> > On Wed, Jun 3, 2020 at 9:02 AM Andrea Corallo <andrea.corallo@arm.com> wrote:
> > > Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:
> > > > Besides it the commit message does not give an overall description of
> > > > which changes it does to the generic implementation neither the requirements
> > > > of MTE support.
> > >
> > > I'll improve the commit message as suggested.  Regarding the MTE
> > > requirements given the function is backward compatible not sure what
> > > should be mentioned.
> > >
> >
> > My impression is if some object files aren't MTE compatible, you can't enable
> > MTE in the executable.  Is that correct?
>
> there can be code that is not mte compatible that would
> crash when memory tagging and tag checking are enabled.
> currently there is no object file marking for compatibility.
>
> i think we should mention in the commit message in what
> way the old code is incompatible with mte (e.g. it does
> out of bound loads that can go across a tag granule
> boundaries which can cause tag check failures with mte)

Should we add a marker to indicate that an object file is
mte compatible?
  
Szabolcs Nagy June 5, 2020, 7:38 a.m. UTC | #16
The 06/04/2020 13:03, H.J. Lu via Libc-alpha wrote:
> On Thu, Jun 4, 2020 at 12:49 PM Szabolcs Nagy <nsz@port70.net> wrote:
> >
> > * H.J. Lu via Libc-alpha <libc-alpha@sourceware.org> [2020-06-04 05:04:23 -0700]:
> > > On Wed, Jun 3, 2020 at 9:02 AM Andrea Corallo <andrea.corallo@arm.com> wrote:
> > > > Adhemerval Zanella <adhemerval.zanella@linaro.org> writes:
> > > > > Besides it the commit message does not give an overall description of
> > > > > which changes it does to the generic implementation neither the requirements
> > > > > of MTE support.
> > > >
> > > > I'll improve the commit message as suggested.  Regarding the MTE
> > > > requirements given the function is backward compatible not sure what
> > > > should be mentioned.
> > > >
> > >
> > > My impression is if some object files aren't MTE compatible, you can't enable
> > > MTE in the executable.  Is that correct?
> >
> > there can be code that is not mte compatible that would
> > crash when memory tagging and tag checking are enabled.
> > currently there is no object file marking for compatibility.
> >
> > i think we should mention in the commit message in what
> > way the old code is incompatible with mte (e.g. it does
> > out of bound loads that can go across a tag granule
> > boundaries which can cause tag check failures with mte)
> 
> Should we add a marker to indicate that an object file is
> mte compatible?

i think we will need a marking for 'compatible with tagged
pointers' (on aarch64 pointer that's a new opt-in kernel abi
and user code may use the top byte of pointers) and another
one for 'compatible with 16byte granules'.

e.g. the old string asm would have the first marking but not
the second one.

the details of the semantics should be discussed once we
post the patches that turn mte on.
  
Wilco Dijkstra June 5, 2020, 8:45 a.m. UTC | #17
Hi,

>> Should we add a marker to indicate that an object file is
>> mte compatible?
>
> i think we will need a marking for 'compatible with tagged
> pointers' (on aarch64 pointer that's a new opt-in kernel abi
> and user code may use the top byte of pointers) and another
> one for 'compatible with 16byte granules'.
>
> e.g. the old string asm would have the first marking but not
> the second one.

I don't see how adding different markings would help. String functions
which are not MTE compatible simply cannot be used if MTE is enabled,
and if they are not fixed or ifunced then all of GLIBC is not compatible
with MTE.

Compatibility is a dynamic property, ie. an incompatible string function is
perfectly fine if it is ifunced. So having various different markings is not useful.
Either way all of this is unrelated to these patches.

Cheers,
Wilco
  
Szabolcs Nagy June 5, 2020, 4:25 p.m. UTC | #18
The 06/05/2020 09:45, Wilco Dijkstra wrote:
> Hi,
> 
> >> Should we add a marker to indicate that an object file is
> >> mte compatible?
> >
> > i think we will need a marking for 'compatible with tagged
> > pointers' (on aarch64 pointer that's a new opt-in kernel abi
> > and user code may use the top byte of pointers) and another
> > one for 'compatible with 16byte granules'.
> >
> > e.g. the old string asm would have the first marking but not
> > the second one.
> 
> I don't see how adding different markings would help. String functions
> which are not MTE compatible simply cannot be used if MTE is enabled,
> and if they are not fixed or ifunced then all of GLIBC is not compatible
> with MTE.
> 
> Compatibility is a dynamic property, ie. an incompatible string function is
> perfectly fine if it is ifunced. So having various different markings is not useful.
> Either way all of this is unrelated to these patches.

this is for user objects so we can print reasonable
error when an incompatible lib is loaded or disable
tag checking at startup time based on the marking.

not for string functions
  

Patch

diff --git a/sysdeps/aarch64/strchrnul.S b/sysdeps/aarch64/strchrnul.S
index a65be6cba8..1ae4598f82 100644
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@@ -22,109 +22,75 @@ 
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64
- * Neon Available.
+ * ARMv8-a, AArch64, Advanced SIMD.
+ * MTE compatible.
  */
 
-/* Arguments and results.  */
 #define srcin		x0
 #define chrin		w1
-
 #define result		x0
 
-/* Locals and temporaries.  */
-
 #define src		x2
-#define tmp1		x3
-#define wtmp2		w4
-#define tmp3		x5
+#define tmp1		x1
+#define tmp2		x3
+#define tmp2w		w3
 
 #define vrepchr		v0
-#define vdata1		v1
-#define vdata2		v2
-#define vhas_nul1	v3
-#define vhas_nul2	v4
-#define vhas_chr1	v5
-#define vhas_chr2	v6
-#define vrepmask	v7
-#define vend1		v16
-
-/* Core algorithm.
-
-   For each 32-byte hunk we calculate a 64-bit syndrome value, with
-   two bits per byte (LSB is always in bits 0 and 1, for both big
-   and little-endian systems).  For each tuple, bit 0 is set iff
-   the relevant byte matched the requested character or nul.  Since the
-   bits in the syndrome reflect exactly the order in which things occur
-   in the original string a count_trailing_zeros() operation will
-   identify exactly which byte is causing the termination.  */
+#define vdata		v1
+#define qdata		q1
+#define vhas_nul	v2
+#define vhas_chr	v3
+#define vrepmask	v4
+#define vend		v5
+#define dend		d5
+
+/* Core algorithm:
+
+   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   bits in the syndrome reflect the order in which things occur in the original
+   string, counting trailing zeros identifies exactly which byte matched.  */
 
 ENTRY (__strchrnul)
 	DELOUSE (0)
-	/* Magic constant 0x40100401 to allow us to identify which lane
-	   matches the termination condition.  */
-	mov	wtmp2, #0x0401
-	movk	wtmp2, #0x4010, lsl #16
+	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
-	bic	src, srcin, #31		/* Work with aligned 32-byte hunks.  */
-	dup	vrepmask.4s, wtmp2
-	ands	tmp1, srcin, #31
-	b.eq	L(loop)
-
-	/* Input string is not 32-byte aligned.  Rather than forcing
-	   the padding bytes to a safe value, we calculate the syndrome
-	   for all the bytes, but then mask off those bits of the
-	   syndrome that are related to the padding.  */
-	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	neg	tmp1, tmp1
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
-	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
-	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	orr	vhas_chr1.16b, vhas_chr1.16b, vhas_nul1.16b
-	orr	vhas_chr2.16b, vhas_chr2.16b, vhas_nul2.16b
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
-	lsl	tmp1, tmp1, #1
-	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b	// 256->128
-	mov	tmp3, #~0
-	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
-	lsr	tmp1, tmp3, tmp1
-
-	mov	tmp3, vend1.2d[0]
-	bic	tmp1, tmp3, tmp1	// Mask padding bits.
-	cbnz	tmp1, L(tail)
+	ld1	{vdata.16b}, [src]
+	mov	tmp2w, 0xf00f
+	dup	vrepmask.8h, tmp2w
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	lsl	tmp2, srcin, 2
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	tmp1, dend
+	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
+	cbz	tmp1, L(loop)
 
+	rbit	tmp1, tmp1
+	clz	tmp1, tmp1
+	add	result, srcin, tmp1, lsr 2
+	ret
+
+	.p2align 4
 L(loop):
-	ld1	{vdata1.16b, vdata2.16b}, [src], #32
-	cmeq	vhas_nul1.16b, vdata1.16b, #0
-	cmeq	vhas_chr1.16b, vdata1.16b, vrepchr.16b
-	cmeq	vhas_nul2.16b, vdata2.16b, #0
-	cmeq	vhas_chr2.16b, vdata2.16b, vrepchr.16b
-	/* Use a fast check for the termination condition.  */
-	orr	vhas_chr1.16b, vhas_nul1.16b, vhas_chr1.16b
-	orr	vhas_chr2.16b, vhas_nul2.16b, vhas_chr2.16b
-	orr	vend1.16b, vhas_chr1.16b, vhas_chr2.16b
-	addp	vend1.2d, vend1.2d, vend1.2d
-	mov	tmp1, vend1.2d[0]
+	ldr	qdata, [src, 16]!
+	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
+	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
+	umaxp	vend.16b, vhas_chr.16b, vhas_chr.16b
+	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
 
-	/* Termination condition found.  Now need to establish exactly why
-	   we terminated.  */
-	and	vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
-	and	vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
-	addp	vend1.16b, vhas_chr1.16b, vhas_chr2.16b		// 256->128
-	addp	vend1.16b, vend1.16b, vend1.16b		// 128->64
-
-	mov	tmp1, vend1.2d[0]
-L(tail):
-	/* Count the trailing zeros, by bit reversing...  */
+	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
+	fmov	tmp1, dend
+#ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
-	/* Re-bias source.  */
-	sub	src, src, #32
-	clz	tmp1, tmp1	/* ... and counting the leading zeros.  */
-	/* tmp1 is twice the offset into the fragment.  */
-	add	result, src, tmp1, lsr #1
+#endif
+	clz	tmp1, tmp1
+	add	result, src, tmp1, lsr 2
 	ret
 
 END(__strchrnul)