[v8,3/6] elf: Introduce _dl_map_segment_align hook for segment alignment tuning

Message ID 20260405035323.558335-4-wangrui@loongson.cn (mailing list archive)
State Superseded
Headers
Series elf: THP-aware load segment alignment |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent
linaro-tcwg-bot/tcwg_glibc_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_glibc_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_glibc_check--master-arm success Test passed
linaro-tcwg-bot/tcwg_glibc_check--master-aarch64 success Test passed

Commit Message

WANG Rui April 5, 2026, 3:53 a.m. UTC
  Introduce a new helper function, _dl_map_segment_align, to allow
architecture-specific adjustment of ELF load segment alignment during
object mapping.

The generic ELF loader now calls this hook when determining the maximum
segment alignment.  The generic implementation is a no-op and preserves
existing behavior.

This provides a well-defined extension point for architectures that
need to adjust segment alignment policies (for example, to improve
mapping efficiency or enable platform-specific optimizations) without
embedding such logic directly in the generic loader.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
Signed-off-by: WANG Rui <wangrui@loongson.cn>
---
 elf/dl-load.c                          |  4 ++++
 sysdeps/generic/dl-map-segment-align.h | 26 ++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 sysdeps/generic/dl-map-segment-align.h
  

Comments

H.J. Lu April 6, 2026, 8:02 a.m. UTC | #1
On Sun, Apr 5, 2026 at 11:54 AM WANG Rui <wangrui@loongson.cn> wrote:
>
> Introduce a new helper function, _dl_map_segment_align, to allow
> architecture-specific adjustment of ELF load segment alignment during
> object mapping.
>
> The generic ELF loader now calls this hook when determining the maximum
> segment alignment.  The generic implementation is a no-op and preserves
> existing behavior.
>
> This provides a well-defined extension point for architectures that
> need to adjust segment alignment policies (for example, to improve
> mapping efficiency or enable platform-specific optimizations) without
> embedding such logic directly in the generic loader.
>
> Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
> Signed-off-by: WANG Rui <wangrui@loongson.cn>
> ---
>  elf/dl-load.c                          |  4 ++++
>  sysdeps/generic/dl-map-segment-align.h | 26 ++++++++++++++++++++++++++
>  2 files changed, 30 insertions(+)
>  create mode 100644 sysdeps/generic/dl-map-segment-align.h
>
> diff --git a/elf/dl-load.c b/elf/dl-load.c
> index 7355eef8e76..f3d943e99c0 100644
> --- a/elf/dl-load.c
> +++ b/elf/dl-load.c
> @@ -71,6 +71,7 @@ struct filebuf
>  #include <dl-dst.h>
>  #include <dl-load.h>
>  #include <dl-map-segments.h>
> +#include <dl-map-segment-align.h>
>  #include <dl-unmap-segments.h>
>  #include <dl-machine-reject-phdr.h>
>  #include <dl-prop.h>
> @@ -1171,6 +1172,9 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
>
>           /* Optimize a common case.  */
>           c->prot = pf_to_prot (ph->p_flags);
> +
> +         /* Architecture-specific adjustment of segment alignment. */
> +         p_align_max = _dl_map_segment_align (c, p_align_max);
>           break;

Can you make THP to work for

Program Headers:
  Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
  PHDR           0x000034 0x00400034 0x00400034 0x001a0 0x001a0 R   0x4
  INTERP         0x0001f8 0x004001f8 0x004001f8 0x0001a 0x0001a R   0x1
      [Requesting program interpreter: /libx32/ld-linux-x32.so.2]
  LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
  LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
  LOAD           0x273ebc 0x00674ebc 0x00674ebc 0x02540 0x039fc RW  0x1000
  DYNAMIC        0x273ec8 0x00674ec8 0x00674ec8 0x00110 0x00110 RW  0x4
  NOTE           0x0001d4 0x004001d4 0x004001d4 0x00024 0x00024 R   0x4
  NOTE           0x16d0b0 0x0056d0b0 0x0056d0b0 0x00060 0x00060 R   0x4
  TLS            0x273ebc 0x00674ebc 0x00674ebc 0x00000 0x0000c R   0x4
  GNU_PROPERTY   0x16d0b0 0x0056d0b0 0x0056d0b0 0x00040 0x00040 R   0x4
  GNU_EH_FRAME   0x1493e0 0x005493e0 0x005493e0 0x05b94 0x05b94 R   0x4
  GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RW  0x10
  GNU_RELRO      0x273ebc 0x00674ebc 0x00674ebc 0x00144 0x00144 R   0x1

As an option, we can combine

  LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
  LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000

into a single RE THP map.

>         case PT_TLS:
> diff --git a/sysdeps/generic/dl-map-segment-align.h b/sysdeps/generic/dl-map-segment-align.h
> new file mode 100644
> index 00000000000..f4a671f25f8
> --- /dev/null
> +++ b/sysdeps/generic/dl-map-segment-align.h
> @@ -0,0 +1,26 @@
> +/* _dl_map_segment_align.  Generic version.
> +   Copyright (C) 2026 Free Software Foundation, Inc.
> +   Copyright The GNU Toolchain Authors.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <dl-load.h>
> +
> +static inline ElfW(Addr)
> +_dl_map_segment_align (const struct loadcmd *c, ElfW(Addr) p_align_max)
> +{
> +  return p_align_max;
> +}
> --
> 2.53.0
>
  
H.J. Lu April 6, 2026, 10:56 p.m. UTC | #2
On Mon, Apr 6, 2026 at 4:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Apr 5, 2026 at 11:54 AM WANG Rui <wangrui@loongson.cn> wrote:
> >
> > Introduce a new helper function, _dl_map_segment_align, to allow
> > architecture-specific adjustment of ELF load segment alignment during
> > object mapping.
> >
> > The generic ELF loader now calls this hook when determining the maximum
> > segment alignment.  The generic implementation is a no-op and preserves
> > existing behavior.
> >
> > This provides a well-defined extension point for architectures that
> > need to adjust segment alignment policies (for example, to improve
> > mapping efficiency or enable platform-specific optimizations) without
> > embedding such logic directly in the generic loader.
> >
> > Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
> > Signed-off-by: WANG Rui <wangrui@loongson.cn>
> > ---
> >  elf/dl-load.c                          |  4 ++++
> >  sysdeps/generic/dl-map-segment-align.h | 26 ++++++++++++++++++++++++++
> >  2 files changed, 30 insertions(+)
> >  create mode 100644 sysdeps/generic/dl-map-segment-align.h
> >
> > diff --git a/elf/dl-load.c b/elf/dl-load.c
> > index 7355eef8e76..f3d943e99c0 100644
> > --- a/elf/dl-load.c
> > +++ b/elf/dl-load.c
> > @@ -71,6 +71,7 @@ struct filebuf
> >  #include <dl-dst.h>
> >  #include <dl-load.h>
> >  #include <dl-map-segments.h>
> > +#include <dl-map-segment-align.h>
> >  #include <dl-unmap-segments.h>
> >  #include <dl-machine-reject-phdr.h>
> >  #include <dl-prop.h>
> > @@ -1171,6 +1172,9 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
> >
> >           /* Optimize a common case.  */
> >           c->prot = pf_to_prot (ph->p_flags);
> > +
> > +         /* Architecture-specific adjustment of segment alignment. */
> > +         p_align_max = _dl_map_segment_align (c, p_align_max);
> >           break;
>
> Can you make THP to work for
>
> Program Headers:
>   Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
>   PHDR           0x000034 0x00400034 0x00400034 0x001a0 0x001a0 R   0x4
>   INTERP         0x0001f8 0x004001f8 0x004001f8 0x0001a 0x0001a R   0x1
>       [Requesting program interpreter: /libx32/ld-linux-x32.so.2]
>   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
>   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
>   LOAD           0x273ebc 0x00674ebc 0x00674ebc 0x02540 0x039fc RW  0x1000
>   DYNAMIC        0x273ec8 0x00674ec8 0x00674ec8 0x00110 0x00110 RW  0x4
>   NOTE           0x0001d4 0x004001d4 0x004001d4 0x00024 0x00024 R   0x4
>   NOTE           0x16d0b0 0x0056d0b0 0x0056d0b0 0x00060 0x00060 R   0x4
>   TLS            0x273ebc 0x00674ebc 0x00674ebc 0x00000 0x0000c R   0x4
>   GNU_PROPERTY   0x16d0b0 0x0056d0b0 0x0056d0b0 0x00040 0x00040 R   0x4
>   GNU_EH_FRAME   0x1493e0 0x005493e0 0x005493e0 0x05b94 0x05b94 R   0x4
>   GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RW  0x10
>   GNU_RELRO      0x273ebc 0x00674ebc 0x00674ebc 0x00144 0x00144 R   0x1
>
> As an option, we can combine
>
>   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
>   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
>
> into a single RE THP map.
>

Something like this.
  
H.J. Lu April 6, 2026, 11:09 p.m. UTC | #3
On Tue, Apr 7, 2026 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Apr 6, 2026 at 4:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sun, Apr 5, 2026 at 11:54 AM WANG Rui <wangrui@loongson.cn> wrote:
> > >
> > > Introduce a new helper function, _dl_map_segment_align, to allow
> > > architecture-specific adjustment of ELF load segment alignment during
> > > object mapping.
> > >
> > > The generic ELF loader now calls this hook when determining the maximum
> > > segment alignment.  The generic implementation is a no-op and preserves
> > > existing behavior.
> > >
> > > This provides a well-defined extension point for architectures that
> > > need to adjust segment alignment policies (for example, to improve
> > > mapping efficiency or enable platform-specific optimizations) without
> > > embedding such logic directly in the generic loader.
> > >
> > > Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
> > > Signed-off-by: WANG Rui <wangrui@loongson.cn>
> > > ---
> > >  elf/dl-load.c                          |  4 ++++
> > >  sysdeps/generic/dl-map-segment-align.h | 26 ++++++++++++++++++++++++++
> > >  2 files changed, 30 insertions(+)
> > >  create mode 100644 sysdeps/generic/dl-map-segment-align.h
> > >
> > > diff --git a/elf/dl-load.c b/elf/dl-load.c
> > > index 7355eef8e76..f3d943e99c0 100644
> > > --- a/elf/dl-load.c
> > > +++ b/elf/dl-load.c
> > > @@ -71,6 +71,7 @@ struct filebuf
> > >  #include <dl-dst.h>
> > >  #include <dl-load.h>
> > >  #include <dl-map-segments.h>
> > > +#include <dl-map-segment-align.h>
> > >  #include <dl-unmap-segments.h>
> > >  #include <dl-machine-reject-phdr.h>
> > >  #include <dl-prop.h>
> > > @@ -1171,6 +1172,9 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
> > >
> > >           /* Optimize a common case.  */
> > >           c->prot = pf_to_prot (ph->p_flags);
> > > +
> > > +         /* Architecture-specific adjustment of segment alignment. */
> > > +         p_align_max = _dl_map_segment_align (c, p_align_max);
> > >           break;
> >
> > Can you make THP to work for
> >
> > Program Headers:
> >   Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
> >   PHDR           0x000034 0x00400034 0x00400034 0x001a0 0x001a0 R   0x4
> >   INTERP         0x0001f8 0x004001f8 0x004001f8 0x0001a 0x0001a R   0x1
> >       [Requesting program interpreter: /libx32/ld-linux-x32.so.2]
> >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> >   LOAD           0x273ebc 0x00674ebc 0x00674ebc 0x02540 0x039fc RW  0x1000
> >   DYNAMIC        0x273ec8 0x00674ec8 0x00674ec8 0x00110 0x00110 RW  0x4
> >   NOTE           0x0001d4 0x004001d4 0x004001d4 0x00024 0x00024 R   0x4
> >   NOTE           0x16d0b0 0x0056d0b0 0x0056d0b0 0x00060 0x00060 R   0x4
> >   TLS            0x273ebc 0x00674ebc 0x00674ebc 0x00000 0x0000c R   0x4
> >   GNU_PROPERTY   0x16d0b0 0x0056d0b0 0x0056d0b0 0x00040 0x00040 R   0x4
> >   GNU_EH_FRAME   0x1493e0 0x005493e0 0x005493e0 0x05b94 0x05b94 R   0x4
> >   GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RW  0x10
> >   GNU_RELRO      0x273ebc 0x00674ebc 0x00674ebc 0x00144 0x00144 R   0x1
> >
> > As an option, we can combine
> >
> >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> >
> > into a single RE THP map.
> >
>
> Something like this.
>
>
> --
> H.J.

A better _dl_map_segment_adjust hook.
  
H.J. Lu April 7, 2026, 12:38 a.m. UTC | #4
On Tue, Apr 7, 2026 at 7:09 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Apr 7, 2026 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Apr 6, 2026 at 4:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Sun, Apr 5, 2026 at 11:54 AM WANG Rui <wangrui@loongson.cn> wrote:
> > > >
> > > > Introduce a new helper function, _dl_map_segment_align, to allow
> > > > architecture-specific adjustment of ELF load segment alignment during
> > > > object mapping.
> > > >
> > > > The generic ELF loader now calls this hook when determining the maximum
> > > > segment alignment.  The generic implementation is a no-op and preserves
> > > > existing behavior.
> > > >
> > > > This provides a well-defined extension point for architectures that
> > > > need to adjust segment alignment policies (for example, to improve
> > > > mapping efficiency or enable platform-specific optimizations) without
> > > > embedding such logic directly in the generic loader.
> > > >
> > > > Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
> > > > Signed-off-by: WANG Rui <wangrui@loongson.cn>
> > > > ---
> > > >  elf/dl-load.c                          |  4 ++++
> > > >  sysdeps/generic/dl-map-segment-align.h | 26 ++++++++++++++++++++++++++
> > > >  2 files changed, 30 insertions(+)
> > > >  create mode 100644 sysdeps/generic/dl-map-segment-align.h
> > > >
> > > > diff --git a/elf/dl-load.c b/elf/dl-load.c
> > > > index 7355eef8e76..f3d943e99c0 100644
> > > > --- a/elf/dl-load.c
> > > > +++ b/elf/dl-load.c
> > > > @@ -71,6 +71,7 @@ struct filebuf
> > > >  #include <dl-dst.h>
> > > >  #include <dl-load.h>
> > > >  #include <dl-map-segments.h>
> > > > +#include <dl-map-segment-align.h>
> > > >  #include <dl-unmap-segments.h>
> > > >  #include <dl-machine-reject-phdr.h>
> > > >  #include <dl-prop.h>
> > > > @@ -1171,6 +1172,9 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
> > > >
> > > >           /* Optimize a common case.  */
> > > >           c->prot = pf_to_prot (ph->p_flags);
> > > > +
> > > > +         /* Architecture-specific adjustment of segment alignment. */
> > > > +         p_align_max = _dl_map_segment_align (c, p_align_max);
> > > >           break;
> > >
> > > Can you make THP to work for
> > >
> > > Program Headers:
> > >   Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
> > >   PHDR           0x000034 0x00400034 0x00400034 0x001a0 0x001a0 R   0x4
> > >   INTERP         0x0001f8 0x004001f8 0x004001f8 0x0001a 0x0001a R   0x1
> > >       [Requesting program interpreter: /libx32/ld-linux-x32.so.2]
> > >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> > >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> > >   LOAD           0x273ebc 0x00674ebc 0x00674ebc 0x02540 0x039fc RW  0x1000
> > >   DYNAMIC        0x273ec8 0x00674ec8 0x00674ec8 0x00110 0x00110 RW  0x4
> > >   NOTE           0x0001d4 0x004001d4 0x004001d4 0x00024 0x00024 R   0x4
> > >   NOTE           0x16d0b0 0x0056d0b0 0x0056d0b0 0x00060 0x00060 R   0x4
> > >   TLS            0x273ebc 0x00674ebc 0x00674ebc 0x00000 0x0000c R   0x4
> > >   GNU_PROPERTY   0x16d0b0 0x0056d0b0 0x0056d0b0 0x00040 0x00040 R   0x4
> > >   GNU_EH_FRAME   0x1493e0 0x005493e0 0x005493e0 0x05b94 0x05b94 R   0x4
> > >   GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RW  0x10
> > >   GNU_RELRO      0x273ebc 0x00674ebc 0x00674ebc 0x00144 0x00144 R   0x1
> > >
> > > As an option, we can combine
> > >
> > >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> > >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> > >
> > > into a single RE THP map.
> > >
> >
> > Something like this.
> >
> >
> > --
> > H.J.
>
> A better _dl_map_segment_adjust hook.
>
>

Here is the actual patch on top of yours to implement PT_LOAD
segment merging for THP:

mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x55fe63888000
mmap(NULL, 4271952, PROT_NONE,
MAP_PRIVATE|MAP_ANONYMOUS|MAP_DENYWRITE, -1, 0) = 0x55fe63475000
mmap(0x55fe63600000, 2174800, PROT_READ|PROT_EXEC,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0) = 0x55fe63600000
mmap(0x55fe63800000, 24576, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x200000) = 0x55fe63800000
mmap(0x55fe63806000, 53072, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x55fe63806000
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x55fe63886000
mmap(NULL, 12, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)
= 0x55fe63885000

vs

mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x55d27fbd7000
mmap(NULL, 2174832, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x55d27f800000
mmap(0x55d27f879000, 1601536, PROT_READ|PROT_EXEC,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x79000) = 0x55d27f879000
mmap(0x55d27fa00000, 24576, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x200000) = 0x55d27fa00000
mmap(0x55d27fa06000, 53104, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x55d27fa06000
mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x55d27fbd5000
mmap(NULL, 12, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)
= 0x55d27fbd4000
  
H.J. Lu April 7, 2026, 1:14 a.m. UTC | #5
On Tue, Apr 7, 2026 at 8:38 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Apr 7, 2026 at 7:09 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Apr 7, 2026 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Apr 6, 2026 at 4:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Sun, Apr 5, 2026 at 11:54 AM WANG Rui <wangrui@loongson.cn> wrote:
> > > > >
> > > > > Introduce a new helper function, _dl_map_segment_align, to allow
> > > > > architecture-specific adjustment of ELF load segment alignment during
> > > > > object mapping.
> > > > >
> > > > > The generic ELF loader now calls this hook when determining the maximum
> > > > > segment alignment.  The generic implementation is a no-op and preserves
> > > > > existing behavior.
> > > > >
> > > > > This provides a well-defined extension point for architectures that
> > > > > need to adjust segment alignment policies (for example, to improve
> > > > > mapping efficiency or enable platform-specific optimizations) without
> > > > > embedding such logic directly in the generic loader.
> > > > >
> > > > > Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
> > > > > Signed-off-by: WANG Rui <wangrui@loongson.cn>
> > > > > ---
> > > > >  elf/dl-load.c                          |  4 ++++
> > > > >  sysdeps/generic/dl-map-segment-align.h | 26 ++++++++++++++++++++++++++
> > > > >  2 files changed, 30 insertions(+)
> > > > >  create mode 100644 sysdeps/generic/dl-map-segment-align.h
> > > > >
> > > > > diff --git a/elf/dl-load.c b/elf/dl-load.c
> > > > > index 7355eef8e76..f3d943e99c0 100644
> > > > > --- a/elf/dl-load.c
> > > > > +++ b/elf/dl-load.c
> > > > > @@ -71,6 +71,7 @@ struct filebuf
> > > > >  #include <dl-dst.h>
> > > > >  #include <dl-load.h>
> > > > >  #include <dl-map-segments.h>
> > > > > +#include <dl-map-segment-align.h>
> > > > >  #include <dl-unmap-segments.h>
> > > > >  #include <dl-machine-reject-phdr.h>
> > > > >  #include <dl-prop.h>
> > > > > @@ -1171,6 +1172,9 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
> > > > >
> > > > >           /* Optimize a common case.  */
> > > > >           c->prot = pf_to_prot (ph->p_flags);
> > > > > +
> > > > > +         /* Architecture-specific adjustment of segment alignment. */
> > > > > +         p_align_max = _dl_map_segment_align (c, p_align_max);
> > > > >           break;
> > > >
> > > > Can you make THP to work for
> > > >
> > > > Program Headers:
> > > >   Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
> > > >   PHDR           0x000034 0x00400034 0x00400034 0x001a0 0x001a0 R   0x4
> > > >   INTERP         0x0001f8 0x004001f8 0x004001f8 0x0001a 0x0001a R   0x1
> > > >       [Requesting program interpreter: /libx32/ld-linux-x32.so.2]
> > > >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> > > >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> > > >   LOAD           0x273ebc 0x00674ebc 0x00674ebc 0x02540 0x039fc RW  0x1000
> > > >   DYNAMIC        0x273ec8 0x00674ec8 0x00674ec8 0x00110 0x00110 RW  0x4
> > > >   NOTE           0x0001d4 0x004001d4 0x004001d4 0x00024 0x00024 R   0x4
> > > >   NOTE           0x16d0b0 0x0056d0b0 0x0056d0b0 0x00060 0x00060 R   0x4
> > > >   TLS            0x273ebc 0x00674ebc 0x00674ebc 0x00000 0x0000c R   0x4
> > > >   GNU_PROPERTY   0x16d0b0 0x0056d0b0 0x0056d0b0 0x00040 0x00040 R   0x4
> > > >   GNU_EH_FRAME   0x1493e0 0x005493e0 0x005493e0 0x05b94 0x05b94 R   0x4
> > > >   GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RW  0x10
> > > >   GNU_RELRO      0x273ebc 0x00674ebc 0x00674ebc 0x00144 0x00144 R   0x1
> > > >
> > > > As an option, we can combine
> > > >
> > > >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> > > >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> > > >
> > > > into a single RE THP map.
> > > >
> > >
> > > Something like this.
> > >
> > >
> > > --
> > > H.J.
> >
> > A better _dl_map_segment_adjust hook.
> >
> >
>
> Here is the actual patch on top of yours to implement PT_LOAD
> segment merging for THP:
>
> mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
> 0) = 0x55fe63888000
> mmap(NULL, 4271952, PROT_NONE,
> MAP_PRIVATE|MAP_ANONYMOUS|MAP_DENYWRITE, -1, 0) = 0x55fe63475000
> mmap(0x55fe63600000, 2174800, PROT_READ|PROT_EXEC,
> MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0) = 0x55fe63600000
> mmap(0x55fe63800000, 24576, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x200000) = 0x55fe63800000
> mmap(0x55fe63806000, 53072, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x55fe63806000
> mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
> 0) = 0x55fe63886000
> mmap(NULL, 12, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)
> = 0x55fe63885000
>
> vs
>
> mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
> 0) = 0x55d27fbd7000
> mmap(NULL, 2174832, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x55d27f800000
> mmap(0x55d27f879000, 1601536, PROT_READ|PROT_EXEC,
> MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x79000) = 0x55d27f879000
> mmap(0x55d27fa00000, 24576, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x200000) = 0x55d27fa00000
> mmap(0x55d27fa06000, 53104, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x55d27fa06000
> mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
> 0) = 0x55d27fbd5000
> mmap(NULL, 12, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)
> = 0x55d27fbd4000
>
>

I got

FAIL: elf/tst-valgrind-smoke

on x86-64:

https://sourceware.org/bugzilla/show_bug.cgi?id=34050
  
H.J. Lu April 7, 2026, 6:54 a.m. UTC | #6
On Tue, Apr 7, 2026 at 9:14 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Apr 7, 2026 at 8:38 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Apr 7, 2026 at 7:09 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Tue, Apr 7, 2026 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > On Mon, Apr 6, 2026 at 4:02 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > On Sun, Apr 5, 2026 at 11:54 AM WANG Rui <wangrui@loongson.cn> wrote:
> > > > > >
> > > > > > Introduce a new helper function, _dl_map_segment_align, to allow
> > > > > > architecture-specific adjustment of ELF load segment alignment during
> > > > > > object mapping.
> > > > > >
> > > > > > The generic ELF loader now calls this hook when determining the maximum
> > > > > > segment alignment.  The generic implementation is a no-op and preserves
> > > > > > existing behavior.
> > > > > >
> > > > > > This provides a well-defined extension point for architectures that
> > > > > > need to adjust segment alignment policies (for example, to improve
> > > > > > mapping efficiency or enable platform-specific optimizations) without
> > > > > > embedding such logic directly in the generic loader.
> > > > > >
> > > > > > Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
> > > > > > Signed-off-by: WANG Rui <wangrui@loongson.cn>
> > > > > > ---
> > > > > >  elf/dl-load.c                          |  4 ++++
> > > > > >  sysdeps/generic/dl-map-segment-align.h | 26 ++++++++++++++++++++++++++
> > > > > >  2 files changed, 30 insertions(+)
> > > > > >  create mode 100644 sysdeps/generic/dl-map-segment-align.h
> > > > > >
> > > > > > diff --git a/elf/dl-load.c b/elf/dl-load.c
> > > > > > index 7355eef8e76..f3d943e99c0 100644
> > > > > > --- a/elf/dl-load.c
> > > > > > +++ b/elf/dl-load.c
> > > > > > @@ -71,6 +71,7 @@ struct filebuf
> > > > > >  #include <dl-dst.h>
> > > > > >  #include <dl-load.h>
> > > > > >  #include <dl-map-segments.h>
> > > > > > +#include <dl-map-segment-align.h>
> > > > > >  #include <dl-unmap-segments.h>
> > > > > >  #include <dl-machine-reject-phdr.h>
> > > > > >  #include <dl-prop.h>
> > > > > > @@ -1171,6 +1172,9 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
> > > > > >
> > > > > >           /* Optimize a common case.  */
> > > > > >           c->prot = pf_to_prot (ph->p_flags);
> > > > > > +
> > > > > > +         /* Architecture-specific adjustment of segment alignment. */
> > > > > > +         p_align_max = _dl_map_segment_align (c, p_align_max);
> > > > > >           break;
> > > > >
> > > > > Can you make THP to work for
> > > > >
> > > > > Program Headers:
> > > > >   Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
> > > > >   PHDR           0x000034 0x00400034 0x00400034 0x001a0 0x001a0 R   0x4
> > > > >   INTERP         0x0001f8 0x004001f8 0x004001f8 0x0001a 0x0001a R   0x1
> > > > >       [Requesting program interpreter: /libx32/ld-linux-x32.so.2]
> > > > >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> > > > >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> > > > >   LOAD           0x273ebc 0x00674ebc 0x00674ebc 0x02540 0x039fc RW  0x1000
> > > > >   DYNAMIC        0x273ec8 0x00674ec8 0x00674ec8 0x00110 0x00110 RW  0x4
> > > > >   NOTE           0x0001d4 0x004001d4 0x004001d4 0x00024 0x00024 R   0x4
> > > > >   NOTE           0x16d0b0 0x0056d0b0 0x0056d0b0 0x00060 0x00060 R   0x4
> > > > >   TLS            0x273ebc 0x00674ebc 0x00674ebc 0x00000 0x0000c R   0x4
> > > > >   GNU_PROPERTY   0x16d0b0 0x0056d0b0 0x0056d0b0 0x00040 0x00040 R   0x4
> > > > >   GNU_EH_FRAME   0x1493e0 0x005493e0 0x005493e0 0x05b94 0x05b94 R   0x4
> > > > >   GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RW  0x10
> > > > >   GNU_RELRO      0x273ebc 0x00674ebc 0x00674ebc 0x00144 0x00144 R   0x1
> > > > >
> > > > > As an option, we can combine
> > > > >
> > > > >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> > > > >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> > > > >
> > > > > into a single RE THP map.
> > > > >
> > > >
> > > > Something like this.
> > > >
> > > >
> > > > --
> > > > H.J.
> > >
> > > A better _dl_map_segment_adjust hook.
> > >
> > >
> >
> > Here is the actual patch on top of yours to implement PT_LOAD
> > segment merging for THP:
> >
> > mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
> > 0) = 0x55fe63888000
> > mmap(NULL, 4271952, PROT_NONE,
> > MAP_PRIVATE|MAP_ANONYMOUS|MAP_DENYWRITE, -1, 0) = 0x55fe63475000
> > mmap(0x55fe63600000, 2174800, PROT_READ|PROT_EXEC,
> > MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0) = 0x55fe63600000
> > mmap(0x55fe63800000, 24576, PROT_READ|PROT_WRITE,
> > MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x200000) = 0x55fe63800000
> > mmap(0x55fe63806000, 53072, PROT_READ|PROT_WRITE,
> > MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x55fe63806000
> > mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
> > 0) = 0x55fe63886000
> > mmap(NULL, 12, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)
> > = 0x55fe63885000
> >
> > vs
> >
> > mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
> > 0) = 0x55d27fbd7000
> > mmap(NULL, 2174832, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x55d27f800000
> > mmap(0x55d27f879000, 1601536, PROT_READ|PROT_EXEC,
> > MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x79000) = 0x55d27f879000
> > mmap(0x55d27fa00000, 24576, PROT_READ|PROT_WRITE,
> > MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x200000) = 0x55d27fa00000
> > mmap(0x55d27fa06000, 53104, PROT_READ|PROT_WRITE,
> > MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x55d27fa06000
> > mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
> > 0) = 0x55d27fbd5000
> > mmap(NULL, 12, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0)
> > = 0x55d27fbd4000
> >
> >
>
> I got
>
> FAIL: elf/tst-valgrind-smoke
>
> on x86-64:
>
> https://sourceware.org/bugzilla/show_bug.cgi?id=34050
>
> --
> H.J.

A small fix for gap check.   Kernel also needs a similar change
to combine 2 PLT_LOAD segments.
  
Wilco Dijkstra April 8, 2026, 2:14 p.m. UTC | #7
Hi HJ,

> > > > >
> > > > >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> > > > >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> > > > >
> > > > > into a single RE THP map.

So my question is why do we need to do this in both loaders?

Shouldn't the linker just do better here? Eg. it can merge program headers
and related read-only data into the text segment even with -zseparate-code.

Cheers,
Wilco
  
H.J. Lu April 8, 2026, 8:27 p.m. UTC | #8
On Wed, Apr 8, 2026 at 10:15 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi HJ,
>
> > > > > >
> > > > > >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> > > > > >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> > > > > >
> > > > > > into a single RE THP map.
>
> So my question is why do we need to do this in both loaders?
>
> Shouldn't the linker just do better here? Eg. it can merge program headers
> and related read-only data into the text segment even with -zseparate-code.

x86 linker on master branch no longer does that after the fix for

https://sourceware.org/bugzilla/show_bug.cgi?id=34003
  
Wilco Dijkstra April 9, 2026, 10:20 a.m. UTC | #9
Hi HJ,

> > > > > > >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> > > > > > >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> > > > > > >
> > > > > > > into a single RE THP map.
> >
> > So my question is why do we need to do this in both loaders?
> >
> > Shouldn't the linker just do better here? Eg. it can merge program headers
> > and related read-only data into the text segment even with -zseparate-code.
> 
> x86 linker on master branch no longer does that after the fix for
> 
> https://sourceware.org/bugzilla/show_bug.cgi?id=34003

So x86 now starts with R+X load segment by default?

However I'm still not convinced that what you're trying to do is correct - it seems to me you can't
just decide to merge segments when the user requested -zseperate-code or --rosegment. Those
segments exist for a reason.

The loader could support R, R+X load segments and still align for THP without merging them.

Cheers,
Wilco
  
H.J. Lu April 9, 2026, 11:47 a.m. UTC | #10
On Thu, Apr 9, 2026 at 6:21 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi HJ,
>
> > > > > > > >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> > > > > > > >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> > > > > > > >
> > > > > > > > into a single RE THP map.
> > >
> > > So my question is why do we need to do this in both loaders?
> > >
> > > Shouldn't the linker just do better here? Eg. it can merge program headers
> > > and related read-only data into the text segment even with -zseparate-code.
> >
> > x86 linker on master branch no longer does that after the fix for
> >
> > https://sourceware.org/bugzilla/show_bug.cgi?id=34003
>
> So x86 now starts with R+X load segment by default?

That was before the bug fix.  Now we get:

  LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
  LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
  LOAD           0x273ebc 0x00674ebc 0x00674ebc 0x02540 0x039fc RW  0x1000

> However I'm still not convinced that what you're trying to do is correct - it seems to me you can't
> just decide to merge segments when the user requested -zseperate-code or --rosegment. Those
> segments exist for a reason.

After the bug fix, x86 linker no longer puts read-only data, including ELF
headers, in R+X load segment with -z seperate-code --rosegment.

> The loader could support R, R+X load segments and still align for THP without merging them.
>

I was told that THP needed to start from the file offset 0.  So

  LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000

doesn't work for THP.

> Cheers,
> Wilco
  
Wilco Dijkstra April 9, 2026, 1:17 p.m. UTC | #11
Hi HJ,

> I was told that THP needed to start from the file offset 0.  So
>
>  LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
>
> doesn't work for THP.

For it to work you'd need to align the file start. Rui's current patch doesn't
do the alignment for this case, but we could generalize it and do it if either
R or R+W load segments are large enough.

But that is different from changing the segments in the loader.

Cheers,
Wilco
  
H.J. Lu April 9, 2026, 8:43 p.m. UTC | #12
On Thu, Apr 9, 2026 at 9:18 PM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi HJ,
>
> > I was told that THP needed to start from the file offset 0.  So
> >
> >  LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> >
> > doesn't work for THP.
>
> For it to work you'd need to align the file start. Rui's current patch doesn't
> do the alignment for this case, but we could generalize it and do it if either

Unless the first R segment is bigger than 2MB,  we need to align from the
file offset 0 anyway.   That is what my proposal does.

> R or R+W load segments are large enough.
>
> But that is different from changing the segments in the loader.
>
> Cheers,
> Wilco
  
Wilco Dijkstra April 9, 2026, 9:04 p.m. UTC | #13
Hi HJ,

> Unless the first R segment is bigger than 2MB,  we need to align from the
> file offset 0 anyway.   That is what my proposal does.

But it doesn't, that's my issue. It would be perfectly fine to generalize Rui's patch
to remove the condition that the R+X segment must start at zero offset and check
any segment before it is contiguous and R.

Merging R with R+W should be done in the linker.

Cheers,
Wilco
  
H.J. Lu April 9, 2026, 9:11 p.m. UTC | #14
On Fri, Apr 10, 2026 at 5:06 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi HJ,
>
> > Unless the first R segment is bigger than 2MB,  we need to align from the
> > file offset 0 anyway.   That is what my proposal does.
>
> But it doesn't, that's my issue. It would be perfectly fine to generalize Rui's patch

What did you mean by "doesn't"?

> to remove the condition that the R+X segment must start at zero offset and check

The easiest way to implement it is to combine R + RX segments in one segment
from the file offset 0.  We only need 1 load, instead of 2.

> any segment before it is contiguous and R.
>
> Merging R with R+W should be done in the linker.
>
> Cheers,
> Wilco
  
Wilco Dijkstra April 9, 2026, 11:18 p.m. UTC | #15
Hi HJ,

>> > Unless the first R segment is bigger than 2MB,  we need to align from the
>> > file offset 0 anyway.   That is what my proposal does.
>>
>> But it doesn't, that's my issue. It would be perfectly fine to generalize Rui's patch
>
> What did you mean by "doesn't"?

It merges load segments which is not the same as aligning.

>> to remove the condition that the R+X segment must start at zero offset and check
>
> The easiest way to implement it is to combine R + RX segments in one segment
> from the file offset 0.  We only need 1 load, instead of 2.

How? If you want to ignore -zseparate-code/--(no-)rosegment, do it in the linker and
warn the user that you're merging sections that they explicitly requested to be separate.

Cheers,
Wilco
  
H.J. Lu April 9, 2026, 11:44 p.m. UTC | #16
On Fri, Apr 10, 2026 at 7:19 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi HJ,
>
> >> > Unless the first R segment is bigger than 2MB,  we need to align from the
> >> > file offset 0 anyway.   That is what my proposal does.
> >>
> >> But it doesn't, that's my issue. It would be perfectly fine to generalize Rui's patch
> >
> > What did you mean by "doesn't"?
>
> It merges load segments which is not the same as aligning.

If the first R segment size is smaller than 2MB, don't you need to
load it from the offset 0 to align the second segment to 2MB?

> >> to remove the condition that the R+X segment must start at zero offset and check
> >
> > The easiest way to implement it is to combine R + RX segments in one segment
> > from the file offset 0.  We only need 1 load, instead of 2.
>
> How? If you want to ignore -zseparate-code/--(no-)rosegment, do it in the linker and
> warn the user that you're merging sections that they explicitly requested to be separate.
>

It goes back to the question of whether THP should be optional or
on by default.
  
WANG Rui April 10, 2026, 3:18 a.m. UTC | #17
On Mon, Apr 6, 2026 at 4:03 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Apr 5, 2026 at 11:54 AM WANG Rui <wangrui@loongson.cn> wrote:
> >
> > Introduce a new helper function, _dl_map_segment_align, to allow
> > architecture-specific adjustment of ELF load segment alignment during
> > object mapping.
> >
> > The generic ELF loader now calls this hook when determining the maximum
> > segment alignment.  The generic implementation is a no-op and preserves
> > existing behavior.
> >
> > This provides a well-defined extension point for architectures that
> > need to adjust segment alignment policies (for example, to improve
> > mapping efficiency or enable platform-specific optimizations) without
> > embedding such logic directly in the generic loader.
> >
> > Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
> > Signed-off-by: WANG Rui <wangrui@loongson.cn>
> > ---
> >  elf/dl-load.c                          |  4 ++++
> >  sysdeps/generic/dl-map-segment-align.h | 26 ++++++++++++++++++++++++++
> >  2 files changed, 30 insertions(+)
> >  create mode 100644 sysdeps/generic/dl-map-segment-align.h
> >
> > diff --git a/elf/dl-load.c b/elf/dl-load.c
> > index 7355eef8e76..f3d943e99c0 100644
> > --- a/elf/dl-load.c
> > +++ b/elf/dl-load.c
> > @@ -71,6 +71,7 @@ struct filebuf
> >  #include <dl-dst.h>
> >  #include <dl-load.h>
> >  #include <dl-map-segments.h>
> > +#include <dl-map-segment-align.h>
> >  #include <dl-unmap-segments.h>
> >  #include <dl-machine-reject-phdr.h>
> >  #include <dl-prop.h>
> > @@ -1171,6 +1172,9 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
> >
> >           /* Optimize a common case.  */
> >           c->prot = pf_to_prot (ph->p_flags);
> > +
> > +         /* Architecture-specific adjustment of segment alignment. */
> > +         p_align_max = _dl_map_segment_align (c, p_align_max);
> >           break;
>
> Can you make THP to work for
>
> Program Headers:
>   Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
>   PHDR           0x000034 0x00400034 0x00400034 0x001a0 0x001a0 R   0x4
>   INTERP         0x0001f8 0x004001f8 0x004001f8 0x0001a 0x0001a R   0x1
>       [Requesting program interpreter: /libx32/ld-linux-x32.so.2]
>   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
>   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
>   LOAD           0x273ebc 0x00674ebc 0x00674ebc 0x02540 0x039fc RW  0x1000
>   DYNAMIC        0x273ec8 0x00674ec8 0x00674ec8 0x00110 0x00110 RW  0x4
>   NOTE           0x0001d4 0x004001d4 0x004001d4 0x00024 0x00024 R   0x4
>   NOTE           0x16d0b0 0x0056d0b0 0x0056d0b0 0x00060 0x00060 R   0x4
>   TLS            0x273ebc 0x00674ebc 0x00674ebc 0x00000 0x0000c R   0x4
>   GNU_PROPERTY   0x16d0b0 0x0056d0b0 0x0056d0b0 0x00040 0x00040 R   0x4
>   GNU_EH_FRAME   0x1493e0 0x005493e0 0x005493e0 0x05b94 0x05b94 R   0x4
>   GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RW  0x10
>   GNU_RELRO      0x273ebc 0x00674ebc 0x00674ebc 0x00144 0x00144 R   0x1
>
> As an option, we can combine
>
>   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
>   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
>
> into a single RE THP map.
>

In GNU ld, x86 is the only target where separate-code is enabled by
default. As as result, there is a large body of existing binaries that
were built with this configuration, and their RE LOAD segments are not
2MB-aligned.

The condition for enabling the opt does not require the file offset or
virtual address to be zero, it only requires hugepge alignment. Zero
just happens to satisfy any alignment requirement perfectly. So
technically, merging the first R segment with the following RE segment
could create such an opportunity.

But that would obviously change the loading semantics expressed by the
ELF itself, since we would be merging the access permission of two
segments. At that point, it is no longer just about adjusting
aligiment. So the question is whether the loader should or even can do
that.

Thanks,
Rui
  
H.J. Lu April 10, 2026, 4:25 a.m. UTC | #18
On Fri, Apr 10, 2026 at 11:28 AM WANG Rui <wangrui@loongson.cn> wrote:
>
> On Mon, Apr 6, 2026 at 4:03 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Sun, Apr 5, 2026 at 11:54 AM WANG Rui <wangrui@loongson.cn> wrote:
> > >
> > > Introduce a new helper function, _dl_map_segment_align, to allow
> > > architecture-specific adjustment of ELF load segment alignment during
> > > object mapping.
> > >
> > > The generic ELF loader now calls this hook when determining the maximum
> > > segment alignment.  The generic implementation is a no-op and preserves
> > > existing behavior.
> > >
> > > This provides a well-defined extension point for architectures that
> > > need to adjust segment alignment policies (for example, to improve
> > > mapping efficiency or enable platform-specific optimizations) without
> > > embedding such logic directly in the generic loader.
> > >
> > > Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
> > > Signed-off-by: WANG Rui <wangrui@loongson.cn>
> > > ---
> > >  elf/dl-load.c                          |  4 ++++
> > >  sysdeps/generic/dl-map-segment-align.h | 26 ++++++++++++++++++++++++++
> > >  2 files changed, 30 insertions(+)
> > >  create mode 100644 sysdeps/generic/dl-map-segment-align.h
> > >
> > > diff --git a/elf/dl-load.c b/elf/dl-load.c
> > > index 7355eef8e76..f3d943e99c0 100644
> > > --- a/elf/dl-load.c
> > > +++ b/elf/dl-load.c
> > > @@ -71,6 +71,7 @@ struct filebuf
> > >  #include <dl-dst.h>
> > >  #include <dl-load.h>
> > >  #include <dl-map-segments.h>
> > > +#include <dl-map-segment-align.h>
> > >  #include <dl-unmap-segments.h>
> > >  #include <dl-machine-reject-phdr.h>
> > >  #include <dl-prop.h>
> > > @@ -1171,6 +1172,9 @@ _dl_map_object_from_fd (const char *name, const char *origname, int fd,
> > >
> > >           /* Optimize a common case.  */
> > >           c->prot = pf_to_prot (ph->p_flags);
> > > +
> > > +         /* Architecture-specific adjustment of segment alignment. */
> > > +         p_align_max = _dl_map_segment_align (c, p_align_max);
> > >           break;
> >
> > Can you make THP to work for
> >
> > Program Headers:
> >   Type           Offset   VirtAddr   PhysAddr   FileSiz MemSiz  Flg Align
> >   PHDR           0x000034 0x00400034 0x00400034 0x001a0 0x001a0 R   0x4
> >   INTERP         0x0001f8 0x004001f8 0x004001f8 0x0001a 0x0001a R   0x1
> >       [Requesting program interpreter: /libx32/ld-linux-x32.so.2]
> >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> >   LOAD           0x273ebc 0x00674ebc 0x00674ebc 0x02540 0x039fc RW  0x1000
> >   DYNAMIC        0x273ec8 0x00674ec8 0x00674ec8 0x00110 0x00110 RW  0x4
> >   NOTE           0x0001d4 0x004001d4 0x004001d4 0x00024 0x00024 R   0x4
> >   NOTE           0x16d0b0 0x0056d0b0 0x0056d0b0 0x00060 0x00060 R   0x4
> >   TLS            0x273ebc 0x00674ebc 0x00674ebc 0x00000 0x0000c R   0x4
> >   GNU_PROPERTY   0x16d0b0 0x0056d0b0 0x0056d0b0 0x00040 0x00040 R   0x4
> >   GNU_EH_FRAME   0x1493e0 0x005493e0 0x005493e0 0x05b94 0x05b94 R   0x4
> >   GNU_STACK      0x000000 0x00000000 0x00000000 0x00000 0x00000 RW  0x10
> >   GNU_RELRO      0x273ebc 0x00674ebc 0x00674ebc 0x00144 0x00144 R   0x1
> >
> > As an option, we can combine
> >
> >   LOAD           0x000000 0x00400000 0x00400000 0x16d110 0x16d110 R   0x1000
> >   LOAD           0x16e000 0x0056e000 0x0056e000 0x1055d9 0x1055d9 R E 0x1000
> >
> > into a single RE THP map.
> >
>
> In GNU ld, x86 is the only target where separate-code is enabled by
> default. As as result, there is a large body of existing binaries that
> were built with this configuration, and their RE LOAD segments are not
> 2MB-aligned.
>
> The condition for enabling the opt does not require the file offset or
> virtual address to be zero, it only requires hugepge alignment. Zero
> just happens to satisfy any alignment requirement perfectly. So
> technically, merging the first R segment with the following RE segment
> could create such an opportunity.
>
> But that would obviously change the loading semantics expressed by the
> ELF itself, since we would be merging the access permission of two
> segments. At that point, it is no longer just about adjusting
> aligiment. So the question is whether the loader should or even can do
> that.

This is how a segment is aligned by ld.so.  It is very natural.   In this
case, the alignment padding happens to cover the whole first R segment
completely.   It just kills 2 birds with one stone.
  
Wilco Dijkstra April 10, 2026, 12:37 p.m. UTC | #19
Hi HJ,

> If the first R segment size is smaller than 2MB, don't you need to
> load it from the offset 0 to align the second segment to 2MB?

Alignment only applies to the *start* of the binary. So if you want the second
segment to be aligned to 2MB, you must do that in the linker.

Remember all offsets have been determined during linking and are thus fixed.

>> How? If you want to ignore -zseparate-code/--(no-)rosegment, do it in the linker and
>> warn the user that you're merging sections that they explicitly requested to be separate.
>
> It goes back to the question of whether THP should be optional or
> on by default.

No idea what you mean - THP has been on by default for a decade in some distros.

Cheers,
Wilco
  

Patch

diff --git a/elf/dl-load.c b/elf/dl-load.c
index 7355eef8e76..f3d943e99c0 100644
--- a/elf/dl-load.c
+++ b/elf/dl-load.c
@@ -71,6 +71,7 @@  struct filebuf
 #include <dl-dst.h>
 #include <dl-load.h>
 #include <dl-map-segments.h>
+#include <dl-map-segment-align.h>
 #include <dl-unmap-segments.h>
 #include <dl-machine-reject-phdr.h>
 #include <dl-prop.h>
@@ -1171,6 +1172,9 @@  _dl_map_object_from_fd (const char *name, const char *origname, int fd,
 
 	  /* Optimize a common case.  */
 	  c->prot = pf_to_prot (ph->p_flags);
+
+	  /* Architecture-specific adjustment of segment alignment. */
+	  p_align_max = _dl_map_segment_align (c, p_align_max);
 	  break;
 
 	case PT_TLS:
diff --git a/sysdeps/generic/dl-map-segment-align.h b/sysdeps/generic/dl-map-segment-align.h
new file mode 100644
index 00000000000..f4a671f25f8
--- /dev/null
+++ b/sysdeps/generic/dl-map-segment-align.h
@@ -0,0 +1,26 @@ 
+/* _dl_map_segment_align.  Generic version.
+   Copyright (C) 2026 Free Software Foundation, Inc.
+   Copyright The GNU Toolchain Authors.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dl-load.h>
+
+static inline ElfW(Addr)
+_dl_map_segment_align (const struct loadcmd *c, ElfW(Addr) p_align_max)
+{
+  return p_align_max;
+}