[v2] Mark more functions as __COLD

Message ID 20230518170648.93316-1-bugaevc@gmail.com
State New
Headers
Series [v2] Mark more functions as __COLD |

Checks

Context Check Description
dj/TryBot-apply_patch success Patch applied to master at the time it was sent
dj/TryBot-32bit success Build for i686
redhat-pt-bot/TryBot-still_applies warning Patch no longer applies to master

Commit Message

Sergey Bugaev May 18, 2023, 5:06 p.m. UTC
  The various error reporting functions are unlikely to be called; let the
compiler know about this. This will help GCC optimize both these
functions and their callers better.

In particular, GCC will place the code generated for these functions
into a .text.unlikely section in the object files. The linker will then
group the .text.unlikely sections together inside the .text section of
the resulting binary. This improves code locality.

In some cases the compiler may even decide that it's beneficial to
separate out the code paths in other functions that lead to a call of a
cold function, and also place them into .text.unlikely section. This
works both within glibc, and in any external code as long as it's
compiled against the new headers containing the __COLD annotations, with
a compiler that can understand and make use of them.

Signed-off-by: Sergey Bugaev <bugaevc@gmail.com>
---

Compared to v1:
* got rid of the unintentional _Noreturn vs __attribute__ ((noreturn))
  changes;
* verified that this commit no longer shows up in 'git log -S' output
  for either _Noreturn or noreturn.

 assert/assert.h                       |  4 ++--
 debug/chk_fail.c                      |  2 +-
 elf/dl-exception.c                    |  2 +-
 elf/dl-minimal.c                      |  8 ++++----
 elf/dl-open.c                         |  2 +-
 elf/dl-tls.c                          |  2 +-
 include/assert.h                      |  6 +++---
 include/stdio.h                       |  8 +++++---
 include/sys/cdefs.h                   |  3 +++
 malloc/dynarray.h                     |  2 +-
 malloc/malloc.c                       |  3 ++-
 misc/bits/error.h                     | 12 ++++++------
 stdlib/stdlib.h                       |  2 +-
 sysdeps/generic/dl-call_tls_init_tp.h |  2 +-
 sysdeps/generic/ldsodefs.h            | 23 ++++++++++++-----------
 sysdeps/mach/hurd/dl-sysdep.c         |  2 +-
 16 files changed, 45 insertions(+), 38 deletions(-)
  

Comments

Adhemerval Zanella Netto May 18, 2023, 7:43 p.m. UTC | #1
On 18/05/23 14:06, Sergey Bugaev via Libc-alpha wrote:
> The various error reporting functions are unlikely to be called; let the
> compiler know about this. This will help GCC optimize both these
> functions and their callers better.
> 
> In particular, GCC will place the code generated for these functions
> into a .text.unlikely section in the object files. The linker will then
> group the .text.unlikely sections together inside the .text section of
> the resulting binary. This improves code locality.
> 
> In some cases the compiler may even decide that it's beneficial to
> separate out the code paths in other functions that lead to a call of a
> cold function, and also place them into .text.unlikely section. This
> works both within glibc, and in any external code as long as it's
> compiled against the new headers containing the __COLD annotations, with
> a compiler that can understand and make use of them.
> 
> Signed-off-by: Sergey Bugaev <bugaevc@gmail.com>

The rationale seems ok, some comments below.
> ---
> 
> Compared to v1:
> * got rid of the unintentional _Noreturn vs __attribute__ ((noreturn))
>   changes;
> * verified that this commit no longer shows up in 'git log -S' output
>   for either _Noreturn or noreturn.
> 
>  assert/assert.h                       |  4 ++--
>  debug/chk_fail.c                      |  2 +-
>  elf/dl-exception.c                    |  2 +-
>  elf/dl-minimal.c                      |  8 ++++----
>  elf/dl-open.c                         |  2 +-
>  elf/dl-tls.c                          |  2 +-
>  include/assert.h                      |  6 +++---
>  include/stdio.h                       |  8 +++++---
>  include/sys/cdefs.h                   |  3 +++
>  malloc/dynarray.h                     |  2 +-
>  malloc/malloc.c                       |  3 ++-
>  misc/bits/error.h                     | 12 ++++++------
>  stdlib/stdlib.h                       |  2 +-
>  sysdeps/generic/dl-call_tls_init_tp.h |  2 +-
>  sysdeps/generic/ldsodefs.h            | 23 ++++++++++++-----------
>  sysdeps/mach/hurd/dl-sysdep.c         |  2 +-
>  16 files changed, 45 insertions(+), 38 deletions(-)
> 
> diff --git a/assert/assert.h b/assert/assert.h
> index 62670e4b..1c472e16 100644
> --- a/assert/assert.h
> +++ b/assert/assert.h
> @@ -66,12 +66,12 @@ __BEGIN_DECLS
>  /* This prints an "Assertion failed" message and aborts.  */
>  extern void __assert_fail (const char *__assertion, const char *__file,
>  			   unsigned int __line, const char *__function)
> -     __THROW __attribute__ ((__noreturn__));
> +     __THROW __attribute__ ((__noreturn__)) __COLD;
>  
>  /* Likewise, but prints the error text for ERRNUM.  */
>  extern void __assert_perror_fail (int __errnum, const char *__file,
>  				  unsigned int __line, const char *__function)
> -     __THROW __attribute__ ((__noreturn__));
> +     __THROW __attribute__ ((__noreturn__)) __COLD;
>  
>  
>  /* The following is not at all used here but needed for standard
> diff --git a/debug/chk_fail.c b/debug/chk_fail.c
> index 299e95b6..97037972 100644
> --- a/debug/chk_fail.c
> +++ b/debug/chk_fail.c
> @@ -22,7 +22,7 @@
>  extern char **__libc_argv attribute_hidden;
>  
>  void
> -__attribute__ ((noreturn))
> +__attribute__ ((noreturn)) __COLD
>  __chk_fail (void)
>  {
>    __fortify_fail ("buffer overflow detected");
> diff --git a/elf/dl-exception.c b/elf/dl-exception.c
> index 06a27cd7..1e1309fb 100644
> --- a/elf/dl-exception.c
> +++ b/elf/dl-exception.c
> @@ -52,7 +52,7 @@ oom_exception (struct dl_exception *exception)
>  }
>  
>  static void
> -__attribute__ ((noreturn))
> +__attribute__ ((noreturn)) __COLD
>  length_mismatch (void)
>  {
>    _dl_fatal_printf ("Fatal error: "
> diff --git a/elf/dl-minimal.c b/elf/dl-minimal.c
> index abda453c..dc267519 100644
> --- a/elf/dl-minimal.c
> +++ b/elf/dl-minimal.c
> @@ -156,14 +156,14 @@ __strerror_r (int errnum, char *buf, size_t buflen)
>    return msg;
>  }
>  
> -void
> +void __COLD
>  __libc_fatal (const char *message)
>  {
>    _dl_fatal_printf ("%s", message);
>  }
>  rtld_hidden_def (__libc_fatal)
>  

Can't you just add on the function prototype at include/stdio.h? Same
question for the __assert_fail and __assert_perror_fail below.

> -void
> +void __COLD
>  __attribute__ ((noreturn))
>  __chk_fail (void)
>  {
> @@ -176,7 +176,7 @@ rtld_hidden_def (__chk_fail)
>     If we are linked into the user program (-ldl), the normal __assert_fail
>     defn can override this one.  */
>  
> -void weak_function
> +void weak_function __COLD
>  __assert_fail (const char *assertion,
>  	       const char *file, unsigned int line, const char *function)
>  {
> @@ -190,7 +190,7 @@ Inconsistency detected by ld.so: %s: %u: %s%sAssertion `%s' failed!\n",
>  rtld_hidden_weak (__assert_fail)
>  # endif
>  
> -void weak_function
> +void weak_function __COLD
>  __assert_perror_fail (int errnum,
>  		      const char *file, unsigned int line,
>  		      const char *function)
> diff --git a/elf/dl-open.c b/elf/dl-open.c
> index 2d985e21..47bc5cac 100644
> --- a/elf/dl-open.c
> +++ b/elf/dl-open.c
> @@ -77,7 +77,7 @@ struct dl_open_args
>  };
>  
>  /* Called in case the global scope cannot be extended.  */
> -static void __attribute__ ((noreturn))
> +static void __attribute__ ((noreturn)) __COLD
>  add_to_global_resize_failure (struct link_map *new)
>  {
>    _dl_signal_error (ENOMEM, new->l_libname->name, NULL,
> diff --git a/elf/dl-tls.c b/elf/dl-tls.c
> index 4ef7bc3f..5e198499 100644
> --- a/elf/dl-tls.c
> +++ b/elf/dl-tls.c
> @@ -113,7 +113,7 @@ _dl_tls_static_surplus_init (size_t naudit)
>  
>  /* Out-of-memory handler.  */
>  static void
> -__attribute__ ((__noreturn__))
> +__attribute__ ((__noreturn__)) __COLD
>  oom (void)
>  {
>    _dl_fatal_printf ("cannot allocate memory for thread-local data: ABORT\n");
> diff --git a/include/assert.h b/include/assert.h
> index c812808f..9f0a7f8a 100644
> --- a/include/assert.h
> +++ b/include/assert.h
> @@ -6,19 +6,19 @@
>     so it has to be repeated here.  */
>  extern void __assert_fail (const char *__assertion, const char *__file,
>  			   unsigned int __line, const char *__function)
> -     __THROW __attribute__ ((__noreturn__));
> +     __THROW __attribute__ ((__noreturn__)) __COLD;
>  
>  /* Likewise, but prints the error text for ERRNUM.  */
>  extern void __assert_perror_fail (int __errnum, const char *__file,
>  				  unsigned int __line,
>  				  const char *__function)
> -     __THROW __attribute__ ((__noreturn__));
> +     __THROW __attribute__ ((__noreturn__)) __COLD;
>  
>  /* The real implementation of the two functions above.  */
>  extern void __assert_fail_base (const char *fmt, const char *assertion,
>  				const char *file, unsigned int line,
>  				const char *function)
> -     __THROW  __attribute__ ((__noreturn__)) attribute_hidden;
> +     __THROW  __attribute__ ((__noreturn__)) attribute_hidden __COLD;
>  
>  rtld_hidden_proto (__assert_fail)
>  rtld_hidden_proto (__assert_perror_fail)
> diff --git a/include/stdio.h b/include/stdio.h
> index da47d1ce..7f4c33e2 100644
> --- a/include/stdio.h
> +++ b/include/stdio.h
> @@ -171,9 +171,11 @@ extern int __gen_tempname (char *__tmpl, int __suffixlen, int __flags,
>  /* Print out MESSAGE (which should end with a newline) on the error output
>     and abort.  */
>  extern void __libc_fatal (const char *__message)
> -     __attribute__ ((__noreturn__));
> -_Noreturn void __libc_message (const char *__fnt, ...) attribute_hidden;
> -extern void __fortify_fail (const char *msg) __attribute__ ((__noreturn__));
> +     __attribute__ ((__noreturn__)) __COLD;
> +_Noreturn void __libc_message (const char *__fnt, ...)
> +     attribute_hidden __COLD;
> +extern void __fortify_fail (const char *msg)
> +     __attribute__ ((__noreturn__)) __COLD;
>  libc_hidden_proto (__fortify_fail)
>  
>  /* Acquire ownership of STREAM.  */
> diff --git a/include/sys/cdefs.h b/include/sys/cdefs.h
> index 56adb231..ef78edda 100644
> --- a/include/sys/cdefs.h
> +++ b/include/sys/cdefs.h
> @@ -16,6 +16,9 @@
>  # undef __nonnull
>  # define __nonnull(params)
>  
> +/* Intentionally not marked __COLD in the header, since this only causes GCC
> +   to create a bunch of useless __foo_chk.cold symbols containing only a call
> +   to this function; better just keep calling it directly.  */
>  extern void __chk_fail (void) __attribute__ ((__noreturn__));
>  libc_hidden_proto (__chk_fail)
>  rtld_hidden_proto (__chk_fail)

Why exactly gcc generates the useless __foo_chk.cold for this case? Is this a
bug or a limitation?

> diff --git a/malloc/dynarray.h b/malloc/dynarray.h
> index a2d1fd26..5093cc33 100644
> --- a/malloc/dynarray.h
> +++ b/malloc/dynarray.h
> @@ -165,7 +165,7 @@ bool __libc_dynarray_finalize (struct dynarray_header *list, void *scratch,
>  /* Internal function.  Terminate the process after an index error.
>     SIZE is the number of elements of the dynamic array.  INDEX is the
>     lookup index which triggered the failure.  */
> -_Noreturn void __libc_dynarray_at_failure (size_t size, size_t index);
> +_Noreturn void __libc_dynarray_at_failure (size_t size, size_t index) __COLD;
>  
>  #ifndef _ISOMAC
>  libc_hidden_proto (__libc_dynarray_emplace_enlarge)
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index 5d8b61d6..6f66813a 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -1093,7 +1093,8 @@ static void*  _int_memalign(mstate, size_t, size_t);
>  static void*  _mid_memalign(size_t, size_t, void *);
>  #endif
>  
> -static void malloc_printerr(const char *str) __attribute__ ((noreturn));
> +static void malloc_printerr(const char *str)
> +    __attribute__ ((noreturn)) __COLD;
>  
>  static void munmap_chunk(mchunkptr p);
>  #if HAVE_MREMAP
> diff --git a/misc/bits/error.h b/misc/bits/error.h
> index b3fd5020..46ec0559 100644
> --- a/misc/bits/error.h
> +++ b/misc/bits/error.h
> @@ -24,16 +24,16 @@
>  extern void __REDIRECT (__error_alias, (int __status, int __errnum,
>  					const char *__format, ...),
>  			error)
> -  __attribute__ ((__format__ (__printf__, 3, 4)));
> +  __attribute__ ((__format__ (__printf__, 3, 4))) __COLD;
>  extern void __REDIRECT (__error_noreturn, (int __status, int __errnum,
>  					   const char *__format, ...),
>  			error)
> -  __attribute__ ((__noreturn__, __format__ (__printf__, 3, 4)));
> +  __attribute__ ((__noreturn__, __format__ (__printf__, 3, 4))) __COLD;
>  
>  
>  /* If we know the function will never return make sure the compiler
>     realizes that, too.  */
> -__extern_always_inline void
> +__extern_always_inline __COLD void
>  error (int __status, int __errnum, const char *__format, ...)
>  {
>    if (__builtin_constant_p (__status) && __status != 0)
> @@ -48,19 +48,19 @@ extern void __REDIRECT (__error_at_line_alias, (int __status, int __errnum,
>  						unsigned int __line,
>  						const char *__format, ...),
>  			error_at_line)
> -  __attribute__ ((__format__ (__printf__, 5, 6)));
> +  __attribute__ ((__format__ (__printf__, 5, 6))) __COLD;
>  extern void __REDIRECT (__error_at_line_noreturn, (int __status, int __errnum,
>  						   const char *__fname,
>  						   unsigned int __line,
>  						   const char *__format,
>  						   ...),
>  			error_at_line)
> -  __attribute__ ((__noreturn__, __format__ (__printf__, 5, 6)));
> +  __attribute__ ((__noreturn__, __format__ (__printf__, 5, 6))) __COLD;
>  
>  
>  /* If we know the function will never return make sure the compiler
>     realizes that, too.  */
> -__extern_always_inline void
> +__extern_always_inline __COLD void
>  error_at_line (int __status, int __errnum, const char *__fname,
>  	       unsigned int __line, const char *__format, ...)
>  {
> diff --git a/stdlib/stdlib.h b/stdlib/stdlib.h
> index 631b0cbb..84afc059 100644
> --- a/stdlib/stdlib.h
> +++ b/stdlib/stdlib.h
> @@ -727,7 +727,7 @@ extern void *aligned_alloc (size_t __alignment, size_t __size)
>  #endif
>  
>  /* Abort execution and generate a core-dump.  */
> -extern void abort (void) __THROW __attribute__ ((__noreturn__));
> +extern void abort (void) __THROW __attribute__ ((__noreturn__)) __COLD;
>  
>  
>  /* Register a function to be called when `exit' is called.  */
> diff --git a/sysdeps/generic/dl-call_tls_init_tp.h b/sysdeps/generic/dl-call_tls_init_tp.h
> index 208f91e2..34f0959f 100644
> --- a/sysdeps/generic/dl-call_tls_init_tp.h
> +++ b/sysdeps/generic/dl-call_tls_init_tp.h
> @@ -19,7 +19,7 @@
>  #include <startup.h>
>  #include <tls.h>
>  
> -static inline void
> +static inline void __COLD
>  _startup_fatal_tls_error (void)
>  {
>    _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
> diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
> index ba531762..a4a0d307 100644
> --- a/sysdeps/generic/ldsodefs.h
> +++ b/sysdeps/generic/ldsodefs.h
> @@ -785,12 +785,12 @@ void _dl_printf (const char *fmt, ...)
>  /* Write a message on the specified descriptor standard error.  The
>     parameters are interpreted as for a `printf' call.  */
>  void _dl_error_printf (const char *fmt, ...)
> -  attribute_hidden __attribute__ ((__format__ (__printf__, 1, 2)));
> +  attribute_hidden __attribute__ ((__format__ (__printf__, 1, 2))) __COLD;
>  
>  /* Write a message on the specified descriptor standard error and exit
>     the program.  The parameters are interpreted as for a `printf' call.  */
>  void _dl_fatal_printf (const char *fmt, ...)
> -  __attribute__ ((__format__ (__printf__, 1, 2), __noreturn__));
> +  __attribute__ ((__format__ (__printf__, 1, 2), __noreturn__)) __COLD;
>  rtld_hidden_proto (_dl_fatal_printf)
>  
>  /* An exception raised by the _dl_signal_error function family and
> @@ -813,25 +813,25 @@ struct dl_exception
>     string describing the specific problem.  */
>  void _dl_exception_create (struct dl_exception *, const char *object,
>  			   const char *errstring)
> -  __attribute__ ((nonnull (1, 3)));
> +  __attribute__ ((nonnull (1, 3))) __COLD;
>  rtld_hidden_proto (_dl_exception_create)
>  
>  /* Used internally to implement dlerror message freeing.  See
>     include/dlfcn.h and dlfcn/dlerror.c.  */
> -void _dl_error_free (void *ptr) attribute_hidden;
> +void _dl_error_free (void *ptr) attribute_hidden __COLD;
>  
>  /* Like _dl_exception_create, but create errstring from a format
>     string FMT.  Currently, only "%s" and "%%" are supported as format
>     directives.  */
>  void _dl_exception_create_format (struct dl_exception *, const char *objname,
>  				  const char *fmt, ...)
> -  __attribute__ ((nonnull (1, 3), format (printf, 3, 4)));
> +  __attribute__ ((nonnull (1, 3), format (printf, 3, 4))) __COLD;
>  rtld_hidden_proto (_dl_exception_create_format)
>  
>  /* Deallocate the exception, freeing allocated buffers (if
>     possible).  */
>  void _dl_exception_free (struct dl_exception *)
> -  __attribute__ ((nonnull (1)));
> +  __attribute__ ((nonnull (1))) __COLD;
>  rtld_hidden_proto (_dl_exception_free)
>  
>  /* This function is called by all the internal dynamic linker
> @@ -841,13 +841,13 @@ rtld_hidden_proto (_dl_exception_free)
>     process is terminated immediately.  */
>  void _dl_signal_exception (int errcode, struct dl_exception *,
>  			   const char *occasion)
> -  __attribute__ ((__noreturn__));
> +  __attribute__ ((__noreturn__)) __COLD;
>  rtld_hidden_proto (_dl_signal_exception)
>  
>  /* Like _dl_signal_exception, but creates the exception first.  */
>  extern void _dl_signal_error (int errcode, const char *object,
>  			      const char *occasion, const char *errstring)
> -     __attribute__ ((__noreturn__));
> +     __attribute__ ((__noreturn__)) __COLD;
>  rtld_hidden_proto (_dl_signal_error)
>  
>  /* Like _dl_signal_exception, but may return when called in the
> @@ -856,7 +856,8 @@ rtld_hidden_proto (_dl_signal_error)
>     _dl_signal_exception.  */
>  #if IS_IN (rtld)
>  extern void _dl_signal_cexception (int errcode, struct dl_exception *,
> -				   const char *occasion) attribute_hidden;
> +				   const char *occasion)
> +  attribute_hidden __COLD;
>  #else
>  __attribute__ ((always_inline))
>  static inline void
> @@ -871,7 +872,7 @@ _dl_signal_cexception (int errcode, struct dl_exception *exception,
>  #if IS_IN (rtld)
>  extern void _dl_signal_cerror (int errcode, const char *object,
>  			       const char *occasion, const char *errstring)
> -     attribute_hidden;
> +     attribute_hidden __COLD;
>  #else
>  __attribute__ ((always_inline))
>  static inline void
> @@ -1020,7 +1021,7 @@ extern void _dl_protect_relro (struct link_map *map) attribute_hidden;
>     PLT is nonzero if this was a PLT reloc; it just affects the message.  */
>  extern void _dl_reloc_bad_type (struct link_map *map,
>  				unsigned int type, int plt)
> -     attribute_hidden __attribute__ ((__noreturn__));
> +     attribute_hidden __attribute__ ((__noreturn__)) __COLD;
>  
>  /* Check the version dependencies of all objects available through
>     MAP.  If VERBOSE print some more diagnostics.  */
> diff --git a/sysdeps/mach/hurd/dl-sysdep.c b/sysdeps/mach/hurd/dl-sysdep.c
> index 79ebb0ce..01a16570 100644
> --- a/sysdeps/mach/hurd/dl-sysdep.c
> +++ b/sysdeps/mach/hurd/dl-sysdep.c
> @@ -753,7 +753,7 @@ strong_alias (_exit, __GI__exit)
>  #endif
>  
>  check_no_hidden(abort);
> -void weak_function
> +void weak_function __COLD
>  abort (void)
>  {
>    /* Try to abort using the system specific command.  */
  
Sergey Bugaev May 19, 2023, 10:35 a.m. UTC | #2
On Thu, May 18, 2023 at 10:43 PM Adhemerval Zanella Netto
<adhemerval.zanella@linaro.org> wrote:
> The rationale seems ok, some comments below.

Thanks. Any thoughts on the .text.{startup,exit} part?

> > -void
> > +void __COLD
> >  __libc_fatal (const char *message)
> >  {
> >    _dl_fatal_printf ("%s", message);
> >  }
> >  rtld_hidden_def (__libc_fatal)
> >
>
> Can't you just add on the function prototype at include/stdio.h? Same
> question for the __assert_fail and __assert_perror_fail below.

But I did just that (added __COLD to the prototypes in include/stdio.h
and include/assert.h), didn't I?

If you're saying that it's not worth repeating __COLD on the
definition, then sure, I could remove that if you prefer.

> > +/* Intentionally not marked __COLD in the header, since this only causes GCC
> > +   to create a bunch of useless __foo_chk.cold symbols containing only a call
> > +   to this function; better just keep calling it directly.  */
> >  extern void __chk_fail (void) __attribute__ ((__noreturn__));
> >  libc_hidden_proto (__chk_fail)
> >  rtld_hidden_proto (__chk_fail)
>
> Why exactly gcc generates the useless __foo_chk.cold for this case? Is this a
> bug or a limitation?

I don't know; your guess is as good as mine (actually yours would be
better than mine). But my guess would be that they just didn't think
to add a check that whatever code size savings they're getting by
moving the cold path into a separate section outweigh the jump
instruction to get there.

Here's what I'm getting specifically, on i686-gnu:

Dump of assembler code for function __ppoll_chk:
Address range 0x198760 to 0x19879e:
   0x00198760 <+0>: 56                 push   %esi
   0x00198761 <+1>: 53                 push   %ebx
   0x00198762 <+2>: 83 ec 04           sub    $0x4,%esp
   0x00198765 <+5>: 8b 44 24 20         mov    0x20(%esp),%eax
   0x00198769 <+9>: 8b 54 24 14         mov    0x14(%esp),%edx
   0x0019876d <+13>: 8b 4c 24 10         mov    0x10(%esp),%ecx
   0x00198771 <+17>: 8b 5c 24 18         mov    0x18(%esp),%ebx
   0x00198775 <+21>: c1 e8 03           shr    $0x3,%eax
   0x00198778 <+24>: 8b 74 24 1c         mov    0x1c(%esp),%esi
   0x0019877c <+28>: 39 d0               cmp    %edx,%eax
   0x0019877e <+30>: 0f 82 9d bb e8 ff   jb     0x24321 <__ppoll_chk.cold>
   0x00198784 <+36>: 89 74 24 1c         mov    %esi,0x1c(%esp)
   0x00198788 <+40>: 89 5c 24 18         mov    %ebx,0x18(%esp)
   0x0019878c <+44>: 89 54 24 14         mov    %edx,0x14(%esp)
   0x00198790 <+48>: 89 4c 24 10         mov    %ecx,0x10(%esp)
   0x00198794 <+52>: 83 c4 04           add    $0x4,%esp
   0x00198797 <+55>: 5b                 pop    %ebx
   0x00198798 <+56>: 5e                 pop    %esi
   0x00198799 <+57>: e9 b2 b9 fb ff     jmp    0x154150 <__GI_ppoll>
Address range 0x24321 to 0x24326:
   0x00024321 <-1524799>: e8 5c ff ff ff     call   0x24282 <__GI___chk_fail>
End of assembler dump.

It's spending 6 bytes for the 'jb __ppoll_chk.cold', only to jump to
'call __GI___chk_fail' which takes 5 bytes. That's negative space
savings, both overall and inside .text.

And actually frankly that's bad codegen altogether, unless I'm missing
something. Why not

mov 20(%esp), %eax
shr $3, %eax
cmp 8(%esp), %eax
jnb __GI_ppoll
push %ebp
mov %esp, %ebp
call __GI___chk_fail

Then maybe it'd make sense to move the "push, mov, call" into
.text.unlikely, adding a jmp.

Sergey
  
Adhemerval Zanella Netto May 22, 2023, 8:41 p.m. UTC | #3
On 19/05/23 07:35, Sergey Bugaev wrote:
> On Thu, May 18, 2023 at 10:43 PM Adhemerval Zanella Netto
> <adhemerval.zanella@linaro.org> wrote:
>> The rationale seems ok, some comments below.
> 
> Thanks. Any thoughts on the .text.{startup,exit} part?
> 
>>> -void
>>> +void __COLD
>>>  __libc_fatal (const char *message)
>>>  {
>>>    _dl_fatal_printf ("%s", message);
>>>  }
>>>  rtld_hidden_def (__libc_fatal)
>>>
>>
>> Can't you just add on the function prototype at include/stdio.h? Same
>> question for the __assert_fail and __assert_perror_fail below.
> 
> But I did just that (added __COLD to the prototypes in include/stdio.h
> and include/assert.h), didn't I?
> 
> If you're saying that it's not worth repeating __COLD on the
> definition, then sure, I could remove that if you prefer.

The later, specially because for __chk_fail you do add an specific comment.

> 
>>> +/* Intentionally not marked __COLD in the header, since this only causes GCC
>>> +   to create a bunch of useless __foo_chk.cold symbols containing only a call
>>> +   to this function; better just keep calling it directly.  */
>>>  extern void __chk_fail (void) __attribute__ ((__noreturn__));
>>>  libc_hidden_proto (__chk_fail)
>>>  rtld_hidden_proto (__chk_fail)
>>
>> Why exactly gcc generates the useless __foo_chk.cold for this case? Is this a
>> bug or a limitation?
> 
> I don't know; your guess is as good as mine (actually yours would be
> better than mine). But my guess would be that they just didn't think
> to add a check that whatever code size savings they're getting by
> moving the cold path into a separate section outweigh the jump
> instruction to get there.
> 
> Here's what I'm getting specifically, on i686-gnu:
> 
> Dump of assembler code for function __ppoll_chk:
> Address range 0x198760 to 0x19879e:
>    0x00198760 <+0>: 56                 push   %esi
>    0x00198761 <+1>: 53                 push   %ebx
>    0x00198762 <+2>: 83 ec 04           sub    $0x4,%esp
>    0x00198765 <+5>: 8b 44 24 20         mov    0x20(%esp),%eax
>    0x00198769 <+9>: 8b 54 24 14         mov    0x14(%esp),%edx
>    0x0019876d <+13>: 8b 4c 24 10         mov    0x10(%esp),%ecx
>    0x00198771 <+17>: 8b 5c 24 18         mov    0x18(%esp),%ebx
>    0x00198775 <+21>: c1 e8 03           shr    $0x3,%eax
>    0x00198778 <+24>: 8b 74 24 1c         mov    0x1c(%esp),%esi
>    0x0019877c <+28>: 39 d0               cmp    %edx,%eax
>    0x0019877e <+30>: 0f 82 9d bb e8 ff   jb     0x24321 <__ppoll_chk.cold>
>    0x00198784 <+36>: 89 74 24 1c         mov    %esi,0x1c(%esp)
>    0x00198788 <+40>: 89 5c 24 18         mov    %ebx,0x18(%esp)
>    0x0019878c <+44>: 89 54 24 14         mov    %edx,0x14(%esp)
>    0x00198790 <+48>: 89 4c 24 10         mov    %ecx,0x10(%esp)
>    0x00198794 <+52>: 83 c4 04           add    $0x4,%esp
>    0x00198797 <+55>: 5b                 pop    %ebx
>    0x00198798 <+56>: 5e                 pop    %esi
>    0x00198799 <+57>: e9 b2 b9 fb ff     jmp    0x154150 <__GI_ppoll>
> Address range 0x24321 to 0x24326:
>    0x00024321 <-1524799>: e8 5c ff ff ff     call   0x24282 <__GI___chk_fail>
> End of assembler dump.
> 
> It's spending 6 bytes for the 'jb __ppoll_chk.cold', only to jump to
> 'call __GI___chk_fail' which takes 5 bytes. That's negative space
> savings, both overall and inside .text.

My guess this is arch-specific, since for aarch64-linux I am not seeing
any '.cold' sections being generated:

00000000000f4950 <__ppoll_chk>:
   f4950:       eb440c3f        cmp     x1, x4, lsr #3
   f4954:       54000048        b.hi    f495c <__ppoll_chk+0xc>  // b.pmore
   f4958:       17ffa1a6        b       dcff0 <ppoll>
   f495c:       a9bf7bfd        stp     x29, x30, [sp, #-16]!
   f4960:       910003fd        mov     x29, sp
   f4964:       97fcdae1        bl      2b4e8 <__chk_fail>
   f4968:       d503201f        nop
   f496c:       d503201f        nop

So I don't have a strong opinion about it.  It does seems to generate
better code for x86, although not as much for aarch64:

With this patch:

   text    data     bss     dec     hex filename
1867381  411832   55080 2334293  239e55 x86_64-linux-gnu-patch/libc.so
2147360  129084   39524 2315968  2356c0 i686-linux-gnu-patch/libc.so
1574355  410624   51704 2036683  1f13cb aarch64-linux-gnu-patch/libc.so

With this patch with __COLD for __chk_fail prototype:

   text    data     bss     dec     hex filename
1868824  411832   55080 2335736  23a3f8 x86_64-linux-gnu/libc.so
2149056  129084   39524 2317664  235d60 i686-linux-gnu/libc.so
1574256  410624   51704 2036584  1f1368 aarch64-linux-gnu/libc.so

> 
> And actually frankly that's bad codegen altogether, unless I'm missing
> something. Why not
> 
> mov 20(%esp), %eax
> shr $3, %eax
> cmp 8(%esp), %eax
> jnb __GI_ppoll
> push %ebp
> mov %esp, %ebp
> call __GI___chk_fail
> 
> Then maybe it'd make sense to move the "push, mov, call" into
> .text.unlikely, adding a jmp.

It might be worth to open a bug report on GCC.
  

Patch

diff --git a/assert/assert.h b/assert/assert.h
index 62670e4b..1c472e16 100644
--- a/assert/assert.h
+++ b/assert/assert.h
@@ -66,12 +66,12 @@  __BEGIN_DECLS
 /* This prints an "Assertion failed" message and aborts.  */
 extern void __assert_fail (const char *__assertion, const char *__file,
 			   unsigned int __line, const char *__function)
-     __THROW __attribute__ ((__noreturn__));
+     __THROW __attribute__ ((__noreturn__)) __COLD;
 
 /* Likewise, but prints the error text for ERRNUM.  */
 extern void __assert_perror_fail (int __errnum, const char *__file,
 				  unsigned int __line, const char *__function)
-     __THROW __attribute__ ((__noreturn__));
+     __THROW __attribute__ ((__noreturn__)) __COLD;
 
 
 /* The following is not at all used here but needed for standard
diff --git a/debug/chk_fail.c b/debug/chk_fail.c
index 299e95b6..97037972 100644
--- a/debug/chk_fail.c
+++ b/debug/chk_fail.c
@@ -22,7 +22,7 @@ 
 extern char **__libc_argv attribute_hidden;
 
 void
-__attribute__ ((noreturn))
+__attribute__ ((noreturn)) __COLD
 __chk_fail (void)
 {
   __fortify_fail ("buffer overflow detected");
diff --git a/elf/dl-exception.c b/elf/dl-exception.c
index 06a27cd7..1e1309fb 100644
--- a/elf/dl-exception.c
+++ b/elf/dl-exception.c
@@ -52,7 +52,7 @@  oom_exception (struct dl_exception *exception)
 }
 
 static void
-__attribute__ ((noreturn))
+__attribute__ ((noreturn)) __COLD
 length_mismatch (void)
 {
   _dl_fatal_printf ("Fatal error: "
diff --git a/elf/dl-minimal.c b/elf/dl-minimal.c
index abda453c..dc267519 100644
--- a/elf/dl-minimal.c
+++ b/elf/dl-minimal.c
@@ -156,14 +156,14 @@  __strerror_r (int errnum, char *buf, size_t buflen)
   return msg;
 }
 
-void
+void __COLD
 __libc_fatal (const char *message)
 {
   _dl_fatal_printf ("%s", message);
 }
 rtld_hidden_def (__libc_fatal)
 
-void
+void __COLD
 __attribute__ ((noreturn))
 __chk_fail (void)
 {
@@ -176,7 +176,7 @@  rtld_hidden_def (__chk_fail)
    If we are linked into the user program (-ldl), the normal __assert_fail
    defn can override this one.  */
 
-void weak_function
+void weak_function __COLD
 __assert_fail (const char *assertion,
 	       const char *file, unsigned int line, const char *function)
 {
@@ -190,7 +190,7 @@  Inconsistency detected by ld.so: %s: %u: %s%sAssertion `%s' failed!\n",
 rtld_hidden_weak (__assert_fail)
 # endif
 
-void weak_function
+void weak_function __COLD
 __assert_perror_fail (int errnum,
 		      const char *file, unsigned int line,
 		      const char *function)
diff --git a/elf/dl-open.c b/elf/dl-open.c
index 2d985e21..47bc5cac 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -77,7 +77,7 @@  struct dl_open_args
 };
 
 /* Called in case the global scope cannot be extended.  */
-static void __attribute__ ((noreturn))
+static void __attribute__ ((noreturn)) __COLD
 add_to_global_resize_failure (struct link_map *new)
 {
   _dl_signal_error (ENOMEM, new->l_libname->name, NULL,
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 4ef7bc3f..5e198499 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -113,7 +113,7 @@  _dl_tls_static_surplus_init (size_t naudit)
 
 /* Out-of-memory handler.  */
 static void
-__attribute__ ((__noreturn__))
+__attribute__ ((__noreturn__)) __COLD
 oom (void)
 {
   _dl_fatal_printf ("cannot allocate memory for thread-local data: ABORT\n");
diff --git a/include/assert.h b/include/assert.h
index c812808f..9f0a7f8a 100644
--- a/include/assert.h
+++ b/include/assert.h
@@ -6,19 +6,19 @@ 
    so it has to be repeated here.  */
 extern void __assert_fail (const char *__assertion, const char *__file,
 			   unsigned int __line, const char *__function)
-     __THROW __attribute__ ((__noreturn__));
+     __THROW __attribute__ ((__noreturn__)) __COLD;
 
 /* Likewise, but prints the error text for ERRNUM.  */
 extern void __assert_perror_fail (int __errnum, const char *__file,
 				  unsigned int __line,
 				  const char *__function)
-     __THROW __attribute__ ((__noreturn__));
+     __THROW __attribute__ ((__noreturn__)) __COLD;
 
 /* The real implementation of the two functions above.  */
 extern void __assert_fail_base (const char *fmt, const char *assertion,
 				const char *file, unsigned int line,
 				const char *function)
-     __THROW  __attribute__ ((__noreturn__)) attribute_hidden;
+     __THROW  __attribute__ ((__noreturn__)) attribute_hidden __COLD;
 
 rtld_hidden_proto (__assert_fail)
 rtld_hidden_proto (__assert_perror_fail)
diff --git a/include/stdio.h b/include/stdio.h
index da47d1ce..7f4c33e2 100644
--- a/include/stdio.h
+++ b/include/stdio.h
@@ -171,9 +171,11 @@  extern int __gen_tempname (char *__tmpl, int __suffixlen, int __flags,
 /* Print out MESSAGE (which should end with a newline) on the error output
    and abort.  */
 extern void __libc_fatal (const char *__message)
-     __attribute__ ((__noreturn__));
-_Noreturn void __libc_message (const char *__fnt, ...) attribute_hidden;
-extern void __fortify_fail (const char *msg) __attribute__ ((__noreturn__));
+     __attribute__ ((__noreturn__)) __COLD;
+_Noreturn void __libc_message (const char *__fnt, ...)
+     attribute_hidden __COLD;
+extern void __fortify_fail (const char *msg)
+     __attribute__ ((__noreturn__)) __COLD;
 libc_hidden_proto (__fortify_fail)
 
 /* Acquire ownership of STREAM.  */
diff --git a/include/sys/cdefs.h b/include/sys/cdefs.h
index 56adb231..ef78edda 100644
--- a/include/sys/cdefs.h
+++ b/include/sys/cdefs.h
@@ -16,6 +16,9 @@ 
 # undef __nonnull
 # define __nonnull(params)
 
+/* Intentionally not marked __COLD in the header, since this only causes GCC
+   to create a bunch of useless __foo_chk.cold symbols containing only a call
+   to this function; better just keep calling it directly.  */
 extern void __chk_fail (void) __attribute__ ((__noreturn__));
 libc_hidden_proto (__chk_fail)
 rtld_hidden_proto (__chk_fail)
diff --git a/malloc/dynarray.h b/malloc/dynarray.h
index a2d1fd26..5093cc33 100644
--- a/malloc/dynarray.h
+++ b/malloc/dynarray.h
@@ -165,7 +165,7 @@  bool __libc_dynarray_finalize (struct dynarray_header *list, void *scratch,
 /* Internal function.  Terminate the process after an index error.
    SIZE is the number of elements of the dynamic array.  INDEX is the
    lookup index which triggered the failure.  */
-_Noreturn void __libc_dynarray_at_failure (size_t size, size_t index);
+_Noreturn void __libc_dynarray_at_failure (size_t size, size_t index) __COLD;
 
 #ifndef _ISOMAC
 libc_hidden_proto (__libc_dynarray_emplace_enlarge)
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 5d8b61d6..6f66813a 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -1093,7 +1093,8 @@  static void*  _int_memalign(mstate, size_t, size_t);
 static void*  _mid_memalign(size_t, size_t, void *);
 #endif
 
-static void malloc_printerr(const char *str) __attribute__ ((noreturn));
+static void malloc_printerr(const char *str)
+    __attribute__ ((noreturn)) __COLD;
 
 static void munmap_chunk(mchunkptr p);
 #if HAVE_MREMAP
diff --git a/misc/bits/error.h b/misc/bits/error.h
index b3fd5020..46ec0559 100644
--- a/misc/bits/error.h
+++ b/misc/bits/error.h
@@ -24,16 +24,16 @@ 
 extern void __REDIRECT (__error_alias, (int __status, int __errnum,
 					const char *__format, ...),
 			error)
-  __attribute__ ((__format__ (__printf__, 3, 4)));
+  __attribute__ ((__format__ (__printf__, 3, 4))) __COLD;
 extern void __REDIRECT (__error_noreturn, (int __status, int __errnum,
 					   const char *__format, ...),
 			error)
-  __attribute__ ((__noreturn__, __format__ (__printf__, 3, 4)));
+  __attribute__ ((__noreturn__, __format__ (__printf__, 3, 4))) __COLD;
 
 
 /* If we know the function will never return make sure the compiler
    realizes that, too.  */
-__extern_always_inline void
+__extern_always_inline __COLD void
 error (int __status, int __errnum, const char *__format, ...)
 {
   if (__builtin_constant_p (__status) && __status != 0)
@@ -48,19 +48,19 @@  extern void __REDIRECT (__error_at_line_alias, (int __status, int __errnum,
 						unsigned int __line,
 						const char *__format, ...),
 			error_at_line)
-  __attribute__ ((__format__ (__printf__, 5, 6)));
+  __attribute__ ((__format__ (__printf__, 5, 6))) __COLD;
 extern void __REDIRECT (__error_at_line_noreturn, (int __status, int __errnum,
 						   const char *__fname,
 						   unsigned int __line,
 						   const char *__format,
 						   ...),
 			error_at_line)
-  __attribute__ ((__noreturn__, __format__ (__printf__, 5, 6)));
+  __attribute__ ((__noreturn__, __format__ (__printf__, 5, 6))) __COLD;
 
 
 /* If we know the function will never return make sure the compiler
    realizes that, too.  */
-__extern_always_inline void
+__extern_always_inline __COLD void
 error_at_line (int __status, int __errnum, const char *__fname,
 	       unsigned int __line, const char *__format, ...)
 {
diff --git a/stdlib/stdlib.h b/stdlib/stdlib.h
index 631b0cbb..84afc059 100644
--- a/stdlib/stdlib.h
+++ b/stdlib/stdlib.h
@@ -727,7 +727,7 @@  extern void *aligned_alloc (size_t __alignment, size_t __size)
 #endif
 
 /* Abort execution and generate a core-dump.  */
-extern void abort (void) __THROW __attribute__ ((__noreturn__));
+extern void abort (void) __THROW __attribute__ ((__noreturn__)) __COLD;
 
 
 /* Register a function to be called when `exit' is called.  */
diff --git a/sysdeps/generic/dl-call_tls_init_tp.h b/sysdeps/generic/dl-call_tls_init_tp.h
index 208f91e2..34f0959f 100644
--- a/sysdeps/generic/dl-call_tls_init_tp.h
+++ b/sysdeps/generic/dl-call_tls_init_tp.h
@@ -19,7 +19,7 @@ 
 #include <startup.h>
 #include <tls.h>
 
-static inline void
+static inline void __COLD
 _startup_fatal_tls_error (void)
 {
   _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n");
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index ba531762..a4a0d307 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -785,12 +785,12 @@  void _dl_printf (const char *fmt, ...)
 /* Write a message on the specified descriptor standard error.  The
    parameters are interpreted as for a `printf' call.  */
 void _dl_error_printf (const char *fmt, ...)
-  attribute_hidden __attribute__ ((__format__ (__printf__, 1, 2)));
+  attribute_hidden __attribute__ ((__format__ (__printf__, 1, 2))) __COLD;
 
 /* Write a message on the specified descriptor standard error and exit
    the program.  The parameters are interpreted as for a `printf' call.  */
 void _dl_fatal_printf (const char *fmt, ...)
-  __attribute__ ((__format__ (__printf__, 1, 2), __noreturn__));
+  __attribute__ ((__format__ (__printf__, 1, 2), __noreturn__)) __COLD;
 rtld_hidden_proto (_dl_fatal_printf)
 
 /* An exception raised by the _dl_signal_error function family and
@@ -813,25 +813,25 @@  struct dl_exception
    string describing the specific problem.  */
 void _dl_exception_create (struct dl_exception *, const char *object,
 			   const char *errstring)
-  __attribute__ ((nonnull (1, 3)));
+  __attribute__ ((nonnull (1, 3))) __COLD;
 rtld_hidden_proto (_dl_exception_create)
 
 /* Used internally to implement dlerror message freeing.  See
    include/dlfcn.h and dlfcn/dlerror.c.  */
-void _dl_error_free (void *ptr) attribute_hidden;
+void _dl_error_free (void *ptr) attribute_hidden __COLD;
 
 /* Like _dl_exception_create, but create errstring from a format
    string FMT.  Currently, only "%s" and "%%" are supported as format
    directives.  */
 void _dl_exception_create_format (struct dl_exception *, const char *objname,
 				  const char *fmt, ...)
-  __attribute__ ((nonnull (1, 3), format (printf, 3, 4)));
+  __attribute__ ((nonnull (1, 3), format (printf, 3, 4))) __COLD;
 rtld_hidden_proto (_dl_exception_create_format)
 
 /* Deallocate the exception, freeing allocated buffers (if
    possible).  */
 void _dl_exception_free (struct dl_exception *)
-  __attribute__ ((nonnull (1)));
+  __attribute__ ((nonnull (1))) __COLD;
 rtld_hidden_proto (_dl_exception_free)
 
 /* This function is called by all the internal dynamic linker
@@ -841,13 +841,13 @@  rtld_hidden_proto (_dl_exception_free)
    process is terminated immediately.  */
 void _dl_signal_exception (int errcode, struct dl_exception *,
 			   const char *occasion)
-  __attribute__ ((__noreturn__));
+  __attribute__ ((__noreturn__)) __COLD;
 rtld_hidden_proto (_dl_signal_exception)
 
 /* Like _dl_signal_exception, but creates the exception first.  */
 extern void _dl_signal_error (int errcode, const char *object,
 			      const char *occasion, const char *errstring)
-     __attribute__ ((__noreturn__));
+     __attribute__ ((__noreturn__)) __COLD;
 rtld_hidden_proto (_dl_signal_error)
 
 /* Like _dl_signal_exception, but may return when called in the
@@ -856,7 +856,8 @@  rtld_hidden_proto (_dl_signal_error)
    _dl_signal_exception.  */
 #if IS_IN (rtld)
 extern void _dl_signal_cexception (int errcode, struct dl_exception *,
-				   const char *occasion) attribute_hidden;
+				   const char *occasion)
+  attribute_hidden __COLD;
 #else
 __attribute__ ((always_inline))
 static inline void
@@ -871,7 +872,7 @@  _dl_signal_cexception (int errcode, struct dl_exception *exception,
 #if IS_IN (rtld)
 extern void _dl_signal_cerror (int errcode, const char *object,
 			       const char *occasion, const char *errstring)
-     attribute_hidden;
+     attribute_hidden __COLD;
 #else
 __attribute__ ((always_inline))
 static inline void
@@ -1020,7 +1021,7 @@  extern void _dl_protect_relro (struct link_map *map) attribute_hidden;
    PLT is nonzero if this was a PLT reloc; it just affects the message.  */
 extern void _dl_reloc_bad_type (struct link_map *map,
 				unsigned int type, int plt)
-     attribute_hidden __attribute__ ((__noreturn__));
+     attribute_hidden __attribute__ ((__noreturn__)) __COLD;
 
 /* Check the version dependencies of all objects available through
    MAP.  If VERBOSE print some more diagnostics.  */
diff --git a/sysdeps/mach/hurd/dl-sysdep.c b/sysdeps/mach/hurd/dl-sysdep.c
index 79ebb0ce..01a16570 100644
--- a/sysdeps/mach/hurd/dl-sysdep.c
+++ b/sysdeps/mach/hurd/dl-sysdep.c
@@ -753,7 +753,7 @@  strong_alias (_exit, __GI__exit)
 #endif
 
 check_no_hidden(abort);
-void weak_function
+void weak_function __COLD
 abort (void)
 {
   /* Try to abort using the system specific command.  */