[1/2] Add private_function for private functions within glibc

Message ID 20170617130612.GA14641@gmail.com
State New, archived
Headers

Commit Message

H.J. Lu June 17, 2017, 1:06 p.m. UTC
  Shadow Stack in Intel Control-flow Enforcement Technology (CET) instructions:

https://software.intel.com/sites/default/files/managed/4d/2a/control-flow-enforcement-technology-preview.pdf

doesn't support _dl_runtime_resolve:

        movl (%esp), %ecx
        movl %eax, (%esp)       # Store the function address.
        movl 4(%esp), %eax
        ret $12                 # Jump to function address.

since shadow stack doesn't match return stack.  We need to use register
indirect branch via %ecx.  That means only 2 parameters can be passed
in registers for external function calls with lazy binding.  However,
internal_function, which should be used only with hidden function, is
defined as

 # define internal_function __attribute__ ((regparm (3), stdcall))

and used with private function calls between different shared objects of
glibc.  We introduce private_function for such purpose:

 # define private_function __attribute__ ((regparm (2), stdcall))

so that %ecx can be used by _dl_runtime_resolve as scratch register.

Any comments?

H.J.
---
	[BZ #21598]
	* config.h.in (USE_REGPARMS): Removed.
	(internal_function): Undef.
	(private_function): New.  Undef.
	* debug/fortify_fail.c (__fortify_fail): Replace internal_function
	with private_function.
	* elf/dl-addr.c (_dl_addr): Likewise.
	* elf/dl-error-skeleton.c (_dl_signal_error): Likewise.
	(_dl_catch_error): Likewise.
	* elf/dl-execstack.c (_dl_make_stack_executable): Likewise.
	* elf/dl-load.c (_dl_rtld_di_serinfo): Likewise.
	* elf/dl-open.c (_dl_find_dso_for_object): Likewise.
	* elf/dl-support.c (_dl_make_stack_executable_hook): Likewise.
	* elf/dl-sym.c (_dl_vsym): Likewise.
	(_dl_sym): Likewise.
	* elf/dl-tls.c (_dl_get_tls_static_info): Likewise.
	(_dl_allocate_tls_init): Likewise.
	(_dl_allocate_tls): Likewise.
	(_dl_deallocate_tls): Likewise.
	* grp/grp-merge.c (__copy_grp): Likewise.
	(__merge_grp): Likewise.
	* grp/grp-merge.h (__copy_grp): Likewise.
	(__merge_grp): Likewise.
	* include/dlfcn.h (_dl_addr): Likewise.
	(_dl_sym): Likewise.
	(_dl_vsym): Likewise.
	* include/rpc/pmap_clnt.h (__libc_rpc_getport): Likewise.
	* include/stdio.h (__fortify_fail): Likewise.
	* include/stdlib.h (__strtof_nan): Likewise.
	(__strtod_nan): Likewise.
	(__strtold_nan): Likewise.
	(__wcstof_nan): Likewise.
	(__wcstod_nan): Likewise.
	(__wcstold_nan): Likewise.
	* inet/inet6_scopeid_pton.c (__inet6_scopeid_pton): Likewise.
	* inet/net-internal.h (__inet6_scopeid_pton): Likewise.
	* nptl/allocatestack.c (__make_stacks_executable): Likewise.
	* nptl/libc_pthread_init.c (__libc_pthread_init): Likewise.
	* nptl/pthreadP.h (__make_stacks_executable): Likewise.
	(__libc_pthread_init): Likewise.
	* nss/XXX-lookup.c (DB_LOOKUP_FCT): Likewise.
	(DB_COMPAT_FCT): Likewise.
	* nss/getXXbyYY_r.c (DB_LOOKUP_FCT): Likewise.
	* nss/getXXent_r.c (DB_LOOKUP_FCT): Likewise.
	* nss/nsswitch.h (db_lookup_function): Likewise.
	* resolv/gai_misc.h (__gai_sigqueue): Likewise.
	* resolv/gai_sigqueue.c (__gai_sigqueue): Likewise.
	* stdlib/strtod_nan_main.c (STRTOD_NAN): Likewise.
	* sunrpc/pm_getport.c (__libc_rpc_getport): Likewise.
	* sysdeps/generic/ldsodefs.h (_dl_make_stack_executable_hook):
	Likewise.
	(_dl_make_stack_executable): Likewise.
	(_dl_signal_error): Likewise.
	(_dl_catch_error): Likewise.
	(_dl_rtld_di_serinfo): Likewise.
	(_dl_allocate_tls): Likewise.
	(_dl_get_tls_static_info): Likewise.
	(_dl_allocate_tls_init): Likewise.
	(_dl_deallocate_tls): Likewise.
	(_dl_find_dso_for_object): Likewise.
	* sysdeps/unix/sysv/linux/dl-execstack.c
	(_dl_make_stack_executable): Likewise.
	* sysdeps/unix/sysv/linux/gai_sigqueue.c (__gai_sigqueue):
	Likewise.
	* sysdeps/unix/sysv/linux/netlink_assert_response.c
	(__netlink_assert_response): Likewise.
	* sysdeps/unix/sysv/linux/netlinkaccess.h
	(__netlink_assert_response): Likewise.
	* include/libc-symbols.h (private_function): New.
	* sysdeps/i386/configure.ac (USE_REGPARMS): Removed.
	(internal_function): New AC_DEFINE.
	(private_function): Likewise.
	* sysdeps/i386/configure: Regenerated.
---
 config.h.in                                       | 11 ++++-------
 debug/fortify_fail.c                              |  2 +-
 elf/dl-addr.c                                     |  2 +-
 elf/dl-error-skeleton.c                           |  4 ++--
 elf/dl-execstack.c                                |  2 +-
 elf/dl-load.c                                     |  2 +-
 elf/dl-open.c                                     |  2 +-
 elf/dl-support.c                                  |  2 +-
 elf/dl-sym.c                                      |  4 ++--
 elf/dl-tls.c                                      |  8 ++++----
 grp/grp-merge.c                                   |  4 ++--
 grp/grp-merge.h                                   |  4 ++--
 include/dlfcn.h                                   |  6 +++---
 include/libc-symbols.h                            |  9 +++++++--
 include/rpc/pmap_clnt.h                           |  2 +-
 include/stdio.h                                   |  2 +-
 include/stdlib.h                                  | 14 ++++++++------
 inet/inet6_scopeid_pton.c                         |  2 +-
 inet/net-internal.h                               |  2 +-
 nptl/allocatestack.c                              |  2 +-
 nptl/libc_pthread_init.c                          |  2 +-
 nptl/pthreadP.h                                   |  6 +++---
 nss/XXX-lookup.c                                  |  6 +++---
 nss/getXXbyYY_r.c                                 |  2 +-
 nss/getXXent_r.c                                  |  2 +-
 nss/nsswitch.h                                    |  2 +-
 resolv/gai_misc.h                                 |  2 +-
 resolv/gai_sigqueue.c                             |  2 +-
 stdlib/strtod_nan_main.c                          |  2 +-
 sunrpc/pm_getport.c                               |  2 +-
 sysdeps/generic/ldsodefs.h                        | 20 ++++++++++----------
 sysdeps/i386/configure                            | 12 ++++++++++--
 sysdeps/i386/configure.ac                         |  9 ++++++++-
 sysdeps/unix/sysv/linux/dl-execstack.c            |  2 +-
 sysdeps/unix/sysv/linux/gai_sigqueue.c            |  2 +-
 sysdeps/unix/sysv/linux/netlink_assert_response.c |  2 +-
 sysdeps/unix/sysv/linux/netlinkaccess.h           |  2 +-
 37 files changed, 91 insertions(+), 72 deletions(-)
  

Comments

Florian Weimer June 17, 2017, 1:42 p.m. UTC | #1
On 06/17/2017 03:06 PM, H.J. Lu wrote:
> since shadow stack doesn't match return stack.  We need to use register
> indirect branch via %ecx.  That means only 2 parameters can be passed
> in registers for external function calls with lazy binding.  However,
> internal_function, which should be used only with hidden function, is
> defined as
> 
>  # define internal_function __attribute__ ((regparm (3), stdcall))
> 
> and used with private function calls between different shared objects of
> glibc.  We introduce private_function for such purpose:
> 
>  # define private_function __attribute__ ((regparm (2), stdcall))
> 
> so that %ecx can be used by _dl_runtime_resolve as scratch register.
> 
> Any comments?

I have previously suggested to get rid of internal_function.  For
example, applying it to static functions is generally unnecessary, but
we still do that a lot.  (I know the static linkage case isn't what you
are after here.)

I wonder what the change in code size due to a change from 3 to 2 to 0
register parameters is.

But even if we fix glibc's own use of register parameters, we risk
running into bugs like bug 21265.  I'm not sure if this is worth the
trouble at this point, especially since i386 has to be considered a
legacy architecture at this point.

Thanks,
Florian
  
H.J. Lu June 17, 2017, 1:59 p.m. UTC | #2
On Sat, Jun 17, 2017 at 6:42 AM, Florian Weimer <fweimer@redhat.com> wrote:
> On 06/17/2017 03:06 PM, H.J. Lu wrote:
>> since shadow stack doesn't match return stack.  We need to use register
>> indirect branch via %ecx.  That means only 2 parameters can be passed
>> in registers for external function calls with lazy binding.  However,
>> internal_function, which should be used only with hidden function, is
>> defined as
>>
>>  # define internal_function __attribute__ ((regparm (3), stdcall))
>>
>> and used with private function calls between different shared objects of
>> glibc.  We introduce private_function for such purpose:
>>
>>  # define private_function __attribute__ ((regparm (2), stdcall))
>>
>> so that %ecx can be used by _dl_runtime_resolve as scratch register.
>>
>> Any comments?
>
> I have previously suggested to get rid of internal_function.  For
> example, applying it to static functions is generally unnecessary, but
> we still do that a lot.  (I know the static linkage case isn't what you
> are after here.)

I tried  and it didn't work since some i386 assembly codes call
internal functions directly:

_dl_start_user:\n\
        # Save the user entry point address in %edi.\n\
        movl %eax, %edi\n\
        # Point %ebx at the GOT.\n\
        call 0b\n\
        addl $_GLOBAL_OFFSET_TABLE_, %ebx\n\
        # See if we were run as a command with the executable file\n\
        # name as an extra leading argument.\n\
        movl _dl_skip_args@GOTOFF(%ebx), %eax\n\
        # Pop the original argument count.\n\
        popl %edx\n\
        # Adjust the stack pointer to skip _dl_skip_args words.\n\
        leal (%esp,%eax,4), %esp\n\
        # Subtract _dl_skip_args from argc.\n\
        subl %eax, %edx\n\
        # Push argc back on the stack.\n\
        push %edx\n\
        # The special initializer gets called with the stack just\n\
        # as the application's entry point will see it; it can\n\
        # switch stacks if it moves these contents over.\n\
" RTLD_START_SPECIAL_INIT "\n\
        # Load the parameters again.\n\
        # (eax, edx, ecx, *--esp) = (_dl_loaded, argc, argv, envp)\n\
        movl _rtld_local@GOTOFF(%ebx), %eax\n\
        leal 8(%esp,%edx,4), %esi\n\
        leal 4(%esp), %ecx\n\
        movl %esp, %ebp\n\
        # Make sure _dl_init is run with 16 byte aligned stack.\n\
        andl $-16, %esp\n\
        pushl %eax\n\
        pushl %eax\n\
        pushl %ebp\n\
        pushl %esi\n\
        # Clear %ebp, so that even constructors have terminated backchain.\n\
        xorl %ebp, %ebp\n\
        # Call the function to run the initializers.\n\
        call _dl_init\n\
        # Pass our finalizer function to the user in %edx, as per ELF ABI.\n\
        leal _dl_fini@GOTOFF(%ebx), %edx\n\
        # Restore %esp _start expects.\n\
        movl (%esp), %esp\n\
        # Jump to the user's entry point.\n\
        jmp *%edi\n\
        .previous\n\

Here parameters are passed to _dl_init in registers.  I want to minimize
changes to avoid any potential issues.

> I wonder what the change in code size due to a change from 3 to 2 to 0
> register parameters is.
>
> But even if we fix glibc's own use of register parameters, we risk
> running into bugs like bug 21265.  I'm not sure if this is worth the
> trouble at this point, especially since i386 has to be considered a
> legacy architecture at this point.
>
> Thanks,
> Florian
  
Florian Weimer June 22, 2017, 1:40 p.m. UTC | #3
On 06/17/2017 03:59 PM, H.J. Lu wrote:
> Here parameters are passed to _dl_init in registers.  I want to minimize
> changes to avoid any potential issues.

Well, as a rule of thumb, if we do something that breaks our own code,
it is pretty much guaranteed to wreak havoc across the board (because
our test coverage is somewhat poor).

I see a lot of use of regparm (3).  For example:

$ echo '#include <Qt/qchar.h>' | g++ -m32 -E -x c++ - | grep regparm
    static Category __attribute__((regparm(3))) category(uint ucs4);
    static Category __attribute__((regparm(3))) category(ushort ucs2);
    static Direction __attribute__((regparm(3))) direction(uint ucs4);
    static Direction __attribute__((regparm(3))) direction(ushort ucs2);
    static Joining __attribute__((regparm(3))) joining(uint ucs4);
…

I think these calls actually cross DSO boundaries.

So I really think you should make the use of %ecx conditional on whether
shadow stacks are enabled.

Thanks,
Florian
  
H.J. Lu June 22, 2017, 2:38 p.m. UTC | #4
On Thu, Jun 22, 2017 at 6:40 AM, Florian Weimer <fweimer@redhat.com> wrote:
> On 06/17/2017 03:59 PM, H.J. Lu wrote:
>> Here parameters are passed to _dl_init in registers.  I want to minimize
>> changes to avoid any potential issues.
>
> Well, as a rule of thumb, if we do something that breaks our own code,
> it is pretty much guaranteed to wreak havoc across the board (because
> our test coverage is somewhat poor).
>
> I see a lot of use of regparm (3).  For example:
>
> $ echo '#include <Qt/qchar.h>' | g++ -m32 -E -x c++ - | grep regparm
>     static Category __attribute__((regparm(3))) category(uint ucs4);
>     static Category __attribute__((regparm(3))) category(ushort ucs2);
>     static Direction __attribute__((regparm(3))) direction(uint ucs4);
>     static Direction __attribute__((regparm(3))) direction(ushort ucs2);
>     static Joining __attribute__((regparm(3))) joining(uint ucs4);
> …
>
> I think these calls actually cross DSO boundaries.

I don't think so.  See "static ...".

> So I really think you should make the use of %ecx conditional on whether
> shadow stacks are enabled.
>
> Thanks,
> Florian
  
Florian Weimer June 22, 2017, 2:41 p.m. UTC | #5
On 06/22/2017 04:38 PM, H.J. Lu wrote:
> On Thu, Jun 22, 2017 at 6:40 AM, Florian Weimer <fweimer@redhat.com> wrote:
>> On 06/17/2017 03:59 PM, H.J. Lu wrote:
>>> Here parameters are passed to _dl_init in registers.  I want to minimize
>>> changes to avoid any potential issues.
>>
>> Well, as a rule of thumb, if we do something that breaks our own code,
>> it is pretty much guaranteed to wreak havoc across the board (because
>> our test coverage is somewhat poor).
>>
>> I see a lot of use of regparm (3).  For example:
>>
>> $ echo '#include <Qt/qchar.h>' | g++ -m32 -E -x c++ - | grep regparm
>>     static Category __attribute__((regparm(3))) category(uint ucs4);
>>     static Category __attribute__((regparm(3))) category(ushort ucs2);
>>     static Direction __attribute__((regparm(3))) direction(uint ucs4);
>>     static Direction __attribute__((regparm(3))) direction(ushort ucs2);
>>     static Joining __attribute__((regparm(3))) joining(uint ucs4);
>> …
>>
>> I think these calls actually cross DSO boundaries.
> 
> I don't think so.  See "static ...".

It's C++ code, so it just means there's no this pointer.

Florian
  
H.J. Lu June 22, 2017, 5:22 p.m. UTC | #6
On Thu, Jun 22, 2017 at 7:41 AM, Florian Weimer <fweimer@redhat.com> wrote:
> On 06/22/2017 04:38 PM, H.J. Lu wrote:
>> On Thu, Jun 22, 2017 at 6:40 AM, Florian Weimer <fweimer@redhat.com> wrote:
>>> On 06/17/2017 03:59 PM, H.J. Lu wrote:
>>>> Here parameters are passed to _dl_init in registers.  I want to minimize
>>>> changes to avoid any potential issues.
>>>
>>> Well, as a rule of thumb, if we do something that breaks our own code,
>>> it is pretty much guaranteed to wreak havoc across the board (because
>>> our test coverage is somewhat poor).
>>>
>>> I see a lot of use of regparm (3).  For example:
>>>
>>> $ echo '#include <Qt/qchar.h>' | g++ -m32 -E -x c++ - | grep regparm
>>>     static Category __attribute__((regparm(3))) category(uint ucs4);
>>>     static Category __attribute__((regparm(3))) category(ushort ucs2);
>>>     static Direction __attribute__((regparm(3))) direction(uint ucs4);
>>>     static Direction __attribute__((regparm(3))) direction(ushort ucs2);
>>>     static Joining __attribute__((regparm(3))) joining(uint ucs4);
>>> …
>>>
>>> I think these calls actually cross DSO boundaries.
>>
>> I don't think so.  See "static ...".
>
> It's C++ code, so it just means there's no this pointer.
>

Compiler should issue an error if SHSTK is enabled when compiling
this code.   Only regparm (2) or less is allowed on public functions
with SHSTK.
  

Patch

diff --git a/config.h.in b/config.h.in
index 2241857..989554a 100644
--- a/config.h.in
+++ b/config.h.in
@@ -47,10 +47,6 @@ 
 #undef	STACK_PROTECTOR_LEVEL
 #endif
 
-/* Define if the regparm attribute shall be used for local functions
-   (gcc on ix86 only).  */
-#undef	USE_REGPARMS
-
 /* Defined on SPARC if GCC emits GOTDATA relocations.  */
 #undef  HAVE_GCC_GOTDATA
 
@@ -101,9 +97,10 @@ 
 
 
 /* Defined to some form of __attribute__ ((...)) if the compiler supports
-   a different, more efficient calling convention.  */
-#if defined USE_REGPARMS && !defined PROF
-# define internal_function __attribute__ ((regparm (3), stdcall))
+   a different, more efficient calling convention (gcc on ix86 only).  */
+#ifndef PROF
+# undef internal_function
+# undef private_function
 #endif
 
 /* Linux specific: minimum supported kernel version.  */
diff --git a/debug/fortify_fail.c b/debug/fortify_fail.c
index a31977a..c97e962 100644
--- a/debug/fortify_fail.c
+++ b/debug/fortify_fail.c
@@ -22,7 +22,7 @@ 
 extern char **__libc_argv attribute_hidden;
 
 void
-__attribute__ ((noreturn)) internal_function
+__attribute__ ((noreturn)) private_function
 __fortify_fail (const char *msg)
 {
   /* The loop is added only to keep gcc happy.  */
diff --git a/elf/dl-addr.c b/elf/dl-addr.c
index 1fac63d..edd42be 100644
--- a/elf/dl-addr.c
+++ b/elf/dl-addr.c
@@ -121,7 +121,7 @@  determine_info (const ElfW(Addr) addr, struct link_map *match, Dl_info *info,
 
 
 int
-internal_function
+private_function
 _dl_addr (const void *address, Dl_info *info,
 	  struct link_map **mapp, const ElfW(Sym) **symbolp)
 {
diff --git a/elf/dl-error-skeleton.c b/elf/dl-error-skeleton.c
index 8e5888d..7d4ad30 100644
--- a/elf/dl-error-skeleton.c
+++ b/elf/dl-error-skeleton.c
@@ -77,7 +77,7 @@  static receiver_fct receiver;
 #endif /* DL_ERROR_BOOTSTRAP */
 
 void
-internal_function
+private_function
 _dl_signal_error (int errcode, const char *objname, const char *occation,
 		  const char *errstring)
 {
@@ -169,7 +169,7 @@  _dl_signal_cerror (int errcode, const char *objname, const char *occation,
 
 
 int
-internal_function
+private_function
 _dl_catch_error (const char **objname, const char **errstring,
 		 bool *mallocedp, void (*operate) (void *), void *args)
 {
diff --git a/elf/dl-execstack.c b/elf/dl-execstack.c
index 875338b..c83de34 100644
--- a/elf/dl-execstack.c
+++ b/elf/dl-execstack.c
@@ -23,7 +23,7 @@ 
    so as to mprotect it.  */
 
 int
-internal_function
+private_function
 _dl_make_stack_executable (void **stack_endp)
 {
   return ENOSYS;
diff --git a/elf/dl-load.c b/elf/dl-load.c
index c1b6d4b..14cf164 100644
--- a/elf/dl-load.c
+++ b/elf/dl-load.c
@@ -2241,7 +2241,7 @@  add_path (struct add_path_state *p, const struct r_search_path_struct *sps,
 }
 
 void
-internal_function
+private_function
 _dl_rtld_di_serinfo (struct link_map *loader, Dl_serinfo *si, bool counting)
 {
   if (counting)
diff --git a/elf/dl-open.c b/elf/dl-open.c
index cec54db..865a3ef 100644
--- a/elf/dl-open.c
+++ b/elf/dl-open.c
@@ -162,7 +162,7 @@  add_to_global (struct link_map *new)
    address ADDR.  Returns the pointer to the link map of the matching DSO, or
    NULL if a match is not found.  */
 struct link_map *
-internal_function
+private_function
 _dl_find_dso_for_object (const ElfW(Addr) addr)
 {
   struct link_map *l;
diff --git a/elf/dl-support.c b/elf/dl-support.c
index c22be85..b98cc4e 100644
--- a/elf/dl-support.c
+++ b/elf/dl-support.c
@@ -181,7 +181,7 @@  ElfW(Word) _dl_stack_flags = DEFAULT_STACK_PERMS;
 /* If loading a shared object requires that we make the stack executable
    when it was not, we do it by calling this function.
    It returns an errno code or zero on success.  */
-int (*_dl_make_stack_executable_hook) (void **) internal_function
+int (*_dl_make_stack_executable_hook) (void **) private_function
   = _dl_make_stack_executable;
 
 
diff --git a/elf/dl-sym.c b/elf/dl-sym.c
index 7cd6e97..0ad270d 100644
--- a/elf/dl-sym.c
+++ b/elf/dl-sym.c
@@ -250,7 +250,7 @@  RTLD_NEXT used in code not dynamically loaded"));
 
 
 void *
-internal_function
+private_function
 _dl_vsym (void *handle, const char *name, const char *version, void *who)
 {
   struct r_found_version vers;
@@ -267,7 +267,7 @@  _dl_vsym (void *handle, const char *name, const char *version, void *who)
 
 
 void *
-internal_function
+private_function
 _dl_sym (void *handle, const char *name, void *who)
 {
   return do_sym (handle, name, who, NULL, DL_LOOKUP_RETURN_NEWEST);
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 5aba33b..49af79d 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -308,7 +308,7 @@  allocate_dtv (void *result)
 
 /* Get size and alignment requirements of the static TLS block.  */
 void
-internal_function
+private_function
 _dl_get_tls_static_info (size_t *sizep, size_t *alignp)
 {
   *sizep = GL(dl_tls_static_size);
@@ -439,7 +439,7 @@  _dl_resize_dtv (dtv_t *dtv)
 
 
 void *
-internal_function
+private_function
 _dl_allocate_tls_init (void *result)
 {
   if (result == NULL)
@@ -532,7 +532,7 @@  _dl_allocate_tls_init (void *result)
 rtld_hidden_def (_dl_allocate_tls_init)
 
 void *
-internal_function
+private_function
 _dl_allocate_tls (void *mem)
 {
   return _dl_allocate_tls_init (mem == NULL
@@ -543,7 +543,7 @@  rtld_hidden_def (_dl_allocate_tls)
 
 
 void
-internal_function
+private_function
 _dl_deallocate_tls (void *tcb, bool dealloc_tcb)
 {
   dtv_t *dtv = GET_DTV (tcb);
diff --git a/grp/grp-merge.c b/grp/grp-merge.c
index 77c494d..9f1d14c 100644
--- a/grp/grp-merge.c
+++ b/grp/grp-merge.c
@@ -36,7 +36,7 @@ 
   })
 
 int
-internal_function
+private_function
 __copy_grp (const struct group srcgrp, const size_t buflen,
 	    struct group *destgrp, char *destbuf, char **endptr)
 {
@@ -109,7 +109,7 @@  libc_hidden_def (__copy_grp)
 /* Check that the name, GID and passwd fields match, then
    copy in the gr_mem array.  */
 int
-internal_function
+private_function
 __merge_grp (struct group *savedgrp, char *savedbuf, char *savedend,
 	     size_t buflen, struct group *mergegrp, char *mergebuf)
 {
diff --git a/grp/grp-merge.h b/grp/grp-merge.h
index 1ad9b9a..180beff 100644
--- a/grp/grp-merge.h
+++ b/grp/grp-merge.h
@@ -26,12 +26,12 @@ 
 int
 __copy_grp (const struct group srcgrp, const size_t buflen,
 	    struct group *destgrp, char *destbuf, char **endptr)
-	    internal_function;
+	    private_function;
 
 /* Merge the member lists of two grp structs together.  */
 int
 __merge_grp (struct group *savedgrp, char *savedbuf, char *savedend,
 	     size_t buflen, struct group *mergegrp, char *mergebuf)
-	     internal_function;
+	     private_function;
 
 #endif /* _GRP_MERGE_H */
diff --git a/include/dlfcn.h b/include/dlfcn.h
index 2524292..8fcd172 100644
--- a/include/dlfcn.h
+++ b/include/dlfcn.h
@@ -44,7 +44,7 @@  libc_hidden_proto (__libc_dlclose)
 #ifdef ElfW
 extern int _dl_addr (const void *address, Dl_info *info,
 		     struct link_map **mapp, const ElfW(Sym) **symbolp)
-     internal_function;
+     private_function;
 libc_hidden_proto (_dl_addr)
 #endif
 
@@ -61,7 +61,7 @@  extern void _dl_close_worker (struct link_map *map, bool force)
    RTLD_NEXT).  WHO is the calling function, for RTLD_NEXT.  Returns
    the symbol value, which may be NULL.  */
 extern void *_dl_sym (void *handle, const char *name, void *who)
-    internal_function;
+    private_function;
 
 /* Look up version VERSION of symbol NAME in shared object HANDLE
    (which may be RTLD_DEFAULT or RTLD_NEXT).  WHO is the calling
@@ -69,7 +69,7 @@  extern void *_dl_sym (void *handle, const char *name, void *who)
    NULL.  */
 extern void *_dl_vsym (void *handle, const char *name, const char *version,
 		       void *who)
-    internal_function;
+    private_function;
 
 /* Helper function for <dlfcn.h> functions.  Runs the OPERATE function via
    _dl_catch_error.  Returns zero for success, nonzero for failure; and
diff --git a/include/libc-symbols.h b/include/libc-symbols.h
index 3310e3a..961cbd6 100644
--- a/include/libc-symbols.h
+++ b/include/libc-symbols.h
@@ -188,12 +188,17 @@ 
 #endif /* __ASSEMBLER__ */
 
 /* On some platforms we can make internal function calls (i.e., calls of
-   functions not exported) a bit faster by using a different calling
-   convention.  */
+   functions within the same shared object) or private function calls
+   (i.e., calls of functions between different shared objects of glibc)
+   a bit faster by using a different calling convention.  */
 #ifndef internal_function
 # define internal_function	/* empty */
 #endif
 
+#ifndef private_function
+# define private_function	/* empty */
+#endif
+
 /* Determine the return address.  */
 #define RETURN_ADDRESS(nr) \
   __builtin_extract_return_addr (__builtin_return_address (nr))
diff --git a/include/rpc/pmap_clnt.h b/include/rpc/pmap_clnt.h
index ec907c2..6a60d44 100644
--- a/include/rpc/pmap_clnt.h
+++ b/include/rpc/pmap_clnt.h
@@ -13,7 +13,7 @@  extern int __get_socket (struct sockaddr_in *saddr)
 extern u_short __libc_rpc_getport (struct sockaddr_in *address, u_long program,
 				   u_long version, u_int protocol,
 				   time_t timeout_sec, time_t tottimeout_sec)
-     internal_function;
+     private_function;
 libc_hidden_proto (__libc_rpc_getport)
 
 libc_hidden_proto (clnt_broadcast)
diff --git a/include/stdio.h b/include/stdio.h
index f68f633..b7d1168 100644
--- a/include/stdio.h
+++ b/include/stdio.h
@@ -91,7 +91,7 @@  extern void __libc_fatal (const char *__message)
      __attribute__ ((__noreturn__));
 extern void __libc_message (int do_abort, const char *__fnt, ...);
 extern void __fortify_fail (const char *msg)
-     __attribute__ ((__noreturn__)) internal_function;
+     __attribute__ ((__noreturn__)) private_function;
 libc_hidden_proto (__fortify_fail)
 
 /* Acquire ownership of STREAM.  */
diff --git a/include/stdlib.h b/include/stdlib.h
index cae9f2c..a82c2ea 100644
--- a/include/stdlib.h
+++ b/include/stdlib.h
@@ -201,16 +201,18 @@  libc_hidden_proto (strtoll)
 libc_hidden_proto (strtoul)
 libc_hidden_proto (strtoull)
 
-extern float __strtof_nan (const char *, char **, char) internal_function;
-extern double __strtod_nan (const char *, char **, char) internal_function;
+extern float __strtof_nan (const char *, char **, char)
+     private_function;
+extern double __strtod_nan (const char *, char **, char)
+     private_function;
 extern long double __strtold_nan (const char *, char **, char)
-     internal_function;
+     private_function;
 extern float __wcstof_nan (const wchar_t *, wchar_t **, wchar_t)
-     internal_function;
+     private_function;
 extern double __wcstod_nan (const wchar_t *, wchar_t **, wchar_t)
-     internal_function;
+     private_function;
 extern long double __wcstold_nan (const wchar_t *, wchar_t **, wchar_t)
-     internal_function;
+     private_function;
 
 libc_hidden_proto (__strtof_nan)
 libc_hidden_proto (__strtod_nan)
diff --git a/inet/inet6_scopeid_pton.c b/inet/inet6_scopeid_pton.c
index f842ffc..8e6b20c 100644
--- a/inet/inet6_scopeid_pton.c
+++ b/inet/inet6_scopeid_pton.c
@@ -28,7 +28,7 @@ 
 
 /* Parse SOURCE as a scope ID for ADDRESS.  Return 0 on success and -1
    on error.  */
-internal_function int
+private_function int
 __inet6_scopeid_pton (const struct in6_addr *address, const char *scope,
                       uint32_t *result)
 {
diff --git a/inet/net-internal.h b/inet/net-internal.h
index 2b2632c..501c211 100644
--- a/inet/net-internal.h
+++ b/inet/net-internal.h
@@ -26,7 +26,7 @@ 
 
 int __inet6_scopeid_pton (const struct in6_addr *address,
                           const char *scope, uint32_t *result)
-  internal_function attribute_hidden;
+  private_function attribute_hidden;
 libc_hidden_proto (__inet6_scopeid_pton)
 
 
diff --git a/nptl/allocatestack.c b/nptl/allocatestack.c
index 8364406..7f7a291 100644
--- a/nptl/allocatestack.c
+++ b/nptl/allocatestack.c
@@ -778,7 +778,7 @@  __deallocate_stack (struct pthread *pd)
 
 
 int
-internal_function
+private_function
 __make_stacks_executable (void **stack_endp)
 {
   /* First the main thread's stack.  */
diff --git a/nptl/libc_pthread_init.c b/nptl/libc_pthread_init.c
index 0db7a10..5c00161 100644
--- a/nptl/libc_pthread_init.c
+++ b/nptl/libc_pthread_init.c
@@ -38,7 +38,7 @@  extern int __libc_multiple_threads attribute_hidden;
 
 int *
 #endif
-internal_function
+private_function
 __libc_pthread_init (unsigned long int *ptr, void (*reclaim) (void),
 		     const struct pthread_functions *functions)
 {
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
index 7fc1e50..0130e15 100644
--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
@@ -357,7 +357,7 @@  extern void __reclaim_stacks (void) attribute_hidden;
 
 /* Make all threads's stacks executable.  */
 extern int __make_stacks_executable (void **stack_endp)
-     internal_function attribute_hidden;
+     private_function attribute_hidden;
 
 /* longjmp handling.  */
 extern void __pthread_cleanup_upto (__jmp_buf target, char *targetframe);
@@ -388,12 +388,12 @@  hidden_proto (__nptl_death_event)
 extern void __libc_pthread_init (unsigned long int *ptr,
 				 void (*reclaim) (void),
 				 const struct pthread_functions *functions)
-     internal_function;
+     private_function;
 #else
 extern int *__libc_pthread_init (unsigned long int *ptr,
 				 void (*reclaim) (void),
 				 const struct pthread_functions *functions)
-     internal_function;
+     private_function;
 
 /* Variable set to a nonzero value either if more than one thread runs or ran,
    or if a single-threaded process is trying to cancel itself.  See
diff --git a/nss/XXX-lookup.c b/nss/XXX-lookup.c
index 5a37fda..84c3f52 100644
--- a/nss/XXX-lookup.c
+++ b/nss/XXX-lookup.c
@@ -57,11 +57,11 @@  service_user *DATABASE_NAME_SYMBOL attribute_hidden;
 
 extern int DB_LOOKUP_FCT (service_user **ni, const char *fct_name,
 			  const char *fct2_name, void **fctp)
-  internal_function;
+  private_function;
 libc_hidden_proto (DB_LOOKUP_FCT)
 
 int
-internal_function
+private_function
 DB_LOOKUP_FCT (service_user **ni, const char *fct_name, const char *fct2_name,
 	       void **fctp)
 {
@@ -79,7 +79,7 @@  libc_hidden_def (DB_LOOKUP_FCT)
 
 #ifndef NO_COMPAT
 int
-internal_function attribute_compat_text_section
+private_function attribute_compat_text_section
 DB_COMPAT_FCT (service_user **ni, const char *fct_name, void **fctp)
 {
   return DB_LOOKUP_FCT (ni, fct_name, NULL, fctp);
diff --git a/nss/getXXbyYY_r.c b/nss/getXXbyYY_r.c
index 5962475..1f21b15 100644
--- a/nss/getXXbyYY_r.c
+++ b/nss/getXXbyYY_r.c
@@ -185,7 +185,7 @@  typedef enum nss_status (*lookup_function) (ADD_PARAMS, LOOKUP_TYPE *, char *,
 /* The lookup function for the first entry of this service.  */
 extern int DB_LOOKUP_FCT (service_user **nip, const char *name,
 			  const char *name2, void **fctp)
-     internal_function;
+     private_function;
 libc_hidden_proto (DB_LOOKUP_FCT)
 
 
diff --git a/nss/getXXent_r.c b/nss/getXXent_r.c
index 2710c1c..9066d47 100644
--- a/nss/getXXent_r.c
+++ b/nss/getXXent_r.c
@@ -114,7 +114,7 @@  __libc_lock_define_initialized (static, lock)
 /* The lookup function for the first entry of this service.  */
 extern int DB_LOOKUP_FCT (service_user **nip, const char *name,
 			  const char *name2, void **fctp)
-     internal_function;
+     private_function;
 libc_hidden_proto (DB_LOOKUP_FCT)
 
 void
diff --git a/nss/nsswitch.h b/nss/nsswitch.h
index f3e756b..3ff3596 100644
--- a/nss/nsswitch.h
+++ b/nss/nsswitch.h
@@ -169,7 +169,7 @@  extern void __nss_disable_nscd (void (*) (size_t, struct traced_file *));
 
 typedef int (*db_lookup_function) (service_user **, const char *, const char *,
 				   void **)
-     internal_function;
+     private_function;
 typedef enum nss_status (*setent_function) (int);
 typedef enum nss_status (*endent_function) (void);
 typedef enum nss_status (*getent_function) (void *, char *, size_t,
diff --git a/resolv/gai_misc.h b/resolv/gai_misc.h
index 6679d2b..6f686c0 100644
--- a/resolv/gai_misc.h
+++ b/resolv/gai_misc.h
@@ -96,6 +96,6 @@  extern int __gai_notify_only (struct sigevent *sigev, pid_t caller_pid)
 
 /* Send the signal.  */
 extern int __gai_sigqueue (int sig, const union sigval val, pid_t caller_pid)
-     internal_function;
+     private_function;
 
 #endif /* gai_misc.h */
diff --git a/resolv/gai_sigqueue.c b/resolv/gai_sigqueue.c
index ebc1e5e..ddec42c 100644
--- a/resolv/gai_sigqueue.c
+++ b/resolv/gai_sigqueue.c
@@ -22,7 +22,7 @@ 
 #include <gai_misc.h>
 
 int
-internal_function
+private_function
 __gai_sigqueue (int sig, const union sigval val, pid_t caller_pid)
 {
   __set_errno (ENOSYS);
diff --git a/stdlib/strtod_nan_main.c b/stdlib/strtod_nan_main.c
index 96b788c..f002abe 100644
--- a/stdlib/strtod_nan_main.c
+++ b/stdlib/strtod_nan_main.c
@@ -29,7 +29,7 @@ 
    return a default NAN.  If ENDPTR is not NULL, set *ENDPTR to point
    to the character after the initial n-char-sequence.  */
 
-internal_function
+private_function
 FLOAT
 STRTOD_NAN (const STRING_TYPE *str, STRING_TYPE **endptr, STRING_TYPE endc)
 {
diff --git a/sunrpc/pm_getport.c b/sunrpc/pm_getport.c
index 54d2e43..0883abc 100644
--- a/sunrpc/pm_getport.c
+++ b/sunrpc/pm_getport.c
@@ -82,7 +82,7 @@  __get_socket (struct sockaddr_in *saddr)
  * Returns 0 if no map exists.
  */
 u_short
-internal_function
+private_function
 __libc_rpc_getport (struct sockaddr_in *address, u_long program,
 		    u_long version, u_int protocol, time_t timeout_sec,
 		    time_t tottimeout_sec)
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index 4508365..3c7ebd1 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -377,7 +377,7 @@  struct rtld_global
   /* If loading a shared object requires that we make the stack executable
      when it was not, we do it by calling this function.
      It returns an errno code or zero on success.  */
-  EXTERN int (*_dl_make_stack_executable_hook) (void **) internal_function;
+  EXTERN int (*_dl_make_stack_executable_hook) (void **) private_function;
 
   /* Prevailing state of the stack, PF_X indicating it's executable.  */
   EXTERN ElfW(Word) _dl_stack_flags;
@@ -632,7 +632,7 @@  extern size_t _dl_phnum;
 
 /* This is the initial value of GL(dl_make_stack_executable_hook).
    A threads library can change it.  */
-extern int _dl_make_stack_executable (void **stack_endp) internal_function;
+extern int _dl_make_stack_executable (void **stack_endp) private_function;
 rtld_hidden_proto (_dl_make_stack_executable)
 
 /* Variable pointing to the end of the stack (or close to it).  This value
@@ -739,7 +739,7 @@  _dl_dprintf (int fd, const char *fmt, ...)
    problem.  */
 extern void _dl_signal_error (int errcode, const char *object,
 			      const char *occurred, const char *errstring)
-     internal_function __attribute__ ((__noreturn__));
+     private_function __attribute__ ((__noreturn__));
 libc_hidden_proto (_dl_signal_error)
 
 /* Like _dl_signal_error, but may return when called in the context of
@@ -779,7 +779,7 @@  extern void _dl_receive_error (receiver_fct fct, void (*operate) (void *),
 extern int _dl_catch_error (const char **objname, const char **errstring,
 			    bool *mallocedp, void (*operate) (void *),
 			    void *args)
-     internal_function;
+     private_function;
 libc_hidden_proto (_dl_catch_error)
 
 /* Open the shared object NAME and map in its segments.
@@ -813,7 +813,7 @@  extern void _dl_setup_hash (struct link_map *map)
    bytes to be used in filling in the result.  */
 extern void _dl_rtld_di_serinfo (struct link_map *loader,
 				 Dl_serinfo *si, bool counting)
-     internal_function;
+     private_function;
 
 
 /* Search loaded objects' symbol tables for a definition of the symbol
@@ -1012,12 +1012,12 @@  void __pthread_initialize_minimal (void) weak_function;
 #endif
 
 /* Allocate memory for static TLS block (unless MEM is nonzero) and dtv.  */
-extern void *_dl_allocate_tls (void *mem) internal_function;
+extern void *_dl_allocate_tls (void *mem) private_function;
 rtld_hidden_proto (_dl_allocate_tls)
 
 /* Get size and alignment requirements of the static TLS block.  */
 extern void _dl_get_tls_static_info (size_t *sizep, size_t *alignp)
-     internal_function;
+     private_function;
 
 extern void _dl_allocate_static_tls (struct link_map *map)
      internal_function attribute_hidden;
@@ -1026,11 +1026,11 @@  extern void _dl_allocate_static_tls (struct link_map *map)
    only used within rtld.c itself at startup time.  */
 extern void *_dl_allocate_tls_storage (void)
      internal_function attribute_hidden;
-extern void *_dl_allocate_tls_init (void *) internal_function;
+extern void *_dl_allocate_tls_init (void *) private_function;
 rtld_hidden_proto (_dl_allocate_tls_init)
 
 /* Deallocate memory allocated with _dl_allocate_tls.  */
-extern void _dl_deallocate_tls (void *tcb, bool dealloc_tcb) internal_function;
+extern void _dl_deallocate_tls (void *tcb, bool dealloc_tcb) private_function;
 rtld_hidden_proto (_dl_deallocate_tls)
 
 extern void _dl_nothread_init_static_tls (struct link_map *) attribute_hidden;
@@ -1081,7 +1081,7 @@  extern void _dl_show_scope (struct link_map *new, int from)
      attribute_hidden;
 
 extern struct link_map *_dl_find_dso_for_object (const ElfW(Addr) addr)
-     internal_function;
+     private_function;
 rtld_hidden_proto (_dl_find_dso_for_object)
 
 /* Initialization which is normally done by the dynamic linker.  */
diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure
index 5b55c5a..bd45aa6 100644
--- a/sysdeps/i386/configure
+++ b/sysdeps/i386/configure
@@ -26,7 +26,7 @@  libc_compiler_builtin_inlined=no
 cat > conftest.c <<EOF
 int _start (void) { int a, b, c; __sync_val_compare_and_swap (&a, b, c); return 0; }
 EOF
-if ! { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS
+if ! { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS $no_ssp
 		     -O0 -nostdlib -nostartfiles
 		     -S conftest.c -o - | fgrep "__sync_val_compare_and_swap"
 		     1>&5'
@@ -77,7 +77,15 @@  if test $libc_cv_asm_mpx = yes; then
 
 fi
 
-$as_echo "#define USE_REGPARMS 1" >>confdefs.h
+# On i386, we can make internal function calls (i.e., calls of functions
+# within the same shared object) or private function calls (i.e., calls
+# of functions between different shared objects of glibc) a bit faster
+# by passing function parameters in registers.  We can only pass 2
+# parameters in registers for private function calls since one register
+# is used by _dl_runtime_resolve as scratch register.
+$as_echo "#define internal_function __attribute__ ((regparm (3), stdcall))" >>confdefs.h
+
+$as_echo "#define private_function __attribute__ ((regparm (2), stdcall))" >>confdefs.h
 
 
 $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
diff --git a/sysdeps/i386/configure.ac b/sysdeps/i386/configure.ac
index 19ef33f..738935a 100644
--- a/sysdeps/i386/configure.ac
+++ b/sysdeps/i386/configure.ac
@@ -45,7 +45,14 @@  if test $libc_cv_asm_mpx = yes; then
   AC_DEFINE(HAVE_MPX_SUPPORT)
 fi
 
-AC_DEFINE(USE_REGPARMS)
+# On i386, we can make internal function calls (i.e., calls of functions
+# within the same shared object) or private function calls (i.e., calls
+# of functions between different shared objects of glibc) a bit faster
+# by passing function parameters in registers.  We can only pass 2
+# parameters in registers for private function calls since one register
+# is used by _dl_runtime_resolve as scratch register.
+AC_DEFINE(internal_function, __attribute__ ((regparm (3), stdcall)))
+AC_DEFINE(private_function, __attribute__ ((regparm (2), stdcall)))
 
 dnl It is always possible to access static and hidden symbols in an
 dnl position independent way.
diff --git a/sysdeps/unix/sysv/linux/dl-execstack.c b/sysdeps/unix/sysv/linux/dl-execstack.c
index 3c4de1c..913af76 100644
--- a/sysdeps/unix/sysv/linux/dl-execstack.c
+++ b/sysdeps/unix/sysv/linux/dl-execstack.c
@@ -30,7 +30,7 @@  extern int __stack_prot attribute_relro attribute_hidden;
 
 
 int
-internal_function
+private_function
 _dl_make_stack_executable (void **stack_endp)
 {
   /* This gives us the highest/lowest page that needs to be changed.  */
diff --git a/sysdeps/unix/sysv/linux/gai_sigqueue.c b/sysdeps/unix/sysv/linux/gai_sigqueue.c
index 0f7b459..04c8d50 100644
--- a/sysdeps/unix/sysv/linux/gai_sigqueue.c
+++ b/sysdeps/unix/sysv/linux/gai_sigqueue.c
@@ -30,7 +30,7 @@ 
 
 /* Return any pending signal or wait for one for the given time.  */
 int
-internal_function
+private_function
 __gai_sigqueue (int sig, const union sigval val, pid_t caller_pid)
 {
   siginfo_t info;
diff --git a/sysdeps/unix/sysv/linux/netlink_assert_response.c b/sysdeps/unix/sysv/linux/netlink_assert_response.c
index d60eb15..963c879 100644
--- a/sysdeps/unix/sysv/linux/netlink_assert_response.c
+++ b/sysdeps/unix/sysv/linux/netlink_assert_response.c
@@ -39,7 +39,7 @@  get_address_family (int fd)
 }
 
 void
-internal_function
+private_function
 __netlink_assert_response (int fd, ssize_t result)
 {
   if (result < 0)
diff --git a/sysdeps/unix/sysv/linux/netlinkaccess.h b/sysdeps/unix/sysv/linux/netlinkaccess.h
index 6cffb65..bcbcc23 100644
--- a/sysdeps/unix/sysv/linux/netlinkaccess.h
+++ b/sysdeps/unix/sysv/linux/netlinkaccess.h
@@ -52,7 +52,7 @@  extern int __netlink_request (struct netlink_handle *h, int type);
 /* Terminate the process if RESULT is an invalid recvmsg result for
    the netlink socket FD.  */
 void __netlink_assert_response (int fd, ssize_t result)
-  internal_function;
+  private_function;
 libc_hidden_proto (__netlink_assert_response)
 
 #endif /* netlinkaccess.h */