[1/N,MPX,x86_64] Intel MPX support in glibc for x86_64

Message ID CAMe9rOp0sLF2JsPSLA-K8weJPygJ2aEpSNRoUP1fm7v8-zWydQ@mail.gmail.com
State Committed
Headers

Commit Message

H.J. Lu April 1, 2014, 6:07 p.m. UTC
  On Thu, Mar 27, 2014 at 11:11 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Mar 17, 2014 at 11:09 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Mon, Mar 17, 2014 at 11:03 AM, Zamyatin, Igor
>> <igor.zamyatin@intel.com> wrote:
>>> Hi All!
>>>
>>> Attached patch is the first of the set of patches that add support for Intel MPX technology (see e.g. http://software.intel.com/sites/default/files/319433-015.pdf, Chapter 9) in Glibc for x86_64. Namely, this particular patch introduces bounds storing/restoring in _dl_runtime_resolve.
>>>
>>> Is it ok to install?
>>>
>>>
>>> Thanks,
>>> Igor
>>>
>>> 2014-03-13  Igor Zamyatin  <igor.zamyatin@intel.com>
>>>
>>>       * config.h.in (HAVE_MPX_SUPPORT): New #undef.
>>>       * sysdeps/x86_64/configure.ac: Set HAVE_MPX_SUPPORT.
>>>       * sysdeps/x86_64/configure: Regenerated.
>>>       * sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve): Add storing
>>>       and restoring of Intel MPX bound registers before and after call
>>>       of_dl_fixup.
>>
>> The reasons we need to save and restore bound registers in
>> symbol lookup are
>>
>> 1.  Branches without BND prefix clear bound registers.
>> 2.  x86-64 pass bounds in bound registers as specified in MPX
>> psABI extension on hjl/mpx/master branch at
>>
>> https://github.com/hjl-tools/x86-64-psABI
>> https://groups.google.com/forum/#!topic/x86-64-abi/KFsB0XTgWYc
>>
>> Binutils has been updated to create an alternate PLT to
>> add BND prefix when branching to ld.so.
>>
>
> Are there any comments, feedbacks or objections?
>
> Thanks.

Hi,

This is the final patch to save and restore bound registers
in _dl_runtime_resolve.  Tested with MPX and non-MPX
binutils on x86-64 and x32.  I will push it onto master if
there is no objection in 24 hours.

Thanks.
  

Comments

Roland McGrath April 1, 2014, 6:29 p.m. UTC | #1
Can you use some macros for the stack offset constants?  There are too many
magic numbers and too much undescribed arithmetic in that code already.
That should be able to consolidate the #ifdef __ILP32__ into one spot.


Thanks,
Roland
  

Patch

From 4515c621c95e01bd9753b36dcd81cbc9e2144200 Mon Sep 17 00:00:00 2001
From: Igor Zamyatin <igor.zamyatin@intel.com>
Date: Tue, 1 Apr 2014 10:16:04 -0700
Subject: [PATCH] Save/restore bound registers in _dl_runtime_resolve

This patch saves and restores bound registers in symbol lookup for x86-64:

1. Branches without BND prefix clear bound registers.
2. x86-64 pass bounds in bound registers as specified in MPX psABI
extension on hjl/mpx/master branch at

https://github.com/hjl-tools/x86-64-psABI
https://groups.google.com/forum/#!topic/x86-64-abi/KFsB0XTgWYc

Binutils has been updated to create an alternate PLT to add BND prefix
when branching to ld.so.

	* config.h.in (HAVE_MPX_SUPPORT): New #undef.
	* sysdeps/x86_64/configure.ac: Set HAVE_MPX_SUPPORT.
	* sysdeps/x86_64/configure: Regenerated.
	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve): Save and
	restore Intel MPX bound registers when calling _dl_fixup.
---
 ChangeLog                      |  8 ++++++++
 config.h.in                    |  3 +++
 sysdeps/x86_64/configure       | 27 +++++++++++++++++++++++++++
 sysdeps/x86_64/configure.ac    | 15 +++++++++++++++
 sysdeps/x86_64/dl-trampoline.S | 39 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 92 insertions(+)

diff --git a/ChangeLog b/ChangeLog
index bfb3083..14efdb7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@ 
+2014-04-01  Igor Zamyatin  <igor.zamyatin@intel.com>
+
+	* config.h.in (HAVE_MPX_SUPPORT): New #undef.
+	* sysdeps/x86_64/configure.ac: Set HAVE_MPX_SUPPORT.
+	* sysdeps/x86_64/configure: Regenerated.
+	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_resolve): Save and
+	restore Intel MPX bound registers when calling _dl_fixup.
+
 2014-04-01  Will Newton  <will.newton@linaro.org>
 
 	* benchtests/Makefile (CFLAGS-bench-ffs.c): Add
diff --git a/config.h.in b/config.h.in
index 3fc34bd..b6e3623 100644
--- a/config.h.in
+++ b/config.h.in
@@ -104,6 +104,9 @@ 
 /* Define if assembler supports AVX512.  */
 #undef  HAVE_AVX512_ASM_SUPPORT
 
+/* Define if assembler supports Intel MPX.  */
+#undef  HAVE_MPX_SUPPORT
+
 /* Define if gcc supports FMA4.  */
 #undef	HAVE_FMA4_SUPPORT
 
diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
index c1c88c8..45d868d 100644
--- a/sysdeps/x86_64/configure
+++ b/sysdeps/x86_64/configure
@@ -222,6 +222,33 @@  $as_echo "$libc_cv_cc_novzeroupper" >&6; }
 config_vars="$config_vars
 config-cflags-novzeroupper = $libc_cv_cc_novzeroupper"
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Intel MPX support" >&5
+$as_echo_n "checking for Intel MPX support... " >&6; }
+if ${libc_cv_asm_mpx+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat > conftest.s <<\EOF
+        bndmov %bnd0,(%rsp)
+EOF
+if { ac_try='${CC-cc} -c $ASFLAGS conftest.s 1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_asm_mpx" >&5
+$as_echo "$libc_cv_asm_mpx" >&6; }
+if test $libc_cv_asm_mpx == yes; then
+  $as_echo "#define HAVE_MPX_SUPPORT 1" >>confdefs.h
+
+fi
+
 $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
 
 # work around problem with autoconf and empty lines at the end of files
diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
index d34f9a8..9138f63 100644
--- a/sysdeps/x86_64/configure.ac
+++ b/sysdeps/x86_64/configure.ac
@@ -75,6 +75,21 @@  LIBC_TRY_CC_OPTION([-mno-vzeroupper],
 ])
 LIBC_CONFIG_VAR([config-cflags-novzeroupper], [$libc_cv_cc_novzeroupper])
 
+dnl Check whether asm supports Intel MPX
+AC_CACHE_CHECK(for Intel MPX support, libc_cv_asm_mpx, [dnl
+cat > conftest.s <<\EOF
+        bndmov %bnd0,(%rsp)
+EOF
+if AC_TRY_COMMAND(${CC-cc} -c $ASFLAGS conftest.s 1>&AS_MESSAGE_LOG_FD); then
+  libc_cv_asm_mpx=yes
+else
+  libc_cv_asm_mpx=no
+fi
+rm -f conftest*])
+if test $libc_cv_asm_mpx == yes; then
+  AC_DEFINE(HAVE_MPX_SUPPORT)
+fi
+
 dnl It is always possible to access static and hidden symbols in an
 dnl position independent way.
 AC_DEFINE(PI_STATIC_AND_HIDDEN)
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 77c4d0f..646fcaf 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -31,8 +31,13 @@ 
 	cfi_startproc
 _dl_runtime_resolve:
 	cfi_adjust_cfa_offset(16) # Incorporate PLT
+#ifdef __ILP32__
 	subq $56,%rsp
 	cfi_adjust_cfa_offset(56)
+#else
+	subq $120,%rsp
+	cfi_adjust_cfa_offset(120)
+#endif
 	movq %rax,(%rsp)	# Preserve registers otherwise clobbered.
 	movq %rcx, 8(%rsp)
 	movq %rdx, 16(%rsp)
@@ -40,10 +45,39 @@  _dl_runtime_resolve:
 	movq %rdi, 32(%rsp)
 	movq %r8, 40(%rsp)
 	movq %r9, 48(%rsp)
+#ifdef __ILP32__
 	movq 64(%rsp), %rsi	# Copy args pushed by PLT in register.
 	movq 56(%rsp), %rdi	# %rdi: link_map, %rsi: reloc_index
+#else
+# ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, 56(%rsp)  # We also have to preserve bound registers.
+	bndmov %bnd1, 72(%rsp)  # These are nops if Intel MPX isn't available
+	bndmov %bnd2, 88(%rsp)  # or disabled.
+	bndmov %bnd3, 104(%rsp)
+# else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,0x38
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,0x48
+	.byte 0x66,0x0f,0x1b,0x54,0x24,0x58
+	.byte 0x66,0x0f,0x1b,0x5c,0x24,0x68
+# endif
+	movq 128(%rsp), %rsi    # Copy args pushed by PLT in register.
+	movq 120(%rsp), %rdi    # %rdi: link_map, %rsi: reloc_index
+#endif
 	call _dl_fixup		# Call resolver.
 	movq %rax, %r11		# Save return value
+#ifndef __ILP32__
+# ifdef HAVE_MPX_SUPPORT
+	bndmov 104(%rsp), %bnd3  # Restore bound registers back.
+	bndmov 88(%rsp), %bnd2   # These are nops if Intel MPX isn't avaiable
+	bndmov 72(%rsp), %bnd1   # or disabled.
+	bndmov 56(%rsp), %bnd0
+# else
+	.byte 0x66,0x0f,0x1a,0x5c,0x24,0x68
+	.byte 0x66,0x0f,0x1a,0x54,0x24,0x58
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,0x48
+	.byte 0x66,0x0f,0x1a,0x44,0x24,0x38
+# endif
+#endif
 	movq 48(%rsp), %r9	# Get register content back.
 	movq 40(%rsp), %r8
 	movq 32(%rsp), %rdi
@@ -51,8 +85,13 @@  _dl_runtime_resolve:
 	movq 16(%rsp), %rdx
 	movq 8(%rsp), %rcx
 	movq (%rsp), %rax
+#ifdef __ILP32__
 	addq $72, %rsp		# Adjust stack(PLT did 2 pushes)
 	cfi_adjust_cfa_offset(-72)
+#else
+	addq $136, %rsp         # Adjust stack(PLT did 2 pushes)
+	cfi_adjust_cfa_offset(-136)
+#endif
 	jmp *%r11		# Jump to function address.
 	cfi_endproc
 	.size _dl_runtime_resolve, .-_dl_runtime_resolve
-- 
1.8.5.3