[3/5] x86: widen @got{,pcrel} support to PUSH and APX IMUL

Message ID 441a0de1-bbac-4235-a58e-f0393b218f1a@suse.com
State New
Headers
Series x86: further GOT{,PCREL} related adjustments |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_binutils_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_binutils_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_binutils_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_binutils_check--master-arm success Test passed

Commit Message

Jan Beulich Feb. 3, 2025, 11:41 a.m. UTC
  With us doing the transformation to an immediate operand for MOV and
various ALU insns, there's little reason to then not support the same
conversion for the other two insns which have respective immediate
operand forms. Unfortunately for IMUL (due to the 0F opcode prefix)
there's no suitable relocation, so the pre-APX forms cannot be marked
for relaxation in the assembler.
---
TBD: Is REX2 really permitted with PUSH <imm>?
  

Comments

H.J. Lu Feb. 3, 2025, 10:40 p.m. UTC | #1
On Mon, Feb 3, 2025 at 7:41 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> With us doing the transformation to an immediate operand for MOV and
> various ALU insns, there's little reason to then not support the same
> conversion for the other two insns which have respective immediate
> operand forms. Unfortunately for IMUL (due to the 0F opcode prefix)
> there's no suitable relocation, so the pre-APX forms cannot be marked
> for relaxation in the assembler.
> ---
> TBD: Is REX2 really permitted with PUSH <imm>?
>
> --- a/bfd/elf32-i386.c
> +++ b/bfd/elf32-i386.c
> @@ -1209,6 +1209,10 @@ elf_i386_tls_transition (struct bfd_link
>     to
>     test $foo, %reg1
>     and convert
> +   push foo@GOT[(%reg)]
> +   to
> +   push $foo
> +   and convert
>     binop foo@GOT[(%reg1)], %reg2
>     to
>     binop $foo, %reg2
> @@ -1233,7 +1237,7 @@ elf_i386_convert_load_reloc (bfd *abfd,
>    unsigned int addend;
>    unsigned int nop;
>    bfd_vma nop_offset;
> -  bool is_pic;
> +  bool is_pic, is_branch = false;
>    bool to_reloc_32;
>    bool abs_symbol;
>    unsigned int r_type;
> @@ -1301,6 +1305,23 @@ elf_i386_convert_load_reloc (bfd *abfd,
>
>    opcode = bfd_get_8 (abfd, contents + roff - 2);
>
> +  if (opcode == 0xff)
> +    {
> +      switch (modrm & 0x38)
> +       {
> +       case 0x10: /* CALL */
> +       case 0x20: /* JMP */
> +         is_branch = true;
> +         break;
> +
> +       case 0x30: /* PUSH */
> +         break;
> +
> +       default:
> +         return true;
> +       }
> +    }
> +
>    /* Convert to R_386_32 if PIC is false or there is no base
>       register.  */
>    to_reloc_32 = !is_pic || baseless;
> @@ -1311,7 +1332,7 @@ elf_i386_convert_load_reloc (bfd *abfd,
>       reloc.  */
>    if (h == NULL)
>      {
> -      if (opcode == 0x0ff)
> +      if (is_branch)
>         /* Convert "call/jmp *foo@GOT[(%reg)]".  */
>         goto convert_branch;
>        else
> @@ -1327,7 +1348,7 @@ elf_i386_convert_load_reloc (bfd *abfd,
>        && !eh->linker_def
>        && local_ref)
>      {
> -      if (opcode == 0xff)
> +      if (is_branch)
>         {
>           /* No direct branch to 0 for PIC.  */
>           if (is_pic)
> @@ -1343,7 +1364,7 @@ elf_i386_convert_load_reloc (bfd *abfd,
>         }
>      }
>
> -  if (opcode == 0xff)
> +  if (is_branch)
>      {
>        /* We have "call/jmp *foo@GOT[(%reg)]".  */
>        if ((h->root.type == bfd_link_hash_defined
> @@ -1399,7 +1420,8 @@ elf_i386_convert_load_reloc (bfd *abfd,
>    else
>      {
>        /* We have "mov foo@GOT[(%re1g)], %reg2",
> -        "test %reg1, foo@GOT(%reg2)" and
> +        "test %reg1, foo@GOT(%reg2)",
> +        "push foo@GOT[(%reg)]", or
>          "binop foo@GOT[(%reg1)], %reg2".
>
>          Avoid optimizing _DYNAMIC since ld.so may use its
> @@ -1460,6 +1482,13 @@ elf_i386_convert_load_reloc (bfd *abfd,
>                   modrm = 0xc0 | ((modrm & 0x38) >> 3) | (opcode & 0x38);
>                   opcode = 0x81;
>                 }
> +             else if (opcode == 0xff)
> +               {
> +                 /* Convert "push foo@GOT(%reg)" to
> +                    "push $foo".  */
> +                 modrm = 0x68; /* Really the opcode.  */
> +                 opcode = 0x26; /* Really a meaningless %es: prefix.  */
> +               }
>               else
>                 return true;
>
> --- a/bfd/elf64-x86-64.c
> +++ b/bfd/elf64-x86-64.c
> @@ -1739,13 +1739,16 @@ elf_x86_64_need_pic (struct bfd_link_inf
>  }
>
>  /* Move the R bits to the B bits in EVEX payload byte 1.  */
> -static unsigned int evex_move_r_to_b (unsigned int byte1)
> +static unsigned int evex_move_r_to_b (unsigned int byte1, bool copy)
>  {
>    byte1 = (byte1 & ~(1 << 5)) | ((byte1 & (1 << 7)) >> 2); /* R3 -> B3 */
>    byte1 = (byte1 & ~(1 << 3)) | ((~byte1 & (1 << 4)) >> 1); /* R4 -> B4 */
>
>    /* Set both R bits, as they're inverted.  */
> -  return byte1 | (1 << 4) | (1 << 7);
> +  if (!copy)
> +    byte1 |= (1 << 4) | (1 << 7);
> +
> +  return byte1;
>  }
>
>  /* With the local symbol, foo, we convert
> @@ -1762,10 +1765,14 @@ static unsigned int evex_move_r_to_b (un
>     to
>     test $foo, %reg
>     and convert
> +   push foo@GOTPCREL(%rip)
> +   to
> +   push $foo
> +   and convert
>     binop foo@GOTPCREL(%rip), %reg
>     to
>     binop $foo, %reg
> -   where binop is one of adc, add, and, cmp, or, sbb, sub, xor
> +   where binop is one of adc, add, and, cmp, imul, or, sbb, sub, xor
>     instructions.  */
>
>  static bool
> @@ -1781,6 +1788,7 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>    bool is_pic;
>    bool no_overflow;
>    bool relocx;
> +  bool is_branch = false;
>    bool to_reloc_pc32;
>    bool abs_symbol;
>    bool local_ref;
> @@ -1873,6 +1881,23 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>    r_symndx = htab->r_sym (irel->r_info);
>
>    opcode = bfd_get_8 (abfd, contents + roff - 2);
> +  modrm = bfd_get_8 (abfd, contents + roff - 1);
> +  if (opcode == 0xff)
> +    {
> +      switch (modrm & 0x38)
> +       {
> +       case 0x10: /* CALL */
> +       case 0x20: /* JMP */
> +         is_branch = true;
> +         break;
> +
> +       case 0x30: /* PUSH */
> +         break;
> +
> +       default:
> +         return true;
> +       }
> +    }
>
>    /* Convert mov to lea since it has been done for a while.  */
>    if (opcode != 0x8b)
> @@ -1890,7 +1915,7 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>       3. no_overflow is true.
>       4. PIC.
>       */
> -  to_reloc_pc32 = (opcode == 0xff
> +  to_reloc_pc32 = (is_branch
>                    || !relocx
>                    || no_overflow
>                    || is_pic);
> @@ -1942,7 +1967,7 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>               && !eh->linker_def
>               && local_ref))
>         {
> -         if (opcode == 0xff)
> +         if (is_branch)
>             {
>               /* Skip for branch instructions since R_X86_64_PC32
>                  may overflow.  */
> @@ -2009,7 +2034,7 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>      return true;
>
>   convert:
> -  if (opcode == 0xff)
> +  if (is_branch)
>      {
>        /* We have "call/jmp *foo@GOTPCREL(%rip)".  */
>        unsigned int nop;
> @@ -2018,7 +2043,6 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>
>        /* Convert R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX to
>          R_X86_64_PC32.  */
> -      modrm = bfd_get_8 (abfd, contents + roff - 1);
>        if (modrm == 0x25)
>         {
>           /* Convert to "jmp foo nop".  */
> @@ -2064,11 +2088,12 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>      }
>    else if (r_type == R_X86_64_CODE_6_GOTPCRELX && opcode != 0x8b)
>      {
> +      bool move_v_r = false;
> +
>        /* R_X86_64_PC32 isn't supported.  */
>        if (to_reloc_pc32)
>         return true;
>
> -      modrm = bfd_get_8 (abfd, contents + roff - 1);
>        if (opcode == 0x85)
>         {
>           /* Convert "ctest<cc> %reg, foo@GOTPCREL(%rip)" to
> @@ -2094,6 +2119,23 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>           modrm = 0xc0 | ((modrm & 0x38) >> 3) | (opcode & 0x38);
>           opcode = 0x81;
>         }
> +      else if (opcode == 0xaf)
> +       {
> +         if (!(evex[2] & 0x10))
> +           {
> +             /* Convert "imul foo@GOTPCREL(%rip), %reg" to
> +                "imul $foo, %reg, %reg".  */
> +             modrm = 0xc0 | ((modrm & 0x38) >> 3) | (modrm & 0x38);
> +           }
> +         else
> +           {
> +             /* Convert "imul foo@GOTPCREL(%rip), %reg1, %reg2" to
> +                "imul $foo, %reg1, %reg2".  */
> +             modrm = 0xc0 | ((modrm & 0x38) >> 3) | (~evex[1] & 0x38);
> +             move_v_r = true;
> +           }
> +         opcode = 0x69;
> +       }
>        else
>         return true;
>
> @@ -2119,7 +2161,23 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>        bfd_put_8 (abfd, opcode, contents + roff - 2);
>        bfd_put_8 (abfd, modrm, contents + roff - 1);
>
> -      evex[0] = evex_move_r_to_b (evex[0]);
> +      evex[0] = evex_move_r_to_b (evex[0], opcode == 0x69 && !move_v_r);
> +      if (move_v_r)
> +       {
> +         /* Move the top two V bits to the R bits in EVEX payload byte 1.
> +            Note that evex_move_r_to_b() set both R bits.  */
> +         if (!(evex[1] & (1 << 6)))
> +           evex[0] &= ~(1 << 7); /* V3 -> R3 */
> +         if (!(evex[2] & (1 << 3)))
> +           evex[0] &= ~(1 << 4); /* V4 -> R4 */
> +         /* Set all V bits, as they're inverted.  */
> +         evex[1] |= 0xf << 3;
> +         evex[2] |= 1 << 3;
> +         /* Clear the ND (ZU) bit (it ought to be ignored anyway).  */
> +         evex[2] &= ~(1 << 4);
> +         bfd_put_8 (abfd, evex[2], contents + roff - 3);
> +         bfd_put_8 (abfd, evex[1], contents + roff - 4);
> +       }
>        bfd_put_8 (abfd, evex[0], contents + roff - 5);
>
>        /* No addend for R_X86_64_32/R_X86_64_32S relocations.  */
> @@ -2162,7 +2220,10 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>         {
>           if (bfd_get_8 (abfd, contents + roff - 4) == 0xd5)
>             {
> -             rex2 = bfd_get_8 (abfd, contents + roff - 3);
> +             /* Make sure even an all-zero payload leaves a non-zero value
> +                in the variable.  */
> +             rex2 = bfd_get_8 (abfd, contents + roff - 3) | 0x100;
> +             rex2_mask |= 0x100;
>               rex_w = (rex2 & REX_W) != 0;
>             }
>           else if (bfd_get_8 (abfd, contents + roff - 4) == 0x0f)
> @@ -2195,7 +2256,6 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>               /* Convert "mov foo@GOTPCREL(%rip), %reg" to
>                  "mov $foo, %reg".  */
>               opcode = 0xc7;
> -             modrm = bfd_get_8 (abfd, contents + roff - 1);
>               modrm = 0xc0 | (modrm & 0x38) >> 3;
>               if (rex_w && ABI_64_P (link_info->output_bfd))
>                 {
> @@ -2222,7 +2282,6 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>           if (to_reloc_pc32)
>             return true;
>
> -         modrm = bfd_get_8 (abfd, contents + roff - 1);
>           if (opcode == 0x85)
>             {
>               /* Convert "test %reg, foo@GOTPCREL(%rip)" to
> @@ -2237,6 +2296,39 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>               modrm = 0xc0 | ((modrm & 0x38) >> 3) | (opcode & 0x38);
>               opcode = 0x81;
>             }
> +         else if (opcode == 0xaf && (rex2 & (REX2_M << 4)))
> +           {
> +             /* Convert "imul foo@GOTPCREL(%rip), %reg" to
> +                "imul $foo, %reg, %reg".  */
> +             modrm = 0xc0 | ((modrm & 0x38) >> 3) | (modrm & 0x38);
> +             rex_mask = 0;
> +             rex2_mask = REX2_M << 4;
> +             opcode = 0x69;
> +           }
> +         else if (opcode == 0xff && !(rex2 & (REX2_M << 4)))
> +           {
> +             /* Convert "push foo@GOTPCREL(%rip)" to
> +                "push $foo".  */
> +             bfd_put_8 (abfd, 0x68, contents + roff - 1);
> +             if (rex)
> +               {
> +                 bfd_put_8 (abfd, 0x26, contents + roff - 3);
> +                 bfd_put_8 (abfd, rex, contents + roff - 2);
> +               }
> +             else if (rex2)
> +               {
> +                 bfd_put_8 (abfd, 0x26, contents + roff - 4);
> +                 bfd_put_8 (abfd, 0xd5, contents + roff - 3);
> +                 bfd_put_8 (abfd, rex2, contents + roff - 2);
> +               }
> +             else
> +               bfd_put_8 (abfd, 0x26, contents + roff - 2);
> +
> +             r_type = R_X86_64_32S;
> +             /* No addend for R_X86_64_32S relocations.  */
> +             irel->r_addend = 0;
> +             goto finish;
> +           }
>           else
>             return true;
>
> @@ -2297,6 +2389,7 @@ elf_x86_64_convert_load_reloc (bfd *abfd
>         }
>      }
>
> + finish:
>    *r_type_p = r_type;
>    irel->r_info = htab->r_info (r_symndx,
>                                r_type | R_X86_64_converted_reloc_bit);
> @@ -4378,7 +4471,7 @@ elf_x86_64_relocate_section (bfd *output
>                       continue;
>                     }
>
> -                 byte1 = evex_move_r_to_b (byte1);
> +                 byte1 = evex_move_r_to_b (byte1, false);
>                   bfd_put_8 (output_bfd, byte1, contents + roff - 5);
>                   bfd_put_8 (output_bfd, 0x81, contents + roff - 2);
>                   bfd_put_8 (output_bfd, 0xc0 | reg, contents + roff - 1);
> --- a/gas/config/tc-i386.c
> +++ b/gas/config/tc-i386.c
> @@ -12928,9 +12928,9 @@ output_disp (fragS *insn_start_frag, off
>               else if (object_64bit)
>                 continue;
>
> -             /* Check for "call/jmp *mem", "mov mem, %reg", "movrs mem, %reg",
> -                "test %reg, mem" and "binop mem, %reg" where binop
> -                is one of adc, add, and, cmp, or, sbb, sub, xor
> +             /* Check for "call/jmp *mem", "push mem", "mov mem, %reg",
> +                "movrs mem, %reg", "test %reg, mem" and "binop mem, %reg" where
> +                binop is one of adc, add, and, cmp, or, sbb, sub, xor, or imul
>                  instructions without data prefix.  Always generate
>                  R_386_GOT32X for "sym*GOT" operand in 32-bit mode.  */
>               unsigned int space = dot_insn () ? i.insn_opcode_space
> @@ -12940,7 +12940,7 @@ output_disp (fragS *insn_start_frag, off
>                       || (i.rm.mode == 0 && i.rm.regmem == 5))
>                   && ((space == SPACE_BASE
>                        && i.tm.base_opcode == 0xff
> -                      && (i.rm.reg == 2 || i.rm.reg == 4))
> +                      && (i.rm.reg == 2 || i.rm.reg == 4 || i.rm.reg == 6))
>                       || ((space == SPACE_BASE
>                            || space == SPACE_0F38
>                            || space == SPACE_MAP4)
> @@ -12949,7 +12949,13 @@ output_disp (fragS *insn_start_frag, off
>                            || space == SPACE_MAP4)
>                           && (i.tm.base_opcode == 0x85
>                               || (i.tm.base_opcode
> -                                 | (i.operands > 2 ? 0x3a : 0x38)) == 0x3b))))
> +                                 | (i.operands > 2 ? 0x3a : 0x38)) == 0x3b))
> +                     || (((space == SPACE_0F
> +                           /* Because of the 0F prefix, no suitable relocation
> +                              exists for this unless it's REX2-encoded.  */
> +                           && is_apx_rex2_encoding ())
> +                          || space == SPACE_MAP4)
> +                         && i.tm.base_opcode == 0xaf)))
>                 {
>                   if (object_64bit)
>                     {
> --- a/ld/testsuite/ld-i386/i386.exp
> +++ b/ld/testsuite/ld-i386/i386.exp
> @@ -373,6 +373,8 @@ run_dump_test "load5a"
>  run_dump_test "load5b"
>  run_dump_test "load6"
>  run_dump_test "load7"
> +run_dump_test "load8a"
> +run_dump_test "load8b"
>  run_dump_test "pr19175"
>  run_dump_test "pr19615"
>  run_dump_test "pr19636-1a"
> --- /dev/null
> +++ b/ld/testsuite/ld-i386/load8.s
> @@ -0,0 +1,20 @@
> +       .data
> +       .type   bar, @object
> +bar:
> +       .byte   1
> +       .size   bar, .-bar
> +       .globl  foo
> +       .type   foo, @object
> +foo:
> +       .byte   1
> +       .size   foo, .-foo
> +       .text
> +       .globl  _start
> +       .type   _start, @function
> +_start:
> +       push    bar@GOT(%ecx)
> +       push    foo@GOT(%edx)
> +       .ifndef PIC
> +       push    foo@GOT
> +       .endif
> +       .size   _start, .-_start
> --- /dev/null
> +++ b/ld/testsuite/ld-i386/load8a.d
> @@ -0,0 +1,14 @@
> +#source: load8.s
> +#as: --32 -mrelax-relocations=yes
> +#ld: -melf_i386 -z noseparate-code
> +#objdump: -dw
> +
> +.*: +file format .*
> +
> +Disassembly of section .text:
> +
> +0+8048074 <_start>:
> +[      ]*[a-f0-9]+:    26 68 86 90 04 08       es push \$0x8049086
> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087

Please avoid adding the es prefix.  It may not be nop in the future.

> +#pass
> --- /dev/null
> +++ b/ld/testsuite/ld-i386/load8b.d
> @@ -0,0 +1,13 @@
> +#source: load8.s
> +#as: --32 -mshared -mrelax-relocations=yes --defsym PIC=1
> +#ld: -melf_i386 -shared -z noseparate-code
> +#objdump: -dw
> +
> +.*: +file format .*
> +
> +Disassembly of section .text:
> +
> +0+[0-9a-f]+ <_start>:
> +[      ]*[0-9a-f]+:    ff b1 f8 ff ff ff       push   -0x8\(%ecx\)
> +[      ]*[0-9a-f]+:    ff b2 fc ff ff ff       push   -0x4\(%edx\)
> +#pass
> --- a/ld/testsuite/ld-x86-64/apx-load1.s
> +++ b/ld/testsuite/ld-x86-64/apx-load1.s
> @@ -118,5 +118,11 @@ _start:
>         sub     %rbp, bar@GOTPCREL(%rip), %r21
>         xor     %rsi, bar@GOTPCREL(%rip), %r22
>
> +       imul    bar@GOTPCREL(%rip), %r17
> +       {nf} imul bar@GOTPCREL(%rip), %r17
> +       imul    bar@GOTPCREL(%rip), %r17, %rdx
> +       imul    bar@GOTPCREL(%rip), %rcx, %r18
> +       {rex2} pushq bar@GOTPCREL(%rip)
> +
>         .size   _start, .-_start
>         .p2align 12, 0x90
> --- a/ld/testsuite/ld-x86-64/apx-load1a.d
> +++ b/ld/testsuite/ld-x86-64/apx-load1a.d
> @@ -115,4 +115,9 @@ Disassembly of section .text:
>   +[a-f0-9]+:   62 f4 dc 10 19 25 74 0c 20 00   sbb    %rsp,0x200c74\(%rip\),%r20        # 602000 <.*>
>   +[a-f0-9]+:   62 f4 d4 10 29 2d 6a 0c 20 00   sub    %rbp,0x200c6a\(%rip\),%r21        # 602000 <.*>
>   +[a-f0-9]+:   62 f4 cc 10 81 f6 20 20 60 00   xor    \$0x602020,%rsi,%r22
> + +[a-f0-9]+:   d5 58 69 c9 20 20 60 00         imul   \$0x602020,%r17,%r17
> + +[a-f0-9]+:   62 ec fc 0c 69 c9 20 20 60 00   \{nf\} imul \$0x602020,%r17,%r17
> + +[a-f0-9]+:   62 fc fc 08 69 d1 20 20 60 00   imul   \$0x602020,%r17,%rdx
> + +[a-f0-9]+:   62 e4 fc 08 69 d1 20 20 60 00   imul   \$0x602020,%rcx,%r18
> + +[a-f0-9]+:   26 d5 00 68 20 20 60 00         es \{rex2 0x0\} push \$0x602020
>  #pass
> --- a/ld/testsuite/ld-x86-64/apx-load1c.d
> +++ b/ld/testsuite/ld-x86-64/apx-load1c.d
> @@ -108,4 +108,9 @@ Disassembly of section .text:
>   +[a-f0-9]+:   62 f4 dc 10 19 25 54 0d 20 00   sbb    %rsp,0x200d54\(%rip\),%r20        # 2020e0 <.*>
>   +[a-f0-9]+:   62 f4 d4 10 29 2d 4a 0d 20 00   sub    %rbp,0x200d4a\(%rip\),%r21        # 2020e0 <.*>
>   +[a-f0-9]+:   62 f4 cc 10 31 35 40 0d 20 00   xor    %rsi,0x200d40\(%rip\),%r22        # 2020e0 <.*>
> + +[a-f0-9]+:   d5 c8 af 0d 38 0d 20 00         imul   0x200d38\(%rip\),%r17        # 2020e0 <.*>
> + +[a-f0-9]+:   62 e4 fc 0c af 0d 2e 0d 20 00   \{nf\} imul 0x200d2e\(%rip\),%r17        # 2020e0 <.*>
> + +[a-f0-9]+:   62 e4 ec 18 af 0d 24 0d 20 00   imul   0x200d24\(%rip\),%r17,%rdx        # 2020e0 <.*>
> + +[a-f0-9]+:   62 f4 ec 10 af 0d 1a 0d 20 00   imul   0x200d1a\(%rip\),%rcx,%r18        # 2020e0 <.*>
> + +[a-f0-9]+:   d5 00 ff 35 12 0d 20 00         \{rex2 0x0\} push 0x200d12\(%rip\)        # 2020e0 <.*>
>  #pass
> --- a/ld/testsuite/ld-x86-64/apx-load1d.d
> +++ b/ld/testsuite/ld-x86-64/apx-load1d.d
> @@ -108,4 +108,9 @@ Disassembly of section .text:
>   +[a-f0-9]+:   62 f4 dc 10 19 25 e4 0c 20 00   sbb    %rsp,0x200ce4\(%rip\),%r20        # 202070 <.*>
>   +[a-f0-9]+:   62 f4 d4 10 29 2d da 0c 20 00   sub    %rbp,0x200cda\(%rip\),%r21        # 202070 <.*>
>   +[a-f0-9]+:   62 f4 cc 10 31 35 d0 0c 20 00   xor    %rsi,0x200cd0\(%rip\),%r22        # 202070 <.*>
> + +[a-f0-9]+:   d5 c8 af 0d c8 0c 20 00         imul   0x200cc8\(%rip\),%r17        # 202070 <.*>
> + +[a-f0-9]+:   62 e4 fc 0c af 0d be 0c 20 00   \{nf\} imul 0x200cbe\(%rip\),%r17        # 202070 <.*>
> + +[a-f0-9]+:   62 e4 ec 18 af 0d b4 0c 20 00   imul   0x200cb4\(%rip\),%r17,%rdx        # 202070 <.*>
> + +[a-f0-9]+:   62 f4 ec 10 af 0d aa 0c 20 00   imul   0x200caa\(%rip\),%rcx,%r18        # 202070 <.*>
> + +[a-f0-9]+:   d5 00 ff 35 a2 0c 20 00         \{rex2 0x0\} push 0x200ca2\(%rip\)        # 202070 <.*>
>  #pass
> --- /dev/null
> +++ b/ld/testsuite/ld-x86-64/load5.s
> @@ -0,0 +1,17 @@
> +       .data
> +       .type   bar, @object
> +bar:
> +       .byte   1
> +       .size   bar, .-bar
> +       .globl  foo
> +       .type   foo, @object
> +foo:
> +       .byte   1
> +       .size   foo, .-foo
> +       .text
> +       .globl  _start
> +       .type   _start, @function
> +_start:
> +             pushq     bar@GOTPCREL(%rip)
> +       {rex} pushq     foo@GOTPCREL(%rip)
> +       .size   _start, .-_start
> --- /dev/null
> +++ b/ld/testsuite/ld-x86-64/load5a.d
> @@ -0,0 +1,15 @@
> +#source: load5.s
> +#as: --64 -mrelax-relocations=yes
> +#ld: -melf_x86_64
> +#objdump: -dw
> +
> +.*: +file format .*
> +
> +
> +Disassembly of section .text:
> +
> +#...
> +[a-f0-9]+ <_start>:
> +[      ]*[a-f0-9]+:    26 68 ([0-9a-f]{2} ){4} *       es push \$0x[a-f0-9]+
> +[      ]*[a-f0-9]+:    26 40 68 ([0-9a-f]{2} ){4} *    es rex push \$0x[a-f0-9]+
> +#pass
> --- /dev/null
> +++ b/ld/testsuite/ld-x86-64/load5b.d
> @@ -0,0 +1,15 @@
> +#source: load5.s
> +#as: --64 -mrelax-relocations=yes
> +#ld: -shared -melf_x86_64
> +#objdump: -dw
> +
> +.*: +file format .*
> +
> +
> +Disassembly of section .text:
> +
> +#...
> +[a-f0-9]+ <_start>:
> +[      ]*[a-f0-9]+:    ff 35 ([0-9a-f]{2} ){4} *       push +0x[a-f0-9]+\(%rip\)        # [a-f0-9]+ <.*>
> +[      ]*[a-f0-9]+:    40 ff 35 ([0-9a-f]{2} ){4} *    rex push 0x[a-f0-9]+\(%rip\)        # [a-f0-9]+ <.*>
> +#pass
> --- a/ld/testsuite/ld-x86-64/x86-64.exp
> +++ b/ld/testsuite/ld-x86-64/x86-64.exp
> @@ -654,6 +654,8 @@ run_dump_test "load2"
>  run_dump_test "load3a"
>  run_dump_test "load3b"
>  run_dump_test "load4"
> +run_dump_test "load5a"
> +run_dump_test "load5b"
>  run_dump_test "call1a"
>  run_dump_test "call1b"
>  run_dump_test "call1c"
>
  
Jan Beulich Feb. 4, 2025, 10:14 a.m. UTC | #2
On 03.02.2025 23:40, H.J. Lu wrote:
> On Mon, Feb 3, 2025 at 7:41 PM Jan Beulich <jbeulich@suse.com> wrote:
>> --- /dev/null
>> +++ b/ld/testsuite/ld-i386/load8a.d
>> @@ -0,0 +1,14 @@
>> +#source: load8.s
>> +#as: --32 -mrelax-relocations=yes
>> +#ld: -melf_i386 -z noseparate-code
>> +#objdump: -dw
>> +
>> +.*: +file format .*
>> +
>> +Disassembly of section .text:
>> +
>> +0+8048074 <_start>:
>> +[      ]*[a-f0-9]+:    26 68 86 90 04 08       es push \$0x8049086
>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
> 
> Please avoid adding the es prefix.  It may not be nop in the future.

Constructive comments please. What other prefix do you suggest we use?
Is another of the segment overrides okay? If not, all that's left is an
address size override, if I'm not mistaken. Which overall seems less
desirable to use.

Plus - is your concern only about 32-bit code, or also about 64-bit? For
32-bit code in particular I'm having difficulty seeing why an ES
prefix might gain new meaning going forward, when an increasing number
of ISA extensions are for 64-bit mode only anyway. If the concern
extends to 64-bit code (it shouldn't, as the pre-386 segment overrides
are documented as nop prefixes, if I'm not mistaken), earlier changes
would need adjusting then, too, I think.

Jan
  
H.J. Lu Feb. 4, 2025, 10:17 a.m. UTC | #3
On Tue, Feb 4, 2025 at 6:14 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 03.02.2025 23:40, H.J. Lu wrote:
> > On Mon, Feb 3, 2025 at 7:41 PM Jan Beulich <jbeulich@suse.com> wrote:
> >> --- /dev/null
> >> +++ b/ld/testsuite/ld-i386/load8a.d
> >> @@ -0,0 +1,14 @@
> >> +#source: load8.s
> >> +#as: --32 -mrelax-relocations=yes
> >> +#ld: -melf_i386 -z noseparate-code
> >> +#objdump: -dw
> >> +
> >> +.*: +file format .*
> >> +
> >> +Disassembly of section .text:
> >> +
> >> +0+8048074 <_start>:
> >> +[      ]*[a-f0-9]+:    26 68 86 90 04 08       es push \$0x8049086
> >> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
> >> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
> >
> > Please avoid adding the es prefix.  It may not be nop in the future.
>
> Constructive comments please. What other prefix do you suggest we use?
> Is another of the segment overrides okay? If not, all that's left is an
> address size override, if I'm not mistaken. Which overall seems less
> desirable to use.

You can use 1-byte NOP.

> Plus - is your concern only about 32-bit code, or also about 64-bit? For
> 32-bit code in particular I'm having difficulty seeing why an ES
> prefix might gain new meaning going forward, when an increasing number
> of ISA extensions are for 64-bit mode only anyway. If the concern
> extends to 64-bit code (it shouldn't, as the pre-386 segment overrides
> are documented as nop prefixes, if I'm not mistaken), earlier changes
> would need adjusting then, too, I think.

I don't think adding instructions like PUSH is very useful.
  
Jan Beulich Feb. 4, 2025, 10:41 a.m. UTC | #4
On 04.02.2025 11:17, H.J. Lu wrote:
> On Tue, Feb 4, 2025 at 6:14 PM Jan Beulich <jbeulich@suse.com> wrote:
>>
>> On 03.02.2025 23:40, H.J. Lu wrote:
>>> On Mon, Feb 3, 2025 at 7:41 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>> --- /dev/null
>>>> +++ b/ld/testsuite/ld-i386/load8a.d
>>>> @@ -0,0 +1,14 @@
>>>> +#source: load8.s
>>>> +#as: --32 -mrelax-relocations=yes
>>>> +#ld: -melf_i386 -z noseparate-code
>>>> +#objdump: -dw
>>>> +
>>>> +.*: +file format .*
>>>> +
>>>> +Disassembly of section .text:
>>>> +
>>>> +0+8048074 <_start>:
>>>> +[      ]*[a-f0-9]+:    26 68 86 90 04 08       es push \$0x8049086
>>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
>>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
>>>
>>> Please avoid adding the es prefix.  It may not be nop in the future.
>>
>> Constructive comments please. What other prefix do you suggest we use?
>> Is another of the segment overrides okay? If not, all that's left is an
>> address size override, if I'm not mistaken. Which overall seems less
>> desirable to use.
> 
> You can use 1-byte NOP.

That'll have an undue effect on debugging, by splitting a single insn
into two.

>> Plus - is your concern only about 32-bit code, or also about 64-bit? For
>> 32-bit code in particular I'm having difficulty seeing why an ES
>> prefix might gain new meaning going forward, when an increasing number
>> of ISA extensions are for 64-bit mode only anyway. If the concern
>> extends to 64-bit code (it shouldn't, as the pre-386 segment overrides
>> are documented as nop prefixes, if I'm not mistaken), earlier changes
>> would need adjusting then, too, I think.
> 
> I don't think adding instructions like PUSH is very useful.

I was actively waiting for this kind of comment. Why was adding support
for e.g. ADC and SBB useful then? Imo it can only be one of two ways:
Either we support everything that can be supported, or we limit things
strictly to cases that are actively deemed useful.

Jan
  
H.J. Lu Feb. 4, 2025, 11:02 a.m. UTC | #5
On Tue, Feb 4, 2025 at 6:41 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 04.02.2025 11:17, H.J. Lu wrote:
> > On Tue, Feb 4, 2025 at 6:14 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>
> >> On 03.02.2025 23:40, H.J. Lu wrote:
> >>> On Mon, Feb 3, 2025 at 7:41 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>> --- /dev/null
> >>>> +++ b/ld/testsuite/ld-i386/load8a.d
> >>>> @@ -0,0 +1,14 @@
> >>>> +#source: load8.s
> >>>> +#as: --32 -mrelax-relocations=yes
> >>>> +#ld: -melf_i386 -z noseparate-code
> >>>> +#objdump: -dw
> >>>> +
> >>>> +.*: +file format .*
> >>>> +
> >>>> +Disassembly of section .text:
> >>>> +
> >>>> +0+8048074 <_start>:
> >>>> +[      ]*[a-f0-9]+:    26 68 86 90 04 08       es push \$0x8049086
> >>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
> >>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
> >>>
> >>> Please avoid adding the es prefix.  It may not be nop in the future.
> >>
> >> Constructive comments please. What other prefix do you suggest we use?
> >> Is another of the segment overrides okay? If not, all that's left is an
> >> address size override, if I'm not mistaken. Which overall seems less
> >> desirable to use.
> >
> > You can use 1-byte NOP.
>
> That'll have an undue effect on debugging, by splitting a single insn
> into two.
>
> >> Plus - is your concern only about 32-bit code, or also about 64-bit? For
> >> 32-bit code in particular I'm having difficulty seeing why an ES
> >> prefix might gain new meaning going forward, when an increasing number
> >> of ISA extensions are for 64-bit mode only anyway. If the concern
> >> extends to 64-bit code (it shouldn't, as the pre-386 segment overrides
> >> are documented as nop prefixes, if I'm not mistaken), earlier changes
> >> would need adjusting then, too, I think.
> >
> > I don't think adding instructions like PUSH is very useful.
>
> I was actively waiting for this kind of comment. Why was adding support
> for e.g. ADC and SBB useful then? Imo it can only be one of two ways:

Good question.  They were included since they have the same operand
encoding as ADD and they can be transformed together with ADD.

> Either we support everything that can be supported, or we limit things
> strictly to cases that are actively deemed useful.
>
> Jan
  
Jan Beulich Feb. 4, 2025, 11:02 a.m. UTC | #6
On 04.02.2025 11:41, Jan Beulich wrote:
> On 04.02.2025 11:17, H.J. Lu wrote:
>> On Tue, Feb 4, 2025 at 6:14 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>
>>> On 03.02.2025 23:40, H.J. Lu wrote:
>>>> On Mon, Feb 3, 2025 at 7:41 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>>> --- /dev/null
>>>>> +++ b/ld/testsuite/ld-i386/load8a.d
>>>>> @@ -0,0 +1,14 @@
>>>>> +#source: load8.s
>>>>> +#as: --32 -mrelax-relocations=yes
>>>>> +#ld: -melf_i386 -z noseparate-code
>>>>> +#objdump: -dw
>>>>> +
>>>>> +.*: +file format .*
>>>>> +
>>>>> +Disassembly of section .text:
>>>>> +
>>>>> +0+8048074 <_start>:
>>>>> +[      ]*[a-f0-9]+:    26 68 86 90 04 08       es push \$0x8049086
>>>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
>>>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
>>>>
>>>> Please avoid adding the es prefix.  It may not be nop in the future.
>>>
>>> Constructive comments please. What other prefix do you suggest we use?
>>> Is another of the segment overrides okay? If not, all that's left is an
>>> address size override, if I'm not mistaken. Which overall seems less
>>> desirable to use.
>>
>> You can use 1-byte NOP.
> 
> That'll have an undue effect on debugging, by splitting a single insn
> into two.
> 
>>> Plus - is your concern only about 32-bit code, or also about 64-bit? For
>>> 32-bit code in particular I'm having difficulty seeing why an ES
>>> prefix might gain new meaning going forward, when an increasing number
>>> of ISA extensions are for 64-bit mode only anyway. If the concern
>>> extends to 64-bit code (it shouldn't, as the pre-386 segment overrides
>>> are documented as nop prefixes, if I'm not mistaken), earlier changes
>>> would need adjusting then, too, I think.
>>
>> I don't think adding instructions like PUSH is very useful.

Further to my earlier reply: You didn't answer my question. Which is
necessary to determine whether earlier changes need adjustment.

> I was actively waiting for this kind of comment. Why was adding support
> for e.g. ADC and SBB useful then? Imo it can only be one of two ways:
> Either we support everything that can be supported, or we limit things
> strictly to cases that are actively deemed useful.

Thinking of it: With TEST being special-cased in the logic involved, I'm
also curious to learn of a code sequence where TEST would sensibly be
used (and where CMP can't be used instead).

Jan
  
H.J. Lu Feb. 4, 2025, 11:12 a.m. UTC | #7
On Tue, Feb 4, 2025 at 7:02 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 04.02.2025 11:41, Jan Beulich wrote:
> > On 04.02.2025 11:17, H.J. Lu wrote:
> >> On Tue, Feb 4, 2025 at 6:14 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>
> >>> On 03.02.2025 23:40, H.J. Lu wrote:
> >>>> On Mon, Feb 3, 2025 at 7:41 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>>> --- /dev/null
> >>>>> +++ b/ld/testsuite/ld-i386/load8a.d
> >>>>> @@ -0,0 +1,14 @@
> >>>>> +#source: load8.s
> >>>>> +#as: --32 -mrelax-relocations=yes
> >>>>> +#ld: -melf_i386 -z noseparate-code
> >>>>> +#objdump: -dw
> >>>>> +
> >>>>> +.*: +file format .*
> >>>>> +
> >>>>> +Disassembly of section .text:
> >>>>> +
> >>>>> +0+8048074 <_start>:
> >>>>> +[      ]*[a-f0-9]+:    26 68 86 90 04 08       es push \$0x8049086
> >>>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
> >>>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
> >>>>
> >>>> Please avoid adding the es prefix.  It may not be nop in the future.
> >>>
> >>> Constructive comments please. What other prefix do you suggest we use?
> >>> Is another of the segment overrides okay? If not, all that's left is an
> >>> address size override, if I'm not mistaken. Which overall seems less
> >>> desirable to use.
> >>
> >> You can use 1-byte NOP.
> >
> > That'll have an undue effect on debugging, by splitting a single insn
> > into two.
> >
> >>> Plus - is your concern only about 32-bit code, or also about 64-bit? For
> >>> 32-bit code in particular I'm having difficulty seeing why an ES
> >>> prefix might gain new meaning going forward, when an increasing number
> >>> of ISA extensions are for 64-bit mode only anyway. If the concern
> >>> extends to 64-bit code (it shouldn't, as the pre-386 segment overrides
> >>> are documented as nop prefixes, if I'm not mistaken), earlier changes
> >>> would need adjusting then, too, I think.
> >>
> >> I don't think adding instructions like PUSH is very useful.
>
> Further to my earlier reply: You didn't answer my question. Which is

Which question?

> necessary to determine whether earlier changes need adjustment.
>
> > I was actively waiting for this kind of comment. Why was adding support
> > for e.g. ADC and SBB useful then? Imo it can only be one of two ways:
> > Either we support everything that can be supported, or we limit things
> > strictly to cases that are actively deemed useful.
>
> Thinking of it: With TEST being special-cased in the logic involved, I'm
> also curious to learn of a code sequence where TEST would sensibly be
> used (and where CMP can't be used instead).
>

Compiler may generate TEST with GOT.
  
Jan Beulich Feb. 4, 2025, 11:16 a.m. UTC | #8
On 04.02.2025 12:12, H.J. Lu wrote:
> On Tue, Feb 4, 2025 at 7:02 PM Jan Beulich <jbeulich@suse.com> wrote:
>>
>> On 04.02.2025 11:41, Jan Beulich wrote:
>>> On 04.02.2025 11:17, H.J. Lu wrote:
>>>> On Tue, Feb 4, 2025 at 6:14 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>>>
>>>>> On 03.02.2025 23:40, H.J. Lu wrote:
>>>>>> On Mon, Feb 3, 2025 at 7:41 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>>>>> --- /dev/null
>>>>>>> +++ b/ld/testsuite/ld-i386/load8a.d
>>>>>>> @@ -0,0 +1,14 @@
>>>>>>> +#source: load8.s
>>>>>>> +#as: --32 -mrelax-relocations=yes
>>>>>>> +#ld: -melf_i386 -z noseparate-code
>>>>>>> +#objdump: -dw
>>>>>>> +
>>>>>>> +.*: +file format .*
>>>>>>> +
>>>>>>> +Disassembly of section .text:
>>>>>>> +
>>>>>>> +0+8048074 <_start>:
>>>>>>> +[      ]*[a-f0-9]+:    26 68 86 90 04 08       es push \$0x8049086
>>>>>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
>>>>>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
>>>>>>
>>>>>> Please avoid adding the es prefix.  It may not be nop in the future.
>>>>>
>>>>> Constructive comments please. What other prefix do you suggest we use?
>>>>> Is another of the segment overrides okay? If not, all that's left is an
>>>>> address size override, if I'm not mistaken. Which overall seems less
>>>>> desirable to use.
>>>>
>>>> You can use 1-byte NOP.
>>>
>>> That'll have an undue effect on debugging, by splitting a single insn
>>> into two.
>>>
>>>>> Plus - is your concern only about 32-bit code, or also about 64-bit? For
>>>>> 32-bit code in particular I'm having difficulty seeing why an ES
>>>>> prefix might gain new meaning going forward, when an increasing number
>>>>> of ISA extensions are for 64-bit mode only anyway. If the concern
>>>>> extends to 64-bit code (it shouldn't, as the pre-386 segment overrides
>>>>> are documented as nop prefixes, if I'm not mistaken), earlier changes
>>>>> would need adjusting then, too, I think.
>>>>
>>>> I don't think adding instructions like PUSH is very useful.
>>
>> Further to my earlier reply: You didn't answer my question. Which is
> 
> Which question?

"Is your concern only about 32-bit code, or also about 64-bit?"

>> necessary to determine whether earlier changes need adjustment.
>>
>>> I was actively waiting for this kind of comment. Why was adding support
>>> for e.g. ADC and SBB useful then? Imo it can only be one of two ways:
>>> Either we support everything that can be supported, or we limit things
>>> strictly to cases that are actively deemed useful.
>>
>> Thinking of it: With TEST being special-cased in the logic involved, I'm
>> also curious to learn of a code sequence where TEST would sensibly be
>> used (and where CMP can't be used instead).
> 
> Compiler may generate TEST with GOT.

Can it? Why would it? (IOW: Again you didn't really address my request.)

Jan
  
H.J. Lu Feb. 4, 2025, 12:14 p.m. UTC | #9
On Tue, Feb 4, 2025 at 7:16 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 04.02.2025 12:12, H.J. Lu wrote:
> > On Tue, Feb 4, 2025 at 7:02 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>
> >> On 04.02.2025 11:41, Jan Beulich wrote:
> >>> On 04.02.2025 11:17, H.J. Lu wrote:
> >>>> On Tue, Feb 4, 2025 at 6:14 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>>>
> >>>>> On 03.02.2025 23:40, H.J. Lu wrote:
> >>>>>> On Mon, Feb 3, 2025 at 7:41 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>>>>> --- /dev/null
> >>>>>>> +++ b/ld/testsuite/ld-i386/load8a.d
> >>>>>>> @@ -0,0 +1,14 @@
> >>>>>>> +#source: load8.s
> >>>>>>> +#as: --32 -mrelax-relocations=yes
> >>>>>>> +#ld: -melf_i386 -z noseparate-code
> >>>>>>> +#objdump: -dw
> >>>>>>> +
> >>>>>>> +.*: +file format .*
> >>>>>>> +
> >>>>>>> +Disassembly of section .text:
> >>>>>>> +
> >>>>>>> +0+8048074 <_start>:
> >>>>>>> +[      ]*[a-f0-9]+:    26 68 86 90 04 08       es push \$0x8049086
> >>>>>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
> >>>>>>> +[      ]*[a-f0-9]+:    26 68 87 90 04 08       es push \$0x8049087
> >>>>>>
> >>>>>> Please avoid adding the es prefix.  It may not be nop in the future.
> >>>>>
> >>>>> Constructive comments please. What other prefix do you suggest we use?
> >>>>> Is another of the segment overrides okay? If not, all that's left is an
> >>>>> address size override, if I'm not mistaken. Which overall seems less
> >>>>> desirable to use.
> >>>>
> >>>> You can use 1-byte NOP.
> >>>
> >>> That'll have an undue effect on debugging, by splitting a single insn
> >>> into two.
> >>>
> >>>>> Plus - is your concern only about 32-bit code, or also about 64-bit? For
> >>>>> 32-bit code in particular I'm having difficulty seeing why an ES
> >>>>> prefix might gain new meaning going forward, when an increasing number
> >>>>> of ISA extensions are for 64-bit mode only anyway. If the concern
> >>>>> extends to 64-bit code (it shouldn't, as the pre-386 segment overrides
> >>>>> are documented as nop prefixes, if I'm not mistaken), earlier changes
> >>>>> would need adjusting then, too, I think.
> >>>>
> >>>> I don't think adding instructions like PUSH is very useful.
> >>
> >> Further to my earlier reply: You didn't answer my question. Which is
> >
> > Which question?
>
> "Is your concern only about 32-bit code, or also about 64-bit?"

Both.

> >> necessary to determine whether earlier changes need adjustment.
> >>
> >>> I was actively waiting for this kind of comment. Why was adding support
> >>> for e.g. ADC and SBB useful then? Imo it can only be one of two ways:
> >>> Either we support everything that can be supported, or we limit things
> >>> strictly to cases that are actively deemed useful.
> >>
> >> Thinking of it: With TEST being special-cased in the logic involved, I'm
> >> also curious to learn of a code sequence where TEST would sensibly be
> >> used (and where CMP can't be used instead).
> >
> > Compiler may generate TEST with GOT.
>
> Can it? Why would it? (IOW: Again you didn't really address my request.)
>

[hjl@gnu-tgl-3 tmp]$ cat x.c
extern int foo __attribute__ ((weak));

extern void bar (void);

__attribute__ ((regparm(3)))
void
_start (long int p)
{
  if (((unsigned long) &foo) & p)
    bar ();
}
[hjl@gnu-tgl-3 tmp]$ gcc -c -O2 -m32 x.c -mno-direct-extern-access

[hjl@gnu-tgl-3 tmp]$ objdump -dwr x.o

x.o:     file format elf32-i386


Disassembly of section .text:

00000000 <_start>:
   0: 85 05 00 00 00 00    test   %eax,0x0 2: R_386_GOT32X foo
   6: 75 08                jne    10 <_start+0x10>
   8: c3                    ret
   9: 8d b4 26 00 00 00 00 lea    0x0(%esi,%eiz,1),%esi
  10: e9 fc ff ff ff        jmp    11 <_start+0x11> 11: R_386_PC32 bar
[hjl@gnu-tgl-3 tmp]$

Which request?
  
Jan Beulich Feb. 5, 2025, 11:37 a.m. UTC | #10
On 04.02.2025 13:14, H.J. Lu wrote:
> On Tue, Feb 4, 2025 at 7:16 PM Jan Beulich <jbeulich@suse.com> wrote:
>> On 04.02.2025 12:12, H.J. Lu wrote:
>>> On Tue, Feb 4, 2025 at 7:02 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>> Thinking of it: With TEST being special-cased in the logic involved, I'm
>>>> also curious to learn of a code sequence where TEST would sensibly be
>>>> used (and where CMP can't be used instead).
>>>
>>> Compiler may generate TEST with GOT.
>>
>> Can it? Why would it? (IOW: Again you didn't really address my request.)
>>
> 
> [hjl@gnu-tgl-3 tmp]$ cat x.c
> extern int foo __attribute__ ((weak));
> 
> extern void bar (void);
> 
> __attribute__ ((regparm(3)))
> void
> _start (long int p)
> {
>   if (((unsigned long) &foo) & p)
>     bar ();
> }
> [hjl@gnu-tgl-3 tmp]$ gcc -c -O2 -m32 x.c -mno-direct-extern-access
> 
> [hjl@gnu-tgl-3 tmp]$ objdump -dwr x.o
> 
> x.o:     file format elf32-i386
> 
> 
> Disassembly of section .text:
> 
> 00000000 <_start>:
>    0: 85 05 00 00 00 00    test   %eax,0x0 2: R_386_GOT32X foo
>    6: 75 08                jne    10 <_start+0x10>
>    8: c3                    ret
>    9: 8d b4 26 00 00 00 00 lea    0x0(%esi,%eiz,1),%esi
>   10: e9 fc ff ff ff        jmp    11 <_start+0x11> 11: R_386_PC32 bar
> [hjl@gnu-tgl-3 tmp]$
> 
> Which request?

"... learn of a code sequence where TEST would sensibly be used" in my
earlier reply. So yes, you provided a contrived example now. I'm afraid
I can't assign meaning to "if (((unsigned long) &foo) & p)", though.
Instead, slightly adjusting your example, I can see PUSH being used by
the compiler in exactly the way we can (but so far didn't) optimize.
That's ordinary argument passing, which I'm inclined to call "sensible".

IOW the question remains as to why optimization of TEST was added,
beyond the "because we can" reasoning. As before, ADD, SUB, and perhaps
CMP make sense when operating on pointers. For AND, OR, XOR, ADC, SBB,
and TEST I'm having a hard time seeing good uses. Since we deal with
them, we also ought to deal with IMUL in the same manner (where
possible). Since beyond these arithmetic ops we also deal with MOV, we
similarly should handle PUSH.

Jan
  
H.J. Lu Feb. 6, 2025, 2:28 a.m. UTC | #11
On Wed, Feb 5, 2025 at 7:37 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 04.02.2025 13:14, H.J. Lu wrote:
> > On Tue, Feb 4, 2025 at 7:16 PM Jan Beulich <jbeulich@suse.com> wrote:
> >> On 04.02.2025 12:12, H.J. Lu wrote:
> >>> On Tue, Feb 4, 2025 at 7:02 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>> Thinking of it: With TEST being special-cased in the logic involved, I'm
> >>>> also curious to learn of a code sequence where TEST would sensibly be
> >>>> used (and where CMP can't be used instead).
> >>>
> >>> Compiler may generate TEST with GOT.
> >>
> >> Can it? Why would it? (IOW: Again you didn't really address my request.)
> >>
> >
> > [hjl@gnu-tgl-3 tmp]$ cat x.c
> > extern int foo __attribute__ ((weak));
> >
> > extern void bar (void);
> >
> > __attribute__ ((regparm(3)))
> > void
> > _start (long int p)
> > {
> >   if (((unsigned long) &foo) & p)
> >     bar ();
> > }
> > [hjl@gnu-tgl-3 tmp]$ gcc -c -O2 -m32 x.c -mno-direct-extern-access
> >
> > [hjl@gnu-tgl-3 tmp]$ objdump -dwr x.o
> >
> > x.o:     file format elf32-i386
> >
> >
> > Disassembly of section .text:
> >
> > 00000000 <_start>:
> >    0: 85 05 00 00 00 00    test   %eax,0x0 2: R_386_GOT32X foo
> >    6: 75 08                jne    10 <_start+0x10>
> >    8: c3                    ret
> >    9: 8d b4 26 00 00 00 00 lea    0x0(%esi,%eiz,1),%esi
> >   10: e9 fc ff ff ff        jmp    11 <_start+0x11> 11: R_386_PC32 bar
> > [hjl@gnu-tgl-3 tmp]$
> >
> > Which request?
>
> "... learn of a code sequence where TEST would sensibly be used" in my
> earlier reply. So yes, you provided a contrived example now. I'm afraid
> I can't assign meaning to "if (((unsigned long) &foo) & p)", though.

It can be used to check pointer alignment.

> Instead, slightly adjusting your example, I can see PUSH being used by
> the compiler in exactly the way we can (but so far didn't) optimize.
> That's ordinary argument passing, which I'm inclined to call "sensible".

Please don't add a segment register.

> IOW the question remains as to why optimization of TEST was added,
> beyond the "because we can" reasoning. As before, ADD, SUB, and perhaps
> CMP make sense when operating on pointers. For AND, OR, XOR, ADC, SBB,
> and TEST I'm having a hard time seeing good uses. Since we deal with
> them, we also ought to deal with IMUL in the same manner (where
> possible). Since beyond these arithmetic ops we also deal with MOV, we
> similarly should handle PUSH.
>
> Jan
  
Jan Beulich Feb. 6, 2025, 12:08 p.m. UTC | #12
On 06.02.2025 03:28, H.J. Lu wrote:
> On Wed, Feb 5, 2025 at 7:37 PM Jan Beulich <jbeulich@suse.com> wrote:
>>
>> On 04.02.2025 13:14, H.J. Lu wrote:
>>> On Tue, Feb 4, 2025 at 7:16 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>> On 04.02.2025 12:12, H.J. Lu wrote:
>>>>> On Tue, Feb 4, 2025 at 7:02 PM Jan Beulich <jbeulich@suse.com> wrote:
>>>>>> Thinking of it: With TEST being special-cased in the logic involved, I'm
>>>>>> also curious to learn of a code sequence where TEST would sensibly be
>>>>>> used (and where CMP can't be used instead).
>>>>>
>>>>> Compiler may generate TEST with GOT.
>>>>
>>>> Can it? Why would it? (IOW: Again you didn't really address my request.)
>>>>
>>>
>>> [hjl@gnu-tgl-3 tmp]$ cat x.c
>>> extern int foo __attribute__ ((weak));
>>>
>>> extern void bar (void);
>>>
>>> __attribute__ ((regparm(3)))
>>> void
>>> _start (long int p)
>>> {
>>>   if (((unsigned long) &foo) & p)
>>>     bar ();
>>> }
>>> [hjl@gnu-tgl-3 tmp]$ gcc -c -O2 -m32 x.c -mno-direct-extern-access
>>>
>>> [hjl@gnu-tgl-3 tmp]$ objdump -dwr x.o
>>>
>>> x.o:     file format elf32-i386
>>>
>>>
>>> Disassembly of section .text:
>>>
>>> 00000000 <_start>:
>>>    0: 85 05 00 00 00 00    test   %eax,0x0 2: R_386_GOT32X foo
>>>    6: 75 08                jne    10 <_start+0x10>
>>>    8: c3                    ret
>>>    9: 8d b4 26 00 00 00 00 lea    0x0(%esi,%eiz,1),%esi
>>>   10: e9 fc ff ff ff        jmp    11 <_start+0x11> 11: R_386_PC32 bar
>>> [hjl@gnu-tgl-3 tmp]$
>>>
>>> Which request?
>>
>> "... learn of a code sequence where TEST would sensibly be used" in my
>> earlier reply. So yes, you provided a contrived example now. I'm afraid
>> I can't assign meaning to "if (((unsigned long) &foo) & p)", though.
> 
> It can be used to check pointer alignment.

Hmm, yes, I didn't think of that. Yet then TESTB (and perhaps even TESTW)
would also want to be supported.

>> Instead, slightly adjusting your example, I can see PUSH being used by
>> the compiler in exactly the way we can (but so far didn't) optimize.
>> That's ordinary argument passing, which I'm inclined to call "sensible".
> 
> Please don't add a segment register.

Actually I have to re-raise the why question: We use CS: in some of the
multi-byte NOPs we emit, both for 32- and for 64-bit.

Plus, as said before, I view it as pretty undesirable to split a single
insn into two (NOP of appropriate size followed by shrunk insn).

Alternatively for 32-bit's PUSH we could use an address size override,
albeit this doesn't look very desirable either. For 64-bit we could use
dummy REX prefixes, unless we're dealing with a REX2 encoding. For REX2
forms of PUSH we could either choose to not use REX2, or again use an
address override. An address size override could also be used for
immediate-to-register MOV.

Also: If anything, these prefixes could only gain hint-like meaning
anyway (like two of them once did for conditional jumps), so there would
not be any functional issue. As for their use in multi-byte NOPs, we can
deal with eventual performance issues once we know how bad things are.

Jan
  
H.J. Lu Feb. 7, 2025, 5:27 a.m. UTC | #13
On Thu, Feb 6, 2025 at 8:08 PM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 06.02.2025 03:28, H.J. Lu wrote:
> > On Wed, Feb 5, 2025 at 7:37 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>
> >> On 04.02.2025 13:14, H.J. Lu wrote:
> >>> On Tue, Feb 4, 2025 at 7:16 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>> On 04.02.2025 12:12, H.J. Lu wrote:
> >>>>> On Tue, Feb 4, 2025 at 7:02 PM Jan Beulich <jbeulich@suse.com> wrote:
> >>>>>> Thinking of it: With TEST being special-cased in the logic involved, I'm
> >>>>>> also curious to learn of a code sequence where TEST would sensibly be
> >>>>>> used (and where CMP can't be used instead).
> >>>>>
> >>>>> Compiler may generate TEST with GOT.
> >>>>
> >>>> Can it? Why would it? (IOW: Again you didn't really address my request.)
> >>>>
> >>>
> >>> [hjl@gnu-tgl-3 tmp]$ cat x.c
> >>> extern int foo __attribute__ ((weak));
> >>>
> >>> extern void bar (void);
> >>>
> >>> __attribute__ ((regparm(3)))
> >>> void
> >>> _start (long int p)
> >>> {
> >>>   if (((unsigned long) &foo) & p)
> >>>     bar ();
> >>> }
> >>> [hjl@gnu-tgl-3 tmp]$ gcc -c -O2 -m32 x.c -mno-direct-extern-access
> >>>
> >>> [hjl@gnu-tgl-3 tmp]$ objdump -dwr x.o
> >>>
> >>> x.o:     file format elf32-i386
> >>>
> >>>
> >>> Disassembly of section .text:
> >>>
> >>> 00000000 <_start>:
> >>>    0: 85 05 00 00 00 00    test   %eax,0x0 2: R_386_GOT32X foo
> >>>    6: 75 08                jne    10 <_start+0x10>
> >>>    8: c3                    ret
> >>>    9: 8d b4 26 00 00 00 00 lea    0x0(%esi,%eiz,1),%esi
> >>>   10: e9 fc ff ff ff        jmp    11 <_start+0x11> 11: R_386_PC32 bar
> >>> [hjl@gnu-tgl-3 tmp]$
> >>>
> >>> Which request?
> >>
> >> "... learn of a code sequence where TEST would sensibly be used" in my
> >> earlier reply. So yes, you provided a contrived example now. I'm afraid
> >> I can't assign meaning to "if (((unsigned long) &foo) & p)", though.
> >
> > It can be used to check pointer alignment.
>
> Hmm, yes, I didn't think of that. Yet then TESTB (and perhaps even TESTW)
> would also want to be supported.
>
> >> Instead, slightly adjusting your example, I can see PUSH being used by
> >> the compiler in exactly the way we can (but so far didn't) optimize.
> >> That's ordinary argument passing, which I'm inclined to call "sensible".
> >
> > Please don't add a segment register.
>
> Actually I have to re-raise the why question: We use CS: in some of the
> multi-byte NOPs we emit, both for 32- and for 64-bit.

CS is better than ES.

> Plus, as said before, I view it as pretty undesirable to split a single
> insn into two (NOP of appropriate size followed by shrunk insn).
>
> Alternatively for 32-bit's PUSH we could use an address size override,
> albeit this doesn't look very desirable either. For 64-bit we could use
> dummy REX prefixes, unless we're dealing with a REX2 encoding. For REX2
> forms of PUSH we could either choose to not use REX2, or again use an
> address override. An address size override could also be used for
> immediate-to-register MOV.
>
> Also: If anything, these prefixes could only gain hint-like meaning
> anyway (like two of them once did for conditional jumps), so there would
> not be any functional issue. As for their use in multi-byte NOPs, we can
> deal with eventual performance issues once we know how bad things are.
>
> Jan
  

Patch

--- a/bfd/elf32-i386.c
+++ b/bfd/elf32-i386.c
@@ -1209,6 +1209,10 @@  elf_i386_tls_transition (struct bfd_link
    to
    test $foo, %reg1
    and convert
+   push foo@GOT[(%reg)]
+   to
+   push $foo
+   and convert
    binop foo@GOT[(%reg1)], %reg2
    to
    binop $foo, %reg2
@@ -1233,7 +1237,7 @@  elf_i386_convert_load_reloc (bfd *abfd,
   unsigned int addend;
   unsigned int nop;
   bfd_vma nop_offset;
-  bool is_pic;
+  bool is_pic, is_branch = false;
   bool to_reloc_32;
   bool abs_symbol;
   unsigned int r_type;
@@ -1301,6 +1305,23 @@  elf_i386_convert_load_reloc (bfd *abfd,
 
   opcode = bfd_get_8 (abfd, contents + roff - 2);
 
+  if (opcode == 0xff)
+    {
+      switch (modrm & 0x38)
+	{
+	case 0x10: /* CALL */
+	case 0x20: /* JMP */
+	  is_branch = true;
+	  break;
+
+	case 0x30: /* PUSH */
+	  break;
+
+	default:
+	  return true;
+	}
+    }
+
   /* Convert to R_386_32 if PIC is false or there is no base
      register.  */
   to_reloc_32 = !is_pic || baseless;
@@ -1311,7 +1332,7 @@  elf_i386_convert_load_reloc (bfd *abfd,
      reloc.  */
   if (h == NULL)
     {
-      if (opcode == 0x0ff)
+      if (is_branch)
 	/* Convert "call/jmp *foo@GOT[(%reg)]".  */
 	goto convert_branch;
       else
@@ -1327,7 +1348,7 @@  elf_i386_convert_load_reloc (bfd *abfd,
       && !eh->linker_def
       && local_ref)
     {
-      if (opcode == 0xff)
+      if (is_branch)
 	{
 	  /* No direct branch to 0 for PIC.  */
 	  if (is_pic)
@@ -1343,7 +1364,7 @@  elf_i386_convert_load_reloc (bfd *abfd,
 	}
     }
 
-  if (opcode == 0xff)
+  if (is_branch)
     {
       /* We have "call/jmp *foo@GOT[(%reg)]".  */
       if ((h->root.type == bfd_link_hash_defined
@@ -1399,7 +1420,8 @@  elf_i386_convert_load_reloc (bfd *abfd,
   else
     {
       /* We have "mov foo@GOT[(%re1g)], %reg2",
-	 "test %reg1, foo@GOT(%reg2)" and
+	 "test %reg1, foo@GOT(%reg2)",
+	 "push foo@GOT[(%reg)]", or
 	 "binop foo@GOT[(%reg1)], %reg2".
 
 	 Avoid optimizing _DYNAMIC since ld.so may use its
@@ -1460,6 +1482,13 @@  elf_i386_convert_load_reloc (bfd *abfd,
 		  modrm = 0xc0 | ((modrm & 0x38) >> 3) | (opcode & 0x38);
 		  opcode = 0x81;
 		}
+	      else if (opcode == 0xff)
+		{
+		  /* Convert "push foo@GOT(%reg)" to
+		     "push $foo".  */
+		  modrm = 0x68; /* Really the opcode.  */
+		  opcode = 0x26; /* Really a meaningless %es: prefix.  */
+		}
 	      else
 		return true;
 
--- a/bfd/elf64-x86-64.c
+++ b/bfd/elf64-x86-64.c
@@ -1739,13 +1739,16 @@  elf_x86_64_need_pic (struct bfd_link_inf
 }
 
 /* Move the R bits to the B bits in EVEX payload byte 1.  */
-static unsigned int evex_move_r_to_b (unsigned int byte1)
+static unsigned int evex_move_r_to_b (unsigned int byte1, bool copy)
 {
   byte1 = (byte1 & ~(1 << 5)) | ((byte1 & (1 << 7)) >> 2); /* R3 -> B3 */
   byte1 = (byte1 & ~(1 << 3)) | ((~byte1 & (1 << 4)) >> 1); /* R4 -> B4 */
 
   /* Set both R bits, as they're inverted.  */
-  return byte1 | (1 << 4) | (1 << 7);
+  if (!copy)
+    byte1 |= (1 << 4) | (1 << 7);
+
+  return byte1;
 }
 
 /* With the local symbol, foo, we convert
@@ -1762,10 +1765,14 @@  static unsigned int evex_move_r_to_b (un
    to
    test $foo, %reg
    and convert
+   push foo@GOTPCREL(%rip)
+   to
+   push $foo
+   and convert
    binop foo@GOTPCREL(%rip), %reg
    to
    binop $foo, %reg
-   where binop is one of adc, add, and, cmp, or, sbb, sub, xor
+   where binop is one of adc, add, and, cmp, imul, or, sbb, sub, xor
    instructions.  */
 
 static bool
@@ -1781,6 +1788,7 @@  elf_x86_64_convert_load_reloc (bfd *abfd
   bool is_pic;
   bool no_overflow;
   bool relocx;
+  bool is_branch = false;
   bool to_reloc_pc32;
   bool abs_symbol;
   bool local_ref;
@@ -1873,6 +1881,23 @@  elf_x86_64_convert_load_reloc (bfd *abfd
   r_symndx = htab->r_sym (irel->r_info);
 
   opcode = bfd_get_8 (abfd, contents + roff - 2);
+  modrm = bfd_get_8 (abfd, contents + roff - 1);
+  if (opcode == 0xff)
+    {
+      switch (modrm & 0x38)
+	{
+	case 0x10: /* CALL */
+	case 0x20: /* JMP */
+	  is_branch = true;
+	  break;
+
+	case 0x30: /* PUSH */
+	  break;
+
+	default:
+	  return true;
+	}
+    }
 
   /* Convert mov to lea since it has been done for a while.  */
   if (opcode != 0x8b)
@@ -1890,7 +1915,7 @@  elf_x86_64_convert_load_reloc (bfd *abfd
      3. no_overflow is true.
      4. PIC.
      */
-  to_reloc_pc32 = (opcode == 0xff
+  to_reloc_pc32 = (is_branch
 		   || !relocx
 		   || no_overflow
 		   || is_pic);
@@ -1942,7 +1967,7 @@  elf_x86_64_convert_load_reloc (bfd *abfd
 	      && !eh->linker_def
 	      && local_ref))
 	{
-	  if (opcode == 0xff)
+	  if (is_branch)
 	    {
 	      /* Skip for branch instructions since R_X86_64_PC32
 		 may overflow.  */
@@ -2009,7 +2034,7 @@  elf_x86_64_convert_load_reloc (bfd *abfd
     return true;
 
  convert:
-  if (opcode == 0xff)
+  if (is_branch)
     {
       /* We have "call/jmp *foo@GOTPCREL(%rip)".  */
       unsigned int nop;
@@ -2018,7 +2043,6 @@  elf_x86_64_convert_load_reloc (bfd *abfd
 
       /* Convert R_X86_64_GOTPCRELX and R_X86_64_REX_GOTPCRELX to
 	 R_X86_64_PC32.  */
-      modrm = bfd_get_8 (abfd, contents + roff - 1);
       if (modrm == 0x25)
 	{
 	  /* Convert to "jmp foo nop".  */
@@ -2064,11 +2088,12 @@  elf_x86_64_convert_load_reloc (bfd *abfd
     }
   else if (r_type == R_X86_64_CODE_6_GOTPCRELX && opcode != 0x8b)
     {
+      bool move_v_r = false;
+
       /* R_X86_64_PC32 isn't supported.  */
       if (to_reloc_pc32)
 	return true;
 
-      modrm = bfd_get_8 (abfd, contents + roff - 1);
       if (opcode == 0x85)
 	{
 	  /* Convert "ctest<cc> %reg, foo@GOTPCREL(%rip)" to
@@ -2094,6 +2119,23 @@  elf_x86_64_convert_load_reloc (bfd *abfd
 	  modrm = 0xc0 | ((modrm & 0x38) >> 3) | (opcode & 0x38);
 	  opcode = 0x81;
 	}
+      else if (opcode == 0xaf)
+	{
+	  if (!(evex[2] & 0x10))
+	    {
+	      /* Convert "imul foo@GOTPCREL(%rip), %reg" to
+	         "imul $foo, %reg, %reg".  */
+	      modrm = 0xc0 | ((modrm & 0x38) >> 3) | (modrm & 0x38);
+	    }
+	  else
+	    {
+	      /* Convert "imul foo@GOTPCREL(%rip), %reg1, %reg2" to
+	         "imul $foo, %reg1, %reg2".  */
+	      modrm = 0xc0 | ((modrm & 0x38) >> 3) | (~evex[1] & 0x38);
+	      move_v_r = true;
+	    }
+	  opcode = 0x69;
+	}
       else
 	return true;
 
@@ -2119,7 +2161,23 @@  elf_x86_64_convert_load_reloc (bfd *abfd
       bfd_put_8 (abfd, opcode, contents + roff - 2);
       bfd_put_8 (abfd, modrm, contents + roff - 1);
 
-      evex[0] = evex_move_r_to_b (evex[0]);
+      evex[0] = evex_move_r_to_b (evex[0], opcode == 0x69 && !move_v_r);
+      if (move_v_r)
+	{
+	  /* Move the top two V bits to the R bits in EVEX payload byte 1.
+	     Note that evex_move_r_to_b() set both R bits.  */
+	  if (!(evex[1] & (1 << 6)))
+	    evex[0] &= ~(1 << 7); /* V3 -> R3 */
+	  if (!(evex[2] & (1 << 3)))
+	    evex[0] &= ~(1 << 4); /* V4 -> R4 */
+	  /* Set all V bits, as they're inverted.  */
+	  evex[1] |= 0xf << 3;
+	  evex[2] |= 1 << 3;
+	  /* Clear the ND (ZU) bit (it ought to be ignored anyway).  */
+	  evex[2] &= ~(1 << 4);
+	  bfd_put_8 (abfd, evex[2], contents + roff - 3);
+	  bfd_put_8 (abfd, evex[1], contents + roff - 4);
+	}
       bfd_put_8 (abfd, evex[0], contents + roff - 5);
 
       /* No addend for R_X86_64_32/R_X86_64_32S relocations.  */
@@ -2162,7 +2220,10 @@  elf_x86_64_convert_load_reloc (bfd *abfd
 	{
 	  if (bfd_get_8 (abfd, contents + roff - 4) == 0xd5)
 	    {
-	      rex2 = bfd_get_8 (abfd, contents + roff - 3);
+	      /* Make sure even an all-zero payload leaves a non-zero value
+		 in the variable.  */
+	      rex2 = bfd_get_8 (abfd, contents + roff - 3) | 0x100;
+	      rex2_mask |= 0x100;
 	      rex_w = (rex2 & REX_W) != 0;
 	    }
 	  else if (bfd_get_8 (abfd, contents + roff - 4) == 0x0f)
@@ -2195,7 +2256,6 @@  elf_x86_64_convert_load_reloc (bfd *abfd
 	      /* Convert "mov foo@GOTPCREL(%rip), %reg" to
 		 "mov $foo, %reg".  */
 	      opcode = 0xc7;
-	      modrm = bfd_get_8 (abfd, contents + roff - 1);
 	      modrm = 0xc0 | (modrm & 0x38) >> 3;
 	      if (rex_w && ABI_64_P (link_info->output_bfd))
 		{
@@ -2222,7 +2282,6 @@  elf_x86_64_convert_load_reloc (bfd *abfd
 	  if (to_reloc_pc32)
 	    return true;
 
-	  modrm = bfd_get_8 (abfd, contents + roff - 1);
 	  if (opcode == 0x85)
 	    {
 	      /* Convert "test %reg, foo@GOTPCREL(%rip)" to
@@ -2237,6 +2296,39 @@  elf_x86_64_convert_load_reloc (bfd *abfd
 	      modrm = 0xc0 | ((modrm & 0x38) >> 3) | (opcode & 0x38);
 	      opcode = 0x81;
 	    }
+	  else if (opcode == 0xaf && (rex2 & (REX2_M << 4)))
+	    {
+	      /* Convert "imul foo@GOTPCREL(%rip), %reg" to
+		 "imul $foo, %reg, %reg".  */
+	      modrm = 0xc0 | ((modrm & 0x38) >> 3) | (modrm & 0x38);
+	      rex_mask = 0;
+	      rex2_mask = REX2_M << 4;
+	      opcode = 0x69;
+	    }
+	  else if (opcode == 0xff && !(rex2 & (REX2_M << 4)))
+	    {
+	      /* Convert "push foo@GOTPCREL(%rip)" to
+		 "push $foo".  */
+	      bfd_put_8 (abfd, 0x68, contents + roff - 1);
+	      if (rex)
+		{
+		  bfd_put_8 (abfd, 0x26, contents + roff - 3);
+		  bfd_put_8 (abfd, rex, contents + roff - 2);
+		}
+	      else if (rex2)
+		{
+		  bfd_put_8 (abfd, 0x26, contents + roff - 4);
+		  bfd_put_8 (abfd, 0xd5, contents + roff - 3);
+		  bfd_put_8 (abfd, rex2, contents + roff - 2);
+		}
+	      else
+		bfd_put_8 (abfd, 0x26, contents + roff - 2);
+
+	      r_type = R_X86_64_32S;
+	      /* No addend for R_X86_64_32S relocations.  */
+	      irel->r_addend = 0;
+	      goto finish;
+	    }
 	  else
 	    return true;
 
@@ -2297,6 +2389,7 @@  elf_x86_64_convert_load_reloc (bfd *abfd
 	}
     }
 
+ finish:
   *r_type_p = r_type;
   irel->r_info = htab->r_info (r_symndx,
 			       r_type | R_X86_64_converted_reloc_bit);
@@ -4378,7 +4471,7 @@  elf_x86_64_relocate_section (bfd *output
 		      continue;
 		    }
 
-		  byte1 = evex_move_r_to_b (byte1);
+		  byte1 = evex_move_r_to_b (byte1, false);
 		  bfd_put_8 (output_bfd, byte1, contents + roff - 5);
 		  bfd_put_8 (output_bfd, 0x81, contents + roff - 2);
 		  bfd_put_8 (output_bfd, 0xc0 | reg, contents + roff - 1);
--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -12928,9 +12928,9 @@  output_disp (fragS *insn_start_frag, off
 	      else if (object_64bit)
 		continue;
 
-	      /* Check for "call/jmp *mem", "mov mem, %reg", "movrs mem, %reg",
-		 "test %reg, mem" and "binop mem, %reg" where binop
-		 is one of adc, add, and, cmp, or, sbb, sub, xor
+	      /* Check for "call/jmp *mem", "push mem", "mov mem, %reg",
+		 "movrs mem, %reg", "test %reg, mem" and "binop mem, %reg" where
+		 binop is one of adc, add, and, cmp, or, sbb, sub, xor, or imul
 		 instructions without data prefix.  Always generate
 		 R_386_GOT32X for "sym*GOT" operand in 32-bit mode.  */
 	      unsigned int space = dot_insn () ? i.insn_opcode_space
@@ -12940,7 +12940,7 @@  output_disp (fragS *insn_start_frag, off
 		      || (i.rm.mode == 0 && i.rm.regmem == 5))
 		  && ((space == SPACE_BASE
 		       && i.tm.base_opcode == 0xff
-		       && (i.rm.reg == 2 || i.rm.reg == 4))
+		       && (i.rm.reg == 2 || i.rm.reg == 4 || i.rm.reg == 6))
 		      || ((space == SPACE_BASE
 			   || space == SPACE_0F38
 			   || space == SPACE_MAP4)
@@ -12949,7 +12949,13 @@  output_disp (fragS *insn_start_frag, off
 			   || space == SPACE_MAP4)
 			  && (i.tm.base_opcode == 0x85
 			      || (i.tm.base_opcode
-				  | (i.operands > 2 ? 0x3a : 0x38)) == 0x3b))))
+				  | (i.operands > 2 ? 0x3a : 0x38)) == 0x3b))
+		      || (((space == SPACE_0F
+			    /* Because of the 0F prefix, no suitable relocation
+			       exists for this unless it's REX2-encoded.  */
+			    && is_apx_rex2_encoding ())
+			   || space == SPACE_MAP4)
+			  && i.tm.base_opcode == 0xaf)))
 		{
 		  if (object_64bit)
 		    {
--- a/ld/testsuite/ld-i386/i386.exp
+++ b/ld/testsuite/ld-i386/i386.exp
@@ -373,6 +373,8 @@  run_dump_test "load5a"
 run_dump_test "load5b"
 run_dump_test "load6"
 run_dump_test "load7"
+run_dump_test "load8a"
+run_dump_test "load8b"
 run_dump_test "pr19175"
 run_dump_test "pr19615"
 run_dump_test "pr19636-1a"
--- /dev/null
+++ b/ld/testsuite/ld-i386/load8.s
@@ -0,0 +1,20 @@ 
+	.data
+	.type	bar, @object
+bar:
+	.byte	1
+	.size	bar, .-bar
+	.globl	foo
+	.type	foo, @object
+foo:
+	.byte	1
+	.size	foo, .-foo
+	.text
+	.globl	_start
+	.type	_start, @function
+_start:
+	push	bar@GOT(%ecx)
+	push	foo@GOT(%edx)
+	.ifndef PIC
+	push	foo@GOT
+	.endif
+	.size	_start, .-_start
--- /dev/null
+++ b/ld/testsuite/ld-i386/load8a.d
@@ -0,0 +1,14 @@ 
+#source: load8.s
+#as: --32 -mrelax-relocations=yes
+#ld: -melf_i386 -z noseparate-code
+#objdump: -dw
+
+.*: +file format .*
+
+Disassembly of section .text:
+
+0+8048074 <_start>:
+[ 	]*[a-f0-9]+:	26 68 86 90 04 08    	es push \$0x8049086
+[ 	]*[a-f0-9]+:	26 68 87 90 04 08    	es push \$0x8049087
+[ 	]*[a-f0-9]+:	26 68 87 90 04 08    	es push \$0x8049087
+#pass
--- /dev/null
+++ b/ld/testsuite/ld-i386/load8b.d
@@ -0,0 +1,13 @@ 
+#source: load8.s
+#as: --32 -mshared -mrelax-relocations=yes --defsym PIC=1
+#ld: -melf_i386 -shared -z noseparate-code
+#objdump: -dw
+
+.*: +file format .*
+
+Disassembly of section .text:
+
+0+[0-9a-f]+ <_start>:
+[ 	]*[0-9a-f]+:	ff b1 f8 ff ff ff    	push   -0x8\(%ecx\)
+[ 	]*[0-9a-f]+:	ff b2 fc ff ff ff    	push   -0x4\(%edx\)
+#pass
--- a/ld/testsuite/ld-x86-64/apx-load1.s
+++ b/ld/testsuite/ld-x86-64/apx-load1.s
@@ -118,5 +118,11 @@  _start:
 	sub	%rbp, bar@GOTPCREL(%rip), %r21
 	xor	%rsi, bar@GOTPCREL(%rip), %r22
 
+	imul	bar@GOTPCREL(%rip), %r17
+	{nf} imul bar@GOTPCREL(%rip), %r17
+	imul	bar@GOTPCREL(%rip), %r17, %rdx
+	imul	bar@GOTPCREL(%rip), %rcx, %r18
+	{rex2} pushq bar@GOTPCREL(%rip)
+
 	.size	_start, .-_start
 	.p2align 12, 0x90
--- a/ld/testsuite/ld-x86-64/apx-load1a.d
+++ b/ld/testsuite/ld-x86-64/apx-load1a.d
@@ -115,4 +115,9 @@  Disassembly of section .text:
  +[a-f0-9]+:	62 f4 dc 10 19 25 74 0c 20 00 	sbb    %rsp,0x200c74\(%rip\),%r20        # 602000 <.*>
  +[a-f0-9]+:	62 f4 d4 10 29 2d 6a 0c 20 00 	sub    %rbp,0x200c6a\(%rip\),%r21        # 602000 <.*>
  +[a-f0-9]+:	62 f4 cc 10 81 f6 20 20 60 00 	xor    \$0x602020,%rsi,%r22
+ +[a-f0-9]+:	d5 58 69 c9 20 20 60 00 	imul   \$0x602020,%r17,%r17
+ +[a-f0-9]+:	62 ec fc 0c 69 c9 20 20 60 00 	\{nf\} imul \$0x602020,%r17,%r17
+ +[a-f0-9]+:	62 fc fc 08 69 d1 20 20 60 00 	imul   \$0x602020,%r17,%rdx
+ +[a-f0-9]+:	62 e4 fc 08 69 d1 20 20 60 00 	imul   \$0x602020,%rcx,%r18
+ +[a-f0-9]+:	26 d5 00 68 20 20 60 00 	es \{rex2 0x0\} push \$0x602020
 #pass
--- a/ld/testsuite/ld-x86-64/apx-load1c.d
+++ b/ld/testsuite/ld-x86-64/apx-load1c.d
@@ -108,4 +108,9 @@  Disassembly of section .text:
  +[a-f0-9]+:	62 f4 dc 10 19 25 54 0d 20 00 	sbb    %rsp,0x200d54\(%rip\),%r20        # 2020e0 <.*>
  +[a-f0-9]+:	62 f4 d4 10 29 2d 4a 0d 20 00 	sub    %rbp,0x200d4a\(%rip\),%r21        # 2020e0 <.*>
  +[a-f0-9]+:	62 f4 cc 10 31 35 40 0d 20 00 	xor    %rsi,0x200d40\(%rip\),%r22        # 2020e0 <.*>
+ +[a-f0-9]+:	d5 c8 af 0d 38 0d 20 00 	imul   0x200d38\(%rip\),%r17        # 2020e0 <.*>
+ +[a-f0-9]+:	62 e4 fc 0c af 0d 2e 0d 20 00 	\{nf\} imul 0x200d2e\(%rip\),%r17        # 2020e0 <.*>
+ +[a-f0-9]+:	62 e4 ec 18 af 0d 24 0d 20 00 	imul   0x200d24\(%rip\),%r17,%rdx        # 2020e0 <.*>
+ +[a-f0-9]+:	62 f4 ec 10 af 0d 1a 0d 20 00 	imul   0x200d1a\(%rip\),%rcx,%r18        # 2020e0 <.*>
+ +[a-f0-9]+:	d5 00 ff 35 12 0d 20 00 	\{rex2 0x0\} push 0x200d12\(%rip\)        # 2020e0 <.*>
 #pass
--- a/ld/testsuite/ld-x86-64/apx-load1d.d
+++ b/ld/testsuite/ld-x86-64/apx-load1d.d
@@ -108,4 +108,9 @@  Disassembly of section .text:
  +[a-f0-9]+:	62 f4 dc 10 19 25 e4 0c 20 00 	sbb    %rsp,0x200ce4\(%rip\),%r20        # 202070 <.*>
  +[a-f0-9]+:	62 f4 d4 10 29 2d da 0c 20 00 	sub    %rbp,0x200cda\(%rip\),%r21        # 202070 <.*>
  +[a-f0-9]+:	62 f4 cc 10 31 35 d0 0c 20 00 	xor    %rsi,0x200cd0\(%rip\),%r22        # 202070 <.*>
+ +[a-f0-9]+:	d5 c8 af 0d c8 0c 20 00 	imul   0x200cc8\(%rip\),%r17        # 202070 <.*>
+ +[a-f0-9]+:	62 e4 fc 0c af 0d be 0c 20 00 	\{nf\} imul 0x200cbe\(%rip\),%r17        # 202070 <.*>
+ +[a-f0-9]+:	62 e4 ec 18 af 0d b4 0c 20 00 	imul   0x200cb4\(%rip\),%r17,%rdx        # 202070 <.*>
+ +[a-f0-9]+:	62 f4 ec 10 af 0d aa 0c 20 00 	imul   0x200caa\(%rip\),%rcx,%r18        # 202070 <.*>
+ +[a-f0-9]+:	d5 00 ff 35 a2 0c 20 00 	\{rex2 0x0\} push 0x200ca2\(%rip\)        # 202070 <.*>
 #pass
--- /dev/null
+++ b/ld/testsuite/ld-x86-64/load5.s
@@ -0,0 +1,17 @@ 
+	.data
+	.type	bar, @object
+bar:
+	.byte	1
+	.size	bar, .-bar
+	.globl	foo
+	.type	foo, @object
+foo:
+	.byte	1
+	.size	foo, .-foo
+	.text
+	.globl	_start
+	.type	_start, @function
+_start:
+	      pushq	bar@GOTPCREL(%rip)
+	{rex} pushq	foo@GOTPCREL(%rip)
+	.size	_start, .-_start
--- /dev/null
+++ b/ld/testsuite/ld-x86-64/load5a.d
@@ -0,0 +1,15 @@ 
+#source: load5.s
+#as: --64 -mrelax-relocations=yes
+#ld: -melf_x86_64
+#objdump: -dw
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+#...
+[a-f0-9]+ <_start>:
+[ 	]*[a-f0-9]+:	26 68 ([0-9a-f]{2} ){4} *	es push \$0x[a-f0-9]+
+[ 	]*[a-f0-9]+:	26 40 68 ([0-9a-f]{2} ){4} *	es rex push \$0x[a-f0-9]+
+#pass
--- /dev/null
+++ b/ld/testsuite/ld-x86-64/load5b.d
@@ -0,0 +1,15 @@ 
+#source: load5.s
+#as: --64 -mrelax-relocations=yes
+#ld: -shared -melf_x86_64
+#objdump: -dw
+
+.*: +file format .*
+
+
+Disassembly of section .text:
+
+#...
+[a-f0-9]+ <_start>:
+[ 	]*[a-f0-9]+:	ff 35 ([0-9a-f]{2} ){4} *	push +0x[a-f0-9]+\(%rip\)        # [a-f0-9]+ <.*>
+[ 	]*[a-f0-9]+:	40 ff 35 ([0-9a-f]{2} ){4} *	rex push 0x[a-f0-9]+\(%rip\)        # [a-f0-9]+ <.*>
+#pass
--- a/ld/testsuite/ld-x86-64/x86-64.exp
+++ b/ld/testsuite/ld-x86-64/x86-64.exp
@@ -654,6 +654,8 @@  run_dump_test "load2"
 run_dump_test "load3a"
 run_dump_test "load3b"
 run_dump_test "load4"
+run_dump_test "load5a"
+run_dump_test "load5b"
 run_dump_test "call1a"
 run_dump_test "call1b"
 run_dump_test "call1c"