x86/APX: make .insn extended-EVEX capable

Message ID 812ceb0e-24fb-4429-bac4-6ff9f91c45b5@suse.com
State New
Headers
Series x86/APX: make .insn extended-EVEX capable |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_binutils_build--master-arm success Build passed
linaro-tcwg-bot/tcwg_binutils_build--master-aarch64 success Build passed
linaro-tcwg-bot/tcwg_binutils_check--master-aarch64 success Test passed
linaro-tcwg-bot/tcwg_binutils_check--master-arm success Test passed

Commit Message

Jan Beulich Feb. 3, 2025, 11:34 a.m. UTC
  So far tricks had to be played to use .insn to encode extended-EVEX
insns; the X4 bit couldn't be controlled at all. Extend the syntax just
enough to cover all features, taking care to reject invalid feature
combinations (albeit aiming at being as lax there as possible, to offer
users as much flexibility as we can - we don't, after all, know what
future will bring).
  

Comments

Cui, Lili Feb. 5, 2025, 7:12 a.m. UTC | #1
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Monday, February 3, 2025 7:34 PM
> To: Binutils <binutils@sourceware.org>
> Cc: H.J. Lu <hjl.tools@gmail.com>; Cui, Lili <lili.cui@intel.com>; Jiang,
> Haochen <haochen.jiang@intel.com>
> Subject: [PATCH] x86/APX: make .insn extended-EVEX capable
> 
> So far tricks had to be played to use .insn to encode extended-EVEX insns; the
> X4 bit couldn't be controlled at all. Extend the syntax just enough to cover all
> features, taking care to reject invalid feature combinations (albeit aiming at
> being as lax there as possible, to offer users as much flexibility as we can - we
> don't, after all, know what future will bring).
> 

Nice to have this patch, but we still can't change X4 arbitrarily, especially for those special invalid test cases. Hopefully in the future .insn can support changing all bits of the EVEX prefix, so .byte can be completely replaced. Just a thought, maybe we need an additional function to parse the symbol and then force overwrite the corresponding bit value at the last stage of encoding.

Thanks,
Lili.

> --- a/gas/config/tc-i386.c
> +++ b/gas/config/tc-i386.c
> @@ -2124,6 +2124,59 @@ check_Scc_OszcOperations (const char *l)
>    /* Skip '{'.  */
>    suffix_string++;
> 
> +  /* For .insn require 'scc=' as the first element.  */  if (dot_insn
> + ())
> +    {
> +      char *copy;
> +      valueT val;
> +
> +      while (is_whitespace (*suffix_string))
> +	suffix_string++;
> +
> +      if (strncasecmp (suffix_string, "scc", 3) == 0)
> +	suffix_string += 3;
> +      else
> +	{
> +	  as_bad (_("unrecognized pseudo-suffix"));
> +	  return -1;
> +	}
> +
> +      while (is_whitespace (*suffix_string))
> +	suffix_string++;
> +
> +      if (*suffix_string == '=')
> +	suffix_string++;
> +      else
> +	{
> +	  as_bad (_("unrecognized pseudo-suffix"));
> +	  return -1;
> +	}
> +
> +      copy = xstrdup (suffix_string);
> +      /* No need to save/restore input_line_pointer; that's done in the
> +	 caller already.  */
> +      input_line_pointer = copy;
> +      val = get_absolute_expression ();
> +      suffix_string += input_line_pointer - copy;
> +      free (copy);
> +
> +      if (val > 0xf)
> +	{
> +	  as_bad (_("scc= value must be between 0 and 15 (decimal)"));
> +	  return -1;
> +	}
> +
> +      i.scc = val;
> +
> +      /* Permit dfv= to be absent (implying all flag values being zero).  */
> +      if (*suffix_string == '}')
> +	return suffix_string + 1 - l;
> +
> +      if (*suffix_string != ',')
> +	goto bad;
> +      suffix_string++;
> +    }
> +
>    /* Parse 'dfv='.  */
>    while (is_whitespace (*suffix_string))
>      suffix_string++;
> @@ -2196,6 +2249,7 @@ check_Scc_OszcOperations (const char *l)
>        suffix_string ++;
>      }
> 
> + bad:
>    as_bad (_("missing `}' or `,' in pseudo-suffix"));
>    return -1;
>  }
> @@ -4571,7 +4625,7 @@ build_rex2_prefix (void)
>     | z| L'L | b | `v | aaa |
>  */
>  static bool
> -build_apx_evex_prefix (void)
> +build_apx_evex_prefix (bool force_nd)
>  {
>    /* To mimic behavior for legacy insns, transform use of DATA16 and REX64
> into
>       their embedded-prefix representations.  */ @@ -4618,7 +4672,8 @@
> build_apx_evex_prefix (void)
>    /* Encode the NDD bit of the instruction promoted from the legacy
>       space. ZU shares the same bit with NDD.  */
>    if ((i.vex.register_specifier && i.tm.opcode_space == SPACE_MAP4)
> -      || i.tm.opcode_modifier.operandconstraint == ZERO_UPPER)
> +      || i.tm.opcode_modifier.operandconstraint == ZERO_UPPER
> +      || force_nd)
>      i.vex.bytes[3] |= 0x10;
> 
>    /* Encode SCC and oszc flags bits.  */ @@ -7460,7 +7515,7 @@
> i386_assemble (char *line)
> 
>        if (is_apx_evex_encoding ())
>  	{
> -	  if (!build_apx_evex_prefix ())
> +	  if (!build_apx_evex_prefix (false))
>  	    return;
>  	}
>        else if (i.tm.opcode_modifier.vex) @@ -10970,7 +11025,8 @@
> build_modrm_byte (void)
>      if (i.tm.operand_types[op].bitfield.baseindex)
>        break;
> 
> -  if (i.reg_operands + i.mem_operands + (i.tm.extension_opcode != None) ==
> 4)
> +  if (i.reg_operands + i.mem_operands + (i.tm.extension_opcode != None)
> +      + (i.tm.opcode_modifier.operandconstraint == SCC) == 4)
>      {
>        expressionS *exp;
> 
> @@ -10982,10 +11038,12 @@ build_modrm_byte (void)
>  	 2. 4 operands: 4 register operands or 3 register operands
>  	 plus 1 memory operand, with VexXDS.
>  	 3. Other equivalent combinations when coming from s_insn().  */
> -      gas_assert (i.tm.opcode_modifier.vexvvvv
> -		  && i.tm.opcode_modifier.vexw);
> -      gas_assert (dot_insn ()
> -		  || i.tm.operand_types[dest].bitfield.class == RegSIMD);
> +      if (!dot_insn ())
> +	{
> +	  gas_assert (i.tm.opcode_modifier.vexvvvv
> +		      && i.tm.opcode_modifier.vexw);
> +	  gas_assert (i.tm.operand_types[dest].bitfield.class == RegSIMD);
> +	}
> 
>        /* Of the first two non-immediate operands the one with the template
>  	 not allowing for a memory one is encoded in the immediate
> operand.  */ @@ -13275,7 +13333,8 @@ s_insn (int dummy
> ATTRIBUTE_UNUSED)
>    const char *end;
>    unsigned int j;
>    valueT val;
> -  bool vex = false, xop = false, evex = false;
> +  bool vex = false, xop = false;
> +  enum { evex_none, evex_basic, evex_nd } evex = evex_none;
>    struct last_insn *last_insn;
> 
>    init_globals ();
> @@ -13324,7 +13383,7 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
>    else if (startswith (line, "EVEX")
>  	   && (line[4] == '.' || is_whitespace (line[4])))
>      {
> -      evex = true;
> +      evex = evex_basic;
>        line += 4;
>      }
> 
> @@ -13543,6 +13602,20 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
>  	line += 3;
>      }
> 
> +  if (line > end && evex && *line == '.')
> +    {
> +      if (line[1] == 'N' && line[2] == 'D')
> +	{
> +	  evex = evex_nd;
> +	  line += 3;
> +	}
> +      else if (line[1] == 'Z' && line[2] == 'U')
> +	{
> +	  i.tm.opcode_modifier.operandconstraint = ZERO_UPPER;
> +	  line += 3;
> +	}
> +    }
> +
>    if (line > end && *line && !is_whitespace (*line))
>      {
>        /* Improve diagnostic a little.  */ @@ -13612,6 +13685,25 @@ s_insn (int
> dummy ATTRIBUTE_UNUSED)
>  	}
>      }
> 
> +  if (evex == evex_basic && *line == '{')
> +    {
> +      int length = check_Scc_OszcOperations (line);
> +
> +      if (length > 0)
> +	{
> +	  line += length;
> +	  if (is_whitespace (*line))
> +	    ++line;
> +
> +	  if (i.tm.opcode_modifier.operandconstraint)
> +	    {
> +	      as_bad (_("SCC/OSZC specifier cannot be used here"));
> +	      goto bad;
> +	    }
> +	  i.tm.opcode_modifier.operandconstraint = SCC;
> +	}
> +    }
> +
>    /* Parse operands, if any, before evaluating encoding space.  */
>    if (*line == ',')
>      {
> @@ -13713,7 +13805,8 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
> 
>        /* Enforce certain constraints on operands.  */
>        switch (i.reg_operands + i.mem_operands
> -	      + (i.tm.extension_opcode != None))
> +	      + (i.tm.extension_opcode != None)
> +	      + (i.tm.opcode_modifier.operandconstraint == SCC))
>  	{
>  	case 0:
>  	  if (i.short_form)
> @@ -13728,9 +13821,13 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
>  	      as_bad (_("too few register/memory operands"));
>  	      goto done;
>  	    }
> -	  break;
> -
> +	  /* Fall through.  */
>  	case 2:
> +	  if (evex == evex_nd)
> +	    {
> +	      as_bad (_("too few register/memory operands"));
> +	      goto done;
> +	    }
>  	  break;
> 
>  	case 4:
> @@ -13743,9 +13840,12 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
>  	    }
>  	  /* Fall through.  */
>  	case 3:
> +	  if (i.tm.opcode_modifier.operandconstraint == SCC)
> +	    break;
>  	  if (pp.encoding != encoding_default)
>  	    {
> -	      i.tm.opcode_modifier.vexvvvv = i.tm.extension_opcode == None
> +	      i.tm.opcode_modifier.vexvvvv = (i.tm.extension_opcode == None
> +					      && evex != evex_nd)
>  					     ? VexVVVV_SRC1 : VexVVVV_DST;
>  	      break;
>  	    }
> @@ -14043,6 +14143,13 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
> 
>    if (vex || xop)
>      {
> +      if (is_apx_evex_encoding ())
> +	{
> +	  as_bad (_("APX functionality cannot be used with %s encodings"),
> +		  vex ? "VEX" : "XOP");
> +	  goto done;
> +	}
> +
>        if (!i.tm.opcode_modifier.vex)
>  	i.tm.opcode_modifier.vex = VEXScalar; /* LIG */
> 
> @@ -14054,7 +14161,36 @@ s_insn (int dummy ATTRIBUTE_UNUSED)
>        if (!i.tm.opcode_modifier.evex)
>  	i.tm.opcode_modifier.evex = EVEXLIG;
> 
> -      build_evex_prefix ();
> +      /* To keep earlier .insn uses working as far as possible, take the
> +	 legacy path when opcode space is 4 bits wide (impossible to encode
> in
> +	 extended EVEX), and when no "extended" syntax elements are used.
> */
> +      if ((!is_apx_evex_encoding () || i.insn_opcode_space > 7)
> +	  && evex == evex_basic
> +	  && !i.tm.opcode_modifier.operandconstraint)
> +	build_evex_prefix ();
> +      else if (i.insn_opcode_space > 7)
> +	{
> +	  as_bad (_("opcode space cannot be larger than 7"));
> +	  goto done;
> +	}
> +      else if (evex == evex_nd && (i.broadcast.type || i.broadcast.bytes))
> +	{
> +	  as_bad (_("ND and broadcast cannot be used at the same time"));
> +	  goto done;
> +	}
> +      else if (pp.has_nf && i.mask.reg)
> +	{
> +	  as_bad (_("{nf} and masking cannot be used at the same time"));
> +	  goto done;
> +	}
> +      else if (i.tm.opcode_modifier.operandconstraint == SCC
> +	       && (pp.has_nf || i.mask.reg))
> +	{
> +	  as_bad (_("SCC cannot be used at the same time {nf} / masking"));
> +	  goto done;
> +	}
> +      else if (!build_apx_evex_prefix (evex == evex_nd))
> +	goto done;
>        i.rex &= REX_OPCODE;
>      }
>    else
> --- a/gas/doc/c-i386.texi
> +++ b/gas/doc/c-i386.texi
> @@ -701,7 +701,7 @@ operand, as long as there is one.
>  syntax tries to resemble that used in documentation:
>  @itemize @bullet
>  @item
> @code{VEX}[@code{.@var{len}}][@code{.@var{prefix}}][@code{.@var{spac
> e}}][@code{.@var{w}}]
> -@item
> @code{EVEX}[@code{.@var{len}}][@code{.@var{prefix}}][@code{.@var{spa
> ce}}][@code{.@var{w}}]
> +@item
> +@code{EVEX}[@code{.@var{len}}][@code{.@var{prefix}}][@code{.@var{sp
> ace}
> +}][@code{.@var{w}}][@code{.@var{opt}}]
>  @item
> @code{XOP}@var{space}[@code{.@var{len}}][@code{.@var{prefix}}][@code
> {.@var{w}}]
>  @end itemize
> 
> @@ -716,10 +716,11 @@ only) @code{512} as well as @code{L0} /  @item
> @code{0f}, @code{0f38}, @code{0f3a}, or @code{M0}...@code{M31}  for
> VEX  @item @code{08}...@code{1f} for XOP -@item @code{0f}, @code{0f38},
> @code{0f3a}, or @code{M0}...@code{M15}
> +@item @code{0f}, @code{0f38}, @code{0f3a}, or @code{M0}...@code{M7}
>  for EVEX
>  @end itemize
>  @item @var{w} can be @code{WIG}, @code{W0}, or @code{W1}
> +@item @var{opt} can be @code{ND} or @code{ZU}
>  @end itemize
> 
>  Defaults:
> @@ -807,6 +808,12 @@ be suffixed by @code{@{:d@var{n}@}} to s  This can
> be combined with an embedded broadcast specifier:
>  @samp{8(%eax)@{1to8:d8@}}.
> 
> +For SCC EVEX the @code{@{dfv=@}} specifier used by ordinary insns is
> +extended and immediately follows the opcode specifier.  The extension
> +is that the SCC value needs to be specified and goes first, as in
> +@code{@{scc=@var{n},dfv=...@}}.  Unlike for ordinary insns @code{dfv=}
> +may be omitted for brevity.
> +
>  @cindex @code{noopt} directive
>  @item .noopt
>  Disable instruction size optimization.
> --- a/gas/testsuite/gas/i386/insn-64.d
> +++ b/gas/testsuite/gas/i386/insn-64.d
> @@ -63,4 +63,13 @@ Disassembly of section .text:
>  [ 	]*[a-f0-9]+:	62 f5 fd 58 5a 40 01[ 	]+vcvtpd2ph
> (0x)?8\(%rax\)\{1to8\},%xmm0
>  [ 	]*[a-f0-9]+:	62 f5 7c 48 5a 40 01[ 	]+vcvtph2pd
> 0x10\(%rax\),%zmm0
>  [ 	]*[a-f0-9]+:	62 f5 7c 58 5a 40 01[ 	]+vcvtph2pd
> (0x)?2\(%rax\)\{1to8\},%zmm0
> +[ 	]*[a-f0-9]+:	62 e4 7c 08 8b 00[ 	]+movrs  \(%rax\),%r16d
> +[ 	]*[a-f0-9]+:	62 fc 7c 08 8b 00[ 	]+movrs  \(%r16\),%eax
> +[ 	]*[a-f0-9]+:	62 f4 78 08 8b 04 00[ 	]+movrs
> \(%rax,%r16(,1)?\),%eax
> +[ 	]*[a-f0-9]+:	62 fc 7c 08 60 c0[ 	]+movbe  %r16d,%eax
> +[ 	]*[a-f0-9]+:	62 f4 7c 0c 01 c0[ 	]+\{nf\} add %eax,%eax
> +[ 	]*[a-f0-9]+:	62 f4 7c 18 01 c0[ 	]+add    %eax,%eax,%eax
> +[ 	]*[a-f0-9]+:	62 f4 ec 18 ff f1[ 	]+push2p %rcx,%rdx
> +[ 	]*[a-f0-9]+:	62 f4 7f 18 42 c0[ 	]+setzub %al
> +[ 	]*[a-f0-9]+:	62 f4 44 0b 39 c0[ 	]+ccmpf \{dfv=of\} %eax,%eax
>  #pass
> --- a/gas/testsuite/gas/i386/insn-64.s
> +++ b/gas/testsuite/gas/i386/insn-64.s
> @@ -104,3 +104,24 @@ insn:
>  	# vcvtph2pd
>  	.insn EVEX.M5.W0 0x5a, 16(%rax){:d16}, %zmm0
>  	.insn EVEX.M5.W0 0x5a, 2(%rax){1to8:d2}, %zmm0
> +
> +	# movrs (APX)
> +	.insn EVEX.L0.NP.M4 0x8b, (%rax), %r16d
> +	.insn EVEX.L0.NP.M4 0x8b, (%r16), %eax
> +	.insn EVEX.L0.NP.M4 0x8b, (%rax,%r16), %eax
> +
> +	# movbe (APX)
> +	.insn EVEX.L0.NP.M4 0x60, %r16d, %eax
> +
> +	# add (APX)
> +	.insn {nf} EVEX.L0.NP.M4 0x01, %eax, %eax
> +	.insn EVEX.L0.NP.M4.ND 0x01, %eax, %eax, %eax
> +
> +	# push2p
> +	.insn EVEX.L0.NP.M4.W1.ND 0xff/6, %rcx, %rdx
> +
> +	# setzub
> +	.insn EVEX.L0.F2.M4.ZU 0x42/0, %eax
> +
> +	# ccmpf
> +	.insn EVEX.L0.NP.M4 0x39 {scc=0b1011,dfv=of}, %eax, %eax
  
Jan Beulich Feb. 5, 2025, 9:41 a.m. UTC | #2
On 05.02.2025 08:12, Cui, Lili wrote:
>> -----Original Message-----
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Monday, February 3, 2025 7:34 PM
>>
>> So far tricks had to be played to use .insn to encode extended-EVEX insns; the
>> X4 bit couldn't be controlled at all. Extend the syntax just enough to cover all
>> features, taking care to reject invalid feature combinations (albeit aiming at
>> being as lax there as possible, to offer users as much flexibility as we can - we
>> don't, after all, know what future will bring).
> 
> Nice to have this patch, but we still can't change X4 arbitrarily, especially for those special invalid test cases. Hopefully in the future .insn can support changing all bits of the EVEX prefix, so .byte can be completely replaced.

Hmm, I'm confused: Doesn't the new testcase demonstrate that X4 can now be
controlled? Can you give an example where things still don't work for you?
Or do you maybe mean U rather than X4 (as X4 is only one of the meanings U
has)?

Jan
  
Jan Beulich Feb. 5, 2025, 2:42 p.m. UTC | #3
On 05.02.2025 10:41, Jan Beulich wrote:
> On 05.02.2025 08:12, Cui, Lili wrote:
>>> -----Original Message-----
>>> From: Jan Beulich <jbeulich@suse.com>
>>> Sent: Monday, February 3, 2025 7:34 PM
>>>
>>> So far tricks had to be played to use .insn to encode extended-EVEX insns; the
>>> X4 bit couldn't be controlled at all. Extend the syntax just enough to cover all
>>> features, taking care to reject invalid feature combinations (albeit aiming at
>>> being as lax there as possible, to offer users as much flexibility as we can - we
>>> don't, after all, know what future will bring).
>>
>> Nice to have this patch, but we still can't change X4 arbitrarily, especially for those special invalid test cases. Hopefully in the future .insn can support changing all bits of the EVEX prefix, so .byte can be completely replaced.
> 
> Hmm, I'm confused: Doesn't the new testcase demonstrate that X4 can now be
> controlled? Can you give an example where things still don't work for you?
> Or do you maybe mean U rather than X4 (as X4 is only one of the meanings U
> has)?

It looks to be the latter. In v2 I'm now converting three of the four .byte
to .insn in x86-64-apx-evex-promoted-bad. The one that cannot be converted
is where EVEX.U is intended to be zero, when - afaict - that's neither
expressable the APX way nor the AVX10/256 one. I also don't see how it could
sensibly be expressed.

Jan
  
Cui, Lili Feb. 6, 2025, 7:18 a.m. UTC | #4
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Wednesday, February 5, 2025 5:42 PM
> To: Cui, Lili <lili.cui@intel.com>
> Cc: H.J. Lu <hjl.tools@gmail.com>; Jiang, Haochen
> <haochen.jiang@intel.com>; Binutils <binutils@sourceware.org>
> Subject: Re: [PATCH] x86/APX: make .insn extended-EVEX capable
> 
> On 05.02.2025 08:12, Cui, Lili wrote:
> >> -----Original Message-----
> >> From: Jan Beulich <jbeulich@suse.com>
> >> Sent: Monday, February 3, 2025 7:34 PM
> >>
> >> So far tricks had to be played to use .insn to encode extended-EVEX
> >> insns; the
> >> X4 bit couldn't be controlled at all. Extend the syntax just enough
> >> to cover all features, taking care to reject invalid feature
> >> combinations (albeit aiming at being as lax there as possible, to
> >> offer users as much flexibility as we can - we don't, after all, know what
> future will bring).
> >
> > Nice to have this patch, but we still can't change X4 arbitrarily, especially for
> those special invalid test cases. Hopefully in the future .insn can support
> changing all bits of the EVEX prefix, so .byte can be completely replaced.
> 
> Hmm, I'm confused: Doesn't the new testcase demonstrate that X4 can now
> be controlled? Can you give an example where things still don't work for you?
> Or do you maybe mean U rather than X4 (as X4 is only one of the meanings U
> has)?
> 

Maybe I'm wrong, I haven't tested it with your patch. I think we cannot handle this case in x86-64-apx-evex-promoted-bad.s. we want to set P[10] to 0.

        #VSIB vpgatherqq (%rbp,%zmm17,8),%zmm16{%k1} set EVEX.P[10] == 0
        .byte 0x62, 0xe2, 0xf9, 0x41, 0x91, 0x84, 0xcd
        .byte 0xff

Lili.

> Jan
  
Cui, Lili Feb. 6, 2025, 8:15 a.m. UTC | #5
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Wednesday, February 5, 2025 10:43 PM
> To: Cui, Lili <lili.cui@intel.com>
> Cc: H.J. Lu <hjl.tools@gmail.com>; Jiang, Haochen
> <haochen.jiang@intel.com>; Binutils <binutils@sourceware.org>
> Subject: Re: [PATCH] x86/APX: make .insn extended-EVEX capable
> 
> On 05.02.2025 10:41, Jan Beulich wrote:
> > On 05.02.2025 08:12, Cui, Lili wrote:
> >>> -----Original Message-----
> >>> From: Jan Beulich <jbeulich@suse.com>
> >>> Sent: Monday, February 3, 2025 7:34 PM
> >>>
> >>> So far tricks had to be played to use .insn to encode extended-EVEX
> >>> insns; the
> >>> X4 bit couldn't be controlled at all. Extend the syntax just enough
> >>> to cover all features, taking care to reject invalid feature
> >>> combinations (albeit aiming at being as lax there as possible, to
> >>> offer users as much flexibility as we can - we don't, after all, know what
> future will bring).
> >>
> >> Nice to have this patch, but we still can't change X4 arbitrarily, especially
> for those special invalid test cases. Hopefully in the future .insn can support
> changing all bits of the EVEX prefix, so .byte can be completely replaced.
> >
> > Hmm, I'm confused: Doesn't the new testcase demonstrate that X4 can
> > now be controlled? Can you give an example where things still don't work
> for you?
> > Or do you maybe mean U rather than X4 (as X4 is only one of the
> > meanings U has)?
> 
> It looks to be the latter. In v2 I'm now converting three of the four .byte
> to .insn in x86-64-apx-evex-promoted-bad. The one that cannot be converted
> is where EVEX.U is intended to be zero, when - afaict - that's neither
> expressable the APX way nor the AVX10/256 one. I also don't see how it could
> sensibly be expressed.
> 

Yes, I mean this one.  I also don't have any good ideas now. Let's deal with it later.

Lili.

> Jan
  
Jan Beulich Feb. 6, 2025, 11:27 a.m. UTC | #6
On 06.02.2025 08:18, Cui, Lili wrote:
> Maybe I'm wrong, I haven't tested it with your patch. I think we cannot handle this case in x86-64-apx-evex-promoted-bad.s. we want to set P[10] to 0.
> 
>         #VSIB vpgatherqq (%rbp,%zmm17,8),%zmm16{%k1} set EVEX.P[10] == 0
>         .byte 0x62, 0xe2, 0xf9, 0x41, 0x91, 0x84, 0xcd
>         .byte 0xff

	.insn EVEX.512.66.0f38.W1 0x91, -8(%rbp,%r17,8){:d8}, %zmm16{%k1}, %r16

is what I arrived at (which isn't exactly the original byte sequence, but that's
for an unrelated reason). Just fyi.

Jan
  

Patch

--- a/gas/config/tc-i386.c
+++ b/gas/config/tc-i386.c
@@ -2124,6 +2124,59 @@  check_Scc_OszcOperations (const char *l)
   /* Skip '{'.  */
   suffix_string++;
 
+  /* For .insn require 'scc=' as the first element.  */
+  if (dot_insn ())
+    {
+      char *copy;
+      valueT val;
+
+      while (is_whitespace (*suffix_string))
+	suffix_string++;
+
+      if (strncasecmp (suffix_string, "scc", 3) == 0)
+	suffix_string += 3;
+      else
+	{
+	  as_bad (_("unrecognized pseudo-suffix"));
+	  return -1;
+	}
+
+      while (is_whitespace (*suffix_string))
+	suffix_string++;
+
+      if (*suffix_string == '=')
+	suffix_string++;
+      else
+	{
+	  as_bad (_("unrecognized pseudo-suffix"));
+	  return -1;
+	}
+
+      copy = xstrdup (suffix_string);
+      /* No need to save/restore input_line_pointer; that's done in the
+	 caller already.  */
+      input_line_pointer = copy;
+      val = get_absolute_expression ();
+      suffix_string += input_line_pointer - copy;
+      free (copy);
+
+      if (val > 0xf)
+	{
+	  as_bad (_("scc= value must be between 0 and 15 (decimal)"));
+	  return -1;
+	}
+
+      i.scc = val;
+
+      /* Permit dfv= to be absent (implying all flag values being zero).  */
+      if (*suffix_string == '}')
+	return suffix_string + 1 - l;
+
+      if (*suffix_string != ',')
+	goto bad;
+      suffix_string++;
+    }
+
   /* Parse 'dfv='.  */
   while (is_whitespace (*suffix_string))
     suffix_string++;
@@ -2196,6 +2249,7 @@  check_Scc_OszcOperations (const char *l)
       suffix_string ++;
     }
 
+ bad:
   as_bad (_("missing `}' or `,' in pseudo-suffix"));
   return -1;
 }
@@ -4571,7 +4625,7 @@  build_rex2_prefix (void)
    | z| L'L | b | `v | aaa |
 */
 static bool
-build_apx_evex_prefix (void)
+build_apx_evex_prefix (bool force_nd)
 {
   /* To mimic behavior for legacy insns, transform use of DATA16 and REX64 into
      their embedded-prefix representations.  */
@@ -4618,7 +4672,8 @@  build_apx_evex_prefix (void)
   /* Encode the NDD bit of the instruction promoted from the legacy
      space. ZU shares the same bit with NDD.  */
   if ((i.vex.register_specifier && i.tm.opcode_space == SPACE_MAP4)
-      || i.tm.opcode_modifier.operandconstraint == ZERO_UPPER)
+      || i.tm.opcode_modifier.operandconstraint == ZERO_UPPER
+      || force_nd)
     i.vex.bytes[3] |= 0x10;
 
   /* Encode SCC and oszc flags bits.  */
@@ -7460,7 +7515,7 @@  i386_assemble (char *line)
 
       if (is_apx_evex_encoding ())
 	{
-	  if (!build_apx_evex_prefix ())
+	  if (!build_apx_evex_prefix (false))
 	    return;
 	}
       else if (i.tm.opcode_modifier.vex)
@@ -10970,7 +11025,8 @@  build_modrm_byte (void)
     if (i.tm.operand_types[op].bitfield.baseindex)
       break;
 
-  if (i.reg_operands + i.mem_operands + (i.tm.extension_opcode != None) == 4)
+  if (i.reg_operands + i.mem_operands + (i.tm.extension_opcode != None)
+      + (i.tm.opcode_modifier.operandconstraint == SCC) == 4)
     {
       expressionS *exp;
 
@@ -10982,10 +11038,12 @@  build_modrm_byte (void)
 	 2. 4 operands: 4 register operands or 3 register operands
 	 plus 1 memory operand, with VexXDS.
 	 3. Other equivalent combinations when coming from s_insn().  */
-      gas_assert (i.tm.opcode_modifier.vexvvvv
-		  && i.tm.opcode_modifier.vexw);
-      gas_assert (dot_insn ()
-		  || i.tm.operand_types[dest].bitfield.class == RegSIMD);
+      if (!dot_insn ())
+	{
+	  gas_assert (i.tm.opcode_modifier.vexvvvv
+		      && i.tm.opcode_modifier.vexw);
+	  gas_assert (i.tm.operand_types[dest].bitfield.class == RegSIMD);
+	}
 
       /* Of the first two non-immediate operands the one with the template
 	 not allowing for a memory one is encoded in the immediate operand.  */
@@ -13275,7 +13333,8 @@  s_insn (int dummy ATTRIBUTE_UNUSED)
   const char *end;
   unsigned int j;
   valueT val;
-  bool vex = false, xop = false, evex = false;
+  bool vex = false, xop = false;
+  enum { evex_none, evex_basic, evex_nd } evex = evex_none;
   struct last_insn *last_insn;
 
   init_globals ();
@@ -13324,7 +13383,7 @@  s_insn (int dummy ATTRIBUTE_UNUSED)
   else if (startswith (line, "EVEX")
 	   && (line[4] == '.' || is_whitespace (line[4])))
     {
-      evex = true;
+      evex = evex_basic;
       line += 4;
     }
 
@@ -13543,6 +13602,20 @@  s_insn (int dummy ATTRIBUTE_UNUSED)
 	line += 3;
     }
 
+  if (line > end && evex && *line == '.')
+    {
+      if (line[1] == 'N' && line[2] == 'D')
+	{
+	  evex = evex_nd;
+	  line += 3;
+	}
+      else if (line[1] == 'Z' && line[2] == 'U')
+	{
+	  i.tm.opcode_modifier.operandconstraint = ZERO_UPPER;
+	  line += 3;
+	}
+    }
+
   if (line > end && *line && !is_whitespace (*line))
     {
       /* Improve diagnostic a little.  */
@@ -13612,6 +13685,25 @@  s_insn (int dummy ATTRIBUTE_UNUSED)
 	}
     }
 
+  if (evex == evex_basic && *line == '{')
+    {
+      int length = check_Scc_OszcOperations (line);
+
+      if (length > 0)
+	{
+	  line += length;
+	  if (is_whitespace (*line))
+	    ++line;
+
+	  if (i.tm.opcode_modifier.operandconstraint)
+	    {
+	      as_bad (_("SCC/OSZC specifier cannot be used here"));
+	      goto bad;
+	    }
+	  i.tm.opcode_modifier.operandconstraint = SCC;
+	}
+    }
+
   /* Parse operands, if any, before evaluating encoding space.  */
   if (*line == ',')
     {
@@ -13713,7 +13805,8 @@  s_insn (int dummy ATTRIBUTE_UNUSED)
 
       /* Enforce certain constraints on operands.  */
       switch (i.reg_operands + i.mem_operands
-	      + (i.tm.extension_opcode != None))
+	      + (i.tm.extension_opcode != None)
+	      + (i.tm.opcode_modifier.operandconstraint == SCC))
 	{
 	case 0:
 	  if (i.short_form)
@@ -13728,9 +13821,13 @@  s_insn (int dummy ATTRIBUTE_UNUSED)
 	      as_bad (_("too few register/memory operands"));
 	      goto done;
 	    }
-	  break;
-
+	  /* Fall through.  */
 	case 2:
+	  if (evex == evex_nd)
+	    {
+	      as_bad (_("too few register/memory operands"));
+	      goto done;
+	    }
 	  break;
 
 	case 4:
@@ -13743,9 +13840,12 @@  s_insn (int dummy ATTRIBUTE_UNUSED)
 	    }
 	  /* Fall through.  */
 	case 3:
+	  if (i.tm.opcode_modifier.operandconstraint == SCC)
+	    break;
 	  if (pp.encoding != encoding_default)
 	    {
-	      i.tm.opcode_modifier.vexvvvv = i.tm.extension_opcode == None
+	      i.tm.opcode_modifier.vexvvvv = (i.tm.extension_opcode == None
+					      && evex != evex_nd)
 					     ? VexVVVV_SRC1 : VexVVVV_DST;
 	      break;
 	    }
@@ -14043,6 +14143,13 @@  s_insn (int dummy ATTRIBUTE_UNUSED)
 
   if (vex || xop)
     {
+      if (is_apx_evex_encoding ())
+	{
+	  as_bad (_("APX functionality cannot be used with %s encodings"),
+		  vex ? "VEX" : "XOP");
+	  goto done;
+	}
+
       if (!i.tm.opcode_modifier.vex)
 	i.tm.opcode_modifier.vex = VEXScalar; /* LIG */
 
@@ -14054,7 +14161,36 @@  s_insn (int dummy ATTRIBUTE_UNUSED)
       if (!i.tm.opcode_modifier.evex)
 	i.tm.opcode_modifier.evex = EVEXLIG;
 
-      build_evex_prefix ();
+      /* To keep earlier .insn uses working as far as possible, take the
+	 legacy path when opcode space is 4 bits wide (impossible to encode in
+	 extended EVEX), and when no "extended" syntax elements are used.  */
+      if ((!is_apx_evex_encoding () || i.insn_opcode_space > 7)
+	  && evex == evex_basic
+	  && !i.tm.opcode_modifier.operandconstraint)
+	build_evex_prefix ();
+      else if (i.insn_opcode_space > 7)
+	{
+	  as_bad (_("opcode space cannot be larger than 7"));
+	  goto done;
+	}
+      else if (evex == evex_nd && (i.broadcast.type || i.broadcast.bytes))
+	{
+	  as_bad (_("ND and broadcast cannot be used at the same time"));
+	  goto done;
+	}
+      else if (pp.has_nf && i.mask.reg)
+	{
+	  as_bad (_("{nf} and masking cannot be used at the same time"));
+	  goto done;
+	}
+      else if (i.tm.opcode_modifier.operandconstraint == SCC
+	       && (pp.has_nf || i.mask.reg))
+	{
+	  as_bad (_("SCC cannot be used at the same time {nf} / masking"));
+	  goto done;
+	}
+      else if (!build_apx_evex_prefix (evex == evex_nd))
+	goto done;
       i.rex &= REX_OPCODE;
     }
   else
--- a/gas/doc/c-i386.texi
+++ b/gas/doc/c-i386.texi
@@ -701,7 +701,7 @@  operand, as long as there is one.
 syntax tries to resemble that used in documentation:
 @itemize @bullet
 @item @code{VEX}[@code{.@var{len}}][@code{.@var{prefix}}][@code{.@var{space}}][@code{.@var{w}}]
-@item @code{EVEX}[@code{.@var{len}}][@code{.@var{prefix}}][@code{.@var{space}}][@code{.@var{w}}]
+@item @code{EVEX}[@code{.@var{len}}][@code{.@var{prefix}}][@code{.@var{space}}][@code{.@var{w}}][@code{.@var{opt}}]
 @item @code{XOP}@var{space}[@code{.@var{len}}][@code{.@var{prefix}}][@code{.@var{w}}]
 @end itemize
 
@@ -716,10 +716,11 @@  only) @code{512} as well as @code{L0} /
 @item @code{0f}, @code{0f38}, @code{0f3a}, or @code{M0}...@code{M31}
 for VEX
 @item @code{08}...@code{1f} for XOP
-@item @code{0f}, @code{0f38}, @code{0f3a}, or @code{M0}...@code{M15}
+@item @code{0f}, @code{0f38}, @code{0f3a}, or @code{M0}...@code{M7}
 for EVEX
 @end itemize
 @item @var{w} can be @code{WIG}, @code{W0}, or @code{W1}
+@item @var{opt} can be @code{ND} or @code{ZU}
 @end itemize
 
 Defaults:
@@ -807,6 +808,12 @@  be suffixed by @code{@{:d@var{n}@}} to s
 This can be combined with an embedded broadcast specifier:
 @samp{8(%eax)@{1to8:d8@}}.
 
+For SCC EVEX the @code{@{dfv=@}} specifier used by ordinary insns is
+extended and immediately follows the opcode specifier.  The extension
+is that the SCC value needs to be specified and goes first, as in
+@code{@{scc=@var{n},dfv=...@}}.  Unlike for ordinary insns @code{dfv=}
+may be omitted for brevity.
+
 @cindex @code{noopt} directive
 @item .noopt
 Disable instruction size optimization.
--- a/gas/testsuite/gas/i386/insn-64.d
+++ b/gas/testsuite/gas/i386/insn-64.d
@@ -63,4 +63,13 @@  Disassembly of section .text:
 [ 	]*[a-f0-9]+:	62 f5 fd 58 5a 40 01[ 	]+vcvtpd2ph (0x)?8\(%rax\)\{1to8\},%xmm0
 [ 	]*[a-f0-9]+:	62 f5 7c 48 5a 40 01[ 	]+vcvtph2pd 0x10\(%rax\),%zmm0
 [ 	]*[a-f0-9]+:	62 f5 7c 58 5a 40 01[ 	]+vcvtph2pd (0x)?2\(%rax\)\{1to8\},%zmm0
+[ 	]*[a-f0-9]+:	62 e4 7c 08 8b 00[ 	]+movrs  \(%rax\),%r16d
+[ 	]*[a-f0-9]+:	62 fc 7c 08 8b 00[ 	]+movrs  \(%r16\),%eax
+[ 	]*[a-f0-9]+:	62 f4 78 08 8b 04 00[ 	]+movrs  \(%rax,%r16(,1)?\),%eax
+[ 	]*[a-f0-9]+:	62 fc 7c 08 60 c0[ 	]+movbe  %r16d,%eax
+[ 	]*[a-f0-9]+:	62 f4 7c 0c 01 c0[ 	]+\{nf\} add %eax,%eax
+[ 	]*[a-f0-9]+:	62 f4 7c 18 01 c0[ 	]+add    %eax,%eax,%eax
+[ 	]*[a-f0-9]+:	62 f4 ec 18 ff f1[ 	]+push2p %rcx,%rdx
+[ 	]*[a-f0-9]+:	62 f4 7f 18 42 c0[ 	]+setzub %al
+[ 	]*[a-f0-9]+:	62 f4 44 0b 39 c0[ 	]+ccmpf \{dfv=of\} %eax,%eax
 #pass
--- a/gas/testsuite/gas/i386/insn-64.s
+++ b/gas/testsuite/gas/i386/insn-64.s
@@ -104,3 +104,24 @@  insn:
 	# vcvtph2pd
 	.insn EVEX.M5.W0 0x5a, 16(%rax){:d16}, %zmm0
 	.insn EVEX.M5.W0 0x5a, 2(%rax){1to8:d2}, %zmm0
+
+	# movrs (APX)
+	.insn EVEX.L0.NP.M4 0x8b, (%rax), %r16d
+	.insn EVEX.L0.NP.M4 0x8b, (%r16), %eax
+	.insn EVEX.L0.NP.M4 0x8b, (%rax,%r16), %eax
+
+	# movbe (APX)
+	.insn EVEX.L0.NP.M4 0x60, %r16d, %eax
+
+	# add (APX)
+	.insn {nf} EVEX.L0.NP.M4 0x01, %eax, %eax
+	.insn EVEX.L0.NP.M4.ND 0x01, %eax, %eax, %eax
+
+	# push2p
+	.insn EVEX.L0.NP.M4.W1.ND 0xff/6, %rcx, %rdx
+
+	# setzub
+	.insn EVEX.L0.F2.M4.ZU 0x42/0, %eax
+
+	# ccmpf
+	.insn EVEX.L0.NP.M4 0x39 {scc=0b1011,dfv=of}, %eax, %eax