[3/5] x86/APX: VMOVDQ{A,U} can also be expressed
Checks
Commit Message
Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but like
V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in the
x86-64-apx-egpr-inval testcase, hence why I overlooked that these, too,
can be encoded, (once again) using suitable AVX512F counterparts.
Comments
> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but like
> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in the x86-
> 64-apx-egpr-inval testcase, hence why I overlooked that these, too, can be
> encoded, (once again) using suitable AVX512F counterparts.
>
I found some of our previous discussions. It would be better if these optimizations could be put under the option instead of by default.
1. The compiler will do this during the backend instruction selection phase. Binutils should only do instruction translation, not instruction selection.
2. We can only convert some instructions, not all instructions. When users use eGPR-s illegally, some will report an error, while others will not, which is very confusing.
3. Binutils needs to report errors for illegal instructions to ensure the correctness of the compiler.
4. I don't know if there are special cases where gcc doesn't want EVEX to be generated.
Thanks,
Lili.
> --- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted.d
> +++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted.d
> @@ -144,6 +144,14 @@ Disassembly of section \.text:
> [ ]*[a-f0-9]+:[ ]*62 fb 7d 28 39 18 01[ ]+vextracti32x4
> \$(0x)?1,%ymm3,\(%r16\)
> [ ]*[a-f0-9]+:[ ]*62 7b 65 28 18 00 01[ ]+vinsertf32x4
> \$(0x)?1,\(%r16\),%ymm3,%ymm8
> [ ]*[a-f0-9]+:[ ]*62 7b 65 28 38 00 01[ ]+vinserti32x4
> \$(0x)?1,\(%r16\),%ymm3,%ymm8
> +[ ]*[a-f0-9]+:[ ]*62 f9 7d 08 6f 08[ ]+vmovdqa32
> \(%r16\),%xmm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 7d 28 6f 08[ ]+vmovdqa32
> \(%r16\),%ymm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 7d 08 7f 08[
> ]+vmovdqa32 %xmm1,\(%r16\)
> +[ ]*[a-f0-9]+:[ ]*62 f9 7d 28 7f 08[
> ]+vmovdqa32 %ymm1,\(%r16\)
> +[ ]*[a-f0-9]+:[ ]*62 f9 7e 08 6f 08[ ]+vmovdqu32
> \(%r16\),%xmm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 7e 28 6f 08[ ]+vmovdqu32
> \(%r16\),%ymm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 7e 08 7f 08[
> ]+vmovdqu32 %xmm1,\(%r16\)
> +[ ]*[a-f0-9]+:[ ]*62 f9 7e 28 7f 08[
> ]+vmovdqu32 %ymm1,\(%r16\)
> [ ]*[a-f0-9]+:[ ]*62 db fd 08 09 30 01[ ]+vrndscalepd
> \$0x1,\(%r24\),%xmm6
> [ ]*[a-f0-9]+:[ ]*62 db 7d 08 08 30 02[ ]+vrndscaleps
> \$0x2,\(%r24\),%xmm6
> [ ]*[a-f0-9]+:[ ]*62 db cd 08 0b 18 03[ ]+vrndscalesd
> \$0x3,\(%r24\),%xmm6,%xmm3
> --- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted.s
> +++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted.s
> @@ -138,6 +138,14 @@ _start:
> vextracti128 $1,%ymm3,(%r16)
> vinsertf128 $1,(%r16),%ymm3,%ymm8
> vinserti128 $1,(%r16),%ymm3,%ymm8
> + vmovdqa (%r16),%xmm1
> + vmovdqa (%r16),%ymm1
> + vmovdqa %xmm1,(%r16)
> + vmovdqa %ymm1,(%r16)
> + vmovdqu (%r16),%xmm1
> + vmovdqu (%r16),%ymm1
> + vmovdqu %xmm1,(%r16)
> + vmovdqu %ymm1,(%r16)
> vroundpd $1,(%r24),%xmm6
> vroundps $2,(%r24),%xmm6
> vroundsd $3,(%r24),%xmm6,%xmm3
> --- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-intel.d
> +++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-intel.d
> @@ -144,6 +144,14 @@ Disassembly of section \.text:
> [ ]*[a-f0-9]+:[ ]*62 fb 7d 28 39 18 01[ ]+vextracti32x4 XMMWORD
> PTR \[r16\],ymm3,(0x)?1
> [ ]*[a-f0-9]+:[ ]*62 7b 65 28 18 00 01[ ]+vinsertf32x4
> ymm8,ymm3,XMMWORD PTR \[r16\],(0x)?1
> [ ]*[a-f0-9]+:[ ]*62 7b 65 28 38 00 01[ ]+vinserti32x4
> ymm8,ymm3,XMMWORD PTR \[r16\],(0x)?1
> +[ ]*[a-f0-9]+:[ ]*62 f9 7d 08 6f 08[ ]+vmovdqa32
> xmm1,XMMWORD PTR \[r16\]
> +[ ]*[a-f0-9]+:[ ]*62 f9 7d 28 6f 08[ ]+vmovdqa32
> ymm1,YMMWORD PTR \[r16\]
> +[ ]*[a-f0-9]+:[ ]*62 f9 7d 08 7f 08[ ]+vmovdqa32 XMMWORD
> PTR \[r16\],xmm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 7d 28 7f 08[ ]+vmovdqa32 YMMWORD
> PTR \[r16\],ymm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 7e 08 6f 08[ ]+vmovdqu32
> xmm1,XMMWORD PTR \[r16\]
> +[ ]*[a-f0-9]+:[ ]*62 f9 7e 28 6f 08[ ]+vmovdqu32
> ymm1,YMMWORD PTR \[r16\]
> +[ ]*[a-f0-9]+:[ ]*62 f9 7e 08 7f 08[ ]+vmovdqu32 XMMWORD
> PTR \[r16\],xmm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 7e 28 7f 08[ ]+vmovdqu32 YMMWORD
> PTR \[r16\],ymm1
> [ ]*[a-f0-9]+:[ ]*62 db fd 08 09 30 01[ ]+vrndscalepd
> xmm6,XMMWORD PTR \[r24\],(0x)?1
> [ ]*[a-f0-9]+:[ ]*62 db 7d 08 08 30 02[ ]+vrndscaleps
> xmm6,XMMWORD PTR \[r24\],(0x)?2
> [ ]*[a-f0-9]+:[ ]*62 db cd 08 0b 18 03[ ]+vrndscalesd
> xmm3,xmm6,QWORD PTR \[r24\],(0x)?3
> --- a/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-wig.d
> +++ b/gas/testsuite/gas/i386/x86-64-apx-evex-promoted-wig.d
> @@ -144,6 +144,14 @@ Disassembly of section \.text:
> [ ]*[a-f0-9]+:[ ]*62 fb 7d 28 39 18 01[ ]+vextracti32x4
> \$(0x)?1,%ymm3,\(%r16\)
> [ ]*[a-f0-9]+:[ ]*62 7b 65 28 18 00 01[ ]+vinsertf32x4
> \$(0x)?1,\(%r16\),%ymm3,%ymm8
> [ ]*[a-f0-9]+:[ ]*62 7b 65 28 38 00 01[ ]+vinserti32x4
> \$(0x)?1,\(%r16\),%ymm3,%ymm8
> +[ ]*[a-f0-9]+:[ ]*62 f9 fd 08 6f 08[ ]+vmovdqa64
> \(%r16\),%xmm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 fd 28 6f 08[ ]+vmovdqa64
> \(%r16\),%ymm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 fd 08 7f 08[
> ]+vmovdqa64 %xmm1,\(%r16\)
> +[ ]*[a-f0-9]+:[ ]*62 f9 fd 28 7f 08[
> ]+vmovdqa64 %ymm1,\(%r16\)
> +[ ]*[a-f0-9]+:[ ]*62 f9 fe 08 6f 08[ ]+vmovdqu64
> \(%r16\),%xmm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 fe 28 6f 08[ ]+vmovdqu64
> \(%r16\),%ymm1
> +[ ]*[a-f0-9]+:[ ]*62 f9 fe 08 7f 08[
> ]+vmovdqu64 %xmm1,\(%r16\)
> +[ ]*[a-f0-9]+:[ ]*62 f9 fe 28 7f 08[
> ]+vmovdqu64 %ymm1,\(%r16\)
> [ ]*[a-f0-9]+:[ ]*62 db fd 08 09 30 01[ ]+vrndscalepd
> \$0x1,\(%r24\),%xmm6
> [ ]*[a-f0-9]+:[ ]*62 db 7d 08 08 30 02[ ]+vrndscaleps
> \$0x2,\(%r24\),%xmm6
> [ ]*[a-f0-9]+:[ ]*62 db cd 08 0b 18 03[ ]+vrndscalesd
> \$0x3,\(%r24\),%xmm6,%xmm3
> --- a/opcodes/i386-opc.tbl
> +++ b/opcodes/i386-opc.tbl
> @@ -1696,6 +1696,9 @@ vmovddup, 0xf212, AVX, Modrm|Vex|Space0F
> vmovddup, 0xf212, AVX, Modrm|Vex=2|Space0F|VexWIG|NoSuf,
> { Unspecified|BaseIndex|RegYMM, RegYMM } vmovdqa, 0x666f, AVX,
> D|Modrm|Vex|Space0F|VexWIG|CheckOperandSize|NoSuf,
> { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM } vmovdqu,
> 0xf36f, AVX, D|Modrm|Vex|Space0F|VexWIG|CheckOperandSize|NoSuf,
> { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM }
> +// vmovdq{a,u}{32,64} in disguise (see vround{p,s}{s,d} comment)
> +vmovdqa, 0x666f, APX_F&AVX512F,
> +D|Modrm|EVexDYN|Space0F|VexWIG|Disp8ShiftVL|CheckOperandSize|NoS
> uf, {
> +Xmmword|Ymmword|Unspecified|BaseIndex, RegXMM|RegYMM } vmovdqu,
> 0xf36f,
> +APX_F&AVX512F,
> +D|Modrm|EVexDYN|Space0F|VexWIG|Disp8ShiftVL|CheckOperandSize|NoS
> uf, {
> +Xmmword|Ymmword|Unspecified|BaseIndex, RegXMM|RegYMM }
> vmovhlps, 0x12, AVX, Modrm|Vex|Space0F|Src1VVVV|VexWIG|NoSuf,
> { RegXMM, RegXMM, RegXMM } vmovhp<sd>, 0x<sd:ppfx>16, AVX,
> Modrm|Vex|Space0F|Src1VVVV|VexWIG|NoSuf,
> { Qword|Unspecified|BaseIndex, RegXMM, RegXMM } vmovhp<sd>,
> 0x<sd:ppfx>17, AVX, Modrm|Vex|Space0F|VexWIG|NoSuf, { RegXMM,
> Qword|Unspecified|BaseIndex }
On 11.09.2024 09:50, Cui, Lili wrote:
>> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but like
>> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in the x86-
>> 64-apx-egpr-inval testcase, hence why I overlooked that these, too, can be
>> encoded, (once again) using suitable AVX512F counterparts.
>>
>
> I found some of our previous discussions. It would be better if these optimizations could be put under the option instead of by default.
>
> 1. The compiler will do this during the backend instruction selection phase. Binutils should only do instruction translation, not instruction selection.
I'm afraid this again takes the compiler-only perspective. Please always
keep in mind that hand-written assembly is where any of this matters. If
we want ...
> 2. We can only convert some instructions, not all instructions. When users use eGPR-s illegally, some will report an error, while others will not, which is very confusing.
> 3. Binutils needs to report errors for illegal instructions to ensure the correctness of the compiler.
... a mode like this, we need an extra flag the compiler passes us.
That could be the existing -f, requiring that to be made work first.
This would have the advantage of asm() constructs then being treatable
like hand-written assembly (once suitably made work), while other code
could be checked as you say.
Right now the assumption is for the compiler to emit sane code.
> 4. I don't know if there are special cases where gcc doesn't want EVEX to be generated.
If such a need existed, insns would need marking accordingly by e.g.
pseudo prefixes ({vex} / {vex3} for the particular insns this patch is
about.
Jan
> Subject: Re: [PATCH 3/5] x86/APX: VMOVDQ{A,U} can also be expressed
>
> On 11.09.2024 09:50, Cui, Lili wrote:
> >> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but like
> >> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in the
> >> x86- 64-apx-egpr-inval testcase, hence why I overlooked that these,
> >> too, can be encoded, (once again) using suitable AVX512F counterparts.
> >>
> >
> > I found some of our previous discussions. It would be better if these
> optimizations could be put under the option instead of by default.
> >
> > 1. The compiler will do this during the backend instruction selection phase.
> Binutils should only do instruction translation, not instruction selection.
>
> I'm afraid this again takes the compiler-only perspective. Please always keep in
> mind that hand-written assembly is where any of this matters. If we want ...
>
> > 2. We can only convert some instructions, not all instructions. When users use
> eGPR-s illegally, some will report an error, while others will not, which is very
> confusing.
> > 3. Binutils needs to report errors for illegal instructions to ensure the
> correctness of the compiler.
>
> ... a mode like this, we need an extra flag the compiler passes us.
> That could be the existing -f, requiring that to be made work first.
> This would have the advantage of asm() constructs then being treatable like
> hand-written assembly (once suitably made work), while other code could be
> checked as you say.
>
> Right now the assumption is for the compiler to emit sane code.
>
> > 4. I don't know if there are special cases where gcc doesn't want EVEX to be
> generated.
>
> If such a need existed, insns would need marking accordingly by e.g.
> pseudo prefixes ({vex} / {vex3} for the particular insns this patch is about.
>
Normally, binutils always downgrade EVEX code to VEX or shorter code by default, but upgrading the wrong VEX to another EVEX instruction by default will give people an unexpected first impression, since the apx spec indicates that this is illegal.
I am worried that this promotion will become more and more common, laying hidden dangers, and eventually it will get out of control.
Lili.
> Jan
On Thu, Sep 12, 2024, 3:47 PM Cui, Lili <lili.cui@intel.com> wrote:
> > Subject: Re: [PATCH 3/5] x86/APX: VMOVDQ{A,U} can also be expressed
> >
> > On 11.09.2024 09:50, Cui, Lili wrote:
> > >> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but like
> > >> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in the
> > >> x86- 64-apx-egpr-inval testcase, hence why I overlooked that these,
> > >> too, can be encoded, (once again) using suitable AVX512F counterparts.
> > >>
> > >
> > > I found some of our previous discussions. It would be better if these
> > optimizations could be put under the option instead of by default.
> > >
> > > 1. The compiler will do this during the backend instruction selection
> phase.
> > Binutils should only do instruction translation, not instruction
> selection.
> >
> > I'm afraid this again takes the compiler-only perspective. Please always
> keep in
> > mind that hand-written assembly is where any of this matters. If we want
> ...
> >
> > > 2. We can only convert some instructions, not all instructions. When
> users use
> > eGPR-s illegally, some will report an error, while others will not,
> which is very
> > confusing.
> > > 3. Binutils needs to report errors for illegal instructions to ensure
> the
> > correctness of the compiler.
> >
> > ... a mode like this, we need an extra flag the compiler passes us.
> > That could be the existing -f, requiring that to be made work first.
> > This would have the advantage of asm() constructs then being treatable
> like
> > hand-written assembly (once suitably made work), while other code could
> be
> > checked as you say.
> >
> > Right now the assumption is for the compiler to emit sane code.
> >
> > > 4. I don't know if there are special cases where gcc doesn't want EVEX
> to be
> > generated.
> >
> > If such a need existed, insns would need marking accordingly by e.g.
> > pseudo prefixes ({vex} / {vex3} for the particular insns this patch is
> about.
> >
>
> Normally, binutils always downgrade EVEX code to VEX or shorter code by
> default, but upgrading the wrong VEX to another EVEX instruction by default
> will give people an unexpected first impression, since the apx spec
> indicates that this is illegal.
>
I agreed that it's a bad idea.
I am worried that this promotion will become more and more common, laying
> hidden dangers, and eventually it will get out of control.
>
> Lili.
>
> > Jan
>
>
On 12.09.2024 10:26, H.J. Lu wrote:
> On Thu, Sep 12, 2024, 3:47 PM Cui, Lili <lili.cui@intel.com> wrote:
>
>>> Subject: Re: [PATCH 3/5] x86/APX: VMOVDQ{A,U} can also be expressed
>>>
>>> On 11.09.2024 09:50, Cui, Lili wrote:
>>>>> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but like
>>>>> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in the
>>>>> x86- 64-apx-egpr-inval testcase, hence why I overlooked that these,
>>>>> too, can be encoded, (once again) using suitable AVX512F counterparts.
>>>>>
>>>>
>>>> I found some of our previous discussions. It would be better if these
>>> optimizations could be put under the option instead of by default.
>>>>
>>>> 1. The compiler will do this during the backend instruction selection
>> phase.
>>> Binutils should only do instruction translation, not instruction
>> selection.
>>>
>>> I'm afraid this again takes the compiler-only perspective. Please always
>> keep in
>>> mind that hand-written assembly is where any of this matters. If we want
>> ...
>>>
>>>> 2. We can only convert some instructions, not all instructions. When
>> users use
>>> eGPR-s illegally, some will report an error, while others will not,
>> which is very
>>> confusing.
>>>> 3. Binutils needs to report errors for illegal instructions to ensure
>> the
>>> correctness of the compiler.
>>>
>>> ... a mode like this, we need an extra flag the compiler passes us.
>>> That could be the existing -f, requiring that to be made work first.
>>> This would have the advantage of asm() constructs then being treatable
>> like
>>> hand-written assembly (once suitably made work), while other code could
>> be
>>> checked as you say.
>>>
>>> Right now the assumption is for the compiler to emit sane code.
>>>
>>>> 4. I don't know if there are special cases where gcc doesn't want EVEX
>> to be
>>> generated.
>>>
>>> If such a need existed, insns would need marking accordingly by e.g.
>>> pseudo prefixes ({vex} / {vex3} for the particular insns this patch is
>> about.
>>>
>>
>> Normally, binutils always downgrade EVEX code to VEX or shorter code by
>> default, but upgrading the wrong VEX to another EVEX instruction by default
>> will give people an unexpected first impression, since the apx spec
>> indicates that this is illegal.
>>
>
> I agreed that it's a bad idea.
>
> I am worried that this promotion will become more and more common, laying
>> hidden dangers, and eventually it will get out of control.
Well, I certainly don't expect similar promotions to occur very often (i.e.
in the future). As to what is done for APX - imo we should either complete
it, or undo what was previously done (despite there not having been any
real objections back at the time). Leaving mixed state is the worst of all
options, imo.
Jan
On Thu, Sep 12, 2024, 5:30 PM Jan Beulich <jbeulich@suse.com> wrote:
> On 12.09.2024 10:26, H.J. Lu wrote:
> > On Thu, Sep 12, 2024, 3:47 PM Cui, Lili <lili.cui@intel.com> wrote:
> >
> >>> Subject: Re: [PATCH 3/5] x86/APX: VMOVDQ{A,U} can also be expressed
> >>>
> >>> On 11.09.2024 09:50, Cui, Lili wrote:
> >>>>> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but like
> >>>>> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in the
> >>>>> x86- 64-apx-egpr-inval testcase, hence why I overlooked that these,
> >>>>> too, can be encoded, (once again) using suitable AVX512F
> counterparts.
> >>>>>
> >>>>
> >>>> I found some of our previous discussions. It would be better if these
> >>> optimizations could be put under the option instead of by default.
> >>>>
> >>>> 1. The compiler will do this during the backend instruction selection
> >> phase.
> >>> Binutils should only do instruction translation, not instruction
> >> selection.
> >>>
> >>> I'm afraid this again takes the compiler-only perspective. Please
> always
> >> keep in
> >>> mind that hand-written assembly is where any of this matters. If we
> want
> >> ...
> >>>
> >>>> 2. We can only convert some instructions, not all instructions. When
> >> users use
> >>> eGPR-s illegally, some will report an error, while others will not,
> >> which is very
> >>> confusing.
> >>>> 3. Binutils needs to report errors for illegal instructions to ensure
> >> the
> >>> correctness of the compiler.
> >>>
> >>> ... a mode like this, we need an extra flag the compiler passes us.
> >>> That could be the existing -f, requiring that to be made work first.
> >>> This would have the advantage of asm() constructs then being treatable
> >> like
> >>> hand-written assembly (once suitably made work), while other code could
> >> be
> >>> checked as you say.
> >>>
> >>> Right now the assumption is for the compiler to emit sane code.
> >>>
> >>>> 4. I don't know if there are special cases where gcc doesn't want EVEX
> >> to be
> >>> generated.
> >>>
> >>> If such a need existed, insns would need marking accordingly by e.g.
> >>> pseudo prefixes ({vex} / {vex3} for the particular insns this patch is
> >> about.
> >>>
> >>
> >> Normally, binutils always downgrade EVEX code to VEX or shorter code by
> >> default, but upgrading the wrong VEX to another EVEX instruction by
> default
> >> will give people an unexpected first impression, since the apx spec
> >> indicates that this is illegal.
> >>
> >
> > I agreed that it's a bad idea.
> >
> > I am worried that this promotion will become more and more common, laying
> >> hidden dangers, and eventually it will get out of control.
>
> Well, I certainly don't expect similar promotions to occur very often (i.e.
> in the future). As to what is done for APX - imo we should either complete
> it, or undo what was previously done (despite there not having been any
> real objections back at the time).
We really shouldn't do that.
Leaving mixed state is the worst of all
> options, imo.
>
> Jan
>
>
On 12.09.2024 12:05, H.J. Lu wrote:
> On Thu, Sep 12, 2024, 5:30 PM Jan Beulich <jbeulich@suse.com> wrote:
>
>> On 12.09.2024 10:26, H.J. Lu wrote:
>>> On Thu, Sep 12, 2024, 3:47 PM Cui, Lili <lili.cui@intel.com> wrote:
>>>
>>>>> Subject: Re: [PATCH 3/5] x86/APX: VMOVDQ{A,U} can also be expressed
>>>>>
>>>>> On 11.09.2024 09:50, Cui, Lili wrote:
>>>>>>> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but like
>>>>>>> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in the
>>>>>>> x86- 64-apx-egpr-inval testcase, hence why I overlooked that these,
>>>>>>> too, can be encoded, (once again) using suitable AVX512F
>> counterparts.
>>>>>>>
>>>>>>
>>>>>> I found some of our previous discussions. It would be better if these
>>>>> optimizations could be put under the option instead of by default.
>>>>>>
>>>>>> 1. The compiler will do this during the backend instruction selection
>>>> phase.
>>>>> Binutils should only do instruction translation, not instruction
>>>> selection.
>>>>>
>>>>> I'm afraid this again takes the compiler-only perspective. Please
>> always
>>>> keep in
>>>>> mind that hand-written assembly is where any of this matters. If we
>> want
>>>> ...
>>>>>
>>>>>> 2. We can only convert some instructions, not all instructions. When
>>>> users use
>>>>> eGPR-s illegally, some will report an error, while others will not,
>>>> which is very
>>>>> confusing.
>>>>>> 3. Binutils needs to report errors for illegal instructions to ensure
>>>> the
>>>>> correctness of the compiler.
>>>>>
>>>>> ... a mode like this, we need an extra flag the compiler passes us.
>>>>> That could be the existing -f, requiring that to be made work first.
>>>>> This would have the advantage of asm() constructs then being treatable
>>>> like
>>>>> hand-written assembly (once suitably made work), while other code could
>>>> be
>>>>> checked as you say.
>>>>>
>>>>> Right now the assumption is for the compiler to emit sane code.
>>>>>
>>>>>> 4. I don't know if there are special cases where gcc doesn't want EVEX
>>>> to be
>>>>> generated.
>>>>>
>>>>> If such a need existed, insns would need marking accordingly by e.g.
>>>>> pseudo prefixes ({vex} / {vex3} for the particular insns this patch is
>>>> about.
>>>>>
>>>>
>>>> Normally, binutils always downgrade EVEX code to VEX or shorter code by
>>>> default, but upgrading the wrong VEX to another EVEX instruction by
>> default
>>>> will give people an unexpected first impression, since the apx spec
>>>> indicates that this is illegal.
>>>>
>>>
>>> I agreed that it's a bad idea.
>>>
>>> I am worried that this promotion will become more and more common, laying
>>>> hidden dangers, and eventually it will get out of control.
>>
>> Well, I certainly don't expect similar promotions to occur very often (i.e.
>> in the future). As to what is done for APX - imo we should either complete
>> it, or undo what was previously done (despite there not having been any
>> real objections back at the time).
>
> We really shouldn't do that.
I'm sorry, but your reply is ambiguous: We really shouldn't do what? The
promotions? (In which case: Why do objections appear only now, when much
of this is already in a release?) The undo? Or ...
> Leaving mixed state is the worst of all
>> options, imo.
... leaving mixed state?
Jan
On Thu, Sep 12, 2024, 6:08 PM Jan Beulich <jbeulich@suse.com> wrote:
> On 12.09.2024 12:05, H.J. Lu wrote:
> > On Thu, Sep 12, 2024, 5:30 PM Jan Beulich <jbeulich@suse.com> wrote:
> >
> >> On 12.09.2024 10:26, H.J. Lu wrote:
> >>> On Thu, Sep 12, 2024, 3:47 PM Cui, Lili <lili.cui@intel.com> wrote:
> >>>
> >>>>> Subject: Re: [PATCH 3/5] x86/APX: VMOVDQ{A,U} can also be expressed
> >>>>>
> >>>>> On 11.09.2024 09:50, Cui, Lili wrote:
> >>>>>>> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but like
> >>>>>>> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in
> the
> >>>>>>> x86- 64-apx-egpr-inval testcase, hence why I overlooked that these,
> >>>>>>> too, can be encoded, (once again) using suitable AVX512F
> >> counterparts.
> >>>>>>>
> >>>>>>
> >>>>>> I found some of our previous discussions. It would be better if
> these
> >>>>> optimizations could be put under the option instead of by default.
> >>>>>>
> >>>>>> 1. The compiler will do this during the backend instruction
> selection
> >>>> phase.
> >>>>> Binutils should only do instruction translation, not instruction
> >>>> selection.
> >>>>>
> >>>>> I'm afraid this again takes the compiler-only perspective. Please
> >> always
> >>>> keep in
> >>>>> mind that hand-written assembly is where any of this matters. If we
> >> want
> >>>> ...
> >>>>>
> >>>>>> 2. We can only convert some instructions, not all instructions. When
> >>>> users use
> >>>>> eGPR-s illegally, some will report an error, while others will not,
> >>>> which is very
> >>>>> confusing.
> >>>>>> 3. Binutils needs to report errors for illegal instructions to
> ensure
> >>>> the
> >>>>> correctness of the compiler.
> >>>>>
> >>>>> ... a mode like this, we need an extra flag the compiler passes us.
> >>>>> That could be the existing -f, requiring that to be made work first.
> >>>>> This would have the advantage of asm() constructs then being
> treatable
> >>>> like
> >>>>> hand-written assembly (once suitably made work), while other code
> could
> >>>> be
> >>>>> checked as you say.
> >>>>>
> >>>>> Right now the assumption is for the compiler to emit sane code.
> >>>>>
> >>>>>> 4. I don't know if there are special cases where gcc doesn't want
> EVEX
> >>>> to be
> >>>>> generated.
> >>>>>
> >>>>> If such a need existed, insns would need marking accordingly by e.g.
> >>>>> pseudo prefixes ({vex} / {vex3} for the particular insns this patch
> is
> >>>> about.
> >>>>>
> >>>>
> >>>> Normally, binutils always downgrade EVEX code to VEX or shorter code
> by
> >>>> default, but upgrading the wrong VEX to another EVEX instruction by
> >> default
> >>>> will give people an unexpected first impression, since the apx spec
> >>>> indicates that this is illegal.
> >>>>
> >>>
> >>> I agreed that it's a bad idea.
> >>>
> >>> I am worried that this promotion will become more and more common,
> laying
> >>>> hidden dangers, and eventually it will get out of control.
> >>
> >> Well, I certainly don't expect similar promotions to occur very often
> (i.e.
> >> in the future). As to what is done for APX - imo we should either
> complete
> >> it, or undo what was previously done (despite there not having been any
> >> real objections back at the time).
> >
> > We really shouldn't do that.
>
> I'm sorry, but your reply is ambiguous: We really shouldn't do what? The
> promotions? (In which case: Why do objections appear only now, when much
> of this is already in a release?) The undo? Or ...
>
Lili, please open an assembler bug for
all APX instructions which are promoted
by assembler, but not in the APX spec.
Thanks.
> > Leaving mixed state is the worst of all
> >> options, imo.
>
> ... leaving mixed state?
>
> Jan
>
>
On Fri, Sep 13, 2024, 3:39 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> On Thu, Sep 12, 2024, 6:08 PM Jan Beulich <jbeulich@suse.com> wrote:
>
>> On 12.09.2024 12:05, H.J. Lu wrote:
>> > On Thu, Sep 12, 2024, 5:30 PM Jan Beulich <jbeulich@suse.com> wrote:
>> >
>> >> On 12.09.2024 10:26, H.J. Lu wrote:
>> >>> On Thu, Sep 12, 2024, 3:47 PM Cui, Lili <lili.cui@intel.com> wrote:
>> >>>
>> >>>>> Subject: Re: [PATCH 3/5] x86/APX: VMOVDQ{A,U} can also be expressed
>> >>>>>
>> >>>>> On 11.09.2024 09:50, Cui, Lili wrote:
>> >>>>>>> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but like
>> >>>>>>> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in
>> the
>> >>>>>>> x86- 64-apx-egpr-inval testcase, hence why I overlooked that
>> these,
>> >>>>>>> too, can be encoded, (once again) using suitable AVX512F
>> >> counterparts.
>> >>>>>>>
>> >>>>>>
>> >>>>>> I found some of our previous discussions. It would be better if
>> these
>> >>>>> optimizations could be put under the option instead of by default.
>> >>>>>>
>> >>>>>> 1. The compiler will do this during the backend instruction
>> selection
>> >>>> phase.
>> >>>>> Binutils should only do instruction translation, not instruction
>> >>>> selection.
>> >>>>>
>> >>>>> I'm afraid this again takes the compiler-only perspective. Please
>> >> always
>> >>>> keep in
>> >>>>> mind that hand-written assembly is where any of this matters. If we
>> >> want
>> >>>> ...
>> >>>>>
>> >>>>>> 2. We can only convert some instructions, not all instructions.
>> When
>> >>>> users use
>> >>>>> eGPR-s illegally, some will report an error, while others will not,
>> >>>> which is very
>> >>>>> confusing.
>> >>>>>> 3. Binutils needs to report errors for illegal instructions to
>> ensure
>> >>>> the
>> >>>>> correctness of the compiler.
>> >>>>>
>> >>>>> ... a mode like this, we need an extra flag the compiler passes us.
>> >>>>> That could be the existing -f, requiring that to be made work first.
>> >>>>> This would have the advantage of asm() constructs then being
>> treatable
>> >>>> like
>> >>>>> hand-written assembly (once suitably made work), while other code
>> could
>> >>>> be
>> >>>>> checked as you say.
>> >>>>>
>> >>>>> Right now the assumption is for the compiler to emit sane code.
>> >>>>>
>> >>>>>> 4. I don't know if there are special cases where gcc doesn't want
>> EVEX
>> >>>> to be
>> >>>>> generated.
>> >>>>>
>> >>>>> If such a need existed, insns would need marking accordingly by e.g.
>> >>>>> pseudo prefixes ({vex} / {vex3} for the particular insns this patch
>> is
>> >>>> about.
>> >>>>>
>> >>>>
>> >>>> Normally, binutils always downgrade EVEX code to VEX or shorter code
>> by
>> >>>> default, but upgrading the wrong VEX to another EVEX instruction by
>> >> default
>> >>>> will give people an unexpected first impression, since the apx spec
>> >>>> indicates that this is illegal.
>> >>>>
>> >>>
>> >>> I agreed that it's a bad idea.
>> >>>
>> >>> I am worried that this promotion will become more and more common,
>> laying
>> >>>> hidden dangers, and eventually it will get out of control.
>> >>
>> >> Well, I certainly don't expect similar promotions to occur very often
>> (i.e.
>> >> in the future). As to what is done for APX - imo we should either
>> complete
>> >> it, or undo what was previously done (despite there not having been any
>> >> real objections back at the time).
>> >
>> > We really shouldn't do that.
>>
>> I'm sorry, but your reply is ambiguous: We really shouldn't do what? The
>> promotions? (In which case: Why do objections appear only now, when much
>> of this is already in a release?) The undo? Or ...
>>
>
> Lili, please open an assembler bug for
> all APX instructions which are promoted
> by assembler, but not in the APX spec.
>
Please run xed on binutils APX tests to
find out which instructions are rejected by xed.
> Thanks.
>
>
>> > Leaving mixed state is the worst of all
>> >> options, imo.
>>
>> ... leaving mixed state?
>>
>> Jan
>>
>>
>>
> >>>>> On 11.09.2024 09:50, Cui, Lili wrote:
> >>>>>>> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but
> >>>>>>> like
> >>>>>>> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in
> >>>>>>> the
> >>>>>>> x86- 64-apx-egpr-inval testcase, hence why I overlooked that
> >>>>>>> these, too, can be encoded, (once again) using suitable AVX512F
> >> counterparts.
> >>>>>>>
> >>>>>>
> >>>>>> I found some of our previous discussions. It would be better if
> >>>>>> these
> >>>>> optimizations could be put under the option instead of by default.
> >>>>>>
> >>>>>> 1. The compiler will do this during the backend instruction
> >>>>>> selection
> >>>> phase.
> >>>>> Binutils should only do instruction translation, not instruction
> >>>> selection.
> >>>>>
> >>>>> I'm afraid this again takes the compiler-only perspective. Please
> >> always
> >>>> keep in
> >>>>> mind that hand-written assembly is where any of this matters. If
> >>>>> we
> >> want
> >>>> ...
> >>>>>
> >>>>>> 2. We can only convert some instructions, not all instructions.
> >>>>>> When
> >>>> users use
> >>>>> eGPR-s illegally, some will report an error, while others will
> >>>>> not,
> >>>> which is very
> >>>>> confusing.
> >>>>>> 3. Binutils needs to report errors for illegal instructions to
> >>>>>> ensure
> >>>> the
> >>>>> correctness of the compiler.
> >>>>>
> >>>>> ... a mode like this, we need an extra flag the compiler passes us.
> >>>>> That could be the existing -f, requiring that to be made work first.
> >>>>> This would have the advantage of asm() constructs then being
> >>>>> treatable
> >>>> like
> >>>>> hand-written assembly (once suitably made work), while other code
> >>>>> could
> >>>> be
> >>>>> checked as you say.
> >>>>>
> >>>>> Right now the assumption is for the compiler to emit sane code.
> >>>>>
> >>>>>> 4. I don't know if there are special cases where gcc doesn't want
> >>>>>> EVEX
> >>>> to be
> >>>>> generated.
> >>>>>
> >>>>> If such a need existed, insns would need marking accordingly by e.g.
> >>>>> pseudo prefixes ({vex} / {vex3} for the particular insns this
> >>>>> patch is
> >>>> about.
> >>>>>
> >>>>
> >>>> Normally, binutils always downgrade EVEX code to VEX or shorter
> >>>> code by default, but upgrading the wrong VEX to another EVEX
> >>>> instruction by
> >> default
> >>>> will give people an unexpected first impression, since the apx spec
> >>>> indicates that this is illegal.
> >>>>
> >>>
> >>> I agreed that it's a bad idea.
> >>>
> >>> I am worried that this promotion will become more and more common,
> >>> laying
> >>>> hidden dangers, and eventually it will get out of control.
> >>
> >> Well, I certainly don't expect similar promotions to occur very often (i.e.
> >> in the future). As to what is done for APX - imo we should either
> >> complete it, or undo what was previously done (despite there not
> >> having been any real objections back at the time).
> >
> > We really shouldn't do that.
>
> I'm sorry, but your reply is ambiguous: We really shouldn't do what? The
> promotions? (In which case: Why do objections appear only now, when much
> of this is already in a release?) The undo? Or ...
>
> Lili, please open an assembler bug for
> all APX instructions which are promoted
> by assembler, but not in the APX spec.
>
> Please run xed on binutils APX tests to
> find out which instructions are rejected by xed.
>
Done. Here is the Bugzilla https://sourceware.org/bugzilla/show_bug.cgi?id=32171
Thanks,
Lili.
>
> Thanks.
>
>
> > Leaving mixed state is the worst of all
> >> options, imo.
>
> ... leaving mixed state?
>
> Jan
On 13.09.2024 05:43, Cui, Lili wrote:
>>>>>>> On 11.09.2024 09:50, Cui, Lili wrote:
>>>>>>>>> Interestingly unlike VROUND{P,S}{S,D} and VPERM{F,I}128, but
>>>>>>>>> like
>>>>>>>>> V{BROADCAST,EXTRACT,INSERT}{F,I}128 they weren't even present in
>>>>>>>>> the
>>>>>>>>> x86- 64-apx-egpr-inval testcase, hence why I overlooked that
>>>>>>>>> these, too, can be encoded, (once again) using suitable AVX512F
>>>> counterparts.
>>>>>>>>>
>>>>>>>>
>>>>>>>> I found some of our previous discussions. It would be better if
>>>>>>>> these
>>>>>>> optimizations could be put under the option instead of by default.
>>>>>>>>
>>>>>>>> 1. The compiler will do this during the backend instruction
>>>>>>>> selection
>>>>>> phase.
>>>>>>> Binutils should only do instruction translation, not instruction
>>>>>> selection.
>>>>>>>
>>>>>>> I'm afraid this again takes the compiler-only perspective. Please
>>>> always
>>>>>> keep in
>>>>>>> mind that hand-written assembly is where any of this matters. If
>>>>>>> we
>>>> want
>>>>>> ...
>>>>>>>
>>>>>>>> 2. We can only convert some instructions, not all instructions.
>>>>>>>> When
>>>>>> users use
>>>>>>> eGPR-s illegally, some will report an error, while others will
>>>>>>> not,
>>>>>> which is very
>>>>>>> confusing.
>>>>>>>> 3. Binutils needs to report errors for illegal instructions to
>>>>>>>> ensure
>>>>>> the
>>>>>>> correctness of the compiler.
>>>>>>>
>>>>>>> ... a mode like this, we need an extra flag the compiler passes us.
>>>>>>> That could be the existing -f, requiring that to be made work first.
>>>>>>> This would have the advantage of asm() constructs then being
>>>>>>> treatable
>>>>>> like
>>>>>>> hand-written assembly (once suitably made work), while other code
>>>>>>> could
>>>>>> be
>>>>>>> checked as you say.
>>>>>>>
>>>>>>> Right now the assumption is for the compiler to emit sane code.
>>>>>>>
>>>>>>>> 4. I don't know if there are special cases where gcc doesn't want
>>>>>>>> EVEX
>>>>>> to be
>>>>>>> generated.
>>>>>>>
>>>>>>> If such a need existed, insns would need marking accordingly by e.g.
>>>>>>> pseudo prefixes ({vex} / {vex3} for the particular insns this
>>>>>>> patch is
>>>>>> about.
>>>>>>>
>>>>>>
>>>>>> Normally, binutils always downgrade EVEX code to VEX or shorter
>>>>>> code by default, but upgrading the wrong VEX to another EVEX
>>>>>> instruction by
>>>> default
>>>>>> will give people an unexpected first impression, since the apx spec
>>>>>> indicates that this is illegal.
>>>>>>
>>>>>
>>>>> I agreed that it's a bad idea.
>>>>>
>>>>> I am worried that this promotion will become more and more common,
>>>>> laying
>>>>>> hidden dangers, and eventually it will get out of control.
>>>>
>>>> Well, I certainly don't expect similar promotions to occur very often (i.e.
>>>> in the future). As to what is done for APX - imo we should either
>>>> complete it, or undo what was previously done (despite there not
>>>> having been any real objections back at the time).
>>>
>>> We really shouldn't do that.
>>
>> I'm sorry, but your reply is ambiguous: We really shouldn't do what? The
>> promotions? (In which case: Why do objections appear only now, when much
>> of this is already in a release?) The undo? Or ...
>>
>> Lili, please open an assembler bug for
>> all APX instructions which are promoted
>> by assembler, but not in the APX spec.
>>
>> Please run xed on binutils APX tests to
>> find out which instructions are rejected by xed.
>>
>
> Done. Here is the Bugzilla https://sourceware.org/bugzilla/show_bug.cgi?id=32171
As indicated - we may want to have a "compiler" and a "hand written
assembly" mode. I would agree that for the former complaining (at
least warning) may be desirable. For the latter I think we ought to
aid programmers, though. If either of you is going to be at the
Cauldron in Prague, we could talk about this some there.
Jan
@@ -144,6 +144,14 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 fb 7d 28 39 18 01[ ]+vextracti32x4 \$(0x)?1,%ymm3,\(%r16\)
[ ]*[a-f0-9]+:[ ]*62 7b 65 28 18 00 01[ ]+vinsertf32x4 \$(0x)?1,\(%r16\),%ymm3,%ymm8
[ ]*[a-f0-9]+:[ ]*62 7b 65 28 38 00 01[ ]+vinserti32x4 \$(0x)?1,\(%r16\),%ymm3,%ymm8
+[ ]*[a-f0-9]+:[ ]*62 f9 7d 08 6f 08[ ]+vmovdqa32 \(%r16\),%xmm1
+[ ]*[a-f0-9]+:[ ]*62 f9 7d 28 6f 08[ ]+vmovdqa32 \(%r16\),%ymm1
+[ ]*[a-f0-9]+:[ ]*62 f9 7d 08 7f 08[ ]+vmovdqa32 %xmm1,\(%r16\)
+[ ]*[a-f0-9]+:[ ]*62 f9 7d 28 7f 08[ ]+vmovdqa32 %ymm1,\(%r16\)
+[ ]*[a-f0-9]+:[ ]*62 f9 7e 08 6f 08[ ]+vmovdqu32 \(%r16\),%xmm1
+[ ]*[a-f0-9]+:[ ]*62 f9 7e 28 6f 08[ ]+vmovdqu32 \(%r16\),%ymm1
+[ ]*[a-f0-9]+:[ ]*62 f9 7e 08 7f 08[ ]+vmovdqu32 %xmm1,\(%r16\)
+[ ]*[a-f0-9]+:[ ]*62 f9 7e 28 7f 08[ ]+vmovdqu32 %ymm1,\(%r16\)
[ ]*[a-f0-9]+:[ ]*62 db fd 08 09 30 01[ ]+vrndscalepd \$0x1,\(%r24\),%xmm6
[ ]*[a-f0-9]+:[ ]*62 db 7d 08 08 30 02[ ]+vrndscaleps \$0x2,\(%r24\),%xmm6
[ ]*[a-f0-9]+:[ ]*62 db cd 08 0b 18 03[ ]+vrndscalesd \$0x3,\(%r24\),%xmm6,%xmm3
@@ -138,6 +138,14 @@ _start:
vextracti128 $1,%ymm3,(%r16)
vinsertf128 $1,(%r16),%ymm3,%ymm8
vinserti128 $1,(%r16),%ymm3,%ymm8
+ vmovdqa (%r16),%xmm1
+ vmovdqa (%r16),%ymm1
+ vmovdqa %xmm1,(%r16)
+ vmovdqa %ymm1,(%r16)
+ vmovdqu (%r16),%xmm1
+ vmovdqu (%r16),%ymm1
+ vmovdqu %xmm1,(%r16)
+ vmovdqu %ymm1,(%r16)
vroundpd $1,(%r24),%xmm6
vroundps $2,(%r24),%xmm6
vroundsd $3,(%r24),%xmm6,%xmm3
@@ -144,6 +144,14 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 fb 7d 28 39 18 01[ ]+vextracti32x4 XMMWORD PTR \[r16\],ymm3,(0x)?1
[ ]*[a-f0-9]+:[ ]*62 7b 65 28 18 00 01[ ]+vinsertf32x4 ymm8,ymm3,XMMWORD PTR \[r16\],(0x)?1
[ ]*[a-f0-9]+:[ ]*62 7b 65 28 38 00 01[ ]+vinserti32x4 ymm8,ymm3,XMMWORD PTR \[r16\],(0x)?1
+[ ]*[a-f0-9]+:[ ]*62 f9 7d 08 6f 08[ ]+vmovdqa32 xmm1,XMMWORD PTR \[r16\]
+[ ]*[a-f0-9]+:[ ]*62 f9 7d 28 6f 08[ ]+vmovdqa32 ymm1,YMMWORD PTR \[r16\]
+[ ]*[a-f0-9]+:[ ]*62 f9 7d 08 7f 08[ ]+vmovdqa32 XMMWORD PTR \[r16\],xmm1
+[ ]*[a-f0-9]+:[ ]*62 f9 7d 28 7f 08[ ]+vmovdqa32 YMMWORD PTR \[r16\],ymm1
+[ ]*[a-f0-9]+:[ ]*62 f9 7e 08 6f 08[ ]+vmovdqu32 xmm1,XMMWORD PTR \[r16\]
+[ ]*[a-f0-9]+:[ ]*62 f9 7e 28 6f 08[ ]+vmovdqu32 ymm1,YMMWORD PTR \[r16\]
+[ ]*[a-f0-9]+:[ ]*62 f9 7e 08 7f 08[ ]+vmovdqu32 XMMWORD PTR \[r16\],xmm1
+[ ]*[a-f0-9]+:[ ]*62 f9 7e 28 7f 08[ ]+vmovdqu32 YMMWORD PTR \[r16\],ymm1
[ ]*[a-f0-9]+:[ ]*62 db fd 08 09 30 01[ ]+vrndscalepd xmm6,XMMWORD PTR \[r24\],(0x)?1
[ ]*[a-f0-9]+:[ ]*62 db 7d 08 08 30 02[ ]+vrndscaleps xmm6,XMMWORD PTR \[r24\],(0x)?2
[ ]*[a-f0-9]+:[ ]*62 db cd 08 0b 18 03[ ]+vrndscalesd xmm3,xmm6,QWORD PTR \[r24\],(0x)?3
@@ -144,6 +144,14 @@ Disassembly of section \.text:
[ ]*[a-f0-9]+:[ ]*62 fb 7d 28 39 18 01[ ]+vextracti32x4 \$(0x)?1,%ymm3,\(%r16\)
[ ]*[a-f0-9]+:[ ]*62 7b 65 28 18 00 01[ ]+vinsertf32x4 \$(0x)?1,\(%r16\),%ymm3,%ymm8
[ ]*[a-f0-9]+:[ ]*62 7b 65 28 38 00 01[ ]+vinserti32x4 \$(0x)?1,\(%r16\),%ymm3,%ymm8
+[ ]*[a-f0-9]+:[ ]*62 f9 fd 08 6f 08[ ]+vmovdqa64 \(%r16\),%xmm1
+[ ]*[a-f0-9]+:[ ]*62 f9 fd 28 6f 08[ ]+vmovdqa64 \(%r16\),%ymm1
+[ ]*[a-f0-9]+:[ ]*62 f9 fd 08 7f 08[ ]+vmovdqa64 %xmm1,\(%r16\)
+[ ]*[a-f0-9]+:[ ]*62 f9 fd 28 7f 08[ ]+vmovdqa64 %ymm1,\(%r16\)
+[ ]*[a-f0-9]+:[ ]*62 f9 fe 08 6f 08[ ]+vmovdqu64 \(%r16\),%xmm1
+[ ]*[a-f0-9]+:[ ]*62 f9 fe 28 6f 08[ ]+vmovdqu64 \(%r16\),%ymm1
+[ ]*[a-f0-9]+:[ ]*62 f9 fe 08 7f 08[ ]+vmovdqu64 %xmm1,\(%r16\)
+[ ]*[a-f0-9]+:[ ]*62 f9 fe 28 7f 08[ ]+vmovdqu64 %ymm1,\(%r16\)
[ ]*[a-f0-9]+:[ ]*62 db fd 08 09 30 01[ ]+vrndscalepd \$0x1,\(%r24\),%xmm6
[ ]*[a-f0-9]+:[ ]*62 db 7d 08 08 30 02[ ]+vrndscaleps \$0x2,\(%r24\),%xmm6
[ ]*[a-f0-9]+:[ ]*62 db cd 08 0b 18 03[ ]+vrndscalesd \$0x3,\(%r24\),%xmm6,%xmm3
@@ -1696,6 +1696,9 @@ vmovddup, 0xf212, AVX, Modrm|Vex|Space0F
vmovddup, 0xf212, AVX, Modrm|Vex=2|Space0F|VexWIG|NoSuf, { Unspecified|BaseIndex|RegYMM, RegYMM }
vmovdqa, 0x666f, AVX, D|Modrm|Vex|Space0F|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM }
vmovdqu, 0xf36f, AVX, D|Modrm|Vex|Space0F|VexWIG|CheckOperandSize|NoSuf, { Unspecified|BaseIndex|RegXMM|RegYMM, RegXMM|RegYMM }
+// vmovdq{a,u}{32,64} in disguise (see vround{p,s}{s,d} comment)
+vmovdqa, 0x666f, APX_F&AVX512F, D|Modrm|EVexDYN|Space0F|VexWIG|Disp8ShiftVL|CheckOperandSize|NoSuf, { Xmmword|Ymmword|Unspecified|BaseIndex, RegXMM|RegYMM }
+vmovdqu, 0xf36f, APX_F&AVX512F, D|Modrm|EVexDYN|Space0F|VexWIG|Disp8ShiftVL|CheckOperandSize|NoSuf, { Xmmword|Ymmword|Unspecified|BaseIndex, RegXMM|RegYMM }
vmovhlps, 0x12, AVX, Modrm|Vex|Space0F|Src1VVVV|VexWIG|NoSuf, { RegXMM, RegXMM, RegXMM }
vmovhp<sd>, 0x<sd:ppfx>16, AVX, Modrm|Vex|Space0F|Src1VVVV|VexWIG|NoSuf, { Qword|Unspecified|BaseIndex, RegXMM, RegXMM }
vmovhp<sd>, 0x<sd:ppfx>17, AVX, Modrm|Vex|Space0F|VexWIG|NoSuf, { RegXMM, Qword|Unspecified|BaseIndex }