[v2] xtensa: Optimize boolean evaluation or branching when EQ/NE to zero in S[IF]mode

Message ID dbc8c369-79af-ca70-646d-4b156b4c2df1@yahoo.co.jp
State New
Headers
Series [v2] xtensa: Optimize boolean evaluation or branching when EQ/NE to zero in S[IF]mode |

Checks

Context Check Description
linaro-tcwg-bot/tcwg_gcc_build--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_gcc_build--master-arm success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-aarch64 success Testing passed
linaro-tcwg-bot/tcwg_gcc_check--master-arm success Testing passed

Commit Message

Takayuki 'January June' Suwa June 5, 2023, 7:30 a.m. UTC
  This patch optimizes the boolean evaluation of EQ/NE against zero
by adding two insn_and_split patterns similar to SImode conditional
store:

"eq_zero":
	op0 = (op1 == 0) ? 1 : 0;
	op0 = clz(op1) >> 5;  /* optimized (requires TARGET_NSA) */

"movsicc_ne0_reg_0":
	op0 = (op1 != 0) ? op2 : 0;
	op0 = op2; if (op1 == 0) ? op0 = op1;  /* optimized */

    /* example #1 */
    int bool_eqSI(int x) {
      return x == 0;
    }
    int bool_neSI(int x) {
      return x != 0;
    }

    ;; after (TARGET_NSA)
    bool_eqSI:
	nsau	a2, a2
	srli	a2, a2, 5
	ret.n
    bool_neSI:
	mov.n	a9, a2
	movi.n	a2, 1
	moveqz	a2, a9, a9
	ret.n

These also work in SFmode by ignoring their sign bits, and further-
more, the branch if EQ/NE against zero in SFmode is also done in the
same manner.

The reasons for this optimization in SFmode are:

  - Only zero values (negative or non-negative) contain no bits of 1
    with both the exponent and the mantissa.
  - EQ/NE comparisons involving NaNs produce no signal even if they
    are signaling.
  - Even if the use of IEEE 754 single-precision floating-point co-
    processor is configured (TARGET_HARD_FLOAT is true):
	1. Load zero value to FP register
        2. Possibly, additional FP move if the comparison target is
	   an address register
	3. FP equality check instruction
	4. Read the boolean register containing the result, or condi-
	   tional branch
    As noted above, a considerable number of instructions are still
    generated.

    /* example #2 */
    int bool_eqSF(float x) {
      return x == 0;
    }
    int bool_neSF(float x) {
      return x != 0;
    }
    int bool_ltSF(float x) {
      return x < 0;
    }
    extern void foo(void);
    void cb_eqSF(float x) {
      if(x != 0)
        foo();
    }
    void cb_neSF(float x) {
      if(x == 0)
        foo();
    }
    void cb_geSF(float x) {
      if(x < 0)
        foo();
    }

    ;; after
    ;; (TARGET_NSA, TARGET_BOOLEANS and TARGET_HARD_FLOAT)
    bool_eqSF:
	add.n	a2, a2, a2
	nsau	a2, a2
	srli	a2, a2, 5
	ret.n
    bool_neSF:
	add.n	a9, a2, a2
	movi.n	a2, 1
	moveqz	a2, a9, a9
	ret.n
    bool_ltSF:
	movi.n	a9, 0
	wfr	f0, a2
	wfr	f1, a9
	olt.s	b0, f0, f1
	movi.n	a9, 0
	movi.n	a2, 1
	movf	a2, a9, b0
	ret.n
    cb_eqSF:
	add.n	a2, a2, a2
	beqz.n	a2, .L6
	j.l	foo, a9
    .L6:
	ret.n
    cb_neSF:
	add.n	a2, a2, a2
	bnez.n	a2, .L8
	j.l	foo, a9
    .L8:
	ret.n
    cb_geSF:
	addi	sp, sp, -16
	movi.n	a3, 0
	s32i.n	a12, sp, 8
	s32i.n	a0, sp, 12
	mov.n	a12, a2
	call0	__unordsf2
	bnez.n	a2, .L10
	movi.n	a3, 0
	mov.n	a2, a12
	call0	__gesf2
	bnei	a2, -1, .L10
	l32i.n	a0, sp, 12
	l32i.n	a12, sp, 8
	addi	sp, sp, 16
	j.l	foo, a9
    .L10:
	l32i.n	a0, sp, 12
	l32i.n	a12, sp, 8
	addi	sp, sp, 16
	ret.n

gcc/ChangeLog:

	* config/xtensa/predicates.md (const_float_0_operand):
	Rename from obsolete "const_float_1_operand" and change the
	constant to compare.
	(cstoresf_cbranchsf_operand, cstoresf_cbranchsf_operator):
	New.
	* config/xtensa/xtensa.cc (xtensa_expand_conditional_branch):
	Add code for EQ/NE comparison with constant zero in SFmode.
	(xtensa_expand_scc): Added code to derive boolean evaluation
	of EQ/NE with constant zero for comparison in SFmode.
	(xtensa_rtx_costs): Change cost of CONST_DOUBLE with value
	zero inside "cbranchsf4" to 0.
	* config/xtensa/xtensa.md (cbranchsf4, cstoresf4):
	Change "match_operator" and the third "match_operand" to the
	ones mentioned above.
	(movsicc_ne0_reg_zero, eq_zero): New.
---
 gcc/config/xtensa/predicates.md | 17 +++++++++--
 gcc/config/xtensa/xtensa.cc     | 45 ++++++++++++++++++++++++++++
 gcc/config/xtensa/xtensa.md     | 53 +++++++++++++++++++++++++++++----
 3 files changed, 106 insertions(+), 9 deletions(-)
  

Comments

Max Filippov June 5, 2023, 3:15 p.m. UTC | #1
Hi Suwa-san,

On Mon, Jun 5, 2023 at 2:37 AM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
>
> This patch optimizes the boolean evaluation of EQ/NE against zero
> by adding two insn_and_split patterns similar to SImode conditional
> store:
>
> "eq_zero":
>         op0 = (op1 == 0) ? 1 : 0;
>         op0 = clz(op1) >> 5;  /* optimized (requires TARGET_NSA) */
>
> "movsicc_ne0_reg_0":
>         op0 = (op1 != 0) ? op2 : 0;
>         op0 = op2; if (op1 == 0) ? op0 = op1;  /* optimized */
>
>     /* example #1 */
>     int bool_eqSI(int x) {
>       return x == 0;
>     }
>     int bool_neSI(int x) {
>       return x != 0;
>     }
>
>     ;; after (TARGET_NSA)
>     bool_eqSI:
>         nsau    a2, a2
>         srli    a2, a2, 5
>         ret.n
>     bool_neSI:
>         mov.n   a9, a2
>         movi.n  a2, 1
>         moveqz  a2, a9, a9
>         ret.n
>
> These also work in SFmode by ignoring their sign bits, and further-
> more, the branch if EQ/NE against zero in SFmode is also done in the
> same manner.
>
> The reasons for this optimization in SFmode are:
>
>   - Only zero values (negative or non-negative) contain no bits of 1
>     with both the exponent and the mantissa.
>   - EQ/NE comparisons involving NaNs produce no signal even if they
>     are signaling.
>   - Even if the use of IEEE 754 single-precision floating-point co-
>     processor is configured (TARGET_HARD_FLOAT is true):
>         1. Load zero value to FP register
>         2. Possibly, additional FP move if the comparison target is
>            an address register
>         3. FP equality check instruction
>         4. Read the boolean register containing the result, or condi-
>            tional branch
>     As noted above, a considerable number of instructions are still
>     generated.
>
>     /* example #2 */
>     int bool_eqSF(float x) {
>       return x == 0;
>     }
>     int bool_neSF(float x) {
>       return x != 0;
>     }
>     int bool_ltSF(float x) {
>       return x < 0;
>     }
>     extern void foo(void);
>     void cb_eqSF(float x) {
>       if(x != 0)
>         foo();
>     }
>     void cb_neSF(float x) {
>       if(x == 0)
>         foo();
>     }
>     void cb_geSF(float x) {
>       if(x < 0)
>         foo();
>     }
>
>     ;; after
>     ;; (TARGET_NSA, TARGET_BOOLEANS and TARGET_HARD_FLOAT)
>     bool_eqSF:
>         add.n   a2, a2, a2
>         nsau    a2, a2
>         srli    a2, a2, 5
>         ret.n
>     bool_neSF:
>         add.n   a9, a2, a2
>         movi.n  a2, 1
>         moveqz  a2, a9, a9
>         ret.n
>     bool_ltSF:
>         movi.n  a9, 0
>         wfr     f0, a2
>         wfr     f1, a9
>         olt.s   b0, f0, f1
>         movi.n  a9, 0
>         movi.n  a2, 1
>         movf    a2, a9, b0
>         ret.n
>     cb_eqSF:
>         add.n   a2, a2, a2
>         beqz.n  a2, .L6
>         j.l     foo, a9
>     .L6:
>         ret.n
>     cb_neSF:
>         add.n   a2, a2, a2
>         bnez.n  a2, .L8
>         j.l     foo, a9
>     .L8:
>         ret.n
>     cb_geSF:
>         addi    sp, sp, -16
>         movi.n  a3, 0
>         s32i.n  a12, sp, 8
>         s32i.n  a0, sp, 12
>         mov.n   a12, a2
>         call0   __unordsf2
>         bnez.n  a2, .L10
>         movi.n  a3, 0
>         mov.n   a2, a12
>         call0   __gesf2
>         bnei    a2, -1, .L10
>         l32i.n  a0, sp, 12
>         l32i.n  a12, sp, 8
>         addi    sp, sp, 16
>         j.l     foo, a9
>     .L10:
>         l32i.n  a0, sp, 12
>         l32i.n  a12, sp, 8
>         addi    sp, sp, 16
>         ret.n
>
> gcc/ChangeLog:
>
>         * config/xtensa/predicates.md (const_float_0_operand):
>         Rename from obsolete "const_float_1_operand" and change the
>         constant to compare.
>         (cstoresf_cbranchsf_operand, cstoresf_cbranchsf_operator):
>         New.
>         * config/xtensa/xtensa.cc (xtensa_expand_conditional_branch):
>         Add code for EQ/NE comparison with constant zero in SFmode.
>         (xtensa_expand_scc): Added code to derive boolean evaluation
>         of EQ/NE with constant zero for comparison in SFmode.
>         (xtensa_rtx_costs): Change cost of CONST_DOUBLE with value
>         zero inside "cbranchsf4" to 0.
>         * config/xtensa/xtensa.md (cbranchsf4, cstoresf4):
>         Change "match_operator" and the third "match_operand" to the
>         ones mentioned above.
>         (movsicc_ne0_reg_zero, eq_zero): New.
> ---
>  gcc/config/xtensa/predicates.md | 17 +++++++++--
>  gcc/config/xtensa/xtensa.cc     | 45 ++++++++++++++++++++++++++++
>  gcc/config/xtensa/xtensa.md     | 53 +++++++++++++++++++++++++++++----
>  3 files changed, 106 insertions(+), 9 deletions(-)

This version performs much better than v1, but there's still new
testsuite failure in the gcc.c-torture/execute/bitfld-3.c
and the following change in the generated code
from:

       l32i.n  a11, a7, 8
       l8ui    a9, a7, 12
       movi    a10, 0xff
       add.n   a9, a9, a10
       addi.n  a7, a11, -1
       movi.n  a10, 1
       movi.n  a6, 0
       moveqz  a10, a6, a11

to:

       l32i.n  a10, a7, 8
       l8ui    a9, a7, 12
       movi    a11, 0xff
       add.n   a9, a9, a11
       addi.n  a7, a10, -1
       movi.n  a11, 1
       mov.n   a10, a11
       movnez  a10, a11, a11

suggests that the pattern movsicc_ne0_reg_zero does not work correctly
when its operands overlap.
  
Takayuki 'January June' Suwa June 5, 2023, 8:59 p.m. UTC | #2
On 2023/06/06 0:15, Max Filippov wrote:
> Hi Suwa-san,
Hi!  Thanks for your regtest every time.

> 
> On Mon, Jun 5, 2023 at 2:37 AM Takayuki 'January June' Suwa
> <jjsuwa_sys3175@yahoo.co.jp> wrote:
>>
>> This patch optimizes the boolean evaluation of EQ/NE against zero
>> by adding two insn_and_split patterns similar to SImode conditional
>> store:
>>
>> "eq_zero":
>>         op0 = (op1 == 0) ? 1 : 0;
>>         op0 = clz(op1) >> 5;  /* optimized (requires TARGET_NSA) */
>>
>> "movsicc_ne0_reg_0":
>>         op0 = (op1 != 0) ? op2 : 0;
>>         op0 = op2; if (op1 == 0) ? op0 = op1;  /* optimized */
>>
>>     /* example #1 */
>>     int bool_eqSI(int x) {
>>       return x == 0;
>>     }
>>     int bool_neSI(int x) {
>>       return x != 0;
>>     }
>>
>>     ;; after (TARGET_NSA)
>>     bool_eqSI:
>>         nsau    a2, a2
>>         srli    a2, a2, 5
>>         ret.n
>>     bool_neSI:
>>         mov.n   a9, a2
>>         movi.n  a2, 1
>>         moveqz  a2, a9, a9
>>         ret.n
>>
>> These also work in SFmode by ignoring their sign bits, and further-
>> more, the branch if EQ/NE against zero in SFmode is also done in the
>> same manner.
>>
>> The reasons for this optimization in SFmode are:
>>
>>   - Only zero values (negative or non-negative) contain no bits of 1
>>     with both the exponent and the mantissa.
>>   - EQ/NE comparisons involving NaNs produce no signal even if they
>>     are signaling.
>>   - Even if the use of IEEE 754 single-precision floating-point co-
>>     processor is configured (TARGET_HARD_FLOAT is true):
>>         1. Load zero value to FP register
>>         2. Possibly, additional FP move if the comparison target is
>>            an address register
>>         3. FP equality check instruction
>>         4. Read the boolean register containing the result, or condi-
>>            tional branch
>>     As noted above, a considerable number of instructions are still
>>     generated.
>>
>>     /* example #2 */
>>     int bool_eqSF(float x) {
>>       return x == 0;
>>     }
>>     int bool_neSF(float x) {
>>       return x != 0;
>>     }
>>     int bool_ltSF(float x) {
>>       return x < 0;
>>     }
>>     extern void foo(void);
>>     void cb_eqSF(float x) {
>>       if(x != 0)
>>         foo();
>>     }
>>     void cb_neSF(float x) {
>>       if(x == 0)
>>         foo();
>>     }
>>     void cb_geSF(float x) {
>>       if(x < 0)
>>         foo();
>>     }
>>
>>     ;; after
>>     ;; (TARGET_NSA, TARGET_BOOLEANS and TARGET_HARD_FLOAT)
>>     bool_eqSF:
>>         add.n   a2, a2, a2
>>         nsau    a2, a2
>>         srli    a2, a2, 5
>>         ret.n
>>     bool_neSF:
>>         add.n   a9, a2, a2
>>         movi.n  a2, 1
>>         moveqz  a2, a9, a9
>>         ret.n
>>     bool_ltSF:
>>         movi.n  a9, 0
>>         wfr     f0, a2
>>         wfr     f1, a9
>>         olt.s   b0, f0, f1
>>         movi.n  a9, 0
>>         movi.n  a2, 1
>>         movf    a2, a9, b0
>>         ret.n
>>     cb_eqSF:
>>         add.n   a2, a2, a2
>>         beqz.n  a2, .L6
>>         j.l     foo, a9
>>     .L6:
>>         ret.n
>>     cb_neSF:
>>         add.n   a2, a2, a2
>>         bnez.n  a2, .L8
>>         j.l     foo, a9
>>     .L8:
>>         ret.n
>>     cb_geSF:
>>         addi    sp, sp, -16
>>         movi.n  a3, 0
>>         s32i.n  a12, sp, 8
>>         s32i.n  a0, sp, 12
>>         mov.n   a12, a2
>>         call0   __unordsf2
>>         bnez.n  a2, .L10
>>         movi.n  a3, 0
>>         mov.n   a2, a12
>>         call0   __gesf2
>>         bnei    a2, -1, .L10
>>         l32i.n  a0, sp, 12
>>         l32i.n  a12, sp, 8
>>         addi    sp, sp, 16
>>         j.l     foo, a9
>>     .L10:
>>         l32i.n  a0, sp, 12
>>         l32i.n  a12, sp, 8
>>         addi    sp, sp, 16
>>         ret.n
>>
>> gcc/ChangeLog:
>>
>>         * config/xtensa/predicates.md (const_float_0_operand):
>>         Rename from obsolete "const_float_1_operand" and change the
>>         constant to compare.
>>         (cstoresf_cbranchsf_operand, cstoresf_cbranchsf_operator):
>>         New.
>>         * config/xtensa/xtensa.cc (xtensa_expand_conditional_branch):
>>         Add code for EQ/NE comparison with constant zero in SFmode.
>>         (xtensa_expand_scc): Added code to derive boolean evaluation
>>         of EQ/NE with constant zero for comparison in SFmode.
>>         (xtensa_rtx_costs): Change cost of CONST_DOUBLE with value
>>         zero inside "cbranchsf4" to 0.
>>         * config/xtensa/xtensa.md (cbranchsf4, cstoresf4):
>>         Change "match_operator" and the third "match_operand" to the
>>         ones mentioned above.
>>         (movsicc_ne0_reg_zero, eq_zero): New.
>> ---
>>  gcc/config/xtensa/predicates.md | 17 +++++++++--
>>  gcc/config/xtensa/xtensa.cc     | 45 ++++++++++++++++++++++++++++
>>  gcc/config/xtensa/xtensa.md     | 53 +++++++++++++++++++++++++++++----
>>  3 files changed, 106 insertions(+), 9 deletions(-)
> 
> This version performs much better than v1, but there's still new
> testsuite failure in the gcc.c-torture/execute/bitfld-3.c
> and the following change in the generated code
> from:
> 
>        l32i.n  a11, a7, 8
>        l8ui    a9, a7, 12
>        movi    a10, 0xff
>        add.n   a9, a9, a10
>        addi.n  a7, a11, -1
>        movi.n  a10, 1
>        movi.n  a6, 0
>        moveqz  a10, a6, a11
> 
> to:
> 
>        l32i.n  a10, a7, 8
>        l8ui    a9, a7, 12
>        movi    a11, 0xff
>        add.n   a9, a9, a11
>        addi.n  a7, a10, -1
>        movi.n  a11, 1
>        mov.n   a10, a11
>        movnez  a10, a11, a11
> 
> suggests that the pattern movsicc_ne0_reg_zero does not work correctly
> when its operands overlap.
> 
The reason I wanted to implement the movsicc_ne0_reg_zero pattern in the first place is the following output
that xtensa_expand_scc() emits in its generic procedure:

(set (reg:SI 10)
     (const_int 1))
(set (reg:SI 6)
     (const_int 0))
(set (reg:SI 10)
     (if_then_else:SI (ne (reg:SI 11)
                          (const_int 0))
            (reg:SI 10)
            (reg:SI 6])))

	movi.n	a10, 1
	movi.n	a6, 0
	moveqz	a10, a6, a11

I realized that A11 should be used instead of A6 (and A6 will be removed later) since it is clear that A11 is
zero if the above moveqz is executed.
Maybe it is better to tweak xtensa_expand_scc() a bit than to implement the pattern separately.
  
Max Filippov June 6, 2023, 12:28 a.m. UTC | #3
On Mon, Jun 5, 2023 at 8:15 AM Max Filippov <jcmvbkbc@gmail.com> wrote:
>
> Hi Suwa-san,
>
> On Mon, Jun 5, 2023 at 2:37 AM Takayuki 'January June' Suwa
> <jjsuwa_sys3175@yahoo.co.jp> wrote:
> >
> > This patch optimizes the boolean evaluation of EQ/NE against zero
> > by adding two insn_and_split patterns similar to SImode conditional
> > store:
> >
> > "eq_zero":
> >         op0 = (op1 == 0) ? 1 : 0;
> >         op0 = clz(op1) >> 5;  /* optimized (requires TARGET_NSA) */
> >
> > "movsicc_ne0_reg_0":
> >         op0 = (op1 != 0) ? op2 : 0;
> >         op0 = op2; if (op1 == 0) ? op0 = op1;  /* optimized */
> >
> >     /* example #1 */
> >     int bool_eqSI(int x) {
> >       return x == 0;
> >     }
> >     int bool_neSI(int x) {
> >       return x != 0;
> >     }
> >
> >     ;; after (TARGET_NSA)
> >     bool_eqSI:
> >         nsau    a2, a2
> >         srli    a2, a2, 5
> >         ret.n
> >     bool_neSI:
> >         mov.n   a9, a2
> >         movi.n  a2, 1
> >         moveqz  a2, a9, a9
> >         ret.n
> >
> > These also work in SFmode by ignoring their sign bits, and further-
> > more, the branch if EQ/NE against zero in SFmode is also done in the
> > same manner.
> >
> > The reasons for this optimization in SFmode are:
> >
> >   - Only zero values (negative or non-negative) contain no bits of 1
> >     with both the exponent and the mantissa.
> >   - EQ/NE comparisons involving NaNs produce no signal even if they
> >     are signaling.
> >   - Even if the use of IEEE 754 single-precision floating-point co-
> >     processor is configured (TARGET_HARD_FLOAT is true):
> >         1. Load zero value to FP register
> >         2. Possibly, additional FP move if the comparison target is
> >            an address register
> >         3. FP equality check instruction
> >         4. Read the boolean register containing the result, or condi-
> >            tional branch
> >     As noted above, a considerable number of instructions are still
> >     generated.
> >
> >     /* example #2 */
> >     int bool_eqSF(float x) {
> >       return x == 0;
> >     }
> >     int bool_neSF(float x) {
> >       return x != 0;
> >     }
> >     int bool_ltSF(float x) {
> >       return x < 0;
> >     }
> >     extern void foo(void);
> >     void cb_eqSF(float x) {
> >       if(x != 0)
> >         foo();
> >     }
> >     void cb_neSF(float x) {
> >       if(x == 0)
> >         foo();
> >     }
> >     void cb_geSF(float x) {
> >       if(x < 0)
> >         foo();
> >     }
> >
> >     ;; after
> >     ;; (TARGET_NSA, TARGET_BOOLEANS and TARGET_HARD_FLOAT)
> >     bool_eqSF:
> >         add.n   a2, a2, a2
> >         nsau    a2, a2
> >         srli    a2, a2, 5
> >         ret.n
> >     bool_neSF:
> >         add.n   a9, a2, a2
> >         movi.n  a2, 1
> >         moveqz  a2, a9, a9
> >         ret.n
> >     bool_ltSF:
> >         movi.n  a9, 0
> >         wfr     f0, a2
> >         wfr     f1, a9
> >         olt.s   b0, f0, f1
> >         movi.n  a9, 0
> >         movi.n  a2, 1
> >         movf    a2, a9, b0
> >         ret.n
> >     cb_eqSF:
> >         add.n   a2, a2, a2
> >         beqz.n  a2, .L6
> >         j.l     foo, a9
> >     .L6:
> >         ret.n
> >     cb_neSF:
> >         add.n   a2, a2, a2
> >         bnez.n  a2, .L8
> >         j.l     foo, a9
> >     .L8:
> >         ret.n
> >     cb_geSF:
> >         addi    sp, sp, -16
> >         movi.n  a3, 0
> >         s32i.n  a12, sp, 8
> >         s32i.n  a0, sp, 12
> >         mov.n   a12, a2
> >         call0   __unordsf2
> >         bnez.n  a2, .L10
> >         movi.n  a3, 0
> >         mov.n   a2, a12
> >         call0   __gesf2
> >         bnei    a2, -1, .L10
> >         l32i.n  a0, sp, 12
> >         l32i.n  a12, sp, 8
> >         addi    sp, sp, 16
> >         j.l     foo, a9
> >     .L10:
> >         l32i.n  a0, sp, 12
> >         l32i.n  a12, sp, 8
> >         addi    sp, sp, 16
> >         ret.n
> >
> > gcc/ChangeLog:
> >
> >         * config/xtensa/predicates.md (const_float_0_operand):
> >         Rename from obsolete "const_float_1_operand" and change the
> >         constant to compare.
> >         (cstoresf_cbranchsf_operand, cstoresf_cbranchsf_operator):
> >         New.
> >         * config/xtensa/xtensa.cc (xtensa_expand_conditional_branch):
> >         Add code for EQ/NE comparison with constant zero in SFmode.
> >         (xtensa_expand_scc): Added code to derive boolean evaluation
> >         of EQ/NE with constant zero for comparison in SFmode.
> >         (xtensa_rtx_costs): Change cost of CONST_DOUBLE with value
> >         zero inside "cbranchsf4" to 0.
> >         * config/xtensa/xtensa.md (cbranchsf4, cstoresf4):
> >         Change "match_operator" and the third "match_operand" to the
> >         ones mentioned above.
> >         (movsicc_ne0_reg_zero, eq_zero): New.
> > ---
> >  gcc/config/xtensa/predicates.md | 17 +++++++++--
> >  gcc/config/xtensa/xtensa.cc     | 45 ++++++++++++++++++++++++++++
> >  gcc/config/xtensa/xtensa.md     | 53 +++++++++++++++++++++++++++++----
> >  3 files changed, 106 insertions(+), 9 deletions(-)
>
> This version performs much better than v1, but there's still new
> testsuite failure in the gcc.c-torture/execute/bitfld-3.c

And on the config with FPU there's one more new failure
in the g++.dg/opt/pr58864.C with the following ICE:

gcc/testsuite/g++.dg/opt/pr58864.C:21:1: error: unrecognizable insn:
(insn 13 12 14 2 (set (reg:CC 18 b0)
       (eq:CC (reg/v:SF 43 [ c ])
           (const_double:SF 0.0 [0x0.0p+0]))) -1
    (nil))
during RTL pass: vregs
  

Patch

diff --git a/gcc/config/xtensa/predicates.md b/gcc/config/xtensa/predicates.md
index a3575a68892..cfac3ad4936 100644
--- a/gcc/config/xtensa/predicates.md
+++ b/gcc/config/xtensa/predicates.md
@@ -155,11 +155,11 @@ 
 			    && CONSTANT_P (op)
 			    && GET_MODE_SIZE (mode) % UNITS_PER_WORD == 0")))))
 
-;; Accept the floating point constant 1 in the appropriate mode.
-(define_predicate "const_float_1_operand"
+;; Accept the floating point constant 0 in the appropriate mode.
+(define_predicate "const_float_0_operand"
   (match_code "const_double")
 {
-  return real_equal (CONST_DOUBLE_REAL_VALUE (op), &dconst1);
+  return real_equal (CONST_DOUBLE_REAL_VALUE (op), &dconst0);
 })
 
 (define_predicate "fpmem_offset_operand"
@@ -179,6 +179,11 @@ 
   return false;
 })
 
+(define_predicate "cstoresf_cbranchsf_operand"
+  (ior (and (match_test "TARGET_HARD_FLOAT")
+	    (match_operand 0 "register_operand"))
+       (match_operand 0 "const_float_0_operand")))
+
 (define_predicate "branch_operator"
   (match_code "eq,ne,lt,ge"))
 
@@ -197,6 +202,12 @@ 
 (define_predicate "xtensa_cstoresi_operator"
   (match_code "eq,ne,gt,ge,lt,le"))
 
+(define_predicate "cstoresf_cbranchsf_operator"
+  (ior (and (match_test "TARGET_HARD_FLOAT")
+	    (and (match_operand 0 "comparison_operator")
+		 (match_test "register_operand (XEXP (op, 1), SFmode)")))
+       (match_operand 0 "boolean_operator")))
+
 (define_predicate "xtensa_shift_per_byte_operator"
   (match_code "ashift,ashiftrt,lshiftrt"))
 
diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index 3b5d25b660a..f43f057344c 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -865,6 +865,16 @@  xtensa_expand_conditional_branch (rtx *operands, machine_mode mode)
   switch (mode)
     {
     case E_SFmode:
+      if ((test_code == EQ || test_code == NE)
+	  && const_float_0_operand (cmp1, SFmode))
+	{
+	  emit_move_insn (cmp1 = gen_reg_rtx (SImode),
+			  simplify_gen_subreg (SImode, cmp0, SFmode, 0));
+	  emit_insn (gen_addsi3 (cmp1, cmp1, cmp1));
+	  cmp = gen_int_relational (test_code, cmp1, const0_rtx);
+	  break;
+	}
+
       if (TARGET_HARD_FLOAT)
 	{
 	  cmp = gen_float_relational (test_code, cmp0, cmp1);
@@ -996,6 +1006,36 @@  xtensa_expand_scc (rtx operands[4], machine_mode cmp_mode)
   rtx one_tmp, zero_tmp;
   rtx (*gen_fn) (rtx, rtx, rtx, rtx, rtx);
 
+  if (cmp_mode == SFmode)
+    {
+      if (const_float_0_operand (operands[3], SFmode))
+	switch (GET_CODE (operands[1]))
+	  {
+	  case EQ:
+	    emit_move_insn (cmp = gen_reg_rtx (SImode),
+			    simplify_gen_subreg (SImode, operands[2],
+						 SFmode, 0));
+	    emit_insn (gen_addsi3 (cmp, cmp, cmp));
+	    emit_insn (gen_eq_zero (dest, cmp));
+	    return 1;
+
+	  case NE:
+	    emit_move_insn (cmp = gen_reg_rtx (SImode),
+			    simplify_gen_subreg (SImode, operands[2],
+						 SFmode, 0));
+	    emit_insn (gen_addsi3 (cmp, cmp, cmp));
+	    one_tmp = force_reg (SImode, const1_rtx);
+	    emit_insn (gen_movsicc_ne0_reg_zero (dest, cmp, one_tmp));
+	    return 1;
+
+	  default:
+	    return 0;
+	  }
+
+      if (! register_operand (operands[3], SFmode))
+	return 0;
+    }
+
   if (!(cmp = gen_conditional_move (GET_CODE (operands[1]), cmp_mode,
 				    operands[2], operands[3])))
     return 0;
@@ -4438,6 +4478,11 @@  xtensa_rtx_costs (rtx x, machine_mode mode, int outer_code,
       return true;
 
     case CONST_DOUBLE:
+      if (outer_code == COMPARE && const_float_0_operand (x, SFmode))
+	{
+	  *total = 0;
+	  return true;
+	}
       if (TARGET_CONST16)
 	*total = COSTS_N_INSNS (4);
       else
diff --git a/gcc/config/xtensa/xtensa.md b/gcc/config/xtensa/xtensa.md
index 4b4ab3f5f37..d4b91ef8fd2 100644
--- a/gcc/config/xtensa/xtensa.md
+++ b/gcc/config/xtensa/xtensa.md
@@ -1906,11 +1906,11 @@ 
 })
 
 (define_expand "cbranchsf4"
-  [(match_operator 0 "comparison_operator"
+  [(match_operator 0 "cstoresf_cbranchsf_operator"
     [(match_operand:SF 1 "register_operand")
-     (match_operand:SF 2 "register_operand")])
+     (match_operand:SF 2 "cstoresf_cbranchsf_operand")])
    (match_operand 3 "")]
-  "TARGET_HARD_FLOAT"
+  ""
 {
   xtensa_expand_conditional_branch (operands, SFmode);
   DONE;
@@ -2395,10 +2395,10 @@ 
 
 (define_expand "cstoresf4"
   [(match_operand:SI 0 "register_operand")
-   (match_operator:SI 1 "comparison_operator"
+   (match_operator:SI 1 "cstoresf_cbranchsf_operator"
     [(match_operand:SF 2 "register_operand")
-     (match_operand:SF 3 "register_operand")])]
-  "TARGET_HARD_FLOAT"
+     (match_operand:SF 3 "cstoresf_cbranchsf_operand")])]
+  ""
 {
   if (!xtensa_expand_scc (operands, SFmode))
     FAIL;
@@ -2463,6 +2463,30 @@ 
    (set_attr "mode"	"SI")
    (set_attr "length"	"3,3")])
 
+(define_insn_and_split "movsicc_ne0_reg_zero"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(if_then_else:SI (ne (match_operand:SI 1 "register_operand" "r")
+			     (const_int 0))
+			 (match_operand:SI 2 "register_operand" "r")
+			 (const_int 0)))]
+  ""
+  "#"
+  ""
+  [(set (match_dup 0)
+	(match_dup 2))
+   (set (match_dup 0)
+	(if_then_else:SI (ne (match_dup 1)
+			     (const_int 0))
+			 (match_dup 0)
+			 (match_dup 1)))]
+  ""
+  [(set_attr "type"	"move")
+   (set_attr "mode"	"SI")
+   (set (attr "length")
+	(if_then_else (match_test "TARGET_DENSITY")
+		      (const_int 5)
+		      (const_int 6)))])
+
 (define_insn "movsfcc_internal0"
   [(set (match_operand:SF 0 "register_operand" "=a,a,f,f")
 	(if_then_else:SF (match_operator 4 "branch_operator"
@@ -3222,6 +3246,23 @@ 
 				    (const_int 5)
 				    (const_int 6))))])
 
+(define_insn_and_split "eq_zero"
+  [(set (match_operand:SI 0 "register_operand" "=a")
+	(eq:SI (match_operand:SI 1 "register_operand" "r")
+	       (const_int 0)))]
+  "TARGET_NSA"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(clz:SI (match_dup 1)))
+   (set (match_dup 0)
+	(lshiftrt:SI (match_dup 0)
+		     (const_int 5)))]
+  ""
+  [(set_attr "type"	"move")
+   (set_attr "mode"	"SI")
+   (set_attr "length"	"6")])
+
 (define_peephole2
   [(set (match_operand:SI 0 "register_operand")
 	(match_operand:SI 6 "reload_operand"))