@@ -10201,6 +10201,61 @@ arm_mem_costs (rtx x, const struct cpu_cost_table *extra_cost,
return true;
}
+/* Helper for arm_bfi_p. */
+static bool
+arm_bfi_1_p (rtx op0, rtx op1, rtx *sub0, rtx *sub1)
+{
+ unsigned HOST_WIDE_INT const1;
+ unsigned HOST_WIDE_INT const2 = 0;
+
+ if (!CONST_INT_P (XEXP (op0, 1)))
+ return false;
+
+ const1 = XUINT (XEXP (op0, 1), 0);
+ if (!CONST_INT_P (XEXP (op1, 1))
+ || ~XUINT (XEXP (op1, 1), 0) != const1)
+ return false;
+
+ if (GET_CODE (XEXP (op0, 0)) == ASHIFT
+ && CONST_INT_P (XEXP (XEXP (op0, 0), 1)))
+ {
+ const2 = XUINT (XEXP (XEXP (op0, 0), 1), 0);
+ *sub0 = XEXP (XEXP (op0, 0), 0);
+ }
+ else
+ *sub0 = XEXP (op0, 0);
+
+ if (const2 >= GET_MODE_BITSIZE (GET_MODE (op0)))
+ return false;
+
+ *sub1 = XEXP (op1, 0);
+ return exact_log2 (const1 + (HOST_WIDE_INT_1U << const2)) >= 0;
+}
+
+/* Recognize a BFI idiom. Helper for arm_rtx_costs_internal. The
+ format looks something like:
+
+ (IOR (AND (reg1) (~const1))
+ (AND (ASHIFT (reg2) (const2))
+ (const1)))
+
+ where const1 is a consecutive sequence of 1-bits with the
+ least-significant non-zero bit starting at bit position const2. If
+ const2 is zero, then the shift will not appear at all, due to
+ canonicalization. The two arms of the IOR expression may be
+ flipped. */
+static bool
+arm_bfi_p (rtx x, rtx *sub0, rtx *sub1)
+{
+ if (GET_CODE (x) != IOR)
+ return false;
+ if (GET_CODE (XEXP (x, 0)) != AND
+ || GET_CODE (XEXP (x, 1)) != AND)
+ return false;
+ return (arm_bfi_1_p (XEXP (x, 0), XEXP (x, 1), sub0, sub1)
+ || arm_bfi_1_p (XEXP (x, 1), XEXP (x, 0), sub1, sub0));
+}
+
/* RTX costs. Make an estimate of the cost of executing the operation
X, which is contained within an operation with code OUTER_CODE.
SPEED_P indicates whether the cost desired is the performance cost,
@@ -10959,14 +11014,28 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code, enum rtx_code outer_code,
*cost = LIBCALL_COST (2);
return false;
case IOR:
- if (mode == SImode && arm_arch6 && aarch_rev16_p (x))
- {
- if (speed_p)
- *cost += extra_cost->alu.rev;
+ {
+ rtx sub0, sub1;
+ if (mode == SImode && arm_arch6 && aarch_rev16_p (x))
+ {
+ if (speed_p)
+ *cost += extra_cost->alu.rev;
- return true;
- }
- /* Fall through. */
+ return true;
+ }
+ else if (mode == SImode && arm_arch_thumb2
+ && arm_bfi_p (x, &sub0, &sub1))
+ {
+ *cost += rtx_cost (sub0, mode, ZERO_EXTRACT, 1, speed_p);
+ *cost += rtx_cost (sub1, mode, ZERO_EXTRACT, 0, speed_p);
+ if (speed_p)
+ *cost += extra_cost->alu.bfi;
+
+ return true;
+ }
+ }
+
+ /* Fall through. */
case AND: case XOR:
if (mode == SImode)
{
@@ -23780,8 +23849,8 @@ arm_print_condition (FILE *stream)
/* Globally reserved letters: acln
Puncutation letters currently used: @_|?().!#
Lower case letters currently used: bcdefhimpqtvwxyz
- Upper case letters currently used: ABCDEFGHIJKLMNOPQRSTU
- Letters previously used, but now deprecated/obsolete: sVWXYZ.
+ Upper case letters currently used: ABCDEFGHIJKLMNOPQRSTUV
+ Letters previously used, but now deprecated/obsolete: sWXYZ.
Note that the global reservation for 'c' is only for CONSTANT_ADDRESS_P.
@@ -23797,7 +23866,10 @@ arm_print_condition (FILE *stream)
If CODE is 'N' then X is a floating point operand that must be negated
before output.
If CODE is 'B' then output a bitwise inverted value of X (a const int).
- If X is a REG and CODE is `M', output a ldm/stm style multi-reg. */
+ If X is a REG and CODE is `M', output a ldm/stm style multi-reg.
+ If CODE is 'V', then the operand must be a CONST_INT representing
+ the bits to preserve in the modified register (Rd) of a BFI or BFC
+ instruction: print out both the width and lsb (shift) fields. */
static void
arm_print_operand (FILE *stream, rtx x, int code)
{
@@ -24106,8 +24178,27 @@ arm_print_operand (FILE *stream, rtx x, int code)
stream);
return;
- case 's':
case 'V':
+ {
+ /* Output the LSB (shift) and width for a bitmask instruction
+ based on a literal mask. The LSB is printed first,
+ followed by the width.
+
+ Eg. For 0b1...1110001, the result is #1, #3. */
+ if (!CONST_INT_P (x))
+ {
+ output_operand_lossage ("invalid operand for code '%c'", code);
+ return;
+ }
+
+ unsigned HOST_WIDE_INT val = ~XUINT (x, 0);
+ int lsb = exact_log2 (val & -val);
+ asm_fprintf (stream, "#%d, #%d", lsb,
+ (exact_log2 (val + (val & -val)) - lsb));
+ }
+ return;
+
+ case 's':
case 'W':
case 'X':
case 'Y':
@@ -3002,30 +3002,36 @@ (define_expand "andsi3"
; ??? Check split length for Thumb-2
(define_insn_and_split "*arm_andsi3_insn"
- [(set (match_operand:SI 0 "s_register_operand" "=r,l,r,r,r")
- (and:SI (match_operand:SI 1 "s_register_operand" "%r,0,r,r,r")
- (match_operand:SI 2 "reg_or_int_operand" "I,l,K,r,?n")))]
+ [(set (match_operand:SI 0 "s_register_operand" "=r,l,r,r,r,r")
+ (and:SI (match_operand:SI 1 "s_register_operand" "%r,0,r,r,0,r")
+ (match_operand:SI 2 "reg_or_int_operand" "I,l,K,r,Dj,?n")))]
"TARGET_32BIT"
"@
and%?\\t%0, %1, %2
and%?\\t%0, %1, %2
bic%?\\t%0, %1, #%B2
and%?\\t%0, %1, %2
+ bfc%?\\t%0, %V2
#"
"TARGET_32BIT
&& CONST_INT_P (operands[2])
&& !(const_ok_for_arm (INTVAL (operands[2]))
- || const_ok_for_arm (~INTVAL (operands[2])))"
+ || const_ok_for_arm (~INTVAL (operands[2]))
+ || (arm_arch_thumb2
+ && satisfies_constraint_Dj (operands[2])
+ && (rtx_equal_p (operands[0], operands[1])
+ || !reload_completed)))"
[(clobber (const_int 0))]
"
- arm_split_constant (AND, SImode, curr_insn,
+ arm_split_constant (AND, SImode, curr_insn,
INTVAL (operands[2]), operands[0], operands[1], 0);
DONE;
"
- [(set_attr "length" "4,4,4,4,16")
+ [(set_attr "length" "4,4,4,4,4,16")
(set_attr "predicable" "yes")
- (set_attr "predicable_short_it" "no,yes,no,no,no")
- (set_attr "type" "logic_imm,logic_imm,logic_reg,logic_reg,logic_imm")]
+ (set_attr "predicable_short_it" "no,yes,no,no,no,no")
+ (set_attr "arch" "*,*,*,*,v6t2,*")
+ (set_attr "type" "logic_imm,logic_imm,logic_reg,logic_reg,bfm,logic_imm")]
)
(define_insn "*andsi3_compare0"
@@ -3471,13 +3477,25 @@ (define_expand "insv"
}"
)
-(define_insn "insv_zero"
+(define_insn_and_split "insv_zero"
[(set (zero_extract:SI (match_operand:SI 0 "s_register_operand" "+r")
(match_operand:SI 1 "const_int_M_operand" "M")
(match_operand:SI 2 "const_int_M_operand" "M"))
(const_int 0))]
"arm_arch_thumb2"
"bfc%?\t%0, %2, %1"
+ ""
+ [(set (match_dup 0) (and:SI (match_dup 0) (match_dup 1)))]
+ {
+ /* Convert back to a normal AND operation, so that we can take advantage
+ of BIC and AND when appropriate; we'll still emit BFC if that's the
+ right thing to do. */
+ unsigned HOST_WIDE_INT width = UINTVAL (operands[1]);
+ unsigned HOST_WIDE_INT lsb = UINTVAL (operands[2]);
+ unsigned HOST_WIDE_INT mask = (HOST_WIDE_INT_1U << width) - 1;
+
+ operands[1] = gen_int_mode (~(mask << lsb), SImode);
+ }
[(set_attr "length" "4")
(set_attr "predicable" "yes")
(set_attr "type" "bfm")]
@@ -3495,6 +3513,76 @@ (define_insn "insv_t2"
(set_attr "type" "bfm")]
)
+(define_insn "*bfi"
+ [(set (match_operand:SI 0 "s_register_operand" "=r")
+ (ior:SI (and:SI (match_operand:SI 1 "s_register_operand" "0")
+ (match_operand 2 "const_int_operand" "Dj"))
+ (and:SI (ashift:SI
+ (match_operand:SI 3 "s_register_operand" "r")
+ (match_operand 4 "const_int_operand" "i"))
+ (match_operand 5 "const_int_operand" "i"))))]
+ "arm_arch_thumb2
+ && UINTVAL (operands[4]) < 32
+ && UINTVAL (operands[2]) == ~UINTVAL (operands[5])
+ && (exact_log2 (UINTVAL (operands[5])
+ + (HOST_WIDE_INT_1U << UINTVAL (operands[4])))
+ >= 0)"
+ "bfi%?\t%0, %3, %V2"
+ [(set_attr "length" "4")
+ (set_attr "predicable" "yes")
+ (set_attr "type" "bfm")]
+)
+
+(define_insn "*bfi_alt1"
+ [(set (match_operand:SI 0 "s_register_operand" "=r")
+ (ior:SI (and:SI (ashift:SI
+ (match_operand:SI 3 "s_register_operand" "r")
+ (match_operand 4 "const_int_operand" "i"))
+ (match_operand 5 "const_int_operand" "i"))
+ (and:SI (match_operand:SI 1 "s_register_operand" "0")
+ (match_operand 2 "const_int_operand" "Dj"))))]
+ "arm_arch_thumb2
+ && UINTVAL (operands[4]) < 32
+ && UINTVAL (operands[2]) == ~UINTVAL (operands[5])
+ && (exact_log2 (UINTVAL (operands[5])
+ + (HOST_WIDE_INT_1U << UINTVAL (operands[4])))
+ >= 0)"
+ "bfi%?\t%0, %3, %V2"
+ [(set_attr "length" "4")
+ (set_attr "predicable" "yes")
+ (set_attr "type" "bfm")]
+)
+
+(define_insn "*bfi_alt2"
+ [(set (match_operand:SI 0 "s_register_operand" "=r")
+ (ior:SI (and:SI (match_operand:SI 1 "s_register_operand" "0")
+ (match_operand 2 "const_int_operand" "i"))
+ (and:SI (match_operand:SI 3 "s_register_operand" "r")
+ (match_operand 4 "const_int_operand" "i"))))]
+ "arm_arch_thumb2
+ && UINTVAL (operands[2]) == ~UINTVAL (operands[4])
+ && exact_log2 (UINTVAL (operands[4]) + 1) >= 0"
+ "bfi%?\t%0, %3, %V2"
+ [(set_attr "length" "4")
+ (set_attr "predicable" "yes")
+ (set_attr "type" "bfm")]
+)
+
+(define_insn "*bfi_alt3"
+ [(set (match_operand:SI 0 "s_register_operand" "=r")
+ (ior:SI (and:SI (match_operand:SI 3 "s_register_operand" "r")
+ (match_operand 4 "const_int_operand" "i"))
+ (and:SI (match_operand:SI 1 "s_register_operand" "0")
+ (match_operand 2 "const_int_operand" "i"))))]
+ "arm_arch_thumb2
+ && UINTVAL (operands[2]) == ~UINTVAL (operands[4])
+ && exact_log2 (UINTVAL (operands[4]) + 1) >= 0"
+ "bfi%?\t%0, %3, %V2"
+ [(set_attr "length" "4")
+ (set_attr "predicable" "yes")
+ (set_attr "type" "bfm")]
+)
+
(define_insn "andsi_notsi_si"
[(set (match_operand:SI 0 "s_register_operand" "=r")
(and:SI (not:SI (match_operand:SI 2 "s_register_operand" "r"))
@@ -32,7 +32,7 @@
;; The following multi-letter normal constraints have been used:
;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, DN, Dm, Dl, DL, Do, Dv, Dy, Di,
-;; Ds, Dt, Dp, Dz, Tu, Te
+;; Dj, Ds, Dt, Dp, Dz, Tu, Te
;; in Thumb-1 state: Pa, Pb, Pc, Pd, Pe
;; in Thumb-2 state: Ha, Pj, PJ, Ps, Pt, Pu, Pv, Pw, Px, Py, Pz, Rd, Rf, Rb, Ra,
;; Rg, Ri
@@ -354,6 +354,14 @@ (define_constraint "Di"
(and (match_code "const_double,const_int")
(match_test "TARGET_32BIT && arm_const_double_by_immediates (op)")))
+(define_constraint "Dj"
+ "@internal
+ In cores with the v6t2 ISA, a constant with exactly one consecutive
+ string of zero bits."
+ (and (match_code "const_int")
+ (match_test "arm_arch_thumb2
+ && exact_log2 (~ival + (~ival & -~ival)) >= 0")))
+
(define_constraint "Dm"
"@internal
In ARM/Thumb-2 state a const_vector which can be loaded with a Neon vmov