@@ -23,6 +23,9 @@ config/loongarch/loongarch-opts.h
HeaderInclude
config/loongarch/loongarch-str.h
+TargetVariable
+unsigned int recip_mask = 0
+
; ISA related options
;; Base ISA
Enum
@@ -197,6 +200,14 @@ mexplicit-relocs
Target Var(la_opt_explicit_relocs_backward) Init(M_OPT_UNSET)
Use %reloc() assembly operators (for backward compatibility).
+mrecip
+Target RejectNegative Var(loongarch_recip)
+Generate approximate reciprocal divide and square root for better throughput.
+
+mrecip=
+Target RejectNegative Joined Var(loongarch_recip_name)
+Control generation of reciprocal estimates.
+
; The code model option names for -mcmodel.
Enum
Name(cmodel) Type(int)
@@ -1249,7 +1249,25 @@ (define_insn "mul<mode>3"
[(set_attr "type" "simd_fmul")
(set_attr "mode" "<MODE>")])
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:FLASX 0 "register_operand")
+ (div:FLASX (match_operand:FLASX 1 "reg_or_vecotr_1_operand")
+ (match_operand:FLASX 2 "register_operand")))]
+ "ISA_HAS_LASX"
+{
+ if (<MODE>mode == V8SFmode
+ && TARGET_RECIP_VEC_DIV
+ && optimize_insn_for_speed_p ()
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ loongarch_emit_swdivsf (operands[0], operands[1],
+ operands[2], V8SFmode);
+ DONE;
+ }
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:FLASX 0 "register_operand" "=f")
(div:FLASX (match_operand:FLASX 1 "register_operand" "f")
(match_operand:FLASX 2 "register_operand" "f")))]
@@ -1278,7 +1296,23 @@ (define_insn "fnma<mode>4"
[(set_attr "type" "simd_fmadd")
(set_attr "mode" "<MODE>")])
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:FLASX 0 "register_operand")
+ (sqrt:FLASX (match_operand:FLASX 1 "register_operand")))]
+ "ISA_HAS_LASX"
+{
+ if (<MODE>mode == V8SFmode
+ && TARGET_RECIP_VEC_SQRT
+ && flag_unsafe_math_optimizations
+ && optimize_insn_for_speed_p ()
+ && flag_finite_math_only && !flag_trapping_math)
+ {
+ loongarch_emit_swrsqrtsf (operands[0], operands[1], V8SFmode, 0);
+ DONE;
+ }
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:FLASX 0 "register_operand" "=f")
(sqrt:FLASX (match_operand:FLASX 1 "register_operand" "f")))]
"ISA_HAS_LASX"
@@ -1710,7 +1744,20 @@ (define_insn "lasx_xvfrint_<flasxfmt>"
[(set_attr "type" "simd_fcvt")
(set_attr "mode" "<MODE>")])
-(define_insn "rsqrt<mode>2"
+(define_expand "rsqrt<mode>2"
+ [(set (match_operand:FLASX 0 "register_operand" "=f")
+ (unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
+ UNSPEC_LASX_XVFRSQRT))]
+ "ISA_HAS_LASX"
+ {
+ if (<MODE>mode == V8SFmode && TARGET_RECIP_VEC_RSQRT)
+ {
+ loongarch_emit_swrsqrtsf (operands[0], operands[1], V8SFmode, 1);
+ DONE;
+ }
+})
+
+(define_insn "*rsqrt<mode>2"
[(set (match_operand:FLASX 0 "register_operand" "=f")
(unspec:FLASX [(match_operand:FLASX 1 "register_operand" "f")]
UNSPEC_LASX_XVFRSQRT))]
@@ -220,5 +220,7 @@ extern rtx loongarch_gen_const_int_vector_shuffle (machine_mode, int);
extern tree loongarch_build_builtin_va_list (void);
extern rtx loongarch_build_signbit_mask (machine_mode, bool, bool);
+extern void loongarch_emit_swrsqrtsf (rtx, rtx, machine_mode, bool);
+extern void loongarch_emit_swdivsf (rtx, rtx, rtx, machine_mode);
extern bool loongarch_explicit_relocs_p (enum loongarch_symbol_type);
#endif /* ! GCC_LOONGARCH_PROTOS_H */
@@ -7547,6 +7547,69 @@ loongarch_option_override_internal (struct gcc_options *opts,
/* Function to allocate machine-dependent function status. */
init_machine_status = &loongarch_init_machine_status;
+
+ /* -mrecip options. */
+ static struct
+ {
+ const char *string; /* option name. */
+ unsigned int mask; /* mask bits to set. */
+ }
+ const recip_options[] = {
+ { "all", RECIP_MASK_ALL },
+ { "none", RECIP_MASK_NONE },
+ { "div", RECIP_MASK_DIV },
+ { "sqrt", RECIP_MASK_SQRT },
+ { "rsqrt", RECIP_MASK_RSQRT },
+ { "vec-div", RECIP_MASK_VEC_DIV },
+ { "vec-sqrt", RECIP_MASK_VEC_SQRT },
+ { "vec-rsqrt", RECIP_MASK_VEC_RSQRT },
+ };
+
+ if (loongarch_recip_name)
+ {
+ char *p = ASTRDUP (loongarch_recip_name);
+ char *q;
+ unsigned int mask, i;
+ bool invert;
+
+ while ((q = strtok (p, ",")) != NULL)
+ {
+ p = NULL;
+ if (*q == '!')
+ {
+ invert = true;
+ q++;
+ }
+ else
+ invert = false;
+
+ if (!strcmp (q, "default"))
+ mask = RECIP_MASK_ALL;
+ else
+ {
+ for (i = 0; i < ARRAY_SIZE (recip_options); i++)
+ if (!strcmp (q, recip_options[i].string))
+ {
+ mask = recip_options[i].mask;
+ break;
+ }
+
+ if (i == ARRAY_SIZE (recip_options))
+ {
+ error ("unknown option for %<-mrecip=%s%>", q);
+ invert = false;
+ mask = RECIP_MASK_NONE;
+ }
+ }
+
+ if (invert)
+ recip_mask &= ~mask;
+ else
+ recip_mask |= mask;
+ }
+ }
+ if (loongarch_recip)
+ recip_mask |= RECIP_MASK_ALL;
}
@@ -11443,6 +11506,156 @@ loongarch_build_signbit_mask (machine_mode mode, bool vect, bool invert)
return force_reg (vec_mode, v);
}
+/* Use rsqrte instruction and Newton-Rhapson to compute the approximation of
+ a single precision floating point [reciprocal] square root. */
+
+void loongarch_emit_swrsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
+{
+ rtx x0, e0, e1, e2, mhalf, monehalf;
+ REAL_VALUE_TYPE r;
+ int unspec;
+
+ x0 = gen_reg_rtx (mode);
+ e0 = gen_reg_rtx (mode);
+ e1 = gen_reg_rtx (mode);
+ e2 = gen_reg_rtx (mode);
+
+ real_arithmetic (&r, ABS_EXPR, &dconsthalf, NULL);
+ mhalf = const_double_from_real_value (r, SFmode);
+
+ real_arithmetic (&r, PLUS_EXPR, &dconsthalf, &dconst1);
+ monehalf = const_double_from_real_value (r, SFmode);
+ unspec = UNSPEC_RSQRTE;
+
+ if (VECTOR_MODE_P (mode))
+ {
+ mhalf = loongarch_build_const_vector (mode, true, mhalf);
+ monehalf = loongarch_build_const_vector (mode, true, monehalf);
+ unspec = GET_MODE_SIZE (mode) == 32 ? UNSPEC_LASX_XVFRSQRTE
+ : UNSPEC_LSX_VFRSQRTE;
+ }
+
+ /* rsqrt(a) = rsqrte(a) * (1.5 - 0.5 * a * rsqrte(a) * rsqrte(a))
+ sqrt(a) = a * rsqrte(a) * (1.5 - 0.5 * a * rsqrte(a) * rsqrte(a)) */
+
+ a = force_reg (mode, a);
+
+ /* x0 = rsqrt(a) estimate. */
+ emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
+ unspec)));
+
+ /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
+ if (!recip)
+ {
+ rtx zero = force_reg (mode, CONST0_RTX (mode));
+
+ if (VECTOR_MODE_P (mode))
+ {
+ machine_mode imode = related_int_vector_mode (mode).require ();
+ rtx mask = gen_reg_rtx (imode);
+ emit_insn (gen_rtx_SET (mask, gen_rtx_NE (imode, a, zero)));
+ emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0,
+ gen_lowpart (mode, mask))));
+ }
+ else
+ {
+ rtx target = emit_conditional_move (x0, { GT, a, zero, mode },
+ x0, zero, mode, 0);
+ if (target != x0)
+ emit_move_insn (x0, target);
+ }
+ }
+
+ /* e0 = x0 * a */
+ emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
+ /* e1 = e0 * x0 */
+ emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
+
+ /* e2 = 1.5 - e1 * 0.5 */
+ mhalf = force_reg (mode, mhalf);
+ monehalf = force_reg (mode, monehalf);
+ emit_insn (gen_rtx_SET (e2, gen_rtx_FMA (mode,
+ gen_rtx_NEG (mode, e1),
+ mhalf, monehalf)));
+
+ if (recip)
+ /* res = e2 * x0 */
+ emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, x0, e2)));
+ else
+ /* res = e2 * e0 */
+ emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e0)));
+}
+
+/* Use recipe instruction and Newton-Rhapson to compute the approximation of
+ a single precision floating point divide. */
+
+void loongarch_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
+{
+ rtx x0, e0, mtwo;
+ REAL_VALUE_TYPE r;
+ x0 = gen_reg_rtx (mode);
+ e0 = gen_reg_rtx (mode);
+ int unspec = UNSPEC_RECIPE;
+
+ real_arithmetic (&r, ABS_EXPR, &dconst2, NULL);
+ mtwo = const_double_from_real_value (r, SFmode);
+
+ if (VECTOR_MODE_P (mode))
+ {
+ mtwo = loongarch_build_const_vector (mode, true, mtwo);
+ unspec = GET_MODE_SIZE (mode) == 32 ? UNSPEC_LASX_XVFRECIPE
+ : UNSPEC_LSX_VFRECIPE;
+ }
+
+ mtwo = force_reg (mode, mtwo);
+
+ /* a / b = a * recipe(b) * (2.0 - b * recipe(b)) */
+
+ /* x0 = 1./b estimate. */
+ emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
+ unspec)));
+ /* 2.0 - b * x0 */
+ emit_insn (gen_rtx_SET (e0, gen_rtx_FMA (mode,
+ gen_rtx_NEG (mode, b), x0, mtwo)));
+
+ /* x0 = a * x0 */
+ if (a != CONST1_RTX (mode))
+ emit_insn (gen_rtx_SET (x0, gen_rtx_MULT (mode, a, x0)));
+
+ /* res = e0 * x0 */
+ emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e0, x0)));
+}
+
+/* Return true if it is safe to use the rsqrt optabs to optimize
+ 1.0/sqrt. */
+
+static bool
+use_rsqrt_p (machine_mode mode)
+{
+ if (TARGET_RECIP_RSQRT && GET_MODE_INNER (mode) == SFmode)
+ return (flag_finite_math_only
+ && !flag_trapping_math
+ && flag_unsafe_math_optimizations);
+ else
+ return true;
+}
+
+/* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
+
+static bool
+loongarch_optab_supported_p (int op, machine_mode mode1, machine_mode,
+ optimization_type opt_type)
+{
+ switch (op)
+ {
+ case rsqrt_optab:
+ return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
+
+ default:
+ return true;
+ }
+}
+
static bool
loongarch_builtin_support_vector_misalignment (machine_mode mode,
const_tree type,
@@ -11610,6 +11823,9 @@ loongarch_asm_code_end (void)
#define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
loongarch_autovectorize_vector_modes
+#undef TARGET_OPTAB_SUPPORTED_P
+#define TARGET_OPTAB_SUPPORTED_P loongarch_optab_supported_p
+
#undef TARGET_INIT_BUILTINS
#define TARGET_INIT_BUILTINS loongarch_init_builtins
#undef TARGET_BUILTIN_DECL
@@ -700,6 +700,24 @@ enum reg_class
&& (GET_MODE_CLASS (MODE) == MODE_VECTOR_INT \
|| GET_MODE_CLASS (MODE) == MODE_VECTOR_FLOAT))
+#define RECIP_MASK_NONE 0x00
+#define RECIP_MASK_DIV 0x01
+#define RECIP_MASK_SQRT 0x02
+#define RECIP_MASK_RSQRT 0x04
+#define RECIP_MASK_VEC_DIV 0x08
+#define RECIP_MASK_VEC_SQRT 0x10
+#define RECIP_MASK_VEC_RSQRT 0x20
+#define RECIP_MASK_ALL (RECIP_MASK_DIV | RECIP_MASK_SQRT \
+ | RECIP_MASK_RSQRT | RECIP_MASK_VEC_SQRT \
+ | RECIP_MASK_VEC_DIV | RECIP_MASK_VEC_RSQRT)
+
+#define TARGET_RECIP_DIV ((recip_mask & RECIP_MASK_DIV) != 0 || TARGET_uARCH_LA664)
+#define TARGET_RECIP_SQRT ((recip_mask & RECIP_MASK_SQRT) != 0 || TARGET_uARCH_LA664)
+#define TARGET_RECIP_RSQRT ((recip_mask & RECIP_MASK_RSQRT) != 0 || TARGET_uARCH_LA664)
+#define TARGET_RECIP_VEC_DIV ((recip_mask & RECIP_MASK_VEC_DIV) != 0 || TARGET_uARCH_LA664)
+#define TARGET_RECIP_VEC_SQRT ((recip_mask & RECIP_MASK_VEC_SQRT) != 0 || TARGET_uARCH_LA664)
+#define TARGET_RECIP_VEC_RSQRT ((recip_mask & RECIP_MASK_VEC_RSQRT) != 0 || TARGET_uARCH_LA664)
+
/* 1 if N is a possible register number for function argument passing.
We have no FP argument registers when soft-float. */
@@ -896,9 +896,21 @@ (define_peephole
;; Float division and modulus.
(define_expand "div<mode>3"
[(set (match_operand:ANYF 0 "register_operand")
- (div:ANYF (match_operand:ANYF 1 "reg_or_1_operand")
- (match_operand:ANYF 2 "register_operand")))]
- "")
+ (div:ANYF (match_operand:ANYF 1 "reg_or_1_operand")
+ (match_operand:ANYF 2 "register_operand")))]
+ ""
+{
+ if (<MODE>mode == SFmode
+ && TARGET_RECIP_DIV
+ && optimize_insn_for_speed_p ()
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ loongarch_emit_swdivsf (operands[0], operands[1],
+ operands[2], SFmode);
+ DONE;
+ }
+})
(define_insn "*div<mode>3"
[(set (match_operand:ANYF 0 "register_operand" "=f")
@@ -1129,7 +1141,23 @@ (define_insn "*fnma<mode>4"
;;
;; ....................
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:ANYF 0 "register_operand")
+ (sqrt:ANYF (match_operand:ANYF 1 "register_operand")))]
+ ""
+ {
+ if (<MODE>mode == SFmode
+ && TARGET_RECIP_SQRT
+ && flag_unsafe_math_optimizations
+ && !optimize_insn_for_size_p ()
+ && flag_finite_math_only && !flag_trapping_math)
+ {
+ loongarch_emit_swrsqrtsf (operands[0], operands[1], SFmode, 0);
+ DONE;
+ }
+ })
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:ANYF 0 "register_operand" "=f")
(sqrt:ANYF (match_operand:ANYF 1 "register_operand" "f")))]
""
@@ -1138,6 +1166,19 @@ (define_insn "sqrt<mode>2"
(set_attr "mode" "<UNITMODE>")
(set_attr "insn_count" "1")])
+(define_expand "rsqrt<mode>2"
+ [(set (match_operand:ANYF 0 "register_operand")
+ (unspec:ANYF [(match_operand:ANYF 1 "register_operand")]
+ UNSPEC_RSQRT))]
+ "TARGET_HARD_FLOAT"
+{
+ if (<MODE>mode == SFmode && TARGET_RECIP_RSQRT)
+ {
+ loongarch_emit_swrsqrtsf (operands[0], operands[1], SFmode, 1);
+ DONE;
+ }
+})
+
(define_insn "*rsqrt<mode>2"
[(set (match_operand:ANYF 0 "register_operand" "=f")
(unspec:ANYF [(match_operand:ANYF 1 "register_operand" "f")]
@@ -31,6 +31,9 @@ config/loongarch/loongarch-opts.h
HeaderInclude
config/loongarch/loongarch-str.h
+TargetVariable
+unsigned int recip_mask = 0
+
; ISA related options
;; Base ISA
Enum
@@ -205,6 +208,14 @@ mexplicit-relocs
Target Var(la_opt_explicit_relocs_backward) Init(M_OPT_UNSET)
Use %reloc() assembly operators (for backward compatibility).
+mrecip
+Target RejectNegative Var(loongarch_recip)
+Generate approximate reciprocal divide and square root for better throughput.
+
+mrecip=
+Target RejectNegative Joined Var(loongarch_recip_name)
+Control generation of reciprocal estimates.
+
; The code model option names for -mcmodel.
Enum
Name(cmodel) Type(int)
@@ -1153,7 +1153,25 @@ (define_insn "mul<mode>3"
[(set_attr "type" "simd_fmul")
(set_attr "mode" "<MODE>")])
-(define_insn "div<mode>3"
+(define_expand "div<mode>3"
+ [(set (match_operand:FLSX 0 "register_operand")
+ (div:FLSX (match_operand:FLSX 1 "reg_or_vecotr_1_operand")
+ (match_operand:FLSX 2 "register_operand")))]
+ "ISA_HAS_LSX"
+{
+ if (<MODE>mode == V4SFmode
+ && TARGET_RECIP_VEC_DIV
+ && optimize_insn_for_speed_p ()
+ && flag_finite_math_only && !flag_trapping_math
+ && flag_unsafe_math_optimizations)
+ {
+ loongarch_emit_swdivsf (operands[0], operands[1],
+ operands[2], V4SFmode);
+ DONE;
+ }
+})
+
+(define_insn "*div<mode>3"
[(set (match_operand:FLSX 0 "register_operand" "=f")
(div:FLSX (match_operand:FLSX 1 "register_operand" "f")
(match_operand:FLSX 2 "register_operand" "f")))]
@@ -1182,7 +1200,23 @@ (define_insn "fnma<mode>4"
[(set_attr "type" "simd_fmadd")
(set_attr "mode" "<MODE>")])
-(define_insn "sqrt<mode>2"
+(define_expand "sqrt<mode>2"
+ [(set (match_operand:FLSX 0 "register_operand")
+ (sqrt:FLSX (match_operand:FLSX 1 "register_operand")))]
+ "ISA_HAS_LSX"
+{
+ if (<MODE>mode == V4SFmode
+ && TARGET_RECIP_VEC_SQRT
+ && flag_unsafe_math_optimizations
+ && optimize_insn_for_speed_p ()
+ && flag_finite_math_only && !flag_trapping_math)
+ {
+ loongarch_emit_swrsqrtsf (operands[0], operands[1], V4SFmode, 0);
+ DONE;
+ }
+})
+
+(define_insn "*sqrt<mode>2"
[(set (match_operand:FLSX 0 "register_operand" "=f")
(sqrt:FLSX (match_operand:FLSX 1 "register_operand" "f")))]
"ISA_HAS_LSX"
@@ -1638,7 +1672,20 @@ (define_insn "lsx_vfrint_<flsxfmt>"
[(set_attr "type" "simd_fcvt")
(set_attr "mode" "<MODE>")])
-(define_insn "rsqrt<mode>2"
+(define_expand "rsqrt<mode>2"
+ [(set (match_operand:FLSX 0 "register_operand" "=f")
+ (unspec:FLSX [(match_operand:FLSX 1 "register_operand" "f")]
+ UNSPEC_LSX_VFRSQRT))]
+ "ISA_HAS_LSX"
+{
+ if (<MODE>mode == V4SFmode && TARGET_RECIP_VEC_RSQRT)
+ {
+ loongarch_emit_swrsqrtsf (operands[0], operands[1], V4SFmode, 1);
+ DONE;
+ }
+})
+
+(define_insn "*rsqrt<mode>2"
[(set (match_operand:FLSX 0 "register_operand" "=f")
(unspec:FLSX [(match_operand:FLSX 1 "register_operand" "f")]
UNSPEC_LSX_VFRSQRT))]
@@ -235,6 +235,10 @@ (define_predicate "reg_or_1_operand"
(ior (match_operand 0 "const_1_operand")
(match_operand 0 "register_operand")))
+(define_predicate "reg_or_vecotr_1_operand"
+ (ior (match_operand 0 "const_vector_1_operand")
+ (match_operand 0 "register_operand")))
+
;; These are used in vec_merge, hence accept bitmask as const_int.
(define_predicate "const_exp_2_operand"
(and (match_code "const_int")
@@ -1209,6 +1209,7 @@ Objective-C and Objective-C++ Dialects}.
-msoft-float -mhard-float -mdouble-float -munordered-float
-mcmov -mror -mrori -msext -msfimm -mshftimm
-mcmodel=@var{code-model}}
+-mrecip -mrecip=@var{opt}
@emph{PDP-11 Options}
@gccoptlist{-mfpu -msoft-float -mac0 -mno-ac0 -m40 -m45 -m10
@@ -26523,6 +26524,57 @@ detecting corresponding assembler support:
This option is mostly useful for debugging, or interoperation with
assemblers different from the build-time one.
+@opindex mrecip
+@item -mrecip
+This option enables use of the reciprocal estimate and reciprocal square
+root estimate instructions with additional Newton-Raphson steps to increase
+precision instead of doing a divide or square root and divide for
+floating-point arguments.
+These instructions are generated only when @option{-funsafe-math-optimizations}
+is enabled together with @option{-ffinite-math-only} and
+@option{-fno-trapping-math}.
+Note that while the throughput of the sequence is higher than the throughput of
+the non-reciprocal instruction, the precision of the sequence can be decreased
+by up to 2 ulp (i.e. the inverse of 1.0 equals 0.99999994).
+
+@opindex mrecip=opt
+@item -mrecip=@var{opt}
+This option controls which reciprocal estimate instructions
+may be used. @var{opt} is a comma-separated list of options, which may
+be preceded by a @samp{!} to invert the option:
+
+@table @samp
+@item all
+Enable all estimate instructions.
+
+@item default
+Enable the default instructions, equivalent to @option{-mrecip}.
+
+@item none
+Disable all estimate instructions, equivalent to @option{-mno-recip}.
+
+@item div
+Enable the approximation for scalar division.
+
+@item vec-div
+Enable the approximation for vectorized division.
+
+@item sqrt
+Enable the approximation for scalar square root.
+
+@item vec-sqrt
+Enable the approximation for vectorized square root.
+
+@item rsqrt
+Enable the approximation for scalar reciprocal square root.
+
+@item vec-rsqrt
+Enable the approximation for vectorized reciprocal square root.
+@end table
+
+So, for example, @option{-mrecip=all,!sqrt} enables
+all of the reciprocal approximations, except for scalar square root.
+
@item loongarch-vect-unroll-limit
The vectorizer will use available tuning information to determine whether it
would be beneficial to unroll the main vectorized loop and by how much. This
new file mode 100644
@@ -0,0 +1,9 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -march=la664" } */
+/* { dg-final { scan-assembler "frecipe.s" } } */
+
+float
+foo(float a, float b)
+{
+ return a / b;
+}
new file mode 100644
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -march=la664" } */
+/* { dg-final { scan-assembler-times "frsqrte.s" 3 } } */
+
+extern float sqrtf (float);
+
+float
+foo1 (float a, float b)
+{
+ return a/sqrtf(b);
+}
+
+float
+foo2 (float a, float b)
+{
+ return sqrtf(a/b);
+}
+
+float
+foo3 (float a)
+{
+ return sqrtf(a);
+}
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlasx -march=la664" } */
+/* { dg-final { scan-assembler "xvfrecipe.s" } } */
+
+float a[8],b[8],c[8];
+
+void
+foo ()
+{
+ for (int i = 0; i < 8; i++)
+ c[i] = a[i] / b[i];
+}
new file mode 100644
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlasx -march=la664" } */
+/* { dg-final { scan-assembler-times "xvfrsqrte.s" 3 } } */
+
+float a[8], b[8], c[8];
+
+extern float sqrtf (float);
+
+void
+foo1 (void)
+{
+ for (int i = 0; i < 8; i++)
+ c[i] = a[i] / sqrtf (b[i]);
+}
+
+void
+foo2 (void)
+{
+ for (int i = 0; i < 8; i++)
+ c[i] = sqrtf (a[i] / b[i]);
+}
+
+void
+foo3 (void)
+{
+ for (int i = 0; i < 8; i++)
+ c[i] = sqrtf (a[i]);
+}
new file mode 100644
@@ -0,0 +1,12 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlsx -march=la664" } */
+/* { dg-final { scan-assembler "vfrecipe.s" } } */
+
+float a[4],b[4],c[4];
+
+void
+foo ()
+{
+ for (int i = 0; i < 4; i++)
+ c[i] = a[i] / b[i];
+}
new file mode 100644
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ffast-math -mrecip -mlsx -march=la664" } */
+/* { dg-final { scan-assembler-times "vfrsqrte.s" 3 } } */
+
+float a[4], b[4], c[4];
+
+extern float sqrtf (float);
+
+void
+foo1 (void)
+{
+ for (int i = 0; i < 4; i++)
+ c[i] = a[i] / sqrtf (b[i]);
+}
+
+void
+foo2 (void)
+{
+ for (int i = 0; i < 4; i++)
+ c[i] = sqrtf (a[i] / b[i]);
+}
+
+void
+foo3 (void)
+{
+ for (int i = 0; i < 4; i++)
+ c[i] = sqrtf (a[i]);
+}