[01/11] Updates for microMIPS Release 6

Message ID 20250123134308.1785777-3-aleksandar.rakic@htecgroup.com (mailing list archive)
State New
Headers
Series Improve Mips target |

Checks

Context Check Description
redhat-pt-bot/TryBot-apply_patch success Patch applied to master at the time it was sent

Commit Message

Aleksandar Rakic Jan. 23, 2025, 1:42 p.m. UTC
  * Remove noreorder
* Fix PC relative code label calculations for microMIPSR6
* Add special versions of code that would be de-optimised by removing
  noreorder
* Avoid use of un-aligned ADDIUPC instruction for address calculation.

Cherry-picked 94a52199502361be4a5b1cc616661e287416cc8d
from https://github.com/MIPS/glibc

Signed-off-by: Matthew Fortune <matthew.fortune@imgtec.com>
Signed-off-by: Andrew Bennett <andrew.bennett@imgtec.com>
Signed-off-by: Faraz Shahbazker <fshahbazker@wavecomp.com>
Signed-off-by: Aleksandar Rakic <aleksandar.rakic@htecgroup.com>
---
 sysdeps/mips/add_n.S                         |  12 +-
 sysdeps/mips/addmul_1.S                      |  11 +-
 sysdeps/mips/dl-machine.h                    |  15 ++-
 sysdeps/mips/dl-trampoline.c                 |   4 -
 sysdeps/mips/lshift.S                        |  12 +-
 sysdeps/mips/machine-gmon.h                  |  82 +++++++++++++
 sysdeps/mips/memcpy.S                        | 120 +++++++++++--------
 sysdeps/mips/memset.S                        |  62 +++++-----
 sysdeps/mips/mips32/crtn.S                   |  12 +-
 sysdeps/mips/mips64/__longjmp.c              |   2 +-
 sysdeps/mips/mips64/add_n.S                  |  12 +-
 sysdeps/mips/mips64/addmul_1.S               |  11 +-
 sysdeps/mips/mips64/lshift.S                 |  12 +-
 sysdeps/mips/mips64/mul_1.S                  |  11 +-
 sysdeps/mips/mips64/n32/crtn.S               |  12 +-
 sysdeps/mips/mips64/n64/crtn.S               |  12 +-
 sysdeps/mips/mips64/rshift.S                 |  12 +-
 sysdeps/mips/mips64/sub_n.S                  |  12 +-
 sysdeps/mips/mips64/submul_1.S               |  11 +-
 sysdeps/mips/mul_1.S                         |  11 +-
 sysdeps/mips/rshift.S                        |  12 +-
 sysdeps/mips/sub_n.S                         |  12 +-
 sysdeps/mips/submul_1.S                      |  11 +-
 sysdeps/mips/sys/asm.h                       |  20 +---
 sysdeps/unix/mips/mips32/sysdep.h            |   4 -
 sysdeps/unix/mips/mips64/sysdep.h            |   4 -
 sysdeps/unix/mips/sysdep.h                   |   2 -
 sysdeps/unix/sysv/linux/mips/mips32/sysdep.h |  10 --
 sysdeps/unix/sysv/linux/mips/mips64/sysdep.h |  14 ---
 29 files changed, 260 insertions(+), 277 deletions(-)
  

Patch

diff --git a/sysdeps/mips/add_n.S b/sysdeps/mips/add_n.S
index 234e1e3c8d..f4d98fa38c 100644
--- a/sysdeps/mips/add_n.S
+++ b/sysdeps/mips/add_n.S
@@ -31,19 +31,16 @@  along with the GNU MP Library.  If not, see
 	.option pic2
 #endif
 ENTRY (__mpn_add_n)
-	.set	noreorder
 #ifdef __PIC__
 	.cpload t9
 #endif
-	.set	nomacro
-
 	lw	$10,0($5)
 	lw	$11,0($6)
 
 	addiu	$7,$7,-1
 	and	$9,$7,4-1	/* number of limbs in first loop */
-	beq	$9,$0,L(L0)	/* if multiple of 4 limbs, skip first loop */
 	move	$2,$0
+	beq	$9,$0,L(L0)	/* if multiple of 4 limbs, skip first loop */
 
 	subu	$7,$7,$9
 
@@ -61,11 +58,10 @@  L(Loop0):	addiu	$9,$9,-1
 	addiu	$6,$6,4
 	move	$10,$12
 	move	$11,$13
-	bne	$9,$0,L(Loop0)
 	addiu	$4,$4,4
+	bne	$9,$0,L(Loop0)
 
 L(L0):	beq	$7,$0,L(end)
-	nop
 
 L(Loop):	addiu	$7,$7,-4
 
@@ -108,14 +104,14 @@  L(Loop):	addiu	$7,$7,-4
 	addiu	$5,$5,16
 	addiu	$6,$6,16
 
-	bne	$7,$0,L(Loop)
 	addiu	$4,$4,16
+	bne	$7,$0,L(Loop)
 
 L(end):	addu	$11,$11,$2
 	sltu	$8,$11,$2
 	addu	$11,$10,$11
 	sltu	$2,$11,$10
 	sw	$11,0($4)
-	j	$31
 	or	$2,$2,$8
+	jr	$31
 END (__mpn_add_n)
diff --git a/sysdeps/mips/addmul_1.S b/sysdeps/mips/addmul_1.S
index 523478d7e8..eea26630fc 100644
--- a/sysdeps/mips/addmul_1.S
+++ b/sysdeps/mips/addmul_1.S
@@ -31,12 +31,9 @@  along with the GNU MP Library.  If not, see
 	.option pic2
 #endif
 ENTRY (__mpn_addmul_1)
-	.set    noreorder
 #ifdef __PIC__
 	.cpload t9
 #endif
-	.set    nomacro
-
 	/* warm up phase 0 */
 	lw	$8,0($5)
 
@@ -50,12 +47,12 @@  ENTRY (__mpn_addmul_1)
 #endif
 
 	addiu	$6,$6,-1
-	beq	$6,$0,L(LC0)
 	move	$2,$0		/* zero cy2 */
+	beq	$6,$0,L(LC0)
 
 	addiu	$6,$6,-1
-	beq	$6,$0,L(LC1)
 	lw	$8,0($5)	/* load new s1 limb as early as possible */
+	beq	$6,$0,L(LC1)
 
 L(Loop):	lw	$10,0($4)
 #if __mips_isa_rev < 6
@@ -81,8 +78,8 @@  L(Loop):	lw	$10,0($4)
 	addu	$2,$2,$10
 	sw	$3,0($4)
 	addiu	$4,$4,4
-	bne	$6,$0,L(Loop)	/* should be "bnel" */
 	addu	$2,$9,$2	/* add high product limb and carry from addition */
+	bne	$6,$0,L(Loop)	/* should be "bnel" */
 
 	/* cool down phase 1 */
 L(LC1):	lw	$10,0($4)
@@ -123,6 +120,6 @@  L(LC0):	lw	$10,0($4)
 	sltu	$10,$3,$10
 	addu	$2,$2,$10
 	sw	$3,0($4)
-	j	$31
 	addu	$2,$9,$2	/* add high product limb and carry from addition */
+	jr	$31
 	END (__mpn_addmul_1)
diff --git a/sysdeps/mips/dl-machine.h b/sysdeps/mips/dl-machine.h
index 10e30f1e90..a360dfcd63 100644
--- a/sysdeps/mips/dl-machine.h
+++ b/sysdeps/mips/dl-machine.h
@@ -127,16 +127,13 @@  elf_machine_load_address (void)
 {
   ElfW(Addr) addr;
 #ifndef __mips16
-  asm ("	.set noreorder\n"
-       "	" STRINGXP (PTR_LA) " %0, 0f\n"
+  asm ("	" STRINGXP (PTR_LA) " %0, 0f\n"
 # if !defined __mips_isa_rev || __mips_isa_rev < 6
        "	bltzal $0, 0f\n"
-       "	nop\n"
+#else
+       "	bal 0f\n"
+#endif
        "0:	" STRINGXP (PTR_SUBU) " %0, $31, %0\n"
-# else
-       "0:	addiupc $31, 0\n"
-       "	" STRINGXP (PTR_SUBU) " %0, $31, %0\n"
-# endif
        "	.set reorder\n"
        :	"=r" (addr)
        :	/* No inputs */
@@ -237,7 +234,9 @@  do {									\
       and not just plain _start.  */
 
 #ifndef __mips16
-# if !defined __mips_isa_rev || __mips_isa_rev < 6
+/* Although microMIPSr6 has an ADDIUPC instruction, it must be 4-byte aligned
+   for the address calculation to be valid.  */
+# if !defined __mips_isa_rev || __mips_isa_rev < 6 || defined __mips_micromips
 #  define LCOFF STRINGXP(.Lcof2)
 #  define LOAD_31 STRINGXP(bltzal $8) "," STRINGXP(.Lcof2)
 # else
diff --git a/sysdeps/mips/dl-trampoline.c b/sysdeps/mips/dl-trampoline.c
index 603ee2d2f8..915e1da6ad 100644
--- a/sysdeps/mips/dl-trampoline.c
+++ b/sysdeps/mips/dl-trampoline.c
@@ -301,7 +301,6 @@  asm ("\n\
 	.ent	_dl_runtime_resolve\n\
 _dl_runtime_resolve:\n\
 	.frame	$29, " STRINGXP(ELF_DL_FRAME_SIZE) ", $31\n\
-	.set noreorder\n\
 	# Save GP.\n\
 1:	move	$3, $28\n\
 	# Save arguments and sp value in stack.\n\
@@ -311,7 +310,6 @@  _dl_runtime_resolve:\n\
 	# Compute GP.\n\
 2:	" STRINGXP(SETUP_GP) "\n\
 	" STRINGXV(SETUP_GP64 (0, _dl_runtime_resolve)) "\n\
-	.set reorder\n\
 	# Save slot call pc.\n\
 	move	$2, $31\n\
 	" IFABIO32(STRINGXP(CPRESTORE(32))) "\n\
@@ -358,7 +356,6 @@  asm ("\n\
 	.ent	_dl_runtime_pltresolve\n\
 _dl_runtime_pltresolve:\n\
 	.frame	$29, " STRINGXP(ELF_DL_PLT_FRAME_SIZE) ", $31\n\
-	.set noreorder\n\
 	# Save arguments and sp value in stack.\n\
 1:	" STRINGXP(PTR_SUBIU) "	$29, " STRINGXP(ELF_DL_PLT_FRAME_SIZE) "\n\
 	" IFABIO32(STRINGXP(PTR_L) "	$13, " STRINGXP(PTRSIZE) "($28)") "\n\
@@ -368,7 +365,6 @@  _dl_runtime_pltresolve:\n\
 	# Compute GP.\n\
 2:	" STRINGXP(SETUP_GP) "\n\
 	" STRINGXV(SETUP_GP64 (0, _dl_runtime_pltresolve)) "\n\
-	.set reorder\n\
 	" IFABIO32(STRINGXP(CPRESTORE(32))) "\n\
 	" ELF_DL_PLT_SAVE_ARG_REGS "\
 	move	$4, $13\n\
diff --git a/sysdeps/mips/lshift.S b/sysdeps/mips/lshift.S
index 04caa76a84..c6c42aa1f5 100644
--- a/sysdeps/mips/lshift.S
+++ b/sysdeps/mips/lshift.S
@@ -30,12 +30,9 @@  along with the GNU MP Library.  If not, see
 	.option pic2
 #endif
 ENTRY (__mpn_lshift)
-	.set	noreorder
 #ifdef __PIC__
 	.cpload t9
 #endif
-	.set	nomacro
-
 	sll	$2,$6,2
 	addu	$5,$5,$2	/* make r5 point at end of src */
 	lw	$10,-4($5)	/* load first limb */
@@ -43,8 +40,8 @@  ENTRY (__mpn_lshift)
 	addu	$4,$4,$2	/* make r4 point at end of res */
 	addiu	$6,$6,-1
 	and	$9,$6,4-1	/* number of limbs in first loop */
-	beq	$9,$0,L(L0)	/* if multiple of 4 limbs, skip first loop */
 	srl	$2,$10,$13	/* compute function result */
+	beq	$9,$0,L(L0)	/* if multiple of 4 limbs, skip first loop */
 
 	subu	$6,$6,$9
 
@@ -56,11 +53,10 @@  L(Loop0):	lw	$3,-8($5)
 	srl	$12,$3,$13
 	move	$10,$3
 	or	$8,$11,$12
-	bne	$9,$0,L(Loop0)
 	sw	$8,0($4)
+	bne	$9,$0,L(Loop0)
 
 L(L0):	beq	$6,$0,L(Lend)
-	nop
 
 L(Loop):	lw	$3,-8($5)
 	addiu	$4,$4,-16
@@ -88,10 +84,10 @@  L(Loop):	lw	$3,-8($5)
 
 	addiu	$5,$5,-16
 	or	$8,$14,$9
-	bgtz	$6,L(Loop)
 	sw	$8,0($4)
+	bgtz	$6,L(Loop)
 
 L(Lend):	sll	$8,$10,$7
-	j	$31
 	sw	$8,-4($4)
+	jr	$31
 	END (__mpn_lshift)
diff --git a/sysdeps/mips/machine-gmon.h b/sysdeps/mips/machine-gmon.h
index e2e0756575..d890e5ec19 100644
--- a/sysdeps/mips/machine-gmon.h
+++ b/sysdeps/mips/machine-gmon.h
@@ -34,6 +34,42 @@  static void __attribute_used__ __mcount (u_long frompc, u_long selfpc)
 # define CPRESTORE
 #endif
 
+#if __mips_isa_rev > 5 && defined (__mips_micromips)
+#define MCOUNT asm(\
+	".globl _mcount;\n\t" \
+	".align 2;\n\t" \
+	".set push;\n\t" \
+	".set nomips16;\n\t" \
+	".type _mcount,@function;\n\t" \
+	".ent _mcount\n\t" \
+        "_mcount:\n\t" \
+        ".frame $sp,44,$31\n\t" \
+        ".set noat;\n\t" \
+        CPLOAD \
+	"subu $29,$29,48;\n\t" \
+	CPRESTORE \
+        "sw $4,24($29);\n\t" \
+        "sw $5,28($29);\n\t" \
+        "sw $6,32($29);\n\t" \
+        "sw $7,36($29);\n\t" \
+        "sw $2,40($29);\n\t" \
+        "sw $1,16($29);\n\t" \
+        "sw $31,20($29);\n\t" \
+        "move $5,$31;\n\t" \
+        "move $4,$1;\n\t" \
+        "balc __mcount;\n\t" \
+        "lw $4,24($29);\n\t" \
+        "lw $5,28($29);\n\t" \
+        "lw $6,32($29);\n\t" \
+        "lw $7,36($29);\n\t" \
+        "lw $2,40($29);\n\t" \
+        "lw $1,20($29);\n\t" \
+        "lw $31,16($29);\n\t" \
+        "addu $29,$29,56;\n\t" \
+        "jrc $1;\n\t" \
+	".end _mcount;\n\t" \
+	".set pop");
+#else
 #define MCOUNT asm(\
 	".globl _mcount;\n\t" \
 	".align 2;\n\t" \
@@ -71,6 +107,7 @@  static void __attribute_used__ __mcount (u_long frompc, u_long selfpc)
         "move $31,$1;\n\t" \
 	".end _mcount;\n\t" \
 	".set pop");
+#endif
 
 #else
 
@@ -97,6 +134,50 @@  static void __attribute_used__ __mcount (u_long frompc, u_long selfpc)
 # error "Unknown ABI"
 #endif
 
+#if __mips_isa_rev > 5 && defined (__mips_micromips)
+#define MCOUNT asm(\
+	".globl _mcount;\n\t" \
+	".align 3;\n\t" \
+	".set push;\n\t" \
+	".set nomips16;\n\t" \
+	".type _mcount,@function;\n\t" \
+	".ent _mcount\n\t" \
+        "_mcount:\n\t" \
+        ".frame $sp,88,$31\n\t" \
+        ".set noat;\n\t" \
+        PTR_SUBU_STRING " $29,$29,96;\n\t" \
+        CPSETUP \
+        "sd $4,24($29);\n\t" \
+        "sd $5,32($29);\n\t" \
+        "sd $6,40($29);\n\t" \
+        "sd $7,48($29);\n\t" \
+        "sd $8,56($29);\n\t" \
+        "sd $9,64($29);\n\t" \
+        "sd $10,72($29);\n\t" \
+        "sd $11,80($29);\n\t" \
+        "sd $2,16($29);\n\t" \
+        "sd $1,0($29);\n\t" \
+        "sd $31,8($29);\n\t" \
+        "move $5,$31;\n\t" \
+        "move $4,$1;\n\t" \
+        "balc __mcount;\n\t" \
+        "ld $4,24($29);\n\t" \
+        "ld $5,32($29);\n\t" \
+        "ld $6,40($29);\n\t" \
+        "ld $7,48($29);\n\t" \
+        "ld $8,56($29);\n\t" \
+        "ld $9,64($29);\n\t" \
+        "ld $10,72($29);\n\t" \
+        "ld $11,80($29);\n\t" \
+        "ld $2,16($29);\n\t" \
+        "ld $1,8($29);\n\t" \
+        "ld $31,0($29);\n\t" \
+        CPRETURN \
+        PTR_ADDU_STRING " $29,$29,96;\n\t" \
+        "jrc $1;\n\t" \
+	".end _mcount;\n\t" \
+	".set pop");
+#else
 #define MCOUNT asm(\
 	".globl _mcount;\n\t" \
 	".align 3;\n\t" \
@@ -142,5 +223,6 @@  static void __attribute_used__ __mcount (u_long frompc, u_long selfpc)
         "move $31,$1;\n\t" \
 	".end _mcount;\n\t" \
 	".set pop");
+#endif
 
 #endif
diff --git a/sysdeps/mips/memcpy.S b/sysdeps/mips/memcpy.S
index 5b277e07c5..96d1c92d89 100644
--- a/sysdeps/mips/memcpy.S
+++ b/sysdeps/mips/memcpy.S
@@ -86,6 +86,12 @@ 
 # endif
 #endif
 
+#if __mips_isa_rev > 5 && defined (__mips_micromips)
+# define PTR_BC	      bc16
+#else
+# define PTR_BC	      bc
+#endif
+
 /*
  * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
  * prefetches appear to offer a slight performance advantage.
@@ -272,7 +278,6 @@  LEAF(MEMCPY_NAME, 0)
 LEAF(MEMCPY_NAME)
 #endif
 	.set	nomips16
-	.set	noreorder
 /*
  * Below we handle the case where memcpy is called with overlapping src and dst.
  * Although memcpy is not required to handle this case, some parts of Android
@@ -284,10 +289,9 @@  LEAF(MEMCPY_NAME)
 	xor	t1,t0,t2
 	PTR_SUBU t0,t1,t2
 	sltu	t2,t0,a2
-	beq	t2,zero,L(memcpy)
 	la	t9,memmove
+	beq	t2,zero,L(memcpy)
 	jr	t9
-	 nop
 L(memcpy):
 #endif
 /*
@@ -295,12 +299,12 @@  L(memcpy):
  * size, copy dst pointer to v0 for the return value.
  */
 	slti	t2,a2,(2 * NSIZE)
-	bne	t2,zero,L(lasts)
 #if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH)
 	move	v0,zero
 #else
 	move	v0,a0
 #endif
+	bne	t2,zero,L(lasts)
 
 #ifndef R6_CODE
 
@@ -312,12 +316,12 @@  L(memcpy):
  */
 	xor	t8,a1,a0
 	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
-	bne	t8,zero,L(unaligned)
 	PTR_SUBU a3, zero, a0
+	bne	t8,zero,L(unaligned)
 
 	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
+	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */
 	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
-	PTR_SUBU a2,a2,a3		/* a2 is the remaining bytes count */
 
 	C_LDHI	t8,0(a1)
 	PTR_ADDU a1,a1,a3
@@ -332,18 +336,24 @@  L(memcpy):
  * align instruction.
  */
 	andi	t8,a0,7
+#ifdef __mips_micromips
+	auipc	t9,%pcrel_hi(L(atable))
+	addiu	t9,t9,%pcrel_lo(L(atable)+4)
+	PTR_LSA	t9,t8,t9,1
+#else
 	lapc	t9,L(atable)
 	PTR_LSA	t9,t8,t9,2
+#endif
 	jrc	t9
 L(atable):
-	bc	L(lb0)
-	bc	L(lb7)
-	bc	L(lb6)
-	bc	L(lb5)
-	bc	L(lb4)
-	bc	L(lb3)
-	bc	L(lb2)
-	bc	L(lb1)
+	PTR_BC	L(lb0)
+	PTR_BC	L(lb7)
+	PTR_BC	L(lb6)
+	PTR_BC	L(lb5)
+	PTR_BC	L(lb4)
+	PTR_BC	L(lb3)
+	PTR_BC	L(lb2)
+	PTR_BC	L(lb1)
 L(lb7):
 	lb	a3, 6(a1)
 	sb	a3, 6(a0)
@@ -374,20 +384,26 @@  L(lb1):
 L(lb0):
 
 	andi	t8,a1,(NSIZE-1)
+#ifdef __mips_micromips
+	auipc	t9,%pcrel_hi(L(jtable))
+	addiu	t9,t9,%pcrel_lo(L(jtable)+4)
+	PTR_LSA	t9,t8,t9,1
+#else
 	lapc	t9,L(jtable)
 	PTR_LSA	t9,t8,t9,2
+#endif
 	jrc	t9
 L(jtable):
-        bc      L(aligned)
-        bc      L(r6_unaligned1)
-        bc      L(r6_unaligned2)
-        bc      L(r6_unaligned3)
-# ifdef USE_DOUBLE
-        bc      L(r6_unaligned4)
-        bc      L(r6_unaligned5)
-        bc      L(r6_unaligned6)
-        bc      L(r6_unaligned7)
-# endif
+	PTR_BC      L(aligned)
+	PTR_BC      L(r6_unaligned1)
+	PTR_BC      L(r6_unaligned2)
+	PTR_BC      L(r6_unaligned3)
+#ifdef USE_DOUBLE
+	PTR_BC      L(r6_unaligned4)
+	PTR_BC      L(r6_unaligned5)
+	PTR_BC      L(r6_unaligned6)
+	PTR_BC      L(r6_unaligned7)
+#endif
 #endif /* R6_CODE */
 
 L(aligned):
@@ -401,8 +417,8 @@  L(aligned):
  */
 
 	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
-	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
 	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
 	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
 
 /* When in the loop we may prefetch with the 'prepare to store' hint,
@@ -428,7 +444,6 @@  L(aligned):
 # if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE
 	sltu    v1,t9,a0
 	bgtz    v1,L(skip_set)
-	nop
 	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
 L(skip_set):
 # else
@@ -444,11 +459,16 @@  L(skip_set):
 #endif
 L(loop16w):
 	C_LD	t0,UNIT(0)(a1)
+/* We need to separate out the C_LD instruction here so that it will work
+   both when it is used by itself and when it is used with the branch
+   instruction.  */
 #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
 	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
+	C_LD	t1,UNIT(1)(a1)
 	bgtz	v1,L(skip_pref)
-#endif
+#else
 	C_LD	t1,UNIT(1)(a1)
+#endif
 #ifdef R6_CODE
 	PREFETCH_FOR_STORE (2, a0)
 #else
@@ -502,8 +522,8 @@  L(skip_pref):
 	C_ST	REG6,UNIT(14)(a0)
 	C_ST	REG7,UNIT(15)(a0)
 	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
-	bne	a0,a3,L(loop16w)
 	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	bne	a0,a3,L(loop16w)
 	move	a2,t8
 
 /* Here we have src and dest word-aligned but less than 64-bytes or
@@ -517,7 +537,6 @@  L(chkw):
 	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
 				/* The t8 is the reminder count past 32-bytes */
 	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
-	nop
 	C_LD	t0,UNIT(0)(a1)
 	C_LD	t1,UNIT(1)(a1)
 	C_LD	REG2,UNIT(2)(a1)
@@ -546,8 +565,8 @@  L(chkw):
  */
 L(chk1w):
 	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
-	beq	a2,t8,L(lastw)
 	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	beq	a2,t8,L(lastw)
 	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
 
 /* copying in words (4-byte or 8-byte chunks) */
@@ -555,8 +574,8 @@  L(wordCopy_loop):
 	C_LD	REG3,UNIT(0)(a1)
 	PTR_ADDIU a0,a0,UNIT(1)
 	PTR_ADDIU a1,a1,UNIT(1)
-	bne	a0,a3,L(wordCopy_loop)
 	C_ST	REG3,UNIT(-1)(a0)
+	bne	a0,a3,L(wordCopy_loop)
 
 /* If we have been copying double words, see if we can copy a single word
    before doing byte copies.  We can have, at most, one word to copy.  */
@@ -574,17 +593,16 @@  L(lastw):
 
 /* Copy the last 8 (or 16) bytes */
 L(lastb):
-	blez	a2,L(leave)
 	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+	blez	a2,L(leave)
 L(lastbloop):
 	lb	v1,0(a1)
 	PTR_ADDIU a0,a0,1
 	PTR_ADDIU a1,a1,1
-	bne	a0,a3,L(lastbloop)
 	sb	v1,-1(a0)
+	bne	a0,a3,L(lastbloop)
 L(leave):
-	j	ra
-	nop
+	jr	ra
 
 /* We jump here with a memcpy of less than 8 or 16 bytes, depending on
    whether or not USE_DOUBLE is defined.  Instead of just doing byte
@@ -625,8 +643,8 @@  L(wcopy_loop):
 
 L(unaligned):
 	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
+	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */
 	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
-	PTR_SUBU a2,a2,a3	/* a2 is the remaining bytes count */
 
 	C_LDHI	v1,UNIT(0)(a1)
 	C_LDLO	v1,UNITM1(1)(a1)
@@ -644,8 +662,8 @@  L(unaligned):
 
 L(ua_chk16w):
 	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
-	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
 	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
 	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
 
 # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
@@ -664,7 +682,6 @@  L(ua_chk16w):
 #  if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
 	sltu    v1,t9,a0
 	bgtz    v1,L(ua_skip_set)
-	nop
 	PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4)
 L(ua_skip_set):
 #  else
@@ -676,11 +693,16 @@  L(ua_loop16w):
 	C_LDHI	t0,UNIT(0)(a1)
 	C_LDHI	t1,UNIT(1)(a1)
 	C_LDHI	REG2,UNIT(2)(a1)
+/* We need to separate out the C_LDHI instruction here so that it will work
+   both when it is used by itself and when it is used with the branch
+   instruction.  */
 # if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
 	sltu	v1,t9,a0
+	C_LDHI	REG3,UNIT(3)(a1)
 	bgtz	v1,L(ua_skip_pref)
-# endif
+# else
 	C_LDHI	REG3,UNIT(3)(a1)
+# endif
 	PREFETCH_FOR_STORE (4, a0)
 	PREFETCH_FOR_STORE (5, a0)
 L(ua_skip_pref):
@@ -731,8 +753,8 @@  L(ua_skip_pref):
 	C_ST	REG6,UNIT(14)(a0)
 	C_ST	REG7,UNIT(15)(a0)
 	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
-	bne	a0,a3,L(ua_loop16w)
 	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	bne	a0,a3,L(ua_loop16w)
 	move	a2,t8
 
 /* Here we have src and dest word-aligned but less than 64-bytes or
@@ -745,7 +767,6 @@  L(ua_chkw):
 	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
 				  /* t8 is the reminder count past 32-bytes */
 	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
-	nop
 	C_LDHI	t0,UNIT(0)(a1)
 	C_LDHI	t1,UNIT(1)(a1)
 	C_LDHI	REG2,UNIT(2)(a1)
@@ -778,8 +799,8 @@  L(ua_chkw):
  */
 L(ua_chk1w):
 	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
-	beq	a2,t8,L(ua_smallCopy)
 	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	beq	a2,t8,L(ua_smallCopy)
 	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
 
 /* copying in words (4-byte or 8-byte chunks) */
@@ -788,22 +809,21 @@  L(ua_wordCopy_loop):
 	C_LDLO	v1,UNITM1(1)(a1)
 	PTR_ADDIU a0,a0,UNIT(1)
 	PTR_ADDIU a1,a1,UNIT(1)
-	bne	a0,a3,L(ua_wordCopy_loop)
 	C_ST	v1,UNIT(-1)(a0)
+	bne	a0,a3,L(ua_wordCopy_loop)
 
 /* Copy the last 8 (or 16) bytes */
 L(ua_smallCopy):
-	beqz	a2,L(leave)
 	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+	beqz	a2,L(leave)
 L(ua_smallCopy_loop):
 	lb	v1,0(a1)
 	PTR_ADDIU a0,a0,1
 	PTR_ADDIU a1,a1,1
-	bne	a0,a3,L(ua_smallCopy_loop)
 	sb	v1,-1(a0)
+	bne	a0,a3,L(ua_smallCopy_loop)
 
-	j	ra
-	nop
+	jr	ra
 
 #else /* R6_CODE */
 
@@ -816,9 +836,9 @@  L(ua_smallCopy_loop):
 # endif
 # define R6_UNALIGNED_WORD_COPY(BYTEOFFSET) \
 	andi	REG7, a2, (NSIZE-1);/* REG7 is # of bytes to by bytes.     */ \
-	beq	REG7, a2, L(lastb); /* Check for bytes to copy by word	   */ \
 	PTR_SUBU a3, a2, REG7;	/* a3 is number of bytes to be copied in   */ \
 				/* (d)word chunks.			   */ \
+	beq	REG7, a2, L(lastb); /* Check for bytes to copy by word	   */ \
 	move	a2, REG7;	/* a2 is # of bytes to copy byte by byte   */ \
 				/* after word loop is finished.		   */ \
 	PTR_ADDU REG6, a0, a3;	/* REG6 is the dst address after loop.	   */ \
@@ -831,10 +851,9 @@  L(r6_ua_wordcopy##BYTEOFFSET):						      \
 	PTR_ADDIU a0, a0, UNIT(1);  /* Increment destination pointer.	   */ \
 	PTR_ADDIU REG2, REG2, UNIT(1); /* Increment aligned source pointer.*/ \
 	move	t0, t1;		/* Move second part of source to first.	   */ \
-	bne	a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET);			      \
 	C_ST	REG3, UNIT(-1)(a0);					      \
+	bne	a0, REG6,L(r6_ua_wordcopy##BYTEOFFSET);			      \
 	j	L(lastb);						      \
-	nop
 
 	/* We are generating R6 code, the destination is 4 byte aligned and
 	   the source is not 4 byte aligned. t8 is 1, 2, or 3 depending on the
@@ -859,7 +878,6 @@  L(r6_unaligned7):
 #endif /* R6_CODE */
 
 	.set	at
-	.set	reorder
 END(MEMCPY_NAME)
 #ifndef ANDROID_CHANGES
 # ifdef _LIBC
diff --git a/sysdeps/mips/memset.S b/sysdeps/mips/memset.S
index 466599b9f4..0c8375c9f5 100644
--- a/sysdeps/mips/memset.S
+++ b/sysdeps/mips/memset.S
@@ -82,6 +82,12 @@ 
 # endif
 #endif
 
+#if __mips_isa_rev > 5 && defined (__mips_micromips)
+# define PTR_BC	      bc16
+#else
+# define PTR_BC	      bc
+#endif
+
 /* Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
    or PREFETCH_STORE_STREAMED offers a large performance advantage
    but PREPAREFORSTORE has some special restrictions to consider.
@@ -205,17 +211,16 @@  LEAF(MEMSET_NAME)
 #endif
 
 	.set	nomips16
-	.set	noreorder
-/* If the size is less than 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
+/* If the size is less than 4*NSIZE (16 or 32), go to L(lastb).  Regardless of
    size, copy dst pointer to v0 for the return value.  */
-	slti	t2,a2,(2 * NSIZE)
-	bne	t2,zero,L(lastb)
+	slti	t2,a2,(4 * NSIZE)
 	move	v0,a0
+	bne	t2,zero,L(lastb)
 
 /* If memset value is not zero, we copy it to all the bytes in a 32 or 64
    bit word.  */
-	beq	a1,zero,L(set0)		/* If memset value is zero no smear  */
 	PTR_SUBU a3,zero,a0
+	beq	a1,zero,L(set0)		/* If memset value is zero no smear  */
 	nop
 
 	/* smear byte into 32 or 64 bit word */
@@ -251,26 +256,30 @@  LEAF(MEMSET_NAME)
 L(set0):
 #ifndef R6_CODE
 	andi	t2,a3,(NSIZE-1)		/* word-unaligned address?          */
-	beq	t2,zero,L(aligned)	/* t2 is the unalignment count      */
 	PTR_SUBU a2,a2,t2
+	beq	t2,zero,L(aligned)	/* t2 is the unalignment count      */
 	C_STHI	a1,0(a0)
 	PTR_ADDU a0,a0,t2
 #else /* R6_CODE */
-	andi	t2,a0,(NSIZE-1)
+	andi	t2,a0,7
+# ifdef __mips_micromips
+	auipc	t9,%pcrel_hi(L(atable))
+	addiu	t9,t9,%pcrel_lo(L(atable)+4)
+	PTR_LSA	t9,t2,t9,1
+# else
 	lapc	t9,L(atable)
 	PTR_LSA	t9,t2,t9,2
+# endif
 	jrc	t9
 L(atable):
-	bc	L(aligned)
-# ifdef USE_DOUBLE
-	bc	L(lb7)
-	bc	L(lb6)
-	bc	L(lb5)
-	bc	L(lb4)
-# endif
-	bc	L(lb3)
-	bc	L(lb2)
-	bc	L(lb1)
+	PTR_BC	L(aligned)
+	PTR_BC	L(lb7)
+	PTR_BC	L(lb6)
+	PTR_BC	L(lb5)
+	PTR_BC	L(lb4)
+	PTR_BC	L(lb3)
+	PTR_BC	L(lb2)
+	PTR_BC	L(lb1)
 L(lb7):
 	sb	a1,6(a0)
 L(lb6):
@@ -300,8 +309,8 @@  L(aligned):
    left to store or we would have jumped to L(lastb) earlier in the code.  */
 #ifdef DOUBLE_ALIGN
 	andi	t2,a3,4
-	beq	t2,zero,L(double_aligned)
 	PTR_SUBU a2,a2,t2
+	beq	t2,zero,L(double_aligned)
 	sw	a1,0(a0)
 	PTR_ADDU a0,a0,t2
 L(double_aligned):
@@ -313,8 +322,8 @@  L(double_aligned):
    chunks have been copied.  We will loop, incrementing a0 until it equals
    a3.  */
 	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
-	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
 	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
 	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
 
 /* When in the loop we may prefetch with the 'prepare to store' hint,
@@ -339,7 +348,6 @@  L(loop16w):
     && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
 	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
 	bgtz	v1,L(skip_pref)
-	nop
 #endif
 #ifdef R6_CODE
 	PREFETCH_FOR_STORE (2, a0)
@@ -366,7 +374,6 @@  L(skip_pref):
 	C_ST	a1,UNIT(15)(a0)
 	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
 	bne	a0,a3,L(loop16w)
-	nop
 	move	a2,t8
 
 /* Here we have dest word-aligned but less than 64-bytes or 128 bytes to go.
@@ -376,7 +383,6 @@  L(chkw):
 	andi	t8,a2,NSIZEMASK	/* is there a 32-byte/64-byte chunk.  */
 				/* the t8 is the reminder count past 32-bytes */
 	beq	a2,t8,L(chk1w)/* when a2==t8, no 32-byte chunk */
-	nop
 	C_ST	a1,UNIT(0)(a0)
 	C_ST	a1,UNIT(1)(a0)
 	C_ST	a1,UNIT(2)(a0)
@@ -394,30 +400,28 @@  L(chkw):
    been copied.  We will loop, incrementing a0 until a0 equals a3.  */
 L(chk1w):
 	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
-	beq	a2,t8,L(lastb)
 	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	beq	a2,t8,L(lastb)
 	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
 
 /* copying in words (4-byte or 8 byte chunks) */
 L(wordCopy_loop):
 	PTR_ADDIU a0,a0,UNIT(1)
-	bne	a0,a3,L(wordCopy_loop)
 	C_ST	a1,UNIT(-1)(a0)
+	bne	a0,a3,L(wordCopy_loop)
 
 /* Copy the last 8 (or 16) bytes */
 L(lastb):
-	blez	a2,L(leave)
 	PTR_ADDU a3,a0,a2       /* a3 is the last dst address */
+	blez	a2,L(leave)
 L(lastbloop):
 	PTR_ADDIU a0,a0,1
-	bne	a0,a3,L(lastbloop)
 	sb	a1,-1(a0)
+	bne	a0,a3,L(lastbloop)
 L(leave):
-	j	ra
-	nop
+	jr	ra
 
 	.set	at
-	.set	reorder
 END(MEMSET_NAME)
 #ifndef ANDROID_CHANGES
 # ifdef _LIBC
diff --git a/sysdeps/mips/mips32/crtn.S b/sysdeps/mips/mips32/crtn.S
index 89ecbd9882..568aabd86e 100644
--- a/sysdeps/mips/mips32/crtn.S
+++ b/sysdeps/mips/mips32/crtn.S
@@ -40,18 +40,10 @@ 
 
 	.section .init,"ax",@progbits
 	lw $31,28($sp)
-	.set noreorder
-	.set nomacro
-	j $31
 	addiu $sp,$sp,32
-	.set macro
-	.set reorder
+	jr $31
 
 	.section .fini,"ax",@progbits
 	lw $31,28($sp)
-	.set noreorder
-	.set nomacro
-	j $31
 	addiu $sp,$sp,32
-	.set macro
-	.set reorder
+	jr $31
diff --git a/sysdeps/mips/mips64/__longjmp.c b/sysdeps/mips/mips64/__longjmp.c
index 4a93e884c0..1a9bb7b23e 100644
--- a/sysdeps/mips/mips64/__longjmp.c
+++ b/sysdeps/mips/mips64/__longjmp.c
@@ -87,7 +87,7 @@  __longjmp (__jmp_buf env_arg, int val_arg)
   else
     asm volatile ("move $2, %0" : : "r" (val));
 
-  asm volatile ("j $31");
+  asm volatile ("jr $31");
 
   /* Avoid `volatile function does return' warnings.  */
   for (;;);
diff --git a/sysdeps/mips/mips64/add_n.S b/sysdeps/mips/mips64/add_n.S
index 345d62fbc5..bab523fd5a 100644
--- a/sysdeps/mips/mips64/add_n.S
+++ b/sysdeps/mips/mips64/add_n.S
@@ -37,16 +37,13 @@  ENTRY (__mpn_add_n)
 #ifdef __PIC__
 	SETUP_GP /* ??? unused */
 #endif
-	.set	noreorder
-	.set	nomacro
-
 	ld	$10,0($5)
 	ld	$11,0($6)
 
 	daddiu	$7,$7,-1
 	and	$9,$7,4-1	# number of limbs in first loop
-	beq	$9,$0,L(L0)	# if multiple of 4 limbs, skip first loop
 	move	$2,$0
+	beq	$9,$0,L(L0)	# if multiple of 4 limbs, skip first loop
 
 	dsubu	$7,$7,$9
 
@@ -64,11 +61,10 @@  L(Loop0):	daddiu	$9,$9,-1
 	daddiu	$6,$6,8
 	move	$10,$12
 	move	$11,$13
-	bne	$9,$0,L(Loop0)
 	daddiu	$4,$4,8
+	bne	$9,$0,L(Loop0)
 
 L(L0):	beq	$7,$0,L(Lend)
-	nop
 
 L(Loop):	daddiu	$7,$7,-4
 
@@ -111,15 +107,15 @@  L(Loop):	daddiu	$7,$7,-4
 	daddiu	$5,$5,32
 	daddiu	$6,$6,32
 
-	bne	$7,$0,L(Loop)
 	daddiu	$4,$4,32
+	bne	$7,$0,L(Loop)
 
 L(Lend):	daddu	$11,$11,$2
 	sltu	$8,$11,$2
 	daddu	$11,$10,$11
 	sltu	$2,$11,$10
 	sd	$11,0($4)
-	j	$31
 	or	$2,$2,$8
+	jr	$31
 
 END (__mpn_add_n)
diff --git a/sysdeps/mips/mips64/addmul_1.S b/sysdeps/mips/mips64/addmul_1.S
index d105938f00..d84edd76a0 100644
--- a/sysdeps/mips/mips64/addmul_1.S
+++ b/sysdeps/mips/mips64/addmul_1.S
@@ -36,9 +36,6 @@  ENTRY (__mpn_addmul_1)
 #ifdef PIC
 	SETUP_GP /* ??? unused */
 #endif
-	.set    noreorder
-	.set    nomacro
-
  # warm up phase 0
 	ld	$8,0($5)
 
@@ -52,12 +49,12 @@  ENTRY (__mpn_addmul_1)
 #endif
 
 	daddiu	$6,$6,-1
-	beq	$6,$0,L(LC0)
 	move	$2,$0		# zero cy2
+	beq	$6,$0,L(LC0)
 
 	daddiu	$6,$6,-1
-	beq	$6,$0,L(LC1)
 	ld	$8,0($5)	# load new s1 limb as early as possible
+	beq	$6,$0,L(LC1)
 
 L(Loop):	ld	$10,0($4)
 #if __mips_isa_rev < 6
@@ -83,8 +80,8 @@  L(Loop):	ld	$10,0($4)
 	daddu	$2,$2,$10
 	sd	$3,0($4)
 	daddiu	$4,$4,8
-	bne	$6,$0,L(Loop)
 	daddu	$2,$9,$2	# add high product limb and carry from addition
+	bne	$6,$0,L(Loop)
 
  # cool down phase 1
 L(LC1):	ld	$10,0($4)
@@ -125,7 +122,7 @@  L(LC0):	ld	$10,0($4)
 	sltu	$10,$3,$10
 	daddu	$2,$2,$10
 	sd	$3,0($4)
-	j	$31
 	daddu	$2,$9,$2	# add high product limb and carry from addition
+	jr	$31
 
 END (__mpn_addmul_1)
diff --git a/sysdeps/mips/mips64/lshift.S b/sysdeps/mips/mips64/lshift.S
index 2ea2e58b85..ca84385998 100644
--- a/sysdeps/mips/mips64/lshift.S
+++ b/sysdeps/mips/mips64/lshift.S
@@ -36,9 +36,6 @@  ENTRY (__mpn_lshift)
 #ifdef __PIC__
 	SETUP_GP /* ??? unused */
 #endif
-	.set	noreorder
-	.set	nomacro
-
 	dsll	$2,$6,3
 	daddu	$5,$5,$2	# make r5 point at end of src
 	ld	$10,-8($5)	# load first limb
@@ -46,8 +43,8 @@  ENTRY (__mpn_lshift)
 	daddu	$4,$4,$2	# make r4 point at end of res
 	daddiu	$6,$6,-1
 	and	$9,$6,4-1	# number of limbs in first loop
-	beq	$9,$0,L(L0)	# if multiple of 4 limbs, skip first loop
 	dsrl	$2,$10,$13	# compute function result
+	beq	$9,$0,L(L0)	# if multiple of 4 limbs, skip first loop
 
 	dsubu	$6,$6,$9
 
@@ -59,11 +56,10 @@  L(Loop0):	ld	$3,-16($5)
 	dsrl	$12,$3,$13
 	move	$10,$3
 	or	$8,$11,$12
-	bne	$9,$0,L(Loop0)
 	sd	$8,0($4)
+	bne	$9,$0,L(Loop0)
 
 L(L0):	beq	$6,$0,L(Lend)
-	nop
 
 L(Loop):	ld	$3,-16($5)
 	daddiu	$4,$4,-32
@@ -91,10 +87,10 @@  L(Loop):	ld	$3,-16($5)
 
 	daddiu	$5,$5,-32
 	or	$8,$14,$9
-	bgtz	$6,L(Loop)
 	sd	$8,0($4)
+	bgtz	$6,L(Loop)
 
 L(Lend):	dsll	$8,$10,$7
-	j	$31
 	sd	$8,-8($4)
+	jr	$31
 END (__mpn_lshift)
diff --git a/sysdeps/mips/mips64/mul_1.S b/sysdeps/mips/mips64/mul_1.S
index 321789b345..7604bac3a2 100644
--- a/sysdeps/mips/mips64/mul_1.S
+++ b/sysdeps/mips/mips64/mul_1.S
@@ -37,9 +37,6 @@  ENTRY (__mpn_mul_1)
 #ifdef __PIC__
 	SETUP_GP /* ??? unused */
 #endif
-	.set    noreorder
-	.set    nomacro
-
  # warm up phase 0
 	ld	$8,0($5)
 
@@ -53,12 +50,12 @@  ENTRY (__mpn_mul_1)
 #endif
 
 	daddiu	$6,$6,-1
-	beq	$6,$0,L(LC0)
 	move	$2,$0		# zero cy2
+	beq	$6,$0,L(LC0)
 
 	daddiu	$6,$6,-1
-	beq	$6,$0,L(LC1)
 	ld	$8,0($5)	# load new s1 limb as early as possible
+	beq	$6,$0,L(LC1)
 
 #if __mips_isa_rev < 6
 L(Loop):	mflo	$10
@@ -80,8 +77,8 @@  L(Loop):	move	$10,$11
 	sltu	$2,$10,$2	# carry from previous addition -> $2
 	sd	$10,0($4)
 	daddiu	$4,$4,8
-	bne	$6,$0,L(Loop)
 	daddu	$2,$9,$2	# add high product limb and carry from addition
+	bne	$6,$0,L(Loop)
 
  # cool down phase 1
 #if __mips_isa_rev < 6
@@ -114,7 +111,7 @@  L(LC0):	move	$10,$11
 	daddu	$10,$10,$2
 	sltu	$2,$10,$2
 	sd	$10,0($4)
-	j	$31
 	daddu	$2,$9,$2	# add high product limb and carry from addition
+	jr	$31
 
 END (__mpn_mul_1)
diff --git a/sysdeps/mips/mips64/n32/crtn.S b/sysdeps/mips/mips64/n32/crtn.S
index 633d79cfad..8d4c83381c 100644
--- a/sysdeps/mips/mips64/n32/crtn.S
+++ b/sysdeps/mips/mips64/n32/crtn.S
@@ -41,19 +41,11 @@ 
 	.section .init,"ax",@progbits
 	ld $31,8($sp)
 	ld $28,0($sp)
-	.set noreorder
-	.set nomacro
-	j $31
 	addiu $sp,$sp,16
-	.set macro
-	.set reorder
+	jr $31
 
 	.section .fini,"ax",@progbits
 	ld $31,8($sp)
 	ld $28,0($sp)
-	.set noreorder
-	.set nomacro
-	j $31
 	addiu $sp,$sp,16
-	.set macro
-	.set reorder
+	jr $31
diff --git a/sysdeps/mips/mips64/n64/crtn.S b/sysdeps/mips/mips64/n64/crtn.S
index 99ed1e3263..110040c9fc 100644
--- a/sysdeps/mips/mips64/n64/crtn.S
+++ b/sysdeps/mips/mips64/n64/crtn.S
@@ -41,19 +41,11 @@ 
 	.section .init,"ax",@progbits
 	ld $31,8($sp)
 	ld $28,0($sp)
-	.set noreorder
-	.set nomacro
-	j $31
 	daddiu $sp,$sp,16
-	.set macro
-	.set reorder
+	jr $31
 
 	.section .fini,"ax",@progbits
 	ld $31,8($sp)
 	ld $28,0($sp)
-	.set noreorder
-	.set nomacro
-	j $31
 	daddiu $sp,$sp,16
-	.set macro
-	.set reorder
+	jr $31
diff --git a/sysdeps/mips/mips64/rshift.S b/sysdeps/mips/mips64/rshift.S
index 1f6e3a2a12..153aacfd86 100644
--- a/sysdeps/mips/mips64/rshift.S
+++ b/sysdeps/mips/mips64/rshift.S
@@ -36,15 +36,12 @@  ENTRY (__mpn_rshift)
 #ifdef __PIC__
 	SETUP_GP /* ??? unused */
 #endif
-	.set	noreorder
-	.set	nomacro
-
 	ld	$10,0($5)	# load first limb
 	dsubu	$13,$0,$7
 	daddiu	$6,$6,-1
 	and	$9,$6,4-1	# number of limbs in first loop
-	beq	$9,$0,L(L0)	# if multiple of 4 limbs, skip first loop
 	dsll	$2,$10,$13	# compute function result
+	beq	$9,$0,L(L0)	# if multiple of 4 limbs, skip first loop
 
 	dsubu	$6,$6,$9
 
@@ -56,11 +53,10 @@  L(Loop0):	ld	$3,8($5)
 	dsll	$12,$3,$13
 	move	$10,$3
 	or	$8,$11,$12
-	bne	$9,$0,L(Loop0)
 	sd	$8,-8($4)
+	bne	$9,$0,L(Loop0)
 
 L(L0):	beq	$6,$0,L(Lend)
-	nop
 
 L(Loop):	ld	$3,8($5)
 	daddiu	$4,$4,32
@@ -88,10 +84,10 @@  L(Loop):	ld	$3,8($5)
 
 	daddiu	$5,$5,32
 	or	$8,$14,$9
-	bgtz	$6,L(Loop)
 	sd	$8,-8($4)
+	bgtz	$6,L(Loop)
 
 L(Lend):	dsrl	$8,$10,$7
-	j	$31
 	sd	$8,0($4)
+	jr	$31
 END (__mpn_rshift)
diff --git a/sysdeps/mips/mips64/sub_n.S b/sysdeps/mips/mips64/sub_n.S
index b83d5ccab6..5b7337472f 100644
--- a/sysdeps/mips/mips64/sub_n.S
+++ b/sysdeps/mips/mips64/sub_n.S
@@ -37,16 +37,13 @@  ENTRY (__mpn_sub_n)
 #ifdef __PIC__
 	SETUP_GP /* ??? unused */
 #endif
-	.set	noreorder
-	.set	nomacro
-
 	ld	$10,0($5)
 	ld	$11,0($6)
 
 	daddiu	$7,$7,-1
 	and	$9,$7,4-1	# number of limbs in first loop
-	beq	$9,$0,L(L0)	# if multiple of 4 limbs, skip first loop
 	move	$2,$0
+	beq	$9,$0,L(L0)	# if multiple of 4 limbs, skip first loop
 
 	dsubu	$7,$7,$9
 
@@ -64,11 +61,10 @@  L(Loop0):	daddiu	$9,$9,-1
 	daddiu	$6,$6,8
 	move	$10,$12
 	move	$11,$13
-	bne	$9,$0,L(Loop0)
 	daddiu	$4,$4,8
+	bne	$9,$0,L(Loop0)
 
 L(L0):	beq	$7,$0,L(Lend)
-	nop
 
 L(Loop):	daddiu	$7,$7,-4
 
@@ -111,15 +107,15 @@  L(Loop):	daddiu	$7,$7,-4
 	daddiu	$5,$5,32
 	daddiu	$6,$6,32
 
-	bne	$7,$0,L(Loop)
 	daddiu	$4,$4,32
+	bne	$7,$0,L(Loop)
 
 L(Lend):	daddu	$11,$11,$2
 	sltu	$8,$11,$2
 	dsubu	$11,$10,$11
 	sltu	$2,$10,$11
 	sd	$11,0($4)
-	j	$31
 	or	$2,$2,$8
+	jr	$31
 
 END (__mpn_sub_n)
diff --git a/sysdeps/mips/mips64/submul_1.S b/sysdeps/mips/mips64/submul_1.S
index 46f26e8dde..121433d232 100644
--- a/sysdeps/mips/mips64/submul_1.S
+++ b/sysdeps/mips/mips64/submul_1.S
@@ -37,9 +37,6 @@  ENTRY (__mpn_submul_1)
 #ifdef __PIC__
 	SETUP_GP /* ??? unused */
 #endif
-	.set    noreorder
-	.set    nomacro
-
  # warm up phase 0
 	ld	$8,0($5)
 
@@ -53,12 +50,12 @@  ENTRY (__mpn_submul_1)
 #endif
 
 	daddiu	$6,$6,-1
-	beq	$6,$0,L(LC0)
 	move	$2,$0		# zero cy2
+	beq	$6,$0,L(LC0)
 
 	daddiu	$6,$6,-1
-	beq	$6,$0,L(LC1)
 	ld	$8,0($5)	# load new s1 limb as early as possible
+	beq	$6,$0,L(LC1)
 
 L(Loop):	ld	$10,0($4)
 #if __mips_isa_rev < 6
@@ -84,8 +81,8 @@  L(Loop):	ld	$10,0($4)
 	daddu	$2,$2,$10
 	sd	$3,0($4)
 	daddiu	$4,$4,8
-	bne	$6,$0,L(Loop)
 	daddu	$2,$9,$2	# add high product limb and carry from addition
+	bne	$6,$0,L(Loop)
 
  # cool down phase 1
 L(LC1):	ld	$10,0($4)
@@ -126,7 +123,7 @@  L(LC0):	ld	$10,0($4)
 	sgtu	$10,$3,$10
 	daddu	$2,$2,$10
 	sd	$3,0($4)
-	j	$31
 	daddu	$2,$9,$2	# add high product limb and carry from addition
+	jr	$31
 
 END (__mpn_submul_1)
diff --git a/sysdeps/mips/mul_1.S b/sysdeps/mips/mul_1.S
index cfd4cc7cd5..ae65ebe79d 100644
--- a/sysdeps/mips/mul_1.S
+++ b/sysdeps/mips/mul_1.S
@@ -31,12 +31,9 @@  along with the GNU MP Library.  If not, see
 	.option pic2
 #endif
 ENTRY (__mpn_mul_1)
-	.set    noreorder
 #ifdef __PIC__
 	.cpload t9
 #endif
-	.set    nomacro
-
 	/* warm up phase 0 */
 	lw	$8,0($5)
 
@@ -50,12 +47,12 @@  ENTRY (__mpn_mul_1)
 #endif
 
 	addiu	$6,$6,-1
-	beq	$6,$0,L(LC0)
 	move	$2,$0		/* zero cy2 */
+	beq	$6,$0,L(LC0)
 
 	addiu	$6,$6,-1
-	beq	$6,$0,L(LC1)
 	lw	$8,0($5)	/* load new s1 limb as early as possible */
+	beq	$6,$0,L(LC1)
 
 
 #if  __mips_isa_rev < 6
@@ -78,8 +75,8 @@  L(Loop):	move	$10,$11
 	sltu	$2,$10,$2	/* carry from previous addition -> $2 */
 	sw	$10,0($4)
 	addiu	$4,$4,4
-	bne	$6,$0,L(Loop)	/* should be "bnel" */
 	addu	$2,$9,$2	/* add high product limb and carry from addition */
+	bne	$6,$0,L(Loop)	/* should be "bnel" */
 
 	/* cool down phase 1 */
 #if __mips_isa_rev < 6
@@ -112,6 +109,6 @@  L(LC0):	move	$10,$11
 	addu	$10,$10,$2
 	sltu	$2,$10,$2
 	sw	$10,0($4)
-	j	$31
 	addu	$2,$9,$2	/* add high product limb and carry from addition */
+	jr	$31
 	END (__mpn_mul_1)
diff --git a/sysdeps/mips/rshift.S b/sysdeps/mips/rshift.S
index e19fa41234..b453ca2ba7 100644
--- a/sysdeps/mips/rshift.S
+++ b/sysdeps/mips/rshift.S
@@ -30,18 +30,15 @@  along with the GNU MP Library.  If not, see
 	.option pic2
 #endif
 ENTRY (__mpn_rshift)
-	.set	noreorder
 #ifdef __PIC__
 	.cpload t9
 #endif
-	.set	nomacro
-
 	lw	$10,0($5)	/* load first limb */
 	subu	$13,$0,$7
 	addiu	$6,$6,-1
 	and	$9,$6,4-1	/* number of limbs in first loop */
+	sll	$2,$10,$13	/* compute function result */
 	beq	$9,$0,L(L0)	/* if multiple of 4 limbs, skip first loop*/
-	 sll	$2,$10,$13	/* compute function result */
 
 	subu	$6,$6,$9
 
@@ -53,11 +50,10 @@  L(Loop0):	lw	$3,4($5)
 	sll	$12,$3,$13
 	move	$10,$3
 	or	$8,$11,$12
+	sw	$8,-4($4)
 	bne	$9,$0,L(Loop0)
-	 sw	$8,-4($4)
 
 L(L0):	beq	$6,$0,L(Lend)
-	 nop
 
 L(Loop):	lw	$3,4($5)
 	addiu	$4,$4,16
@@ -85,10 +81,10 @@  L(Loop):	lw	$3,4($5)
 
 	addiu	$5,$5,16
 	or	$8,$14,$9
+	sw	$8,-4($4)
 	bgtz	$6,L(Loop)
-	 sw	$8,-4($4)
 
 L(Lend):	srl	$8,$10,$7
-	j	$31
 	sw	$8,0($4)
+	jr	$31
 	END (__mpn_rshift)
diff --git a/sysdeps/mips/sub_n.S b/sysdeps/mips/sub_n.S
index 3e988ecbb4..9f7cb5458d 100644
--- a/sysdeps/mips/sub_n.S
+++ b/sysdeps/mips/sub_n.S
@@ -31,19 +31,16 @@  along with the GNU MP Library.  If not, see
 	.option pic2
 #endif
 ENTRY (__mpn_sub_n)
-	.set	noreorder
 #ifdef __PIC__
 	.cpload t9
 #endif
-	.set	nomacro
-
 	lw	$10,0($5)
 	lw	$11,0($6)
 
 	addiu	$7,$7,-1
 	and	$9,$7,4-1	/* number of limbs in first loop */
-	beq	$9,$0,L(L0)	/* if multiple of 4 limbs, skip first loop */
 	move	$2,$0
+	beq	$9,$0,L(L0)	/* if multiple of 4 limbs, skip first loop */
 
 	subu	$7,$7,$9
 
@@ -61,11 +58,10 @@  L(Loop0):	addiu	$9,$9,-1
 	addiu	$6,$6,4
 	move	$10,$12
 	move	$11,$13
-	bne	$9,$0,L(Loop0)
 	addiu	$4,$4,4
+	bne	$9,$0,L(Loop0)
 
 L(L0):	beq	$7,$0,L(Lend)
-	nop
 
 L(Loop):	addiu	$7,$7,-4
 
@@ -108,14 +104,14 @@  L(Loop):	addiu	$7,$7,-4
 	addiu	$5,$5,16
 	addiu	$6,$6,16
 
-	bne	$7,$0,L(Loop)
 	addiu	$4,$4,16
+	bne	$7,$0,L(Loop)
 
 L(Lend):	addu	$11,$11,$2
 	sltu	$8,$11,$2
 	subu	$11,$10,$11
 	sltu	$2,$10,$11
 	sw	$11,0($4)
-	j	$31
 	or	$2,$2,$8
+	jr	$31
 	END (__mpn_sub_n)
diff --git a/sysdeps/mips/submul_1.S b/sysdeps/mips/submul_1.S
index be8e2844ef..8405801c57 100644
--- a/sysdeps/mips/submul_1.S
+++ b/sysdeps/mips/submul_1.S
@@ -31,12 +31,9 @@  along with the GNU MP Library.  If not, see
 	.option pic2
 #endif
 ENTRY (__mpn_submul_1)
-	.set    noreorder
 #ifdef __PIC__
 	.cpload t9
 #endif
-	.set    nomacro
-
 	/* warm up phase 0 */
 	lw	$8,0($5)
 
@@ -50,12 +47,12 @@  ENTRY (__mpn_submul_1)
 #endif
 
 	addiu	$6,$6,-1
-	beq	$6,$0,L(LC0)
 	move	$2,$0		/* zero cy2 */
+	beq	$6,$0,L(LC0)
 
 	addiu	$6,$6,-1
-	beq	$6,$0,L(LC1)
 	lw	$8,0($5)	/* load new s1 limb as early as possible */
+	beq	$6,$0,L(LC1)
 
 L(Loop):	lw	$10,0($4)
 #if __mips_isa_rev < 6
@@ -81,8 +78,8 @@  L(Loop):	lw	$10,0($4)
 	addu	$2,$2,$10
 	sw	$3,0($4)
 	addiu	$4,$4,4
-	bne	$6,$0,L(Loop)	/* should be "bnel" */
 	addu	$2,$9,$2	/* add high product limb and carry from addition */
+	bne	$6,$0,L(Loop)	/* should be "bnel" */
 
 	/* cool down phase 1 */
 L(LC1):	lw	$10,0($4)
@@ -123,6 +120,6 @@  L(LC0):	lw	$10,0($4)
 	sgtu	$10,$3,$10
 	addu	$2,$2,$10
 	sw	$3,0($4)
-	j	$31
 	addu	$2,$9,$2	/* add high product limb and carry from addition */
+	jr	$31
 	END (__mpn_submul_1)
diff --git a/sysdeps/mips/sys/asm.h b/sysdeps/mips/sys/asm.h
index e43eb39ca3..62f9e549c6 100644
--- a/sysdeps/mips/sys/asm.h
+++ b/sysdeps/mips/sys/asm.h
@@ -71,23 +71,21 @@ 
 		.set reorder
 /* Set gp when not at 1st instruction */
 # define SETUP_GPX(r)					\
-		.set noreorder;				\
 		move r, $31;	 /* Save old ra.  */	\
 		bal 10f; /* Find addr of cpload.  */	\
-		nop;					\
 10:							\
+		.set noreorder;				\
 		.cpload $31;				\
-		move $31, r;				\
-		.set reorder
+		.set reorder;				\
+		move $31, r;
 # define SETUP_GPX_L(r, l)				\
-		.set noreorder;				\
 		move r, $31;	 /* Save old ra.  */	\
 		bal l;   /* Find addr of cpload.  */	\
-		nop;					\
 l:							\
+		.set noreorder;				\
 		.cpload $31;				\
-		move $31, r;				\
-		.set reorder
+		.set reorder;				\
+		move $31, r;
 # define SAVE_GP(x) \
 		.cprestore x /* Save gp trigger t9/jalr conversion.	 */
 # define SETUP_GP64(a, b)
@@ -108,20 +106,14 @@  l:							\
 		.cpsetup $25, gpoffset, proc
 # define SETUP_GPX64(cp_reg, ra_save)			\
 		move ra_save, $31; /* Save old ra.  */	\
-		.set noreorder;				\
 		bal 10f; /* Find addr of .cpsetup.  */	\
-		nop;					\
 10:							\
-		.set reorder;				\
 		.cpsetup $31, cp_reg, 10b;		\
 		move $31, ra_save
 # define SETUP_GPX64_L(cp_reg, ra_save, l)  \
 		move ra_save, $31; /* Save old ra.  */	\
-		.set noreorder;				\
 		bal l;   /* Find addr of .cpsetup.  */	\
-		nop;					\
 l:							\
-		.set reorder;				\
 		.cpsetup $31, cp_reg, l;		\
 		move $31, ra_save
 # define RESTORE_GP64 \
diff --git a/sysdeps/unix/mips/mips32/sysdep.h b/sysdeps/unix/mips/mips32/sysdep.h
index c515b94540..df3f73a4eb 100644
--- a/sysdeps/unix/mips/mips32/sysdep.h
+++ b/sysdeps/unix/mips/mips32/sysdep.h
@@ -38,18 +38,14 @@ 
 L(syse1):
 #else
 #define PSEUDO(name, syscall_name, args) \
-  .set noreorder;							      \
   .set nomips16;							      \
   .align 2;								      \
   cfi_startproc;							      \
   99: j __syscall_error;						      \
-  nop;									      \
   cfi_endproc;								      \
   ENTRY(name)								      \
-  .set noreorder;							      \
   li v0, SYS_ify(syscall_name);						      \
   syscall;								      \
-  .set reorder;								      \
   bne a3, zero, 99b;							      \
 L(syse1):
 #endif
diff --git a/sysdeps/unix/mips/mips64/sysdep.h b/sysdeps/unix/mips/mips64/sysdep.h
index 6565b84e3a..c0772002e6 100644
--- a/sysdeps/unix/mips/mips64/sysdep.h
+++ b/sysdeps/unix/mips/mips64/sysdep.h
@@ -45,18 +45,14 @@ 
 L(syse1):
 #else
 #define PSEUDO(name, syscall_name, args) \
-  .set noreorder;							      \
   .align 2;								      \
   .set nomips16;							      \
   cfi_startproc;							      \
   99: j __syscall_error;						      \
-  nop;                                                                        \
   cfi_endproc;								      \
   ENTRY(name)								      \
-  .set noreorder;							      \
   li v0, SYS_ify(syscall_name);						      \
   syscall;								      \
-  .set reorder;								      \
   bne a3, zero, 99b;							      \
 L(syse1):
 #endif
diff --git a/sysdeps/unix/mips/sysdep.h b/sysdeps/unix/mips/sysdep.h
index d1e0460260..07cd5c4a06 100644
--- a/sysdeps/unix/mips/sysdep.h
+++ b/sysdeps/unix/mips/sysdep.h
@@ -48,7 +48,6 @@ 
   .align 2;						\
   ENTRY(name)						\
   .set nomips16;					\
-  .set noreorder;					\
   li v0, SYS_ify(syscall_name);				\
   syscall
 
@@ -61,7 +60,6 @@ 
   .align 2;						\
   ENTRY(name)						\
   .set nomips16;					\
-  .set noreorder;					\
   li v0, SYS_ify(syscall_name);				\
   syscall
 
diff --git a/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h b/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
index 47a1b97351..647a66ee1f 100644
--- a/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
+++ b/sysdeps/unix/sysv/linux/mips/mips32/sysdep.h
@@ -140,10 +140,8 @@  union __mips_syscall_return
 	register long int __v0 asm ("$2");				\
 	register long int __a3 asm ("$7");				\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set reorder"							\
 	: "=r" (__v0), "=r" (__a3)					\
 	: input								\
 	: __SYSCALL_CLOBBERS);						\
@@ -164,10 +162,8 @@  union __mips_syscall_return
 	register long int __a0 asm ("$4") = _arg1;			\
 	register long int __a3 asm ("$7");				\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set reorder"							\
 	: "=r" (__v0), "=r" (__a3)					\
 	: input, "r" (__a0)						\
 	: __SYSCALL_CLOBBERS);						\
@@ -190,10 +186,8 @@  union __mips_syscall_return
 	register long int __a1 asm ("$5") = _arg2;			\
 	register long int __a3 asm ("$7");				\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set\treorder"							\
 	: "=r" (__v0), "=r" (__a3)					\
 	: input, "r" (__a0), "r" (__a1)					\
 	: __SYSCALL_CLOBBERS);						\
@@ -219,10 +213,8 @@  union __mips_syscall_return
 	register long int __a2 asm ("$6") = _arg3;			\
 	register long int __a3 asm ("$7");				\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set\treorder"							\
 	: "=r" (__v0), "=r" (__a3)					\
 	: input, "r" (__a0), "r" (__a1), "r" (__a2)			\
 	: __SYSCALL_CLOBBERS);						\
@@ -249,10 +241,8 @@  union __mips_syscall_return
 	register long int __a2 asm ("$6") = _arg3;			\
 	register long int __a3 asm ("$7") = _arg4;			\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set\treorder"							\
 	: "=r" (__v0), "+r" (__a3)					\
 	: input, "r" (__a0), "r" (__a1), "r" (__a2)			\
 	: __SYSCALL_CLOBBERS);						\
diff --git a/sysdeps/unix/sysv/linux/mips/mips64/sysdep.h b/sysdeps/unix/sysv/linux/mips/mips64/sysdep.h
index 0438bed23d..8f4787352a 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/sysdep.h
+++ b/sysdeps/unix/sysv/linux/mips/mips64/sysdep.h
@@ -95,10 +95,8 @@ 
 	register __syscall_arg_t __v0 asm ("$2");			\
 	register __syscall_arg_t __a3 asm ("$7");			\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set reorder"							\
 	: "=r" (__v0), "=r" (__a3)					\
 	: input								\
 	: __SYSCALL_CLOBBERS);						\
@@ -119,10 +117,8 @@ 
 	register __syscall_arg_t __a0 asm ("$4") = _arg1;		\
 	register __syscall_arg_t __a3 asm ("$7");			\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set reorder"							\
 	: "=r" (__v0), "=r" (__a3)					\
 	: input, "r" (__a0)						\
 	: __SYSCALL_CLOBBERS);						\
@@ -145,10 +141,8 @@ 
 	register __syscall_arg_t __a1 asm ("$5") = _arg2;		\
 	register __syscall_arg_t __a3 asm ("$7");			\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set\treorder"							\
 	: "=r" (__v0), "=r" (__a3)					\
 	: input, "r" (__a0), "r" (__a1)					\
 	: __SYSCALL_CLOBBERS);						\
@@ -173,10 +167,8 @@ 
 	register __syscall_arg_t __a2 asm ("$6") = _arg3;		\
 	register __syscall_arg_t __a3 asm ("$7");			\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set\treorder"							\
 	: "=r" (__v0), "=r" (__a3)					\
 	: input, "r" (__a0), "r" (__a1), "r" (__a2)			\
 	: __SYSCALL_CLOBBERS);						\
@@ -203,10 +195,8 @@ 
 	register __syscall_arg_t __a2 asm ("$6") = _arg3;		\
 	register __syscall_arg_t __a3 asm ("$7") = _arg4;		\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set\treorder"							\
 	: "=r" (__v0), "+r" (__a3)					\
 	: input, "r" (__a0), "r" (__a1), "r" (__a2)			\
 	: __SYSCALL_CLOBBERS);						\
@@ -235,10 +225,8 @@ 
 	register __syscall_arg_t __a3 asm ("$7") = _arg4;		\
 	register __syscall_arg_t __a4 asm ("$8") = _arg5;		\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set\treorder"							\
 	: "=r" (__v0), "+r" (__a3)					\
 	: input, "r" (__a0), "r" (__a1), "r" (__a2), "r" (__a4)		\
 	: __SYSCALL_CLOBBERS);						\
@@ -269,10 +257,8 @@ 
 	register __syscall_arg_t __a4 asm ("$8") = _arg5;		\
 	register __syscall_arg_t __a5 asm ("$9") = _arg6;		\
 	__asm__ volatile (						\
-	".set\tnoreorder\n\t"						\
 	v0_init								\
 	"syscall\n\t"							\
-	".set\treorder"							\
 	: "=r" (__v0), "+r" (__a3)					\
 	: input, "r" (__a0), "r" (__a1), "r" (__a2), "r" (__a4),	\
 	  "r" (__a5)							\