Commit: AArch64: Simulate scalar NEG and vector MUL

Message ID 87io0kt9lt.fsf@redhat.com
State New, archived
Headers

Commit Message

Nick Clifton March 18, 2016, 9:32 a.m. UTC
  Hi Guys,

  I am slowly going through the gcc testsuite, fixing bugs that it
  exposes in the AArch64 simulator.  The attached patch adds support for
  a couple of missing instructions - namely scalar NEG and vector MUL,
  and adds some improvements to the tracing functions to help track down
  problems.  I also found that the floating point load and store
  functions were not working because of aliasing problems when casting
  between real and integer values.  So the patch also makes use of the
  FRegister union to ensure that the correct values are read to and from
  memory.

Cheers
  Nick

sim/aarch64/ChangeLog
2016-03-18  Nick Clifton  <nickc@redhat.com>

	* cpustate.c: Remove spurious spaces from TRACE strings.
	Print hex equivalents of floats and doubles.
	Check element number against array size when accessing vector
	registers.
	* memory.c: Trace memory reads when --trace-memory is enabled.
	Remove float and double load and store functions.
	* memory.h (aarch64_get_mem_float): Delete prototype.
	(aarch64_get_mem_double): Likewise.
	(aarch64_set_mem_float): Likewise.
	(aarch64_set_mem_double): Likewise.
	* simulator (IS_SET): Always return either 0 or 1.
	(IS_CLEAR): Likewise.
	(fldrs_pcrel): Load and store floats using 32-bit memory accesses
	and doubles using 64-bit memory accesses.
	(fldrd_pcrel, fldrs_wb, fldrs_abs, fldrs_scale_ext): Likewise.
	(fldrd_wb, fldrd_abs, fsturs, fsturd, fldurs, fldurd): Likewise.
	(fstrs_abs, fstrs_wb, fstrs_scale_ext, fstrd_abs): Likewise.
	(fstrd_wb, fstrd_scale_ext, store_pair_float): Likewise.
	(store_pair_double, load_pair_float, load_pair_double): Likewise.
	(do_vec_MUL_by_element): New function.
	(do_vec_op2): Call do_vec_MUL_by_element.
	(do_scalar_NEG): New function.
	(do_double_add): Call do_scalar_NEG.
  

Comments

Mike Frysinger March 18, 2016, 1:27 p.m. UTC | #1
On 18 Mar 2016 09:32, Nick Clifton wrote:
> +#define GET_VEC_ELEMENT(REG, ELEMENT, FIELD)	   \

is using all caps for arg names a normal style thing ?

> +  do						   \
> +    {						   \
> +      if (element > ARRAY_SIZE (cpu->fr[0].FIELD)) \

shouldn't this be ELEMENT ?  and be >= ?

> +#define SET_VEC_ELEMENT(REG, ELEMENT, VAL, FIELD, PRINTER)	\
> +  do						   		\
> +    {								\
> +      if (ELEMENT > ARRAY_SIZE (cpu->fr[0].FIELD))			\

>= ?
-mike
  
Nick Clifton March 18, 2016, 2:26 p.m. UTC | #2
Hi Mike,

>> +#define GET_VEC_ELEMENT(REG, ELEMENT, FIELD)	   \
> is using all caps for arg names a normal style thing ?

For macros, yes.

For example, in gdb/gdbarch.h:

#define GDBARCH_OBSTACK_CALLOC(GDBARCH, NR, TYPE) ((TYPE *) gdbarch_obstack_zalloc ((GDBARCH), (NR) * sizeof (TYPE)))


>> +      if (element > ARRAY_SIZE (cpu->fr[0].FIELD)) \
> shouldn't this be ELEMENT ?  and be >= ?

Yes.

>> +      if (ELEMENT > ARRAY_SIZE (cpu->fr[0].FIELD))			\

Yup.  Patch coming.

Cheers
  Nick
  

Patch

diff --git a/sim/aarch64/cpustate.c b/sim/aarch64/cpustate.c
index 35a60cc..86b1b15 100644
--- a/sim/aarch64/cpustate.c
+++ b/sim/aarch64/cpustate.c
@@ -34,13 +34,13 @@  aarch64_set_reg_u64 (sim_cpu *cpu, GReg reg, int r31_is_sp, uint64_t val)
 {
   if (reg == R31 && ! r31_is_sp)
     {
-      TRACE_REGISTER (cpu, "  GR[31] NOT CHANGED!");
+      TRACE_REGISTER (cpu, "GR[31] NOT CHANGED!");
       return;
     }
 
   if (val != cpu->gr[reg].u64)
     TRACE_REGISTER (cpu,
-		    "  GR[%2d] changes from %16" PRIx64 " to %16" PRIx64,
+		    "GR[%2d] changes from %16" PRIx64 " to %16" PRIx64,
 		    reg, cpu->gr[reg].u64, val);
 
   cpu->gr[reg].u64 = val;
@@ -51,13 +51,13 @@  aarch64_set_reg_s64 (sim_cpu *cpu, GReg reg, int r31_is_sp, int64_t val)
 {
   if (reg == R31 && ! r31_is_sp)
     {
-      TRACE_REGISTER (cpu, "  GR[31] NOT CHANGED!");
+      TRACE_REGISTER (cpu, "GR[31] NOT CHANGED!");
       return;
     }
 
   if (val != cpu->gr[reg].s64)
     TRACE_REGISTER (cpu,
-		    "  GR[%2d] changes from %16" PRIx64 " to %16" PRIx64,
+		    "GR[%2d] changes from %16" PRIx64 " to %16" PRIx64,
 		    reg, cpu->gr[reg].s64, val);
 
   cpu->gr[reg].s64 = val;
@@ -128,7 +128,7 @@  aarch64_set_next_PC (sim_cpu *cpu, uint64_t next)
 {
   if (next != cpu->nextpc + 4)
     TRACE_REGISTER (cpu,
-		    "  NextPC changes from %16" PRIx64 " to %16" PRIx64,
+		    "NextPC changes from %16" PRIx64 " to %16" PRIx64,
 		    cpu->nextpc, next);
 
   cpu->nextpc = next;
@@ -139,7 +139,7 @@  aarch64_set_next_PC_by_offset (sim_cpu *cpu, int64_t offset)
 {
   if (cpu->pc + offset != cpu->nextpc + 4)
     TRACE_REGISTER (cpu,
-		    "  NextPC changes from %16" PRIx64 " to %16" PRIx64,
+		    "NextPC changes from %16" PRIx64 " to %16" PRIx64,
 		    cpu->nextpc, cpu->pc + offset);
 
   cpu->nextpc = cpu->pc + offset;
@@ -163,7 +163,7 @@  aarch64_save_LR (sim_cpu *cpu)
 {
   if (cpu->gr[LR].u64 != cpu->nextpc)
     TRACE_REGISTER (cpu,
-		    "  LR    changes from %16" PRIx64 " to %16" PRIx64,
+		    "LR    changes from %16" PRIx64 " to %16" PRIx64,
 		    cpu->gr[LR].u64, cpu->nextpc);
 
   cpu->gr[LR].u64 = cpu->nextpc;
@@ -209,11 +209,11 @@  aarch64_set_CPSR (sim_cpu *cpu, uint32_t new_flags)
     {
       if (cpu->CPSR != new_flags)
 	TRACE_REGISTER (cpu,
-			"  CPSR changes from %s to %s",
+			"CPSR changes from %s to %s",
 			decode_cpsr (cpu->CPSR), decode_cpsr (new_flags));
       else
 	TRACE_REGISTER (cpu,
-			"  CPSR stays at %s", decode_cpsr (cpu->CPSR));
+			"CPSR stays at %s", decode_cpsr (cpu->CPSR));
     }
 
   cpu->CPSR = new_flags & CPSR_ALL_FLAGS;
@@ -238,7 +238,7 @@  aarch64_set_CPSR_bits (sim_cpu *cpu, uint32_t mask, uint32_t value)
 
   if (old_flags != cpu->CPSR)
     TRACE_REGISTER (cpu,
-		    "  CPSR changes from %s to %s",
+		    "CPSR changes from %s to %s",
 		    decode_cpsr (old_flags), decode_cpsr (cpu->CPSR));
 }
 
@@ -259,7 +259,7 @@  aarch64_set_CPSR_bit (sim_cpu *cpu, FlagMask bit)
 
   if (old_flags != cpu->CPSR)
     TRACE_REGISTER (cpu,
-		    "  CPSR changes from %s to %s",
+		    "CPSR changes from %s to %s",
 		    decode_cpsr (old_flags), decode_cpsr (cpu->CPSR));
 }
 
@@ -273,7 +273,7 @@  aarch64_clear_CPSR_bit (sim_cpu *cpu, FlagMask bit)
 
   if (old_flags != cpu->CPSR)
     TRACE_REGISTER (cpu,
-		    "  CPSR changes from %s to %s",
+		    "CPSR changes from %s to %s",
 		    decode_cpsr (old_flags), decode_cpsr (cpu->CPSR));
 }
 
@@ -300,9 +300,14 @@  void
 aarch64_set_FP_float (sim_cpu *cpu, VReg reg, float val)
 {
   if (val != cpu->fr[reg].s)
-    TRACE_REGISTER (cpu,
-		    "  FR[%d] changes from %f to %f",
-		    reg, cpu->fr[reg].s, val);
+    {
+      FRegister v;
+
+      v.s = val;
+      TRACE_REGISTER (cpu,
+		      "FR[%d].s changes from %f to %f [hex: %0lx]",
+		      reg, cpu->fr[reg].s, val, v.v[0]);
+    }
 
   cpu->fr[reg].s = val;
 }
@@ -311,10 +316,14 @@  void
 aarch64_set_FP_double (sim_cpu *cpu, VReg reg, double val)
 {
   if (val != cpu->fr[reg].d)
-    TRACE_REGISTER (cpu,
-		    "  FR[%d] changes from %f to %f",
-		    reg, cpu->fr[reg].d, val);
+    {
+      FRegister v;
 
+      v.d = val;
+      TRACE_REGISTER (cpu,
+		      "FR[%d].d changes from %f to %f [hex: %0lx]",
+		      reg, cpu->fr[reg].d, val, v.v[0]);
+    }
   cpu->fr[reg].d = val;
 }
 
@@ -324,7 +333,7 @@  aarch64_set_FP_long_double (sim_cpu *cpu, VReg reg, FRegister a)
   if (cpu->fr[reg].v[0] != a.v[0]
       || cpu->fr[reg].v[1] != a.v[1])
     TRACE_REGISTER (cpu,
-		    "  FR[%d] changes from [%0lx %0lx] to [%lx %lx] ",
+		    "FR[%d].q changes from [%0lx %0lx] to [%0lx %0lx] ",
 		    reg,
 		    cpu->fr[reg].v[0], cpu->fr[reg].v[1],
 		    a.v[0], a.v[1]);
@@ -333,225 +342,202 @@  aarch64_set_FP_long_double (sim_cpu *cpu, VReg reg, FRegister a)
   cpu->fr[reg].v[1] = a.v[1];
 }
 
+#define GET_VEC_ELEMENT(REG, ELEMENT, FIELD)	   \
+  do						   \
+    {						   \
+      if (element > ARRAY_SIZE (cpu->fr[0].FIELD)) \
+	{								\
+	  TRACE_REGISTER (cpu, \
+			  "Internal SIM error: invalid element number: %d ",\
+			  ELEMENT);					\
+	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu), \
+			   sim_stopped, SIM_SIGBUS);			\
+	}								\
+      return cpu->fr[REG].FIELD [ELEMENT];				\
+    }									\
+  while (0)
+
 uint64_t
 aarch64_get_vec_u64 (sim_cpu *cpu, VReg reg, unsigned element)
 {
-  return cpu->fr[reg].v[element];
+  GET_VEC_ELEMENT (reg, element, v);
 }
 
 uint32_t
-aarch64_get_vec_u32 (sim_cpu *cpu, VReg regno, unsigned element)
+aarch64_get_vec_u32 (sim_cpu *cpu, VReg reg, unsigned element)
 {
-  return cpu->fr[regno].w[element];
+  GET_VEC_ELEMENT (reg, element, w);
 }
 
 uint16_t
-aarch64_get_vec_u16 (sim_cpu *cpu, VReg regno, unsigned element)
+aarch64_get_vec_u16 (sim_cpu *cpu, VReg reg, unsigned element)
 {
-  return cpu->fr[regno].h[element];
+  GET_VEC_ELEMENT (reg, element, h);
 }
 
 uint8_t
-aarch64_get_vec_u8 (sim_cpu *cpu, VReg regno, unsigned element)
+aarch64_get_vec_u8 (sim_cpu *cpu, VReg reg, unsigned element)
 {
-  return cpu->fr[regno].b[element];
+  GET_VEC_ELEMENT (reg, element, b);
 }
 
-void
-aarch64_set_vec_u64 (sim_cpu *  cpu,
-		     VReg       regno,
-		     unsigned   element,
-		     uint64_t   value)
+int64_t
+aarch64_get_vec_s64 (sim_cpu *cpu, VReg reg, unsigned element)
 {
-  if (value != cpu->fr[regno].v[element])
-    TRACE_REGISTER (cpu,
-		    "  VR[%2d].<long>[%d] changes from %16" PRIx64
-		    " to %16" PRIx64,
-		    regno, element, cpu->fr[regno].v[element], value);
-
-  cpu->fr[regno].v[element] = value;
+  GET_VEC_ELEMENT (reg, element, V);
 }
 
-void
-aarch64_set_vec_u32 (sim_cpu *  cpu,
-		     VReg       regno,
-		     unsigned   element,
-		     uint32_t   value)
+int32_t
+aarch64_get_vec_s32 (sim_cpu *cpu, VReg reg, unsigned element)
 {
-  if (value != cpu->fr[regno].w[element])
-    TRACE_REGISTER (cpu,
-		    "  VR[%2d].<word>[%d] changes from %8x to %8x",
-		    regno, element, cpu->fr[regno].w[element], value);
-
-  cpu->fr[regno].w[element] = value;
+  GET_VEC_ELEMENT (reg, element, W);
 }
 
-void
-aarch64_set_vec_u16 (sim_cpu *  cpu,
-		     VReg       regno,
-		     unsigned   element,
-		     uint16_t   value)
+int16_t
+aarch64_get_vec_s16 (sim_cpu *cpu, VReg reg, unsigned element)
 {
-  if (value != cpu->fr[regno].h[element])
-    TRACE_REGISTER (cpu,
-		    "  VR[%2d].<half>[%d] changes from %4x to %4x",
-		    regno, element, cpu->fr[regno].h[element], value);
-
-  cpu->fr[regno].h[element] = value;
+  GET_VEC_ELEMENT (reg, element, H);
 }
 
-void
-aarch64_set_vec_u8 (sim_cpu *cpu, VReg regno, unsigned element, uint8_t value)
+int8_t
+aarch64_get_vec_s8 (sim_cpu *cpu, VReg reg, unsigned element)
 {
-  if (value != cpu->fr[regno].b[element])
-    TRACE_REGISTER (cpu,
-		    "  VR[%2d].<byte>[%d] changes from %x to %x",
-		    regno, element, cpu->fr[regno].b[element], value);
-
-  cpu->fr[regno].b[element] = value;
+  GET_VEC_ELEMENT (reg, element, B);
 }
 
-void
-aarch64_set_FPSR (sim_cpu *cpu, uint32_t value)
+float
+aarch64_get_vec_float (sim_cpu *cpu, VReg reg, unsigned element)
 {
-  if (cpu->FPSR != value)
-    TRACE_REGISTER (cpu,
-		    "  FPSR changes from %x to %x", cpu->FPSR, value);
-
-  cpu->FPSR = value & FPSR_ALL_FPSRS;
+  GET_VEC_ELEMENT (reg, element, S);
 }
 
-uint32_t
-aarch64_get_FPSR (sim_cpu *cpu)
-{
-  return cpu->FPSR;
-}
+double
+aarch64_get_vec_double (sim_cpu *cpu, VReg reg, unsigned element)
+{
+  GET_VEC_ELEMENT (reg, element, D);
+}
+
+
+#define SET_VEC_ELEMENT(REG, ELEMENT, VAL, FIELD, PRINTER)	\
+  do						   		\
+    {								\
+      if (ELEMENT > ARRAY_SIZE (cpu->fr[0].FIELD))			\
+	{								\
+	  TRACE_REGISTER (cpu, \
+			  "Internal SIM error: invalid element number: %d ",\
+			  ELEMENT);					\
+	  sim_engine_halt (CPU_STATE (cpu), cpu, NULL, aarch64_get_PC (cpu), \
+			   sim_stopped, SIM_SIGBUS);			\
+	}								\
+      if (VAL != cpu->fr[REG].FIELD [ELEMENT])				\
+	TRACE_REGISTER (cpu,						\
+			"VR[%2d]." #FIELD " [%d] changes from " PRINTER \
+			" to " PRINTER , REG,				\
+			ELEMENT, cpu->fr[REG].FIELD [ELEMENT], VAL);	\
+      \
+      cpu->fr[REG].FIELD [ELEMENT] = VAL;     \
+    }					      \
+  while (0)
 
 void
-aarch64_set_FPSR_bits (sim_cpu *cpu, uint32_t mask, uint32_t value)
-{
-  uint32_t old_FPSR = cpu->FPSR;
-
-  mask &= FPSR_ALL_FPSRS;
-  cpu->FPSR &= ~mask;
-  cpu->FPSR |= (value & mask);
-
-  if (cpu->FPSR != old_FPSR)
-    TRACE_REGISTER (cpu,
-		    "  FPSR changes from %x to %x", old_FPSR, cpu->FPSR);
-}
-
-uint32_t
-aarch64_get_FPSR_bits (sim_cpu *cpu, uint32_t mask)
+aarch64_set_vec_u64 (sim_cpu * cpu, VReg reg, unsigned element, uint64_t val)
 {
-  mask &= FPSR_ALL_FPSRS;
-  return cpu->FPSR & mask;
+  SET_VEC_ELEMENT (reg, element, val, v, "%16lx");
 }
 
-int
-aarch64_test_FPSR_bit (sim_cpu *cpu, FPSRMask flag)
+void
+aarch64_set_vec_u32 (sim_cpu * cpu, VReg reg, unsigned element, uint32_t val)
 {
-  return cpu->FPSR & flag;
+  SET_VEC_ELEMENT (reg, element, val, w, "%8x");
 }
 
-float
-aarch64_get_vec_float (sim_cpu *cpu, VReg v, unsigned e)
+void
+aarch64_set_vec_u16 (sim_cpu * cpu, VReg reg, unsigned element, uint16_t val)
 {
-  return cpu->fr[v].S[e];
+  SET_VEC_ELEMENT (reg, element, val, h, "%4x");
 }
 
-double
-aarch64_get_vec_double (sim_cpu *cpu, VReg v, unsigned e)
+void
+aarch64_set_vec_u8 (sim_cpu * cpu, VReg reg, unsigned element, uint8_t val)
 {
-  return cpu->fr[v].D[e];
+  SET_VEC_ELEMENT (reg, element, val, b, "%x");
 }
 
 void
-aarch64_set_vec_float (sim_cpu *cpu, VReg v, unsigned e, float f)
+aarch64_set_vec_s64 (sim_cpu *cpu, VReg reg, unsigned element, int64_t val)
 {
-  if (f != cpu->fr[v].S[e])
-    TRACE_REGISTER (cpu,
-		    "  VR[%2d].<float>[%d] changes from %f to %f",
-		    v, e, cpu->fr[v].S[e], f);
-
-  cpu->fr[v].S[e] = f;
+  SET_VEC_ELEMENT (reg, element, val, V, "%16lx");
 }
 
 void
-aarch64_set_vec_double (sim_cpu *cpu, VReg v, unsigned e, double d)
+aarch64_set_vec_s32 (sim_cpu *cpu, VReg reg, unsigned element, int32_t val)
 {
-  if (d != cpu->fr[v].D[e])
-    TRACE_REGISTER (cpu,
-		    "  VR[%2d].<double>[%d] changes from %f to %f",
-		    v, e, cpu->fr[v].D[e], d);
-
-  cpu->fr[v].D[e] = d;
+  SET_VEC_ELEMENT (reg, element, val, W, "%8x");
 }
 
-int64_t
-aarch64_get_vec_s64 (sim_cpu *cpu, VReg regno, unsigned element)
+void
+aarch64_set_vec_s16 (sim_cpu *cpu, VReg reg, unsigned element, int16_t val)
 {
-  return cpu->fr[regno].V[element];
+  SET_VEC_ELEMENT (reg, element, val, H, "%4x");
 }
 
-int32_t
-aarch64_get_vec_s32 (sim_cpu *cpu, VReg regno, unsigned element)
+void
+aarch64_set_vec_s8 (sim_cpu *cpu, VReg reg, unsigned element, int8_t val)
 {
-  return cpu->fr[regno].W[element];
+  SET_VEC_ELEMENT (reg, element, val, B, "%x");
 }
 
-int16_t
-aarch64_get_vec_s16 (sim_cpu *cpu, VReg regno, unsigned element)
+void
+aarch64_set_vec_float (sim_cpu *cpu, VReg reg, unsigned element, float val)
 {
-  return cpu->fr[regno].H[element];
+  SET_VEC_ELEMENT (reg, element, val, S, "%f");
 }
 
-int8_t
-aarch64_get_vec_s8 (sim_cpu *cpu, VReg regno, unsigned element)
+void
+aarch64_set_vec_double (sim_cpu *cpu, VReg reg, unsigned element, double val)
 {
-  return cpu->fr[regno].B[element];
+  SET_VEC_ELEMENT (reg, element, val, D, "%f");
 }
 
 void
-aarch64_set_vec_s64 (sim_cpu *cpu, VReg regno, unsigned element, int64_t value)
+aarch64_set_FPSR (sim_cpu *cpu, uint32_t value)
 {
-  if (value != cpu->fr[regno].V[element])
+  if (cpu->FPSR != value)
     TRACE_REGISTER (cpu,
-		    "  VR[%2d].<long>[%d] changes from %16" PRIx64 " to %16" PRIx64,
-		    regno, element, cpu->fr[regno].V[element], value);
+		    "FPSR changes from %x to %x", cpu->FPSR, value);
 
-  cpu->fr[regno].V[element] = value;
+  cpu->FPSR = value & FPSR_ALL_FPSRS;
 }
 
-void
-aarch64_set_vec_s32 (sim_cpu *cpu, VReg regno, unsigned element, int32_t value)
+uint32_t
+aarch64_get_FPSR (sim_cpu *cpu)
 {
-  if (value != cpu->fr[regno].W[element])
-    TRACE_REGISTER (cpu,
-		    "  VR[%2d].<word>[%d] changes from %8x to %8x",
-		    regno, element, cpu->fr[regno].W[element], value);
-
-  cpu->fr[regno].W[element] = value;
+  return cpu->FPSR;
 }
 
 void
-aarch64_set_vec_s16 (sim_cpu *cpu, VReg regno, unsigned element, int16_t value)
+aarch64_set_FPSR_bits (sim_cpu *cpu, uint32_t mask, uint32_t value)
 {
-  if (value != cpu->fr[regno].H[element])
-    TRACE_REGISTER (cpu,
-		    "  VR[%2d].<half>[%d] changes from %4x to %4x",
-		    regno, element, cpu->fr[regno].H[element], value);
+  uint32_t old_FPSR = cpu->FPSR;
+
+  mask &= FPSR_ALL_FPSRS;
+  cpu->FPSR &= ~mask;
+  cpu->FPSR |= (value & mask);
 
-  cpu->fr[regno].H[element] = value;
+  if (cpu->FPSR != old_FPSR)
+    TRACE_REGISTER (cpu,
+		    "FPSR changes from %x to %x", old_FPSR, cpu->FPSR);
 }
 
-void
-aarch64_set_vec_s8 (sim_cpu *cpu, VReg regno, unsigned element, int8_t value)
+uint32_t
+aarch64_get_FPSR_bits (sim_cpu *cpu, uint32_t mask)
 {
-  if (value != cpu->fr[regno].B[element])
-    TRACE_REGISTER (cpu,
-		    "  VR[%2d].<byte>[%d] changes from %x to %x",
-		    regno, element, cpu->fr[regno].B[element], value);
+  mask &= FPSR_ALL_FPSRS;
+  return cpu->FPSR & mask;
+}
 
-  cpu->fr[regno].B[element] = value;
+int
+aarch64_test_FPSR_bit (sim_cpu *cpu, FPSRMask flag)
+{
+  return cpu->FPSR & flag;
 }
diff --git a/sim/aarch64/memory.c b/sim/aarch64/memory.c
index 2d9a797..bb02dda 100644
--- a/sim/aarch64/memory.c
+++ b/sim/aarch64/memory.c
@@ -46,7 +46,12 @@  mem_error (sim_cpu *cpu, const char *message, uint64_t addr)
   RETURN_TYPE								\
   aarch64_get_mem_##NAME (sim_cpu *cpu, uint64_t address)		\
   {									\
-    return (RETURN_TYPE) sim_core_read_##N (cpu, 0, read_map, address); \
+    RETURN_TYPE val = (RETURN_TYPE) sim_core_read_##N (cpu, 0, read_map, address); \
+    TRACE_MEMORY (cpu,							\
+		  "read of %" PRIx64 " (%d bytes) from %" PRIx64,	\
+		  (uint64_t) val, N, address);				\
+									\
+    return val;								\
   }
 
 /* A variant of the FETCH_FUNC macro that uses unaligned reads.
@@ -56,7 +61,12 @@  mem_error (sim_cpu *cpu, const char *message, uint64_t addr)
   RETURN_TYPE								\
   aarch64_get_mem_##NAME (sim_cpu *cpu, uint64_t address)		\
   {									\
-    return (RETURN_TYPE) sim_core_read_unaligned_8 (cpu, 0, read_map, address); \
+    RETURN_TYPE val = (RETURN_TYPE) sim_core_read_unaligned_8 (cpu, 0, read_map, address); \
+    TRACE_MEMORY (cpu,							\
+		  "read of %" PRIx64 " (%d bytes) from %" PRIx64 " (unaligned double)",	\
+		  (uint64_t) val, N, address);				\
+									\
+    return val;								\
   }
 
 FETCH_FUNC_U (uint64_t, uint64_t, u64)
@@ -67,8 +77,6 @@  FETCH_FUNC (uint32_t,   uint16_t, u16, 2)
 FETCH_FUNC (int32_t,     int16_t, s16, 2)
 FETCH_FUNC (uint32_t,    uint8_t, u8, 1)
 FETCH_FUNC (int32_t,      int8_t, s8, 1)
-FETCH_FUNC (float,         float, float, 4)
-FETCH_FUNC_U (double,     double, double)
 
 void
 aarch64_get_mem_long_double (sim_cpu *cpu, uint64_t address, FRegister *a)
@@ -110,8 +118,6 @@  STORE_FUNC (uint16_t,   u16, 2)
 STORE_FUNC (int16_t,    s16, 2)
 STORE_FUNC (uint8_t,    u8, 1)
 STORE_FUNC (int8_t,     s8, 1)
-STORE_FUNC (float,      float, 4)
-STORE_FUNC_U (double,   double)
 
 void
 aarch64_set_mem_long_double (sim_cpu *cpu, uint64_t address, FRegister a)
diff --git a/sim/aarch64/memory.h b/sim/aarch64/memory.h
index 64326f8..3f63973 100644
--- a/sim/aarch64/memory.h
+++ b/sim/aarch64/memory.h
@@ -26,10 +26,7 @@ 
 #include "bfd.h"
 #include "simulator.h"
 
-extern float        aarch64_get_mem_float (sim_cpu *, uint64_t);
-extern double       aarch64_get_mem_double (sim_cpu *, uint64_t);
 extern void         aarch64_get_mem_long_double (sim_cpu *, uint64_t, FRegister *);
-
 extern uint64_t     aarch64_get_mem_u64 (sim_cpu *, uint64_t);
 extern int64_t      aarch64_get_mem_s64 (sim_cpu *, uint64_t);
 extern uint32_t     aarch64_get_mem_u32 (sim_cpu *, uint64_t);
@@ -41,10 +38,7 @@  extern int32_t      aarch64_get_mem_s8  (sim_cpu *, uint64_t);
 extern void         aarch64_get_mem_blk (sim_cpu *, uint64_t, char *, unsigned);
 extern const char * aarch64_get_mem_ptr (sim_cpu *, uint64_t);
 
-extern void         aarch64_set_mem_float (sim_cpu *, uint64_t, float);
-extern void         aarch64_set_mem_double (sim_cpu *, uint64_t, double);
 extern void         aarch64_set_mem_long_double (sim_cpu *, uint64_t, FRegister);
-
 extern void         aarch64_set_mem_u64 (sim_cpu *, uint64_t, uint64_t);
 extern void         aarch64_set_mem_s64 (sim_cpu *, uint64_t, int64_t);
 extern void         aarch64_set_mem_u32 (sim_cpu *, uint64_t, uint32_t);
diff --git a/sim/aarch64/simulator.c b/sim/aarch64/simulator.c
index f5486f4..ae05019 100644
--- a/sim/aarch64/simulator.c
+++ b/sim/aarch64/simulator.c
@@ -36,8 +36,8 @@ 
 #define SP_OK 1
 
 #define TST(_flag)   (aarch64_test_CPSR_bit (cpu, _flag))
-#define IS_SET(_X)   ( TST (( _X )))
-#define IS_CLEAR(_X) (!TST (( _X )))
+#define IS_SET(_X)   (TST (( _X )) ? 1 : 0)
+#define IS_CLEAR(_X) (TST (( _X )) ? 0 : 1)
 
 #define HALT_UNALLOC							\
   do									\
@@ -460,9 +460,9 @@  fldrs_pcrel (sim_cpu *cpu, int32_t offset)
 {
   unsigned int rd = uimm (aarch64_get_instr (cpu), 4, 0);
 
-  aarch64_set_FP_float (cpu, rd,
-			aarch64_get_mem_float
-			(cpu, aarch64_get_PC (cpu) + offset * 4));
+  aarch64_set_vec_u32 (cpu, rd, 0,
+		       aarch64_get_mem_u32
+		       (cpu, aarch64_get_PC (cpu) + offset * 4));
 }
 
 /* double pc-relative load  */
@@ -471,9 +471,9 @@  fldrd_pcrel (sim_cpu *cpu, int32_t offset)
 {
   unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
 
-  aarch64_set_FP_double (cpu, st,
-			 aarch64_get_mem_double
-			 (cpu, aarch64_get_PC (cpu) + offset * 4));
+  aarch64_set_vec_u64 (cpu, st, 0,
+		       aarch64_get_mem_u64
+		       (cpu, aarch64_get_PC (cpu) + offset * 4));
 }
 
 /* long double pc-relative load.  */
@@ -545,7 +545,7 @@  fldrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_FP_float (cpu, st, aarch64_get_mem_float (cpu, address));
+  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32 (cpu, address));
   if (wb == Post)
     address += offset;
 
@@ -560,10 +560,9 @@  fldrs_abs (sim_cpu *cpu, uint32_t offset)
   unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
   unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
 
-  aarch64_set_FP_float (cpu, st,
-			aarch64_get_mem_float
-			(cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
-			 + SCALE (offset, 32)));
+  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
+		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK)
+			+ SCALE (offset, 32)));
 }
 
 /* Load 32 bit scaled or unscaled zero- or sign-extended
@@ -578,9 +577,8 @@  fldrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
   int64_t extended = extend (aarch64_get_reg_u32 (cpu, rm, NO_SP), extension);
   uint64_t displacement = OPT_SCALE (extended, 32, scaling);
 
-  aarch64_set_FP_float (cpu, st,
-			aarch64_get_mem_float
-			(cpu, address + displacement));
+  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
+		       (cpu, address + displacement));
 }
 
 /* Load 64 bit unscaled signed 9 bit with pre- or post-writeback.  */
@@ -594,7 +592,7 @@  fldrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_FP_double (cpu, st, aarch64_get_mem_double (cpu, address));
+  aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
 
   if (wb == Post)
     address += offset;
@@ -611,7 +609,7 @@  fldrd_abs (sim_cpu *cpu, uint32_t offset)
   unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
   uint64_t address = aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64);
 
-  aarch64_set_FP_double (cpu, st, aarch64_get_mem_double (cpu, address));
+  aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64 (cpu, address));
 }
 
 /* Load 64 bit scaled or unscaled zero- or sign-extended 32-bit register offset.  */
@@ -2322,7 +2320,7 @@  static void
 sbc32 (sim_cpu *cpu)
 {
   unsigned rm = uimm (aarch64_get_instr (cpu), 20, 16);
-  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5); /* ngc iff rn == 31.  */
   unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
 
   aarch64_set_reg_u64 (cpu, rd, NO_SP,
@@ -5676,6 +5674,83 @@  do_vec_SSHR_USHR (sim_cpu *cpu)
 }
 
 static void
+do_vec_MUL_by_element (sim_cpu *cpu)
+{
+  /* instr[31]    = 0
+     instr[30]    = half/full
+     instr[29,24] = 00 1111
+     instr[23,22] = size
+     instr[21]    = L
+     instr[20]    = M
+     instr[19,16] = m
+     instr[15,12] = 1000
+     instr[11]    = H
+     instr[10]    = 0
+     instr[9,5]   = Vn
+     instr[4,0]   = Vd  */
+
+  unsigned full     = uimm (aarch64_get_instr (cpu), 30, 30);
+  unsigned L        = uimm (aarch64_get_instr (cpu), 21, 21);
+  unsigned H        = uimm (aarch64_get_instr (cpu), 11, 11);
+  unsigned vn       = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned vd       = uimm (aarch64_get_instr (cpu), 4, 0);
+  unsigned size     = uimm (aarch64_get_instr (cpu), 23, 22);
+  unsigned index;
+  unsigned vm;
+  unsigned e;
+
+  NYI_assert (29, 24, 0x0F);
+  NYI_assert (15, 12, 0x8);
+  NYI_assert (10, 10, 0);
+
+  switch (size)
+    {
+    case 1:
+      {
+	/* 16 bit products.  */
+	uint16_t product;
+	uint16_t element1;
+	uint16_t element2;
+
+	index = (H << 2) | (L << 1) | uimm (aarch64_get_instr (cpu), 20, 20);
+	vm = uimm (aarch64_get_instr (cpu), 19, 16);
+	element2 = aarch64_get_vec_u16 (cpu, vm, index);
+
+	for (e = 0; e < (full ? 8 : 4); e ++)
+	  {
+	    element1 = aarch64_get_vec_u16 (cpu, vn, e);
+	    product  = element1 * element2;
+	    aarch64_set_vec_u16 (cpu, vd, e, product);
+	  }
+      }
+      break;
+
+    case 2:
+      {
+	/* 32 bit products.  */
+	uint32_t product;
+	uint32_t element1;
+	uint32_t element2;
+
+	index = (H << 1) | L;
+	vm = uimm (aarch64_get_instr (cpu), 20, 16);
+	element2 = aarch64_get_vec_u32 (cpu, vm, index);
+
+	for (e = 0; e < (full ? 4 : 2); e ++)
+	  {
+	    element1 = aarch64_get_vec_u32 (cpu, vn, e);
+	    product  = element1 * element2;
+	    aarch64_set_vec_u32 (cpu, vd, e, product);
+	  }
+      }
+      break;
+
+    default:
+      HALT_UNALLOC;
+    }
+}
+
+static void
 do_vec_op2 (sim_cpu *cpu)
 {
   /* instr[31]    = 0
@@ -5685,19 +5760,30 @@  do_vec_op2 (sim_cpu *cpu)
      instr[22,16] = element size & index
      instr[15,10] = sub-opcode
      instr[9,5]   = Vm
-     instr[4.0]   = Vd  */
+     instr[4,0]   = Vd  */
 
   NYI_assert (29, 24, 0x0F);
 
   if (uimm (aarch64_get_instr (cpu), 23, 23) != 0)
-    HALT_NYI;
-
-  switch (uimm (aarch64_get_instr (cpu), 15, 10))
     {
-    case 0x01: do_vec_SSHR_USHR (cpu); return;
-    case 0x15: do_vec_SHL (cpu); return;
-    case 0x29: do_vec_xtl (cpu); return;
-    default:   HALT_NYI;
+      switch (uimm (aarch64_get_instr (cpu), 15, 10))
+	{
+	case 0x20:
+	case 0x22: do_vec_MUL_by_element (cpu); return;
+	default:   HALT_NYI;
+	}
+    }
+  else
+    {
+      switch (uimm (aarch64_get_instr (cpu), 15, 10))
+	{
+	case 0x01: do_vec_SSHR_USHR (cpu); return;
+	case 0x15: do_vec_SHL (cpu); return;
+	case 0x20:
+	case 0x22: do_vec_MUL_by_element (cpu); return;
+	case 0x29: do_vec_xtl (cpu); return;
+	default:   HALT_NYI;
+	}
     }
 }
 
@@ -6831,8 +6917,8 @@  fsturs (sim_cpu *cpu, int32_t offset)
   unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
   unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
 
-  aarch64_set_mem_float (cpu, aarch64_get_reg_u64 (cpu, st, 1) + offset,
-			 aarch64_get_FP_float (cpu, rn));
+  aarch64_set_mem_u32 (cpu, aarch64_get_reg_u64 (cpu, st, 1) + offset,
+		       aarch64_get_vec_u32 (cpu, rn, 0));
 }
 
 /* Store 64 bit unscaled signed 9 bit.  */
@@ -6842,8 +6928,8 @@  fsturd (sim_cpu *cpu, int32_t offset)
   unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
   unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
 
-  aarch64_set_mem_double (cpu, aarch64_get_reg_u64 (cpu, st, 1) + offset,
-			  aarch64_get_FP_double (cpu, rn));
+  aarch64_set_mem_u64 (cpu, aarch64_get_reg_u64 (cpu, st, 1) + offset,
+		       aarch64_get_vec_u64 (cpu, rn, 0));
 }
 
 /* Store 128 bit unscaled signed 9 bit.  */
@@ -6985,8 +7071,8 @@  fldurs (sim_cpu *cpu, int32_t offset)
   unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
   unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
 
-  aarch64_set_FP_float (cpu, st, aarch64_get_mem_float
-			(cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
+  aarch64_set_vec_u32 (cpu, st, 0, aarch64_get_mem_u32
+		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
 }
 
 /* Load 64 bit unscaled signed 9 bit.  */
@@ -6996,8 +7082,8 @@  fldurd (sim_cpu *cpu, int32_t offset)
   unsigned int rn = uimm (aarch64_get_instr (cpu), 9, 5);
   unsigned int st = uimm (aarch64_get_instr (cpu), 4, 0);
 
-  aarch64_set_FP_double (cpu, st, aarch64_get_mem_double
-			 (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
+  aarch64_set_vec_u64 (cpu, st, 0, aarch64_get_mem_u64
+		       (cpu, aarch64_get_reg_u64 (cpu, rn, SP_OK) + offset));
 }
 
 /* Load 128 bit unscaled signed 9 bit.  */
@@ -8190,6 +8276,25 @@  do_scalar_MOV (sim_cpu *cpu)
 }
 
 static void
+do_scalar_NEG (sim_cpu *cpu)
+{
+  /* instr [31,24] = 0111 1110
+     instr [23,22] = 11
+     instr [21,10] = 1000 0010 1110
+     instr [9, 5]  = Rn
+     instr [4, 0]  = Rd.  */
+
+  unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
+  unsigned rd = uimm (aarch64_get_instr (cpu), 4, 0);
+
+  NYI_assert (31, 24, 0x7E);
+  NYI_assert (21, 10, 0x82E);
+  NYI_assert (23, 22, 3);
+
+  aarch64_set_vec_u64 (cpu, rd, 0, - aarch64_get_vec_u64 (cpu, rn, 0));
+}
+
+static void
 do_double_add (sim_cpu *cpu)
 {
   /* instr [28,25] = 1111.  */
@@ -8228,6 +8333,7 @@  do_double_add (sim_cpu *cpu)
 	case 0x35: do_scalar_FABD (cpu); return;
 	case 0x39: do_scalar_FCM (cpu); return;
 	case 0x3B: do_scalar_FCM (cpu); return;
+	case 0x2E: do_scalar_NEG (cpu); return;
 	default:
 	  HALT_NYI;
 	}
@@ -9464,10 +9570,10 @@  fstrs_abs (sim_cpu *cpu, uint32_t offset)
   unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
   unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
 
-  aarch64_set_mem_float
+  aarch64_set_mem_u32
     (cpu,
      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 32),
-     aarch64_get_FP_float (cpu, st));
+     aarch64_get_vec_u32 (cpu, st, 0));
 }
 
 /* 32 bit store unscaled signed 9 bit with pre- or post-writeback.  */
@@ -9482,7 +9588,7 @@  fstrs_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_mem_float (cpu, address, aarch64_get_FP_float (cpu, st));
+  aarch64_set_mem_u32 (cpu, address, aarch64_get_vec_u32 (cpu, st, 0));
 
   if (wb == Post)
     address += offset;
@@ -9505,8 +9611,8 @@  fstrs_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 			       extension);
   uint64_t  displacement = OPT_SCALE (extended, 32, scaling);
 
-  aarch64_set_mem_float
-    (cpu, address + displacement, aarch64_get_FP_float (cpu, st));
+  aarch64_set_mem_u32
+    (cpu, address + displacement, aarch64_get_vec_u32 (cpu, st, 0));
 }
 
 /* 64 bit store scaled unsigned 12 bit.  */
@@ -9516,10 +9622,10 @@  fstrd_abs (sim_cpu *cpu, uint32_t offset)
   unsigned st = uimm (aarch64_get_instr (cpu), 4, 0);
   unsigned rn = uimm (aarch64_get_instr (cpu), 9, 5);
 
-  aarch64_set_mem_double
+  aarch64_set_mem_u64
     (cpu,
      aarch64_get_reg_u64 (cpu, rn, SP_OK) + SCALE (offset, 64),
-     aarch64_get_FP_double (cpu, st));
+     aarch64_get_vec_u64 (cpu, st, 0));
 }
 
 /* 64 bit store unscaled signed 9 bit with pre- or post-writeback.  */
@@ -9534,7 +9640,7 @@  fstrd_wb (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_mem_double (cpu, address, aarch64_get_FP_double (cpu, st));
+  aarch64_set_mem_u64 (cpu, address, aarch64_get_vec_u64 (cpu, st, 0));
 
   if (wb == Post)
     address += offset;
@@ -9557,8 +9663,8 @@  fstrd_scale_ext (sim_cpu *cpu, Scaling scaling, Extension extension)
 			       extension);
   uint64_t  displacement = OPT_SCALE (extended, 64, scaling);
 
-  aarch64_set_mem_double
-    (cpu, address + displacement, aarch64_get_FP_double (cpu, st));
+  aarch64_set_mem_u64
+    (cpu, address + displacement, aarch64_get_vec_u64 (cpu, st, 0));
 }
 
 /* 128 bit store scaled unsigned 12 bit.  */
@@ -10093,8 +10199,8 @@  store_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_mem_float (cpu, address,     aarch64_get_FP_float (cpu, rm));
-  aarch64_set_mem_float (cpu, address + 4, aarch64_get_FP_float (cpu, rn));
+  aarch64_set_mem_u32 (cpu, address,     aarch64_get_vec_u32 (cpu, rm, 0));
+  aarch64_set_mem_u32 (cpu, address + 4, aarch64_get_vec_u32 (cpu, rn, 0));
 
   if (wb == Post)
     address += offset;
@@ -10116,8 +10222,8 @@  store_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_mem_double (cpu, address,     aarch64_get_FP_double (cpu, rm));
-  aarch64_set_mem_double (cpu, address + 8, aarch64_get_FP_double (cpu, rn));
+  aarch64_set_mem_u64 (cpu, address,     aarch64_get_vec_u64 (cpu, rm, 0));
+  aarch64_set_mem_u64 (cpu, address + 8, aarch64_get_vec_u64 (cpu, rn, 0));
 
   if (wb == Post)
     address += offset;
@@ -10168,8 +10274,8 @@  load_pair_float (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_FP_float (cpu, rm, aarch64_get_mem_float (cpu, address));
-  aarch64_set_FP_float (cpu, rn, aarch64_get_mem_float (cpu, address + 4));
+  aarch64_set_vec_u32 (cpu, rm, 0, aarch64_get_mem_u32 (cpu, address));
+  aarch64_set_vec_u32 (cpu, rn, 0, aarch64_get_mem_u32 (cpu, address + 4));
 
   if (wb == Post)
     address += offset;
@@ -10194,8 +10300,8 @@  load_pair_double (sim_cpu *cpu, int32_t offset, WriteBack wb)
   if (wb != Post)
     address += offset;
 
-  aarch64_set_FP_double (cpu, rm, aarch64_get_mem_double (cpu, address));
-  aarch64_set_FP_double (cpu, rn, aarch64_get_mem_double (cpu, address + 8));
+  aarch64_set_vec_u64 (cpu, rm, 0, aarch64_get_mem_u64 (cpu, address));
+  aarch64_set_vec_u64 (cpu, rn, 0, aarch64_get_mem_u64 (cpu, address + 8));
 
   if (wb == Post)
     address += offset;