From 2b67f2568375ad0f7bf065fc3bc69166ebed0717 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Sun, 20 Jun 2021 20:24:09 -0400
Subject: [PATCH] all rv64f instructions except convert, divide, square root,
 and FLD pass

---
 .../config/rv64icfd/wally-config.vh           |  2 +-
 wally-pipelined/src/fpu/FMA/tbgen/tb.sv       | 12 +--
 wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh |  2 +-
 wally-pipelined/src/fpu/fctrl.sv              |  5 +-
 wally-pipelined/src/fpu/fma1.sv               | 40 ++++-----
 wally-pipelined/src/fpu/fma2.sv               | 89 ++++++++++---------
 wally-pipelined/src/fpu/fpu.sv                | 34 ++++---
 wally-pipelined/src/fpu/fpuclassify.sv        |  6 +-
 wally-pipelined/src/fpu/fpucmp2.sv            |  9 +-
 .../testbench/testbench-imperas.sv            | 35 ++++----
 10 files changed, 125 insertions(+), 109 deletions(-)

diff --git a/wally-pipelined/config/rv64icfd/wally-config.vh b/wally-pipelined/config/rv64icfd/wally-config.vh
index 1a7df3c4..e5ccc0bf 100644
--- a/wally-pipelined/config/rv64icfd/wally-config.vh
+++ b/wally-pipelined/config/rv64icfd/wally-config.vh
@@ -34,7 +34,7 @@
 `define XLEN 64
 
 // MISA RISC-V configuration per specification
-`define MISA (32'h00000104 | 0 << 5 | 1 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0)
+`define MISA (32'h00000104 | 1 << 5 | 1 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0)
 `define ZCSR_SUPPORTED 1
 `define COUNTERS 32
 `define ZCOUNTERS_SUPPORTED 1
diff --git a/wally-pipelined/src/fpu/FMA/tbgen/tb.sv b/wally-pipelined/src/fpu/FMA/tbgen/tb.sv
index 9a91bfdd..48dc16da 100644
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.sv
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.sv
@@ -45,7 +45,7 @@ assign FOpCtrlE = 3'b0;
 // down - 010
 // up - 011
 // nearest max mag - 100  
-assign FrmE = 3'b000;
+assign FrmE = 3'b010;
 assign FmtE = 1'b1;
 
 
@@ -55,8 +55,8 @@ assign	ynan = FmtE ? &FInput2E[62:52] && |FInput2E[51:0] : &FInput2E[62:55] && |
 assign	znan = FmtE ? &FInput3E[62:52] && |FInput3E[51:0] : &FInput3E[62:55] && |FInput3E[54:32]; 
 assign	ansnan = FmtE ? &ans[62:52] && |ans[51:0] : &ans[62:55] && |ans[54:32]; 
  // instantiate device under test
-fma1 UUT1(.*);
-fma2 UUT2(.FInput1M(FInput1E), .FInput2M(FInput2E), .FInput3M(FInput3E), .FrmM(FrmE), .ProdManM(ProdManE),
+fma1 UUT1(.X(FInput1E), .Y(FInput2E), .Z(FInput3E), .*);
+fma2 UUT2(.X(FInput1E), .Y(FInput2E), .Z(FInput3E), .FrmM(FrmE), .ProdManM(ProdManE),
 			.AlignedAddendM(AlignedAddendE), .ProdExpM(ProdExpE), .AddendStickyM(AddendStickyE),.KillProdM(KillProdE), .FOpCtrlM(FOpCtrlE),
 			.XZeroM(XZeroE),.YZeroM(YZeroE),.ZZeroM(ZZeroE),.XInfM(XInfE),.YInfM(YInfE),.ZInfM(ZInfE),.XNaNM(XNaNE),.YNaNM(YNaNE),.ZNaNM(ZNaNE), .FmtM(FmtE), .*);
 
@@ -110,7 +110,7 @@ always @(posedge clk)
 		if(ans >= 64'h7FF8000000000000 && ans <= 64'h7FFfffffffffffff ) $display( "ans=qutNaN ");
 		if(ans >= 64'hFFF8000000000000 && ans <= 64'hFFFfffffffffffff ) $display( "ans=qutNaN ");
         errors = errors + 1;
-	  if (errors == 40)
+	 // if (errors == 40)
 		$stop;
     end
     if((FmtE==1'b0)&(FmaFlagsM != flags[4:0] || (!wnan && (FmaResultM != ans)) || (wnan && ansnan && ~(((xnan && (FmaResultM[62:0] == {FInput1E[62:55],1'b1,FInput1E[53:0]})) || (ynan && (FmaResultM[62:0] == {FInput2E[62:55],1'b1,FInput2E[53:0]}))  || (znan && (FmaResultM[62:0] == {FInput3E[62:55],1'b1,FInput3E[53:0]})) || (FmaResultM[62:0] == ans[62:0]))) ))) begin
@@ -131,7 +131,7 @@ always @(posedge clk)
 		if(&ans[62:55] && |ans[54:32] && ~ans[54] ) $display( "ans=sigNaN ");
 		if(&ans[62:55] && |ans[54:32] && ans[54]) $display( "ans=qutNaN ");
         errors = errors + 1;
-	  if (errors == 10)
+	  //if (errors == 10)
 		$stop;
     end
  vectornum = vectornum + 1;
@@ -140,4 +140,4 @@ always @(posedge clk)
  $stop;
  end
  end
-endmodule
\ No newline at end of file
+endmodule
diff --git a/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh b/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
index 0741e9d6..a8dd70b8 100755
--- a/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
+++ b/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
@@ -1,3 +1,3 @@
-testfloat_gen f64_mulAdd -tininessafter -n 6133248 -rnear_even  -seed 113355 -level 1 > testFloat
+testfloat_gen f64_mulAdd -tininessbefore -n 6133248 -rmin  -seed 113355 -level 1 > testFloat
 tr -d ' ' < testFloat > testFloatNoSpace
 
diff --git a/wally-pipelined/src/fpu/fctrl.sv b/wally-pipelined/src/fpu/fctrl.sv
index 94143b87..5749d0db 100755
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@@ -19,8 +19,6 @@ module fctrl (
 
 
   logic IllegalFPUInstr1D, IllegalFPUInstr2D;
-  //precision is taken directly from instruction
-  assign FmtD = Funct7D[0];
   // *** fix rounding for dynamic rounding
   assign FrmD = &Funct3D ? FRM_REGW : Funct3D;
 
@@ -211,6 +209,9 @@ module fctrl (
     endcase
   end
 
+  //precision
+  assign FmtD = (~&FResultSelD & Funct7D[0]) | (&FResultSelD & FOpCtrlD[0]);
+
   assign IllegalFPUInstrD = IllegalFPUInstr1D | IllegalFPUInstr2D;
   //write to integer source if conv to int occurs
   //AND of Funct7 for int results 
diff --git a/wally-pipelined/src/fpu/fma1.sv b/wally-pipelined/src/fpu/fma1.sv
index dd2cc585..ab9d2bb1 100644
--- a/wally-pipelined/src/fpu/fma1.sv
+++ b/wally-pipelined/src/fpu/fma1.sv
@@ -1,8 +1,8 @@
 module fma1(
  
-	input logic 	[63:0]		FInput1E,	// X
-	input logic		[63:0]		FInput2E,	// Y
-	input logic 	[63:0]		FInput3E,	// Z
+	input logic 	[63:0]		X,	// X
+	input logic		[63:0]		Y,	// Y
+	input logic 	[63:0]		Z,	// Z
 	input logic 	[2:0]		FOpCtrlE,	// 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
 	input logic 				FmtE,		// precision 1 = double 0 = single
 	output logic 	[105:0]		ProdManE,	// 1.X frac * 1.Y frac
@@ -21,8 +21,8 @@ module fma1(
 	logic [12:0]	AlignCnt;			// how far to shift the addend to align with the product
 	logic [211:0] 	ZManShifted;				// output of the alignment shifter including sticky bit
 	logic [211:0] 	ZManPreShifted;		// input to the alignment shifter
-	logic			XDenormE, YDenormE, ZDenormE;	// inputs are denormal
-	logic [63:0]	FInput3E2;	// value to add (Z or zero)
+	logic			XDenorm, YDenorm, ZDenorm;	// inputs are denormal
+	logic [63:0]	Addend;	// value to add (Z or zero)
 	logic [12:0]	Bias;	// 1023 for double, 127 for single
 	logic 			XExpZero, YExpZero, ZExpZero; 	// input exponent zero
 	logic 			XFracZero, YFracZero, ZFracZero; // input fraction zero
@@ -34,19 +34,19 @@ module fma1(
 	///////////////////////////////////////////////////////////////////////////////
 
 	// Set addend to zero if FMUL instruction
-  	assign FInput3E2 = FOpCtrlE[2] ? 64'b0 : FInput3E;
+  	assign Addend = FOpCtrlE[2] ? 64'b0 : Z;
 
-	assign XSgn = FInput1E[63];
-	assign YSgn = FInput2E[63];
-	assign ZSgn = FInput3E2[63];
+	assign XSgn = X[63];
+	assign YSgn = Y[63];
+	assign ZSgn = Addend[63];
 
-	assign XExp = FmtE ? {2'b0, FInput1E[62:52]} : {5'b0, FInput1E[62:55]};
-	assign YExp = FmtE ? {2'b0, FInput2E[62:52]} : {5'b0, FInput2E[62:55]};
-	assign ZExp = FmtE ? {2'b0, FInput3E2[62:52]} : {5'b0, FInput3E2[62:55]};
+	assign XExp = FmtE ? {2'b0, X[62:52]} : {5'b0, X[62:55]};
+	assign YExp = FmtE ? {2'b0, Y[62:52]} : {5'b0, Y[62:55]};
+	assign ZExp = FmtE ? {2'b0, Addend[62:52]} : {5'b0, Addend[62:55]};
 
-	assign XFrac = FmtE ? FInput1E[51:0] : {FInput1E[54:32], 29'b0};
-	assign YFrac = FmtE ? FInput2E[51:0] : {FInput2E[54:32], 29'b0};
-	assign ZFrac = FmtE ? FInput3E2[51:0] : {FInput3E2[54:32], 29'b0};
+	assign XFrac = FmtE ? X[51:0] : {X[54:32], 29'b0};
+	assign YFrac = FmtE ? Y[51:0] : {Y[54:32], 29'b0};
+	assign ZFrac = FmtE ? Addend[51:0] : {Addend[54:32], 29'b0};
 	
 	assign XMan = {~XExpZero, XFrac};
 	assign YMan = {~YExpZero, YFrac};
@@ -76,9 +76,9 @@ module fma1(
 	assign YNaNE = YExpMax & ~YFracZero;
 	assign ZNaNE = ZExpMax & ~ZFracZero;
 
-	assign XDenormE = XExpZero & ~XFracZero; 
-	assign YDenormE = YExpZero & ~YFracZero; 
-	assign ZDenormE = ZExpZero & ~ZFracZero; 
+	assign XDenorm = XExpZero & ~XFracZero; 
+	assign YDenorm = YExpZero & ~YFracZero; 
+	assign ZDenorm = ZExpZero & ~ZFracZero; 
 
 	assign XInfE = XExpMax & XFracZero; 
 	assign YInfE = YExpMax & YFracZero; 
@@ -101,7 +101,7 @@ module fma1(
 	
 	// verilator lint_off WIDTH
 	assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 : 
-				 XExp + YExp - Bias + XDenormE + YDenormE;
+				 XExp + YExp - Bias + XDenorm + YDenorm;
 
 	// Calculate the product's mantissa
 	//		- Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
@@ -124,7 +124,7 @@ module fma1(
 	//		- positive means the product is larger, so shift Z right
 	//		- Denormal numbers have an an exponent value of 1, however they are 
 	//		  represented with an exponent of 0. add one to the exponent if it is a denormal number
-	assign AlignCnt = ProdExpE - ZExp - ZDenormE;
+	assign AlignCnt = ProdExpE - ZExp - ZDenorm;
 	// verilator lint_on WIDTH
 
 
diff --git a/wally-pipelined/src/fpu/fma2.sv b/wally-pipelined/src/fpu/fma2.sv
index 8d12431a..89a059dc 100644
--- a/wally-pipelined/src/fpu/fma2.sv
+++ b/wally-pipelined/src/fpu/fma2.sv
@@ -1,8 +1,8 @@
 module fma2(
  
-	input logic 	[63:0]		FInput1M,	// X
-	input logic		[63:0]		FInput2M,	// Y
-	input logic 	[63:0]		FInput3M,	// Z
+	input logic 	[63:0]		X,	// X
+	input logic		[63:0]		Y,	// Y
+	input logic 	[63:0]		Z,	// Z
 	input logic 	[2:0] 		FrmM,		// rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
 	input logic 	[2:0]		FOpCtrlM,	// 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
 	input logic 				FmtM,		// precision 1 = double 0 = single
@@ -32,7 +32,7 @@ module fma2(
 	logic [12:0]	SumExp;		// exponent of the normalized sum
 	logic [12:0]	SumExpTmp;	// exponent of the normalized sum not taking into account denormal or zero results
 	logic [12:0]	SumExpTmpMinus1;	// SumExpTmp-1
-	logic [12:0]	ResultExpTmp;		// ResultExp with bits to determine sign and overflow
+	logic [12:0]	FullResultExp;		// ResultExp with bits to determine sign and overflow
 	logic [53:0]	NormSum;	// normalized sum
 	logic [161:0]	SumShifted; // sum shifted for normalization
 	logic [8:0]		NormCnt;	// output of the leading zero detector
@@ -42,17 +42,18 @@ module fma2(
 	logic 			InvZ;		// invert Z if there is a subtraction (-product + Z or product - Z)
 	logic			ResultDenorm;	// is the result denormalized
 	logic			Sticky;		// Sticky bit
-	logic 			Plus1, Minus1, Plus1Tmp, Minus1Tmp;	// do you add or subtract one for rounding
+	logic 			Plus1, Minus1, CalcPlus1, CalcMinus1;	// do you add or subtract one for rounding
 	logic 			Invalid,Underflow,Overflow,Inexact;	// flags
 	logic [8:0]		DenormShift;	// right shift if the result is denormalized
 	logic 			SubBySmallNum;	// was there supposed to be a subtraction by a small number
-	logic [63:0]	FInput3M2;		// value to add (Z or zero)
+	logic [63:0]	Addend;		// value to add (Z or zero)
 	logic			ZeroSgn;		// the result's sign if the sum is zero
 	logic			ResultSgnTmp;	// the result's sign assuming the result is not zero
 	logic 			Guard, Round, LSBNormSum;	// bits needed to determine rounding
 	logic [12:0] 	MaxExp;		// maximum value of the exponent
 	logic [12:0] 	FracLen;	// length of the fraction
 	logic 			SigNaN;		// is an input a signaling NaN
+	logic 			UnderflowFlag; 	// Underflow singal used in FmaFlagsM (used to avoid a circular depencency)
 	logic [63:0] XNaNResult, YNaNResult, ZNaNResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult; // possible results
 
 	
@@ -62,15 +63,15 @@ module fma2(
 	///////////////////////////////////////////////////////////////////////////////
 
 	// Set addend to zero if FMUL instruction
-  	assign FInput3M2 = FOpCtrlM[2] ? 64'b0 : FInput3M;
+  	assign Addend = FOpCtrlM[2] ? 64'b0 : Z;
 
 	// split inputs into the sign bit, and exponent to handle single or double precision
 	// 		- single precision is in the top half of the inputs
-	assign XSgn = FInput1M[63];
-	assign YSgn = FInput2M[63];
-	assign ZSgn = FInput3M2[63]^FOpCtrlM[0]; //Negate Z if subtraction
+	assign XSgn = X[63];
+	assign YSgn = Y[63];
+	assign ZSgn = Addend[63]^FOpCtrlM[0]; //Negate Z if subtraction
 
-	assign ZExp = FmtM ? FInput3M2[62:52] : {3'b0, FInput3M2[62:55]};
+	assign ZExp = FmtM ? Addend[62:52] : {3'b0, Addend[62:55]};
 
 
 
@@ -207,28 +208,28 @@ module fma2(
 	always_comb begin
 		// Determine if you add 1
 		case (FrmM)
-			3'b000: Plus1Tmp = Guard & (Round | (Sticky&~(~Round&SubBySmallNum)) | (~Round&~Sticky&LSBNormSum&~SubBySmallNum));//round to nearest even
-			3'b001: Plus1Tmp = 0;//round to zero
-			3'b010: Plus1Tmp = ResultSgn & ~(SubBySmallNum & ~Guard & ~Round);//round down
-			3'b011: Plus1Tmp = ~ResultSgn & ~(SubBySmallNum & ~Guard & ~Round);//round up
-			3'b100: Plus1Tmp = (Guard & (Round | (Sticky&~(~Round&SubBySmallNum)) | (~Round&~Sticky&~SubBySmallNum)));//round to nearest max magnitude
-			default: Plus1Tmp = 1'bx;
+			3'b000: CalcPlus1 = Guard & (Round | (Sticky&~(~Round&SubBySmallNum)) | (~Round&~Sticky&LSBNormSum&~SubBySmallNum));//round to nearest even
+			3'b001: CalcPlus1 = 0;//round to zero
+			3'b010: CalcPlus1 = ResultSgn & ~(SubBySmallNum & ~Guard & ~Round);//round down
+			3'b011: CalcPlus1 = ~ResultSgn & ~(SubBySmallNum & ~Guard & ~Round);//round up
+			3'b100: CalcPlus1 = (Guard & (Round | (Sticky&~(~Round&SubBySmallNum)) | (~Round&~Sticky&~SubBySmallNum)));//round to nearest max magnitude
+			default: CalcPlus1 = 1'bx;
 		endcase
 		// Determine if you subtract 1
 		case (FrmM)
-			3'b000: Minus1Tmp = 0;//round to nearest even
-			3'b001: Minus1Tmp = SubBySmallNum & ~Guard & ~Round;//round to zero
-			3'b010: Minus1Tmp = ~ResultSgn & ~Guard & ~Round & SubBySmallNum;//round down
-			3'b011: Minus1Tmp = ResultSgn & ~Guard & ~Round & SubBySmallNum;//round up
-			3'b100: Minus1Tmp = 0;//round to nearest max magnitude
-			default: Minus1Tmp = 1'bx;
+			3'b000: CalcMinus1 = 0;//round to nearest even
+			3'b001: CalcMinus1 = SubBySmallNum & ~Guard & ~Round;//round to zero
+			3'b010: CalcMinus1 = ~ResultSgn & ~Guard & ~Round & SubBySmallNum;//round down
+			3'b011: CalcMinus1 = ResultSgn & ~Guard & ~Round & SubBySmallNum;//round up
+			3'b100: CalcMinus1 = 0;//round to nearest max magnitude
+			default: CalcMinus1 = 1'bx;
 		endcase
 	
 	end
 
 	// If an answer is exact don't round
-    assign Plus1 = Plus1Tmp & (Sticky | Guard | Round);
-    assign Minus1 = Minus1Tmp & (Sticky | Guard | Round);
+    assign Plus1 = CalcPlus1 & (Sticky | Guard | Round);
+    assign Minus1 = CalcMinus1 & (Sticky | Guard | Round);
 
 	// Compute rounded result 
 	logic [64:0] RoundAdd;
@@ -237,8 +238,8 @@ module fma2(
 							 Minus1 ? {{36{1'b1}}, 29'b0} :	{35'b0, Plus1, 29'b0};
 	assign NormSumTruncated = FmtM ? NormSum[53:2] : {NormSum[53:31], 29'b0};
 
-	assign {ResultExpTmp, ResultFrac} = {SumExp, NormSumTruncated} + RoundAdd;
-    assign ResultExp = ResultExpTmp[10:0];
+	assign {FullResultExp, ResultFrac} = {SumExp, NormSumTruncated} + RoundAdd;
+    assign ResultExp = FullResultExp[10:0];
 
 
 
@@ -277,27 +278,27 @@ module fma2(
 	//   2) 0 * Inf
 	//   3) any input is a signaling NaN
 	assign MaxExp = FmtM ? 13'd2047 : 13'd255;
-	assign SigNaN = FmtM ? (XNaNM&~FInput1M[51]) | (YNaNM&~FInput2M[51]) | (ZNaNM&~FInput3M2[51]) : 
-						   (XNaNM&~FInput1M[54]) | (YNaNM&~FInput2M[54]) | (ZNaNM&~FInput3M2[54]);
-	assign Invalid = SigNaN | ((XInfM || YInfM) & ZInfM & (XSgn ^ YSgn ^ ZSgn) & ~XNaNM & ~YNaNM) | (XZeroM & YInfM) | (YZeroM & XInfM);  
+	assign SigNaN = FmtM ? (XNaNM&~X[51]) | (YNaNM&~Y[51]) | (ZNaNM&~Addend[51]) : 
+						   (XNaNM&~X[54]) | (YNaNM&~Y[54]) | (ZNaNM&~Addend[54]);
+	assign Invalid = SigNaN | ((XInfM || YInfM) & ZInfM & (PSgn ^ ZSgn) & ~XNaNM & ~YNaNM) | (XZeroM & YInfM) | (YZeroM & XInfM);  
 	
 	// Set Overflow flag if the number is too big to be represented
 	//		- Don't set the overflow flag if an overflowed result isn't outputed
-	assign Overflow = ResultExpTmp >= MaxExp & ~ResultExpTmp[12]&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+	assign Overflow = FullResultExp >= MaxExp & ~FullResultExp[12]&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
 
 	// Set Underflow flag if the number is too small to be represented in normal numbers
-	logic ProdUf;
-	assign ProdUf = ProdExpM <= 1;
-	// assign Underflow = ResultExpTmp[12] | (KillProdM&AddendStickyM&ZZeroM) | (~(|ResultExpTmp)&ResultDenorm&(Round|Guard|Sticky)) | Plus1&ResultDenorm&(ResultExp == 1);
+	//		- Don't set the underflow flag if the result is exact 
 	assign Underflow = (SumExp[12] | ((SumExp == 0) & (Round|Guard|Sticky))    )&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+	assign UnderflowFlag = Underflow | (FullResultExp == 0)&Minus1; // before rounding option
+	// assign UnderflowFlag = (Underflow | (FullResultExp == 0)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM)&(Round|Guard|Sticky))  & ~(FullResultExp == 1); //after rounding option
 	// Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
 	//		- Don't set the underflow flag if an underflowed result isn't outputed
 	assign Inexact = (Sticky|Overflow|Guard|Round|Underflow)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
 
 	// Combine flags 
 	//		- FMA can't set the Divide by zero flag
-	//		- Don't set the underflow flag if the result is exact 
-	assign FmaFlagsM = {Invalid, 1'b0, Overflow, Underflow & ~(ResultExpTmp == 1), Inexact};
+	//		- Don't set the underflow flag if the result was rounded up to a normal number
+	assign FmaFlagsM = {Invalid, 1'b0, Overflow, UnderflowFlag, Inexact};
 
 
 
@@ -308,23 +309,23 @@ module fma2(
 	///////////////////////////////////////////////////////////////////////////////
 	// Select the result
 	///////////////////////////////////////////////////////////////////////////////
-	assign XNaNResult = FmtM ? {XSgn, FInput1M[62:52], 1'b1,FInput1M[50:0]} : {XSgn, FInput1M[62:55], 1'b1,FInput1M[53:0]};
-	assign YNaNResult = FmtM ? {YSgn, FInput2M[62:52], 1'b1,FInput2M[50:0]} : {YSgn, FInput2M[62:55], 1'b1,FInput2M[53:0]};
-	assign ZNaNResult = FmtM ? {ZSgn, FInput3M2[62:52], 1'b1,FInput3M2[50:0]} : {ZSgn, FInput3M2[62:55], 1'b1,FInput3M2[53:0]};
+	assign XNaNResult = FmtM ? {XSgn, X[62:52], 1'b1,X[50:0]} : {XSgn, X[62:55], 1'b1,X[53:0]};
+	assign YNaNResult = FmtM ? {YSgn, Y[62:52], 1'b1,Y[50:0]} : {YSgn, Y[62:55], 1'b1,Y[53:0]};
+	assign ZNaNResult = FmtM ? {ZSgn, Addend[62:52], 1'b1,Addend[50:0]} : {ZSgn, Addend[62:55], 1'b1,Addend[53:0]};
 	assign OverflowResult =  FmtM ? ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, 11'h7fe, {52{1'b1}}} : 
 																														  {ResultSgn, 11'h7ff, 52'b0} : 
 									((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, 8'hfe, {23{1'b1}}, 32'b0} :
 																														  {ResultSgn, 8'hff, 55'b0};
 	assign InvalidResult = FmtM ? {ResultSgn, 11'h7ff, 1'b1, 51'b0} : {ResultSgn, 8'hff, 1'b1, 54'b0};
-	assign KillProdResult = FmtM ?{ResultSgn, FInput3M2[62:0] - {62'b0, (Minus1&AddendStickyM)}} + {62'b0, (Plus1&AddendStickyM)} : {ResultSgn, FInput3M2[62:32] - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}, 32'b0};
-	assign UnderflowResult = FmtM ? {ResultSgn, 63'b0} + {63'b0, (Plus1Tmp&(AddendStickyM|FrmM[1]))} : {{ResultSgn, 31'b0} + {31'b0, (Plus1Tmp&(AddendStickyM|FrmM[1]))}, 32'b0};
+	assign KillProdResult = FmtM ?{ResultSgn, Addend[62:0] - {62'b0, (Minus1&AddendStickyM)}} + {62'b0, (Plus1&AddendStickyM)} : {ResultSgn, Addend[62:32] - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}, 32'b0};
+	assign UnderflowResult = FmtM ? {ResultSgn, 63'b0} + {63'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))} : {{ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}, 32'b0};
 	assign FmaResultM = XNaNM ? XNaNResult : 
 						YNaNM ? YNaNResult : 
 						ZNaNM ? ZNaNResult :
 						Invalid ? InvalidResult : // has to be before inf
-						XInfM ? {PSgn, FInput1M[62:0]} :
-						YInfM ? {PSgn, FInput2M[62:0]} :
-						ZInfM ? {ZSgn, FInput3M2[62:0]} :
+						XInfM ? {PSgn, X[62:0]} :
+						YInfM ? {PSgn, Y[62:0]} :
+						ZInfM ? {ZSgn, Addend[62:0]} :
 						Overflow ? OverflowResult :	
 						KillProdM ? KillProdResult : // has to be after Underflow		
 						Underflow & ~ResultDenorm ? UnderflowResult :	
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index e85d4743..016f004a 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -61,6 +61,7 @@ module fpu (
    logic 		   FInput3UsedD;                                           // Is input 3 used
    logic [2:0] 		   FResultSelD, FResultSelE, FResultSelM, FResultSelW;     // Select FP result
    logic [3:0] 		   FOpCtrlD, FOpCtrlE, FOpCtrlM;                           // Select which opperation to do in each component
+   logic          SelLoadInputE, SelLoadInputM;
    
    // regfile signals //*** KEP lint warning -  changed `XLEN-1 to 63 
    logic [4:0] 		   RdE, RdM, RdW; // ***Can take from ieu
@@ -70,7 +71,7 @@ module fpu (
    logic [63:0] 	   FInput1E, FInput1M, FInput1tmpE;
    logic [63:0] 	   FInput2E, FInput2M;
    logic [63:0] 	   FInput3E, FInput3M;
-   logic [63:0] 	   FLoadStoreResultM, FLoadStoreResultW;                   // Result for load, store, and move to int-reg instructions
+   logic [63:0] 	   FLoadResultM, FLoadStoreResultM, FLoadStoreResultW;                   // Result for load, store, and move to int-reg instructions
    
    // div/sqrt signals
    logic 		   DivDenormE, DivDenormM, DivDenormW;
@@ -139,7 +140,7 @@ module fpu (
    logic [4:0] 		   SgnFlagsE, SgnFlagsM, SgnFlagsW;
    
    // instantiation of W stage regfile signals
-   logic [`XLEN-1:0] 	   SrcAW;
+   logic [63:0] 	   AlignedSrcAM, ForwardSrcAM, SrcAW;
    
    // classify signals
    logic [63:0] 	   ClassResultE, ClassResultM, ClassResultW;
@@ -207,16 +208,18 @@ module fpu (
    flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE);
    flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E);
    flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE);
+   flopenrc #(1) DEReg18(clk, reset, PipeClearDE, PipeEnableDE, InstrD[15], SelLoadInputE);
    
    //EXECUTION STAGE
    
-   // input muxs for forwarding
-   mux4  #(64)  FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, {SrcAM, {64-`XLEN{1'b0}}}, FForwardInput1E, FInput1tmpE);
+   // input muxs for forwarding   
+   mux2  #(64)  SrcAMuxForward({SrcAM[31:0], 32'b0}, {SrcAM, {64-`XLEN{1'b0}}}, FmtM, ForwardSrcAM);
+   mux4  #(64)  FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, ForwardSrcAM, FForwardInput1E, FInput1tmpE);
    mux3  #(64)  FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E);
    mux2  #(64)  FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E);
    mux2  #(64)  FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E);
    
-   fma1 fma1 (.FOpCtrlE(FOpCtrlE[2:0]),.*);
+   fma1 fma1 (.X(FInput1E), .Y(FInput2E), .Z(FInput3E), .FOpCtrlE(FOpCtrlE[2:0]),.*);
    
    // first and only instance of floating-point divider
    logic fpdivClk;
@@ -337,6 +340,7 @@ module fpu (
    flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM);
    flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
    flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
+   flopenrc #(1) EMReg9(clk, reset, PipeClearEM, PipeEnableEM, SelLoadInputE, SelLoadInputM);
    
    //*****************
    // fpuclassify E/M pipe registers
@@ -345,11 +349,13 @@ module fpu (
    
    //BEGIN MEMORY STAGE
    
-   assign FWriteDataM = FInput1M[63:64-`XLEN];
+   assign FWriteDataM = FmtM ? FInput1M[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FInput1M[63:32]};
+   //adjecent adress values are sent to the FPU, select the correct one
+   //    -imm is 80000 most of the time vs the error one which is 00000
+   mux3  #(64)  FLoadResultMux({HRDATA[31:0], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA[`AHBW-1:`AHBW-32], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA, {64-`AHBW{1'b0}}}, {FmtM, SelLoadInputM}, FLoadResultM);
+   mux2  #(64)  FLoadStoreResultMux(FLoadResultM, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
    
-   mux2  #(64)  FLoadStoreResultMux({HRDATA, {64-`AHBW{1'b0}}}, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
-   
-   fma2 fma2(.FOpCtrlM(FOpCtrlM[2:0]), .*);
+   fma2 fma2(.X(FInput1M), .Y(FInput2M), .Z(FInput3M), .FOpCtrlM(FOpCtrlM[2:0]), .*);
    
    // second instance of two-stage floating-point add/cvt unit
    fpuaddcvt2 fpadd2 (.*);
@@ -357,7 +363,9 @@ module fpu (
    // second instance of two-stage floating-point comparator
    fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), 
 		   .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*);
-   
+
+   mux2  #(64)  SrcAMux({SrcAM[31:0], 32'b0}, {{64-`XLEN{1'b0}}, SrcAM}, FmtM, AlignedSrcAM);
+      
    //*****************
    // fma M/W pipe registers
    //*****************
@@ -397,7 +405,7 @@ module fpu (
    flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
    flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
    flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
-   flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW);
+   flopenrc #(64) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, AlignedSrcAM, SrcAW);
    flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW);
    flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
    
@@ -447,7 +455,7 @@ module fpu (
 	// classify
 	3'b101 : FPUResult64W = ClassResultW;
 	// output SrcAW
-	3'b110 : FPUResult64W = {SrcAW, {64-`XLEN{1'b0}}};
+	3'b110 : FPUResult64W = SrcAW;
 	// Load/Store/Move to FP-register
 	3'b111 : FPUResult64W = FLoadStoreResultW;
 	default : FPUResult64W = {64{1'bx}};
@@ -460,7 +468,7 @@ module fpu (
    // define offsets for LSB zero extension or truncation
    always_comb begin      
       // zero extension 
-      FPUResultW = FPUResult64W[63:64-`XLEN];
+      FPUResultW = FmtW ? FPUResult64W[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FPUResult64W[63:32]};
       SetFflagsM = FPUFlagsW;      
    end
   
diff --git a/wally-pipelined/src/fpu/fpuclassify.sv b/wally-pipelined/src/fpu/fpuclassify.sv
index ee03cb52..1000bdf4 100644
--- a/wally-pipelined/src/fpu/fpuclassify.sv
+++ b/wally-pipelined/src/fpu/fpuclassify.sv
@@ -43,8 +43,10 @@ module fpuclassify (
     //  bit 7 - +infinity
     //  bit 8 - signaling NaN
     //  bit 9 - quiet NaN
-    assign ClassResultE = {{`XLEN-10{1'b0}}, FirstBitMan&NaN, ~FirstBitMan&NaN, ~sign&infinity, ~sign&normal, 
-                                    ~sign&subnormal, ~sign&zero, sign&zero, sign&subnormal, sign&normal, sign&infinity, {64-`XLEN{1'b0}}};
+    assign ClassResultE = FmtE ? {{54{1'b0}}, FirstBitMan&NaN, ~FirstBitMan&NaN, ~sign&infinity, ~sign&normal, 
+                                    ~sign&subnormal, ~sign&zero, sign&zero, sign&subnormal, sign&normal, sign&infinity} : 
+				 {{22{1'b0}}, FirstBitMan&NaN, ~FirstBitMan&NaN, ~sign&infinity, ~sign&normal, 
+                                    ~sign&subnormal, ~sign&zero, sign&zero, sign&subnormal, sign&normal, sign&infinity, {32{1'b0}}};
 
 
 endmodule
diff --git a/wally-pipelined/src/fpu/fpucmp2.sv b/wally-pipelined/src/fpu/fpucmp2.sv
index e2820688..42a780ac 100755
--- a/wally-pipelined/src/fpu/fpucmp2.sv
+++ b/wally-pipelined/src/fpu/fpucmp2.sv
@@ -45,6 +45,7 @@ module fpucmp2 (
    input logic        ANaN, BNaN,
    input logic        Azero, Bzero,
    input logic [3:0]  FOpCtrlM,
+   input logic 	      FmtM,
    
    output logic       Invalid, 		 // Invalid Operation
    output logic [1:0] FCC,  		 // Condition Codes 
@@ -160,6 +161,7 @@ endmodule // magcompare64b
 module exception_cmp_2 (
    input logic [63:0] A,
    input logic [63:0] B,
+   input logic 	      FmtM,
    input logic 	      LT_mag,
    input logic 	      EQ_mag,
    input logic [1:0]  Sel,
@@ -230,11 +232,12 @@ module exception_cmp_2 (
       case (FOpCtrlM[2:0])
          3'b111: FCmpResultM = LT ? A : B;//min 
          3'b101: FCmpResultM = GT ? A : B;//max
-         3'b010: FCmpResultM = {63'b0, EQ};//equal
-         3'b001: FCmpResultM = {63'b0, LT};//less than
-         3'b011: FCmpResultM = {63'b0, LT | EQ};//less than or equal
+         3'b010: FCmpResultM = FmtM ? {63'b0, EQ} : {31'b0, EQ, 32'b0};//equal
+         3'b001: FCmpResultM = FmtM ? {63'b0, LT} : {31'b0, LT, 32'b0};//less than
+         3'b011: FCmpResultM = FmtM ? {63'b0, LT|EQ} : {31'b0, LT|EQ, 32'b0};//less than or equal
          default: FCmpResultM = 64'b0;
       endcase
    end 
 
+
 endmodule // exception_cmp
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index f87f369b..7fa1e695 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -90,17 +90,21 @@ string tests32f[] = '{
   };
 
   string tests64f[] = '{
-    "rv64f/I-FADD-S-01", "2000",
+    // "rv64f/I-FLW-01", "2110",
+    "rv64f/I-FMV-W-X-01", "2000",
+    "rv64f/I-FMV-X-W-01", "2000",
+    "rv64f/I-FSW-01", "2000",
     "rv64f/I-FCLASS-S-01", "2000",
-    "rv64f/I-FCVT-S-L-01", "2000",
-    "rv64f/I-FCVT-S-LU-01", "2000",
-    "rv64f/I-FCVT-S-W-01", "2000",
-    "rv64f/I-FCVT-S-WU-01", "2000",
-    "rv64f/I-FCVT-L-S-01", "2000",
-    "rv64f/I-FCVT-LU-S-01", "2000",
-    "rv64f/I-FCVT-W-S-01", "2000",
-    "rv64f/I-FCVT-WU-S-01", "2000",
-    "rv64f/I-FDIV-S-01", "2000",
+    "rv64f/I-FADD-S-01", "2000",
+    // "rv64f/I-FCVT-S-L-01", "2000",
+    // "rv64f/I-FCVT-S-LU-01", "2000",
+    // "rv64f/I-FCVT-S-W-01", "2000",
+    // "rv64f/I-FCVT-S-WU-01", "2000",
+    // "rv64f/I-FCVT-L-S-01", "2000",
+    // "rv64f/I-FCVT-LU-S-01", "2000",
+    // "rv64f/I-FCVT-W-S-01", "2000",
+    // "rv64f/I-FCVT-WU-S-01", "2000",
+    // "rv64f/I-FDIV-S-01", "2000",
     "rv64f/I-FEQ-S-01", "2000",
     "rv64f/I-FLE-S-01", "2000",
     "rv64f/I-FLT-S-01", "2000",
@@ -109,20 +113,19 @@ string tests32f[] = '{
     "rv64f/I-FMIN-S-01", "2000",
     "rv64f/I-FMSUB-S-01", "2000",
     "rv64f/I-FMUL-S-01", "2000",
-    "rv64f/I-FMV-W-X-01", "2000",
     "rv64f/I-FNMADD-S-01", "2000",
     "rv64f/I-FNMSUB-S-01", "2000",
     "rv64f/I-FSGNJ-S-01", "2000",
     "rv64f/I-FSGNJN-S-01", "2000",
     "rv64f/I-FSGNJX-S-01", "2000",
-    "rv64f/I-FSQRT-S-01", "2000",
-    "rv64f/I-FSW-01", "2000",
-    "rv64f/I-FLW-01", "2000",
+    // "rv64f/I-FSQRT-S-01", "2000",
     "rv64f/I-FSUB-S-01", "2000"
   };
 
   string tests64d[] = '{
     // "rv64d/I-FDIV-D-01", "2000",
+    "rv64d/I-FSD-01", "2000",
+    "rv64d/I-FLD-01", "2420",
     "rv64d/I-FNMADD-D-01", "2000",
     "rv64d/I-FNMSUB-D-01", "2000",
     "rv64d/I-FMSUB-D-01", "2000",
@@ -143,8 +146,6 @@ string tests32f[] = '{
     // "rv64d/I-FCVT-S-D-01", "2000",
     // "rv64d/I-FCVT-W-D-01", "2000",
     // "rv64d/I-FCVT-WU-D-01", "2000",
-    "rv64d/I-FSD-01", "2000",
-    "rv64d/I-FLD-01", "2420",
     "rv64d/I-FMADD-D-01", "2000",
     "rv64d/I-FMUL-D-01", "2000",
     "rv64d/I-FMV-D-X-01", "2000",
@@ -538,8 +539,8 @@ string tests32f[] = '{
         if (`M_SUPPORTED) tests = {tests, tests64m};
         if (`A_SUPPORTED) tests = {tests, tests64a};
         if (`MEM_VIRTMEM) tests = {tests, tests64mmu};
-        // if (`F_SUPPORTED) tests = {tests64f, tests};
         if (`D_SUPPORTED) tests = {tests64d, tests};
+        if (`F_SUPPORTED) tests = {tests64f, tests};
       end
       //tests = {tests64a, tests};
     end else begin // RV32