all fpu units use the unpacking unit

2021-07-28 23:49:21 -04:00 · 2021-07-28 23:49:21 -04:00 · d60e394ef9
commit d60e394ef9
parent 915d8136e5
13 changed files with 298 additions and 345 deletions
--- a/wally-pipelined/fpu-testfloat/FMA/tbgen/tb.sv
+++ b/wally-pipelined/fpu-testfloat/FMA/tbgen/tb.sv
@ -48,7 +48,7 @@ assign FOpCtrlE = 3'b0;
 // up - 011
 // nearest max mag - 100  
 assign FrmE = 3'b000;
-assign FmtE = 1'b0;
+assign FmtE = 1'b1;

    logic  [`FLEN-1:0] X, Y, Z;
    // logic         FmtE;
@ -76,9 +76,9 @@ assign FmtE = 1'b0;
    assign YSgnE = FmtE ? Y[`FLEN-1] : Y[31];
    assign ZSgnE = FmtE ? Addend[`FLEN-1] : Addend[31];

-    assign XExpE = FmtE ? X[62:52] : {3'b0, X[30:23]};//{X[30], {3{~X[30]&~XExpZero|XExpMaxE}}, X[29:23]}; 
-    assign YExpE = FmtE ? Y[62:52] : {3'b0, Y[30:23]};//{Y[30], {3{~Y[30]&~YExpZero|YExpMaxE}}, Y[29:23]}; 
-    assign ZExpE = FmtE ? Addend[62:52] : {3'b0, Addend[30:23]};//{Addend[30], {3{~Addend[30]&~ZExpZero|ZExpMaxE}}, Addend[29:23]}; 
+    assign XExpE = FmtE ? X[62:52] : {X[30], {3{~X[30]&~XExpZero|XExpMaxE}}, X[29:23]}; 
+    assign YExpE = FmtE ? Y[62:52] : {Y[30], {3{~Y[30]&~YExpZero|YExpMaxE}}, Y[29:23]}; 
+    assign ZExpE = FmtE ? Addend[62:52] : {Addend[30], {3{~Addend[30]&~ZExpZero|ZExpMaxE}}, Addend[29:23]}; 

    assign XFracE = FmtE ? X[`NF-1:0] : {X[22:0], 29'b0};
    assign YFracE = FmtE ? Y[`NF-1:0] : {Y[22:0], 29'b0};
@ -122,7 +122,7 @@ assign FmtE = 1'b0;
    assign YZeroE = YExpZero & YFracZero;
    assign ZZeroE = ZExpZero & ZFracZero;

-    assign BiasE = FmtE ? {1'b0, {`NE-1{1'b1}}} : 13'h7f;
+    assign BiasE = 13'h3ff;

 assign	wnan = FmtE ? &FMAResM[`FLEN-2:`NF] && |FMAResM[`NF-1:0] : &FMAResM[30:23] && |FMAResM[22:0]; 
 // assign	XNaNE = FmtE ? &X[62:52] && |X[51:0] : &X[62:55] && |X[54:32]; 
@ -203,7 +203,7 @@ always @(posedge clk)
 		if(&ans[30:23] && |ans[22:0] && ~ans[22] ) $display( "ans=sigNaN ");
 		if(&ans[30:23] && |ans[22:0] && ans[22]) $display( "ans=qutNaN ");
        errors = errors + 1;
-	  //if (errors == 10)
+	  if (errors == 10)
 		$stop;
    end
 vectornum = vectornum + 1;
--- a/wally-pipelined/fpu-testfloat/FMA/tbgen/test_gen.sh
+++ b/wally-pipelined/fpu-testfloat/FMA/tbgen/test_gen.sh
@ -1,3 +1,3 @@
-testfloat_gen f32_mulAdd -tininessafter -n 6133248 -rnear_even  -seed 113355 -level 1 > testFloat
+testfloat_gen f64_mulAdd -tininessafter -n 6133248 -rnear_even  -seed 113355 -level 1 > testFloat
 tr -d ' ' < testFloat > testFloatNoSpace

--- a/wally-pipelined/src/fpu/convert_inputs.sv
+++ b/wally-pipelined/src/fpu/convert_inputs.sv
@ -8,7 +8,7 @@
 module convert_inputs(
   input [63:0]  op1,      // 1st input operand (A)
   input [63:0]  op2,      // 2nd input operand (B)
-   input [3:0]   op_type,  // Function opcode
+   input [2:0]   op_type,  // Function opcode
   input 	     P,        // Result Precision (0 for double, 1 for single)

   output [63:0] Float1,	// Converted 1st input operand
@ -16,8 +16,6 @@ module convert_inputs(
 );

   wire 	 conv_SP;   // Convert from SP to DP
-   wire 	 negate;    // Operation is negation
-   wire 	 abs_val;   // Operation is absolute value
   wire 	 Zexp1;		// One if the exponent of op1 is zero
   wire 	 Zexp2;		// One if the exponent of op2 is zero
   wire 	 Oexp1;		// One if the exponent of op1 is all ones
@ -25,7 +23,7 @@ module convert_inputs(

   // Convert from single precision to double precision if (op_type is 11X
   // and P is 0) or (op_type is not 11X and P is one). 
-   assign conv_SP = (op_type[2]&op_type[1]) ^ P;
+   assign conv_SP = ~P;

   // Test if the input exponent is zero, because if it is then the
   // exponent of the converted number should be zero. 
@ -40,17 +38,14 @@ module convert_inputs(
   assign Float1[28:0] = op1[28:0] & {29{~conv_SP}};

   // Conditionally convert op2. Lower 29 bits are zero for single precision. 
-   assign Float2[62:29] = conv_SP ? {op2[30], 
-				     {3{(~op2[30]&~Zexp2)|Oexp2}}, op2[29:0]}
+   assign Float2[62:29] = conv_SP ? {op2[30], {3{(~op2[30]&~Zexp2)|Oexp2}}, op2[29:0]}
 			  : op2[62:29];
   assign Float2[28:0] = op2[28:0] & {29{~conv_SP}};

   // Set the sign of Float1 based on its original sign and if the operation
   // is negation (op_type = 101) or absolute value (op_type = 100)

-   assign negate  = op_type[2] & ~op_type[1] & op_type[0];
-   assign abs_val = op_type[2] & ~op_type[1] & ~op_type[0]; //*** remove abs_val
-   assign Float1[63]  = conv_SP ? (op1[31] ^ negate) & ~abs_val : (op1[63] ^ negate) & ~abs_val;
+   assign Float1[63]  = conv_SP ? op1[31] : op1[63];
   assign Float2[63]  = conv_SP ? op2[31] : op2[63];

 endmodule // convert_inputs
--- a/wally-pipelined/src/fpu/exception.sv
+++ b/wally-pipelined/src/fpu/exception.sv
@ -1,95 +1,58 @@
 // Exception logic for the floating point adder. Note: We may 
 // actually want to move to where the result is computed.

-module exception (Ztype, Invalid, Denorm, ANorm, BNorm, Sub, A, B, op_type);
+module exception (

-   input [63:0] A;		// 1st input operand (op1)
-   input [63:0] B;		// 2nd input operand (op2)
-   input [3:0] 	op_type;   	// Function opcode
-   output [3:0] Ztype;		// Indicates type of result (Z)
-   output 	Invalid;	// Invalid operation exception
-   output 	Denorm;		// Denormalized input
-   output       ANorm;          // A is not zero or Denorm
-   output       BNorm;          // B is not zero or Denorm
-   output       Sub;		// The effective operation is subtraction
-   wire		AzeroM;	 	// '1' if the mantissa of A is zero
-   wire		BzeroM;		// '1' if the mantissa of B is zero
-   wire		AzeroE;	 	// '1' if the exponent of A is zero
-   wire		BzeroE;		// '1' if the exponent of B is zero
-   wire		AonesE;	 	// '1' if the exponent of A is all ones
-   wire		BonesE;		// '1' if the exponent of B is all ones
-   wire		ADenorm; 	// '1' if A is a denomalized number
-   wire		BDenorm; 	// '1' if B is a denomalized number
-   wire		AInf;	 	// '1' if A is infinite
-   wire		BInf;	 	// '1' if B is infinite
-   wire		AZero;	 	// '1' if A is 0
-   wire		BZero;	 	// '1' if B is 0
-   wire		ANaN;	 	// '1' if A is a not-a-number
-   wire		BNaN; 		// '1' if B is a not-a-number
-   wire		ASNaN;	 	// '1' if A is a signalling not-a-number
-   wire		BSNaN;	 	// '1' if B is a signalling not-a-number
+   input logic [2:0] 	op_type,   	// Function opcode
+   input logic XSgnE, YSgnE,
+   // input logic [52:0] XManE, YManE,
+   input logic XDenormE, YDenormE,
+   input logic XNormE, YNormE,
+   input logic XZeroE, YZeroE,
+   input logic XInfE, YInfE,
+   input logic XNaNE, YNaNE,
+   input logic XSNaNE, YSNaNE,
+   output logic [3:0] Ztype,		// Indicates type of result (Z)
+   output logic 	Invalid,	// Invalid operation exception
+   output logic 	Denorm,		// Denormalized logic
+   output logic       Sub		// The effective operation is subtraction
+);
   wire		ZQNaN;	 	// '1' if result Z is a quiet NaN
   wire		ZPInf;	 	// '1' if result Z positive infnity
   wire		ZNInf;	 	// '1' if result Z negative infnity
   wire         add_sub;	// '1' if operation is add or subtract
   wire 	converts;       // See if there are any converts   
   
-   parameter [51:0]  fifty_two_zeros = 52'h0000000000000; // Use parameter?


   // Is this instruction a convert
-   assign converts      = ~(~op_type[1] & ~op_type[2]);
+   assign converts      = op_type[1];
   
-   // Determine if mantissas are all zeros
-   assign AzeroM = (A[51:0] == fifty_two_zeros);
-   assign BzeroM = (B[51:0] == fifty_two_zeros);

-   // Determine if exponents are all ones or all zeros 
-   assign AonesE = A[62]&A[61]&A[60]&A[59]&A[58]&A[57]&A[56]&A[55]&A[54]&A[53]&A[52];
-   assign BonesE = B[62]&B[61]&B[60]&B[59]&B[58]&B[57]&B[56]&B[55]&B[54]&B[53]&B[52];
-   assign AzeroE = ~(A[62]|A[61]|A[60]|A[59]|A[58]|A[57]|A[56]|A[55]|A[54]|A[53]|A[52]);
-   assign BzeroE = ~(B[62]|B[61]|B[60]|B[59]|B[58]|B[57]|B[56]|B[55]|B[54]|B[53]|B[52]);
-
-   // Determine special cases. Note: Zero is not really a special case. 
-   assign ADenorm = AzeroE & ~AzeroM;
-   assign BDenorm = BzeroE & ~BzeroM;
-   assign AInf = AonesE & AzeroM;
-   assign BInf = BonesE & BzeroM;
-   assign ANaN = AonesE & ~AzeroM;
-   assign BNaN = BonesE & ~BzeroM;
-   assign ASNaN = ANaN & ~A[51];
-   assign BSNaN = BNaN & ~B[51];
-   assign AZero = AzeroE & AzeroM;
-   assign BZero = BzeroE & BzeroE;
-
-   // A and B are normalized if their exponents are not zero. 
-   assign ANorm = ~AzeroE;
-   assign BNorm = ~BzeroE;

   // An "Invalid Operation" exception occurs if (A or B is a signalling NaN)
   // or (A and B are both Infinite and the "effective operation" is 
   // subtraction). 
-   assign add_sub = ~op_type[2] & ~op_type[1];
-   assign Invalid = (ASNaN | BSNaN | 
-		     (add_sub & AInf & BInf & (A[63]^B[63]^op_type[0]))) & ~converts;
+   assign add_sub = ~op_type[1];
+   assign Invalid = (XSNaNE | YSNaNE | (add_sub & XInfE & YInfE & (XSgnE^YSgnE^op_type[0]))) & ~converts;

   // The Denorm flag is set if (A is denormlized and the operation is not integer 
   // conversion ) or (if B is normalized and the operation is addition or  subtraction). 
-   assign Denorm = ADenorm&(op_type[2]|~op_type[1]) | BDenorm & add_sub;
+   assign Denorm = XDenormE | YDenormE & add_sub;

   // The result is a quiet NaN if (an "Invalid Operation" exception occurs) 
   // or (A is a NaN) or (B is a NaN and the operation uses B).
-   assign ZQNaN = Invalid | ANaN | (BNaN & add_sub);
+   assign ZQNaN = Invalid | XNaNE | (YNaNE & add_sub);

   // The result is +Inf if ((A is +Inf) or (B is -Inf and the operation is
   // subtraction) or (B is +Inf and the operation is addition)) and (the
   // result is not a quiet NaN).  
-   assign ZPInf = (AInf&A[63] | add_sub&BInf&(~B[63]^op_type[0]))&~ZQNaN;
+   assign ZPInf = (XInfE&XSgnE | add_sub&YInfE&(~YSgnE^op_type[0]))&~ZQNaN;

   // The result is -Inf if ((A is -Inf) or (B is +Inf and the operation is
   // subtraction) or (B is -Inf and the operation is addition)) and the
   // result is not a quiet NaN.  
-   assign ZNInf = (AInf&~A[63] | add_sub&BInf&(B[63]^op_type[0]))&~ZQNaN;
+   assign ZNInf = (XInfE&~XSgnE | add_sub&YInfE&(YSgnE^op_type[0]))&~ZQNaN;

   // Set the type of the result as follows:
   // (needs optimization - got lazy or was late)
@ -102,19 +65,19 @@ module exception (Ztype, Invalid, Denorm, ANorm, BNorm, Sub, A, B, op_type);
   //  0101     +Bzero and -Azero (and vice-versa)
   //  1000     Convert SP to DP (and vice-versa)

-   assign Ztype[0] = ((ZQNaN | ZPInf) & ~(~op_type[2] & op_type[1])) | 
-		     ((AZero & BZero & (A[63]^B[63]^op_type[0])) 
+   assign Ztype[0] = (ZQNaN | ZPInf) | 
+		     ((XZeroE & YZeroE & (XSgnE^YSgnE^op_type[0])) 
 		      & ~converts);
-   assign Ztype[1] = ((ZNInf | ZPInf) & ~(~op_type[2] & op_type[1])) | 
-		     (((AZero & BZero & A[63] & B[63] & ~op_type[0]) |
-		       (AZero & BZero & A[63] & ~B[63] & op_type[0])) 
+   assign Ztype[1] = (ZNInf | ZPInf) | 
+		     (((XZeroE & YZeroE & XSgnE & YSgnE & ~op_type[0]) |
+		       (XZeroE & YZeroE & XSgnE & ~YSgnE & op_type[0])) 
 		      & ~converts);
-   assign Ztype[2] = ((AZero & BZero & ~op_type[1] & ~op_type[2]) 
+   assign Ztype[2] = ((XZeroE & YZeroE & ~op_type[1]) 
 		      & ~converts);
-   assign Ztype[3] = (op_type[1] & op_type[2] & ~op_type[0]);
+   assign Ztype[3] = (op_type[1] & ~op_type[0]);

   // Determine if the effective operation is subtraction
-   assign Sub = ~(op_type[3] & ~op_type[0]) & ( (op_type[3] & op_type[0]) | (add_sub & (A[63]^B[63]^op_type[0])) );
+   assign Sub = add_sub & (XSgnE^YSgnE^op_type[0]);
 
 endmodule // exception

--- a/wally-pipelined/src/fpu/exception_div.sv
+++ b/wally-pipelined/src/fpu/exception_div.sv
@ -27,7 +27,7 @@ module exception_div (
   logic 	      ZInf;	 	// '1' if result Z is an infnity
   logic 	      Zero;             // '1' if result is zero   
   
-
+   //***take this module out and add more registers or just recalculate it all
   // Determine if mantissas are all zeros
   assign AzeroM = (A[51:0] == 52'h0);
   assign BzeroM = (B[51:0] == 52'h0);
--- a/wally-pipelined/src/fpu/faddcvt.sv
+++ b/wally-pipelined/src/fpu/faddcvt.sv
@ -33,9 +33,22 @@ module faddcvt(
   input logic          StallM,     // stall the memory stage
   input logic  [63:0]  FSrcXE,		// 1st input operand (A)
   input logic  [63:0]  FSrcYE,		// 2nd input operand (B)
-   input logic  [3:0]   FOpCtrlE, FOpCtrlM,	// Function opcode
+   input logic  [2:0]   FOpCtrlE, FOpCtrlM,	// Function opcode
   input logic          FmtE, FmtM,   	// Result Precision (0 for double, 1 for single)
   input logic  [2:0] 	FrmM,		      // Rounding mode - specify values 
+   input logic XSgnE, YSgnE,
+   input logic [52:0] XManE, YManE,
+   input logic [10:0] XExpE, YExpE,
+   input logic XSgnM, YSgnM,
+   input logic [52:0] XManM, YManM,
+   input logic [10:0] XExpM, YExpM,
+   input logic XDenormE, YDenormE,
+   input logic XNormE, YNormE,
+   input logic XNormM, YNormM,
+   input logic XZeroE, YZeroE,
+   input logic XInfE, YInfE,
+   input logic XNaNE, YNaNE,
+   input logic XSNaNE, YSNaNE,
   output logic [63:0]  FAddResM,	   // Result of operation
   output logic [4:0]   FAddFlgM);   	// IEEE exception flags 
   
@ -44,53 +57,53 @@ module faddcvt(
   logic [3:0] 	AddSelInvE, AddSelInvM;
   logic [10:0] 	AddExpPostSumE,AddExpPostSumM;
   logic 		   AddCorrSignE, AddCorrSignM;
-   logic          AddOp1NormE, AddOp1NormM;
-   logic          AddOp2NormE, AddOp2NormM;
   logic          AddOpANormE,  AddOpANormM;
   logic          AddOpBNormE, AddOpBNormM;
   logic          AddInvalidE, AddInvalidM;
   logic 		   AddDenormInE, AddDenormInM;
   logic          AddSwapE, AddSwapM;
   logic          AddSignAE, AddSignAM;
-   logic 		   AddConvertE, AddConvertM;
-   logic [63:0] 	AddFloat1E, AddFloat2E, AddFloat1M, AddFloat2M;
   logic [11:0] 	AddExp1DenormE, AddExp2DenormE, AddExp1DenormM, AddExp2DenormM;
   logic [10:0] 	AddExponentE, AddExponentM;


-   fpuaddcvt1 fpadd1 (.FSrcXE, .FSrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
+   fpuaddcvt1 fpadd1 (.FOpCtrlE, .FmtE, .AddExponentE, 
                     .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
-                     .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
-                     .AddDenormInE, .AddConvertE, .AddSwapE);
+   .XSgnE, .YSgnE,.XManE, .YManE, .XExpE, .YExpE,  .XDenormE, .YDenormE, .XNormE, .YNormE, .XZeroE, .YZeroE, .XInfE, .YInfE, .XNaNE, .YNaNE, .XSNaNE, .YSNaNE,
+                     .AddCorrSignE, .AddSignAE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
+                     .AddDenormInE, .AddSwapE);

   // E/M pipeline registers
   flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
   flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
   flopenrc #(11) EMRegAdd3(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
-   flopenrc #(64) EMRegAdd4(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
-   flopenrc #(64) EMRegAdd5(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
   flopenrc #(12) EMRegAdd6(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
   flopenrc #(12) EMRegAdd7(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
   flopenrc #(11) EMRegAdd8(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM);
-   flopenrc #(14) EMRegAdd9(clk, reset, FlushM, ~StallM, 
-                           {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddSignAE},
-                           {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddSignAM}); 
+   flopenrc #(11) EMRegAdd9(clk, reset, FlushM, ~StallM, 
+                           {AddSelInvE, AddCorrSignE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddSwapE, AddSignAE},
+                           {AddSelInvM, AddCorrSignM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddSwapM, AddSignAM}); 

                     
-   fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM, .AddFloat1M, .AddFloat2M, 
-                     .AddExp1DenormM, .AddExp2DenormM, .AddExponentM, .AddExpPostSumM, .AddSelInvM, 
-                     .AddOp1NormM, .AddOp2NormM, .AddOpANormM, .AddOpBNormM, .AddInvalidM, .AddDenormInM, 
-                     .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
+   fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM,  .XNormM, .YNormM, 
+                     .AddExp1DenormM, .AddExp2DenormM, .AddExponentM, .AddExpPostSumM, .AddSelInvM, .XSgnM, .YSgnM, .XManM, .YManM, .XExpM, .YExpM,
+                     .AddOpANormM, .AddOpBNormM, .AddInvalidM, .AddDenormInM, 
+                     .AddSignAM, .AddCorrSignM, .AddSwapM, .FAddResM, .FAddFlgM);
 endmodule

 module fpuaddcvt1 (
-   input logic [63:0]   FSrcXE,		// 1st input operand (A)
-   input logic [63:0]   FSrcYE,		// 2nd input operand (B)
-   input logic [3:0]	   FOpCtrlE,	// Function opcode
+   input logic [2:0]	   FOpCtrlE,	// Function opcode
   input logic 	      FmtE,   		// Result Precision (1 for double, 0 for single)
+   input logic XSgnE, YSgnE,
+   input logic [10:0] XExpE, YExpE,
+   input logic [52:0] XManE, YManE,
+   input logic XDenormE, YDenormE,
+   input logic XNormE, YNormE,
+   input logic XZeroE, YZeroE,
+   input logic XInfE, YInfE,
+   input logic XNaNE, YNaNE,
+   input logic XSNaNE, YSNaNE,

-   output logic [63:0] 	AddFloat1E, 
-   output logic [63:0] 	AddFloat2E,
   output logic [10:0] 	AddExponentE,
   output logic [10:0]	AddExpPostSumE,
   output logic [11:0]  AddExp1DenormE, AddExp2DenormE,//KEP used to be [10:0]
@ -98,11 +111,9 @@ module fpuaddcvt1 (
   output logic [3:0]   AddSelInvE,
   output logic         AddCorrSignE,
   output logic 	      AddSignAE,
-   output logic	      AddOp1NormE, AddOp2NormE,
   output logic	      AddOpANormE, AddOpBNormE,
   output logic	      AddInvalidE,
   output logic 	      AddDenormInE,
-   output logic 	      AddConvertE,
   output logic         AddSwapE
   );

@ -112,7 +123,7 @@ module fpuaddcvt1 (
   wire		    ZV_mantissaB;

   wire          P;
-   assign P = ~FmtE;
+   assign P = ~(FmtE^FOpCtrlE[1]);

   wire [63:0] IntValue;
   wire [11:0] exp1, exp2;
@ -130,22 +141,15 @@ module fpuaddcvt1 (
   wire 	      zeroB;
   wire [5:0]	align_shift;

-   // Convert the input operands to their appropriate forms based on 
-   // the orignal operands, the FOpCtrlE , and their precision P. 
-   // Single precision inputs are converted to double precision 
-   // and the sign of the first operand is set appropratiately based on
-   // if the operation is absolute value or negation. 
-
-   convert_inputs conv1 (.Float1(AddFloat1E), .Float2(AddFloat2E), .op1(FSrcXE), .op2(FSrcYE), .op_type(FOpCtrlE), .P);
-
   // Test for exceptions and return the "Invalid Operation" and
   // "Denormalized" Input Flags. The "AddSelInvE" is used in
   // the third pipeline stage to select the result. Also, AddOp1NormE
   // and AddOp2NormE are one if FSrcXE and FSrcYE are not zero or denormalized.
   // sub is one if the effective operation is subtaction. 

-   exception exc1 (AddSelInvE, AddInvalidE, AddDenormInE, AddOp1NormE, AddOp2NormE, sub, 
-		   AddFloat1E, AddFloat2E, FOpCtrlE);
+   exception exc1 (.Ztype(AddSelInvE), .Invalid(AddInvalidE), .Denorm(AddDenormInE), .Sub(sub), 
+   .XSgnE, .YSgnE, .XDenormE, .YDenormE, .XNormE, .YNormE, .XZeroE, .YZeroE, .XInfE, .YInfE, .XNaNE, .YNaNE, .XSNaNE, .YSNaNE,
+	.op_type(FOpCtrlE));

   // Perform Exponent Subtraction (used for alignment). For performance
   // both exponent subtractions are performed in parallel. This was 
@ -153,25 +157,25 @@ module fpuaddcvt1 (
   // the two parallel additions. The input values are zero-extended to 12 
   // bits prior to performing the addition. 

-   assign exp1 = {1'b0, AddFloat1E[62:52]};
-   assign exp2 = {1'b0, AddFloat2E[62:52]};
+   assign exp1 = {1'b0, XExpE};
+   assign exp2 = {1'b0, YExpE};
   assign exp_diff1 = exp1 - exp2;
-   assign exp_diff2 = AddDenormInE ? ({AddFloat2E[63], exp2[10:0]} - {AddFloat1E[63], exp1[10:0]}): exp2 - exp1;
+   assign exp_diff2 = AddDenormInE ? ({YSgnE, YExpE} - {XSgnE, XExpE}): exp2 - exp1;

   // The second operand (B) should be set to zero, if FOpCtrlE does not
   // specify addition or subtraction
-   assign zeroB = FOpCtrlE[2] | FOpCtrlE[1];
+   assign zeroB = FOpCtrlE[1];

   // Swapped operands if zeroB is not one and exp1 < exp2. 
   // Swapping causes exp2 to be used for the result exponent. 
   // Only the exponent of the larger operand is used to determine
   // the final result. 
   assign AddSwapE = exp_diff1[11] & ~zeroB;
-   assign AddExponentE = AddSwapE ? exp2[10:0] : exp1[10:0];
-   assign AddExpPostSumE = AddSwapE ? exp2[10:0] : exp1[10:0];
-   assign mantissaA = AddSwapE ? AddFloat2E[51:0] : AddFloat1E[51:0];
-   assign mantissaB = AddSwapE ? AddFloat1E[51:0] : AddFloat2E[51:0];
-   assign AddSignAE     = AddSwapE ? AddFloat2E[63] : AddFloat1E[63];   
+   assign AddExponentE = AddSwapE ? YExpE : XExpE;
+   assign AddExpPostSumE = AddSwapE ? YExpE : XExpE;
+   assign mantissaA = AddSwapE ? YManE[51:0] : XManE[51:0];
+   assign mantissaB = AddSwapE ? XManE[51:0] : YManE[51:0];
+   assign AddSignAE     = AddSwapE ? YSgnE : XSgnE;   

   // Leading-Zero Detector. Determine the size of the shift needed for
   // normalization. If sum_corrected is all zeros, the exp_valid is 
@ -201,8 +205,8 @@ module fpuaddcvt1 (
   // and loss of sign information. The two bits to the right of the 
   // original mantissa form the "guard" and "round" bits that are used
   // to round the result. 
-   assign AddOpANormE = AddSwapE ? AddOp2NormE : AddOp1NormE;
-   assign AddOpBNormE = AddSwapE ? AddOp1NormE : AddOp2NormE;
+   assign AddOpANormE = AddSwapE ? YNormE : XNormE;
+   assign AddOpBNormE = AddSwapE ? XNormE : YNormE;
   assign mantissaA1 = {2'h0, AddOpANormE, mantissaA[51:0]&{52{AddOpANormE}}, 2'h0};
   assign mantissaB1 = {2'h0, AddOpBNormE, mantissaB[51:0]&{52{AddOpBNormE}}, 2'h0};

@ -223,19 +227,18 @@ module fpuaddcvt1 (
   // and the exponent value is left unchanged. 
   // Under denormalized cases, the exponent before the rounder is set to 1
   // if the normal shift value is 11.
-   assign AddConvertE       = ~FOpCtrlE[2] & FOpCtrlE[1];
-   assign mantissaA3    = (FOpCtrlE[3]) ? (FOpCtrlE[0] ? AddFloat1E : ~AddFloat1E) : (AddDenormInE ? ({12'h0, mantissaA}) : (AddConvertE ? IntValue : {mantissaA1, 7'h0}));
+   assign mantissaA3    = AddDenormInE ? ({12'h0, mantissaA}) : {mantissaA1, 7'h0};

   // Put zero in for mantissaB3, if zeroB is one. Otherwise, B is extended to 
   // 64-bits by setting the 7 LSBs to the Sticky_out bit followed by six  
   // zeros. 
-   assign mantissaB3[63:7] = (FOpCtrlE[3]) ? (57'h0) : (AddDenormInE ? {12'h0, mantissaB[51:7]} : mantissaB2 & {57{~zeroB}});
-   assign mantissaB3[6]    = (FOpCtrlE[3]) ? (1'b0) : (AddDenormInE ? mantissaB[6] : Sticky_out & ~zeroB);
-   assign mantissaB3[5:0]  = (FOpCtrlE[3]) ? (6'h01) : (AddDenormInE ? mantissaB[5:0] : 6'h0);
+   assign mantissaB3[63:7] = AddDenormInE ? {12'h0, mantissaB[51:7]} : mantissaB2 & {57{~zeroB}};
+   assign mantissaB3[6]    = AddDenormInE ? mantissaB[6] : Sticky_out & ~zeroB;
+   assign mantissaB3[5:0]  = AddDenormInE ? mantissaB[5:0] : 6'h0;

   // The sign of the result needs to be corrected if the true
   // operation is subtraction and the input operands were swapped. 
-   assign AddCorrSignE = ~FOpCtrlE[2]&~FOpCtrlE[1]&FOpCtrlE[0]&AddSwapE;
+   assign AddCorrSignE = ~FOpCtrlE[1]&FOpCtrlE[0]&AddSwapE;

   // 64-bit Mantissa Adder/Subtractor
   cla64 add1 (AddSumE, mantissaA3, mantissaB3, sub); //***adder
@ -281,31 +284,31 @@ endmodule // fpadd


 module fpuaddcvt2 (
-   input [2:0] 	FrmM,		// Rounding mode - specify values 
-   input [3:0]	FOpCtrlM,	// Function opcode
-   input 	FmtM,   		// Result Precision (0 for double, 1 for single)
-   input [63:0] AddSumM, AddSumTcM,
-   input [63:0] 	 AddFloat1M, 
-   input [63:0] 	 AddFloat2M,
-   input [11:0]	 AddExp1DenormM, AddExp2DenormM,
-   input [10:0] 	 AddExponentM, AddExpPostSumM,
-   input [3:0] 	 AddSelInvM,
-   input		 AddOp1NormM, AddOp2NormM,
-   input		 AddOpANormM, AddOpBNormM,
-   input		 AddInvalidM,
-   input 	 AddDenormInM, 
-   input 	 AddSignAM, 
-   input         AddCorrSignM,
-   input 	 AddConvertM,
-   input          AddSwapM,
+   input logic [2:0] 	FrmM,		// Rounding mode - specify values 
+   input logic [2:0]	FOpCtrlM,	// Function opcode
+   input logic 	FmtM,   		// Result Precision (0 for double, 1 for single)
+   input logic [63:0] AddSumM, AddSumTcM,
+   input logic [11:0]	 AddExp1DenormM, AddExp2DenormM,
+   input logic [10:0] 	 AddExponentM, AddExpPostSumM,
+   input logic [3:0] 	 AddSelInvM,
+   input logic XSgnM, YSgnM,
+   input logic [52:0] XManM, YManM,
+   input logic [10:0] XExpM, YExpM,
+   input logic XNormM, YNormM,
+   input logic		 AddOpANormM, AddOpBNormM,
+   input logic		 AddInvalidM,
+   input logic 	 AddDenormInM, 
+   input logic 	 AddSignAM, 
+   input logic         AddCorrSignM,
+   input logic          AddSwapM,

-   output [63:0] FAddResM,	// Result of operation
-   output [4:0]  FAddFlgM   	// IEEE exception flags 
+   output logic [63:0] FAddResM,	// Result of operation
+   output logic [4:0]  FAddFlgM   	// IEEE exception flags 
 );
   wire 	 AddDenormM;   	// AddDenormM on input or output   

   wire          P;
-   assign P = ~FmtM;
+   assign P = ~(FmtM^FOpCtrlM[1]);

   wire [10:0]   exp_pre;
   wire [63:0] 	 Result;   
@ -338,15 +341,15 @@ module fpuaddcvt2 (
   //cases/conversion cases
   assign exp_pre       = AddDenormInM ?
                          ((norm_shift == 6'b001011) ? 11'b00000000001 : (AddSwapM ? AddExp2DenormM[10:0] : AddExp1DenormM[10:0]))
-                          : (AddConvertM ? 11'b10000111100 : AddExponentM);
+                          : AddExponentM;


   // Finds normal underflow result to determine whether to round final AddExponentM down
   // Comparison between each float and the resulting AddSumM of the primary cla adder/subtractor and cla subtractor
-   assign Float1_sum_comp = (AddFloat1M[51:0] > AddSumM[51:0]) ? 1'b0 : 1'b1;
-   assign Float2_sum_comp = (AddFloat2M[51:0] > AddSumM[51:0]) ? 1'b0 : 1'b1;
-   assign Float1_sum_tc_comp = (AddFloat1M[51:0] > AddSumTcM[51:0]) ? 1'b0 : 1'b1;
-   assign Float2_sum_tc_comp = (AddFloat2M[51:0] > AddSumTcM[51:0]) ? 1'b0 : 1'b1;
+   assign Float1_sum_comp = ~(XManM[51:0] > AddSumM[51:0]);
+   assign Float2_sum_comp = ~(YManM[51:0] > AddSumM[51:0]);
+   assign Float1_sum_tc_comp = ~(XManM[51:0] > AddSumTcM[51:0]);
+   assign Float2_sum_tc_comp = ~(YManM[51:0] > AddSumTcM[51:0]);

   // Determines the correct Float value to compare based on AddSwapM result
   assign mantissa_comp_sum = AddSwapM ? Float2_sum_comp : Float1_sum_comp;
@ -357,16 +360,16 @@ module fpuaddcvt2 (

   // If the signs are different and both operands aren't denormalized
   // the normal underflow bit is needed and therefore updated.
-   assign normal_underflow = ((AddFloat1M[63] ~^ AddFloat2M[63]) & (AddOpANormM | AddOpBNormM)) ? mantissa_comp : 1'b0;
+   assign normal_underflow = ((XSgnM ^ YSgnM) & (AddOpANormM | AddOpBNormM)) ? mantissa_comp : 1'b0;

   // Determine the correct sign of the result
-   assign sign_corr = ((AddCorrSignM ^ AddSignAM) & ~AddConvertM) ^ AddSumM[63];   
+   assign sign_corr = (AddCorrSignM ^ AddSignAM) ^ AddSumM[63];   
   
   // If the AddSumM is negative, use its two complement instead. 
   // This value has to be 64-bits to correctly handle the 
   // case 10...00
-   assign sum_corr = (AddDenormInM & (AddOpANormM | AddOpBNormM) & ( ( (AddFloat1M[63] ~^ AddFloat2M[63]) & FOpCtrlM[0] ) | ((AddFloat1M[63] ^ AddFloat2M[63]) & ~FOpCtrlM[0]) ))
-			 ? (AddSumM[63] ? AddSumM : AddSumTcM) : ( (FOpCtrlM[3]) ? AddSumM : (AddSumM[63] ? AddSumTcM : AddSumM));
+   assign sum_corr = (AddDenormInM & (AddOpANormM | AddOpBNormM) & ( ( (XSgnM ~^ YSgnM) & FOpCtrlM[0] ) | ((XSgnM ^ YSgnM) & ~FOpCtrlM[0]) ))
+			 ? (AddSumM[63] ? AddSumM : AddSumTcM) : (AddSumM[63] ? AddSumTcM : AddSumM);

   // Finds normal underflow result to determine whether to round final AddExponentM down
   //KEP used to be (AddSumM == 16'h0) not sure what it is supposed to be
@ -384,7 +387,7 @@ module fpuaddcvt2 (
   // be right shifted. It outputs the normalized AddSumM. 
   barrel_shifter_l64 bs2 (sum_norm, sum_corr, norm_shift_denorm);
  
-   assign sum_norm_w_bypass = (FOpCtrlM[3]) ? (FOpCtrlM[0] ? ~sum_corr : sum_corr) : (sum_norm);
+   assign sum_norm_w_bypass = sum_norm;

   // Round the mantissa to a 52-bit value, with the leading one
   // removed. If the result is a single precision number, the actual 
@ -397,10 +400,10 @@ module fpuaddcvt2 (
   // help in processor reservation station detection of load/stores. In
   // other words, the processor would like to know ahead of time that
   // if the result is an exception then don't load or store.
-   rounder round1 (Result, DenormIO, FlagsIn, FrmM, P, AddOvEnM, AddUnEnM, exp_valid, 
-		   AddSelInvM, AddInvalidM, AddDenormInM, AddConvertM, sign_corr, exp_pre, norm_shift, sum_norm_w_bypass,
-		   AddExpPostSumM, AddOp1NormM, AddOp2NormM, AddFloat1M[63:52], AddFloat2M[63:52],
-		   AddNormOvflowM, normal_underflow, AddSwapM, FOpCtrlM, AddSumM);
+   rounder round1 (.Result, .DenormIO, .Flags(FlagsIn), .rm(FrmM), .P, .OvEn(AddOvEnM), .UnEn(AddUnEnM), .exp_valid, 
+		   .sel_inv(AddSelInvM), .Invalid(AddInvalidM), .DenormIn(AddDenormInM), .Asign(sign_corr), .Aexp(exp_pre), .norm_shift, .A(sum_norm_w_bypass),
+		   .exponent_postsum(AddExpPostSumM), .A_Norm(XNormM), .B_Norm(YNormM), .exp_A_unmodified({XSgnM, XExpM}), .exp_B_unmodified({YSgnM, YExpM}),
+		   .normal_overflow(AddNormOvflowM), .normal_underflow, .swap(AddSwapM), .op_type(FOpCtrlM), .sum(AddSumM));

   // Store the final result and the exception flags in registers.
   assign FAddResM = Result;
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@ -9,7 +9,7 @@ module fctrl (
  output logic       FRegWriteD,  // FP register write enable
  output logic       FDivStartD,  // Start division or squareroot
  output logic [2:0] FResultSelD, // select result to be written to fp register
-  output logic [3:0] FOpCtrlD,    // chooses which opperation to do - specifics shown at bottom of module and in each unit
+  output logic [2:0] FOpCtrlD,    // chooses which opperation to do - specifics shown at bottom of module and in each unit
  output logic [1:0] FResSelD,    // select one of the results done in the memory stage
  output logic [1:0] FIntResSelD, // select the result that will be written to the integer register
  output logic       FmtD,        // precision - single-0 double-1
@ -24,82 +24,82 @@ module fctrl (
    case(OpD)
    // FRegWrite_FWriteInt_FResultSel_FOpCtrl_FResSel_FIntResSel_FDivStart_IllegalFPUInstr
      7'b0000111: case(Funct3D)
-                    3'b010:  ControlsD = `FCTRLW'b1_0_000_0000_00_00_0_0; // flw
-                    3'b011:  ControlsD = `FCTRLW'b1_0_000_0001_00_00_0_0; // fld
-                    default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                    3'b010:  ControlsD = `FCTRLW'b1_0_000_000_00_00_0_0; // flw
+                    3'b011:  ControlsD = `FCTRLW'b1_0_000_001_00_00_0_0; // fld
+                    default: ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
                  endcase
      7'b0100111: case(Funct3D)
-                    3'b010:  ControlsD = `FCTRLW'b0_0_000_0010_00_00_0_0; // fsw
-                    3'b011:  ControlsD = `FCTRLW'b0_0_000_0011_00_00_0_0; // fsd
-                    default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                    3'b010:  ControlsD = `FCTRLW'b0_0_000_010_00_00_0_0; // fsw
+                    3'b011:  ControlsD = `FCTRLW'b0_0_000_011_00_00_0_0; // fsd
+                    default: ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
                  endcase
-      7'b1000011:   ControlsD = `FCTRLW'b1_0_001_0000_00_00_0_0; // fmadd
-      7'b1000111:   ControlsD = `FCTRLW'b1_0_001_0001_00_00_0_0; // fmsub
-      7'b1001011:   ControlsD = `FCTRLW'b1_0_001_0010_00_00_0_0; // fnmsub
-      7'b1001111:   ControlsD = `FCTRLW'b1_0_001_0011_00_00_0_0; // fnmadd
+      7'b1000011:   ControlsD = `FCTRLW'b1_0_001_000_00_00_0_0; // fmadd
+      7'b1000111:   ControlsD = `FCTRLW'b1_0_001_001_00_00_0_0; // fmsub
+      7'b1001011:   ControlsD = `FCTRLW'b1_0_001_010_00_00_0_0; // fnmsub
+      7'b1001111:   ControlsD = `FCTRLW'b1_0_001_011_00_00_0_0; // fnmadd
      7'b1010011: casez(Funct7D)
-                    7'b00000??: ControlsD = `FCTRLW'b1_0_010_0000_00_00_0_0; // fadd
-                    7'b00001??: ControlsD = `FCTRLW'b1_0_010_0001_00_00_0_0; // fsub
-                    7'b00010??: ControlsD = `FCTRLW'b1_0_001_0100_00_00_0_0; // fmul
-                    7'b00011??: ControlsD = `FCTRLW'b1_0_011_0000_00_00_1_0; // fdiv
-                    7'b01011??: ControlsD = `FCTRLW'b1_0_011_0001_00_00_1_0; // fsqrt
+                    7'b00000??: ControlsD = `FCTRLW'b1_0_010_000_00_00_0_0; // fadd
+                    7'b00001??: ControlsD = `FCTRLW'b1_0_010_001_00_00_0_0; // fsub
+                    7'b00010??: ControlsD = `FCTRLW'b1_0_001_100_00_00_0_0; // fmul
+                    7'b00011??: ControlsD = `FCTRLW'b1_0_011_000_00_00_1_0; // fdiv
+                    7'b01011??: ControlsD = `FCTRLW'b1_0_011_001_00_00_1_0; // fsqrt
                    7'b00100??: case(Funct3D)
-                                  3'b000:  ControlsD = `FCTRLW'b1_0_100_0000_01_00_0_0; // fsgnj
-                                  3'b001:  ControlsD = `FCTRLW'b1_0_100_0001_01_00_0_0; // fsgnjn
-                                  3'b010:  ControlsD = `FCTRLW'b1_0_100_0010_01_00_0_0; // fsgnjx
-                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                  3'b000:  ControlsD = `FCTRLW'b1_0_100_000_01_00_0_0; // fsgnj
+                                  3'b001:  ControlsD = `FCTRLW'b1_0_100_001_01_00_0_0; // fsgnjn
+                                  3'b010:  ControlsD = `FCTRLW'b1_0_100_010_01_00_0_0; // fsgnjx
+                                  default: ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
                                endcase
                    7'b00101??: case(Funct3D)
-                                  3'b000:  ControlsD = `FCTRLW'b1_0_100_0111_10_00_0_0; // fmin
-                                  3'b001:  ControlsD = `FCTRLW'b1_0_100_0101_10_00_0_0; // fmax
-                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                  3'b000:  ControlsD = `FCTRLW'b1_0_100_111_10_00_0_0; // fmin
+                                  3'b001:  ControlsD = `FCTRLW'b1_0_100_101_10_00_0_0; // fmax
+                                  default: ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
                                endcase
                    7'b10100??: case(Funct3D)
-                                  3'b010:  ControlsD = `FCTRLW'b0_1_100_0010_00_00_0_0; // feq
-                                  3'b001:  ControlsD = `FCTRLW'b0_1_100_0001_00_00_0_0; // flt
-                                  3'b000:  ControlsD = `FCTRLW'b0_1_100_0011_00_00_0_0; // fle
-                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                  3'b010:  ControlsD = `FCTRLW'b0_1_100_010_00_00_0_0; // feq
+                                  3'b001:  ControlsD = `FCTRLW'b0_1_100_001_00_00_0_0; // flt
+                                  3'b000:  ControlsD = `FCTRLW'b0_1_100_011_00_00_0_0; // fle
+                                  default: ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
                                endcase
                    7'b11100??: if (Funct3D == 3'b001)
-                                  ControlsD = `FCTRLW'b0_1_100_0000_00_10_0_0; // fclass
-                                else if (Funct3D[1:0] == 2'b00) ControlsD = `FCTRLW'b0_1_100_0100_00_01_0_0; // fmv.x.w
-                                else if (Funct3D[1:0] == 2'b01) ControlsD = `FCTRLW'b0_1_100_0101_00_01_0_0; // fmv.x.d
-                                else                            ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                  ControlsD = `FCTRLW'b0_1_100_000_00_10_0_0; // fclass
+                                else if (Funct3D[1:0] == 2'b00) ControlsD = `FCTRLW'b0_1_100_100_00_01_0_0; // fmv.x.w
+                                else if (Funct3D[1:0] == 2'b01) ControlsD = `FCTRLW'b0_1_100_101_00_01_0_0; // fmv.x.d
+                                else                            ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
                    7'b1101000: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_0001_11_00_0_0; // fcvt.s.w
-                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_0101_11_00_0_0; // fcvt.s.wu
-                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_1001_11_00_0_0; // fcvt.s.l
-                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_1101_11_00_0_0; // fcvt.s.lu
+                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_000_11_00_0_0; // fcvt.s.w
+                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_010_11_00_0_0; // fcvt.s.wu
+                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_100_11_00_0_0; // fcvt.s.l
+                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_110_11_00_0_0; // fcvt.s.lu
                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                endcase
                    7'b1100000: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0010_11_11_0_0; // fcvt.w.s
-                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0110_11_11_0_0; // fcvt.wu.s
-                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1010_11_11_0_0; // fcvt.l.s
-                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1110_11_11_0_0; // fcvt.lu.s
-                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_001_11_11_0_0; // fcvt.w.s
+                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_011_11_11_0_0; // fcvt.wu.s
+                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_101_11_11_0_0; // fcvt.l.s
+                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_111_11_11_0_0; // fcvt.lu.s
+                                  default: ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
                                endcase
-                    7'b1111000: ControlsD = `FCTRLW'b1_0_100_0000_00_00_0_0; // fmv.w.x
-                    7'b0100000: ControlsD = `FCTRLW'b1_0_010_0111_00_00_0_0; // fcvt.s.d
+                    7'b1111000: ControlsD = `FCTRLW'b1_0_100_000_00_00_0_0; // fmv.w.x
+                    7'b0100000: ControlsD = `FCTRLW'b1_0_010_111_00_00_0_0; // fcvt.s.d
                    7'b1101001: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_0001_11_00_0_0; // fcvt.d.w
-                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_0101_11_00_0_0; // fcvt.d.wu
-                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_1001_11_00_0_0; // fcvt.d.l
-                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_1101_11_00_0_0; // fcvt.d.lu
-                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_000_11_00_0_0; // fcvt.d.w
+                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_010_11_00_0_0; // fcvt.d.wu
+                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_100_11_00_0_0; // fcvt.d.l
+                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_110_11_00_0_0; // fcvt.d.lu
+                                  default: ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
                                endcase
                    7'b1100001: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0010_11_11_0_0; // fcvt.w.d
-                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0110_11_11_0_0; // fcvt.wu.d
-                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1010_11_11_0_0; // fcvt.l.d
-                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1110_11_11_0_0; // fcvt.lu.d
-                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_001_11_11_0_0; // fcvt.w.d
+                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_011_11_11_0_0; // fcvt.wu.d
+                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_101_11_11_0_0; // fcvt.l.d
+                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_111_11_11_0_0; // fcvt.lu.d
+                                  default: ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
                                endcase
-                    7'b1111001: ControlsD = `FCTRLW'b1_0_100_0001_00_00_0_0; // fmv.d.x
-                    7'b0100001: ControlsD = `FCTRLW'b1_0_010_0111_00_00_0_0; // fcvt.d.s
-                    default:    ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                    7'b1111001: ControlsD = `FCTRLW'b1_0_100_001_00_00_0_0; // fmv.d.x
+                    7'b0100001: ControlsD = `FCTRLW'b1_0_010_111_00_00_0_0; // fcvt.d.s
+                    default:    ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
                  endcase
-      default:      ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+      default:      ControlsD = `FCTRLW'b0_0_000_000_00_00_0_1; // non-implemented instruction
    endcase

  // unswizzle control bits
@ -117,7 +117,7 @@ module fctrl (
  // Precision
  //    0-single
  //    1-double
-  assign FmtD = FResultSelD == 3'b000 ? Funct3D[0] : OpD[6:1] == 6'b010000 ? ~Funct7D[0] : Funct7D[0];
+  assign FmtD = FResultSelD == 3'b000 ? Funct3D[0] : FResultSelD == 3'b010 ? Funct7D[0]^FOpCtrlD[1] : OpD[6:1] == 6'b010000 ? ~Funct7D[0] : Funct7D[0];

  // FResultSel:
  //    000 - ReadRes - load
--- a/wally-pipelined/src/fpu/fcvt.sv
+++ b/wally-pipelined/src/fpu/fcvt.sv
@ -11,7 +11,7 @@ module fcvt (
    input logic             XDenormE,   // is X denormalized
    input logic [10:0]      BiasE,      // bias - depends on precision (max exponent/2)
    input logic [`XLEN-1:0] SrcAE,      // integer input
-    input logic [3:0]       FOpCtrlE,   // chooses which instruction is done (full list below)
+    input logic [2:0]       FOpCtrlE,   // chooses which instruction is done (full list below)
    input logic [2:0]       FrmE,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
    input logic             FmtE,       // precision 1 = double 0 = single
    output logic [63:0]     CvtResE,    // convert final result
@ -43,27 +43,27 @@ module fcvt (
    logic               RoundSgn;           // sign of the rounded result

    // FOpCtrlE:
-      //  fcvt.w.s  = 0010
-      //  fcvt.wu.s = 0110
-      //  fcvt.s.w  = 0001
-      //  fcvt.s.wu = 0101
-      //  fcvt.l.s  = 1010
-      //  fcvt.lu.s = 1110
-      //  fcvt.s.l  = 1001
-      //  fcvt.s.lu = 1101
-      //  fcvt.w.d  = 0010 
-      //  fcvt.wu.d = 0110
-      //  fcvt.d.w  = 0001
-      //  fcvt.d.wu = 0101
-      //  fcvt.l.d  = 1010
-      //  fcvt.lu.d = 1110
-      //  fcvt.d.l  = 1001
-      //  fcvt.d.lu = 1101
+      //  fcvt.w.s  = 001
+      //  fcvt.wu.s = 011
+      //  fcvt.s.w  = 000
+      //  fcvt.s.wu = 010
+      //  fcvt.l.s  = 101
+      //  fcvt.lu.s = 111
+      //  fcvt.s.l  = 100
+      //  fcvt.s.lu = 110
+      //  fcvt.w.d  = 001 
+      //  fcvt.wu.d = 011
+      //  fcvt.d.w  = 000
+      //  fcvt.d.wu = 010
+      //  fcvt.l.d  = 101
+      //  fcvt.lu.d = 111
+      //  fcvt.d.l  = 100
+      //  fcvt.d.lu = 110
      //  {long, unsigned, to int, from int}
   
    // calculate signals based off the input and output's size
-    assign Res64 = (FOpCtrlE[1]&FOpCtrlE[3]) | (FmtE&FOpCtrlE[0]);
-    assign In64 =  (FOpCtrlE[0]&FOpCtrlE[3]) | (FmtE&FOpCtrlE[1]);
+    assign Res64 = (FOpCtrlE[0]&FOpCtrlE[2]) | (FmtE&~FOpCtrlE[0]);
+    assign In64 =  (~FOpCtrlE[0]&FOpCtrlE[2]) | (FmtE&FOpCtrlE[0]);
    assign SubBits = In64 ? 8'd64 : 8'd32;
    assign Bits = Res64 ? 8'd64 : 8'd32;

@ -73,11 +73,11 @@ module fcvt (
 ////////////////////////////////////////////////////////

    // position the input in the most significant bits
-    assign IntIn = FOpCtrlE[3] ? {SrcAE, {64-`XLEN{1'b0}}} : {SrcAE[31:0], 32'b0};
+    assign IntIn = FOpCtrlE[2] ? {SrcAE, {64-`XLEN{1'b0}}} : {SrcAE[31:0], 32'b0};
    // make the integer positive
-    assign PosInt = IntIn[64-1]&~FOpCtrlE[2] ? -IntIn : IntIn;
+    assign PosInt = IntIn[64-1]&~FOpCtrlE[1] ? -IntIn : IntIn;
    // determine the integer's sign
-    assign ResSgn = ~FOpCtrlE[2] ? IntIn[64-1] : 1'b0;
+    assign ResSgn = ~FOpCtrlE[1] ? IntIn[64-1] : 1'b0;
    
 	// Leading one detector
 	logic [8:0]	i;
@ -97,8 +97,8 @@ module fcvt (


    // select the shift value and amount based on operation (to fp or int)
-    assign ShiftCnt = FOpCtrlE[1] ? ExpVal : LZResP;
-    assign ShiftVal = FOpCtrlE[1] ? {{64-2{1'b0}}, XManE} : {PosInt, 52'b0};
+    assign ShiftCnt = FOpCtrlE[0] ? ExpVal : LZResP;
+    assign ShiftVal = FOpCtrlE[0] ? {{64-2{1'b0}}, XManE} : {PosInt, 52'b0};

 	// if shift = -1 then shift one bit right for gaurd bit (right shifting twice never rounds)
 	// if the shift is negitive add a bit for sticky bit calculation
@ -111,35 +111,35 @@ module fcvt (
    // calculate sticky bit 
    //  - take into account the possible right shift from before
    //  - the sticky bit calculation covers three diffrent sizes depending on the opperation
-    assign Sticky = |ShiftedManTmp[49:0] | &ShiftCnt&XManE[0] | (FOpCtrlE[0]&|ShiftedManTmp[62:50]) | (FOpCtrlE[0]&~FmtE&|ShiftedManTmp[91:63]);
+    assign Sticky = |ShiftedManTmp[49:0] | &ShiftCnt&XManE[0] | (~FOpCtrlE[0]&|ShiftedManTmp[62:50]) | (~FOpCtrlE[0]&~FmtE&|ShiftedManTmp[91:63]);

    
    // determine guard, round, and least significant bit of the result
-    assign Guard = FOpCtrlE[1] ? ShiftedMan[1] : FmtE ? ShiftedMan[13] : ShiftedMan[42];
-    assign Round = FOpCtrlE[1] ? ShiftedMan[0] : FmtE ? ShiftedMan[12] : ShiftedMan[41];
-    assign LSB = FOpCtrlE[1] ? ShiftedMan[2] : FmtE ? ShiftedMan[14] : ShiftedMan[43];
+    assign Guard = FOpCtrlE[0] ? ShiftedMan[1] : FmtE ? ShiftedMan[13] : ShiftedMan[42];
+    assign Round = FOpCtrlE[0] ? ShiftedMan[0] : FmtE ? ShiftedMan[12] : ShiftedMan[41];
+    assign LSB = FOpCtrlE[0] ? ShiftedMan[2] : FmtE ? ShiftedMan[14] : ShiftedMan[43];

    always_comb begin
        // Determine if you add 1
        case (FrmE)
            3'b000: CalcPlus1 = Guard & (Round | Sticky | (~Round&~Sticky&LSB));//round to nearest even
            3'b001: CalcPlus1 = 0;//round to zero
-            3'b010: CalcPlus1 = (XSgnE&FOpCtrlE[1]) | (ResSgn&FOpCtrlE[0]);//round down
-            3'b011: CalcPlus1 = (~XSgnE&FOpCtrlE[1]) | (~ResSgn&FOpCtrlE[0]);//round up
+            3'b010: CalcPlus1 = (XSgnE&FOpCtrlE[0]) | (ResSgn&~FOpCtrlE[0]);//round down
+            3'b011: CalcPlus1 = (~XSgnE&FOpCtrlE[0]) | (~ResSgn&~FOpCtrlE[0]);//round up
            3'b100: CalcPlus1 = Guard & (Round | Sticky | (~Round&~Sticky));//round to nearest max magnitude
            default: CalcPlus1 = 1'bx;
        endcase
    end

    // dont tound if the result is exact
-    assign Plus1 = CalcPlus1 & (Guard|Round|Sticky)&~(XZeroE&FOpCtrlE[1]);
+    assign Plus1 = CalcPlus1 & (Guard|Round|Sticky)&~(XZeroE&FOpCtrlE[0]);

    // round the shifted mantissa
    assign RoundedTmp = ShiftedMan[64+1:2] + Plus1;
    assign {ResExp, ResFrac} = FmtE ? {TmpExp, ShiftedMan[64+1:14]} + Plus1 :  {{TmpExp, ShiftedMan[64+1:43]} + Plus1, 29'b0} ;

    // fit the rounded result into the appropriate size and take the 2's complement if needed
-     assign Rounded = Res64 ? XSgnE&FOpCtrlE[1] ? -RoundedTmp[63:0] : RoundedTmp[63:0] : 
+     assign Rounded = Res64 ? XSgnE&FOpCtrlE[0] ? -RoundedTmp[63:0] : RoundedTmp[63:0] : 
 			      XSgnE ? {{32{1'b1}}, -RoundedTmp[31:0]} : {32'b0, RoundedTmp[31:0]};

    // extract the MSB and Sign for later use (will be used to determine underflow and overflow)
@ -148,29 +148,29 @@ module fcvt (


    // check if the result overflows
-    assign Of = (~XSgnE&($signed(ShiftCnt) >= $signed(Bits))) | (~XSgnE&RoundSgn&~FOpCtrlE[2]) | (RoundMSB&(ShiftCnt==(Bits-1))) | (~XSgnE&XInfE) | XNaNE;
+    assign Of = (~XSgnE&($signed(ShiftCnt) >= $signed(Bits))) | (~XSgnE&RoundSgn&~FOpCtrlE[1]) | (RoundMSB&(ShiftCnt==(Bits-1))) | (~XSgnE&XInfE) | XNaNE;

    // check if the result underflows (this calculation changes if the result is signed or unsigned)
-    assign Uf = FOpCtrlE[2] ? XSgnE&~XZeroE | (XSgnE&XInfE) | (XSgnE&~XZeroE&(~ShiftCnt[12]|CalcPlus1)) | (ShiftCnt[12]&Plus1) : (XSgnE&XInfE) | (XSgnE&($signed(ShiftCnt) >= $signed(Bits))) | (XSgnE&~RoundSgn&~ShiftCnt[12]);    // assign CvtIntRes =  (XSgnE | ShiftCnt[12]) ? {64{1'b0}}  : (ShiftCnt >= 64) ? {64{1'b1}} : Rounded;
+    assign Uf = FOpCtrlE[1] ? XSgnE&~XZeroE | (XSgnE&XInfE) | (XSgnE&~XZeroE&(~ShiftCnt[12]|CalcPlus1)) | (ShiftCnt[12]&Plus1) : (XSgnE&XInfE) | (XSgnE&($signed(ShiftCnt) >= $signed(Bits))) | (XSgnE&~RoundSgn&~ShiftCnt[12]);    // assign CvtIntRes =  (XSgnE | ShiftCnt[12]) ? {64{1'b0}}  : (ShiftCnt >= 64) ? {64{1'b1}} : Rounded;
    
    // calculate the result's sign
-    assign SgnRes = ~FOpCtrlE[3] & FOpCtrlE[1];
+    assign SgnRes = ~FOpCtrlE[2] & FOpCtrlE[0];

    // select the integer result
-    assign CvtIntRes = Of ? FOpCtrlE[2] ? {64{1'b1}} : SgnRes ? {33'b0, {31{1'b1}}}: {1'b0, {63{1'b1}}} : 
-                    Uf ? FOpCtrlE[2] ? 64'b0 : SgnRes ? {32'b0, 1'b1, 31'b0} : {1'b1, 63'b0} :
+    assign CvtIntRes = Of ? FOpCtrlE[1] ? {64{1'b1}} : SgnRes ? {33'b0, {31{1'b1}}}: {1'b0, {63{1'b1}}} : 
+                    Uf ? FOpCtrlE[1] ? 64'b0 : SgnRes ? {32'b0, 1'b1, 31'b0} : {1'b1, 63'b0} :
 		            Rounded[64-1:0];

    // select the floating point result            
    assign CvtFPRes = FmtE ? {ResSgn, ResExp, ResFrac} : {{32{1'b1}}, ResSgn, ResExp[7:0], ResFrac[51:29]};

    // select the result
-    assign CvtResE = FOpCtrlE[0] ? CvtFPRes : CvtIntRes;
+    assign CvtResE = ~FOpCtrlE[0] ? CvtFPRes : CvtIntRes;

    // calculate the flags
    //      - to int only sets the invalid flag
    //      - from int only sets the inexact flag
-    assign CvtFlgE = {(Of | Uf)&FOpCtrlE[1], 3'b0, (Guard|Round|Sticky)&FOpCtrlE[0]};
+    assign CvtFlgE = {(Of | Uf)&FOpCtrlE[0], 3'b0, (Guard|Round|Sticky)&~FOpCtrlE[0]};



--- a/wally-pipelined/src/fpu/fma.sv
+++ b/wally-pipelined/src/fpu/fma.sv
@ -23,7 +23,7 @@
 ///////////////////////////////////////////

 `include "wally-config.vh"
-// `include "../../../config/rv64icfd/wally-config.vh"
+//  `include "../../../config/rv64icfd/wally-config.vh"

 module fma(
    input logic                 clk,
@ -106,6 +106,7 @@ module fma1(
    logic [`NE+1:0]     AlignCnt;           // how far to shift the addend to align with the product in Q(NE+2.0) format
    logic [4*`NF+5:0]   ZManShifted;        // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
    logic [4*`NF+5:0]   ZManPreShifted;     // input to the alignment shifter U(NF+5.3NF+1)
+    logic [`NE-2:0]     Denorm;             // Denormalized input value

    ///////////////////////////////////////////////////////////////////////////////
    // Calculate the product
@ -116,8 +117,9 @@ module fma1(
    ///////////////////////////////////////////////////////////////////////////////
   
    // verilator lint_off WIDTH
+    assign Denorm = FmtE ? 1 : -126+1023;
    assign ProdExpE = (XZeroE|YZeroE) ? 0 :
-                 XExpE + YExpE - BiasE + XDenormE + YDenormE;
+                 XExpE + YExpE - BiasE + ({`NE-1{XDenormE}}&Denorm) + ({`NE-1{YDenormE}}&Denorm);
    // verilator lint_on WIDTH

    // Calculate the product's mantissa
@ -133,7 +135,7 @@ module fma1(
    //      - positive means the product is larger, so shift Z right
    //      - Denormal numbers have an an exponent value of 1, however they are
    //        represented with an exponent of 0. add one to the exponent if it is a denormal number
-    assign AlignCnt = ProdExpE - ZExpE - ZDenormE;
+    assign AlignCnt = ProdExpE - (ZExpE + ({`NE-1{ZDenormE}}&Denorm));

    // Defualt Addition without shifting
    //          |   54'b0    |  106'b(product)  | 2'b0 |
@ -320,7 +322,9 @@ module fma2(
    //assign FracLen = `NF;

    // Determine if the result is denormal
-    assign SumExpTmp = KillProdM ? {2'b0, ZExpM} : ProdExpM + -({4'b0, NormCnt} - (`NF+4));
+    logic [`NE+1:0] SumExpTmpTmp;
+    assign SumExpTmpTmp = KillProdM ? {2'b0, ZExpM} : ProdExpM + -({4'b0, NormCnt} - (`NF+4));
+    assign SumExpTmp = FmtM ? SumExpTmpTmp : (SumExpTmpTmp-1023+127)&{`NE+2{|SumExpTmpTmp}};

    assign ResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp)>=$signed(-FracLen)) & ~SumZero;

@ -511,7 +515,7 @@ module fma2(
                                    ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{32{1'b1}}, ResultSgn, 8'hfe, {23{1'b1}}} :
                                                                                                                          {{32{1'b1}}, ResultSgn, 8'hff, 23'b0};
    assign InvalidResult = FmtM ? {ResultSgn, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}} : {{32{1'b1}}, ResultSgn, 8'hff, 1'b1, 22'b0};
-    assign KillProdResult = FmtM ? {ResultSgn, {ZExpM, ZManM[`NF-1:0]} - (Minus1&AddendStickyM) + (Plus1&AddendStickyM)} : {{32{1'b1}}, ResultSgn, {ZExpM[7:0], ZManM[51:29]} - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}};
+    assign KillProdResult = FmtM ? {ResultSgn, {ZExpM, ZManM[`NF-1:0]} - (Minus1&AddendStickyM) + (Plus1&AddendStickyM)} : {{32{1'b1}}, ResultSgn, {ZExpM[`NE-1],ZExpM[6:0], ZManM[51:29]} - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}};
    assign UnderflowResult = FmtM ? {ResultSgn, {`FLEN-1{1'b0}}} + (CalcPlus1&(AddendStickyM|FrmM[1])) : {{32{1'b1}}, {ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}};
    assign FMAResM = XNaNM ? XNaNResult :
                        YNaNM ? YNaNResult :
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@ -75,15 +75,8 @@ module fpdiv (
   // div/sqrt
         //  fdiv  = 0
         //  fsqrt = 1
-
-   // Convert the input operands to their appropriate forms based on 
-   // the orignal operands, the op_type , and their precision P. 
-   // Single precision inputs are converted to double precision 
-   // and the sign of the first operand is set appropratiately based on
-   // if the operation is absolute value or negation.   
-   convert_inputs_div conv1 (.op1, .op2, .op_type, .P, 
-                           // outputs:
-                           .Float1, .Float2b(Float2));
+   assign Float1 = op1;
+   assign Float2 = op_type ? op1 : op2;   

   // Test for exceptions and return the "Invalid Operation" and
   // "Denormalized" Input Flags. The "sel_inv" is used in
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -57,7 +57,7 @@ module fpu (
  //                single stored in a double: | 32 1s | single precision value |
  //    - sets the underflow after rounding
  
-  generate if (`F_SUPPORTED | `D_SUPPORTED) begin 
+  generate if (`F_SUPPORTED | `D_SUPPORTED) begin : fpu

  // control signals
 	logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
@ -67,7 +67,7 @@ module fpu (
 	logic 		  FWriteIntD;                         // Write to integer register
 	logic [1:0] FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
 	logic [2:0] FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select the result written to FP register
-	logic [3:0] FOpCtrlD, FOpCtrlE, FOpCtrlM;           // Select which opperation to do in each component
+	logic [2:0] FOpCtrlD, FOpCtrlE, FOpCtrlM;           // Select which opperation to do in each component
 	logic [1:0] FResSelD, FResSelE, FResSelM;           // Select one of the results that finish in the memory stage
 	logic [1:0] FIntResSelD, FIntResSelE, FIntResSelM;  // Select the result written to the integer resister
 	logic [4:0] Adr1E, Adr2E, Adr3E;                    // adresses of each input
@ -97,7 +97,8 @@ module fpu (
 	logic 		   XInfE, YInfE, ZInfE;           // is the input infinity - execute stage
 	logic 		   XInfM, YInfM, ZInfM;           // is the input infinity - memory stage
 	logic 		   XExpMaxE;                      // is the exponent all ones (max value)
-	logic 		   XNormE;                        // is X normal
+	logic 		   XNormE,YNormE;                 // is normal
+	logic 		   XNormM,YNormM;                 // is normal
 	
 	
 	// result and flag signals
@ -171,7 +172,7 @@ module fpu (
 	flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
 	flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
                                                       {Adr1E,         Adr2E,         Adr3E});
-	flopenrc #(18) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+	flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
 				  {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
 				  {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
 	
@ -203,11 +204,11 @@ module fpu (
  // unpacking unit
  //    - splits FP inputs into their various parts
  //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
-	unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, 
+	unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FResultSelE, .FmtE, 
                      // outputs:
                      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
                      .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
-                      .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
+                      .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE, .YNormE);

  // FMA
  //    - two stage FMA
@ -222,7 +223,7 @@ module fpu (
 		 .XSgnM, .YSgnM, .ZSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
     .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
     .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
-		 .FOpCtrlE(FOpCtrlE[2:0]), .FOpCtrlM(FOpCtrlM[2:0]), 
+		 .FOpCtrlE, .FOpCtrlM, 
 		 .FmtE, .FmtM, .FrmM, 
     // outputs:
     .FMAFlgM, .FMAResM);
@ -240,10 +241,10 @@ module fpu (
  //    - if not captured any forwarded inputs will change durring computation
  //        - this problem is caused by stalling the execute stage
  //    - the other units don't have this problem, only div/sqrt stalls the execute stage
-	flopenrc #(64) reg_input1 (.d(FSrcXE), .q(DivInput1E),
+	flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
 				   .en(1'b1), .clear(FDivSqrtDoneE),
 				   .reset(reset),  .clk(FDivBusyE));
-	flopenrc #(64) reg_input2 (.d(FSrcYE), .q(DivInput2E),
+	flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
 				   .en(1'b1), .clear(FDivSqrtDoneE),
 				   .reset(reset),  .clk(FDivBusyE));
 	
@ -261,6 +262,8 @@ module fpu (
  //*** remove uneeded logic
  //*** change to use the unpacking unit if possible
 	faddcvt faddcvt (.clk, .reset, .FlushM, .StallM, .FrmM, .FOpCtrlM, .FmtE, .FmtM, .FSrcXE, .FSrcYE, .FOpCtrlE, 
+   .XSgnM, .YSgnM, .XManM, .YManM, .XExpM, .YExpM,
+   .XSgnE, .YSgnE, .XManE, .YManE, .XExpE, .YExpE, .XDenormE, .YDenormE, .XNormE, .YNormE, .XNormM, .YNormM,  .XZeroE, .YZeroE, .XInfE, .YInfE, .XNaNE, .YNaNE, .XSNaNE, .YSNaNE,
                  // outputs:
                  .FAddResM, .FAddFlgM);
 	
@ -269,7 +272,7 @@ module fpu (
  //    - writes to FP file durring min/max instructions
  //    - other comparisons write a 1 or 0 to the integer register
 	fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
-            .FSrcXE, .FSrcYE, .FOpCtrlE(FOpCtrlE[2:0]), 
+            .FSrcXE, .FSrcYE, .FOpCtrlE, 
            .FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
            // outputs:
 		        .Invalid(CmpNVE), .CmpResE);
@ -325,9 +328,9 @@ module fpu (
  
 	flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
 	
-	flopenrc #(17) EMCtrlReg(clk, reset, FlushM, ~StallM,
-				 {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
-				 {FRegWriteM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
+	flopenrc #(18) EMCtrlReg(clk, reset, FlushM, ~StallM,
+				 {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, XNormE, YNormE},
+				 {FRegWriteM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM, XNormM, YNormM});
 	
 	

--- a/wally-pipelined/src/fpu/rounder_denorm.sv
+++ b/wally-pipelined/src/fpu/rounder_denorm.sv
@ -1,4 +1,4 @@
-// The rounder takes as inputs a 64-bit value to be rounded, A, the 
+// The rounder takes as input logics a 64-bit value to be rounded, A, the 
 // exponent of the value to be rounded, the sign of the final result, Sign, 
 // the precision of the results, P, and the two-bit rounding mode, rm. 
 // It produces a rounded 52-bit result, Z, the exponent of the rounded 
@ -17,38 +17,34 @@
 // where , denotes the rounding boundary. S is the logical OR of all the
 // bits to the right of R. 
 
-module rounder (Result, DenormIO, Flags, rm, P, OvEn, 
-		UnEn, exp_valid, sel_inv, Invalid, DenormIn, convert, Asign, Aexp, 
-		norm_shift, A, exponent_postsum, A_Norm, B_Norm, exp_A_unmodified, exp_B_unmodified,
-		normal_overflow, normal_underflow, swap, op_type, sum);
-
-   input  [2:0]  rm;
-   input         P;
-   input         OvEn;
-   input         UnEn;
-   input         exp_valid;
-   input [3:0] 	 sel_inv;
-   input	 Invalid;
-   input	 DenormIn;
-   input         convert;
-   input         Asign;
-   input [10:0]  Aexp;
-   input [5:0] 	 norm_shift;
-   input [63:0]  A;
-   input [10:0]  exponent_postsum;
-   input 	 A_Norm;
-   input 	 B_Norm;
-   input [11:0]  exp_A_unmodified;
-   input [11:0]  exp_B_unmodified;
-   input 	 normal_overflow;
-   input 	 normal_underflow;
-   input 	 swap;
-   input [3:0]	 op_type;
-   input [63:0]  sum;
+module rounder (
+   input logic  [2:0]  rm,
+   input logic         P,
+   input logic         OvEn,
+   input logic         UnEn,
+   input logic         exp_valid,
+   input logic [3:0] 	 sel_inv,
+   input logic	 Invalid,
+   input logic	 DenormIn,
+   input logic         Asign,
+   input logic [10:0]  Aexp,
+   input logic [5:0] 	 norm_shift,
+   input logic [63:0]  A,
+   input logic [10:0]  exponent_postsum,
+   input logic 	 A_Norm,
+   input logic 	 B_Norm,
+   input logic [11:0]  exp_A_unmodified,
+   input logic [11:0]  exp_B_unmodified,
+   input logic 	 normal_overflow,
+   input logic 	 normal_underflow,
+   input logic 	 swap,
+   input logic [2:0]	 op_type,
+   input logic [63:0]  sum,
   
-   output [63:0] Result;
-   output 	 DenormIO;
-   output [4:0]  Flags;
+   output logic [63:0] Result,
+   output logic 	 DenormIO,
+   output logic [4:0]  Flags
+);
   
   wire          Rsign;
   wire 	 Sticky_out;
@ -87,7 +83,6 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
   wire 	 Cout_overflow;
   wire		 Texp_l7z;
   wire		 Texp_l7o;
-   wire		 OvCon;

   // Determine the sticky bits for double and single precision
   assign S_DP= A[9]|A[8]|A[7]|A[6]|A[5]|A[4]|A[3]|A[2]|A[1]|A[0];
@ -152,7 +147,7 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
   assign UnFlow_SP = (~Texp[10]&(~Texp[9]|~Texp[8]|~Texp[7]|Texp_l7z));
   
   // Set the overflow and underflow flags. They should not be set if
-   // the input was infinite or NaN or the output of the adder is zero.
+   // the input logic was infinite or NaN or the output logic of the adder is zero.
   // 00 = Valid
   // 10 = NaN
   assign Valid = (~sel_inv[2]&~sel_inv[1]&~sel_inv[0]);
@ -164,7 +159,7 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
   assign OverFlow  = (P & OvFlow_SP | OvFlow_DP)&Valid&~UnderFlow&exp_valid;

   // The DenormIO is set if underflow has occurred or if their was a
-   // denormalized input. 
+   // denormalized input logic. 
   assign DenormIO = DenormIn | UnderFlow;

   // The final result is Inexact if any rounding occurred ((i.e., R or S 
@ -192,7 +187,7 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
   // -0 + +0 = -0 (for RD) 
   assign Rzero = ~exp_valid | UnderFlow;
   assign Rsign = DenormIn ?
-		  ( ~(op_type[2] | op_type[1] | op_type[0]) ? 
+		  ( ~(op_type[1] | op_type[0]) ? 
 		  ( (sum[63] & (A_Norm | B_Norm) & (exp_A_unmodified[11] ^ exp_B_unmodified[11])) ?
 		  ~Asign : Asign) 
   		  : ( ((A_Norm ^ B_Norm) & (exp_A_unmodified[11] ~^ exp_B_unmodified[11])) ?
@ -202,7 +197,7 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
     	          (sel_inv[2]&~sel_inv[1]&sel_inv[0]&rm[1]&rm[0] |
 	          sel_inv[2]&sel_inv[1]&~sel_inv[0] |		  
 	          ~exp_valid&rm[1]&rm[0]&~sel_inv[2] | 
-	          UnderFlow&rm[1]&rm[0]) & ~convert) & ~sel_inv[3]) |
+	          UnderFlow&rm[1]&rm[0])) & ~sel_inv[3]) |
 		  (Asign & sel_inv[3]) );
   
   // The exponent of the final result is zero if the final result is 
@ -218,7 +213,7 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
   assign VeryLarge = OverFlow & ~OvEn;
   assign Infinite   = (VeryLarge & ~Round_zero) | (~sel_inv[2] & sel_inv[1]);
   assign Largest = VeryLarge & Round_zero;
-   assign Adj_exp = OverFlow & OvEn & ~convert;
+   assign Adj_exp = OverFlow & OvEn;
   assign Rexp[10:1] = ({10{~Valid}} | 
 			{Texp[10]&~Adj_exp, Texp[9]&~Adj_exp, Texp[8], 
 			 (Texp[7]^P)&~(Adj_exp&P), Texp[6]&~(Adj_exp&P), Texp[5:1]} | 
@ -230,7 +225,7 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
   // Depending on the operation and the signs of the orignal operands,
   // underflow may or may not be needed to round.
   assign Rexp_denorm = DenormIn ? 
-			((~op_type[2] & ~op_type[1] & op_type[0]) ? 
+			((~op_type[1] & op_type[0]) ? 
 				( ((A_Norm != B_Norm) & (exp_A_unmodified[11] == exp_B_unmodified[11])) ? 
 					( (normal_overflow == normal_underflow) ? Texp[10:0] : (normal_overflow ? Texp_addone[10:0] : Texp_subone[10:0]) ) 
 					: ( normal_overflow ? Texp_addone[10:0] : Texp[10:0] ) ) 
@ -238,7 +233,7 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
 					( (normal_overflow == normal_underflow) ? Texp[10:0] : (normal_overflow ? Texp_addone[10:0] : Texp_subone[10:0]) ) 
 					: ( normal_overflow ? Texp_addone[10:0] : Texp[10:0] ) ) 
 				) : 
-			(op_type[3]) ? exp_A_unmodified[10:0] : Rexp; //KEP used to be all of exp_A_unmodified
+			Rexp; //KEP used to be all of exp_A_unmodified

   // If the result is zero or infinity, the mantissa is all zeros. 
   // If the result is NaN, the mantissa is 10...0
@ -256,10 +251,9 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
   // for the final result. A double precision result is returned if 
   // overflow has occurred, the overflow trap is enabled, and a conversion
   // is being performed. 
-   assign OvCon = OverFlow & OvEn & convert;

-   assign Result = (op_type[3]) ? {A[63:0]} : (DenormIn ? {Rsign, Rexp_denorm, ShiftMant} : ((P&~OvCon) ? {{32{1'b1}}, Rsign, Rexp[7:0], Rmant[51:29]}
-	           : {Rsign, Rexp, Rmant}));
+   assign Result = DenormIn ? {Rsign, Rexp_denorm, ShiftMant} : (P ? {{32{1'b1}}, Rsign, Rexp[7:0], Rmant[51:29]}
+	           : {Rsign, Rexp, Rmant});

 endmodule // rounder

--- a/wally-pipelined/src/fpu/unpacking.sv
+++ b/wally-pipelined/src/fpu/unpacking.sv
@ -1,11 +1,12 @@
 module unpacking ( 
    input logic  [63:0] X, Y, Z,
    input logic         FmtE,
+    input logic  [2:0]  FResultSelE,
    input logic  [2:0]  FOpCtrlE,
    output logic        XSgnE, YSgnE, ZSgnE,
    output logic [10:0] XExpE, YExpE, ZExpE,
    output logic [52:0] XManE, YManE, ZManE,
-    output logic XNormE,
+    output logic XNormE, YNormE,
    output logic XNaNE, YNaNE, ZNaNE,
    output logic XSNaNE, YSNaNE, ZSNaNE,
    output logic XDenormE, YDenormE, ZDenormE,
@ -25,12 +26,9 @@ module unpacking (
    assign YSgnE = FmtE ? Y[63] : Y[31];
    assign ZSgnE = FmtE ? Z[63] : Z[31];

-    assign XExpE = FmtE ? X[62:52] : {3'b0, X[30:23]};//{X[30], {3{~X[30]&~XExpZero|XExpMaxE}}, X[29:23]}; 
-    assign YExpE = FmtE ? Y[62:52] : {3'b0, Y[30:23]};//{Y[30], {3{~Y[30]&~YExpZero|YExpMaxE}}, Y[29:23]}; 
-    assign ZExpE = FmtE ? Z[62:52] : {3'b0, Z[30:23]};//{Z[30], {3{~Z[30]&~ZExpZero|ZExpMaxE}}, Z[29:23]}; 
-/*    assign XExpE = FmtE ? X[62:52] : {3'b0, X[30:23]}; // *** maybe convert to full number of bits here?
-    assign YExpE = FmtE ? Y[62:52] : {3'b0, Y[30:23]};
-    assign ZExpE = FmtE ? Z[62:52] : {3'b0, Z[30:23]};*/
+    assign XExpE = FmtE ? X[62:52] : {X[30], {3{~X[30]&~XExpZero|XExpMaxE}}, X[29:23]}; 
+    assign YExpE = FmtE ? Y[62:52] : {Y[30], {3{~Y[30]&~YExpZero|YExpMaxE}}, Y[29:23]}; 
+    assign ZExpE = FmtE ? Z[62:52] : {Z[30], {3{~Z[30]&~ZExpZero|ZExpMaxE}}, Z[29:23]}; 

    assign XFracE = FmtE ? X[51:0] : {X[22:0], 29'b0};
    assign YFracE = FmtE ? Y[51:0] : {Y[22:0], 29'b0};
@ -57,6 +55,7 @@ module unpacking (
    assign ZExpMaxE = FmtE ? &Z[62:52] : &Z[30:23];
  
    assign XNormE = ~(XExpMaxE|XExpZero);
+    assign YNormE = ~YExpZero; // only used in addcvt - checks inf and NaN seperately
    
    assign XNaNE = XExpMaxE & ~XFracZero;
    assign YNaNE = YExpMaxE & ~YFracZero;
@ -78,7 +77,6 @@ module unpacking (
    assign YZeroE = YExpZero & YFracZero;
    assign ZZeroE = ZExpZero & ZFracZero;

-    assign BiasE = FmtE ? 13'h3ff : 13'h7f; // *** is it better to convert to full precision exponents so bias isn't needed?
-    // assign BiasE = 13'h3ff; // always use 1023 because exponents are unpacked to double precision
+    assign BiasE = 13'h3ff; // always use 1023 because exponents are unpacked to double precision

 endmodule