Almost all convert instructions pass Imperas tests

2021-07-11 18:06:33 -04:00 · 2021-07-11 18:06:33 -04:00 · 36f59f3c99
commit 36f59f3c99
parent 20f2a4e47c
32 changed files with 948 additions and 5726 deletions
--- a/wally-pipelined/config/rv64icfd/wally-config.vh
+++ b/wally-pipelined/config/rv64icfd/wally-config.vh
@ -46,7 +46,7 @@
 `define MEM_DCACHE 0
 `define MEM_DTIM 1
 `define MEM_ICACHE 0
-`define MEM_VIRTMEM 0\1
+`define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1

 `define ITLB_ENTRIES 32
@ -56,10 +56,7 @@
 `define PMP_ENTRIES 16

 // Address space
-`define RESET_VECTOR 64'h0000000080000000
-
-// Bus Interface width
-`define AHBW 64
+`define RESET_VECTOR 64'h80000000

 // Peripheral Addresses
 // Peripheral memory space extends from BASE to BASE+RANGE
@ -84,6 +81,9 @@
 `define PLIC_BASE   56'h0C000000
 `define PLIC_RANGE  56'h03FFFFFF

+// Bus Interface width
+`define AHBW 64
+
 // Test modes

 // Tie GPIO outputs back to inputs
--- a/wally-pipelined/regression/wave-dos/default-waves.do
+++ b/wally-pipelined/regression/wave-dos/default-waves.do
@ -8,7 +8,7 @@ add wave /testbench/clk
 add wave /testbench/reset
 add wave -divider
 #add wave /testbench/dut/hart/ebu/IReadF
-add wave /testbench/dut/hart/DataStall
+#add wave /testbench/dut/hart/DataStall
 add wave /testbench/dut/hart/ICacheStallF
 add wave /testbench/dut/hart/StallF
 add wave /testbench/dut/hart/StallD
--- a/wally-pipelined/src/fpu/FMA/add.sv
+++ b/wally-pipelined/src/fpu/FMA/add.sv
@ -1,65 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-//
-// Block Name:	add.v
-// Author:		David Harris
-// Date:		11/12/1995
-//
-// Block Description:
-//       This block performs the addition of the product and addend.   It also
-//   contains logic necessary to adjust the signs for effective subtracts 
-//   and negative results. 
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-module add(rM, sM, tM, sum,
-		   negsum, invz, selsum1, negsum0, negsum1, killprodM);
-////////////////////////////////////////////////////////////////////////////////
-
-	input logic 		[105:0]		rM;     			// partial product 1
-	input logic 		[105:0]		sM;              // partial product 2
-	input logic 		[163:0]		tM;             	// aligned addend 
-	input logic					invz;       	// invert addend
-	input logic 					selsum1;    	// select +1 mode of compound adder 
-	input logic					killprodM;    	// z >> product
-	input logic					negsum;      	// Negate sum 
-	output logic		[163:0]		sum;         	// sum
-	output logic					negsum0;     	// sum was negative in +0 mode
-	output logic					negsum1;     	// sum was negative in +1 mode 
-
-	// Internal nodes
-
-	wire		[105:0]		r2;				// partial product possibly zeroed out
-	wire		[105:0]		s2;				// partial product possibly zeroed out
-	wire		[164:0]		t2;				// addend after inversion if necessary
-	wire		[164:0] 	sum0;			// sum of compound adder +0 mode
-	wire		[164:0] 	sum1;			// sum of compound adder +1 mode
-	wire		[163:0] 	prodshifted;			// sum of compound adder +1 mode
-	wire		[164:0] 	tmp;			// sum of compound adder +1 mode
-
-	// Invert addend if z'sM sign is diffrent from the product'sM sign
-
-	assign t2 = invz ? ~{1'b0,tM} : {1'b0,tM};
-	
-	// Zero out product if Z >> product or product really should be 	
-
-	assign r2 = killprodM ? 106'b0 : rM;
-	assign s2 = killprodM ? 106'b0 : sM;
-
-	//***replace this with a more structural cpa that synthisises better
-	// Compound adder
-	// Consists of 3:2 CSA followed by long compound CPA
-	//assign prodshifted = killprodM ? 0 : {56'b0, r2+s2, 2'b0};
-	//assign tmp = ({{57{r2[105]}},r2, 2'b0} + {{57{s2[105]}},s2, 2'b0});
-	assign sum0 = t2 + 164'b0 + {57'b0, r2+s2, 2'b0};
-	assign sum1 = t2 + 164'b1 + {57'b0, r2+s2, 2'b0}; // +1 from invert of z above
-	
-	// Check sign bits in +0/1 modes 
-	assign negsum0 = sum0[164];
-	assign negsum1 = sum1[164];
-
-	// Mux proper result (+Oil mode and inversion) using 4:1 mux
- 	//assign sumzero = |sum;
-	assign sum = selsum1 ? (negsum ? -sum1[163:0] : sum1[163:0]) : (negsum ? -sum0[163:0] : sum0[163:0]);
-	
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/align.sv
+++ b/wally-pipelined/src/fpu/FMA/align.sv
@ -1,88 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	align.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This block implements the alignment shifter.   It is responsible for
-//   adjusting the fraction portion of the addend relative to the fraction
-//   produced in the multiplier array.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module align(zman, aligncntE, xzeroE, yzeroE, zzeroE, zdenormE, tE, bsE, 
-             killprodE,  sumshiftE, sumshiftzeroE);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic 		[51:0]		zman;		// Fraction of addend z;
-	input logic 		[12:0]		aligncntE;	// amount to shift
-	input logic				xzeroE;		// Input X = 0
-	input logic                  		yzeroE;          // Input Y = 0 
-	input logic                  		zzeroE;          // Input Z = 0
-	input logic                  		zdenormE;        // Input Z is denormalized
-	output logic    	[163:0]    	tE;              // aligned addend (54 bits left of bpt)
-	output logic          		bsE;           	// sticky bit of addend
-	output logic          		killprodE;    	// Z >> product
-	output logic		[8:0]		sumshiftE;	
-	output logic				sumshiftzeroE;
-
-	// Internal nodes
- 
-	reg       	[215:0]   	shift;				// aligned addend from shifter
-	logic 		[12:0]		tmp;
-	
-
-
-	always_comb 
-		begin
-
-		// Default to clearing sticky bits 
-		bsE = 0;
-
-		// And to using product as primary operand in adder I exponent gen 
-		killprodE = xzeroE | yzeroE;
-		// d = aligncntE
-		// p = 53
-		//***try reducing this hardware to use one shifter
-		if ($signed(aligncntE) <= $signed(-(13'd105))) begin //d<=-2p+1
-			//product ancored case with saturated shift
-			sumshiftE = 163;	// 3p+4	
-			sumshiftzeroE = 0;
-			shift = {1'b1,zman,163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-
-		end else if($signed(aligncntE) <= $signed(13'd2))  begin // -2p+1<d<=2
-			// product ancored or cancellation
-			tmp = 13'd57-aligncntE;
-			sumshiftE = tmp[8:0]; // p + 2 - d  
-			sumshiftzeroE = 0;
-			shift = {~zdenormE,zman,163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-
-		end else if ($signed(aligncntE)<=$signed(13'd55))  begin // 2 < d <= p+2
-			// addend ancored case
-			// used to be 56 \/ somthing doesn't seem right too many typos
-			tmp = 13'd57-aligncntE;
-			sumshiftE = tmp[8:0]; 
-			sumshiftzeroE = 0;
-			shift = {~zdenormE,zman, 163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-
-		end else begin                 	// d >= p+3
-			// addend anchored case with saturated shift
-			sumshiftE = 0;	
-			sumshiftzeroE = 1;		
-			shift = {~zdenormE,zman, 163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-			killprodE = 1;
-
-		end 
-	end
-
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/booth.sv
+++ b/wally-pipelined/src/fpu/FMA/booth.sv
@ -1,53 +0,0 @@
-module booth(xExt, choose, add1, e, pp); 
-/////////////////////////////////////////////////////////////////////////////
-    
-	input logic 		[53:0]		xExt;				// multiplicand	xExt
-	input logic		[2:0]		choose;				// bits needed to choose which encoding
-	output logic		[1:0]       	add1;				// do you add 1	
-    output logic                  e;
-	output logic		[54:0]		pp;				//	the resultant encoding
-    
-    logic [54:0] temp;
-    logic [53:0] negx;
-    //logic temp;
-
-    assign negx = ~xExt;
-
-    always_comb
-    case (choose)
-        3'b000 : pp = 55'b0;   //  0
-        3'b001 : pp = {1'b0, xExt};  //  1
-        3'b010 : pp = {1'b0, xExt};  //  1
-        3'b011 : pp = {xExt, 1'b0};  //  2
-        3'b100 : pp = {negx, 1'b0};  // -2
-        3'b101 : pp = {1'b1, negx};  // -1
-        3'b110 : pp = {1'b1, negx};  // -1
-        3'b111 : pp = '1;  //  -0
-    endcase
-
-    always_comb
-    case (choose)
-        3'b000 : e = 0;   //  0
-        3'b001 : e = 0;  //  1
-        3'b010 : e = 0;  //  1
-        3'b011 : e = 0;  //  2
-        3'b100 : e = 1;  // -2
-        3'b101 : e = 1;  // -1
-        3'b110 : e = 1;  // -1
-        3'b111 : e = 1;  //  -0
-    endcase
-    // assign add1 = (choose[2] == 1'b1) ? ((choose[1:0] == 2'b11) ? 1'b0 : 1'b1) : 1'b0;
-    // assign add1 = choose[2];
-    always_comb
-    case (choose)
-        3'b000 : add1 = 2'b0;   //  0
-        3'b001 : add1 = 2'b0;  //  1
-        3'b010 : add1 = 2'b0;  //  1
-        3'b011 : add1 = 2'b0;  //  2
-        3'b100 : add1 = 2'b10;  // -2
-        3'b101 : add1 = 2'b1;  // -1
-        3'b110 : add1 = 2'b1;  // -1
-        3'b111 : add1 = 2'b1;  //  -0
-    endcase
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/compressors.sv
+++ b/wally-pipelined/src/fpu/FMA/compressors.sv
@ -1,90 +0,0 @@
-module add3comp2(a, b, c, carry, sum); 
-/////////////////////////////////////////////////////////////////////////////
-//look into diffrent implementations of the compressors?
-    
-    parameter BITS = 4;
-	input logic 		[BITS-1:0]		a;
-	input logic		[BITS-1:0]		b;
-	input logic		[BITS-1:0]    	c;
-    output logic      [BITS-1:0]      carry;
-	output logic		[BITS-1:0]		sum;
-    genvar i;
-
-    generate
-        for(i= 0; i<BITS; i=i+1) begin
-            sng3comp2 add0(a[i], b[i], c[i], carry[i], sum[i]);
-        end
-    endgenerate
-
-endmodule
-
-module add4comp2(a, b, c, d, carry, sum); 
-/////////////////////////////////////////////////////////////////////////////
-    
-    parameter BITS = 4;
-	input logic 		[BITS-1:0]		a;
-	input logic		[BITS-1:0]		b;
-	input logic		[BITS-1:0]    	c;
-	input logic		[BITS-1:0]    	d;
-    output logic      [BITS:0]      carry;
-	output logic		[BITS-1:0]		sum;
-
-    logic       [BITS-1:0]      cout;
-    logic                       carryTmp;
-    genvar i;
-
-
-    sng4comp2 add0(a[0], b[0], c[0], d[0], 1'b0, cout[0], carry[0], sum[0]);
-
-    generate
-        for(i= 1; i<BITS-1; i=i+1) begin
-            sng4comp2 add1(a[i], b[i], c[i], d[i], cout[i-1], cout[i], carry[i], sum[i]);
-        end
-    endgenerate
-
-
-    sng4comp2 add2(a[BITS-1], b[BITS-1], c[BITS-1], d[BITS-1], cout[BITS-2], cout[BITS-1], carryTmp, sum[BITS-1]);
-
-    assign carry[BITS-1] = carryTmp & cout[BITS-1];
-    assign carry[BITS] = carryTmp ^ cout[BITS-1];
-
-endmodule
-
-module sng3comp2(a, b, c, carry, sum); 
-/////////////////////////////////////////////////////////////////////////////
-//look into diffrent implementations of the compressors?
-    
-	input logic 				a;
-	input logic				b;
-	input logic		       	c;
-    output logic              carry;
-	output logic				sum;
-    
-    logic               axorb;
-
-    assign axorb = a ^ b;
-    assign sum = axorb ^ c;
-
-    assign carry = axorb ? c : a;
-
-endmodule
-
-module sng4comp2(a, b, c, d, cin, cout, carry, sum); 
-/////////////////////////////////////////////////////////////////////////////
-//look into pass gate 4:2 counters?
-    
-	input logic 				a;
-	input logic				b;
-	input logic		       	c;
-    input logic               d;
-    input logic               cin;
-    output logic              cout;
-    output logic              carry;
-	output logic				sum;
-    
-    logic               TmpSum;
-
-    sng3comp2 add1(.carry(cout), .sum(TmpSum),.*);
-    sng3comp2 add2(.a(TmpSum), .b(d), .c(cin), .*);
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/expgen.sv
+++ b/wally-pipelined/src/fpu/FMA/expgen.sv
@ -1,140 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	expgen.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-//   Block Description:
-//   This block implements the exponent path of the FMAC. It performs the
-//   following operations:
-//
-//   1) Compute exponent of multiply.  
-//   2) Compare multiply and add exponents to generate alignment shift count
-//   3) Adjust exponent based on normalization
-//   4)  Increment exponent based on postrounding renormalization
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module expgen(xexp, yexp, zexp,
-			   killprod,  sumzero, resultdenorm, normcnt, infinity, 
-			   FmaFlagsM, inf, xzero, yzero,expplus1,
-			   nan, de0, xnan, ynan, znan, xdenorm, ydenorm, zdenorm, proddenorm, specialsel, zexpsel,
-			   aligncnt, wexp,
-			   prodof, sumof, sumuf, denorm0, ae);
-/////////////////////////////////////////////////////////////////////////////
-  
-	input     	[62:52]    	xexp;           	// Exponent of multiplicand x
-	input     	[62:52]  	yexp;         		// Exponent of multiplicand y
-	input     	[62:52]  	zexp;           	// Exponent of addend z
-	input     			killprod;    	// Z >> product
-	input     			sumzero;     	// sum exactly equals zero 
-	input     			resultdenorm;  // postnormalize rounded result
-	input     	[8:0]  		normcnt;     	// normalization shift count 
-	input     			infinity;    	// generate infinity on overflow 
-	input     	[4:0]	FmaFlagsM;     	// Result invalid
-	input     			inf;			// Some input is infinity
-	input     			nan;			// Some input is NaN
-	input     	[12:0]		de0;			// X is NaN NaN
-	input     			xnan;			// X is NaN
-	input     			ynan;			// Y is NaN
-	input     			znan;			// Z is NaN 
-	input     			xdenorm;		// Z is denorm
-	input     			ydenorm;		// Z is denorm
-	input     			zdenorm;		// Z is denorm
-	input     			xzero;		// Z is denorm
-	input     			yzero;		// Z is denorm
-	input				expplus1;
-	input     			proddenorm;		// product is denorm
-	input     			specialsel;  	// Select special result
-	input     			zexpsel;  	// Select special result
-	output		[12:0]   	aligncnt;       // shift count for alignment shifter
-	output		[62:52]    	wexp;           	// Exponent of result
-	output				prodof;         // X*Y exponent out of bounds 
-	output				sumof;          // X*Y+Z exponent out of bounds 
-	output				sumuf;         // X*Y+Z exponent underflows 
-	output				denorm0;     	// exponent = 0 for denorm 
-	output		[12:0]		ae;				//exponent of multiply
-
-	//   Internal nodes
-
-
-	wire 	[12:0]			aligncnt0;		// Shift count for alignment
-	wire 	[12:0]			aligncnt1;		// Shift count for alignment
-	wire 	[12:0]			be;				// Exponent of multiply
-	wire 	[12:0]			de1;			// Normalized exponent
-	wire 	[12:0]			de;				// Normalized exponent
-	wire 	[10:0]			infinityres;	// Infinity or max number
-	wire 	[10:0]			nanres;          //	Nan propagated or generated
-	wire 	[10:0]			specialres;  //	Exceptional case result
-
-	//   Compute exponent of multiply
-	// Note that the exponent does not have to be incremented on a postrounding
-	//   normalization of X because the mantissa was already increased.   Report
-	//   if exponent is out of bounds 
-
-
-	assign ae = xzero|yzero ? 0 : xexp + yexp -1023;
-
-	assign prodof = (ae > 2046 && ~ae[12]);
-
-	// Compute alignment shift count
-	// Adjust for postrounding normalization of Z.
-	// This should not increas the critical path because the time to
-	// check if a round overflows is shorter than the actual round and
-	// is masked by the bypass mux and two 10 bit adder delays.
-	assign aligncnt0 = - 1 + ~xdenorm + ~ydenorm - ~zdenorm;
-	assign aligncnt1 = - 1 + {12'b0,~xdenorm} + {12'b0,~ydenorm} - {12'b0,~zdenorm};
-	assign aligncnt = zexp -ae - 1 + {12'b0,~xdenorm} + {12'b0,~ydenorm} - {12'b0,~zdenorm};
-	//assign aligncnt = zexp -ae - 1 + ~xdenorm + ~ydenorm - ~zdenorm;
-	//assign aligncnt = zexp - ae;// KEP use all of ae
-
-	// Select exponent (usually from product except in case of huge addend)
-
-	//assign be = zexpsel ? zexp : ae;
-
-	// Adjust exponent based on normalization
-	// A compound adder takes care of the case of post-rounding normalization
-	// requiring an extra increment
-	 
-	//assign de0 = sumzero ? 13'b0 : be + normcnt + 2;
-	// assign de1 = sumzero ? 13'b0 : be + normcnt + 2;
-	 
-	// If the exponent becomes exactly zero (denormalized)
-	// signal such to adjust R bit before rounding
-
-	assign denorm0 = (de0 == 0);
-	
-	// check for exponent out of bounds after add 
-	
-	assign de = resultdenorm | sumzero ? 0 : de0;
-	assign sumof = ~de[12] && de > 2046;
-	assign sumuf = de == 0  && ~sumzero && ~resultdenorm;
-
-	// bypass occurs before rounding or taking early results 
-	
-	//assign wbypass = de0[10:0];
-	
-	// In a non-critical special mux, we combine the early result from other
-	// FPU blocks with the results of exceptional conditions.  Overflow
-	// produces either infinity or the largest finite number, depending on the
-	// rounding mode.  NaNs are propagated or generated.
-
-	assign specialres = FmaFlagsM[4] | nan ? nanres : // invalid
-					FmaFlagsM[2] ? infinityres : 	//overflow
-					inf ? 11'b11111111111 :
-					FmaFlagsM[1] ? 11'b0 : 11'bx; //underflow
-
-	assign infinityres = infinity ? 11'b11111111111 : 11'b11111111110;
-
-	// IEEE 754-2008 section 6.2.3 states:
-	// "If two or more inputs are NaN, then the payload of the resulting NaN should be 
-	// identical to the payload of one of the input NaNs if representable in the destination
-	// format. This standard does not specify which of the input NaNs will provide the payload."
-	assign nanres = xnan ? xexp : (ynan ? yexp : (znan? zexp : 11'b11111111111));
-
-	// A mux selects the early result from other FPU blocks or the 
-	// normalized FMAC result.   Special cases are also detected. 
-	
-	assign wexp = specialsel ? specialres[10:0] : de[10:0] + expplus1; 
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/expgen1.sv
+++ b/wally-pipelined/src/fpu/FMA/expgen1.sv
@ -1,90 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	expgen.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-//   Block Description:
-//   This block implements the exponent path of the FMAC. It performs the
-//   following operations:
-//
-//   1) Compute exponent of multiply.  
-//   2) Compare multiply and add exponents to generate alignment shift count
-//   3) Adjust exponent based on normalization
-//   4)  Increment exponent based on postrounding renormalization
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module expgen1(xexp, yexp, zexp, xzeroE, yzeroE,
-			   xdenormE, ydenormE, zdenormE, 
-			   aligncntE, prodof, aeE);
-/////////////////////////////////////////////////////////////////////////////
-  
-	input logic     	[62:52]    	xexp;           	// Exponent of multiplicand x
-	input logic     	[62:52]  	yexp;         		// Exponent of multiplicand y
-	input logic     	[62:52]  	zexp;           	// Exponent of addend z
-	input logic     			xdenormE;		// Z is denorm
-	input logic     			ydenormE;		// Z is denorm
-	input logic     			zdenormE;		// Z is denorm
-	input logic     			xzeroE;		// Z is denorm
-	input logic     			yzeroE;		// Z is denorm
-	output logic		[12:0]   	aligncntE;       // shift count for alignment shifter
-	output logic			prodof;         // X*Y exponent out of bounds 
-	output logic		[12:0]		aeE;				//exponent of multiply
-
-	//   Internal nodes
-
-
-	wire 	[12:0]			aligncnt0;		// Shift count for alignment
-	wire 	[12:0]			aligncnt1;		// Shift count for alignment
-	wire 	[12:0]			be;				// Exponent of multiply
-	wire 	[12:0]			de1;			// Normalized exponent
-	wire 	[12:0]			de;				// Normalized exponent
-	wire 	[10:0]			infinityres;	// Infinity or max number
-	wire 	[10:0]			nanres;          //	Nan propagated or generated
-	wire 	[10:0]			specialres;  //	Exceptional case result
-
-	//   Compute exponent of multiply
-	// Note that the exponent does not have to be incremented on a postrounding
-	//   normalization of X because the mantissa was already increased.   Report
-	//   if exponent is out of bounds 
-
-
-	assign aeE = xzeroE|yzeroE ? 0 : {2'b0,xexp} + {2'b0,yexp} - 13'd1023;
-
-	assign prodof = (aeE > 2046 && ~aeE[12]);
-
-	// Compute alignment shift count
-	// Adjust for postrounding normalization of Z.
-	// This should not increas the critical path because the time to
-	// check if a round overflows is shorter than the actual round and
-	// is masked by the bypass mux and two 10 bit adder delays.
-	// assign aligncnt0 = - 1 + ~xdenormE + ~ydenormE - ~zdenormE;
-	// assign aligncnt1 = - 1 + {12'b0,~xdenormE} + {12'b0,~ydenormE} - {12'b0,~zdenormE};
-	assign aligncntE = {2'b0,zexp} -aeE - 1 + {12'b0,~xdenormE} + {12'b0,~ydenormE} - {12'b0,~zdenormE};
-	//assign aligncntE = zexp -aeE - 1 + ~xdenormE + ~ydenormE - ~zdenormE;
-	//assign aligncntE = zexp - aeE;// KEP use all of aeE
-
-	// Select exponent (usually from product except in case of huge addend)
-
-	//assign be = zexpsel ? zexp : aeE;
-
-	// Adjust exponent based on normalization
-	// A compound adder takes care of the case of post-rounding normalization
-	// requiring an extra increment
-	 
-	//assign de0 = sumzero ? 13'b0 : be + normcnt + 2;
-	// assign de1 = sumzero ? 13'b0 : be + normcnt + 2;
-	 
-
-	// bypass occurs before rounding or taking early results 
-	
-	//assign wbypass = de0[10:0];
-	
-	// In a non-critical special mux, we combine the early result from other
-	// FPU blocks with the results of exceptional conditions.  Overflow
-	// produces either infinity or the largest finite number, depending on the
-	// rounding mode.  NaNs are propagated or generated.
-endmodule
-
-
--- a/wally-pipelined/src/fpu/FMA/expgen2.sv
+++ b/wally-pipelined/src/fpu/FMA/expgen2.sv
@ -1,108 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	expgen.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-//   Block Description:
-//   This block implements the exponent path of the FMAC. It performs the
-//   following operations:
-//
-//   1) Compute exponent of multiply.  
-//   2) Compare multiply and add exponents to generate alignment shift count
-//   3) Adjust exponent based on normalization
-//   4)  Increment exponent based on postrounding renormalization
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module expgen2(xexp, yexp, zexp,
-			   sumzero, resultdenorm, infinity, 
-			   FmaFlagsM, inf, expplus1,
-			   nanM, de0, xnanM, ynanM, znanM,  specialsel,
-			    wexp,
-			   sumof, sumuf);
-/////////////////////////////////////////////////////////////////////////////
-  
-	input logic     	[62:52]    	xexp;           	// Exponent of multiplicand x
-	input logic     	[62:52]  	yexp;         		// Exponent of multiplicand y
-	input logic     	[62:52]  	zexp;           	// Exponent of addend z
-	input logic     			sumzero;     	// sum exactly equals zero 
-	input logic     			resultdenorm;  // postnormalize rounded result
-	input logic     			infinity;    	// generate infinity on overflow 
-	input logic     	[4:0]	FmaFlagsM;     	// Result invalid
-	input logic     			inf;			// Some input is infinity
-	input logic     			nanM;			// Some input is NaN
-	input logic     	[12:0]		de0;			// X is NaN NaN
-	input logic     			xnanM;			// X is NaN
-	input logic    			ynanM;			// Y is NaN
-	input logic     			znanM;			// Z is NaN 
-	input logic				expplus1;
-	input logic     			specialsel;  	// Select special result
-	output logic		[62:52]    	wexp;           	// Exponent of result
-	output logic				sumof;          // X*Y+Z exponent out of bounds 
-	output logic				sumuf;         // X*Y+Z exponent underflows 
-
-	//   Internal nodes
-
-
-	wire 	[12:0]			aligncnt0;		// Shift count for alignment
-	wire 	[12:0]			aligncnt1;		// Shift count for alignment
-	wire 	[12:0]			be;				// Exponent of multiply
-	wire 	[12:0]			de1;			// Normalized exponent
-	wire 	[12:0]			de;				// Normalized exponent
-	wire 	[10:0]			infinityres;	// Infinity or max number
-	wire 	[10:0]			nanres;          //	Nan propagated or generated
-	wire 	[10:0]			specialres;  //	Exceptional case result
-
-	//   Compute exponent of multiply
-	// Note that the exponent does not have to be incremented on a postrounding
-	//   normalization of X because the mantissa was already increased.   Report
-	//   if exponent is out of bounds 
-
-	// Select exponent (usually from product except in case of huge addend)
-
-	//assign be = zexpsel ? zexp : ae;
-
-	// Adjust exponent based on normalization
-	// A compound adder takes care of the case of post-rounding normalization
-	// requiring an extra increment
-	 
-	//assign de0 = sumzero ? 13'b0 : be + normcnt + 2;
-	// assign de1 = sumzero ? 13'b0 : be + normcnt + 2;
-	 
-	
-	// check for exponent out of bounds after add 
-	
-	assign de = resultdenorm | sumzero ? 0 : de0;
-	assign sumof = ~de[12] && de > 2046;
-	assign sumuf = de == 0  && ~sumzero && ~resultdenorm;
-
-	// bypass occurs before rounding or taking early results 
-	
-	//assign wbypass = de0[10:0];
-	
-	// In a non-critical special mux, we combine the early result from other
-	// FPU blocks with the results of exceptional conditions.  Overflow
-	// produces either infinity or the largest finite number, depending on the
-	// rounding mode.  NaNs are propagated or generated.
-
-	assign specialres = FmaFlagsM[4] | nanM ? nanres : // invalid
-					FmaFlagsM[2] ? infinityres : 	//overflow
-					inf ? 11'b11111111111 :
-					FmaFlagsM[1] ? 11'b0 : 11'bx; //underflow
-
-	assign infinityres = infinity ? 11'b11111111111 : 11'b11111111110;
-
-	// IEEE 754-2008 section 6.2.3 states:
-	// "If two or more inputs are NaN, then the payload of the resulting NaN should be 
-	// identical to the payload of one of the input NaNs if representable in the destination
-	// format. This standard does not specify which of the input NaNs will provide the payload."
-	assign nanres = xnanM ? xexp : (ynanM ? yexp : (znanM? zexp : 11'b11111111111));
-
-	// A mux selects the early result from other FPU blocks or the 
-	// normalized FMAC result.   Special cases are also detected. 
-	
-	assign wexp = specialsel ? specialres[10:0] : de[10:0] + {10'b0,expplus1}; 
-endmodule
-
-
--- a/wally-pipelined/src/fpu/FMA/flag.sv
+++ b/wally-pipelined/src/fpu/FMA/flag.sv
@ -1,88 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	flag.v
-// Author:		David Harris
-// Date:		12/6/1995
-//
-// Block Description:
-//       This block generates the flags: invalid, overflow, underflow, inexact. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
-			 psign,  zsign, xzero, yzero, zzero, vbits, killprod,
-			 inf, nan, FmaFlagsM,sticky);
-/////////////////////////////////////////////////////////////////////////////
-
-	input                  		xnan;        	// X is NaN 
-	input                  		ynan;        	// Y is NaN 
-	input                 		znan;       	// Z is NaN 
-	input                  		sticky;        	// X is Inf
-	input                  		xinf;        	// X is Inf
-	input                 		yinf;       	// Y is Inf 
-	input                  		zinf;        	// Z is Inf
-	input                  		prodof;         // X*Y overflows exponent
-	input                  		sumof;          // X*Y + z underflows exponent
-	input                  		sumuf;          // X*Y + z underflows exponent
-	input				psign; 		// Sign of product
-	input				zsign; 		// Sign of z
-	input				xzero;		// x = 0
-	input				yzero;		// y = 0
-	input				zzero;		// y = 0
-	input				killprod;
-	input     	[1:0]  		vbits;		// R and S bits of result
-	output				inf;		// Some	source is Inf
-	output				nan;		// Some	source is NaN
-	output		[4:0]	FmaFlagsM;
- 
-	//   Internal nodes
-
-	wire				prodinf;	// X*Y larger than max possible
-	wire				suminf;		// X*Y+Z larger than max possible
-
-	// If any input is NaN, propagate the NaN 
-
-	assign nan = xnan || ynan || znan;
-
-	// Same with infinity (inf - inf and O * inf don't propagate inf
-	//  but it's ok becaue illegal op takes higher precidence)
-
-	assign inf= xinf || yinf || zinf || suminf;//KEP added suminf 
-	//assign inf= xinf || yinf || zinf;//original
-
-	// Generate infinity checks
-
-	assign prodinf = prodof && ~xnan && ~ynan;
-	//KEP added if the product is infinity then sum is infinity
-	assign suminf = sumof && ~xnan && ~ynan && ~znan;
-
-	// Set invalid flag for following cases:
-	//   1) Inf - Inf
-	//   2) 0 * Inf
-	//   3) Output = NaN (this is not part of the IEEE spec,  only 486 proj)
-
-	assign FmaFlagsM[4] = (xinf || yinf || prodinf) && zinf && (psign ^ zsign) ||
-					   xzero && yinf || yzero && xinf;// KEP remove case 3) above
-
-	assign FmaFlagsM[3] = 0; // divide by zero flag
-
-
-	// Set the overflow flag for the following cases:
-	//   1) Rounded multiply result would be out of bounds
-	//   2) Rounded add result would be out of bounds
-
-	assign FmaFlagsM[2] = suminf && ~inf;
-
-	// Set the underflow  flag for the following cases:
-	//   1) Any input is denormalized
-	//   2)  Output would be denormalized or smaller
-
-	assign FmaFlagsM[1] = (sumuf && ~inf && ~prodinf && ~nan) || (killprod & zzero & ~(yzero | xzero));
-
-	// Set the inexact flag for the following cases:
-	//   1) Multiplication inexact
-	//   2) Addition  inexact
-	// One of these cases occurred if the R or S bit is set
-
-	assign FmaFlagsM[0] = (vbits[0] || vbits[1] ||sticky  || suminf) && ~(inf || nan);
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/flag1.sv
+++ b/wally-pipelined/src/fpu/FMA/flag1.sv
@ -1,34 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	flag.v
-// Author:		David Harris
-// Date:		12/6/1995
-//
-// Block Description:
-//       This block generates the flags: invalid, overflow, underflow, inexact. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module flag1(xnanE, ynanE, znanE, prodof, prodinfE, nanE);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic                  		xnanE;        	// X is NaN 
-	input logic                  		ynanE;        	// Y is NaN 
-	input logic                 		znanE;       	// Z is NaN
-	input logic                  		prodof;         // X*Y overflows exponent
-	output logic				nanE;		// Some	source is NaN
- 
-	//   Internal nodes
-
-	output logic				prodinfE;	// X*Y larger than max possible
-
-	// If any input logic is NaN, propagate the NaN 
-
-	assign nanE = xnanE || ynanE || znanE;
-
-
-	// Generate infinity checks
-
-	assign prodinfE = prodof && ~xnanE && ~ynanE;
-
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/flag2.sv
+++ b/wally-pipelined/src/fpu/FMA/flag2.sv
@ -1,80 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	flag.v
-// Author:		David Harris
-// Date:		12/6/1995
-//
-// Block Description:
-//       This block generates the flags: invalid, overflow, underflow, inexact. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module flag2(xsign,ysign,zsign, xnanM, ynanM, znanM, xinfM, yinfM, zinfM, sumof, sumuf,
-			 xzeroM, yzeroM, zzeroM, vbits, killprodM,
-			 inf, nanM, FmaFlagsM,sticky,prodinfM);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic                  		xnanM;        	// X is NaN 
-	input logic                  		ynanM;        	// Y is NaN 
-	input logic                 		znanM;       	// Z is NaN 
-	input logic				xsign; 		// Sign of z
-	input logic				ysign; 		// Sign of z
-	input logic				zsign; 		// Sign of z
-	input logic                  		sticky;        	// X is Inf
-    input logic                       prodinfM;
-	input logic                  		xinfM;        	// X is Inf
-	input logic                 		yinfM;       	// Y is Inf 
-	input logic                  		zinfM;        	// Z is Inf
-	input logic                  		sumof;          // X*Y + z underflows exponent
-	input logic                  		sumuf;          // X*Y + z underflows exponent
-	input logic				xzeroM;		// x = 0
-	input logic				yzeroM;		// y = 0
-	input logic				zzeroM;		// y = 0
-	input logic				killprodM;
-	input logic     	[1:0]  		vbits;		// R and S bits of result
-	output logic				inf;		// Some	source is Inf
-	input logic				nanM;		// Some	source is NaN
-	output logic		[4:0]	FmaFlagsM;
- 
-	//   Internal nodes
-
-logic suminf;
-
-	// Same with infinity (inf - inf and O * inf don't propagate inf
-	//  but it's ok becaue illegal op takes higher precidence)
-
-	assign inf= xinfM || yinfM || zinfM || suminf;//KEP added suminf 
-	//assign inf= xinfM || yinfM || zinfM;//original
-
-	assign suminf = sumof && ~xnanM && ~ynanM && ~znanM;
-
-
-	// Set the overflow flag for the following cases:
-	//   1) Rounded multiply result would be out of bounds
-	//   2) Rounded add result would be out of bounds
-
-	assign FmaFlagsM[2] = suminf && ~inf;
-
-	// Set the underflow  flag for the following cases:
-	//   1) Any input logic is denormalized
-	//   2)  output logic would be denormalized or smaller
-
-	assign FmaFlagsM[1] = (sumuf && ~inf && ~prodinfM && ~nanM) || (killprodM & zzeroM & ~(yzeroM | xzeroM));
-
-	// Set the inexact flag for the following cases:
-	//   1) Multiplication inexact
-	//   2) Addition  inexact
-	// One of these cases occurred if the R or S bit is set
-
-	assign FmaFlagsM[0] = (vbits[0] || vbits[1] ||sticky  || suminf) && ~(inf || nanM);
-
-	// Set invalid flag for following cases:
-	//   1) Inf - Inf
-	//   2) 0 * Inf
-	//   3) output logic = NaN (this is not part of the IEEE spec,  only 486 proj)
-
-	assign FmaFlagsM[4] = (xinfM || yinfM || prodinfM) && zinfM && (xsign ^ ysign ^ zsign) ||
-					   xzeroM && yinfM || yzeroM && xinfM;// KEP remove case 3) above
-
-	assign FmaFlagsM[3] = 0; // divide by zero flag
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/fma.sv
+++ b/wally-pipelined/src/fpu/FMA/fma.sv
@ -1,132 +0,0 @@
- ////////////////////////////////////////////////////////////////////////////////
-// Block Name:	fmac.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This is the top level block of a floating-point  multiply/accumulate
-//   unit(FMAC).   It instantiates the following sub-blocks:
-//
-//    array     Booth encoding, partial product generation, product summation
-//    expgen    Exponent summation, compare, and adjust
-//    align     Alignment shifter
-//    add       Carry-save adder for accumulate, carry propagate adder
-//    lza       Leading zero anticipator to control normalization shifter
-//    normalize Normalization shifter
-//    round     Rounding of result
-//    exception Handles exceptional cases
-//    bypass    Handles bypass of result to ReadData1E or ReadData3E inputs
-//    sign      One bit sign handling block 
-//    special   Catch special cases (inputs = 0  / infinity /  etc.) 
-//
-//   The FMAC computes FmaResultM=ReadData1E*ReadData2E+ReadData3E, rounded with the mode specified by
-//   RN, RZ, RM, or RP.  The result is optionally bypassed back to
-//   the ReadData1E or ReadData3E inputs for use on the next cycle.  In addition,  four signals
-//   are produced: trap, overflow, underflow, and inexact.  Trap indicates
-//   an infinity, NaN, or denormalized number to be handled in software;
-//   the other three signals are IEEE flags.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module fma(ReadData1E, ReadData2E, ReadData3E, FrmE,
-			FmaResultM, FmaFlagsM, aligncnt);
-/////////////////////////////////////////////////////////////////////////////
- 
-	input 		[63:0]		ReadData1E;		// input 1
-	input		[63:0]		ReadData2E;     // input 2 
-	input 		[63:0]		ReadData3E;     // input 3
-	input 		[2:0]	 	FrmE;          	// Rounding mode
-	output 		[63:0]		FmaResultM;     // output FmaResultM=ReadData1E*ReadData2E+ReadData3E
-	output 		[4:0]		FmaFlagsM;    	// status flags
-	output 		[12:0]		aligncnt;    	// status flags
-
-// Internal nodes
- 
-	logic 		[105:0]		r; 				// one result of partial product sum
-	logic 		[105:0]		s; 				// other result of partial products
-	logic 		[163:0]		t;				// output of alignment shifter
-	logic 		[163:0]		sum;			// output of carry prop adder
-	logic 		[53:0]		v; 				// normalized sum, R, S bits
-//	logic 		[12:0]		aligncnt; 		// shift count for alignment
-	logic 		[8:0]		normcnt; 		// shift count for normalizer
-	logic 		[12:0]		ae; 		// multiplier expoent
-	logic 					bs;				// sticky bit of addend
-	logic 					ps;				// sticky bit of product
-	logic 					killprod; 		// ReadData3E >> product
-	logic 					negsum; 		// negate sum
-	logic 					invz; 			// invert addend
-	logic 					selsum1; 		// select +1 mode of sum
-	logic 					negsum0; 		// sum +0 < 0
-	logic 					negsum1; 		// sum +1 < 0
-	logic 					sumzero; 		// sum = 0
-	logic 					infinity; 		// generate infinity on overflow
-	logic 					prodof; 		// ReadData1E*ReadData2E out of range
-	logic 					sumof;			// result out of range
-	logic					xzero;
-	logic					yzero;
-	logic					zzero;
-	logic					xdenorm;
-	logic					ydenorm;
-	logic					zdenorm;
-	logic					proddenorm;
-	logic					zexpsel;
-	logic					denorm0;
-	logic					resultdenorm;
-	logic					inf;
-	logic					xinf;
-	logic					yinf;
-	logic					zinf;
-	logic					xnan;
-	logic					ynan;
-	logic					znan;
-	logic					specialsel;
-	logic					expplus1;
-	logic					nan;
-	logic					sumuf;
-	logic					psign;
-	logic					sticky;
-	logic			[8:0]		sumshift;
-	logic					sumshiftzero;
-	logic			[12:0]		de0;
-	logic					isAdd;
-
-	assign isAdd = 1;
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-//   Instantiate fraction datapath
-
-	multiply		multiply(.xman(ReadData1E[51:0]), .yman(ReadData2E[51:0]), .*);
-	align			align(.zman(ReadData3E[51:0]),.*);
-	add				add(.*);
-	lza				lza(.*);
-	normalize		normalize(.zexp(ReadData3E[62:52]),.*); 
-	round			round(.xman(ReadData1E[51:0]), .yman(ReadData2E[51:0]),.zman(ReadData3E[51:0]), .wman(FmaResultM[51:0]),.wsign(FmaResultM[63]),.*);
-
-// Instantiate exponent datapath
-
-	expgen			expgen(.xexp(ReadData1E[62:52]),.yexp(ReadData2E[62:52]),.zexp(ReadData3E[62:52]),.wexp(FmaResultM[62:52]),.*);
-// Instantiate special case detection across datapath & exponent path 
-
-	special			special(.*);
-
-
-// Instantiate control logic
- 
-sign				sign(.xsign(ReadData1E[63]),.ysign(ReadData2E[63]),.zsign(ReadData3E[63]),.wsign(FmaResultM[63]),.*); 
-flag				flag(.zsign(ReadData3E[63]),.vbits(v[1:0]),.*); 
-
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/fma1.sv
+++ b/wally-pipelined/src/fpu/FMA/fma1.sv
@ -1,165 +0,0 @@
-module fma1(
- 
-	input logic 	[63:0]		FInput1E,	// X
-	input logic		[63:0]		FInput2E,	// Y
-	input logic 	[63:0]		FInput3E,	// Z
-	input logic 	[2:0]		FOpCtrlE,	// 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
-	input logic 				FmtE,		// precision 1 = double 0 = single
-	output logic 	[105:0]		ProdManE,	// 1.X frac * 1.Y frac
-	output logic 	[161:0]		AlignedAddendE,	// Z aligned for addition
-	output logic 	[12:0]		ProdExpE,		// X exponent + Y exponent - bias
-	output logic 				AddendStickyE,	// sticky bit that is calculated during alignment
-	output logic 				KillProdE,		// set the product to zero before addition if the product is too small to matter
-	output logic				XZeroE, YZeroE, ZZeroE, // inputs are zero
-	output logic				XInfE, YInfE, ZInfE,	// inputs are infinity
-	output logic				XNaNE, YNaNE, ZNaNE);	// inputs are NaN
-
-	logic [51:0] 	XFrac,YFrac,ZFrac;	// input fraction
-	logic [52:0] 	XMan,YMan,ZMan;		// input mantissa (with leading one)
-	logic [12:0] 	XExp,YExp,ZExp;		// input exponents
-	logic 		 	XSgn,YSgn,ZSgn;		// input signs
-	logic [12:0]	AlignCnt;			// how far to shift the addend to align with the product
-	logic [211:0] 	Shift;				// output of the alignment shifter including sticky bit
-	logic			XDenormE, YDenormE, ZDenormE;	// inputs are denormal
-	logic [63:0]	FInput3E2;	// value to add (Z or zero)
-	logic [12:0]	Bias;	// 1023 for double, 127 for single
-	logic 			XExpZero, YExpZero, ZExpZero; 	// input exponent zero
-	logic 			XFracZero, YFracZero, ZFracZero; // input fraction zero
-	logic 			XExpMax, YExpMax, ZExpMax; 	// input exponent all 1s
-
-	// Set addend to zero if FMUL instruction
-  	assign FInput3E2 = FOpCtrlE[2] ? 64'b0 : FInput3E;
-
-	// split inputs into the sign bit, fraction, and exponent and handle single or double precision
-	// 		- single precision is in the top half of the inputs
-	assign XSgn = FInput1E[63];
-	assign YSgn = FInput2E[63];
-	assign ZSgn = FInput3E2[63];
-
-	assign XExp = FmtE ? {2'b0, FInput1E[62:52]} : {5'b0, FInput1E[62:55]};
-	assign YExp = FmtE ? {2'b0, FInput2E[62:52]} : {5'b0, FInput2E[62:55]};
-	assign ZExp = FmtE ? {2'b0, FInput3E2[62:52]} : {5'b0, FInput3E2[62:55]};
-
-	assign XFrac = FmtE ? FInput1E[51:0] : {FInput1E[54:32], 29'b0};
-	assign YFrac = FmtE ? FInput2E[51:0] : {FInput2E[54:32], 29'b0};
-	assign ZFrac = FmtE ? FInput3E2[51:0] : {FInput3E2[54:32], 29'b0};
-	
-	assign XMan = {~XExpZero, XFrac};
-	assign YMan = {~YExpZero, YFrac};
-	assign ZMan = {~ZExpZero, ZFrac};
-
-	assign Bias = FmtE ? 13'h3ff : 13'h7f;
-
-
-
-	// determine if an input is a special value
-	assign XExpZero = ~|XExp;
-	assign YExpZero = ~|YExp;
-	assign ZExpZero = ~|ZExp;
-	
-	assign XFracZero = ~|XFrac;
-	assign YFracZero = ~|YFrac;
-	assign ZFracZero = ~|ZFrac;
-
-	assign XExpMax = FmtE ? &XExp[10:0] : &XExp[7:0];
-	assign YExpMax = FmtE ? &YExp[10:0] : &YExp[7:0];
-	assign ZExpMax = FmtE ? &ZExp[10:0] : &ZExp[7:0];
-	
-	assign XNaNE = XExpMax & ~XFracZero;
-	assign YNaNE = YExpMax & ~YFracZero;
-	assign ZNaNE = ZExpMax & ~ZFracZero;
-
-	assign XDenormE = XExpZero & ~XFracZero; 
-	assign YDenormE = YExpZero & ~YFracZero; 
-	assign ZDenormE = ZExpZero & ~ZFracZero; 
-
-	assign XInfE = XExpMax & XFracZero; 
-	assign YInfE = YExpMax & YFracZero; 
-	assign ZInfE = ZExpMax & ZFracZero; 
-
-	assign XZeroE = XExpZero & XFracZero;
-	assign YZeroE = YExpZero & YFracZero;
-	assign ZZeroE = ZExpZero & ZFracZero;
-
-
-
-
-	// Calculate the product's exponent
-	//		- When multipliying two fp numbers, add the exponents
-	// 		- Subtract the bias (XExp + YExp has two biases, one from each exponent)
-	//		- Denormal numbers have an an exponent value of 1, however they are 
-	//		  represented with an exponent of 0. add one if there is a denormal number
-	assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 : 
-				 XExp + YExp - Bias + XDenormE + YDenormE;
-
-	// Calculate the product's mantissa
-	//		- Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
-	assign ProdManE =  XMan * YMan;
-
-
-
-	// determine the shift count for alignment
-	//		- negitive means Z is larger, so shift Z left
-	//		- positive means the product is larger, so shift Z right
-	//		- Denormal numbers have an an exponent value of 1, however they are 
-	//		  represented with an exponent of 0. add one to the exponent if it is a denormal number
-	assign AlignCnt = ProdExpE - ZExp - ZDenormE;
-
-	// Alignment shifter
-
-	// Defualt Addition without shifting
-	// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-	//						 |1'b0| addnend |
-
-	// the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
-	
-	always_comb 
-		begin
-			
-		// Set default values
-		AddendStickyE = 0;
-		KillProdE = 0;
-		
-		// If the product is too small to effect the sum, kill the product
-
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//	| addnend |
-		if ($signed(AlignCnt) <= $signed(-13'd56)) begin
-			KillProdE = 1;
-			AlignedAddendE = {107'b0, ZMan,2'b0};
-			AddendStickyE = ~(XZeroE|YZeroE);
-
-		// If the Addend is shifted left (negitive AlignCnt)
-
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//					| addnend |
-		end else if($signed(AlignCnt) <= $signed(13'd0))  begin
-			Shift = {55'b0, ZMan, 104'b0} << -AlignCnt;
-			AlignedAddendE = Shift[211:50];
-			AddendStickyE = |(Shift[49:0]);
-
-		// If the Addend is shifted right (positive AlignCnt)
-
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//									| addnend |
-		end else if ($signed(AlignCnt)<=$signed(13'd105))  begin
-			Shift = {55'b0, ZMan, 104'b0} >> AlignCnt;
-			AlignedAddendE = Shift[211:50];
-			AddendStickyE = |(Shift[49:0]);
-
-		// If the addend is too small to effect the addition		
-		//		- The addend has to shift two past the end of the addend to be considered too small
-		//		- The 2 extra bits are needed for rounding
-
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//														| addnend |
-		end else begin
-			AlignedAddendE = 162'b0;
-			AddendStickyE = ~ZZeroE;
-
-
-		end 
-	end
-
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/fma2.sv
+++ b/wally-pipelined/src/fpu/FMA/fma2.sv
@ -1,282 +0,0 @@
-module fma2(
- 
-	input logic 	[63:0]		FInput1M,
-	input logic		[63:0]		FInput2M,
-	input logic 	[63:0]		FInput3M,
-	input logic 	[2:0] 		FrmM,
-	input logic 	[105:0]		ProdManM,
-	input logic 	[161:0]		AlignedAddendM,	
-	input logic 	[12:0]		ProdExpM,
-	input logic 				FmtM,
-	input logic 				AddendStickyM,
-	input logic 				KillProdM,
-	input logic 	[2:0]		FOpCtrlM,
-	input logic					XZeroM, YZeroM, ZZeroM,
-	input logic					XInfM, YInfM, ZInfM,
-	input logic					XNaNM, YNaNM, ZNaNM,
-	output logic	[63:0]		FmaResultM,
-	output logic 	[4:0]		FmaFlagsM);
-	
-
-
-	logic [51:0] 	XMan, YMan, ZMan, WMan;
-	logic [10:0] 	XExp, YExp, ZExp, WExp;
-	logic 		 	XSgn, YSgn, ZSgn, WSgn, PSgn;
-	logic [105:0]	ProdMan2;
-	logic [162:0]	AlignedAddend2;
- 	logic [161:0]	Sum;
-	logic [162:0]	SumTmp;
-	logic [12:0]	SumExp;
-	logic [12:0]	SumExpMinus1;
-	logic [12:0]	SumExpTmp, SumExpTmpMinus1, WExpTmp;
-	logic [53:0]	NormSum;
-	logic [161:0]	NormSumTmp;
-	logic [8:0]		NormCnt;
-	logic 			NormSumSticky;
-	logic 			SumZero;
-	logic 			NegSum;
-	logic 			InvZ;
-	logic			ResultDenorm;
-	logic			Sticky;
-	logic 			Plus1, Minus1, Plus1Tmp, Minus1Tmp;
-	logic 			Invalid,Underflow,Overflow,Inexact;
-	logic [8:0]		DenormShift;
-	logic 			ProdInf, ProdOf, ProdUf;
-	logic [63:0]	FmaResultTmp;
-	logic 			SubBySmallNum;
-	logic [63:0]	FInput3M2;
-	logic			ZeroSgn, ResultSgn;
-
-	// Set addend to zero if FMUL instruction
-  	assign FInput3M2 = FOpCtrlM[2] ? 64'b0 : FInput3M;
-
-	// split inputs into the sign bit, mantissa, and exponent for readability
-	
-	assign XSgn = FInput1M[63];
-	assign YSgn = FInput2M[63];
-	assign ZSgn = FInput3M2[63]^FOpCtrlM[0]; //Negate Z if subtraction
-
-	assign XExp = FmtM ? FInput1M[62:52] : {3'b0, FInput1M[62:55]};
-	assign YExp = FmtM ? FInput2M[62:52] : {3'b0, FInput2M[62:55]};
-	assign ZExp = FmtM ? FInput3M2[62:52] : {3'b0, FInput3M2[62:55]};
-
-	assign XMan = FmtM ? FInput1M[51:0] : {FInput1M[54:32], 29'b0};
-	assign YMan = FmtM ? FInput2M[51:0] : {FInput2M[54:32], 29'b0};
-	assign ZMan = FmtM ? FInput3M2[51:0] : {FInput3M2[54:32], 29'b0};
-
-
-
-	// Calculate the product's sign
-	//		Negate product's sign if FNMADD or FNMSUB
-	assign PSgn = XSgn ^ YSgn ^ FOpCtrlM[1];
-
-
-
-
-	// Addition
-	
-	// Negate Z  when doing one of the following opperations:
-	//		-prod +  Z
-	//		 prod -  Z 
-	assign InvZ = ZSgn ^ PSgn;
-
-	// Choose an inverted or non-inverted addend - the one is added later
-	assign AlignedAddend2 = InvZ ? ~{1'b0,AlignedAddendM} : {1'b0,AlignedAddendM};
-	// Kill the product if the product is too small to effect the addition (determined in fma1.sv)
-	assign ProdMan2 = KillProdM ? 106'b0 : ProdManM;
-
-	// Do the addition
-	// 		- add one to negate if the added was inverted
-	//		- the 2 extra bits at the begining and end are needed for rounding
-	assign SumTmp = AlignedAddend2 + {55'b0, ProdMan2,2'b0} + {162'b0, InvZ};
-	 
-	// Is the sum negitive
-	assign NegSum = SumTmp[162];
-	// If the sum is negitive, negate the sum.
-	assign Sum = NegSum ? -SumTmp[161:0] : SumTmp[161:0];
-
-
-
-
-
-
-	// Leading one detector
-	logic [8:0]	i;
-	always_comb begin
-			i = 0;
-			while (~Sum[161-i] && $unsigned(i) <= $unsigned(9'd161)) i = i+1;  // search for leading one 
-			NormCnt = i+1;    // compute shift count
-	end
-
-
-
-
-
-
-
-
-
-
-
-	// Normalization
-
-
-	// Determine if the sum is zero
-	assign SumZero = ~(|Sum);
-
-	logic [12:0] ManLen;
-	assign ManLen = FmtM ? 13'd52 : 13'd23;
-	// Determine if the result is denormal
-	assign ResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp)>=$signed(-ManLen));
-
-	// Determine the shift needed for denormal results
-	assign SumExpTmpMinus1 = SumExpTmp-1;
-	assign DenormShift = ResultDenorm ? SumExpTmpMinus1[8:0] : 9'b0;
-
-	// Normalize the sum
-	assign NormSumTmp = SumZero ? 162'b0 : Sum << NormCnt+DenormShift; 
-	assign NormSum = NormSumTmp[161:108];
-	// Calculate the sticky bit
-	assign NormSumSticky = FmtM ? (|NormSumTmp[107:0]) : (|NormSumTmp[136:0]);
-	assign Sticky = AddendStickyM | NormSumSticky;
-
-	// Determine sum's exponent
-	assign SumExpTmp = KillProdM ? {2'b0, ZExp} : ProdExpM + -({4'b0, NormCnt} - 13'd56);
-	assign SumExp = SumZero ? 13'b0 : 
-				 ResultDenorm ? 13'b0 :
-				 SumExpTmp; 
-
-
-
-
-
-	// Rounding
-
-	// round to nearest even
-	//		{Gaurd, Round, Sticky}
-	//		0xx - do nothing
-	//		100 - tie - Plus1 if NormSum[2] = 1
-	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
-	//		101/110/111 - Plus1
-
-	// 	round to zero - do nothing
-	//			- subtract 1 if a small number was supposed to be subtracted from the positive result
-
-	// 	round to -infinity - Plus1 if negitive
-	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
-	//			- subtract 1 if a small number was supposed to be subtracted from the positive result
-
-	// 	round to infinity - Plus1 if positive
-
-	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
-	//			- subtract 1 if a small number was supposed to be subtracted from the negitive result
-
-	//  round to nearest max magnitude
-	//		{Gaurd, Round, Sticky}
-	//		0xx - do nothing
-	//		100 - tie - Plus1
-	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
-	//		101/110/111 - Plus1
-
-	// Deterimine if the result was supposed to be subtrated by a small number
-	logic Gaurd, Round;
-	assign Gaurd = FmtM ? NormSum[1] : NormSum[30];
-	assign Round = FmtM ? NormSum[0] : NormSum[29];
-	assign SubBySmallNum = AddendStickyM&InvZ&~NormSumSticky;
-
-	always_comb begin
-		// Determine if you add 1
-		case (FrmM)
-			3'b000: Plus1Tmp = Gaurd & (Round | (Sticky&~(~Round&SubBySmallNum)) | (~Round&~Sticky&NormSum[2]));//round to nearest even
-			3'b001: Plus1Tmp = 0;//round to zero
-			3'b010: Plus1Tmp = WSgn & ~(SubBySmallNum);//round down
-			3'b011: Plus1Tmp = ~WSgn & ~(SubBySmallNum);//round up
-			3'b100: Plus1Tmp = (Gaurd & (Round | (Sticky&~(~Round&SubBySmallNum)) | (~Round&~Sticky)));//round to nearest max magnitude
-			default: Plus1Tmp = 1'bx;
-		endcase
-		// Determine if you subtract 1
-		case (FrmM)
-			3'b000: Minus1Tmp = 0;//round to nearest even
-			3'b001: Minus1Tmp = SubBySmallNum;//round to zero
-			3'b010: Minus1Tmp = ~WSgn & SubBySmallNum;//round down
-			3'b011: Minus1Tmp = WSgn & SubBySmallNum;//round up
-			3'b100: Minus1Tmp = 0;//round to nearest max magnitude
-			default: Minus1Tmp = 1'bx;
-		endcase
-	
-	end
-
-	// If an answer is exact don't round
-    assign Plus1 = Sticky | (Gaurd|Round) ? Plus1Tmp : 1'b0;
-    assign Minus1 = Sticky | (Gaurd|Round) ? Minus1Tmp : 1'b0;
-	// Compute rounded result 
-    assign {WExpTmp, WMan} = FmtM ? {SumExp, NormSum[53:2]} - {64'b0, Minus1} + {64'b0, Plus1} : {{SumExp, NormSum[53:31]} - {35'b0, Minus1} + {35'b0, Plus1}, 28'b0};
-    assign WExp = WExpTmp[10:0];
-
-
-
-
-
-
-
-	// Sign calculation
-
-
-	// Determine the sign if the sum is zero
-	//	if product underflows then use psign
-	//	otherwise
-	//		if cancelation then 0 unless round to -inf
-	//		otherwise psign
-	assign ZeroSgn = Underflow & ~ResultDenorm ? PSgn :
-				  (PSgn^ZSgn ? FrmM == 3'b010 : PSgn);
-
-	// is the result negitive
-	// 	if p - z is the Sum negitive
-	// 	if -p + z is the Sum positive
-	// 	if -p - z then the Sum is negitive
-	assign ResultSgn = InvZ&(ZSgn)&NegSum | InvZ&PSgn&~NegSum | ((ZSgn)&PSgn);
-	assign WSgn = SumZero ? ZeroSgn : ResultSgn;
- 
-	// Select the result
-	assign FmaResultM = XNaNM ? (FmtM ? {XSgn, FInput1M[62:52], 1'b1,FInput1M[50:0]} : {XSgn, FInput1M[62:55], 1'b1,FInput1M[53:0]}) : 
-						YNaNM ? (FmtM ? {YSgn, FInput2M[62:52], 1'b1,FInput2M[50:0]} : {YSgn, FInput2M[62:55], 1'b1,FInput2M[53:0]}) : 
-						ZNaNM ? (FmtM ? {ZSgn, FInput3M2[62:52], 1'b1,FInput3M2[50:0]} : {ZSgn, FInput3M2[62:55], 1'b1,FInput3M2[53:0]}) :
-						Invalid ? (FmtM ? {WSgn, 11'h7ff, 1'b1, 51'b0} : {WSgn, 8'h7f8, 1'b1, 54'b0}) : // has to be before inf
-						XInfM ? {PSgn, FInput1M[62:0]} :
-						YInfM ? {PSgn, FInput2M[62:0]} :
-						ZInfM ? {ZSgn, FInput3M2[62:0]} :
-						Overflow ? (FmtM ? {WSgn, 11'h7ff, 52'b0} : {WSgn, 8'h7f8, 55'b0}) :
-						Underflow & ~ResultDenorm ? (FmtM ? {WSgn, 63'b0} - {63'b0, (Minus1&AddendStickyM)} + {63'b0, (Plus1&AddendStickyM)} : {{WSgn, 31'b0} - {31'b0, (Minus1&AddendStickyM)} + {31'b0, (Plus1&AddendStickyM)}, 32'b0}) : //***do you need minus1?
-						KillProdM ? (FmtM ? FInput3M2 - {63'b0, (Minus1&AddendStickyM)} + {63'b0, (Plus1&AddendStickyM)} : {FInput3M2[63:32] - {31'b0, (Minus1&AddendStickyM)} + {31'b0, (Plus1&AddendStickyM)}, 32'b0}) : // has to be after Underflow
-						FmtM ? {WSgn,WExp,WMan} : {WSgn,WExp[6:0],WMan,4'b0};
-logic [63:0] tmp;
-	assign tmp = {WSgn,WExp[6:0],WMan,4'b0};
-
-	// Set Invalid flag for following cases:
-	//   1) Inf - Inf
-	//   2) 0 * Inf
-	//   3) any input is a signaling NaN
-	logic [12:0] MaxExp;
-	assign MaxExp = FmtM ? 13'd2047 : 13'd255;
-	assign ProdOf = (ProdExpM >= MaxExp && ~ProdExpM[12]);
-	assign ProdInf = ProdOf && ~XNaNM && ~YNaNM;
-	assign SigNaN = FmtM ? (XNaNM&~FInput1M[51]) | (YNaNM&~FInput2M[51]) | (ZNaNM&~FInput3M2[51]) : (XNaNM&~FInput1M[54]) | (YNaNM&~FInput2M[54]) | (ZNaNM&~FInput3M2[54]);
-	assign Invalid = SigNaN | ((XInfM || YInfM || ProdInf) & ZInfM & (XSgn ^ YSgn ^ ZSgn)) | (XZeroM & YInfM) | (YZeroM & XInfM);  
-	
-	// Set Overflow flag if the number is too big to be represented
-	assign Overflow = WExpTmp >= MaxExp & ~WExpTmp[12];
-
-	// Set Underflow flag if the number is too small to be represented in normal numbers
-	assign ProdUf = KillProdM & ZZeroM;
-	assign Underflow = SumExp[12] | ProdUf;
-
-	// Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
-	assign Inexact = (Sticky|Overflow| (Gaurd|Round))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
-
-	// Combine flags 
-	//		- FMA can't set the Divide by zero flag
-	//		- Don't set the underflow flag if the result is exact 
-	assign FmaFlagsM = {Invalid, 1'b0, Overflow, Underflow & Inexact, Inexact};
-
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/lza.sv
+++ b/wally-pipelined/src/fpu/FMA/lza.sv
@ -1,40 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	lop.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This block implements a Leading One Predictor used to determine 
-//   the normalization shift count. 
-///////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////// 
-module lza(sum, normcnt, sumzero); 
-/////////////////////////////////////////////////////////////////////////////
- 
-	input logic     	[163:0]  	sum;            // sum
-	output logic     	[8:0]		normcnt;		// normalization shift count
-	output logic     		  		sumzero;		// sum = 0
-
-	// Internal nodes
-
-	reg			[8:0] 		i;				// loop index
- 
-	// A real LOP uses a fast carry chain to find only the first 0.
-	// It is an example of a parallel prefix algorithm.  For the sake
-	// of simplicity,  this model is behavioral instead.
-	// A real LOP would also operate on the sources of the adder, not
-	// the result!
-
-	always_comb
-		begin
-			i =   0;
-			while (~sum[163-i] && i <= 163) i = i+1;  // search for leading one 
-			normcnt = i;    // compute shift count
-	end
-
-	// Also check if sum is zero 
-	assign sumzero = ~(|sum);
-	
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/multiply.sv
+++ b/wally-pipelined/src/fpu/FMA/multiply.sv
@ -1,136 +0,0 @@
-
-module multiply(xman, yman, xdenormE, ydenormE, xzeroE, yzeroE, rE, sE); 
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic 		[51:0]		xman;				// Fraction of multiplicand	x
-	input logic		[51:0]		yman;				// Fraction of multiplicand y	
-	input logic					xdenormE;		// is x denormalized	
-	input logic					ydenormE;		// is y denormalized	
-	input logic     			xzeroE;		// Z is denorm
-	input logic     			yzeroE;		// Z is denorm
-	output logic		[105:0]		rE;				//	partial product 1	
-	output logic		[105:0]		sE;				//	partial product 2	
-    
-     wire        [54:0]      yExt; //y with appended 0 and assumed 1
-     wire        [53:0]      xExt; //y with assumed 1
-     wire [26:0][1:0] add1;
-     wire [26:0][54:0] pp; 
-     wire [26:0] e;
-     logic [106:0] tmpsE;
-     logic [17:0][106:0] lv1add;
-     logic [11:0][106:0] lv2add;
-     logic [7:0][106:0] lv3add;
-     logic [3:0][106:0] lv4add;
-     logic [21:0][107:0] carryTmp;
-     wire [26:0][106:0] acc; 
-     // wire [105:0] acc
-    genvar i;	
-
-	assign xExt = {1'b0,~(xdenormE|xzeroE),xman};
-	assign yExt = {1'b0,~(ydenormE|yzeroE),yman, 1'b0};
-    
-     generate
-        for(i=0; i<27; i=i+1) begin
-            booth booth(.xExt(xExt), .choose(yExt[(i*2)+2:i*2]), .add1(add1[i]), .e(e[i]), .pp(pp[i]));
-        end
-     endgenerate
-
-    assign acc[0] = {49'b0,~e[0],e[0],e[0],pp[0]}; 
-    assign acc[1] = {49'b01,~e[1],pp[1],add1[0]}; 
-    assign acc[2] = {47'b01,~e[2],pp[2],add1[1], 2'b0};
-    assign acc[3] = {45'b01,~e[3],pp[3],add1[2], 4'b0};
-    assign acc[4] = {43'b01,~e[4],pp[4],add1[3], 6'b0};
-    assign acc[5] = {41'b01,~e[5],pp[5],add1[4], 8'b0};
-    assign acc[6] = {39'b01,~e[6],pp[6],add1[5], 10'b0};
-    assign acc[7] = {37'b01,~e[7],pp[7],add1[6], 12'b0};
-    assign acc[8] = {35'b01,~e[8],pp[8],add1[7], 14'b0};
-    assign acc[9] = {33'b01,~e[9],pp[9],add1[8], 16'b0};
-    assign acc[10] = {31'b01,~e[10],pp[10],add1[9], 18'b0};
-    assign acc[11] = {29'b01,~e[11],pp[11],add1[10], 20'b0};
-    assign acc[12] = {27'b01,~e[12],pp[12],add1[11], 22'b0};
-    assign acc[13] = {25'b01,~e[13],pp[13],add1[12], 24'b0};
-    assign acc[14] = {23'b01,~e[14],pp[14],add1[13], 26'b0};
-    assign acc[15] = {21'b01,~e[15],pp[15],add1[14], 28'b0};
-    assign acc[16] = {19'b01,~e[16],pp[16],add1[15], 30'b0};
-    assign acc[17] = {17'b01,~e[17],pp[17],add1[16], 32'b0};
-    assign acc[18] = {15'b01,~e[18],pp[18],add1[17], 34'b0};
-    assign acc[19] = {13'b01,~e[19],pp[19],add1[18], 36'b0};
-    assign acc[20] = {11'b01,~e[20],pp[20],add1[19], 38'b0};
-    assign acc[21] = {9'b01,~e[21],pp[21],add1[20], 40'b0};
-    assign acc[22] = {7'b01,~e[22],pp[22],add1[21], 42'b0};
-    assign acc[23] = {5'b01,~e[23],pp[23],add1[22], 44'b0};
-    assign acc[24] = {3'b01,~e[24],pp[24],add1[23], 46'b0};
-    assign acc[25] = {1'b0, ~e[25],pp[25],add1[24], 48'b0};
-    assign acc[26] = {pp[26],add1[25], 50'b0};
-
-    //*** resize adders
-     generate
-        for(i=0; i<9; i=i+1) begin
-            add3comp2 #(.BITS(107)) add1(.a(acc[i*3]), .b(acc[i*3+1]), .c(acc[i*3+2]), 
-                                           .carry(carryTmp[i][106:0]), .sum(lv1add[i*2+1]));
-            assign lv1add[i*2] = {carryTmp[i][105:0], 1'b0};
-        end
-     endgenerate
-
-     generate
-        for(i=0; i<6; i=i+1) begin
-            add3comp2 #(.BITS(107)) add2(.a(lv1add[i*3]), .b(lv1add[i*3+1]), .c(lv1add[i*3+2]), 
-                                           .carry(carryTmp[i+9][106:0]), .sum(lv2add[i*2+1]));
-            assign lv2add[i*2] = {carryTmp[i+9][105:0], 1'b0};
-        end
-     endgenerate
-
-    generate
-        for(i=0; i<4; i=i+1) begin
-            add3comp2 #(.BITS(107)) add3(.a(lv2add[i*3]), .b(lv2add[i*3+1]), .c(lv2add[i*3+2]), 
-                                            .carry(carryTmp[i+15][106:0]), .sum(lv3add[i*2+1]));
-            assign lv3add[i*2] = {carryTmp[i+15][105:0], 1'b0};
-        end
-    endgenerate
-
-
-    generate
-        for(i=0; i<2; i=i+1) begin
-            add4comp2 #(.BITS(107)) add4(.a(lv3add[i*4]), .b(lv3add[i*4+1]), .c(lv3add[i*4+2]), .d(lv3add[i*4+3]),
-                                            .carry(carryTmp[i+19]), .sum(lv4add[i*2+1]));
-            assign lv4add[i*2] = {carryTmp[i+19][105:0], 1'b0};
-        end
-    endgenerate
-
-    add4comp2 #(.BITS(107)) add5(.a(lv4add[0]), .b(lv4add[1]), .c(lv4add[2]), .d(lv4add[3]) ,
-                                    .carry(carryTmp[21]), .sum(tmpsE));
-    assign sE = tmpsE[105:0];
-    assign rE = {carryTmp[21][104:0], 1'b0};
-		// assign rE = 0;
-		// assign sE = acc[0] +
-		// 		   acc[1] +
-		// 		   acc[2] +
-		// 		   acc[3] +
-		// 		   acc[4] +
-		// 		   acc[5] +
-		// 		   acc[6] +
-		// 		   acc[7] +
-		// 		   acc[8] +
-		// 		   acc[9] +
-		// 		   acc[10] +
-		// 		   acc[11] +
-		// 		   acc[12] +
-		// 		   acc[13] +
-		// 		   acc[14] +
-		// 		   acc[15] +
-		// 		   acc[16] +
-		// 		   acc[17] +
-		// 		   acc[18] +
-		// 		   acc[19] +
-		// 		   acc[20] +
-		// 		   acc[21] +
-		// 		   acc[22] +
-		// 		   acc[23] +
-		// 		   acc[24] +
-		// 		   acc[25] +
-		// 		   acc[26];
-
-			// assign sE = {53'b0,~(xdenormE|xzeroE),xman}  *  {53'b0,~(ydenormE|yzeroE),yman};
-			// assign rE = 0;
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/normalize.sv
+++ b/wally-pipelined/src/fpu/FMA/normalize.sv
@ -1,147 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	normalize.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This block performs the normalization shift.  It also
-//   generates the Rands bits for rounding.  Finally, it
-//   handles the special case of a zero sum.
-//
-//   v[53:2]  is the fraction component of the prerounded result.
-//   It can be bypassed back to the X or Z inputs of the FMAC
-//   for back-to-back operations. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module normalize(sum, zexp, normcnt, aeM, aligncntM, sumshiftM, sumshiftzeroM, sumzero, 
-				xzeroM, zzeroM, yzeroM, bsM, xdenormM, ydenormM, zdenormM, sticky, de0, resultdenorm, v); 
-/////////////////////////////////////////////////////////////////////////////
-	input logic     	[163:0]  	sum;            // sum
-	input logic     	[62:52]  	zexp;            // sum
-	input logic		[8:0] 		normcnt;     	// normalization shift count
-	input logic		[12:0] 		aeM;     	// normalization shift count
-	input logic		[12:0] 		aligncntM;     	// normalization shift count
-	input logic		[8:0] 		sumshiftM;     	// normalization shift count
-	input logic				sumshiftzeroM;
-	input logic				sumzero;	// sum is zero
-	input logic				bsM;		// sticky bit for addend
-	input logic                  		xdenormM;        // Input Z is denormalized
-	input logic                  		ydenormM;        // Input Z is denormalized
-	input logic                  		zdenormM;        // Input Z is denormalized
-	input logic				xzeroM;
-	input logic				yzeroM;
-	input logic				zzeroM;
-	output logic				sticky;		//sticky bit
-	output logic		[12:0]		de0;
-	output logic                  	resultdenorm;        // Input Z is denormalized
-	output logic		[53:0]		v;		// normalized sum, R, S bits
-
-	// Internal nodes
-
-logic       	[163:0]  	sumshifted;     // shifted sum
-	logic		[9:0]		sumshifttmp;
-	logic       	[163:0]  	sumshiftedtmp;     // shifted sum
-	logic				isShiftLeft1;
-logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;
-
-	// When the sum is zero,  normalization does not apply and only the
-	// sticky bit must be computed.  Otherwise,  the sum is right-shifted
-	// and the Rand S bits (v[1]  and v[O],  respectively) are assigned.
-
-	// The R bit is also set on denormalized numbers where the exponent
-	// was computed to be exactly -1023 and the L bit was set.  This
-	// is required for correct rounding up of multiplication results.
-
-	// The sticky bit calculation is actually built into the shifter and
-	// does not require a true subtraction shown in the model.
- 
-	assign isShiftLeft1 = (aligncntM == 13'b1 ||aligncntM == 13'b0 || $signed(aligncntM) == $signed(-(13'b1)))&& zexp == 11'h2;
-	// assign tmp = ($signed(aeM-normcnt+2) >= $signed(-1022));
-	always_comb
-		begin
-		// d = aligncntM
-		// l = normcnt
-		// p = 53
-		// ea + eb = aeM
-			// set d<=2 to d<=0
-			if ($signed(aligncntM)<=$signed(13'd2))  begin //d<=2 
-				// product anchored or cancellation
-				if ($signed(aeM-{{4{normcnt[8]}},normcnt}+13'd2) >= $signed(-(13'd1022))) begin //ea+eb-l+2 >= emin
-					//normal result
-					de0 = xzeroM|yzeroM ? {2'b0,zexp} : aeM-{{4{normcnt[8]}},normcnt}+{12'b0,xdenormM}+{12'b0,ydenormM}+13'd57;
-					resultdenorm = |sum & ~|de0 | de0[12];
-					// if z is zero then there was a 56 bit shift of the product
-					sumshifted = resultdenorm ? sum << sumshiftM-{8'b0,zzeroM}+{8'b0,isShiftLeft1} : sum << normcnt; // p+2+l
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-					//de0 = aeM-normcnt+2-1023;
-				end else begin
-					sumshifted = sum << (13'd1080+aeM);
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-					resultdenorm = 1;
-					de0 = 0;
-				end
-
-			end else begin                 // extract normalized bits
-				sumshifttmp = {1'b0,sumshiftM} - 2;
-				sumshifted = sumshifttmp[9] ? sum : sum << sumshifttmp;
-				tmp1 = (sumshifted[163] & ~sumshifttmp[9]);
-				tmp2 = ((sumshifttmp[9] & sumshiftM[0]) || sumshifted[162]);
-				tmp3 = (sumshifted[161] || (sumshifttmp[9] & sumshiftM[1]));
-				tmp4 = sumshifted[160];
-				tmp5 = sumshifted[159];
-				// for some reason use exp = zexp + {0,1,2}
-				// the book says exp = zexp + {-1,0,1}
-				if(sumshiftzeroM) begin
-					v = sum[162:109];
-					sticky = (|sum[108:0]) | bsM;
-					de0 = {2'b0,zexp};
-				end else if(sumshifted[163] & ~sumshifttmp[9])begin
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-					de0 = {2'b0,zexp} +13'd2;
-				end else if ((sumshifttmp[9] & sumshiftM[0]) || sumshifted[162]) begin
-					v = sumshifted[161:108];
-					sticky = (|sumshifted[107:0]) | bsM;
-					de0 = {2'b0,zexp}+13'd1;
-				end else if (sumshifted[161] || (sumshifttmp[9] & sumshiftM[1])) begin
-					v = sumshifted[160:107];
-					sticky = (|sumshifted[106:0]) | bsM;
-					//de0 = zexp-1;
-					de0 = {2'b0,zexp}+{12'b0,zdenormM};
-				end else if(sumshifted[160]& ~zdenormM) begin
-					de0 = {2'b0,zexp}-13'b1;
-					v = ~|de0&~sumzero ? sumshifted[160:107] : sumshifted[159:106];
-					sticky = (|sumshifted[105:0]) | bsM;
-					//de0 = zexp-1;
-				end else if(sumshifted[159]& ~zdenormM) begin
-					//v = sumshifted[158:105];
-					de0 = {2'b0,zexp}-13'd2;
-					v = (~|de0 | de0[12])&~sumzero ? sumshifted[161:108] : sumshifted[158:105];
-					sticky = (|sumshifted[104:0]) | bsM;
-					//de0 = zexp-1;
-				end else if(zdenormM) begin					
-					v = sumshifted[160:107];
-					sticky = (|sumshifted[106:0]) | bsM;
-					//de0 = zexp-1;
-					de0 = {{2{zexp[62]}},zexp};
-				end else begin
-					de0 = 0;
-					sumshifted = sum << sumshiftM-1; // p+2+l
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-				end
-
-				resultdenorm = (~|de0 | de0[12]);
-		end 
-	end
-
-
-	// shift sum left by normcnt,  filling the right with zeros 
-	//assign sumshifted = sum << normcnt;
-	
-endmodule
-
-
--- a/wally-pipelined/src/fpu/FMA/round.sv
+++ b/wally-pipelined/src/fpu/FMA/round.sv
@ -1,124 +0,0 @@
-///////////////////////////////////////////////////////////////////////////// 
-// Block Name:	round.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description: 
-//   This block is responsible for rounding the normalized result of //   the FMAC.   Because prenormalized results may be bypassed back to //   the FMAC X and z input logics, rounding does not appear in the critical //   path of most floating point code.   This is good because rounding //   requires an entire 52 bit carry-propagate half-adder delay.
-//
-//   The results from other FPU blocks (e.g. FCVT,  FDIV,  etc)  are also 
-//   muxed in to form the actual result for register file writeback.  This
-//   saves a mux from the writeback path.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module round(v, sticky, FrmM, wsign,
-			  FmaFlagsM, inf, nanM, xnanM, ynanM, znanM, 
-			  xman, yman, zman,
-			  wman, infinity, specialsel,expplus1);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic		[53:0]		v;		// normalized sum, R, S bits
-	input logic				sticky;		//sticky bit
-	input logic		[2:0]	FrmM;
-	input logic				wsign;		// Sign of result
-	input logic 		[4:0]	FmaFlagsM;
-	input logic				inf;		// Some input logic is infinity
-	input logic				nanM;		// Some input logic is NaN
-	input logic				xnanM;		// X is NaN
-	input logic				ynanM;		// Y is NaN
-	input logic				znanM;		// Z is NaN
-	input logic		[51:0]		xman;		// input logic X
-	input logic		[51:0]		yman;		// input logic Y
-	input logic		[51:0]		zman;		// input logic Z
-	output logic		[51:0]		wman; 		// rounded result of FMAC
-	output logic				infinity;    	// Generate infinity on overflow
-	output logic				specialsel;  	// Select special result
-	output logic				expplus1;
-
-	// Internal nodes
-
-	logic				plus1;		// Round by adding one 
-	wire		[52:0]		v1;		// Result + 1 (for rounding)
-	wire		[51:0]		specialres;	// Result of exceptional case 
-	wire		[51:0]		infinityres;	// Infinity or largest real number
-	wire		[51:0]		nanres;		// Propagated or generated NaN 
-
-	// Compute if round should occur.  This equation is derived from
-	// the rounding tables.
-
-	// round to infinity - plus1 if positive
-	// round to -infinity - plus1 if negitive
-	// round to zero - do nothing
-	// round to nearest even
-	//	{v[1], v[0], sticky}
-	//	0xx - do nothing
-	//	100 - tie - plus1 if v[2] = 1
-	//	101/110/111 - plus1
-	always_comb begin
-		case (FrmM)
-			3'b000: plus1 = (v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2])));//round to nearest even
-			3'b001: plus1 = 0;//round to zero
-			3'b010: plus1 = wsign;//round down
-			3'b011: plus1 = ~wsign;//round up
-			3'b100: plus1 = (v[1] & (v[0] | sticky | (~v[0]&~sticky&~wsign)));//round to nearest max magnitude
-			default: plus1 = 1'bx;
-		endcase
-	end
-	// assign plus1 = (rn & v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2]))) |
-	// 	       (rp & ~wsign) |
-	// 	       (rm & wsign);
-	//assign plus1 = rn && ((v[1] && v[0]) || (v[2] && (v[1]))) ||
-	//				 rp && ~wsign && (v[1] || v[0]) ||
-	//				 rm && wsign && (v[1] || v[0]);
-
-	// Compute rounded result 
-    assign v1 = v[53:2] + 1;
-	// Determine if postnormalization is necessary
-	// Predicted by all bits =1 before round +1
-
-	//assign postnormalize = &(v[53:2]) && plus1;
-
-	// Determine special result in event of of selection of a result from
-	// another FPU functional unit,  infinity, NAN,  or underflow
-	// The special result mux is a 4:1 mux that should not appear in the
-	// critical path of the machine.   It is not priority encoded,  despite
-	// the code below suggesting otherwise.  Also,  several of the identical data
-	// input logics to the wide muxes can be combined at the expense of more
-	// complicated non-critical control in the circuit implementation.
-
-	assign specialsel =  FmaFlagsM[2] ||  FmaFlagsM[1] ||  FmaFlagsM[4] || //overflow underflow invalid
-							nanM || inf;
-	assign specialres = FmaFlagsM[4] | nanM ? nanres : //invalid
-						 FmaFlagsM[2] ? infinityres : //overflow
-						 inf ? 52'b0 :
-						 FmaFlagsM[1] ? 52'b0 : 52'bx;  // underflow
-
-	// Overflow is handled differently for different rounding modes
-	// Round is to either infinity or to maximum finite number
-
-	assign infinity =  |FrmM;//rn || (rp && ~wsign) || (rm && wsign);//***look into this
-	assign infinityres = infinity ? 52'b0 : {52{1'b1}};
-
-	// Invalid operations produce a quiet NaN. The result should
-	// propagate an input logic if the input logic is NaN. Since we assume all
-	// NaN input logics are already quiet, we don't have to force them quiet.
-
-	// assign nanres = xnanM ? x: (ynanM ? y : (znanM ? z : {1'b1, 51'b0})); // original
-
-	// IEEE 754-2008 section 6.2.3 states:
-	// "If two or more input logics are NaN, then the payload of the resulting NaN should be 
-	// identical to the payload of one of the input logic NaNs if representable in the destination
-	// format. This standard does not specify which of the input logic NaNs will provide the payload."
-	assign nanres = xnanM ? {1'b1, xman[50:0]}: (ynanM ? {1'b1, yman[50:0]} : (znanM ? {1'b1, zman[50:0]} : {1'b1, 51'b0}));// KEP 210112 add the 1 to make NaNs quiet
-
-	// Select result with 4:1 mux
-	// If the sum is zero and we round up,  there is a special case in
-	// which we produce a massive loss of significance and trap to software.
-	// It is handled in the exception unit. 
-	assign expplus1 = v1[52] & ~specialsel & plus1;
-	assign wman = specialsel ? specialres : (plus1 ? v1[51:0] : v[53:2]);
-	
-endmodule
-
--- a/wally-pipelined/src/fpu/FMA/sign.sv
+++ b/wally-pipelined/src/fpu/FMA/sign.sv
@ -1,111 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	sign.v
-// Author:		David Harris
-// Date:		12/1/1995
-//
-// Block Description:
-//   This block manages the signs of the numbers.
-//   1 =  negative
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module sign(xsign, ysign, zsign, negsum0, negsum1, bsM, FrmM, FmaFlagsM, 
-			 sumzero, zinfM, inf, wsign, invz, negsum, selsum1, isAdd);
-////////////////////////////////////////////////////////////////////////////I
- 
-	input logic					xsign;			// Sign of X 
-	input logic					ysign;			// Sign of Y 
-	input logic					zsign;			// Sign of Z
-	input logic					isAdd;
-	input logic					negsum0;		// Sum in +O mode is negative 
-	input logic					negsum1;		// Sum in +1 mode is negative 
-	input logic					bsM;				// sticky bit from addend
-	input logic		[2:0]		FrmM;				// Round toward minus infinity
-	input logic		[4:0]		FmaFlagsM;				// Round toward minus infinity
-	input logic					sumzero;		// Sum = O
-	input logic					zinfM;			// Y = Inf
-	input logic					inf;			// Some input logic = Inf
-	output logic					wsign;			// Sign of W 
-	output logic					invz;			// Invert addend into adder
-	output logic					negsum;			// Negate result of adder
-	output logic					selsum1;		// Select +1 mode from compound adder
- 
-	// Internal nodes
-
-	wire					zerosign;    	// sign if result= 0 
-	wire					sumneg;    	// sign if result= 0 
-	wire					infsign;     	// sign if result= Inf 
-logic tmp;
-
-	// Compute sign of product 
-
-	assign psign = xsign ^ ysign;
-
-	// Invert addend if sign of Z is different from sign of product assign invz = zsign ^ psign;
-
-	//do you invert z
-	assign invz = (zsign ^ psign);
-
-	assign selsum1 = invz;
-	//negate sum if its negitive
-	assign negsum = (selsum1&negsum1) | (~selsum1&negsum0);
-	// is the sum negitive
-	// 	if p - z is the sum negitive
-	// 	if -p + z is the sum positive
-	// 	if -p - z then the sum is negitive
-	assign sumneg = invz&zsign&negsum1 | invz&psign&~negsum1 | (zsign&psign);
-	//always @(invz or negsum0 or negsum1 or bsM or ps)
-	//	begin
-	//		if (~invz) begin               // both input logics have same sign  
-	//			negsum = 0;
-	//			selsum1 = 0;
-	//		end else if (bsM) begin        // sticky bit set on addend
-	//			selsum1 = 0;
-	//			negsum = negsum0; 
-	//		end else if (ps) begin 		// sticky bit set on product
-	//			selsum1 = 1;
-	//			negsum =  negsum1;
-	//		end else begin 				// both sticky bits clear
-	//			//selsum1 = negsum1; 	// KEP 210113-10:44 Selsum1 was adding 1 to values that were multiplied by 0
-	//			 selsum1 = ~negsum1; //original
-	//			negsum = negsum1;
-	//	end 
-	//end
-
-	// Compute sign of result
-	// This involves a special case when the sum is zero:
-	//   x+x retains the same sign as x even when x = +/- 0.
-	//   otherwise,  x-x = +O unless in the RM mode when x-x = -0
-	// There is also a special case for NaNs and invalid results;
-	// the sign of the NaN produced is forced to be 0.
-	// Sign calculation is not in the critical path so the cases
-	// can be tolerated. 
-	// IEEE 754-2008 section 6.3 states 
-	// 		"When ether an input logic or result is NaN, this standard does not interpret the sign of a NaN."
-	// 		also pertaining to negZero it states:
-	//			"When the sum/difference of two operands with opposite signs is exactly zero, the sign of that sum/difference
-	//			 shall be +0 in all rounding attributes EXCEPT roundTowardNegative. Under that attribute, the sign of an exact zero 
-	//			 sum/difference shall be -0.  However, x+x = x-(-X) retains the same sign as x even when x is zero."
- 
-	//assign zerosign = (~invz && killprodM) ? zsign : rm;//***look into
-//	assign zerosign = (~invz && killprodM) ? zsign : 0;
-	// zero sign
-	//	if product underflows then use psign
-	//	otherwise
-	//		addition
-	//			if cancelation then 0 unless round to -inf
-	//			otherwise psign
-	//		subtraction
-	//			if cancelation then 0 unless round to -inf
-	//			otherwise psign
-
-	assign zerosign = FmaFlagsM[1] ? psign :
-			  (isAdd ? (psign^zsign ? FrmM == 3'b010 : psign) :
-				  (psign^zsign ? psign : FrmM == 3'b010));
-	assign infsign = zinfM ? zsign : psign; //KEP 210112 keep the correct sign when result is infinity
-	//assign infsign = xinfM ? (yinfM ? psign : xsign) : yinfM ? ysign : zsign;//original
-	assign tmp = FmaFlagsM[4] ? 0 : (inf ? infsign :(sumzero ? zerosign : psign ^ negsum));
-	assign wsign = FmaFlagsM[4] ? 0 : (inf ? infsign :(sumzero ? zerosign : sumneg));
-
-endmodule
--- a/wally-pipelined/src/fpu/FMA/special.sv
+++ b/wally-pipelined/src/fpu/FMA/special.sv
@ -1,67 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	special.v
-// Author:		David Harris
-// Date:		12/2/1995
-//
-// Block Description:
-//   This block implements special case handling for unusual operands (e.g. 
-//   0, NaN,  denormalize,  infinity).   The block consists of zero/one detectors.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module special(ReadData1E, ReadData2E, ReadData3E, xzeroE, yzeroE, zzeroE,
-				xnanE, ynanE, znanE, xdenormE, ydenormE, zdenormE, xinfE, yinfE, zinfE);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic   	[63:0]     	ReadData1E;              // Input ReadData1E
-	input logic     	[63:0]     	ReadData2E;           	// Input ReadData2E
-	input logic      	[63:0]    	ReadData3E;            	// Input ReadData3E 
-	output logic				xzeroE;		// Input ReadData1E = 0
-	output logic				yzeroE;		// Input ReadData2E = 0
-	output logic				zzeroE;		// Input ReadData3E = 0
-	output logic				xnanE;		// ReadData1E is NaN
-	output logic				ynanE;		// ReadData2E is NaN
-	output logic				znanE;		// ReadData3E is NaN
-	output logic				xdenormE;	// ReadData1E is denormalized
-	output logic				ydenormE;	// ReadData2E is denormalized
-	output logic				zdenormE;	// ReadData3E is denormalized
-	output logic				xinfE;		// ReadData1E is infinity
-	output logic				yinfE;		// ReadData2E is infinity
-	output logic				zinfE;		// ReadData3E is infinity
-
-	// In the actual circuit design, the gates looking at bits
-	// 51:0 and at bits 62:52 should be shared among the various detectors.
-
-	// Check if input is NaN
-
-	assign xnanE = &ReadData1E[62:52] && |ReadData1E[51:0]; 
-	assign ynanE = &ReadData2E[62:52] && |ReadData2E[51:0]; 
-	assign znanE = &ReadData3E[62:52] && |ReadData3E[51:0];
-
-	// Check if input is denormalized
-
-	assign xdenormE = ~(|ReadData1E[62:52]) && |ReadData1E[51:0]; 
-	assign ydenormE = ~(|ReadData2E[62:52]) && |ReadData2E[51:0]; 
-	assign zdenormE = ~(|ReadData3E[62:52]) && |ReadData3E[51:0];
-
-	// Check if input is infinity
-
-	assign xinfE = &ReadData1E[62:52] && ~(|ReadData1E[51:0]); 
-	assign yinfE = &ReadData2E[62:52] && ~(|ReadData2E[51:0]); 
-	assign zinfE = &ReadData3E[62:52] && ~(|ReadData3E[51:0]);
-
-	// Check if inputs are all zero
-	// Also forces denormalized inputs to zero.
-	//   In the circuit implementation,  this can be optimized
-	// to just check if the exponent is zero.
-	
-	// KATHERINE - commented following (21/01/11)
-	// assign xzeroE = ~(|ReadData1E[62:0]) || xdenormE;
-	// assign yzeroE = ~(|ReadData2E[62:0]) || ydenormE;
-	// assign zzeroE = ~(|ReadData3E[62:0]) || zdenormE;
-	// KATHERINE - removed denorm to prevent output logicing zero when computing with a denormalized number
-	assign xzeroE = ~(|ReadData1E[62:0]);
-	assign yzeroE = ~(|ReadData2E[62:0]);
-	assign zzeroE = ~(|ReadData3E[62:0]);
- endmodule
--- a/wally-pipelined/src/fpu/FMA/tbgen/StineVectors
+++ b/wally-pipelined/src/fpu/FMA/tbgen/StineVectors
--- a/wally-pipelined/src/fpu/FMA/tbgen/testMini
+++ b/wally-pipelined/src/fpu/FMA/tbgen/testMini
--- a/wally-pipelined/src/fpu/faddcvt.sv
+++ b/wally-pipelined/src/fpu/faddcvt.sv
@ -0,0 +1,417 @@
+//
+// File name : fpadd
+// Title     : Floating-Point Adder/Subtractor
+// project   : FPU
+// Library   : fpadd
+// Author(s) : James E. Stine, Jr., Brett Mathis
+// Purpose   : definition of main unit to floating-point add/sub
+// notes :   
+//
+// Copyright Oklahoma State University
+// Copyright AFRL
+//
+// Basic and Denormalized Operations
+//
+// Step 1: Load operands, set flags, and convert SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
+//         or of (exp1 = exp2 AND mnt1 < mnt2)
+// Step 4: Shift the mantissa corresponding to the smaller exponent, 
+//          and extend precision by three bits to the right.
+// Step 5: Add or subtract the mantissas.
+// Step 6: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 7: Round the result.// 
+// Step 8: Put sum onto output.
+//
+
+module faddcvt(
+   input logic          clk,
+   input logic          reset,
+   input logic          FlushM,
+   input logic          StallM,
+   input logic  [63:0]  SrcXE,		// 1st input operand (A)
+   input logic  [63:0]  SrcYE,		// 2nd input operand (B)
+   input logic  [3:0]   FOpCtrlE, FOpCtrlM,	// Function opcode
+   input logic          FmtE, FmtM,   		// Result Precision (0 for double, 1 for single)
+   input logic  [2:0] 	FrmM,		// Rounding mode - specify values 
+   output logic [63:0]  FAddResM,	// Result of operation
+   output logic [4:0]   FAddFlgM);   	// IEEE exception flags 
+   
+   logic [63:0] 	AddSumE, AddSumM;
+   logic [63:0]   AddSumTcE, AddSumTcM;
+   logic [3:0] 	AddSelInvE, AddSelInvM;
+   logic [10:0] 	AddExpPostSumE,AddExpPostSumM;
+   logic 		   AddCorrSignE, AddCorrSignM;
+   logic          AddOp1NormE, AddOp1NormM;
+   logic          AddOp2NormE, AddOp2NormM;
+   logic          AddOpANormE,  AddOpANormM;
+   logic          AddOpBNormE, AddOpBNormM;
+   logic          AddInvalidE, AddInvalidM;
+   logic 		   AddDenormInE, AddDenormInM;
+   logic          AddSwapE, AddSwapM;
+   logic          AddNormOvflowE, AddNormOvflowM; //***this isn't used in addcvt2
+   logic          AddSignAE, AddSignAM;
+   logic 		   AddConvertE, AddConvertM;
+   logic [63:0] 	AddFloat1E, AddFloat2E, AddFloat1M, AddFloat2M;
+   logic [11:0] 	AddExp1DenormE, AddExp2DenormE, AddExp1DenormM, AddExp2DenormM;
+   logic [10:0] 	AddExponentE, AddExponentM;
+
+
+   fpuaddcvt1 fpadd1 (.SrcXE, .SrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
+                     .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
+                     .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
+                     .AddDenormInE, .AddConvertE, .AddSwapE, .AddNormOvflowE);
+
+   flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
+   flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
+   flopenrc #(11) EMRegAdd3(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
+   flopenrc #(64) EMRegAdd4(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
+   flopenrc #(64) EMRegAdd5(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
+   flopenrc #(12) EMRegAdd6(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
+   flopenrc #(12) EMRegAdd7(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
+   flopenrc #(11) EMRegAdd8(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM);
+   flopenrc #(15) EMRegAdd9(clk, reset, FlushM, ~StallM, 
+                           {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE},
+                           {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM}); 
+
+                     
+   fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM, .AddFloat1M, .AddFloat2M, 
+                     .AddExp1DenormM, .AddExp2DenormM, .AddExponentM, .AddExpPostSumM, .AddSelInvM, 
+                     .AddOp1NormM, .AddOp2NormM, .AddOpANormM, .AddOpBNormM, .AddInvalidM, .AddDenormInM, 
+                     .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
+endmodule
+
+module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, SrcXE, SrcYE, FOpCtrlE, FmtE);
+
+   input logic [63:0] SrcXE;		// 1st input operand (A)
+   input logic [63:0] SrcYE;		// 2nd input operand (B)
+   input logic [3:0]	FOpCtrlE;	// Function opcode
+   input logic 	FmtE;   		// Result Precision (1 for double, 0 for single)
+
+   wire          P;
+   assign P = ~FmtE;
+
+   wire [63:0] 	 IntValue;
+   wire [11:0] 	 exp1, exp2;
+   wire [11:0] 	 exp_diff1, exp_diff2;
+   wire [11:0] 	 exp_shift;
+   wire [51:0] 	 mantissaA;
+   wire [56:0] 	 mantissaA1;
+   wire [63:0] 	 mantissaA3;
+   wire [51:0] 	 mantissaB; 
+   wire [56:0] 	 mantissaB1, mantissaB2;
+   wire [63:0] 	 mantissaB3;
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire          sub;
+   wire 	 zeroB;
+   wire [5:0]	 align_shift; 
+
+   output logic [63:0] 	 AddFloat1E; 
+   output logic [63:0] 	 AddFloat2E;
+   output logic [10:0] 	 AddExponentE;
+   output logic [10:0]	 AddExpPostSumE;
+   output logic [11:0]	 AddExp1DenormE, AddExp2DenormE;//KEP used to be [10:0]
+   output logic [63:0] AddSumE, AddSumTcE;
+   output logic [3:0]  AddSelInvE;
+   output logic        AddCorrSignE;
+   output logic 	 AddSignAE;
+   output logic	 AddOp1NormE, AddOp2NormE;
+   output logic	 AddOpANormE, AddOpBNormE;
+   output logic	 AddInvalidE;
+   output logic 	 AddDenormInE;
+//   output logic 	 exp_valid;
+   output logic 	 AddConvertE;
+   output logic        AddSwapE;
+   output logic 	 AddNormOvflowE;
+   wire [5:0]	 ZP_mantissaA;
+   wire [5:0]	 ZP_mantissaB;
+   wire		 ZV_mantissaA;
+   wire		 ZV_mantissaB;
+
+   // Convert the input operands to their appropriate forms based on 
+   // the orignal operands, the FOpCtrlE , and their precision P. 
+   // Single precision inputs are converted to double precision 
+   // and the sign of the first operand is set appropratiately based on
+   // if the operation is absolute value or negation. 
+
+   convert_inputs conv1 (AddFloat1E, AddFloat2E, SrcXE, SrcYE, FOpCtrlE, P);
+
+   // Test for exceptions and return the "Invalid Operation" and
+   // "Denormalized" Input Flags. The "AddSelInvE" is used in
+   // the third pipeline stage to select the result. Also, AddOp1NormE
+   // and AddOp2NormE are one if SrcXE and SrcYE are not zero or denormalized.
+   // sub is one if the effective operation is subtaction. 
+
+   exception exc1 (AddSelInvE, AddInvalidE, AddDenormInE, AddOp1NormE, AddOp2NormE, sub, 
+		   AddFloat1E, AddFloat2E, FOpCtrlE);
+
+   // Perform Exponent Subtraction (used for alignment). For performance
+   // both exponent subtractions are performed in parallel. This was 
+   // changed to a behavior level to allow the tools to  try to optimize
+   // the two parallel additions. The input values are zero-extended to 12 
+   // bits prior to performing the addition. 
+
+   assign exp1 = {1'b0, AddFloat1E[62:52]};
+   assign exp2 = {1'b0, AddFloat2E[62:52]};
+   assign exp_diff1 = exp1 - exp2;
+   assign exp_diff2 = AddDenormInE ? ({AddFloat2E[63], exp2[10:0]} - {AddFloat1E[63], exp1[10:0]}): exp2 - exp1;
+
+   // The second operand (B) should be set to zero, if FOpCtrlE does not
+   // specify addition or subtraction
+   assign zeroB = FOpCtrlE[2] | FOpCtrlE[1];
+
+   // Swapped operands if zeroB is not one and exp1 < exp2. 
+   // Swapping causes exp2 to be used for the result exponent. 
+   // Only the exponent of the larger operand is used to determine
+   // the final result. 
+   assign AddSwapE = exp_diff1[11] & ~zeroB;
+   assign AddExponentE = AddSwapE ? exp2[10:0] : exp1[10:0];
+   assign AddExpPostSumE = AddSwapE ? exp2[10:0] : exp1[10:0];
+   assign mantissaA = AddSwapE ? AddFloat2E[51:0] : AddFloat1E[51:0];
+   assign mantissaB = AddSwapE ? AddFloat1E[51:0] : AddFloat2E[51:0];
+   assign AddSignAE     = AddSwapE ? AddFloat2E[63] : AddFloat1E[63];   
+
+   // Leading-Zero Detector. Determine the size of the shift needed for
+   // normalization. If sum_corrected is all zeros, the exp_valid is 
+   // zero; otherwise, it is one. 
+   // modified to 52 bits to detect leading zeroes on denormalized mantissas
+   lz52 lz_norm_1 (ZP_mantissaA, ZV_mantissaA, mantissaA);
+   lz52 lz_norm_2 (ZP_mantissaB, ZV_mantissaB, mantissaB);
+
+   // Denormalized exponents created by subtracting the leading zeroes from the original exponents
+   assign AddExp1DenormE = AddSwapE ? (exp1 - {6'b0, ZP_mantissaB}) : (exp1 - {6'b0, ZP_mantissaA}); //KEP extended ZP_mantissa 
+   assign AddExp2DenormE = AddSwapE ? (exp2 - {6'b0, ZP_mantissaA}) : (exp2 - {6'b0, ZP_mantissaB});
+
+   // Determine the alignment shift and limit it to 63. If any bit from 
+   // exp_shift[6] to exp_shift[11] is one, then shift is set to all ones. 
+   assign exp_shift = AddSwapE ? exp_diff2 : exp_diff1;
+   assign exp_gt63 = exp_shift[11] | exp_shift[10] | exp_shift[9] 
+     | exp_shift[8] | exp_shift[7] | exp_shift[6];
+   assign align_shift = exp_shift[5:0] | {6{exp_gt63}}; //KEP used to be all of exp_shift
+
+   // Unpack the 52-bit mantissas to 57-bit numbers of the form.
+   //    001.M[51]M[50] ... M[1]M[0]00
+   // Unless the number has an exponent of zero, in which case it
+   // is unpacked as
+   //    000.00 ... 00
+   // This effectively flushes denormalized values to zero. 
+   // The three bits of to the left of the binary point prevent overflow
+   // and loss of sign information. The two bits to the right of the 
+   // original mantissa form the "guard" and "round" bits that are used
+   // to round the result. 
+   assign AddOpANormE = AddSwapE ? AddOp2NormE : AddOp1NormE;
+   assign AddOpBNormE = AddSwapE ? AddOp1NormE : AddOp2NormE;
+   assign mantissaA1 = {2'h0, AddOpANormE, mantissaA[51:0]&{52{AddOpANormE}}, 2'h0};
+   assign mantissaB1 = {2'h0, AddOpBNormE, mantissaB[51:0]&{52{AddOpBNormE}}, 2'h0};
+
+   // Perform mantissa alignment using a 57-bit barrel shifter 
+   // If any of the bits shifted out are one, Sticky_out is set. 
+   // The size of the barrel shifter could be reduced by two bits
+   // by not adding the leading two zeros until after the shift. 
+   barrel_shifter_r57 bs1 (mantissaB2, Sticky_out, mantissaB1, align_shift);
+
+   // Place either the sign-extened 32-bit value or the original 64-bit value 
+   // into IntValue (to be used for integer to floating point conversion)
+   assign IntValue [31:0] = SrcXE[31:0];
+   assign IntValue [63:32] = FOpCtrlE[0] ? {32{SrcXE[31]}} : SrcXE[63:32];
+
+   // If doing an integer to floating point conversion, mantissaA3 is set to 
+   // IntVal and the prenomalized exponent is set to 1084. Otherwise, 
+   // mantissaA3 is simply extended to 64-bits by setting the 7 LSBs to zero, 
+   // and the exponent value is left unchanged. 
+   // Under denormalized cases, the exponent before the rounder is set to 1
+   // if the normal shift value is 11.
+   assign AddConvertE       = ~FOpCtrlE[2] & FOpCtrlE[1];
+   assign mantissaA3    = (FOpCtrlE[3]) ? (FOpCtrlE[0] ? AddFloat1E : ~AddFloat1E) : (AddDenormInE ? ({12'h0, mantissaA}) : (AddConvertE ? IntValue : {mantissaA1, 7'h0}));
+
+   // Put zero in for mantissaB3, if zeroB is one. Otherwise, B is extended to 
+   // 64-bits by setting the 7 LSBs to the Sticky_out bit followed by six  
+   // zeros. 
+   assign mantissaB3[63:7] = (FOpCtrlE[3]) ? (57'h0) : (AddDenormInE ? {12'h0, mantissaB[51:7]} : mantissaB2 & {57{~zeroB}});
+   assign mantissaB3[6]    = (FOpCtrlE[3]) ? (1'b0) : (AddDenormInE ? mantissaB[6] : Sticky_out & ~zeroB);
+   assign mantissaB3[5:0]  = (FOpCtrlE[3]) ? (6'h01) : (AddDenormInE ? mantissaB[5:0] : 6'h0);
+
+   // The sign of the result needs to be corrected if the true
+   // operation is subtraction and the input operands were swapped. 
+   assign AddCorrSignE = ~FOpCtrlE[2]&~FOpCtrlE[1]&FOpCtrlE[0]&AddSwapE;
+
+   // 64-bit Mantissa Adder/Subtractor
+   cla64 add1 (AddSumE, mantissaA3, mantissaB3, sub); //***adder
+
+   // 64-bit Mantissa Subtractor - to get the two's complement of the 
+   // result when the sign from the adder/subtractor is negative. 
+   cla_sub64 sub1 (AddSumTcE, mantissaB3, mantissaA3); //***adder
+ 
+   // Finds normal underflow result to determine whether to round final exponent down
+   //***KEP used to be (AddSumE == 16'h0) I am unsure what it's supposed to be
+   assign AddNormOvflowE = (AddDenormInE & (AddSumE == 64'h0) & (AddOpANormE | AddOpBNormE) & ~FOpCtrlE[0]) ? 1'b1 : (AddSumE[63] ? AddSumTcE[52] : AddSumE[52]);
+
+endmodule // fpadd
+
+
+//
+// File name : fpadd
+// Title     : Floating-Point Adder/Subtractor
+// project   : FPU
+// Library   : fpadd
+// Author(s) : James E. Stine, Jr., Brett Mathis
+// Purpose   : definition of main unit to floating-point add/sub
+// notes :   
+//
+// Copyright Oklahoma State University
+// Copyright AFRL
+//
+// Basic and Denormalized Operations
+//
+// Step 1: Load operands, set flags, and AddConvertM SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
+//         or of (exp1 = exp2 AND mnt1 < mnt2)
+// Step 4: Shift the mantissa corresponding to the smaller AddExponentM, 
+//          and extend precision by three bits to the right.
+// Step 5: Add or subtract the mantissas.
+// Step 6: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 7: Round the result.// 
+// Step 8: Put AddSumM onto output.
+//
+
+
+module fpuaddcvt2 (FAddResM, FAddFlgM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, FrmM, FOpCtrlM, FmtM);
+
+   input [2:0] 	FrmM;		// Rounding mode - specify values 
+   input [3:0]	FOpCtrlM;	// Function opcode
+   input 	FmtM;   		// Result Precision (0 for double, 1 for single)
+   // input 	AddOvEnM;		// Overflow trap enabled
+   // input 	AddUnEnM;   	// Underflow trap enabled
+   input [63:0] AddSumM, AddSumTcM;
+   input [63:0] 	 AddFloat1M; 
+   input [63:0] 	 AddFloat2M;
+   input [11:0]	 AddExp1DenormM, AddExp2DenormM;
+   input [10:0] 	 AddExponentM, AddExpPostSumM; //exp_pre;
+   //input		 exp_valid;
+   input [3:0] 	 AddSelInvM;
+   input		 AddOp1NormM, AddOp2NormM;
+   input		 AddOpANormM, AddOpBNormM;
+   input		 AddInvalidM;
+   input 	 AddDenormInM; 
+   input 	 AddSignAM; 
+   input         AddCorrSignM;
+   input 	 AddConvertM;
+   input          AddSwapM;
+   // input 	 AddNormOvflowM;
+
+   output [63:0] FAddResM;	// Result of operation
+   output [4:0]  FAddFlgM;   	// IEEE exception flags 
+   wire 	 AddDenormM;   	// AddDenormM on input or output   
+
+   wire          P;
+   assign P = ~FmtM;
+
+   wire [10:0]   exp_pre;
+   wire [63:0] 	 Result;   
+   wire [63:0] 	 sum_norm, sum_norm_w_bypass;
+   wire [5:0] 	 norm_shift, norm_shift_denorm;
+   wire          exp_valid;
+   wire		 DenormIO;
+   wire [4:0] 	 FlagsIn;	
+   wire 	 Sticky_out;
+   wire 	 sign_corr;
+   wire 	 zeroB;         
+   wire [10:0]	 AddExpPostSumM;
+   wire 	 mantissa_comp;
+   wire 	 mantissa_comp_sum;
+   wire 	 mantissa_comp_sum_tc;
+   wire 	 Float1_sum_comp;
+   wire 	 Float2_sum_comp;
+   wire 	 Float1_sum_tc_comp;
+   wire 	 Float2_sum_tc_comp;
+   wire 	 normal_underflow;
+   wire [63:0]   sum_corr;
+   logic AddNormOvflowM;
+ 
+ 
+   logic 	AddOvEnM;		// Overflow trap enabled
+   logic 	AddUnEnM;   	// Underflow trap enabled
+
+   assign AddOvEnM = 1'b1;
+   assign AddUnEnM = 1'b1;
+   //AddExponentM value pre-rounding with considerations for denormalized
+   //cases/conversion cases
+   assign exp_pre       = AddDenormInM ?
+                          ((norm_shift == 6'b001011) ? 11'b00000000001 : (AddSwapM ? AddExp2DenormM[10:0] : AddExp1DenormM[10:0]))
+                          : (AddConvertM ? 11'b10000111100 : AddExponentM);
+
+
+   // Finds normal underflow result to determine whether to round final AddExponentM down
+   // Comparison between each float and the resulting AddSumM of the primary cla adder/subtractor and cla subtractor
+   assign Float1_sum_comp = (AddFloat1M[51:0] > AddSumM[51:0]) ? 1'b0 : 1'b1;
+   assign Float2_sum_comp = (AddFloat2M[51:0] > AddSumM[51:0]) ? 1'b0 : 1'b1;
+   assign Float1_sum_tc_comp = (AddFloat1M[51:0] > AddSumTcM[51:0]) ? 1'b0 : 1'b1;
+   assign Float2_sum_tc_comp = (AddFloat2M[51:0] > AddSumTcM[51:0]) ? 1'b0 : 1'b1;
+
+   // Determines the correct Float value to compare based on AddSwapM result
+   assign mantissa_comp_sum = AddSwapM ? Float2_sum_comp : Float1_sum_comp;
+   assign mantissa_comp_sum_tc = AddSwapM ? Float2_sum_tc_comp : Float1_sum_tc_comp;
+
+   // Determines the correct comparison result based on operation and sign of resulting AddSumM
+   assign mantissa_comp = (FOpCtrlM[0] ^ AddSumM[63]) ? mantissa_comp_sum_tc : mantissa_comp_sum;
+
+   // If the signs are different and both operands aren't denormalized
+   // the normal underflow bit is needed and therefore updated.
+   assign normal_underflow = ((AddFloat1M[63] ~^ AddFloat2M[63]) & (AddOpANormM | AddOpBNormM)) ? mantissa_comp : 1'b0;
+
+   // Determine the correct sign of the result
+   assign sign_corr = ((AddCorrSignM ^ AddSignAM) & ~AddConvertM) ^ AddSumM[63];   
+   
+   // If the AddSumM is negative, use its two complement instead. 
+   // This value has to be 64-bits to correctly handle the 
+   // case 10...00
+   assign sum_corr = (AddDenormInM & (AddOpANormM | AddOpBNormM) & ( ( (AddFloat1M[63] ~^ AddFloat2M[63]) & FOpCtrlM[0] ) | ((AddFloat1M[63] ^ AddFloat2M[63]) & ~FOpCtrlM[0]) ))
+			 ? (AddSumM[63] ? AddSumM : AddSumTcM) : ( (FOpCtrlM[3]) ? AddSumM : (AddSumM[63] ? AddSumTcM : AddSumM));
+
+   // Finds normal underflow result to determine whether to round final AddExponentM down
+   //KEP used to be (AddSumM == 16'h0) not sure what it is supposed to be
+   assign AddNormOvflowM = (AddDenormInM & (AddSumM == 64'h0) & (AddOpANormM | AddOpBNormM) & ~FOpCtrlM[0]) ? 1'b1 : (AddSumM[63] ? AddSumTcM[52] : AddSumM[52]);
+
+   // Leading-Zero Detector. Determine the size of the shift needed for
+   // normalization. If sum_corrected is all zeros, the exp_valid is 
+   // zero; otherwise, it is one. 
+   lz64 lzd1 (norm_shift, exp_valid, sum_corr);
+
+   assign norm_shift_denorm = (AddDenormInM & ( (~AddOpANormM & ~AddOpBNormM) | normal_underflow)) ? (6'h00) : (norm_shift);
+
+   // Barell shifter used for normalization. It takes as inputs the 
+   // the corrected AddSumM and the amount by which the AddSumM should 
+   // be right shifted. It outputs the normalized AddSumM. 
+   barrel_shifter_l64 bs2 (sum_norm, sum_corr, norm_shift_denorm);
+  
+   assign sum_norm_w_bypass = (FOpCtrlM[3]) ? (FOpCtrlM[0] ? ~sum_corr : sum_corr) : (sum_norm);
+
+   // Round the mantissa to a 52-bit value, with the leading one
+   // removed. If the result is a single precision number, the actual 
+   // mantissa is in the upper 23 bits and the lower 29 bits are zero. 
+   // At this point, normalization has already been performed, so we know 
+   // exactly where the rounding point is. The rounding units also
+   // handles special cases and set the exception flags.
+
+   // Changed DenormIO -> AddDenormM and FlagsIn -> FAddFlgM in order to
+   // help in processor reservation station detection of load/stores. In
+   // other words, the processor would like to know ahead of time that
+   // if the result is an exception then don't load or store.
+   rounder round1 (Result, DenormIO, FlagsIn, FrmM, P, AddOvEnM, AddUnEnM, exp_valid, 
+		   AddSelInvM, AddInvalidM, AddDenormInM, AddConvertM, sign_corr, exp_pre, norm_shift, sum_norm_w_bypass,
+		   AddExpPostSumM, AddOp1NormM, AddOp2NormM, AddFloat1M[63:52], AddFloat2M[63:52],
+		   AddNormOvflowM, normal_underflow, AddSwapM, FOpCtrlM, AddSumM);
+
+   // Store the final result and the exception flags in registers.
+   assign FAddResM = Result;
+   assign {AddDenormM, FAddFlgM} = {DenormIO, FlagsIn};
+   
+endmodule // fpadd
+
+
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@ -64,38 +64,38 @@ module fctrl (
                                else if (Funct3D[1:0] == 2'b00) ControlsD = `FCTRLW'b0_1_100_0100_00_01_0_0; // fmv.x.w
                                else if (Funct3D[1:0] == 2'b01) ControlsD = `FCTRLW'b0_1_100_0101_00_01_0_0; // fmv.x.d
                                else                            ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
-                    7'b1100000: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0001_00_00_0_0; // fcvt.s.w
-                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0101_00_00_0_0; // fcvt.s.wu
-                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1001_00_00_0_0; // fcvt.s.l
-                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1101_00_00_0_0; // fcvt.s.lu
+                    7'b1101000: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_0001_11_00_0_0; // fcvt.s.w
+                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_0101_11_00_0_0; // fcvt.s.wu
+                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_1001_11_00_0_0; // fcvt.s.l
+                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_1101_11_00_0_0; // fcvt.s.lu
                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                endcase
-                    7'b1101000: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b1_1_100_0010_00_00_0_0; // fcvt.w.s
-                                  2'b01:    ControlsD = `FCTRLW'b1_1_100_0110_00_00_0_0; // fcvt.wu.s
-                                  2'b10:    ControlsD = `FCTRLW'b1_1_100_1010_00_00_0_0; // fcvt.l.s
-                                  2'b11:    ControlsD = `FCTRLW'b1_1_100_1110_00_00_0_0; // fcvt.lu.s
+                    7'b1100000: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0010_11_11_0_0; // fcvt.w.s
+                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0110_11_11_0_0; // fcvt.wu.s
+                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1010_11_11_0_0; // fcvt.l.s
+                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1110_11_11_0_0; // fcvt.lu.s
                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                endcase
                    7'b1111000: ControlsD = `FCTRLW'b1_0_100_0000_00_00_0_0; // fmv.w.x
-                    7'b0100000: ControlsD = `FCTRLW'b1_0_010_0000_00_00_0_0; // fcvt.s.d
-                    7'b1100001: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0001_00_00_0_0; // fcvt.d.w
-                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0101_00_00_0_0; // fcvt.d.wu
-                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1001_00_00_0_0; // fcvt.d.l
-                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1101_00_00_0_0; // fcvt.d.lu
+                    7'b0100000: ControlsD = `FCTRLW'b1_0_010_0111_00_00_0_0; // fcvt.s.d
+                    7'b1101001: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_0001_11_00_0_0; // fcvt.d.w
+                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_0101_11_00_0_0; // fcvt.d.wu
+                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_1001_11_00_0_0; // fcvt.d.l
+                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_1101_11_00_0_0; // fcvt.d.lu
                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                endcase
-                    7'b1101001: case(Rs2D[1:0])
-                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_0010_00_00_0_0; // fcvt.w.d
-                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_0110_00_00_0_0; // fcvt.wu.d
-                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_1010_00_00_0_0; // fcvt.l.d
-                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_1110_00_00_0_0; // fcvt.lu.d
+                    7'b1100001: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0010_11_11_0_0; // fcvt.w.d
+                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0110_11_11_0_0; // fcvt.wu.d
+                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1010_11_11_0_0; // fcvt.l.d
+                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1110_11_11_0_0; // fcvt.lu.d
                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                endcase
                    7'b1111001: ControlsD = `FCTRLW'b1_0_100_0001_00_00_0_0; // fmv.d.x
-                    7'b0100001: ControlsD = `FCTRLW'b1_0_100_0000_00_00_0_0; // fcvt.d.s
+                    7'b0100001: ControlsD = `FCTRLW'b1_0_010_0111_00_00_0_0; // fcvt.d.s
                    default:    ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                  endcase
      default:      ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
@ -109,7 +109,7 @@ module fctrl (
  // Precision
  //  0-single
  //  1-double
-  assign FmtD = FResultSelD == 3'b000 ? Funct3D[0] : Funct7D[0];
+  assign FmtD = FResultSelD == 3'b000 ? Funct3D[0] : OpD[6:1] == 6'b010000 ? ~Funct7D[0] : Funct7D[0];
  // div/sqrt
      //  fdiv  = ???0
      //  fsqrt = ???1
--- a/wally-pipelined/src/fpu/fcvt.sv
+++ b/wally-pipelined/src/fpu/fcvt.sv
@ -0,0 +1,163 @@
+
+// `include "wally-config.vh"
+module fcvt (
+    input logic [63:0] X,
+    input logic [64-1:0] SrcAE,
+    input logic [3:0] FOpCtrlE,
+    input logic [2:0] FrmE,
+    input logic FmtE,
+    output logic [63:0] CvtResE,
+    output logic [4:0] CvtFlgE);
+
+    logic [10:0] XExp;
+    logic [51:0] XFrac;
+    logic XSgn;
+    logic [10:0] ResExp,TmpExp;
+    logic [51:0] ResFrac;
+    logic ResSgn;
+    logic [10:0] NormCnt;
+    logic [11:0]    Bias;   // 1023 for double, 127 for single
+    logic [7:0]    Bits, SubBits;
+    logic [64+51:0]    ShiftedManTmp;
+    logic [64+51:0]    ShiftVal;
+    logic [64+1:0]    ShiftedMan;
+    logic [64:0]	RoundedTmp;
+    logic [63:0]	Rounded;
+    logic [12:0]    ExpVal, ShiftCnt;
+    logic [64-1:0] PosInt;
+    
+    logic [64-1:0] CvtIntRes;
+    logic [63:0] CvtRes;
+    logic XFracZero, Of,Uf;
+    logic XExpMax;
+    logic XNaN, XDenorm, XInf, XZero;
+    logic Plus1,CalcPlus1, Guard, Round, LSB, Sticky;
+    logic SgnRes, In64;
+    logic Res64;
+    logic RoundMSB;
+    logic RoundSgn;
+    logic XExpZero;
+
+      //  fcvt.w.s  = 0010 -
+      //  fcvt.wu.s = 0110 -
+      //  fcvt.s.w  = 0001 
+      //  fcvt.s.wu = 0101 
+      //  fcvt.l.s  = 1010 -
+      //  fcvt.lu.s = 1110 -
+      //  fcvt.s.l  = 1001 
+      //  fcvt.s.lu = 1101 
+      //  fcvt.w.d  = 0010 - 
+      //  fcvt.wu.d = 0110 -
+      //  fcvt.d.w  = 0001 
+      //  fcvt.d.wu = 0101 
+      //  fcvt.l.d  = 1010 -
+      //  fcvt.lu.d = 1110 -
+      //  fcvt.d.l  = 1001 --
+      //  fcvt.d.lu = 1101 --
+      //  {long, unsigned, to int, from int} Fmt controls the output for fp -> fp
+    assign XSgn = X[63];
+    assign XExp = FmtE ? X[62:52] : {3'b0, X[62:55]};
+    assign XFrac = FmtE ? X[51:0] : {X[54:32], 29'b0};
+    assign XExpZero = ~|XExp;
+   
+    assign XFracZero = ~|XFrac;
+    assign XExpMax = FmtE ? &XExp[10:0] : &XExp[7:0];
+    assign XNaN = XExpMax & ~XFracZero;
+    assign XDenorm = XExpZero & ~XFracZero;
+    assign XInf = XExpMax & XFracZero;
+    assign XZero = XExpZero & XFracZero;
+
+
+    assign Bias = FmtE ? 12'h3ff : 12'h7f;
+    assign Res64 = ((FOpCtrlE==4'b1010 || FOpCtrlE==4'b1110) | (FmtE&(FOpCtrlE==4'b0001 | FOpCtrlE==4'b0101 | FOpCtrlE==4'b0000 | FOpCtrlE==4'b1001 | FOpCtrlE==4'b1101)));
+    assign In64 = ((FOpCtrlE==4'b1001 || FOpCtrlE==4'b1101) | (FmtE&(FOpCtrlE==4'b0010 | FOpCtrlE==4'b0110 | FOpCtrlE==4'b1010 | FOpCtrlE==4'b1110) | (FOpCtrlE==4'b1101 & ~FmtE)));
+    assign SubBits = In64 ? 8'd64 : 8'd32;
+    assign Bits = Res64 ? 8'd64 : 8'd32;
+    assign ExpVal = XExp - Bias + XDenorm;
+
+////////////////////////////////////////////////////////
+
+	logic [64-1:0] IntIn;
+    assign IntIn = FOpCtrlE[3] ? SrcAE : {SrcAE[31:0], 32'b0};
+    assign PosInt = IntIn[64-1]&~FOpCtrlE[2] ? -IntIn : IntIn;
+    assign ResSgn = ~FOpCtrlE[2] ? IntIn[64-1] : 1'b0;
+    
+	// Leading one detector
+	logic [8:0]	i;
+	always_comb begin
+			i = 0;
+			while (~PosInt[64-1-i] && i <= 64) i = i+1;  // search for leading one 
+			NormCnt = i+1;    // compute shift count
+	end
+    assign TmpExp = i==64 ? 0 : Bias + SubBits - NormCnt;
+
+
+
+
+////////////////////////////////////////////
+
+
+
+    assign ShiftCnt = FOpCtrlE[1] ? ExpVal : NormCnt;
+    assign ShiftVal = FOpCtrlE[1] ? {{64-2{1'b0}}, ~(XDenorm|XZero), XFrac} : {PosInt, 52'b0};
+	//if shift = -1 then shift one bit right for round to nearest (shift over 2 never rounds)
+	// if the shift is negitive add bit for sticky bit
+	// otherwise shift left
+    assign ShiftedManTmp = &ShiftCnt ? {{64-1{1'b0}}, ~(XDenorm|XZero), XFrac[51:1]} : ShiftCnt[12] ? {115'b0, ~XZero} : ShiftVal << ShiftCnt;
+
+    assign ShiftedMan = ShiftedManTmp[64+51:50];
+    assign Sticky = |ShiftedManTmp[49:0] | &ShiftCnt&XFrac[0] | (FOpCtrlE[0]&|ShiftedManTmp[62:50]) | (FOpCtrlE[0]&~FmtE&|ShiftedManTmp[91:63]);
+
+    
+    // determine guard, round, and least significant bit of the result
+    assign Guard = FOpCtrlE[1] ? ShiftedMan[1] : FmtE ? ShiftedMan[13] : ShiftedMan[42];
+    assign Round = FOpCtrlE[1] ? ShiftedMan[0] : FmtE ? ShiftedMan[12] : ShiftedMan[41];
+    assign LSB = FOpCtrlE[1] ? ShiftedMan[2] : FmtE ? ShiftedMan[14] : ShiftedMan[43];
+
+    always_comb begin
+        // Determine if you add 1
+        case (FrmE)
+            3'b000: CalcPlus1 = Guard & (Round | Sticky | (~Round&~Sticky&LSB));//round to nearest even
+            3'b001: CalcPlus1 = 0;//round to zero
+            3'b010: CalcPlus1 = (XSgn&FOpCtrlE[1]) | (ResSgn&FOpCtrlE[0]);//round down
+            3'b011: CalcPlus1 = (~XSgn&FOpCtrlE[1]) | (~ResSgn&FOpCtrlE[0]);//round up
+            3'b100: CalcPlus1 = Guard & (Round | Sticky | (~Round&~Sticky));//round to nearest max magnitude
+            default: CalcPlus1 = 1'bx;
+        endcase
+    end
+
+    assign Plus1 = CalcPlus1 & (Guard|Round|Sticky)&~(XZero&FOpCtrlE[1]);
+
+    assign RoundedTmp = ShiftedMan[64+1:2] + Plus1;
+    assign {ResExp, ResFrac} = FmtE ? {TmpExp, ShiftedMan[64+1:14]} + Plus1 :  {{TmpExp, ShiftedMan[64+1:43]} + Plus1, 29'b0} ;
+
+     assign Rounded = Res64 ? XSgn&FOpCtrlE[1] ? -RoundedTmp[63:0] : RoundedTmp[63:0] : 
+			      XSgn ? {{32{1'b1}}, -RoundedTmp[31:0]} : {32'b0, RoundedTmp[31:0]};
+     assign RoundMSB = Res64 ? RoundedTmp[64] : RoundedTmp[32];
+     assign RoundSgn = Res64 ? Rounded[63] : Rounded[31];
+
+
+
+   // Choose result
+   //    double to unsigned long
+   //         >2^64-1 or +inf or NaN - all 1's
+   //         <0 or -inf - zero
+   //         otherwise rounded result
+    //assign Of = (~XSgn&($signed(ShiftCnt) >= $signed(Bits))) | (RoundMSB&(ShiftCnt==(Bits-1))) | (~XSgn&XInf) | XNaN;
+    assign Of = (~XSgn&($signed(ShiftCnt) >= $signed(Bits))) | (~XSgn&RoundSgn&~FOpCtrlE[2]) | (RoundMSB&(ShiftCnt==(Bits-1))) | (~XSgn&XInf) | XNaN;
+    assign Uf = FOpCtrlE[2] ? XSgn&~XZero | (XSgn&XInf) | (XSgn&~XZero&(~ShiftCnt[12]|CalcPlus1)) | (ShiftCnt[12]&Plus1) : (XSgn&XInf) | (XSgn&($signed(ShiftCnt) >= $signed(Bits))) | (XSgn&~RoundSgn&~ShiftCnt[12]);    // assign CvtIntRes =  (XSgn | ShiftCnt[12]) ? {64{1'b0}}  : (ShiftCnt >= 64) ? {64{1'b1}} : Rounded;
+    assign SgnRes = ~FOpCtrlE[3] & FOpCtrlE[1];
+    assign CvtIntRes = Of ? FOpCtrlE[2] ? SgnRes ? {32'b0, {32{1'b1}}}: {64{1'b1}} : SgnRes ? {33'b0, {31{1'b1}}}: {1'b0, {63{1'b1}}} : 
+                    Uf ? FOpCtrlE[2] ? 64'b0 : SgnRes ? {32'b0, 1'b1, 31'b0} : {1'b1, 63'b0} :
+		            Rounded[64-1:0];
+                    
+    assign CvtRes = FmtE ? {ResSgn, ResExp, ResFrac} : {ResSgn, ResExp[7:0], ResFrac, 3'b0};
+    assign CvtResE = FOpCtrlE[0] ? CvtRes : CvtIntRes;
+    assign CvtFlgE = {(Of | Uf)&FOpCtrlE[1], 3'b0, (Guard|Round|Sticky)&FOpCtrlE[0]};
+
+
+
+
+endmodule // fpadd
+
+
--- a/wally-pipelined/src/fpu/fma2.sv
+++ b/wally-pipelined/src/fpu/fma2.sv
@ -1,3 +1,231 @@
+module fma(
+    input logic             clk,
+    input logic             reset,
+    input logic             FlushM,
+    input logic             StallM,
+    input logic  [63:0]     SrcXE, SrcXM,  // X
+    input logic  [63:0]     SrcYE, SrcYM,  // Y
+    input logic  [63:0]     SrcZE, SrcZM,  // Z
+    input logic             FmtE, FmtM,       // precision 1 = double 0 = single
+    input logic  [2:0]      FOpCtrlM, FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
+    input logic  [2:0]      FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+	output logic [63:0]		FMAResM,
+	output logic [4:0]		FMAFlgM);
+	
+
+    logic [105:0]	ProdManE, ProdManM; 
+    logic [161:0]	AlignedAddendE, AlignedAddendM;                       
+    logic [12:0]	ProdExpE, ProdExpM;
+    logic 			AddendStickyE, AddendStickyM;
+    logic 			KillProdE, KillProdM;
+    logic				XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
+    logic				XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
+    logic				XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
+    
+    fma1 fma1 (.X(SrcXE), .Y(SrcYE), .Z(SrcZE), .FOpCtrlE, .FmtE, .ProdManE, .AlignedAddendE,
+                .ProdExpE, .AddendStickyE, .KillProdE, .XZeroE, .YZeroE, .ZZeroE, .XInfE, .YInfE, .ZInfE,
+                .XNaNE, .YNaNE, .ZNaNE ); 
+                
+    flopenrc #(106) EMRegFma1(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
+    flopenrc #(162) EMRegFma2(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
+    flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
+    flopenrc #(11) EMRegFma4(clk, reset, FlushM, ~StallM, 
+                            {AddendStickyE, KillProdE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE},
+                            {AddendStickyM, KillProdM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM});
+
+    fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM, .FrmM, .FmtM, 
+            .ProdManM, .AlignedAddendM, .ProdExpM, .AddendStickyM, .KillProdM, 
+            .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, 
+            .FMAResM, .FMAFlgM);
+
+endmodule
+      
+
+
+module fma1(
+ 
+    input logic     [63:0]      X,  // X
+    input logic     [63:0]      Y,  // Y
+    input logic     [63:0]      Z,  // Z
+    input logic     [2:0]       FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
+    input logic                 FmtE,       // precision 1 = double 0 = single
+    output logic    [105:0]     ProdManE,   // 1.X frac * 1.Y frac
+    output logic    [161:0]     AlignedAddendE, // Z aligned for addition
+    output logic    [12:0]      ProdExpE,       // X exponent + Y exponent - bias
+    output logic                AddendStickyE,  // sticky bit that is calculated during alignment
+    output logic                KillProdE,      // set the product to zero before addition if the product is too small to matter
+    output logic                XZeroE, YZeroE, ZZeroE, // inputs are zero
+    output logic                XInfE, YInfE, ZInfE,    // inputs are infinity
+    output logic                XNaNE, YNaNE, ZNaNE);   // inputs are NaN
+
+    logic [51:0]    XFrac,YFrac,ZFrac;  // input fraction
+    logic [52:0]    XMan,YMan,ZMan;     // input mantissa (with leading one)
+    logic [12:0]    XExp,YExp,ZExp;     // input exponents
+    logic           XSgn,YSgn,ZSgn;     // input signs
+    logic [12:0]    AlignCnt;           // how far to shift the addend to align with the product
+    logic [213:0]   ZManShifted;                // output of the alignment shifter including sticky bit
+    logic [213:0]   ZManPreShifted;     // input to the alignment shifter
+    logic           XDenorm, YDenorm, ZDenorm;  // inputs are denormal
+    logic [63:0]    Addend; // value to add (Z or zero)
+    logic [12:0]    Bias;   // 1023 for double, 127 for single
+    logic           XExpZero, YExpZero, ZExpZero;   // input exponent zero
+    logic           XFracZero, YFracZero, ZFracZero; // input fraction zero
+    logic           XExpMax, YExpMax, ZExpMax;  // input exponent all 1s
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // split inputs into the sign bit, fraction, and exponent to handle single or double precision
+    //      - single precision is in the top half of the inputs
+    ///////////////////////////////////////////////////////////////////////////////
+
+    // Set addend to zero if FMUL instruction
+    assign Addend = FOpCtrlE[2] ? 64'b0 : Z;
+
+    assign XSgn = X[63];
+    assign YSgn = Y[63];
+    assign ZSgn = Addend[63];
+
+    assign XExp = FmtE ? {2'b0, X[62:52]} : {5'b0, X[62:55]};
+    assign YExp = FmtE ? {2'b0, Y[62:52]} : {5'b0, Y[62:55]};
+    assign ZExp = FmtE ? {2'b0, Addend[62:52]} : {5'b0, Addend[62:55]};
+
+    assign XFrac = FmtE ? X[51:0] : {X[54:32], 29'b0};
+    assign YFrac = FmtE ? Y[51:0] : {Y[54:32], 29'b0};
+    assign ZFrac = FmtE ? Addend[51:0] : {Addend[54:32], 29'b0};
+   
+    assign XMan = {~XExpZero, XFrac};
+    assign YMan = {~YExpZero, YFrac};
+    assign ZMan = {~ZExpZero, ZFrac};
+
+    assign Bias = FmtE ? 13'h3ff : 13'h7f;
+
+
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // determine if an input is a special value
+    ///////////////////////////////////////////////////////////////////////////////
+
+    assign XExpZero = ~|XExp;
+    assign YExpZero = ~|YExp;
+    assign ZExpZero = ~|ZExp;
+   
+    assign XFracZero = ~|XFrac;
+    assign YFracZero = ~|YFrac;
+    assign ZFracZero = ~|ZFrac;
+
+    assign XExpMax = FmtE ? &XExp[10:0] : &XExp[7:0];
+    assign YExpMax = FmtE ? &YExp[10:0] : &YExp[7:0];
+    assign ZExpMax = FmtE ? &ZExp[10:0] : &ZExp[7:0];
+   
+    assign XNaNE = XExpMax & ~XFracZero;
+    assign YNaNE = YExpMax & ~YFracZero;
+    assign ZNaNE = ZExpMax & ~ZFracZero;
+
+    assign XDenorm = XExpZero & ~XFracZero;
+    assign YDenorm = YExpZero & ~YFracZero;
+    assign ZDenorm = ZExpZero & ~ZFracZero;
+
+    assign XInfE = XExpMax & XFracZero;
+    assign YInfE = YExpMax & YFracZero;
+    assign ZInfE = ZExpMax & ZFracZero;
+
+    assign XZeroE = XExpZero & XFracZero;
+    assign YZeroE = YExpZero & YFracZero;
+    assign ZZeroE = ZExpZero & ZFracZero;
+
+
+
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Calculate the product
+    //      - When multipliying two fp numbers, add the exponents
+    //      - Subtract the bias (XExp + YExp has two biases, one from each exponent)
+    //      - Denormal numbers have an an exponent value of 1, however they are
+    //        represented with an exponent of 0. add one if there is a denormal number
+    ///////////////////////////////////////////////////////////////////////////////
+   
+    // verilator lint_off WIDTH
+    assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 :
+                 XExp + YExp - Bias + XDenorm + YDenorm;
+
+    // Calculate the product's mantissa
+    //      - Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
+    assign ProdManE =  XMan * YMan;
+
+
+
+
+
+
+
+
+   
+    ///////////////////////////////////////////////////////////////////////////////
+    // Alignment shifter
+    ///////////////////////////////////////////////////////////////////////////////
+
+    // determine the shift count for alignment
+    //      - negitive means Z is larger, so shift Z left
+    //      - positive means the product is larger, so shift Z right
+    //      - Denormal numbers have an an exponent value of 1, however they are
+    //        represented with an exponent of 0. add one to the exponent if it is a denormal number
+    assign AlignCnt = ProdExpE - ZExp - ZDenorm;
+    // verilator lint_on WIDTH
+
+
+    // Defualt Addition without shifting
+    //          |   55'b0    |  106'b(product)  | 2'b0 |
+    //                       |1'b0| addnend |
+
+    // the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
+    assign ZManPreShifted = {55'b0, ZMan, 106'b0};
+    always_comb
+        begin
+           
+        // If the product is too small to effect the sum, kill the product
+
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //  | addnend |
+        if ($signed(AlignCnt) <= $signed(-13'd56)) begin
+            KillProdE = 1;
+            ZManShifted = ZManPreShifted;//{107'b0, ZMan, 54'b0};
+            AddendStickyE = ~(XZeroE|YZeroE);
+
+        // If the Addend is shifted left (negitive AlignCnt)
+
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                  | addnend |
+        end else if($signed(AlignCnt) <= $signed(13'd0))  begin
+            KillProdE = 0;
+            ZManShifted = ZManPreShifted << -AlignCnt;
+            AddendStickyE = |(ZManShifted[51:0]);
+
+        // If the Addend is shifted right (positive AlignCnt)
+
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                                  | addnend |
+        end else if ($signed(AlignCnt)<=$signed(13'd106))  begin
+            KillProdE = 0;
+            ZManShifted = ZManPreShifted >> AlignCnt;
+            AddendStickyE = |(ZManShifted[51:0]);
+
+        // If the addend is too small to effect the addition        
+        //      - The addend has to shift two past the end of the addend to be considered too small
+        //      - The 2 extra bits are needed for rounding
+
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                                                      | addnend |
+        end else begin
+            KillProdE = 0;
+            ZManShifted = 0;
+            AddendStickyE = ~ZZeroE;
+
+        end
+    end
+
+   
+    assign AlignedAddendE = ZManShifted[213:52];
+
+endmodule


 module fma2(
--- a/wally-pipelined/src/fpu/fma1.sv
+++ b/wally-pipelined/src/fpu/fma1.sv
@ -1,184 +0,0 @@
-module fma1(
- 
-    input logic     [63:0]      X,  // X
-    input logic     [63:0]      Y,  // Y
-    input logic     [63:0]      Z,  // Z
-    input logic     [2:0]       FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
-    input logic                 FmtE,       // precision 1 = double 0 = single
-    output logic    [105:0]     ProdManE,   // 1.X frac * 1.Y frac
-    output logic    [161:0]     AlignedAddendE, // Z aligned for addition
-    output logic    [12:0]      ProdExpE,       // X exponent + Y exponent - bias
-    output logic                AddendStickyE,  // sticky bit that is calculated during alignment
-    output logic                KillProdE,      // set the product to zero before addition if the product is too small to matter
-    output logic                XZeroE, YZeroE, ZZeroE, // inputs are zero
-    output logic                XInfE, YInfE, ZInfE,    // inputs are infinity
-    output logic                XNaNE, YNaNE, ZNaNE);   // inputs are NaN
-
-    logic [51:0]    XFrac,YFrac,ZFrac;  // input fraction
-    logic [52:0]    XMan,YMan,ZMan;     // input mantissa (with leading one)
-    logic [12:0]    XExp,YExp,ZExp;     // input exponents
-    logic           XSgn,YSgn,ZSgn;     // input signs
-    logic [12:0]    AlignCnt;           // how far to shift the addend to align with the product
-    logic [213:0]   ZManShifted;                // output of the alignment shifter including sticky bit
-    logic [213:0]   ZManPreShifted;     // input to the alignment shifter
-    logic           XDenorm, YDenorm, ZDenorm;  // inputs are denormal
-    logic [63:0]    Addend; // value to add (Z or zero)
-    logic [12:0]    Bias;   // 1023 for double, 127 for single
-    logic           XExpZero, YExpZero, ZExpZero;   // input exponent zero
-    logic           XFracZero, YFracZero, ZFracZero; // input fraction zero
-    logic           XExpMax, YExpMax, ZExpMax;  // input exponent all 1s
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // split inputs into the sign bit, fraction, and exponent to handle single or double precision
-    //      - single precision is in the top half of the inputs
-    ///////////////////////////////////////////////////////////////////////////////
-
-    // Set addend to zero if FMUL instruction
-    assign Addend = FOpCtrlE[2] ? 64'b0 : Z;
-
-    assign XSgn = X[63];
-    assign YSgn = Y[63];
-    assign ZSgn = Addend[63];
-
-    assign XExp = FmtE ? {2'b0, X[62:52]} : {5'b0, X[62:55]};
-    assign YExp = FmtE ? {2'b0, Y[62:52]} : {5'b0, Y[62:55]};
-    assign ZExp = FmtE ? {2'b0, Addend[62:52]} : {5'b0, Addend[62:55]};
-
-    assign XFrac = FmtE ? X[51:0] : {X[54:32], 29'b0};
-    assign YFrac = FmtE ? Y[51:0] : {Y[54:32], 29'b0};
-    assign ZFrac = FmtE ? Addend[51:0] : {Addend[54:32], 29'b0};
-   
-    assign XMan = {~XExpZero, XFrac};
-    assign YMan = {~YExpZero, YFrac};
-    assign ZMan = {~ZExpZero, ZFrac};
-
-    assign Bias = FmtE ? 13'h3ff : 13'h7f;
-
-
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // determine if an input is a special value
-    ///////////////////////////////////////////////////////////////////////////////
-
-    assign XExpZero = ~|XExp;
-    assign YExpZero = ~|YExp;
-    assign ZExpZero = ~|ZExp;
-   
-    assign XFracZero = ~|XFrac;
-    assign YFracZero = ~|YFrac;
-    assign ZFracZero = ~|ZFrac;
-
-    assign XExpMax = FmtE ? &XExp[10:0] : &XExp[7:0];
-    assign YExpMax = FmtE ? &YExp[10:0] : &YExp[7:0];
-    assign ZExpMax = FmtE ? &ZExp[10:0] : &ZExp[7:0];
-   
-    assign XNaNE = XExpMax & ~XFracZero;
-    assign YNaNE = YExpMax & ~YFracZero;
-    assign ZNaNE = ZExpMax & ~ZFracZero;
-
-    assign XDenorm = XExpZero & ~XFracZero;
-    assign YDenorm = YExpZero & ~YFracZero;
-    assign ZDenorm = ZExpZero & ~ZFracZero;
-
-    assign XInfE = XExpMax & XFracZero;
-    assign YInfE = YExpMax & YFracZero;
-    assign ZInfE = ZExpMax & ZFracZero;
-
-    assign XZeroE = XExpZero & XFracZero;
-    assign YZeroE = YExpZero & YFracZero;
-    assign ZZeroE = ZExpZero & ZFracZero;
-
-
-
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Calculate the product
-    //      - When multipliying two fp numbers, add the exponents
-    //      - Subtract the bias (XExp + YExp has two biases, one from each exponent)
-    //      - Denormal numbers have an an exponent value of 1, however they are
-    //        represented with an exponent of 0. add one if there is a denormal number
-    ///////////////////////////////////////////////////////////////////////////////
-   
-    // verilator lint_off WIDTH
-    assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 :
-                 XExp + YExp - Bias + XDenorm + YDenorm;
-
-    // Calculate the product's mantissa
-    //      - Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
-    assign ProdManE =  XMan * YMan;
-
-
-
-
-
-
-
-
-   
-    ///////////////////////////////////////////////////////////////////////////////
-    // Alignment shifter
-    ///////////////////////////////////////////////////////////////////////////////
-
-    // determine the shift count for alignment
-    //      - negitive means Z is larger, so shift Z left
-    //      - positive means the product is larger, so shift Z right
-    //      - Denormal numbers have an an exponent value of 1, however they are
-    //        represented with an exponent of 0. add one to the exponent if it is a denormal number
-    assign AlignCnt = ProdExpE - ZExp - ZDenorm;
-    // verilator lint_on WIDTH
-
-
-    // Defualt Addition without shifting
-    //          |   55'b0    |  106'b(product)  | 2'b0 |
-    //                       |1'b0| addnend |
-
-    // the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
-    assign ZManPreShifted = {55'b0, ZMan, 106'b0};
-    always_comb
-        begin
-           
-        // If the product is too small to effect the sum, kill the product
-
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //  | addnend |
-        if ($signed(AlignCnt) <= $signed(-13'd56)) begin
-            KillProdE = 1;
-            ZManShifted = ZManPreShifted;//{107'b0, ZMan, 54'b0};
-            AddendStickyE = ~(XZeroE|YZeroE);
-
-        // If the Addend is shifted left (negitive AlignCnt)
-
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //                  | addnend |
-        end else if($signed(AlignCnt) <= $signed(13'd0))  begin
-            KillProdE = 0;
-            ZManShifted = ZManPreShifted << -AlignCnt;
-            AddendStickyE = |(ZManShifted[51:0]);
-
-        // If the Addend is shifted right (positive AlignCnt)
-
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //                                  | addnend |
-        end else if ($signed(AlignCnt)<=$signed(13'd106))  begin
-            KillProdE = 0;
-            ZManShifted = ZManPreShifted >> AlignCnt;
-            AddendStickyE = |(ZManShifted[51:0]);
-
-        // If the addend is too small to effect the addition        
-        //      - The addend has to shift two past the end of the addend to be considered too small
-        //      - The 2 extra bits are needed for rounding
-
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //                                                      | addnend |
-        end else begin
-            KillProdE = 0;
-            ZManShifted = 0;
-            AddendStickyE = ~ZZeroE;
-
-        end
-    end
-
-   
-    assign AlignedAddendE = ZManShifted[213:52];
-
-endmodule
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -45,7 +45,7 @@ module fpu (
 // *** change FMA to do 16 - 32 - 64 - 128 FEXPBITS 

  generate
-     if (`F_SUPPORTED) begin 
+     if (`F_SUPPORTED | `D_SUPPORTED) begin 
      // control logic signal instantiation
      logic 		   FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;              // FP register write enable
      logic [2:0] 	FrmD, FrmE, FrmM;                                  // FP rounding mode
@ -75,39 +75,15 @@ module fpu (
      logic [63:0] 	DivInput1E, DivInput2E;
      logic          HoldInputs;                                              // keep forwarded inputs arround durring division
      
-      // FMA signals
-      logic [105:0]	ProdManE, ProdManM; ///*** put pipline stages in units
-      logic [161:0]	AlignedAddendE, AlignedAddendM;                       
-      logic [12:0]	ProdExpE, ProdExpM;
-      logic 			AddendStickyE, AddendStickyM;
-      logic 			KillProdE, KillProdM;
-      logic				XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
-      logic				XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
-      logic				XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
+      //fpu signals
      logic [63:0]   FMAResM, FMAResW;
      logic [4:0]    FMAFlgM, FMAFlgW;

      // add/cvt signals
-      logic [63:0] 	AddSumE, AddSumM;
-      logic [63:0]   AddSumTcE, AddSumTcM;
-      logic [3:0] 	AddSelInvE, AddSelInvM;
-      logic [10:0] 	AddExpPostSumE,AddExpPostSumM;
-      logic 		   AddCorrSignE, AddCorrSignM;
-      logic          AddOp1NormE, AddOp1NormM;
-      logic          AddOp2NormE, AddOp2NormM;
-      logic          AddOpANormE,  AddOpANormM;
-      logic          AddOpBNormE, AddOpBNormM;
-      logic          AddInvalidE, AddInvalidM;
-      logic 		   AddDenormInE, AddDenormInM;
-      logic          AddSwapE, AddSwapM;
-      logic          AddNormOvflowE, AddNormOvflowM; //***this isn't used in addcvt2
-      logic          AddSignAE, AddSignAM;
-      logic 		   AddConvertE, AddConvertM;
-      logic [63:0] 	AddFloat1E, AddFloat2E, AddFloat1M, AddFloat2M;
-      logic [11:0] 	AddExp1DenormE, AddExp2DenormE, AddExp1DenormM, AddExp2DenormM;
-      logic [10:0] 	AddExponentE, AddExponentM;
      logic [63:0] 	FAddResM, FAddResW;
      logic [4:0] 	FAddFlgM, FAddFlgW;  
+      logic [63:0] 	CvtResE, CvtResM;
+      logic [4:0] 	CvtFlgE, CvtFlgM;  
      
      // cmp signals 
      logic 		   CmpNVE, CmpNVM, CmpNVW;
@ -117,7 +93,7 @@ module fpu (
      logic [63:0] 	SgnResE, SgnResM;
      logic        	SgnNVE, SgnNVM, SgnNVW;
      logic [63:0]   FResM, FResW;
-      logic          FFlgM, FFlgW;
+      logic [4:0]         FFlgM, FFlgW;
      
      // instantiation of W stage regfile signals
      logic [63:0] 	AlignedSrcAM;
@ -198,9 +174,10 @@ module fpu (

      
      // first of two-stage instance of floating-point fused multiply-add unit
-      fma1 fma1 (.X(SrcXE), .Y(SrcYE), .Z(SrcZE), .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .ProdManE, .AlignedAddendE,
-                  .ProdExpE, .AddendStickyE, .KillProdE, .XZeroE, .YZeroE, .ZZeroE, .XInfE, .YInfE, .ZInfE,
-                  .XNaNE, .YNaNE, .ZNaNE );
+      fma fma (.clk, .reset, .FlushM, .StallM, 
+               .SrcXE, .SrcYE, .SrcZE, .SrcXM, .SrcYM, .SrcZM, 
+               .FOpCtrlE(FOpCtrlE[2:0]), .FOpCtrlM(FOpCtrlM[2:0]), 
+               .FmtE, .FmtM, .FrmM, .FMAFlgM, .FMAResM);
      
      // first and only instance of floating-point divider
      logic fpdivClk;
@ -225,10 +202,8 @@ module fpu (


      // first of two-stage instance of floating-point add/cvt unit
-      fpuaddcvt1 fpadd1 (.SrcXE, .SrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
-                        .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
-                        .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
-                        .AddDenormInE, .AddConvertE, .AddSwapE, .AddNormOvflowE);
+      faddcvt faddcvt (.clk, .reset, .FlushM, .StallM, .FrmM, .FOpCtrlM, .FmtE, .FmtM,
+                        .SrcXE, .SrcYE, .FOpCtrlE, .FAddResM, .FAddFlgM);
      
      // first and only instance of floating-point comparator
      fcmp fcmp (SrcXE, SrcYE, FOpCtrlE[2:0], FmtE, CmpNVE, CmpResE);
@ -239,6 +214,9 @@ module fpu (
      // first and only instance of floating-point classify unit
      fclassify fclassify (.SrcXE, .FmtE, .ClassResE);

+
+      fcvt fcvt (.X(SrcXE), .SrcAE, .FOpCtrlE, .FmtE, .FrmE, .CvtResE, .CvtFlgE);
+
      // output for store instructions
      assign FWriteDataE = FmtE ? SrcYE[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcYE[63:32]};
      //***swap to mux
@ -259,31 +237,16 @@ module fpu (
      flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, SrcYE, SrcYM);
      flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, SrcZE, SrcZM);
      
-      flopenrc #(106) EMRegFma1(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
-      flopenrc #(162) EMRegFma2(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
-      flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
-      flopenrc #(11) EMRegFma4(clk, reset, FlushM, ~StallM, 
-                                 {AddendStickyE, KillProdE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE},
-                                 {AddendStickyM, KillProdM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM});
-
-      flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
-      flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
-      flopenrc #(11) EMRegAdd3(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
-      flopenrc #(64) EMRegAdd4(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
-      flopenrc #(64) EMRegAdd5(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
-      flopenrc #(12) EMRegAdd6(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
-      flopenrc #(12) EMRegAdd7(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
-      flopenrc #(11) EMRegAdd8(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM);
-      flopenrc #(15) EMRegAdd9(clk, reset, FlushM, ~StallM, 
-                              {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE},
-                              {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM}); 
-
+     
      flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
      flopenrc #(64) EMRegCmp2(clk, reset, FlushM, ~StallM, CmpResE, CmpResM); 
      
      flopenrc #(64) EMRegSgn1(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
      flopenrc #(1) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
      
+      flopenrc #(64) EMRegCvt1(clk, reset, FlushM, ~StallM, CvtResE, CvtResM);
+      flopenrc #(5) EMRegCvt2(clk, reset, FlushM, ~StallM, CvtFlgE, CvtFlgM);
+      
      flopenrc #(22) EMCtrlReg(clk, reset, FlushM, ~StallM,
                           {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE, FOpCtrlE, FWriteIntE},
                           {FWriteEnM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, RdM, FOpCtrlM, FWriteIntM});
@ -299,29 +262,27 @@ module fpu (

      //BEGIN MEMORY STAGE
      
-      mux3  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, FResSelM, FResM);
-      mux3  #(1)  FFlgMux(1'b0, SgnNVM, CmpNVM, FResSelM, FFlgM);
+      mux4  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, CvtResM, FResSelM, FResM);
+      mux4  #(5)  FFlgMux(5'b0, {4'b0, SgnNVM}, {4'b0, CmpNVM}, CvtFlgM, FResSelM, FFlgM);

      //***change to mux
      assign SrcXMAligned = FmtM ? SrcXM[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcXM[63:32]};
-      mux3  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], SrcXMAligned, ClassResM[`XLEN-1:0], FIntResSelM, FIntResM);
+      mux4  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], SrcXMAligned, ClassResM[`XLEN-1:0], CvtResM[`XLEN-1:0], FIntResSelM, FIntResM);

-      // second instance of two-stage FMA unit
-      fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM(FOpCtrlM[2:0]), .FrmM, .FmtM, 
-               .ProdManM, .AlignedAddendM, .ProdExpM, .AddendStickyM, .KillProdM, 
-               .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, 
-               .FMAResM, .FMAFlgM);
-      
-      // second instance of two-stage floating-point add/cvt unit
-      fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM, .AddFloat1M, .AddFloat2M, 
-                        .AddExp1DenormM, .AddExp2DenormM, .AddExponentM, .AddExpPostSumM, .AddSelInvM, 
-                        .AddOp1NormM, .AddOp2NormM, .AddOpANormM, .AddOpBNormM, .AddInvalidM, .AddDenormInM, 
-                        .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
      
      // Align SrcA to MSB when single precicion
      mux2  #(64)  SrcAMux({SrcAM[31:0], 32'b0}, {{64-`XLEN{1'b0}}, SrcAM}, FmtM, AlignedSrcAM);
         
-
+      always_comb begin
+         case (FResultSelM)
+      3'b000 : SetFflagsM = 5'b0;
+      3'b001 : SetFflagsM = FMAFlgM;
+      3'b010 : SetFflagsM = FAddFlgM;
+      3'b011 : SetFflagsM = FDivSqrtFlgM;
+      3'b100 : SetFflagsM = FFlgM;
+      default : SetFflagsM = 5'bxxxxx;
+         endcase
+      end



@ -334,19 +295,14 @@ module fpu (
      // M/W pipe registers
      //*****************
      flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
-      flopenrc #(5) MWRegFma2(clk, reset, FlushW, ~StallW, FMAFlgM, FMAFlgW); 
      
      flopenrc #(64) MWRegDiv1(clk, reset, FlushW, ~StallW, FDivResultM, FDivResultW); 
-      flopenrc #(5) MWRegDiv2(clk, reset, FlushW, ~StallW, FDivSqrtFlgM, FDivSqrtFlgW);
      
      flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
-      flopenrc #(5) MWRegAdd2(clk, reset, FlushW, ~StallW, FAddFlgM, FAddFlgW); 
      
-      flopenrc #(1) MWRegCmp1(clk, reset, FlushW, ~StallW, CmpNVM, CmpNVW); 
      flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, CmpResM, CmpResW);

      flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
-      flopenrc #(1) MWRegClass1(clk, reset, FlushW, ~StallW, FFlgM, FFlgW);
      
      flopenrc #(11) MWCtrlReg(clk, reset, FlushW, ~StallW,
                           {FWriteEnM, FResultSelM, RdM, FmtM, FWriteIntM},
@ -363,20 +319,6 @@ module fpu (



-
-
-   //***turn into muxs
-      always_comb begin
-         case (FResultSelW)
-      3'b000 : FPUFlagsW = 5'b0;
-      3'b001 : FPUFlagsW = FMAFlgW;
-      3'b010 : FPUFlagsW = FAddFlgW;
-      3'b011 : FPUFlagsW = FDivSqrtFlgW;
-      3'b100 : FPUFlagsW = {4'b0,FFlgW};
-      default : FPUFlagsW = 5'bxxxxx;
-         endcase
-      end
-
      always_comb begin
         case (FResultSelW)
      3'b000 : FPUResult64W = FmtW ? {ReadDataW, {64-`XLEN{1'b0}}} : {ReadDataW[31:0], 32'b0};
@ -393,13 +335,11 @@ module fpu (
      // floating-point results
      //
      // define offsets for LSB zero extension or truncation
-      always_comb begin      
-         // zero extension 
-   //***turn into mux
-         FPUResultW = FmtW ? FPUResult64W[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FPUResult64W[63:32]};
-         //*** put into mem stage
-         SetFflagsM = FPUFlagsW;      
-      end
+   always_comb begin      
+      // zero extension 
+//***turn into mux
+      FPUResultW = FmtW ? FPUResult64W[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FPUResult64W[63:32]};    
+   end
   end else begin // no F_SUPPORTED; tie outputs low
     assign FStallD = 0;
     assign FWriteIntE = 0; 
--- a/wally-pipelined/src/fpu/fpuaddcvt1.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt1.sv
@ -1,198 +0,0 @@
-//
-// File name : fpadd
-// Title     : Floating-Point Adder/Subtractor
-// project   : FPU
-// Library   : fpadd
-// Author(s) : James E. Stine, Jr., Brett Mathis
-// Purpose   : definition of main unit to floating-point add/sub
-// notes :   
-//
-// Copyright Oklahoma State University
-// Copyright AFRL
-//
-// Basic and Denormalized Operations
-//
-// Step 1: Load operands, set flags, and convert SP to DP
-// Step 2: Check for special inputs ( +/- Infinity,  NaN)
-// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
-//         or of (exp1 = exp2 AND mnt1 < mnt2)
-// Step 4: Shift the mantissa corresponding to the smaller exponent, 
-//          and extend precision by three bits to the right.
-// Step 5: Add or subtract the mantissas.
-// Step 6: Normalize the result.//
-//   Shift left until normalized.  Normalized when the value to the 
-//   left of the binrary point is 1.
-// Step 7: Round the result.// 
-// Step 8: Put sum onto output.
-//
-
-
-module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, SrcXE, SrcYE, FOpCtrlE, FmtE);
-
-   input logic [63:0] SrcXE;		// 1st input operand (A)
-   input logic [63:0] SrcYE;		// 2nd input operand (B)
-   input logic [3:0]	FOpCtrlE;	// Function opcode
-   input logic 	FmtE;   		// Result Precision (1 for double, 0 for single)
-
-   wire          P;
-   assign P = ~FmtE | FOpCtrlE[2];
-
-   wire [63:0] 	 IntValue;
-   wire [11:0] 	 exp1, exp2;
-   wire [11:0] 	 exp_diff1, exp_diff2;
-   wire [11:0] 	 exp_shift;
-   wire [51:0] 	 mantissaA;
-   wire [56:0] 	 mantissaA1;
-   wire [63:0] 	 mantissaA3;
-   wire [51:0] 	 mantissaB; 
-   wire [56:0] 	 mantissaB1, mantissaB2;
-   wire [63:0] 	 mantissaB3;
-   wire 	 exp_gt63;
-   wire 	 Sticky_out;
-   wire          sub;
-   wire 	 zeroB;
-   wire [5:0]	 align_shift; 
-
-   output logic [63:0] 	 AddFloat1E; 
-   output logic [63:0] 	 AddFloat2E;
-   output logic [10:0] 	 AddExponentE;
-   output logic [10:0]	 AddExpPostSumE;
-   output logic [11:0]	 AddExp1DenormE, AddExp2DenormE;//KEP used to be [10:0]
-   output logic [63:0] AddSumE, AddSumTcE;
-   output logic [3:0]  AddSelInvE;
-   output logic        AddCorrSignE;
-   output logic 	 AddSignAE;
-   output logic	 AddOp1NormE, AddOp2NormE;
-   output logic	 AddOpANormE, AddOpBNormE;
-   output logic	 AddInvalidE;
-   output logic 	 AddDenormInE;
-//   output logic 	 exp_valid;
-   output logic 	 AddConvertE;
-   output logic        AddSwapE;
-   output logic 	 AddNormOvflowE;
-   wire [5:0]	 ZP_mantissaA;
-   wire [5:0]	 ZP_mantissaB;
-   wire		 ZV_mantissaA;
-   wire		 ZV_mantissaB;
-
-   // Convert the input operands to their appropriate forms based on 
-   // the orignal operands, the FOpCtrlE , and their precision P. 
-   // Single precision inputs are converted to double precision 
-   // and the sign of the first operand is set appropratiately based on
-   // if the operation is absolute value or negation. 
-
-   convert_inputs conv1 (AddFloat1E, AddFloat2E, SrcXE, SrcYE, FOpCtrlE, P);
-
-   // Test for exceptions and return the "Invalid Operation" and
-   // "Denormalized" Input Flags. The "AddSelInvE" is used in
-   // the third pipeline stage to select the result. Also, AddOp1NormE
-   // and AddOp2NormE are one if SrcXE and SrcYE are not zero or denormalized.
-   // sub is one if the effective operation is subtaction. 
-
-   exception exc1 (AddSelInvE, AddInvalidE, AddDenormInE, AddOp1NormE, AddOp2NormE, sub, 
-		   AddFloat1E, AddFloat2E, FOpCtrlE);
-
-   // Perform Exponent Subtraction (used for alignment). For performance
-   // both exponent subtractions are performed in parallel. This was 
-   // changed to a behavior level to allow the tools to  try to optimize
-   // the two parallel additions. The input values are zero-extended to 12 
-   // bits prior to performing the addition. 
-
-   assign exp1 = {1'b0, AddFloat1E[62:52]};
-   assign exp2 = {1'b0, AddFloat2E[62:52]};
-   assign exp_diff1 = exp1 - exp2;
-   assign exp_diff2 = AddDenormInE ? ({AddFloat2E[63], exp2[10:0]} - {AddFloat1E[63], exp1[10:0]}): exp2 - exp1;
-
-   // The second operand (B) should be set to zero, if FOpCtrlE does not
-   // specify addition or subtraction
-   assign zeroB = FOpCtrlE[2] | FOpCtrlE[1];
-
-   // Swapped operands if zeroB is not one and exp1 < exp2. 
-   // Swapping causes exp2 to be used for the result exponent. 
-   // Only the exponent of the larger operand is used to determine
-   // the final result. 
-   assign AddSwapE = exp_diff1[11] & ~zeroB;
-   assign AddExponentE = AddSwapE ? exp2[10:0] : exp1[10:0];
-   assign AddExpPostSumE = AddSwapE ? exp2[10:0] : exp1[10:0];
-   assign mantissaA = AddSwapE ? AddFloat2E[51:0] : AddFloat1E[51:0];
-   assign mantissaB = AddSwapE ? AddFloat1E[51:0] : AddFloat2E[51:0];
-   assign AddSignAE     = AddSwapE ? AddFloat2E[63] : AddFloat1E[63];   
-
-   // Leading-Zero Detector. Determine the size of the shift needed for
-   // normalization. If sum_corrected is all zeros, the exp_valid is 
-   // zero; otherwise, it is one. 
-   // modified to 52 bits to detect leading zeroes on denormalized mantissas
-   lz52 lz_norm_1 (ZP_mantissaA, ZV_mantissaA, mantissaA);
-   lz52 lz_norm_2 (ZP_mantissaB, ZV_mantissaB, mantissaB);
-
-   // Denormalized exponents created by subtracting the leading zeroes from the original exponents
-   assign AddExp1DenormE = AddSwapE ? (exp1 - {6'b0, ZP_mantissaB}) : (exp1 - {6'b0, ZP_mantissaA}); //KEP extended ZP_mantissa 
-   assign AddExp2DenormE = AddSwapE ? (exp2 - {6'b0, ZP_mantissaA}) : (exp2 - {6'b0, ZP_mantissaB});
-
-   // Determine the alignment shift and limit it to 63. If any bit from 
-   // exp_shift[6] to exp_shift[11] is one, then shift is set to all ones. 
-   assign exp_shift = AddSwapE ? exp_diff2 : exp_diff1;
-   assign exp_gt63 = exp_shift[11] | exp_shift[10] | exp_shift[9] 
-     | exp_shift[8] | exp_shift[7] | exp_shift[6];
-   assign align_shift = exp_shift[5:0] | {6{exp_gt63}}; //KEP used to be all of exp_shift
-
-   // Unpack the 52-bit mantissas to 57-bit numbers of the form.
-   //    001.M[51]M[50] ... M[1]M[0]00
-   // Unless the number has an exponent of zero, in which case it
-   // is unpacked as
-   //    000.00 ... 00
-   // This effectively flushes denormalized values to zero. 
-   // The three bits of to the left of the binary point prevent overflow
-   // and loss of sign information. The two bits to the right of the 
-   // original mantissa form the "guard" and "round" bits that are used
-   // to round the result. 
-   assign AddOpANormE = AddSwapE ? AddOp2NormE : AddOp1NormE;
-   assign AddOpBNormE = AddSwapE ? AddOp1NormE : AddOp2NormE;
-   assign mantissaA1 = {2'h0, AddOpANormE, mantissaA[51:0]&{52{AddOpANormE}}, 2'h0};
-   assign mantissaB1 = {2'h0, AddOpBNormE, mantissaB[51:0]&{52{AddOpBNormE}}, 2'h0};
-
-   // Perform mantissa alignment using a 57-bit barrel shifter 
-   // If any of the bits shifted out are one, Sticky_out is set. 
-   // The size of the barrel shifter could be reduced by two bits
-   // by not adding the leading two zeros until after the shift. 
-   barrel_shifter_r57 bs1 (mantissaB2, Sticky_out, mantissaB1, align_shift);
-
-   // Place either the sign-extened 32-bit value or the original 64-bit value 
-   // into IntValue (to be used for integer to floating point conversion)
-   assign IntValue [31:0] = SrcXE[31:0];
-   assign IntValue [63:32] = FOpCtrlE[0] ? {32{SrcXE[31]}} : SrcXE[63:32];
-
-   // If doing an integer to floating point conversion, mantissaA3 is set to 
-   // IntVal and the prenomalized exponent is set to 1084. Otherwise, 
-   // mantissaA3 is simply extended to 64-bits by setting the 7 LSBs to zero, 
-   // and the exponent value is left unchanged. 
-   // Under denormalized cases, the exponent before the rounder is set to 1
-   // if the normal shift value is 11.
-   assign AddConvertE       = ~FOpCtrlE[2] & FOpCtrlE[1];
-   assign mantissaA3    = (FOpCtrlE[3]) ? (FOpCtrlE[0] ? AddFloat1E : ~AddFloat1E) : (AddDenormInE ? ({12'h0, mantissaA}) : (AddConvertE ? IntValue : {mantissaA1, 7'h0}));
-
-   // Put zero in for mantissaB3, if zeroB is one. Otherwise, B is extended to 
-   // 64-bits by setting the 7 LSBs to the Sticky_out bit followed by six  
-   // zeros. 
-   assign mantissaB3[63:7] = (FOpCtrlE[3]) ? (57'h0) : (AddDenormInE ? {12'h0, mantissaB[51:7]} : mantissaB2 & {57{~zeroB}});
-   assign mantissaB3[6]    = (FOpCtrlE[3]) ? (1'b0) : (AddDenormInE ? mantissaB[6] : Sticky_out & ~zeroB);
-   assign mantissaB3[5:0]  = (FOpCtrlE[3]) ? (6'h01) : (AddDenormInE ? mantissaB[5:0] : 6'h0);
-
-   // The sign of the result needs to be corrected if the true
-   // operation is subtraction and the input operands were swapped. 
-   assign AddCorrSignE = ~FOpCtrlE[2]&~FOpCtrlE[1]&FOpCtrlE[0]&AddSwapE;
-
-   // 64-bit Mantissa Adder/Subtractor
-   cla64 add1 (AddSumE, mantissaA3, mantissaB3, sub); //***adder
-
-   // 64-bit Mantissa Subtractor - to get the two's complement of the 
-   // result when the sign from the adder/subtractor is negative. 
-   cla_sub64 sub1 (AddSumTcE, mantissaB3, mantissaA3); //***adder
- 
-   // Finds normal underflow result to determine whether to round final exponent down
-   //***KEP used to be (AddSumE == 16'h0) I am unsure what it's supposed to be
-   assign AddNormOvflowE = (AddDenormInE & (AddSumE == 64'h0) & (AddOpANormE | AddOpBNormE) & ~FOpCtrlE[0]) ? 1'b1 : (AddSumE[63] ? AddSumTcE[52] : AddSumE[52]);
-
-endmodule // fpadd
-
-
--- a/wally-pipelined/src/fpu/fpuaddcvt2.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt2.sv
@ -1,163 +0,0 @@
-//
-// File name : fpadd
-// Title     : Floating-Point Adder/Subtractor
-// project   : FPU
-// Library   : fpadd
-// Author(s) : James E. Stine, Jr., Brett Mathis
-// Purpose   : definition of main unit to floating-point add/sub
-// notes :   
-//
-// Copyright Oklahoma State University
-// Copyright AFRL
-//
-// Basic and Denormalized Operations
-//
-// Step 1: Load operands, set flags, and AddConvertM SP to DP
-// Step 2: Check for special inputs ( +/- Infinity,  NaN)
-// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
-//         or of (exp1 = exp2 AND mnt1 < mnt2)
-// Step 4: Shift the mantissa corresponding to the smaller AddExponentM, 
-//          and extend precision by three bits to the right.
-// Step 5: Add or subtract the mantissas.
-// Step 6: Normalize the result.//
-//   Shift left until normalized.  Normalized when the value to the 
-//   left of the binrary point is 1.
-// Step 7: Round the result.// 
-// Step 8: Put AddSumM onto output.
-//
-
-
-module fpuaddcvt2 (FAddResM, FAddFlgM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, FrmM, FOpCtrlM, FmtM);
-
-   input [2:0] 	FrmM;		// Rounding mode - specify values 
-   input [3:0]	FOpCtrlM;	// Function opcode
-   input 	FmtM;   		// Result Precision (0 for double, 1 for single)
-   // input 	AddOvEnM;		// Overflow trap enabled
-   // input 	AddUnEnM;   	// Underflow trap enabled
-   input [63:0] AddSumM, AddSumTcM;
-   input [63:0] 	 AddFloat1M; 
-   input [63:0] 	 AddFloat2M;
-   input [11:0]	 AddExp1DenormM, AddExp2DenormM;
-   input [10:0] 	 AddExponentM, AddExpPostSumM; //exp_pre;
-   //input		 exp_valid;
-   input [3:0] 	 AddSelInvM;
-   input		 AddOp1NormM, AddOp2NormM;
-   input		 AddOpANormM, AddOpBNormM;
-   input		 AddInvalidM;
-   input 	 AddDenormInM; 
-   input 	 AddSignAM; 
-   input         AddCorrSignM;
-   input 	 AddConvertM;
-   input          AddSwapM;
-   // input 	 AddNormOvflowM;
-
-   output [63:0] FAddResM;	// Result of operation
-   output [4:0]  FAddFlgM;   	// IEEE exception flags 
-   wire 	 AddDenormM;   	// AddDenormM on input or output   
-
-   wire          P;
-   assign P = ~FmtM | FOpCtrlM[2];
-
-   wire [10:0]   exp_pre;
-   wire [63:0] 	 Result;   
-   wire [63:0] 	 sum_norm, sum_norm_w_bypass;
-   wire [5:0] 	 norm_shift, norm_shift_denorm;
-   wire          exp_valid;
-   wire		 DenormIO;
-   wire [4:0] 	 FlagsIn;	
-   wire 	 Sticky_out;
-   wire 	 sign_corr;
-   wire 	 zeroB;         
-   wire [10:0]	 AddExpPostSumM;
-   wire 	 mantissa_comp;
-   wire 	 mantissa_comp_sum;
-   wire 	 mantissa_comp_sum_tc;
-   wire 	 Float1_sum_comp;
-   wire 	 Float2_sum_comp;
-   wire 	 Float1_sum_tc_comp;
-   wire 	 Float2_sum_tc_comp;
-   wire 	 normal_underflow;
-   wire [63:0]   sum_corr;
-   logic AddNormOvflowM;
- 
- 
-   logic 	AddOvEnM;		// Overflow trap enabled
-   logic 	AddUnEnM;   	// Underflow trap enabled
-
-   assign AddOvEnM = 1'b1;
-   assign AddUnEnM = 1'b1;
-   //AddExponentM value pre-rounding with considerations for denormalized
-   //cases/conversion cases
-   assign exp_pre       = AddDenormInM ?
-                          ((norm_shift == 6'b001011) ? 11'b00000000001 : (AddSwapM ? AddExp2DenormM[10:0] : AddExp1DenormM[10:0]))
-                          : (AddConvertM ? 11'b10000111100 : AddExponentM);
-
-
-   // Finds normal underflow result to determine whether to round final AddExponentM down
-   // Comparison between each float and the resulting AddSumM of the primary cla adder/subtractor and cla subtractor
-   assign Float1_sum_comp = (AddFloat1M[51:0] > AddSumM[51:0]) ? 1'b0 : 1'b1;
-   assign Float2_sum_comp = (AddFloat2M[51:0] > AddSumM[51:0]) ? 1'b0 : 1'b1;
-   assign Float1_sum_tc_comp = (AddFloat1M[51:0] > AddSumTcM[51:0]) ? 1'b0 : 1'b1;
-   assign Float2_sum_tc_comp = (AddFloat2M[51:0] > AddSumTcM[51:0]) ? 1'b0 : 1'b1;
-
-   // Determines the correct Float value to compare based on AddSwapM result
-   assign mantissa_comp_sum = AddSwapM ? Float2_sum_comp : Float1_sum_comp;
-   assign mantissa_comp_sum_tc = AddSwapM ? Float2_sum_tc_comp : Float1_sum_tc_comp;
-
-   // Determines the correct comparison result based on operation and sign of resulting AddSumM
-   assign mantissa_comp = (FOpCtrlM[0] ^ AddSumM[63]) ? mantissa_comp_sum_tc : mantissa_comp_sum;
-
-   // If the signs are different and both operands aren't denormalized
-   // the normal underflow bit is needed and therefore updated.
-   assign normal_underflow = ((AddFloat1M[63] ~^ AddFloat2M[63]) & (AddOpANormM | AddOpBNormM)) ? mantissa_comp : 1'b0;
-
-   // Determine the correct sign of the result
-   assign sign_corr = ((AddCorrSignM ^ AddSignAM) & ~AddConvertM) ^ AddSumM[63];   
-   
-   // If the AddSumM is negative, use its two complement instead. 
-   // This value has to be 64-bits to correctly handle the 
-   // case 10...00
-   assign sum_corr = (AddDenormInM & (AddOpANormM | AddOpBNormM) & ( ( (AddFloat1M[63] ~^ AddFloat2M[63]) & FOpCtrlM[0] ) | ((AddFloat1M[63] ^ AddFloat2M[63]) & ~FOpCtrlM[0]) ))
-			 ? (AddSumM[63] ? AddSumM : AddSumTcM) : ( (FOpCtrlM[3]) ? AddSumM : (AddSumM[63] ? AddSumTcM : AddSumM));
-
-   // Finds normal underflow result to determine whether to round final AddExponentM down
-   //KEP used to be (AddSumM == 16'h0) not sure what it is supposed to be
-   assign AddNormOvflowM = (AddDenormInM & (AddSumM == 64'h0) & (AddOpANormM | AddOpBNormM) & ~FOpCtrlM[0]) ? 1'b1 : (AddSumM[63] ? AddSumTcM[52] : AddSumM[52]);
-
-   // Leading-Zero Detector. Determine the size of the shift needed for
-   // normalization. If sum_corrected is all zeros, the exp_valid is 
-   // zero; otherwise, it is one. 
-   lz64 lzd1 (norm_shift, exp_valid, sum_corr);
-
-   assign norm_shift_denorm = (AddDenormInM & ( (~AddOpANormM & ~AddOpBNormM) | normal_underflow)) ? (6'h00) : (norm_shift);
-
-   // Barell shifter used for normalization. It takes as inputs the 
-   // the corrected AddSumM and the amount by which the AddSumM should 
-   // be right shifted. It outputs the normalized AddSumM. 
-   barrel_shifter_l64 bs2 (sum_norm, sum_corr, norm_shift_denorm);
-  
-   assign sum_norm_w_bypass = (FOpCtrlM[3]) ? (FOpCtrlM[0] ? ~sum_corr : sum_corr) : (sum_norm);
-
-   // Round the mantissa to a 52-bit value, with the leading one
-   // removed. If the result is a single precision number, the actual 
-   // mantissa is in the upper 23 bits and the lower 29 bits are zero. 
-   // At this point, normalization has already been performed, so we know 
-   // exactly where the rounding point is. The rounding units also
-   // handles special cases and set the exception flags.
-
-   // Changed DenormIO -> AddDenormM and FlagsIn -> FAddFlgM in order to
-   // help in processor reservation station detection of load/stores. In
-   // other words, the processor would like to know ahead of time that
-   // if the result is an exception then don't load or store.
-   rounder round1 (Result, DenormIO, FlagsIn, FrmM, P, AddOvEnM, AddUnEnM, exp_valid, 
-		   AddSelInvM, AddInvalidM, AddDenormInM, AddConvertM, sign_corr, exp_pre, norm_shift, sum_norm_w_bypass,
-		   AddExpPostSumM, AddOp1NormM, AddOp2NormM, AddFloat1M[63:52], AddFloat2M[63:52],
-		   AddNormOvflowM, normal_underflow, AddSwapM, FOpCtrlM, AddSumM);
-
-   // Store the final result and the exception flags in registers.
-   assign FAddResM = Result;
-   assign {AddDenormM, FAddFlgM} = {DenormIO, FlagsIn};
-   
-endmodule // fpadd
-
-
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -94,14 +94,14 @@ string tests32f[] = '{
    "rv64f/I-FSW-01", "2000",
    "rv64f/I-FCLASS-S-01", "2000",
    "rv64f/I-FADD-S-01", "2000",
-    // "rv64f/I-FCVT-S-L-01", "2000",
-    // "rv64f/I-FCVT-S-LU-01", "2000",
-    // "rv64f/I-FCVT-S-W-01", "2000",
-    // "rv64f/I-FCVT-S-WU-01", "2000",
-    // "rv64f/I-FCVT-L-S-01", "2000",
-    // "rv64f/I-FCVT-LU-S-01", "2000",
-    // "rv64f/I-FCVT-W-S-01", "2000",
-    // "rv64f/I-FCVT-WU-S-01", "2000",
+    "rv64f/I-FCVT-S-L-01", "2000",
+    "rv64f/I-FCVT-S-LU-01", "2000",
+    "rv64f/I-FCVT-S-W-01", "2000",
+    "rv64f/I-FCVT-S-WU-01", "2000",
+    "rv64f/I-FCVT-L-S-01", "2000",
+    "rv64f/I-FCVT-LU-S-01", "2000",
+    "rv64f/I-FCVT-W-S-01", "2000",
+    "rv64f/I-FCVT-WU-S-01", "2000",
    // "rv64f/I-FDIV-S-01", "2000",
    "rv64f/I-FEQ-S-01", "2000",
    "rv64f/I-FLE-S-01", "2000",
@ -122,6 +122,16 @@ string tests32f[] = '{

  string tests64d[] = '{
    // "rv64d/I-FDIV-D-01", "2000",
+    "rv64d/I-FCVT-D-L-01", "2000",
+    "rv64d/I-FCVT-D-LU-01", "2000",
+    // "rv64d/I-FCVT-D-S-01", "2000", //the number to be converted is in the lower 32 bits need to change the test
+    "rv64d/I-FCVT-D-W-01", "2000",
+    "rv64d/I-FCVT-D-WU-01", "2000",
+    "rv64d/I-FCVT-L-D-01", "2000",
+    "rv64d/I-FCVT-LU-D-01", "2000",
+    // "rv64d/I-FCVT-S-D-01", "2000", //the result is in the lower 32 bits needs to be changed in the imperas test
+    "rv64d/I-FCVT-W-D-01", "2000",
+    // "rv64d/I-FCVT-WU-D-01", "2000", //this test needs to be fixed it expects 2^64-1 rather then 2^32-1 (specified in spec)
    "rv64d/I-FSD-01", "2000",
    "rv64d/I-FLD-01", "2420",
    "rv64d/I-FNMADD-D-01", "2000",
@ -134,16 +144,6 @@ string tests32f[] = '{
    "rv64d/I-FEQ-D-01", "2000",
    "rv64d/I-FADD-D-01", "2000",
    "rv64d/I-FCLASS-D-01", "2000",
-    // "rv64d/I-FCVT-D-L-01", "2000",
-    // "rv64d/I-FCVT-D-LU-01", "2000",
-    // "rv64d/I-FCVT-D-S-01", "2000",
-    // "rv64d/I-FCVT-D-W-01", "2000",
-    // "rv64d/I-FCVT-D-WU-01", "2000",
-    // "rv64d/I-FCVT-L-D-01", "2000",
-    // "rv64d/I-FCVT-LU-D-01", "2000",
-    // "rv64d/I-FCVT-S-D-01", "2000",
-    // "rv64d/I-FCVT-W-D-01", "2000",
-    // "rv64d/I-FCVT-WU-D-01", "2000",
    "rv64d/I-FMADD-D-01", "2000",
    "rv64d/I-FMUL-D-01", "2000",
    "rv64d/I-FMV-D-X-01", "2000",
@ -898,8 +898,22 @@ module instrNameDecTB(
                       else if (funct7[6:2] == 5'b01011) name = "FSQRT";
                       else if (funct7 == 7'b1100000 && rs2 == 5'b00000) name = "FCVT.W.S";
                       else if (funct7 == 7'b1100000 && rs2 == 5'b00001) name = "FCVT.WU.S";
+                       else if (funct7 == 7'b1100000 && rs2 == 5'b00010) name = "FCVT.L.S";
+                       else if (funct7 == 7'b1100000 && rs2 == 5'b00011) name = "FCVT.LU.S";
                       else if (funct7 == 7'b1101000 && rs2 == 5'b00000) name = "FCVT.S.W";
                       else if (funct7 == 7'b1101000 && rs2 == 5'b00001) name = "FCVT.S.WU";
+                       else if (funct7 == 7'b1101000 && rs2 == 5'b00010) name = "FCVT.S.L";
+                       else if (funct7 == 7'b1101000 && rs2 == 5'b00011) name = "FCVT.S.LU";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00000) name = "FCVT.W.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00001) name = "FCVT.WU.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00010) name = "FCVT.L.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00011) name = "FCVT.LU.D";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00000) name = "FCVT.D.W";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00001) name = "FCVT.D.WU";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00010) name = "FCVT.D.L";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00011) name = "FCVT.D.LU";
+                       else if (funct7 == 7'b0100000 && rs2 == 5'b00001) name = "FCVT.S.D";
+                       else if (funct7 == 7'b0100001 && rs2 == 5'b00000) name = "FCVT.D.S";
                       else if (funct7 == 7'b1110000 && rs2 == 5'b00000) name = "FMV.X.W";
                       else if (funct7 == 7'b1111000 && rs2 == 5'b00000) name = "FMV.W.X";
                       else if (funct7 == 7'b1110001 && rs2 == 5'b00000) name = "FMV.X.D"; // DOUBLE
@ -915,22 +929,50 @@ module instrNameDecTB(
                       else if (funct7[6:2] == 5'b01011) name = "FSQRT";
                       else if (funct7 == 7'b1100000 && rs2 == 5'b00000) name = "FCVT.W.S";
                       else if (funct7 == 7'b1100000 && rs2 == 5'b00001) name = "FCVT.WU.S";
+                       else if (funct7 == 7'b1100000 && rs2 == 5'b00010) name = "FCVT.L.S";
+                       else if (funct7 == 7'b1100000 && rs2 == 5'b00011) name = "FCVT.LU.S";
                       else if (funct7 == 7'b1101000 && rs2 == 5'b00000) name = "FCVT.S.W";
                       else if (funct7 == 7'b1101000 && rs2 == 5'b00001) name = "FCVT.S.WU";
+                       else if (funct7 == 7'b1101000 && rs2 == 5'b00010) name = "FCVT.S.L";
+                       else if (funct7 == 7'b1101000 && rs2 == 5'b00011) name = "FCVT.S.LU";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00000) name = "FCVT.W.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00001) name = "FCVT.WU.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00010) name = "FCVT.L.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00011) name = "FCVT.LU.D";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00000) name = "FCVT.D.W";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00001) name = "FCVT.D.WU";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00010) name = "FCVT.D.L";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00011) name = "FCVT.D.LU";
+                       else if (funct7 == 7'b0100000 && rs2 == 5'b00001) name = "FCVT.S.D";
+                       else if (funct7 == 7'b0100001 && rs2 == 5'b00000) name = "FCVT.D.S";
                       else if (funct7[6:2] == 5'b00100) name = "FSGNJN";
                       else if (funct7[6:2] == 5'b00101) name = "FMAX";
                       else if (funct7[6:2] == 5'b10100) name = "FLT";
                       else if (funct7[6:2] == 5'b11100) name = "FCLASS";
                       else                              name = "ILLEGAL";
-      10'b0101111_010: if      (funct7[6:2] == 5'b00000) name = "FADD";
+      10'b1010011_010: if      (funct7[6:2] == 5'b00000) name = "FADD";
                       else if (funct7[6:2] == 5'b00001) name = "FSUB";
                       else if (funct7[6:2] == 5'b00010) name = "FMUL";
                       else if (funct7[6:2] == 5'b00011) name = "FDIV";
                       else if (funct7[6:2] == 5'b01011) name = "FSQRT";
                       else if (funct7 == 7'b1100000 && rs2 == 5'b00000) name = "FCVT.W.S";
                       else if (funct7 == 7'b1100000 && rs2 == 5'b00001) name = "FCVT.WU.S";
+                       else if (funct7 == 7'b1100000 && rs2 == 5'b00010) name = "FCVT.L.S";
+                       else if (funct7 == 7'b1100000 && rs2 == 5'b00011) name = "FCVT.LU.S";
                       else if (funct7 == 7'b1101000 && rs2 == 5'b00000) name = "FCVT.S.W";
                       else if (funct7 == 7'b1101000 && rs2 == 5'b00001) name = "FCVT.S.WU";
+                       else if (funct7 == 7'b1101000 && rs2 == 5'b00010) name = "FCVT.S.L";
+                       else if (funct7 == 7'b1101000 && rs2 == 5'b00011) name = "FCVT.S.LU";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00000) name = "FCVT.W.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00001) name = "FCVT.WU.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00010) name = "FCVT.L.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00011) name = "FCVT.LU.D";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00000) name = "FCVT.D.W";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00001) name = "FCVT.D.WU";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00010) name = "FCVT.D.L";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00011) name = "FCVT.D.LU";
+                       else if (funct7 == 7'b0100000 && rs2 == 5'b00001) name = "FCVT.S.D";
+                       else if (funct7 == 7'b0100001 && rs2 == 5'b00000) name = "FCVT.D.S";
                       else if (funct7[6:2] == 5'b00100) name = "FSGNJX";
                       else if (funct7[6:2] == 5'b10100) name = "FEQ";
                       else                              name = "ILLEGAL";
@ -941,8 +983,22 @@ module instrNameDecTB(
                       else if (funct7[6:2] == 5'b01011) name = "FSQRT";
                       else if (funct7 == 7'b1100000 && rs2 == 5'b00000) name = "FCVT.W.S";
                       else if (funct7 == 7'b1100000 && rs2 == 5'b00001) name = "FCVT.WU.S";
+                       else if (funct7 == 7'b1100000 && rs2 == 5'b00010) name = "FCVT.L.S";
+                       else if (funct7 == 7'b1100000 && rs2 == 5'b00011) name = "FCVT.LU.S";
                       else if (funct7 == 7'b1101000 && rs2 == 5'b00000) name = "FCVT.S.W";
                       else if (funct7 == 7'b1101000 && rs2 == 5'b00001) name = "FCVT.S.WU";
+                       else if (funct7 == 7'b1101000 && rs2 == 5'b00010) name = "FCVT.S.L";
+                       else if (funct7 == 7'b1101000 && rs2 == 5'b00011) name = "FCVT.S.LU";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00000) name = "FCVT.W.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00001) name = "FCVT.WU.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00010) name = "FCVT.L.D";
+                       else if (funct7 == 7'b1100001 && rs2 == 5'b00011) name = "FCVT.LU.D";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00000) name = "FCVT.D.W";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00001) name = "FCVT.D.WU";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00010) name = "FCVT.D.L";
+                       else if (funct7 == 7'b1101001 && rs2 == 5'b00011) name = "FCVT.D.LU";
+                       else if (funct7 == 7'b0100000 && rs2 == 5'b00001) name = "FCVT.S.D";
+                       else if (funct7 == 7'b0100001 && rs2 == 5'b00000) name = "FCVT.D.S";
                       else                              name = "ILLEGAL";
      10'b0000111_010: name = "FLW";
      10'b0100111_010: name = "FSW";