Merge branch 'main' of github.com:davidharrishmc/riscv-wally into main

2021-06-04 15:16:39 -05:00 · 2021-06-04 15:16:39 -05:00 · 41a1e6112a
commit 41a1e6112a
parent 7406e33b61 fc65aedbd6
28 changed files with 220102 additions and 124314 deletions
--- a/wally-pipelined/regression/sim-wally-rv64icfd
+++ b/wally-pipelined/regression/sim-wally-rv64icfd
@ -0,0 +1 @@
+vsim -do wally-pipelined-rv64icfd.do
--- a/wally-pipelined/regression/wally-pipelined-rv64icfd.do
+++ b/wally-pipelined/regression/wally-pipelined-rv64icfd.do
@ -0,0 +1,50 @@
+# wally-pipelined.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# Use this wally-pipelined.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# default to config/rv64ic, but allow this to be overridden at the command line.  For example:
+# do wally-pipelined.do ../config/rv32ic
+switch $argc {
+    0 {vlog +incdir+../config/rv64icfd +incdir+../config/shared ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+    1 {vlog +incdir+$1  +incdir+../config/shared ../testbench/testbench-imperas.sv ../testbench/function_radix.sv ../src/*/*.sv -suppress 2583}
+}
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+vopt +acc work.testbench -o workopt 
+vsim workopt
+
+view wave
+-- display input and output signals as hexidecimal values
+do ./wave-dos/default-waves.do
+
+-- Run the Simulation 
+#run 5000 
+run -all
+#quit
+noview ../testbench/testbench-imperas.sv
+view wave
--- a/wally-pipelined/src/fpu/FMA/fma1.sv
+++ b/wally-pipelined/src/fpu/FMA/fma1.sv
@ -1,103 +1,137 @@
- ////////////////////////////////////////////////////////////////////////////////
-// Block Name:	fmac.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This is the top level block of a floating-point  multiply/accumulate
-//   unit(FMAC).   It instantiates the following sub-blocks:
-//
-//    array     Booth encoding, partial product generation, product summation
-//    expgen    Exponent summation, compare, and adjust
-//    align     Alignment shifter
-//    add       Carry-save adder for accumulate, carry propagate adder
-//    lza       Leading zero anticipator to control normalization shifter
-//    normalize Normalization shifter
-//    round     Rounding of result
-//    exception Handles exceptional cases
-//    bypass    Handles bypass of result to ReadData1E or ReadData3E inputs
-//    sign      One bit sign handling block 
-//    special   Catch special cases (inputs = 0  / infinity /  etc.) 
-//
-//   The FMAC computes FmaResultM=ReadData1E*ReadData2E+ReadData3E, rounded with the mode specified by
-//   RN, RZ, RM, or RP.  The result is optionally bypassed back to
-//   the ReadData1E or ReadData3E inputs for use on the next cycle.  In addition,  four signals
-//   are produced: trap, overflow, underflow, and inexact.  Trap indicates
-//   an infinity, NaN, or denormalized number to be handled in software;
-//   the other three signals are IEEE flags.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module fma1(ReadData1E, ReadData2E, ReadData3E, FrmE,  
-			rE, sE, tE, bsE, killprodE, sumshiftE, sumshiftzeroE,  aligncntE, aeE
-			, xzeroE, yzeroE, zzeroE, xnanE,ynanE, znanE, xdenormE, ydenormE, zdenormE,
-			xinfE, yinfE, zinfE, nanE, prodinfE);
-/////////////////////////////////////////////////////////////////////////////
+module fma1(
 
-	input logic 		[63:0]		ReadData1E;		// input 1
-	input logic		[63:0]		ReadData2E;     // input 2 
-	input logic 		[63:0]		ReadData3E;     // input 3
-	input logic 		[2:0]	 	FrmE;          	// Rounding mode
-	output logic 		[12:0]		aligncntE;    	// status flags
-	output logic 		[105:0]		rE; 				// one result of partial product sum
-	output logic 		[105:0]		sE; 				// other result of partial products
-	output logic 		[163:0]		tE;				// output logic of alignment shifter	
-	output logic 		[12:0]		aeE; 		// multiplier expoent
-	output logic 					bsE;				// sticky bit of addend
-	output logic 					killprodE; 		// ReadData3E >> product
-	output logic					xzeroE;
-	output logic					yzeroE;
-	output logic					zzeroE;
-	output logic					xdenormE;
-	output logic					ydenormE;
-	output logic					zdenormE;
-	output logic					xinfE;
-	output logic					yinfE;
-	output logic					zinfE;
-	output logic					xnanE;
-	output logic					ynanE;
-	output logic					znanE;
-	output logic					nanE;
-	output logic					prodinfE;
-	output logic			[8:0]		sumshiftE;
-	output logic					sumshiftzeroE;
+	input logic 	[63:0]		ReadData1E,
+	input logic		[63:0]		ReadData2E,
+	input logic 	[63:0]		ReadData3E,
+	output logic 	[105:0]		ProdManE,
+	output logic 	[161:0]		AlignedAddendE,	
+	output logic 	[12:0]		ProdExpE,
+	output logic 				AddendStickyE,
+	output logic 				KillProdE,
+	output logic				XZeroE, YZeroE, ZZeroE,
+	output logic				XInfE, YInfE, ZInfE,
+	output logic				XNaNE, YNaNE, ZNaNE);

-// Internal nodes
- 
-//	output logic 		[12:0]		aligncntE; 		// shift count for alignment
+	logic [51:0] 	XMan,YMan,ZMan;
+	logic [10:0] 	XExp,YExp,ZExp;
+	logic 		 	XSgn,YSgn,ZSgn;
+	logic [12:0]	AlignCnt;
+	logic [211:0] 	Shift;
+	logic			XDenormE, YDenormE, ZDenormE;


-	logic 					prodof; 		// ReadData1E*ReadData2E out of range
+	// split inputs into the sign bit, mantissa, and exponent for readability
+	assign XSgn = ReadData1E[63];
+	assign YSgn = ReadData2E[63];
+	assign ZSgn = ReadData3E[63];
+
+	assign XExp = ReadData1E[62:52];
+	assign YExp = ReadData2E[62:52];
+	assign ZExp = ReadData3E[62:52];
+
+	assign XMan = ReadData1E[51:0];
+	assign YMan = ReadData2E[51:0];
+	assign ZMan = ReadData3E[51:0];
+
+
+
+	// determine if an input is a special value
+	assign XNaNE = &ReadData1E[62:52] && |ReadData1E[51:0]; 
+	assign YNaNE = &ReadData2E[62:52] && |ReadData2E[51:0]; 
+	assign ZNaNE = &ReadData3E[62:52] && |ReadData3E[51:0];
+
+	assign XDenormE = ~(|ReadData1E[62:52]) && |ReadData1E[51:0]; 
+	assign YDenormE = ~(|ReadData2E[62:52]) && |ReadData2E[51:0]; 
+	assign ZDenormE = ~(|ReadData3E[62:52]) && |ReadData3E[51:0];
+
+	assign XInfE = &ReadData1E[62:52] && ~(|ReadData1E[51:0]); 
+	assign YInfE = &ReadData2E[62:52] && ~(|ReadData2E[51:0]); 
+	assign ZInfE = &ReadData3E[62:52] && ~(|ReadData3E[51:0]);
+
+	assign XZeroE = ~(|ReadData1E[62:0]);
+	assign YZeroE = ~(|ReadData2E[62:0]);
+	assign ZZeroE = ~(|ReadData3E[62:0]);




+	// Calculate the product's exponent
+	//		- When multipliying two fp numbers, add the exponents
+	// 		- Subtract 3ff to remove one of the biases (XExp + YExp has two biases, one from each exponent)
+	//		- Denormal numbers have an an exponent value of 1, however they are 
+	//		  represented with an exponent of 0. add one if there is a denormal number
+	assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 : 
+				 {2'b0, XExp} + {2'b0, YExp} - 13'h3ff + XDenormE + YDenormE;
+
+	// Calculate the product's mantissa
+	//		- Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
+	assign ProdManE = {53'b0,~(XDenormE|XZeroE),XMan}  *  {53'b0,~(YDenormE|YZeroE),YMan};




+	// determine the shift count for alignment
+	//		- negitive means Z is larger, so shift Z left
+	//		- positive means the product is larger, so shift Z right
+	//		- Denormal numbers have an an exponent value of 1, however they are 
+	//		  represented with an exponent of 0. add one to the exponent if it is a denormal number
+	assign AlignCnt = ProdExpE - ZExp - ZDenormE;
+
+	// Alignment shifter
+
+	// Defualt Addition without shifting
+	// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
+	//						 |1'b0| addnend |
+
+	// the 1'b0 before the added is because the product's mantissa has two bits before the decimal point (xx.xxxxxxxxxx...)
+	
+	always_comb 
+		begin
+			
+		// Set default values
+		AddendStickyE = 0;
+		KillProdE = 0;
+		
+		// If the product is too small to effect the sum, kill the product
+
+		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
+		//	| addnend |
+		if ($signed(AlignCnt) <= $signed(-56)) begin
+			KillProdE = 1;
+			AlignedAddendE = {55'b0, ~(ZZeroE|ZDenormE),ZMan,2'b0};
+			AddendStickyE = ~(XZeroE|YZeroE);
+
+		// If the Addend is shifted left (negitive AlignCnt)
+
+		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
+		//					| addnend |
+		end else if($signed(AlignCnt) <= $signed(0))  begin
+			Shift = {55'b0, ~(ZZeroE|ZDenormE),ZMan, 104'b0} << -AlignCnt;
+			AlignedAddendE = Shift[211:50];
+			AddendStickyE = |(Shift[49:0]);
+
+		// If the Addend is shifted right (positive AlignCnt)
+
+		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
+		//									| addnend |
+		end else if ($signed(AlignCnt)<=$signed(105))  begin
+			Shift = {55'b0, ~(ZZeroE|ZDenormE),ZMan, 104'b0} >> AlignCnt;
+			AlignedAddendE = Shift[211:50];
+			AddendStickyE = |(Shift[49:0]);
+
+		// If the addend is too small to effect the addition		
+		//		- The addend has to shift two past the end of the addend to be considered too small
+		//		- The 2 extra bits are needed for rounding
+
+		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
+		//														| addnend |
+		end else begin
+			AlignedAddendE = 162'b0;
+			AddendStickyE = ~ZZeroE;


-
-
-
-//   Instantiate fraction datapath
-
-	multiply		multiply(.xman(ReadData1E[51:0]), .yman(ReadData2E[51:0]), .*);
-	align			align(.zman(ReadData3E[51:0]),.*);
-
-// Instantiate exponent datapath
-
-	expgen1			expgen1(.xexp(ReadData1E[62:52]),.yexp(ReadData2E[62:52]),.zexp(ReadData3E[62:52]),.*);
-// Instantiate special case detection across datapath & exponent path 
-
-	special			special(.*);
-
-
-// Instantiate control output logic
- 
-flag1				flag1(.*); 
+		end 
+	end

 endmodule

--- a/wally-pipelined/src/fpu/FMA/fma2.sv
+++ b/wally-pipelined/src/fpu/FMA/fma2.sv
@ -1,104 +1,107 @@
- ////////////////////////////////////////////////////////////////////////////////
-// Block Name:	fmac.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This is the top level block of a floating-point  multiply/accumulate
-//   unit(FMAC).   It instantiates the following sub-blocks:
-//
-//    array     Booth encoding, partial product generation, product summation
-//    expgen    Mxponent summation, compare, and adjust
-//    align     Alignment shifter
-//    add       Carry-save adder for accumulate, carry propagate adder
-//    lza       Leading zero anticipator to control normalization shifter
-//    normalize Normalization shifter
-//    round     Rounding of result
-//    exception Handles exceptional cases
-//    bypass    Handles bypass of result to ReadData1M or ReadData3M input logics
-//    sign      One bit sign handling block 
-//    special   Catch special cases (input logics = 0  / infinity /  etc.) 
-//
-//   The FMAC computes FmaResultM=ReadData1M*ReadData2M+ReadData3M, rounded with the mode specified by
-//   RN, RZ, RM, or RP.  The result is optionally bypassed back to
-//   the ReadData1M or ReadData3M input logics for use on the next cycle.  In addition,  four signals
-//   are produced: trap, overflow, underflow, and inexact.  Trap indicates
-//   an infinity, NaN, or denormalized number to be handled in software;
-//   the other three signals are IMMM flags.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module fma2(ReadData1M, ReadData2M, ReadData3M, FrmM,
-			FmaResultM, FmaFlagsM, aligncntM, rM, sM,
-			tM,	normcntM, aeM, bsM,killprodM,
-			xzeroM,	yzeroM,zzeroM,xdenormM,ydenormM,
-			zdenormM,xinfM,yinfM,zinfM,xnanM,ynanM,znanM,
-			nanM,sumshiftM,sumshiftzeroM,prodinfM
-
-);
-/////////////////////////////////////////////////////////////////////////////
+module fma2(
 
-	input logic 		[63:0]		ReadData1M;		// input logic 1
-	input logic		[63:0]		ReadData2M;     // input logic 2 
-	input logic 		[63:0]		ReadData3M;     // input logic 3
-	input logic 		[2:0]	 	FrmM;          	// Rounding mode
-	input logic 		[12:0]		aligncntM;    	// status flags
-	input logic 		[105:0]		rM; 				// one result of partial product sum
-	input logic 		[105:0]		sM; 				// other result of partial products
-	input logic 		[163:0]		tM;				// output of alignment shifter	
-	input logic 		[8:0]		normcntM; 		// shift count for normalizer
-	input logic 		[12:0]		aeM; 		// multiplier expoent
-	input logic 					bsM;				// sticky bit of addend
-	input logic 					killprodM; 		// ReadData3M >> product
-	input logic					prodinfM;
-	input logic					xzeroM;
-	input logic					yzeroM;
-	input logic					zzeroM;
-	input logic					xdenormM;
-	input logic					ydenormM;
-	input logic					zdenormM;
-	input logic					xinfM;
-	input logic					yinfM;
-	input logic					zinfM;
-	input logic					xnanM;
-	input logic					ynanM;
-	input logic					znanM;
-	input logic					nanM;
-	input logic			[8:0]		sumshiftM;
-	input logic					sumshiftzeroM;
-
-
-	output logic 		[63:0]		FmaResultM;     // output FmaResultM=ReadData1M*ReadData2M+ReadData3M
-	output logic 		[4:0]		FmaFlagsM;    	// status flags
+	input logic 	[63:0]		ReadData1M,
+	input logic		[63:0]		ReadData2M,
+	input logic 	[63:0]		ReadData3M,
+	input logic 	[2:0] 		FrmM,
+	input logic 	[105:0]		ProdManM,
+	input logic 	[161:0]		AlignedAddendM,	
+	input logic 	[12:0]		ProdExpM,
+	input logic 				AddendStickyM,
+	input logic 				KillProdM,
+	input logic 	[3:0]		FOpCtrlM,
+	input logic					XZeroM, YZeroM, ZZeroM,
+	input logic					XInfM, YInfM, ZInfM,
+	input logic					XNaNM, YNaNM, ZNaNM,
+	output logic	[63:0]		FmaResultM,
+	output logic 	[4:0]		FmaFlagsM);
 	

-// Internal nodes
- 	logic 		[163:0]		sum;			// output of carry prop adder
-	logic 		[53:0]		v; 				// normalized sum, R, S bits
-//	logic 		[12:0]		aligncnt; 		// shift count for alignment
-	logic 		[8:0]		normcnt; 		// shift count for normalizer
-	logic 					negsum; 		// negate sum
-	logic 					invz; 			// invert addend
-	logic 					selsum1; 		// select +1 mode of sum
-	logic 					negsum0; 		// sum +0 < 0
-	logic 					negsum1; 		// sum +1 < 0
-	logic 					sumzero; 		// sum = 0
-	logic 					infinity; 		// generate infinity on overflow
-	logic 					sumof;			// result out of range
-	logic					zexpsel;
-	logic					denorm0;
-	logic					resultdenorm;
-	logic					inf;
-	logic					specialsel;
-	logic					expplus1;
-	logic					sumuf;
-	logic					psign;
-	logic					sticky;
-	logic			[12:0]		de0;
-	logic					isAdd;

-	assign isAdd = 1;
+	logic [51:0] 	XMan, YMan, ZMan, WMan;
+	logic [10:0] 	XExp, YExp, ZExp, WExp;
+	logic 		 	XSgn, YSgn, ZSgn, WSgn, PSgn;
+	logic 			IsSub;
+	logic [105:0]	ProdMan2;
+	logic [162:0]	AlignedAddend2;
+ 	logic [161:0]	Sum;
+	logic [162:0]	SumTmp;
+	logic [12:0]	SumExp;
+	logic [12:0]	SumExpMinus1;
+	logic [12:0]	SumExpTmp, WExpTmp;
+	logic [53:0]	NormSum;
+	logic [161:0]	NormSumTmp;
+	logic [8:0]		NormCnt;
+	logic 			NormSumSticky;
+	logic 			SumZero;
+	logic 			NegSum;
+	logic 			InvZ;
+	logic			ResultDenorm;
+	logic			Sticky;
+	logic 			Plus1, Minus1, Plus1Tmp, Minus1Tmp;
+	logic 			Invalid,Underflow,Overflow,Inexact;
+	logic [8:0]		DenormShift;
+	logic 			ProdInf, ProdOf, ProdUf;
+	logic [63:0]	FmaResultTmp;
+	logic 			SubBySmallNum;
+
+
+	// split inputs into the sign bit, mantissa, and exponent for readability
+	assign XSgn = ReadData1M[63];
+	assign YSgn = ReadData2M[63];
+	assign ZSgn = ReadData3M[63];
+
+	assign XExp = ReadData1M[62:52];
+	assign YExp = ReadData2M[62:52];
+	assign ZExp = ReadData3M[62:52];
+
+	assign XMan = ReadData1M[51:0];
+	assign YMan = ReadData2M[51:0];
+	assign ZMan = ReadData3M[51:0];
+
+
+
+	// is it an FMSUB or FNMSUB instruction
+	assign IsSub = FOpCtrlM[0];
+
+
+
+
+
+	// Addition
+	
+	// Negate Z  when doing one of the following opperations:
+	//		-prod +  Z
+	//		 prod -  Z 
+	assign InvZ = IsSub ? ~(ZSgn ^ PSgn) : (ZSgn ^ PSgn);
+
+	// Choose an inverted or non-inverted addend - the one is added later
+	assign AlignedAddend2 = InvZ ? ~{2'b0,AlignedAddendM} : {2'b0,AlignedAddendM};
+	// Kill the product if the product is too small to effect the addition (determined in fma1.sv)
+	assign ProdMan2 = KillProdM ? 106'b0 : ProdManM;
+
+	// Do the addition
+	// 		- add one to negate if the added was inverted
+	//		- the 2 extra bits at the begining and end are needed for rounding
+	assign SumTmp = AlignedAddend2 + {55'b0, ProdMan2,2'b0} + InvZ;
+	 
+	// Is the sum negitive
+	assign NegSum = SumTmp[162];
+	// If the sum is negitive, negate the sum.
+	assign Sum = NegSum ? -SumTmp[161:0] : SumTmp[161:0];
+
+
+
+
+
+
+	// Leading one detector
+	logic [8:0]	i;
+	always_comb begin
+			i = 0;
+			while (~Sum[161-i] && $unsigned(i) <= $unsigned(9'd161)) i = i+1;  // search for leading one 
+			NormCnt = i+1;    // compute shift count
+	end



@ -110,25 +113,163 @@ module fma2(ReadData1M, ReadData2M, ReadData3M, FrmM,



+	// Normalization
+
+
+	// Determine if the sum is zero
+	assign SumZero = ~(|Sum);
+
+	// Determine if the result is denormal
+	assign ResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp+13'd52)>=0);
+
+	// Determine the shift needed for denormal results
+	assign DenormShift = ResultDenorm ? SumExpTmp-1 : 6'b0;
+
+	// Normalize the sum
+	assign NormSumTmp = SumZero ? 162'b0 : Sum << NormCnt+DenormShift; 
+	assign NormSum = NormSumTmp[161:108];
+	// Calculate the sticky bit
+	assign NormSumSticky = (|NormSumTmp[107:0]);
+	assign Sticky = AddendStickyM | NormSumSticky;
+
+	// Determine sum's exponent
+	assign SumExpTmp = KillProdM ? ZExp : ProdExpM + -({5'b0, NormCnt} - 13'd56);
+	assign SumExp = SumZero ? 12'b0 : 
+				 ResultDenorm ? 12'b0 :
+				 SumExpTmp; 



-//   Instantiate fraction datapath
-
-	add				add(.*);
-	lza				lza(.*);
-	normalize		normalize(.zexp(ReadData3M[62:52]),.*); 
-	round			round(.xman(ReadData1M[51:0]), .yman(ReadData2M[51:0]),.zman(ReadData3M[51:0]), .wman(FmaResultM[51:0]),.wsign(FmaResultM[63]),.*);
-
-// Instantiate exponent datapath
-
-	expgen2			expgen2(.xexp(ReadData1M[62:52]),.yexp(ReadData2M[62:52]),.zexp(ReadData3M[62:52]),.wexp(FmaResultM[62:52]),.*);


-// Instantiate control logic
+
+
+
+
+	// Rounding
+
+	// round to nearest even
+	//		{NormSum[1], NormSum[0], Sticky}
+	//		0xx - do nothing
+	//		100 - tie - Plus1 if NormSum[2] = 1
+	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
+	//		101/110/111 - Plus1
+
+	// 	round to zero - do nothing
+	//			- subtract 1 if a small number was supposed to be subtracted from the positive result
+
+	// 	round to -infinity - Plus1 if negitive
+	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
+	//			- subtract 1 if a small number was supposed to be subtracted from the positive result
+
+	// 	round to infinity - Plus1 if positive
+
+	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
+	//			- subtract 1 if a small number was supposed to be subtracted from the negitive result
+
+	//  round to nearest max magnitude
+	//		{NormSum[1], NormSum[0], Sticky}
+	//		0xx - do nothing
+	//		100 - tie - Plus1
+	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
+	//		101/110/111 - Plus1
+
+	// Deterimine if the result was supposed to be subtrated by a small number
+	assign SubBySmallNum = AddendStickyM&InvZ&~NormSumSticky;
+
+	always_comb begin
+		// Determine if you add 1
+		case (FrmM)
+			3'b000: Plus1Tmp = NormSum[1] & (NormSum[0] | (Sticky&~(~NormSum[0]&SubBySmallNum)) | (~NormSum[0]&~Sticky&NormSum[2]));//round to nearest even
+			3'b001: Plus1Tmp = 0;//round to zero
+			3'b010: Plus1Tmp = WSgn & ~(SubBySmallNum);//round down
+			3'b011: Plus1Tmp = ~WSgn & ~(SubBySmallNum);//round up
+			3'b100: Plus1Tmp = (NormSum[1] & (NormSum[0] | (Sticky&~(~NormSum[0]&SubBySmallNum)) | (~NormSum[0]&~Sticky)));//round to nearest max magnitude
+			default: Plus1Tmp = 1'bx;
+		endcase
+		// Determine if you subtract 1
+		case (FrmM)
+			3'b000: Minus1Tmp = 0;//round to nearest even
+			3'b001: Minus1Tmp = SubBySmallNum;//round to zero
+			3'b010: Minus1Tmp = ~WSgn & SubBySmallNum;//round down
+			3'b011: Minus1Tmp = WSgn & SubBySmallNum;//round up
+			3'b100: Minus1Tmp = 0;//round to nearest max magnitude
+			default: Minus1Tmp = 1'bx;
+		endcase
+	
+	end
+
+	// If an answer is exact don't round
+    assign Plus1 = Sticky | (|NormSum[1:0]) ? Plus1Tmp : 0;
+    assign Minus1 = Sticky | (|NormSum[1:0]) ? Minus1Tmp : 0;
+	// Compute rounded result 
+    assign {WExpTmp, WMan} = {SumExp, NormSum[53:2]} + Plus1 - Minus1;
+    assign WExp = WExpTmp[10:0];
+
+
+
+
+
+
+
+	// Sign calculation
+
+	// Calculate the product's sign
+	assign PSgn = XSgn ^ YSgn;
+
+	// Determine the sign if the sum is zero
+	//	if product underflows then use psign
+	//	otherwise
+	//		if cancelation then 0 unless round to -inf
+	//		otherwise psign
+	assign zerosign = Underflow ? PSgn :
+			  (IsSub ? (PSgn^ZSgn ? PSgn : FrmM == 3'b010) :
+				  (PSgn^ZSgn ? FrmM == 3'b010 : PSgn));
+
+	// is the result negitive
+	// 	if p - z is the Sum negitive
+	// 	if -p + z is the Sum positive
+	// 	if -p - z then the Sum is negitive
+	assign resultsgn = InvZ&ZSgn&NegSum | InvZ&PSgn&~NegSum | (ZSgn&PSgn);
+	assign WSgn = SumZero ? zerosign : resultsgn;
 
-sign				sign(.xsign(ReadData1M[63]),.ysign(ReadData2M[63]),.zsign(ReadData3M[63]),.wsign(FmaResultM[63]),.*); 
-flag2				flag2(.xsign(ReadData1M[63]),.ysign(ReadData2M[63]),.zsign(ReadData3M[63]),.vbits(v[1:0]),.*); 
+	// Select the result
+	assign FmaResultTmp = XNaNM ? {XSgn, XExp, 1'b1,XMan[50:0]} : 
+						YNaNM ? {YSgn, YExp, 1'b1,YMan[50:0]} :
+						ZNaNM ? {ZSgn, ZExp, 1'b1,ZMan[50:0]} :
+						Invalid ? {WSgn, 11'h7ff, 1'b1, 51'b0} : // has to be before inf
+						XInfM ? {PSgn, XExp, XMan} :
+						YInfM ? {PSgn, YExp, YMan} :
+						ZInfM ? {ZSgn^IsSub, ZExp, ZMan} :
+						Overflow ? {WSgn, 11'h7ff, 52'b0} :
+						Underflow ? {WSgn, 63'b0} :
+						KillProdM ? ReadData3M - (Minus1&AddendStickyM) + (Plus1&AddendStickyM): // has to be after Underflow
+						{WSgn,WExp,WMan};
+	
+	// Negate the result if FNMADD or FNSUB instruction
+	assign FmaResultM[63] = FOpCtrlM[1] ? ~FmaResultTmp[63] : FmaResultTmp[63];
+	assign FmaResultM[62:0] = FmaResultTmp[62:0];
+
+	// Set Invalid flag for following cases:
+	//   1) Inf - Inf
+	//   2) 0 * Inf
+	//   3) any input is a signaling NaN
+	assign ProdOf = (ProdExpM >= 2047 && ~ProdExpM[12]);
+	assign ProdInf = ProdOf && ~XNaNM && ~YNaNM;
+	assign Invalid = (XNaNM&~XMan[51]) | (YNaNM&~YMan[51]) | (ZNaNM&~ZMan[51]) | ((XInfM || YInfM || ProdInf) & ZInfM & (XSgn ^ YSgn ^ ZSgn)) | (XZeroM & YInfM) | (YZeroM & XInfM);  
+	
+	// Set Overflow flag if the number is too big to be represented
+	assign Overflow = WExpTmp >= 2047 & ~WExpTmp[12];
+
+	// Set Underflow flag if the number is too small to be represented and isn't denormalized
+	assign ProdUf = KillProdM & ZZeroM;
+	assign Underflow = (WExpTmp[12] & ~ResultDenorm) | ProdUf;
+
+	// Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
+	assign Inexact = Sticky|Overflow|Underflow | (|NormSum[1:0]);
+
+	// Combine flags - FMA can't set the Divide by zero flag 
+	assign FmaFlagsM = {Invalid, 1'b0, Overflow, Underflow, Inexact};

 endmodule

--- a/wally-pipelined/src/fpu/FMA/tbgen/results.dat
+++ b/wally-pipelined/src/fpu/FMA/tbgen/results.dat
@ -1 +1,170 @@
-c3f000200003fffe 0000000000000001 001ffffffffffffe 80cffc400007fffd 80cffc400007fffc  Wrong FmaResultM=  -64 ydenorm 1119653
+cce008007fffffff 7fe6e0fac3dc6e26 401ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 28027
+c03fffffffffc800 7fdfffffffffe000 37f07ffffffffffc fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 44043
+c7f000ffffffffef 7fefffffffffde00 4e1ffffffffffe7f fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 107106
+c7f00000dffffffe 7fe0000000000000 8000000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 238237
+ffdf0000001fffff 7feffffffffffffe 7fe0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 310309
+c79ff80003fffffe 7feffc0000003ffe 2bd0020000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 426425
+ffeffffeffc00000 3fffffffffffffff 8000000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 436435
+d16ff800007fffff 7fe0000000000000 c000000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 517516
+d10ffffffff3fffe 7feffffffffffffe b9d07f0000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 519518
+442ff9fffffffffe ffefffffffffffff 3ff0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 553552
+c34f24b48d2af3e7 7fef7fe000000000 800ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z zdenorm ovrflw FmaResultM=-inf 577576
+7fdfffffff8000ff c3f0100000000002 39300dfffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 593592
+ffe00007fffffdfe 4340000000000001 ffd34131592163f6 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 654653
+4b98eba3e512fb7b ffe84639040d967a 42c00000010001fe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 683682
+ffed83a6b2e656b1 7fe0000000000001 0010000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 796795
+7fd5220b51609cf6 c030000000001020 7fdfbfffffffffdf fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 903902
+c3d6eb6dede43198 7feffffffffffffe 3a6008000000000f fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1078076
+c1f02000001fffff 7fe0000000000001 e8f000000040000f fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1285283
+c1cdfffbffffffff 7fe0000000000001 bca0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1355353
+43447336acaf7bd8 ffeffffffffffffe 0010000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1391389
+4010000000fff7ff ffe0000000000000 7fdfffc000003ffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1528526
+ffe0000002000003 47fffc00000007ff 93b0040000002000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1597595
+4060000200000400 ffe0000000000000 7fe0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1598596
+fe7007fffdffffff 7fdffffffffff03e 001ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1631629
+4000000000000000 ffe0000000000001 3fdffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1738736
+4000000000000000 ffeffffffffffffe 4263dd4adb450db9 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1740738
+40200001ffc00000 ffe0000000000000 3fdfcfffffffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1807805
+400ffffffffffffe ffd00013fffffffe 40200000100001ff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1941939
+400ffffffffffffe ffe0000000000001 c00fffe003ffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 1947945
+7fe00000080000fe bfffffffffffffff 3fd002000000003f fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2006003
+4010000000000000 ffe0000000000001 7feffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2018015
+4010000000000000 ffeffffffffffffe bf7ffffffff80001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2020017
+43ffffd000000000 ffe0000000000000 613ffffffffffe1e fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2087084
+c1fb6efe117a3ae3 7fefffffffffffff 43c0000001effffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2123120
+ffdfffffc0000000 7fe0000002002000 3fffffffffbfff80 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2147144
+401ffffffffffffe ffe0000000000001 7c300040000000ff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2227224
+4340000000000000 ffe0000000000001 bfeffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2297294
+c0f0000000203fff 7fefffffffffffff c921fffffffffefe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2402399
+7fedffffffdfffff c7f0400000000008 401ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2406403
+434fffffffffffff ffd0000008fffffe c03fffffffffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2419416
+41dfffffffe00003 ffe0000000000001 3ff0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2436433
+c1f0000000037fff 7fdffffffff7ffc0 3fdffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2451448
+ffebfffffffffbff 4010000000000001 bf20001fffffffe0 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2465462
+ffe000020001ffff 7fdfdffff7ffffff 41d000083fffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2471468
+434ffffffffffffe ffe0000000000001 bf1fffffc00003ff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2506503
+7fe0000000000000 c1c0000001ffffbf 0000000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z zdenorm ovrflw FmaResultM=-inf 2538535
+7fe0000000000000 c1d264933e9e988c 3ca0000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2565562
+7fe0000000000000 c00fffffffffffff bcaffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2567564
+7fe0000000000000 c010000000000001 403400003fffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2569566
+7fe0000000000001 c3d0bfffffffffff a9817e19c25e6ffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2590587
+7fe0000000000001 c1c01feffffffffe 3fe0000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2592589
+7fe0000000000001 f860000ffbfffffe 4000000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2619616
+7fe0000000000001 c1e29f751d0db106 41dff88000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2626623
+7fe0000000000001 c010000000000001 800ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z zdenorm ovrflw FmaResultM=-inf 2639636
+7fe0000000000001 c340000000000000 41e9bfbd1705ab74 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2641638
+7fe0000000000001 c1ffffc0007fffff c0e00000003f8000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2644641
+7fefffffffffffff c3cfff000003ffff c01fffffefbfffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2653650
+c00000ffc0000000 7fefffffffff81ff 00199d0888644678 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2660657
+7fefffffffffffff c01fffe00000003e 3cdedfffffffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2671668
+7fefffffffffffff c7e00800ffffffff c010000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2682679
+7fefffffffffffff c3f50270323fdbca 3fe0000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2691688
+7fefffffffffffff c06f000000000006 8010000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2700697
+7fefffffffffffff bff0000000000001 001ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2702699
+7fefffffffffffff bffffffffffffffe 47edd848c981ea6a fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2704701
+7fefffffffffffff d6f0007fbfffffff 380ff8000000001f fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2707704
+7fefffffffffffff c167c6ca402625fe ffe0000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2709706
+7fefffffffffffff c340000000000000 7feffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2711708
+7fefffffffffffff c34fffffffffffff c1a3cdb48240da83 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2713710
+7feffffffffffffe c01580f1a3e9c31d 3d258f8ba280bed4 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2725722
+7feffffffffffffe ffd800001fffffff bfd0000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2727724
+7feffffffffffffe c27a98a4d75fad64 0000000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z zdenorm ovrflw FmaResultM=-inf 2736733
+c01ffffffe03ffff 7fd00000000c0000 c00ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2739736
+7feffffffffffffe c3f01ffffff00000 4340000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2745742
+7feffffffffffffe c0550d69ccececd4 403ffffff83fffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2761758
+7feffffffffffffe c00fffffffffffff b81080ffffffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2776773
+7feffffffffffffe c0020ec4bd7f8123 403894684b0415af fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2779776
+7feffffffffffffe c34ffffffffffffe 401ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2783780
+7feffffffffffffe ffe0000000000001 43c0000000000bfe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2785782
+7feffffffffffffe c1f000000003ff7f 40017ffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2788785
+bf9ffffffd800000 7fefffffffffffff ffefffffffbfffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2960957
+e8d01e2c59865900 7fe05fffffffffff c34ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 2964961
+ffd917679344f70e 401fffffffffffff c000000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3094090
+4470000023ffffff ffe0000000000001 b802000001ffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3204200
+43627f4abb7a5c8e ffefffffffffffff 0010000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3274270
+c1c0000820000000 7feffffffff8001f 402000100000007f fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3332328
+c1cd41643238b450 7feffffffffffffe 3f4012189596a55a fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3519515
+c80ea7921c438451 7fe008000000007e 424153696dc450d3 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3552548
+4f000fffffffffff ffefffffffffffff 4010000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3553549
+7fe1868cfb076bc1 c34000000000037f b7effffc003ffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3719715
+c3fff9fffffffffe 7fe0000000000000 3d6000008000000e fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3726722
+43f007ffbfffffff ffefffffffffffff 43dffffeffffffbf fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3762758
+7fdfffdfffffffbe c01fffffffffffff 3fd0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 3895891
+ffeefffffffffff7 43e0003ffffeffff b7f000001fdfffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4125120
+4800002000000007 ffe0000000000000 3ff0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4319314
+43f856a5096bfc0d ffeffffffffffffe 3fd0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4391386
+c009c2b9147e606c 7fe0000002007fff bfa004001ffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4440435
+4030008000003fff ffe0000000000000 b810eaddea941d3f fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4528523
+67affffff8000006 f3016e70e2a6bd2f c1edddf29e459b21 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4548543
+ffe07ffbffffffff 5026589203bb88d1 401ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4586581
+43dffffc00000003 ffe0000000000000 8000000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4598593
+ffdfffffff800003 4010000000000001 c290000080000002 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4627622
+ffd001fffffffbff 4010000000000001 8000000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4697692
+bffffffffffffffe 7fefffffffffffff 3d30040000200000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4704699
+c000000000000000 7fefffffffffffff bfeffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4774769
+c000000000000000 7fe9d625d7f2ee96 380ffeffffffc000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4797792
+41efffffbfffdfff ffe0000000000000 bbf0000003f80000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4807802
+fcf00000000003e0 7fdfffffffc02000 bfeffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4892887
+c00ffffffffffffe 7fe0000000000000 001ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4981976
+c00ffffffffffffe 7fefffffffffffff 4020e8f734a930e7 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 4983978
+ffeffffc01fffffe 43d0000000000000 3806864c983757ae fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5030024
+41b0000000010007 ffe0000000000001 0010000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5157151
+c3e413dc0ee29162 7fefffffffffffff 8000000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5193187
+c01ffffffffffffe 7fe0000000000000 401ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5261255
+c01ffffffffffffe 7fefffffffffffff c1c177d35a8a07ad fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5263257
+c340000000000000 7feffffffffffffe 3ffffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5333327
+c34ff0000003fffe 7fefffffffffffff c0101442690e84e3 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5402396
+c340000000000001 7fe41774eee28bfa 37efffff000000ff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5437431
+c34fffffffffffff 7fe0000000000000 4010008001fffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5470464
+c34ffffffffffffe 7fe0000000000000 bcaffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5540534
+c34ffffffffffffe 7feffffffffffffe c7e6b68e99fe64db fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5542536
+ffe0000000000000 41effffff7fffffe 2a7000207fffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5590584
+ffe0000000000000 40b00000000008ff 4013ac1788ee2681 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5599593
+ffe0000000000000 4010000000000000 3fdffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5603597
+ffe0000000000000 401fffffffffffff 0012000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5605599
+ffe0000000000000 45e00007fff7ffff 9c80852a49e348a6 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5608602
+ffe0000000000000 41e6d2bd893fa49f 0000000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z zdenorm ovrflw FmaResultM=-inf 5610604
+ffe0000000000000 7feffffffffffffe 800ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z zdenorm ovrflw FmaResultM=-inf 5612606
+ffe0000000000000 4804ecddd4dee74f 9700000101fffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5617611
+ffe0000000000000 47e0400000000100 4340000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5619613
+ffe0000000000000 41d0000000001fff 800007ffffffdfff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z zdenorm ovrflw FmaResultM=-inf 5626620
+ffe0000000000001 4c7ffffffff87fff 3fbfdffffffffff7 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5662656
+ffe0000000000001 401ffffffffffffe 001ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5675669
+ffe0000000000001 4340000000000001 48700003fffefffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5677671
+ffe0000000000001 4000f2f5230ef1a6 382efffffeffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5689683
+ffe0000000000001 407b2a20706ca02f bcc8eea3de85c218 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5707701
+41efdffffffbfffe ffe0000000000001 bca0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5715709
+ffe0000000000001 43e000000000ffff 4340000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5718712
+ffedffffffff7fff 7f500000001fffff 469cefa7e05db8e7 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5728722
+ffefffffffffffff 3fffffffffffffff bcaffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5738732
+ffefffffffffffff 4000000000000001 800ffffffdffe000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z zdenorm ovrflw FmaResultM=-inf 5740734
+ffefffffffffffff 7fe0000000000000 3fdffffffffffe1f fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5749743
+ffd44208deea7d5b 7fdffffcffffffff caf0000000007fff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5764758
+ffefffffffffffff 43cffff6ffffffff 47ffba85ed27c05e fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5779773
+ffeffffffffffffe 40b0000fffffffc0 bfd0000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5799793
+ffeffffffffffffe 43ea49f9e3cf97b4 0000000000000001 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z zdenorm ovrflw FmaResultM=-inf 5808802
+ffeffffffffffffe 4000000000000001 800ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z zdenorm ovrflw FmaResultM=-inf 5810804
+ffeffffffffffffe 4010000000000000 bc800001ffffffe0 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5812806
+ffeffffffffffffe 7fe0000000000000 c34ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5819813
+ffeffffffffffffe 7feffffffffffffe c1efff801fffffff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5821815
+ffdfffffc0007ffe 4340000000000001 8000000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5886880
+c4a000001ffeffff 7fe0000000000000 b80fc03ffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5888882
+ffdfffff00000040 48f00001bfffffff c00ffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5910904
+c37ffffffffffbf0 7fd1800000000000 bfa7e7cad560a3d0 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 5912906
+c1700000000007f7 7feffffffffffffe 3f6ff7ffffffefff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6240233
+c3fffffffdfe0000 7fe0000000000000 c34fff6000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6447440
+400ffffdfffff7fe ffefffffffffffff 41de000000007ffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6483476
+4030000000004020 ffe88b9c477c3a97 ffe007ffff000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6575568
+7fe00807ffffffff c1e0000000007fe0 bfeffffffffffffe fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6676669
+ffdfc00000000800 7fe0000000000000 bcffffffffffefef fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6726719
+7feffffeffffbfff c34ffffffffffffe c000000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6760753
+42bff00000000010 ffefffffffffffff c3003a94038a1ec3 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6762755
+c3c00ffffffffeff 7feddda224891f86 43d0aa9335103e61 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6782775
+c08ff80000000400 7fe0000000000001 3ff0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6796789
+c07fffdfffffffbe 7feffffffffffffe 474ffffffdffff80 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6798791
+c01fffffeffff7ff 7fd0080080000000 bff26df7cf61cdd5 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6827820
+c7effff000000004 7fe0000008000fff 4770000007ffbfff fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 6863856
+7fe85e6f4033d7dd c000000000000000 bfe0000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 7031023
+c1f732bc454b0563 7fe0000000000001 8000000000000000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 7076068
+ffe000000fffffbe 401ffffffffffffe b80d2116944eef72 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 7141133
+ffd0002000001fff 40e00003ffffefff c03fffffffe80000 fff0000000000000 ffefffffffffffff  Wrong FmaResultM=    z ovrflw FmaResultM=-inf 7242234
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.c
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.c
@ -26,13 +26,13 @@ void main() {
 		char ans[81];
 		char flags[3];
 		int FrmE;
-		long stop = 1119653;
-		int debug = 1;
+		long stop = 5587581;
+		int debug = 0;
 		//my_string = (char *) malloc (nbytes + 1);
 		//bytes_read = getline (&my_string, &nbytes, stdin);
 	

-		for(n=0; n < 305; n++) {//613 for 10000
+		for(n=0; n < 1000; n++) {//613 for 10000
 			if(getline(&ln,&nbytes,fp) < 0 || feof(fp)) break;
 			if(k == stop && debug == 1) break;
 			k++;
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.v
--- a/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
@ -11,26 +11,25 @@ module tb;
 wire 	[4:0]	 	FmaFlagsM;

 	wire 		[12:0]		aligncntE;    	// status flags
-	wire 		[105:0]		rE; 				// one result of partial product sum
-	wire 		[105:0]		sE; 				// other result of partial products
-	wire 		[163:0]		tE;				// wire of alignment shifter	
+	wire 		[105:0]		ProdManE; 				// other result of partial products
+	wire 		[161:0]		AlignedAddendE;				// wire of alignment shifter	
 	wire 		[8:0]		normcntE; 		// shift count for normalizer
-	wire 		[12:0]		aeE; 		// multiplier expoent
-	wire 					bsE;				// sticky bit of addend
-	wire 					killprodE; 		// ReadData3E >> product
+	wire 		[12:0]		ProdExpE; 		// multiplier expoent
+	wire 					AddendStickyE;				// sticky bit of addend
+	wire 					KillProdE; 		// ReadData3E >> product
 	wire 					prodofE; 		// ReadData1E*ReadData2E out of range
-	wire					xzeroE;
+	wire					XZeroE;
 	wire					yzeroE;
 	wire					zzeroE;
-	wire					xdenormE;
-	wire					ydenormE;
-	wire					zdenormE;
-	wire					xinfE;
-	wire					yinfE;
-	wire					zinfE;
-	wire					xnanE;
-	wire					ynanE;
-	wire					znanE;
+	wire					XDenormE;
+	wire					YDenormE;
+	wire					ZDenormE;
+	wire					XInfE;
+	wire					YInfE;
+	wire					ZInfE;
+	wire					XNaNE;
+	wire					YNaNE;
+	wire					ZNaNE;
 	wire					nanE;
 	wire			[8:0]		sumshiftE;
 	wire					sumshiftzeroE;
@ -45,16 +44,16 @@ reg ansnan;
 reg		[105:0]		s;				//	partial product 2	
 reg		[51:0] 		xnorm;
 reg 		[51:0] 		ynorm;
+wire 	[3:0]		FOpCtrlM;
+
+assign FOpCtrlM = 4'b0;


 localparam period = 20;  
 fma1 UUT1(.*);
-fma2 UUT2(.ReadData1M(ReadData1E), .ReadData2M(ReadData2E), .ReadData3M(ReadData3E), .FrmM(FrmE),
-			 .aligncntM(aligncntE), .rM(rE), .sM(sE),
-			.tM(tE),	.normcntM(normcntE), .aeM(aeE), .bsM(bsE),.killprodM(killprodE),
-			.xzeroM(xzeroE),	.yzeroM(yzeroE),.zzeroM(zzeroE),.xdenormM(xdenormE),.ydenormM(ydenormE),
-			.zdenormM(zdenormE),.xinfM(xinfE),.yinfM(yinfE),.zinfM(zinfE),.xnanM(xnanE),.ynanM(ynanE),.znanM(znanE),
-			.nanM(nanE),.sumshiftM(sumshiftE),.sumshiftzeroM(sumshiftzeroE), .prodinfM(prodinfE), .*);
+fma2 UUT2(.ReadData1M(ReadData1E), .ReadData2M(ReadData2E), .ReadData3M(ReadData3E), .FrmM(FrmE), .ProdManM(ProdManE),
+			.AlignedAddendM(AlignedAddendE), .ProdExpM(ProdExpE), .AddendStickyM(AddendStickyE),.KillProdM(KillProdE),
+			.XZeroM(XZeroE),.YZeroM(YZeroE),.ZZeroM(ZZeroE),.XInfM(XInfE),.YInfM(YInfE),.ZInfM(ZInfE),.XNaNM(XNaNE),.YNaNM(YNaNE),.ZNaNM(ZNaNE), .*);


 initial 
--- a/wally-pipelined/src/fpu/add.sv
+++ b/wally-pipelined/src/fpu/add.sv
@ -1,65 +0,0 @@
-////////////////////////////////////////////////////////////////////////////////
-//
-// Block Name:	add.v
-// Author:		David Harris
-// Date:		11/12/1995
-//
-// Block Description:
-//       This block performs the addition of the product and addend.   It also
-//   contains logic necessary to adjust the signs for effective subtracts 
-//   and negative results. 
-////////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////////
-module add(rM, sM, tM, sum,
-		   negsum, invz, selsum1, negsum0, negsum1, killprodM);
-////////////////////////////////////////////////////////////////////////////////
-
-	input logic 		[105:0]		rM;     			// partial product 1
-	input logic 		[105:0]		sM;              // partial product 2
-	input logic 		[163:0]		tM;             	// aligned addend 
-	input logic					invz;       	// invert addend
-	input logic 					selsum1;    	// select +1 mode of compound adder 
-	input logic					killprodM;    	// z >> product
-	input logic					negsum;      	// Negate sum 
-	output logic		[163:0]		sum;         	// sum
-	output logic					negsum0;     	// sum was negative in +0 mode
-	output logic					negsum1;     	// sum was negative in +1 mode 
-
-	// Internal nodes
-
-	wire		[105:0]		r2;				// partial product possibly zeroed out
-	wire		[105:0]		s2;				// partial product possibly zeroed out
-	wire		[164:0]		t2;				// addend after inversion if necessary
-	wire		[164:0] 	sum0;			// sum of compound adder +0 mode
-	wire		[164:0] 	sum1;			// sum of compound adder +1 mode
-	wire		[163:0] 	prodshifted;			// sum of compound adder +1 mode
-	wire		[164:0] 	tmp;			// sum of compound adder +1 mode
-
-	// Invert addend if z'sM sign is diffrent from the product'sM sign
-
-	assign t2 = invz ? ~{1'b0,tM} : {1'b0,tM};
-	
-	// Zero out product if Z >> product or product really should be 	
-
-	assign r2 = killprodM ? 106'b0 : rM;
-	assign s2 = killprodM ? 106'b0 : sM;
-
-	//***replace this with a more structural cpa that synthisises better
-	// Compound adder
-	// Consists of 3:2 CSA followed by long compound CPA
-	//assign prodshifted = killprodM ? 0 : {56'b0, r2+s2, 2'b0};
-	//assign tmp = ({{57{r2[105]}},r2, 2'b0} + {{57{s2[105]}},s2, 2'b0});
-	assign sum0 = t2 + 164'b0 + {57'b0, r2+s2, 2'b0};
-	assign sum1 = t2 + 164'b1 + {57'b0, r2+s2, 2'b0}; // +1 from invert of z above
-	
-	// Check sign bits in +0/1 modes 
-	assign negsum0 = sum0[164];
-	assign negsum1 = sum1[164];
-
-	// Mux proper result (+Oil mode and inversion) using 4:1 mux
- 	//assign sumzero = |sum;
-	assign sum = selsum1 ? (negsum ? -sum1[163:0] : sum1[163:0]) : (negsum ? -sum0[163:0] : sum0[163:0]);
-	
-endmodule
-
--- a/wally-pipelined/src/fpu/align.sv
+++ b/wally-pipelined/src/fpu/align.sv
@ -1,88 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	align.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This block implements the alignment shifter.   It is responsible for
-//   adjusting the fraction portion of the addend relative to the fraction
-//   produced in the multiplier array.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module align(zman, aligncntE, xzeroE, yzeroE, zzeroE, zdenormE, tE, bsE, 
-             killprodE,  sumshiftE, sumshiftzeroE);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic 		[51:0]		zman;		// Fraction of addend z;
-	input logic 		[12:0]		aligncntE;	// amount to shift
-	input logic				xzeroE;		// Input X = 0
-	input logic                  		yzeroE;          // Input Y = 0 
-	input logic                  		zzeroE;          // Input Z = 0
-	input logic                  		zdenormE;        // Input Z is denormalized
-	output logic    	[163:0]    	tE;              // aligned addend (54 bits left of bpt)
-	output logic          		bsE;           	// sticky bit of addend
-	output logic          		killprodE;    	// Z >> product
-	output logic		[8:0]		sumshiftE;	
-	output logic				sumshiftzeroE;
-
-	// Internal nodes
- 
-	reg       	[215:0]   	shift;				// aligned addend from shifter
-	logic 		[12:0]		tmp;
-	
-
-
-	always_comb 
-		begin
-
-		// Default to clearing sticky bits 
-		bsE = 0;
-
-		// And to using product as primary operand in adder I exponent gen 
-		killprodE = xzeroE | yzeroE;
-		// d = aligncntE
-		// p = 53
-		//***try reducing this hardware to use one shifter
-		if ($signed(aligncntE) <= $signed(-(13'd105))) begin //d<=-2p+1
-			//product ancored case with saturated shift
-			sumshiftE = 163;	// 3p+4	
-			sumshiftzeroE = 0;
-			shift = {1'b1,zman,163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-
-		end else if($signed(aligncntE) <= $signed(13'd2))  begin // -2p+1<d<=2
-			// product ancored or cancellation
-			tmp = 13'd57-aligncntE;
-			sumshiftE = tmp[8:0]; // p + 2 - d  
-			sumshiftzeroE = 0;
-			shift = {~zdenormE,zman,163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-
-		end else if ($signed(aligncntE)<=$signed(13'd55))  begin // 2 < d <= p+2
-			// addend ancored case
-			// used to be 56 \/ somthing doesn't seem right too many typos
-			tmp = 13'd57-aligncntE;
-			sumshiftE = tmp[8:0]; 
-			sumshiftzeroE = 0;
-			shift = {~zdenormE,zman, 163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-
-		end else begin                 	// d >= p+3
-			// addend anchored case with saturated shift
-			sumshiftE = 0;	
-			sumshiftzeroE = 1;		
-			shift = {~zdenormE,zman, 163'b0} >> sumshiftE;
-			tE = zzeroE ? 0 : {shift[215:52]};
-			bsE = |(shift[51:0]);
-			killprodE = 1;
-
-		end 
-	end
-
-endmodule
-
--- a/wally-pipelined/src/fpu/booth.sv
+++ b/wally-pipelined/src/fpu/booth.sv
@ -1,53 +0,0 @@
-module booth(xExt, choose, add1, e, pp); 
-/////////////////////////////////////////////////////////////////////////////
-    
-	input logic 		[53:0]		xExt;				// multiplicand	xExt
-	input logic		[2:0]		choose;				// bits needed to choose which encoding
-	output logic		[1:0]       	add1;				// do you add 1	
-    output logic                  e;
-	output logic		[54:0]		pp;				//	the resultant encoding
-    
-    logic [54:0] temp;
-    logic [53:0] negx;
-    //logic temp;
-
-    assign negx = ~xExt;
-
-    always_comb
-    case (choose)
-        3'b000 : pp = 55'b0;   //  0
-        3'b001 : pp = {1'b0, xExt};  //  1
-        3'b010 : pp = {1'b0, xExt};  //  1
-        3'b011 : pp = {xExt, 1'b0};  //  2
-        3'b100 : pp = {negx, 1'b0};  // -2
-        3'b101 : pp = {1'b1, negx};  // -1
-        3'b110 : pp = {1'b1, negx};  // -1
-        3'b111 : pp = '1;  //  -0
-    endcase
-
-    always_comb
-    case (choose)
-        3'b000 : e = 0;   //  0
-        3'b001 : e = 0;  //  1
-        3'b010 : e = 0;  //  1
-        3'b011 : e = 0;  //  2
-        3'b100 : e = 1;  // -2
-        3'b101 : e = 1;  // -1
-        3'b110 : e = 1;  // -1
-        3'b111 : e = 1;  //  -0
-    endcase
-    // assign add1 = (choose[2] == 1'b1) ? ((choose[1:0] == 2'b11) ? 1'b0 : 1'b1) : 1'b0;
-    // assign add1 = choose[2];
-    always_comb
-    case (choose)
-        3'b000 : add1 = 2'b0;   //  0
-        3'b001 : add1 = 2'b0;  //  1
-        3'b010 : add1 = 2'b0;  //  1
-        3'b011 : add1 = 2'b0;  //  2
-        3'b100 : add1 = 2'b10;  // -2
-        3'b101 : add1 = 2'b1;  // -1
-        3'b110 : add1 = 2'b1;  // -1
-        3'b111 : add1 = 2'b1;  //  -0
-    endcase
-
-endmodule
--- a/wally-pipelined/src/fpu/compressors.sv
+++ b/wally-pipelined/src/fpu/compressors.sv
@ -1,93 +0,0 @@
-// //***breaks lint with warnings like: %Warning-UNOPTFLAT:      Example path: src/fpu/compressors.sv:37:  ASSIGNW
-// //%Warning-UNOPTFLAT:      Example path: src/fpu/compressors.sv:32:  wallypipelinedsoc.hart.fpu.fma1.multiply.genblk5[0].add4.cout
-
-// module add3comp2(a, b, c, carry, sum); 
-// /////////////////////////////////////////////////////////////////////////////
-// //look into diffrent implementations of the compressors?
-    
-//     parameter BITS = 4;
-// 	input logic 		[BITS-1:0]		a;
-// 	input logic		[BITS-1:0]		b;
-// 	input logic		[BITS-1:0]    	c;
-//     output logic      [BITS-1:0]      carry;
-// 	output logic		[BITS-1:0]		sum;
-//     genvar i;
-
-//     generate
-//         for(i= 0; i<BITS; i=i+1) begin
-//             sng3comp2 add0(a[i], b[i], c[i], carry[i], sum[i]);
-//         end
-//     endgenerate
-
-// endmodule
-
-// module add4comp2(a, b, c, d, carry, sum); 
-// /////////////////////////////////////////////////////////////////////////////
-    
-//     parameter BITS = 4;
-// 	input logic 		[BITS-1:0]		a;
-// 	input logic		[BITS-1:0]		b;
-// 	input logic		[BITS-1:0]    	c;
-// 	input logic		[BITS-1:0]    	d;
-//     output logic      [BITS:0]      carry;
-// 	output logic		[BITS-1:0]		sum;
-
-//     logic       [BITS-1:0]      cout;
-//     logic                       carryTmp;
-//     genvar i;
-
-
-//     sng4comp2 add0(a[0], b[0], c[0], d[0], 1'b0, cout[0], carry[0], sum[0]);
-
-//     generate
-//         for(i= 1; i<BITS-1; i=i+1) begin
-//             sng4comp2 add1(a[i], b[i], c[i], d[i], cout[i-1], cout[i], carry[i], sum[i]);
-//         end
-//     endgenerate
-
-
-//     sng4comp2 add2(a[BITS-1], b[BITS-1], c[BITS-1], d[BITS-1], cout[BITS-2], cout[BITS-1], carryTmp, sum[BITS-1]);
-
-//     assign carry[BITS-1] = carryTmp & cout[BITS-1];
-//     assign carry[BITS] = carryTmp ^ cout[BITS-1];
-
-// endmodule
-
-// module sng3comp2(a, b, c, carry, sum); 
-// /////////////////////////////////////////////////////////////////////////////
-// //look into diffrent implementations of the compressors?
-    
-// 	input logic 				a;
-// 	input logic				b;
-// 	input logic		       	c;
-//     output logic              carry;
-// 	output logic				sum;
-    
-//     logic               axorb;
-
-//     assign axorb = a ^ b;
-//     assign sum = axorb ^ c;
-
-//     assign carry = axorb ? c : a;
-
-// endmodule
-
-// module sng4comp2(a, b, c, d, cin, cout, carry, sum); 
-// /////////////////////////////////////////////////////////////////////////////
-// //look into pass gate 4:2 counters?
-    
-// 	input logic 				a;
-// 	input logic				b;
-// 	input logic		       	c;
-//     input logic               d;
-//     input logic               cin;
-//     output logic              cout;
-//     output logic              carry;
-// 	output logic				sum;
-    
-//     logic               TmpSum;
-
-//     sng3comp2 add1(.carry(cout), .sum(TmpSum),.*);
-//     sng3comp2 add2(.a(TmpSum), .b(d), .c(cin), .*);
-
-// endmodule
--- a/wally-pipelined/src/fpu/expgen1.sv
+++ b/wally-pipelined/src/fpu/expgen1.sv
@ -1,90 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	expgen.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-//   Block Description:
-//   This block implements the exponent path of the FMAC. It performs the
-//   following operations:
-//
-//   1) Compute exponent of multiply.  
-//   2) Compare multiply and add exponents to generate alignment shift count
-//   3) Adjust exponent based on normalization
-//   4)  Increment exponent based on postrounding renormalization
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module expgen1(xexp, yexp, zexp, xzeroE, yzeroE,
-			   xdenormE, ydenormE, zdenormE, 
-			   aligncntE, prodof, aeE);
-/////////////////////////////////////////////////////////////////////////////
-  
-	input logic     	[62:52]    	xexp;           	// Exponent of multiplicand x
-	input logic     	[62:52]  	yexp;         		// Exponent of multiplicand y
-	input logic     	[62:52]  	zexp;           	// Exponent of addend z
-	input logic     			xdenormE;		// Z is denorm
-	input logic     			ydenormE;		// Z is denorm
-	input logic     			zdenormE;		// Z is denorm
-	input logic     			xzeroE;		// Z is denorm
-	input logic     			yzeroE;		// Z is denorm
-	output logic		[12:0]   	aligncntE;       // shift count for alignment shifter
-	output logic			prodof;         // X*Y exponent out of bounds 
-	output logic		[12:0]		aeE;				//exponent of multiply
-
-	//   Internal nodes
-
-
-	wire 	[12:0]			aligncnt0;		// Shift count for alignment
-	wire 	[12:0]			aligncnt1;		// Shift count for alignment
-	wire 	[12:0]			be;				// Exponent of multiply
-	wire 	[12:0]			de1;			// Normalized exponent
-	wire 	[12:0]			de;				// Normalized exponent
-	wire 	[10:0]			infinityres;	// Infinity or max number
-	wire 	[10:0]			nanres;          //	Nan propagated or generated
-	wire 	[10:0]			specialres;  //	Exceptional case result
-
-	//   Compute exponent of multiply
-	// Note that the exponent does not have to be incremented on a postrounding
-	//   normalization of X because the mantissa was already increased.   Report
-	//   if exponent is out of bounds 
-
-
-	assign aeE = xzeroE|yzeroE ? 0 : {2'b0,xexp} + {2'b0,yexp} - 13'd1023;
-
-	assign prodof = (aeE > 2046 && ~aeE[12]);
-
-	// Compute alignment shift count
-	// Adjust for postrounding normalization of Z.
-	// This should not increas the critical path because the time to
-	// check if a round overflows is shorter than the actual round and
-	// is masked by the bypass mux and two 10 bit adder delays.
-	// assign aligncnt0 = - 1 + ~xdenormE + ~ydenormE - ~zdenormE;
-	// assign aligncnt1 = - 1 + {12'b0,~xdenormE} + {12'b0,~ydenormE} - {12'b0,~zdenormE};
-	assign aligncntE = {2'b0,zexp} -aeE - 1 + {12'b0,~xdenormE} + {12'b0,~ydenormE} - {12'b0,~zdenormE};
-	//assign aligncntE = zexp -aeE - 1 + ~xdenormE + ~ydenormE - ~zdenormE;
-	//assign aligncntE = zexp - aeE;// KEP use all of aeE
-
-	// Select exponent (usually from product except in case of huge addend)
-
-	//assign be = zexpsel ? zexp : aeE;
-
-	// Adjust exponent based on normalization
-	// A compound adder takes care of the case of post-rounding normalization
-	// requiring an extra increment
-	 
-	//assign de0 = sumzero ? 13'b0 : be + normcnt + 2;
-	// assign de1 = sumzero ? 13'b0 : be + normcnt + 2;
-	 
-
-	// bypass occurs before rounding or taking early results 
-	
-	//assign wbypass = de0[10:0];
-	
-	// In a non-critical special mux, we combine the early result from other
-	// FPU blocks with the results of exceptional conditions.  Overflow
-	// produces either infinity or the largest finite number, depending on the
-	// rounding mode.  NaNs are propagated or generated.
-endmodule
-
-
--- a/wally-pipelined/src/fpu/expgen2.sv
+++ b/wally-pipelined/src/fpu/expgen2.sv
@ -1,108 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	expgen.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-//   Block Description:
-//   This block implements the exponent path of the FMAC. It performs the
-//   following operations:
-//
-//   1) Compute exponent of multiply.  
-//   2) Compare multiply and add exponents to generate alignment shift count
-//   3) Adjust exponent based on normalization
-//   4)  Increment exponent based on postrounding renormalization
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module expgen2(xexp, yexp, zexp,
-			   sumzero, resultdenorm, infinity, 
-			   FmaFlagsM, inf, expplus1,
-			   nanM, de0, xnanM, ynanM, znanM,  specialsel,
-			    wexp,
-			   sumof, sumuf);
-/////////////////////////////////////////////////////////////////////////////
-  
-	input logic     	[62:52]    	xexp;           	// Exponent of multiplicand x
-	input logic     	[62:52]  	yexp;         		// Exponent of multiplicand y
-	input logic     	[62:52]  	zexp;           	// Exponent of addend z
-	input logic     			sumzero;     	// sum exactly equals zero 
-	input logic     			resultdenorm;  // postnormalize rounded result
-	input logic     			infinity;    	// generate infinity on overflow 
-	input logic     	[4:0]	FmaFlagsM;     	// Result invalid
-	input logic     			inf;			// Some input is infinity
-	input logic     			nanM;			// Some input is NaN
-	input logic     	[12:0]		de0;			// X is NaN NaN
-	input logic     			xnanM;			// X is NaN
-	input logic    			ynanM;			// Y is NaN
-	input logic     			znanM;			// Z is NaN 
-	input logic				expplus1;
-	input logic     			specialsel;  	// Select special result
-	output logic		[62:52]    	wexp;           	// Exponent of result
-	output logic				sumof;          // X*Y+Z exponent out of bounds 
-	output logic				sumuf;         // X*Y+Z exponent underflows 
-
-	//   Internal nodes
-
-
-	wire 	[12:0]			aligncnt0;		// Shift count for alignment
-	wire 	[12:0]			aligncnt1;		// Shift count for alignment
-	wire 	[12:0]			be;				// Exponent of multiply
-	wire 	[12:0]			de1;			// Normalized exponent
-	wire 	[12:0]			de;				// Normalized exponent
-	wire 	[10:0]			infinityres;	// Infinity or max number
-	wire 	[10:0]			nanres;          //	Nan propagated or generated
-	wire 	[10:0]			specialres;  //	Exceptional case result
-
-	//   Compute exponent of multiply
-	// Note that the exponent does not have to be incremented on a postrounding
-	//   normalization of X because the mantissa was already increased.   Report
-	//   if exponent is out of bounds 
-
-	// Select exponent (usually from product except in case of huge addend)
-
-	//assign be = zexpsel ? zexp : ae;
-
-	// Adjust exponent based on normalization
-	// A compound adder takes care of the case of post-rounding normalization
-	// requiring an extra increment
-	 
-	//assign de0 = sumzero ? 13'b0 : be + normcnt + 2;
-	// assign de1 = sumzero ? 13'b0 : be + normcnt + 2;
-	 
-	
-	// check for exponent out of bounds after add 
-	
-	assign de = resultdenorm | sumzero ? 0 : de0;
-	assign sumof = ~de[12] && de > 2046;
-	assign sumuf = de == 0  && ~sumzero && ~resultdenorm;
-
-	// bypass occurs before rounding or taking early results 
-	
-	//assign wbypass = de0[10:0];
-	
-	// In a non-critical special mux, we combine the early result from other
-	// FPU blocks with the results of exceptional conditions.  Overflow
-	// produces either infinity or the largest finite number, depending on the
-	// rounding mode.  NaNs are propagated or generated.
-
-	assign specialres = FmaFlagsM[4] | nanM ? nanres : // invalid
-					FmaFlagsM[2] ? infinityres : 	//overflow
-					inf ? 11'b11111111111 :
-					FmaFlagsM[1] ? 11'b0 : 11'bx; //underflow
-
-	assign infinityres = infinity ? 11'b11111111111 : 11'b11111111110;
-
-	// IEEE 754-2008 section 6.2.3 states:
-	// "If two or more inputs are NaN, then the payload of the resulting NaN should be 
-	// identical to the payload of one of the input NaNs if representable in the destination
-	// format. This standard does not specify which of the input NaNs will provide the payload."
-	assign nanres = xnanM ? xexp : (ynanM ? yexp : (znanM? zexp : 11'b11111111111));
-
-	// A mux selects the early result from other FPU blocks or the 
-	// normalized FMAC result.   Special cases are also detected. 
-	
-	assign wexp = specialsel ? specialres[10:0] : de[10:0] + {10'b0,expplus1}; 
-endmodule
-
-
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@ -168,8 +168,8 @@ module fctrl (
      //fma/mult	
      //  fmadd  = ?000
      //  fmsub  = ?001
-      //  fnmadd = ?010
-      //  fnmsub = ?011
+      //  fnmsub = ?010	-(a*b)+c
+      //  fnmadd = ?011 -(a*b)-c
      //  fmul   = ?100
      //		  {?, is mul, is negitive, is sub}
      3'b010 : begin FOpCtrlD = {1'b0, OpD[4:2]}; FInput2UsedD = 1'b1; FInput3UsedD = ~OpD[4]; end
--- a/wally-pipelined/src/fpu/flag1.sv
+++ b/wally-pipelined/src/fpu/flag1.sv
@ -1,34 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	flag.v
-// Author:		David Harris
-// Date:		12/6/1995
-//
-// Block Description:
-//       This block generates the flags: invalid, overflow, underflow, inexact. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module flag1(xnanE, ynanE, znanE, prodof, prodinfE, nanE);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic                  		xnanE;        	// X is NaN 
-	input logic                  		ynanE;        	// Y is NaN 
-	input logic                 		znanE;       	// Z is NaN
-	input logic                  		prodof;         // X*Y overflows exponent
-	output logic				nanE;		// Some	source is NaN
- 
-	//   Internal nodes
-
-	output logic				prodinfE;	// X*Y larger than max possible
-
-	// If any input logic is NaN, propagate the NaN 
-
-	assign nanE = xnanE || ynanE || znanE;
-
-
-	// Generate infinity checks
-
-	assign prodinfE = prodof && ~xnanE && ~ynanE;
-
-
-endmodule
--- a/wally-pipelined/src/fpu/flag2.sv
+++ b/wally-pipelined/src/fpu/flag2.sv
@ -1,80 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	flag.v
-// Author:		David Harris
-// Date:		12/6/1995
-//
-// Block Description:
-//       This block generates the flags: invalid, overflow, underflow, inexact. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module flag2(xsign,ysign,zsign, xnanM, ynanM, znanM, xinfM, yinfM, zinfM, sumof, sumuf,
-			 xzeroM, yzeroM, zzeroM, vbits, killprodM,
-			 inf, nanM, FmaFlagsM,sticky,prodinfM);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic                  		xnanM;        	// X is NaN 
-	input logic                  		ynanM;        	// Y is NaN 
-	input logic                 		znanM;       	// Z is NaN 
-	input logic				xsign; 		// Sign of z
-	input logic				ysign; 		// Sign of z
-	input logic				zsign; 		// Sign of z
-	input logic                  		sticky;        	// X is Inf
-    input logic                       prodinfM;
-	input logic                  		xinfM;        	// X is Inf
-	input logic                 		yinfM;       	// Y is Inf 
-	input logic                  		zinfM;        	// Z is Inf
-	input logic                  		sumof;          // X*Y + z underflows exponent
-	input logic                  		sumuf;          // X*Y + z underflows exponent
-	input logic				xzeroM;		// x = 0
-	input logic				yzeroM;		// y = 0
-	input logic				zzeroM;		// y = 0
-	input logic				killprodM;
-	input logic     	[1:0]  		vbits;		// R and S bits of result
-	output logic				inf;		// Some	source is Inf
-	input logic				nanM;		// Some	source is NaN
-	output logic		[4:0]	FmaFlagsM;
- 
-	//   Internal nodes
-
-logic suminf;
-
-	// Same with infinity (inf - inf and O * inf don't propagate inf
-	//  but it's ok becaue illegal op takes higher precidence)
-
-	assign inf= xinfM || yinfM || zinfM || suminf;//KEP added suminf 
-	//assign inf= xinfM || yinfM || zinfM;//original
-
-	assign suminf = sumof && ~xnanM && ~ynanM && ~znanM;
-
-
-	// Set the overflow flag for the following cases:
-	//   1) Rounded multiply result would be out of bounds
-	//   2) Rounded add result would be out of bounds
-
-	assign FmaFlagsM[2] = suminf && ~inf;
-
-	// Set the underflow  flag for the following cases:
-	//   1) Any input logic is denormalized
-	//   2)  output logic would be denormalized or smaller
-
-	assign FmaFlagsM[1] = (sumuf && ~inf && ~prodinfM && ~nanM) || (killprodM & zzeroM & ~(yzeroM | xzeroM));
-
-	// Set the inexact flag for the following cases:
-	//   1) Multiplication inexact
-	//   2) Addition  inexact
-	// One of these cases occurred if the R or S bit is set
-
-	assign FmaFlagsM[0] = (vbits[0] || vbits[1] ||sticky  || suminf) && ~(inf || nanM);
-
-	// Set invalid flag for following cases:
-	//   1) Inf - Inf
-	//   2) 0 * Inf
-	//   3) output logic = NaN (this is not part of the IEEE spec,  only 486 proj)
-
-	assign FmaFlagsM[4] = (xinfM || yinfM || prodinfM) && zinfM && (xsign ^ ysign ^ zsign) ||
-					   xzeroM && yinfM || yzeroM && xinfM;// KEP remove case 3) above
-
-	assign FmaFlagsM[3] = 0; // divide by zero flag
-
-endmodule
--- a/wally-pipelined/src/fpu/fma1.sv
+++ b/wally-pipelined/src/fpu/fma1.sv
@ -1,103 +1,141 @@
- ////////////////////////////////////////////////////////////////////////////////
-// Block Name:	fmac.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This is the top level block of a floating-point  multiply/accumulate
-//   unit(FMAC).   It instantiates the following sub-blocks:
-//
-//    array     Booth encoding, partial product generation, product summation
-//    expgen    Exponent summation, compare, and adjust
-//    align     Alignment shifter
-//    add       Carry-save adder for accumulate, carry propagate adder
-//    lza       Leading zero anticipator to control normalization shifter
-//    normalize Normalization shifter
-//    round     Rounding of result
-//    exception Handles exceptional cases
-//    bypass    Handles bypass of result to FInput1E or FInput3E inputs
-//    sign      One bit sign handling block 
-//    special   Catch special cases (inputs = 0  / infinity /  etc.) 
-//
-//   The FMAC computes FmaResultM=FInput1E*FInput2E+FInput3E, rounded with the mode specified by
-//   RN, RZ, RM, or RP.  The result is optionally bypassed back to
-//   the FInput1E or FInput3E inputs for use on the next cycle.  In addition,  four signals
-//   are produced: trap, overflow, underflow, and inexact.  Trap indicates
-//   an infinity, NaN, or denormalized number to be handled in software;
-//   the other three signals are IEEE flags.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module fma1(FInput1E, FInput2E, FInput3E, FrmE,  
-			rE, sE, tE, bsE, killprodE, sumshiftE, sumshiftzeroE,  aligncntE, aeE
-			, xzeroE, yzeroE, zzeroE, xnanE,ynanE, znanE, xdenormE, ydenormE, zdenormE,
-			xinfE, yinfE, zinfE, nanE, prodinfE);
-/////////////////////////////////////////////////////////////////////////////
+module fma1(
 
-	input logic 		[63:0]		FInput1E;		// input 1
-	input logic		[63:0]		FInput2E;     // input 2 
-	input logic 		[63:0]		FInput3E;     // input 3
-	input logic 		[2:0]	 	FrmE;          	// Rounding mode
-	output logic 		[12:0]		aligncntE;    	// status flags
-	output logic 		[105:0]		rE; 				// one result of partial product sum
-	output logic 		[105:0]		sE; 				// other result of partial products
-	output logic 		[163:0]		tE;				// output logic of alignment shifter	
-	output logic 		[12:0]		aeE; 		// multiplier expoent
-	output logic 					bsE;				// sticky bit of addend
-	output logic 					killprodE; 		// FInput3E >> product
-	output logic					xzeroE;
-	output logic					yzeroE;
-	output logic					zzeroE;
-	output logic					xdenormE;
-	output logic					ydenormE;
-	output logic					zdenormE;
-	output logic					xinfE;
-	output logic					yinfE;
-	output logic					zinfE;
-	output logic					xnanE;
-	output logic					ynanE;
-	output logic					znanE;
-	output logic					nanE;
-	output logic					prodinfE;
-	output logic			[8:0]		sumshiftE;
-	output logic					sumshiftzeroE;
+	input logic 	[63:0]		FInput1E,
+	input logic		[63:0]		FInput2E,
+	input logic 	[63:0]		FInput3E,
+	input logic 	[3:0]		FOpCtrlE,
+	output logic 	[105:0]		ProdManE,
+	output logic 	[161:0]		AlignedAddendE,	
+	output logic 	[12:0]		ProdExpE,
+	output logic 				AddendStickyE,
+	output logic 				KillProdE,
+	output logic				XZeroE, YZeroE, ZZeroE,
+	output logic				XInfE, YInfE, ZInfE,
+	output logic				XNaNE, YNaNE, ZNaNE);

-// Internal nodes
- 
-//	output logic 		[12:0]		aligncntE; 		// shift count for alignment
+	logic [51:0] 	XMan,YMan,ZMan;
+	logic [10:0] 	XExp,YExp,ZExp;
+	logic 		 	XSgn,YSgn,ZSgn;
+	logic [12:0]	AlignCnt;
+	logic [211:0] 	Shift;
+	logic			XDenormE, YDenormE, ZDenormE;
+	logic [63:0]	FInput3E2;
+
+	// Set addend to zero if FMUL instruction
+  	assign FInput3E2 = FOpCtrlE[2] ? 64'b0 : FInput3E;
+
+	// split inputs into the sign bit, mantissa, and exponent for readability
+	assign XSgn = FInput1E[63];
+	assign YSgn = FInput2E[63];
+	assign ZSgn = FInput3E2[63];
+
+	assign XExp = FInput1E[62:52];
+	assign YExp = FInput2E[62:52];
+	assign ZExp = FInput3E2[62:52];
+
+	assign XMan = FInput1E[51:0];
+	assign YMan = FInput2E[51:0];
+	assign ZMan = FInput3E2[51:0];


-	logic 					prodof; 		// FInput1E*FInput2E out of range
+
+	// determine if an input is a special value
+	assign XNaNE = &FInput1E[62:52] && |FInput1E[51:0]; 
+	assign YNaNE = &FInput2E[62:52] && |FInput2E[51:0]; 
+	assign ZNaNE = &FInput3E2[62:52] && |FInput3E2[51:0];
+
+	assign XDenormE = ~(|FInput1E[62:52]) && |FInput1E[51:0]; 
+	assign YDenormE = ~(|FInput2E[62:52]) && |FInput2E[51:0]; 
+	assign ZDenormE = ~(|FInput3E2[62:52]) && |FInput3E2[51:0];
+
+	assign XInfE = &FInput1E[62:52] && ~(|FInput1E[51:0]); 
+	assign YInfE = &FInput2E[62:52] && ~(|FInput2E[51:0]); 
+	assign ZInfE = &FInput3E2[62:52] && ~(|FInput3E2[51:0]);
+
+	assign XZeroE = ~(|FInput1E[62:0]);
+	assign YZeroE = ~(|FInput2E[62:0]);
+	assign ZZeroE = ~(|FInput3E2[62:0]);




+	// Calculate the product's exponent
+	//		- When multipliying two fp numbers, add the exponents
+	// 		- Subtract 3ff to remove one of the biases (XExp + YExp has two biases, one from each exponent)
+	//		- Denormal numbers have an an exponent value of 1, however they are 
+	//		  represented with an exponent of 0. add one if there is a denormal number
+	assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 : 
+				 {2'b0, XExp} + {2'b0, YExp} - 13'h3ff + {12'b0, XDenormE} + {12'b0, YDenormE};
+
+	// Calculate the product's mantissa
+	//		- Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
+	assign ProdManE = {53'b0,~(XDenormE|XZeroE),XMan}  *  {53'b0,~(YDenormE|YZeroE),YMan};




+	// determine the shift count for alignment
+	//		- negitive means Z is larger, so shift Z left
+	//		- positive means the product is larger, so shift Z right
+	//		- Denormal numbers have an an exponent value of 1, however they are 
+	//		  represented with an exponent of 0. add one to the exponent if it is a denormal number
+	assign AlignCnt = ProdExpE - {2'b0, ZExp} - {12'b0, ZDenormE};
+
+	// Alignment shifter
+
+	// Defualt Addition without shifting
+	// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
+	//						 |1'b0| addnend |
+
+	// the 1'b0 before the added is because the product's mantissa has two bits before the decimal point (xx.xxxxxxxxxx...)
+	
+	always_comb 
+		begin
+			
+		// Set default values
+		AddendStickyE = 0;
+		KillProdE = 0;
+		
+		// If the product is too small to effect the sum, kill the product
+
+		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
+		//	| addnend |
+		if ($signed(AlignCnt) <= $signed(-13'd56)) begin
+			KillProdE = 1;
+			AlignedAddendE = {107'b0, ~(ZZeroE|ZDenormE),ZMan,2'b0};
+			AddendStickyE = ~(XZeroE|YZeroE);
+
+		// If the Addend is shifted left (negitive AlignCnt)
+
+		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
+		//					| addnend |
+		end else if($signed(AlignCnt) <= $signed(13'd0))  begin
+			Shift = {55'b0, ~(ZZeroE|ZDenormE),ZMan, 104'b0} << -AlignCnt;
+			AlignedAddendE = Shift[211:50];
+			AddendStickyE = |(Shift[49:0]);
+
+		// If the Addend is shifted right (positive AlignCnt)
+
+		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
+		//									| addnend |
+		end else if ($signed(AlignCnt)<=$signed(13'd105))  begin
+			Shift = {55'b0, ~(ZZeroE|ZDenormE),ZMan, 104'b0} >> AlignCnt;
+			AlignedAddendE = Shift[211:50];
+			AddendStickyE = |(Shift[49:0]);
+
+		// If the addend is too small to effect the addition		
+		//		- The addend has to shift two past the end of the addend to be considered too small
+		//		- The 2 extra bits are needed for rounding
+
+		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
+		//														| addnend |
+		end else begin
+			AlignedAddendE = 162'b0;
+			AddendStickyE = ~ZZeroE;


-
-
-
-//   Instantiate fraction datapath
-
-	multiply		multiply(.xman(FInput1E[51:0]), .yman(FInput2E[51:0]), .*);
-	align			align(.zman(FInput3E[51:0]),.*);
-
-// Instantiate exponent datapath
-
-	expgen1			expgen1(.xexp(FInput1E[62:52]),.yexp(FInput2E[62:52]),.zexp(FInput3E[62:52]),.*);
-// Instantiate special case detection across datapath & exponent path 
-
-	special			special(.*);
-
-
-// Instantiate control output logic
- 
-flag1				flag1(.*); 
+		end 
+	end

 endmodule

--- a/wally-pipelined/src/fpu/fma2.sv
+++ b/wally-pipelined/src/fpu/fma2.sv
@ -1,107 +1,110 @@
- ////////////////////////////////////////////////////////////////////////////////
-// Block Name:	fmac.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This is the top level block of a floating-point  multiply/accumulate
-//   unit(FMAC).   It instantiates the following sub-blocks:
-//
-//    array     Booth encoding, partial product generation, product summation
-//    expgen    Mxponent summation, compare, and adjust
-//    align     Alignment shifter
-//    add       Carry-save adder for accumulate, carry propagate adder
-//    lza       Leading zero anticipator to control normalization shifter
-//    normalize Normalization shifter
-//    round     Rounding of result
-//    exception Handles exceptional cases
-//    bypass    Handles bypass of result to FInput1M or FInput3M input logics
-//    sign      One bit sign handling block 
-//    special   Catch special cases (input logics = 0  / infinity /  etc.) 
-//
-//   The FMAC computes FmaResultM=FInput1M*FInput2M+FInput3M, rounded with the mode specified by
-//   RN, RZ, RM, or RP.  The result is optionally bypassed back to
-//   the FInput1M or FInput3M input logics for use on the next cycle.  In addition,  four signals
-//   are produced: trap, overflow, underflow, and inexact.  Trap indicates
-//   an infinity, NaN, or denormalized number to be handled in software;
-//   the other three signals are IMMM flags.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module fma2(FInput1M, FInput2M, FInput3M, FrmM,
-			FmaResultM, FmaFlagsM, aligncntM, rM, sM,
-			tM,	normcntM, aeM, bsM,killprodM,
-			xzeroM,	yzeroM,zzeroM,xdenormM,ydenormM,
-			zdenormM,xinfM,yinfM,zinfM,xnanM,ynanM,znanM,
-			nanM,sumshiftM,sumshiftzeroM,prodinfM
-
-);
-/////////////////////////////////////////////////////////////////////////////
+module fma2(
 
-	input logic 		[63:0]		FInput1M;		// input logic 1
-	input logic		[63:0]		FInput2M;     // input logic 2 
-	input logic 		[63:0]		FInput3M;     // input logic 3
-	input logic 		[2:0]	 	FrmM;          	// Rounding mode
-	input logic 		[12:0]		aligncntM;    	// status flags
-	input logic 		[105:0]		rM; 				// one result of partial product sum
-	input logic 		[105:0]		sM; 				// other result of partial products
-	input logic 		[163:0]		tM;				// output of alignment shifter	
-	input logic 		[8:0]		normcntM; 		// shift count for normalizer
-	input logic 		[12:0]		aeM; 		// multiplier expoent
-	input logic 					bsM;				// sticky bit of addend
-	input logic 					killprodM; 		// FInput3M >> product
-	input logic					prodinfM;
-	input logic					xzeroM;
-	input logic					yzeroM;
-	input logic					zzeroM;
-	input logic					xdenormM;
-	input logic					ydenormM;
-	input logic					zdenormM;
-	input logic					xinfM;
-	input logic					yinfM;
-	input logic					zinfM;
-	input logic					xnanM;
-	input logic					ynanM;
-	input logic					znanM;
-	input logic					nanM;
-	input logic			[8:0]		sumshiftM;
-	input logic					sumshiftzeroM;
-
-
-	output logic 		[63:0]		FmaResultM;     // output FmaResultM=FInput1M*FInput2M+FInput3M
-	output logic 		[4:0]		FmaFlagsM;    	// status flags
+	input logic 	[63:0]		FInput1M,
+	input logic		[63:0]		FInput2M,
+	input logic 	[63:0]		FInput3M,
+	input logic 	[2:0] 		FrmM,
+	input logic 	[105:0]		ProdManM,
+	input logic 	[161:0]		AlignedAddendM,	
+	input logic 	[12:0]		ProdExpM,
+	input logic 				AddendStickyM,
+	input logic 				KillProdM,
+	input logic 	[3:0]		FOpCtrlM,
+	input logic					XZeroM, YZeroM, ZZeroM,
+	input logic					XInfM, YInfM, ZInfM,
+	input logic					XNaNM, YNaNM, ZNaNM,
+	output logic	[63:0]		FmaResultM,
+	output logic 	[4:0]		FmaFlagsM);
 	

-// Internal nodes
- 	logic 		[163:0]		sum;			// output of carry prop adder
-	logic 		[53:0]		v; 				// normalized sum, R, S bits
-//	logic 		[12:0]		aligncnt; 		// shift count for alignment
-	logic 		[8:0]		normcnt; 		// shift count for normalizer
-	logic 					negsum; 		// negate sum
-	logic 					invz; 			// invert addend
-	logic 					selsum1; 		// select +1 mode of sum
-	logic 					negsum0; 		// sum +0 < 0
-	logic 					negsum1; 		// sum +1 < 0
-	logic 					sumzero; 		// sum = 0
-	logic 					infinity; 		// generate infinity on overflow
-	logic 					sumof;			// result out of range
-	logic					zexpsel;
-	logic					denorm0;
-	logic					resultdenorm;
-	logic					inf;
-	logic					specialsel;
-	logic					expplus1;
-	logic					sumuf;
-	logic					psign;
-	logic					sticky;
-	logic			[12:0]		de0;
-	logic					isAdd;
-	logic					wsign;
-	logic 			[51:0]		wman;
-	logic 			[10:0]		wexp;

-	assign isAdd = 1;
+	logic [51:0] 	XMan, YMan, ZMan, WMan;
+	logic [10:0] 	XExp, YExp, ZExp, WExp;
+	logic 		 	XSgn, YSgn, ZSgn, WSgn, PSgn;
+	logic [105:0]	ProdMan2;
+	logic [162:0]	AlignedAddend2;
+ 	logic [161:0]	Sum;
+	logic [162:0]	SumTmp;
+	logic [12:0]	SumExp;
+	logic [12:0]	SumExpMinus1;
+	logic [12:0]	SumExpTmp, SumExpTmpMinus1, WExpTmp;
+	logic [53:0]	NormSum;
+	logic [161:0]	NormSumTmp;
+	logic [8:0]		NormCnt;
+	logic 			NormSumSticky;
+	logic 			SumZero;
+	logic 			NegSum;
+	logic 			InvZ;
+	logic			ResultDenorm;
+	logic			Sticky;
+	logic 			Plus1, Minus1, Plus1Tmp, Minus1Tmp;
+	logic 			Invalid,Underflow,Overflow,Inexact;
+	logic [8:0]		DenormShift;
+	logic 			ProdInf, ProdOf, ProdUf;
+	logic [63:0]	FmaResultTmp;
+	logic 			SubBySmallNum;
+	logic [63:0]	FInput3M2;
+	logic			ZeroSgn, ResultSgn;
+
+	// Set addend to zero if FMUL instruction
+  	assign FInput3M2 = FOpCtrlM[2] ? 64'b0 : FInput3M;
+
+	// split inputs into the sign bit, mantissa, and exponent for readability
+	assign XSgn = FInput1M[63];
+	assign YSgn = FInput2M[63];
+	assign ZSgn = FInput3M2[63]^FOpCtrlM[0]; //Negate Z if subtraction
+
+	assign XExp = FInput1M[62:52];
+	assign YExp = FInput2M[62:52];
+	assign ZExp = FInput3M2[62:52];
+
+	assign XMan = FInput1M[51:0];
+	assign YMan = FInput2M[51:0];
+	assign ZMan = FInput3M2[51:0];
+
+
+
+	// Calculate the product's sign
+	//		Negate product's sign if FNMADD or FNMSUB
+	assign PSgn = XSgn ^ YSgn ^ FOpCtrlM[1];
+
+
+
+
+	// Addition
+	
+	// Negate Z  when doing one of the following opperations:
+	//		-prod +  Z
+	//		 prod -  Z 
+	assign InvZ = ZSgn ^ PSgn;
+
+	// Choose an inverted or non-inverted addend - the one is added later
+	assign AlignedAddend2 = InvZ ? ~{1'b0,AlignedAddendM} : {1'b0,AlignedAddendM};
+	// Kill the product if the product is too small to effect the addition (determined in fma1.sv)
+	assign ProdMan2 = KillProdM ? 106'b0 : ProdManM;
+
+	// Do the addition
+	// 		- add one to negate if the added was inverted
+	//		- the 2 extra bits at the begining and end are needed for rounding
+	assign SumTmp = AlignedAddend2 + {55'b0, ProdMan2,2'b0} + {162'b0, InvZ};
+	 
+	// Is the sum negitive
+	assign NegSum = SumTmp[162];
+	// If the sum is negitive, negate the sum.
+	assign Sum = NegSum ? -SumTmp[161:0] : SumTmp[161:0];
+
+
+
+
+
+
+	// Leading one detector
+	logic [8:0]	i;
+	always_comb begin
+			i = 0;
+			while (~Sum[161-i] && $unsigned(i) <= $unsigned(9'd161)) i = i+1;  // search for leading one 
+			NormCnt = i+1;    // compute shift count
+	end



@ -113,27 +116,160 @@ module fma2(FInput1M, FInput2M, FInput3M, FrmM,



+	// Normalization
+
+
+	// Determine if the sum is zero
+	assign SumZero = ~(|Sum);
+
+	// Determine if the result is denormal
+	assign ResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp+13'd52)>=0);
+
+	// Determine the shift needed for denormal results
+	assign SumExpTmpMinus1 = SumExpTmp-1;
+	assign DenormShift = ResultDenorm ? SumExpTmpMinus1[8:0] : 9'b0;
+
+	// Normalize the sum
+	assign NormSumTmp = SumZero ? 162'b0 : Sum << NormCnt+DenormShift; 
+	assign NormSum = NormSumTmp[161:108];
+	// Calculate the sticky bit
+	assign NormSumSticky = (|NormSumTmp[107:0]);
+	assign Sticky = AddendStickyM | NormSumSticky;
+
+	// Determine sum's exponent
+	assign SumExpTmp = KillProdM ? {2'b0, ZExp} : ProdExpM + -({4'b0, NormCnt} - 13'd56);
+	assign SumExp = SumZero ? 13'b0 : 
+				 ResultDenorm ? 13'b0 :
+				 SumExpTmp; 



-//   Instantiate fraction datapath
-
-	add				add(.*);
-	lza				lza(.*);
-	normalize		normalize(.zexp(FInput3M[62:52]),.*); 
-	round			round(.xman(FInput1M[51:0]), .yman(FInput2M[51:0]),.zman(FInput3M[51:0]),.*);
-
-// Instantiate exponent datapath
-
-	expgen2			expgen2(.xexp(FInput1M[62:52]),.yexp(FInput2M[62:52]),.zexp(FInput3M[62:52]),.*);


-// Instantiate control logic
+
+
+
+
+	// Rounding
+
+	// round to nearest even
+	//		{NormSum[1], NormSum[0], Sticky}
+	//		0xx - do nothing
+	//		100 - tie - Plus1 if NormSum[2] = 1
+	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
+	//		101/110/111 - Plus1
+
+	// 	round to zero - do nothing
+	//			- subtract 1 if a small number was supposed to be subtracted from the positive result
+
+	// 	round to -infinity - Plus1 if negitive
+	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
+	//			- subtract 1 if a small number was supposed to be subtracted from the positive result
+
+	// 	round to infinity - Plus1 if positive
+
+	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
+	//			- subtract 1 if a small number was supposed to be subtracted from the negitive result
+
+	//  round to nearest max magnitude
+	//		{NormSum[1], NormSum[0], Sticky}
+	//		0xx - do nothing
+	//		100 - tie - Plus1
+	//			- don't add 1 if there was supposed to be a subtraction by a small number that didn't happen
+	//		101/110/111 - Plus1
+
+	// Deterimine if the result was supposed to be subtrated by a small number
+	assign SubBySmallNum = AddendStickyM&InvZ&~NormSumSticky;
+
+	always_comb begin
+		// Determine if you add 1
+		case (FrmM)
+			3'b000: Plus1Tmp = NormSum[1] & (NormSum[0] | (Sticky&~(~NormSum[0]&SubBySmallNum)) | (~NormSum[0]&~Sticky&NormSum[2]));//round to nearest even
+			3'b001: Plus1Tmp = 0;//round to zero
+			3'b010: Plus1Tmp = WSgn & ~(SubBySmallNum);//round down
+			3'b011: Plus1Tmp = ~WSgn & ~(SubBySmallNum);//round up
+			3'b100: Plus1Tmp = (NormSum[1] & (NormSum[0] | (Sticky&~(~NormSum[0]&SubBySmallNum)) | (~NormSum[0]&~Sticky)));//round to nearest max magnitude
+			default: Plus1Tmp = 1'bx;
+		endcase
+		// Determine if you subtract 1
+		case (FrmM)
+			3'b000: Minus1Tmp = 0;//round to nearest even
+			3'b001: Minus1Tmp = SubBySmallNum;//round to zero
+			3'b010: Minus1Tmp = ~WSgn & SubBySmallNum;//round down
+			3'b011: Minus1Tmp = WSgn & SubBySmallNum;//round up
+			3'b100: Minus1Tmp = 0;//round to nearest max magnitude
+			default: Minus1Tmp = 1'bx;
+		endcase
+	
+	end
+
+	// If an answer is exact don't round
+    assign Plus1 = Sticky | (|NormSum[1:0]) ? Plus1Tmp : 1'b0;
+    assign Minus1 = Sticky | (|NormSum[1:0]) ? Minus1Tmp : 1'b0;
+	// Compute rounded result 
+    assign {WExpTmp, WMan} = {SumExp, NormSum[53:2]} - {64'b0, Minus1} + {64'b0, Plus1};
+    assign WExp = WExpTmp[10:0];
+
+
+
+
+
+
+
+	// Sign calculation
+
+
+	// Determine the sign if the sum is zero
+	//	if product underflows then use psign
+	//	otherwise
+	//		if cancelation then 0 unless round to -inf
+	//		otherwise psign
+	assign ZeroSgn = Underflow & ~ResultDenorm ? PSgn :
+				  (PSgn^ZSgn ? FrmM == 3'b010 : PSgn);
+
+	// is the result negitive
+	// 	if p - z is the Sum negitive
+	// 	if -p + z is the Sum positive
+	// 	if -p - z then the Sum is negitive
+	assign ResultSgn = InvZ&(ZSgn)&NegSum | InvZ&PSgn&~NegSum | ((ZSgn)&PSgn);
+	assign WSgn = SumZero ? ZeroSgn : ResultSgn;
 
-sign				sign(.xsign(FInput1M[63]),.ysign(FInput2M[63]),.zsign(FInput3M[63]),.*); 
-flag2				flag2(.xsign(FInput1M[63]),.ysign(FInput2M[63]),.zsign(FInput3M[63]),.vbits(v[1:0]),.*); 
+	// Select the result
+	assign FmaResultM = XNaNM ? {XSgn, XExp, 1'b1,XMan[50:0]} : 
+						YNaNM ? {YSgn, YExp, 1'b1,YMan[50:0]} :
+						ZNaNM ? {ZSgn, ZExp, 1'b1,ZMan[50:0]} :
+						Invalid ? {WSgn, 11'h7ff, 1'b1, 51'b0} : // has to be before inf
+						XInfM ? {PSgn, XExp, XMan} :
+						YInfM ? {PSgn, YExp, YMan} :
+						ZInfM ? {ZSgn, ZExp, ZMan} :
+						Overflow ? {WSgn, 11'h7ff, 52'b0} :
+						Underflow & ~ResultDenorm ? {WSgn, 63'b0} - {63'b0, (Minus1&AddendStickyM)} + {63'b0, (Plus1&AddendStickyM)} :
+						KillProdM ? {ZSgn, ZExp, ZMan} - {63'b0, (Minus1&AddendStickyM)} + {63'b0, (Plus1&AddendStickyM)}: // has to be after Underflow
+						{WSgn,WExp,WMan};
+	

-assign FmaResultM = {wsign,wexp,wman};
+	// Set Invalid flag for following cases:
+	//   1) Inf - Inf
+	//   2) 0 * Inf
+	//   3) any input is a signaling NaN
+	assign ProdOf = (ProdExpM >= 2047 && ~ProdExpM[12]);
+	assign ProdInf = ProdOf && ~XNaNM && ~YNaNM;
+	assign Invalid = (XNaNM&~XMan[51]) | (YNaNM&~YMan[51]) | (ZNaNM&~ZMan[51]) | ((XInfM || YInfM || ProdInf) & ZInfM & (XSgn ^ YSgn ^ ZSgn)) | (XZeroM & YInfM) | (YZeroM & XInfM);  
+	
+	// Set Overflow flag if the number is too big to be represented
+	assign Overflow = WExpTmp >= 2047 & ~WExpTmp[12];
+
+	// Set Underflow flag if the number is too small to be represented in normal numbers
+	assign ProdUf = KillProdM & ZZeroM;
+	assign Underflow = SumExp[12] | ProdUf;
+
+	// Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
+	assign Inexact = Sticky|Overflow| (|NormSum[1:0]);
+
+	// Combine flags 
+	//		- FMA can't set the Divide by zero flag
+	//		- Don't set the underflow flag if the result is exact 
+	assign FmaFlagsM = {Invalid, 1'b0, Overflow, Underflow & Inexact, Inexact};

 endmodule

--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -80,34 +80,17 @@ module fpu (
   logic [4:0] 		   FDivFlagsM, FDivFlagsW;
   
   // FMA signals
-   logic [12:0] 	   aligncntE, aligncntM; 
-   logic [105:0] 	   rE, rM; 
-   logic [105:0] 	   sE, sM; 
-   logic [163:0] 	   tE, tM;	
-   logic [8:0] 		   normcntE, normcntM; 
-   logic [12:0] 	   aeE, aeM; 
-   logic 		   bsE, bsM;
-   logic 		   killprodE, killprodM; 
-   logic 		   prodofE, prodofM; 
-   logic 		   xzeroE, xzeroM;
-   logic 		   yzeroE, yzeroM;
-   logic 		   zzeroE, zzeroM;
-   logic 		   xdenormE, xdenormM;
-   logic 		   ydenormE, ydenormM;
-   logic 		   zdenormE, zdenormM;
-   logic 		   xinfE, xinfM;
-   logic 		   yinfE, yinfM;
-   logic 		   zinfE, zinfM;
-   logic 		   xnanE, xnanM;
-   logic 		   ynanE, ynanM;
-   logic 		   znanE, znanM;
-   logic 		   nanE, nanM;
-   logic [8:0] 		   sumshiftE, sumshiftM;
-   logic 		   sumshiftzeroE, sumshiftzeroM;
-   logic 		   prodinfE, prodinfM;
-   logic [63:0] 	   FmaResultM, FmaResultW;
-   logic [4:0] 		   FmaFlagsM, FmaFlagsW;
-   
+	logic 	[105:0]		ProdManE, ProdManM;
+	logic 	[161:0]		AlignedAddendE,	AlignedAddendM;
+	logic 	[12:0]		ProdExpE, ProdExpM;
+	logic 				    AddendStickyE, AddendStickyM;
+	logic 				    KillProdE, KillProdM;
+	logic				      XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
+	logic				      XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
+	logic				      XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
+  logic [63:0]      FmaResultM, FmaResultW;
+  logic [4:0]       FmaFlagsM, FmaFlagsW;
+
   // add/cvt signals
   logic [63:0] 	   AddSumE, AddSumTcE;
   logic [3:0] 		   AddSelInvE;
@ -241,7 +224,7 @@ module fpu (
 			.CLK(clk),
 			.ECLK(fpdivClk));
   
-   fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk));
+   fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .*);
   
   // first of two-stage instance of floating-point add/cvt unit
   fpuaddcvt1 fpadd1 (.*);
@ -265,31 +248,20 @@ module fpu (
   //*****************
   // fma E/M pipe registers
   //*****************  
-   flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); 
-   flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); 
-   flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); 
-   flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); 
-   flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); 
-   flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM);  
-   flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); 
-   flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); 
-   flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); 
-   flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); 
-   flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); 
-   flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); 
-   flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); 
-   flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); 
-   flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); 
-   flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); 
-   flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); 
-   flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); 
-   flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); 
-   flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); 
-   flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); 
-   flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); 
-   flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); 
-   flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); 
-   flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); 
+  flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, ProdManE, ProdManM); 
+  flopenrc #(162) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, AlignedAddendE, AlignedAddendM); 
+  flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, ProdExpE, ProdExpM);  
+  flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, AddendStickyE, AddendStickyM); 
+  flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, KillProdE, KillProdM); 
+  flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, XZeroE, XZeroM); 
+  flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, YZeroE, YZeroM); 
+  flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, ZZeroE, ZZeroM); 
+  flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, XInfE, XInfM); 
+  flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, YInfE, YInfM); 
+  flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, ZInfE, ZInfM); 
+  flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, XNaNE, XNaNM); 
+  flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, YNaNE, YNaNM); 
+  flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, ZNaNE, ZNaNM);  
   
   //*****************
   // fpadd E/M pipe registers
--- a/wally-pipelined/src/fpu/lza.sv
+++ b/wally-pipelined/src/fpu/lza.sv
@ -1,40 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	lop.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This block implements a Leading One Predictor used to determine 
-//   the normalization shift count. 
-///////////////////////////////////////////////////////////////////////////////
-
-///////////////////////////////////////////////////////////////////////////// 
-module lza(sum, normcnt, sumzero); 
-/////////////////////////////////////////////////////////////////////////////
- 
-	input logic     	[163:0]  	sum;            // sum
-	output logic     	[8:0]		normcnt;		// normalization shift count
-	output logic     		  		sumzero;		// sum = 0
-
-	// Internal nodes
-
-	reg			[8:0] 		i;				// loop index
- 
-	// A real LOP uses a fast carry chain to find only the first 0.
-	// It is an example of a parallel prefix algorithm.  For the sake
-	// of simplicity,  this model is behavioral instead.
-	// A real LOP would also operate on the sources of the adder, not
-	// the result!
-
-	always_comb
-		begin
-			i =   0;
-			while (~sum[163-i] && i <= 163) i = i+1;  // search for leading one 
-			normcnt = i;    // compute shift count
-	end
-
-	// Also check if sum is zero 
-	assign sumzero = ~(|sum);
-	
-endmodule
-
--- a/wally-pipelined/src/fpu/multiply.sv
+++ b/wally-pipelined/src/fpu/multiply.sv
@ -1,138 +0,0 @@
-
-module multiply(xman, yman, xdenormE, ydenormE, xzeroE, yzeroE, rE, sE); 
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic 		[51:0]		xman;				// Fraction of multiplicand	x
-	input logic		[51:0]		yman;				// Fraction of multiplicand y	
-	input logic					xdenormE;		// is x denormalized	
-	input logic					ydenormE;		// is y denormalized	
-	input logic     			xzeroE;		// Z is denorm
-	input logic     			yzeroE;		// Z is denorm
-	output logic		[105:0]		rE;				//	partial product 1	
-	output logic		[105:0]		sE;				//	partial product 2	
-    
-     wire        [54:0]      yExt; //y with appended 0 and assumed 1
-     wire        [53:0]      xExt; //y with assumed 1
-     wire [26:0][1:0] add1;
-     wire [26:0][54:0] pp; 
-     wire [26:0] e;
-     logic [106:0] tmpsE;
-     logic [17:0][106:0] lv1add;
-     logic [11:0][106:0] lv2add;
-     logic [7:0][106:0] lv3add;
-     logic [3:0][106:0] lv4add;
-     logic [21:0][107:0] carryTmp;
-     wire [26:0][106:0] acc; 
-     // wire [105:0] acc
-    genvar i;	
-
-	// assign xExt = {1'b0,~(xdenormE|xzeroE),xman};
-	// assign yExt = {1'b0,~(ydenormE|yzeroE),yman, 1'b0};
-    
-    //  generate
-    //     for(i=0; i<27; i=i+1) begin
-    //         booth booth(.xExt(xExt), .choose(yExt[(i*2)+2:i*2]), .add1(add1[i]), .e(e[i]), .pp(pp[i]));
-    //     end
-    //  endgenerate
-
-    // assign acc[0] = {49'b0,~e[0],e[0],e[0],pp[0]}; 
-    // assign acc[1] = {49'b01,~e[1],pp[1],add1[0]}; 
-    // assign acc[2] = {47'b01,~e[2],pp[2],add1[1], 2'b0};
-    // assign acc[3] = {45'b01,~e[3],pp[3],add1[2], 4'b0};
-    // assign acc[4] = {43'b01,~e[4],pp[4],add1[3], 6'b0};
-    // assign acc[5] = {41'b01,~e[5],pp[5],add1[4], 8'b0};
-    // assign acc[6] = {39'b01,~e[6],pp[6],add1[5], 10'b0};
-    // assign acc[7] = {37'b01,~e[7],pp[7],add1[6], 12'b0};
-    // assign acc[8] = {35'b01,~e[8],pp[8],add1[7], 14'b0};
-    // assign acc[9] = {33'b01,~e[9],pp[9],add1[8], 16'b0};
-    // assign acc[10] = {31'b01,~e[10],pp[10],add1[9], 18'b0};
-    // assign acc[11] = {29'b01,~e[11],pp[11],add1[10], 20'b0};
-    // assign acc[12] = {27'b01,~e[12],pp[12],add1[11], 22'b0};
-    // assign acc[13] = {25'b01,~e[13],pp[13],add1[12], 24'b0};
-    // assign acc[14] = {23'b01,~e[14],pp[14],add1[13], 26'b0};
-    // assign acc[15] = {21'b01,~e[15],pp[15],add1[14], 28'b0};
-    // assign acc[16] = {19'b01,~e[16],pp[16],add1[15], 30'b0};
-    // assign acc[17] = {17'b01,~e[17],pp[17],add1[16], 32'b0};
-    // assign acc[18] = {15'b01,~e[18],pp[18],add1[17], 34'b0};
-    // assign acc[19] = {13'b01,~e[19],pp[19],add1[18], 36'b0};
-    // assign acc[20] = {11'b01,~e[20],pp[20],add1[19], 38'b0};
-    // assign acc[21] = {9'b01,~e[21],pp[21],add1[20], 40'b0};
-    // assign acc[22] = {7'b01,~e[22],pp[22],add1[21], 42'b0};
-    // assign acc[23] = {5'b01,~e[23],pp[23],add1[22], 44'b0};
-    // assign acc[24] = {3'b01,~e[24],pp[24],add1[23], 46'b0};
-    // assign acc[25] = {1'b0, ~e[25],pp[25],add1[24], 48'b0};
-    // assign acc[26] = {pp[26],add1[25], 50'b0};
-
-//***breaks lint with warnings like: %Warning-UNOPTFLAT:      Example path: src/fpu/multiply.sv:86:  ASSIGNW
-// %Warning-UNOPTFLAT:      Example path: src/fpu/multiply.sv:22:  wallypipelinedsoc.hart.fpu.fma1.multiply.lv3add
-    //*** resize adders
-    //  generate
-    //     for(i=0; i<9; i=i+1) begin
-    //         add3comp2 #(.BITS(107)) add1(.a(acc[i*3]), .b(acc[i*3+1]), .c(acc[i*3+2]), 
-    //                                        .carry(carryTmp[i][106:0]), .sum(lv1add[i*2+1]));
-    //         assign lv1add[i*2] = {carryTmp[i][105:0], 1'b0};
-    //     end
-    //  endgenerate
-
-    //  generate
-    //     for(i=0; i<6; i=i+1) begin
-    //         add3comp2 #(.BITS(107)) add2(.a(lv1add[i*3]), .b(lv1add[i*3+1]), .c(lv1add[i*3+2]), 
-    //                                        .carry(carryTmp[i+9][106:0]), .sum(lv2add[i*2+1]));
-    //         assign lv2add[i*2] = {carryTmp[i+9][105:0], 1'b0};
-    //     end
-    //  endgenerate
-
-    // generate
-    //     for(i=0; i<4; i=i+1) begin
-    //         add3comp2 #(.BITS(107)) add3(.a(lv2add[i*3]), .b(lv2add[i*3+1]), .c(lv2add[i*3+2]), 
-    //                                         .carry(carryTmp[i+15][106:0]), .sum(lv3add[i*2+1]));
-    //         assign lv3add[i*2] = {carryTmp[i+15][105:0], 1'b0};
-    //     end
-    // endgenerate
-
-
-    // generate
-    //     for(i=0; i<2; i=i+1) begin
-    //         add4comp2 #(.BITS(107)) add4(.a(lv3add[i*4]), .b(lv3add[i*4+1]), .c(lv3add[i*4+2]), .d(lv3add[i*4+3]),
-    //                                         .carry(carryTmp[i+19]), .sum(lv4add[i*2+1]));
-    //         assign lv4add[i*2] = {carryTmp[i+19][105:0], 1'b0};
-    //     end
-    // endgenerate
-
-    // add4comp2 #(.BITS(107)) add5(.a(lv4add[0]), .b(lv4add[1]), .c(lv4add[2]), .d(lv4add[3]) ,
-    //                                 .carry(carryTmp[21]), .sum(tmpsE));
-    // assign sE = tmpsE[105:0];
-    // assign rE = {carryTmp[21][104:0], 1'b0};
-		// assign rE = 0;
-		// assign sE = acc[0] +
-		// 		   acc[1] +
-		// 		   acc[2] +
-		// 		   acc[3] +
-		// 		   acc[4] +
-		// 		   acc[5] +
-		// 		   acc[6] +
-		// 		   acc[7] +
-		// 		   acc[8] +
-		// 		   acc[9] +
-		// 		   acc[10] +
-		// 		   acc[11] +
-		// 		   acc[12] +
-		// 		   acc[13] +
-		// 		   acc[14] +
-		// 		   acc[15] +
-		// 		   acc[16] +
-		// 		   acc[17] +
-		// 		   acc[18] +
-		// 		   acc[19] +
-		// 		   acc[20] +
-		// 		   acc[21] +
-		// 		   acc[22] +
-		// 		   acc[23] +
-		// 		   acc[24] +
-		// 		   acc[25] +
-		// 		   acc[26];
-
-			assign sE = {53'b0,~(xdenormE|xzeroE),xman}  *  {53'b0,~(ydenormE|yzeroE),yman};
-			assign rE = 0;
-endmodule
-
--- a/wally-pipelined/src/fpu/normalize.sv
+++ b/wally-pipelined/src/fpu/normalize.sv
@ -1,147 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	normalize.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description:
-//   This block performs the normalization shift.  It also
-//   generates the Rands bits for rounding.  Finally, it
-//   handles the special case of a zero sum.
-//
-//   v[53:2]  is the fraction component of the prerounded result.
-//   It can be bypassed back to the X or Z inputs of the FMAC
-//   for back-to-back operations. 
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module normalize(sum, zexp, normcnt, aeM, aligncntM, sumshiftM, sumshiftzeroM, sumzero, 
-				xzeroM, zzeroM, yzeroM, bsM, xdenormM, ydenormM, zdenormM, sticky, de0, resultdenorm, v); 
-/////////////////////////////////////////////////////////////////////////////
-	input logic     	[163:0]  	sum;            // sum
-	input logic     	[62:52]  	zexp;            // sum
-	input logic		[8:0] 		normcnt;     	// normalization shift count
-	input logic		[12:0] 		aeM;     	// normalization shift count
-	input logic		[12:0] 		aligncntM;     	// normalization shift count
-	input logic		[8:0] 		sumshiftM;     	// normalization shift count
-	input logic				sumshiftzeroM;
-	input logic				sumzero;	// sum is zero
-	input logic				bsM;		// sticky bit for addend
-	input logic                  		xdenormM;        // Input Z is denormalized
-	input logic                  		ydenormM;        // Input Z is denormalized
-	input logic                  		zdenormM;        // Input Z is denormalized
-	input logic				xzeroM;
-	input logic				yzeroM;
-	input logic				zzeroM;
-	output logic				sticky;		//sticky bit
-	output logic		[12:0]		de0;
-	output logic                  	resultdenorm;        // Input Z is denormalized
-	output logic		[53:0]		v;		// normalized sum, R, S bits
-
-	// Internal nodes
-
-logic       	[163:0]  	sumshifted;     // shifted sum
-	logic		[9:0]		sumshifttmp;
-	logic       	[163:0]  	sumshiftedtmp;     // shifted sum
-	logic				isShiftLeft1;
-logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;
-
-	// When the sum is zero,  normalization does not apply and only the
-	// sticky bit must be computed.  Otherwise,  the sum is right-shifted
-	// and the Rand S bits (v[1]  and v[O],  respectively) are assigned.
-
-	// The R bit is also set on denormalized numbers where the exponent
-	// was computed to be exactly -1023 and the L bit was set.  This
-	// is required for correct rounding up of multiplication results.
-
-	// The sticky bit calculation is actually built into the shifter and
-	// does not require a true subtraction shown in the model.
- 
-	assign isShiftLeft1 = (aligncntM == 13'b1 ||aligncntM == 13'b0 || $signed(aligncntM) == $signed(-(13'b1)))&& zexp == 11'h2;
-	// assign tmp = ($signed(aeM-normcnt+2) >= $signed(-1022));
-	always_comb
-		begin
-		// d = aligncntM
-		// l = normcnt
-		// p = 53
-		// ea + eb = aeM
-			// set d<=2 to d<=0
-			if ($signed(aligncntM)<=$signed(13'd2))  begin //d<=2 
-				// product anchored or cancellation
-				if ($signed(aeM-{{4{normcnt[8]}},normcnt}+13'd2) >= $signed(-(13'd1022))) begin //ea+eb-l+2 >= emin
-					//normal result
-					de0 = xzeroM|yzeroM ? {2'b0,zexp} : aeM-{{4{normcnt[8]}},normcnt}+{12'b0,xdenormM}+{12'b0,ydenormM}+13'd57;
-					resultdenorm = |sum & ~|de0 | de0[12];
-					// if z is zero then there was a 56 bit shift of the product
-					sumshifted = resultdenorm ? sum << sumshiftM-{8'b0,zzeroM}+{8'b0,isShiftLeft1} : sum << normcnt; // p+2+l
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-					//de0 = aeM-normcnt+2-1023;
-				end else begin
-					sumshifted = sum << (13'd1080+aeM);
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-					resultdenorm = 1;
-					de0 = 0;
-				end
-
-			end else begin                 // extract normalized bits
-				sumshifttmp = {1'b0,sumshiftM} - 2;
-				sumshifted = sumshifttmp[9] ? sum : sum << sumshifttmp;
-				tmp1 = (sumshifted[163] & ~sumshifttmp[9]);
-				tmp2 = ((sumshifttmp[9] & sumshiftM[0]) || sumshifted[162]);
-				tmp3 = (sumshifted[161] || (sumshifttmp[9] & sumshiftM[1]));
-				tmp4 = sumshifted[160];
-				tmp5 = sumshifted[159];
-				// for some reason use exp = zexp + {0,1,2}
-				// the book says exp = zexp + {-1,0,1}
-				if(sumshiftzeroM) begin
-					v = sum[162:109];
-					sticky = (|sum[108:0]) | bsM;
-					de0 = {2'b0,zexp};
-				end else if(sumshifted[163] & ~sumshifttmp[9])begin
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-					de0 = {2'b0,zexp} +13'd2;
-				end else if ((sumshifttmp[9] & sumshiftM[0]) || sumshifted[162]) begin
-					v = sumshifted[161:108];
-					sticky = (|sumshifted[107:0]) | bsM;
-					de0 = {2'b0,zexp}+13'd1;
-				end else if (sumshifted[161] || (sumshifttmp[9] & sumshiftM[1])) begin
-					v = sumshifted[160:107];
-					sticky = (|sumshifted[106:0]) | bsM;
-					//de0 = zexp-1;
-					de0 = {2'b0,zexp}+{12'b0,zdenormM};
-				end else if(sumshifted[160]& ~zdenormM) begin
-					de0 = {2'b0,zexp}-13'b1;
-					v = ~|de0&~sumzero ? sumshifted[160:107] : sumshifted[159:106];
-					sticky = (|sumshifted[105:0]) | bsM;
-					//de0 = zexp-1;
-				end else if(sumshifted[159]& ~zdenormM) begin
-					//v = sumshifted[158:105];
-					de0 = {2'b0,zexp}-13'd2;
-					v = (~|de0 | de0[12])&~sumzero ? sumshifted[161:108] : sumshifted[158:105];
-					sticky = (|sumshifted[104:0]) | bsM;
-					//de0 = zexp-1;
-				end else if(zdenormM) begin					
-					v = sumshifted[160:107];
-					sticky = (|sumshifted[106:0]) | bsM;
-					//de0 = zexp-1;
-					de0 = {{2{zexp[62]}},zexp};
-				end else begin
-					de0 = 0;
-					sumshifted = sum << sumshiftM-1; // p+2+l
-					v = sumshifted[162:109];
-					sticky = (|sumshifted[108:0]) | bsM;
-				end
-
-				resultdenorm = (~|de0 | de0[12]);
-		end 
-	end
-
-
-	// shift sum left by normcnt,  filling the right with zeros 
-	//assign sumshifted = sum << normcnt;
-	
-endmodule
-
-
--- a/wally-pipelined/src/fpu/round.sv
+++ b/wally-pipelined/src/fpu/round.sv
@ -1,122 +0,0 @@
-///////////////////////////////////////////////////////////////////////////// 
-// Block Name:	round.v
-// Author:		David Harris
-// Date:		11/2/1995
-//
-// Block Description: 
-//   This block is responsible for rounding the normalized result of //   the FMAC.   Because prenormalized results may be bypassed back to //   the FMAC X and z input logics, rounding does not appear in the critical //   path of most floating point code.   This is good because rounding //   requires an entire 52 bit carry-propagate half-adder delay.
-//
-//   The results from other FPU blocks (e.g. FCVT,  FDIV,  etc)  are also 
-//   muxed in to form the actual result for register file writeback.  This
-//   saves a mux from the writeback path.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module round(v, sticky, FrmM, wsign,
-			  FmaFlagsM, inf, nanM, xnanM, ynanM, znanM, 
-			  xman, yman, zman,
-			  wman, infinity, specialsel,expplus1);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic		[53:0]		v;		// normalized sum, R, S bits
-	input logic				sticky;		//sticky bit
-	input logic		[2:0]	FrmM;
-	input logic				wsign;		// Sign of result
-	input logic 		[4:0]	FmaFlagsM;
-	input logic				inf;		// Some input logic is infinity
-	input logic				nanM;		// Some input logic is NaN
-	input logic				xnanM;		// X is NaN
-	input logic				ynanM;		// Y is NaN
-	input logic				znanM;		// Z is NaN
-	input logic		[51:0]		xman;		// input logic X
-	input logic		[51:0]		yman;		// input logic Y
-	input logic		[51:0]		zman;		// input logic Z
-	output logic		[51:0]		wman; 		// rounded result of FMAC
-	output logic				infinity;    	// Generate infinity on overflow
-	output logic				specialsel;  	// Select special result
-	output logic				expplus1;
-
-	// Internal nodes
-
-	logic				plus1;		// Round by adding one 
-	wire		[52:0]		v1;		// Result + 1 (for rounding)
-	wire		[51:0]		specialres;	// Result of exceptional case 
-	wire		[51:0]		infinityres;	// Infinity or largest real number
-	wire		[51:0]		nanres;		// Propagated or generated NaN 
-
-	// Compute if round should occur.  This equation is derived from
-	// the rounding tables.
-
-	// round to infinity - plus1 if positive
-	// round to -infinity - plus1 if negitive
-	// round to zero - do nothing
-	// round to nearest even
-	//	{v[1], v[0], sticky}
-	//	0xx - do nothing
-	//	100 - tie - plus1 if v[2] = 1
-	//	101/110/111 - plus1
-
-	//***causes lint warning: %Warning-UNOPTFLAT:      Example path: src/fpu/round.sv:59:  ALWAYS
-// %Warning-UNOPTFLAT:      Example path: src/fpu/round.sv:42:  wallypipelinedsoc.hart.fpu.fma2.round.plus1
-
-	always_comb begin
-		case (FrmM)
-			3'b000: plus1 = (v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2])));//round to nearest even
-			3'b001: plus1 = 0;//round to zero
-			3'b010: plus1 = wsign;//round down
-			3'b011: plus1 = ~wsign;//round up
-			3'b100: plus1 = (v[1] & (v[0] | sticky | (~v[0]&~sticky&~wsign)));//round to nearest max magnitude
-			default: plus1 = 1'bx;
-		endcase
-	end
-
-	// Compute rounded result 
-    assign v1 = v[53:2] + 1;
-	// Determine if postnormalization is necessary
-	// Predicted by all bits =1 before round +1
-
-	//assign postnormalize = &(v[53:2]) && plus1;
-
-	// Determine special result in event of of selection of a result from
-	// another FPU functional unit,  infinity, NAN,  or underflow
-	// The special result mux is a 4:1 mux that should not appear in the
-	// critical path of the machine.   It is not priority encoded,  despite
-	// the code below suggesting otherwise.  Also,  several of the identical data
-	// input logics to the wide muxes can be combined at the expense of more
-	// complicated non-critical control in the circuit implementation.
-
-	assign specialsel =  FmaFlagsM[2] ||  FmaFlagsM[1] ||  FmaFlagsM[4] || //overflow underflow invalid
-							nanM || inf;
-	assign specialres = FmaFlagsM[4] | nanM ? nanres : //invalid
-						 FmaFlagsM[2] ? infinityres : //overflow
-						 inf ? 52'b0 :
-						 FmaFlagsM[1] ? 52'b0 : 52'bx;  // underflow
-
-	// Overflow is handled differently for different rounding modes
-	// Round is to either infinity or to maximum finite number
-
-	assign infinity =  |FrmM;//rn || (rp && ~wsign) || (rm && wsign);//***look into this
-	assign infinityres = infinity ? 52'b0 : {52{1'b1}};
-
-	// Invalid operations produce a quiet NaN. The result should
-	// propagate an input logic if the input logic is NaN. Since we assume all
-	// NaN input logics are already quiet, we don't have to force them quiet.
-
-	// assign nanres = xnanM ? x: (ynanM ? y : (znanM ? z : {1'b1, 51'b0})); // original
-
-	// IEEE 754-2008 section 6.2.3 states:
-	// "If two or more input logics are NaN, then the payload of the resulting NaN should be 
-	// identical to the payload of one of the input logic NaNs if representable in the destination
-	// format. This standard does not specify which of the input logic NaNs will provide the payload."
-	assign nanres = xnanM ? {1'b1, xman[50:0]}: (ynanM ? {1'b1, yman[50:0]} : (znanM ? {1'b1, zman[50:0]} : {1'b1, 51'b0}));// KEP 210112 add the 1 to make NaNs quiet
-
-	// Select result with 4:1 mux
-	// If the sum is zero and we round up,  there is a special case in
-	// which we produce a massive loss of significance and trap to software.
-	// It is handled in the exception unit. 
-	assign expplus1 = v1[52] & ~specialsel & plus1;
-	assign wman = specialsel ? specialres : (plus1 ? v1[51:0] : v[53:2]);
-	
-endmodule
-
--- a/wally-pipelined/src/fpu/sign.sv
+++ b/wally-pipelined/src/fpu/sign.sv
@ -1,112 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	sign.v
-// Author:		David Harris
-// Date:		12/1/1995
-//
-// Block Description:
-//   This block manages the signs of the numbers.
-//   1 =  negative
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module sign(xsign, ysign, zsign, negsum0, negsum1, bsM, FrmM, FmaFlagsM, 
-			 sumzero, zinfM, inf, wsign, invz, negsum, selsum1, isAdd);
-////////////////////////////////////////////////////////////////////////////I
- 
-	input logic					xsign;			// Sign of X 
-	input logic					ysign;			// Sign of Y 
-	input logic					zsign;			// Sign of Z
-	input logic					isAdd;
-	input logic					negsum0;		// Sum in +O mode is negative 
-	input logic					negsum1;		// Sum in +1 mode is negative 
-	input logic					bsM;				// sticky bit from addend
-	input logic		[2:0]		FrmM;				// Round toward minus infinity
-	input logic		[4:0]		FmaFlagsM;				// Round toward minus infinity
-	input logic					sumzero;		// Sum = O
-	input logic					zinfM;			// Y = Inf
-	input logic					inf;			// Some input logic = Inf
-	output logic					wsign;			// Sign of W 
-	output logic					invz;			// Invert addend into adder
-	output logic					negsum;			// Negate result of adder
-	output logic					selsum1;		// Select +1 mode from compound adder
- 
-	// Internal nodes
-
-	wire					zerosign;    	// sign if result= 0 
-	wire					sumneg;    	// sign if result= 0 
-	wire					infsign;     	// sign if result= Inf 
-logic tmp;
-	logic psign;
-
-	// Compute sign of product 
-
-	assign psign = xsign ^ ysign;
-
-	// Invert addend if sign of Z is different from sign of product assign invz = zsign ^ psign;
-
-	//do you invert z
-	assign invz = (zsign ^ psign);
-
-	assign selsum1 = invz;
-	//negate sum if its negitive
-	assign negsum = (selsum1&negsum1) | (~selsum1&negsum0);
-	// is the sum negitive
-	// 	if p - z is the sum negitive
-	// 	if -p + z is the sum positive
-	// 	if -p - z then the sum is negitive
-	assign sumneg = invz&zsign&negsum1 | invz&psign&~negsum1 | (zsign&psign);
-	//always @(invz or negsum0 or negsum1 or bsM or ps)
-	//	begin
-	//		if (~invz) begin               // both input logics have same sign  
-	//			negsum = 0;
-	//			selsum1 = 0;
-	//		end else if (bsM) begin        // sticky bit set on addend
-	//			selsum1 = 0;
-	//			negsum = negsum0; 
-	//		end else if (ps) begin 		// sticky bit set on product
-	//			selsum1 = 1;
-	//			negsum =  negsum1;
-	//		end else begin 				// both sticky bits clear
-	//			//selsum1 = negsum1; 	// KEP 210113-10:44 Selsum1 was adding 1 to values that were multiplied by 0
-	//			 selsum1 = ~negsum1; //original
-	//			negsum = negsum1;
-	//	end 
-	//end
-
-	// Compute sign of result
-	// This involves a special case when the sum is zero:
-	//   x+x retains the same sign as x even when x = +/- 0.
-	//   otherwise,  x-x = +O unless in the RM mode when x-x = -0
-	// There is also a special case for NaNs and invalid results;
-	// the sign of the NaN produced is forced to be 0.
-	// Sign calculation is not in the critical path so the cases
-	// can be tolerated. 
-	// IEEE 754-2008 section 6.3 states 
-	// 		"When ether an input logic or result is NaN, this standard does not interpret the sign of a NaN."
-	// 		also pertaining to negZero it states:
-	//			"When the sum/difference of two operands with opposite signs is exactly zero, the sign of that sum/difference
-	//			 shall be +0 in all rounding attributes EXCEPT roundTowardNegative. Under that attribute, the sign of an exact zero 
-	//			 sum/difference shall be -0.  However, x+x = x-(-X) retains the same sign as x even when x is zero."
- 
-	//assign zerosign = (~invz && killprodM) ? zsign : rm;//***look into
-//	assign zerosign = (~invz && killprodM) ? zsign : 0;
-	// zero sign
-	//	if product underflows then use psign
-	//	otherwise
-	//		addition
-	//			if cancelation then 0 unless round to -inf
-	//			otherwise psign
-	//		subtraction
-	//			if cancelation then 0 unless round to -inf
-	//			otherwise psign
-
-	assign zerosign = FmaFlagsM[1] ? psign :
-			  (isAdd ? (psign^zsign ? FrmM == 3'b010 : psign) :
-				  (psign^zsign ? psign : FrmM == 3'b010));
-	assign infsign = zinfM ? zsign : psign; //KEP 210112 keep the correct sign when result is infinity
-	//assign infsign = xinfM ? (yinfM ? psign : xsign) : yinfM ? ysign : zsign;//original
-	assign tmp = FmaFlagsM[4] ? 0 : (inf ? infsign :(sumzero ? zerosign : psign ^ negsum));
-	assign wsign = FmaFlagsM[4] ? 0 : (inf ? infsign :(sumzero ? zerosign : sumneg));
-
-endmodule
--- a/wally-pipelined/src/fpu/special.sv
+++ b/wally-pipelined/src/fpu/special.sv
@ -1,67 +0,0 @@
-/////////////////////////////////////////////////////////////////////////////// 
-// Block Name:	special.v
-// Author:		David Harris
-// Date:		12/2/1995
-//
-// Block Description:
-//   This block implements special case handling for unusual operands (e.g. 
-//   0, NaN,  denormalize,  infinity).   The block consists of zero/one detectors.
-//
-/////////////////////////////////////////////////////////////////////////////
-
-/////////////////////////////////////////////////////////////////////////////
-module special(FInput1E, FInput2E, FInput3E, xzeroE, yzeroE, zzeroE,
-				xnanE, ynanE, znanE, xdenormE, ydenormE, zdenormE, xinfE, yinfE, zinfE);
-/////////////////////////////////////////////////////////////////////////////
-
-	input logic   	[63:0]     	FInput1E;              // Input FInput1E
-	input logic     	[63:0]     	FInput2E;           	// Input FInput2E
-	input logic      	[63:0]    	FInput3E;            	// Input FInput3E 
-	output logic				xzeroE;		// Input FInput1E = 0
-	output logic				yzeroE;		// Input FInput2E = 0
-	output logic				zzeroE;		// Input FInput3E = 0
-	output logic				xnanE;		// FInput1E is NaN
-	output logic				ynanE;		// FInput2E is NaN
-	output logic				znanE;		// FInput3E is NaN
-	output logic				xdenormE;	// FInput1E is denormalized
-	output logic				ydenormE;	// FInput2E is denormalized
-	output logic				zdenormE;	// FInput3E is denormalized
-	output logic				xinfE;		// FInput1E is infinity
-	output logic				yinfE;		// FInput2E is infinity
-	output logic				zinfE;		// FInput3E is infinity
-
-	// In the actual circuit design, the gates looking at bits
-	// 51:0 and at bits 62:52 should be shared among the various detectors.
-
-	// Check if input is NaN
-
-	assign xnanE = &FInput1E[62:52] && |FInput1E[51:0]; 
-	assign ynanE = &FInput2E[62:52] && |FInput2E[51:0]; 
-	assign znanE = &FInput3E[62:52] && |FInput3E[51:0];
-
-	// Check if input is denormalized
-
-	assign xdenormE = ~(|FInput1E[62:52]) && |FInput1E[51:0]; 
-	assign ydenormE = ~(|FInput2E[62:52]) && |FInput2E[51:0]; 
-	assign zdenormE = ~(|FInput3E[62:52]) && |FInput3E[51:0];
-
-	// Check if input is infinity
-
-	assign xinfE = &FInput1E[62:52] && ~(|FInput1E[51:0]); 
-	assign yinfE = &FInput2E[62:52] && ~(|FInput2E[51:0]); 
-	assign zinfE = &FInput3E[62:52] && ~(|FInput3E[51:0]);
-
-	// Check if inputs are all zero
-	// Also forces denormalized inputs to zero.
-	//   In the circuit implementation,  this can be optimized
-	// to just check if the exponent is zero.
-	
-	// KATHERINE - commented following (21/01/11)
-	// assign xzeroE = ~(|FInput1E[62:0]) || xdenormE;
-	// assign yzeroE = ~(|FInput2E[62:0]) || ydenormE;
-	// assign zzeroE = ~(|FInput3E[62:0]) || zdenormE;
-	// KATHERINE - removed denorm to prevent output logicing zero when computing with a denormalized number
-	assign xzeroE = ~(|FInput1E[62:0]);
-	assign yzeroE = ~(|FInput2E[62:0]);
-	assign zzeroE = ~(|FInput3E[62:0]);
- endmodule
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -122,6 +122,9 @@ string tests32f[] = '{
  };

  string tests64d[] = '{
+    "rv64d/I-FNMADD-D-01", "2000",
+    "rv64d/I-FNMSUB-D-01", "2000",
+    "rv64d/I-FMSUB-D-01", "2000",
    "rv64d/I-FMAX-D-01", "2000",
    "rv64d/I-FMIN-D-01", "2000",
    "rv64d/I-FLE-D-01", "2000",
@ -143,12 +146,9 @@ string tests32f[] = '{
    "rv64d/I-FSD-01", "2000",
    "rv64d/I-FLD-01", "2420",
    "rv64d/I-FMADD-D-01", "2000",
-    // "rv64d/I-FMSUB-D-01", "2000",
-    // "rv64d/I-FMUL-D-01", "2000",
-    "rv64d/I-FMV-D-X-01", "2000",
-    "rv64d/I-FMV-X-D-01", "2000",
-    // "rv64d/I-FNMADD-D-01", "2000",
-    // "rv64d/I-FNMSUB-D-01", "2000",
+    "rv64d/I-FMUL-D-01", "2000",
+    // "rv64d/I-FMV-D-X-01", "2000",
+    // "rv64d/I-FMV-X-D-01", "2000",
    "rv64d/I-FSGNJ-D-01", "2000",
    "rv64d/I-FSGNJN-D-01", "2000",
    "rv64d/I-FSGNJX-D-01", "2000",