Various bugs fixed in FMA

2021-04-13 18:27:13 +00:00 · 2021-04-13 18:27:13 +00:00 · ef011496a7
commit ef011496a7
parent 03bb37a849
20 changed files with 136386 additions and 307226 deletions
--- a/wally-pipelined/src/fpu/FMA/add.sv
+++ b/wally-pipelined/src/fpu/FMA/add.sv
@ -48,7 +48,7 @@ module add(r, s, t, sum,

 	// Compound adder
 	// Consists of 3:2 CSA followed by long compound CPA
-	assign prodshifted = killprod ? 0 : {56'b0, r2, 2'b0} + {56'b0, s2, 2'b0};
+	assign prodshifted = killprod ? 0 : {56'b0, r2+s2, 2'b0};
 	assign sum0 = {1'b0,prodshifted} + t2 + 158'b0;
 	assign sum1 = {1'b0,prodshifted} + t2 + 158'b1; // +1 from invert of z above
 	
--- a/wally-pipelined/src/fpu/FMA/align.sv
+++ b/wally-pipelined/src/fpu/FMA/align.sv
@ -56,7 +56,7 @@ module align(zman, ae, aligncnt, xzero, yzero, zzero, zdenorm, proddenorm, t, bs
 	// addend on right shifts.  Handle special cases of shifting
 	// by too much.

-	always @(aligncnt or zman or zdenorm)
+	always @(aligncnt or xzero or yzero or zman or zdenorm or zzero)
 		begin

 		// Default to clearing sticky bits 
@ -67,26 +67,23 @@ module align(zman, ae, aligncnt, xzero, yzero, zzero, zdenorm, proddenorm, t, bs
 		killprod = xzero | yzero;
 		// d = aligncnt
 		// p = 53
-		if ($signed(aligncnt) <= $signed(-103)) begin //d<=-2p+1
+		if ($signed(aligncnt) <= $signed(-105)) begin //d<=-2p+1
 			//product ancored case with saturated shift
 			sumshift = 163;	// 3p+4	
 			sumshiftzero = 0;
-			shift = {~zdenorm,zman,163'b0} >> sumshift;
+			shift = {1'b1,zman,163'b0} >> sumshift;
 			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			//zexpsel = 0;
-		end else if($signed(aligncnt) <= $signed(1))  begin // -2p+1<d<=2
-			// set d<=2 to d<=0
+		end else if($signed(aligncnt) <= $signed(2))  begin // -2p+1<d<=2
 			// product ancored or cancellation
-			// warning: set to 55 rather then 56. was there a typo in the book?
-			sumshift = 57-aligncnt; // p + 3 - d  
+			sumshift = 57-aligncnt; // p + 2 - d  
 			sumshiftzero = 0;
 			shift = {~zdenorm,zman,163'b0} >> sumshift;
 			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			//zexpsel = 0;
 		end else if ($signed(aligncnt)<=$signed(55))  begin // 2 < d <= p+2
-			// another typo in book? above was 55 changed to 52
 			// addend ancored case
 			// used to be 56 \/ somthing doesn't seem right too many typos
 			sumshift = 57-aligncnt;
--- a/wally-pipelined/src/fpu/FMA/booth.sv
+++ b/wally-pipelined/src/fpu/FMA/booth.sv
@ -0,0 +1,55 @@
+module booth(xExt, choose, add1, e, pp); 
+/////////////////////////////////////////////////////////////////////////////
+    
+	input 		[53:0]		xExt;				// multiplicand	xExt
+	input		[2:0]		choose;				// bits needed to choose which encoding
+	output		[1:0]       	add1;				// do you add 1	
+    output                  e;
+	output		[54:0]		pp;				//	the resultant encoding
+    
+    logic [54:0] pp, temp;
+    logic e;
+    logic [1:0] add1;
+    logic [53:0] negx;
+    //logic temp;
+
+    assign negx = ~xExt;
+
+    always @(choose, xExt, negx)
+    case (choose)
+        3'b000 : pp = 55'b0;   //  0
+        3'b001 : pp = {1'b0, xExt};  //  1
+        3'b010 : pp = {1'b0, xExt};  //  1
+        3'b011 : pp = {xExt, 1'b0};  //  2
+        3'b100 : pp = {negx, 1'b0};  // -2
+        3'b101 : pp = {1'b1, negx};  // -1
+        3'b110 : pp = {1'b1, negx};  // -1
+        3'b111 : pp = 55'hfffffffffffffff;  //  -0
+    endcase
+
+    always @(choose, xExt, negx)
+    case (choose)
+        3'b000 : e = 0;   //  0
+        3'b001 : e = 0;  //  1
+        3'b010 : e = 0;  //  1
+        3'b011 : e = 0;  //  2
+        3'b100 : e = 1;  // -2
+        3'b101 : e = 1;  // -1
+        3'b110 : e = 1;  // -1
+        3'b111 : e = 1;  //  -0
+    endcase
+    // assign add1 = (choose[2] == 1'b1) ? ((choose[1:0] == 2'b11) ? 1'b0 : 1'b1) : 1'b0;
+    // assign add1 = choose[2];
+    always @(choose)
+    case (choose)
+        3'b000 : add1 = 2'b0;   //  0
+        3'b001 : add1 = 2'b0;  //  1
+        3'b010 : add1 = 2'b0;  //  1
+        3'b011 : add1 = 2'b0;  //  2
+        3'b100 : add1 = 2'b10;  // -2
+        3'b101 : add1 = 2'b1;  // -1
+        3'b110 : add1 = 2'b1;  // -1
+        3'b111 : add1 = 2'b1;  //  -0
+    endcase
+
+endmodule
--- a/wally-pipelined/src/fpu/FMA/compressors.sv
+++ b/wally-pipelined/src/fpu/FMA/compressors.sv
@ -0,0 +1,90 @@
+module add3comp2(a, b, c, carry, sum); 
+/////////////////////////////////////////////////////////////////////////////
+//look into diffrent implementations of the compressors?
+    
+    parameter BITS = 4;
+	input 		[BITS-1:0]		a;
+	input		[BITS-1:0]		b;
+	input		[BITS-1:0]    	c;
+    output      [BITS-1:0]      carry;
+	output		[BITS-1:0]		sum;
+    genvar i;
+
+    generate
+        for(i= 0; i<BITS; i=i+1) begin
+            sng3comp2 add0(a[i], b[i], c[i], carry[i], sum[i]);
+        end
+    endgenerate
+
+endmodule
+
+module add4comp2(a, b, c, d, carry, sum); 
+/////////////////////////////////////////////////////////////////////////////
+    
+    parameter BITS = 4;
+	input 		[BITS-1:0]		a;
+	input		[BITS-1:0]		b;
+	input		[BITS-1:0]    	c;
+	input		[BITS-1:0]    	d;
+    output      [BITS:0]      carry;
+	output		[BITS-1:0]		sum;
+
+    logic       [BITS-1:0]      cout;
+    logic                       carryTmp;
+    genvar i;
+
+
+    sng4comp2 add0(a[0], b[0], c[0], d[0], 1'b0, cout[0], carry[0], sum[0]);
+
+    generate
+        for(i= 1; i<BITS-1; i=i+1) begin
+            sng4comp2 add1(a[i], b[i], c[i], d[i], cout[i-1], cout[i], carry[i], sum[i]);
+        end
+    endgenerate
+
+
+    sng4comp2 add2(a[BITS-1], b[BITS-1], c[BITS-1], d[BITS-1], cout[BITS-2], cout[BITS-1], carryTmp, sum[BITS-1]);
+
+    assign carry[BITS-1] = carryTmp & cout[BITS-1];
+    assign carry[BITS] = carryTmp ^ cout[BITS-1];
+
+endmodule
+
+module sng3comp2(a, b, c, carry, sum); 
+/////////////////////////////////////////////////////////////////////////////
+//look into diffrent implementations of the compressors?
+    
+	input 				a;
+	input				b;
+	input		       	c;
+    output              carry;
+	output				sum;
+    
+    logic               axorb;
+
+    assign axorb = a ^ b;
+    assign sum = axorb ^ c;
+
+    assign carry = axorb ? c : a;
+
+endmodule
+
+module sng4comp2(a, b, c, d, cin, cout, carry, sum); 
+/////////////////////////////////////////////////////////////////////////////
+//look into pass gate 4:2 counters?
+    
+	input 				a;
+	input				b;
+	input		       	c;
+    input               d;
+    input               cin;
+    output              cout;
+    output              carry;
+	output				sum;
+    
+    logic               TmpSum;
+
+    sng3comp2 add1(.carry(cout), .sum(TmpSum),.*);
+    sng3comp2 add2(.a(TmpSum), .b(d), .c(cin), .*);
+
+endmodule
--- a/wally-pipelined/src/fpu/FMA/expgen.sv
+++ b/wally-pipelined/src/fpu/FMA/expgen.sv
@ -17,7 +17,7 @@
 /////////////////////////////////////////////////////////////////////////////
 module expgen(xexp, yexp, zexp,
 			   killprod,  sumzero, resultdenorm, normcnt, infinity, 
-			   invalid, overflow, underflow, inf, xzero, yzero,expplus1,
+			   FmaFlagsM, inf, xzero, yzero,expplus1,
 			   nan, de0, xnan, ynan, znan, xdenorm, ydenorm, zdenorm, proddenorm, specialsel, zexpsel,
 			   aligncnt, wexp,
 			   prodof, sumof, sumuf, denorm0, ae);
@ -31,9 +31,7 @@ module expgen(xexp, yexp, zexp,
 	input     			resultdenorm;  // postnormalize rounded result
 	input     	[8:0]  		normcnt;     	// normalization shift count 
 	input     			infinity;    	// generate infinity on overflow 
-	input     			invalid;     	// Result invalid
-	input     			overflow;    	// Result overflowed
-	input     			underflow;   	// Result underflowed 
+	input     	[4:0]	FmaFlagsM;     	// Result invalid
 	input     			inf;			// Some input is infinity
 	input     			nan;			// Some input is NaN
 	input     	[12:0]		de0;			// X is NaN NaN
@ -121,10 +119,10 @@ module expgen(xexp, yexp, zexp,
 	// produces either infinity or the largest finite number, depending on the
 	// rounding mode.  NaNs are propagated or generated.

-	assign specialres = invalid | nan ? nanres : // KEP added nan
-					overflow ? infinityres : 
+	assign specialres = FmaFlagsM[4] | nan ? nanres : // invalid
+					FmaFlagsM[2] ? infinityres : 	//overflow
 					inf ? 11'b11111111111 :
-					underflow ? 11'b0 : 11'bx;
+					FmaFlagsM[1] ? 11'b0 : 11'bx; //underflow

 	assign infinityres = infinity ? 11'b11111111111 : 11'b11111111110;

--- a/wally-pipelined/src/fpu/FMA/flag.sv
+++ b/wally-pipelined/src/fpu/FMA/flag.sv
@ -10,12 +10,13 @@
 /////////////////////////////////////////////////////////////////////////////
 module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 			 psign,  zsign, xzero, yzero, zzero, vbits, killprod,
-			 inf, nan, invalid, overflow, underflow, inexact);
+			 inf, nan, FmaFlagsM,sticky);
 /////////////////////////////////////////////////////////////////////////////

 	input                  		xnan;        	// X is NaN 
 	input                  		ynan;        	// Y is NaN 
 	input                 		znan;       	// Z is NaN 
+	input                  		sticky;        	// X is Inf
 	input                  		xinf;        	// X is Inf
 	input                 		yinf;       	// Y is Inf 
 	input                  		zinf;        	// Z is Inf
@ -31,10 +32,7 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	input     	[1:0]  		vbits;		// R and S bits of result
 	output				inf;		// Some	source is Inf
 	output				nan;		// Some	source is NaN
-	output				invalid;	// Result is invalid	
-	output				overflow;	// Result overflowed	
-	output				underflow;	// Result underflowed	
-	output				inexact;	// Result is not an exact number
+	output		[4:0]	FmaFlagsM;
 
 	//   Internal nodes

@ -55,33 +53,36 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,

 	assign prodinf = prodof && ~xnan && ~ynan;
 	//KEP added if the product is infinity then sum is infinity
-	assign suminf = prodinf | sumof && ~xnan && ~ynan && ~znan;
+	assign suminf = sumof && ~xnan && ~ynan && ~znan;

 	// Set invalid flag for following cases:
 	//   1) Inf - Inf
 	//   2) 0 * Inf
 	//   3) Output = NaN (this is not part of the IEEE spec,  only 486 proj)

-	assign invalid = (xinf || yinf || prodinf) && zinf && (psign ^ zsign) ||
+	assign FmaFlagsM[4] = (xinf || yinf || prodinf) && zinf && (psign ^ zsign) ||
 					   xzero && yinf || yzero && xinf;// KEP remove case 3) above

+	assign FmaFlagsM[3] = 0; // divide by zero flag
+
+
 	// Set the overflow flag for the following cases:
 	//   1) Rounded multiply result would be out of bounds
 	//   2) Rounded add result would be out of bounds

-	assign overflow = suminf && ~inf;
+	assign FmaFlagsM[2] = suminf && ~inf;

 	// Set the underflow  flag for the following cases:
 	//   1) Any input is denormalized
 	//   2)  Output would be denormalized or smaller

-	assign underflow = (sumuf && ~inf && ~prodinf && ~nan) || (killprod & zzero & ~(yzero | xzero));
+	assign FmaFlagsM[1] = (sumuf && ~inf && ~prodinf && ~nan) || (killprod & zzero & ~(yzero | xzero));

 	// Set the inexact flag for the following cases:
 	//   1) Multiplication inexact
 	//   2) Addition  inexact
 	// One of these cases occurred if the R or S bit is set

-	assign inexact = (vbits[0] || vbits[1]  || suminf) && ~(inf || nan);
+	assign FmaFlagsM[0] = (vbits[0] || vbits[1] ||sticky  || suminf) && ~(inf || nan);

 endmodule
--- a/wally-pipelined/src/fpu/FMA/fmac.sv
+++ b/wally-pipelined/src/fpu/FMA/fmac.sv
@ -15,13 +15,13 @@
 //    normalize Normalization shifter
 //    round     Rounding of result
 //    exception Handles exceptional cases
-//    bypass    Handles bypass of result to X or Z inputs
+//    bypass    Handles bypass of result to ReadData1E or ReadData3E inputs
 //    sign      One bit sign handling block 
 //    special   Catch special cases (inputs = 0  / infinity /  etc.) 
 //
-//   The FMAC computes W=X*Y+Z, rounded with the mode specified by
+//   The FMAC computes FmaResultM=ReadData1E*ReadData2E+ReadData3E, rounded with the mode specified by
 //   RN, RZ, RM, or RP.  The result is optionally bypassed back to
-//   the X or Z inputs for use on the next cycle.  In addition,  four signals
+//   the ReadData1E or ReadData3E inputs for use on the next cycle.  In addition,  four signals
 //   are produced: trap, overflow, underflow, and inexact.  Trap indicates
 //   an infinity, NaN, or denormalized number to be handled in software;
 //   the other three signals are IEEE flags.
@ -29,29 +29,17 @@
 /////////////////////////////////////////////////////////////////////////////

 /////////////////////////////////////////////////////////////////////////////
-module fmac(x, y, z, rn, rz, rp, rm,
-			earlyres, earlyressel, bypsel, bypplus1, byppostnorm, 
-			w, wbypass, invalid, overflow, underflow, inexact);
+module fma(ReadData1E, ReadData2E, ReadData3E, FrmE,
+			FmaResultM, FmaFlagsM, aligncnt);
 /////////////////////////////////////////////////////////////////////////////
 
-	input 		[63:0]		x;			// input X from reg file
-	input		[63:0]		y;				// input Y  
-	input 		[63:0]		z;          	// input Z from reg file 
-	input 			 		rn;          	// Round to Nearest
-	input 					rz;           	// Round toward zero
-	input 					rm;          	// Round toward minus infinity
-	input 					rp;          	// Round toward plus infinity
-	input 		[63:0]		earlyres;    	// Early result from other FP logic
-	input 					earlyressel;	// Select early result, not W 
-	input 		[1:0]		bypsel;     	// Select W bypass to X, or z 
-	input 					bypplus1;    	// Add one in bypass
-	input 					byppostnorm;	// postnormalize in bypass
-	output 		[63:0]		w;           	// output W=X*Y+Z
-	output 		[63:0]		wbypass;     	// prerounded output W=X*Y+Z for bypass
-	output 					invalid;    	// Result is invalid 
-	output					overflow;		// Result overflowed 
-	output					underflow;   	// Result underflowed
-	output 					inexact;     	// Result is not an exact number 
+	input 		[63:0]		ReadData1E;		// input 1
+	input		[63:0]		ReadData2E;     // input 2 
+	input 		[63:0]		ReadData3E;     // input 3
+	input 		[2:0]	 	FrmE;          	// Rounding mode
+	output 		[63:0]		FmaResultM;     // output FmaResultM=ReadData1E*ReadData2E+ReadData3E
+	output 		[4:0]		FmaFlagsM;    	// status flags
+	output 		[12:0]		aligncnt;    	// status flags

 // Internal nodes
 
@ -60,12 +48,12 @@ module fmac(x, y, z, rn, rz, rp, rm,
 	logic 		[163:0]		t;				// output of alignment shifter
 	logic 		[163:0]		sum;			// output of carry prop adder
 	logic 		[53:0]		v; 				// normalized sum, R, S bits
-	logic 		[12:0]		aligncnt; 		// shift count for alignment
+//	logic 		[12:0]		aligncnt; 		// shift count for alignment
 	logic 		[8:0]		normcnt; 		// shift count for normalizer
 	logic 		[12:0]		ae; 		// multiplier expoent
 	logic 					bs;				// sticky bit of addend
 	logic 					ps;				// sticky bit of product
-	logic 					killprod; 		// Z >> product
+	logic 					killprod; 		// ReadData3E >> product
 	logic 					negsum; 		// negate sum
 	logic 					invz; 			// invert addend
 	logic 					selsum1; 		// select +1 mode of sum
@ -73,7 +61,7 @@ module fmac(x, y, z, rn, rz, rp, rm,
 	logic 					negsum1; 		// sum +1 < 0
 	logic 					sumzero; 		// sum = 0
 	logic 					infinity; 		// generate infinity on overflow
-	logic 					prodof; 		// X*Y out of range
+	logic 					prodof; 		// ReadData1E*ReadData2E out of range
 	logic 					sumof;			// result out of range
 	logic					xzero;
 	logic					yzero;
@ -101,6 +89,9 @@ module fmac(x, y, z, rn, rz, rp, rm,
 	logic			[8:0]		sumshift;
 	logic					sumshiftzero;
 	logic			[12:0]		de0;
+	logic					isAdd;
+
+	assign isAdd = 1;



@ -117,16 +108,16 @@ module fmac(x, y, z, rn, rz, rp, rm,

 //   Instantiate fraction datapath

-	multiply		multiply(.xman(x[51:0]), .yman(y[51:0]), .*);
-	align			align(.zman(z[51:0]),.*);
+	multiply		multiply(.xman(ReadData1E[51:0]), .yman(ReadData2E[51:0]), .*);
+	align			align(.zman(ReadData3E[51:0]),.*);
 	add				add(.*);
 	lza				lza(.*);
-	normalize		normalize(.zexp(z[62:52]),.*); 
-	round			round(.xman(x[51:0]), .yman(y[51:0]),.zman(z[51:0]), .wman(w[51:0]),.wsign(w[63]),.*);
+	normalize		normalize(.xexp(ReadData1E[62:52]),.yexp(ReadData2E[62:52]),.zexp(ReadData3E[62:52]),.*); 
+	round			round(.xman(ReadData1E[51:0]), .yman(ReadData2E[51:0]),.zman(ReadData3E[51:0]), .wman(FmaResultM[51:0]),.wsign(FmaResultM[63]),.*);

 // Instantiate exponent datapath

-	expgen			expgen(.xexp(x[62:52]),.yexp(y[62:52]),.zexp(z[62:52]),.wexp(w[62:52]),.*);
+	expgen			expgen(.xexp(ReadData1E[62:52]),.yexp(ReadData2E[62:52]),.zexp(ReadData3E[62:52]),.wexp(FmaResultM[62:52]),.*);
 // Instantiate special case detection across datapath & exponent path 

 	special			special(.*);
@ -134,8 +125,8 @@ module fmac(x, y, z, rn, rz, rp, rm,

 // Instantiate control logic
 
-sign				sign(.xsign(x[63]),.ysign(y[63]),.zsign(z[63]),.wsign(w[63]),.*); 
-flag				flag(.zsign(z[63]),.vbits(v[1:0]),.*); 
+sign				sign(.xsign(ReadData1E[63]),.ysign(ReadData2E[63]),.zsign(ReadData3E[63]),.wsign(FmaResultM[63]),.*); 
+flag				flag(.zsign(ReadData3E[63]),.vbits(v[1:0]),.*); 

 endmodule

--- a/wally-pipelined/src/fpu/FMA/lza.sv
+++ b/wally-pipelined/src/fpu/FMA/lza.sv
@ -30,7 +30,7 @@ module lza(sum, normcnt, sumzero);
 	always @ ( sum)
 		begin
 			i =   0;
-			while (~sum[108-i] && i < 108) i = i+1;  // search for leading one 
+			while (~sum[163-i] && i <= 163) i = i+1;  // search for leading one 
 			normcnt = i;    // compute shift count
 	end

--- a/wally-pipelined/src/fpu/FMA/multiply.sv
+++ b/wally-pipelined/src/fpu/FMA/multiply.sv
@ -10,8 +10,124 @@ module multiply(xman, yman, xdenorm, ydenorm, xzero, yzero, r, s);
 	input     			yzero;		// Z is denorm
 	output		[105:0]		r;				//	partial product 1	
 	output		[105:0]		s;				//	partial product 2	
+    
+     wire        [54:0]      yExt; //y with appended 0 and assumed 1
+     wire        [53:0]      xExt; //y with assumed 1
+     wire [26:0][1:0] add1;
+     wire [26:0][54:0] pp; 
+     wire [26:0] e;
+     logic [17:0][105:0] lv1add;
+     logic [11:0][105:0] lv2add;
+     logic [7:0][105:0] lv3add;
+     logic [3:0][105:0] lv4add;
+     logic [21:0][106:0] carryTmp;
+     wire [26:0][105:0] acc; 
+     // wire [105:0] acc
+    genvar i;	

-	assign r = 106'b0;
-	assign s = {53'b0,~(xdenorm|xzero),xman}  *  {53'b0,~(ydenorm|yzero),yman};
+	assign xExt = {2'b0,~(xdenorm|xzero),xman};
+	assign yExt = {2'b0,~(ydenorm|yzero),yman, 1'b0};
+    
+     generate
+        for(i=0; i<27; i=i+1) begin
+            booth booth(.xExt(xExt), .choose(yExt[(i*2)+2:i*2]), .add1(add1[i]), .e(e[i]), .pp(pp[i]));
+        end
+     endgenerate

+    assign acc[0] = {49'b0,~e[0],e[0],e[0],pp[0]}; 
+    assign acc[1] = {50'b01,~e[1],pp[1],add1[0]}; 
+    assign acc[2] = {48'b01,~e[2],pp[2],add1[1], 2'b0};
+    assign acc[3] = {46'b01,~e[3],pp[3],add1[2], 4'b0};
+    assign acc[4] = {44'b01,~e[4],pp[4],add1[3], 6'b0};
+    assign acc[5] = {42'b01,~e[5],pp[5],add1[4], 8'b0};
+    assign acc[6] = {40'b01,~e[6],pp[6],add1[5], 10'b0};
+    assign acc[7] = {38'b01,~e[7],pp[7],add1[6], 12'b0};
+    assign acc[8] = {36'b01,~e[8],pp[8],add1[7], 14'b0};
+    assign acc[9] = {34'b01,~e[9],pp[9],add1[8], 16'b0};
+    assign acc[10] = {32'b01,~e[10],pp[10],add1[9], 18'b0};
+    assign acc[11] = {30'b01,~e[11],pp[11],add1[10], 20'b0};
+    assign acc[12] = {28'b01,~e[12],pp[12],add1[11], 22'b0};
+    assign acc[13] = {26'b01,~e[13],pp[13],add1[12], 24'b0};
+    assign acc[14] = {24'b01,~e[14],pp[14],add1[13], 26'b0};
+    assign acc[15] = {22'b01,~e[15],pp[15],add1[14], 28'b0};
+    assign acc[16] = {20'b01,~e[16],pp[16],add1[15], 30'b0};
+    assign acc[17] = {18'b01,~e[17],pp[17],add1[16], 32'b0};
+    assign acc[18] = {16'b01,~e[18],pp[18],add1[17], 34'b0};
+    assign acc[19] = {14'b01,~e[19],pp[19],add1[18], 36'b0};
+    assign acc[20] = {12'b01,~e[20],pp[20],add1[19], 38'b0};
+    assign acc[21] = {10'b01,~e[21],pp[21],add1[20], 40'b0};
+    assign acc[22] = {8'b01,~e[22],pp[22],add1[21], 42'b0};
+    assign acc[23] = {6'b01,~e[23],pp[23],add1[22], 44'b0};
+    assign acc[24] = {4'b01,~e[24],pp[24],add1[23], 46'b0};
+    assign acc[25] = {~e[25],pp[25],add1[24], 48'b0};
+    assign acc[26] = {pp[26],add1[25], 50'b0};
+
+    //*** resize adders
+     generate
+        for(i=0; i<9; i=i+1) begin
+            add3comp2 #(.BITS(106)) add1(.a(acc[i*3]), .b(acc[i*3+1]), .c(acc[i*3+2]), 
+                                           .carry(carryTmp[i][105:0]), .sum(lv1add[i*2+1]));
+            assign lv1add[i*2] = {carryTmp[i][104:0], 1'b0};
+        end
+     endgenerate
+
+     generate
+        for(i=0; i<6; i=i+1) begin
+            add3comp2 #(.BITS(106)) add2(.a(lv1add[i*3]), .b(lv1add[i*3+1]), .c(lv1add[i*3+2]), 
+                                           .carry(carryTmp[i+9][105:0]), .sum(lv2add[i*2+1]));
+            assign lv2add[i*2] = {carryTmp[i+9][104:0], 1'b0};
+        end
+     endgenerate
+
+    generate
+        for(i=0; i<4; i=i+1) begin
+            add3comp2 #(.BITS(106)) add3(.a(lv2add[i*3]), .b(lv2add[i*3+1]), .c(lv2add[i*3+2]), 
+                                            .carry(carryTmp[i+15][105:0]), .sum(lv3add[i*2+1]));
+            assign lv3add[i*2] = {carryTmp[i+15][104:0], 1'b0};
+        end
+    endgenerate
+
+
+    generate
+        for(i=0; i<2; i=i+1) begin
+            add4comp2 #(.BITS(106)) add4(.a(lv3add[i*4]), .b(lv3add[i*4+1]), .c(lv3add[i*4+2]), .d(lv3add[i*4+3]),
+                                            .carry(carryTmp[i+19]), .sum(lv4add[i*2+1]));
+            assign lv4add[i*2] = {carryTmp[i+19][104:0], 1'b0};
+        end
+    endgenerate
+
+    add4comp2 #(.BITS(106)) add5(.a(lv4add[0]), .b(lv4add[1]), .c(lv4add[2]), .d(lv4add[3]) ,
+                                    .carry(carryTmp[21]), .sum(s));
+    assign r = {carryTmp[21][104:0], 1'b0};
+		// assign r = 0;
+		// assign s = acc[0] +
+		// 		   acc[1] +
+		// 		   acc[2] +
+		// 		   acc[3] +
+		// 		   acc[4] +
+		// 		   acc[5] +
+		// 		   acc[6] +
+		// 		   acc[7] +
+		// 		   acc[8] +
+		// 		   acc[9] +
+		// 		   acc[10] +
+		// 		   acc[11] +
+		// 		   acc[12] +
+		// 		   acc[13] +
+		// 		   acc[14] +
+		// 		   acc[15] +
+		// 		   acc[16] +
+		// 		   acc[17] +
+		// 		   acc[18] +
+		// 		   acc[19] +
+		// 		   acc[20] +
+		// 		   acc[21] +
+		// 		   acc[22] +
+		// 		   acc[23] +
+		// 		   acc[24] +
+		// 		   acc[25] +
+		// 		   acc[26];
+
+			// assign s = {53'b0,~(xdenorm|xzero),xman}  *  {53'b0,~(ydenorm|yzero),yman};
+			// assign r = 0;
 endmodule
--- a/wally-pipelined/src/fpu/FMA/normalize.sv
+++ b/wally-pipelined/src/fpu/FMA/normalize.sv
@ -14,9 +14,11 @@
 /////////////////////////////////////////////////////////////////////////////

 /////////////////////////////////////////////////////////////////////////////
-module normalize(sum, zexp, invz, normcnt, ae, aligncnt, sumshift, sumshiftzero, sumzero, xzero, yzero, bs, ps, denorm0, xdenorm, ydenorm, zdenorm, sticky, de0, resultdenorm, v); 
+module normalize(sum, xexp, yexp, zexp, invz, normcnt, ae, aligncnt, sumshift, sumshiftzero, sumzero, xzero, zzero, yzero, bs, ps, denorm0, xdenorm, ydenorm, zdenorm, sticky, de0, resultdenorm, v); 
 /////////////////////////////////////////////////////////////////////////////
 	input     	[163:0]  	sum;            // sum
+	input     	[62:52]  	xexp;            // sum
+	input     	[62:52]  	yexp;            // sum
 	input     	[62:52]  	zexp;            // sum
 	input		[8:0] 		normcnt;     	// normalization shift count
 	input		[12:0] 		ae;     	// normalization shift count
@ -33,6 +35,7 @@ module normalize(sum, zexp, invz, normcnt, ae, aligncnt, sumshift, sumshiftzero,
 	input                  		zdenorm;        // Input Z is denormalized
 	input				xzero;
 	input				yzero;
+	input				zzero;
 	output				sticky;		//sticky bit
 	output		[12:0]		de0;
 	output                  	resultdenorm;        // Input Z is denormalized
@ -47,6 +50,7 @@ module normalize(sum, zexp, invz, normcnt, ae, aligncnt, sumshift, sumshiftzero,
 	logic		[9:0]		sumshifttmp;
 	logic       	[163:0]  	sumshiftedtmp;     // shifted sum
 	logic 				sticky;
+	logic				isShiftLeft1;
 logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;

 	// When the sum is zero,  normalization does not apply and only the
@ -60,21 +64,23 @@ logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;
 	// The sticky bit calculation is actually built into the shifter and
 	// does not require a true subtraction shown in the model.
 
+	assign isShiftLeft1 = (aligncnt == 1 ||aligncnt == 0 || $signed(aligncnt) == $signed(-1))&& zexp == 11'h2;//((xexp == 11'h3ff && yexp == 11'h1) || (yexp == 11'h3ff && xexp == 11'h1)) && zexp == 11'h2;
 	assign tmp = ($signed(ae-normcnt+2) >= $signed(-1022));
-	always @(sum or sumshift or ae or aligncnt or normcnt or bs or zexp or zdenorm)
+	always @(sum or sumshift or ae or aligncnt or normcnt or bs or isShiftLeft1 or zexp or zdenorm)
 		begin
 		// d = aligncnt
 		// l = normcnt
 		// p = 53
 		// ea + eb = ae
 			// set d<=2 to d<=0
-			if ($signed(aligncnt)<=$signed(1))  begin //d<=2 
+			if ($signed(aligncnt)<=$signed(2))  begin //d<=2 
 				// product anchored or cancellation
 				if ($signed(ae-normcnt+2) >= $signed(-1022)) begin //ea+eb-l+2 >= emin
 					//normal result
-					de0 = xzero|yzero ? zexp : ae-normcnt+2+xdenorm+ydenorm;
-					resultdenorm = |sum & ~|de0;
-					sumshifted = resultdenorm ? sum << sumshift : sum << (55+normcnt); // p+2+l
+					de0 = xzero|yzero ? zexp : ae-normcnt+xdenorm+ydenorm+57;
+					resultdenorm = |sum & ~|de0 | de0[12];
+					// if z is zero then there was a 56 bit shift of the product
+					sumshifted = resultdenorm ? sum << sumshift-zzero+isShiftLeft1 : sum << normcnt; // p+2+l
 					v = sumshifted[162:109];
 					sticky = (|sumshifted[108:0]) | bs;
 					//de0 = ae-normcnt+2-1023;
@ -90,8 +96,8 @@ logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;
 				sumshifttmp = {1'b0,sumshift} - 2;
 				sumshifted = sumshifttmp[9] ? sum : sum << sumshifttmp;
 				tmp1 = (sumshifted[163] & ~sumshifttmp[9]);
-				tmp2 = (sumshifttmp[9] || sumshifted[162]);
-				tmp3 = sumshifted[161];
+				tmp2 = ((sumshifttmp[9] & sumshift[0]) || sumshifted[162]);
+				tmp3 = (sumshifted[161] || (sumshifttmp[9] & sumshift[1]));
 				tmp4 = sumshifted[160];
 				tmp5 = sumshifted[159];
 				// for some reason use exp = zexp + {0,1,2}
@ -112,25 +118,31 @@ logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;
 					v = sumshifted[160:107];
 					sticky = (|sumshifted[106:0]) | bs;
 					//de0 = zexp-1;
-					de0 = zexp;
-				end else if(sumshifted[160]) begin
-					v = sumshifted[159:106];
+					de0 = zexp+zdenorm;
+				end else if(sumshifted[160]& ~zdenorm) begin
+					de0 = zexp-1;
+					v = ~|de0&~sumzero ? sumshifted[160:107] : sumshifted[159:106];
 					sticky = (|sumshifted[105:0]) | bs;
 					//de0 = zexp-1;
-					de0 = zexp-1;
-				end else if(sumshifted[159]) begin
-					v = sumshifted[158:105];
+				end else if(sumshifted[159]& ~zdenorm) begin
+					//v = sumshifted[158:105];
+					de0 = zexp-2;
+					v = (~|de0 | de0[12])&~sumzero ? sumshifted[161:108] : sumshifted[158:105];
 					sticky = (|sumshifted[104:0]) | bs;
 					//de0 = zexp-1;
-					de0 = zexp-2;
-				end else begin					
+				end else if(zdenorm) begin					
 					v = sumshifted[160:107];
 					sticky = (|sumshifted[106:0]) | bs;
 					//de0 = zexp-1;
 					de0 = zexp;
+				end else begin
+					de0 = 0;
+					sumshifted = sum << sumshift-1; // p+2+l
+					v = sumshifted[162:109];
+					sticky = (|sumshifted[108:0]) | bs;
 				end

-				resultdenorm = ~(|de0);
+				resultdenorm = (~|de0 | de0[12]);
 		end 
 	end

--- a/wally-pipelined/src/fpu/FMA/round.sv
+++ b/wally-pipelined/src/fpu/FMA/round.sv
@ -13,22 +13,17 @@
 /////////////////////////////////////////////////////////////////////////////

 /////////////////////////////////////////////////////////////////////////////
-module round(v, sticky, rz, rn, rp, rm, wsign,
-			  invalid, overflow, underflow, inf, nan, xnan, ynan, znan, 
+module round(v, sticky, FrmE, wsign,
+			  FmaFlagsM, inf, nan, xnan, ynan, znan, 
 			  xman, yman, zman,
 			  wman, infinity, specialsel,expplus1);
 /////////////////////////////////////////////////////////////////////////////

 	input		[53:0]		v;		// normalized sum, R, S bits
 	input				sticky;		//sticky bit
-	input				rz;		// Round toward zero
-	input				rn;		// Round toward	nearest
-	input				rp;		// Round toward	plus infinity
-	input				rm;		// Round toward	minus infinity
+	input		[2:0]	FrmE;
 	input				wsign;		// Sign of result
-	input 				invalid;	// Trap on infinity, NaN, denorm
-	input				overflow;	// Result overflowed
-	input				underflow;	// Result underflowed
+	input 		[4:0]	FmaFlagsM;
 	input				inf;		// Some input is infinity
 	input				nan;		// Some input is NaN
 	input				xnan;		// X is NaN
@ -45,7 +40,7 @@ module round(v, sticky, rz, rn, rp, rm, wsign,

 	// Internal nodes

-	wire				plus1;		// Round by adding one 
+	logic				plus1;		// Round by adding one 
 	wire		[52:0]		v1;		// Result + 1 (for rounding)
 	wire		[51:0]		specialres;	// Result of exceptional case 
 	wire		[51:0]		infinityres;	// Infinity or largest real number
@ -62,9 +57,19 @@ module round(v, sticky, rz, rn, rp, rm, wsign,
 	//	0xx - do nothing
 	//	100 - tie - plus1 if v[2] = 1
 	//	101/110/111 - plus1
-	assign plus1 = (rn & v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2]))) |
-		       (rp & ~wsign) |
-		       (rm & wsign);
+	always @ (FrmE, v, wsign, sticky) begin
+		case (FrmE)
+			3'b000: plus1 = (v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2])));//round to nearest even
+			3'b001: plus1 = 0;//round to zero
+			3'b010: plus1 = wsign;//round down
+			3'b011: plus1 = ~wsign;//round up
+			3'b100: plus1 = (v[1] & (v[0] | sticky | (~v[0]&~sticky&~wsign)));//round to nearest max magnitude
+			default: plus1 = 1'bx;
+		endcase
+	end
+	// assign plus1 = (rn & v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2]))) |
+	// 	       (rp & ~wsign) |
+	// 	       (rm & wsign);
 	//assign plus1 = rn && ((v[1] && v[0]) || (v[2] && (v[1]))) ||
 	//				 rp && ~wsign && (v[1] || v[0]) ||
 	//				 rm && wsign && (v[1] || v[0]);
@ -84,17 +89,17 @@ module round(v, sticky, rz, rn, rp, rm, wsign,
 	// inputs to the wide muxes can be combined at the expense of more
 	// complicated non-critical control in the circuit implementation.

-	assign specialsel =  overflow || underflow || invalid ||
+	assign specialsel =  FmaFlagsM[2] ||  FmaFlagsM[1] ||  FmaFlagsM[4] || //overflow underflow invalid
 							nan || inf;
-	assign specialres = invalid | nan ? nanres : //KEP added nan
-						 overflow ? infinityres : 
+	assign specialres = FmaFlagsM[4] | nan ? nanres : //invalid
+						 FmaFlagsM[2] ? infinityres : //overflow
 						 inf ? 52'b0 :
-						underflow ? 52'b0 : 52'bx;  // default to undefined 
+						 FmaFlagsM[1] ? 52'b0 : 52'bx;  // underflow

 	// Overflow is handled differently for different rounding modes
 	// Round is to either infinity or to maximum finite number

-	assign infinity = rn || (rp && ~wsign) || (rm && wsign);
+	assign infinity =  |FrmE;//rn || (rp && ~wsign) || (rm && wsign);//***look into this
 	assign infinityres = infinity ? 52'b0 : {52{1'b1}};

 	// Invalid operations produce a quiet NaN. The result should
--- a/wally-pipelined/src/fpu/FMA/sign.sv
+++ b/wally-pipelined/src/fpu/FMA/sign.sv
@ -10,23 +10,24 @@
 /////////////////////////////////////////////////////////////////////////////

 /////////////////////////////////////////////////////////////////////////////
-module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm, overflow,
-			 sumzero, nan, invalid, xinf, yinf, zinf, inf, wsign, invz, negsum, selsum1, psign);
+module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, FrmE, FmaFlagsM, zzero,
+			 sumzero, nan, xinf, yinf, zinf, inf, wsign, invz, negsum, selsum1, psign, isAdd);
 ////////////////////////////////////////////////////////////////////////////I
 
 	input					xsign;			// Sign of X 
 	input					ysign;			// Sign of Y 
 	input					zsign;			// Sign of Z
+	input					zzero;
+	input					isAdd;
 	input					negsum0;		// Sum in +O mode is negative 
 	input					negsum1;		// Sum in +1 mode is negative 
 	input					bs;				// sticky bit from addend
 	input					ps;				// sticky bit from product
 	input					killprod;		// Product forced to zero
-	input					rm;				// Round toward minus infinity
-	input					overflow;				// Round toward minus infinity
+	input		[2:0]		FrmE;				// Round toward minus infinity
+	input		[4:0]		FmaFlagsM;				// Round toward minus infinity
 	input					sumzero;		// Sum = O
 	input					nan;			// Some input is NaN
-	input					invalid;		// Result invalid
 	input					xinf;			// X = Inf
 	input					yinf;			// Y = Inf
 	input					zinf;			// Y = Inf
@ -96,10 +97,24 @@ logic tmp;
 	//			 shall be +0 in all rounding attributes EXCEPT roundTowardNegative. Under that attribute, the sign of an exact zero 
 	//			 sum/difference shall be -0.  However, x+x = x-(-X) retains the same sign as x even when x is zero."
 
-	assign zerosign = (~invz && killprod) ? zsign : rm;
+	//assign zerosign = (~invz && killprod) ? zsign : rm;//***look into
+//	assign zerosign = (~invz && killprod) ? zsign : 0;
+	// zero sign
+	//	if product underflows then use psign
+	//	otherwise
+	//		addition
+	//			if cancelation then 0 unless round to -inf
+	//			otherwise psign
+	//		subtraction
+	//			if cancelation then 0 unless round to -inf
+	//			otherwise psign
+
+	assign zerosign = FmaFlagsM[1] ? psign :
+			  (isAdd ? (psign^zsign ? FrmE == 3'b010 : psign) :
+				  (psign^zsign ? psign : FrmE == 3'b010));
 	assign infsign = zinf ? zsign : psign; //KEP 210112 keep the correct sign when result is infinity
 	//assign infsign = xinf ? (yinf ? psign : xsign) : yinf ? ysign : zsign;//original
-	assign tmp = invalid ? 0 : (inf ? infsign :(sumzero ? zerosign : psign ^ negsum));
-	assign wsign = invalid ? 0 : (inf ? infsign :(sumzero ? zerosign : sumneg));
+	assign tmp = FmaFlagsM[4] ? 0 : (inf ? infsign :(sumzero ? zerosign : psign ^ negsum));
+	assign wsign = FmaFlagsM[4] ? 0 : (inf ? infsign :(sumzero ? zerosign : sumneg));

 endmodule
--- a/wally-pipelined/src/fpu/FMA/special.sv
+++ b/wally-pipelined/src/fpu/FMA/special.sv
@ -10,49 +10,49 @@
 /////////////////////////////////////////////////////////////////////////////

 /////////////////////////////////////////////////////////////////////////////
-module special(x, y, z, ae, xzero, yzero, zzero,
+module special(ReadData1E, ReadData2E, ReadData3E, ae, xzero, yzero, zzero,
 				xnan, ynan, znan, xdenorm, ydenorm, zdenorm, proddenorm, xinf, yinf, zinf);
 /////////////////////////////////////////////////////////////////////////////

-	input   	[63:0]     	x;              // Input x
-	input     	[63:0]     	y;           	// Input Y
-	input      	[63:0]    	z;            	// Input z 
+	input   	[63:0]     	ReadData1E;              // Input ReadData1E
+	input     	[63:0]     	ReadData2E;           	// Input ReadData2E
+	input      	[63:0]    	ReadData3E;            	// Input ReadData3E 
 	input		[12:0]		ae;		// exponent of product
-	output				xzero;		// Input x = 0
-	output				yzero;		// Input y = 0
-	output				zzero;		// Input z = 0
-	output				xnan;		// x is NaN
-	output				ynan;		// y is NaN
-	output				znan;		// z is NaN
-	output				xdenorm;	// x is denormalized
-	output				ydenorm;	// y is denormalized
-	output				zdenorm;	// z is denormalized
+	output				xzero;		// Input ReadData1E = 0
+	output				yzero;		// Input ReadData2E = 0
+	output				zzero;		// Input ReadData3E = 0
+	output				xnan;		// ReadData1E is NaN
+	output				ynan;		// ReadData2E is NaN
+	output				znan;		// ReadData3E is NaN
+	output				xdenorm;	// ReadData1E is denormalized
+	output				ydenorm;	// ReadData2E is denormalized
+	output				zdenorm;	// ReadData3E is denormalized
 	output				proddenorm;	// product is denormalized
-	output				xinf;		// x is infinity
-	output				yinf;		// y is infinity
-	output				zinf;		// z is infinity
+	output				xinf;		// ReadData1E is infinity
+	output				yinf;		// ReadData2E is infinity
+	output				zinf;		// ReadData3E is infinity

 	// In the actual circuit design, the gates looking at bits
 	// 51:0 and at bits 62:52 should be shared among the various detectors.

 	// Check if input is NaN

-	assign xnan = &x[62:52] && |x[51:0]; 
-	assign ynan = &y[62:52] && |y[51:0]; 
-	assign znan = &z[62:52] && |z[51:0];
+	assign xnan = &ReadData1E[62:52] && |ReadData1E[51:0]; 
+	assign ynan = &ReadData2E[62:52] && |ReadData2E[51:0]; 
+	assign znan = &ReadData3E[62:52] && |ReadData3E[51:0];

 	// Check if input is denormalized

-	assign xdenorm = ~(|x[62:52]) && |x[51:0]; 
-	assign ydenorm = ~(|y[62:52]) && |y[51:0]; 
-	assign zdenorm = ~(|z[62:52]) && |z[51:0];
+	assign xdenorm = ~(|ReadData1E[62:52]) && |ReadData1E[51:0]; 
+	assign ydenorm = ~(|ReadData2E[62:52]) && |ReadData2E[51:0]; 
+	assign zdenorm = ~(|ReadData3E[62:52]) && |ReadData3E[51:0];
 	assign proddenorm = &ae & ~xzero & ~yzero; //KEP is the product denormalized

 	// Check if input is infinity

-	assign xinf = &x[62:52] && ~(|x[51:0]); 
-	assign yinf = &y[62:52] && ~(|y[51:0]); 
-	assign zinf = &z[62:52] && ~(|z[51:0]);
+	assign xinf = &ReadData1E[62:52] && ~(|ReadData1E[51:0]); 
+	assign yinf = &ReadData2E[62:52] && ~(|ReadData2E[51:0]); 
+	assign zinf = &ReadData3E[62:52] && ~(|ReadData3E[51:0]);

 	// Check if inputs are all zero
 	// Also forces denormalized inputs to zero.
@ -60,11 +60,11 @@ module special(x, y, z, ae, xzero, yzero, zzero,
 	// to just check if the exponent is zero.
 	
 	// KATHERINE - commented following (21/01/11)
-	// assign xzero = ~(|x[62:0]) || xdenorm;
-	// assign yzero = ~(|y[62:0]) || ydenorm;
-	// assign zzero = ~(|z[62:0]) || zdenorm;
+	// assign xzero = ~(|ReadData1E[62:0]) || xdenorm;
+	// assign yzero = ~(|ReadData2E[62:0]) || ydenorm;
+	// assign zzero = ~(|ReadData3E[62:0]) || zdenorm;
 	// KATHERINE - removed denorm to prevent outputing zero when computing with a denormalized number
-	assign xzero = ~(|x[62:0]);
-	assign yzero = ~(|y[62:0]);
-	assign zzero = ~(|z[62:0]);
+	assign xzero = ~(|ReadData1E[62:0]);
+	assign yzero = ~(|ReadData2E[62:0]);
+	assign zzero = ~(|ReadData3E[62:0]);
 endmodule
--- a/wally-pipelined/src/fpu/FMA/tbgen/results.dat
+++ b/wally-pipelined/src/fpu/FMA/tbgen/results.dat
@ -1,16 +1 @@
-0010000000000000 bf4fdffffff7fffe 800ffffffffffffe 800003fbfffffefe 801003fbfffffefe  Wrong zdenorm 308227
-0010000000000000 be6fffffbffffff7 8000000000000000 800000001fffffc0 800000000fffffe0  Wrong 313753
-001ffffffffffffe 3fddfbffffffffff 000ffffffffffffe 000efdfffffffffd 001efdfffffffffd  Wrong zdenorm 551371
-3befe000ffffffff 800ffffffffffffe 0000000000000000 0000000000000000 8000000000000000  Wrong ydenorm unflw 665575
-000007fffffffffe 3f6ffffffe01fffe 000ffffffffffffe 00000007ffffff7e 00100007ffffff7e  Wrong xdenorm zdenorm 768727
-3fdffffffffffffe 000ffffffffffffe 8000000000000001 7feffffffffffff6 0007fffffffffffe  Wrong ydenorm zdenorm 1049939
-7fe0000000000001 4000000000000000 ffefffffffffffff 7ff0000000000000 7cb8000000000000  Wrong w=+inf 2602745
-000fff000000000f 3ff00800001fffff 8010000000000000 7f7bfe007ff8381e 000006ff801ffe0e  Wrong xdenorm 3117277
-8000000000000001 40211275ffe5ee3c 0000000000000001 fcfe24ebffcbdc78 8000000000000008  Wrong xdenorm zdenorm 3148591
-801fffffffffffff bfdffffffffffffe 0000000000021fff 0000000000021ffe 0010000000021ffe  Wrong zdenorm 3537867
-801ffffffffffffe 0010000000000001 0000000000000000 0000000000000000 8000000000000000  Wrong unflw 3564269
-bca0000000000001 000fffffc000001e 8000000000000000 8000000000000001 8000000000000000  Wrong ydenorm 3717769
-bcafffffffffffff 800ffffffffffffe 8000000000000000 0000000000000002 0000000000000001  Wrong ydenorm 3807413
-7fec5fed92358a74 400000001bffffff ffefc0003ffffffe 7ff0000000000000 7fe8ffdb47bad466  Wrong w=+inf 3889689
-bfdfffffffffffff 3fdf1f3616aa73e1 3fd0000000000001 3fd07064f4aac611 3f7c193d2ab1843f  Wrong 4099063
-3fd07dfffffffffe 8010000000000001 0000000000000001 ffe07dfffffffffb 80041f7fffffffff  Wrong zdenorm 4716133
+c3f000200003fffe 0000000000000001 001ffffffffffffe 80cffc400007fffd 80cffc400007fffc  Wrong FmaResultM=  -64 ydenorm 1119653
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.c
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.c
@ -20,19 +20,19 @@ void main() {
 		// b68ffff8000000ff_3f9080000007ffff_b6307ffbe0080080_00001
                char ch;
 		int i,j,n;
-		char x[17];
-		char y[17];
-		char z[17];
+		char ReadData1E[17];
+		char ReadData2E[17];
+		char ReadData3E[17];
 		char ans[81];
 		char flags[3];
-		int rn,rz,rm,rp;
-		long stop = 4099063;
+		int FrmE;
+		long stop = 1119653;
 		int debug = 1;
 		//my_string = (char *) malloc (nbytes + 1);
 		//bytes_read = getline (&my_string, &nbytes, stdin);
 	

-		for(n=0; n < 613; n++) {//613 for 10000
+		for(n=0; n < 305; n++) {//613 for 10000
 			if(getline(&ln,&nbytes,fp) < 0 || feof(fp)) break;
 			if(k == stop && debug == 1) break;
 			k++;
@ -41,71 +41,59 @@ void main() {

 		if(!feof(fp)) {

-			strncpy(x,   ln,     16); x[16]=0;
-			strncpy(y,    &ln[17], 16); y[16]=0;
-			strncpy(z,  &ln[34], 16); z[16]=0;
-			// fprintf(stdout,"[%s]\n[%s]\n", ln,z);
+			strncpy(ReadData1E,   ln,     16); ReadData1E[16]=0;
+			strncpy(ReadData2E,    &ln[17], 16); ReadData2E[16]=0;
+			strncpy(ReadData3E,  &ln[34], 16); ReadData3E[16]=0;
+			// fprintf(stdout,"[%s]\n[%s]\n", ln,ReadData3E);
 			strncpy(ans,  &ln[51], 16); ans[16]=0;
 			strncpy(flags,&ln[68],2);   flags[2]=0;
 		
-			// fprintf(stdout,"[%s]\n[%s]\n", ln,z);
-			fprintf(fq,"    x = 64'h%s;\n",x); 
-			fprintf(fq,"    y = 64'h%s;\n",y); 
-			fprintf(fq,"    z = 64'h%s;\n",z);
+			// fprintf(stdout,"[%s]\n[%s]\n", ln,ReadData3E);
+			fprintf(fq,"    ReadData1E = 64'h%s;\n",ReadData1E); 
+			fprintf(fq,"    ReadData2E = 64'h%s;\n",ReadData2E); 
+			fprintf(fq,"    ReadData3E = 64'h%s;\n",ReadData3E);
 			fprintf(fq,"    ans = 64'h%s;\n", ans);
 			// fprintf(fq,"    flags = 5'h%s;\n", flags);
 		

 			{
 				//rn=1; rz=0; rm=0; rp=0;
-				fprintf(fq,"    rn = %d;\n",1);
-				fprintf(fq,"    rz = %d;\n", 0);
-				fprintf(fq,"    rm = %d;\n", 0);
-				fprintf(fq,"    rp = %d;\n", 0);
-			}
-			{
-				fprintf(fq,"    earlyres = 64'b0;\n");
-				fprintf(fq,"    earlyressel = 0;\n");
-			}		
-			{
-
-				fprintf(fq,"    bypsel= 2'b0;\n"); //, bysel);
-				fprintf(fq,"    bypplus1 = 0;\n"); //, byp1);
-				fprintf(fq,"    byppostnorm = 0;\n"); //, bypnorm);
+				fprintf(fq,"    FrmE = 3'b000;\n");
 			}
 			fprintf(fq,"#10\n");
 			// IEEE 754-2008 section 6.3 states "When ether an input or result is NaN, this standard does not interpret the sign of a NaN."
-			//fprintf(fq,"	$fwrite(fp, \"%%h %%h %%h %%h \",x,y,w, ans);\n");	
+			//fprintf(fq,"	$fwrite(fp, \"%%h %%h %%h %%h \",ReadData1E,ReadData2E,FmaResultM, ans);\n");	
 			fprintf(fq,"    // IEEE 754-2008 section 6.3 states: \"When ether an input or result is NaN, this\n");
 			fprintf(fq,"    //                                     standard does not interpret the sign of a NaN.\"\n");
-			fprintf(fq,"	wnan = &w[62:52] && |w[51:0]; \n");
-			fprintf(fq,"	xnan = &x[62:52] && |x[51:0]; \n");
-			fprintf(fq,"	ynan = &y[62:52] && |y[51:0]; \n");
-			fprintf(fq,"	znan = &z[62:52] && |z[51:0]; \n");
+			fprintf(fq,"	wnan = &FmaResultM[62:52] && |FmaResultM[51:0]; \n");
+			fprintf(fq,"	xnan = &ReadData1E[62:52] && |ReadData1E[51:0]; \n");
+			fprintf(fq,"	ynan = &ReadData2E[62:52] && |ReadData2E[51:0]; \n");
+			fprintf(fq,"	znan = &ReadData3E[62:52] && |ReadData3E[51:0]; \n");
 			fprintf(fq,"	ansnan = &ans[62:52] && |ans[51:0]; \n");
-			fprintf(fq,"	xnorm = ~(|x[62:52]) && |x[51:0] ? {x[50:0], 1'b0} : x; \n");
-			fprintf(fq,"	ynorm = ~(|y[62:52]) && |y[51:0] ? {y[50:0], 1'b0} : y;\n");
-			fprintf(fq,"	s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm}; \n");
-			// fprintf(fq,"    if(!(~(|x[62:52]) && |x[51:0] || ~(|y[62:52]) && |y[51:0])) begin\n"); 
+			fprintf(fq,"	xnorm = ~(|ReadData1E[62:52]) && |ReadData1E[51:0] ? {ReadData1E[50:0], 1'b0} : ReadData1E; \n");
+			fprintf(fq,"	ynorm = ~(|ReadData2E[62:52]) && |ReadData2E[51:0] ? {ReadData2E[50:0], 1'b0} : ReadData2E;\n");
+			// fprintf(fq,"	s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm}; \n");
+			// fprintf(fq,"    if(!(~(|ReadData1E[62:52]) && |ReadData1E[51:0] || ~(|ReadData2E[62:52]) && |ReadData2E[51:0])) begin\n"); 
 																							// not looknig at negative zero results right now
-			//fprintf(fq,"	  if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) && !(w == 64'h8000000000000000 && ans == 64'b0)) begin\n"); 
-			// fprintf(fq,"	if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) ) begin\n"); 
-			fprintf(fq,"	if((!wnan && (w != ans)) || (wnan && ansnan && ~(((xnan && (w[62:0] == {x[62:52],1'b1,x[50:0]})) || (ynan && (w[62:0] == {y[62:52],1'b1,y[50:0]}))  || (znan && (w[62:0] == {z[62:52],1'b1,z[50:0]})) || (w[62:0] == ans[62:0])) ))) begin\n"); 
-			fprintf(fq,"		$fwrite(fp, \"%%h %%h %%h %%h %%h  Wrong \",x,y, z, w, ans);\n");
+			//fprintf(fq,"	  if( (nan && (FmaResultM[62:0] != ans[62:0])) || (!nan && (FmaResultM != ans)) && !(FmaResultM == 64'h8000000000000000 && ans == 64'b0)) begin\n"); 
+			// fprintf(fq,"	if( (nan && (FmaResultM[62:0] != ans[62:0])) || (!nan && (FmaResultM != ans)) ) begin\n"); 
+			fprintf(fq,"	if((!wnan && (FmaResultM != ans)) || (wnan && ansnan && ~(((xnan && (FmaResultM[62:0] == {ReadData1E[62:52],1'b1,ReadData1E[50:0]})) || (ynan && (FmaResultM[62:0] == {ReadData2E[62:52],1'b1,ReadData2E[50:0]}))  || (znan && (FmaResultM[62:0] == {ReadData3E[62:52],1'b1,ReadData3E[50:0]})) || (FmaResultM[62:0] == ans[62:0])) ))) begin\n"); 
+			fprintf(fq,"		$fwrite(fp, \"%%h %%h %%h %%h %%h  Wrong \",ReadData1E,ReadData2E, ReadData3E, FmaResultM, ans);\n");
 			//fprintf(fq,"		$fwrite(fp, \"%%h \",s);\n");
-			fprintf(fq,"		if(w == 64'h8000000000000000) $fwrite(fp, \"w=-zero \");\n");
-			fprintf(fq,"		if(~(|x[62:52]) && |x[51:0]) $fwrite(fp, \"xdenorm \");\n");
-			fprintf(fq,"		if(~(|y[62:52]) && |y[51:0]) $fwrite(fp, \"ydenorm \");\n");
-			fprintf(fq,"		if(~(|z[62:52]) && |z[51:0]) $fwrite(fp, \"zdenorm \");\n");
-			fprintf(fq,"		if(invalid != 0) $fwrite(fp, \"invld \");\n");
-			fprintf(fq,"		if(overflow != 0) $fwrite(fp, \"ovrflw \");\n");
-			fprintf(fq,"		if(underflow != 0) $fwrite(fp, \"unflw \");\n");
-			fprintf(fq,"		if(w == 64'hFFF0000000000000) $fwrite(fp, \"w=-inf \");\n");
-			fprintf(fq,"		if(w == 64'h7FF0000000000000) $fwrite(fp, \"w=+inf \");\n");
-			fprintf(fq,"		if(w >  64'h7FF0000000000000 && w <  64'h7FF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
-			fprintf(fq,"		if(w >  64'hFFF8000000000000 && w <  64'hFFF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
-			fprintf(fq,"		if(w >= 64'h7FF8000000000000 && w <= 64'h7FFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
-			fprintf(fq,"		if(w >= 64'hFFF8000000000000 && w <= 64'hFFFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
+			fprintf(fq,"		$fwrite(fp, \"FmaResultM=%%d \",$signed(aligncnt));\n");
+			fprintf(fq,"		if(FmaResultM == 64'h8000000000000000) $fwrite(fp, \"FmaResultM=-zero \");\n");
+			fprintf(fq,"		if(~(|ReadData1E[62:52]) && |ReadData1E[51:0]) $fwrite(fp, \"xdenorm \");\n");
+			fprintf(fq,"		if(~(|ReadData2E[62:52]) && |ReadData2E[51:0]) $fwrite(fp, \"ydenorm \");\n");
+			fprintf(fq,"		if(~(|ReadData3E[62:52]) && |ReadData3E[51:0]) $fwrite(fp, \"zdenorm \");\n");
+			fprintf(fq,"		if(FmaFlagsM[4] != 0) $fwrite(fp, \"invld \");\n");
+			fprintf(fq,"		if(FmaFlagsM[2] != 0) $fwrite(fp, \"ovrflw \");\n");
+			fprintf(fq,"		if(FmaFlagsM[1] != 0) $fwrite(fp, \"unflw \");\n");
+			fprintf(fq,"		if(FmaResultM == 64'hFFF0000000000000) $fwrite(fp, \"FmaResultM=-inf \");\n");
+			fprintf(fq,"		if(FmaResultM == 64'h7FF0000000000000) $fwrite(fp, \"FmaResultM=+inf \");\n");
+			fprintf(fq,"		if(FmaResultM >  64'h7FF0000000000000 && FmaResultM <  64'h7FF8000000000000 ) $fwrite(fp, \"FmaResultM=sigNaN \");\n");
+			fprintf(fq,"		if(FmaResultM >  64'hFFF8000000000000 && FmaResultM <  64'hFFF8000000000000 ) $fwrite(fp, \"FmaResultM=sigNaN \");\n");
+			fprintf(fq,"		if(FmaResultM >= 64'h7FF8000000000000 && FmaResultM <= 64'h7FFfffffffffffff ) $fwrite(fp, \"FmaResultM=qutNaN \");\n");
+			fprintf(fq,"		if(FmaResultM >= 64'hFFF8000000000000 && FmaResultM <= 64'hFFFfffffffffffff ) $fwrite(fp, \"FmaResultM=qutNaN \");\n");

 			fprintf(fq,"		if(ans == 64'hFFF0000000000000) $fwrite(fp, \"ans=-inf \");\n");
 			fprintf(fq,"		if(ans == 64'h7FF0000000000000) $fwrite(fp, \"ans=+inf \");\n");
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.v
--- a/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
@ -2,38 +2,27 @@
 module tb;


- reg 		[63:0]		x;
- reg 		[63:0]		y;
- reg 		[63:0]		z;
- reg 		[63:0]		ans;
- reg 						rn;
- reg 						rz;
- reg 						rm;
- reg 						rp;
- reg 		[63:0]		earlyres;
- reg 						earlyressel;
- reg 		[1:0]			bypsel;
- reg 						bypplus1;
- reg 						byppostnorm;
- wire 	[63:0]		w;
- wire 	[63:0]		wbypass;
- wire 		 			invalid;
- wire 					overflow;
- wire 					underflow;
- wire 					inexact;
+ reg 	[63:0]		ReadData1E;
+ reg 	[63:0]		ReadData2E;
+ reg 	[63:0]		ReadData3E;
+ reg 	[63:0]		ans;
+ reg 	[2:0]		FrmE;
+ wire 	[63:0]		FmaResultM;
+ wire 	[4:0]	 	FmaFlagsM;

 integer fp;
 reg wnan;
 reg xnan;
 reg ynan;
 reg znan;
+wire [12:0] aligncnt;
 reg ansnan;
 reg		[105:0]		s;				//	partial product 2	
 reg		[51:0] 		xnorm;
 reg 		[51:0] 		ynorm;

 localparam period = 20;  
-fmac UUT(.*);
+fma UUT(.*);


 initial 
--- a/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
+++ b/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
@ -1 +1 @@
-testfloat_gen f64_mulAdd -n 6133248 -rnear_even -seed 113355 -level 1 >> testFloat
+testfloat_gen f64_mulAdd -n 6133248 -rminMag -seed 113355 -level 1 >> testFloat
--- a/wally-pipelined/src/fpu/csa.sv
+++ b/wally-pipelined/src/fpu/csa.sv
@ -50,7 +50,7 @@ module FA_array (S, C, A, B, Ci) ;
   genvar 	  i;
   generate
      for (i = 0; i < n; i = i + 1) begin : index
-	 fa FA1(.S(S[i]), .C(C[i]), .A(A[i]), .B(B[i]), .Ci(Ci[i]));
+	 fa FA1(.sum(S[i]), .carry(C[i]), .a(A[i]), .b(B[i]), .c(Ci[i]));
      end
   endgenerate