Merge branch 'main' of github.com:davidharrishmc/riscv-wally into main

2025-02-11 06:05:49 +00:00 · 2021-04-13 17:15:10 -04:00 · 2021-04-13 17:15:10 -04:00 · a545dcb9ae
commit a545dcb9ae
parent ae888b5705 e075dc2d13
20 changed files with 136386 additions and 307226 deletions
--- a/wally-pipelined/src/fpu/FMA/add.sv
+++ b/wally-pipelined/src/fpu/FMA/add.sv
@ -48,7 +48,7 @@ module add(r, s, t, sum,
 	// Compound adder
 	// Consists of 3:2 CSA followed by long compound CPA
-	assign prodshifted = killprod ? 0 : {56'b0, r2, 2'b0} + {56'b0, s2, 2'b0};
+	assign prodshifted = killprod ? 0 : {56'b0, r2+s2, 2'b0};
 	assign sum0 = {1'b0,prodshifted} + t2 + 158'b0;
 	assign sum1 = {1'b0,prodshifted} + t2 + 158'b1; // +1 from invert of z above
--- a/wally-pipelined/src/fpu/FMA/align.sv
+++ b/wally-pipelined/src/fpu/FMA/align.sv
@ -56,7 +56,7 @@ module align(zman, ae, aligncnt, xzero, yzero, zzero, zdenorm, proddenorm, t, bs
 	// addend on right shifts.  Handle special cases of shifting
 	// by too much.
-	always @(aligncnt or zman or zdenorm)
+	always @(aligncnt or xzero or yzero or zman or zdenorm or zzero)
 		begin
 		// Default to clearing sticky bits 
@ -67,26 +67,23 @@ module align(zman, ae, aligncnt, xzero, yzero, zzero, zdenorm, proddenorm, t, bs
 		killprod = xzero | yzero;
 		// d = aligncnt
 		// p = 53
-		if ($signed(aligncnt) <= $signed(-103)) begin //d<=-2p+1
+		if ($signed(aligncnt) <= $signed(-105)) begin //d<=-2p+1
 			//product ancored case with saturated shift
 			sumshift = 163;	// 3p+4	
 			sumshiftzero = 0;
-			shift = {~zdenorm,zman,163'b0} >> sumshift;
+			shift = {1'b1,zman,163'b0} >> sumshift;
 			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			//zexpsel = 0;
-		end else if($signed(aligncnt) <= $signed(1))  begin // -2p+1<d<=2
+		end else if($signed(aligncnt) <= $signed(2))  begin // -2p+1<d<=2
 			// set d<=2 to d<=0
 			// product ancored or cancellation
-			// warning: set to 55 rather then 56. was there a typo in the book?
+			sumshift = 57-aligncnt; // p + 2 - d  
 			sumshift = 57-aligncnt; // p + 3 - d  
 			sumshiftzero = 0;
 			shift = {~zdenorm,zman,163'b0} >> sumshift;
 			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			//zexpsel = 0;
 		end else if ($signed(aligncnt)<=$signed(55))  begin // 2 < d <= p+2
 			// another typo in book? above was 55 changed to 52
 			// addend ancored case
 			// used to be 56 \/ somthing doesn't seem right too many typos
 			sumshift = 57-aligncnt;
--- a/wally-pipelined/src/fpu/FMA/booth.sv
+++ b/wally-pipelined/src/fpu/FMA/booth.sv
@ -0,0 +1,55 @@
 module booth(xExt, choose, add1, e, pp); 
 /////////////////////////////////////////////////////////////////////////////
 	input 		[53:0]		xExt;				// multiplicand	xExt
 	input		[2:0]		choose;				// bits needed to choose which encoding
 	output		[1:0]       	add1;				// do you add 1	
    output                  e;
 	output		[54:0]		pp;				//	the resultant encoding
    logic [54:0] pp, temp;
    logic e;
    logic [1:0] add1;
    logic [53:0] negx;
    //logic temp;
    assign negx = ~xExt;
    always @(choose, xExt, negx)
    case (choose)
        3'b000 : pp = 55'b0;   //  0
        3'b001 : pp = {1'b0, xExt};  //  1
        3'b010 : pp = {1'b0, xExt};  //  1
        3'b011 : pp = {xExt, 1'b0};  //  2
        3'b100 : pp = {negx, 1'b0};  // -2
        3'b101 : pp = {1'b1, negx};  // -1
        3'b110 : pp = {1'b1, negx};  // -1
        3'b111 : pp = 55'hfffffffffffffff;  //  -0
    endcase
    always @(choose, xExt, negx)
    case (choose)
        3'b000 : e = 0;   //  0
        3'b001 : e = 0;  //  1
        3'b010 : e = 0;  //  1
        3'b011 : e = 0;  //  2
        3'b100 : e = 1;  // -2
        3'b101 : e = 1;  // -1
        3'b110 : e = 1;  // -1
        3'b111 : e = 1;  //  -0
    endcase
    // assign add1 = (choose[2] == 1'b1) ? ((choose[1:0] == 2'b11) ? 1'b0 : 1'b1) : 1'b0;
    // assign add1 = choose[2];
    always @(choose)
    case (choose)
        3'b000 : add1 = 2'b0;   //  0
        3'b001 : add1 = 2'b0;  //  1
        3'b010 : add1 = 2'b0;  //  1
        3'b011 : add1 = 2'b0;  //  2
        3'b100 : add1 = 2'b10;  // -2
        3'b101 : add1 = 2'b1;  // -1
        3'b110 : add1 = 2'b1;  // -1
        3'b111 : add1 = 2'b1;  //  -0
    endcase
 endmodule
--- a/wally-pipelined/src/fpu/FMA/compressors.sv
+++ b/wally-pipelined/src/fpu/FMA/compressors.sv
@ -0,0 +1,90 @@
 module add3comp2(a, b, c, carry, sum); 
 /////////////////////////////////////////////////////////////////////////////
 //look into diffrent implementations of the compressors?
    parameter BITS = 4;
 	input 		[BITS-1:0]		a;
 	input		[BITS-1:0]		b;
 	input		[BITS-1:0]    	c;
    output      [BITS-1:0]      carry;
 	output		[BITS-1:0]		sum;
    genvar i;
    generate
        for(i= 0; i<BITS; i=i+1) begin
            sng3comp2 add0(a[i], b[i], c[i], carry[i], sum[i]);
        end
    endgenerate
 endmodule
 module add4comp2(a, b, c, d, carry, sum); 
 /////////////////////////////////////////////////////////////////////////////
    parameter BITS = 4;
 	input 		[BITS-1:0]		a;
 	input		[BITS-1:0]		b;
 	input		[BITS-1:0]    	c;
 	input		[BITS-1:0]    	d;
    output      [BITS:0]      carry;
 	output		[BITS-1:0]		sum;
    logic       [BITS-1:0]      cout;
    logic                       carryTmp;
    genvar i;
    sng4comp2 add0(a[0], b[0], c[0], d[0], 1'b0, cout[0], carry[0], sum[0]);
    generate
        for(i= 1; i<BITS-1; i=i+1) begin
            sng4comp2 add1(a[i], b[i], c[i], d[i], cout[i-1], cout[i], carry[i], sum[i]);
        end
    endgenerate
    sng4comp2 add2(a[BITS-1], b[BITS-1], c[BITS-1], d[BITS-1], cout[BITS-2], cout[BITS-1], carryTmp, sum[BITS-1]);
    assign carry[BITS-1] = carryTmp & cout[BITS-1];
    assign carry[BITS] = carryTmp ^ cout[BITS-1];
 endmodule
 module sng3comp2(a, b, c, carry, sum); 
 /////////////////////////////////////////////////////////////////////////////
 //look into diffrent implementations of the compressors?
 	input 				a;
 	input				b;
 	input		       	c;
    output              carry;
 	output				sum;
    logic               axorb;
    assign axorb = a ^ b;
    assign sum = axorb ^ c;
    assign carry = axorb ? c : a;
 endmodule
 module sng4comp2(a, b, c, d, cin, cout, carry, sum); 
 /////////////////////////////////////////////////////////////////////////////
 //look into pass gate 4:2 counters?
 	input 				a;
 	input				b;
 	input		       	c;
    input               d;
    input               cin;
    output              cout;
    output              carry;
 	output				sum;
    logic               TmpSum;
    sng3comp2 add1(.carry(cout), .sum(TmpSum),.*);
    sng3comp2 add2(.a(TmpSum), .b(d), .c(cin), .*);
 endmodule
--- a/wally-pipelined/src/fpu/FMA/expgen.sv
+++ b/wally-pipelined/src/fpu/FMA/expgen.sv
@ -17,7 +17,7 @@
 /////////////////////////////////////////////////////////////////////////////
 module expgen(xexp, yexp, zexp,
 			   killprod,  sumzero, resultdenorm, normcnt, infinity, 
-			   invalid, overflow, underflow, inf, xzero, yzero,expplus1,
+			   FmaFlagsM, inf, xzero, yzero,expplus1,
 			   nan, de0, xnan, ynan, znan, xdenorm, ydenorm, zdenorm, proddenorm, specialsel, zexpsel,
 			   aligncnt, wexp,
 			   prodof, sumof, sumuf, denorm0, ae);
@ -31,9 +31,7 @@ module expgen(xexp, yexp, zexp,
 	input     			resultdenorm;  // postnormalize rounded result
 	input     	[8:0]  		normcnt;     	// normalization shift count 
 	input     			infinity;    	// generate infinity on overflow 
-	input     			invalid;     	// Result invalid
+	input     	[4:0]	FmaFlagsM;     	// Result invalid
 	input     			overflow;    	// Result overflowed
 	input     			underflow;   	// Result underflowed 
 	input     			inf;			// Some input is infinity
 	input     			nan;			// Some input is NaN
 	input     	[12:0]		de0;			// X is NaN NaN
@ -121,10 +119,10 @@ module expgen(xexp, yexp, zexp,
 	// produces either infinity or the largest finite number, depending on the
 	// rounding mode.  NaNs are propagated or generated.
-	assign specialres = invalid | nan ? nanres : // KEP added nan
+	assign specialres = FmaFlagsM[4] | nan ? nanres : // invalid
-					overflow ? infinityres : 
+					FmaFlagsM[2] ? infinityres : 	//overflow
 					inf ? 11'b11111111111 :
-					underflow ? 11'b0 : 11'bx;
+					FmaFlagsM[1] ? 11'b0 : 11'bx; //underflow
 	assign infinityres = infinity ? 11'b11111111111 : 11'b11111111110;
--- a/wally-pipelined/src/fpu/FMA/flag.sv
+++ b/wally-pipelined/src/fpu/FMA/flag.sv
@ -10,12 +10,13 @@
 /////////////////////////////////////////////////////////////////////////////
 module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 			 psign,  zsign, xzero, yzero, zzero, vbits, killprod,
-			 inf, nan, invalid, overflow, underflow, inexact);
+			 inf, nan, FmaFlagsM,sticky);
 /////////////////////////////////////////////////////////////////////////////
 	input                  		xnan;        	// X is NaN 
 	input                  		ynan;        	// Y is NaN 
 	input                 		znan;       	// Z is NaN 
 	input                  		sticky;        	// X is Inf
 	input                  		xinf;        	// X is Inf
 	input                 		yinf;       	// Y is Inf 
 	input                  		zinf;        	// Z is Inf
@ -31,10 +32,7 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	input     	[1:0]  		vbits;		// R and S bits of result
 	output				inf;		// Some	source is Inf
 	output				nan;		// Some	source is NaN
-	output				invalid;	// Result is invalid	
+	output		[4:0]	FmaFlagsM;
 	output				overflow;	// Result overflowed	
 	output				underflow;	// Result underflowed	
 	output				inexact;	// Result is not an exact number
 	//   Internal nodes
@ -55,33 +53,36 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	assign prodinf = prodof && ~xnan && ~ynan;
 	//KEP added if the product is infinity then sum is infinity
-	assign suminf = prodinf | sumof && ~xnan && ~ynan && ~znan;
+	assign suminf = sumof && ~xnan && ~ynan && ~znan;
 	// Set invalid flag for following cases:
 	//   1) Inf - Inf
 	//   2) 0 * Inf
 	//   3) Output = NaN (this is not part of the IEEE spec,  only 486 proj)
-	assign invalid = (xinf || yinf || prodinf) && zinf && (psign ^ zsign) ||
+	assign FmaFlagsM[4] = (xinf || yinf || prodinf) && zinf && (psign ^ zsign) ||
 					   xzero && yinf || yzero && xinf;// KEP remove case 3) above
 	assign FmaFlagsM[3] = 0; // divide by zero flag
 	// Set the overflow flag for the following cases:
 	//   1) Rounded multiply result would be out of bounds
 	//   2) Rounded add result would be out of bounds
-	assign overflow = suminf && ~inf;
+	assign FmaFlagsM[2] = suminf && ~inf;
 	// Set the underflow  flag for the following cases:
 	//   1) Any input is denormalized
 	//   2)  Output would be denormalized or smaller
-	assign underflow = (sumuf && ~inf && ~prodinf && ~nan) || (killprod & zzero & ~(yzero | xzero));
+	assign FmaFlagsM[1] = (sumuf && ~inf && ~prodinf && ~nan) || (killprod & zzero & ~(yzero | xzero));
 	// Set the inexact flag for the following cases:
 	//   1) Multiplication inexact
 	//   2) Addition  inexact
 	// One of these cases occurred if the R or S bit is set
-	assign inexact = (vbits[0] || vbits[1]  || suminf) && ~(inf || nan);
+	assign FmaFlagsM[0] = (vbits[0] || vbits[1] ||sticky  || suminf) && ~(inf || nan);
 endmodule
--- a/wally-pipelined/src/fpu/FMA/fmac.sv
+++ b/wally-pipelined/src/fpu/FMA/fmac.sv
@ -15,13 +15,13 @@
 //    normalize Normalization shifter
 //    round     Rounding of result
 //    exception Handles exceptional cases
-//    bypass    Handles bypass of result to X or Z inputs
+//    bypass    Handles bypass of result to ReadData1E or ReadData3E inputs
 //    sign      One bit sign handling block 
 //    special   Catch special cases (inputs = 0  / infinity /  etc.) 
 //
-//   The FMAC computes W=X*Y+Z, rounded with the mode specified by
+//   The FMAC computes FmaResultM=ReadData1E*ReadData2E+ReadData3E, rounded with the mode specified by
 //   RN, RZ, RM, or RP.  The result is optionally bypassed back to
-//   the X or Z inputs for use on the next cycle.  In addition,  four signals
+//   the ReadData1E or ReadData3E inputs for use on the next cycle.  In addition,  four signals
 //   are produced: trap, overflow, underflow, and inexact.  Trap indicates
 //   an infinity, NaN, or denormalized number to be handled in software;
 //   the other three signals are IEEE flags.
@ -29,29 +29,17 @@
 /////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////
-module fmac(x, y, z, rn, rz, rp, rm,
+module fma(ReadData1E, ReadData2E, ReadData3E, FrmE,
-			earlyres, earlyressel, bypsel, bypplus1, byppostnorm, 
+			FmaResultM, FmaFlagsM, aligncnt);
 			w, wbypass, invalid, overflow, underflow, inexact);
 /////////////////////////////////////////////////////////////////////////////
-	input 		[63:0]		x;			// input X from reg file
+	input 		[63:0]		ReadData1E;		// input 1
-	input		[63:0]		y;				// input Y  
+	input		[63:0]		ReadData2E;     // input 2 
-	input 		[63:0]		z;          	// input Z from reg file 
+	input 		[63:0]		ReadData3E;     // input 3
-	input 			 		rn;          	// Round to Nearest
+	input 		[2:0]	 	FrmE;          	// Rounding mode
-	input 					rz;           	// Round toward zero
+	output 		[63:0]		FmaResultM;     // output FmaResultM=ReadData1E*ReadData2E+ReadData3E
-	input 					rm;          	// Round toward minus infinity
+	output 		[4:0]		FmaFlagsM;    	// status flags
-	input 					rp;          	// Round toward plus infinity
+	output 		[12:0]		aligncnt;    	// status flags
 	input 		[63:0]		earlyres;    	// Early result from other FP logic
 	input 					earlyressel;	// Select early result, not W 
 	input 		[1:0]		bypsel;     	// Select W bypass to X, or z 
 	input 					bypplus1;    	// Add one in bypass
 	input 					byppostnorm;	// postnormalize in bypass
 	output 		[63:0]		w;           	// output W=X*Y+Z
 	output 		[63:0]		wbypass;     	// prerounded output W=X*Y+Z for bypass
 	output 					invalid;    	// Result is invalid 
 	output					overflow;		// Result overflowed 
 	output					underflow;   	// Result underflowed
 	output 					inexact;     	// Result is not an exact number 
 // Internal nodes
@ -60,12 +48,12 @@ module fmac(x, y, z, rn, rz, rp, rm,
 	logic 		[163:0]		t;				// output of alignment shifter
 	logic 		[163:0]		sum;			// output of carry prop adder
 	logic 		[53:0]		v; 				// normalized sum, R, S bits
-	logic 		[12:0]		aligncnt; 		// shift count for alignment
+//	logic 		[12:0]		aligncnt; 		// shift count for alignment
 	logic 		[8:0]		normcnt; 		// shift count for normalizer
 	logic 		[12:0]		ae; 		// multiplier expoent
 	logic 					bs;				// sticky bit of addend
 	logic 					ps;				// sticky bit of product
-	logic 					killprod; 		// Z >> product
+	logic 					killprod; 		// ReadData3E >> product
 	logic 					negsum; 		// negate sum
 	logic 					invz; 			// invert addend
 	logic 					selsum1; 		// select +1 mode of sum
@ -73,7 +61,7 @@ module fmac(x, y, z, rn, rz, rp, rm,
 	logic 					negsum1; 		// sum +1 < 0
 	logic 					sumzero; 		// sum = 0
 	logic 					infinity; 		// generate infinity on overflow
-	logic 					prodof; 		// X*Y out of range
+	logic 					prodof; 		// ReadData1E*ReadData2E out of range
 	logic 					sumof;			// result out of range
 	logic					xzero;
 	logic					yzero;
@ -101,6 +89,9 @@ module fmac(x, y, z, rn, rz, rp, rm,
 	logic			[8:0]		sumshift;
 	logic					sumshiftzero;
 	logic			[12:0]		de0;
 	logic					isAdd;
 	assign isAdd = 1;
@ -117,16 +108,16 @@ module fmac(x, y, z, rn, rz, rp, rm,
 //   Instantiate fraction datapath
-	multiply		multiply(.xman(x[51:0]), .yman(y[51:0]), .*);
+	multiply		multiply(.xman(ReadData1E[51:0]), .yman(ReadData2E[51:0]), .*);
-	align			align(.zman(z[51:0]),.*);
+	align			align(.zman(ReadData3E[51:0]),.*);
 	add				add(.*);
 	lza				lza(.*);
-	normalize		normalize(.zexp(z[62:52]),.*); 
+	normalize		normalize(.xexp(ReadData1E[62:52]),.yexp(ReadData2E[62:52]),.zexp(ReadData3E[62:52]),.*); 
-	round			round(.xman(x[51:0]), .yman(y[51:0]),.zman(z[51:0]), .wman(w[51:0]),.wsign(w[63]),.*);
+	round			round(.xman(ReadData1E[51:0]), .yman(ReadData2E[51:0]),.zman(ReadData3E[51:0]), .wman(FmaResultM[51:0]),.wsign(FmaResultM[63]),.*);
 // Instantiate exponent datapath
-	expgen			expgen(.xexp(x[62:52]),.yexp(y[62:52]),.zexp(z[62:52]),.wexp(w[62:52]),.*);
+	expgen			expgen(.xexp(ReadData1E[62:52]),.yexp(ReadData2E[62:52]),.zexp(ReadData3E[62:52]),.wexp(FmaResultM[62:52]),.*);
 // Instantiate special case detection across datapath & exponent path 
 	special			special(.*);
@ -134,8 +125,8 @@ module fmac(x, y, z, rn, rz, rp, rm,
 // Instantiate control logic
-sign				sign(.xsign(x[63]),.ysign(y[63]),.zsign(z[63]),.wsign(w[63]),.*); 
+sign				sign(.xsign(ReadData1E[63]),.ysign(ReadData2E[63]),.zsign(ReadData3E[63]),.wsign(FmaResultM[63]),.*); 
-flag				flag(.zsign(z[63]),.vbits(v[1:0]),.*); 
+flag				flag(.zsign(ReadData3E[63]),.vbits(v[1:0]),.*); 
 endmodule
--- a/wally-pipelined/src/fpu/FMA/lza.sv
+++ b/wally-pipelined/src/fpu/FMA/lza.sv
@ -30,7 +30,7 @@ module lza(sum, normcnt, sumzero);
 	always @ ( sum)
 		begin
 			i =   0;
-			while (~sum[108-i] && i < 108) i = i+1;  // search for leading one 
+			while (~sum[163-i] && i <= 163) i = i+1;  // search for leading one 
 			normcnt = i;    // compute shift count
 	end
--- a/wally-pipelined/src/fpu/FMA/multiply.sv
+++ b/wally-pipelined/src/fpu/FMA/multiply.sv
@ -11,7 +11,123 @@ module multiply(xman, yman, xdenorm, ydenorm, xzero, yzero, r, s);
 	output		[105:0]		r;				//	partial product 1	
 	output		[105:0]		s;				//	partial product 2	
-	assign r = 106'b0;
+     wire        [54:0]      yExt; //y with appended 0 and assumed 1
-	assign s = {53'b0,~(xdenorm|xzero),xman}  *  {53'b0,~(ydenorm|yzero),yman};
+     wire        [53:0]      xExt; //y with assumed 1
     wire [26:0][1:0] add1;
     wire [26:0][54:0] pp; 
     wire [26:0] e;
     logic [17:0][105:0] lv1add;
     logic [11:0][105:0] lv2add;
     logic [7:0][105:0] lv3add;
     logic [3:0][105:0] lv4add;
     logic [21:0][106:0] carryTmp;
     wire [26:0][105:0] acc; 
     // wire [105:0] acc
    genvar i;	
 	assign xExt = {2'b0,~(xdenorm|xzero),xman};
 	assign yExt = {2'b0,~(ydenorm|yzero),yman, 1'b0};
     generate
        for(i=0; i<27; i=i+1) begin
            booth booth(.xExt(xExt), .choose(yExt[(i*2)+2:i*2]), .add1(add1[i]), .e(e[i]), .pp(pp[i]));
        end
     endgenerate
    assign acc[0] = {49'b0,~e[0],e[0],e[0],pp[0]}; 
    assign acc[1] = {50'b01,~e[1],pp[1],add1[0]}; 
    assign acc[2] = {48'b01,~e[2],pp[2],add1[1], 2'b0};
    assign acc[3] = {46'b01,~e[3],pp[3],add1[2], 4'b0};
    assign acc[4] = {44'b01,~e[4],pp[4],add1[3], 6'b0};
    assign acc[5] = {42'b01,~e[5],pp[5],add1[4], 8'b0};
    assign acc[6] = {40'b01,~e[6],pp[6],add1[5], 10'b0};
    assign acc[7] = {38'b01,~e[7],pp[7],add1[6], 12'b0};
    assign acc[8] = {36'b01,~e[8],pp[8],add1[7], 14'b0};
    assign acc[9] = {34'b01,~e[9],pp[9],add1[8], 16'b0};
    assign acc[10] = {32'b01,~e[10],pp[10],add1[9], 18'b0};
    assign acc[11] = {30'b01,~e[11],pp[11],add1[10], 20'b0};
    assign acc[12] = {28'b01,~e[12],pp[12],add1[11], 22'b0};
    assign acc[13] = {26'b01,~e[13],pp[13],add1[12], 24'b0};
    assign acc[14] = {24'b01,~e[14],pp[14],add1[13], 26'b0};
    assign acc[15] = {22'b01,~e[15],pp[15],add1[14], 28'b0};
    assign acc[16] = {20'b01,~e[16],pp[16],add1[15], 30'b0};
    assign acc[17] = {18'b01,~e[17],pp[17],add1[16], 32'b0};
    assign acc[18] = {16'b01,~e[18],pp[18],add1[17], 34'b0};
    assign acc[19] = {14'b01,~e[19],pp[19],add1[18], 36'b0};
    assign acc[20] = {12'b01,~e[20],pp[20],add1[19], 38'b0};
    assign acc[21] = {10'b01,~e[21],pp[21],add1[20], 40'b0};
    assign acc[22] = {8'b01,~e[22],pp[22],add1[21], 42'b0};
    assign acc[23] = {6'b01,~e[23],pp[23],add1[22], 44'b0};
    assign acc[24] = {4'b01,~e[24],pp[24],add1[23], 46'b0};
    assign acc[25] = {~e[25],pp[25],add1[24], 48'b0};
    assign acc[26] = {pp[26],add1[25], 50'b0};
    //*** resize adders
     generate
        for(i=0; i<9; i=i+1) begin
            add3comp2 #(.BITS(106)) add1(.a(acc[i*3]), .b(acc[i*3+1]), .c(acc[i*3+2]), 
                                           .carry(carryTmp[i][105:0]), .sum(lv1add[i*2+1]));
            assign lv1add[i*2] = {carryTmp[i][104:0], 1'b0};
        end
     endgenerate
     generate
        for(i=0; i<6; i=i+1) begin
            add3comp2 #(.BITS(106)) add2(.a(lv1add[i*3]), .b(lv1add[i*3+1]), .c(lv1add[i*3+2]), 
                                           .carry(carryTmp[i+9][105:0]), .sum(lv2add[i*2+1]));
            assign lv2add[i*2] = {carryTmp[i+9][104:0], 1'b0};
        end
     endgenerate
    generate
        for(i=0; i<4; i=i+1) begin
            add3comp2 #(.BITS(106)) add3(.a(lv2add[i*3]), .b(lv2add[i*3+1]), .c(lv2add[i*3+2]), 
                                            .carry(carryTmp[i+15][105:0]), .sum(lv3add[i*2+1]));
            assign lv3add[i*2] = {carryTmp[i+15][104:0], 1'b0};
        end
    endgenerate
    generate
        for(i=0; i<2; i=i+1) begin
            add4comp2 #(.BITS(106)) add4(.a(lv3add[i*4]), .b(lv3add[i*4+1]), .c(lv3add[i*4+2]), .d(lv3add[i*4+3]),
                                            .carry(carryTmp[i+19]), .sum(lv4add[i*2+1]));
            assign lv4add[i*2] = {carryTmp[i+19][104:0], 1'b0};
        end
    endgenerate
    add4comp2 #(.BITS(106)) add5(.a(lv4add[0]), .b(lv4add[1]), .c(lv4add[2]), .d(lv4add[3]) ,
                                    .carry(carryTmp[21]), .sum(s));
    assign r = {carryTmp[21][104:0], 1'b0};
 		// assign r = 0;
 		// assign s = acc[0] +
 		// 		   acc[1] +
 		// 		   acc[2] +
 		// 		   acc[3] +
 		// 		   acc[4] +
 		// 		   acc[5] +
 		// 		   acc[6] +
 		// 		   acc[7] +
 		// 		   acc[8] +
 		// 		   acc[9] +
 		// 		   acc[10] +
 		// 		   acc[11] +
 		// 		   acc[12] +
 		// 		   acc[13] +
 		// 		   acc[14] +
 		// 		   acc[15] +
 		// 		   acc[16] +
 		// 		   acc[17] +
 		// 		   acc[18] +
 		// 		   acc[19] +
 		// 		   acc[20] +
 		// 		   acc[21] +
 		// 		   acc[22] +
 		// 		   acc[23] +
 		// 		   acc[24] +
 		// 		   acc[25] +
 		// 		   acc[26];
 			// assign s = {53'b0,~(xdenorm|xzero),xman}  *  {53'b0,~(ydenorm|yzero),yman};
 			// assign r = 0;
 endmodule
--- a/wally-pipelined/src/fpu/FMA/normalize.sv
+++ b/wally-pipelined/src/fpu/FMA/normalize.sv
@ -14,9 +14,11 @@
 /////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////
-module normalize(sum, zexp, invz, normcnt, ae, aligncnt, sumshift, sumshiftzero, sumzero, xzero, yzero, bs, ps, denorm0, xdenorm, ydenorm, zdenorm, sticky, de0, resultdenorm, v); 
+module normalize(sum, xexp, yexp, zexp, invz, normcnt, ae, aligncnt, sumshift, sumshiftzero, sumzero, xzero, zzero, yzero, bs, ps, denorm0, xdenorm, ydenorm, zdenorm, sticky, de0, resultdenorm, v); 
 /////////////////////////////////////////////////////////////////////////////
 	input     	[163:0]  	sum;            // sum
 	input     	[62:52]  	xexp;            // sum
 	input     	[62:52]  	yexp;            // sum
 	input     	[62:52]  	zexp;            // sum
 	input		[8:0] 		normcnt;     	// normalization shift count
 	input		[12:0] 		ae;     	// normalization shift count
@ -33,6 +35,7 @@ module normalize(sum, zexp, invz, normcnt, ae, aligncnt, sumshift, sumshiftzero,
 	input                  		zdenorm;        // Input Z is denormalized
 	input				xzero;
 	input				yzero;
 	input				zzero;
 	output				sticky;		//sticky bit
 	output		[12:0]		de0;
 	output                  	resultdenorm;        // Input Z is denormalized
@ -47,6 +50,7 @@ module normalize(sum, zexp, invz, normcnt, ae, aligncnt, sumshift, sumshiftzero,
 	logic		[9:0]		sumshifttmp;
 	logic       	[163:0]  	sumshiftedtmp;     // shifted sum
 	logic 				sticky;
 	logic				isShiftLeft1;
 logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;
 	// When the sum is zero,  normalization does not apply and only the
@ -60,21 +64,23 @@ logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;
 	// The sticky bit calculation is actually built into the shifter and
 	// does not require a true subtraction shown in the model.
 	assign isShiftLeft1 = (aligncnt == 1 ||aligncnt == 0 || $signed(aligncnt) == $signed(-1))&& zexp == 11'h2;//((xexp == 11'h3ff && yexp == 11'h1) || (yexp == 11'h3ff && xexp == 11'h1)) && zexp == 11'h2;
 	assign tmp = ($signed(ae-normcnt+2) >= $signed(-1022));
-	always @(sum or sumshift or ae or aligncnt or normcnt or bs or zexp or zdenorm)
+	always @(sum or sumshift or ae or aligncnt or normcnt or bs or isShiftLeft1 or zexp or zdenorm)
 		begin
 		// d = aligncnt
 		// l = normcnt
 		// p = 53
 		// ea + eb = ae
 			// set d<=2 to d<=0
-			if ($signed(aligncnt)<=$signed(1))  begin //d<=2 
+			if ($signed(aligncnt)<=$signed(2))  begin //d<=2 
 				// product anchored or cancellation
 				if ($signed(ae-normcnt+2) >= $signed(-1022)) begin //ea+eb-l+2 >= emin
 					//normal result
-					de0 = xzero|yzero ? zexp : ae-normcnt+2+xdenorm+ydenorm;
+					de0 = xzero|yzero ? zexp : ae-normcnt+xdenorm+ydenorm+57;
-					resultdenorm = |sum & ~|de0;
+					resultdenorm = |sum & ~|de0 | de0[12];
-					sumshifted = resultdenorm ? sum << sumshift : sum << (55+normcnt); // p+2+l
+					// if z is zero then there was a 56 bit shift of the product
 					sumshifted = resultdenorm ? sum << sumshift-zzero+isShiftLeft1 : sum << normcnt; // p+2+l
 					v = sumshifted[162:109];
 					sticky = (|sumshifted[108:0]) | bs;
 					//de0 = ae-normcnt+2-1023;
@ -90,8 +96,8 @@ logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;
 				sumshifttmp = {1'b0,sumshift} - 2;
 				sumshifted = sumshifttmp[9] ? sum : sum << sumshifttmp;
 				tmp1 = (sumshifted[163] & ~sumshifttmp[9]);
-				tmp2 = (sumshifttmp[9] || sumshifted[162]);
+				tmp2 = ((sumshifttmp[9] & sumshift[0]) || sumshifted[162]);
-				tmp3 = sumshifted[161];
+				tmp3 = (sumshifted[161] || (sumshifttmp[9] & sumshift[1]));
 				tmp4 = sumshifted[160];
 				tmp5 = sumshifted[159];
 				// for some reason use exp = zexp + {0,1,2}
@ -112,25 +118,31 @@ logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;
 					v = sumshifted[160:107];
 					sticky = (|sumshifted[106:0]) | bs;
 					//de0 = zexp-1;
-					de0 = zexp;
+					de0 = zexp+zdenorm;
-				end else if(sumshifted[160]) begin
+				end else if(sumshifted[160]& ~zdenorm) begin
-					v = sumshifted[159:106];
+					de0 = zexp-1;
 					v = ~|de0&~sumzero ? sumshifted[160:107] : sumshifted[159:106];
 					sticky = (|sumshifted[105:0]) | bs;
 					//de0 = zexp-1;
-					de0 = zexp-1;
+				end else if(sumshifted[159]& ~zdenorm) begin
-				end else if(sumshifted[159]) begin
+					//v = sumshifted[158:105];
-					v = sumshifted[158:105];
+					de0 = zexp-2;
 					v = (~|de0 | de0[12])&~sumzero ? sumshifted[161:108] : sumshifted[158:105];
 					sticky = (|sumshifted[104:0]) | bs;
 					//de0 = zexp-1;
-					de0 = zexp-2;
+				end else if(zdenorm) begin					
 				end else begin					
 					v = sumshifted[160:107];
 					sticky = (|sumshifted[106:0]) | bs;
 					//de0 = zexp-1;
 					de0 = zexp;
 				end else begin
 					de0 = 0;
 					sumshifted = sum << sumshift-1; // p+2+l
 					v = sumshifted[162:109];
 					sticky = (|sumshifted[108:0]) | bs;
 				end
-				resultdenorm = ~(|de0);
+				resultdenorm = (~|de0 | de0[12]);
 		end 
 	end
--- a/wally-pipelined/src/fpu/FMA/round.sv
+++ b/wally-pipelined/src/fpu/FMA/round.sv
@ -13,22 +13,17 @@
 /////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////
-module round(v, sticky, rz, rn, rp, rm, wsign,
+module round(v, sticky, FrmE, wsign,
-			  invalid, overflow, underflow, inf, nan, xnan, ynan, znan, 
+			  FmaFlagsM, inf, nan, xnan, ynan, znan, 
 			  xman, yman, zman,
 			  wman, infinity, specialsel,expplus1);
 /////////////////////////////////////////////////////////////////////////////
 	input		[53:0]		v;		// normalized sum, R, S bits
 	input				sticky;		//sticky bit
-	input				rz;		// Round toward zero
+	input		[2:0]	FrmE;
 	input				rn;		// Round toward	nearest
 	input				rp;		// Round toward	plus infinity
 	input				rm;		// Round toward	minus infinity
 	input				wsign;		// Sign of result
-	input 				invalid;	// Trap on infinity, NaN, denorm
+	input 		[4:0]	FmaFlagsM;
 	input				overflow;	// Result overflowed
 	input				underflow;	// Result underflowed
 	input				inf;		// Some input is infinity
 	input				nan;		// Some input is NaN
 	input				xnan;		// X is NaN
@ -45,7 +40,7 @@ module round(v, sticky, rz, rn, rp, rm, wsign,
 	// Internal nodes
-	wire				plus1;		// Round by adding one 
+	logic				plus1;		// Round by adding one 
 	wire		[52:0]		v1;		// Result + 1 (for rounding)
 	wire		[51:0]		specialres;	// Result of exceptional case 
 	wire		[51:0]		infinityres;	// Infinity or largest real number
@ -62,9 +57,19 @@ module round(v, sticky, rz, rn, rp, rm, wsign,
 	//	0xx - do nothing
 	//	100 - tie - plus1 if v[2] = 1
 	//	101/110/111 - plus1
-	assign plus1 = (rn & v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2]))) |
+	always @ (FrmE, v, wsign, sticky) begin
-		       (rp & ~wsign) |
+		case (FrmE)
-		       (rm & wsign);
+			3'b000: plus1 = (v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2])));//round to nearest even
 			3'b001: plus1 = 0;//round to zero
 			3'b010: plus1 = wsign;//round down
 			3'b011: plus1 = ~wsign;//round up
 			3'b100: plus1 = (v[1] & (v[0] | sticky | (~v[0]&~sticky&~wsign)));//round to nearest max magnitude
 			default: plus1 = 1'bx;
 		endcase
 	end
 	// assign plus1 = (rn & v[1] & (v[0] | sticky | (~v[0]&~sticky&v[2]))) |
 	// 	       (rp & ~wsign) |
 	// 	       (rm & wsign);
 	//assign plus1 = rn && ((v[1] && v[0]) || (v[2] && (v[1]))) ||
 	//				 rp && ~wsign && (v[1] || v[0]) ||
 	//				 rm && wsign && (v[1] || v[0]);
@ -84,17 +89,17 @@ module round(v, sticky, rz, rn, rp, rm, wsign,
 	// inputs to the wide muxes can be combined at the expense of more
 	// complicated non-critical control in the circuit implementation.
-	assign specialsel =  overflow || underflow || invalid ||
+	assign specialsel =  FmaFlagsM[2] ||  FmaFlagsM[1] ||  FmaFlagsM[4] || //overflow underflow invalid
 							nan || inf;
-	assign specialres = invalid | nan ? nanres : //KEP added nan
+	assign specialres = FmaFlagsM[4] | nan ? nanres : //invalid
-						 overflow ? infinityres : 
+						 FmaFlagsM[2] ? infinityres : //overflow
 						 inf ? 52'b0 :
-						underflow ? 52'b0 : 52'bx;  // default to undefined 
+						 FmaFlagsM[1] ? 52'b0 : 52'bx;  // underflow
 	// Overflow is handled differently for different rounding modes
 	// Round is to either infinity or to maximum finite number
-	assign infinity = rn || (rp && ~wsign) || (rm && wsign);
+	assign infinity =  |FrmE;//rn || (rp && ~wsign) || (rm && wsign);//***look into this
 	assign infinityres = infinity ? 52'b0 : {52{1'b1}};
 	// Invalid operations produce a quiet NaN. The result should
--- a/wally-pipelined/src/fpu/FMA/sign.sv
+++ b/wally-pipelined/src/fpu/FMA/sign.sv
@ -10,23 +10,24 @@
 /////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////
-module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm, overflow,
+module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, FrmE, FmaFlagsM, zzero,
-			 sumzero, nan, invalid, xinf, yinf, zinf, inf, wsign, invz, negsum, selsum1, psign);
+			 sumzero, nan, xinf, yinf, zinf, inf, wsign, invz, negsum, selsum1, psign, isAdd);
 ////////////////////////////////////////////////////////////////////////////I
 	input					xsign;			// Sign of X 
 	input					ysign;			// Sign of Y 
 	input					zsign;			// Sign of Z
 	input					zzero;
 	input					isAdd;
 	input					negsum0;		// Sum in +O mode is negative 
 	input					negsum1;		// Sum in +1 mode is negative 
 	input					bs;				// sticky bit from addend
 	input					ps;				// sticky bit from product
 	input					killprod;		// Product forced to zero
-	input					rm;				// Round toward minus infinity
+	input		[2:0]		FrmE;				// Round toward minus infinity
-	input					overflow;				// Round toward minus infinity
+	input		[4:0]		FmaFlagsM;				// Round toward minus infinity
 	input					sumzero;		// Sum = O
 	input					nan;			// Some input is NaN
 	input					invalid;		// Result invalid
 	input					xinf;			// X = Inf
 	input					yinf;			// Y = Inf
 	input					zinf;			// Y = Inf
@ -96,10 +97,24 @@ logic tmp;
 	//			 shall be +0 in all rounding attributes EXCEPT roundTowardNegative. Under that attribute, the sign of an exact zero 
 	//			 sum/difference shall be -0.  However, x+x = x-(-X) retains the same sign as x even when x is zero."
-	assign zerosign = (~invz && killprod) ? zsign : rm;
+	//assign zerosign = (~invz && killprod) ? zsign : rm;//***look into
 //	assign zerosign = (~invz && killprod) ? zsign : 0;
 	// zero sign
 	//	if product underflows then use psign
 	//	otherwise
 	//		addition
 	//			if cancelation then 0 unless round to -inf
 	//			otherwise psign
 	//		subtraction
 	//			if cancelation then 0 unless round to -inf
 	//			otherwise psign
 	assign zerosign = FmaFlagsM[1] ? psign :
 			  (isAdd ? (psign^zsign ? FrmE == 3'b010 : psign) :
 				  (psign^zsign ? psign : FrmE == 3'b010));
 	assign infsign = zinf ? zsign : psign; //KEP 210112 keep the correct sign when result is infinity
 	//assign infsign = xinf ? (yinf ? psign : xsign) : yinf ? ysign : zsign;//original
-	assign tmp = invalid ? 0 : (inf ? infsign :(sumzero ? zerosign : psign ^ negsum));
+	assign tmp = FmaFlagsM[4] ? 0 : (inf ? infsign :(sumzero ? zerosign : psign ^ negsum));
-	assign wsign = invalid ? 0 : (inf ? infsign :(sumzero ? zerosign : sumneg));
+	assign wsign = FmaFlagsM[4] ? 0 : (inf ? infsign :(sumzero ? zerosign : sumneg));
 endmodule
--- a/wally-pipelined/src/fpu/FMA/special.sv
+++ b/wally-pipelined/src/fpu/FMA/special.sv
@ -10,49 +10,49 @@
 /////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////
-module special(x, y, z, ae, xzero, yzero, zzero,
+module special(ReadData1E, ReadData2E, ReadData3E, ae, xzero, yzero, zzero,
 				xnan, ynan, znan, xdenorm, ydenorm, zdenorm, proddenorm, xinf, yinf, zinf);
 /////////////////////////////////////////////////////////////////////////////
-	input   	[63:0]     	x;              // Input x
+	input   	[63:0]     	ReadData1E;              // Input ReadData1E
-	input     	[63:0]     	y;           	// Input Y
+	input     	[63:0]     	ReadData2E;           	// Input ReadData2E
-	input      	[63:0]    	z;            	// Input z 
+	input      	[63:0]    	ReadData3E;            	// Input ReadData3E 
 	input		[12:0]		ae;		// exponent of product
-	output				xzero;		// Input x = 0
+	output				xzero;		// Input ReadData1E = 0
-	output				yzero;		// Input y = 0
+	output				yzero;		// Input ReadData2E = 0
-	output				zzero;		// Input z = 0
+	output				zzero;		// Input ReadData3E = 0
-	output				xnan;		// x is NaN
+	output				xnan;		// ReadData1E is NaN
-	output				ynan;		// y is NaN
+	output				ynan;		// ReadData2E is NaN
-	output				znan;		// z is NaN
+	output				znan;		// ReadData3E is NaN
-	output				xdenorm;	// x is denormalized
+	output				xdenorm;	// ReadData1E is denormalized
-	output				ydenorm;	// y is denormalized
+	output				ydenorm;	// ReadData2E is denormalized
-	output				zdenorm;	// z is denormalized
+	output				zdenorm;	// ReadData3E is denormalized
 	output				proddenorm;	// product is denormalized
-	output				xinf;		// x is infinity
+	output				xinf;		// ReadData1E is infinity
-	output				yinf;		// y is infinity
+	output				yinf;		// ReadData2E is infinity
-	output				zinf;		// z is infinity
+	output				zinf;		// ReadData3E is infinity
 	// In the actual circuit design, the gates looking at bits
 	// 51:0 and at bits 62:52 should be shared among the various detectors.
 	// Check if input is NaN
-	assign xnan = &x[62:52] && |x[51:0]; 
+	assign xnan = &ReadData1E[62:52] && |ReadData1E[51:0]; 
-	assign ynan = &y[62:52] && |y[51:0]; 
+	assign ynan = &ReadData2E[62:52] && |ReadData2E[51:0]; 
-	assign znan = &z[62:52] && |z[51:0];
+	assign znan = &ReadData3E[62:52] && |ReadData3E[51:0];
 	// Check if input is denormalized
-	assign xdenorm = ~(|x[62:52]) && |x[51:0]; 
+	assign xdenorm = ~(|ReadData1E[62:52]) && |ReadData1E[51:0]; 
-	assign ydenorm = ~(|y[62:52]) && |y[51:0]; 
+	assign ydenorm = ~(|ReadData2E[62:52]) && |ReadData2E[51:0]; 
-	assign zdenorm = ~(|z[62:52]) && |z[51:0];
+	assign zdenorm = ~(|ReadData3E[62:52]) && |ReadData3E[51:0];
 	assign proddenorm = &ae & ~xzero & ~yzero; //KEP is the product denormalized
 	// Check if input is infinity
-	assign xinf = &x[62:52] && ~(|x[51:0]); 
+	assign xinf = &ReadData1E[62:52] && ~(|ReadData1E[51:0]); 
-	assign yinf = &y[62:52] && ~(|y[51:0]); 
+	assign yinf = &ReadData2E[62:52] && ~(|ReadData2E[51:0]); 
-	assign zinf = &z[62:52] && ~(|z[51:0]);
+	assign zinf = &ReadData3E[62:52] && ~(|ReadData3E[51:0]);
 	// Check if inputs are all zero
 	// Also forces denormalized inputs to zero.
@ -60,11 +60,11 @@ module special(x, y, z, ae, xzero, yzero, zzero,
 	// to just check if the exponent is zero.
 	// KATHERINE - commented following (21/01/11)
-	// assign xzero = ~(|x[62:0]) || xdenorm;
+	// assign xzero = ~(|ReadData1E[62:0]) || xdenorm;
-	// assign yzero = ~(|y[62:0]) || ydenorm;
+	// assign yzero = ~(|ReadData2E[62:0]) || ydenorm;
-	// assign zzero = ~(|z[62:0]) || zdenorm;
+	// assign zzero = ~(|ReadData3E[62:0]) || zdenorm;
 	// KATHERINE - removed denorm to prevent outputing zero when computing with a denormalized number
-	assign xzero = ~(|x[62:0]);
+	assign xzero = ~(|ReadData1E[62:0]);
-	assign yzero = ~(|y[62:0]);
+	assign yzero = ~(|ReadData2E[62:0]);
-	assign zzero = ~(|z[62:0]);
+	assign zzero = ~(|ReadData3E[62:0]);
 endmodule
--- a/wally-pipelined/src/fpu/FMA/tbgen/results.dat
+++ b/wally-pipelined/src/fpu/FMA/tbgen/results.dat
@ -1,16 +1 @@
-0010000000000000 bf4fdffffff7fffe 800ffffffffffffe 800003fbfffffefe 801003fbfffffefe  Wrong zdenorm 308227
+c3f000200003fffe 0000000000000001 001ffffffffffffe 80cffc400007fffd 80cffc400007fffc  Wrong FmaResultM=  -64 ydenorm 1119653
 0010000000000000 be6fffffbffffff7 8000000000000000 800000001fffffc0 800000000fffffe0  Wrong 313753
 001ffffffffffffe 3fddfbffffffffff 000ffffffffffffe 000efdfffffffffd 001efdfffffffffd  Wrong zdenorm 551371
 3befe000ffffffff 800ffffffffffffe 0000000000000000 0000000000000000 8000000000000000  Wrong ydenorm unflw 665575
 000007fffffffffe 3f6ffffffe01fffe 000ffffffffffffe 00000007ffffff7e 00100007ffffff7e  Wrong xdenorm zdenorm 768727
 3fdffffffffffffe 000ffffffffffffe 8000000000000001 7feffffffffffff6 0007fffffffffffe  Wrong ydenorm zdenorm 1049939
 7fe0000000000001 4000000000000000 ffefffffffffffff 7ff0000000000000 7cb8000000000000  Wrong w=+inf 2602745
 000fff000000000f 3ff00800001fffff 8010000000000000 7f7bfe007ff8381e 000006ff801ffe0e  Wrong xdenorm 3117277
 8000000000000001 40211275ffe5ee3c 0000000000000001 fcfe24ebffcbdc78 8000000000000008  Wrong xdenorm zdenorm 3148591
 801fffffffffffff bfdffffffffffffe 0000000000021fff 0000000000021ffe 0010000000021ffe  Wrong zdenorm 3537867
 801ffffffffffffe 0010000000000001 0000000000000000 0000000000000000 8000000000000000  Wrong unflw 3564269
 bca0000000000001 000fffffc000001e 8000000000000000 8000000000000001 8000000000000000  Wrong ydenorm 3717769
 bcafffffffffffff 800ffffffffffffe 8000000000000000 0000000000000002 0000000000000001  Wrong ydenorm 3807413
 7fec5fed92358a74 400000001bffffff ffefc0003ffffffe 7ff0000000000000 7fe8ffdb47bad466  Wrong w=+inf 3889689
 bfdfffffffffffff 3fdf1f3616aa73e1 3fd0000000000001 3fd07064f4aac611 3f7c193d2ab1843f  Wrong 4099063
 3fd07dfffffffffe 8010000000000001 0000000000000001 ffe07dfffffffffb 80041f7fffffffff  Wrong zdenorm 4716133
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.c
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.c
@ -20,19 +20,19 @@ void main() {
 		// b68ffff8000000ff_3f9080000007ffff_b6307ffbe0080080_00001
                char ch;
 		int i,j,n;
-		char x[17];
+		char ReadData1E[17];
-		char y[17];
+		char ReadData2E[17];
-		char z[17];
+		char ReadData3E[17];
 		char ans[81];
 		char flags[3];
-		int rn,rz,rm,rp;
+		int FrmE;
-		long stop = 4099063;
+		long stop = 1119653;
 		int debug = 1;
 		//my_string = (char *) malloc (nbytes + 1);
 		//bytes_read = getline (&my_string, &nbytes, stdin);
-		for(n=0; n < 613; n++) {//613 for 10000
+		for(n=0; n < 305; n++) {//613 for 10000
 			if(getline(&ln,&nbytes,fp) < 0 || feof(fp)) break;
 			if(k == stop && debug == 1) break;
 			k++;
@ -41,71 +41,59 @@ void main() {
 		if(!feof(fp)) {
-			strncpy(x,   ln,     16); x[16]=0;
+			strncpy(ReadData1E,   ln,     16); ReadData1E[16]=0;
-			strncpy(y,    &ln[17], 16); y[16]=0;
+			strncpy(ReadData2E,    &ln[17], 16); ReadData2E[16]=0;
-			strncpy(z,  &ln[34], 16); z[16]=0;
+			strncpy(ReadData3E,  &ln[34], 16); ReadData3E[16]=0;
-			// fprintf(stdout,"[%s]\n[%s]\n", ln,z);
+			// fprintf(stdout,"[%s]\n[%s]\n", ln,ReadData3E);
 			strncpy(ans,  &ln[51], 16); ans[16]=0;
 			strncpy(flags,&ln[68],2);   flags[2]=0;
-			// fprintf(stdout,"[%s]\n[%s]\n", ln,z);
+			// fprintf(stdout,"[%s]\n[%s]\n", ln,ReadData3E);
-			fprintf(fq,"    x = 64'h%s;\n",x); 
+			fprintf(fq,"    ReadData1E = 64'h%s;\n",ReadData1E); 
-			fprintf(fq,"    y = 64'h%s;\n",y); 
+			fprintf(fq,"    ReadData2E = 64'h%s;\n",ReadData2E); 
-			fprintf(fq,"    z = 64'h%s;\n",z);
+			fprintf(fq,"    ReadData3E = 64'h%s;\n",ReadData3E);
 			fprintf(fq,"    ans = 64'h%s;\n", ans);
 			// fprintf(fq,"    flags = 5'h%s;\n", flags);
 			{
 				//rn=1; rz=0; rm=0; rp=0;
-				fprintf(fq,"    rn = %d;\n",1);
+				fprintf(fq,"    FrmE = 3'b000;\n");
 				fprintf(fq,"    rz = %d;\n", 0);
 				fprintf(fq,"    rm = %d;\n", 0);
 				fprintf(fq,"    rp = %d;\n", 0);
 			}
 			{
 				fprintf(fq,"    earlyres = 64'b0;\n");
 				fprintf(fq,"    earlyressel = 0;\n");
 			}		
 			{
 				fprintf(fq,"    bypsel= 2'b0;\n"); //, bysel);
 				fprintf(fq,"    bypplus1 = 0;\n"); //, byp1);
 				fprintf(fq,"    byppostnorm = 0;\n"); //, bypnorm);
 			}
 			fprintf(fq,"#10\n");
 			// IEEE 754-2008 section 6.3 states "When ether an input or result is NaN, this standard does not interpret the sign of a NaN."
-			//fprintf(fq,"	$fwrite(fp, \"%%h %%h %%h %%h \",x,y,w, ans);\n");	
+			//fprintf(fq,"	$fwrite(fp, \"%%h %%h %%h %%h \",ReadData1E,ReadData2E,FmaResultM, ans);\n");	
 			fprintf(fq,"    // IEEE 754-2008 section 6.3 states: \"When ether an input or result is NaN, this\n");
 			fprintf(fq,"    //                                     standard does not interpret the sign of a NaN.\"\n");
-			fprintf(fq,"	wnan = &w[62:52] && |w[51:0]; \n");
+			fprintf(fq,"	wnan = &FmaResultM[62:52] && |FmaResultM[51:0]; \n");
-			fprintf(fq,"	xnan = &x[62:52] && |x[51:0]; \n");
+			fprintf(fq,"	xnan = &ReadData1E[62:52] && |ReadData1E[51:0]; \n");
-			fprintf(fq,"	ynan = &y[62:52] && |y[51:0]; \n");
+			fprintf(fq,"	ynan = &ReadData2E[62:52] && |ReadData2E[51:0]; \n");
-			fprintf(fq,"	znan = &z[62:52] && |z[51:0]; \n");
+			fprintf(fq,"	znan = &ReadData3E[62:52] && |ReadData3E[51:0]; \n");
 			fprintf(fq,"	ansnan = &ans[62:52] && |ans[51:0]; \n");
-			fprintf(fq,"	xnorm = ~(|x[62:52]) && |x[51:0] ? {x[50:0], 1'b0} : x; \n");
+			fprintf(fq,"	xnorm = ~(|ReadData1E[62:52]) && |ReadData1E[51:0] ? {ReadData1E[50:0], 1'b0} : ReadData1E; \n");
-			fprintf(fq,"	ynorm = ~(|y[62:52]) && |y[51:0] ? {y[50:0], 1'b0} : y;\n");
+			fprintf(fq,"	ynorm = ~(|ReadData2E[62:52]) && |ReadData2E[51:0] ? {ReadData2E[50:0], 1'b0} : ReadData2E;\n");
-			fprintf(fq,"	s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm}; \n");
+			// fprintf(fq,"	s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm}; \n");
-			// fprintf(fq,"    if(!(~(|x[62:52]) && |x[51:0] || ~(|y[62:52]) && |y[51:0])) begin\n"); 
+			// fprintf(fq,"    if(!(~(|ReadData1E[62:52]) && |ReadData1E[51:0] || ~(|ReadData2E[62:52]) && |ReadData2E[51:0])) begin\n"); 
 																							// not looknig at negative zero results right now
-			//fprintf(fq,"	  if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) && !(w == 64'h8000000000000000 && ans == 64'b0)) begin\n"); 
+			//fprintf(fq,"	  if( (nan && (FmaResultM[62:0] != ans[62:0])) || (!nan && (FmaResultM != ans)) && !(FmaResultM == 64'h8000000000000000 && ans == 64'b0)) begin\n"); 
-			// fprintf(fq,"	if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) ) begin\n"); 
+			// fprintf(fq,"	if( (nan && (FmaResultM[62:0] != ans[62:0])) || (!nan && (FmaResultM != ans)) ) begin\n"); 
-			fprintf(fq,"	if((!wnan && (w != ans)) || (wnan && ansnan && ~(((xnan && (w[62:0] == {x[62:52],1'b1,x[50:0]})) || (ynan && (w[62:0] == {y[62:52],1'b1,y[50:0]}))  || (znan && (w[62:0] == {z[62:52],1'b1,z[50:0]})) || (w[62:0] == ans[62:0])) ))) begin\n"); 
+			fprintf(fq,"	if((!wnan && (FmaResultM != ans)) || (wnan && ansnan && ~(((xnan && (FmaResultM[62:0] == {ReadData1E[62:52],1'b1,ReadData1E[50:0]})) || (ynan && (FmaResultM[62:0] == {ReadData2E[62:52],1'b1,ReadData2E[50:0]}))  || (znan && (FmaResultM[62:0] == {ReadData3E[62:52],1'b1,ReadData3E[50:0]})) || (FmaResultM[62:0] == ans[62:0])) ))) begin\n"); 
-			fprintf(fq,"		$fwrite(fp, \"%%h %%h %%h %%h %%h  Wrong \",x,y, z, w, ans);\n");
+			fprintf(fq,"		$fwrite(fp, \"%%h %%h %%h %%h %%h  Wrong \",ReadData1E,ReadData2E, ReadData3E, FmaResultM, ans);\n");
 			//fprintf(fq,"		$fwrite(fp, \"%%h \",s);\n");
-			fprintf(fq,"		if(w == 64'h8000000000000000) $fwrite(fp, \"w=-zero \");\n");
+			fprintf(fq,"		$fwrite(fp, \"FmaResultM=%%d \",$signed(aligncnt));\n");
-			fprintf(fq,"		if(~(|x[62:52]) && |x[51:0]) $fwrite(fp, \"xdenorm \");\n");
+			fprintf(fq,"		if(FmaResultM == 64'h8000000000000000) $fwrite(fp, \"FmaResultM=-zero \");\n");
-			fprintf(fq,"		if(~(|y[62:52]) && |y[51:0]) $fwrite(fp, \"ydenorm \");\n");
+			fprintf(fq,"		if(~(|ReadData1E[62:52]) && |ReadData1E[51:0]) $fwrite(fp, \"xdenorm \");\n");
-			fprintf(fq,"		if(~(|z[62:52]) && |z[51:0]) $fwrite(fp, \"zdenorm \");\n");
+			fprintf(fq,"		if(~(|ReadData2E[62:52]) && |ReadData2E[51:0]) $fwrite(fp, \"ydenorm \");\n");
-			fprintf(fq,"		if(invalid != 0) $fwrite(fp, \"invld \");\n");
+			fprintf(fq,"		if(~(|ReadData3E[62:52]) && |ReadData3E[51:0]) $fwrite(fp, \"zdenorm \");\n");
-			fprintf(fq,"		if(overflow != 0) $fwrite(fp, \"ovrflw \");\n");
+			fprintf(fq,"		if(FmaFlagsM[4] != 0) $fwrite(fp, \"invld \");\n");
-			fprintf(fq,"		if(underflow != 0) $fwrite(fp, \"unflw \");\n");
+			fprintf(fq,"		if(FmaFlagsM[2] != 0) $fwrite(fp, \"ovrflw \");\n");
-			fprintf(fq,"		if(w == 64'hFFF0000000000000) $fwrite(fp, \"w=-inf \");\n");
+			fprintf(fq,"		if(FmaFlagsM[1] != 0) $fwrite(fp, \"unflw \");\n");
-			fprintf(fq,"		if(w == 64'h7FF0000000000000) $fwrite(fp, \"w=+inf \");\n");
+			fprintf(fq,"		if(FmaResultM == 64'hFFF0000000000000) $fwrite(fp, \"FmaResultM=-inf \");\n");
-			fprintf(fq,"		if(w >  64'h7FF0000000000000 && w <  64'h7FF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
+			fprintf(fq,"		if(FmaResultM == 64'h7FF0000000000000) $fwrite(fp, \"FmaResultM=+inf \");\n");
-			fprintf(fq,"		if(w >  64'hFFF8000000000000 && w <  64'hFFF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
+			fprintf(fq,"		if(FmaResultM >  64'h7FF0000000000000 && FmaResultM <  64'h7FF8000000000000 ) $fwrite(fp, \"FmaResultM=sigNaN \");\n");
-			fprintf(fq,"		if(w >= 64'h7FF8000000000000 && w <= 64'h7FFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
+			fprintf(fq,"		if(FmaResultM >  64'hFFF8000000000000 && FmaResultM <  64'hFFF8000000000000 ) $fwrite(fp, \"FmaResultM=sigNaN \");\n");
-			fprintf(fq,"		if(w >= 64'hFFF8000000000000 && w <= 64'hFFFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
+			fprintf(fq,"		if(FmaResultM >= 64'h7FF8000000000000 && FmaResultM <= 64'h7FFfffffffffffff ) $fwrite(fp, \"FmaResultM=qutNaN \");\n");
 			fprintf(fq,"		if(FmaResultM >= 64'hFFF8000000000000 && FmaResultM <= 64'hFFFfffffffffffff ) $fwrite(fp, \"FmaResultM=qutNaN \");\n");
 			fprintf(fq,"		if(ans == 64'hFFF0000000000000) $fwrite(fp, \"ans=-inf \");\n");
 			fprintf(fq,"		if(ans == 64'h7FF0000000000000) $fwrite(fp, \"ans=+inf \");\n");
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.v
--- a/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
@ -2,38 +2,27 @@
 module tb;
- reg 		[63:0]		x;
+ reg 	[63:0]		ReadData1E;
- reg 		[63:0]		y;
+ reg 	[63:0]		ReadData2E;
- reg 		[63:0]		z;
+ reg 	[63:0]		ReadData3E;
 reg 	[63:0]		ans;
- reg 						rn;
+ reg 	[2:0]		FrmE;
- reg 						rz;
+ wire 	[63:0]		FmaResultM;
- reg 						rm;
+ wire 	[4:0]	 	FmaFlagsM;
 reg 						rp;
 reg 		[63:0]		earlyres;
 reg 						earlyressel;
 reg 		[1:0]			bypsel;
 reg 						bypplus1;
 reg 						byppostnorm;
 wire 	[63:0]		w;
 wire 	[63:0]		wbypass;
 wire 		 			invalid;
 wire 					overflow;
 wire 					underflow;
 wire 					inexact;
 integer fp;
 reg wnan;
 reg xnan;
 reg ynan;
 reg znan;
 wire [12:0] aligncnt;
 reg ansnan;
 reg		[105:0]		s;				//	partial product 2	
 reg		[51:0] 		xnorm;
 reg 		[51:0] 		ynorm;
 localparam period = 20;  
-fmac UUT(.*);
+fma UUT(.*);
 initial 
--- a/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
+++ b/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
@ -1 +1 @@
-testfloat_gen f64_mulAdd -n 6133248 -rnear_even -seed 113355 -level 1 >> testFloat
+testfloat_gen f64_mulAdd -n 6133248 -rminMag -seed 113355 -level 1 >> testFloat
--- a/wally-pipelined/src/fpu/csa.sv
+++ b/wally-pipelined/src/fpu/csa.sv
@ -50,7 +50,7 @@ module FA_array (S, C, A, B, Ci) ;
   genvar 	  i;
   generate
      for (i = 0; i < n; i = i + 1) begin : index
-	 fa FA1(.S(S[i]), .C(C[i]), .A(A[i]), .B(B[i]), .Ci(Ci[i]));
+	 fa FA1(.sum(S[i]), .carry(C[i]), .a(A[i]), .b(B[i]), .c(Ci[i]));
      end
   endgenerate
`@ -1 +1 @@`
	`testfloat_gen f64_mulAdd -n 6133248 -rnear_even -seed 113355 -level 1 >> testFloat`	`testfloat_gen f64_mulAdd -n 6133248 -rminMag -seed 113355 -level 1 >> testFloat`