inital FMA push

2021-02-23 20:19:12 +00:00 · 2021-02-23 20:19:12 +00:00 · 7b103423e1
commit 7b103423e1
parent 64536dbc34
27 changed files with 11498 additions and 0 deletions
--- a/wally-pipelined/src/fpu/FMA/add.v
+++ b/wally-pipelined/src/fpu/FMA/add.v
@ -0,0 +1,61 @@
+////////////////////////////////////////////////////////////////////////////////
+//
+// Block Name:	add.v
+// Author:		David Harris
+// Date:		11/12/1995
+//
+// Block Description:
+//       This block performs the addition of the product and addend.   It also
+//   contains logic necessary to adjust the signs for effective subtracts 
+//   and negative results. 
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+module add(r[105:0], s[105:0], t[157:0], sum[157:0],
+		   negsum, invz, selsum1, killprod, negsum0, negsum1, proddenorm);
+////////////////////////////////////////////////////////////////////////////////
+
+	input 		[105:0]		r;     			// partial product 1
+	input 		[105:0]		s;              // partial product 2
+	input 		[157:0]		t;             	// aligned addend 
+	input					invz;       	// invert addend
+	input 					selsum1;    	// select +1 mode of compound adder 
+	input					killprod;    	// z >> product
+	input					negsum;      	// Negate sum 
+	input 					proddenorm;
+	output		[157:0]		sum;         	// sum
+	output					negsum0;     	// sum was negative in +0 mode
+	output					negsum1;     	// sum was negative in +1 mode 
+
+	// Internal nodes
+
+	wire		[105:0]		r2;				// partial product possibly zeroed out
+	wire		[105:0]		s2;				// partial product possibly zeroed out
+	wire		[157:0]		t2;				// addend after inversion if necessary
+	wire		[157:0] 	sum0;			// sum of compound adder +0 mode
+	wire		[157:0] 	sum1;			// sum of compound adder +1 mode
+
+	// Invert addend if necessary 
+
+	assign t2 = invz ? -t : t;
+	
+	// Zero out product if Z >> product or product really should be zero
+
+	assign r2 = ~proddenorm & killprod ? 106'b0 : r;
+	assign s2 = ~proddenorm & killprod ? 106'b0 : s;
+
+	// Compound adder
+	// Consists of 3:2 CSA followed by long compound CPA
+
+	assign sum0 = {52'b0, r2} + {52'b0, s2} + t2 + 158'b0;
+	assign sum1 = {52'b0, r2} + {52'b0, s2} + t2 + 158'b1;
+	
+	// Check sign bits in +0/1 modes 
+	assign negsum0 = sum0[157];
+	assign negsum1 = sum1[157];
+
+	// Mux proper result (+Oil mode and inversion) using 4:1 mux
+ 
+	assign sum = selsum1 ? (negsum ? ~sum1 : sum1) : (negsum ? ~sum0 : sum0);
+	
+endmodule
--- a/wally-pipelined/src/fpu/FMA/align.v
+++ b/wally-pipelined/src/fpu/FMA/align.v
@ -0,0 +1,99 @@
+/////////////////////////////////////////////////////////////////////////////// 
+// Block Name:	align.v
+// Author:		David Harris
+// Date:		11/2/1995
+//
+// Block Description:
+//   This block implements the alignment shifter.   It is responsible for
+//   adjusting the fraction portion of the addend relative to the fraction
+//   produced in the multiplier array.
+//
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddenorm, t[157:0], bs, ps, 
+             killprod,  bypsel[1], bypplus1, byppostnorm);
+/////////////////////////////////////////////////////////////////////////////
+
+	input 		[51:0]		z;				// Fraction of addend z;
+	input 		[12:0]		ae;		// sign of exponent of addend z;
+	input 		[11:0]		aligncnt;		// amount to shift
+	input					xzero;			// Input X = 0
+	input                  	yzero;          // Input Y = 0 
+	input                  	zzero;          // Input Z = 0
+	input                  	zdenorm;        // Input Z = denorm
+	input			proddenorm;
+	input     	[1:1] 		bypsel;         // Select bypass to X or Z
+	input					bypplus1;		// Add one to bypassed result
+	input                  	byppostnorm;    // Postnormalize bypassed result 
+	output    	[157:0]    	t;              // aligned addend (54 bits left of bpt)
+	output          		bs;           	// sticky bit of addend
+	output          		ps;           	// sticky bit of product
+	output          		killprod;    	// Z >> product
+
+	// Internal nodes
+ 
+	reg       	[157:0]   	t;				// aligned addend from shifter
+	reg             		killprod;		// Z >> product 
+	reg             		bs;				// sticky bit of addend
+	reg             		ps;				// sticky bit of product
+	reg       	[7:0]		i;				// temp storage for finding sticky bit
+	wire		[52:0]		z1;				// Z plus 1
+	wire		[51:0]		z2;				// Z selected after handling rounds
+	wire		[11:0]		align104;		// alignment count + 104
+
+	// Increment fraction of Z by  one if necessary for prerounded bypass
+	// This incrementor delay is masked by the alignment count computation
+
+	assign z1 =  z + 1;
+	assign z2 = bypsel[1] && bypplus1 ? (byppostnorm ? z1[52:1] : z1[51:0]): z;
+
+	// Compute sign of aligncnt + 104 to check for shifting too far right 
+
+	assign align104 = aligncnt+104;
+	
+	// Shift addend by alignment count.  Generate sticky bits from
+	// addend on right shifts.  Handle special cases of shifting
+	// by too much.
+
+	always @(z2 or aligncnt or align104 or zzero or xzero or yzero or zdenorm)
+		begin
+
+		// Default to clearing sticky bits 
+		bs = 0;
+		ps = 0;
+
+		// And to using product as primary operand in adder I exponent gen 
+		killprod = 0;
+
+		if(zzero) begin 
+			t = 158'b0;
+			if (xzero || yzero) killprod = 1;
+		end else if ((aligncnt > 53 && ~aligncnt[11]) || xzero || yzero) begin
+									// Left shift by huge amount
+									// or product = 0
+			t = {53'b0, ~zzero, z2, 52'b0}; 
+			killprod = 1;
+			ps = ~xzero && ~yzero; 
+		end else if ((ae[12] && align104[11])) begin //***fix the if statement
+			// KEP if the multiplier's exponent overflows
+			t = {53'b0, ~zzero, z2, 52'b0}; 
+			killprod = 1;
+			ps = ~xzero && ~yzero; 
+		end else if(align104[11])  begin 	// Right shift by huge amount
+			bs = ~zzero;
+			t = 0;
+		end else if (~aligncnt[11])  begin 	// Left shift by reasonable amount
+			t = {53'b0, ~zzero, z2, 52'b0} << aligncnt;
+		end else begin                 // Otherwise right shift 
+			t = {53'b0, ~zzero, z2, 52'b0} >> -aligncnt;
+
+		// use some behavioral code to find sticky bit.  This is really
+		// done by hardware in the shifter.
+		if (aligncnt < 0)
+			for (i=0; i<-aligncnt-52;  i = i+1)
+				bs = bs || z2[i];
+		end 
+	end
+
+endmodule
--- a/wally-pipelined/src/fpu/FMA/array.sv
+++ b/wally-pipelined/src/fpu/FMA/array.sv
@ -0,0 +1,114 @@
+
+module array(x, y, xdenorm, ydenorm, r, s, bypsel, bypplus1); 
+/////////////////////////////////////////////////////////////////////////////
+
+	input 		[51:0]		x;				// Fraction of multiplicand	x
+	input		[51:0]		y;				// Fraction of multiplicand y	
+	input					xdenorm;		// is x denormalized	
+	input					ydenorm;		// is y denormalized	
+	input					bypsel;			// Bypass X	
+	input				bypplus1;		// Add 1 to X to handle rounding
+	output		[105:0]		r;				//	partial product 1	
+	output		[105:0]		s;				//	partial product 2	
+
+	wire 		[51:0] 		xnorm;
+	wire 		[51:0] 		ynorm;
+    
+     wire        [54:0]      yExt; //y with appended 0 and assumed 1
+     wire        [53:0]      xExt; //y with assumed 1
+     wire [26:0][1:0] add1;
+     wire [26:0][54:0] pp; 
+     wire [26:0] e;
+     logic [17:0][105:0] lv1add;
+     logic [11:0][105:0] lv2add;
+     logic [7:0][105:0] lv3add;
+     logic [3:0][105:0] lv4add;
+     logic [21:0][106:0] carryTmp;
+     wire [26:0][105:0] acc; 
+     // wire [105:0] acc
+    genvar i;
+
+    assign xnorm = xdenorm ? {x[50:0], 1'b0} : x; // normalization of denormalized numbers
+	assign ynorm = ydenorm ? {y[50:0], 1'b0} : y;
+     assign yExt = {2'b01,ynorm,1'b0}; // y extended and added assumed 1
+     assign xExt = {2'b01,xnorm}; // x with added assumed 1
+
+
+     //booth encoding
+
+     generate
+        for(i=0; i<27; i=i+1) begin
+            booth booth(.xExt(xExt), .choose(yExt[(i*2)+2:i*2]), .add1(add1[i]), .e(e[i]), .pp(pp[i]));
+        end
+     endgenerate
+
+    assign acc[0] = {49'b0,~e[0],e[0],e[0],pp[0]}; 
+    assign acc[1] = {50'b01,~e[1],pp[1],add1[0]}; 
+    assign acc[2] = {48'b01,~e[2],pp[2],add1[1], 2'b0};
+    assign acc[3] = {46'b01,~e[3],pp[3],add1[2], 4'b0};
+    assign acc[4] = {44'b01,~e[4],pp[4],add1[3], 6'b0};
+    assign acc[5] = {42'b01,~e[5],pp[5],add1[4], 8'b0};
+    assign acc[6] = {40'b01,~e[6],pp[6],add1[5], 10'b0};
+    assign acc[7] = {38'b01,~e[7],pp[7],add1[6], 12'b0};
+    assign acc[8] = {36'b01,~e[8],pp[8],add1[7], 14'b0};
+    assign acc[9] = {34'b01,~e[9],pp[9],add1[8], 16'b0};
+    assign acc[10] = {32'b01,~e[10],pp[10],add1[9], 18'b0};
+    assign acc[11] = {30'b01,~e[11],pp[11],add1[10], 20'b0};
+    assign acc[12] = {28'b01,~e[12],pp[12],add1[11], 22'b0};
+    assign acc[13] = {26'b01,~e[13],pp[13],add1[12], 24'b0};
+    assign acc[14] = {24'b01,~e[14],pp[14],add1[13], 26'b0};
+    assign acc[15] = {22'b01,~e[15],pp[15],add1[14], 28'b0};
+    assign acc[16] = {20'b01,~e[16],pp[16],add1[15], 30'b0};
+    assign acc[17] = {18'b01,~e[17],pp[17],add1[16], 32'b0};
+    assign acc[18] = {16'b01,~e[18],pp[18],add1[17], 34'b0};
+    assign acc[19] = {14'b01,~e[19],pp[19],add1[18], 36'b0};
+    assign acc[20] = {12'b01,~e[20],pp[20],add1[19], 38'b0};
+    assign acc[21] = {10'b01,~e[21],pp[21],add1[20], 40'b0};
+    assign acc[22] = {8'b01,~e[22],pp[22],add1[21], 42'b0};
+    assign acc[23] = {6'b01,~e[23],pp[23],add1[22], 44'b0};
+    assign acc[24] = {4'b01,~e[24],pp[24],add1[23], 46'b0};
+    assign acc[25] = {~e[25],pp[25],add1[24], 48'b0};
+    assign acc[26] = {pp[26],add1[25], 50'b0};
+
+    //*** resize adders
+     generate
+        for(i=0; i<9; i=i+1) begin
+            add3comp2 #(.BITS(106)) add1(.a(acc[i*3]), .b(acc[i*3+1]), .c(acc[i*3+2]), 
+                                           .carry(carryTmp[i][105:0]), .sum(lv1add[i*2+1]));
+            assign lv1add[i*2] = {carryTmp[i][104:0], 1'b0};
+        end
+     endgenerate
+
+     generate
+        for(i=0; i<6; i=i+1) begin
+            add3comp2 #(.BITS(106)) add2(.a(lv1add[i*3]), .b(lv1add[i*3+1]), .c(lv1add[i*3+2]), 
+                                           .carry(carryTmp[i+9][105:0]), .sum(lv2add[i*2+1]));
+            assign lv2add[i*2] = {carryTmp[i+9][104:0], 1'b0};
+        end
+     endgenerate
+
+    generate
+        for(i=0; i<4; i=i+1) begin
+            add3comp2 #(.BITS(106)) add3(.a(lv2add[i*3]), .b(lv2add[i*3+1]), .c(lv2add[i*3+2]), 
+                                            .carry(carryTmp[i+15][105:0]), .sum(lv3add[i*2+1]));
+            assign lv3add[i*2] = {carryTmp[i+15][104:0], 1'b0};
+        end
+    endgenerate
+
+
+    generate
+        for(i=0; i<2; i=i+1) begin
+            add4comp2 #(.BITS(106)) add4(.a(lv3add[i*4]), .b(lv3add[i*4+1]), .c(lv3add[i*4+2]), .d(lv3add[i*4+3]),
+                                            .carry(carryTmp[i+19]), .sum(lv4add[i*2+1]));
+            assign lv4add[i*2] = {carryTmp[i+19][104:0], 1'b0};
+        end
+    endgenerate
+
+    add4comp2 #(.BITS(106)) add5(.a(lv4add[0]), .b(lv4add[1]), .c(lv4add[2]), .d(lv4add[3]) ,
+                                    .carry(carryTmp[21]), .sum(s));
+    assign r = {carryTmp[21][104:0], 1'b0};
+
+	// assign r = 106'b0;
+	// assign s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm};
+
+endmodule
--- a/wally-pipelined/src/fpu/FMA/booth.sv
+++ b/wally-pipelined/src/fpu/FMA/booth.sv
@ -0,0 +1,55 @@
+module booth(xExt, choose, add1, e, pp); 
+/////////////////////////////////////////////////////////////////////////////
+    
+	input 		[53:0]		xExt;				// multiplicand	xExt
+	input		[2:0]		choose;				// bits needed to choose which encoding
+	output		[1:0]       	add1;				// do you add 1	
+    output                  e;
+	output		[54:0]		pp;				//	the resultant encoding
+    
+    logic [54:0] pp, temp;
+    logic e;
+    logic [1:0] add1;
+    logic [53:0] negx;
+    //logic temp;
+
+    assign negx = ~xExt;
+
+    always @(choose, xExt, negx)
+    case (choose)
+        3'b000 : pp = 55'b0;   //  0
+        3'b001 : pp = {xExt[53], xExt};  //  1
+        3'b010 : pp = {xExt[53], xExt};  //  1
+        3'b011 : pp = {xExt, 1'b0};  //  2
+        3'b100 : pp = {negx, 1'b0};  // -2
+        3'b101 : pp = {negx[53], negx};  // -1
+        3'b110 : pp = {negx[53], negx};  // -1
+        3'b111 : pp = 55'hfffffffffffffff;  //  -0
+    endcase
+
+    always @(choose, xExt, negx)
+    case (choose)
+        3'b000 : e = 0;   //  0
+        3'b001 : e = xExt[53];  //  1
+        3'b010 : e = xExt[53];  //  1
+        3'b011 : e = xExt[53];  //  2
+        3'b100 : e = negx[53];  // -2
+        3'b101 : e = negx[53];  // -1
+        3'b110 : e = negx[53];  // -1
+        3'b111 : e = 1;  //  -0
+    endcase
+    // assign add1 = (choose[2] == 1'b1) ? ((choose[1:0] == 2'b11) ? 1'b0 : 1'b1) : 1'b0;
+    // assign add1 = choose[2];
+    always @(choose)
+    case (choose)
+        3'b000 : add1 = 2'b0;   //  0
+        3'b001 : add1 = 2'b0;  //  1
+        3'b010 : add1 = 2'b0;  //  1
+        3'b011 : add1 = 2'b0;  //  2
+        3'b100 : add1 = 2'b10;  // -2
+        3'b101 : add1 = 2'b1;  // -1
+        3'b110 : add1 = 2'b1;  // -1
+        3'b111 : add1 = 2'b1;  //  -0
+    endcase
+
+endmodule
--- a/wally-pipelined/src/fpu/FMA/bypass.v
+++ b/wally-pipelined/src/fpu/FMA/bypass.v
@ -0,0 +1,30 @@
+/////////////////////////////////////////////////////////////////////////////
+//  
+// Block Name:	bypass.v
+// Author:		David Harris
+// Date:		11/2/1995
+//
+// Block Description:
+//   This block contains the bypass muxes which allow fast prerounded
+//   bypass to the X and Z inputs of the FMAC
+//
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+module bypass(xrf[63:0], zrf[63:0], wbypass[63:0], bypsel[1:0],
+			   x[63:0], z[63:0]);
+/////////////////////////////////////////////////////////////////////////////
+
+	input     	[63:0]     	xrf;         	// X from register file 
+	input      	[63:0]   	zrf;           	// Z  from register file
+	input      	[63:0]     	wbypass;     	// Prerounded result for bypass 
+	input      	[1:0] 		bypsel;         // Select bypass to X or Z 
+	output    	[63:0]      x;           	// Source X
+	output    	[63:0]   	z;           	// Source Z
+
+	// If bypass select is asserted, bypass source, else take reg file value
+
+	assign x = bypsel[0] ? wbypass : xrf;
+	assign z = bypsel[1] ? wbypass : zrf;
+
+endmodule
--- a/wally-pipelined/src/fpu/FMA/compressors.sv
+++ b/wally-pipelined/src/fpu/FMA/compressors.sv
@ -0,0 +1,90 @@
+module add3comp2(a, b, c, carry, sum); 
+/////////////////////////////////////////////////////////////////////////////
+//look into diffrent implementations of the compressors?
+    
+    parameter BITS = 4;
+	input 		[BITS-1:0]		a;
+	input		[BITS-1:0]		b;
+	input		[BITS-1:0]    	c;
+    output      [BITS-1:0]      carry;
+	output		[BITS-1:0]		sum;
+    genvar i;
+
+    generate
+        for(i= 0; i<BITS; i=i+1) begin
+            sng3comp2 add0(a[i], b[i], c[i], carry[i], sum[i]);
+        end
+    endgenerate
+
+endmodule
+
+module add4comp2(a, b, c, d, carry, sum); 
+/////////////////////////////////////////////////////////////////////////////
+    
+    parameter BITS = 4;
+	input 		[BITS-1:0]		a;
+	input		[BITS-1:0]		b;
+	input		[BITS-1:0]    	c;
+	input		[BITS-1:0]    	d;
+    output      [BITS:0]      carry;
+	output		[BITS-1:0]		sum;
+
+    logic       [BITS-1:0]      cout;
+    logic                       carryTmp;
+    genvar i;
+
+
+    sng4comp2 add0(a[0], b[0], c[0], d[0], 1'b0, cout[0], carry[0], sum[0]);
+
+    generate
+        for(i= 1; i<BITS-1; i=i+1) begin
+            sng4comp2 add1(a[i], b[i], c[i], d[i], cout[i-1], cout[i], carry[i], sum[i]);
+        end
+    endgenerate
+
+
+    sng4comp2 add2(a[BITS-1], b[BITS-1], c[BITS-1], d[BITS-1], cout[BITS-2], cout[BITS-1], carryTmp, sum[BITS-1]);
+
+    assign carry[BITS-1] = carryTmp & cout[BITS-1];
+    assign carry[BITS] = carryTmp ^ cout[BITS-1];
+
+endmodule
+
+module sng3comp2(a, b, c, carry, sum); 
+/////////////////////////////////////////////////////////////////////////////
+//look into diffrent implementations of the compressors?
+    
+	input 				a;
+	input				b;
+	input		       	c;
+    output              carry;
+	output				sum;
+    
+    logic               axorb;
+
+    assign axorb = a ^ b;
+    assign sum = axorb ^ c;
+
+    assign carry = axorb ? c : a;
+
+endmodule
+
+module sng4comp2(a, b, c, d, cin, cout, carry, sum); 
+/////////////////////////////////////////////////////////////////////////////
+//look into pass gate 4:2 counters?
+    
+	input 				a;
+	input				b;
+	input		       	c;
+    input               d;
+    input               cin;
+    output              cout;
+    output              carry;
+	output				sum;
+    
+    logic               TmpSum;
+
+    sng3comp2 add1(.carry(cout), .sum(TmpSum),.*);
+    sng3comp2 add2(.a(TmpSum), .b(d), .c(cin), .*);
+
+endmodule
--- a/wally-pipelined/src/fpu/FMA/expgen.v
+++ b/wally-pipelined/src/fpu/FMA/expgen.v
@ -0,0 +1,135 @@
+/////////////////////////////////////////////////////////////////////////////// 
+// Block Name:	expgen.v
+// Author:		David Harris
+// Date:		11/2/1995
+//
+//   Block Description:
+//   This block implements the exponent path of the FMAC. It performs the
+//   following operations:
+//
+//   1) Compute exponent of multiply.  
+//   2) Compare multiply and add exponents to generate alignment shift count
+//   3) Adjust exponent based on normalization
+//   4)  Increment exponent based on postrounding renormalization
+//
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+module expgen(x[62:52], y[62:52], z[62:52],
+			   earlyres[62:52], earlyressel, bypsel[1], byppostnorm, 
+			   killprod,  sumzero, postnormalize, normcnt, infinity, 
+			   invalid, overflow, underflow, inf, 
+			   nan, xnan, ynan, znan, zdenorm, specialsel, 
+			   aligncnt, w[62:52], wbypass[62:52],
+			   prodof, sumof, sumuf, denorm0, ae[12:0]);
+/////////////////////////////////////////////////////////////////////////////
+  
+	input     	[62:52]    	x;           	// Exponent of multiplicand x
+	input     	[62:52]  	y;         		// Exponent of multiplicand y
+	input     	[62:52]  	z;           	// Exponent of addend z
+	input     	[62:52]	 	earlyres;  		// Result from other FPU block
+	input     				earlyressel;    // Select result from other block
+	input     	[1:1] 		bypsel;         // Bypass X or Z
+	input     				byppostnorm;    // Postnormalize bypassed result
+	input     				killprod;    	// Z >> product
+	input     				sumzero;     	// sum exactly equals zero 
+	input     				postnormalize;  // postnormalize rounded result
+	input     	[8:0]  		normcnt;     	// normalization shift count 
+	input     				infinity;    	// generate infinity on overflow 
+	input     				invalid;     	// Result invalid
+	input     				overflow;    	// Result overflowed
+	input     				underflow;   	// Result underflowed 
+	input     				inf;			// Some input is infinity
+	input     				nan;			// Some input is NaN
+	input     				xnan;			// X is NaN
+	input     				ynan;			// Y is NaN
+	input     				znan;			// Z is NaN 
+	input     				zdenorm;		// Z is denorm
+	input     				specialsel;  	// Select special result
+	output		[11:0]   	aligncnt;       // shift count for alignment shifter
+	output		[62:52]     w;           	// Exponent of result
+	output		[62:52]     wbypass;     	// Prerounded exponent for bypass 
+	output					prodof;         // X*Y exponent out of bounds 
+	output					sumof;          // X*Y+Z exponent out of bounds 
+	output					sumuf;         // X*Y+Z exponent underflows 
+	output					denorm0;     	// exponent = 0 for denorm 
+	output		[12:0]		ae;				//exponent of multiply
+
+	//   Internal nodes
+
+	wire 	[12:0]			aetmp;				// Exponent of Multiply
+	wire 	[12:0]			aligncnt0;		// Shift count for alignment
+	wire 	[12:0]			aligncnt1;		// Shift count for alignment
+	wire 	[12:0]			be;				// Exponent of multiply
+	wire 	[12:0]			de0;			// Normalized exponent
+	wire 	[12:0]			de1;			// Normalized exponent
+	wire 	[12:0]			de;				// Normalized exponent
+	wire 	[10:0]			infinityres;	// Infinity or max number
+	wire 	[10:0]			nanres;          //	Nan propagated or generated
+	wire 	[10:0]			specialres;  //	Exceptional case result
+
+	//   Compute exponent of multiply
+	// Note that the exponent does not have to be incremented on a postrounding
+	//   normalization of X because the mantissa was already increased.   Report
+	//   if exponent is out of bounds 
+	assign ae = x + y  - 1023; 
+
+	assign prodof = (ae > 2046 && ~ae[12] && ~killprod);
+
+	// Compute alignment shift count
+	// Adjust for postrounding normalization of Z.
+	// This should not increas the critical path because the time to
+	// check if a round overflows is shorter than the actual round and
+	// is masked by the bypass mux and two 10 bit adder delays.
+
+	assign aligncnt0 = z - ae[10:0] + 13'b0;
+	assign aligncnt1 = z - ae[10:0] + 13'b1;
+	assign aligncnt = bypsel[1] && byppostnorm ? aligncnt1 : aligncnt0;
+
+	// Select exponent (usually from product except in case of huge addend)
+
+	assign be = killprod ? z : ae;
+
+	// Adjust exponent based on normalization
+	// A compound adder takes care of the case of post-rounding normalization
+	// requiring an extra increment
+	 
+	assign de0 = sumzero ? 13'b0 : be + 53 - normcnt;
+	assign de1 = sumzero ? 13'b0 : be + 53 - normcnt + 13'b1;
+	 
+	// If the exponent becomes exactly zero (denormalized)
+	// signal such to adjust R bit before rounding
+
+	assign denorm0 = (de0 == 0);
+	
+	// check for exponent out of bounds after add 
+	
+	assign de = postnormalize ? de1 : de0;
+	assign sumof = de > 2046 && ~de[12];
+	assign sumuf = (de == 0 || de[12])  && ~sumzero && ~zdenorm;//KEP ~zdenorm to prevent underflow flag
+
+	// bypass occurs before rounding or taking early results 
+	
+	assign wbypass = de0[10:0];
+	
+	// In a non-critical special mux, we combine the early result from other
+	// FPU blocks with the results of exceptional conditions.  Overflow
+	// produces either infinity or the largest finite number, depending on the
+	// rounding mode.  NaNs are propagated or generated.
+
+	assign specialres = earlyressel ? earlyres :
+					invalid ? nanres :
+					overflow ? infinityres : 
+					inf ? 11'b11111111111 :
+					underflow ? 11'b0 : 11'bx;
+
+	assign infinityres = infinity ? 11'b11111111111 : 11'b11111111110;
+
+	assign nanres = xnan ? x : (ynan ? y : (znan? z : 11'b11111111111));
+
+	// A mux selects the early result from other FPU blocks or the 
+	// normalized FMAC result.   Special cases are also detected. 
+	
+	assign w = specialsel ? specialres[10:0] : de; 
+endmodule
+
--- a/wally-pipelined/src/fpu/FMA/flag.v
+++ b/wally-pipelined/src/fpu/FMA/flag.v
@ -0,0 +1,85 @@
+/////////////////////////////////////////////////////////////////////////////// 
+// Block Name:	flag.v
+// Author:		David Harris
+// Date:		12/6/1995
+//
+// Block Description:
+//       This block generates the flags: invalid, overflow, underflow, inexact. 
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
+			 psign,  zsign, xzero, yzero, v[1:0],
+			 inf, nan, invalid, overflow, underflow, inexact);
+/////////////////////////////////////////////////////////////////////////////
+
+	input                  	xnan;        	// X is NaN 
+	input                  	ynan;        	// Y is NaN 
+	input                 	znan;       	// Z is NaN 
+	input                  	xinf;        	// X is Inf
+	input                 	yinf;       	// Y is Inf 
+	input                  	zinf;        	// Z is Inf
+	input                  	prodof;         // X*Y overflows exponent
+	input                  	sumof;          // X*Y + z underflows exponent
+	input                  	sumuf;          // X*Y + z underflows exponent
+	input					psign; 			// Sign of product
+	input					zsign; 			// Sign of z
+	input					xzero;			// x = 0
+	input					yzero;			// y = 0
+	input     	[1:0]  		v;				// R and S bits of result
+	output					inf;			// Some	source is Inf
+	output					nan;			// Some	source is NaN
+	output					invalid;		// Result is invalid	
+	output					overflow;		// Result overflowed	
+	output					underflow;		// Result underflowed	
+	output					inexact;		// Result is not an exact	number
+ 
+	//   Internal nodes
+
+	wire					prodinf;		// X*Y larger than max possible
+	wire					suminf;			// X*Y+Z larger than max possible
+
+	// If any input is NaN, propagate the NaN 
+
+	assign nan = xnan || ynan || znan;
+
+	// Same with infinity (inf - inf and O * inf don't propagate inf
+	//  but it's ok becaue illegal op takes higher precidence)
+
+	assign inf= xinf || yinf || zinf;
+
+	// Generate infinity checks
+
+	assign prodinf = prodof && ~xnan && ~ynan;
+	assign suminf = sumof && ~xnan && ~ynan && ~znan;
+
+	// Set invalid flag for following cases:
+	//   1) Inf - Inf
+	//   2) 0 * Inf
+	//   3) Output = NaN (this is not part of the IEEE spec,  only 486 proj)
+
+	assign invalid = (xinf || yinf || prodinf) && zinf && (psign ^ zsign) ||
+					   xzero && yinf || yzero && xinf ||
+					   nan;
+
+	// Set the overflow flag for the following cases:
+	//   1) Rounded multiply result would be out of bounds
+	//   2) Rounded add result would be out of bounds
+
+	assign overflow = suminf && ~inf;
+
+	// Set the underflow  flag for the following cases:
+	//   1) Any input is denormalized
+	//   2)  Output would be denormalized or smaller
+
+	assign underflow = (sumuf && ~inf && ~prodinf && ~nan);
+
+
+	// Set the inexact flag for the following cases:
+	//   1) Multiplication inexact
+	//   2) Addition  inexact
+	// One of these cases occurred if the R or S bit is set
+
+	assign inexact = (v[0] || v[1]  || suminf) && ~(inf || nan);
+
+endmodule
--- a/wally-pipelined/src/fpu/FMA/fmac.v
+++ b/wally-pipelined/src/fpu/FMA/fmac.v
@ -0,0 +1,130 @@
+ ////////////////////////////////////////////////////////////////////////////////
+// Block Name:	fmac.v
+// Author:		David Harris
+// Date:		11/2/1995
+//
+// Block Description:
+//   This is the top level block of a floating-point  multiply/accumulate
+//   unit(FMAC).   It instantiates the following sub-blocks:
+//
+//    array     Booth encoding, partial product generation, product summation
+//    expgen    Exponent summation, compare, and adjust
+//    align     Alignment shifter
+//    add       Carry-save adder for accumulate, carry propagate adder
+//    lza       Leading zero anticipator to control normalization shifter
+//    normalize Normalization shifter
+//    round     Rounding of result
+//    exception Handles exceptional cases
+//    bypass    Handles bypass of result to X or Z inputs
+//    sign      One bit sign handling block 
+//    special   Catch special cases (inputs = 0  / infinity /  etc.) 
+//
+//   The FMAC computes W=X*Y+Z, rounded with the mode specified by
+//   RN, RZ, RM, or RP.  The result is optionally bypassed back to
+//   the X or Z inputs for use on the next cycle.  In addition,  four signals
+//   are produced: trap, overflow, underflow, and inexact.  Trap indicates
+//   an infinity, NaN, or denormalized number to be handled in software;
+//   the other three signals are IEEE flags.
+//
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+module fmac(xrf, y, zrf, rn, rz, rp, rm,
+			earlyres, earlyressel, bypsel, bypplus1, byppostnorm, 
+			w, wbypass, invalid, overflow, underflow, inexact);
+/////////////////////////////////////////////////////////////////////////////
+ 
+	input 		[63:0]		xrf;			// input X from reg file
+	input		[63:0]		y;				// input Y  
+	input 		[63:0]		zrf;          	// input Z from reg file 
+	input 			 		rn;          	// Round to Nearest
+	input 					rz;           	// Round toward zero
+	input 					rm;          	// Round toward minus infinity
+	input 					rp;          	// Round toward plus infinity
+	input 		[63:0]		earlyres;    	// Early result from other FP logic
+	input 					earlyressel;	// Select early result, not W 
+	input 		[1:0]		bypsel;     	// Select W bypass to X, or z 
+	input 					bypplus1;    	// Add one in bypass
+	input 					byppostnorm;	// postnormalize in bypass
+	output 		[63:0]		w;           	// output W=X*Y+Z
+	output 		[63:0]		wbypass;     	// prerounded output W=X*Y+Z for bypass
+	output 					invalid;    	// Result is invalid 
+	output					overflow;		// Result overflowed 
+	output					underflow;   	// Result underflowed
+	output 					inexact;     	// Result is not an exact number 
+
+// Internal nodes
+ 
+	wire 		[63:0]		x;				// input X after bypass mux
+	wire 		[63:0]		z; 				// input Z after bypass mux
+	wire 		[105:0]		r; 				// one result of partial product sum
+	wire 		[105:0]		s; 				// other result of partial products
+	wire 		[157:0]		t;				// output of alignment shifter
+	wire 		[157:0]		sum;			// output of carry prop adder
+	wire 		[53:0]		v; 				// normalized sum, R, S bits
+	wire 		[11:0]		aligncnt; 		// shift count for alignment
+	wire 		[8:0]		normcnt; 		// shift count for normalizer
+	wire 		[12:0]		ae; 		// multiplier expoent
+	wire 					bs;				// sticky bit of addend
+	wire 					ps;				// sticky bit of product
+	wire 					killprod; 		// Z >> product
+	wire 					negsum; 		// negate sum
+	wire 					invz; 			// invert addend
+	wire 					selsum1; 		// select +1 mode of sum
+	wire 					negsum0; 		// sum +0 < 0
+	wire 					negsum1; 		// sum +1 < 0
+	wire 					sumzero; 		// sum = 0
+	wire 					infinity; 		// generate infinity on overflow
+	wire 					prodof; 		// X*Y out of range
+	wire 					sumof;			// result out of range
+
+//   Instantiate fraction datapath
+
+	array			array(x[51:0], y[51:0], xdenorm, ydenorm, r[105:0], s[105:0],
+						  bypsel[0], bypplus1);
+	align			align(z[51:0], ae, aligncnt, xzero, yzero,  zzero, zdenorm, proddenorm,
+					      t[157:0], bs, ps, killprod, 
+						  bypsel[1], bypplus1, byppostnorm);
+	add				add(r[105:0], s[105:0], t[157:0], sum[157:0],
+					    negsum, invz, selsum1, killprod, negsum0, negsum1, proddenorm);
+	lop				lop(sum, normcnt, sumzero);
+	normalize		normalize(sum[157:0], normcnt, sumzero, bs, ps, denorm0, zdenorm,
+							  v[53:0]); 
+	round			round(v[53:0], earlyres[51:0], earlyressel, rz, rn, rp, rm, w[63],
+						  invalid, overflow,  underflow, inf, nan, xnan, ynan, znan,
+						  x[51:0], y[51:0],  z[51:0],
+						  w[51:0], postnorrnalize, infinity, specialsel);
+	bypass			bypass(xrf[63:0], zrf[63:0], wbypass[63:0], bypsel[1:0],
+						   x[63:0], z[63:0]); 
+
+// Instantiate exponent datapath
+
+	expgen			expgen(x[62:52], y[62:52], z[62:52],
+						   earlyres[62:52], earlyressel, bypsel[1], byppostnorm,
+						   killprod, sumzero, postnorrnalize, normcnt, 
+						   infinity, invalid, overflow, underflow, 
+						   inf, nan, xnan, ynan, znan, zdenorm, specialsel,
+						   aligncnt, w[62:52], wbypass[62:52],
+						   prodof, sumof, sumuf, denorm0, ae);
+// Instantiate special case detection across datapath & exponent path 
+
+	special			special(x[63:0], y[63:0], z[63:0], ae, xzero, yzero, zzero,
+							xnan, ynan, znan, xdenorm, ydenorm, zdenorm, proddenorm,
+							xinf, yinf, zinf);
+
+// Produce W for bypass
+
+assign wbypass[51:0] = v[53:2];
+assign wbypass[63] = w[63];
+
+// Instantiate control logic
+ 
+sign				sign(x[63], y[63], z[63], negsum0, negsum1, bs, ps, 
+					     killprod, rm, sumzero, nan, invalid, xinf, yinf, inf, 
+						 w[63], invz, negsum, selsum1, psign); 
+flag				flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
+						 psign, z[63], xzero, yzero, v[1:0],
+						 inf, nan, invalid, overflow, underflow, inexact); 
+
+endmodule
+
--- a/wally-pipelined/src/fpu/FMA/lop.v
+++ b/wally-pipelined/src/fpu/FMA/lop.v
@ -0,0 +1,41 @@
+/////////////////////////////////////////////////////////////////////////////// 
+// Block Name:	lop.v
+// Author:		David Harris
+// Date:		11/2/1995
+//
+// Block Description:
+//   This block implements a Leading One Predictor used to determine 
+//   the normalization shift count. 
+///////////////////////////////////////////////////////////////////////////////
+
+///////////////////////////////////////////////////////////////////////////// 
+module lop(sum, normcnt, sumzero); 
+/////////////////////////////////////////////////////////////////////////////
+ 
+	input     	[157:0]  	sum;            // sum
+	output     	[8:0]		normcnt;		// normalization shift count
+	output     		  		sumzero;		// sum = 0
+
+	// Internal nodes
+
+	reg			[8:0] 		i;				// loop index
+	reg			[8:0] 		normcnt;		// normalization shift count
+ 
+	// A real LOP uses a fast carry chain to find only the first 0.
+	// It is an example of a parallel prefix algorithm.  For the sake
+	// of simplicity,  this model is behavioral instead.
+	// A real LOP would also operate on the sources of the adder, not
+	// the result!
+
+	always @ ( sum)
+		begin
+			i =   0;
+			while (~sum[157-i] && i < 157) i = i+1;  // search for leading one 
+			normcnt = i;    // compute shift count
+	end
+
+	// Also check if sum is zero 
+	assign sumzero = ~(|sum);
+	
+endmodule
+
--- a/wally-pipelined/src/fpu/FMA/normalize.v
+++ b/wally-pipelined/src/fpu/FMA/normalize.v
@ -0,0 +1,63 @@
+/////////////////////////////////////////////////////////////////////////////// 
+// Block Name:	normalize.v
+// Author:		David Harris
+// Date:		11/2/1995
+//
+// Block Description:
+//   This block performs the normalization shift.  It also
+//   generates the Rands bits for rounding.  Finally, it
+//   handles the special case of a zero sum.
+//
+//   v[53:2]  is the fraction component of the prerounded result.
+//   It can be bypassed back to the X or Z inputs of the FMAC
+//   for back-to-back operations. 
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+module normalize(sum[157:0], normcnt, sumzero, bs, ps, denorm0, zdenorm, v[53:0]); 
+/////////////////////////////////////////////////////////////////////////////
+	input     	[157:0]  	sum;            // sum
+	input		[8:0] 		normcnt;     	// normalization shift count
+	input					sumzero;		// sum is zero
+	input					bs;				// sticky bit for addend
+	input					ps;				// sticky bit for product
+	input					denorm0;		// exponent = -1023
+	input                  	zdenorm;        // Input Z is denormalized
+	output		[53:0]		v;				// normalized sum, R, S bits
+
+	// Internal nodes
+
+	reg       	[53:0]     	v;           	// normalized sum, R, S bits 
+	wire       	[157:0]  	sumshifted;     // shifted sum
+
+	// When the sum is zero,  normalization does not apply and only the
+	// sticky bit must be computed.  Otherwise,  the sum is right-shifted
+	// and the Rand S bits (v[1]  and v[O],  respectively) are assigned.
+
+	// The R bit is also set on denormalized numbers where the exponent
+	// was computed to be exactly -1023 and the L bit was set.  This
+	// is required for correct rounding up of multiplication results.
+
+	// The sticky bit calculation is actually built into the shifter and
+	// does not require a true subtraction shown in the model.
+ 
+	always @(sum or normcnt or sumzero or bs or ps or sumshifted or denorm0)
+		begin
+			if (sumzero)  begin            // special case
+				v[53:1] = 0;
+				v[0] =  ps ||  bs ;
+			end else begin                 // extract normalized bits
+				v[53:3] = sumshifted[156:106];
+				// KEP prevent plus1 in round.v when z is denormalized.
+				v[2] = sumshifted[105] || sumshifted[106] && denorm0 && ~zdenorm; 
+				v[1] = sumshifted[104] || sumshifted[105] && denorm0 && ~zdenorm;
+				v[0] = |(sumshifted[103:0]) || ps || bs;
+		end 
+	end
+
+
+	// shift sum left by normcnt,  filling the right with zeros 
+	assign sumshifted = sum << normcnt;
+	
+endmodule
+
--- a/wally-pipelined/src/fpu/FMA/round.v
+++ b/wally-pipelined/src/fpu/FMA/round.v
@ -0,0 +1,106 @@
+///////////////////////////////////////////////////////////////////////////// 
+// Block Name:	round.v
+// Author:		David Harris
+// Date:		11/2/1995
+//
+// Block Description: 
+//   This block is responsible for rounding the normalized result of //   the FMAC.   Because prenormalized results may be bypassed back to //   the FMAC X and z inputs, rounding does not appear in the critical //   path of most floating point code.   This is good because rounding //   requires an entire 52 bit carry-propagate half-adder delay.
+//
+//   The results from other FPU blocks (e.g. FCVT,  FDIV,  etc)  are also 
+//   muxed in to form the actual result for register file writeback.  This
+//   saves a mux from the writeback path.
+//
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+module round(v[53:0], earlyres[51:0], earlyressel, rz, rn, rp, rm, wsign,
+			  invalid, overflow, underflow, inf, nan, xnan, ynan, znan, 
+			  x[51:0], y[51:0], z[51:0],
+			  w[51:0], postnormalize, infinity, specialsel);
+/////////////////////////////////////////////////////////////////////////////
+
+	input		[53:0]		v;				// normalized sum, R, S bits
+	input		[51:0]		earlyres;		// result from other FPU blocks
+	input 					earlyressel; 	// use result from other FPU blocks
+	input					rz;				// Round toward zero
+	input					rn;				// Round toward	nearest
+	input					rp;				// Round toward	plus infinity
+	input					rm;				// Round toward	minus infinity
+	input					wsign;			// Sign of result
+	input 					invalid;		// Trap on infinity, NaN, denorm
+	input					overflow;		// Result overflowed
+	input					underflow;		// Result underflowed
+	input					inf;			// Some input is infinity
+	input					nan;			// Some input is NaN
+	input					xnan;			// X is NaN
+	input					ynan;			// Y is NaN
+	input					znan;			// Z is NaN
+	input		[51:0]		x;				// Input X
+	input		[51:0]		y;				// Input Y
+	input		[51:0]		z;				// Input Z
+	output		[51:0]		w; 				// rounded result of FMAC
+	output					postnormalize; 	// Right shift 1 for post-rounding norm
+	output					infinity;    	// Generate infinity on overflow
+	output					specialsel;  	// Select special result
+
+	// Internal nodes
+
+	wire					plus1;			// Round by adding one 
+	wire		[52:0]		v1;				// Result + 1 (for rounding)
+	wire		[51:0]		specialres;		// Result of exceptional case 
+	wire		[51:0]		infinityres;	// Infinity or largest real number
+	wire		[51:0]		nanres;			// Propagated or generated NaN 
+
+	// Compute if round should occur.  This equation is derived from
+	// the rounding tables.
+
+
+	assign plus1 = rn && ((v[1] && v[0]) || (v[2] && (v[1]))) ||
+					 rp && ~wsign && (v[1] || v[0]) ||
+					 rm && wsign && (v[1] || v[0]);
+
+	// Compute rounded result 
+    assign v1 = v[53:2] + 1;
+	// Determine if postnormalization is necessary
+	// Predicted by all bits =1 before round +1
+
+	assign postnormalize = &(v[53:2]) && plus1;
+
+	// Determine special result in event of of selection of a result from
+	// another FPU functional unit,  infinity, NAN,  or underflow
+	// The special result mux is a 4:1 mux that should not appear in the
+	// critical path of the machine.   It is not priority encoded,  despite
+	// the code below suggesting otherwise.  Also,  several of the identical data
+	// inputs to the wide muxes can be combined at the expense of more
+	// complicated non-critical control in the circuit implementation.
+
+	assign specialsel = earlyressel || overflow || underflow || invalid ||
+							nan || inf;
+	assign specialres = earlyressel ? earlyres : 
+						 invalid ? nanres : 
+						 overflow ? infinityres : 
+						 inf ? 52'b0 :
+						underflow ? 52'b0 : 52'bx;  // default to undefined 
+
+	// Overflow is handled differently for different rounding modes
+	// Round is to either infinity or to maximum finite number
+
+	assign infinity = rn || (rp && ~wsign) || (rm && wsign);
+	assign infinityres = infinity ? 52'b0 : {52{1'b1}};
+
+	// Invalid operations produce a quiet NaN. The result should
+	// propagate an input if the input is NaN. Since we assume all
+	// NaN inputs are already quiet, we don't have to force them quiet.
+
+	// assign nanres = xnan ? x: (ynan ? y : (znan ? z : {1'b1, 51'b0})); // original
+	assign nanres = xnan ? {1'b1, x[50:0]}: (ynan ? {1'b1, y[50:0]} : (znan ? {1'b1, z[50:0]} : {1'b1, 51'b0}));// KEP 210112 add the 1 to make NaNs quiet
+
+	// Select result with 4:1 mux
+	// If the sum is zero and we round up,  there is a special case in
+	// which we produce a massive loss of significance and trap to software.
+	// It is handled in the exception unit. 
+
+	assign w = specialsel ? specialres : (plus1 ? v1[51:0] : v[53:2]);
+	
+endmodule
+
--- a/wally-pipelined/src/fpu/FMA/sign.v
+++ b/wally-pipelined/src/fpu/FMA/sign.v
@ -0,0 +1,93 @@
+/////////////////////////////////////////////////////////////////////////////// 
+// Block Name:	sign.v
+// Author:		David Harris
+// Date:		12/1/1995
+//
+// Block Description:
+//   This block manages the signs of the numbers.
+//   1 =  negative
+//
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm,
+			 sumzero, nan, invalid, xinf, yinf, inf, wsign, invz, negsum, selsum1, psign);
+////////////////////////////////////////////////////////////////////////////I
+ 
+	input					xsign;			// Sign of X 
+	input					ysign;			// Sign of Y 
+	input					zsign;			// Sign of Z
+	input					negsum0;		// Sum in +O mode is negative 
+	input					negsum1;		// Sum in +1 mode is negative 
+	input					bs;				// sticky bit from addend
+	input					ps;				// sticky bit from product
+	input					killprod;		// Product forced to zero
+	input					rm;				// Round toward minus infinity
+	input					sumzero;		// Sum = O
+	input					nan;			// Some input is NaN
+	input					invalid;		// Result invalid
+	input					xinf;			// X = Inf
+	input					yinf;			// Y = Inf
+	input					inf;			// Some input = Inf
+	output					wsign;			// Sign of W 
+	output					invz;			// Invert addend into adder
+	output					negsum;			// Negate result of adder
+	output					selsum1;		// Select +1 mode from compound adder
+	output					psign;			// sign of product X * Y 
+ 
+	// Internal nodes
+
+	wire					zerosign;    	// sign if result= 0 
+	wire					infsign;     	// sign if result= Inf 
+	reg						negsum;         // negate result of adder 
+	reg						selsum1;     	// select +1 mode from compound adder 
+
+	// Compute sign of product 
+
+	assign psign = xsign ^ ysign;
+
+	// Invert addend if sign of Z is different from sign of product assign invz = zsign ^ psign;
+	assign invz = zsign ^ psign;
+	// Select +l mode for adder and compute if result must be negated
+	// This is done according to cases based on the sticky bit.
+
+	always @(invz or negsum0 or negsum1 or bs or ps)
+		begin
+			if (~invz) begin               // both inputs have same sign
+				negsum = 0;
+				selsum1 = 0;
+			end else if (bs) begin        // sticky bit set on addend
+				selsum1 = 0;
+				negsum = negsum0; 
+			end else if (ps) begin 		// sticky bit set on product
+				selsum1 = 1;
+				negsum =  negsum1;
+			end else begin 				// both sticky bits clear
+				selsum1 = negsum1; 	// KEP 210113-10:44 Selsum1 was adding 1 to values that were multiplied by 0
+				// selsum1 = ~negsum1; //original
+				negsum = negsum1;
+		end 
+	end
+
+	// Compute sign of result
+	// This involves a special case when the sum is zero:
+	//   x+x retains the same sign as x even when x = +/- 0.
+	//   otherwise,  x-x = +O unless in the RM mode when x-x = -0
+	// There is also a special case for NaNs and invalid results;
+	// the sign of the NaN produced is forced to be 0.
+	// Sign calculation is not in the critical path so the cases
+	// can be tolerated. 
+	// IEEE 754-2008 section 6.3 states 
+	// 		"When ether an input or result is NaN, this standard does not interpret the sign of a NaN."
+	// 		also pertaining to negZero it states:
+	//			"When the sum/difference of two operands with opposite signs is exactly zero, the sign of that sum/difference
+	//			 shall be +0 in all rounding attributes EXCEPT roundTowardNegative. Under that attribute, the sign of an exact zero 
+	//			 sum/difference shall be -0.  However, x+x = x-(-X) retains the same sign as x even when x is zero."
+ 
+	assign zerosign = (~invz && killprod) ? zsign : rm;
+	assign infsign = psign; //KEP 210112 keep the correct sign when result is infinity
+	// assign infsign = xinf ? (yinf ? psign : xsign) : yinf ? ysign : zsign;//original
+	assign wsign =invalid? 0 : (inf ? infsign:
+								(sumzero ? zerosign : psign ^ negsum));
+
+endmodule
--- a/wally-pipelined/src/fpu/FMA/special.v
+++ b/wally-pipelined/src/fpu/FMA/special.v
@ -0,0 +1,70 @@
+/////////////////////////////////////////////////////////////////////////////// 
+// Block Name:	special.v
+// Author:		David Harris
+// Date:		12/2/1995
+//
+// Block Description:
+//   This block implements special case handling for unusual operands (e.g. 
+//   0, NaN,  denormalize,  infinity).   The block consists of zero/one detectors.
+//
+/////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////
+module special(x[63:0], y[63:0], z[63:0], ae, xzero, yzero, zzero,
+				xnan, ynan, znan, xdenorm, ydenorm, zdenorm, proddenorm, xinf, yinf, zinf);
+/////////////////////////////////////////////////////////////////////////////
+
+	input   		[63:0]     	x;             // Input x
+	input     	[63:0]     	y;           	// Input Y
+	input      	[63:0]    	z;            	// Input z 
+	input		[12:0]			ae;			// exponent of product
+	output						xzero;			// Input x = 0
+	output						yzero;			// Input y = 0
+	output						zzero;			// Input z = 0
+	output						xnan;			// x is NaN
+	output						ynan;			// y is NaN
+	output						znan;			// z is NaN
+	output						xdenorm;		// x is denormalized
+	output						ydenorm;		// y is denormalized
+	output						zdenorm;		// z is denormalized
+	output						proddenorm;		// product is denormalized
+	output						xinf;			// x is infinity
+	output						yinf;			// y is infinity
+	output						zinf;			// z is infinity
+
+	// In the actual circuit design, the gates looking at bits
+	// 51:0 and at bits 62:52 should be shared among the various detectors.
+
+	// Check if input is NaN
+
+	assign xnan = &x[62:52] && |x[51:0]; 
+	assign ynan = &y[62:52] && |y[51:0]; 
+	assign znan = &z[62:52] && |z[51:0];
+
+	// Check if input is denormalized
+
+	assign xdenorm = ~(|x[62:52]) && |x[51:0]; 
+	assign ydenorm = ~(|y[62:52]) && |y[51:0]; 
+	assign zdenorm = ~(|z[62:52]) && |z[51:0];
+	assign proddenorm = &ae & ~xzero & ~yzero; //KEP is the product denormalized
+
+	// Check if input is infinity
+
+	assign xinf = &x[62:52] && ~(|x[51:0]); 
+	assign yinf = &y[62:52] && ~(|y[51:0]); 
+	assign zinf = &z[62:52] && ~(|z[51:0]);
+
+	// Check if inputs are all zero
+	// Also forces denormalized inputs to zero.
+	//   In the circuit implementation,  this can be optimized
+	// to just check if the exponent is zero.
+	
+	// KATHERINE - commented following (21/01/11)
+	// assign xzero = ~(|x[62:0]) || xdenorm;
+	// assign yzero = ~(|y[62:0]) || ydenorm;
+	// assign zzero = ~(|z[62:0]) || zdenorm;
+	// KATHERINE - removed denorm to prevent outputing zero when computing with a denormalized number
+	assign xzero = ~(|x[62:0]);
+	assign yzero = ~(|y[62:0]);
+	assign zzero = ~(|z[62:0]);
+ endmodule
--- a/wally-pipelined/src/fpu/FMA/tb.v
+++ b/wally-pipelined/src/fpu/FMA/tb.v
--- a/wally-pipelined/src/fpu/FMA/tbgen/StineVectors
+++ b/wally-pipelined/src/fpu/FMA/tbgen/StineVectors
--- a/wally-pipelined/src/fpu/FMA/tbgen/ans
+++ b/wally-pipelined/src/fpu/FMA/tbgen/ans
@ -0,0 +1,199 @@
+c22000007fffffff 24700000ffffffef a6a00001800007ed
+bfc00000000011fe 3fdfffffffffff03 bfb000000000117f
+a83100000007fffe 41e0000effffffff aa21000ff0080004
+0000000000000000 001ffffffffffffe 0000000000000000
+400327ca64d70ec7 3ca0000000000001 3cb327ca64d70ec9
+0000000000000000 43e207ffffffffff 0000000000000000
+0000000000000000 3fd0000000000000 0000000000000000
+0000000000000000 3fdfffffffffffff 0000000000000000
+0000000000000000 3fe0000000000000 0000000000000000
+c870200000010000 3fefffffffffffff c87020000000ffff
+c00aaa4fd557ef13 c3b8917384eb32d0 43d478efdc9216d8
+0000000000000000 7ffc000000000000 7ff8000000000000
+0000000000000000 c18aca47203438e2 0000000000000000
+0000000000000000 4000000000000001 0000000000000000
+47efff0008000000 b1dcb0523546117f b9dcaf6cb9e07bdb
+43f000ffffff7fff 22300000001fffdf 26300100001f81de
+402ff000001fffff 40759558e27de226 40b58a8e3622388e
+0000000000000000 40efdeffffffffff 0000000000000000
+0000000000000000 434fffffffffffff 0000000000000000
+7ffc000000000000 7fe0000000000000 7ff8000000000000
+b35e061abc769f3a c078000003fffffe 33e684941119bac2
+403a793cfb1e2471 bff0000100007fff c03a793ea2b2c7eb
+3d1ffffbfe000000 216898822a24af3f 1e98987f158ae1d8
+bfb00000001bffff 7ffc000000000000 7ff8000000000000
+37f0000000efffff c3d00007fffffeff bbd0000800efff75
+0000000000000000 ffefff8000080000 0000000000000000
+3fb00200000000ff c0000000011fffff bfc00200012024fd
+41c0000007ffffff 49103fffefffffff 4ae03ffff81ffff6
+407effbfffffffff 3e00000040001fff 3e8effc07bff3dfd
+c1f00013fffffffe 7ffc000000000000 7ff8000000000000
+c3f00004000001ff c3d00bfffffffffe 47d00c04030001ff
+403b5ab30b28be12 bfdfffffffffffff c02b5ab30b28be11
+0000000000000000 c1cfffffff87ffff 0000000000000000
+0000000000000000 bfe0000000000001 0000000000000000
+801ffc000007ffff bfeffffffffffffe 001ffc000007fffe
+0000000000000000 ffe0000005fffffe 0000000000000000
+0000000000000000 bfffffffffffffff 0000000000000000
+0000000000000000 c000000000000000 0000000000000000
+c3d09308769f3f51 c00fffffffffffff 43f09308769f3f51
+0000000000000000 402ffffdfefffffe 0000000000000000
+0000000000000000 c010000000000001 0000000000000000
+c01fffffffc00fff c01ffffffffffffe 404fffffffc00ffe
+c025e14360f49046 412fff0000000003 c165e09456d988a3
+0000000000000000 43ee59a2f1155c8b 0000000000000000
+3fe0000000008fff 802ffffff7fffff6 801ffffff8011ff3
+0000000000000000 ffefffffffffffff 0000000000000000
+40401007fffffffe fff0000000000000 80401007fffffffe
+0000000000000000 c0045abb4860cbf3 0000000000000000
+0000000000000000 7ffc000000000000 7ff8000000000000
+bffffffec0000000 c000000000003eff 400ffffec0007dfe
+48000000004001ff 41f331de979ac49e 4a0331de97e78e7e
+3d0fffffbff7ffff 7ffc000000000000 7ff8000000000000
+43d3ffffff000000 3caffffffffffffe 4093fffffeffffff
+7ffc000000000000 43dfff8004000000 7ff8000000000000
+bcaffe0000000008 3fd00008000000ff bc8ffe0fff000205
+404ffbfffffffffc c34ffff8003fffff c3affbf8013ff7fb
+43e0000000000082 3db000003ffffeff 41a000003fffff82
+c1d004000ffffffe 4000000000000000 c1e004000ffffffe
+c00fffffc000007e c02ffffdfffffbff 404ffffdc000007e
+409dfffbffffffff 4010000000000001 40bdfffc00000001
+c120000003ffffe0 c06000fffbffffff 4190010000003fde
+3fd1f7ffffffffff c01000001dffffff bff1f80021b0fffd
+2e0fefdfffffffff 4030000020000040 2e4fefe03fdfc07f
+43c0000803ffffff 3fcfffffffffffff 43a0000803ffffff
+c0afffffbffffdfe 3fc07ffdffffffff c0807ffddf0002f5
+c0fffffffeffffee 55139bb9349e058c d6239bb9340127b7
+41ffdbaf18ce06bd 8010000000000000 821fdbaf18ce06bd
+c0e1000000080000 801ffffffffffffe 011100000007ffff
+3fbffffff0000007 c807dfffffffffff c7d7dffff4100004
+c357b53537b96da5 bfd0000000000000 4337b53537b96da5
+401fffffffffffff ffebff8000000000 801bff7fffffffff
+c7eff77bf2b59c3c bfe0000000000001 47dff77bf2b59c3e
+380c3f72cc3dec98 c3fffffffbffffff bc1c3f72c8b5fe3d
+b8e0000003fbffff c503f4d44f4bf888 3df3f4d454443066
+3f3ffffc001fffff c000000000000001 bf4ffffc00200000
+c340002000004000 c0db3367e0423019 442b339e47125d6b
+4f60000801ffffff 41c07fe000000000 51307fe841fffbff
+c1ffffffbfefffff c340000000000001 454fffffbff00001
+404fff7fffffff7f 48ab7e2aad4ec686 490b7dbcb4a410dd
+7ffc000000000000 ffefffffffffffff 7ff8000000000000
+41e189ea1a6fff97 7ffc000000000000 7ff8000000000000
+3ff0ee9046c9330f 8479e1e79766e02b 847b63d14ff91acb
+d2f805130a8c11df 43effffdfdfffffe d6f8051188ba9004
+4f1fffbfffe00000 bcd02000000007ff cc001fdfbfefe7fe
+be70000077ffffff c1efffffffffffff 4070000077ffffff
+41e1ffffbffffffe 3caffffffffffffe 3ea1ffffbffffffd
+3bbd976272fb1d2a c06ffff80007fffe bc3d975b0d29e641
+434fff01ffffffff 403dfeffffffffff 439dfe11e7efffff
+be6fff7fffffffff 3feffffffffffffe be6fff7ffffffffd
+41d007ff80000000 41f0fffffffc0000 43d1087f77fbfe01
+ffeef7a206029708 bdcfa4109a3a5b22 7dce9eaa2542875b
+3b6ffffffeffffc0 3c7ffffe003ffffe 37fffffdff3fffce
+c1d1ffffffbfffff bfcffffefffff800 41b1ffff6fbffb82
+2030000000000090 c05e2e90015c47a1 a09e2e90015c48b0
+bbf000000007efff 001fe0000007fffe fc1fe0000017d01c
+41cae866712069f4 c02fffffffffffff c20ae866712069f3
+bfce1e32ccf56348 3ca1f66d4c8eeef3 bc80e7fa025544da
+ffedfffff0000000 ffeffff000000800 3fedfff0f0000f80
+37effffc3ffffffe bca0fffffffffffd b4a0fffe01fffffb
+bc950a021bf9dee1 3db0001fffdffffe ba550a2c2fd402cd
+fd4fffffdfffffef 41cffffdffffffef ff2ffffde00001de
+bfc00000004007ff bcafffffffffffff 3c800000004007ff
+c009130b80fe8274 b811571307061a38 382b2cb1993b60f3
+c0600000ffffffdf 7feda1b8c591f9c6 805da1ba9fad85e2
+c1e4af3f8d45e031 3ca0020002000000 be94b1d577cd70de
+3800008100000000 b810000020000080 b020008120010280
+372ff00000003fff 7fe000fdfffffffe 771ff1fb02003fff
+47d00021fffffffe c00fffffffffffff c7f00021fffffffd
+bfbc9ea0c2b4884b 43f4a552574073d5 c3c277000b21a4e7
+bf1fe0000000ffff c01ffffffffffffe 3f4fe0000000fffe
+41ffffffff7ffffb 0027ffffffffeffe 0237ffffff9feffb
+c7e040000fffffff ffe0000000000000 07d040000fffffff
+7ffc000000000000 3fe0000ffffff7ff 7ff8000000000000
+c1effc1fffffffff 7ffc000000000000 7ff8000000000000
+c0d000000001ffbf c03ba46e644e4e9c 411ba46e6451c2ba
+c4500000005fffff c03a20ab4de47fc9 449a20ab4e8143cc
+400e00000000007e 001fffffffffffff 003e00000000007e
+45a01fffff7fffff c3c0020200000000 c9702206037fefee
+3e8ff800000000ff 3caffffffffffffe 3b4ff800000000fe
+be004000000007fe 3fdffff7ff7fffff bdf03ffbefbf07fd
+b11000007ffffe00 3fe0000000000000 b10000007ffffe00
+b80cef50bd17db40 c05fffc00000000e 387cef16de76611d
+3d4000ffffffffff 3d47f68d8eb6b9a4 3a97f80cf78fa50f
+ffe3fffffffffffb c03dc3321aaa5380 003299ff50aa742c
+3ca3fffffffffeff bf02ffafb4e9241d bbb7bf9ba2236bf3
+53598c812c3c39dd 3f20000100fffffe 52898c82c69d14b1
+c3dffffff8000001 3fe0020000003ffe c3d001fffbffbffe
+7ba00800003fffff 3ff9a9a129c791b3 7ba9b675fac31bff
+c3d0000fffffffef 7fe0000000000001 83c0000ffffffff0
+c34f80001fffffff b7fffffe0007ffff 3b5f7ffe2807ddff
+0010000000001ff8 4800020000010000 0820020000011ffc
+2c4c0000003fffff 230ffffc00400000 0f6bfffc8077fff8
+381fffffffbff7fe 8010000000000000 f83fffffffbff7fe
+802d3018ea8c241d c007fdffffffffff 0045e23fae5a7253
+43e047fffffffffe 4000003ffdfffffe 43f048411df6fffc
+c000005fffffffff 403ffffffff00002 c050005ffff7ffd0
+3fc8b60e46a80f6d bfdffffffffffffe bfb8b60e46a80f6b
+bd5fdffdffffffff 5644b72ace1bbb6b d3b4a27257daf2cd
+b80010001fffffff 40e01ffffff7fffe b8f030202037f7fc
+407000003ffbfffe 38042862fe8e3368 388428634f2ab547
+bf8ffbfff7ffffff c00fffffffffffff 3faffbfff7ffffff
+bcafc000003fffff c010000000000001 3ccfc00000400001
+47eddf042473ef08 b7e00000fe000000 bfdddf05fea850ca
+3fbfffff7fffffef c340ffffffffffbf c310ffffbbffffb5
+c02f8000000007ff ffe0000000000001 001f800000000801
+002f37ebf6c8eaec c08be464f4c81c69 80cb36000706e168
+c00e800000000000 7ffc000000000000 7ff8000000000000
+0010000000000000 0000000000000000 0000000000000000
+bfffc00000000003 391001ffffffffff b91fc3f800000001
+c1db54446247aa52 bfcc001fffffffff 41b7e9d72a43174f
+0010000000000000 c0392c59c8e48f37 80592c59c8e48f37
+0010000000000000 c0000800000001ff 80200800000001ff
+0010000000000000 c1d0000004000fff 81f0000004000fff
+4030040000200000 0017055f48beeff5 00570b20a0bf2a70
+bc7000000000ffee c1e0001100000000 3e6000110000fff0
+c040000000007fff c3b2a6c91c557f56 4402a6c91c56148c
+41ffffffff003fff c3b0000007ffffee c5c0000007801fed
+21900001dfffffff bf20000017fffffe a0c00001f80002cc
+0029954d0f0df5b3 41e00000000003ff 0219954d0f0dfc17
+b810000020000001 47ffdfffffffff80 c01fe0003fbfff81
+0010000000000000 ffeffff800007fff c00ffff800007fff
+0010000000000000 4010000000000000 0030000000000000
+bf700000000100ff 401fffffffffffff bfa00000000100fe
+37feffffffffffff 47ef8000000fffff 3ffe8400000f7fff
+b80f800001fffffe 44e00000ffff7fff bcff8001f9ff041c
+0010000000000000 434ffffffffffffe 036ffffffffffffe
+41ffffdfffff8000 7fe0000000000001 01efffdfffff8002
+b80a16ad02c87cd3 380fffffffffe7fe b02a16ad02c86940
+47f0fffffffffffb 7ffc000000000000 7ff8000000000000
+0010000000000000 41ffffffffbfff7f 021fffffffbfff7f
+0010000000000000 8000000000000000 0000000000000000
+c3d00001000001ff b7f60cb3edb38762 3bd60cb54e7ec8fe
+0010000000000000 8010000000000001 c030000000000001
+43c0007fffdfffff 801ffffffffffffe 83f0007fffdffffd
+c7efffffdffffbff bca0000000000001 449fffffdffffc01
+0010000000000000 c11ff00000000003 813ff00000000003
+0010000000000000 bfd0000000000000 ffefffffffffffff
+c0ffffffffeffffe bfdfffffffffffff 40efffffffeffffe
+6f7000000001fdff 1510010000000fff 4490010000020e1e
+37f002000000000f b1effcfffffffffe a9f0007fd000000d
+cc3050bc013d7cd7 bff0000000000000 4c3050bc013d7cd7
+0010000000000000 87fff0000000fffe c81ff0000000fffe
+0010000000000000 bffffffffffffffe 801ffffffffffffe
+43effbfffffff7ff 7fefffffff801ffe 03effbffff8027fa
+c015834380f2b995 3f9fff0000000400 bfc5829766d6b4af
+0010000000000000 41dfffffc0001000 01ffffffc0001000
+0010000000000000 c01fffffffffffff 803fffffffffffff
+41e010000000001f c5b04000000fffff c7a050400010101e
+3b40018000000000 3ea0400000000100 39f0418600000101
+0010000000000000 4cdffeffff7fffff 0cfffeffff7fffff
+16dff0001ffffffe 3fb500ae0796659d 16a4f62dc5934871
+b7e003ffffffff7f deafffffeffffffd 56a003fff7fdff7e
+406000001fffbfff 3f20020000080000 3f900200200bbff8
+0010000000000000 7ffc000000000000 7ff8000000000000
+439fbffffffbffff bf8454fd38ef0ba0 c3342c533e7aa2e8
+c1c000000200007e bf000001ffffffbf 40d000020200007e
+480000000008fffe 001637e790e69de2 082637e790f31d52
+bffffffc000003fe 3ca0000000000001 bcaffffc000003ff
+6b4848a9a8c0dcd5 480ffffffffbdfff 736848a9a8bdbb77
--- a/wally-pipelined/src/fpu/FMA/tbgen/output
+++ b/wally-pipelined/src/fpu/FMA/tbgen/output
@ -0,0 +1,199 @@
+c22000007fffffff 24700000ffffffef a6a00001800007ee
+bfc00000000011fe 3fdfffffffffff03 bfb000000000117f
+a83100000007fffe 41e0000effffffff aa21000ff0080004
+0000000000000000 001ffffffffffffe 0000000000000000
+400327ca64d70ec7 3ca0000000000001 3cb327ca64d70ec8
+0000000000000000 43e207ffffffffff 0000000000000000
+0000000000000000 3fd0000000000000 0000000000000000
+0000000000000000 3fdfffffffffffff 0000000000000000
+0000000000000000 3fe0000000000000 0000000000000000
+c870200000010000 3fefffffffffffff c87020000000ffff
+c00aaa4fd557ef13 c3b8917384eb32d0 43d478efdc9216d7
+0000000000000000 7ffc000000000000 7ffc000000000000
+0000000000000000 c18aca47203438e2 8000000000000000
+0000000000000000 4000000000000001 0000000000000000
+47efff0008000000 b1dcb0523546117f b9dcaf6cb9e07bdc
+43f000ffffff7fff 22300000001fffdf 26300100001f81de
+402ff000001fffff 40759558e27de226 40b58a8e3622388d
+0000000000000000 40efdeffffffffff 0000000000000000
+0000000000000000 434fffffffffffff 0000000000000000
+7ffc000000000000 7fe0000000000000 7ffc000000000000
+b35e061abc769f3a c078000003fffffe 33e684941119bac1
+403a793cfb1e2471 bff0000100007fff c03a793ea2b2c7eb
+3d1ffffbfe000000 216898822a24af3f 1e98987f158ae1d8
+bfb00000001bffff 7ffc000000000000 7ffc000000000000
+37f0000000efffff c3d00007fffffeff bbd0000800efff76
+0000000000000000 ffefff8000080000 8000000000000000
+3fb00200000000ff c0000000011fffff bfc00200012024fe
+41c0000007ffffff 49103fffefffffff 4ae03ffff81ffff6
+407effbfffffffff 3e00000040001fff 3e8effc07bff3dfd
+c1f00013fffffffe 7ffc000000000000 7ffc000000000000
+c3f00004000001ff c3d00bfffffffffe 47d00c04030001fe
+403b5ab30b28be12 bfdfffffffffffff c02b5ab30b28be11
+0000000000000000 c1cfffffff87ffff 8000000000000000
+0000000000000000 bfe0000000000001 8000000000000000
+801ffc000007ffff bfeffffffffffffe 001ffc000007fffd
+0000000000000000 ffe0000005fffffe 8000000000000000
+0000000000000000 bfffffffffffffff 8000000000000000
+0000000000000000 c000000000000000 8000000000000000
+c3d09308769f3f51 c00fffffffffffff 43f09308769f3f50
+0000000000000000 402ffffdfefffffe 0000000000000000
+0000000000000000 c010000000000001 8000000000000000
+c01fffffffc00fff c01ffffffffffffe 404fffffffc00ffd
+c025e14360f49046 412fff0000000003 c165e09456d988a4
+0000000000000000 43ee59a2f1155c8b 0000000000000000
+3fe0000000008fff 802ffffff7fffff6 801ffffff8011ff4
+0000000000000000 ffefffffffffffff 8000000000000000
+40401007fffffffe fff0000000000000 fff0000000000000
+0000000000000000 c0045abb4860cbf3 8000000000000000
+0000000000000000 7ffc000000000000 7ffc000000000000
+bffffffec0000000 c000000000003eff 400ffffec0007dfe
+48000000004001ff 41f331de979ac49e 4a0331de97e78e7d
+3d0fffffbff7ffff 7ffc000000000000 7ffc000000000000
+43d3ffffff000000 3caffffffffffffe 4093fffffeffffff
+7ffc000000000000 43dfff8004000000 7ffc000000000000
+bcaffe0000000008 3fd00008000000ff bc8ffe0fff000206
+404ffbfffffffffc c34ffff8003fffff c3affbf8013ff7fb
+43e0000000000082 3db000003ffffeff 41a000003fffff81
+c1d004000ffffffe 4000000000000000 c1e004000ffffffe
+c00fffffc000007e c02ffffdfffffbff 404ffffdc000007d
+409dfffbffffffff 4010000000000001 40bdfffc00000001
+c120000003ffffe0 c06000fffbffffff 4190010000003fde
+3fd1f7ffffffffff c01000001dffffff bff1f80021b0fffe
+2e0fefdfffffffff 4030000020000040 2e4fefe03fdfc07f
+43c0000803ffffff 3fcfffffffffffff 43a0000803fffffe
+c0afffffbffffdfe 3fc07ffdffffffff c0807ffddf0002f6
+c0fffffffeffffee 55139bb9349e058c d6239bb9340127b7
+41ffdbaf18ce06bd 8010000000000000 821fdbaf18ce06bd
+c0e1000000080000 801ffffffffffffe 011100000007ffff
+3fbffffff0000007 c807dfffffffffff c7d7dffff4100004
+c357b53537b96da5 bfd0000000000000 4337b53537b96da5
+401fffffffffffff ffebff8000000000 fff0000000000000
+c7eff77bf2b59c3c bfe0000000000001 47dff77bf2b59c3e
+380c3f72cc3dec98 c3fffffffbffffff bc1c3f72c8b5fe3e
+b8e0000003fbffff c503f4d44f4bf888 3df3f4d454443065
+3f3ffffc001fffff c000000000000001 bf4ffffc00200001
+c340002000004000 c0db3367e0423019 442b339e47125d6b
+4f60000801ffffff 41c07fe000000000 51307fe841fffbff
+c1ffffffbfefffff c340000000000001 454fffffbff00001
+404fff7fffffff7f 48ab7e2aad4ec686 490b7dbcb4a410dc
+7ffc000000000000 ffefffffffffffff 7ffc000000000000
+41e189ea1a6fff97 7ffc000000000000 7ffc000000000000
+3ff0ee9046c9330f 8479e1e79766e02b 847b63d14ff91acb
+d2f805130a8c11df 43effffdfdfffffe d6f8051188ba9004
+4f1fffbfffe00000 bcd02000000007ff cc001fdfbfefe7ff
+be70000077ffffff c1efffffffffffff 4070000077fffffe
+41e1ffffbffffffe 3caffffffffffffe 3ea1ffffbffffffd
+3bbd976272fb1d2a c06ffff80007fffe bc3d975b0d29e642
+434fff01ffffffff 403dfeffffffffff 439dfe11e7effffe
+be6fff7fffffffff 3feffffffffffffe be6fff7ffffffffd
+41d007ff80000000 41f0fffffffc0000 43d1087f77fbfe00
+ffeef7a206029708 bdcfa4109a3a5b22 7dce9eaa2542875b
+3b6ffffffeffffc0 3c7ffffe003ffffe 37fffffdff3fffce
+c1d1ffffffbfffff bfcffffefffff800 41b1ffff6fbffb81
+2030000000000090 c05e2e90015c47a1 a09e2e90015c48b1
+bbf000000007efff 001fe0000007fffe 8000000000000000
+41cae866712069f4 c02fffffffffffff c20ae866712069f3
+bfce1e32ccf56348 3ca1f66d4c8eeef3 bc80e7fa025544db
+ffedfffff0000000 ffeffff000000800 7ff0000000000000
+37effffc3ffffffe bca0fffffffffffd b4a0fffe01fffffc
+bc950a021bf9dee1 3db0001fffdffffe ba550a2c2fd402ce
+fd4fffffdfffffef 41cffffdffffffef ff2ffffde00001de
+bfc00000004007ff bcafffffffffffff 3c800000004007fe
+c009130b80fe8274 b811571307061a38 382b2cb1993b60f2
+c0600000ffffffdf 7feda1b8c591f9c6 fff0000000000000
+c1e4af3f8d45e031 3ca0020002000000 be94b1d577cd70df
+3800008100000000 b810000020000080 b020008120010280
+372ff00000003fff 7fe000fdfffffffe 771ff1fb02003fff
+47d00021fffffffe c00fffffffffffff c7f00021fffffffd
+bfbc9ea0c2b4884b 43f4a552574073d5 c3c277000b21a4e8
+bf1fe0000000ffff c01ffffffffffffe 3f4fe0000000fffd
+41ffffffff7ffffb 0027ffffffffeffe 0237ffffff9feffa
+c7e040000fffffff ffe0000000000000 7ff0000000000000
+7ffc000000000000 3fe0000ffffff7ff 7ffc000000000000
+c1effc1fffffffff 7ffc000000000000 7ffc000000000000
+c0d000000001ffbf c03ba46e644e4e9c 411ba46e6451c2ba
+c4500000005fffff c03a20ab4de47fc9 449a20ab4e8143cb
+400e00000000007e 001fffffffffffff 003e00000000007d
+45a01fffff7fffff c3c0020200000000 c9702206037fefef
+3e8ff800000000ff 3caffffffffffffe 3b4ff800000000fd
+be004000000007fe 3fdffff7ff7fffff bdf03ffbefbf07fd
+b11000007ffffe00 3fe0000000000000 b10000007ffffe00
+b80cef50bd17db40 c05fffc00000000e 387cef16de76611d
+3d4000ffffffffff 3d47f68d8eb6b9a4 3a97f80cf78fa50e
+ffe3fffffffffffb c03dc3321aaa5380 7ff0000000000000
+3ca3fffffffffeff bf02ffafb4e9241d bbb7bf9ba2236bf3
+53598c812c3c39dd 3f20000100fffffe 52898c82c69d14b0
+c3dffffff8000001 3fe0020000003ffe c3d001fffbffbfff
+7ba00800003fffff 3ff9a9a129c791b3 7ba9b675fac31bff
+c3d0000fffffffef 7fe0000000000001 fff0000000000000
+c34f80001fffffff b7fffffe0007ffff 3b5f7ffe2807ddfe
+0010000000001ff8 4800020000010000 0820020000011ffc
+2c4c0000003fffff 230ffffc00400000 0f6bfffc8077fff7
+381fffffffbff7fe 8010000000000000 8000000000000000
+802d3018ea8c241d c007fdffffffffff 0045e23fae5a7253
+43e047fffffffffe 4000003ffdfffffe 43f048411df6fffc
+c000005fffffffff 403ffffffff00002 c050005ffff7ffd0
+3fc8b60e46a80f6d bfdffffffffffffe bfb8b60e46a80f6b
+bd5fdffdffffffff 5644b72ace1bbb6b d3b4a27257daf2cd
+b80010001fffffff 40e01ffffff7fffe b8f030202037f7fd
+407000003ffbfffe 38042862fe8e3368 388428634f2ab547
+bf8ffbfff7ffffff c00fffffffffffff 3faffbfff7fffffe
+bcafc000003fffff c010000000000001 3ccfc00000400001
+47eddf042473ef08 b7e00000fe000000 bfdddf05fea850cb
+3fbfffff7fffffef c340ffffffffffbf c310ffffbbffffb6
+c02f8000000007ff ffe0000000000001 7ff0000000000000
+002f37ebf6c8eaec c08be464f4c81c69 80cb36000706e169
+c00e800000000000 7ffc000000000000 7ffc000000000000
+0010000000000000 0000000000000000 0000000000000000
+bfffc00000000003 391001ffffffffff b91fc3f800000001
+c1db54446247aa52 bfcc001fffffffff 41b7e9d72a43174f
+0010000000000000 c0392c59c8e48f37 80592c59c8e48f37
+0010000000000000 c0000800000001ff 80200800000001ff
+0010000000000000 c1d0000004000fff 81f0000004000fff
+4030040000200000 0017055f48beeff5 00570b20a0bf2a70
+bc7000000000ffee c1e0001100000000 3e6000110000ffef
+c040000000007fff c3b2a6c91c557f56 4402a6c91c56148b
+41ffffffff003fff c3b0000007ffffee c5c0000007801fed
+21900001dfffffff bf20000017fffffe a0c00001f80002cd
+0029954d0f0df5b3 41e00000000003ff 0219954d0f0dfc17
+b810000020000001 47ffdfffffffff80 c01fe0003fbfff82
+0010000000000000 ffeffff800007fff c00ffff800007fff
+0010000000000000 4010000000000000 0030000000000000
+bf700000000100ff 401fffffffffffff bfa00000000100fe
+37feffffffffffff 47ef8000000fffff 3ffe8400000f7ffe
+b80f800001fffffe 44e00000ffff7fff bcff8001f9ff041c
+0010000000000000 434ffffffffffffe 036ffffffffffffe
+41ffffdfffff8000 7fe0000000000001 7ff0000000000000
+b80a16ad02c87cd3 380fffffffffe7fe b02a16ad02c86940
+47f0fffffffffffb 7ffc000000000000 7ffc000000000000
+0010000000000000 41ffffffffbfff7f 021fffffffbfff7f
+0010000000000000 8000000000000000 8000000000000000
+c3d00001000001ff b7f60cb3edb38762 3bd60cb54e7ec8fd
+0010000000000000 8010000000000001 8000000000000000
+43c0007fffdfffff 801ffffffffffffe 83f0007fffdffffe
+c7efffffdffffbff bca0000000000001 449fffffdffffc01
+0010000000000000 c11ff00000000003 813ff00000000003
+0010000000000000 bfd0000000000000 8000000000000000
+c0ffffffffeffffe bfdfffffffffffff 40efffffffeffffd
+6f7000000001fdff 1510010000000fff 4490010000020e1e
+37f002000000000f b1effcfffffffffe a9f0007fd000000e
+cc3050bc013d7cd7 bff0000000000000 4c3050bc013d7cd7
+0010000000000000 87fff0000000fffe 8000000000000000
+0010000000000000 bffffffffffffffe 801ffffffffffffe
+43effbfffffff7ff 7fefffffff801ffe 7ff0000000000000
+c015834380f2b995 3f9fff0000000400 bfc5829766d6b4b0
+0010000000000000 41dfffffc0001000 01ffffffc0001000
+0010000000000000 c01fffffffffffff 803fffffffffffff
+41e010000000001f c5b04000000fffff c7a050400010101e
+3b40018000000000 3ea0400000000100 39f0418600000100
+0010000000000000 4cdffeffff7fffff 0cfffeffff7fffff
+16dff0001ffffffe 3fb500ae0796659d 16a4f62dc5934870
+b7e003ffffffff7f deafffffeffffffd 56a003fff7fdff7d
+406000001fffbfff 3f20020000080000 3f900200200bbff7
+0010000000000000 7ffc000000000000 7ffc000000000000
+439fbffffffbffff bf8454fd38ef0ba0 c3342c533e7aa2e8
+c1c000000200007e bf000001ffffffbf 40d000020200007d
+480000000008fffe 001637e790e69de2 082637e790f31d51
+bffffffc000003fe 3ca0000000000001 bcaffffc00000400
+6b4848a9a8c0dcd5 480ffffffffbdfff 736848a9a8bdbb76
--- a/wally-pipelined/src/fpu/FMA/tbgen/results.dat
+++ b/wally-pipelined/src/fpu/FMA/tbgen/results.dat
@ -0,0 +1 @@
+0020000803ffffff bfcb4181a9468e24 000fffffffffffff 7fe2f9c2bca0f33c 00092f9c2bca0f33  Wrong zdenorm 18
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.c
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.c
@ -0,0 +1,116 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+void main() {
+	FILE *fp, *fq, *fr;
+	int cnt=0;
+	char *ln;
+	size_t nbytes = 80;
+
+	ln = (char *)malloc(nbytes + 1);
+
+	// fp = fopen("tb.dat","r");
+	fp = fopen("testFloat","r");
+	fq = fopen("tb.v","a");
+	system("cp tbhead.v tb.v");
+	int k=0;
+	for(k=0; k<91 && !feof(fp); k++) {
+		//3FDBFFFFFFFFFF7F DE608000000001FF 43CFED83C17EDBD0 DE4CE000000002F9 01
+		// b68ffff8000000ff_3f9080000007ffff_b6307ffbe0080080_00001
+        char ch;
+		int i,j;
+		char *ln;
+		char xrf[17];
+		char y[17];
+		char zrf[17];
+		char ans[81];
+		char flags[3];
+		int rn,rz,rm,rp;
+		{
+  //my_string = (char *) malloc (nbytes + 1);
+  //bytes_read = getline (&my_string, &nbytes, stdin);
+			if(getline(&ln,&nbytes,fp) < 0) break;
+			//fprintf(stderr,"%s\n", ln);
+
+			strncpy(xrf,   ln,     16); xrf[16]=0;
+			strncpy(y,    &ln[17], 16); y[16]=0;
+			strncpy(zrf,  &ln[34], 16); zrf[16]=0;
+			// fprintf(stdout,"[%s]\n[%s]\n", ln,zrf);
+			strncpy(ans,  &ln[51], 16); ans[16]=0;
+			strncpy(flags,&ln[68],2);   flags[2]=0;
+		
+			// fprintf(stdout,"[%s]\n[%s]\n", ln,zrf);
+			fprintf(fq,"    xrf = 64'h%s;\n",xrf); 
+			fprintf(fq,"    y = 64'h%s;\n",y); 
+			fprintf(fq,"    zrf = 64'h%s;\n",zrf);
+			fprintf(fq,"    ans = 64'h%s;\n", ans);
+			// fprintf(fq,"    flags = 5'h%s;\n", flags);
+		}
+
+		{
+			//rn=1; rz=0; rm=0; rp=0;
+			fprintf(fq,"    rn = %d;\n",1);
+			fprintf(fq,"    rz = %d;\n", 0);
+			fprintf(fq,"    rm = %d;\n", 0);
+			fprintf(fq,"    rp = %d;\n", 0);
+		}
+		{
+			fprintf(fq,"    earlyres = 64'b0;\n");
+			fprintf(fq,"    earlyressel = 0;\n");
+		}		
+		{
+
+			fprintf(fq,"    bypsel= 2'b0;\n"); //, bysel);
+			fprintf(fq,"    bypplus1 = 0;\n"); //, byp1);
+			fprintf(fq,"    byppostnorm = 0;\n"); //, bypnorm);
+		}
+		fprintf(fq,"#10\n");
+	// IEEE 754-2008 section 6.3 states "When ether an input or result is NaN, this standard does not interpret the sign of a NaN."
+		//fprintf(fq,"	$fwrite(fp, \"%%h %%h %%h %%h \",xrf,y,w, ans);\n");	
+		fprintf(fq,"    // IEEE 754-2008 section 6.3 states: \"When ether an input or result is NaN, this\n");
+		fprintf(fq,"    //                                     standard does not interpret the sign of a NaN.\"\n");
+ 		fprintf(fq,"	nan = (w >  64'h7FF0000000000000 && w <  64'h7FF8000000000000)  ||\n");
+ 		fprintf(fq,"	      (w >  64'hFFF8000000000000 && w <  64'hFFF8000000000000 ) ||\n");
+ 		fprintf(fq,"	      (w >= 64'h7FF8000000000000 && w <= 64'h7FFfffffffffffff ) ||\n");
+ 		fprintf(fq,"	      (w >= 64'hFFF8000000000000 && w <= 64'hFFFfffffffffffff );\n");
+		// fprintf(fq,"    if(!(~(|xrf[62:52]) && |xrf[51:0] || ~(|y[62:52]) && |y[51:0])) begin\n"); 
+																						// not looknig at negative zero results right now
+		//fprintf(fq,"	  if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) && !(w == 64'h8000000000000000 && ans == 64'b0)) begin\n"); 
+		fprintf(fq,"	if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) ) begin\n"); 
+		fprintf(fq,"		$fwrite(fp, \"%%h %%h %%h %%h %%h  Wrong \",xrf,y, zrf, w, ans);\n");
+ 		fprintf(fq,"		if(w == 64'h8000000000000000) $fwrite(fp, \"w=-zero \");\n");
+ 		fprintf(fq,"		if(~(|xrf[62:52]) && |xrf[51:0]) $fwrite(fp, \"xdenorm \");\n");
+ 		fprintf(fq,"		if(~(|y[62:52]) && |y[51:0]) $fwrite(fp, \"ydenorm \");\n");
+ 		fprintf(fq,"		if(~(|zrf[62:52]) && |zrf[51:0]) $fwrite(fp, \"zdenorm \");\n");
+  		fprintf(fq,"		if(invalid != 0) $fwrite(fp, \"invld \");\n");
+ 		fprintf(fq,"		if(overflow != 0) $fwrite(fp, \"ovrflw \");\n");
+ 		fprintf(fq,"		if(underflow != 0) $fwrite(fp, \"unflw \");\n");
+ 		fprintf(fq,"		if(w == 64'hFFF0000000000000) $fwrite(fp, \"w=-inf \");\n");
+ 		fprintf(fq,"		if(w == 64'h7FF0000000000000) $fwrite(fp, \"w=+inf \");\n");
+ 		fprintf(fq,"		if(w >  64'h7FF0000000000000 && w <  64'h7FF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
+ 		fprintf(fq,"		if(w >  64'hFFF8000000000000 && w <  64'hFFF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
+ 		fprintf(fq,"		if(w >= 64'h7FF8000000000000 && w <= 64'h7FFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
+ 		fprintf(fq,"		if(w >= 64'hFFF8000000000000 && w <= 64'hFFFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
+
+ 		fprintf(fq,"		if(ans == 64'hFFF0000000000000) $fwrite(fp, \"ans=-inf \");\n");
+ 		fprintf(fq,"		if(ans == 64'h7FF0000000000000) $fwrite(fp, \"ans=+inf \");\n");
+		fprintf(fq,"		if(ans >  64'h7FF0000000000000 && ans <  64'h7FF8000000000000 ) $fwrite(fp, \"ans=sigNaN \");\n");
+ 		fprintf(fq,"		if(ans >  64'hFFF8000000000000 && ans <  64'hFFF8000000000000 ) $fwrite(fp, \"ans=sigNaN \");\n");
+ 		fprintf(fq,"		if(ans >= 64'h7FF8000000000000 && ans <= 64'h7FFfffffffffffff ) $fwrite(fp, \"ans=qutNaN \");\n");
+ 		fprintf(fq,"		if(ans >= 64'hFFF8000000000000 && ans <= 64'hFFFfffffffffffff ) $fwrite(fp, \"ans=qutNaN \");\n");
+		fprintf(fq,"    	$fwrite(fp,\"%d\\n\");\n",cnt);
+		if(cnt == 358)fprintf(fq,"    	$stop;\n");
+		// fprintf(fq,"    end\n");
+		fprintf(fq,"    end\n");
+		cnt++;
+
+		//if(cnt > 100) break;
+		fflush(fq);
+	}
+
+	fprintf(fq, "\t$stop;\n\tend\nendmodule");
+	fclose(fq);
+	fclose(fp);
+}
+
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.v
--- a/wally-pipelined/src/fpu/FMA/tbgen/tbgen
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tbgen
--- a/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
@ -0,0 +1,36 @@
+`timescale 1 ns/10 ps
+module tb;
+
+
+ reg 		[63:0]		xrf;
+ reg 		[63:0]		y;
+ reg 		[63:0]		zrf;
+ reg 		[63:0]		ans;
+ reg 						rn;
+ reg 						rz;
+ reg 						rm;
+ reg 						rp;
+ reg 		[63:0]		earlyres;
+ reg 						earlyressel;
+ reg 		[1:0]			bypsel;
+ reg 						bypplus1;
+ reg 						byppostnorm;
+ wire 	[63:0]		w;
+ wire 	[63:0]		wbypass;
+ wire 		 			invalid;
+ wire 					overflow;
+ wire 					underflow;
+ wire 					inexact;
+
+integer fp;
+reg nan;
+
+localparam period = 20;  
+fmac UUT(.xrf(xrf), .y(y), .zrf(zrf), .rn(rn), .rz(rz), .rp(rp), .rm(rm),
+		.earlyres(earlyres), .earlyressel(earlyressel), .bypsel(bypsel), .bypplus1(bypplus1), .byppostnorm(byppostnorm), 
+		.w(w), .wbypass(wbypass), .invalid(invalid), .overflow(overflow), .underflow(underflow), .inexact(inexact));
+
+
+initial 
+    begin
+    fp = $fopen("/home/kparry/code/FMAC/tbgen/results.dat","w");
--- a/wally-pipelined/src/fpu/FMA/tbgen/testMini
+++ b/wally-pipelined/src/fpu/FMA/tbgen/testMini
--- a/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
+++ b/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
@ -0,0 +1 @@
+testfloat_gen f64_mulAdd -n 6133248 -rnear_even -seed 113355 -level 1 >> testFloat
--- a/wally-pipelined/src/fpu/FMA/tbgen/tstFlMult.awk
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tstFlMult.awk
@ -0,0 +1 @@
+awk 'BEGIN {FS = " "; OFS = "_"} {if ($3 == "0000000000000000") print $1, $2, $4;}' testFloat | head -n 1000 > testMini
				`@ -0,0 +1 @@`
				`0020000803ffffff bfcb4181a9468e24 000fffffffffffff 7fe2f9c2bca0f33c 00092f9c2bca0f33 Wrong zdenorm 18`
				`@ -0,0 +1 @@`
				`testfloat_gen f64_mulAdd -n 6133248 -rnear_even -seed 113355 -level 1 >> testFloat`
				`@ -0,0 +1 @@`
				`awk 'BEGIN {FS = " "; OFS = "_"} {if ($3 == "0000000000000000") print $1, $2, $4;}' testFloat \| head -n 1000 > testMini`