Integrated FPU

2021-04-03 20:52:26 +00:00 · 2021-04-03 20:52:26 +00:00 · d7b1379ab8
commit d7b1379ab8
parent d21006d048
11 changed files with 284063 additions and 104315 deletions
--- a/wally-pipelined/src/fpu/FMA/align.sv
+++ b/wally-pipelined/src/fpu/FMA/align.sv
@ -64,35 +64,35 @@ module align(zman, ae, aligncnt, xzero, yzero, zzero, zdenorm, proddenorm, t, bs
 		ps = 0;

 		// And to using product as primary operand in adder I exponent gen 
-		killprod = 0;
+		killprod = xzero | yzero;
 		// d = aligncnt
 		// p = 53
-		if ($signed(aligncnt) <= $signed(-105)) begin //d<=-2p+1
+		if ($signed(aligncnt) <= $signed(-103)) begin //d<=-2p+1
 			//product ancored case with saturated shift
 			sumshift = 163;	// 3p+4	
 			sumshiftzero = 0;
 			shift = {~zdenorm,zman,163'b0} >> sumshift;
-			t = {shift[215:52]};
+			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			//zexpsel = 0;
-		end else if($signed(aligncnt) <= $signed(0))  begin // -2p+1<d<=2
+		end else if($signed(aligncnt) <= $signed(1))  begin // -2p+1<d<=2
 			// set d<=2 to d<=0
 			// product ancored or cancellation
 			// warning: set to 55 rather then 56. was there a typo in the book?
-			sumshift = 55-aligncnt; // p + 3 - d  
+			sumshift = 57-aligncnt; // p + 3 - d  
 			sumshiftzero = 0;
 			shift = {~zdenorm,zman,163'b0} >> sumshift;
-			t = {shift[215:52]};
+			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			//zexpsel = 0;
-		end else if ($signed(aligncnt)<=$signed(52))  begin // 2 < d <= p+2
+		end else if ($signed(aligncnt)<=$signed(55))  begin // 2 < d <= p+2
 			// another typo in book? above was 55 changed to 52
 			// addend ancored case
 			// used to be 56 \/ somthing doesn't seem right too many typos
-			sumshift = 55-aligncnt;
+			sumshift = 57-aligncnt;
 			sumshiftzero = 0;
 			shift = {~zdenorm,zman, 163'b0} >> sumshift;
-			t = {shift[215:52]};
+			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			//zexpsel = 1;
 		end else begin                 	// d >= p+3
@ -100,7 +100,7 @@ module align(zman, ae, aligncnt, xzero, yzero, zzero, zdenorm, proddenorm, t, bs
 			sumshift = 0;	
 			sumshiftzero = 1;		
 			shift = {~zdenorm,zman, 163'b0} >> sumshift;
-			t = {shift[215:52]};
+			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			killprod = 1;
 			//ps = 1;
--- a/wally-pipelined/src/fpu/FMA/expgen.sv
+++ b/wally-pipelined/src/fpu/FMA/expgen.sv
@ -84,8 +84,10 @@ module expgen(xexp, yexp, zexp,
 	// This should not increas the critical path because the time to
 	// check if a round overflows is shorter than the actual round and
 	// is masked by the bypass mux and two 10 bit adder delays.
-
-	assign aligncnt = zexp -ae - 1 + ~xdenorm + ~ydenorm - ~zdenorm;
+	assign aligncnt0 = - 1 + ~xdenorm + ~ydenorm - ~zdenorm;
+	assign aligncnt1 = - 1 + {12'b0,~xdenorm} + {12'b0,~ydenorm} - {12'b0,~zdenorm};
+	assign aligncnt = zexp -ae - 1 + {12'b0,~xdenorm} + {12'b0,~ydenorm} - {12'b0,~zdenorm};
+	//assign aligncnt = zexp -ae - 1 + ~xdenorm + ~ydenorm - ~zdenorm;
 	//assign aligncnt = zexp - ae;// KEP use all of ae

 	// Select exponent (usually from product except in case of huge addend)
@ -107,7 +109,7 @@ module expgen(xexp, yexp, zexp,
 	// check for exponent out of bounds after add 
 	
 	assign de = resultdenorm | sumzero ? 0 : de0;
-	assign sumof = de[12];
+	assign sumof = ~de[12] && de > 2046;
 	assign sumuf = de == 0  && ~sumzero && ~resultdenorm;

 	// bypass occurs before rounding or taking early results 
--- a/wally-pipelined/src/fpu/FMA/flag.sv
+++ b/wally-pipelined/src/fpu/FMA/flag.sv
@ -9,7 +9,7 @@

 /////////////////////////////////////////////////////////////////////////////
 module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
-			 psign,  zsign, xzero, yzero, vbits,
+			 psign,  zsign, xzero, yzero, zzero, vbits, killprod,
 			 inf, nan, invalid, overflow, underflow, inexact);
 /////////////////////////////////////////////////////////////////////////////

@ -26,6 +26,8 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	input				zsign; 		// Sign of z
 	input				xzero;		// x = 0
 	input				yzero;		// y = 0
+	input				zzero;		// y = 0
+	input				killprod;
 	input     	[1:0]  		vbits;		// R and S bits of result
 	output				inf;		// Some	source is Inf
 	output				nan;		// Some	source is NaN
@ -73,8 +75,7 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	//   1) Any input is denormalized
 	//   2)  Output would be denormalized or smaller

-	assign underflow = (sumuf && ~inf && ~prodinf && ~nan);
-
+	assign underflow = (sumuf && ~inf && ~prodinf && ~nan) || (killprod & zzero & ~(yzero | xzero));

 	// Set the inexact flag for the following cases:
 	//   1) Multiplication inexact
--- a/wally-pipelined/src/fpu/FMA/normalize.sv
+++ b/wally-pipelined/src/fpu/FMA/normalize.sv
@ -47,7 +47,7 @@ module normalize(sum, zexp, invz, normcnt, ae, aligncnt, sumshift, sumshiftzero,
 	logic		[9:0]		sumshifttmp;
 	logic       	[163:0]  	sumshiftedtmp;     // shifted sum
 	logic 				sticky;
-logic tmp,tmp1,tmp2,tmp3;
+logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;

 	// When the sum is zero,  normalization does not apply and only the
 	// sticky bit must be computed.  Otherwise,  the sum is right-shifted
@ -68,16 +68,16 @@ logic tmp,tmp1,tmp2,tmp3;
 		// p = 53
 		// ea + eb = ae
 			// set d<=2 to d<=0
-			if ($signed(aligncnt)<=$signed(0))  begin //d<=2 
+			if ($signed(aligncnt)<=$signed(1))  begin //d<=2 
 				// product anchored or cancellation
 				if ($signed(ae-normcnt+2) >= $signed(-1022)) begin //ea+eb-l+2 >= emin
 					//normal result
-					sumshifted = sum << (55+normcnt); // p+2+l
+					de0 = xzero|yzero ? zexp : ae-normcnt+2+xdenorm+ydenorm;
+					resultdenorm = |sum & ~|de0;
+					sumshifted = resultdenorm ? sum << sumshift : sum << (55+normcnt); // p+2+l
 					v = sumshifted[162:109];
 					sticky = (|sumshifted[108:0]) | bs;
-					resultdenorm = 0;
 					//de0 = ae-normcnt+2-1023;
-					de0 = xzero|yzero ? zexp : ae-normcnt+2+xdenorm+ydenorm;
 				end else begin
 					sumshifted = sum << (1080+ae);
 					v = sumshifted[162:109];
@ -87,38 +87,50 @@ logic tmp,tmp1,tmp2,tmp3;
 				end

 			end else begin                 // extract normalized bits
-				sumshifttmp = sumshift - 2;
+				sumshifttmp = {1'b0,sumshift} - 2;
 				sumshifted = sumshifttmp[9] ? sum : sum << sumshifttmp;
-				tmp1 = (sumshifted[163] & ~zdenorm & ~sumshifttmp[9]);
-				tmp2 = (zdenorm | sumshifttmp[9] || sumshifted[162]);
+				tmp1 = (sumshifted[163] & ~sumshifttmp[9]);
+				tmp2 = (sumshifttmp[9] || sumshifted[162]);
 				tmp3 = sumshifted[161];
+				tmp4 = sumshifted[160];
+				tmp5 = sumshifted[159];
 				// for some reason use exp = zexp + {0,1,2}
 				// the book says exp = zexp + {-1,0,1}
 				if(sumshiftzero) begin
 					v = sum[162:109];
 					sticky = sum[108:0] | bs;
 					de0 = zexp;
-				end else if(sumshifted[163] & ~zdenorm & ~sumshifttmp[9])begin
+				end else if(sumshifted[163] & ~sumshifttmp[9])begin
 					v = sumshifted[162:109];
 					sticky = (|sumshifted[108:0]) | bs;
 					de0 = zexp +2;
-				end else if (zdenorm | sumshifttmp[9] || sumshifted[162]) begin
+				end else if ((sumshifttmp[9] & sumshift[0]) || sumshifted[162]) begin
 					v = sumshifted[161:108];
 					sticky = (|sumshifted[107:0]) | bs;
 					de0 = zexp+1;
-				end else if (sumshifted[161]) begin
+				end else if (sumshifted[161] || (sumshifttmp[9] & sumshift[1])) begin
 					v = sumshifted[160:107];
 					sticky = (|sumshifted[106:0]) | bs;
 					//de0 = zexp-1;
 					de0 = zexp;
-				end else begin
+				end else if(sumshifted[160]) begin
 					v = sumshifted[159:106];
 					sticky = (|sumshifted[105:0]) | bs;
 					//de0 = zexp-1;
 					de0 = zexp-1;
+				end else if(sumshifted[159]) begin
+					v = sumshifted[158:105];
+					sticky = (|sumshifted[104:0]) | bs;
+					//de0 = zexp-1;
+					de0 = zexp-2;
+				end else begin					
+					v = sumshifted[160:107];
+					sticky = (|sumshifted[106:0]) | bs;
+					//de0 = zexp-1;
+					de0 = zexp;
 				end

-				resultdenorm = 0;
+				resultdenorm = ~(|de0);
 		end 
 	end

--- a/wally-pipelined/src/fpu/FMA/tbgen/results.dat
+++ b/wally-pipelined/src/fpu/FMA/tbgen/results.dat
@ -1,11 +1,16 @@
-8020007ffdffffff 9beffff7fff7fffe 000ffffffff7fffe 0000000000000000 000ffffffff7fffe  Wrong zdenorm unflw 475303
-3cafffffffffffff 3fd0000000000000 3cafffffffffffff 3c8ffffffffffffb 3cb3ffffffffffff  Wrong 706913
-bfbfffff007fffff 000fffffffffffff 000bffffffc00000 0015000007dc0000 000a00000fb80000  Wrong ydenorm zdenorm 1675647
-00114508bde544e1 3caffffffffffffe 800010000003fffe 801008000001fffe 800010000003fffd  Wrong zdenorm 2310057
-800ffffffdffffff bfcffe00003ffffe 800ffff01ffffffe 80160018103bfbff 800c00302077f7ff  Wrong xdenorm zdenorm 2475205
-bcafffffffffffff 3fd0000000000001 bcafffffffffffff bc8ffffffffffffd bcb4000000000000  Wrong 3776249
-bfc0000000800008 43d0001000000002 c3cffffbffff8000 c3a00000007e008a c3d20000000fc011  Wrong 3804445
-bfefffffffffffff 3fefffffffffffff bff0000000000001 b950000000000000 c000000000000000  Wrong 4338155
-37ea3353806450ba bffffffffffffffe b803fffffffff7ff b7c19a9c032205b3 b8108cd4e019102e  Wrong 5143755
-8010000000803fff 3ff0000000000001 000fffe07fffffff fff0000000000000 8000001f80804001  Wrong zdenorm w=-inf 5246469
-b7fffff80000001f 001ffffffffffffe 800fffffffff07ff 8000000000000000 800fffffffff07ff  Wrong w=-zero zdenorm unflw 5723787
+0010000000000000 bf4fdffffff7fffe 800ffffffffffffe 800003fbfffffefe 801003fbfffffefe  Wrong zdenorm 308227
+0010000000000000 be6fffffbffffff7 8000000000000000 800000001fffffc0 800000000fffffe0  Wrong 313753
+001ffffffffffffe 3fddfbffffffffff 000ffffffffffffe 000efdfffffffffd 001efdfffffffffd  Wrong zdenorm 551371
+3befe000ffffffff 800ffffffffffffe 0000000000000000 0000000000000000 8000000000000000  Wrong ydenorm unflw 665575
+000007fffffffffe 3f6ffffffe01fffe 000ffffffffffffe 00000007ffffff7e 00100007ffffff7e  Wrong xdenorm zdenorm 768727
+3fdffffffffffffe 000ffffffffffffe 8000000000000001 7feffffffffffff6 0007fffffffffffe  Wrong ydenorm zdenorm 1049939
+7fe0000000000001 4000000000000000 ffefffffffffffff 7ff0000000000000 7cb8000000000000  Wrong w=+inf 2602745
+000fff000000000f 3ff00800001fffff 8010000000000000 7f7bfe007ff8381e 000006ff801ffe0e  Wrong xdenorm 3117277
+8000000000000001 40211275ffe5ee3c 0000000000000001 fcfe24ebffcbdc78 8000000000000008  Wrong xdenorm zdenorm 3148591
+801fffffffffffff bfdffffffffffffe 0000000000021fff 0000000000021ffe 0010000000021ffe  Wrong zdenorm 3537867
+801ffffffffffffe 0010000000000001 0000000000000000 0000000000000000 8000000000000000  Wrong unflw 3564269
+bca0000000000001 000fffffc000001e 8000000000000000 8000000000000001 8000000000000000  Wrong ydenorm 3717769
+bcafffffffffffff 800ffffffffffffe 8000000000000000 0000000000000002 0000000000000001  Wrong ydenorm 3807413
+7fec5fed92358a74 400000001bffffff ffefc0003ffffffe 7ff0000000000000 7fe8ffdb47bad466  Wrong w=+inf 3889689
+bfdfffffffffffff 3fdf1f3616aa73e1 3fd0000000000001 3fd07064f4aac611 3f7c193d2ab1843f  Wrong 4099063
+3fd07dfffffffffe 8010000000000001 0000000000000001 ffe07dfffffffffb 80041f7fffffffff  Wrong zdenorm 4716133
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.c
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.c
@ -26,13 +26,13 @@ void main() {
 		char ans[81];
 		char flags[3];
 		int rn,rz,rm,rp;
-		long stop = 5723787;
+		long stop = 4099063;
 		int debug = 1;
 		//my_string = (char *) malloc (nbytes + 1);
 		//bytes_read = getline (&my_string, &nbytes, stdin);
 	

-		for(n=0; n < 2013; n++) {//613 for 10000
+		for(n=0; n < 613; n++) {//613 for 10000
 			if(getline(&ln,&nbytes,fp) < 0 || feof(fp)) break;
 			if(k == stop && debug == 1) break;
 			k++;
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.v
--- a/wally-pipelined/src/fpu/dev/fputop.sv
+++ b/wally-pipelined/src/fpu/dev/fputop.sv
@ -1,476 +0,0 @@
-`include "../../../config/rv64icfd/wally-config.vh"
-
-module fputop (
-  input  logic [2:0]       FrmD,
-  input  logic             reset,
-  input  logic             clear,
-  input  logic             clk,
-  input  logic [31:0]      InstrD,
-  input  logic [`XLEN-1:0] SrcAE,
-  input  logic [`XLEN-1:0] SrcAW,
-  output logic [31:0]      FSROutW,
-  output logic             DivSqrtDoneE,
-  output logic             FInvalInstrD,
-  output logic [`XLEN-1:0] FPUResultW);
-
-  //NOTE:
-  //For readability and ease of modification, logic signals will be
-  //instantiated as they occur within the pipeline. This will keep local
-  //signals, modules, and combinational logic closely defined.
-
-  //used for OSU DP-size hardware to wally XLEN interfacing
-  integer XLENDIFF;
-  assign XLENDIFF = `XLEN - 64;
-  integer XLENDIFFN;
-  assign XLENDIFFN = 63 - `XLEN;
-
-  //#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
-  //BEGIN PIPELINE CONTROL LOGIC
-  //
-   
-  logic	                   PipeEnableDE;
-  logic	                   PipeEnableEM;
-  logic	                   PipeEnableMW;
-  logic                    PipeClearDE;
-  logic                    PipeClearEM;
-  logic                    PipeClearMW;
-
-  //temporarily assign pipe clear and enable signals
-  //to never flush & always be running
-  assign PipeClear = 1'b0;
-  assign PipeEnable = 1'b1;
-  always_comb begin
-
-	  PipeEnableDE = PipeEnable;
-	  PipeEnableEM = PipeEnable;
-	  PipeEnableMW = PipeEnable;
-	  PipeClearDE = PipeClear;
-	  PipeClearEM = PipeClear;
-	  PipeClearMW = PipeClear;
-
-  end
-
-  //
-  //END PIPELINE CONTROL LOGIC
-  //#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
-
-  //#########################################
-  //BEGIN DECODE STAGE
-  //
- 
-  //wally-spec D stage control logic signal instantiation
-  logic                    IllegalFPUInstrFaultD;
-  logic                    FRegWriteD;
-  logic [2:0]              FResultSelD;
-  //logic [2:0]              FrmD;
-  logic                    PD;
-  logic                    DivSqrtStartD;
-  logic [3:0]              OpCtrlD;
-  logic                    WriteIntD;
-  
-  //top-level controller for FPU
-  fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Rs1D(InstrD[19:15]), .FrmW(InstrD[14:12]), .WriteEnD(FRegWriteD), .DivSqrtStartD(DivSqrtStartD), .WriteSelD(FResultSelD), .OpCtrlD(OpCtrlD), .FmtD(PD), .WriteIntD(WriteIntD));
-
-  //instantiation of D stage regfile signals (includes some W stage signals
-  //for easy reference)
-  logic [2:0]              FrmW;
-  logic                    WriteEnW;
-  logic [4:0]              RdW, Rs1D, Rs2D, Rs3D;
-  logic [`XLEN-1:0]        WriteDataW;
-  logic [`XLEN-1:0]        ReadData1D, ReadData2D, ReadData3D; 
-
-  //regfile instantiation
-  freg3adr fpregfile (FrmW, reset, PipeClear, clk, RdW, WriteEnW, Rs1D, Rs2D, Rs3D, WriteDataW, ReadData1D, ReadData2D, ReadData3D);
-
-  always_comb begin
-     FrmW = InstrD[14:12];
-  end
-
-  //
-  //END DECODE STAGE
-  //#########################################
-
-  //*****************************************
-  //BEGIN D/E PIPE
-  //
-
-  //wally-spec E stage control logic signal instantiation
-  logic                    FRegWriteE;
-  logic [2:0]              FResultSelE;
-  logic [2:0]              FrmE;
-  logic                    PE;
-  logic                    DivSqrtStartE;
-  logic [3:0]              OpCtrlE;
-
-  //instantiation of E stage regfile signals
-  logic [4:0]              RdE;
-  logic [`XLEN-1:0]        ReadData1E, ReadData2E, ReadData3E;
-
-  //instantiation of E/M stage div/sqrt signals
-  logic                    DivSqrtDone, DivDenormM;
-  logic [63:0]             DivResultM;
-  logic [4:0]              DivFlagsM;
-  logic [63:0]             DivOp1, DivOp2;
-  logic [2:0]              DivFrm;
-  logic                    DivOpType;
-  logic                    DivP;
-  logic                    DivOvEn, DivUnEn;
-  logic                    DivStart;
-
-  //instantiate E stage FMA signals here
-
-  //instantiation of E stage add/cvt signals
-  logic [63:0]             AddSumE, AddSumTcE;
-  logic [3:0]              AddSelInvE;
-  logic [10:0]             AddExpPostSumE;
-  logic                    AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
-  logic                    AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
-  logic [63:0]             AddFloat1E, AddFloat2E;
-  logic [10:0]             AddExp1DenormE, AddExp2DenormE, AddExponentE;
-  logic [63:0]             AddOp1E, AddOp2E;
-  logic [2:0]              AddRmE;
-  logic [3:0]              AddOpTypeE;
-  logic                    AddPE, AddOvEnE, AddUnEnE;  
-
-  //instantiation of E stage cmp signals 
-  logic [7:0]              WE, XE;
-  logic                    ANaNE, BNaNE, AzeroE, BzeroE;
-  logic [63:0]             CmpOp1E, CmpOp2E;
-  logic [1:0]              CmpSelE;
-
-  //instantiation of E/M stage fsgn signals (due to bypass logic)
-  logic [63:0]             SgnOp1E, SgnOp2E;
-  logic [1:0]              SgnOpCodeE, SgnOpCodeM;
-  logic [63:0]             SgnResultE, SgnResultM;
-  logic [4:0]              SgnFlagsE, SgnFlagsM;
-
-  //*****************
-  //fpregfile D/E pipe registers
-  //*****************
-  flopenrc #(64) (clk, reset, PipeClearDE, PipeEnableDE, ReadData1D, ReadData1E);
-  flopenrc #(64) (clk, reset, PipeClearDE, PipeEnableDE, ReadData2D, ReadData2E);
-  flopenrc #(64) (clk, reset, PipeClearDE, PipeEnableDE, ReadData3D, ReadData3E);
-
-  //*****************
-  //other  D/E pipe registers
-  //*****************
-  flopenrc #(1) (clk, reset, PipeClearDE, PipeEnableDE, FRegWriteD, FRegWriteE);
-  flopenrc #(3) (clk, reset, PipeClearDE, PipeEnableDE, FResultsSelD, FResultsSelE);
-  flopenrc #(3) (clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
-  flopenrc #(1) (clk, reset, PipeClearDE, PipeEnableDE, PD, PE);
-  flopenrc #(4) (clk, reset, PipeClearDE, PipeEnableDE, OpCtrlD, OpCtrlE);
-  flopenrc #(1) (clk, reset, PipeClearDE, PipeEnableDE, DivSqrtStartD, DivSqrtStartE);
-
-  //
-  //END D/E PIPE
-  //*****************************************
-
-  //#########################################
-  //BEGIN EXECUTION STAGE
-  //
-
-  //fma1 ();
-
-  //first and only instance of floating-point divider
-  fpdivsqrt (DivSqrtDone, DivResultM, DivFlagsM, DivDenormM, DivOp1, DivOp2, DivFrm, DivOpType, DivP, DivOvEn, DivUnEn, DivStart, reset, clk);
-
-  //first of two-stage instance of floating-point add/cvt unit
-  fpaddcvt1 fpadd1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, AddOp1E, AddOp2E, AddRmE, AddOpTypeE, AddPE, AddOvEnE, AddUnEnE);
-
-  //first of two-stage instance of floating-point comparator
-  fpcmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, CmpOp1E, CmpOp2E, CmpSelE);
-
-  //first and only instance of floating-point sign converter
-  fpusgn fpsgn (SgnOpCodeE, SgnResultE, SgnFlagsE, SgnOp1, SgnOp2);
-
-  //interface between XLEN size datapath and double-precision sized
-  //floating-point results
-  //
-  //define offsets for LSB zero extension or truncation
-  always_comb begin
-
-  //truncate to 64 bits
-  //(causes warning during compilation - case never reached) 
-  if(`XLEN > 64) begin
-        DivOp1 <= ReadData1E[`XLEN:`XLEN-64];
-	DivOp2 <= ReadData2E[`XLEN:`XLEN-64];
-        AddOp1E <= ReadData1E[`XLEN:`XLEN-64];
-	AddOp2E <= ReadData2E[`XLEN:`XLEN-64];
-        CmpOp1E <= ReadData1E[`XLEN:`XLEN-64];
-	CmpOp2E <= ReadData2E[`XLEN:`XLEN-64];
-        SgnOp1E <= ReadData1E[`XLEN:`XLEN-64];
-	SgnOp2E <= ReadData2E[`XLEN:`XLEN-64];
-  end
-  //zero extend to 64 bits
-  else begin
-        DivOp1 <= {ReadData1E,{64-`XLEN{1'b0}}};
-	DivOp2 <= {ReadData2E,{64-`XLEN{1'b0}}};
-        AddOp1E <= {ReadData1E,{64-`XLEN{1'b0}}};
-	AddOp2E <= {ReadData2E,{64-`XLEN{1'b0}}};
-        CmpOp1E <= {ReadData1E,{64-`XLEN{1'b0}}};
-	CmpOp2E <= {ReadData2E,{64-`XLEN{1'b0}}};
-        SgnOp1E <= {ReadData1E,{64-`XLEN{1'b0}}};
-	SgnOp2E <= {ReadData2E,{64-`XLEN{1'b0}}};
-  end
-
-  //assign op codes
-  AddOpTypeE[3:0] <= OpCtrlE[3:0];
-  CmpSelE[1:0] <= OpCtrlE[1:0];
-  DivOpType <= OpCtrlE[0];
-  SgnOpCodeE[1:0] <= OpCtrlE[1:0];
-
-  end 
-
-  //E stage control signal interfacing between wally spec and OSU fp hardware
-  //op codes
-
-  //
-  //END EXECUTION STAGE
-  //#########################################
-
-  //*****************************************
-  //BEGIN E/M PIPE
-  //
-
-  //wally-spec M stage control logic signal instantiation
-  logic                    FRegWriteM;
-  logic [2:0]              FResultSelM;
-  logic [2:0]              FrmM;
-  logic                    PM;
-  logic [3:0]              OpCtrlM;
-
-  //instantiate M stage FMA signals here
-
-  //instantiation of M stage regfile signals
-  logic [4:0]              RdM;
-  logic [`XLEN-1:0]        ReadData1M, ReadData2M, ReadData3M;
-
-  //instantiation of M stage add/cvt signals
-  logic [63:0]             AddResultM;
-  logic [4:0]              AddFlagsM;
-  logic                    AddDenormM;
-  logic [63:0]             AddSumM, AddSumTcM;
-  logic [3:0]              AddSelInvM;
-  logic [10:0]             AddExpPostSumM;
-  logic                    AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
-  logic                    AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
-  logic [63:0]             AddFloat1M, AddFloat2M;
-  logic [10:0]             AddExp1DenormM, AddExp2DenormM, AddExponentM;
-  logic [63:0]             AddOp1M, AddOp2M;
-  logic [2:0]              AddRmM;
-  logic [3:0]              AddOpTypeM;
-  logic                    AddPM, AddOvEnM, AddUnEnM;  
-
-  //instantiation of M stage cmp signals
-  logic                    CmpInvalidM;
-  logic [1:0]              CmpFCCM; 
-  logic [7:0]              WM, XM;
-  logic                    ANaNM, BNaNM, AzeroM, BzeroM;
-  logic [63:0]             CmpOp1M, CmpOp2M;
-  logic [1:0]              CmpSelM;
-
-  //*****************
-  //fma E/M pipe registers
-  //*****************  
-
-  //*****************
-  //fpadd E/M pipe registers
-  //*****************
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
-  flopenrc #(4) (clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
-  flopenrc #(11) (clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignM); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
-  flopenrc #(11) (clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
-  flopenrc #(11) (clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
-  flopenrc #(11) (clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddOp1E, AddOp1M); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddOp2E, AddOp2M); 
-  flopenrc #(3) (clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
-  flopenrc #(4) (clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
-
-  //*****************
-  //fpcmp E/M pipe registers
-  //*****************
-  flopenrc #(8) (clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
-  flopenrc #(8) (clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, CmpOp1E, CmpOp1M); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, CmpOp2E, CmpOp2M); 
-  flopenrc #(2) (clk, reset, PipeClearEM, PipeEnableEM, CmpSelE, CmpSelM);
-
-  //put this in for the event we want to delay fsgn - will otherwise bypass
-  //*****************
-  //fpsgn E/M pipe registers
-  //***************** 
-  flopenrc #(2) (clk, reset, PipeClearEM, PipeEnableEM, SgnOpCodeE, SgnOpCodeM);
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
-  flopenrc #(5) (clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
-
-  //*****************
-  //other E/M pipe registers
-  //*****************
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, FRegWriteE, FRegWriteM);
-  flopenrc #(3) (clk, reset, PipeClearEM, PipeEnableEM, FResultsSelE, FResultsSelM);
-  flopenrc #(3) (clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, PE, PM);
-  flopenrc #(4) (clk, reset, PipeClearEM, PipeEnableEM, OpCtrlE, OpCtrlM);
-
-  //
-  //END E/M PIPE
-  //*****************************************
-
-  //#########################################
-  //BEGIN MEMORY STAGE
-  //
-
-  //fma2 ();
-
-  //second instance of two-stage floating-point add/cvt unit
-  fpaddcvt2 fpadd2 (AddResultM, AddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, AddOp1M, AddOp2M, AddRmM, AddOpTypeM, AddPM, AddOvEnM, AddUnEnM);
-
-  //second instance of two-stage floating-point comparator
-  fpcmp2 fpcmp2 (CmpInvalidM, CmpFCCM, ANaNM, BNaNM, AzeroM, BzeroM, WM, XM, CmpSelM, CmpOp1M, CmpOp2M);
-
-  //
-  //END MEMORY STAGE
-  //#########################################
-
-
-  //*****************************************
-  //BEGIN M/W PIPE
-  //
-  
-  //wally-spec W stage control logic signal instantiation
-  logic                    FRegWriteW;
-  logic [2:0]              FResultSelW;
-  logic                    PW;
-
-  //instantiate W stage fma signals here
-
-  //instantiation of W stage div/sqrt signals
-  logic                    DivDenormW;
-  logic [63:0]             DivResultW;
-  logic [4:0]              DivFlagsW;
-
-  //instantiation of W stage regfile signals
-  logic [`XLEN-1:0]        ReadData1W, ReadData2W, ReadData3W;
-
-  //instantiation of W stage add/cvt signals
-  logic [63:0]             AddResultW;
-  logic [4:0]              AddFlagsW;
-  logic                    AddDenormW;
-
-  //instantiation of W stage cmp signals
-  logic                    CmpInvalidW;
-  logic [1:0]              CmpFCCW; 
-
-  //*****************
-  //fma M/W pipe registers
-  //*****************
-  
-  //*****************
-  //fpdiv M/W pipe registers
-  //*****************
-  flopenrc #(64) (clk, reset, PipeClearMW, PipeEnableMW, DivResultM, DivResultW); 
-  flopenrc #(5) (clk, reset, PipeClearMW, PipeEnableMW, DivFlagsM, DivFlagsW);
-  flopenrc #(1) (clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
-
-  //*****************
-  //fpadd M/W pipe registers
-  //*****************
-  flopenrc #(64) (clk, reset, PipeClearMW, PipeEnableMW, AddResultM, AddResultW); 
-  flopenrc #(5) (clk, reset, PipeClearMW, PipeEnableMW, AddFlagsM, AddFlagsW); 
-  flopenrc #(1) (clk, reset, PipeClearMW, PipeEnableMW, AddDenormM, AddDenormW); 
-
-  //*****************
-  //fpcmp M/W pipe registers
-  //*****************
-  flopenrc #(1) (clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
-  flopenrc #(2) (clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
-
-  //*****************
-  //fpsgn M/W pipe registers
-  //***************** 
-  flopenrc #(64) (clk, reset, PipeClearMW, PipeEnableMw, SgnResultM, SgnResultW);
-  flopenrc #(5) (clk, reset, PipeClearMw, PipeEnableMw, SgnFlagsM, SgnFlagsW);
-
-  //*****************
-  //other M/W pipe registers
-  //*****************
-  flopenrc #(1) (clk, reset, PipeClearMW, PipeEnableMW, FRegWriteM, FRegWriteW);
-  flopenrc #(3) (clk, reset, PipeClearMW, PipeEnableMW, FResultsSelM, FResultsSelW);
-  flopenrc #(1) (clk, reset, PipeClearMW, PipeEnableMW, PM, PW);
-
-  ////END M/W PIPE
-  //*****************************************
-
-
-  //#########################################
-  //BEGIN WRITEBACK STAGE
-  //
-
-  //flag signal mux via in-line ternaries
-  logic [4:0] FPUFlagsW;
-  //if bit 2 is active set to sign flags - otherwise:
-  //iff bit one is high - if bit zero is active set to fma flags - otherwise
-  //set to cmp flags
-  //iff bit one is low - if bit zero is active set to add/cvt flags - otherwise
-  //set to div/sqrt flags
-  assign FPUFlagsW = (FResultSelW[2]) ? (SgnFlagsW) : (
-	             (FResultSelW[1]) ? 
-		     ( (FResultSelW[0]) ? (5'b00000) : ({CmpInvalidW,4'b0000}) ) 
-		     : ( (FResultSelW[0]) ? (AddFlagsW) : (DivFlagsW) ) 
-                     );
-
-  //result mux via in-line ternaries
-  logic [63:0] FPUResultDirW; 
-  //the uses the same logic as for flag signals
-  assign FPUResultDirW = (FResultSelW[2]) ? (SgnResultW) : (
-	             (FResultSelW[1]) ? 
-		     ( (FResultSelW[0]) ? (64'b0) : ({62'b0,CmpFCCW}) ) 
-		     : ( (FResultSelW[0]) ? (AddResultW) : (DivResultW) ) 
-                     );
-
-  //interface between XLEN size datapath and double-precision sized
-  //floating-point results
-  //
-  //define offsets for LSB zero extension or truncation
-  always_comb begin
-           
-  //zero extension  
-  if(`XLEN > 64) begin
-      FPUResultW <= {FPUResultDirW,{XLENDIFF{1'b0}}};
-  end
-  //truncate
-  else begin
-      FPUResultW <= FPUResultDirW[63:64-`XLEN];
-  end
-
-  end  
-
-  //
-  //END WRITEBACK STAGE
-  //#########################################
-
-
-
-endmodule
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@ -88,8 +88,13 @@ module wallypipelinedhart (
   logic       DivBusyE;   
  logic [4:0] SetFflagsM;
  logic [2:0] FRM_REGW;
+  logic       DivDoneW;  
  logic       FloatRegWriteW;
  logic       SquashSCW;
+  logic [31:0]      FSROutW;
+  logic             DivSqrtDoneE;
+  logic             FInvalInstrD;
+  logic [`XLEN-1:0] FPUResultW;

  // memory management unit signals
  logic             ITLBWriteF, DTLBWriteM;
@ -144,16 +149,17 @@ module wallypipelinedhart (

 
  muldiv mdu(.*); // multiply and divide unit
- /*  fpu fpu(.*); // floating point unit
-  */
+  
  hazard     hzu(.*);	// global stall and flush control

  // Priveleged block operates in M and W stages, handling CSRs and exceptions
  privileged priv(.*);
+  

+  fpu fpu(.*); // floating point unit
  // add FPU here, with SetFflagsM, FRM_REGW
  // presently stub out SetFlagsM and FloatRegWriteW
-  assign SetFflagsM = 0;
-  assign FloatRegWriteW = 0;
+  //assign SetFflagsM = 0;
+  //assign FloatRegWriteW = 0;
             
 endmodule
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -360,7 +360,7 @@ string tests32i[] = {
        if (`A_SUPPORTED) tests = {tests, tests64a};
      end
 //     tests = {tests64a, tests};
-      tests = {tests, tests64p};
+      // tests = {tests, tests64p};
    end else begin // RV32
      // *** add the 32 bit bp tests
      tests = {tests32i};