LZA added to FMA and attemting a merged FMA and adder in synthesis

2021-08-10 13:57:16 -04:00 · 2021-08-10 13:57:16 -04:00 · e00f181bcf
commit e00f181bcf
parent cce0571925
7 changed files with 846 additions and 196 deletions
--- a/wally-pipelined/config/rv64icfd/wally-config.vh
+++ b/wally-pipelined/config/rv64icfd/wally-config.vh
@ -26,7 +26,7 @@

 // include shared configuration
 `include "wally-shared.vh"
-// `include "../../../config/shared/wally-shared.vh"
+  // `include "../shared/wally-shared.vh"

 `define QEMU 0
 `define BUILDROOT 0
--- a/wally-pipelined/fpu-testfloat/FMA/tbgen/test_gen.sh
+++ b/wally-pipelined/fpu-testfloat/FMA/tbgen/test_gen.sh
@ -1,3 +1,3 @@
-testfloat_gen f64_mulAdd -tininessafter -n 6133248 -rnear_even  -seed 113355 -level 1 > testFloat
+testfloat_gen f32_add -tininessafter -n 6133248 -rnear_even  -seed 113355 -level 1 > testFloat
 tr -d ' ' < testFloat > testFloatNoSpace

--- a/wally-pipelined/src/fpu/cvtfp.sv
+++ b/wally-pipelined/src/fpu/cvtfp.sv
@ -0,0 +1,120 @@
+
+// `include "wally-config.vh"
+module cvtfp (
+    input logic [10:0] XExpE,
+    input logic [52:0] XManE,
+    input logic XSgnE,
+    input logic XZeroE,
+    input logic XDenormE,
+    input logic XInfE,
+    input logic XNaNE,
+    input logic XSNaNE,
+    input logic [2:0] FrmE,
+    input logic FmtE,
+    output logic [63:0] CvtFpResE,
+    output logic [4:0] CvtFpFlgE);
+
+    logic [7:0] DExp;
+    logic [51:0] Frac;
+    logic Denorm;
+
+
+	logic [8:0]	i,NormCnt;
+	always_comb begin
+			i = 0;
+			while (~XManE[52-i] && i <= 52) i = i+1;  // search for leading one 
+			NormCnt = i;
+	end
+
+
+
+
+
+
+
+
+    logic [12:0] DExpCalc;
+    // logic Overflow, Underflow;
+    assign DExpCalc = (XExpE-1023+127)&{13{~XZeroE}};
+    assign Denorm = $signed(DExpCalc) <= 0 & $signed(DExpCalc) > $signed(-23);
+
+    logic [12:0] ShiftCnt;
+	logic [51:0] SFrac;
+	logic [25:0] DFrac;
+	logic [77:0] DFracTmp,tmp, tmp2;
+    //assign ShiftCnt = FmtE ? -DExpCalc&{13{Denorm}} : NormCnt;
+    assign SFrac = XManE[51:0] << NormCnt;
+logic Shift;
+assign tmp = (-DExpCalc+1)&{13{Shift}};
+assign tmp2 = {XManE, 23'b0};
+assign Shift = {13{Denorm|(($signed(DExpCalc) > $signed(-25)) & DExpCalc[12])}};
+	assign DFracTmp = {XManE, 25'b0} >> ((-DExpCalc+1)&{13{Shift}});
+assign DFrac = DFracTmp[76:51];
+
+    logic Sticky, UfSticky, Guard, Round, LSBFrac, UfGuard, UfRound, UfLSBFrac;
+    logic CalcPlus1, UfCalcPlus1;
+    logic Plus1, UfPlus1;
+    // used to determine underflow flag
+    assign UfSticky = |DFracTmp[50:0];
+    assign UfGuard = DFrac[1];
+    assign UfRound = DFrac[0];
+    assign UfLSBFrac = DFrac[2];
+
+    
+    assign Sticky = UfSticky | UfRound;
+    assign Guard = DFrac[2];
+    assign Round = DFrac[1];
+    assign LSBFrac = DFrac[3];
+
+
+    always_comb begin
+        // Determine if you add 1
+        case (FrmE)
+            3'b000: CalcPlus1 = Guard & (Round | (Sticky) | (~Round&~Sticky&LSBFrac));//round to nearest even
+            3'b001: CalcPlus1 = 0;//round to zero
+            3'b010: CalcPlus1 = XSgnE;//round down
+            3'b011: CalcPlus1 = ~XSgnE;//round up
+            3'b100: CalcPlus1 = (Guard & (Round | (Sticky) | (~Round&~Sticky)));//round to nearest max magnitude
+            default: CalcPlus1 = 1'bx;
+        endcase
+        // Determine if you add 1 (for underflow flag)
+        case (FrmE)
+            3'b000: UfCalcPlus1 = UfGuard & (UfRound | UfSticky | (~UfRound&~UfSticky&UfLSBFrac));//round to nearest even
+            3'b001: UfCalcPlus1 = 0;//round to zero
+            3'b010: UfCalcPlus1 = XSgnE;//round down
+            3'b011: UfCalcPlus1 = ~XSgnE;//round up
+            3'b100: UfCalcPlus1 = (UfGuard & (UfRound | UfSticky | (~UfRound&~UfSticky)));//round to nearest max magnitude
+            default: UfCalcPlus1 = 1'bx;
+        endcase
+   
+    end
+
+    // If an answer is exact don't round
+    assign Plus1 = CalcPlus1 & (Sticky | UfGuard | Guard | Round);
+    assign UfPlus1 = UfCalcPlus1 & (Sticky | UfGuard);
+    logic [12:0] DExpFull;
+logic [22:0] DResFrac;
+logic [7:0] DResExp;
+    assign {DExpFull, DResFrac} = {DExpCalc&{13{~Denorm}}, DFrac[25:3]} + Plus1;
+    assign DResExp = DExpFull[7:0];
+
+	logic [10:0] SExp;
+	assign SExp = XExpE-(NormCnt&{8{~XZeroE}})+({11{XDenormE}}&1024-127);
+
+    logic Overflow, Underflow, Inexact;
+    assign Overflow = $signed(DExpFull) >= $signed({1'b0, {8{1'b1}}}) & ~(XNaNE|XInfE);
+    assign Underflow = (($signed(DExpFull) <= 0) & ((Sticky|Guard|Round) | (XManE[52]&~|DFrac) | (|DFrac&~Denorm)) | ((DExpFull == 1) & Denorm & ~(UfPlus1&UfLSBFrac))) & ~(XNaNE|XInfE);
+    assign Inexact = (Sticky|Guard|Round|Underflow|Overflow) &~(XNaNE);
+
+logic [31:0] DRes;
+    assign DRes = XNaNE ? {XSgnE, XExpE, 1'b1, XManE[50:29]} : 
+			Underflow & ~Denorm ? {XSgnE, 30'b0, CalcPlus1&(|FrmE[1:0]|Shift)} : 
+			    Overflow | XInfE ? ((FrmE[1:0]==2'b01) | (FrmE[1:0]==2'b10&~XSgnE) | (FrmE[1:0]==2'b11&XSgnE)) & ~XInfE ? {XSgnE, 8'hfe, {23{1'b1}}} :
+                                                                                                                 {XSgnE, 8'hff, 23'b0} : 
+			    {XSgnE, DResExp, DResFrac};
+    assign CvtFpResE = FmtE ? {{32{1'b1}},DRes} : {XSgnE, SExp, SFrac[51]|XNaNE, SFrac[50:0]};
+    assign CvtFpFlgE = FmtE ? {XSNaNE, 1'b0, Overflow, Underflow, Inexact} : {XSNaNE, 4'b0};
+
+endmodule // fpadd
+
+
--- a/wally-pipelined/src/fpu/faddcvt.sv
+++ b/wally-pipelined/src/fpu/faddcvt.sv
@ -117,8 +117,8 @@ module fpuaddcvt1 (
   output logic         AddSwapE
   );

-   wire [5:0]	 ZP_mantissaA;
-   wire [5:0]	 ZP_mantissaB;
+   logic [5:0]	 ZP_mantissaA;
+   logic [5:0]	 ZP_mantissaB;
   wire		    ZV_mantissaA;
   wire		    ZV_mantissaB;

@ -181,8 +181,20 @@ module fpuaddcvt1 (
   // normalization. If sum_corrected is all zeros, the exp_valid is 
   // zero; otherwise, it is one. 
   // modified to 52 bits to detect leading zeroes on denormalized mantissas
-   lz52 lz_norm_1 (ZP_mantissaA, ZV_mantissaA, mantissaA);
-   lz52 lz_norm_2 (ZP_mantissaB, ZV_mantissaB, mantissaB);
+   // lz52 lz_norm_1 (ZP_mantissaA, ZV_mantissaA, mantissaA);
+   // lz52 lz_norm_2 (ZP_mantissaB, ZV_mantissaB, mantissaB);    
+   logic [8:0] i;
+   logic [8:0] j;
+    always_comb begin
+            i = 0;
+            while (~mantissaA[52-i] && $unsigned(i) <= $unsigned(52)) i = i+1;  // search for leading one
+            ZP_mantissaA = i;
+    end
+    always_comb begin
+            j = 0;
+            while (~mantissaB[52-j] && $unsigned(j) <= $unsigned(52)) j = j+1;  // search for leading one
+            ZP_mantissaB = j;
+    end

   // Denormalized exponents created by subtracting the leading zeroes from the original exponents
   assign AddExp1DenormE = AddSwapE ? (exp1 - {6'b0, ZP_mantissaB}) : (exp1 - {6'b0, ZP_mantissaA}); //KEP extended ZP_mantissa 
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@ -38,8 +38,8 @@ module fctrl (
      7'b1001011:   ControlsD = `FCTRLW'b1_0_001_010_00_00_0_0; // fnmsub
      7'b1001111:   ControlsD = `FCTRLW'b1_0_001_011_00_00_0_0; // fnmadd
      7'b1010011: casez(Funct7D)
-                    7'b00000??: ControlsD = `FCTRLW'b1_0_010_000_00_00_0_0; // fadd
-                    7'b00001??: ControlsD = `FCTRLW'b1_0_010_001_00_00_0_0; // fsub
+                    7'b00000??: ControlsD = `FCTRLW'b1_0_001_110_00_00_0_0; // fadd
+                    7'b00001??: ControlsD = `FCTRLW'b1_0_001_111_00_00_0_0; // fsub
                    7'b00010??: ControlsD = `FCTRLW'b1_0_001_100_00_00_0_0; // fmul
                    7'b00011??: ControlsD = `FCTRLW'b1_0_011_000_00_00_1_0; // fdiv
                    7'b01011??: ControlsD = `FCTRLW'b1_0_011_001_00_00_1_0; // fsqrt
--- a/wally-pipelined/src/fpu/fma.sv
+++ b/wally-pipelined/src/fpu/fma.sv
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -76,7 +76,7 @@ module fpu (
 	logic [63:0] 	    FRD1D, FRD2D, FRD3D;  // Read Data from FP register - decode stage
 	logic [63:0] 	    FRD1E, FRD2E, FRD3E;  // Read Data from FP register - execute stage
 	logic [63:0] 	    FSrcXE, FSrcXM;       // Input 1 to the various units (after forwarding)
-	logic [63:0] 	    FSrcYE;               // Input 2 to the various units (after forwarding)
+	logic [63:0] 	    FPreSrcYE, FSrcYE;               // Input 2 to the various units (after forwarding)
 	logic [63:0] 	    FPreSrcZE, FSrcZE;     // Input 3 to the various units (after forwarding)
 	
 	// unpacking signals
@ -110,8 +110,8 @@ module fpu (
 	
 	logic [63:0] 	ReadResW;           // read result (load instruction)

-	logic [63:0] 	FAddResM, FAddResW; // add/FP -> FP convert result
-	logic [4:0] 	FAddFlgM, FAddFlgW; // add/FP -> FP convert flags
+	logic [63:0] 	CvtFpResE, CvtFpResM, CvtFpResW; // add/FP -> FP convert result
+	logic [4:0] 	CvtFpFlgE, CvtFpFlgM, CvtFpFlgW; // add/FP -> FP convert flags

 	logic [63:0] 	CvtResE, CvtResM;   // FP <-> int convert result
 	logic [4:0] 	CvtFlgE, CvtFlgM;   // FP <-> int convert flags //*** trim this
@ -196,9 +196,10 @@ module fpu (

 	// forwarding muxs
 	mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
-	mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FSrcYE);
+	mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
 	mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
-	mux2  #(64)  fzmulmux(FPreSrcZE, 64'b0, FOpCtrlE[2], FSrcZE); // Force Z to be 0 for multiply instructions
+	mux3  #(64)  fyaddmux(FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, {2'b0, {10{1'b1}}, 52'b0}, {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b001), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b001)}, FSrcYE); // Force Z to be 0 for multiply instructions
+	mux3  #(64)  fzmulmux(FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE); // Force Z to be 0 for multiply instructions
 	
   
  // unpacking unit
@ -261,11 +262,14 @@ module fpu (
  //    - contains some E/M pipleine registers
  //*** remove uneeded logic
  //*** change to use the unpacking unit if possible
-	faddcvt faddcvt (.clk, .reset, .FlushM, .StallM, .FrmM, .FOpCtrlM, .FmtE, .FmtM, .FSrcXE, .FSrcYE, .FOpCtrlE, 
-   .XSgnM, .YSgnM, .XManM, .YManM, .XExpM, .YExpM,
-   .XSgnE, .YSgnE, .XManE, .YManE, .XExpE, .YExpE, .XDenormE, .YDenormE, .XNormE, .YNormE, .XNormM, .YNormM,  .XZeroE, .YZeroE, .XInfE, .YInfE, .XNaNE, .YNaNE, .XSNaNE, .YSNaNE,
-                  // outputs:
-                  .FAddResM, .FAddFlgM);
+// 	faddcvt faddcvt (.clk, .reset, .FlushM, .StallM, .FrmM, .FOpCtrlM, .FmtE, .FmtM, .FSrcXE, .FSrcYE, .FOpCtrlE, 
+//    .XSgnM, .YSgnM, .XManM, .YManM, .XExpM, .YExpM,
+//    .XSgnE, .YSgnE, .XManE, .YManE, .XExpE, .YExpE, .XDenormE, .YDenormE, .XNormE, .YNormE, .XNormM, .YNormM,  .XZeroE, .YZeroE, .XInfE, .YInfE, .XNaNE, .YNaNE, .XSNaNE, .YSNaNE,
+//                   // outputs:
+//                   .CvtFpResM, .CvtFpFlgM);
+
+
+	cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
 	
 	// compare unit
  //    - computation is done in one stage
@ -323,6 +327,9 @@ module fpu (
 	flopenrc #(64) EMRegSgnRes(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
 	flopenrc #(1) EMRegSgnFlg(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);

+	flopenrc #(64) EMRegCvtFpRes(clk, reset, FlushM, ~StallM, CvtFpResE, CvtFpResM);
+	flopenrc #(5) EMRegCvtFpFlg(clk, reset, FlushM, ~StallM, CvtFpFlgE, CvtFpFlgM);
+	
 	flopenrc #(64) EMRegCvtRes(clk, reset, FlushM, ~StallM, CvtResE, CvtResM);
 	flopenrc #(5) EMRegCvtFlg(clk, reset, FlushM, ~StallM, CvtFlgE, CvtFlgM);
  
@ -352,7 +359,7 @@ module fpu (
 	mux4  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], FSrcXM[`XLEN-1:0], ClassResM[`XLEN-1:0], CvtResM[`XLEN-1:0], FIntResSelM, FIntResM);
 	
  // FPU flag selection - to privileged
-	mux5  #(5)  FPUFlgMux(5'b0, FMAFlgM, FAddFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
+	mux5  #(5)  FPUFlgMux(5'b0, FMAFlgM, CvtFpFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
 	


@ -363,7 +370,7 @@ module fpu (
 	////////////////////////////////////////////////////////////////////////////////////////
 	flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
 	flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
-	flopenrc #(64) MWRegAdd(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
+	flopenrc #(64) MWRegAdd(clk, reset, FlushW, ~StallW, CvtFpResM, CvtFpResW); 
 	flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
 	flopenrc #(6)  MWCtrlReg(clk, reset, FlushW, ~StallW,
 				{FRegWriteM, FResultSelM, FmtM, FWriteIntM},
@ -382,7 +389,7 @@ module fpu (
 	mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);

  // select the result to be written to the FP register
-	mux5  #(64)  FPUResultMux(ReadResW, FMAResW, FAddResW, FDivResW, FResW, FResultSelW, FPUResultW);
+	mux5  #(64)  FPUResultMux(ReadResW, FMAResW, CvtFpResW, FDivResW, FResW, FResultSelW, FPUResultW);
 	
 	
  end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low