Merge branch 'main' of https://github.com/davidharrishmc/riscv-wally

2025-02-11 06:05:49 +00:00 · 2022-07-13 17:36:56 +00:00 · 2022-07-13 17:36:56 +00:00 · e9ce71ca20
commit e9ce71ca20
parent 8d5081e8e9 b45b3baec2
13 changed files with 51 additions and 49 deletions
--- a/pipelined/config/rv64fp/wally-config.vh
+++ b/pipelined/config/rv64fp/wally-config.vh
@ -32,7 +32,7 @@
 `define DESIGN_COMPILER 0

 // RV32 or RV64: XLEN = 32 or 64
-`define XLEN 32
+`define XLEN 64

 // IEEE 754 compliance
 `define IEEE754 0
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@ -101,14 +101,15 @@
 `define CORRSHIFTSZ ((`DIVLEN+`NF+3) > (3*`NF+8) ? (`DIVLEN+`NF+3) : (3*`NF+6))

 // division constants
-`define RADIX 4
-`define DIVCOPIES 4
+`define RADIX 32'h4
+`define DIVCOPIES 32'h4
 `define DIVLEN ((`NF < `XLEN) ? (`XLEN) : (`NF))
 `define DIVRESLEN ((`NF>`XLEN) ? `DIVLEN+2 : `DIVLEN)
-`define LOGR ((`RADIX==2) ? 1 : 2)
-`define FPDUR $ceil($itor(`DIVRESLEN)/$itor(`LOGR*`DIVCOPIES))
-`define DURLEN ($clog2($rtoi(`FPDUR)+1))
-`define QLEN ($rtoi(`FPDUR)*`LOGR*`DIVCOPIES)
+`define LOGR ((`RADIX==2) ? 32'h1 : 32'h2)
+// FPDUR = ceil(DIVRESLEN/(LOGR*DIVCOPIES))
+`define FPDUR ((`DIVRESLEN+(`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES))
+`define DURLEN ($clog2(`FPDUR+1))
+`define QLEN (`FPDUR*`LOGR*`DIVCOPIES)


 `define USE_SRAM 0
--- a/pipelined/regression/wave-fpu.do
+++ b/pipelined/regression/wave-fpu.do
@ -24,9 +24,6 @@ add wave -group {Divide} -noupdate /testbenchfp/srtradix4/*
 add wave -group {Divide} -group inter0 -noupdate /testbenchfp/srtradix4/genblk1[0]/divinteration/*
 add wave -group {Divide} -group inter0 -noupdate /testbenchfp/srtradix4/genblk1[0]/divinteration/qsel4/*
 add wave -group {Divide} -group inter0 -noupdate /testbenchfp/srtradix4/genblk1[0]/divinteration/otfc4/*
-add wave -group {Divide} -group inter1 -noupdate /testbenchfp/srtradix4/genblk1[1]/divinteration/*
-add wave -group {Divide} -group inter2 -noupdate /testbenchfp/srtradix4/genblk1[2]/divinteration/*
-add wave -group {Divide} -group inter3 -noupdate /testbenchfp/srtradix4/genblk1[3]/divinteration/*
 add wave -group {Divide} -noupdate /testbenchfp/srtpreproc/*
 add wave -group {Divide} -noupdate /testbenchfp/srtradix4/expcalc/*
 add wave -group {Divide} -noupdate /testbenchfp/srtfsm/*
--- a/pipelined/src/fpu/divsqrt.sv
+++ b/pipelined/src/fpu/divsqrt.sv
@ -52,7 +52,7 @@ module divsqrt(
 //   output logic [`XLEN-1:0] RemM,
 );

-  logic [`DIVLEN+3:0]  WSN, WCN;
+  logic [`DIVLEN+3:0]  NextWSN, NextWCN;
  logic [`DIVLEN+3:0]  WS, WC;
  logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
  logic [`DIVLEN-1:0] X;
@ -61,8 +61,8 @@ module divsqrt(

  srtpreproc srtpreproc(.XManE, .Dur, .YManE,.X,.Dpreproc, .XZeroCnt, .YZeroCnt);

-  srtfsm srtfsm(.reset, .WSN, .WCN, .WS, .WC, .Dur, .DivBusy, .clk, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, .DivStickyE(DivStickyM), .XNaNE, .YNaNE,
+  srtfsm srtfsm(.reset, .NextWSN, .NextWCN, .WS, .WC, .Dur, .DivBusy, .clk, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, .DivStickyE(DivStickyM), .XNaNE, .YNaNE,
                .XInfE, .YInfE, .DivNegStickyE(DivNegStickyM), .EarlyTermShiftE(EarlyTermShiftM));
-  srtradix4 srtradix4(.clk, .FmtE, .X,.Dpreproc, .XZeroCnt, .YZeroCnt, .FirstWS(WS), .FirstWC(WC), .WSN, .WCN, .DivStart(DivStartE), .XExpE, .YExpE, .XZeroE, .YZeroE,
+  srtradix4 srtradix4(.clk, .FmtE, .X,.Dpreproc, .XZeroCnt, .YZeroCnt, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, .DivStart(DivStartE), .XExpE, .YExpE, .XZeroE, .YZeroE,
                .DivBusy, .Quot(QuotM), .Rem(), .DivCalcExpM);
 endmodule
--- a/pipelined/src/fpu/fcvt.sv
+++ b/pipelined/src/fpu/fcvt.sv
@ -68,7 +68,8 @@ module fcvt (
    logic                   Signed;     // is the opperation with a signed integer?
    logic                   Int64;      // is the integer 64 bits?
    logic                   IntToFp;       // is the opperation an int->fp conversion?
-    logic [`LOGCVTLEN-1:0] LeadingZeros; // output from the LZC
+    logic [`CVTLEN:0]       LzcInFull;      // input to the Leading Zero Counter (priority encoder)
+    logic [`LOGCVTLEN-1:0]  LeadingZeros; // output from the LZC


    // seperate OpCtrl for code readability
@ -102,10 +103,11 @@ module fcvt (
    // choose the input to the leading zero counter i.e. priority encoder
    //             int -> fp : | positive integer | 00000... (if needed) | 
    //             fp  -> fp : | fraction         | 00000... (if needed) | 
-    assign LzcIn = IntToFp ? {TrimInt, {`CVTLEN-`XLEN{1'b0}}} :
-                             {Xm[`NF-1:0], {`CVTLEN-`NF{1'b0}}};
+    assign LzcInFull = IntToFp ? {1'b0, TrimInt, {`CVTLEN-`XLEN{1'b0}}} :
+                             {Xm, {`CVTLEN-`NF{1'b0}}};
+    assign LzcIn = LzcInFull[`CVTLEN-1:0];
    
-    lzc #(`CVTLEN) lzc (.num(LzcIn), .ZeroCnt(LeadingZeros));
+    lzc #(`CVTLEN+1) lzc (.num(LzcInFull), .ZeroCnt(LeadingZeros));

    ///////////////////////////////////////////////////////////////////////////
    // shifter
@ -119,13 +121,13 @@ module fcvt (
    //      denormalized/undeflowed result fp -> fp:
    //          - shift left by NF-1+CalcExp - to shift till the biased expoenent is 0
    //      ??? -> fp: 
-    //          - shift left by LeadingZeros+1 - to shift till the result is normalized
+    //          - shift left by LeadingZeros - to shift till the result is normalized
    //              - only shift fp -> fp if the intital value is denormalized
    //                  - this is a problem because the input to the lzc was the fraction rather than the mantissa
    //                  - rather have a few and-gates than an extra bit in the priority encoder??? *** is this true?
    assign ShiftAmt = ToInt ? Ce[`LOGCVTLEN-1:0]&{`LOGCVTLEN{~Ce[`NE]}} :
                    ResDenormUf&~IntToFp ? (`LOGCVTLEN)'(`NF-1)+Ce[`LOGCVTLEN-1:0] : 
-                              (LeadingZeros+1)&{`LOGCVTLEN{XDenorm|IntToFp}};
+                              (LeadingZeros)&{`LOGCVTLEN{XDenorm|IntToFp}};
    
    ///////////////////////////////////////////////////////////////////////////
    // exp calculations
@ -197,14 +199,14 @@ module fcvt (
    //                  |  0's |     Mantissa      |      0's if nessisary     |
    //                  |     keep        |
    //
-    //              - if the input is denormalized then we dont shift... so the  "- (LeadingZeros+1)" is just leftovers from other options
-    //      int -> fp : largest bias +  XLEN - Largest bias + new bias - 1 - LeadingZeros = XLEN + NewBias - 1 - LeadingZeros
+    //              - if the input is denormalized then we dont shift... so the  "- LeadingZeros" is just leftovers from other options
+    //      int -> fp : largest bias +  XLEN - Largest bias + new bias - LeadingZeros = XLEN + NewBias - LeadingZeros
    //              Process:
    //                  - shifted right by XLEN (XLEN)
-    //                  - shift left to normilize (-1-LeadingZeros)
+    //                  - shift left to normilize (-LeadingZeros)
    //                  - newBias to make the biased exponent
-    //          oldexp - biasold +newbias - (LeadingZeros+1)&(XDenorm|IntToFp)
-    assign Ce = {1'b0, OldExp} - (`NE+1)'(`BIAS) + {2'b0, NewBias} - {{`NE{1'b0}}, XDenorm|IntToFp} - {{`NE-`LOGCVTLEN+1{1'b0}}, (LeadingZeros&{`LOGCVTLEN{XDenorm|IntToFp}})};
+    //          oldexp - biasold +newbias - LeadingZeros&(XDenorm|IntToFp)
+    assign Ce = {1'b0, OldExp} - (`NE+1)'(`BIAS) + {2'b0, NewBias} - {{`NE-`LOGCVTLEN+1{1'b0}}, (LeadingZeros&{`LOGCVTLEN{XDenorm|IntToFp}})};
    // find if the result is dnormal or underflows
    //      - if Calculated expoenent is 0 or negitive (and the input/result is not exactaly 0)
    //      - can't underflow an integer to Fp conversion
--- a/pipelined/src/fpu/fmashiftcalc.sv
+++ b/pipelined/src/fpu/fmashiftcalc.sv
@ -53,7 +53,8 @@ module fmashiftcalc(
    assign FmaSZero = ~(|FmaSm);

    // calculate the sum's exponent
-    assign NormSumExp = FmaKillProd ? {2'b0, Ze[`NE-1:1], Ze[0]&~ZDenorm} : FmaPe + -{{`NE+2-$unsigned($clog2(3*`NF+7)){1'b0}}, FmaNCnt} - 1 + (`NE+2)'(`NF+4);
+    //                                                                      ProdExp - NormCnt - 1 + NF+4 = ProdExp + ~NormCnt + 1 - 1 + NF+4 = ProdExp + ~NormCnt + NF+4
+    assign NormSumExp = FmaKillProd ? {2'b0, Ze[`NE-1:1], Ze[0]&~ZDenorm} : FmaPe + {{`NE+2-$unsigned($clog2(3*`NF+7)){1'b1}}, ~FmaNCnt} + (`NE+2)'(`NF+4);

    //convert the sum's exponent into the proper percision
    if (`FPSIZES == 1) begin
--- a/pipelined/src/fpu/postprocess.sv
+++ b/pipelined/src/fpu/postprocess.sv
@ -29,7 +29,7 @@

 `include "wally-config.vh"

-module postprocess(
+module postprocess (
    // general signals
    input logic                             Xs, Ys,  // input signs
    input logic  [`NE-1:0]                  Ze, // input exponents
--- a/pipelined/src/fpu/srt-radix4.sv
+++ b/pipelined/src/fpu/srt-radix4.sv
@ -41,7 +41,7 @@ module srtradix4(
  input logic [`DIVLEN-1:0] Dpreproc,
  input logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
  output logic [`QLEN-1:0] Quot,
-  output logic [`DIVLEN+3:0]  WSN, WCN,
+  output logic [`DIVLEN+3:0]  NextWSN, NextWCN,
  output logic [`DIVLEN+3:0]  FirstWS, FirstWC,
  output logic  [`NE+1:0] DivCalcExpM,
  output logic [`XLEN-1:0] Rem
@ -58,11 +58,12 @@ module srtradix4(
  logic [`QLEN-1:0] QNext[`DIVCOPIES-1:0];
  logic [`QLEN-1:0] QMNext[`DIVCOPIES-1:0];
 /* verilator lint_on UNOPTFLAT */
+  logic [`DIVLEN+3:0]  WSN, WCN;
  logic [`DIVLEN+3:0]  D, DBar, D2, DBar2;
  logic [`NE+1:0] DivCalcExp;
  logic [$clog2(`XLEN+1)-1:0] intExp;
  logic           intSign;
-  logic [`QLEN-1:0] QMux, QMMux;
+  logic [`QLEN-1:0] QMMux;

  // Top Muxes and Registers
  // When start is asserted, the inputs are loaded into the divider.
@ -72,9 +73,11 @@ module srtradix4(
  //  - otherwise load WSA into the flipflop
  //  - the assumed one is added to D since it's always normalized (and X/0 is a special case handeled by result selection)
  //  - XZeroE is used as the assumed one to avoid creating a sticky bit - all other numbers are normalized
-  mux2   #(`DIVLEN+4) wsmux({WSA[`DIVCOPIES-1][`DIVLEN+1:0], 2'b0}, {3'b000, ~XZeroE, X}, DivStart, WSN);
+  assign NextWSN = {WSA[`DIVCOPIES-1][`DIVLEN+1:0], 2'b0};
+  assign NextWCN = {WCA[`DIVCOPIES-1][`DIVLEN+1:0], 2'b0};
+  mux2   #(`DIVLEN+4) wsmux(NextWSN, {3'b000, ~XZeroE, X}, DivStart, WSN);
  flop   #(`DIVLEN+4) wsflop(clk, WSN, WS[0]);
-  mux2   #(`DIVLEN+4) wcmux({WCA[`DIVCOPIES-1][`DIVLEN+1:0], 2'b0}, {`DIVLEN+4{1'b0}}, DivStart, WCN);
+  mux2   #(`DIVLEN+4) wcmux(NextWCN, {`DIVLEN+4{1'b0}}, DivStart, WCN);
  flop   #(`DIVLEN+4) wcflop(clk, WCN, WC[0]);
  flopen #(`DIVLEN+4) dflop(clk, DivStart, {4'b0001, Dpreproc}, D);
  flopen #(`NE+2) expflop(clk, DivStart, DivCalcExp, DivCalcExpM);
@ -88,10 +91,10 @@ module srtradix4(

  genvar i;
  generate
-    for(i=0; i<`DIVCOPIES; i++) begin
+    for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin
      divinteration divinteration(.clk, .DivStart, .DivBusy, .D, .DBar, .D2, .DBar2, 
      .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), .Q(Q[i]), .QM(QM[i]), .QNext(QNext[i]), .QMNext(QMNext[i]));
-      if(i<3) begin 
+      if(i<(`DIVCOPIES-1)) begin 
        assign WS[i+1] = {WSA[i][`DIVLEN+1:0], 2'b0};
        assign WC[i+1] = {WCA[i][`DIVLEN+1:0], 2'b0};
        assign Q[i+1] = QNext[i];
@ -101,9 +104,8 @@ module srtradix4(
  endgenerate

  // if starting a new divison set Q to 0 and QM to -1
-  mux2 #(`QLEN) Qmux(QNext[`DIVCOPIES-1], {`QLEN{1'b0}}, DivStart, QMux);
  mux2 #(`QLEN) QMmux(QMNext[`DIVCOPIES-1], {`QLEN{1'b1}}, DivStart, QMMux);
-  flopen #(`QLEN) Qreg(clk, DivBusy|DivStart, QMux, Q[0]); // *** have to connect Quot directly to M stage
+  flopenr #(`QLEN) Qreg(clk, DivStart, DivBusy, QNext[`DIVCOPIES-1], Q[0]);
  flop #(`QLEN) QMreg(clk, QMMux, QM[0]);

  assign Quot = Q[0];
@ -181,7 +183,7 @@ module qsel4 (

 	logic [3:0] QSel4[1023:0];

-  initial begin 
+  always_comb begin 
    integer d, w, i, w2;
    for(d=0; d<8; d++)
      for(w=0; w<128; w++)begin
@ -270,9 +272,9 @@ module otfc4 (
 		// else if 	q = -2	Q = {QM, 10} 	QM = {QM, 01}
    // *** how does the 0 concatination numbers work?

+  assign QR  = Q[`QLEN-3:0];
+  assign QMR = QM[`QLEN-3:0];     // Shifted Q and QM
  always_comb begin
-    QR  = Q[`QLEN-3:0];
-    QMR = QM[`QLEN-3:0];     // Shift Q and QM
    if (q[3]) begin // +2
      QNext  = {QR,  2'b10};
      QMNext = {QR,  2'b01};
@ -352,5 +354,5 @@ module expcalc(
            endcase
    end
    // correct exponent for denormalized input's normalization shifts
-    assign DivCalcExp = ({2'b0, XExpE} - {{`NE+1-$clog2(`NF+2){1'b0}}, XZeroCnt} - {2'b0, YExpE} + {{`NE+1-$clog2(`NF+2){1'b0}}, YZeroCnt} + {3'b0, Bias})&{`NE+2{~XZeroE}};
+    assign DivCalcExp = ({2'b0, XExpE} - {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, XZeroCnt} - {2'b0, YExpE} + {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, YZeroCnt} + {3'b0, Bias})&{`NE+2{~XZeroE}};
    endmodule
--- a/pipelined/src/fpu/srtfsm.sv
+++ b/pipelined/src/fpu/srtfsm.sv
@ -33,7 +33,7 @@
 module srtfsm(
  input  logic clk, 
  input  logic reset, 
-  input logic [`DIVLEN+3:0] WSN, WCN, WS, WC,
+  input logic [`DIVLEN+3:0] NextWSN, NextWCN, WS, WC,
  input  logic XInfE, YInfE, 
  input  logic XZeroE, YZeroE, 
  input  logic XNaNE, YNaNE, 
@ -58,8 +58,8 @@ module srtfsm(

  //flopen #($clog2(`DIVLEN/2+3)) durflop(clk, DivStart, CalcDur, Dur);
  assign DivBusy = (state == BUSY);
-  assign WZero = ((WSN^WCN)=={WSN[`DIVLEN+2:0]|WCN[`DIVLEN+2:0], 1'b0});
-  assign DivStickyE = ~WZero;
+  assign WZero = ((NextWSN^NextWCN)=={NextWSN[`DIVLEN+2:0]|NextWCN[`DIVLEN+2:0], 1'b0});
+  assign DivStickyE = |W;
  assign DivDone = (state == DONE);
  assign W = WC+WS;
  assign DivNegStickyE = W[`DIVLEN+3]; //*** is there a better way to do this???
--- a/pipelined/src/fpu/srtpreproc.sv
+++ b/pipelined/src/fpu/srtpreproc.sv
@ -63,8 +63,7 @@ module srtpreproc (
  
  assign X = PreprocX;
  assign Dpreproc = PreprocY;
-  
-  assign Dur = (`DURLEN)'($rtoi(`FPDUR));
+  assign Dur = (`DURLEN)'(`FPDUR);
  // assign intExp = zeroCntB - zeroCntA + 1;
  // assign intSign = Signed & (SrcA[`XLEN - 1] ^ SrcB[`XLEN - 1]);

--- a/pipelined/src/generic/lzc.sv
+++ b/pipelined/src/generic/lzc.sv
@ -34,7 +34,7 @@ module lzc #(parameter WIDTH = 1) (
 /* verilator lint_off CMPCONST */
 /* verilator lint_off WIDTH */
    
-    int i;
+    logic [31:0] i;
    always_comb begin
        i = 0;
        while (~num[WIDTH-1-i] & (i < WIDTH)) i = i+1;  // search for leading one
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@ -87,8 +87,8 @@ module testbenchfp;
  logic reset = 1'b0;
  logic [`DIVLEN-1:0]    DivX;
  logic [`DIVLEN-1:0]  Dpreproc;
-  logic [`DIVLEN+3:0]  WSN, WS;
-  logic [`DIVLEN+3:0]  WCN, WC;
+  logic [`DIVLEN+3:0]  NextWSN, WS;
+  logic [`DIVLEN+3:0]  NextWCN, WC;
  logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
  logic [`DURLEN-1:0] Dur;

@ -696,9 +696,9 @@ module testbenchfp;
              .XManE(XMan), .YManE(YMan), .XZeroE(XZero), .YZeroE(YZero), .CmpIntResE(CmpRes),
              .XNaNE(XNaN), .YNaNE(YNaN), .XSNaNE(XSNaN), .YSNaNE(YSNaN), .FSrcXE(X), .FSrcYE(Y), .CmpNVE(CmpFlg[4]), .CmpFpResE(FpCmpRes));
  srtpreproc srtpreproc(.XManE(XMan), .Dur, .YManE(YMan),.X(DivX),.Dpreproc, .XZeroCnt, .YZeroCnt);
-  srtfsm srtfsm(.reset, .WSN, .WCN, .WS, .WC, .Dur, .DivBusy, .DivDone, .clk, .DivStart, .StallM(1'b0), .StallE(1'b0), .XZeroE(XZero), .YZeroE(YZero), .DivStickyE(DivSticky), .XNaNE(XNaN), .YNaNE(YNaN),
+  srtfsm srtfsm(.reset, .NextWSN, .NextWCN, .WS, .WC, .Dur, .DivBusy, .DivDone, .clk, .DivStart, .StallM(1'b0), .StallE(1'b0), .XZeroE(XZero), .YZeroE(YZero), .DivStickyE(DivSticky), .XNaNE(XNaN), .YNaNE(YNaN),
                .XInfE(XInf), .YInfE(YInf), .DivNegStickyE(DivNegSticky), .EarlyTermShiftE(EarlyTermShift));
-  srtradix4 srtradix4(.clk, .FmtE(ModFmt), .X(DivX),.Dpreproc, .DivBusy, .XZeroCnt, .YZeroCnt, .FirstWS(WS), .FirstWC(WC), .WSN, .WCN, .DivStart, .XExpE(XExp), .YExpE(YExp), .XZeroE(XZero), .YZeroE(YZero),
+  srtradix4 srtradix4(.clk, .FmtE(ModFmt), .X(DivX),.Dpreproc, .DivBusy, .XZeroCnt, .YZeroCnt, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, .DivStart, .XExpE(XExp), .YExpE(YExp), .XZeroE(XZero), .YZeroE(YZero),
                .Quot, .Rem(), .DivCalcExpM(DivCalcExp));

  assign CmpFlg[3:0] = 0;
--- a/synthDC/scripts/synth.tcl
+++ b/synthDC/scripts/synth.tcl
@ -347,7 +347,7 @@ redirect -append $filename { report_timing -capacitance -transition_time -nets -
 redirect -append $filename { echo "\n\n\n//// Critical paths through fma2 ////\n\n\n" }
 redirect -append $filename { report_timing -capacitance -transition_time -nets -through {postprocess/*} -nworst 1 }
 redirect -append $filename { echo "\n\n\n//// Critical paths through fpdiv ////\n\n\n" }
-redirect -append $filename { report_timing -capacitance -transition_time -nets -through {fdivsqrt/*} -nworst 1 }
+redirect -append $filename { report_timing -capacitance -transition_time -nets -through {divsqrt/*} -nworst 1 }
 redirect -append $filename { echo "\n\n\n//// Critical paths through fcvt ////\n\n\n" }
 redirect -append $filename { report_timing -capacitance -transition_time -nets -through {fcvt/*} -nworst 1 }