Handle special case Int Div/Rem of |A| < |B| in a single cycle

2025-02-11 06:05:49 +00:00 · 2023-01-01 13:54:01 -08:00 · 2023-01-01 13:54:01 -08:00 · 499b52a7f0
commit 499b52a7f0
parent c653f1b30f
6 changed files with 26 additions and 20 deletions
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@ -24,7 +24,7 @@
 // division constants
 `define RADIX       32'h2
-`define DIVCOPIES   32'h2
+`define DIVCOPIES   32'h1
 // Memory synthesis configuration
 `define USE_SRAM 0
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@ -68,20 +68,20 @@ module fdivsqrt(
  logic DivStartE;                    // Enable signal for flops during stall
  // Integer div/rem signals
-  logic AZeroE, BZeroE;               // Numerator/Denominator is zero (Execute) 
+  logic BZeroE, BZeroM;               // Denominator is zero
  logic AZeroM, BZeroM;               // Numerator/Denominator is zero (Memory) 
  logic MDUM;                         // Integer operation
  logic [`DIVBLEN:0] nE, nM, mM;      // Shift amounts
  logic NegQuotM, ALTBM, AsM, W64M;   // Special handling for postprocessor
  logic [`XLEN-1:0] AM;               // Original Numerator for postprocessor
  logic ISpecialCaseE;                // Integer div/remainder special cases
  fdivsqrtpreproc fdivsqrtpreproc(                        // Preprocessor
    .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE), 
    .Fmt(FmtE), .Sqrt(SqrtE), .XZeroE, .Funct3E, 
    .QeM, .X, .DPreproc, 
    // Int-specific 
-    .ForwardedSrcAE, .ForwardedSrcBE, .MDUE, .W64E, 
+    .ForwardedSrcAE, .ForwardedSrcBE, .MDUE, .W64E, .ISpecialCaseE,
-    .AZeroE, .BZeroE, .nE, .AZeroM, .BZeroM, .nM, .mM, .AM, 
+    .BZeroE, .nE, .BZeroM, .nM, .mM, .AM, 
    .MDUM, .W64M, .NegQuotM, .ALTBM, .AsM);
  fdivsqrtfsm fdivsqrtfsm(                                // FSM
@ -89,7 +89,7 @@ module fdivsqrt(
    .FDivStartE, .XsE, .SqrtE, .WZeroE, .FlushE, .StallM, 
    .FDivBusyE, .IFDivStartE, .FDivDoneE, .SpecialCaseM, 
    // Int-specific 
-    .IDivStartE, .AZeroE, .BZeroE, .nE, .MDUE);
+    .IDivStartE, .BZeroE, .ISpecialCaseE, .nE, .MDUE);
  fdivsqrtiter fdivsqrtiter(                              // CSA Iterator
    .clk, .IFDivStartE, .FDivBusyE, .SqrtE, .X, .DPreproc, 
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@ -36,7 +36,7 @@ module fdivsqrtfsm(
  input  logic [`FMTBITS-1:0] FmtE,
  input  logic XInfE, YInfE, 
  input  logic XZeroE, YZeroE, 
-  input  logic AZeroE, BZeroE,
+  input  logic BZeroE,
  input  logic XNaNE, YNaNE, 
  input  logic FDivStartE, IDivStartE,
  input  logic XsE,
@ -46,6 +46,7 @@ module fdivsqrtfsm(
  input  logic WZeroE,
  input  logic MDUE,
  input  logic [`DIVBLEN:0] nE,
  input  logic ISpecialCaseE,
  output logic IFDivStartE,
  output logic FDivBusyE, FDivDoneE,
  output logic SpecialCaseM
@ -65,7 +66,7 @@ module fdivsqrtfsm(
  // terminate immediately on special cases
  assign FSpecialCaseE = XZeroE | (YZeroE&~SqrtE) | XInfE | YInfE | XNaNE | YNaNE | (XsE&SqrtE);
-  if (`IDIV_ON_FPU) assign SpecialCaseE = MDUE ? BZeroE : FSpecialCaseE;
+  if (`IDIV_ON_FPU) assign SpecialCaseE = MDUE ? ISpecialCaseE : FSpecialCaseE;
  else              assign SpecialCaseE = FSpecialCaseE;
  flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@ -99,7 +99,7 @@ module fdivsqrtpostproc(
  mux2 #(`DIVb+1) preqmmux(FirstU, FirstUM, NegStickyM, PreQmM); // Select U or U-1 depending on negative sticky bit
  mux2 #(`DIVb+1)    qmmux(PreQmM, (PreQmM << 1), SqrtM, QmM);
-  if (`IDIV_ON_FPU) begin // Int supported
+  if (`IDIV_ON_FPU) begin:intpostproc // Int supported
    logic [`DIVBLEN:0] NormShiftM;
    logic [`DIVb+3:0] UnsignedQuotM, NormRemM, NormRemDM, NormQuotM;
@ -121,18 +121,19 @@ module fdivsqrtpostproc(
        NormShiftM = ((`DIVBLEN+1)'(`DIVb) - (nM * (`DIVBLEN+1)'(`LOGR)));
        PreResultM = NormQuotM;
      end
-      PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
+      PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);  // *** rename to PreIntResultM?
    end
    // special case logic
    // terminates immediately when B is Zero (div 0) or |A| has more leading 0s than |B|
    always_comb
      if (BZeroM) begin         // Divide by zero
-        if (RemOpM) SpecialFPIntDivResultM = AM;
+        if (RemOpM) SpecialFPIntDivResultM = AM;  // *** rename to IntDivResult?
        else        SpecialFPIntDivResultM = {(`XLEN){1'b1}};
-      end else if (ALTBM) begin // Numerator is zero
+     end else if (ALTBM) begin // Numerator is zero
        if (RemOpM) SpecialFPIntDivResultM = AM;
        else        SpecialFPIntDivResultM = '0;
-      end else      SpecialFPIntDivResultM = PreFPIntDivResultM[`XLEN-1:0];
+     end else      SpecialFPIntDivResultM = PreFPIntDivResultM[`XLEN-1:0];
    // sign extend result for W64
    if (`XLEN==64) begin
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@ -45,9 +45,10 @@ module fdivsqrtpreproc (
  // Int-specific
  input  logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
 	input  logic MDUE, W64E,
  output logic ISpecialCaseE,
  output logic [`DIVBLEN:0] nE, nM, mM,
  output logic NegQuotM, ALTBM, MDUM, W64M,
-  output logic AsM, AZeroM, BZeroM, AZeroE, BZeroE,
+  output logic AsM, BZeroM, BZeroE,
  output logic [`XLEN-1:0] AM
 );
@ -58,8 +59,9 @@ module fdivsqrtpreproc (
  logic  [`DIVb-1:0] IFNormLenX, IFNormLenD;  // Correctly-sized inputs for iterator
  logic  [`DIVBLEN:0] mE, ell;                // Leading zeros of inputs
  logic  NumerZeroE;                          // Numerator is zero (X or A)
  logic  AZeroE;                              // A is Zero for integer division
-  if (`IDIV_ON_FPU) begin // Int Supported
+  if (`IDIV_ON_FPU) begin:intpreproc // Int Supported
    logic signedDiv, NegQuotE;
    logic AsBit, BsBit, AsE, BsE, ALTBE;
    logic [`XLEN-1:0] AE, BE, PosA, PosB;
@ -98,8 +100,11 @@ module fdivsqrtpreproc (
    // calculate number of fractional bits p
    assign ZeroDiff = mE - ell;         // Difference in number of leading zeros
-    assign ALTBE = ZeroDiff[`DIVBLEN];  // A less than B?
+    assign ALTBE = ZeroDiff[`DIVBLEN];  // A less than B (A has more leading zeros)
-    mux2 #(`DIVBLEN+1) pmux(ZeroDiff, 0, ALTBE, p);                         
+    mux2 #(`DIVBLEN+1) pmux(ZeroDiff, {(`DIVBLEN+1){1'b0}}, ALTBE, p);            // *** is there a more graceful way to write these constants    
    // Integer special cases (terminate immediately)
    assign ISpecialCaseE = BZeroE | ALTBE;
  /* verilator lint_off WIDTH */
    // calculate number of fractional digits nE and right shift amount RightShiftX to complete in discrete number of steps
@ -113,7 +118,7 @@ module fdivsqrtpreproc (
      assign IntSteps = (TotalIntBits >> `LOGRK) + |IntTrunc;     // Number of steps for int div
      assign nE = (IntSteps * `DIVCOPIES) - 1;                    // Fractional digits
      assign RightShiftX = `RK - 1 - ((TotalIntBits - 1) % `RK);  // Right shift amount
-      assign DivXShifted = DivX >> RightShiftX;                   // shift X to complete in nE steps
+      assign DivXShifted = DivX >> RightShiftX;                   // shift X by up to R*K-1 to complete in nE steps
    end else begin // radix 2 1 copy doesn't require shifting
      assign nE = p; 
      assign DivXShifted = DivX;
@ -129,7 +134,6 @@ module fdivsqrtpreproc (
    flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);
    flopen #(1)       altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
    flopen #(1)    negquotreg(clk, IFDivStartE, NegQuotE, NegQuotM);
    flopen #(1)      azeroreg(clk, IFDivStartE, AZeroE,   AZeroM);
    flopen #(1)      bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
    flopen #(1)      asignreg(clk, IFDivStartE, AsE,      AsM);
    flopen #(`DIVBLEN+1) nreg(clk, IFDivStartE, nE,       nM);
--- a/pipelined/src/ieu/forward.sv
+++ b/pipelined/src/ieu/forward.sv
@ -60,6 +60,6 @@ module forward(
  assign MatchDE = ((Rs1D == RdE) | (Rs2D == RdE)) & (RdE != 5'b0); // Decode-stage instruction source depends on result from execute stage instruction
  assign FCvtIntStallD = FCvtIntE & MatchDE; // FPU to Integer transfers have single-cycle latency except fcvt
  assign LoadStallD = (MemReadE|SCE) & MatchDE;  
-  assign MDUStallD = MDUE & MatchDE; 
+  assign MDUStallD = MDUE & MatchDE; // Int mult/div is at least two cycle latency, even when coming from the FDIV
  assign CSRRdStallD = CSRReadE & MatchDE;
 endmodule