From f8af51e07be4c2ae7c7457693957dbe199e1e5f9 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Sun, 1 Jan 2023 13:54:01 -0800
Subject: [PATCH] Handle special case Int Div/Rem of |A| < |B| in a single
 cycle

---
 pipelined/config/shared/wally-shared.vh        |  2 +-
 pipelined/src/fpu/fdivsqrt/fdivsqrt.sv         | 10 +++++-----
 pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv      |  5 +++--
 pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 11 ++++++-----
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 16 ++++++++++------
 pipelined/src/ieu/forward.sv                   |  2 +-
 6 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index e79e5561..7d54f50b 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -24,7 +24,7 @@
 
 // division constants
 `define RADIX       32'h2
-`define DIVCOPIES   32'h2
+`define DIVCOPIES   32'h1
 
 // Memory synthesis configuration
 `define USE_SRAM 0
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
index de51eeab..dc07078f 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -68,20 +68,20 @@ module fdivsqrt(
   logic DivStartE;                    // Enable signal for flops during stall
 
   // Integer div/rem signals
-  logic AZeroE, BZeroE;               // Numerator/Denominator is zero (Execute) 
-  logic AZeroM, BZeroM;               // Numerator/Denominator is zero (Memory) 
+  logic BZeroE, BZeroM;               // Denominator is zero
   logic MDUM;                         // Integer operation
   logic [`DIVBLEN:0] nE, nM, mM;      // Shift amounts
   logic NegQuotM, ALTBM, AsM, W64M;   // Special handling for postprocessor
   logic [`XLEN-1:0] AM;               // Original Numerator for postprocessor
+  logic ISpecialCaseE;                // Integer div/remainder special cases
 
   fdivsqrtpreproc fdivsqrtpreproc(                        // Preprocessor
     .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE), 
     .Fmt(FmtE), .Sqrt(SqrtE), .XZeroE, .Funct3E, 
     .QeM, .X, .DPreproc, 
     // Int-specific 
-    .ForwardedSrcAE, .ForwardedSrcBE, .MDUE, .W64E, 
-    .AZeroE, .BZeroE, .nE, .AZeroM, .BZeroM, .nM, .mM, .AM, 
+    .ForwardedSrcAE, .ForwardedSrcBE, .MDUE, .W64E, .ISpecialCaseE,
+    .BZeroE, .nE, .BZeroM, .nM, .mM, .AM, 
     .MDUM, .W64M, .NegQuotM, .ALTBM, .AsM);
 
   fdivsqrtfsm fdivsqrtfsm(                                // FSM
@@ -89,7 +89,7 @@ module fdivsqrt(
     .FDivStartE, .XsE, .SqrtE, .WZeroE, .FlushE, .StallM, 
     .FDivBusyE, .IFDivStartE, .FDivDoneE, .SpecialCaseM, 
     // Int-specific 
-    .IDivStartE, .AZeroE, .BZeroE, .nE, .MDUE);
+    .IDivStartE, .BZeroE, .ISpecialCaseE, .nE, .MDUE);
 
   fdivsqrtiter fdivsqrtiter(                              // CSA Iterator
     .clk, .IFDivStartE, .FDivBusyE, .SqrtE, .X, .DPreproc, 
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index 9e42cadb..6c1348d6 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -36,7 +36,7 @@ module fdivsqrtfsm(
   input  logic [`FMTBITS-1:0] FmtE,
   input  logic XInfE, YInfE, 
   input  logic XZeroE, YZeroE, 
-  input  logic AZeroE, BZeroE,
+  input  logic BZeroE,
   input  logic XNaNE, YNaNE, 
   input  logic FDivStartE, IDivStartE,
   input  logic XsE,
@@ -46,6 +46,7 @@ module fdivsqrtfsm(
   input  logic WZeroE,
   input  logic MDUE,
   input  logic [`DIVBLEN:0] nE,
+  input  logic ISpecialCaseE,
   output logic IFDivStartE,
   output logic FDivBusyE, FDivDoneE,
   output logic SpecialCaseM
@@ -65,7 +66,7 @@ module fdivsqrtfsm(
 
   // terminate immediately on special cases
   assign FSpecialCaseE = XZeroE | (YZeroE&~SqrtE) | XInfE | YInfE | XNaNE | YNaNE | (XsE&SqrtE);
-  if (`IDIV_ON_FPU) assign SpecialCaseE = MDUE ? BZeroE : FSpecialCaseE;
+  if (`IDIV_ON_FPU) assign SpecialCaseE = MDUE ? ISpecialCaseE : FSpecialCaseE;
   else              assign SpecialCaseE = FSpecialCaseE;
   flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
 
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 75d6a323..e1012120 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -99,7 +99,7 @@ module fdivsqrtpostproc(
   mux2 #(`DIVb+1) preqmmux(FirstU, FirstUM, NegStickyM, PreQmM); // Select U or U-1 depending on negative sticky bit
   mux2 #(`DIVb+1)    qmmux(PreQmM, (PreQmM << 1), SqrtM, QmM);
 
-  if (`IDIV_ON_FPU) begin // Int supported
+  if (`IDIV_ON_FPU) begin:intpostproc // Int supported
     logic [`DIVBLEN:0] NormShiftM;
     logic [`DIVb+3:0] UnsignedQuotM, NormRemM, NormRemDM, NormQuotM;
 
@@ -121,18 +121,19 @@ module fdivsqrtpostproc(
         NormShiftM = ((`DIVBLEN+1)'(`DIVb) - (nM * (`DIVBLEN+1)'(`LOGR)));
         PreResultM = NormQuotM;
       end
-      PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
+      PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);  // *** rename to PreIntResultM?
     end
 
     // special case logic
+    // terminates immediately when B is Zero (div 0) or |A| has more leading 0s than |B|
     always_comb
       if (BZeroM) begin         // Divide by zero
-        if (RemOpM) SpecialFPIntDivResultM = AM;
+        if (RemOpM) SpecialFPIntDivResultM = AM;  // *** rename to IntDivResult?
         else        SpecialFPIntDivResultM = {(`XLEN){1'b1}};
-      end else if (ALTBM) begin // Numerator is zero
+     end else if (ALTBM) begin // Numerator is zero
         if (RemOpM) SpecialFPIntDivResultM = AM;
         else        SpecialFPIntDivResultM = '0;
-      end else      SpecialFPIntDivResultM = PreFPIntDivResultM[`XLEN-1:0];
+     end else      SpecialFPIntDivResultM = PreFPIntDivResultM[`XLEN-1:0];
 
     // sign extend result for W64
     if (`XLEN==64) begin
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 0b513211..968d7cbc 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -45,9 +45,10 @@ module fdivsqrtpreproc (
   // Int-specific
   input  logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
 	input  logic MDUE, W64E,
+  output logic ISpecialCaseE,
   output logic [`DIVBLEN:0] nE, nM, mM,
   output logic NegQuotM, ALTBM, MDUM, W64M,
-  output logic AsM, AZeroM, BZeroM, AZeroE, BZeroE,
+  output logic AsM, BZeroM, BZeroE,
   output logic [`XLEN-1:0] AM
 );
 
@@ -58,8 +59,9 @@ module fdivsqrtpreproc (
   logic  [`DIVb-1:0] IFNormLenX, IFNormLenD;  // Correctly-sized inputs for iterator
   logic  [`DIVBLEN:0] mE, ell;                // Leading zeros of inputs
   logic  NumerZeroE;                          // Numerator is zero (X or A)
+  logic  AZeroE;                              // A is Zero for integer division
 
-  if (`IDIV_ON_FPU) begin // Int Supported
+  if (`IDIV_ON_FPU) begin:intpreproc // Int Supported
     logic signedDiv, NegQuotE;
     logic AsBit, BsBit, AsE, BsE, ALTBE;
     logic [`XLEN-1:0] AE, BE, PosA, PosB;
@@ -98,8 +100,11 @@ module fdivsqrtpreproc (
 
     // calculate number of fractional bits p
     assign ZeroDiff = mE - ell;         // Difference in number of leading zeros
-    assign ALTBE = ZeroDiff[`DIVBLEN];  // A less than B?
-    mux2 #(`DIVBLEN+1) pmux(ZeroDiff, 0, ALTBE, p);                         
+    assign ALTBE = ZeroDiff[`DIVBLEN];  // A less than B (A has more leading zeros)
+    mux2 #(`DIVBLEN+1) pmux(ZeroDiff, {(`DIVBLEN+1){1'b0}}, ALTBE, p);            // *** is there a more graceful way to write these constants    
+
+    // Integer special cases (terminate immediately)
+    assign ISpecialCaseE = BZeroE | ALTBE;
 
   /* verilator lint_off WIDTH */
     // calculate number of fractional digits nE and right shift amount RightShiftX to complete in discrete number of steps
@@ -113,7 +118,7 @@ module fdivsqrtpreproc (
       assign IntSteps = (TotalIntBits >> `LOGRK) + |IntTrunc;     // Number of steps for int div
       assign nE = (IntSteps * `DIVCOPIES) - 1;                    // Fractional digits
       assign RightShiftX = `RK - 1 - ((TotalIntBits - 1) % `RK);  // Right shift amount
-      assign DivXShifted = DivX >> RightShiftX;                   // shift X to complete in nE steps
+      assign DivXShifted = DivX >> RightShiftX;                   // shift X by up to R*K-1 to complete in nE steps
     end else begin // radix 2 1 copy doesn't require shifting
       assign nE = p; 
       assign DivXShifted = DivX;
@@ -129,7 +134,6 @@ module fdivsqrtpreproc (
     flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);
     flopen #(1)       altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
     flopen #(1)    negquotreg(clk, IFDivStartE, NegQuotE, NegQuotM);
-    flopen #(1)      azeroreg(clk, IFDivStartE, AZeroE,   AZeroM);
     flopen #(1)      bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
     flopen #(1)      asignreg(clk, IFDivStartE, AsE,      AsM);
     flopen #(`DIVBLEN+1) nreg(clk, IFDivStartE, nE,       nM);
diff --git a/pipelined/src/ieu/forward.sv b/pipelined/src/ieu/forward.sv
index 701a7d43..ff576cf3 100644
--- a/pipelined/src/ieu/forward.sv
+++ b/pipelined/src/ieu/forward.sv
@@ -60,6 +60,6 @@ module forward(
   assign MatchDE = ((Rs1D == RdE) | (Rs2D == RdE)) & (RdE != 5'b0); // Decode-stage instruction source depends on result from execute stage instruction
   assign FCvtIntStallD = FCvtIntE & MatchDE; // FPU to Integer transfers have single-cycle latency except fcvt
   assign LoadStallD = (MemReadE|SCE) & MatchDE;  
-  assign MDUStallD = MDUE & MatchDE; 
+  assign MDUStallD = MDUE & MatchDE; // Int mult/div is at least two cycle latency, even when coming from the FDIV
   assign CSRRdStallD = CSRReadE & MatchDE;
 endmodule