From dceb6f9034854fc3ee26a48bf367f7ed111573e2 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Sun, 9 Oct 2022 04:45:45 -0700
Subject: [PATCH] Moved shift into divsqrt stage and cleaned up comments

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv   | 20 ++++------
 pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv | 14 +++++--
 pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv | 40 +++++++++++---------
 3 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
index 5e22be3e..5c067796 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -55,8 +55,8 @@ module fdivsqrtiter(
 // U/UM should be 1.b so b+1 bits or b:0
 // C needs to be the lenght of the final fraction 0.b so b or b-1:0
  /* verilator lint_off UNOPTFLAT */
-  logic [`DIVb+3:0]  WSA[`DIVCOPIES-1:0]; // Q4.b
-  logic [`DIVb+3:0]  WCA[`DIVCOPIES-1:0]; // Q4.b
+  logic [`DIVb+3:0]  WSNext[`DIVCOPIES-1:0]; // Q4.b
+  logic [`DIVb+3:0]  WCNext[`DIVCOPIES-1:0]; // Q4.b
   logic [`DIVb+3:0]  WS[`DIVCOPIES:0]; // Q4.b
   logic [`DIVb+3:0]  WC[`DIVCOPIES:0]; // Q4.b
   logic [`DIVb:0] U[`DIVCOPIES:0]; // U1.b
@@ -78,12 +78,8 @@ module fdivsqrtiter(
 
   // Top Muxes and Registers
   // When start is asserted, the inputs are loaded into the divider.
-  // Otherwise, the divisor is retained and the partial remainder
-  // is fed back for the next iteration.
-  //  - when the start signal is asserted X and 0 are loaded into WS and WC
-  //  - otherwise load WSA into the flipflop
-  //  - the assumed one is added to D since it's always normalized (and X/0 is a special case handeled by result selection)
-  //  - XZeroE is used as the assumed one to avoid creating a sticky bit - all other numbers are normalized
+  // Otherwise, the divisor is retained and the residual and result
+  // are fed back for the next iteration.
  
   // Residual WS/SC registers/initializaiton mux
   mux2   #(`DIVb+4) wsmux(WS[`DIVCOPIES], X, DivStartE, WSN);
@@ -126,17 +122,17 @@ module fdivsqrtiter(
     for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin : interations
       if (`RADIX == 2) begin: stage
         fdivsqrtstage2 fdivsqrtstage(.D, .DBar, .SqrtM,
-        .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), 
+        .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
         .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end else begin: stage
         logic j1;
         assign j1 = (i == 0 & ~C[0][`DIVb-1]);
         fdivsqrtstage4 fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtM, .j1,
-        .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), 
+        .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
         .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end
-      assign WS[i+1] = WSA[i] << `LOGR;
-      assign WC[i+1] = WCA[i] << `LOGR;
+      assign WS[i+1] = WSNext[i];
+      assign WC[i+1] = WCNext[i];
       assign U[i+1]  = UNext[i];
       assign UM[i+1] = UMNext[i];
     end
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
index 987f2357..8ed1664a 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@@ -41,7 +41,7 @@ module fdivsqrtstage2 (
   output logic un,
   output logic [`DIVb+1:0] CNext,
   output logic [`DIVb:0] UNext, UMNext, 
-  output logic [`DIVb+3:0]  WSA, WCA
+  output logic [`DIVb+3:0]  WSNext, WCNext
 );
  /* verilator lint_on UNOPTFLAT */
 
@@ -49,8 +49,7 @@ module fdivsqrtstage2 (
   logic up, uz;
   logic [`DIVb+3:0] F;
   logic [`DIVb+3:0] AddIn;
-
-  assign CNext = {1'b1, C[`DIVb+1:1]};
+  logic [`DIVb+3:0]  WSA, WCA;
 
   // Qmient Selection logic
   // Given partial remainder, select digit of +1, 0, or -1 (up, uz, un)
@@ -61,8 +60,11 @@ module fdivsqrtstage2 (
 	// 0010 = -1
 	// 0001 = -2
   fdivsqrtqsel2 qsel2(WS[`DIVb+3:`DIVb], WC[`DIVb+3:`DIVb], up, uz, un);
+
+  // Sqrt F generatin
   fdivsqrtfgen2 fgen2(.up, .uz, .C(CNext), .U, .UM, .F);
 
+  // Divisor multiple
   always_comb
     if      (up) Dsel = DBar;
     else if (uz) Dsel = '0; // qz
@@ -72,7 +74,13 @@ module fdivsqrtstage2 (
   //  WSA, WCA = WS + WC - qD
   assign AddIn = SqrtM ? F : Dsel;
   csa #(`DIVb+4) csa(WS, WC, AddIn, up&~SqrtM, WSA, WCA);
+  assign WSNext = WSA << 1;
+  assign WCNext = WCA << 1;
 
+  // Shift thermometer code C
+  assign CNext = {1'b1, C[`DIVb+1:1]};
+
+  // Unified On-The-Fly Converter to accumulate result
   fdivsqrtuotfc2 uotfc2(.up, .uz, .C(CNext), .U, .UM, .UNext, .UMNext);
 endmodule
 
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
index e463762a..e4931d4d 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@@ -30,7 +30,6 @@
 
 `include "wally-config.vh"
 
-/* verilator lint_off UNOPTFLAT */
 module fdivsqrtstage4 (
   input logic [`DIVN-2:0] D,
   input logic [`DIVb+3:0]  DBar, D2, DBar2,
@@ -41,9 +40,8 @@ module fdivsqrtstage4 (
   input logic SqrtM, j1,
   output logic un,
   output logic [`DIVb:0] UNext, UMNext, 
-  output logic [`DIVb+3:0]  WSA, WCA
+  output logic [`DIVb+3:0]  WSNext, WCNext
 );
- /* verilator lint_on UNOPTFLAT */
 
   logic [`DIVb+3:0]  Dsel;
   logic [3:0]     udigit;
@@ -51,7 +49,7 @@ module fdivsqrtstage4 (
   logic [`DIVb+3:0] AddIn;
   logic [4:0] Smsbs;
   logic CarryIn;
-  assign CNext = {2'b11, C[`DIVb+1:2]};
+  logic [`DIVb+3:0]  WSA, WCA;
 
   // Digit Selection logic
   // u encoding:
@@ -62,27 +60,35 @@ module fdivsqrtstage4 (
 	// 0001 = -2
   assign Smsbs = U[`DIVb:`DIVb-4];
   fdivsqrtqsel4 qsel4(.D, .Smsbs, .WS, .WC, .Sqrt(SqrtM), .j1, .udigit);
+  assign un = 0; // unused for radix 4
+
+  // F generation logic
   fdivsqrtfgen4 fgen4(.udigit, .C({2'b11, CNext}), .U({3'b000, U}), .UM({3'b000, UM}), .F);
 
+  // Divisor multiple logic
   always_comb
-  case (udigit)
-    4'b1000: Dsel = DBar2;
-    4'b0100: Dsel = DBar;
-    4'b0000: Dsel = '0;
-    4'b0010: Dsel = {3'b0, 1'b1, D, {`DIVb-`DIVN+1{1'b0}}};
-    4'b0001: Dsel = D2;
-    default: Dsel = 'x;
-  endcase
+    case (udigit)
+      4'b1000: Dsel = DBar2;
+      4'b0100: Dsel = DBar;
+      4'b0000: Dsel = '0;
+      4'b0010: Dsel = {3'b0, 1'b1, D, {`DIVb-`DIVN+1{1'b0}}};
+      4'b0001: Dsel = D2;
+      default: Dsel = 'x;
+    endcase
 
-  // Partial Product Generation
-  //  WSA, WCA = WS + WC - qD
+  // Residual Update
+  //  {WS, WC}}Next = (WS + WC - qD or F) << 2
   assign AddIn = SqrtM ? F : Dsel;
   assign CarryIn = ~SqrtM & (udigit[3] | udigit[2]); // +1 for 2's complement of -D and -2D 
   csa #(`DIVb+4) csa(WS, WC, AddIn, CarryIn, WSA, WCA);
- 
-  fdivsqrtuotfc4 fdivsqrtuotfc4(.udigit, .Sqrt(SqrtM), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext);
+  assign WSNext = WSA << 2;
+  assign WCNext = WCA << 2;
 
-  assign un = 0; // unused for radix 4
+  // Shift thermometer code C
+  assign CNext = {2'b11, C[`DIVb+1:2]};
+ 
+  // On-the-fly converter to accumulate result
+  fdivsqrtuotfc4 fdivsqrtuotfc4(.udigit, .Sqrt(SqrtM), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext);
 endmodule