From 1a0097f6e76bf4f862da355586422cb2e825a5bf Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Mon, 4 Mar 2024 16:40:49 -0800
Subject: [PATCH] Further fdivsqrt simplification after starting Sqrt at
 iteration 0

---
 src/fpu/fdivsqrt/fdivsqrtiter.sv     | 17 ++++++-----------
 src/fpu/fdivsqrt/fdivsqrtstage4.sv   |  6 +++---
 src/fpu/fdivsqrt/fdivsqrtuslc4.sv    | 13 +++++++------
 src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv |  9 ++++-----
 4 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv
index 29b6d4fe6..4bfcebcd1 100644
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -44,7 +44,7 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
   logic [P.DIVb+3:0]      WCNext[P.DIVCOPIES-1:0]; // Q4.DIVb
   logic [P.DIVb+3:0]      WS[P.DIVCOPIES:0];       // Q4.DIVb
   logic [P.DIVb+3:0]      WC[P.DIVCOPIES:0];       // Q4.DIVb
-  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.DIVb
+  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.DIVb // *** probably Q not U.  See Table 16.26 notes
   logic [P.DIVb:0]        UM[P.DIVCOPIES:0];       // U1.DIVb
   logic [P.DIVb:0]        UNext[P.DIVCOPIES-1:0];  // U1.DIVb
   logic [P.DIVb:0]        UMNext[P.DIVCOPIES-1:0]; // U1.DIVb
@@ -71,7 +71,7 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
   flopen #(P.DIVb+4) wcreg(clk, FDivBusyE, WCN, WC[0]);
 
   // UOTFC Result U and UM registers/initialization mux
-  // Initialize U to 1.0 and UM to 0 for square root; U to 0 and UM to -1 otherwise
+  // Initialize U to 0 = 0.0000... and UM to -1 = 1.00000... (in Q1.Divb)
   assign initU  ={(P.DIVb+1){1'b0}};
   assign initUM = {{1'b1}, {(P.DIVb){1'b0}}};
   mux2   #(P.DIVb+1)  Umux(UNext[P.DIVCOPIES-1],  initU,  IFDivStartE, UMux);
@@ -79,15 +79,10 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
   flopen #(P.DIVb+1)  UReg(clk, FDivBusyE, UMux,  U[0]);
   flopen #(P.DIVb+1) UMReg(clk, FDivBusyE, UMMux, UM[0]);
 
-  // C register/initialization mux
-  logic [1:0] initCUpper;
-  if(P.RADIX == 4) begin
-    assign initCUpper = 2'b00;
-  end else begin
-    assign initCUpper = 2'b10;
-  end
-  
-  assign initC = {initCUpper, {P.DIVb{1'b0}}};
+  // C register/initialization mux: C = -R:
+  // C = -4 = 00.000000... (in Q2.DIVb) for radix 4, C = -2 = 10.000000... for radix2
+  if(P.RADIX == 4) assign initC = '0;
+  else             assign initC = {2'b10, {{P.DIVb{1'b0}}}};
   mux2   #(P.DIVb+2) cmux(C[P.DIVCOPIES], initC, IFDivStartE, NextC); 
   flopen #(P.DIVb+2) creg(clk, FDivBusyE, NextC, C[0]);
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
index 4323ee35c..856273a5e 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@@ -48,16 +48,16 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
   logic [7:0]               WCmsbs, WSmsbs;     // U4.4
   logic                     CarryIn;
   logic [P.DIVb+3:0]        WSA, WCA;           // Q4.DIVb
-  logic j0,j1;
+  logic j0, j1;                                 // step j = 0 or step j = 1
 
   // Digit Selection logic
   assign j0     = ~C[P.DIVb+1];             // first step of R digit selection: C = 00...0
-  assign j1     = C[P.DIVb] ^ C[P.DIVb-1];  // second step of R digit selection: C = 1100...0
+  assign j1     = C[P.DIVb] & ~C[P.DIVb-1]; // second step of R digit selection: C = 1100...0; *** could simplify to ~C[P.DIVb-1] because j=0 case takes priority
   assign Smsbs  = U[P.DIVb:P.DIVb-4];       // U1.4 most significant bits of square root
   assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
   assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
   assign WSmsbs = WS[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
-  fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .j0, .udigit);
+  fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j0, .j1, .udigit);
   assign un = 1'b0; // unused for radix 4
 
   // F generation logic
diff --git a/src/fpu/fdivsqrt/fdivsqrtuslc4.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
index 63ea5aae2..610b79395 100644
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
@@ -31,7 +31,7 @@ module fdivsqrtuslc4 (
   input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
   input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
   input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 redundant residual most significant bits
-  input  logic       Sqrt, j1,
+  input  logic       Sqrt, j0, j1,
   output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
   logic [7:0] PreWmsbs;                 // Q4.4 nonredundant residual msbs
@@ -102,11 +102,12 @@ module fdivsqrtuslc4 (
   // Select A
   always_comb
     if (Sqrt) begin 
-      if (j1) A = 3'b101;                       // on first sqrt iteration        A = .101
-      else if (Smsbs == 5'b10000) A = 3'b111;   // if S = 1.0, use                A = .111
-      else A = Smsbs[2:0];                      // otherwise use                  A = 2S (in U0.3 format)
-    end else A = Dmsbs;                         // division Unless                A = D (IN U0.3 format, dropping leading 1)
+      if (j1)                 A = 3'b101;     // on first sqrt iteration        A = .101
+      else if (Smsbs[4] == 1) A = 3'b111;     // if S = 1.0000, use             A = .111
+      else                    A = Smsbs[2:0]; // otherwise use                  A = 2S (in U0.3 format)
+    end else                  A = Dmsbs;      // division                       A = D (IN U0.3 format, dropping leading 1)
 
   // Select quotient digit from lookup table based on A and W
-  assign udigit = USel4[{A,Wmsbs}];
+  // On step j = 0 for square root, always select u_0 = 1
+  assign udigit = (Sqrt & j0) ? 4'b0100 : USel4[{A,Wmsbs}];
 endmodule
diff --git a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
index 7812248a9..fef26668c 100644
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@@ -32,7 +32,7 @@ module fdivsqrtuslc4cmp (
   input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
   input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 residual most significant bits
   input  logic       SqrtE, 
-  input  logic       j0,j1,             // are we on first (j0) or second step (j1) of digit selection
+  input  logic       j0, j1,            // are we on first (j0) or second step (j1) of digit selection
   output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
   logic [6:0] Wmsbs;
@@ -71,23 +71,22 @@ module fdivsqrtuslc4cmp (
   
   // handles special case when j = 0 or j = 1 for sqrt
   assign mkj2 = 20; // when j = 1 use mk2[101] when j = 0 use anything bigger than 7.
-  assign mkj1 = j1 ? 8 : 0; // when j = 1 use mk1[101] = 8 and when j = 0 use 0 so we choose u_0 = 1
+  assign mkj1 = j0 ? 0 : 8; // when j = 1 use mk1[101] = 8 and when j = 0 use 0 so we choose u_0 = 1
   assign sqrtspecial = SqrtE & (j1 | j0);
 
   // Choose A for current operation 
  always_comb
     if (SqrtE) begin 
-      if (Smsbs[4]) A = 3'b111; // *** can we get rid of SMSBs case?
+      if (Smsbs[4]) A = 3'b111; // for S = 1.0000  *** can we optimize away this case?
       else A = Smsbs[2:0];
     end else A = Dmsbs;
-
     
   // Choose selection constants based on a
   
   assign mk2 = sqrtspecial ? mkj2 : mks2[A];
   assign mk1 = sqrtspecial ? mkj1 : mks1[A];
   assign mk0 = -mk1;
-  assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table *** can we hide?
+  assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table *** can we hide from critical path
  
   // Compare residual W to selection constants to choose digit
   always_comb