From 65c5ec6e9d2b88e85dc9b0a683193f510f655d98 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 12 Nov 2023 06:15:47 -0800
Subject: [PATCH 1/4] fdivsqrt comment improvements

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv  |  2 +-
 src/fpu/fdivsqrt/fdivsqrtexpcalc.sv |  8 +++++---
 src/fpu/fdivsqrt/fdivsqrtfgen2.sv   |  8 ++++----
 src/fpu/fdivsqrt/fdivsqrtfgen4.sv   | 12 ++++++------
 src/fpu/fdivsqrt/fdivsqrtfsm.sv     |  2 +-
 src/fpu/fdivsqrt/fdivsqrtiter.sv    |  8 ++++----
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 11 ++++++-----
 src/fpu/fdivsqrt/fdivsqrtqsel2.sv   | 25 ++++++++-----------------
 src/fpu/fdivsqrt/fdivsqrtstage2.sv  | 16 +++++-----------
 9 files changed, 40 insertions(+), 52 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index 20fb16f62..6043ebb4a 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -30,7 +30,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0] FmtE,
   input  logic                 SqrtE,
   input  logic                 IntDivE,
-  input  logic [P.DIVBLEN-1:0] IntResultBitsE,
+  input  logic [P.DIVBLEN-1:0] IntResultBitsE,    
   output logic [P.DURLEN-1:0]  CyclesE
 );
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
index a1dd82e35..cf243a84b 100644
--- a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
@@ -28,17 +28,19 @@
 
 module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0] Fmt,
-  input  logic [P.NE-1:0]      Xe, Ye,
+  input  logic [P.NE-1:0]      Xe, Ye,    // input exponents
   input  logic                 Sqrt,
   input  logic                 XZero, 
-  input  logic [P.DIVBLEN-1:0] ell, m,
-  output logic [P.NE+1:0]      Ue
+  input  logic [P.DIVBLEN-1:0] ell, m,    // number of leading 0s in Xe and Ye
+  output logic [P.NE+1:0]      Ue         // result exponent
   );
   
   logic [P.NE-2:0] Bias;
   logic [P.NE+1:0] SXExp;
   logic [P.NE+1:0] SExp;
   logic [P.NE+1:0] DExp;
+
+  // Determine exponent bias according to the format
   
   if (P.FPSIZES == 1) begin
     assign Bias = (P.NE-1)'(P.BIAS); 
diff --git a/src/fpu/fdivsqrt/fdivsqrtfgen2.sv b/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
index 990e3f19f..cf398f570 100644
--- a/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
@@ -28,12 +28,12 @@
 
 module fdivsqrtfgen2 import cvw::*;  #(parameter cvw_t P) (
   input  logic              up, uz,
-  input  logic [P.DIVb+3:0] C, U, UM,
-  output logic [P.DIVb+3:0] F
+  input  logic [P.DIVb+3:0] C, U, UM,   // Q4.DIVb (extended from shorter forms)
+  output logic [P.DIVb+3:0] F           // Q4.DIVb
 );
-  logic [P.DIVb+3:0]        FP, FN, FZ;
+  logic [P.DIVb+3:0]        FP, FN, FZ;  // Q4.DIVb
 
-  // Generate for both positive and negative bits
+  // Generate for both positive and negative quotient digits
   assign FP = ~(U << 1) & C;
   assign FN = (UM << 1) | (C & ~(C << 2));
   assign FZ = '0;
diff --git a/src/fpu/fdivsqrt/fdivsqrtfgen4.sv b/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
index fc648f5bd..e2cec1ab4 100644
--- a/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
@@ -27,14 +27,14 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module fdivsqrtfgen4 import cvw::*;  #(parameter cvw_t P) (
-  input  logic [3:0]        udigit,
-  input  logic [P.DIVb+3:0] C, U, UM,
-  output logic [P.DIVb+3:0] F
+  input  logic [3:0]        udigit,           // {2, 1, -1, -2}; all cold for zero
+  input  logic [P.DIVb+3:0] C, U, UM,         // Q4.DIVb (extended from shorter forms)
+  output logic [P.DIVb+3:0] F                 // Q4.DIVb
 );
-  logic [P.DIVb+3:0]        F2, F1, F0, FN1, FN2;
+  logic [P.DIVb+3:0]        F2, F1, F0, FN1, FN2; // Q4.DIVb
   
-  // Generate for both positive and negative bits
-  assign F2  = (~U << 2) & (C << 2);
+  // Generate for both positive and negative digits
+  assign F2  = (~U << 2) & (C << 2);              // 
   assign F1  = ~(U << 1) & C;
   assign F0  = '0;
   assign FN1 = (UM << 1) | (C & ~(C << 3));
diff --git a/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index 0e2cba90e..862d53b25 100644
--- a/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -57,7 +57,7 @@ module fdivsqrtfsm import cvw::*;  #(parameter cvw_t P) (
   // terminate immediately on special cases
   assign FSpecialCaseE = XZeroE | XInfE  | XNaNE |  (XsE&SqrtE) | (YZeroE | YInfE | YNaNE)&~SqrtE;
   if (P.IDIV_ON_FPU) assign SpecialCaseE = IntDivE ? ISpecialCaseE : FSpecialCaseE;
-  else              assign SpecialCaseE = FSpecialCaseE;
+  else               assign SpecialCaseE = FSpecialCaseE;
   flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
 
   always_ff @(posedge clk) begin
diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv
index 0f66982ab..863d94837 100644
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -104,14 +104,14 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
     for(i=0; $unsigned(i)<P.DIVCOPIES; i++) begin : iterations
       if (P.RADIX == 2) begin: stage
         fdivsqrtstage2 #(P) fdivsqrtstage(.D, .DBar, .SqrtE,
-        .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
-        .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
+          .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
+          .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end else begin: stage
         logic j1;
         assign j1 = (i == 0 & ~C[0][P.DIVb-1]);
         fdivsqrtstage4 #(P) fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, .j1,
-        .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
-        .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
+          .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
+          .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end
       assign WS[i+1] = WSNext[i];
       assign WC[i+1] = WCNext[i];
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 8d6e565b1..c65f26fd8 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -29,17 +29,18 @@
 module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 clk,
   input  logic                 IFDivStartE, 
-  input  logic [P.NF:0]        Xm, Ym,
-  input  logic [P.NE-1:0]      Xe, Ye,
+  input  logic [P.NF:0]        Xm, Ym,      // Floating-point significands
+  input  logic [P.NE-1:0]      Xe, Ye,      // Floating-point exponents
   input  logic [P.FMTBITS-1:0] FmtE,
   input  logic                 SqrtE,
   input  logic                 XZeroE,
   input  logic [2:0]           Funct3E,
-  output logic [P.NE+1:0]      UeM,
-  output logic [P.DIVb+3:0]    X, D,
+  output logic [P.NE+1:0]      UeM,         // biased exponent of result
+  output logic [P.DIVb+3:0]    X, D,        // Q4.DIVb
   // Int-specific
-  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
+  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // U(XLEN.0) inputs from IEU 
   input  logic                 IntDivE, W64E,
+  // Outputs
   output logic                 ISpecialCaseE,
   output logic [P.DURLEN-1:0]  CyclesE,
   output logic [P.DIVBLEN-1:0] IntNormShiftM,
diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel2.sv b/src/fpu/fdivsqrt/fdivsqrtqsel2.sv
index fe32924e1..de64bafc9 100644
--- a/src/fpu/fdivsqrt/fdivsqrtqsel2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtqsel2.sv
@@ -18,7 +18,7 @@
 // except in compliance with the License, or, at your option, the Apache License version 2.0. You 
 // may obtain a copy of the License at
 //
-// https://solderpad.org/licenses/SHL-2.1/
+// httWS://solderpad.org/licenses/SHL-2.1/
 //
 // Unless required by applicable law or agreed to in writing, any work distributed under the 
 // License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
@@ -27,27 +27,18 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module fdivsqrtqsel2 ( 
-  input  logic [3:0] ps, pc, 
+  input  logic [3:0] WS, WC, 
   output logic       up, uz, un
 );
  
-  logic [3:0]  p, g;
   logic        magnitude, sign;
  
-  // The quotient selection logic is presented for simplicity, not
-  // for efficiency.  You can probably optimize your logic to
-  // select the proper divisor with less delay.
-
-  // Quotient equations from EE371 lecture notes 13-20
-  assign p = ps ^ pc;
-  assign g = ps & pc;
-
-  assign magnitude = ~((ps[2]^pc[2]) & (ps[1]^pc[1]) & 
-        (ps[0]^pc[0]));
-  assign sign = (ps[3]^pc[3])^
-      (ps[2] & pc[2] | ((ps[2]^pc[2]) &
-          (ps[1]&pc[1] | ((ps[1]^pc[1]) &
-            (ps[0]&pc[0])))));
+  assign magnitude = ~((WS[2]^WC[2]) & (WS[1]^WC[1]) & 
+        (WS[0]^WC[0]));
+  assign sign = (WS[3]^WC[3])^
+      (WS[2] & WC[2] | ((WS[2]^WC[2]) &
+          (WS[1]&WC[1] | ((WS[1]^WC[1]) &
+            (WS[0]&WC[0])))));
 
   // Produce digit = +1, 0, or -1
   assign up = magnitude & ~sign;
diff --git a/src/fpu/fdivsqrt/fdivsqrtstage2.sv b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
index 5e319a7c1..ad0c828e9 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@@ -33,8 +33,8 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.DIVb:0]   U, UM,          // U1.DIVb
   input  logic [P.DIVb+3:0] WS, WC,         // Q4.DIVb
   input  logic [P.DIVb+1:0] C,              // Q2.DIVb
-  input  logic             SqrtE,
-  output logic             un,
+  input  logic              SqrtE,
+  output logic              un,
   output logic [P.DIVb+1:0] CNext,          // Q2.DIVb
   output logic [P.DIVb:0]   UNext, UMNext,  // U1.DIVb
   output logic [P.DIVb+3:0] WSNext, WCNext  // Q4.DIVb
@@ -42,19 +42,13 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
  /* verilator lint_on UNOPTFLAT */
 
   logic [P.DIVb+3:0]        Dsel;     // Q4.DIVb
-  logic                    up, uz;
+  logic                     up, uz;
   logic [P.DIVb+3:0]        F;        // Q4.DIVb
   logic [P.DIVb+3:0]        AddIn;    // Q4.DIVb
   logic [P.DIVb+3:0]        WSA, WCA; // Q4.DIVb
 
-  // Qmient Selection logic
+  // Quotient Selection logic
   // Given partial remainder, select digit of +1, 0, or -1 (up, uz, un)
-  // q encoding:
-  // 1000 = +2
-  // 0100 = +1
-  // 0000 =  0
-  // 0010 = -1
-  // 0001 = -2
   fdivsqrtqsel2 qsel2(WS[P.DIVb+3:P.DIVb], WC[P.DIVb+3:P.DIVb], up, uz, un);
 
   // Sqrt F generation.  Extend C, U, UM to Q4.k
@@ -66,7 +60,7 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
     else if (uz) Dsel = '0;
     else         Dsel = D; // un
 
-  // Partial Product Generation
+  // Residual Generation
   //  WSA, WCA = WS + WC - qD
   mux2 #(P.DIVb+4) addinmux(Dsel, F, SqrtE, AddIn);
   csa #(P.DIVb+4) csa(WS, WC, AddIn, up&~SqrtE, WSA, WCA);

From fdda3d6cde61b2e1baaa05f1ca7a3a041bd6d57d Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 12 Nov 2023 06:36:57 -0800
Subject: [PATCH 2/4] Renamed qsel to uslc and simplified radix2 uslc

---
 src/fpu/fdivsqrt/fdivsqrtstage2.sv            |  4 +--
 src/fpu/fdivsqrt/fdivsqrtstage4.sv            | 23 +++++--------
 .../{fdivsqrtqsel2.sv => fdivsqrtuslc2.sv}    | 24 +++++++------
 .../{fdivsqrtqsel4.sv => fdivsqrtuslc4.sv}    | 34 +++++++++----------
 ...divsqrtqsel4cmp.sv => fdivsqrtuslc4cmp.sv} | 10 +++---
 5 files changed, 46 insertions(+), 49 deletions(-)
 rename src/fpu/fdivsqrt/{fdivsqrtqsel2.sv => fdivsqrtuslc2.sv} (69%)
 rename src/fpu/fdivsqrt/{fdivsqrtqsel4.sv => fdivsqrtuslc4.sv} (72%)
 rename src/fpu/fdivsqrt/{fdivsqrtqsel4cmp.sv => fdivsqrtuslc4cmp.sv} (90%)

diff --git a/src/fpu/fdivsqrt/fdivsqrtstage2.sv b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
index ad0c828e9..40a2a5a01 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@@ -49,7 +49,7 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
 
   // Quotient Selection logic
   // Given partial remainder, select digit of +1, 0, or -1 (up, uz, un)
-  fdivsqrtqsel2 qsel2(WS[P.DIVb+3:P.DIVb], WC[P.DIVb+3:P.DIVb], up, uz, un);
+  fdivsqrtuslc2 uslc2(.WS(WS[P.DIVb+3:P.DIVb]), .WC(WC[P.DIVb+3:P.DIVb]), .up, .uz, .un);
 
   // Sqrt F generation.  Extend C, U, UM to Q4.k
   fdivsqrtfgen2 #(P) fgen2(.up, .uz, .C({2'b11, CNext}), .U({3'b000, U}), .UM({3'b000, UM}), .F);
@@ -60,7 +60,7 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
     else if (uz) Dsel = '0;
     else         Dsel = D; // un
 
-  // Residual Generation
+  // Residual Update
   //  WSA, WCA = WS + WC - qD
   mux2 #(P.DIVb+4) addinmux(Dsel, F, SqrtE, AddIn);
   csa #(P.DIVb+4) csa(WS, WC, AddIn, up&~SqrtE, WSA, WCA);
diff --git a/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
index fea2851b6..a24c1155f 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@@ -31,36 +31,29 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.DIVb:0]   U,UM,               // U1.DIVb
   input  logic [P.DIVb+3:0] WS, WC,             // Q4.DIVb
   input  logic [P.DIVb+1:0] C,                  // Q2.DIVb
-  input  logic             SqrtE, j1,
+  input  logic              SqrtE, j1,
   output logic [P.DIVb+1:0] CNext,              // Q2.DIVb
-  output logic             un,
+  output logic              un,
   output logic [P.DIVb:0]   UNext, UMNext,      // U1.DIVb
   output logic [P.DIVb+3:0] WSNext, WCNext      // Q4.DIVb
 );
 
   logic [P.DIVb+3:0]        Dsel;               // Q4.DIVb
-  logic [3:0]              udigit;
+  logic [3:0]               udigit;             // {+2, +1, -1, -2} or 0000 for 0
   logic [P.DIVb+3:0]        F;                  // Q4.DIVb
   logic [P.DIVb+3:0]        AddIn;              // Q4.DIVb
-  logic [4:0]              Smsbs;
-  logic [2:0]              Dmsbs;
-  logic [7:0]              WCmsbs, WSmsbs;
-  logic                    CarryIn;
+  logic [4:0]               Smsbs;              // U1.4
+  logic [2:0]               Dmsbs;              // U0.3   drop leading 1 from D
+  logic [7:0]               WCmsbs, WSmsbs;     // U4.4
+  logic                     CarryIn;
   logic [P.DIVb+3:0]        WSA, WCA;           // Q4.DIVb
 
   // Digit Selection logic
-  // u encoding:
-  // 1000 = +2
-  // 0100 = +1
-  // 0000 =  0
-  // 0010 = -1
-  // 0001 = -2
   assign Smsbs  = U[P.DIVb:P.DIVb-4];       // U1.4 most significant bits of square root
   assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
   assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
   assign WSmsbs = WS[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
-
-  fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .udigit);
+  fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .udigit);
   assign un = 1'b0; // unused for radix 4
 
   // F generation logic
diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel2.sv b/src/fpu/fdivsqrt/fdivsqrtuslc2.sv
similarity index 69%
rename from src/fpu/fdivsqrt/fdivsqrtqsel2.sv
rename to src/fpu/fdivsqrt/fdivsqrtuslc2.sv
index de64bafc9..e4fcfeadf 100644
--- a/src/fpu/fdivsqrt/fdivsqrtqsel2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc2.sv
@@ -1,10 +1,10 @@
 ///////////////////////////////////////////
-// fdivsqrtqsel2.sv
+// fdivsqrtuslc2.sv
 //
 // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
 // Modified:13 January 2022
 //
-// Purpose: Radix 2 Quotient Digit Selection
+// Purpose: Radix 2 Unified Quotient/Square Root Digit Selection
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
@@ -26,22 +26,26 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module fdivsqrtqsel2 ( 
-  input  logic [3:0] WS, WC, 
-  output logic       up, uz, un
+module fdivsqrtuslc2 ( 
+  input  logic [3:0] WS, WC,      // Q4.0 most significant bits of redundant residual
+  output logic       up, uz, un   // {+1, 0, -1}
 );
  
-  logic        magnitude, sign;
+  logic        sign;
+
+  // Carry chain logic determines if W = WS + WC = -1, < -1, > -1 to choose 0, -1, 1 respectively
  
-  assign magnitude = ~((WS[2]^WC[2]) & (WS[1]^WC[1]) & 
+  //if p2 * p1 * p0, W = -1 and choose digit of 0
+  assign uz = ((WS[2]^WC[2]) & (WS[1]^WC[1]) & 
         (WS[0]^WC[0]));
+
+  // Otherwise determine sign using carry chain: sign = p3 ^ g_2:0
   assign sign = (WS[3]^WC[3])^
       (WS[2] & WC[2] | ((WS[2]^WC[2]) &
           (WS[1]&WC[1] | ((WS[1]^WC[1]) &
             (WS[0]&WC[0])))));
 
   // Produce digit = +1, 0, or -1
-  assign up = magnitude & ~sign;
-  assign uz = ~magnitude;
-  assign un = magnitude & sign;
+  assign up = ~uz & ~sign;
+  assign un = ~uz & sign;
 endmodule
diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel4.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
similarity index 72%
rename from src/fpu/fdivsqrt/fdivsqrtqsel4.sv
rename to src/fpu/fdivsqrt/fdivsqrtuslc4.sv
index de520bef2..268ca9ea2 100644
--- a/src/fpu/fdivsqrt/fdivsqrtqsel4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
@@ -1,10 +1,10 @@
 ///////////////////////////////////////////
-// fdivsqrtqsel4.sv
+// fdivsqrtuslc4.sv
 //
 // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
 // Modified:13 January 2022
 //
-// Purpose: Radix 4 Quotient Digit Selection
+// Purpose: Table-based Radix 4 Unified Quotient/Square Root Digit Selection
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
@@ -26,25 +26,25 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module fdivsqrtqsel4 (
-  input  logic [2:0] Dmsbs,
-  input  logic [4:0] Smsbs,
-  input  logic [7:0] WSmsbs, WCmsbs,
+module fdivsqrtuslc4 (
+  input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
+  input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
+  input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 redundant residual most significant bits
   input  logic       Sqrt, j1,
-  output logic [3:0] udigit
+  output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
-  logic [6:0] Wmsbs;
-  logic [7:0] PreWmsbs;
-  logic [2:0] A;
+  logic [7:0] PreWmsbs;                 // Q4.4 nonredundant residual msbs
+  logic [6:0] Wmsbs;                    // Q4.3 truncated nonredundant residual
+  logic [2:0] A;                        // U0.3 upper bits of D or Smsbs, discarding integer bit
 
-  assign PreWmsbs = WCmsbs + WSmsbs;
-  assign Wmsbs = PreWmsbs[7:1];
+  assign PreWmsbs = WCmsbs + WSmsbs;    // add redundant residual to find msbs
+  assign Wmsbs = PreWmsbs[7:1];         // truncate least significant bit to Q4.3 to index table
   // D = 0001.xxx...
   // Dmsbs = |   |
   // W =      xxxx.xxx...
   // Wmsbs = |        |
 
-  logic [3:0] USel4[1023:0];
+  logic [3:0] USel4[1023:0];            // 1024-bit table indexed with 3 bits of A and 7 bits of Wmsbs
 
   // Prepopulate selection table; this is constant at compile time
   always_comb begin 
@@ -101,10 +101,10 @@ module fdivsqrtqsel4 (
   // Select A
   always_comb
     if (Sqrt) begin 
-      if (j1) A = 3'b101;
-      else if (Smsbs == 5'b10000) A = 3'b111;
-      else A = Smsbs[2:0];
-    end else A = Dmsbs;
+      if (j1) A = 3'b101;                       // on first sqrt iteration        A = .101
+      else if (Smsbs == 5'b10000) A = 3'b111;   // if S = 1.0, use                A = .111
+      else A = Smsbs[2:0];                      // otherwise use                  A = S (in U0.3 format)
+    end else A = Dmsbs;                         // division Unless                A = D (IN U0.3 format, dropping leading 1)
 
   // Select quotient digit from lookup table based on A and W
   assign udigit = USel4[{A,Wmsbs}];
diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
similarity index 90%
rename from src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv
rename to src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
index fe436413e..ccb5e618a 100644
--- a/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@@ -1,10 +1,10 @@
 ///////////////////////////////////////////
-// fdivsqrtqsel4cmp.sv
+// fdivsqrtuslc4cmp.sv
 //
 // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
 // Modified:13 January 2022
 //
-// Purpose: Comparator-based Radix 4 Quotient Digit Selection
+// Purpose: Comparator-based Radix 4 Unified Quotient/Square Root Digit Selection 
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
@@ -26,12 +26,12 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module fdivsqrtqsel4cmp (
+module fdivsqrtuslc4cmp (
   input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
   input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
-  input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4
+  input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 residual most significant bits
   input  logic       SqrtE, j1,
-  output logic [3:0] udigit
+  output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
   logic [6:0] Wmsbs;
   logic [7:0] PreWmsbs;

From b49330c5566397238886c73e78e5498ffb0eb6b7 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 12 Nov 2023 10:05:54 -0800
Subject: [PATCH 3/4] Explained sqrt preshifting

---
 config/shared/config-shared.vh      |  1 +
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 41 ++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 14de5187e..9635d706b 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -99,6 +99,7 @@ localparam RK          = LOGR*DIVCOPIES;                            // r*k bits
 
 // intermediate division parameters not directly used in fdivsqrt hardware
 localparam FPDIVMINb   = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit because square root could be shifted right *** explain better
+//localparam FPDIVMINb   = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right
 localparam DIVMINb     = ((FPDIVMINb<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVMINb; // minimum fractional bits b = max(XLEN, FPDIVMINb)
 localparam RESBITS     = DIVMINb + LOGR; // number of bits in a result: r integer + b fractional
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index c65f26fd8..ecdf10f8b 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -50,7 +50,6 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 );
 
   logic [P.DIVb:0]             Xnorm, Dnorm;
-  logic [P.DIVb:0]             PreSqrtX;
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
   logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
@@ -61,7 +60,8 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic                        SignedDivE;                          // signed division
   logic                        AsE, BsE;                            // Signs of integer inputs
   logic [P.XLEN-1:0]           AE;                                  // input A after W64 adjustment
-  logic  ALTBE;
+  logic                        ALTBE;
+  logic                        EvenExp;
 
   //////////////////////////////////////////////////////
   // Integer Preprocessing
@@ -153,9 +153,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   // shift square root to be in range [1/4, 1)
   // Normalized numbers are shifted right by 1 if the exponent is odd
   // Subnormal numbers have Xe = 0 and an unbiased exponent of 1-BIAS.  They are shifted right if the number of leading zeros is odd.
-  // NOTE: there might be a discrepancy that X is never right shifted by 2.  However
-  //  it comes out in the wash and gives the right answer.  Investigate later if possible. ***
-  //////////////////////////////////////////////////////
+   //////////////////////////////////////////////////////
 
   assign DivX = {3'b000, Xnorm}; // Zero-extend numerator for division
 
@@ -165,13 +163,32 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   // Next X is shifted right by 1 or 2 bits to range [1/4, 1) and exponent will be adjusted accordingly to be even
   // Now (X-1) is negative.  Formed by placing all 1s in all four integer bits (in Q4.b) form, keeping X in fraciton bits
   // Then multiply by R is left shift by r (1 or 2 for radix 2 or 4)
-  // For Radix 2, this gives 3 leading 1s, followed by the fraction bits
-  // For Radix 4, this gives 2 leading 1s, followed by the fraction bits (and a zero in the lsb)
-  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
-  if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};
-  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};
-  mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);
-  
+  // This is optimized in hardware by first right shifting by 0 or 1 bit (instead of 1 or 2), then left shifting by (r-1), then subtracting 2 or 4
+  // Subtracting 2 is equivalent to adding 1110.  Subtracting 4 is equivalent to adding 1100.  Prepend leading 1s to do a free subtraction.
+  // This also means only one extra fractional bit is needed becaue we never shift right by more than 1.
+  // Radix      Exponent odd          Exponent Even
+  // 2          x-2 = 2(x/2 - 1)      x/2 - 2 = 2(x/4 - 1)
+  // 4          2x-4 = 4(x/2 - 1))    x-4 = 4(x/4 - 1)
+  // Summary: PreSqrtX = r(x/2or4 - 1)
+
+  assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
+/*  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
+  if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
+  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};                     // 2PreSqrtX - 4 = 4(PreSqrtX/2 - 1) */
+
+  if (P.RADIX == 2) begin
+    logic [P.DIVb:0] PreSqrtX;    // U1.DIVb
+    mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
+    assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
+  end else begin
+    logic [P.DIVb+1:0] PreSqrtX;  // U2.DIVb
+    mux2 #(P.DIVb+2) sqrtxmux({Xnorm, 1'b0}, {1'b0, Xnorm}, EvenExp, PreSqrtX); // 2X if exponent odd, X if exponent even
+    assign SqrtX = {2'b11, PreSqrtX};                     // PreSqrtX - 4 = 4(PreSqrtX/4 - 1)
+  end
+
+  // Initialize X for division or square root
+  mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);                    
+
   //////////////////////////////////////////////////////
   // Selet integer or floating-point operands
   //////////////////////////////////////////////////////

From 75216f8b2ad5b022d4a7eb753eb14cf02ee30f49 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 12 Nov 2023 19:41:12 -0800
Subject: [PATCH 4/4] Divider cleanup

---
 config/shared/config-shared.vh      |  4 ++--
 src/fpu/fdivsqrt/fdivsqrtcycles.sv  |  6 +++---
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 13 ++++++++++---
 src/fpu/fdivsqrt/fdivsqrtuslc4.sv   |  2 +-
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 9635d706b..55bca569f 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -98,8 +98,8 @@ localparam LOGR        = $clog2(RADIX);                             // r = log(R
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
 
 // intermediate division parameters not directly used in fdivsqrt hardware
-localparam FPDIVMINb   = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit because square root could be shifted right *** explain better
-//localparam FPDIVMINb   = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right
+localparam FPDIVMINb   = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit to allow sqrt being shifted right
+//localparam FPDIVMINb   = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right, if radix4 doesn't use a right shift.  This version saves one cycle on double-precision with R=4,k=4.  However, it doesn't work yet because C is too short, so k is incorrectly calculated as a 1 in the lsb after the last step.
 localparam DIVMINb     = ((FPDIVMINb<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVMINb; // minimum fractional bits b = max(XLEN, FPDIVMINb)
 localparam RESBITS     = DIVMINb + LOGR; // number of bits in a result: r integer + b fractional
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index 6043ebb4a..e8a430a91 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -66,12 +66,12 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   // P.DIVCOPIES = k. P.LOGR = log(R) = r.  P.RK = rk.  
   // Integer division needs p fractional + r integer result bits
   // FP Division needs at least Nf fractional bits + 2 guard/round bits and one integer digit (LOG R integer bits) = Nf + 2 + r bits
-  // FP Sqrt needs at least Nf fractional bits, 2 guard/round bits, and *** shift bits
+  // FP Sqrt needs at least Nf fractional bits and 2 guard/round bits.  The integer bit is always initialized to 1 and does not need a cycle.
   // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
 
   always_comb begin 
-    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard; integer bit implicit
-    else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
+    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
+    else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits 
 
     if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;
     else               ResultBitsE = FPResultBitsE;
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index ecdf10f8b..145bf9a68 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -168,14 +168,20 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   // This also means only one extra fractional bit is needed becaue we never shift right by more than 1.
   // Radix      Exponent odd          Exponent Even
   // 2          x-2 = 2(x/2 - 1)      x/2 - 2 = 2(x/4 - 1)
-  // 4          2x-4 = 4(x/2 - 1))    x-4 = 4(x/4 - 1)
+  // 4          2(x)-4 = 4(x/2 - 1))  2(x/2)-4 = 4(x/4 - 1)
   // Summary: PreSqrtX = r(x/2or4 - 1)
 
+  logic [P.DIVb:0] PreSqrtX;
   assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
-/*  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
+  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
   if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
-  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};                     // 2PreSqrtX - 4 = 4(PreSqrtX/2 - 1) */
+  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};                     // 2PreSqrtX - 4 = 4(PreSqrtX/2 - 1) 
 
+/*  
+  // Attempt to optimize radix 4 to use a left shift by 1 or zero initially, followed by no more left shift
+  // This saves one bit in DIVb because there is no initial right shift.
+  // However, C needs to be extended further, lest it create a k with a 1 in the lsb when C is all 1s.
+  // That is an optimization for another day.
   if (P.RADIX == 2) begin
     logic [P.DIVb:0] PreSqrtX;    // U1.DIVb
     mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
@@ -185,6 +191,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.DIVb+2) sqrtxmux({Xnorm, 1'b0}, {1'b0, Xnorm}, EvenExp, PreSqrtX); // 2X if exponent odd, X if exponent even
     assign SqrtX = {2'b11, PreSqrtX};                     // PreSqrtX - 4 = 4(PreSqrtX/4 - 1)
   end
+*/
 
   // Initialize X for division or square root
   mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);                    
diff --git a/src/fpu/fdivsqrt/fdivsqrtuslc4.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
index 268ca9ea2..b44b34a35 100644
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
@@ -103,7 +103,7 @@ module fdivsqrtuslc4 (
     if (Sqrt) begin 
       if (j1) A = 3'b101;                       // on first sqrt iteration        A = .101
       else if (Smsbs == 5'b10000) A = 3'b111;   // if S = 1.0, use                A = .111
-      else A = Smsbs[2:0];                      // otherwise use                  A = S (in U0.3 format)
+      else A = Smsbs[2:0];                      // otherwise use                  A = 2S (in U0.3 format)
     end else A = Dmsbs;                         // division Unless                A = D (IN U0.3 format, dropping leading 1)
 
   // Select quotient digit from lookup table based on A and W