From fedf9c8a5ab2bdb708e01a6d1bece387d0cf8572 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 08:46:55 -0800
Subject: [PATCH 01/11] Started cleaning up shifting leading 1 in fdivsqrt

---
 config/shared/config-shared.vh      | 18 +++++++++---------
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 18 +++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 48f02b848..acc7996cb 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -94,15 +94,15 @@ localparam FMT2  = ((F_SUPPORTED & (LEN1 != S_LEN)) ? 2'd0    : 2'd2);
 localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 
 // division constants
-localparam DIVN        = (((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2); // standard length of input
-localparam LOGR        = ($clog2(RADIX));           // r = log(R)
-localparam RK          = (LOGR*DIVCOPIES);         // r*k used for intdiv preproc
-localparam LOGRK       = ($clog2(RK));               // log2(r*k)
-localparam FPDUR       = ((DIVN+1+(LOGR*DIVCOPIES))/(LOGR*DIVCOPIES)+(RADIX/4));
-localparam DURLEN      = ($clog2(FPDUR+1));
-localparam DIVb        = (FPDUR*LOGR*DIVCOPIES-1); // canonical fdiv size (b)
-localparam DIVBLEN     = ($clog2(DIVb+1)-1);
-localparam DIVa        = (DIVb+1-XLEN); // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
+localparam DIVN        = ((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2; // standard length of input
+localparam LOGR        = $clog2(RADIX);           // r = log(R)
+localparam RK          = LOGR*DIVCOPIES;         // r*k used for intdiv preproc
+localparam LOGRK       = $clog2(RK);               // log2(r*k)
+localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);
+localparam DURLEN      = $clog2(FPDUR+1);
+localparam DIVb        = FPDUR*RK - 1; // canonical fdiv size (b)
+localparam DIVBLEN     = $clog2(DIVb+1)-1;
+localparam DIVa        = DIVb+1-XLEN; // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
 
 // largest length in IEU/FPU
 localparam CVTLEN = ((NF<XLEN) ? (XLEN) : (NF));  // max(XLEN, NF)
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 6c397576a..8f3c477c4 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -52,7 +52,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic [P.DIVb:0]             PreSqrtX;
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
   logic [P.NE+1:0]             QeE;                                 // Quotient Exponent (FP only)
-  logic [P.DIVb-1:0]           IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
+  logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
   logic [P.DIVBLEN:0]          mE, nE, ell;                         // Leading zeros of inputs
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
   logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
@@ -89,12 +89,12 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.XLEN) posbmux(BE, -BE, BsE, PosB);
 
     // Select integer or floating point inputs
-    mux2 #(P.DIVb) ifxmux({Xm, {(P.DIVb-P.NF-1){1'b0}}}, {PosA, {(P.DIVb-P.XLEN){1'b0}}}, IntDivE, IFX);
-    mux2 #(P.DIVb) ifdmux({Ym, {(P.DIVb-P.NF-1){1'b0}}}, {PosB, {(P.DIVb-P.XLEN){1'b0}}}, IntDivE, IFD);
+    mux2 #(P.DIVb+1) ifxmux({Xm, {(P.DIVb-P.NF){1'b0}}}, {PosA, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFX);
+    mux2 #(P.DIVb+1) ifdmux({Ym, {(P.DIVb-P.NF){1'b0}}}, {PosB, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFD);
     mux2 #(1)    numzmux(XZeroE, AZeroE, IntDivE, NumerZeroE);
   end else begin // Int not supported
-    assign IFX = {Xm, {(P.DIVb-P.NF-1){1'b0}}};
-    assign IFD = {Ym, {(P.DIVb-P.NF-1){1'b0}}};
+    assign IFX = {Xm, {(P.DIVb-P.NF){1'b0}}};
+    assign IFD = {Ym, {(P.DIVb-P.NF){1'b0}}};
     assign NumerZeroE = XZeroE;
   end
 
@@ -103,12 +103,12 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   //////////////////////////////////////////////////////
 
   // count leading zeros for Subnorm FP and to normalize integer inputs
-  lzc #(P.DIVb) lzcX (IFX, ell);
-  lzc #(P.DIVb) lzcY (IFD, mE);
+  lzc #(P.DIVb) lzcX (IFX[P.DIVb:1], ell);
+  lzc #(P.DIVb) lzcY (IFD[P.DIVb:1], mE);
 
   // Normalization shift: shift off leading one
-  assign Xfract = (IFX << ell) << 1;
-  assign Dfract = (IFD << mE)  << 1; 
+  assign Xfract = (IFX[P.DIVb:1] << ell) << 1;
+  assign Dfract = (IFD[P.DIVb:1] << mE)  << 1; 
 
   //////////////////////////////////////////////////////
   // Integer Right Shift to digit boundary

From a1f94c9b0cac927da505489907bfb5baf2bc2696 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 09:11:15 -0800
Subject: [PATCH 02/11] fdivsqrt parameter cleanup

---
 config/shared/config-shared.vh       | 13 ++++++-------
 config/shared/parameter-defs.vh      |  3 +--
 src/cvw.sv                           |  1 -
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv |  3 ++-
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 18 +++++++++---------
 5 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index acc7996cb..17b1ede83 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -94,15 +94,14 @@ localparam FMT2  = ((F_SUPPORTED & (LEN1 != S_LEN)) ? 2'd0    : 2'd2);
 localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 
 // division constants
-localparam DIVN        = ((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2; // standard length of input
-localparam LOGR        = $clog2(RADIX);           // r = log(R)
-localparam RK          = LOGR*DIVCOPIES;         // r*k used for intdiv preproc
-localparam LOGRK       = $clog2(RK);               // log2(r*k)
-localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);
+localparam DIVN        = ((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2; // standard length of input: max(XLEN, NF+2)
+localparam LOGR        = $clog2(RADIX);                             // r = log(R)
+localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
+localparam LOGRK       = $clog2(RK);                                // log2(r*k)
+localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // 
 localparam DURLEN      = $clog2(FPDUR+1);
 localparam DIVb        = FPDUR*RK - 1; // canonical fdiv size (b)
-localparam DIVBLEN     = $clog2(DIVb+1)-1;
-localparam DIVa        = DIVb+1-XLEN; // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
+localparam DIVBLEN     = $clog2(DIVb+2)-1;
 
 // largest length in IEU/FPU
 localparam CVTLEN = ((NF<XLEN) ? (XLEN) : (NF));  // max(XLEN, NF)
diff --git a/config/shared/parameter-defs.vh b/config/shared/parameter-defs.vh
index d04b35e56..0c377c02d 100644
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@@ -184,6 +184,5 @@ localparam cvw_t P = '{
   FPDUR       : FPDUR,
   DURLEN      : DURLEN,
   DIVb        : DIVb,
-  DIVBLEN     : DIVBLEN,
-  DIVa        : DIVa
+  DIVBLEN     : DIVBLEN
 };
diff --git a/src/cvw.sv b/src/cvw.sv
index 4cbf67b28..02105823e 100644
--- a/src/cvw.sv
+++ b/src/cvw.sv
@@ -277,7 +277,6 @@ typedef struct packed {
   int DURLEN     ;
   int DIVb       ;
   int DIVBLEN    ;
-  int DIVa       ;
 
 } cvw_t;
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index c3c558902..9f887d4ab 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -110,7 +110,8 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.DIVb+4) quotresmux(UnsignedQuotM, -UnsignedQuotM, NegQuotM, NormQuotM);
 
     // Select quotient or remainder and do normalization shift
-    mux2 #(P.DIVBLEN+1) normshiftmux(((P.DIVBLEN+1)'(P.DIVb) - (nM * (P.DIVBLEN+1)'(P.LOGR))), (mM + (P.DIVBLEN+1)'(P.DIVa)), RemOpM, NormShiftM);
+    localparam DIVa        = (P.DIVb+1-P.XLEN); // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
+    mux2 #(P.DIVBLEN+1) normshiftmux(((P.DIVBLEN+1)'(P.DIVb) - (nM * (P.DIVBLEN+1)'(P.LOGR))), (mM + (P.DIVBLEN+1)'(DIVa)), RemOpM, NormShiftM);
     mux2 #(P.DIVb+4)    presresultmux(NormQuotM, NormRemM, RemOpM, PreResultM);
     assign PreIntResultM = $signed(PreResultM >>> NormShiftM); 
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 8f3c477c4..0e716ac20 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -48,7 +48,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   output logic [P.XLEN-1:0]    AM
 );
 
-  logic [P.DIVb-1:0]           Xfract, Dfract;
+  logic [P.DIVb:0]             Xfract, Dfract;
   logic [P.DIVb:0]             PreSqrtX;
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
   logic [P.NE+1:0]             QeE;                                 // Quotient Exponent (FP only)
@@ -103,12 +103,12 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   //////////////////////////////////////////////////////
 
   // count leading zeros for Subnorm FP and to normalize integer inputs
-  lzc #(P.DIVb) lzcX (IFX[P.DIVb:1], ell);
-  lzc #(P.DIVb) lzcY (IFD[P.DIVb:1], mE);
+  lzc #(P.DIVb+1) lzcX (IFX, ell);
+  lzc #(P.DIVb+1) lzcY (IFD, mE);
 
   // Normalization shift: shift off leading one
-  assign Xfract = (IFX[P.DIVb:1] << ell) << 1;
-  assign Dfract = (IFD[P.DIVb:1] << mE)  << 1; 
+  assign Xfract = (IFX << ell);
+  assign Dfract = (IFD << mE); 
 
   //////////////////////////////////////////////////////
   // Integer Right Shift to digit boundary
@@ -158,10 +158,10 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   //  it comes out in the wash and gives the right answer.  Investigate later if possible.
   //////////////////////////////////////////////////////
 
-  assign DivX = {3'b000, ~NumerZeroE, Xfract};
+  assign DivX = {3'b000, Xfract};
 
   // Sqrt is initialized on step one as R(X-1), so depends on Radix
-  mux2 #(P.DIVb+1) sqrtxmux({~XZeroE, Xfract}, {1'b0, ~XZeroE, Xfract[P.DIVb-1:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
+  mux2 #(P.DIVb+1) sqrtxmux(Xfract, {1'b0, Xfract[P.DIVb:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
   if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};
   else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};
   mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);
@@ -176,8 +176,8 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     assign X = PreShiftX;
   end
 
-   // Divisior register
-  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {4'b0001, Dfract}, D);
+  // Divisior register
+  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dfract}, D);
  
   // Floating-point exponent
   fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .XZero(XZeroE), .ell, .m(mE), .Qe(QeE));

From 4d77f28a1947e6c295ca9900fe5768aff3c0f47a Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 11:21:02 -0800
Subject: [PATCH 03/11] Divsqrt cleanup: change Q to U, commenting code

---
 src/fpu/fdivsqrt/fdivsqrt.sv         |  8 +++----
 src/fpu/fdivsqrt/fdivsqrtcycles.sv   |  2 +-
 src/fpu/fdivsqrt/fdivsqrtexpcalc.sv  | 11 ++++++---
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 12 +++++-----
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 36 +++++++++++++++++-----------
 src/fpu/fpu.sv                       | 12 +++++-----
 src/fpu/postproc/divshiftcalc.sv     | 28 +++++++++++-----------
 src/fpu/postproc/postprocess.sv      | 12 +++++-----
 src/fpu/postproc/round.sv            |  6 ++---
 src/fpu/postproc/shiftcorrection.sv  |  8 +++----
 10 files changed, 74 insertions(+), 61 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrt.sv b/src/fpu/fdivsqrt/fdivsqrt.sv
index 5c5fa0f57..60e42f457 100644
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -45,8 +45,8 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
   input  logic                 IntDivE, W64E,
   output logic                 DivStickyM,
   output logic                 FDivBusyE, IFDivStartE, FDivDoneE,
-  output logic [P.NE+1:0]      QeM,
-  output logic [P.DIVb:0]      QmM,
+  output logic [P.NE+1:0]      UeM,                         // Exponent result 
+  output logic [P.DIVb:0]      UmM,                         // Significand result
   output logic [P.XLEN-1:0]    FIntDivResultM
 );
 
@@ -74,7 +74,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
 
   fdivsqrtpreproc #(P) fdivsqrtpreproc(                          // Preprocessor
     .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE),
-    .FmtE, .SqrtE, .XZeroE, .Funct3E, .QeM, .X, .D, .CyclesE,
+    .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
     // Int-specific 
     .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
     .BZeroM, .nM, .mM, .AM, 
@@ -94,7 +94,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
   fdivsqrtpostproc #(P) fdivsqrtpostproc(                        // Postprocessor
     .clk, .reset, .StallM, .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, 
     .SqrtE, .Firstun, .SqrtM, .SpecialCaseM, 
-    .QmM, .WZeroE, .DivStickyM, 
+    .UmM, .WZeroE, .DivStickyM, 
     // Int-specific 
     .nM, .mM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
     .FIntDivResultM);
diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index ed28c9355..2122317fe 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -68,7 +68,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
     if (SqrtE) fbits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1; is it related to DIVCOPIES logic below?
     // if (SqrtE) fbits = Nf + 2 + 2; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2
     else       fbits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
-    if (P.IDIV_ON_FPU) CyclesE =  IntDivE ? ((nE + 1)/P.DIVCOPIES) : (fbits + (P.LOGR*P.DIVCOPIES)-1)/(P.LOGR*P.DIVCOPIES);
+    if (P.IDIV_ON_FPU) CyclesE =  IntDivE ? ((nE + 1)/P.DIVCOPIES) : (fbits -1)/(P.RK) + 1;
     else              CyclesE = (fbits + (P.LOGR*P.DIVCOPIES)-1)/(P.LOGR*P.DIVCOPIES);
   end 
   /* verilator lint_on WIDTH */
diff --git a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
index 5531276df..113f2b2dd 100644
--- a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
@@ -32,8 +32,9 @@ module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 Sqrt,
   input  logic                 XZero, 
   input  logic [P.DIVBLEN:0]   ell, m,
-  output logic [P.NE+1:0]      Qe
+  output logic [P.NE+1:0]      Ue
   );
+  
   logic [P.NE-2:0] Bias;
   logic [P.NE+1:0] SXExp;
   logic [P.NE+1:0] SExp;
@@ -63,10 +64,14 @@ module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
       2'h2: Bias =  (P.NE-1)'(P.H_BIAS);
     endcase
   end
+
+  // Square root exponent = (Xe - l - bias) / 2 + bias; l accounts for subnorms
   assign SXExp = {2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - (P.NE+2)'(P.BIAS);
   assign SExp  = {SXExp[P.NE+1], SXExp[P.NE+1:1]} + {2'b0, Bias};
   
-  // correct exponent for subnormal input's normalization shifts
+  // division exponent = (Xe-l) - (Ye-m) + bias; l and m account for subnorms
   assign DExp  = ({2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - {2'b0, Ye} + {{(P.NE+1-P.DIVBLEN){1'b0}}, m} + {3'b0, Bias}); 
-  assign Qe = Sqrt ? SExp : DExp;
+
+  // Select square root or division exponent
+  assign Ue = Sqrt ? SExp : DExp;
 endmodule
diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 9f887d4ab..2b9be54a7 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -38,14 +38,14 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.XLEN-1:0]  AM,
   input  logic               RemOpM, ALTBM, BZeroM, AsM, BsM, W64M,
   input  logic [P.DIVBLEN:0] nM, mM,
-  output logic [P.DIVb:0]    QmM, 
+  output logic [P.DIVb:0]    UmM,               // result significand
   output logic               WZeroE,
   output logic               DivStickyM,
   output logic [P.XLEN-1:0]  FIntDivResultM
 );
   
   logic [P.DIVb+3:0]         W, Sum;
-  logic [P.DIVb:0]           PreQmM;
+  logic [P.DIVb:0]           PreUmM;
   logic                      NegStickyM;
   logic                      weq0E, WZeroM;
   logic [P.XLEN-1:0]         IntDivResultM;
@@ -91,17 +91,17 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
   // Determine if sticky bit is negative  // *** look for ways to optimize this.  Shift shouldn't be needed.
   assign Sum = WC + WS;
   assign NegStickyM = Sum[P.DIVb+3];
-  mux2 #(P.DIVb+1) preqmmux(FirstU, FirstUM, NegStickyM, PreQmM); // Select U or U-1 depending on negative sticky bit
-  mux2 #(P.DIVb+1)    qmmux(PreQmM, (PreQmM << 1), SqrtM, QmM);
+  mux2 #(P.DIVb+1) preummux(FirstU, FirstUM, NegStickyM, PreUmM); // Select U or U-1 depending on negative sticky bit
+  mux2 #(P.DIVb+1)    ummux(PreUmM, (PreUmM << 1), SqrtM, UmM);
 
-  // Integer quotient or remainder correctoin, normalization, and special cases
+  // Integer quotient or remainder correction, normalization, and special cases
   if (P.IDIV_ON_FPU) begin:intpostproc // Int supported
     logic [P.DIVBLEN:0] NormShiftM;
     logic [P.DIVb+3:0] UnsignedQuotM, NormRemM, NormRemDM, NormQuotM;
     logic signed [P.DIVb+3:0] PreResultM, PreIntResultM;
 
     assign W = $signed(Sum) >>> P.LOGR;
-    assign UnsignedQuotM = {3'b000, PreQmM};
+    assign UnsignedQuotM = {3'b000, PreUmM};
 
     // Integer remainder: sticky and sign correction muxes
     assign NegQuotM = AsM ^ BsM; // Integer Quotient is negative
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 0e716ac20..2255aafb1 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -35,7 +35,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 SqrtE,
   input  logic                 XZeroE,
   input  logic [2:0]           Funct3E,
-  output logic [P.NE+1:0]      QeM,
+  output logic [P.NE+1:0]      UeM,
   output logic [P.DIVb+3:0]    X, D,
   // Int-specific
   input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
@@ -48,10 +48,10 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   output logic [P.XLEN-1:0]    AM
 );
 
-  logic [P.DIVb:0]             Xfract, Dfract;
+  logic [P.DIVb:0]             Xnorm, Dnorm;
   logic [P.DIVb:0]             PreSqrtX;
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
-  logic [P.NE+1:0]             QeE;                                 // Quotient Exponent (FP only)
+  logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
   logic [P.DIVBLEN:0]          mE, nE, ell;                         // Leading zeros of inputs
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
@@ -106,9 +106,9 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   lzc #(P.DIVb+1) lzcX (IFX, ell);
   lzc #(P.DIVb+1) lzcY (IFD, mE);
 
-  // Normalization shift: shift off leading one
-  assign Xfract = (IFX << ell);
-  assign Dfract = (IFD << mE); 
+  // Normalization shift: shift leading one into most significant bit
+  assign Xnorm = (IFX << ell);
+  assign Dnorm = (IFD << mE); 
 
   //////////////////////////////////////////////////////
   // Integer Right Shift to digit boundary
@@ -133,10 +133,11 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
       logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
       logic [P.DIVBLEN:0] TotalIntBits, IntSteps;
       /* verilator lint_off WIDTH */
+      // n = k*ceil((r+p)/rk) - 1
       assign TotalIntBits = P.LOGR + p;                            // Total number of result bits (r integer bits plus p fractional bits)
       assign IntTrunc = TotalIntBits % P.RK;                       // Truncation check for ceiling operator
       assign IntSteps = (TotalIntBits >> P.LOGRK) + |IntTrunc;     // Number of steps for int div
-      assign nE = (IntSteps * P.DIVCOPIES) - 1;                    // Fractional digits
+      assign nE = (IntSteps * P.DIVCOPIES) - 1;                    // Fractional digits = total digits - 1 integer digit
       assign RightShiftX = P.RK - 1 - ((TotalIntBits - 1) % P.RK); // Right shift amount
       assign DivXShifted = DivX >> RightShiftX;                    // shift X by up to R*K-1 to complete in nE steps
       /* verilator lint_on WIDTH */
@@ -150,18 +151,25 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 
   //////////////////////////////////////////////////////
   // Floating-Point Preprocessing
-  // append leading 1 (for nonzero inputs)
+  // Extend to Q4.b format
   // shift square root to be in range [1/4, 1)
   // Normalized numbers are shifted right by 1 if the exponent is odd
   // Subnormal numbers have Xe = 0 and an unbiased exponent of 1-BIAS.  They are shifted right if the number of leading zeros is odd.
   // NOTE: there might be a discrepancy that X is never right shifted by 2.  However
-  //  it comes out in the wash and gives the right answer.  Investigate later if possible.
+  //  it comes out in the wash and gives the right answer.  Investigate later if possible. ***
   //////////////////////////////////////////////////////
 
-  assign DivX = {3'b000, Xfract};
+  assign DivX = {3'b000, Xnorm}; // Zero-extend numerator for division
 
   // Sqrt is initialized on step one as R(X-1), so depends on Radix
-  mux2 #(P.DIVb+1) sqrtxmux(Xfract, {1'b0, Xfract[P.DIVb:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
+  // If X = 0, then special case logic sets sqrt = 0 so this portion doesn't matter
+  // Otherwise, X has a leading 1 after possible normalization shift and is now in range [1, 2)
+  // Next X is shifted right by 1 or 2 bits to range [1/4, 1) and exponent will be adjusted accordingly to be even
+  // Now (X-1) is negative.  Formed by placing all 1s in all four integer bits (in Q4.b) form, keeping X in fraciton bits
+  // Then multiply by R is left shift by r (1 or 2 for radix 2 or 4)
+  // For Radix 2, this gives 3 leading 1s, followed by the fraction bits
+  // For Radix 4, this gives 2 leading 1s, followed by the fraction bits (and a zero in the lsb)
+  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
   if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};
   else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};
   mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);
@@ -177,11 +185,11 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   end
 
   // Divisior register
-  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dfract}, D);
+  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dnorm}, D);
  
   // Floating-point exponent
-  fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .XZero(XZeroE), .ell, .m(mE), .Qe(QeE));
-  flopen #(P.NE+2) expreg(clk, IFDivStartE, QeE, QeM);
+  fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .XZero(XZeroE), .ell, .m(mE), .Ue(UeE));
+  flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
 
   // Number of FSM cycles (to FSM)
   fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .nE, .CyclesE);
diff --git a/src/fpu/fpu.sv b/src/fpu/fpu.sv
index f71999471..ffd9cf49a 100755
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@@ -133,8 +133,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   logic [P.XLEN-1:0]           FCvtIntResM;                        // fcvt integer result (for IEU)
   
   // divide signals
-  logic [P.DIVb:0]             QmM;                                // fdivsqrt signifcand
-  logic [P.NE+1:0]             QeM;                                // fdivsqrt exponent
+  logic [P.DIVb:0]             UmM;                                // fdivsqrt signifcand
+  logic [P.NE+1:0]             UeM;                                // fdivsqrt exponent
   logic                        DivStickyM;                         // fdivsqrt sticky bit
   logic                        FDivDoneE, IFDivStartE;             // fdivsqrt control signals
   logic [P.XLEN-1:0]           FIntDivResultM;                     // fdivsqrt integer division result (for IEU)
@@ -242,8 +242,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   fdivsqrt #(P) fdivsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
     .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .FDivStartE, .IDivStartE, .XsE,
     .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .IntDivE, .W64E,
-    .StallM, .FlushE, .DivStickyM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .QeM, 
-    .QmM, .FIntDivResultM);
+    .StallM, .FlushE, .DivStickyM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .UeM, 
+    .UmM, .FIntDivResultM);
 
   // compare: fmin/fmax, flt/fle/feq
   fcmp #(P) fcmp (.Fmt(FmtE), .OpCtrl(OpCtrlE), .Xs(XsE), .Ys(YsE), .Xe(XeE), .Ye(YeE), 
@@ -326,9 +326,9 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   //////////////////////////////////////////////////////////////////////////////////////////
 
   postprocess #(P) postprocess(.Xs(XsM), .Ys(YsM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), 
-    .FmaASticky(FmaAStickyM), .XZero(XZeroM), .YZero(YZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QmM), .FmaSs(SsM),
+    .FmaASticky(FmaAStickyM), .XZero(XZeroM), .YZero(YZeroM), .XInf(XInfM), .YInf(YInfM), .DivUm(UmM), .FmaSs(SsM),
     .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), 
-    .FmaSm(SmM), .DivQe(QeM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
+    .FmaSm(SmM), .DivUe(UeM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
     .CvtCe(CeM), .CvtResSubnormUf(CvtResSubnormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CsM), 
     .ToInt(FWriteIntM), .DivSticky(DivStickyM), .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), 
     .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));
diff --git a/src/fpu/postproc/divshiftcalc.sv b/src/fpu/postproc/divshiftcalc.sv
index d560714db..380f8f5e6 100644
--- a/src/fpu/postproc/divshiftcalc.sv
+++ b/src/fpu/postproc/divshiftcalc.sv
@@ -27,8 +27,8 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.DIVb:0]              DivQm,              // divsqrt significand
-  input  logic [P.NE+1:0]              DivQe,              // divsqrt exponent
+  input  logic [P.DIVb:0]              DivUm,              // divsqrt significand
+  input  logic [P.NE+1:0]              DivUe,              // divsqrt exponent
   output logic [P.LOGNORMSHIFTSZ-1:0]  DivShiftAmt,        // divsqrt shift amount
   output logic [P.NORMSHIFTSZ-1:0]     DivShiftIn,         // divsqrt shift input
   output logic                         DivResSubnorm,      // is the divsqrt result subnormal
@@ -41,23 +41,23 @@ module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
 
   // is the result subnormal
   // if the exponent is 1 then the result needs to be normalized then the result is Subnormalizes
-  assign DivResSubnorm = DivQe[P.NE+1]|(~|DivQe[P.NE+1:0]);
+  assign DivResSubnorm = DivUe[P.NE+1]|(~|DivUe[P.NE+1:0]);
 
   // if the result is subnormal
-  //  00000000x.xxxxxx...                     Exp = DivQe
-  //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
-  //  .00xxxxxxxxxxxxx... << DivQe+NF+1       Exp = +1
+  //  00000000x.xxxxxx...                     Exp = DivUe
+  //  .00000000xxxxxxx... >> NF+1             Exp = DivUe+NF+1
+  //  .00xxxxxxxxxxxxx... << DivUe+NF+1       Exp = +1
   //  .0000xxxxxxxxxxx... >> 1                Exp = 1
-  // Left shift amount      = DivQe+NF+1-1
-  assign DivSubnormShift    = (P.NE+2)'(P.NF)+DivQe;
+  // Left shift amount      = DivUe+NF+1-1
+  assign DivSubnormShift    = (P.NE+2)'(P.NF)+DivUe;
   assign DivSubnormShiftPos = ~DivSubnormShift[P.NE+1];
 
   // if the result is normalized
-  //  00000000x.xxxxxx...                     Exp = DivQe
-  //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
-  //  00000000.xxxxxxx... << NF               Exp = DivQe+1
-  //  00000000x.xxxxxx... << NF               Exp = DivQe (extra shift done afterwards)
-  //  00000000xx.xxxxx... << 1?               Exp = DivQe-1 (determined after)
+  //  00000000x.xxxxxx...                     Exp = DivUe
+  //  .00000000xxxxxxx... >> NF+1             Exp = DivUe+NF+1
+  //  00000000.xxxxxxx... << NF               Exp = DivUe+1
+  //  00000000x.xxxxxx... << NF               Exp = DivUe (extra shift done afterwards)
+  //  00000000xx.xxxxx... << 1?               Exp = DivUe-1 (determined after)
   // inital Left shift amount  = NF
   // shift one more if the it's a minimally redundent radix 4 - one entire cycle needed for integer bit
   assign NormShift = (P.LOGNORMSHIFTSZ)'(P.NF);
@@ -68,5 +68,5 @@ module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
   assign DivShiftAmt        = DivResSubnorm ? DivSubnormShiftAmt : NormShift;
 
   // pre-shift the divider result for normalization
-  assign DivShiftIn = {{P.NF{1'b0}}, DivQm, {P.NORMSHIFTSZ-P.DIVb-1-P.NF{1'b0}}};
+  assign DivShiftIn = {{P.NF{1'b0}}, DivUm, {P.NORMSHIFTSZ-P.DIVb-1-P.NF{1'b0}}};
 endmodule
diff --git a/src/fpu/postproc/postprocess.sv b/src/fpu/postproc/postprocess.sv
index ee96b34d2..05db352cd 100644
--- a/src/fpu/postproc/postprocess.sv
+++ b/src/fpu/postproc/postprocess.sv
@@ -48,8 +48,8 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   input logic  [$clog2(3*P.NF+5)-1:0]      FmaSCnt,             // the normalization shift count
   //divide signals
   input logic                              DivSticky,           // divider sticky bit
-  input logic  [P.NE+1:0]                  DivQe,               // divsqrt exponent
-  input logic  [P.DIVb:0]                  DivQm,               // divsqrt significand
+  input logic  [P.NE+1:0]                  DivUe,               // divsqrt exponent
+  input logic  [P.DIVb:0]                  DivUm,               // divsqrt significand
   // conversion signals
   input logic                              CvtCs,               // the result's sign
   input logic  [P.NE:0]                    CvtCe,               // the calculated expoent
@@ -91,7 +91,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   // division singals
   logic [P.LOGNORMSHIFTSZ-1:0] DivShiftAmt;          // divsqrt shif amount
   logic [P.NORMSHIFTSZ-1:0]    DivShiftIn;           // divsqrt shift input
-  logic [P.NE+1:0]             Qe;                   // divsqrt corrected exponent after corretion shift
+  logic [P.NE+1:0]             Ue;                   // divsqrt corrected exponent after corretion shift
   logic                        DivByZero;            // divide by zero flag
   logic                        DivResSubnorm;        // is the divsqrt result subnormal
   logic                        DivSubnormShiftPos;   // is the divsqrt subnorm shift amout positive (not underflowed)
@@ -146,7 +146,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   fmashiftcalc #(P) fmashiftcalc(.FmaSm, .FmaSCnt, .Fmt, .NormSumExp, .FmaSe,
       .FmaSZero, .FmaPreResultSubnorm, .FmaShiftAmt, .FmaShiftIn);
 
-  divshiftcalc #(P) divshiftcalc(.DivQe, .DivQm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
+  divshiftcalc #(P) divshiftcalc(.DivUe, .DivUm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
 
   // select which unit's output to shift
   always_comb
@@ -174,7 +174,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
 
   // correct for LZA/divsqrt error
   shiftcorrection #(P) shiftcorrection(.FmaOp, .FmaPreResultSubnorm, .NormSumExp,
-      .DivResSubnorm, .DivSubnormShiftPos, .DivOp, .DivQe, .Qe, .FmaSZero, .Shifted, .FmaMe, .Mf);
+      .DivResSubnorm, .DivSubnormShiftPos, .DivOp, .DivUe, .Ue, .FmaSZero, .Shifted, .FmaMe, .Mf);
 
   ///////////////////////////////////////////////////////////////////////////////
   // Rounding
@@ -189,7 +189,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   // calulate result sign used in rounding unit
   roundsign roundsign(.FmaOp, .DivOp, .CvtOp, .Sqrt, .FmaSs, .Xs, .Ys, .CvtCs, .Ms);
 
-  round #(P) round(.OutFmt, .Frm, .FmaASticky, .Plus1, .PostProcSel, .CvtCe, .Qe,
+  round #(P) round(.OutFmt, .Frm, .FmaASticky, .Plus1, .PostProcSel, .CvtCe, .Ue,
       .Ms, .FmaMe, .FmaOp, .CvtOp, .CvtResSubnormUf, .Mf, .ToInt,  .CvtResUf,
       .DivSticky, .DivOp, .UfPlus1, .FullRe, .Rf, .Re, .Sticky, .Round, .Guard, .Me);
 
diff --git a/src/fpu/postproc/round.sv b/src/fpu/postproc/round.sv
index 0a5d9ecc5..e01ff376b 100644
--- a/src/fpu/postproc/round.sv
+++ b/src/fpu/postproc/round.sv
@@ -39,7 +39,7 @@ module round import cvw::*;  #(parameter cvw_t P) (
   // divsqrt
   input  logic                     DivOp,              // is a division opperation being done
   input  logic                     DivSticky,          // divsqrt sticky bit
-  input  logic [P.NE+1:0]          Qe,                 // the divsqrt calculated expoent
+  input  logic [P.NE+1:0]          Ue,                 // the divsqrt calculated expoent
   // cvt
   input  logic                     CvtOp,              // is a convert opperation being done
   input  logic                     ToInt,              // is the cvt op a cvt to integer
@@ -300,8 +300,8 @@ module round import cvw::*;  #(parameter cvw_t P) (
       case(PostProcSel)
           2'b10:    Me = FmaMe; // fma
           2'b00:    Me = {CvtCe[P.NE], CvtCe}&{P.NE+2{~CvtResSubnormUf|CvtResUf}}; // cvt
-          // 2'b01: Me = DivDone ? Qe : '0; // divide
-          2'b01:    Me = Qe; // divide
+          // 2'b01: Me = DivDone ? Ue : '0; // divide
+          2'b01:    Me = Ue; // divide
           default:  Me = '0; 
       endcase
 
diff --git a/src/fpu/postproc/shiftcorrection.sv b/src/fpu/postproc/shiftcorrection.sv
index 9e0473667..f5860b42d 100644
--- a/src/fpu/postproc/shiftcorrection.sv
+++ b/src/fpu/postproc/shiftcorrection.sv
@@ -31,7 +31,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
   // divsqrt
   input logic                      DivOp,                  // is it a divsqrt opperation
   input logic                      DivResSubnorm,          // is the divsqrt result subnormal
-  input logic  [P.NE+1:0]          DivQe,                  // the divsqrt result's exponent
+  input logic  [P.NE+1:0]          DivUe,                  // the divsqrt result's exponent
   input logic                      DivSubnormShiftPos,     // is the subnorm divider shift amount positive (ie not underflowed)
   //fma
   input logic                      FmaOp,                  // is it an fma opperation
@@ -41,7 +41,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
   // output
   output logic [P.NE+1:0]          FmaMe,                  // exponent of the normalized sum
   output logic [P.CORRSHIFTSZ-1:0] Mf,                     // the shifted sum before LZA correction
-  output logic [P.NE+1:0]          Qe                      // corrected exponent for divider
+  output logic [P.NE+1:0]          Ue                      // corrected exponent for divider
 );
 
   logic [3*P.NF+3:0]               CorrSumShifted;         // the shifted sum after LZA correction
@@ -61,7 +61,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
 
   // correct the shifting of the divsqrt caused by producing a result in (2, .5] range
   // condition: if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Subnorm)
-  assign LeftShiftQm = (LZAPlus1|(DivQe==1&~LZAPlus1));
+  assign LeftShiftQm = (LZAPlus1|(DivUe==1&~LZAPlus1));
   assign CorrQm0     = Shifted[P.NORMSHIFTSZ-3:P.NORMSHIFTSZ-P.CORRSHIFTSZ-2];
   assign CorrQm1     = Shifted[P.NORMSHIFTSZ-2:P.NORMSHIFTSZ-P.CORRSHIFTSZ-1];
   mux2 #(P.CORRSHIFTSZ) divcorrmux(CorrQm0, CorrQm1, LeftShiftQm, CorrQmShifted);
@@ -87,5 +87,5 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
 
   // the quotent is in the range [.5,2) if there is no early termination
   // if the quotent < 1 and not Subnormal then subtract 1 to account for the normalization shift
-  assign Qe = (DivResSubnorm & DivSubnormShiftPos) ? '0 : DivQe - {(P.NE+1)'(0), ~LZAPlus1};
+  assign Ue = (DivResSubnorm & DivSubnormShiftPos) ? '0 : DivUe - {(P.NE+1)'(0), ~LZAPlus1};
 endmodule

From 083ed09f1e759b09720b0bc207d112c19b1be543 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 11:25:54 -0800
Subject: [PATCH 04/11] Reduced duplicated logic in fdivsqrtcycles

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index 2122317fe..e9fbc6042 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -33,7 +33,10 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.DIVBLEN:0]   nE,
   output logic [P.DURLEN-1:0]  CyclesE
 );
+
   logic [P.DURLEN+1:0] Nf, fbits; // number of fractional bits
+  logic [P.DURLEN-1:0] fpcycles;  // number of cycles for floating-point operation
+
   // DIVN = P.NF+3
   // NS = NF + 1
   // N = NS or NS+2 for div/sqrt.
@@ -68,8 +71,10 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
     if (SqrtE) fbits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1; is it related to DIVCOPIES logic below?
     // if (SqrtE) fbits = Nf + 2 + 2; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2
     else       fbits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
-    if (P.IDIV_ON_FPU) CyclesE =  IntDivE ? ((nE + 1)/P.DIVCOPIES) : (fbits -1)/(P.RK) + 1;
-    else              CyclesE = (fbits + (P.LOGR*P.DIVCOPIES)-1)/(P.LOGR*P.DIVCOPIES);
+    assign     fpcycles = (fbits-1)/(P.RK) + 1;
+
+    if (P.IDIV_ON_FPU) CyclesE = IntDivE ? ((nE + 1)/P.DIVCOPIES) : fpcycles;
+    else               CyclesE = fpcycles;
   end 
   /* verilator lint_on WIDTH */
 

From b8bdb1c7d148ed6609594ce7530bbe68e53bedaa Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 14:00:27 -0800
Subject: [PATCH 05/11] Simplified cycle count logic

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv  | 18 +++++++++---------
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 22 +++++++++++++---------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index e9fbc6042..df581701b 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -30,12 +30,12 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0] FmtE,
   input  logic                 SqrtE,
   input  logic                 IntDivE,
-  input  logic [P.DIVBLEN:0]   nE,
+  input  logic [P.DIVBLEN:0]   IntResultBits,
   output logic [P.DURLEN-1:0]  CyclesE
 );
 
-  logic [P.DURLEN+1:0] Nf, fbits; // number of fractional bits
-  logic [P.DURLEN-1:0] fpcycles;  // number of cycles for floating-point operation
+  logic [P.DURLEN+1:0] Nf, FPResultBits; // number of fractional bits
+  logic [P.DIVBLEN:0]  ResultBits; // number of result bits;
 
   // DIVN = P.NF+3
   // NS = NF + 1
@@ -68,13 +68,13 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
       endcase 
 
   always_comb begin 
-    if (SqrtE) fbits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1; is it related to DIVCOPIES logic below?
-    // if (SqrtE) fbits = Nf + 2 + 2; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2
-    else       fbits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
-    assign     fpcycles = (fbits-1)/(P.RK) + 1;
+    if (SqrtE) FPResultBits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1 rather than +2; is it related to DIVCOPIES logic below?
+    else       FPResultBits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
 
-    if (P.IDIV_ON_FPU) CyclesE = IntDivE ? ((nE + 1)/P.DIVCOPIES) : fpcycles;
-    else               CyclesE = fpcycles;
+    if (P.IDIV_ON_FPU) ResultBits = IntDivE ? IntResultBits : FPResultBits;
+    else               ResultBits = FPResultBits;
+
+    assign CyclesE = (ResultBits-1)/(P.RK) + 1;
   end 
   /* verilator lint_on WIDTH */
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 2255aafb1..ab0941aca 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -54,6 +54,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
   logic [P.DIVBLEN:0]          mE, nE, ell;                         // Leading zeros of inputs
+  logic [P.DIVBLEN:0]          IntResultBits;                       // bits in integer result
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
   logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
   logic                        SignedDivE;                          // signed division
@@ -122,7 +123,11 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     // calculate number of fractional bits p
     assign ZeroDiff = mE - ell;         // Difference in number of leading zeros
     assign ALTBE = ZeroDiff[P.DIVBLEN];  // A less than B (A has more leading zeros)
-    mux2 #(P.DIVBLEN+1) pmux(ZeroDiff, '0, ALTBE, p);              
+    mux2 #(P.DIVBLEN+1) pmux(ZeroDiff, '0, ALTBE, p);          
+
+    /* verilator lint_off WIDTH */
+    assign IntResultBits = P.LOGR + p;                            // Total number of result bits (r integer bits plus p fractional bits)
+    /* verilator lint_on WIDTH */
 
     // Integer special cases (terminate immediately)
     assign ISpecialCaseE = BZeroE | ALTBE;
@@ -131,15 +136,14 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 
     if (P.LOGRK > 0) begin // more than 1 bit per cycle
       logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
-      logic [P.DIVBLEN:0] TotalIntBits, IntSteps;
+      logic [P.DIVBLEN:0] IntSteps;
       /* verilator lint_off WIDTH */
       // n = k*ceil((r+p)/rk) - 1
-      assign TotalIntBits = P.LOGR + p;                            // Total number of result bits (r integer bits plus p fractional bits)
-      assign IntTrunc = TotalIntBits % P.RK;                       // Truncation check for ceiling operator
-      assign IntSteps = (TotalIntBits >> P.LOGRK) + |IntTrunc;     // Number of steps for int div
-      assign nE = (IntSteps * P.DIVCOPIES) - 1;                    // Fractional digits = total digits - 1 integer digit
-      assign RightShiftX = P.RK - 1 - ((TotalIntBits - 1) % P.RK); // Right shift amount
-      assign DivXShifted = DivX >> RightShiftX;                    // shift X by up to R*K-1 to complete in nE steps
+      assign IntTrunc = IntResultBits % P.RK;                       // Truncation check for ceiling operator
+      assign IntSteps = (IntResultBits >> P.LOGRK) + |IntTrunc;     // Number of steps for int div
+      assign nE = (IntSteps * P.DIVCOPIES) - 1;                     // Fractional digits = total digits - 1 integer digit
+      assign RightShiftX = P.RK - 1 - ((IntResultBits - 1) % P.RK); // Right shift amount
+      assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in nE steps
       /* verilator lint_on WIDTH */
     end else begin // radix 2 1 copy doesn't require shifting
       assign nE = p; 
@@ -192,7 +196,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
 
   // Number of FSM cycles (to FSM)
-  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .nE, .CyclesE);
+  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBits, .CyclesE);
 
   if (P.IDIV_ON_FPU) begin:intpipelineregs
     // pipeline registers

From 72ad1d361c939c36cb99627d941a74e03433697b Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 14:28:57 -0800
Subject: [PATCH 06/11] Simplified IntDivNormShift

---
 src/fpu/fdivsqrt/fdivsqrt.sv         |  6 ++--
 src/fpu/fdivsqrt/fdivsqrtcycles.sv   |  9 +++++-
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv |  4 +--
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 41 ++++++++++++++--------------
 4 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrt.sv b/src/fpu/fdivsqrt/fdivsqrt.sv
index 60e42f457..751486f86 100644
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -67,7 +67,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
   // Integer div/rem signals                                
   logic                        BZeroM;                       // Denominator is zero
   logic                        IntDivM;                      // Integer operation
-  logic [P.DIVBLEN:0]          nM, mM;                       // Shift amounts
+  logic [P.DIVBLEN:0]          mM, IntDivNormShiftM;         // Shift amounts
   logic                        ALTBM, AsM, BsM, W64M;        // Special handling for postprocessor
   logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
   logic                        ISpecialCaseE;                // Integer div/remainder special cases
@@ -77,7 +77,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
     .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
     // Int-specific 
     .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
-    .BZeroM, .nM, .mM, .AM, 
+    .BZeroM, .IntDivNormShiftM, .mM, .AM, 
     .IntDivM, .W64M, .ALTBM, .AsM, .BsM);
 
   fdivsqrtfsm #(P) fdivsqrtfsm(                                  // FSM
@@ -96,6 +96,6 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
     .SqrtE, .Firstun, .SqrtM, .SpecialCaseM, 
     .UmM, .WZeroE, .DivStickyM, 
     // Int-specific 
-    .nM, .mM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
+    .IntDivNormShiftM, .mM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
     .FIntDivResultM);
 endmodule
diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index df581701b..bba6e8005 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -67,6 +67,13 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
         P.Q_FMT: Nf = P.Q_NF;
       endcase 
 
+  // Cycle logic
+  // P.DIVCOPIES = k. P.LOGR = log(R) = r.  P.RK = rk.  
+  // Integer division needs p fractional + r integer result bits
+  // FP Division needs at least Nf fractional bits + 2 guard/round bits and one integer digit (LOG R integer bits) = Nf + 2 + r bits
+  // FP Sqrt needs at least Nf fractional bits, 2 guard/round bits, and *** shift bits
+  // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBits / rk)
+
   always_comb begin 
     if (SqrtE) FPResultBits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1 rather than +2; is it related to DIVCOPIES logic below?
     else       FPResultBits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
@@ -74,7 +81,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
     if (P.IDIV_ON_FPU) ResultBits = IntDivE ? IntResultBits : FPResultBits;
     else               ResultBits = FPResultBits;
 
-    assign CyclesE = (ResultBits-1)/(P.RK) + 1;
+    assign CyclesE = (ResultBits-1)/(P.RK) + 1; // ceil (ResultBits/rk)
   end 
   /* verilator lint_on WIDTH */
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 2b9be54a7..58649e3a8 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -37,7 +37,7 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
   input  logic               Firstun, SqrtM, SpecialCaseM, 
   input  logic [P.XLEN-1:0]  AM,
   input  logic               RemOpM, ALTBM, BZeroM, AsM, BsM, W64M,
-  input  logic [P.DIVBLEN:0] nM, mM,
+  input  logic [P.DIVBLEN:0] mM, IntDivNormShiftM,
   output logic [P.DIVb:0]    UmM,               // result significand
   output logic               WZeroE,
   output logic               DivStickyM,
@@ -111,7 +111,7 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
 
     // Select quotient or remainder and do normalization shift
     localparam DIVa        = (P.DIVb+1-P.XLEN); // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
-    mux2 #(P.DIVBLEN+1) normshiftmux(((P.DIVBLEN+1)'(P.DIVb) - (nM * (P.DIVBLEN+1)'(P.LOGR))), (mM + (P.DIVBLEN+1)'(DIVa)), RemOpM, NormShiftM);
+    mux2 #(P.DIVBLEN+1) normshiftmux(IntDivNormShiftM, (mM + (P.DIVBLEN+1)'(DIVa)), RemOpM, NormShiftM);
     mux2 #(P.DIVb+4)    presresultmux(NormQuotM, NormRemM, RemOpM, PreResultM);
     assign PreIntResultM = $signed(PreResultM >>> NormShiftM); 
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index ab0941aca..35757e480 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -42,7 +42,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 IntDivE, W64E,
   output logic                 ISpecialCaseE,
   output logic [P.DURLEN-1:0]  CyclesE,
-  output logic [P.DIVBLEN:0]   nM, mM,
+  output logic [P.DIVBLEN:0]   mM, IntDivNormShiftM,
   output logic                 ALTBM, IntDivM, W64M,
   output logic                 AsM, BsM, BZeroM,
   output logic [P.XLEN-1:0]    AM
@@ -53,7 +53,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
   logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
-  logic [P.DIVBLEN:0]          mE, nE, ell;                         // Leading zeros of inputs
+  logic [P.DIVBLEN:0]          mE, ell;                             // Leading zeros of inputs
   logic [P.DIVBLEN:0]          IntResultBits;                       // bits in integer result
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
   logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
@@ -126,27 +126,21 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.DIVBLEN+1) pmux(ZeroDiff, '0, ALTBE, p);          
 
     /* verilator lint_off WIDTH */
-    assign IntResultBits = P.LOGR + p;                            // Total number of result bits (r integer bits plus p fractional bits)
+    assign IntResultBits = P.LOGR + p;  // Total number of result bits (r integer bits plus p fractional bits)
     /* verilator lint_on WIDTH */
 
     // Integer special cases (terminate immediately)
     assign ISpecialCaseE = BZeroE | ALTBE;
 
-    // calculate number of fractional digits nE and right shift amount RightShiftX to complete in discrete number of steps
-
+    // calculate right shift amount RightShiftX to complete in discrete number of steps
     if (P.LOGRK > 0) begin // more than 1 bit per cycle
       logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
       logic [P.DIVBLEN:0] IntSteps;
-      /* verilator lint_off WIDTH */
-      // n = k*ceil((r+p)/rk) - 1
-      assign IntTrunc = IntResultBits % P.RK;                       // Truncation check for ceiling operator
-      assign IntSteps = (IntResultBits >> P.LOGRK) + |IntTrunc;     // Number of steps for int div
-      assign nE = (IntSteps * P.DIVCOPIES) - 1;                     // Fractional digits = total digits - 1 integer digit
+      /* verilator lint_offf WIDTH */
       assign RightShiftX = P.RK - 1 - ((IntResultBits - 1) % P.RK); // Right shift amount
-      assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in nE steps
+      assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in n steps
       /* verilator lint_on WIDTH */
     end else begin // radix 2 1 copy doesn't require shifting
-      assign nE = p; 
       assign DivXShifted = DivX;
     end
   end else begin
@@ -199,17 +193,22 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBits, .CyclesE);
 
   if (P.IDIV_ON_FPU) begin:intpipelineregs
+    logic [P.DIVBLEN:0] IntDivNormShiftE;
+    /* verilator lint_off WIDTH */
+    assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
+    /* verilator lint_on WIDTH */
+
     // pipeline registers
-    flopen #(1)        mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
-    flopen #(1)       altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
-    flopen #(1)      bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
-    flopen #(1)      asignreg(clk, IFDivStartE, AsE,      AsM);
-    flopen #(1)      bsignreg(clk, IFDivStartE, BsE,      BsM);
-    flopen #(P.DIVBLEN+1) nreg(clk, IFDivStartE, nE,       nM); 
-    flopen #(P.DIVBLEN+1) mreg(clk, IFDivStartE, mE,       mM);
-    flopen #(P.XLEN)   srcareg(clk, IFDivStartE, AE,       AM);
+    flopen #(1)          mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
+    flopen #(1)         altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
+    flopen #(1)        bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
+    flopen #(1)        asignreg(clk, IFDivStartE, AsE,      AsM);
+    flopen #(1)        bsignreg(clk, IFDivStartE, BsE,      BsM);
+    flopen #(P.DIVBLEN+1) nsreg(clk, IFDivStartE, IntDivNormShiftE, IntDivNormShiftM); 
+    flopen #(P.DIVBLEN+1)  mreg(clk, IFDivStartE, mE,       mM);
+    flopen #(P.XLEN)    srcareg(clk, IFDivStartE, AE,       AM);
     if (P.XLEN==64) 
-      flopen #(1)      w64reg(clk, IFDivStartE, W64E,     W64M);
+      flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);
   end
 
 endmodule

From f539f6171b6f63c1e12eaa0567fc70303812b7a8 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 14:55:36 -0800
Subject: [PATCH 07/11] Simplified integer postnormalization shift

---
 src/fpu/fdivsqrt/fdivsqrt.sv         |  6 +++---
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv |  7 ++-----
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 12 ++++++++----
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrt.sv b/src/fpu/fdivsqrt/fdivsqrt.sv
index 751486f86..ac5c2c338 100644
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -67,7 +67,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
   // Integer div/rem signals                                
   logic                        BZeroM;                       // Denominator is zero
   logic                        IntDivM;                      // Integer operation
-  logic [P.DIVBLEN:0]          mM, IntDivNormShiftM;         // Shift amounts
+  logic [P.DIVBLEN:0]          IntNormShiftM;                // Integer normalizatoin shift amount
   logic                        ALTBM, AsM, BsM, W64M;        // Special handling for postprocessor
   logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
   logic                        ISpecialCaseE;                // Integer div/remainder special cases
@@ -77,7 +77,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
     .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
     // Int-specific 
     .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
-    .BZeroM, .IntDivNormShiftM, .mM, .AM, 
+    .BZeroM, .IntNormShiftM, .AM, 
     .IntDivM, .W64M, .ALTBM, .AsM, .BsM);
 
   fdivsqrtfsm #(P) fdivsqrtfsm(                                  // FSM
@@ -96,6 +96,6 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
     .SqrtE, .Firstun, .SqrtM, .SpecialCaseM, 
     .UmM, .WZeroE, .DivStickyM, 
     // Int-specific 
-    .IntDivNormShiftM, .mM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
+    .IntNormShiftM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
     .FIntDivResultM);
 endmodule
diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 58649e3a8..3b6115201 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -37,7 +37,7 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
   input  logic               Firstun, SqrtM, SpecialCaseM, 
   input  logic [P.XLEN-1:0]  AM,
   input  logic               RemOpM, ALTBM, BZeroM, AsM, BsM, W64M,
-  input  logic [P.DIVBLEN:0] mM, IntDivNormShiftM,
+  input  logic [P.DIVBLEN:0] IntNormShiftM,
   output logic [P.DIVb:0]    UmM,               // result significand
   output logic               WZeroE,
   output logic               DivStickyM,
@@ -96,7 +96,6 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
 
   // Integer quotient or remainder correction, normalization, and special cases
   if (P.IDIV_ON_FPU) begin:intpostproc // Int supported
-    logic [P.DIVBLEN:0] NormShiftM;
     logic [P.DIVb+3:0] UnsignedQuotM, NormRemM, NormRemDM, NormQuotM;
     logic signed [P.DIVb+3:0] PreResultM, PreIntResultM;
 
@@ -110,10 +109,8 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.DIVb+4) quotresmux(UnsignedQuotM, -UnsignedQuotM, NegQuotM, NormQuotM);
 
     // Select quotient or remainder and do normalization shift
-    localparam DIVa        = (P.DIVb+1-P.XLEN); // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
-    mux2 #(P.DIVBLEN+1) normshiftmux(IntDivNormShiftM, (mM + (P.DIVBLEN+1)'(DIVa)), RemOpM, NormShiftM);
     mux2 #(P.DIVb+4)    presresultmux(NormQuotM, NormRemM, RemOpM, PreResultM);
-    assign PreIntResultM = $signed(PreResultM >>> NormShiftM); 
+    assign PreIntResultM = $signed(PreResultM >>> IntNormShiftM); 
 
     // special case logic
     // terminates immediately when B is Zero (div 0) or |A| has more leading 0s than |B|
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 35757e480..137f54d99 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -42,7 +42,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 IntDivE, W64E,
   output logic                 ISpecialCaseE,
   output logic [P.DURLEN-1:0]  CyclesE,
-  output logic [P.DIVBLEN:0]   mM, IntDivNormShiftM,
+  output logic [P.DIVBLEN:0]   IntNormShiftM,
   output logic                 ALTBM, IntDivM, W64M,
   output logic                 AsM, BsM, BZeroM,
   output logic [P.XLEN-1:0]    AM
@@ -193,10 +193,15 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBits, .CyclesE);
 
   if (P.IDIV_ON_FPU) begin:intpipelineregs
-    logic [P.DIVBLEN:0] IntDivNormShiftE;
+    logic [P.DIVBLEN:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
+    logic               RemOpE;
+
     /* verilator lint_off WIDTH */
     assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
+    assign IntRemNormShiftE = mE + (P.DIVb+1-P.XLEN);             // m + b - (N-1) for remainder normalization shift
     /* verilator lint_on WIDTH */
+    assign RemOpE = Funct3E[1];
+    mux2 #(P.DIVBLEN+1) normshiftmux(IntDivNormShiftE, IntRemNormShiftE, RemOpE, IntNormShiftE);
 
     // pipeline registers
     flopen #(1)          mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
@@ -204,8 +209,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     flopen #(1)        bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
     flopen #(1)        asignreg(clk, IFDivStartE, AsE,      AsM);
     flopen #(1)        bsignreg(clk, IFDivStartE, BsE,      BsM);
-    flopen #(P.DIVBLEN+1) nsreg(clk, IFDivStartE, IntDivNormShiftE, IntDivNormShiftM); 
-    flopen #(P.DIVBLEN+1)  mreg(clk, IFDivStartE, mE,       mM);
+    flopen #(P.DIVBLEN+1) nsreg(clk, IFDivStartE, IntNormShiftE, IntNormShiftM); 
     flopen #(P.XLEN)    srcareg(clk, IFDivStartE, AE,       AM);
     if (P.XLEN==64) 
       flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);

From d92f3e0216a398a33d56523aad4a255b97f19b85 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 16:42:32 -0800
Subject: [PATCH 08/11] fdivsqrt cleanup

---
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 2 +-
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 3b6115201..e9fd2fd2c 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -118,7 +118,7 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
       if (BZeroM) begin         // Divide by zero
         if (RemOpM) IntDivResultM = AM;  
         else        IntDivResultM = {(P.XLEN){1'b1}};
-     end else if (ALTBM) begin // Numerator is zero
+     end else if (ALTBM) begin // Numerator is small
         if (RemOpM) IntDivResultM = AM;
         else        IntDivResultM = '0;
      end else       IntDivResultM = PreIntResultM[P.XLEN-1:0];
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 137f54d99..66ba957e8 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -198,7 +198,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 
     /* verilator lint_off WIDTH */
     assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
-    assign IntRemNormShiftE = mE + (P.DIVb+1-P.XLEN);             // m + b - (N-1) for remainder normalization shift
+    assign IntRemNormShiftE = mE + (P.DIVb-(P.XLEN-1));           // m + b - (N-1) for remainder normalization shift
     /* verilator lint_on WIDTH */
     assign RemOpE = Funct3E[1];
     mux2 #(P.DIVBLEN+1) normshiftmux(IntDivNormShiftE, IntRemNormShiftE, RemOpE, IntNormShiftE);

From 1302a89baf7aa4a902d95faf48682fb97f1dcd46 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 18:01:13 -0800
Subject: [PATCH 09/11] divider cleanup

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv  | 18 +++++++++---------
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv |  8 ++++----
 src/fpu/unpackinput.sv              |  6 ------
 3 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index bba6e8005..d5c571940 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -30,12 +30,12 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0] FmtE,
   input  logic                 SqrtE,
   input  logic                 IntDivE,
-  input  logic [P.DIVBLEN:0]   IntResultBits,
+  input  logic [P.DIVBLEN:0]   IntResultBitsE,
   output logic [P.DURLEN-1:0]  CyclesE
 );
 
-  logic [P.DURLEN+1:0] Nf, FPResultBits; // number of fractional bits
-  logic [P.DIVBLEN:0]  ResultBits; // number of result bits;
+  logic [P.DURLEN+1:0] Nf, FPResultBitsE; // number of fractional bits
+  logic [P.DIVBLEN:0]  ResultBitsE; // number of result bits;
 
   // DIVN = P.NF+3
   // NS = NF + 1
@@ -72,16 +72,16 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   // Integer division needs p fractional + r integer result bits
   // FP Division needs at least Nf fractional bits + 2 guard/round bits and one integer digit (LOG R integer bits) = Nf + 2 + r bits
   // FP Sqrt needs at least Nf fractional bits, 2 guard/round bits, and *** shift bits
-  // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBits / rk)
+  // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
 
   always_comb begin 
-    if (SqrtE) FPResultBits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1 rather than +2; is it related to DIVCOPIES logic below?
-    else       FPResultBits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
+    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1 and +0 rather than +2; is it related to DIVCOPIES logic below?
+    else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
 
-    if (P.IDIV_ON_FPU) ResultBits = IntDivE ? IntResultBits : FPResultBits;
-    else               ResultBits = FPResultBits;
+    if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;
+    else               ResultBitsE = FPResultBitsE;
 
-    assign CyclesE = (ResultBits-1)/(P.RK) + 1; // ceil (ResultBits/rk)
+    assign CyclesE = (ResultBitsE-1)/(P.RK) + 1; // ceil (ResultBitsE/rk)
   end 
   /* verilator lint_on WIDTH */
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 66ba957e8..e950a40bd 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -54,7 +54,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
   logic [P.DIVBLEN:0]          mE, ell;                             // Leading zeros of inputs
-  logic [P.DIVBLEN:0]          IntResultBits;                       // bits in integer result
+  logic [P.DIVBLEN:0]          IntResultBitsE;                      // bits in integer result
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
   logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
   logic                        SignedDivE;                          // signed division
@@ -126,7 +126,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.DIVBLEN+1) pmux(ZeroDiff, '0, ALTBE, p);          
 
     /* verilator lint_off WIDTH */
-    assign IntResultBits = P.LOGR + p;  // Total number of result bits (r integer bits plus p fractional bits)
+    assign IntResultBitsE = P.LOGR + p;  // Total number of result bits (r integer bits plus p fractional bits)
     /* verilator lint_on WIDTH */
 
     // Integer special cases (terminate immediately)
@@ -137,7 +137,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
       logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
       logic [P.DIVBLEN:0] IntSteps;
       /* verilator lint_offf WIDTH */
-      assign RightShiftX = P.RK - 1 - ((IntResultBits - 1) % P.RK); // Right shift amount
+      assign RightShiftX = P.RK - 1 - ((IntResultBitsE - 1) % P.RK); // Right shift amount
       assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in n steps
       /* verilator lint_on WIDTH */
     end else begin // radix 2 1 copy doesn't require shifting
@@ -190,7 +190,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
 
   // Number of FSM cycles (to FSM)
-  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBits, .CyclesE);
+  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
 
   if (P.IDIV_ON_FPU) begin:intpipelineregs
     logic [P.DIVBLEN:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
diff --git a/src/fpu/unpackinput.sv b/src/fpu/unpackinput.sv
index c551e8173..b3d7f901e 100644
--- a/src/fpu/unpackinput.sv
+++ b/src/fpu/unpackinput.sv
@@ -83,7 +83,6 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (
       assign BadNaNBox = ~(Fmt|(&In[P.FLEN-1:P.LEN1])); // Check NaN boxing
       always_comb
         if (BadNaNBox) begin
-//          PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, In[P.LEN1-P.NE1-3:0]};
           PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, {(P.LEN1-P.NE1-2){1'b0}}};
         end else 
           PostBox = In;
@@ -143,8 +142,6 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (
         if (BadNaNBox) begin
           case (Fmt)
             P.FMT: PostBox = In;
-//            P.FMT1: PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, In[P.LEN1-P.NE1-3:0]};
-//            P.FMT2: PostBox = {{(P.FLEN-P.LEN2){1'b1}}, 1'b1, {(P.NE2+1){1'b1}}, In[P.LEN2-P.NE2-3:0]};
             P.FMT1: PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, {(P.LEN1-P.NE1-2){1'b0}}};
             P.FMT2: PostBox = {{(P.FLEN-P.LEN2){1'b1}}, 1'b1, {(P.NE2+1){1'b1}}, {(P.LEN2-P.NE2-2){1'b0}}};
             default: PostBox = 'x;
@@ -230,9 +227,6 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (
         if (BadNaNBox) begin
           case (Fmt)
             2'b11: PostBox = In;
-//            2'b01: PostBox = {{(P.Q_LEN-P.D_LEN){1'b1}}, 1'b1, {(P.D_NE+1){1'b1}}, In[P.D_LEN-P.D_NE-3:0]};
-//            2'b00: PostBox = {{(P.Q_LEN-P.S_LEN){1'b1}}, 1'b1, {(P.S_NE+1){1'b1}}, In[P.S_LEN-P.S_NE-3:0]};
-//            2'b10: PostBox = {{(P.Q_LEN-P.H_LEN){1'b1}}, 1'b1, {(P.H_NE+1){1'b1}}, In[P.H_LEN-P.H_NE-3:0]};
             2'b01: PostBox = {{(P.Q_LEN-P.D_LEN){1'b1}}, 1'b1, {(P.D_NE+1){1'b1}}, {(P.D_LEN-P.D_NE-2){1'b0}}};
             2'b00: PostBox = {{(P.Q_LEN-P.S_LEN){1'b1}}, 1'b1, {(P.S_NE+1){1'b1}}, {(P.S_LEN-P.S_NE-2){1'b0}}};
             2'b10: PostBox = {{(P.Q_LEN-P.H_LEN){1'b1}}, 1'b1, {(P.H_NE+1){1'b1}}, {(P.H_LEN-P.H_NE-2){1'b0}}};

From 6ed5ba4a85c88529775df12641fad004c56b9fc3 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 18:19:41 -0800
Subject: [PATCH 10/11] Simplified out LOGRK parameter

---
 config/shared/config-shared.vh      | 5 ++---
 config/shared/parameter-defs.vh     | 1 -
 src/cvw.sv                          | 1 -
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 5 ++---
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 17b1ede83..10b56f24e 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -97,11 +97,10 @@ localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 localparam DIVN        = ((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2; // standard length of input: max(XLEN, NF+2)
 localparam LOGR        = $clog2(RADIX);                             // r = log(R)
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
-localparam LOGRK       = $clog2(RK);                                // log2(r*k)
-localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // 
+localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // *** relate to algorithm for rest of these
 localparam DURLEN      = $clog2(FPDUR+1);
 localparam DIVb        = FPDUR*RK - 1; // canonical fdiv size (b)
-localparam DIVBLEN     = $clog2(DIVb+2)-1;
+localparam DIVBLEN     = $clog2(DIVb+2)-1;                          // *** where is 2 coming from?
 
 // largest length in IEU/FPU
 localparam CVTLEN = ((NF<XLEN) ? (XLEN) : (NF));  // max(XLEN, NF)
diff --git a/config/shared/parameter-defs.vh b/config/shared/parameter-defs.vh
index 0c377c02d..85c9d1c19 100644
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@@ -180,7 +180,6 @@ localparam cvw_t P = '{
   DIVN        : DIVN,
   LOGR        : LOGR,
   RK          : RK,
-  LOGRK       : LOGRK,
   FPDUR       : FPDUR,
   DURLEN      : DURLEN,
   DIVb        : DIVb,
diff --git a/src/cvw.sv b/src/cvw.sv
index 02105823e..3c32982bd 100644
--- a/src/cvw.sv
+++ b/src/cvw.sv
@@ -272,7 +272,6 @@ typedef struct packed {
   int DIVN       ;
   int LOGR       ;
   int RK         ;
-  int LOGRK      ;
   int FPDUR      ;
   int DURLEN     ;
   int DIVb       ;
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index e950a40bd..97ceeb085 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -133,9 +133,8 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     assign ISpecialCaseE = BZeroE | ALTBE;
 
     // calculate right shift amount RightShiftX to complete in discrete number of steps
-    if (P.LOGRK > 0) begin // more than 1 bit per cycle
-      logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
-      logic [P.DIVBLEN:0] IntSteps;
+    if (P.RK > 1) begin // more than 1 bit per cycle
+      logic [$clog2(P.RK)-1:0] RightShiftX;
       /* verilator lint_offf WIDTH */
       assign RightShiftX = P.RK - 1 - ((IntResultBitsE - 1) % P.RK); // Right shift amount
       assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in n steps

From 35efb7082c8dcdc6ca4794297d6dc4e8009619de Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 18:33:08 -0800
Subject: [PATCH 11/11] fdivsqrt parameter cleanup

---
 config/shared/config-shared.vh  | 10 ++++++++--
 config/shared/parameter-defs.vh |  1 -
 src/cvw.sv                      |  1 -
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 10b56f24e..12967764f 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -93,11 +93,17 @@ localparam NF2   = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_NF   : H_NF);
 localparam FMT2  = ((F_SUPPORTED & (LEN1 != S_LEN)) ? 2'd0    : 2'd2);
 localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 
+// intermediate division parameters not directly used in Divider
+localparam FPDIVN      = NF+3; // length of floating-point inputs: Ns + 2 = Nf + 3 for 1 integer bit, Nf fracitonal bits, 2 extra bits to shift sqrt into [1/4, 1)]
+localparam DIVN        = ((FPDIVN<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVN+3; // standard length of input: max(XLEN, NF+2) ***
+
 // division constants
-localparam DIVN        = ((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2; // standard length of input: max(XLEN, NF+2)
+
+// *** define NF+2, justify, use in DIVN
 localparam LOGR        = $clog2(RADIX);                             // r = log(R)
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
-localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // *** relate to algorithm for rest of these
+//localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // *** relate to algorithm for rest of these
+localparam FPDUR       = (DIVN+LOGR-1)/RK + 1 ;               // ceiling((DIVN+LOGR)/RK)
 localparam DURLEN      = $clog2(FPDUR+1);
 localparam DIVb        = FPDUR*RK - 1; // canonical fdiv size (b)
 localparam DIVBLEN     = $clog2(DIVb+2)-1;                          // *** where is 2 coming from?
diff --git a/config/shared/parameter-defs.vh b/config/shared/parameter-defs.vh
index 85c9d1c19..57d61fc00 100644
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@@ -177,7 +177,6 @@ localparam cvw_t P = '{
   NORMSHIFTSZ : NORMSHIFTSZ,
   LOGNORMSHIFTSZ : LOGNORMSHIFTSZ,
   CORRSHIFTSZ : CORRSHIFTSZ,
-  DIVN        : DIVN,
   LOGR        : LOGR,
   RK          : RK,
   FPDUR       : FPDUR,
diff --git a/src/cvw.sv b/src/cvw.sv
index 3c32982bd..cc968b803 100644
--- a/src/cvw.sv
+++ b/src/cvw.sv
@@ -269,7 +269,6 @@ typedef struct packed {
   int CORRSHIFTSZ;
 
 // division constants
-  int DIVN       ;
   int LOGR       ;
   int RK         ;
   int FPDUR      ;