From 65c5ec6e9d2b88e85dc9b0a683193f510f655d98 Mon Sep 17 00:00:00 2001 From: David Harris Date: Sun, 12 Nov 2023 06:15:47 -0800 Subject: [PATCH 1/4] fdivsqrt comment improvements --- src/fpu/fdivsqrt/fdivsqrtcycles.sv | 2 +- src/fpu/fdivsqrt/fdivsqrtexpcalc.sv | 8 +++++--- src/fpu/fdivsqrt/fdivsqrtfgen2.sv | 8 ++++---- src/fpu/fdivsqrt/fdivsqrtfgen4.sv | 12 ++++++------ src/fpu/fdivsqrt/fdivsqrtfsm.sv | 2 +- src/fpu/fdivsqrt/fdivsqrtiter.sv | 8 ++++---- src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 11 ++++++----- src/fpu/fdivsqrt/fdivsqrtqsel2.sv | 25 ++++++++----------------- src/fpu/fdivsqrt/fdivsqrtstage2.sv | 16 +++++----------- 9 files changed, 40 insertions(+), 52 deletions(-) diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv index 20fb16f62..6043ebb4a 100644 --- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv +++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv @@ -30,7 +30,7 @@ module fdivsqrtcycles import cvw::*; #(parameter cvw_t P) ( input logic [P.FMTBITS-1:0] FmtE, input logic SqrtE, input logic IntDivE, - input logic [P.DIVBLEN-1:0] IntResultBitsE, + input logic [P.DIVBLEN-1:0] IntResultBitsE, output logic [P.DURLEN-1:0] CyclesE ); diff --git a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv index a1dd82e35..cf243a84b 100644 --- a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv +++ b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv @@ -28,17 +28,19 @@ module fdivsqrtexpcalc import cvw::*; #(parameter cvw_t P) ( input logic [P.FMTBITS-1:0] Fmt, - input logic [P.NE-1:0] Xe, Ye, + input logic [P.NE-1:0] Xe, Ye, // input exponents input logic Sqrt, input logic XZero, - input logic [P.DIVBLEN-1:0] ell, m, - output logic [P.NE+1:0] Ue + input logic [P.DIVBLEN-1:0] ell, m, // number of leading 0s in Xe and Ye + output logic [P.NE+1:0] Ue // result exponent ); logic [P.NE-2:0] Bias; logic [P.NE+1:0] SXExp; logic [P.NE+1:0] SExp; logic [P.NE+1:0] DExp; + + // Determine exponent bias according to the format if (P.FPSIZES == 1) begin assign Bias = (P.NE-1)'(P.BIAS); diff --git a/src/fpu/fdivsqrt/fdivsqrtfgen2.sv b/src/fpu/fdivsqrt/fdivsqrtfgen2.sv index 990e3f19f..cf398f570 100644 --- a/src/fpu/fdivsqrt/fdivsqrtfgen2.sv +++ b/src/fpu/fdivsqrt/fdivsqrtfgen2.sv @@ -28,12 +28,12 @@ module fdivsqrtfgen2 import cvw::*; #(parameter cvw_t P) ( input logic up, uz, - input logic [P.DIVb+3:0] C, U, UM, - output logic [P.DIVb+3:0] F + input logic [P.DIVb+3:0] C, U, UM, // Q4.DIVb (extended from shorter forms) + output logic [P.DIVb+3:0] F // Q4.DIVb ); - logic [P.DIVb+3:0] FP, FN, FZ; + logic [P.DIVb+3:0] FP, FN, FZ; // Q4.DIVb - // Generate for both positive and negative bits + // Generate for both positive and negative quotient digits assign FP = ~(U << 1) & C; assign FN = (UM << 1) | (C & ~(C << 2)); assign FZ = '0; diff --git a/src/fpu/fdivsqrt/fdivsqrtfgen4.sv b/src/fpu/fdivsqrt/fdivsqrtfgen4.sv index fc648f5bd..e2cec1ab4 100644 --- a/src/fpu/fdivsqrt/fdivsqrtfgen4.sv +++ b/src/fpu/fdivsqrt/fdivsqrtfgen4.sv @@ -27,14 +27,14 @@ //////////////////////////////////////////////////////////////////////////////////////////////// module fdivsqrtfgen4 import cvw::*; #(parameter cvw_t P) ( - input logic [3:0] udigit, - input logic [P.DIVb+3:0] C, U, UM, - output logic [P.DIVb+3:0] F + input logic [3:0] udigit, // {2, 1, -1, -2}; all cold for zero + input logic [P.DIVb+3:0] C, U, UM, // Q4.DIVb (extended from shorter forms) + output logic [P.DIVb+3:0] F // Q4.DIVb ); - logic [P.DIVb+3:0] F2, F1, F0, FN1, FN2; + logic [P.DIVb+3:0] F2, F1, F0, FN1, FN2; // Q4.DIVb - // Generate for both positive and negative bits - assign F2 = (~U << 2) & (C << 2); + // Generate for both positive and negative digits + assign F2 = (~U << 2) & (C << 2); // assign F1 = ~(U << 1) & C; assign F0 = '0; assign FN1 = (UM << 1) | (C & ~(C << 3)); diff --git a/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/src/fpu/fdivsqrt/fdivsqrtfsm.sv index 0e2cba90e..862d53b25 100644 --- a/src/fpu/fdivsqrt/fdivsqrtfsm.sv +++ b/src/fpu/fdivsqrt/fdivsqrtfsm.sv @@ -57,7 +57,7 @@ module fdivsqrtfsm import cvw::*; #(parameter cvw_t P) ( // terminate immediately on special cases assign FSpecialCaseE = XZeroE | XInfE | XNaNE | (XsE&SqrtE) | (YZeroE | YInfE | YNaNE)&~SqrtE; if (P.IDIV_ON_FPU) assign SpecialCaseE = IntDivE ? ISpecialCaseE : FSpecialCaseE; - else assign SpecialCaseE = FSpecialCaseE; + else assign SpecialCaseE = FSpecialCaseE; flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc always_ff @(posedge clk) begin diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv index 0f66982ab..863d94837 100644 --- a/src/fpu/fdivsqrt/fdivsqrtiter.sv +++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv @@ -104,14 +104,14 @@ module fdivsqrtiter import cvw::*; #(parameter cvw_t P) ( for(i=0; $unsigned(i) -1 to choose 0, -1, 1 respectively - assign magnitude = ~((WS[2]^WC[2]) & (WS[1]^WC[1]) & + //if p2 * p1 * p0, W = -1 and choose digit of 0 + assign uz = ((WS[2]^WC[2]) & (WS[1]^WC[1]) & (WS[0]^WC[0])); + + // Otherwise determine sign using carry chain: sign = p3 ^ g_2:0 assign sign = (WS[3]^WC[3])^ (WS[2] & WC[2] | ((WS[2]^WC[2]) & (WS[1]&WC[1] | ((WS[1]^WC[1]) & (WS[0]&WC[0]))))); // Produce digit = +1, 0, or -1 - assign up = magnitude & ~sign; - assign uz = ~magnitude; - assign un = magnitude & sign; + assign up = ~uz & ~sign; + assign un = ~uz & sign; endmodule diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel4.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv similarity index 72% rename from src/fpu/fdivsqrt/fdivsqrtqsel4.sv rename to src/fpu/fdivsqrt/fdivsqrtuslc4.sv index de520bef2..268ca9ea2 100644 --- a/src/fpu/fdivsqrt/fdivsqrtqsel4.sv +++ b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv @@ -1,10 +1,10 @@ /////////////////////////////////////////// -// fdivsqrtqsel4.sv +// fdivsqrtuslc4.sv // // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu // Modified:13 January 2022 // -// Purpose: Radix 4 Quotient Digit Selection +// Purpose: Table-based Radix 4 Unified Quotient/Square Root Digit Selection // // Documentation: RISC-V System on Chip Design Chapter 13 // @@ -26,25 +26,25 @@ // and limitations under the License. //////////////////////////////////////////////////////////////////////////////////////////////// -module fdivsqrtqsel4 ( - input logic [2:0] Dmsbs, - input logic [4:0] Smsbs, - input logic [7:0] WSmsbs, WCmsbs, +module fdivsqrtuslc4 ( + input logic [2:0] Dmsbs, // U0.3 fractional bits after implicit leading 1 + input logic [4:0] Smsbs, // U1.4 leading bits of square root approximation + input logic [7:0] WSmsbs, WCmsbs, // Q4.4 redundant residual most significant bits input logic Sqrt, j1, - output logic [3:0] udigit + output logic [3:0] udigit // {2, 1, -1, -2} digit is 0 if none are hot ); - logic [6:0] Wmsbs; - logic [7:0] PreWmsbs; - logic [2:0] A; + logic [7:0] PreWmsbs; // Q4.4 nonredundant residual msbs + logic [6:0] Wmsbs; // Q4.3 truncated nonredundant residual + logic [2:0] A; // U0.3 upper bits of D or Smsbs, discarding integer bit - assign PreWmsbs = WCmsbs + WSmsbs; - assign Wmsbs = PreWmsbs[7:1]; + assign PreWmsbs = WCmsbs + WSmsbs; // add redundant residual to find msbs + assign Wmsbs = PreWmsbs[7:1]; // truncate least significant bit to Q4.3 to index table // D = 0001.xxx... // Dmsbs = | | // W = xxxx.xxx... // Wmsbs = | | - logic [3:0] USel4[1023:0]; + logic [3:0] USel4[1023:0]; // 1024-bit table indexed with 3 bits of A and 7 bits of Wmsbs // Prepopulate selection table; this is constant at compile time always_comb begin @@ -101,10 +101,10 @@ module fdivsqrtqsel4 ( // Select A always_comb if (Sqrt) begin - if (j1) A = 3'b101; - else if (Smsbs == 5'b10000) A = 3'b111; - else A = Smsbs[2:0]; - end else A = Dmsbs; + if (j1) A = 3'b101; // on first sqrt iteration A = .101 + else if (Smsbs == 5'b10000) A = 3'b111; // if S = 1.0, use A = .111 + else A = Smsbs[2:0]; // otherwise use A = S (in U0.3 format) + end else A = Dmsbs; // division Unless A = D (IN U0.3 format, dropping leading 1) // Select quotient digit from lookup table based on A and W assign udigit = USel4[{A,Wmsbs}]; diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv similarity index 90% rename from src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv rename to src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv index fe436413e..ccb5e618a 100644 --- a/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv +++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv @@ -1,10 +1,10 @@ /////////////////////////////////////////// -// fdivsqrtqsel4cmp.sv +// fdivsqrtuslc4cmp.sv // // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu // Modified:13 January 2022 // -// Purpose: Comparator-based Radix 4 Quotient Digit Selection +// Purpose: Comparator-based Radix 4 Unified Quotient/Square Root Digit Selection // // Documentation: RISC-V System on Chip Design Chapter 13 // @@ -26,12 +26,12 @@ // and limitations under the License. //////////////////////////////////////////////////////////////////////////////////////////////// -module fdivsqrtqsel4cmp ( +module fdivsqrtuslc4cmp ( input logic [2:0] Dmsbs, // U0.3 fractional bits after implicit leading 1 input logic [4:0] Smsbs, // U1.4 leading bits of square root approximation - input logic [7:0] WSmsbs, WCmsbs, // Q4.4 + input logic [7:0] WSmsbs, WCmsbs, // Q4.4 residual most significant bits input logic SqrtE, j1, - output logic [3:0] udigit + output logic [3:0] udigit // {2, 1, -1, -2} digit is 0 if none are hot ); logic [6:0] Wmsbs; logic [7:0] PreWmsbs; From b49330c5566397238886c73e78e5498ffb0eb6b7 Mon Sep 17 00:00:00 2001 From: David Harris Date: Sun, 12 Nov 2023 10:05:54 -0800 Subject: [PATCH 3/4] Explained sqrt preshifting --- config/shared/config-shared.vh | 1 + src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 41 ++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 12 deletions(-) diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh index 14de5187e..9635d706b 100644 --- a/config/shared/config-shared.vh +++ b/config/shared/config-shared.vh @@ -99,6 +99,7 @@ localparam RK = LOGR*DIVCOPIES; // r*k bits // intermediate division parameters not directly used in fdivsqrt hardware localparam FPDIVMINb = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit because square root could be shifted right *** explain better +//localparam FPDIVMINb = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right localparam DIVMINb = ((FPDIVMINb Date: Sun, 12 Nov 2023 19:41:12 -0800 Subject: [PATCH 4/4] Divider cleanup --- config/shared/config-shared.vh | 4 ++-- src/fpu/fdivsqrt/fdivsqrtcycles.sv | 6 +++--- src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 13 ++++++++++--- src/fpu/fdivsqrt/fdivsqrtuslc4.sv | 2 +- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh index 9635d706b..55bca569f 100644 --- a/config/shared/config-shared.vh +++ b/config/shared/config-shared.vh @@ -98,8 +98,8 @@ localparam LOGR = $clog2(RADIX); // r = log(R localparam RK = LOGR*DIVCOPIES; // r*k bits per cycle generated // intermediate division parameters not directly used in fdivsqrt hardware -localparam FPDIVMINb = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit because square root could be shifted right *** explain better -//localparam FPDIVMINb = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right +localparam FPDIVMINb = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit to allow sqrt being shifted right +//localparam FPDIVMINb = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right, if radix4 doesn't use a right shift. This version saves one cycle on double-precision with R=4,k=4. However, it doesn't work yet because C is too short, so k is incorrectly calculated as a 1 in the lsb after the last step. localparam DIVMINb = ((FPDIVMINb