Merge pull request #646 from kevindkim723/sqrtbugfix_USLC

Square root R=4 K=2 bug fix
2025-02-11 06:05:49 +00:00 · 2024-03-04 16:04:14 -08:00 · 2024-03-04 16:04:14 -08:00 · 2e31bf021c
commit 2e31bf021c
parent 6ed2376582 10ab07975f
7 changed files with 34 additions and 27 deletions
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@ -94,7 +94,7 @@ localparam LOGR        = $clog2(RADIX);                             // r = log(R
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated

 // intermediate division parameters not directly used in fdivsqrt hardware
-localparam FPDIVMINb   = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit to allow sqrt being shifted right
+localparam FPDIVMINb   = NF + 2; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit to allow sqrt being shifted right
 //localparam FPDIVMINb   = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right, if radix4 doesn't use a right shift.  This version saves one cycle on double-precision with R=4,k=4.  However, it doesn't work yet because C is too short, so k is incorrectly calculated as a 1 in the lsb after the last step.
 localparam DIVMINb     = ((FPDIVMINb<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVMINb; // minimum fractional bits b = max(XLEN, FPDIVMINb)
 localparam RESBITS     = DIVMINb + LOGR; // number of bits in a result: r integer + b fractional
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@ -71,8 +71,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
  // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)

  always_comb begin 
-    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
-    else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits 
+    FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1

    if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;
    else               ResultBitsE = FPResultBitsE;
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@ -72,20 +72,19 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (

  // UOTFC Result U and UM registers/initialization mux
  // Initialize U to 1.0 and UM to 0 for square root; U to 0 and UM to -1 otherwise
-  assign initU  = {SqrtE, {(P.DIVb){1'b0}}};
-  assign initUM = {~SqrtE, {(P.DIVb){1'b0}}};
+  assign initU  ={(P.DIVb+1){1'b0}};
+  assign initUM = {{1'b1}, {(P.DIVb){1'b0}}};
  mux2   #(P.DIVb+1)  Umux(UNext[P.DIVCOPIES-1],  initU,  IFDivStartE, UMux);
  mux2   #(P.DIVb+1) UMmux(UMNext[P.DIVCOPIES-1], initUM, IFDivStartE, UMMux);
  flopen #(P.DIVb+1)  UReg(clk, FDivBusyE, UMux,  U[0]);
  flopen #(P.DIVb+1) UMReg(clk, FDivBusyE, UMMux, UM[0]);

  // C register/initialization mux
-  // Initialize C to -1 for sqrt and -R for division
  logic [1:0] initCUpper;
  if(P.RADIX == 4) begin
-    mux2 #(2) cuppermux4(2'b00, 2'b11, SqrtE, initCUpper);
+    assign initCUpper = 2'b00;
  end else begin
-    mux2 #(2) cuppermux2(2'b10, 2'b11, SqrtE, initCUpper);
+    assign initCUpper = 2'b10;
  end
  
  assign initC = {initCUpper, {P.DIVb{1'b0}}};
@ -108,9 +107,7 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
          .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
          .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
      end else begin: stage
-        logic j1;
-        assign j1 = (i == 0 & ~C[0][P.DIVb-1]);
-        fdivsqrtstage4 #(P) fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, .j1,
+        fdivsqrtstage4 #(P) fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, 
          .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
          .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
      end
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@ -174,9 +174,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (

  logic [P.DIVb:0] PreSqrtX;
  assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
-  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
-  if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
-  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};                     // 2PreSqrtX - 4 = 4(PreSqrtX/2 - 1) 
+  mux2 #(P.DIVb+4) sqrtxmux({4'b0,Xnorm[P.DIVb:1]}, {5'b00, Xnorm[P.DIVb:2]}, EvenExp, SqrtX); // X/2 if exponent odd, X/4 if exponent even

 /*  
  // Attempt to optimize radix 4 to use a left shift by 1 or zero initially, followed by no more left shift
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@ -32,7 +32,7 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.DIVb:0]   U,UM,               // U1.DIVb
  input  logic [P.DIVb+3:0] WS, WC,             // Q4.DIVb
  input  logic [P.DIVb+1:0] C,                  // Q2.DIVb
-  input  logic              SqrtE, j1,
+  input  logic              SqrtE, 
  output logic [P.DIVb+1:0] CNext,              // Q2.DIVb
  output logic              un,
  output logic [P.DIVb:0]   UNext, UMNext,      // U1.DIVb
@ -48,13 +48,16 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
  logic [7:0]               WCmsbs, WSmsbs;     // U4.4
  logic                     CarryIn;
  logic [P.DIVb+3:0]        WSA, WCA;           // Q4.DIVb
+  logic j0,j1;

  // Digit Selection logic
+  assign j0     = ~C[P.DIVb+1];             // first step of R digit selection: C = 00...0
+  assign j1     = C[P.DIVb] ^ C[P.DIVb-1];  // second step of R digit selection: C = 1100...0
  assign Smsbs  = U[P.DIVb:P.DIVb-4];       // U1.4 most significant bits of square root
  assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
  assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
  assign WSmsbs = WS[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
-  fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .udigit);
+  fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .j0, .udigit);
  assign un = 1'b0; // unused for radix 4

  // F generation logic
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@ -31,7 +31,8 @@ module fdivsqrtuslc4cmp (
  input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
  input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
  input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 residual most significant bits
-  input  logic       SqrtE, j1,
+  input  logic       SqrtE, 
+  input  logic       j0,j1,             // are we on first (j0) or second step (j1) of digit selection
  output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
  logic [6:0] Wmsbs;
@ -46,7 +47,9 @@ module fdivsqrtuslc4cmp (
  // Wmsbs = |        |

  logic [6:0] mk2, mk1, mk0, mkm1;
+  logic [6:0] mkj2, mkj1, mkj0, mkjm1;
  logic [6:0] mks2[7:0], mks1[7:0]; 
+  logic sqrtspecial;

  // Prepopulate table of mks0
  assign mks2[0] = 12;
@ -65,20 +68,26 @@ module fdivsqrtuslc4cmp (
  assign mks1[5] = 8; // is the logic any cheaper if this is a 6?
  assign mks1[6] = 8;
  assign mks1[7] = 8;
+  
+  // handles special case when j = 0 or j = 1 for sqrt
+  assign mkj2 = 20; // when j = 1 use mk2[101] when j = 0 use anything bigger than 7.
+  assign mkj1 = j1 ? 8 : 0; // when j = 1 use mk1[101] = 8 and when j = 0 use 0 so we choose u_0 = 1
+  assign sqrtspecial = SqrtE & (j1 | j0);

-  // Choose A for current operation
+  // Choose A for current operation 
 always_comb
    if (SqrtE) begin 
-      if (j1) A = 3'b101;
-      else if (Smsbs == 5'b10000) A = 3'b111;
+      if (Smsbs[4]) A = 3'b111; // *** can we get rid of SMSBs case?
      else A = Smsbs[2:0];
    end else A = Dmsbs;

+    
  // Choose selection constants based on a
-  assign mk2 = mks2[A];
-  assign mk1 = mks1[A];
-  assign mk0 = -mks1[A];
-  assign mkm1 = (A == 3'b000) ? -13 : -mks2[A]; // asymmetry in table
+  
+  assign mk2 = sqrtspecial ? mkj2 : mks2[A];
+  assign mk1 = sqrtspecial ? mkj1 : mks1[A];
+  assign mk0 = -mk1;
+  assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table *** can we hide?
 
  // Compare residual W to selection constants to choose digit
  always_comb 
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@ -51,7 +51,8 @@ configs: $(CONFIG)
 $(CONFIG):
 	@echo $(CONFIG)
 	cp -r $(OLDCONFIGDIR)/shared/*.vh $(CONFIGDIR)
-	cp -r $(OLDCONFIGDIR)/$(CONFIG)/* $(CONFIGDIR)
+#   cp -r $(OLDCONFIGDIR)/$(CONFIG)/* $(CONFIGDIR)
+	cp -r $(OLDCONFIGDIR)/deriv/$(CONFIG)/* $(CONFIGDIR)

 # adjust DTIM and IROM to reasonable values depending on config	
 ifneq ($(filter $(CONFIG), $(DIRS32)),)
@ -61,8 +62,8 @@ else ifneq ($(filter $(CONFIG), $(DIRS64)),)
 	sed -i "s/DTIM_RANGE.*/DTIM_RANGE	= 56\'h01FF;/g" $(CONFIGDIR)/config.vh
 	sed -i "s/IROM_RANGE.*/IROM_RANGE	= 56\'h01FF;/g" $(CONFIGDIR)/config.vh
 else 
-    $(info $(CONFIG) does not exist in $(DIRS32) or $(DIRS64))
-    @echo "Config not in list, RAM_RANGE will be unmodified"
+	$(info $(CONFIG) does not exist in $(DIRS32) or $(DIRS64))
+	@echo "Config not in list, RAM_RANGE will be unmodified"
 endif

 # if USESRAM = 1, set that in the config file, otherwise reduce sizes