From 86ebdd05f0753693a1d86e27ea48f7e897e4fe90 Mon Sep 17 00:00:00 2001
From: cturek <cturek@hmc.edu>
Date: Thu, 21 Jul 2022 17:59:10 +0000
Subject: [PATCH 1/2] Division working too

---
 pipelined/srt/srt.sv       | 2 +-
 pipelined/srt/testbench.sv | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelined/srt/srt.sv b/pipelined/srt/srt.sv
index 157be2e7f..a7216b9ff 100644
--- a/pipelined/srt/srt.sv
+++ b/pipelined/srt/srt.sv
@@ -2,7 +2,7 @@
 // srt.sv
 //
 // Written: David_Harris@hmc.edu 13 January 2022
-// Modified: cturek@hmc.edu June 2022
+// Modified: cturek@hmc.edu July 2022
 //
 // Purpose: Combined Divide and Square Root Floating Point and Integer Unit
 // 
diff --git a/pipelined/srt/testbench.sv b/pipelined/srt/testbench.sv
index 39696af44..7a4e1897b 100644
--- a/pipelined/srt/testbench.sv
+++ b/pipelined/srt/testbench.sv
@@ -72,7 +72,7 @@ module testbench;
 
   // Equip Int test or Sqrt test
   assign Int = 1'b0;
-  assign Sqrt = 1'b1;
+  assign Sqrt = 1'b0;
 
   // Divider
   srt srt(.clk, .Start(req), 
@@ -101,7 +101,7 @@ module testbench;
     begin
       testnum = 0; 
       errors = 0;
-      $readmemh ("sqrttestvectors", Tests);
+      $readmemh ("testvectors", Tests);
       Vec = Tests[testnum];
       a = Vec[`mema];
       {asign, aExp, afrac} = a;

From fbe8bb2298413d731f70306e100e8cc3881222ec Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Thu, 21 Jul 2022 19:38:06 +0000
Subject: [PATCH 2/2] radix-4 division integrated into srt - not tested

---
 addins/embench-iot                      |   2 +-
 pipelined/config/shared/wally-shared.vh |   2 +-
 pipelined/regression/wave-fpu.do        |   2 +-
 pipelined/src/fpu/divsqrt.sv            |  13 +--
 pipelined/src/fpu/fctrl.sv              |   4 +-
 pipelined/src/fpu/fpu.sv                |   2 +-
 pipelined/src/fpu/otfc.sv               |  70 ++++++++++++++++
 pipelined/src/fpu/qsel.sv               |  89 +++++++++++++++++----
 pipelined/src/fpu/srt.sv                | 102 +++++++++++-------------
 pipelined/src/fpu/srtpreproc.sv         |  78 +++++++++++++++---
 pipelined/testbench/testbench-fp.sv     |   2 +-
 11 files changed, 271 insertions(+), 95 deletions(-)

diff --git a/addins/embench-iot b/addins/embench-iot
index 58ffa0c68..261a65e0a 160000
--- a/addins/embench-iot
+++ b/addins/embench-iot
@@ -1 +1 @@
-Subproject commit 58ffa0c68c52f291d12c5902fc787d2bca94ddf9
+Subproject commit 261a65e0a2d3e8d62d81b1d8fe7e309a096bc6a9
diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index 015ef2611..b2abdff7b 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -101,7 +101,7 @@
 `define CORRSHIFTSZ ((`DIVRESLEN+`NF) > (3*`NF+8) ? (`DIVRESLEN+`NF) : (3*`NF+6))
 
 // division constants
-`define RADIX 32'h2
+`define RADIX 32'h4
 `define DIVCOPIES 32'h1
 `define DIVLEN ((`NF < `XLEN) ? (`XLEN) : (`NF + 3))
 `define EXTRAFRACBITS ((`NF<(`XLEN)) ? (`XLEN - `NF) : 3)
diff --git a/pipelined/regression/wave-fpu.do b/pipelined/regression/wave-fpu.do
index 98c72f170..b71207e09 100644
--- a/pipelined/regression/wave-fpu.do
+++ b/pipelined/regression/wave-fpu.do
@@ -33,7 +33,7 @@ add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/intera
 # add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/otfc/otfc2/*
 # add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/qsel/qsel2/*
 add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srtpreproc/*
-add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/expcalc/*
+# add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/expcalc/*
 add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srtfsm/*
 add wave -group {Testbench} -noupdate /testbenchfp/*
 add wave -group {Testbench} -noupdate /testbenchfp/readvectors/*
diff --git a/pipelined/src/fpu/divsqrt.sv b/pipelined/src/fpu/divsqrt.sv
index a2f0ba8e3..7ba44a953 100644
--- a/pipelined/src/fpu/divsqrt.sv
+++ b/pipelined/src/fpu/divsqrt.sv
@@ -41,7 +41,8 @@ module divsqrt(
   input  logic XNaNE, YNaNE, 
   input  logic DivStartE, 
   input  logic StallM,
-  input logic StallE,
+  input  logic StallE,
+  input  logic SqrtE, SqrtM,
   output logic DivSM,
   output logic DivBusy,
   output logic DivDone,
@@ -55,15 +56,15 @@ module divsqrt(
   logic [`DIVLEN+3:0]  WS, WC;
   logic [`DIVLEN+3:0] StickyWSA;
   logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
-  logic [`DIVLEN-1:0] X;
-  logic [`DIVLEN-1:0] Dpreproc;
+  logic [`DIVLEN+3:0] X;
+  logic [`DIVLEN+3:0] Dpreproc;
   logic [`DURLEN-1:0] Dur;
   logic NegSticky;
 
-  srtpreproc srtpreproc(.Xm(XmE), .Dur, .Ym(YmE), .X,.Dpreproc, .XZeroCnt, .YZeroCnt);
+  srtpreproc srtpreproc(.clk, .DivStart(DivStartE), .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), .Sqrt(SqrtE), .Dur, .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc, .XZeroCnt, .YZeroCnt);
 
   srtfsm srtfsm(.reset, .NextWSN, .NextWCN, .WS, .WC, .Dur, .DivBusy, .clk, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, .DivSE(DivSM), .XNaNE, .YNaNE,
                .StickyWSA, .XInfE, .YInfE, .NegSticky(NegSticky), .EarlyTermShiftE(EarlyTermShiftM));
-  srt srt(.clk, .FmtE, .X,.Dpreproc, .NegSticky, .XZeroCnt, .YZeroCnt, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, .DivStart(DivStartE), .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
-                .StickyWSA, .DivBusy, .Qm(QmM), .Rem(), .QeM);
+  srt srt(.clk, .Sqrt(SqrtM), .X,.Dpreproc, .NegSticky, .XZeroCnt, .YZeroCnt, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, .DivStart(DivStartE), .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
+                .StickyWSA, .DivBusy, .Qm(QmM), .Rem());
 endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/fctrl.sv b/pipelined/src/fpu/fctrl.sv
index 5b6b22ef0..20e4a0099 100755
--- a/pipelined/src/fpu/fctrl.sv
+++ b/pipelined/src/fpu/fctrl.sv
@@ -219,8 +219,8 @@ module fctrl (
 //        110 - add
 //        111 - sub
 //    Div: 
-//        0 - ???
-//        1 - ???
+//        0 - div
+//        1 - sqrt
 //    Cvt Int: {Int to Fp?, 64 bit int?, signed int?}
 //    Cvt Fp: output format
 //        10 - to half
diff --git a/pipelined/src/fpu/fpu.sv b/pipelined/src/fpu/fpu.sv
index cfa46b657..3e214b0f1 100755
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@@ -259,7 +259,7 @@ module fpu (
    //    - fdiv
    //    - fsqrt
    // *** add other opperations
-   divsqrt divsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, 
+   divsqrt divsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
                   .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .DivStartE(DivStartE), 
                   .StallE, .StallM, .DivSM, .DivBusy(FDivBusyE), .QeM, //***change divbusyE to M signal
                   .EarlyTermShiftM, .QmM, .DivDone(DivDoneM));
diff --git a/pipelined/src/fpu/otfc.sv b/pipelined/src/fpu/otfc.sv
index 66af5b3c5..7ecb823e6 100644
--- a/pipelined/src/fpu/otfc.sv
+++ b/pipelined/src/fpu/otfc.sv
@@ -58,6 +58,41 @@ module otfc2 (
 
 endmodule
 
+///////////////////////////////
+// Square Root OTFC, Radix 2 //
+///////////////////////////////
+module sotfc2(
+  input  logic         clk,
+  input  logic         Start,
+  input  logic         sp, sn,
+  input  logic         Sqrt,
+  input  logic [`DIVLEN+3:0] C,
+  output logic [`DIVLEN-2:0] Sq,
+  output logic [`DIVLEN+3:0] S, SM
+);
+  //  The on-the-fly converter transfers the square root 
+  //  bits to the quotient as they come.
+  //  Use this otfc for division and square root.
+  logic [`DIVLEN+3:0] SNext, SMNext, SMux;
+
+  flopr #(`DIVLEN+4) SMreg(clk, Start, SMNext, SM);
+  mux2 #(`DIVLEN+4) Smux(SNext, {3'b000, Sqrt, {(`DIVLEN){1'b0}}}, Start, SMux);
+  flop #(`DIVLEN+4) Sreg(clk, SMux, S);
+
+  always_comb begin
+    if (sp) begin
+      SNext  = S | (C & ~(C << 1));
+      SMNext = S;
+    end else if (sn) begin
+      SNext  = SM | (C & ~(C << 1));
+      SMNext = SM;
+    end else begin        // If sp and sn are not true, then sz is
+      SNext  = S;
+      SMNext = SM | (C & ~(C << 1));
+    end 
+  end
+  assign Sq = S[`DIVLEN] ? S[`DIVLEN-1:1] : S[`DIVLEN-2:0];
+endmodule
 
 module otfc4 (
   input  logic [3:0]   q,
@@ -110,3 +145,38 @@ module otfc4 (
   // Final Qmeint is in the range [.5, 2)
 
 endmodule
+
+///////////////////////////////
+// Square Root OTFC, Radix 4 //
+///////////////////////////////
+module sotfc4(
+  input  logic [3:0]   s,
+  input  logic         Sqrt,
+  input  logic [`DIVLEN+3:0] S, SM,
+  input  logic [`DIVLEN+3:0] C,
+  output logic [`DIVLEN+3:0] SNext, SMNext
+);
+  //  The on-the-fly converter transfers the square root 
+  //  bits to the quotient as they come.
+  //  Use this otfc for division and square root.
+
+  always_comb begin
+    if (s[3]) begin
+      SNext  = S | ((C << 1)&~(C << 2));
+      SMNext = S | (C&~(C << 1));
+    end else if (s[2]) begin
+      SNext  = S | (C&~(C << 1));
+      SMNext = S;
+    end else if (s[1]) begin
+      SNext  = SM | (C&~(C << 2));
+      SMNext = SM | ((C << 1)&~(C << 2));
+    end else if (s[0]) begin
+      SNext  = SM | ((C << 1)&~(C << 2));
+      SMNext = SM | (C&~(C << 1));
+    end else begin        // If sp and sn are not true, then sz is
+      SNext  = S;
+      SMNext = SM | (C & ~(C << 2));
+    end 
+  end
+
+endmodule
diff --git a/pipelined/src/fpu/qsel.sv b/pipelined/src/fpu/qsel.sv
index 202b3ee81..87c6a4b25 100644
--- a/pipelined/src/fpu/qsel.sv
+++ b/pipelined/src/fpu/qsel.sv
@@ -62,9 +62,36 @@ module qsel2 ( // *** eventually just change to 4 bits
 //   assign #1 qn = magnitude & sign;
 endmodule
 
+////////////////////////////////////
+// Adder Input Generation, Radix 2 //
+////////////////////////////////////
+module fgen2 (
+  input  logic sp, sn,
+  input  logic [`DIVLEN+3:0] C, S, SM,
+  output logic [`DIVLEN+3:0] F
+);
+  logic [`DIVLEN+3:0] FP, FN, FZ;
+  
+  // Generate for both positive and negative bits
+  assign FP = ~(S << 1) & C;
+  assign FN = (SM << 1) | (C & (~C << 2));
+  assign FZ = '0;
+
+  // Choose which adder input will be used
+
+  always_comb
+    if (sp)       F = FP;
+    else if (sn)  F = FN;
+    else          F = FZ;
+
+  // assign F = sp ? FP : (sn ? FN : FZ);
+
+endmodule
+
 module qsel4 (
 	input logic [`DIVLEN+3:0] D,
 	input logic [`DIVLEN+3:0] WS, WC,
+  input logic Sqrt,
 	output logic [3:0] q
 );
 	logic [6:0] Wmsbs;
@@ -91,45 +118,77 @@ module qsel4 (
             else if(w2>=4)   QSel4[i] = 4'b0100; 
             else if(w2>=-4)  QSel4[i] = 4'b0000; 
             else if(w2>=-13) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
+            else             QSel4[i] = 4'b0001; 
           1: if(w2>=14)      QSel4[i] = 4'b1000;
             else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-6)  QSel4[i] = 4'b0000; 
-            else if(w2>=-15) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
+            else if(w2>=-5)  QSel4[i] = 4'b0000; // was -6
+            else if(~Sqrt&(w2>=-15)) QSel4[i] = 4'b0010; // divide case
+            else if( Sqrt&(w2>=-14)) QSel4[i] = 4'b0010; // sqrt case
+            else             QSel4[i] = 4'b0001; 
           2: if(w2>=15)      QSel4[i] = 4'b1000;
             else if(w2>=4)   QSel4[i] = 4'b0100; 
             else if(w2>=-6)  QSel4[i] = 4'b0000; 
             else if(w2>=-16) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
+            else             QSel4[i] = 4'b0001; 
           3: if(w2>=16)      QSel4[i] = 4'b1000;
             else if(w2>=4)   QSel4[i] = 4'b0100; 
             else if(w2>=-6)  QSel4[i] = 4'b0000; 
-            else if(w2>=-18) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
+            else if(w2>=-17) QSel4[i] = 4'b0010; // was -18
+            else             QSel4[i] = 4'b0001; 
           4: if(w2>=18)      QSel4[i] = 4'b1000;
             else if(w2>=6)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-20) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
+            else if(w2>=-6)  QSel4[i] = 4'b0000; // was -8
+            else if(~Sqrt&(w2>=-20)) QSel4[i] = 4'b0010; // divide case
+            else if( Sqrt&(w2>=-18)) QSel4[i] = 4'b0010; // sqrt case
+            else             QSel4[i] = 4'b0001; 
           5: if(w2>=20)      QSel4[i] = 4'b1000;
             else if(w2>=6)   QSel4[i] = 4'b0100; 
             else if(w2>=-8)  QSel4[i] = 4'b0000; 
             else if(w2>=-20) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
+            else             QSel4[i] = 4'b0001; 
           6: if(w2>=20)      QSel4[i] = 4'b1000;
             else if(w2>=8)   QSel4[i] = 4'b0100; 
             else if(w2>=-8)  QSel4[i] = 4'b0000; 
             else if(w2>=-22) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
-          7: if(w2>=24)      QSel4[i] = 4'b1000;
+            else             QSel4[i] = 4'b0001; 
+          7: if(w2>=22)      QSel4[i] = 4'b1000; // was 24
             else if(w2>=8)   QSel4[i] = 4'b0100; 
             else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-24) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
+            else if(w2>=-23) QSel4[i] = 4'b0010; // was -24
+            else             QSel4[i] = 4'b0001; 
         endcase
       end
   end
 	assign q = QSel4[{Dmsbs,Wmsbs}];
 	
 endmodule
+
+////////////////////////////////////
+// Adder Input Generation, Radix 4 //
+////////////////////////////////////
+module fgen4 (
+  input  logic [3:0] s,
+  input  logic [`DIVLEN+3:0] C, S, SM,
+  output logic [`DIVLEN+3:0] F
+);
+  logic [`DIVLEN+3:0] F2, F1, F0, FN1, FN2;
+  
+  // Generate for both positive and negative bits
+  assign F2  = (~S << 2) & (C << 2);
+  assign F1  = ~(S << 1) & C;
+  assign F0  = '0;
+  assign FN1 = (SM << 1) | (C & ~(C << 2));
+  assign FN2 = (SM << 2) | ((C << 2)&~(C <<4));
+
+  // Choose which adder input will be used
+
+  always_comb
+    if (s[3])       F = F2;
+    else if (s[2])  F = F1;
+    else if (s[1])  F = FN1;
+    else if (s[0])  F = FN2;
+    else            F = F0;
+
+  // assign F = sp ? FP : (sn ? FN : FZ);
+
+endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/srt.sv b/pipelined/src/fpu/srt.sv
index 7e9f9922a..633ac1787 100644
--- a/pipelined/src/fpu/srt.sv
+++ b/pipelined/src/fpu/srt.sv
@@ -34,18 +34,17 @@ module srt(
   input  logic clk,
   input  logic DivStart, 
   input  logic DivBusy, 
-  input  logic [`FMTBITS-1:0] FmtE,
   input  logic [`NE-1:0] Xe, Ye,
   input  logic XZeroE, YZeroE, 
-  input logic [`DIVLEN-1:0] X,
-  input logic [`DIVLEN-1:0] Dpreproc,
-  input logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
-  input logic NegSticky,
+  input  logic Sqrt,
+  input  logic [`DIVLEN+3:0] X,
+  input  logic [`DIVLEN+3:0] Dpreproc,
+  input  logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
+  input  logic NegSticky,
   output logic [`QLEN-1-(`RADIX/4):0] Qm,
   output logic [`DIVLEN+3:0]  NextWSN, NextWCN,
   output logic [`DIVLEN+3:0]  StickyWSA,
   output logic [`DIVLEN+3:0]  FirstWS, FirstWC,
-  output logic  [`NE+1:0] QeM,
   output logic [`XLEN-1:0] Rem
 );
 
@@ -59,13 +58,19 @@ module srt(
   logic [`QLEN-1:0] QM[`DIVCOPIES-1:0];
   logic [`QLEN-1:0] QNext[`DIVCOPIES-1:0];
   logic [`QLEN-1:0] QMNext[`DIVCOPIES-1:0];
+  logic [`DIVLEN+3:0] S[`DIVCOPIES-1:0]; //***change to QLEN???
+  logic [`DIVLEN+3:0] SM[`DIVCOPIES-1:0];
+  logic [`DIVLEN+3:0] SNext[`DIVCOPIES-1:0];
+  logic [`DIVLEN+3:0] SMNext[`DIVCOPIES-1:0];
+  logic [`DIVLEN+3:0] C[`DIVCOPIES-1:0];
  /* verilator lint_on UNOPTFLAT */
   logic [`DIVLEN+3:0]  WSN, WCN;
   logic [`DIVLEN+3:0]  D, DBar, D2, DBar2;
-  logic [`NE+1:0] Qe;
   logic [$clog2(`XLEN+1)-1:0] intExp;
   logic           intSign;
   logic [`QLEN-1:0] QMMux;
+  logic [`DIVLEN+3:0] CMux;
+  logic [`DIVLEN+3:0] SMux;
 
   // Top Muxes and Registers
   // When start is asserted, the inputs are loaded into the divider.
@@ -83,13 +88,13 @@ module srt(
     assign NextWCN = {WCA[`DIVCOPIES-1][`DIVLEN+1:0], 2'b0};
   end
 
-  mux2   #(`DIVLEN+4) wsmux(NextWSN, {3'b000, ~XZeroE, X}, DivStart, WSN);
+  mux2   #(`DIVLEN+4) wsmux(NextWSN, X, DivStart, WSN);
   flopen   #(`DIVLEN+4) wsflop(clk, DivStart|DivBusy, WSN, WS[0]);
   mux2   #(`DIVLEN+4) wcmux(NextWCN, {`DIVLEN+4{1'b0}}, DivStart, WCN);
   flopen   #(`DIVLEN+4) wcflop(clk, DivStart|DivBusy, WCN, WC[0]);
-  flopen #(`DIVLEN+4) dflop(clk, DivStart, {4'b0001, Dpreproc}, D);
-  flopen #(`NE+2) expflop(clk, DivStart, Qe, QeM);
-
+  flopen #(`DIVLEN+4) dflop(clk, DivStart, Dpreproc, D);
+  mux2 #(`DIVLEN+4) Cmux({2'b11, C[`DIVCOPIES-1][`DIVLEN+3:2]}, {5'b11111, Sqrt, {(`DIVLEN-2){1'b0}}}, DivStart, CMux);
+  flop #(`DIVLEN+4) cflop(clk, CMux, C[0]);
 
   // Divisor Selections
   // - choose the negitive version of what's being selected
@@ -102,8 +107,9 @@ module srt(
   genvar i;
   generate
     for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin : interations
-      divinteration divinteration(.D, .DBar, .D2, .DBar2, 
-      .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), .Q(Q[i]), .QM(QM[i]), .QNext(QNext[i]), .QMNext(QMNext[i]));
+      divinteration divinteration(.D, .DBar, .D2, .DBar2, .Sqrt,
+      .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), .Q(Q[i]), .QM(QM[i]), .QNext(QNext[i]), .QMNext(QMNext[i]),
+      .C(C[i]), .S(S[i]), .SM(SM[i]), .SNext(SNext[i]), .SMNext(SMNext[i]));
       if(i<(`DIVCOPIES-1)) begin 
         if (`RADIX==2)begin 
           assign WS[i+1] = {WSA[i][`DIVLEN+1:0], 1'b0};
@@ -111,9 +117,12 @@ module srt(
         end else begin
           assign WS[i+1] = {WSA[i][`DIVLEN+1:0], 2'b0};
           assign WC[i+1] = {WCA[i][`DIVLEN+1:0], 2'b0};
+          assign  C[i+1] = {2'b11, C[i][`DIVLEN+3:2]};
         end
         assign Q[i+1] = QNext[i];
         assign QM[i+1] = QMNext[i];
+        assign S[i+1] = SNext[i];
+        assign SM[i+1] = SMNext[i];
       end
     end
   endgenerate
@@ -123,16 +132,27 @@ module srt(
   flopenr #(`QLEN) Qreg(clk, DivStart, DivBusy, QNext[`DIVCOPIES-1], Q[0]);
   flopen #(`QLEN) QMreg(clk, DivBusy, QMMux, QM[0]);
 
-  assign Qm = NegSticky ? QM[0][`QLEN-1-(`RADIX/4):0] : Q[0][`QLEN-1-(`RADIX/4):0];
+  flopr #(`DIVLEN+4) SMreg(clk, DivStart, SMNext[`DIVCOPIES-1], SM[0]);
+  mux2 #(`DIVLEN+4) Smux(SNext[`DIVCOPIES-1], {3'b000, Sqrt, {(`DIVLEN){1'b0}}}, DivStart, SMux);
+  flop #(`DIVLEN+4) Sreg(clk, SMux, S[0]);
+
+  always_comb
+    if(Sqrt)
+      if(NegSticky) Qm = SM[0][`QLEN-1-(`RADIX/4):0];
+      else          Qm = S[0][`QLEN-1-(`RADIX/4):0];
+    else  
+      if(NegSticky) Qm = QM[0][`QLEN-1-(`RADIX/4):0];
+      else          Qm = Q[0][`QLEN-1-(`RADIX/4):0];
+
   assign FirstWS = WS[0];
   assign FirstWC = WC[0];
+
   if(`RADIX==2)
     if (`DIVCOPIES == 1)
       assign StickyWSA = {WSA[0][`DIVLEN+2:0], 1'b0};
     else
       assign StickyWSA = {WSA[1][`DIVLEN+2:0], 1'b0};
 
-  expcalc expcalc(.FmtE, .Xe, .Ye, .XZeroE, .XZeroCnt, .YZeroCnt, .Qe);
 
 endmodule
 
@@ -145,8 +165,12 @@ module divinteration (
   input logic [`DIVLEN+3:0] D,
   input logic [`DIVLEN+3:0]  DBar, D2, DBar2,
   input logic [`QLEN-1:0] Q, QM,
+  input logic [`DIVLEN+3:0] S, SM,
   input logic [`DIVLEN+3:0]  WS, WC,
+  input logic [`DIVLEN+3:0] C,
+  input logic Sqrt,
   output logic [`QLEN-1:0] QNext, QMNext, 
+  output logic [`DIVLEN+3:0] SNext, SMNext, 
   output logic [`DIVLEN+3:0]  WSA, WCA
 );
  /* verilator lint_on UNOPTFLAT */
@@ -154,6 +178,8 @@ module divinteration (
   logic [`DIVLEN+3:0]  Dsel;
   logic [3:0]     q;
   logic qp, qz;//, qn;
+  logic [`DIVLEN+3:0] F;
+  logic [`DIVLEN+3:0] AddIn;
 
   // Qmient Selection logic
   // Given partial remainder, select quotient of +1, 0, or -1 (qp, qz, pm)
@@ -166,7 +192,8 @@ module divinteration (
   if(`RADIX == 2) begin : qsel
     qsel2 qsel2(WS[`DIVLEN+3:`DIVLEN], WC[`DIVLEN+3:`DIVLEN], qp, qz);//, qn);
   end else begin
-    qsel4 qsel4(.D, .WS, .WC, .q);
+    qsel4 qsel4(.D, .WS, .WC, .Sqrt, .q);
+    fgen4 fgen4(.s(q), .C, .S, .SM, .F);
   end
 
   if(`RADIX == 2) begin : dsel
@@ -184,16 +211,18 @@ module divinteration (
   end
   // Partial Product Generation
   //  WSA, WCA = WS + WC - qD
+  assign AddIn = Sqrt ? F : Dsel;
   if (`RADIX == 2) begin : csa
-    csa #(`DIVLEN+4) csa(WS, WC, Dsel, qp, WSA, WCA);
+    csa #(`DIVLEN+4) csa(WS, WC, AddIn, qp, WSA, WCA);
   end else begin
-    csa #(`DIVLEN+4) csa(WS, WC, Dsel, |q[3:2], WSA, WCA);
+    csa #(`DIVLEN+4) csa(WS, WC, AddIn, |q[3:2], WSA, WCA);
   end
 
   if (`RADIX == 2) begin : otfc
     otfc2 otfc2(.qp, .qz, .Q, .QM, .QNext, .QMNext);
   end else begin
     otfc4 otfc4(.q, .Q, .QM, .QNext, .QMNext);
+    sotfc4 sotfc4(.s(q), .Sqrt, .C, .S, .SM, .SNext, .SMNext);
   end
 
 endmodule
@@ -220,40 +249,3 @@ module csa #(parameter N=69) (
   assign out2 = {in1[N-2:0] & (in2[N-2:0] | in3[N-2:0]) | 
 		    (in2[N-2:0] & in3[N-2:0]), cin};
 endmodule
-
-module expcalc(
-  input logic  [`FMTBITS-1:0] FmtE,
-  input  logic [`NE-1:0] Xe, Ye,
-  input logic XZeroE, 
-  input logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
-  output logic  [`NE+1:0] Qe
-  );
-    logic [`NE-2:0] Bias;
-    
-    if (`FPSIZES == 1) begin
-        assign Bias = (`NE-1)'(`BIAS); 
-
-    end else if (`FPSIZES == 2) begin
-        assign Bias = FmtE ? (`NE-1)'(`BIAS) : (`NE-1)'(`BIAS1); 
-
-    end else if (`FPSIZES == 3) begin
-        always_comb
-            case (FmtE)
-                `FMT: Bias  =  (`NE-1)'(`BIAS);
-                `FMT1: Bias = (`NE-1)'(`BIAS1);
-                `FMT2: Bias = (`NE-1)'(`BIAS2);
-                default: Bias = 'x;
-            endcase
-
-    end else if (`FPSIZES == 4) begin        
-        always_comb
-            case (FmtE)
-                2'h3: Bias =  (`NE-1)'(`Q_BIAS);
-                2'h1: Bias =  (`NE-1)'(`D_BIAS);
-                2'h0: Bias =  (`NE-1)'(`S_BIAS);
-                2'h2: Bias =  (`NE-1)'(`H_BIAS);
-            endcase
-    end
-    // correct exponent for denormalized input's normalization shifts
-    assign Qe = ({2'b0, Xe} - {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, XZeroCnt} - {2'b0, Ye} + {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, YZeroCnt} + {3'b0, Bias})&{`NE+2{~XZeroE}};
-    endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/srtpreproc.sv b/pipelined/src/fpu/srtpreproc.sv
index b9fb8bb82..4d2609179 100644
--- a/pipelined/src/fpu/srtpreproc.sv
+++ b/pipelined/src/fpu/srtpreproc.sv
@@ -31,16 +31,25 @@
 `include "wally-config.vh"
 
 module srtpreproc (
+  input  logic clk,
+  input  logic DivStart, 
   input  logic [`NF:0] Xm, Ym,
-  output logic [`DIVLEN-1:0] X,
-  output logic [`DIVLEN-1:0] Dpreproc,
+  input  logic [`NE-1:0] Xe, Ye,
+  input  logic [`FMTBITS-1:0] Fmt,
+  input  logic Sqrt,
+  input logic XZero,
+  output logic  [`NE+1:0] QeM,
+  output logic [`DIVLEN+3:0] X,
+  output logic [`DIVLEN+3:0] Dpreproc,
   output logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
   output logic [`DURLEN-1:0] Dur
 );
   // logic  [`XLEN-1:0] PosA, PosB;
   // logic  [`DIVLEN-1:0] ExtraA, ExtraB, PreprocA, PreprocB, PreprocX, PreprocY;
-  logic  [`DIVLEN-1:0] PreprocA, PreprocX;
-  logic  [`DIVLEN-1:0] PreprocB, PreprocY;
+  logic  [`NF-1:0] PreprocA, PreprocX;
+  logic  [`NF-1:0] PreprocB, PreprocY;
+  logic  [`NF+3:0] SqrtX;
+  logic [`NE+1:0] Qe;
 
   // assign PosA = (Signed & SrcA[`XLEN - 1]) ? -SrcA : SrcA;
   // assign PosB = (Signed & SrcB[`XLEN - 1]) ? -SrcB : SrcB;
@@ -49,23 +58,22 @@ module srtpreproc (
 
   // ***can probably merge X LZC with conversion
   // cout the number of leading zeros
-  lzc #(`NF+1) lzcA (Xm, XZeroCnt);
-  lzc #(`NF+1) lzcB (Ym, YZeroCnt);
+  lzc #(`NF+1) lzcX (Xm, XZeroCnt);
+  lzc #(`NF+1) lzcY (Ym, YZeroCnt);
 
   // assign ExtraA = {PosA, {`DIVLEN-`XLEN{1'b0}}};
   // assign ExtraB = {PosB, {`DIVLEN-`XLEN{1'b0}}};
 
   // assign PreprocA = ExtraA << zeroCntA;
   // assign PreprocB = ExtraB << (zeroCntB + 1);
-  assign PreprocX = {Xm[`NF-1:0]<<XZeroCnt, {`DIVLEN-`NF{1'b0}}};
-  assign PreprocY = {Ym[`NF-1:0]<<YZeroCnt, {`DIVLEN-`NF{1'b0}}};
+  assign PreprocX = Xm[`NF-1:0]<<XZeroCnt;
+  assign PreprocY = Ym[`NF-1:0]<<YZeroCnt;
 
   
-  assign X = PreprocX;
-  assign Dpreproc = PreprocY;
+  assign SqrtX = Xe[0] ? {3'b110, ~XZero, PreprocX} : {2'b11, ~XZero, PreprocX, 1'b0};
+  assign X = Sqrt ? {SqrtX, {`DIVLEN-`NF{1'b0}}} : {3'b000, ~XZero, PreprocX, {`DIVLEN-`NF{1'b0}}};
+  assign Dpreproc = {4'b0001, /*Int ? PreprocB : */PreprocY, {`DIVLEN-`NF{1'b0}}};
   assign Dur = (`DURLEN)'(`FPDUR);
-  // assign intExp = zeroCntB - zeroCntA + 1;
-  // assign intSign = Signed & (SrcA[`XLEN - 1] ^ SrcB[`XLEN - 1]);
 
   //           radix 2     radix 4
   // 1 copies  DIVLEN+2    DIVLEN+2/2
@@ -76,6 +84,52 @@ module srtpreproc (
   // DIVRESLEN = DIVLEN or DIVLEN+2
   // r = 1 or 2
   // DIVRESLEN/(r*`DIVCOPIES)
+  flopen #(`NE+2) expflop(clk, DivStart, Qe, QeM);
+  expcalc expcalc(.Fmt, .Xe, .Ye, .Sqrt, .XZero, .XZeroCnt, .YZeroCnt, .Qe);
 
 
+endmodule
+
+module expcalc(
+  input logic  [`FMTBITS-1:0] Fmt,
+  input  logic [`NE-1:0] Xe, Ye,
+  input logic Sqrt,
+  input logic XZero, 
+  input logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
+  output logic  [`NE+1:0] Qe
+  );
+  logic [`NE-2:0] Bias;
+  logic [`NE-1:0] SExp, SXExp;
+  logic [`NE+1:0] DExp;
+  
+  if (`FPSIZES == 1) begin
+      assign Bias = (`NE-1)'(`BIAS); 
+
+  end else if (`FPSIZES == 2) begin
+      assign Bias = Fmt ? (`NE-1)'(`BIAS) : (`NE-1)'(`BIAS1); 
+
+  end else if (`FPSIZES == 3) begin
+      always_comb
+          case (Fmt)
+              `FMT: Bias  =  (`NE-1)'(`BIAS);
+              `FMT1: Bias = (`NE-1)'(`BIAS1);
+              `FMT2: Bias = (`NE-1)'(`BIAS2);
+              default: Bias = 'x;
+          endcase
+
+  end else if (`FPSIZES == 4) begin        
+    always_comb
+        case (Fmt)
+            2'h3: Bias =  (`NE-1)'(`Q_BIAS);
+            2'h1: Bias =  (`NE-1)'(`D_BIAS);
+            2'h0: Bias =  (`NE-1)'(`S_BIAS);
+            2'h2: Bias =  (`NE-1)'(`H_BIAS);
+        endcase
+  end
+  assign SXExp = Xe - (`NE)'(`BIAS);
+  assign SExp  = {1'b0, SXExp[`NE-1:1]} + Bias;
+  // correct exponent for denormalized input's normalization shifts
+  assign DExp = ({2'b0, Xe} - {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, XZeroCnt} - {2'b0, Ye} + {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, YZeroCnt} + {3'b0, Bias})&{`NE+2{~XZero}};
+  
+  assign Qe = Sqrt ? {2'b0, SExp} : DExp;
 endmodule
\ No newline at end of file
diff --git a/pipelined/testbench/testbench-fp.sv b/pipelined/testbench/testbench-fp.sv
index 19b637478..9be68f507 100644
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@@ -697,7 +697,7 @@ module testbenchfp;
   fcmp fcmp   (.Fmt(ModFmt), .OpCtrl(OpCtrlVal), .Xs, .Ys, .Xe, .Ye, 
               .Xm, .Ym, .XZero, .YZero, .CmpIntRes(CmpRes),
               .XNaN, .YNaN, .XSNaN, .YSNaN, .X, .Y, .CmpNV(CmpFlg[4]), .CmpFpRes(FpCmpRes));
-  divsqrt divsqrt(.clk, .reset, .FmtE(ModFmt), .XmE(Xm), .YmE(Ym), .XeE(Xe), .YeE(Ye), 
+  divsqrt divsqrt(.clk, .reset, .FmtE(ModFmt), .XmE(Xm), .YmE(Ym), .XeE(Xe), .YeE(Ye), .SqrtE(1'b0), .SqrtM(1'b0),
                   .XInfE(XInf), .YInfE(YInf), .XZeroE(XZero), .YZeroE(YZero), .XNaNE(XNaN), .YNaNE(YNaN), .DivStartE(DivStart), 
                   .StallE(1'b0), .StallM(1'b0), .DivSM(DivSticky), .DivBusy, .QeM(DivCalcExp),
                   .EarlyTermShiftM(EarlyTermShift), .QmM(Quot), .DivDone);