diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index e5adea1f9..1f05a4f13 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -101,17 +101,18 @@
 `define CORRSHIFTSZ ((`DIVRESLEN+`NF) > (3*`NF+8) ? (`DIVRESLEN+`NF) : (3*`NF+6))
 
 // division constants
-`define RADIX 32'h4
-`define DIVCOPIES 32'h4
+`define RADIX 32'h2
+`define DIVCOPIES 32'h1
 `define DIVLEN ((`NF < `XLEN) ? (`XLEN) : (`NF + 3))
-`define DIVN (`NF < `XLEN ? `XLEN : `NF+1) // length of input
+// `define DIVN (`NF < `XLEN ? `XLEN : `NF+1) // length of input
+`define DIVN (`NF < `XLEN ? `XLEN : `NF+3) // length of input
 `define EXTRAFRACBITS ((`NF<(`XLEN)) ? (`XLEN - `NF) : 3)
 `define EXTRAINTBITS ((`NF<(`XLEN)) ? 0 : (`NF - `XLEN + 3))
 `define DIVRESLEN ((`NF>`XLEN) ? `NF+4 : `XLEN)
 `define LOGR ((`RADIX==2) ? 32'h1 : 32'h2)
 // FPDUR = ceil(DIVRESLEN/(LOGR*DIVCOPIES))
 // one interation is required for the integer bit for minimally redundent radix-4
-`define FPDUR ((`DIVLEN+(`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES)+(`RADIX/4))
+`define FPDUR ((`DIVN+2+(`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES)+(`RADIX/4))
 `define DURLEN ($clog2(`FPDUR+1))
 `define QLEN (`FPDUR*`LOGR*`DIVCOPIES)
 `define DIVb (`FPDUR*`LOGR*`DIVCOPIES)-1
diff --git a/pipelined/regression/wave-fpu.do b/pipelined/regression/wave-fpu.do
index e16d7b0b5..f06fb6d63 100644
--- a/pipelined/regression/wave-fpu.do
+++ b/pipelined/regression/wave-fpu.do
@@ -30,9 +30,9 @@ add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/QNext
 add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/QMNext
 add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/*
 add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/*
-# add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/otfc/otfc2/*
-# add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/qsel/qsel2/*
-add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/genblk1/qsel4/*
+add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/otfc/otfc2/*
+add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/qsel/qsel2/*
+# add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/genblk1/qsel4/*
 add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srtpreproc/*
 add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srtpreproc/expcalc/*
 add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srtfsm/*
diff --git a/pipelined/src/fpu/divsqrt.sv b/pipelined/src/fpu/divsqrt.sv
index 70610bcd3..a1b19394a 100644
--- a/pipelined/src/fpu/divsqrt.sv
+++ b/pipelined/src/fpu/divsqrt.sv
@@ -65,6 +65,6 @@ module divsqrt(
 
   srtfsm srtfsm(.reset, .XsE, .SqrtE, .NextWSN, .NextWCN, .WS, .WC, .Dur, .DivBusy, .clk, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, .DivSE(DivSM), .XNaNE, .YNaNE,
                .StickyWSA, .XInfE, .YInfE, .NegSticky(NegSticky), .EarlyTermShiftE(EarlyTermShiftM));
-  srt srt(.clk, .Sqrt(SqrtM), .X,.Dpreproc, .NegSticky, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, .DivStart(DivStartE), .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
+  srt srt(.clk, .SqrtE, .SqrtM, .X,.Dpreproc, .NegSticky, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, .DivStart(DivStartE), .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
                 .StickyWSA, .DivBusy, .Qm(QmM));
 endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/flags.sv b/pipelined/src/fpu/flags.sv
index 403b65fe4..67fdb4935 100644
--- a/pipelined/src/fpu/flags.sv
+++ b/pipelined/src/fpu/flags.sv
@@ -157,7 +157,7 @@ module flags(
     //                                                                                                     or when the positive res rounds up out of range
     assign SigNaN = (XSNaN&~(IntToFp&CvtOp)) | (YSNaN&~CvtOp) | (ZSNaN&FmaOp);
     assign FmaInvalid = ((XInf | YInf) & ZInf & (FmaPs ^ FmaAs) & ~NaNIn) | (XZero & YInf) | (YZero & XInf);
-    assign DivInvalid = ((XInf & YInf) | (XZero & YZero))&~Sqrt | (Xs&Sqrt);
+    assign DivInvalid = ((XInf & YInf) | (XZero & YZero))&~Sqrt | (Xs&Sqrt&~NaNIn&~XZero);
 
     assign Invalid = SigNaN | (FmaInvalid&FmaOp) | (DivInvalid&DivOp);
 
diff --git a/pipelined/src/fpu/otfc.sv b/pipelined/src/fpu/otfc.sv
index 71320fedf..b2d1310ff 100644
--- a/pipelined/src/fpu/otfc.sv
+++ b/pipelined/src/fpu/otfc.sv
@@ -147,9 +147,9 @@ endmodule
 module sotfc4(
   input  logic [3:0]   s,
   input  logic         Sqrt,
-  input  logic [`DIVLEN+3:0] S, SM,
-  input  logic [`DIVLEN+3:0] C,
-  output logic [`DIVLEN+3:0] SNext, SMNext
+  input  logic [`DIVb+3:0] S, SM,
+  input  logic [`DIVb+3:0] C,
+  output logic [`DIVb+3:0] SNext, SMNext
 );
   //  The on-the-fly converter transfers the square root 
   //  bits to the quotient as they come.
diff --git a/pipelined/src/fpu/qsel.sv b/pipelined/src/fpu/qsel.sv
index cb8d3202b..e9350da26 100644
--- a/pipelined/src/fpu/qsel.sv
+++ b/pipelined/src/fpu/qsel.sv
@@ -31,11 +31,11 @@
 `include "wally-config.vh"
 
 module qsel2 ( // *** eventually just change to 4 bits
-  input  logic [`DIVLEN+3:`DIVLEN] ps, pc, 
+  input  logic [3:0] ps, pc, 
   output logic         qp, qz//, qn
 );
  
-  logic [`DIVLEN+3:`DIVLEN]  p, g;
+  logic [3:0]  p, g;
   logic          magnitude, sign, cout;
 
   // The quotient selection logic is presented for simplicity, not
@@ -46,9 +46,9 @@ module qsel2 ( // *** eventually just change to 4 bits
   assign p = ps ^ pc;
   assign g = ps & pc;
 
-  assign magnitude = ~(&p[`DIVLEN+2:`DIVLEN]);
-  assign cout = g[`DIVLEN+2] | (p[`DIVLEN+2] & (g[`DIVLEN+1] | p[`DIVLEN+1] & g[`DIVLEN]));
-  assign sign = p[`DIVLEN+3] ^ cout;
+  assign magnitude = ~(&p[2:0]);
+  assign cout = g[2] | (p[2] & (g[1] | p[1] & g[0]));
+  assign sign = p[3] ^ cout;
 /*  assign #1 magnitude = ~((ps[54]^pc[54]) & (ps[53]^pc[53]) & 
 			  (ps[52]^pc[52]));
   assign #1 sign = (ps[55]^pc[55])^
@@ -80,7 +80,7 @@ module fgen2 (
 
   // Generate for both positive and negative bits
   assign FP = ~(SExt << 1) & CExt;
-  assign FN = (SMExt << 1) | (CExt & (~CExt << 2));
+  assign FN = (SMExt << 1) | (CExt & ~(CExt << 2));
   assign FZ = '0;
 
   // Choose which adder input will be used
@@ -172,10 +172,10 @@ endmodule
 ////////////////////////////////////
 module fgen4 (
   input  logic [3:0] s,
-  input  logic [`DIVLEN+3:0] C, S, SM,
-  output logic [`DIVLEN+3:0] F
+  input  logic [`DIVb+3:0] C, S, SM,
+  output logic [`DIVb+3:0] F
 );
-  logic [`DIVLEN+3:0] F2, F1, F0, FN1, FN2;
+  logic [`DIVb+3:0] F2, F1, F0, FN1, FN2;
   
   // Generate for both positive and negative bits
   assign F2  = (~S << 2) & (C << 2);
diff --git a/pipelined/src/fpu/srt.sv b/pipelined/src/fpu/srt.sv
index db2abf25a..0e244a229 100644
--- a/pipelined/src/fpu/srt.sv
+++ b/pipelined/src/fpu/srt.sv
@@ -36,7 +36,8 @@ module srt(
   input  logic DivBusy, 
   input  logic [`NE-1:0] Xe, Ye,
   input  logic XZeroE, YZeroE, 
-  input  logic Sqrt,
+  input  logic SqrtE,
+  input  logic SqrtM,
   input  logic [`DIVb:0] X,
   input  logic [`DIVN-2:0] Dpreproc,
   input  logic NegSticky,
@@ -95,21 +96,14 @@ module srt(
   end
 
 
-//   mux2   #(`DIVb+4) wsmux(NextWSN, {{3{Sqrt}}, X}, DivStart, WSN); //*** modified for sqrt which doesnt work
-//   flopen   #(`DIVb+4) wsflop(clk, DivStart|DivBusy, WSN, WS[0]);
-//   mux2   #(`DIVb+4) wcmux(NextWCN, '0, DivStart, WCN);
-//   flopen   #(`DIVb+4) wcflop(clk, DivStart|DivBusy, WCN, WC[0]);
-//   flopen #(`DIVN-1) dflop(clk, DivStart, Dpreproc, D);
-//   mux2 #(`DIVb) Cmux(NextC, {Sqrt, {(`DIVb-1){1'b0}}}, DivStart, CMux);
-//   flop #(`DIVb) cflop(clk, CMux, C[0]);
-
-  mux2   #(`DIVb+4) wsmux(NextWSN, {3'b0, X}, DivStart, WSN);
+  // mux2   #(`DIVb+4) wsmux(NextWSN, {3'b0, X}, DivStart, WSN);
+  mux2   #(`DIVb+4) wsmux(NextWSN, {{3{SqrtE&~XZeroE}}, X}, DivStart, WSN);
   flopen   #(`DIVb+4) wsflop(clk, DivStart|DivBusy, WSN, WS[0]);
   mux2   #(`DIVb+4) wcmux(NextWCN, '0, DivStart, WCN);
   flopen   #(`DIVb+4) wcflop(clk, DivStart|DivBusy, WCN, WC[0]);
   flopen #(`DIVN-1) dflop(clk, DivStart, Dpreproc, D);
-  mux2 #(`DIVb) Cmux({2'b11, C[`DIVCOPIES-1][`DIVb-1:2]}, {Sqrt, {(`DIVb-1){1'b0}}}, DivStart, CMux);
-  flop #(`DIVb) cflop(clk, CMux, C[0]);
+  mux2 #(`DIVb) Cmux(NextC, {1'b1, {(`DIVb-1){1'b0}}}, DivStart, CMux);
+  flopen #(`DIVb) cflop(clk, DivStart|DivBusy, CMux, C[0]);
 
   // Divisor Selections
   //  - choose the negitive version of what's being selected
@@ -123,7 +117,7 @@ module srt(
   genvar i;
   generate
     for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin : interations
-      divinteration divinteration(.D, .DBar, .D2, .DBar2, .Sqrt,
+      divinteration divinteration(.D, .DBar, .D2, .DBar2, .SqrtM,
       .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), .Q(Q[i]), .QM(QM[i]), .QNext(QNext[i]), .QMNext(QMNext[i]),
       .C(C[i]), .S(S[i]), .SM(SM[i]), .SNext(SNext[i]), .SMNext(SMNext[i]));
       if(i<(`DIVCOPIES-1)) begin 
@@ -151,11 +145,11 @@ module srt(
   flopen #(`DIVb+1) QMreg(clk, DivBusy, QMMux, QM[0]);
 
   flopr #(`DIVb+1) SMreg(clk, DivStart, SMNext[`DIVCOPIES-1], SM[0]);
-  mux2 #(`DIVb+1) Smux(SNext[`DIVCOPIES-1], {Sqrt, {(`DIVb){1'b0}}}, DivStart, SMux);
+  mux2 #(`DIVb+1) Smux(SNext[`DIVCOPIES-1], {SqrtM, {(`DIVb){1'b0}}}, DivStart, SMux);
   flop #(`DIVb+1) Sreg(clk, SMux, S[0]);
  // division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
   always_comb
-    if(Sqrt) // sqrt ouputs in the range (1, .5]
+    if(SqrtM) // sqrt ouputs in the range (1, .5]
       if(NegSticky) Qm = {SM[0][`DIVb-1-(`RADIX/4):0], 1'b0};
       else          Qm = {S[0][`DIVb-1-(`RADIX/4):0], 1'b0};
     else  
@@ -186,7 +180,7 @@ module divinteration (
   input logic [`DIVb:0] S, SM,
   input logic [`DIVb+3:0]  WS, WC,
   input logic [`DIVb-1:0] C,
-  input logic Sqrt,
+  input logic SqrtM,
   output logic [`DIVb:0] QNext, QMNext, 
   output logic [`DIVb:0] SNext, SMNext, 
   output logic [`DIVb+3:0]  WSA, WCA
@@ -211,7 +205,7 @@ module divinteration (
     qsel2 qsel2(WS[`DIVb+3:`DIVb], WC[`DIVb+3:`DIVb], qp, qz);
     fgen2 fgen2(.sp(qp), .sz(qz), .C, .S, .SM, .F);
   end else begin
-    qsel4 qsel4(.D, .WS, .WC, .Sqrt, .q);
+    qsel4 qsel4(.D, .WS, .WC, .Sqrt(SqrtM), .q);
     // fgen4 fgen4(.s(q), .C, .S, .SM, .F);
   end
 
@@ -230,11 +224,11 @@ module divinteration (
   end
   // Partial Product Generation
   //  WSA, WCA = WS + WC - qD
-  assign AddIn = Sqrt ? F : Dsel;
+  assign AddIn = SqrtM ? F : Dsel;
   if (`RADIX == 2) begin : csa
-    csa #(`DIVb+4) csa(WS, WC, AddIn, qp&~Sqrt, WSA, WCA);
+    csa #(`DIVb+4) csa(WS, WC, AddIn, qp&~SqrtM, WSA, WCA);
   end else begin
-    csa #(`DIVb+4) csa(WS, WC, AddIn, |q[3:2]&~Sqrt, WSA, WCA);
+    csa #(`DIVb+4) csa(WS, WC, AddIn, |q[3:2]&~SqrtM, WSA, WCA);
   end
 
   if (`RADIX == 2) begin : otfc
@@ -242,7 +236,7 @@ module divinteration (
     sotfc2 sotfc2(.sp(qp), .sz(qz), .C, .S, .SM, .SNext, .SMNext);
   end else begin
     otfc4 otfc4(.q, .Q, .QM, .QNext, .QMNext);
-    // sotfc4 sotfc4(.s(q), .Sqrt, .C, .S, .SM, .SNext, .SMNext);
+    // sotfc4 sotfc4(.s(q), .SqrtM, .C, .S, .SM, .SNext, .SMNext);
   end
 
 endmodule
diff --git a/pipelined/testbench/testbench-fp.sv b/pipelined/testbench/testbench-fp.sv
index 17383d1f7..88190aad2 100644
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@@ -85,10 +85,6 @@ module testbenchfp;
   logic [`DURLEN-1:0] EarlyTermShift;
   logic DivStart, DivBusy;
   logic reset = 1'b0;
-  logic [`DIVLEN-1:0]    DivX;
-  logic [`DIVLEN-1:0]  Dpreproc;
-  logic [`DIVLEN+3:0]  NextWSN, WS;
-  logic [`DIVLEN+3:0]  NextWCN, WC;
   logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
   logic [`DURLEN-1:0] Dur;