From ee7932c804d05398d87785b15bfee62c6c273520 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Fri, 22 Jul 2022 22:02:04 +0000
Subject: [PATCH] divider sizes reworked to match book

---
 addins/riscv-arch-test                  |   2 +-
 pipelined/config/rv64fp/wally-config.vh |   2 +-
 pipelined/config/shared/wally-shared.vh |   4 +-
 pipelined/regression/wave-fpu.do        |   3 +-
 pipelined/src/fpu/divshiftcalc.sv       |   7 +-
 pipelined/src/fpu/divsqrt.sv            |  22 +--
 pipelined/src/fpu/fctrl.sv              |   6 +-
 pipelined/src/fpu/flags.sv              |   6 +-
 pipelined/src/fpu/fpu.sv                |   4 +-
 pipelined/src/fpu/otfc.sv               |  20 +--
 pipelined/src/fpu/postprocess.sv        |   6 +-
 pipelined/src/fpu/qsel.sv               |  12 +-
 pipelined/src/fpu/roundsign.sv          |   3 +-
 pipelined/src/fpu/srt.sv                | 171 +++++++++++-----------
 pipelined/src/fpu/srtfsm.sv             |  14 +-
 pipelined/src/fpu/srtpreproc.sv         |  23 +--
 pipelined/srt/inttestgen.c              |   2 +-
 pipelined/testbench/testbench-fp.sv     | 186 ++++++++++++++----------
 18 files changed, 269 insertions(+), 224 deletions(-)

diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
index 307c77b2..be67c99b 160000
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@@ -1 +1 @@
-Subproject commit 307c77b26e070ae85ffea665ad9b642b40e33c86
+Subproject commit be67c99bd461742aa1c100bcc0732657faae2230
diff --git a/pipelined/config/rv64fp/wally-config.vh b/pipelined/config/rv64fp/wally-config.vh
index 8f13b2e3..cc8d1b2b 100644
--- a/pipelined/config/rv64fp/wally-config.vh
+++ b/pipelined/config/rv64fp/wally-config.vh
@@ -32,7 +32,7 @@
 `define DESIGN_COMPILER 0
 
 // RV32 or RV64: XLEN = 32 or 64
-`define XLEN 64
+`define XLEN 32
 
 // IEEE 754 compliance
 `define IEEE754 0
diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index b2abdff7..ea39ca35 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -102,8 +102,9 @@
 
 // division constants
 `define RADIX 32'h4
-`define DIVCOPIES 32'h1
+`define DIVCOPIES 32'h4
 `define DIVLEN ((`NF < `XLEN) ? (`XLEN) : (`NF + 3))
+`define DIVN (`NF < `XLEN ? `XLEN : `NF+1) // length of input
 `define EXTRAFRACBITS ((`NF<(`XLEN)) ? (`XLEN - `NF) : 3)
 `define EXTRAINTBITS ((`NF<(`XLEN)) ? 0 : (`NF - `XLEN + 3))
 `define DIVRESLEN ((`NF>`XLEN) ? `NF+4 : `XLEN)
@@ -113,6 +114,7 @@
 `define FPDUR ((`DIVLEN+(`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES)+(`RADIX/4))
 `define DURLEN ($clog2(`FPDUR+1))
 `define QLEN (`FPDUR*`LOGR*`DIVCOPIES)
+`define DIVb (`FPDUR*`LOGR*`DIVCOPIES)-1
 
 
 `define USE_SRAM 0
diff --git a/pipelined/regression/wave-fpu.do b/pipelined/regression/wave-fpu.do
index b71207e0..e16d7b0b 100644
--- a/pipelined/regression/wave-fpu.do
+++ b/pipelined/regression/wave-fpu.do
@@ -32,8 +32,9 @@ add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/*
 add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/*
 # add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/otfc/otfc2/*
 # add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/qsel/qsel2/*
+add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/genblk1/qsel4/*
 add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srtpreproc/*
-# add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/expcalc/*
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srtpreproc/expcalc/*
 add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srtfsm/*
 add wave -group {Testbench} -noupdate /testbenchfp/*
 add wave -group {Testbench} -noupdate /testbenchfp/readvectors/*
diff --git a/pipelined/src/fpu/divshiftcalc.sv b/pipelined/src/fpu/divshiftcalc.sv
index 3fbc9419..8095b517 100644
--- a/pipelined/src/fpu/divshiftcalc.sv
+++ b/pipelined/src/fpu/divshiftcalc.sv
@@ -1,8 +1,9 @@
 `include "wally-config.vh"
 
 module divshiftcalc(
-    input logic  [`QLEN-1-(`RADIX/4):0] DivQm,
+    input logic  [`DIVb-(`RADIX/4):0] DivQm,
     input logic  [`FMTBITS-1:0] Fmt,
+    input logic Sqrt,
     input logic [`DURLEN-1:0] DivEarlyTermShift,
     input logic [`NE+1:0] DivQe,
     output logic [$clog2(`NORMSHIFTSZ)-1:0] DivShiftAmt,
@@ -34,8 +35,8 @@ module divshiftcalc(
     assign NormShift = (`NE+2)'(`NF);
     // if the shift amount is negitive then dont shift (keep sticky bit)
     // need to multiply the early termination shift by LOGR*DIVCOPIES =  left shift of log2(LOGR*DIVCOPIES)
-    assign DivShiftAmt = (DivResDenorm ?  DivDenormShift[$clog2(`NORMSHIFTSZ)-1:0]&{$clog2(`NORMSHIFTSZ){~DivDenormShift[`NE+1]}} : NormShift[$clog2(`NORMSHIFTSZ)-1:0])+{{$clog2(`NORMSHIFTSZ)-`DURLEN-$clog2(`LOGR*`DIVCOPIES){1'b0}}, DivEarlyTermShift&{`DURLEN{~DivDenormShift[`NE+1]}}, {$clog2(`LOGR*`DIVCOPIES){1'b0}}};
+    assign DivShiftAmt = (DivResDenorm ?  DivDenormShift[$clog2(`NORMSHIFTSZ)-1:0]&{$clog2(`NORMSHIFTSZ){~DivDenormShift[`NE+1]}} : NormShift[$clog2(`NORMSHIFTSZ)-1:0])+{{$clog2(`NORMSHIFTSZ)-`DURLEN-$clog2(`LOGR*`DIVCOPIES){1'b0}}, DivEarlyTermShift&{`DURLEN{~(DivDenormShift[`NE+1]|Sqrt)}}, {$clog2(`LOGR*`DIVCOPIES){1'b0}}};
 
-    assign DivShiftIn = {{`NF{1'b0}}, DivQm, {`NORMSHIFTSZ-`QLEN+(`RADIX/4)-`NF{1'b0}}};
+    assign DivShiftIn = {{`NF{1'b0}}, DivQm, {`NORMSHIFTSZ-`DIVb+1+(`RADIX/4)-`NF{1'b0}}};
 
 endmodule
diff --git a/pipelined/src/fpu/divsqrt.sv b/pipelined/src/fpu/divsqrt.sv
index 7ba44a95..70610bcd 100644
--- a/pipelined/src/fpu/divsqrt.sv
+++ b/pipelined/src/fpu/divsqrt.sv
@@ -34,6 +34,7 @@ module divsqrt(
   input  logic clk, 
   input  logic reset, 
   input  logic [`FMTBITS-1:0] FmtE,
+  input  logic XsE,
   input  logic [`NF:0] XmE, YmE,
   input  logic [`NE-1:0] XeE, YeE,
   input  logic XInfE, YInfE, 
@@ -48,23 +49,22 @@ module divsqrt(
   output logic DivDone,
   output logic [`NE+1:0] QeM,
   output logic [`DURLEN-1:0] EarlyTermShiftM,
-  output logic [`QLEN-1-(`RADIX/4):0] QmM
+  output logic [`DIVb-(`RADIX/4):0] QmM
 //   output logic [`XLEN-1:0] RemM,
 );
 
-  logic [`DIVLEN+3:0]  NextWSN, NextWCN;
-  logic [`DIVLEN+3:0]  WS, WC;
-  logic [`DIVLEN+3:0] StickyWSA;
-  logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
-  logic [`DIVLEN+3:0] X;
-  logic [`DIVLEN+3:0] Dpreproc;
+  logic [`DIVb+3:0]  NextWSN, NextWCN;
+  logic [`DIVb+3:0]  WS, WC;
+  logic [`DIVb+3:0] StickyWSA;
+  logic [`DIVb:0] X;
+  logic [`DIVN-2:0] Dpreproc;
   logic [`DURLEN-1:0] Dur;
   logic NegSticky;
 
-  srtpreproc srtpreproc(.clk, .DivStart(DivStartE), .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), .Sqrt(SqrtE), .Dur, .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc, .XZeroCnt, .YZeroCnt);
+  srtpreproc srtpreproc(.clk, .DivStart(DivStartE), .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), .Sqrt(SqrtE), .Dur, .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc);
 
-  srtfsm srtfsm(.reset, .NextWSN, .NextWCN, .WS, .WC, .Dur, .DivBusy, .clk, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, .DivSE(DivSM), .XNaNE, .YNaNE,
+  srtfsm srtfsm(.reset, .XsE, .SqrtE, .NextWSN, .NextWCN, .WS, .WC, .Dur, .DivBusy, .clk, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, .DivSE(DivSM), .XNaNE, .YNaNE,
                .StickyWSA, .XInfE, .YInfE, .NegSticky(NegSticky), .EarlyTermShiftE(EarlyTermShiftM));
-  srt srt(.clk, .Sqrt(SqrtM), .X,.Dpreproc, .NegSticky, .XZeroCnt, .YZeroCnt, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, .DivStart(DivStartE), .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
-                .StickyWSA, .DivBusy, .Qm(QmM), .Rem());
+  srt srt(.clk, .Sqrt(SqrtM), .X,.Dpreproc, .NegSticky, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, .DivStart(DivStartE), .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
+                .StickyWSA, .DivBusy, .Qm(QmM));
 endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/fctrl.sv b/pipelined/src/fpu/fctrl.sv
index 20e4a009..934aba2c 100755
--- a/pipelined/src/fpu/fctrl.sv
+++ b/pipelined/src/fpu/fctrl.sv
@@ -178,14 +178,14 @@ module fctrl (
 
 // enables:
 //    X - all except int->fp, store, load, mv int->fp
-//    Y - all except cvt, mv, load, class
+//    Y - all except cvt, mv, load, class, sqrt
 //    Z - fma ops only
 //                  load/store                        mv int->fp                      cvt int->fp
     assign XEnE = ~(((FResSelE==2'b10)&~FWriteIntE)|((FResSelE==2'b11)&FRegWriteE)|((FResSelE==2'b01)&(PostProcSelE==2'b00)&OpCtrlE[2]));
 //                  load/class                                    mv               cvt
-    assign YEnE = ~(((FResSelE==2'b10)&(FWriteIntE|FRegWriteE))|(FResSelE==2'b11)|((FResSelE==2'b01)&(PostProcSelE==2'b00)));    
+    assign YEnE = ~(((FResSelE==2'b10)&(FWriteIntE|FRegWriteE))|(FResSelE==2'b11)|((FResSelE==2'b01)&((PostProcSelE==2'b00)|((PostProcSelE==2'b01)&OpCtrlE[0]))));    
     assign ZEnE = (PostProcSelE==2'b10)&(FResSelE==2'b01)&(~OpCtrlE[2]|OpCtrlE[1]);
-    assign YEnForwardE = ~(((FResSelE==2'b10)&(FWriteIntE|FRegWriteE))|(FResSelE==2'b11)|((FResSelE==2'b01)&(PostProcSelE==2'b00)));    
+    assign YEnForwardE = ~(((FResSelE==2'b10)&(FWriteIntE|FRegWriteE))|(FResSelE==2'b11)|((FResSelE==2'b01)&((PostProcSelE==2'b00)|((PostProcSelE==2'b01)&OpCtrlE[0]))));    
     assign ZEnForwardE = (PostProcSelE==2'b10)&(FResSelE==2'b01)&~OpCtrlE[2];
 
 //  Final Res Sel:
diff --git a/pipelined/src/fpu/flags.sv b/pipelined/src/fpu/flags.sv
index c169ab2f..403b65fe 100644
--- a/pipelined/src/fpu/flags.sv
+++ b/pipelined/src/fpu/flags.sv
@@ -126,11 +126,11 @@ module flags(
     //                  |                    |                    |                                      |                     and if the result is not exact
     //                  |                    |                    |                                      |                     |               and if the input isnt infinity or NaN
     //                  |                    |                    |                                      |                     |               |
-    assign Underflow = ((FullRe[`NE+1] | (FullRe == 0) | ((FullRe == 1) & (Me == 0) & ~(UfPlus1&UfL)))&(R|S))&~(InfIn|NaNIn|DivByZero);
+    assign Underflow = ((FullRe[`NE+1] | (FullRe == 0) | ((FullRe == 1) & (Me == 0) & ~(UfPlus1&UfL)))&(R|S))&~(InfIn|NaNIn|DivByZero|Invalid);
 
     // Set Inexact flag if the res is diffrent from what would be outputed given infinite precision
     //      - Don't set the underflow flag if an underflowed res isn't outputed
-    assign FpInexact = (S|Overflow|R)&~(InfIn|NaNIn|DivByZero);
+    assign FpInexact = (S|Overflow|R)&~(InfIn|NaNIn|DivByZero|Invalid);
 
     //                  if the res is too small to be represented and not 0
     //                  |                                     and if the res is not invalid (outside the integer bounds)
@@ -163,7 +163,7 @@ module flags(
 
     // if dividing by zero and not 0/0
     //  - don't set flag if an input is NaN or Inf(IEEE says has to be a finite numerator)
-    assign DivByZero = YZero&DivOp&~(XZero|NaNIn|InfIn);  
+    assign DivByZero = YZero&DivOp&~Sqrt&~(XZero|NaNIn|InfIn);  
 
     // Combine flags
     //      - to integer results do not set the underflow or overflow flags
diff --git a/pipelined/src/fpu/fpu.sv b/pipelined/src/fpu/fpu.sv
index 3e214b0f..d98079b2 100755
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@@ -125,7 +125,7 @@ module fpu (
    logic [`CVTLEN-1:0]     CvtLzcInE, CvtLzcInM;      // input to the Leading Zero Counter (priority encoder)
    
    //divide signals
-   logic [`QLEN-1-(`RADIX/4):0] QmM;
+   logic [`DIVb-(`RADIX/4):0] QmM;
    logic [`NE+1:0]      QeE, QeM; 
    logic                DivSE, DivSM;
    logic                DivDoneM;
@@ -260,7 +260,7 @@ module fpu (
    //    - fsqrt
    // *** add other opperations
    divsqrt divsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
-                  .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .DivStartE(DivStartE), 
+                  .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .DivStartE(DivStartE), .XsE,
                   .StallE, .StallM, .DivSM, .DivBusy(FDivBusyE), .QeM, //***change divbusyE to M signal
                   .EarlyTermShiftM, .QmM, .DivDone(DivDoneM));
    // compare
diff --git a/pipelined/src/fpu/otfc.sv b/pipelined/src/fpu/otfc.sv
index 7ecb823e..1e794391 100644
--- a/pipelined/src/fpu/otfc.sv
+++ b/pipelined/src/fpu/otfc.sv
@@ -32,16 +32,16 @@
 
 module otfc2 (
   input  logic         qp, qz,
-  input  logic [`QLEN-1:0] Q, QM,
-  output logic [`QLEN-1:0] QNext, QMNext
+  input  logic [`DIVb:0] Q, QM,
+  output logic [`DIVb:0] QNext, QMNext
 );
   //  The on-the-fly converter transfers the quotient 
   //  bits to the quotient as they come.
   //  Use this otfc for division only.
-  logic [`QLEN-2:0] QR, QMR;
+  logic [`DIVb-1:0] QR, QMR;
 
-  assign QR  = Q[`QLEN-2:0];
-  assign QMR = QM[`QLEN-2:0];     // Shifted Q and QM
+  assign QR  = Q[`DIVb-1:0];
+  assign QMR = QM[`DIVb-1:0];     // Shifted Q and QM
 
   always_comb begin
     if (qp) begin
@@ -96,8 +96,8 @@ endmodule
 
 module otfc4 (
   input  logic [3:0]   q,
-  input  logic [`QLEN-1:0] Q, QM,
-  output logic [`QLEN-1:0] QNext, QMNext
+  input  logic [`DIVb:0] Q, QM,
+  output logic [`DIVb:0] QNext, QMNext
 );
 
   //  The on-the-fly converter transfers the quotient 
@@ -113,7 +113,7 @@ module otfc4 (
   //  QR and QMR are the shifted versions of Q and QM.
   //  They are treated as [N-1:r] size signals, and 
   //  discard the r most significant bits of Q and QM. 
-  logic [`QLEN-3:0] QR, QMR;
+  logic [`DIVb-2:0] QR, QMR;
 
   // shift Q (quotent) and QM (quotent-1)
 		// if 	q = 2  	    Q = {Q, 10} 	QM = {Q, 01}		
@@ -122,8 +122,8 @@ module otfc4 (
 		// else if 	q = -1	Q = {QM, 11} 	QM = {QM, 10}
 		// else if 	q = -2	Q = {QM, 10} 	QM = {QM, 01}
 
-  assign QR  = Q[`QLEN-3:0];
-  assign QMR = QM[`QLEN-3:0];     // Shifted Q and QM
+  assign QR  = Q[`DIVb-2:0];
+  assign QMR = QM[`DIVb-2:0];     // Shifted Q and QM
   always_comb begin
     if (q[3]) begin // +2
       QNext  = {QR,  2'b10};
diff --git a/pipelined/src/fpu/postprocess.sv b/pipelined/src/fpu/postprocess.sv
index 4d9dc310..003c23d7 100644
--- a/pipelined/src/fpu/postprocess.sv
+++ b/pipelined/src/fpu/postprocess.sv
@@ -60,7 +60,7 @@ module postprocess (
     input logic                             DivS,
     input logic                             DivDone,
     input logic  [`NE+1:0]                  DivQe,
-    input logic  [`QLEN-1-(`RADIX/4):0]                DivQm,
+    input logic  [`DIVb-(`RADIX/4):0]                DivQm,
     // conversion signals
     input logic                             CvtCs,     // the result's sign
     input logic  [`NE:0]                    CvtCe,    // the calculated expoent
@@ -154,7 +154,7 @@ module postprocess (
                               .XZero, .IntToFp, .OutFmt, .CvtResUf, .CvtShiftIn);
     fmashiftcalc fmashiftcalc(.FmaSm, .Ze, .FmaPe, .FmaSCnt, .Fmt, .FmaKillProd, .NormSumExp, .FmaSe,
                           .FmaSZero, .FmaPreResultDenorm, .FmaShiftAmt, .FmaShiftIn);
-    divshiftcalc divshiftcalc(.Fmt, .DivQe, .DivQm, .DivEarlyTermShift, .DivResDenorm, .DivDenormShift, .DivShiftAmt, .DivShiftIn);
+    divshiftcalc divshiftcalc(.Fmt, .Sqrt, .DivQe, .DivQm, .DivEarlyTermShift, .DivResDenorm, .DivDenormShift, .DivShiftAmt, .DivShiftIn);
 
     always_comb
         case(PostProcSel)
@@ -199,7 +199,7 @@ module postprocess (
 
                           
     roundsign roundsign(.FmaPs, .FmaAs, .FmaInvA, .FmaOp, .DivOp, .CvtOp, .FmaNegSum, 
-                        .FmaSs, .Xs, .Ys, .CvtCs, .Ms);
+                        .Sqrt, .FmaSs, .Xs, .Ys, .CvtCs, .Ms);
 
     round round(.OutFmt, .Frm, .S, .FmaZmS, .Plus1, .PostProcSel, .CvtCe, .Qe,
                 .Ms, .FmaMe, .FmaOp, .CvtOp, .CvtResDenormUf, .Mf, .ToInt,  .CvtResUf,
diff --git a/pipelined/src/fpu/qsel.sv b/pipelined/src/fpu/qsel.sv
index 87c6a4b2..afb5b1d4 100644
--- a/pipelined/src/fpu/qsel.sv
+++ b/pipelined/src/fpu/qsel.sv
@@ -89,17 +89,17 @@ module fgen2 (
 endmodule
 
 module qsel4 (
-	input logic [`DIVLEN+3:0] D,
-	input logic [`DIVLEN+3:0] WS, WC,
+	input logic [`DIVN-2:0] D,
+	input logic [`DIVb+3:0] WS, WC,
   input logic Sqrt,
 	output logic [3:0] q
 );
 	logic [6:0] Wmsbs;
 	logic [7:0] PreWmsbs;
 	logic [2:0] Dmsbs;
-	assign PreWmsbs = WC[`DIVLEN+3:`DIVLEN-4] + WS[`DIVLEN+3:`DIVLEN-4];
+	assign PreWmsbs = WC[`DIVb+3:`DIVb-4] + WS[`DIVb+3:`DIVb-4];
 	assign Wmsbs = PreWmsbs[7:1];
-	assign Dmsbs = D[`DIVLEN-1:`DIVLEN-3];
+	assign Dmsbs = D[`DIVN-2:`DIVN-4];//|{3{D[`DIVN-2]&Sqrt}};
 	// D = 0001.xxx...
 	// Dmsbs = |   |
   // W =      xxxx.xxx...
@@ -177,8 +177,8 @@ module fgen4 (
   assign F2  = (~S << 2) & (C << 2);
   assign F1  = ~(S << 1) & C;
   assign F0  = '0;
-  assign FN1 = (SM << 1) | (C & ~(C << 2));
-  assign FN2 = (SM << 2) | ((C << 2)&~(C <<4));
+  assign FN1 = (SM << 1) | (C & ~(C << 3));
+  assign FN2 = (SM << 2) | ((C << 2)&~(C << 4));
 
   // Choose which adder input will be used
 
diff --git a/pipelined/src/fpu/roundsign.sv b/pipelined/src/fpu/roundsign.sv
index acecb594..62e882e6 100644
--- a/pipelined/src/fpu/roundsign.sv
+++ b/pipelined/src/fpu/roundsign.sv
@@ -34,6 +34,7 @@ module roundsign(
     input logic         Xs,
     input logic         Ys,
     input logic         FmaNegSum,
+    input logic         Sqrt,
     input logic         FmaOp,
     input logic         DivOp,
     input logic         CvtOp,
@@ -44,7 +45,7 @@ module roundsign(
 
     logic Qs;
 
-    assign Qs = Xs^Ys;
+    assign Qs = Xs^(Ys&~Sqrt);
 
     // Sign for rounding calulation
     assign Ms = (FmaSs&FmaOp) | (CvtCs&CvtOp) | (Qs&DivOp);
diff --git a/pipelined/src/fpu/srt.sv b/pipelined/src/fpu/srt.sv
index 633ac178..55cde36d 100644
--- a/pipelined/src/fpu/srt.sv
+++ b/pipelined/src/fpu/srt.sv
@@ -37,40 +37,43 @@ module srt(
   input  logic [`NE-1:0] Xe, Ye,
   input  logic XZeroE, YZeroE, 
   input  logic Sqrt,
-  input  logic [`DIVLEN+3:0] X,
-  input  logic [`DIVLEN+3:0] Dpreproc,
-  input  logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
+  input  logic [`DIVb:0] X,
+  input  logic [`DIVN-2:0] Dpreproc,
   input  logic NegSticky,
-  output logic [`QLEN-1-(`RADIX/4):0] Qm,
-  output logic [`DIVLEN+3:0]  NextWSN, NextWCN,
-  output logic [`DIVLEN+3:0]  StickyWSA,
-  output logic [`DIVLEN+3:0]  FirstWS, FirstWC,
-  output logic [`XLEN-1:0] Rem
+  output logic [`DIVb-(`RADIX/4):0] Qm,
+  output logic [`DIVb+3:0]  NextWSN, NextWCN,
+  output logic [`DIVb+3:0]  StickyWSA,
+  output logic [`DIVb+3:0]  FirstWS, FirstWC
 );
 
-
+//QLEN = 1.(number of bits created for division)
+// N is NF+1 or XLEN
+// WC/WS is dependent on D so 4.N-1 ie N+3 bits or N+2:0 + one more bit in fraction for possible sqrt right shift
+// D is 1.N-1, but the msb is always 1 so 0.N-1 or N-1 bits or N-1:0
+// Dsel should match WC/WS so 4.N-1 ie N+3 bits or N+2:0
+// Q/QM/S/SM should be 1.b so b+1 bits or b:0
+// C needs to be the lenght of the final fraction 0.b so b or b-1:0
  /* verilator lint_off UNOPTFLAT */
-  logic [`DIVLEN+3:0]  WSA[`DIVCOPIES-1:0];
-  logic [`DIVLEN+3:0]  WCA[`DIVCOPIES-1:0];
-  logic [`DIVLEN+3:0]  WS[`DIVCOPIES-1:0];
-  logic [`DIVLEN+3:0]  WC[`DIVCOPIES-1:0];
-  logic [`QLEN-1:0] Q[`DIVCOPIES-1:0];
-  logic [`QLEN-1:0] QM[`DIVCOPIES-1:0];
-  logic [`QLEN-1:0] QNext[`DIVCOPIES-1:0];
-  logic [`QLEN-1:0] QMNext[`DIVCOPIES-1:0];
-  logic [`DIVLEN+3:0] S[`DIVCOPIES-1:0]; //***change to QLEN???
-  logic [`DIVLEN+3:0] SM[`DIVCOPIES-1:0];
-  logic [`DIVLEN+3:0] SNext[`DIVCOPIES-1:0];
-  logic [`DIVLEN+3:0] SMNext[`DIVCOPIES-1:0];
-  logic [`DIVLEN+3:0] C[`DIVCOPIES-1:0];
+  logic [`DIVb+3:0]  WSA[`DIVCOPIES-1:0]; // Q4.b
+  logic [`DIVb+3:0]  WCA[`DIVCOPIES-1:0]; // Q4.b
+  logic [`DIVb+3:0]  WS[`DIVCOPIES-1:0]; // Q4.b
+  logic [`DIVb+3:0]  WC[`DIVCOPIES-1:0]; // Q4.b
+  logic [`DIVb:0] Q[`DIVCOPIES-1:0]; // U1.b
+  logic [`DIVb:0] QM[`DIVCOPIES-1:0];// 1.b
+  logic [`DIVb:0] QNext[`DIVCOPIES-1:0];// U1.b
+  logic [`DIVb:0] QMNext[`DIVCOPIES-1:0];// U1.b
+  logic [`DIVb:0] S[`DIVCOPIES-1:0];// U1.b
+  logic [`DIVb:0] SM[`DIVCOPIES-1:0];// U1.b
+  logic [`DIVb:0] SNext[`DIVCOPIES-1:0];// U1.b
+  logic [`DIVb:0] SMNext[`DIVCOPIES-1:0];// U1.b
+  logic [`DIVb-1:0] C[`DIVCOPIES-1:0]; // 0.b
  /* verilator lint_on UNOPTFLAT */
-  logic [`DIVLEN+3:0]  WSN, WCN;
-  logic [`DIVLEN+3:0]  D, DBar, D2, DBar2;
-  logic [$clog2(`XLEN+1)-1:0] intExp;
-  logic           intSign;
-  logic [`QLEN-1:0] QMMux;
-  logic [`DIVLEN+3:0] CMux;
-  logic [`DIVLEN+3:0] SMux;
+  logic [`DIVb+3:0]  WSN, WCN; // Q4.N-1
+  logic [`DIVN-2:0]  D; // U0.N-1
+  logic [`DIVb+3:0]  DBar, D2, DBar2; // Q4.N-1
+  logic [`DIVb:0] QMMux;
+  logic [`DIVb-1:0] CMux;
+  logic [`DIVb:0] SMux;
 
   // Top Muxes and Registers
   // When start is asserted, the inputs are loaded into the divider.
@@ -81,27 +84,28 @@ module srt(
   //  - the assumed one is added to D since it's always normalized (and X/0 is a special case handeled by result selection)
   //  - XZeroE is used as the assumed one to avoid creating a sticky bit - all other numbers are normalized
   if (`RADIX == 2) begin : nextw
-    assign NextWSN = {WSA[`DIVCOPIES-1][`DIVLEN+2:0], 1'b0};
-    assign NextWCN = {WCA[`DIVCOPIES-1][`DIVLEN+2:0], 1'b0};
+    assign NextWSN = {WSA[`DIVCOPIES-1][`DIVb+2:0], 1'b0};
+    assign NextWCN = {WCA[`DIVCOPIES-1][`DIVb+2:0], 1'b0};
   end else begin
-    assign NextWSN = {WSA[`DIVCOPIES-1][`DIVLEN+1:0], 2'b0};
-    assign NextWCN = {WCA[`DIVCOPIES-1][`DIVLEN+1:0], 2'b0};
+    assign NextWSN = {WSA[`DIVCOPIES-1][`DIVb+1:0], 2'b0};
+    assign NextWCN = {WCA[`DIVCOPIES-1][`DIVb+1:0], 2'b0};
   end
 
-  mux2   #(`DIVLEN+4) wsmux(NextWSN, X, DivStart, WSN);
-  flopen   #(`DIVLEN+4) wsflop(clk, DivStart|DivBusy, WSN, WS[0]);
-  mux2   #(`DIVLEN+4) wcmux(NextWCN, {`DIVLEN+4{1'b0}}, DivStart, WCN);
-  flopen   #(`DIVLEN+4) wcflop(clk, DivStart|DivBusy, WCN, WC[0]);
-  flopen #(`DIVLEN+4) dflop(clk, DivStart, Dpreproc, D);
-  mux2 #(`DIVLEN+4) Cmux({2'b11, C[`DIVCOPIES-1][`DIVLEN+3:2]}, {5'b11111, Sqrt, {(`DIVLEN-2){1'b0}}}, DivStart, CMux);
-  flop #(`DIVLEN+4) cflop(clk, CMux, C[0]);
+  mux2   #(`DIVb+4) wsmux(NextWSN, {3'b0, X}, DivStart, WSN);
+  flopen   #(`DIVb+4) wsflop(clk, DivStart|DivBusy, WSN, WS[0]);
+  mux2   #(`DIVb+4) wcmux(NextWCN, '0, DivStart, WCN);
+  flopen   #(`DIVb+4) wcflop(clk, DivStart|DivBusy, WCN, WC[0]);
+  flopen #(`DIVN-1) dflop(clk, DivStart, Dpreproc, D);
+  mux2 #(`DIVb) Cmux({2'b11, C[`DIVCOPIES-1][`DIVb-1:2]}, {Sqrt, {(`DIVb-1){1'b0}}}, DivStart, CMux);
+  flop #(`DIVb) cflop(clk, CMux, C[0]);
 
   // Divisor Selections
-  // - choose the negitive version of what's being selected
-  assign DBar = ~D;
+  //  - choose the negitive version of what's being selected
+  //  - D is only the fraction
+  assign DBar = {3'b111, 1'b0, ~D, {`DIVb-`DIVN+1{1'b1}}};
   if(`RADIX == 4) begin : d2
-    assign DBar2 = {~D[`DIVLEN+2:0], 1'b1};
-    assign D2 = {D[`DIVLEN+2:0], 1'b0};
+    assign DBar2 = {2'b11, 1'b0, ~D, {`DIVb+2-`DIVN{1'b1}}};
+    assign D2 = {2'b0, 1'b1, D, {`DIVb+2-`DIVN{1'b0}}};
   end
 
   genvar i;
@@ -112,12 +116,13 @@ module srt(
       .C(C[i]), .S(S[i]), .SM(SM[i]), .SNext(SNext[i]), .SMNext(SMNext[i]));
       if(i<(`DIVCOPIES-1)) begin 
         if (`RADIX==2)begin 
-          assign WS[i+1] = {WSA[i][`DIVLEN+1:0], 1'b0};
-          assign WC[i+1] = {WCA[i][`DIVLEN+1:0], 1'b0};
+          assign WS[i+1] = {WSA[i][`DIVb+2:0], 1'b0};
+          assign WC[i+1] = {WCA[i][`DIVb+2:0], 1'b0};
+          assign  C[i+1] = {1'b1, C[i][`DIVb-1:1]};
         end else begin
-          assign WS[i+1] = {WSA[i][`DIVLEN+1:0], 2'b0};
-          assign WC[i+1] = {WCA[i][`DIVLEN+1:0], 2'b0};
-          assign  C[i+1] = {2'b11, C[i][`DIVLEN+3:2]};
+          assign WS[i+1] = {WSA[i][`DIVb+1:0], 2'b0};
+          assign WC[i+1] = {WCA[i][`DIVb+1:0], 2'b0};
+          assign  C[i+1] = {2'b11, C[i][`DIVb-1:2]};
         end
         assign Q[i+1] = QNext[i];
         assign QM[i+1] = QMNext[i];
@@ -128,30 +133,30 @@ module srt(
   endgenerate
 
   // if starting a new divison set Q to 0 and QM to -1
-  mux2 #(`QLEN) QMmux(QMNext[`DIVCOPIES-1], {`QLEN{1'b1}}, DivStart, QMMux);
-  flopenr #(`QLEN) Qreg(clk, DivStart, DivBusy, QNext[`DIVCOPIES-1], Q[0]);
-  flopen #(`QLEN) QMreg(clk, DivBusy, QMMux, QM[0]);
-
-  flopr #(`DIVLEN+4) SMreg(clk, DivStart, SMNext[`DIVCOPIES-1], SM[0]);
-  mux2 #(`DIVLEN+4) Smux(SNext[`DIVCOPIES-1], {3'b000, Sqrt, {(`DIVLEN){1'b0}}}, DivStart, SMux);
-  flop #(`DIVLEN+4) Sreg(clk, SMux, S[0]);
+  mux2 #(`DIVb+1) QMmux(QMNext[`DIVCOPIES-1], '1, DivStart, QMMux);
+  flopenr #(`DIVb+1) Qreg(clk, DivStart, DivBusy, QNext[`DIVCOPIES-1], Q[0]);
+  flopen #(`DIVb+1) QMreg(clk, DivBusy, QMMux, QM[0]);
 
+  flopr #(`DIVb+1) SMreg(clk, DivStart, SMNext[`DIVCOPIES-1], SM[0]);
+  mux2 #(`DIVb+1) Smux(SNext[`DIVCOPIES-1], {Sqrt, {(`DIVb){1'b0}}}, DivStart, SMux);
+  flop #(`DIVb+1) Sreg(clk, SMux, S[0]);
+ // division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
   always_comb
-    if(Sqrt)
-      if(NegSticky) Qm = SM[0][`QLEN-1-(`RADIX/4):0];
-      else          Qm = S[0][`QLEN-1-(`RADIX/4):0];
+    if(Sqrt) // sqrt ouputs in the range (1, .5]
+      if(NegSticky) Qm = {SM[0][`DIVb-1-(`RADIX/4):0], 1'b0};
+      else          Qm = {S[0][`DIVb-1-(`RADIX/4):0], 1'b0};
     else  
-      if(NegSticky) Qm = QM[0][`QLEN-1-(`RADIX/4):0];
-      else          Qm = Q[0][`QLEN-1-(`RADIX/4):0];
+      if(NegSticky) Qm = QM[0][`DIVb-(`RADIX/4):0];
+      else          Qm = Q[0][`DIVb-(`RADIX/4):0];
 
   assign FirstWS = WS[0];
   assign FirstWC = WC[0];
 
   if(`RADIX==2)
     if (`DIVCOPIES == 1)
-      assign StickyWSA = {WSA[0][`DIVLEN+2:0], 1'b0};
+      assign StickyWSA = {WSA[0][`DIVb+2:0], 1'b0};
     else
-      assign StickyWSA = {WSA[1][`DIVLEN+2:0], 1'b0};
+      assign StickyWSA = {WSA[1][`DIVb+2:0], 1'b0};
 
 
 endmodule
@@ -162,24 +167,24 @@ endmodule
 
  /* verilator lint_off UNOPTFLAT */
 module divinteration (
-  input logic [`DIVLEN+3:0] D,
-  input logic [`DIVLEN+3:0]  DBar, D2, DBar2,
-  input logic [`QLEN-1:0] Q, QM,
-  input logic [`DIVLEN+3:0] S, SM,
-  input logic [`DIVLEN+3:0]  WS, WC,
-  input logic [`DIVLEN+3:0] C,
+  input logic [`DIVN-2:0] D,
+  input logic [`DIVb+3:0]  DBar, D2, DBar2,
+  input logic [`DIVb:0] Q, QM,
+  input logic [`DIVb:0] S, SM,
+  input logic [`DIVb+3:0]  WS, WC,
+  input logic [`DIVb-1:0] C,
   input logic Sqrt,
-  output logic [`QLEN-1:0] QNext, QMNext, 
-  output logic [`DIVLEN+3:0] SNext, SMNext, 
-  output logic [`DIVLEN+3:0]  WSA, WCA
+  output logic [`DIVb:0] QNext, QMNext, 
+  output logic [`DIVb:0] SNext, SMNext, 
+  output logic [`DIVb+3:0]  WSA, WCA
 );
  /* verilator lint_on UNOPTFLAT */
 
-  logic [`DIVLEN+3:0]  Dsel;
+  logic [`DIVb+3:0]  Dsel;
   logic [3:0]     q;
-  logic qp, qz;//, qn;
-  logic [`DIVLEN+3:0] F;
-  logic [`DIVLEN+3:0] AddIn;
+  logic qp, qz;
+  logic [`DIVb+3:0] F;
+  logic [`DIVb+3:0] AddIn;
 
   // Qmient Selection logic
   // Given partial remainder, select quotient of +1, 0, or -1 (qp, qz, pm)
@@ -190,21 +195,21 @@ module divinteration (
 	// 0010 = -1
 	// 0001 = -2
   if(`RADIX == 2) begin : qsel
-    qsel2 qsel2(WS[`DIVLEN+3:`DIVLEN], WC[`DIVLEN+3:`DIVLEN], qp, qz);//, qn);
+    qsel2 qsel2(WS[`DIVb+3:`DIVb], WC[`DIVb+3:`DIVb], qp, qz);
   end else begin
     qsel4 qsel4(.D, .WS, .WC, .Sqrt, .q);
-    fgen4 fgen4(.s(q), .C, .S, .SM, .F);
+    // fgen4 fgen4(.s(q), .C, .S, .SM, .F);
   end
 
   if(`RADIX == 2) begin : dsel
-    assign Dsel = {`DIVLEN+4{~qz}}&(qp ? DBar : D);
+    assign Dsel = {`DIVb+4{~qz}}&(qp ? DBar : {3'b0, 1'b1, D, {`DIVb-`DIVN+1{1'b0}}});
   end else begin
     always_comb
       case (q)
         4'b1000: Dsel = DBar2;
         4'b0100: Dsel = DBar;
         4'b0000: Dsel = '0;
-        4'b0010: Dsel = D;
+        4'b0010: Dsel = {3'b0, 1'b1, D, {`DIVb-`DIVN+1{1'b0}}};
         4'b0001: Dsel = D2;
         default: Dsel = 'x;
       endcase
@@ -213,16 +218,16 @@ module divinteration (
   //  WSA, WCA = WS + WC - qD
   assign AddIn = Sqrt ? F : Dsel;
   if (`RADIX == 2) begin : csa
-    csa #(`DIVLEN+4) csa(WS, WC, AddIn, qp, WSA, WCA);
+    csa #(`DIVb+4) csa(WS, WC, AddIn, qp, WSA, WCA);
   end else begin
-    csa #(`DIVLEN+4) csa(WS, WC, AddIn, |q[3:2], WSA, WCA);
+    csa #(`DIVb+4) csa(WS, WC, AddIn, |q[3:2]&~Sqrt, WSA, WCA);
   end
 
   if (`RADIX == 2) begin : otfc
     otfc2 otfc2(.qp, .qz, .Q, .QM, .QNext, .QMNext);
   end else begin
     otfc4 otfc4(.q, .Q, .QM, .QNext, .QMNext);
-    sotfc4 sotfc4(.s(q), .Sqrt, .C, .S, .SM, .SNext, .SMNext);
+    // sotfc4 sotfc4(.s(q), .Sqrt, .C, .S, .SM, .SNext, .SMNext);
   end
 
 endmodule
diff --git a/pipelined/src/fpu/srtfsm.sv b/pipelined/src/fpu/srtfsm.sv
index 597f96cd..7fe6b6b7 100644
--- a/pipelined/src/fpu/srtfsm.sv
+++ b/pipelined/src/fpu/srtfsm.sv
@@ -33,14 +33,16 @@
 module srtfsm(
   input  logic clk, 
   input  logic reset, 
-  input logic [`DIVLEN+3:0] NextWSN, NextWCN, WS, WC,
+  input logic [`DIVb+3:0] NextWSN, NextWCN, WS, WC,
   input  logic XInfE, YInfE, 
   input  logic XZeroE, YZeroE, 
   input  logic XNaNE, YNaNE, 
   input  logic DivStart, 
+  input  logic XsE,
+  input  logic SqrtE,
   input  logic StallE,
   input  logic StallM,
-  input  logic [`DIVLEN+3:0] StickyWSA,
+  input  logic [`DIVb+3:0] StickyWSA,
   input  logic [`DURLEN-1:0] Dur,
   output logic [`DURLEN-1:0] EarlyTermShiftE,
   output logic DivSE,
@@ -55,11 +57,11 @@ module srtfsm(
   logic [`DURLEN-1:0] step;
   logic WZero;
   //logic [$clog2(`DIVLEN/2+3)-1:0] Dur;
-  logic [`DIVLEN+3:0] W;
+  logic [`DIVb+3:0] W;
 
   //flopen #($clog2(`DIVLEN/2+3)) durflop(clk, DivStart, CalcDur, Dur);
   assign DivBusy = (state == BUSY);
-  assign WZero = ((NextWSN^NextWCN)=={NextWSN[`DIVLEN+2:0]|NextWCN[`DIVLEN+2:0], 1'b0});
+  assign WZero = ((NextWSN^NextWCN)=={NextWSN[`DIVb+2:0]|NextWCN[`DIVb+2:0], 1'b0});
   // calculate sticky bit
   //    - there is a chance that a value is subtracted infinitly, resulting in an exact QM result
   //      this is only a problem on radix 2 (and pssibly maximally redundant 4) since minimally redundant
@@ -70,7 +72,7 @@ module srtfsm(
     assign DivSE = |W;
   assign DivDone = (state == DONE);
   assign W = WC+WS;
-  assign NegSticky = W[`DIVLEN+3]; //*** is there a better way to do this???
+  assign NegSticky = W[`DIVb+3];
   assign EarlyTermShiftE = step;
 
   always_ff @(posedge clk) begin
@@ -78,7 +80,7 @@ module srtfsm(
           state <= #1 IDLE; 
       end else if (DivStart&~StallE) begin 
           step <= Dur;
-          if (XZeroE|YZeroE|XInfE|YInfE|XNaNE|YNaNE) state <= #1 DONE;
+          if (XZeroE|YZeroE|XInfE|YInfE|XNaNE|YNaNE|(XsE&SqrtE)) state <= #1 DONE;
           else         state <= #1 BUSY;
       end else if (state == BUSY) begin
           if ((~|step[`DURLEN-1:1]&step[0])|WZero) begin
diff --git a/pipelined/src/fpu/srtpreproc.sv b/pipelined/src/fpu/srtpreproc.sv
index 4d260917..63b2b977 100644
--- a/pipelined/src/fpu/srtpreproc.sv
+++ b/pipelined/src/fpu/srtpreproc.sv
@@ -39,16 +39,16 @@ module srtpreproc (
   input  logic Sqrt,
   input logic XZero,
   output logic  [`NE+1:0] QeM,
-  output logic [`DIVLEN+3:0] X,
-  output logic [`DIVLEN+3:0] Dpreproc,
-  output logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
+  output logic [`DIVb:0] X,
+  output logic [`DIVN-2:0] Dpreproc,
   output logic [`DURLEN-1:0] Dur
 );
   // logic  [`XLEN-1:0] PosA, PosB;
   // logic  [`DIVLEN-1:0] ExtraA, ExtraB, PreprocA, PreprocB, PreprocX, PreprocY;
   logic  [`NF-1:0] PreprocA, PreprocX;
   logic  [`NF-1:0] PreprocB, PreprocY;
-  logic  [`NF+3:0] SqrtX;
+  logic  [`NF+1:0] SqrtX;
+  logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
   logic [`NE+1:0] Qe;
 
   // assign PosA = (Signed & SrcA[`XLEN - 1]) ? -SrcA : SrcA;
@@ -70,9 +70,9 @@ module srtpreproc (
   assign PreprocY = Ym[`NF-1:0]<<YZeroCnt;
 
   
-  assign SqrtX = Xe[0] ? {3'b110, ~XZero, PreprocX} : {2'b11, ~XZero, PreprocX, 1'b0};
-  assign X = Sqrt ? {SqrtX, {`DIVLEN-`NF{1'b0}}} : {3'b000, ~XZero, PreprocX, {`DIVLEN-`NF{1'b0}}};
-  assign Dpreproc = {4'b0001, /*Int ? PreprocB : */PreprocY, {`DIVLEN-`NF{1'b0}}};
+  assign SqrtX = Xe[0]^XZeroCnt[0] ? {1'b0, ~XZero, PreprocX} : {~XZero, PreprocX, 1'b0};
+  assign X = Sqrt ? {SqrtX, {`DIVb-1-`NF{1'b0}}} : {~XZero, PreprocX, {`DIVb-`NF{1'b0}}};
+  assign Dpreproc = {PreprocY, {`DIVN-1-`NF{1'b0}}};
   assign Dur = (`DURLEN)'(`FPDUR);
 
   //           radix 2     radix 4
@@ -99,7 +99,8 @@ module expcalc(
   output logic  [`NE+1:0] Qe
   );
   logic [`NE-2:0] Bias;
-  logic [`NE-1:0] SExp, SXExp;
+  logic [`NE+1:0] SXExp;
+  logic [`NE+1:0] SExp;
   logic [`NE+1:0] DExp;
   
   if (`FPSIZES == 1) begin
@@ -126,10 +127,10 @@ module expcalc(
             2'h2: Bias =  (`NE-1)'(`H_BIAS);
         endcase
   end
-  assign SXExp = Xe - (`NE)'(`BIAS);
-  assign SExp  = {1'b0, SXExp[`NE-1:1]} + Bias;
+  assign SXExp = {2'b0, Xe} - {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, XZeroCnt} - (`NE+1)'(`BIAS);
+  assign SExp  = {SXExp[`NE+1], SXExp[`NE+1:1]} + {2'b0, Bias};
   // correct exponent for denormalized input's normalization shifts
   assign DExp = ({2'b0, Xe} - {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, XZeroCnt} - {2'b0, Ye} + {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, YZeroCnt} + {3'b0, Bias})&{`NE+2{~XZero}};
   
-  assign Qe = Sqrt ? {2'b0, SExp} : DExp;
+  assign Qe = Sqrt ? SExp : DExp;
 endmodule
\ No newline at end of file
diff --git a/pipelined/srt/inttestgen.c b/pipelined/srt/inttestgen.c
index 8c83d796..911ce9a5 100644
--- a/pipelined/srt/inttestgen.c
+++ b/pipelined/srt/inttestgen.c
@@ -8,7 +8,7 @@
 /* #includes */
 
 #include <stdio.h>
-#include <stdlib.h>
+#include <stdlib.h> 
 #include <math.h>
 
 /* Constants */
diff --git a/pipelined/testbench/testbench-fp.sv b/pipelined/testbench/testbench-fp.sv
index 9be68f50..17383d1f 100644
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@@ -80,7 +80,7 @@ module testbenchfp;
   logic CvtResSgnE;
   logic [`NE:0]           CvtCalcExpE;    // the calculated expoent
 	logic [`LOGCVTLEN-1:0] CvtShiftAmtE;  // how much to shift by
-	logic [`QLEN-1-(`RADIX/4):0] Quot;
+	logic [`DIVb-(`RADIX/4):0] Quot;
   logic CvtResDenormUfE;
   logic [`DURLEN-1:0] EarlyTermShift;
   logic DivStart, DivBusy;
@@ -256,16 +256,16 @@ module testbenchfp;
             Fmt = {Fmt, 2'b11};
           end
       end
-      // if (TEST === "sqrt"  | TEST === "all") begin // if square-root is being tested
-      //   // add the square-root tests/op-ctrls/unit/fmt
-      //   Tests = {Tests, f128sqrt};
-      //   OpCtrl = {OpCtrl, `SQRT_OPCTRL};
-      //   WriteInt = {WriteInt, 1'b0};
-      //     for(int i = 0; i<5; i++) begin
-      //       Unit = {Unit, `DIVUNIT};
-      //       Fmt = {Fmt, 2'b11};
-      //     end
-      // end
+      if (TEST === "sqrt"  | TEST === "all") begin // if square-root is being tested
+        // add the square-root tests/op-ctrls/unit/fmt
+        Tests = {Tests, f128sqrt};
+        OpCtrl = {OpCtrl, `SQRT_OPCTRL};
+        WriteInt = {WriteInt, 1'b0};
+          for(int i = 0; i<5; i++) begin
+            Unit = {Unit, `DIVUNIT};
+            Fmt = {Fmt, 2'b11};
+          end
+      end
       if (TEST === "fma"   | TEST === "all") begin  // if fused-mutliply-add is being tested
         Tests = {Tests, f128fma};
         OpCtrl = {OpCtrl, `FMA_OPCTRL};
@@ -383,16 +383,16 @@ module testbenchfp;
           Fmt = {Fmt, 2'b01};
         end
       end
-      // if (TEST === "sqrt"  | TEST === "all") begin // if square-root is being tessted
-      //   // add the correct tests/op-ctrls/unit/fmt to their lists
-      //   Tests = {Tests, f64sqrt};
-      //   OpCtrl = {OpCtrl, `SQRT_OPCTRL};
-      //   WriteInt = {WriteInt, 1'b0};
-      //   for(int i = 0; i<5; i++) begin
-      //     Unit = {Unit, `DIVUNIT};
-      //     Fmt = {Fmt, 2'b01};
-      //   end
-      // end
+      if (TEST === "sqrt"  | TEST === "all") begin // if square-root is being tessted
+        // add the correct tests/op-ctrls/unit/fmt to their lists
+        Tests = {Tests, f64sqrt};
+        OpCtrl = {OpCtrl, `SQRT_OPCTRL};
+        WriteInt = {WriteInt, 1'b0};
+        for(int i = 0; i<5; i++) begin
+          Unit = {Unit, `DIVUNIT};
+          Fmt = {Fmt, 2'b01};
+        end
+      end
       if (TEST === "fma"   | TEST === "all") begin // if the fused multiply add is being tested
         Tests = {Tests, f64fma};
         OpCtrl = {OpCtrl, `FMA_OPCTRL};
@@ -494,16 +494,16 @@ module testbenchfp;
           Fmt = {Fmt, 2'b00};
         end
       end
-      // if (TEST === "sqrt"  | TEST === "all") begin // if sqrt is being tested
-      //   // add the correct tests/op-ctrls/unit/fmt to their lists
-      //   Tests = {Tests, f32sqrt};
-      //   OpCtrl = {OpCtrl, `SQRT_OPCTRL};
-      //   WriteInt = {WriteInt, 1'b0};
-      //   for(int i = 0; i<5; i++) begin
-      //     Unit = {Unit, `DIVUNIT};
-      //     Fmt = {Fmt, 2'b00};
-      //   end
-      // end
+      if (TEST === "sqrt"  | TEST === "all") begin // if sqrt is being tested
+        // add the correct tests/op-ctrls/unit/fmt to their lists
+        Tests = {Tests, f32sqrt};
+        OpCtrl = {OpCtrl, `SQRT_OPCTRL};
+        WriteInt = {WriteInt, 1'b0};
+        for(int i = 0; i<5; i++) begin
+          Unit = {Unit, `DIVUNIT};
+          Fmt = {Fmt, 2'b00};
+        end
+      end
       if (TEST === "fma"   | TEST === "all")  begin // if fma is being tested
         Tests = {Tests, f32fma};
         OpCtrl = {OpCtrl, `FMA_OPCTRL};
@@ -587,16 +587,16 @@ module testbenchfp;
           Fmt = {Fmt, 2'b10};
         end
       end
-      // if (TEST === "sqrt"  | TEST === "all") begin // if sqrt is being tested
-      //   // add the correct tests/op-ctrls/unit/fmt to their lists
-      //   Tests = {Tests, f16sqrt};
-      //   OpCtrl = {OpCtrl, `SQRT_OPCTRL};
-      //   WriteInt = {WriteInt, 1'b0};
-      //   for(int i = 0; i<5; i++) begin
-      //     Unit = {Unit, `DIVUNIT};
-      //     Fmt = {Fmt, 2'b10};
-      //   end
-      // end
+      if (TEST === "sqrt"  | TEST === "all") begin // if sqrt is being tested
+        // add the correct tests/op-ctrls/unit/fmt to their lists
+        Tests = {Tests, f16sqrt};
+        OpCtrl = {OpCtrl, `SQRT_OPCTRL};
+        WriteInt = {WriteInt, 1'b0};
+        for(int i = 0; i<5; i++) begin
+          Unit = {Unit, `DIVUNIT};
+          Fmt = {Fmt, 2'b10};
+        end
+      end
       if (TEST === "fma"   | TEST === "all") begin // if fma is being tested
         Tests = {Tests, f16fma};
         OpCtrl = {OpCtrl, `FMA_OPCTRL};
@@ -697,7 +697,7 @@ module testbenchfp;
   fcmp fcmp   (.Fmt(ModFmt), .OpCtrl(OpCtrlVal), .Xs, .Ys, .Xe, .Ye, 
               .Xm, .Ym, .XZero, .YZero, .CmpIntRes(CmpRes),
               .XNaN, .YNaN, .XSNaN, .YSNaN, .X, .Y, .CmpNV(CmpFlg[4]), .CmpFpRes(FpCmpRes));
-  divsqrt divsqrt(.clk, .reset, .FmtE(ModFmt), .XmE(Xm), .YmE(Ym), .XeE(Xe), .YeE(Ye), .SqrtE(1'b0), .SqrtM(1'b0),
+  divsqrt divsqrt(.clk, .reset, .XsE(Xs), .FmtE(ModFmt), .XmE(Xm), .YmE(Ym), .XeE(Xe), .YeE(Ye), .SqrtE(OpCtrlVal[0]), .SqrtM(OpCtrlVal[0]),
                   .XInfE(XInf), .YInfE(YInf), .XZeroE(XZero), .YZeroE(YZero), .XNaNE(XNaN), .YNaNE(YNaN), .DivStartE(DivStart), 
                   .StallE(1'b0), .StallM(1'b0), .DivSM(DivSticky), .DivBusy, .QeM(DivCalcExp),
                   .EarlyTermShiftM(EarlyTermShift), .QmM(Quot), .DivDone);
@@ -1007,40 +1007,72 @@ module readvectors (
           end
         endcase
       `DIVUNIT:
-        case (Fmt)
-          2'b11: begin       // quad
-            X = TestVector[8+3*(`Q_LEN)-1:8+2*(`Q_LEN)];
-            Y = TestVector[8+2*(`Q_LEN)-1:8+(`Q_LEN)];
-            Ans = TestVector[8+(`Q_LEN-1):8];
-            if (~clk) #5;
-            DivStart = 1'b1; #10 // one clk cycle
-            DivStart = 1'b0;
-          end
-          2'b01:	if (`D_SUPPORTED)begin	  // double
-            X = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+3*(`D_LEN)-1:8+2*(`D_LEN)]};
-            Y = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+2*(`D_LEN)-1:8+(`D_LEN)]};
-            Ans = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+(`D_LEN-1):8]};
-            if (~clk) #5;
-            DivStart = 1'b1; #10
-            DivStart = 1'b0;
-          end
-          2'b00:	if (`S_SUPPORTED)begin	  // single
-            X = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+3*(`S_LEN)-1:8+2*(`S_LEN)]};
-            Y = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+2*(`S_LEN)-1:8+1*(`S_LEN)]};
-            Ans = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+(`S_LEN-1):8]};
-            if (~clk) #5;
-            DivStart = 1'b1; #10
-            DivStart = 1'b0;
-          end
-          2'b10:	begin	  // half
-            X = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+3*(`H_LEN)-1:8+2*(`H_LEN)]};
-            Y = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+2*(`H_LEN)-1:8+(`H_LEN)]};
-            Ans = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+(`H_LEN-1):8]};
-            if (~clk) #5;
-            DivStart = 1'b1; #10
-            DivStart = 1'b0;
-          end
-        endcase
+        if(OpCtrl[0])
+          case (Fmt)
+            2'b11: begin       // quad
+              X = TestVector[8+2*(`Q_LEN)-1:8+(`Q_LEN)];
+              Ans = TestVector[8+(`Q_LEN-1):8];
+              if (~clk) #5;
+              DivStart = 1'b1; #10 // one clk cycle
+              DivStart = 1'b0;
+            end
+            2'b01:	if (`D_SUPPORTED)begin	  // double
+              X = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+2*(`D_LEN)-1:8+(`D_LEN)]};
+              Ans = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+(`D_LEN-1):8]};
+              if (~clk) #5;
+              DivStart = 1'b1; #10
+              DivStart = 1'b0;
+            end
+            2'b00:	if (`S_SUPPORTED)begin	  // single
+              X = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+2*(`S_LEN)-1:8+1*(`S_LEN)]};
+              Ans = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+(`S_LEN-1):8]};
+              if (~clk) #5;
+              DivStart = 1'b1; #10
+              DivStart = 1'b0;
+            end
+            2'b10:	begin	  // half
+              X = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+2*(`H_LEN)-1:8+(`H_LEN)]};
+              Ans = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+(`H_LEN-1):8]};
+              if (~clk) #5;
+              DivStart = 1'b1; #10
+              DivStart = 1'b0;
+            end
+          endcase
+        else
+          case (Fmt)
+            2'b11: begin       // quad
+              X = TestVector[8+3*(`Q_LEN)-1:8+2*(`Q_LEN)];
+              Y = TestVector[8+2*(`Q_LEN)-1:8+(`Q_LEN)];
+              Ans = TestVector[8+(`Q_LEN-1):8];
+              if (~clk) #5;
+              DivStart = 1'b1; #10 // one clk cycle
+              DivStart = 1'b0;
+            end
+            2'b01:	if (`D_SUPPORTED)begin	  // double
+              X = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+3*(`D_LEN)-1:8+2*(`D_LEN)]};
+              Y = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+2*(`D_LEN)-1:8+(`D_LEN)]};
+              Ans = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+(`D_LEN-1):8]};
+              if (~clk) #5;
+              DivStart = 1'b1; #10
+              DivStart = 1'b0;
+            end
+            2'b00:	if (`S_SUPPORTED)begin	  // single
+              X = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+3*(`S_LEN)-1:8+2*(`S_LEN)]};
+              Y = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+2*(`S_LEN)-1:8+1*(`S_LEN)]};
+              Ans = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+(`S_LEN-1):8]};
+              if (~clk) #5;
+              DivStart = 1'b1; #10
+              DivStart = 1'b0;
+            end
+            2'b10:	begin	  // half
+              X = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+3*(`H_LEN)-1:8+2*(`H_LEN)]};
+              Y = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+2*(`H_LEN)-1:8+(`H_LEN)]};
+              Ans = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+(`H_LEN-1):8]};
+              if (~clk) #5;
+              DivStart = 1'b1; #10
+              DivStart = 1'b0;
+            end
+          endcase
       `CMPUNIT:
         case (Fmt)        
           2'b11: begin       // quad
@@ -1259,7 +1291,7 @@ module readvectors (
   end
   
   assign XEn = ~((Unit == `CVTINTUNIT)&OpCtrl[2]);
-  assign YEn = ~((Unit == `CVTINTUNIT)|(Unit == `CVTFPUNIT));
+  assign YEn = ~((Unit == `CVTINTUNIT)|(Unit == `CVTFPUNIT)|((Unit == `DIVUNIT)&OpCtrl[0]));
   assign ZEn = (Unit == `FMAUNIT);
   
   unpack unpack(.X, .Y, .Z, .Fmt(ModFmt), .Xs, .Ys, .Zs, .Xe, .Ye, .Ze,