diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index 14924fcc..cd5bb05e 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -102,7 +102,7 @@
 `define CORRSHIFTSZ ((`DIVRESLEN+`NF) > (3*`NF+8) ? (`DIVRESLEN+`NF) : (3*`NF+6))
 
 // division constants
-`define RADIX 32'h2
+`define RADIX 32'h4
 `define DIVCOPIES 32'h3
 `define DIVLEN ((`NF < `XLEN) ? (`XLEN) : (`NF + 3))
 // `define DIVN (`NF < `XLEN ? `XLEN : `NF+1) // length of input
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
index 5b740f5a..43f7687c 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -61,6 +61,7 @@ module fdivsqrt(
   logic [`DIVb+1:0] FirstC;
   logic Firstun;
   logic WZero;
+  logic SpecialCaseM;
 
   fdivsqrtpreproc fdivsqrtpreproc(
     .clk, .DivStart(DivStartE), .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), 
@@ -69,11 +70,11 @@ module fdivsqrt(
     .clk, .reset, .FmtE, .XsE, .SqrtE, 
     .DivBusy, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, 
     .XNaNE, .YNaNE,
-    .XInfE, .YInfE, .WZero);
+    .XInfE, .YInfE, .WZero, .SpecialCaseM);
   fdivsqrtiter fdivsqrtiter(
     .clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, .SqrtM, 
     .X,.Dpreproc, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, 
     .DivStart(DivStartE), .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
     .DivBusy);
-  fdivsqrtpostproc fdivsqrtpostproc(.WS, .WC, .D, .FirstU, .FirstUM, .FirstC, .Firstun, .SqrtM, .QmM, .WZero, .DivSM);
+  fdivsqrtpostproc fdivsqrtpostproc(.WS, .WC, .D, .FirstU, .FirstUM, .FirstC, .Firstun, .SqrtM, .SpecialCaseM, .QmM, .WZero, .DivSM);
 endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfgen4.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
index 08b2dfab..9b0427aa 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
@@ -31,7 +31,7 @@
 `include "wally-config.vh"
 
 module fdivsqrtfgen4 (
-  input  logic [3:0] u,
+  input  logic [3:0] udigit,
   input  logic [`DIVb+3:0] C, U, UM,
   output logic [`DIVb+3:0] F
 );
@@ -47,9 +47,9 @@ module fdivsqrtfgen4 (
   // Choose which adder input will be used
 
   always_comb
-    if (u[3])       F = F2;
-    else if (u[2])  F = F1;
-    else if (U[1])  F = FN1;
-    else if (u[0])  F = FN2;
+    if (udigit[3])       F = F2;
+    else if (udigit[2])  F = F1;
+    else if (udigit[1])  F = FN1;
+    else if (udigit[0])  F = FN2;
     else            F = F0;
 endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index cc1294f2..db11dcef 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -44,18 +44,20 @@ module fdivsqrtfsm(
   input  logic StallM,
   input logic WZero,
   output logic DivDone,
-  output logic DivBusy
+  output logic DivBusy,
+  output logic SpecialCaseM
 );
   
   typedef enum logic [1:0] {IDLE, BUSY, DONE} statetype;
   statetype state;
 
   logic [`DURLEN-1:0] step;
-  logic SpecialCase;
   logic [`DURLEN-1:0] cycles;
+  logic SpecialCaseE;
 
   // terminate immediately on special cases
-  assign SpecialCase = XZeroE | (YZeroE&~SqrtE) | XInfE | YInfE | XNaNE | YNaNE | (XsE&SqrtE);
+  assign SpecialCaseE = XZeroE | (YZeroE&~SqrtE) | XInfE | YInfE | XNaNE | YNaNE | (XsE&SqrtE);
+  flopenr #(1) SpecialCaseReg(clk, reset, ~StallM, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
 
 // DIVN = `NF+3
 // NS = NF + 1
@@ -103,7 +105,7 @@ module fdivsqrtfsm(
           step <= cycles; // *** this should be adjusted to depend on the precision; sqrt should use one fewer step becasue firststep=1
 //          $display("Setting Nf = %d fbits %d cycles = %d FmtE %d FPSIZES = %d Q_NF = %d num = %d denom = %d\n", Nf, fbits, cycles, FmtE, `FPSIZES, `Q_NF,
 //          (fbits +(`LOGR*`DIVCOPIES)-1), (`LOGR*`DIVCOPIES));
-          if (SpecialCase) state <= #1 DONE;
+          if (SpecialCaseE) state <= #1 DONE;
           else             state <= #1 BUSY;
       end else if (DivDone) begin
         if (StallM) state <= #1 DONE;
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 795879cb..e0acd0ed 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -37,6 +37,7 @@ module fdivsqrtpostproc(
   input logic [`DIVb+1:0] FirstC,
   input logic  Firstun,
   input logic SqrtM,
+  input logic SpecialCaseM,
   output logic [`DIVb:0] QmM, 
   output logic WZero,
   output logic DivSM
@@ -64,7 +65,7 @@ module fdivsqrtpostproc(
   end else begin
     assign WZero = weq0;
   end 
-  assign DivSM = ~WZero;
+  assign DivSM = ~WZero & ~(SpecialCaseM & SqrtM); // ***unsure why SpecialCaseM has to be gated by SqrtM, but otherwise fails regression on divide
 
   // Determine if sticky bit is negative
   assign W = WC+WS;
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 9b357862..2a6f6a9e 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -61,6 +61,8 @@ module fdivsqrtpreproc (
 
   assign SqrtX = Xe[0]^XZeroCnt[0] ? {1'b0, ~XZero, PreprocX} : {~XZero, PreprocX, 1'b0};
   assign DivX = {3'b000, ~XZero, PreprocX, {`DIVb-`NF{1'b0}}};
+
+  // *** explain why X is shifted between radices
   if (`RADIX == 2)  assign X = Sqrt ? {3'b111, SqrtX, {`DIVb-1-`NF{1'b0}}} : DivX;
   else              assign X = Sqrt ? {2'b11, SqrtX, {`DIVb-1-`NF{1'b0}}, 1'b0} : DivX;
   assign Dpreproc = {PreprocY, {`DIVN-1-`NF{1'b0}}};
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4.sv
index f0a6cae0..4379724f 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4.sv
@@ -35,7 +35,7 @@ module fdivsqrtqsel4 (
   input logic [4:0] Smsbs,
   input logic [`DIVb+3:0] WS, WC,
   input logic Sqrt, j1,
-  output logic [3:0] u
+  output logic [3:0] udigit
 );
 	logic [6:0] Wmsbs;
 	logic [7:0] PreWmsbs;
@@ -107,6 +107,6 @@ module fdivsqrtqsel4 (
       else if (Smsbs == 5'b10000) A = 3'b111;
       else A = Smsbs[2:0];
     end else A = Dmsbs;
-	assign u = USel4[{A,Wmsbs}];
+	assign udigit = USel4[{A,Wmsbs}];
 	
 endmodule
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
index 9fa655c3..e463762a 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@@ -46,7 +46,7 @@ module fdivsqrtstage4 (
  /* verilator lint_on UNOPTFLAT */
 
   logic [`DIVb+3:0]  Dsel;
-  logic [3:0]     u;
+  logic [3:0]     udigit;
   logic [`DIVb+3:0] F;
   logic [`DIVb+3:0] AddIn;
   logic [4:0] Smsbs;
@@ -61,11 +61,11 @@ module fdivsqrtstage4 (
 	// 0010 = -1
 	// 0001 = -2
   assign Smsbs = U[`DIVb:`DIVb-4];
-  fdivsqrtqsel4 qsel4(.D, .Smsbs, .WS, .WC, .Sqrt(SqrtM), .j1, .u);
-  fdivsqrtfgen4 fgen4(.u, .C({2'b11, CNext}), .U({3'b000, U}), .UM({3'b000, UM}), .F);
+  fdivsqrtqsel4 qsel4(.D, .Smsbs, .WS, .WC, .Sqrt(SqrtM), .j1, .udigit);
+  fdivsqrtfgen4 fgen4(.udigit, .C({2'b11, CNext}), .U({3'b000, U}), .UM({3'b000, UM}), .F);
 
   always_comb
-  case (u)
+  case (udigit)
     4'b1000: Dsel = DBar2;
     4'b0100: Dsel = DBar;
     4'b0000: Dsel = '0;
@@ -77,10 +77,10 @@ module fdivsqrtstage4 (
   // Partial Product Generation
   //  WSA, WCA = WS + WC - qD
   assign AddIn = SqrtM ? F : Dsel;
-  assign CarryIn = ~SqrtM & (u[3] | u[2]); // +1 for 2's complement of -D and -2D 
+  assign CarryIn = ~SqrtM & (udigit[3] | udigit[2]); // +1 for 2's complement of -D and -2D 
   csa #(`DIVb+4) csa(WS, WC, AddIn, CarryIn, WSA, WCA);
  
-  fdivsqrtuotfc4 fdivsqrtuotfc4(.u, .Sqrt(SqrtM), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext);
+  fdivsqrtuotfc4 fdivsqrtuotfc4(.udigit, .Sqrt(SqrtM), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext);
 
   assign un = 0; // unused for radix 4
 endmodule
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
index c3c64bbb..d0524ac8 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
@@ -31,7 +31,7 @@
 `include "wally-config.vh"
 
 module fdivsqrtuotfc4(
-  input  logic [3:0]   u,
+  input  logic [3:0]   udigit,
   input  logic         Sqrt,
   input  logic [`DIVb:0] U, UM,
   input  logic [`DIVb:0] C,
@@ -47,19 +47,19 @@ module fdivsqrtuotfc4(
   assign K3 = (C & ~(C << 2));      // 3K
 
   always_comb begin
-    if (u[3]) begin
+    if (udigit[3]) begin
       UNext  = U | K2;
       UMNext = U | K1;
-    end else if (u[2]) begin
+    end else if (udigit[2]) begin
       UNext  = U | K1;
       UMNext = U;
-    end else if (u[1]) begin
+    end else if (udigit[1]) begin
       UNext  = UM | K3;
       UMNext = UM | K2;
-    end else if (u[0]) begin
+    end else if (udigit[0]) begin
       UNext  = UM | K2;
       UMNext = UM | K1;
-    end else begin        // digit = 0
+    end else begin        // udigit = 0
       UNext  = U;
       UMNext = UM | K3;
     end 
diff --git a/pipelined/src/fpu/postproc/divshiftcalc.sv b/pipelined/src/fpu/postproc/divshiftcalc.sv
index 2b1128ea..cb671a80 100644
--- a/pipelined/src/fpu/postproc/divshiftcalc.sv
+++ b/pipelined/src/fpu/postproc/divshiftcalc.sv
@@ -73,8 +73,10 @@ module divshiftcalc(
     assign DivDenormShiftAmt = DivDenormShiftPos ? DivDenormShift[`LOGNORMSHIFTSZ-1:0] : '0;
     assign DivShiftAmt = DivResDenorm ? DivDenormShiftAmt : NormShift;
 
+    // *** explain why radix 4 division needs a left shift by 1
+    // *** can this shift be moved into the shiftcorrection logic?
     if (`RADIX == 4)
-        assign DivShiftIn = {{`NF{1'b0}}, DivQm[`DIVb-1:0], {`NORMSHIFTSZ-`DIVb+2-`NF{1'b0}}};
+        assign DivShiftIn = Sqrt ? {{`NF{1'b0}}, DivQm, {`NORMSHIFTSZ-`DIVb+1-`NF{1'b0}}} : {{`NF{1'b0}}, DivQm[`DIVb-1:0], {`NORMSHIFTSZ-`DIVb+2-`NF{1'b0}}};
     else
         assign DivShiftIn = {{`NF{1'b0}}, DivQm, {`NORMSHIFTSZ-`DIVb+1-`NF{1'b0}}};
 endmodule
diff --git a/pipelined/src/fpu/flags.sv b/pipelined/src/fpu/postproc/flags.sv
similarity index 97%
rename from pipelined/src/fpu/flags.sv
rename to pipelined/src/fpu/postproc/flags.sv
index 952e0c02..c56bc651 100644
--- a/pipelined/src/fpu/flags.sv
+++ b/pipelined/src/fpu/postproc/flags.sv
@@ -128,10 +128,12 @@ module flags(
     //                  |                    |                    |                                      |                     |               and if the input isnt infinity or NaN
     //                  |                    |                    |                                      |                     |               |
     assign Underflow = ((FullRe[`NE+1] | (FullRe == 0) | ((FullRe == 1) & (Me == 0) & ~(UfPlus1&G)))&(R|S|G))&~(InfIn|NaNIn|DivByZero|Invalid);
+   //assign Underflow = ((FullRe[`NE+1] | (FullRe == 0) | ((FullRe == 1) & (Me == 0) & ~(UfPlus1&G)))&(R|S|G))&~(InfIn|NaNIn|DivByZero|Invalid|XZero);
 
     // Set Inexact flag if the res is diffrent from what would be outputed given infinite precision
     //      - Don't set the underflow flag if an underflowed res isn't outputed
     assign FpInexact = (S|G|Overflow|R)&~(InfIn|NaNIn|DivByZero|Invalid);
+    //assign FpInexact = (S|G|Overflow|R)&~(InfIn|NaNIn|DivByZero|Invalid|XZero);
 
     //                  if the res is too small to be represented and not 0
     //                  |                                     and if the res is not invalid (outside the integer bounds)