diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
index 4d7eb3120..4f3dcf6ff 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -43,13 +43,14 @@ module fdivsqrt(
   input  logic FDivStartE, IDivStartE,
   input  logic StallM,
   input  logic StallE,
+  input  logic TrapM,
   input  logic SqrtE, SqrtM,
 	input  logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
 	input  logic [2:0] 	Funct3E, Funct3M,
 	input  logic MDUE, W64E,
   output logic DivSM,
-  output logic FDivBusyE,
-  output logic DivDone,
+  output logic FDivBusyE, DivStartE, FDivDoneE,
+//  output logic DivDone,
   output logic [`NE+1:0] QeM,
   output logic [`DIVb:0] QmM
 //   output logic [`XLEN-1:0] RemM,
@@ -66,7 +67,6 @@ module fdivsqrt(
   logic SpecialCaseM;
   logic [`DIVBLEN:0] n, m;
   logic OTFCSwap, ALTB, BZero, As;
-  logic DivStartE;
 
   fdivsqrtpreproc fdivsqrtpreproc(
     .clk, .DivStartE, .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), 
@@ -75,11 +75,11 @@ module fdivsqrt(
     .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .MDUE, .W64E);
   fdivsqrtfsm fdivsqrtfsm(
     .clk, .reset, .FmtE, .XsE, .SqrtE, 
-    .FDivBusyE, .FDivStartE, .IDivStartE, .DivStartE, .StallE, .StallM, .DivDone, .XZeroE, .YZeroE, 
+    .FDivBusyE, .FDivStartE, .IDivStartE, .DivStartE, .FDivDoneE, .StallE, .StallM, .TrapM, /*.DivDone, */ .XZeroE, .YZeroE, 
     .XNaNE, .YNaNE, .MDUE, .n,
     .XInfE, .YInfE, .WZero, .SpecialCaseM);
   fdivsqrtiter fdivsqrtiter(
-    .clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, .SqrtM, 
+    .clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, // .SqrtM, 
     .X,.Dpreproc, .FirstWS(WS), .FirstWC(WC),
     .DivStartE, .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE, .OTFCSwap,
     .FDivBusyE);
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index e33688500..aa13b7da9 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -42,12 +42,13 @@ module fdivsqrtfsm(
   input  logic SqrtE,
   input  logic StallE,
   input  logic StallM,
+  input  logic TrapM,
   input  logic WZero,
   input  logic MDUE,
   input  logic [`DIVBLEN:0] n,
   output logic DivStartE,
-  output logic DivDone,
-  output logic FDivBusyE,
+//  output logic DivDone,
+  output logic FDivBusyE, FDivDoneE,
   output logic SpecialCaseM
 );
   
@@ -61,8 +62,10 @@ module fdivsqrtfsm(
   // *** start logic is presently in fctl.  Make it look more like integer division start logic
   // DivStartE comes from fctrl, reflecitng the start of floating-point and possibly integer division
   assign DivStartE = (FDivStartE | IDivStartE) & (state == IDLE) & ~StallM;
-  assign DivDone = (state == DONE) | (WZero & (state == BUSY)); // *** used in postprocess.sv and round.sv.  This doesn't seem proper.  They break when removed.
-  assign FDivBusyE = (state == BUSY & ~DivDone); // *** want to add | DivStartE but it creates comb loop
+  assign FDivDoneE = (state == DONE);
+ // assign DivDone = (state == DONE) | (WZero & (state == BUSY)); // *** used in postprocess.sv and round.sv.  This doesn't seem proper.  They break when removed.
+  //assign FDivBusyE = (state == BUSY & ~DivDone); // *** want to add | DivStartE but it creates comb loop
+  assign FDivBusyE = (state == BUSY) | DivStartE; 
 
     // Divider control signals from MDU
   //assign DivBusyE = (state == BUSY) | DivStartE;
@@ -110,6 +113,23 @@ module fdivsqrtfsm(
 
   /* verilator lint_on WIDTH */
 
+  always_ff @(posedge clk) begin
+      if (reset | TrapM) begin
+          state <= #1 IDLE; 
+      end else if (DivStartE) begin 
+          step <= cycles; 
+          if (SpecialCaseE) state <= #1 DONE;
+          else             state <= #1 BUSY;
+      end else if (state == BUSY) begin
+          if (step == 1)  state <= #1 DONE;
+          step <= step - 1;
+      end else if ((state == DONE) | (WZero & (state == BUSY))) begin
+        if (StallM) state <= #1 DONE;
+        else        state <= #1 IDLE;
+      end 
+  end
+
+/*
   always_ff @(posedge clk) begin
       if (reset) begin
           state <= #1 IDLE; 
@@ -129,6 +149,6 @@ module fdivsqrtfsm(
           step <= step - 1;
       end 
   end
-
+*/
 
 endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
index a6c6c8bce..df8dd2c7f 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -37,7 +37,7 @@ module fdivsqrtiter(
   input  logic [`NE-1:0] Xe, Ye,
   input  logic XZeroE, YZeroE, 
   input  logic SqrtE,
-  input  logic SqrtM,
+//  input  logic SqrtM,
   input  logic OTFCSwap,
   input  logic [`DIVb+3:0] X,
   input  logic [`DIVN-2:0] Dpreproc,
@@ -85,8 +85,8 @@ module fdivsqrtiter(
   // Residual WS/SC registers/initializaiton mux
   mux2   #(`DIVb+4) wsmux(WS[`DIVCOPIES], X, DivStartE, WSN);
   mux2   #(`DIVb+4) wcmux(WC[`DIVCOPIES], '0, DivStartE, WCN);
-  flopen   #(`DIVb+4) wsflop(clk, DivStartE|FDivBusyE, WSN, WS[0]);
-  flopen   #(`DIVb+4) wcflop(clk, DivStartE|FDivBusyE, WCN, WC[0]);
+  flopen   #(`DIVb+4) wsflop(clk, FDivBusyE, WSN, WS[0]);
+  flopen   #(`DIVb+4) wcflop(clk, FDivBusyE, WCN, WC[0]);
 
   // UOTFC Result U and UM registers/initialization mux
   // Initialize U to 1.0 and UM to 0 for square root; U to 0 and UM to -1 for division
@@ -122,13 +122,13 @@ module fdivsqrtiter(
   generate
     for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin : iterations
       if (`RADIX == 2) begin: stage
-        fdivsqrtstage2 fdivsqrtstage(.D, .DBar, .SqrtM, .OTFCSwap,
+        fdivsqrtstage2 fdivsqrtstage(.D, .DBar, .SqrtE, .OTFCSwap,
         .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
         .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end else begin: stage
         logic j1;
         assign j1 = (i == 0 & ~C[0][`DIVb-1]);
-        fdivsqrtstage4 fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtM, .j1, .OTFCSwap,
+        fdivsqrtstage4 fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, .j1, .OTFCSwap,
         .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
         .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index a7da01454..9eec65dae 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -134,6 +134,7 @@ module fdivsqrtpostproc(
 
    // division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
   
+  // *** Result is unused right now
   assign Result = ($signed(PreResult) >>> NormShift) + {{(`DIVb+3){1'b0}}, (PostInc & ~RemOp)};
 
   assign PreQmM = NegSticky ? FirstUM : FirstU; // Select U or U-1 depending on negative sticky bit
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
index 8d78ccd5b..b4c2527d3 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@@ -37,7 +37,7 @@ module fdivsqrtstage2 (
   input  logic [`DIVb:0] U, UM,
   input  logic [`DIVb+3:0]  WS, WC,
   input  logic [`DIVb+1:0] C,
-  input  logic SqrtM,
+  input  logic SqrtE,
   input  logic OTFCSwap,
   output logic un,
   output logic [`DIVb+1:0] CNext,
@@ -73,8 +73,8 @@ module fdivsqrtstage2 (
 
   // Partial Product Generation
   //  WSA, WCA = WS + WC - qD
-  assign AddIn = SqrtM ? F : Dsel;
-  csa #(`DIVb+4) csa(WS, WC, AddIn, up&~SqrtM, WSA, WCA);
+  assign AddIn = SqrtE ? F : Dsel;
+  csa #(`DIVb+4) csa(WS, WC, AddIn, up&~SqrtE, WSA, WCA);
   assign WSNext = WSA << 1;
   assign WCNext = WCA << 1;
 
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
index 92e8f55d4..fb203fd72 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@@ -36,7 +36,7 @@ module fdivsqrtstage4 (
   input  logic [`DIVb:0] U, UM,
   input  logic [`DIVb+3:0]  WS, WC,
   input  logic [`DIVb+1:0] C,
-  input  logic SqrtM, j1, OTFCSwap,
+  input  logic SqrtE, j1, OTFCSwap,
   output logic [`DIVb+1:0] CNext,
   output logic un,
   output logic [`DIVb:0] UNext, UMNext, 
@@ -65,7 +65,7 @@ module fdivsqrtstage4 (
   assign WCmsbs = WC[`DIVb+3:`DIVb-4];
   assign WSmsbs = WS[`DIVb+3:`DIVb-4];
 
-  fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .Sqrt(SqrtM), .j1, .udigit, .OTFCSwap);
+  fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .Sqrt(SqrtE), .j1, .udigit, .OTFCSwap);
   assign un = 1'b0; // unused for radix 4
 
   // F generation logic
@@ -84,8 +84,8 @@ module fdivsqrtstage4 (
 
   // Residual Update
   //  {WS, WC}}Next = (WS + WC - qD or F) << 2
-  assign AddIn = SqrtM ? F : Dsel;
-  assign CarryIn = ~SqrtM & (udigit[3] | udigit[2]); // +1 for 2's complement of -D and -2D 
+  assign AddIn = SqrtE ? F : Dsel;
+  assign CarryIn = ~SqrtE & (udigit[3] | udigit[2]); // +1 for 2's complement of -D and -2D 
   csa #(`DIVb+4) csa(WS, WC, AddIn, CarryIn, WSA, WCA);
   assign WSNext = WSA << 2;
   assign WCNext = WCA << 2;
@@ -94,7 +94,7 @@ module fdivsqrtstage4 (
   assign CNext = {2'b11, C[`DIVb+1:2]};
  
   // On-the-fly converter to accumulate result
-  fdivsqrtuotfc4 fdivsqrtuotfc4(.udigit, .Sqrt(SqrtM), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext);
+  fdivsqrtuotfc4 fdivsqrtuotfc4(.udigit, .Sqrt(SqrtE), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext);
 endmodule
 
 
diff --git a/pipelined/src/fpu/fpu.sv b/pipelined/src/fpu/fpu.sv
index d0b4aceef..0aa549991 100755
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@@ -38,6 +38,7 @@ module fpu (
    input  logic  [`FLEN-1:0] ReadDataW,  // Read data (from LSU)
    input  logic  [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // Integer input (from IEU)
    input  logic 		        StallE, StallM, StallW, // stall signals (from HZU)
+   input  logic              TrapM,
    input  logic 		        FlushE, FlushM, FlushW, // flush signals (from HZU)
    input  logic  [4:0] 	     RdM, RdW,   // which FP register to write to (from IEU)
    input  logic  [1:0]       STATUS_FS,  // Is floating-point enabled? (From privileged unit)
@@ -129,7 +130,8 @@ module fpu (
    logic [`DIVb:0]      QmM;
    logic [`NE+1:0]      QeE, QeM; 
    logic                DivSE, DivSM;
-   logic                DivDoneM;
+//   logic                DivDoneM;
+   logic                FDivDoneE, DivStartE;
 
    // result and flag signals
    logic [`XLEN-1:0] ClassResE;               // classify result
@@ -149,6 +151,7 @@ module fpu (
    logic [`FLEN-1:0] 	 AlignedSrcAE;                       // align SrcA to the floating point format
    logic [`FLEN-1:0]     BoxedZeroE;                         // Zero value for Z for multiplication, with NaN boxing if needed
    logic [`FLEN-1:0]     BoxedOneE;                         // Zero value for Z for multiplication, with NaN boxing if needed
+   logic             EMRegEn;
 
    // DECODE STAGE
 
@@ -176,7 +179,7 @@ module fpu (
       .a4(RdW), .wd4(FPUResultW),
       .rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
 
-   // D/E pipeline registers
+   // D/E pipeline registers  
    flopenrc #(`FLEN) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
    flopenrc #(`FLEN) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
    flopenrc #(`FLEN) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
@@ -263,8 +266,8 @@ module fpu (
    fdivsqrt fdivsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
                   .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .FDivStartE, .IDivStartE, .XsE,
                   .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .MDUE, .W64E,
-                  .StallE, .StallM, .DivSM, .FDivBusyE, .QeM, 
-                  .QmM, .DivDone(DivDoneM));
+                  .StallE, .StallM, .TrapM, .DivSM, .FDivBusyE, .DivStartE, .FDivDoneE, .QeM, 
+                  .QmM /*, .DivDone(DivDoneM) */);
 
                   //
    // compare
@@ -337,15 +340,20 @@ module fpu (
 
    // E/M pipe registers
 
-   // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, XE, FSrcXM);
-   flopenrc #(`NF+2) EMFpReg2 (clk, reset, FlushM, ~StallM, {XsE,XmE}, {XsM,XmM});
-   flopenrc #(`NF+2) EMFpReg3 (clk, reset, FlushM, ~StallM, {YsE,YmE}, {YsM,YmM});
+   assign EMRegEn = ~StallM & (~FDivBusyE & ~FDivDoneE | DivStartE);
+
+   // flopenrc #(64) EMFpReg1(clk, reset, FlushM, EMRegEn, XE, FSrcXM);
+   flopenrc #(`NF+1) EMFpReg2 (clk, reset, FlushM, ~StallM, XmE, XmM);
+   flopenrc #(`NF+1) EMFpReg3 (clk, reset, FlushM, ~StallM, YmE, YmM);
    flopenrc #(`FLEN) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZeE,ZmE}, {ZeM,ZmM});
    flopenrc #(`XLEN) EMFpReg6 (clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
    flopenrc #(`FLEN) EMFpReg7 (clk, reset, FlushM, ~StallM, PreFpResE, PreFpResM);
-   flopenrc #(13) EMFpReg5 (clk, reset, FlushM, ~StallM, 
+   flopenr #(15) EMFpReg5 (clk, reset, EMRegEn, 
+            {XsE, YsE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE, ZDenormE},
+            {XsM, YsM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM, ZDenormM});     
+   /* flopenrc #(13) EMFpReg5 (clk, reset, FlushM, ~StallM, 
             {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE, ZDenormE},
-            {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM, ZDenormM});     
+            {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM, ZDenormM});   */   
    flopenrc #(1)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, PreNVE, PreNVM);      
    flopenrc #(3*`NF+6) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM); 
    flopenrc #(`NE+2) EMRegFma3(clk, reset, FlushM, ~StallM, PeE, PeM);  
@@ -372,7 +380,7 @@ module fpu (
 
    postprocess postprocess(.Xs(XsM), .Ys(YsM), .Ze(ZeM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), .FmaPe(PeM), 
                            .FmaZmS(ZmStickyM), .FmaKillProd(KillProdM), .XZero(XZeroM), .YZero(YZeroM), .ZZero(ZZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QmM), .FmaSs(SsM),
-                           .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), .FmaSm(SmM), .DivQe(QeM), .DivDone(DivDoneM),
+                           .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), .FmaSm(SmM), .DivQe(QeM), /*.DivDone(DivDoneM), */
                            .FmaNegSum(NegSumM), .FmaInvA(InvAM), .ZDenorm(ZDenormM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
                            .CvtCe(CeM), .CvtResDenormUf(CvtResDenormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CsM), .ToInt(FWriteIntM), .DivS(DivSM),
                            .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));
diff --git a/pipelined/src/fpu/postproc/postprocess.sv b/pipelined/src/fpu/postproc/postprocess.sv
index 24365cf96..bcac6de59 100644
--- a/pipelined/src/fpu/postproc/postprocess.sv
+++ b/pipelined/src/fpu/postproc/postprocess.sv
@@ -58,7 +58,7 @@ module postprocess (
     input logic  [$clog2(3*`NF+7)-1:0]      FmaSCnt,   // the normalization shift count
     //divide signals
     input logic                             DivS,
-    input logic                             DivDone,
+//    input logic                             DivDone,
     input logic  [`NE+1:0]                  DivQe,
     input logic  [`DIVb:0]                  DivQm,
     // conversion signals
@@ -129,7 +129,7 @@ module postprocess (
     assign Mult = OpCtrl[2]&~OpCtrl[1]&~OpCtrl[0];
     assign CvtOp = (PostProcSel == 2'b00);
     assign FmaOp = (PostProcSel == 2'b10);
-    assign DivOp = (PostProcSel == 2'b01) & DivDone;
+    assign DivOp = (PostProcSel == 2'b01); // & DivDone;
     assign Sqrt =  OpCtrl[0];
 
     // is there an input of infinity or NaN being used
@@ -165,13 +165,13 @@ module postprocess (
                 ShiftIn =  {CvtShiftIn, {`NORMSHIFTSZ-`CVTLEN-`NF-1{1'b0}}};
             end
             2'b01: begin //div
-                if(DivDone) begin
+               /* if(DivDone) begin */
                     ShiftAmt = DivShiftAmt;
                     ShiftIn =  DivShiftIn;
-                end else begin
+              /*  end else begin
                     ShiftAmt = '0;
                     ShiftIn =  '0;
-                end
+                end */
             end
             default: begin 
                 ShiftAmt = {`LOGNORMSHIFTSZ{1'bx}}; 
@@ -201,7 +201,7 @@ module postprocess (
 
     round round(.OutFmt, .Frm, .FmaZmS, .Plus1, .PostProcSel, .CvtCe, .Qe,
                 .Ms, .FmaMe, .FmaOp, .CvtOp, .CvtResDenormUf, .Mf, .ToInt,  .CvtResUf,
-                .DivS, .DivDone,
+                .DivS, //.DivDone,
                 .DivOp, .UfPlus1, .FullRe, .Rf, .Re, .S, .R, .G, .Me);
 
     ///////////////////////////////////////////////////////////////////////////////
diff --git a/pipelined/src/fpu/postproc/round.sv b/pipelined/src/fpu/postproc/round.sv
index e4450325e..c9e2b94e4 100644
--- a/pipelined/src/fpu/postproc/round.sv
+++ b/pipelined/src/fpu/postproc/round.sv
@@ -43,7 +43,7 @@ module round(
     input logic                     DivOp,
     input logic                     CvtOp,
     input logic                     ToInt,
-    input logic                     DivDone,
+//    input logic                     DivDone,
     input logic  [1:0]              PostProcSel,
     input logic                     CvtResDenormUf,
     input logic                     CvtResUf,
@@ -295,7 +295,8 @@ module round(
         case(PostProcSel)
             2'b10: Me = FmaMe; // fma
             2'b00: Me = {CvtCe[`NE], CvtCe}&{`NE+2{~CvtResDenormUf|CvtResUf}}; // cvt
-            2'b01: Me = DivDone ? Qe : '0; // divide
+            // 2'b01: Me = DivDone ? Qe : '0; // divide
+            2'b01: Me = Qe; // divide
             default: Me = '0; 
         endcase
 
diff --git a/pipelined/src/hazard/hazard.sv b/pipelined/src/hazard/hazard.sv
index b95b7a375..7d381f7c3 100644
--- a/pipelined/src/hazard/hazard.sv
+++ b/pipelined/src/hazard/hazard.sv
@@ -65,10 +65,10 @@ module hazard(
   assign StallFCause = CSRWriteFencePendingDEM & ~(TrapM | RetM | BPPredWrongE);
   // stall in decode if instruction is a load/mul/csr dependent on previous
   assign StallDCause = (LoadStallD | StoreStallD | MDUStallD | CSRRdStallD | FPUStallD | FStallD) & ~(TrapM | RetM | BPPredWrongE);    
-  assign StallECause = (DivBusyE) & ~(TrapM);  // *** can we move to decode stage (KP?)
+  assign StallECause = (DivBusyE | FDivBusyE) & ~(TrapM);  // *** can we move to decode stage (KP?)
   // WFI terminates if any enabled interrupt is pending, even if global interrupts are disabled.  It could also terminate with TW trap
   assign StallMCause = ((wfiM) & (~TrapM & ~IntPendingM)); 
-  assign StallWCause = ((IFUStallF | LSUStallM) & ~TrapM) | (FDivBusyE & ~TrapM & ~IntPendingM);
+  assign StallWCause = ((IFUStallF | LSUStallM) & ~TrapM); // | (FDivBusyE & ~TrapM & ~IntPendingM);
   // head version
   // assign StallWCause = LSUStallM | IFUStallF  | (FDivBusyE & ~TrapM & ~IntPendingM); // *** FDivBusyE should look like DivBusyE  
 //  assign StallMCause = (wfiM & (~TrapM & ~IntPendingM)); // | FDivBusyE;  
diff --git a/pipelined/src/wally/wallypipelinedcore.sv b/pipelined/src/wally/wallypipelinedcore.sv
index 9092d179d..ba9753950 100644
--- a/pipelined/src/wally/wallypipelinedcore.sv
+++ b/pipelined/src/wally/wallypipelinedcore.sv
@@ -389,6 +389,7 @@ module wallypipelinedcore (
          .ReadDataW(ReadDataW[`FLEN-1:0]),// Read data from memory
          .ForwardedSrcAE, // Integer input being processed (from IEU)
          .StallE, .StallM, .StallW, // stall signals from HZU
+         .TrapM,
          .FlushE, .FlushM, .FlushW, // flush signals from HZU
          .RdM, .RdW, // which FP register to write to (from IEU)
          .STATUS_FS, // is floating-point enabled?
diff --git a/pipelined/testbench/tests.vh b/pipelined/testbench/tests.vh
index d8f77ed61..faf9dd8f8 100644
--- a/pipelined/testbench/tests.vh
+++ b/pipelined/testbench/tests.vh
@@ -136,7 +136,6 @@ string tvpaths[] = '{
   
 string imperas32f[] = '{
     `IMPERASTEST,
-    "rv32i_m/F/FDIV-S-DYN-RDN-01",
     "rv32i_m/F/FADD-S-DYN-RDN-01",
     "rv32i_m/F/FADD-S-DYN-RMM-01",
     "rv32i_m/F/FADD-S-DYN-RNE-01",