diff --git a/pipelined/src/fpu/fctrl.sv b/pipelined/src/fpu/fctrl.sv
index 8f10611af..dd291ecea 100755
--- a/pipelined/src/fpu/fctrl.sv
+++ b/pipelined/src/fpu/fctrl.sv
@@ -38,7 +38,8 @@ module fctrl (
   input  logic [6:0] Funct7D,   // bits 31:25 of instruction - may contain percision
   input  logic [6:0] OpD,       // bits 6:0 of instruction
   input  logic [4:0] Rs2D,      // bits 24:20 of instruction
-  input  logic [2:0] Funct3D,   // bits 14:12 of instruction - may contain rounding mode
+  input  logic [2:0] Funct3D, Funct3E,   // bits 14:12 of instruction - may contain rounding mode
+  input  logic       MDUE,
   input  logic [2:0] FRM_REGW,  // rounding mode from CSR
   input  logic [1:0] STATUS_FS, // is FPU enabled?
   input  logic       FDivBusyE,  // is the divider busy
@@ -61,7 +62,7 @@ module fctrl (
   logic [`FCTRLW-1:0] ControlsD;
   logic       IllegalFPUInstrD, IllegalFPUInstrE;
   logic 		  FRegWriteD; // FP register write enable
-  logic 		  DivStartD; // integer register write enable
+  logic 		  FDivStartD, FDivStartE, IDivStartE; // integer register write enable
   logic 		  FWriteIntD; // integer register write enable
   logic 		         FRegWriteE; // FP register write enable
   logic [2:0] 	      OpCtrlD;       // Select which opperation to do in each component
@@ -169,7 +170,7 @@ module fctrl (
     endcase
 
   // unswizzle control bits
-  assign {FRegWriteD, FWriteIntD, FResSelD, PostProcSelD, OpCtrlD, DivStartD, IllegalFPUInstrD} = ControlsD;
+  assign {FRegWriteD, FWriteIntD, FResSelD, PostProcSelD, OpCtrlD, FDivStartD, IllegalFPUInstrD} = ControlsD;
   
   // rounding modes:
   //    000 - round to nearest, ties to even
@@ -264,7 +265,12 @@ module fctrl (
               {FRegWriteE, PostProcSelE, FResSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, IllegalFPUInstrE});
   flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
                            {Adr1E, Adr2E, Adr3E});
-  flopenrc #(1) DEDivStartReg(clk, reset, FlushE, ~StallE|FDivBusyE, DivStartD, DivStartE);
+  flopenrc #(1) DEFDivStartReg(clk, reset, FlushE, ~StallE|FDivBusyE, FDivStartD, FDivStartE);
+  if (`M_SUPPORTED) begin
+    assign IDivStartE = MDUE & Funct3E[2];
+    assign DivStartE = FDivStartE | IDivStartE; // integer or floating-point division
+  end else assign DivStartE = FDivStartE;
+
   assign FCvtIntE = (FResSelE == 2'b01);
 
   // E/M pipleine register
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
index 3f6426a2a..19679aa55 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -67,17 +67,18 @@ module fdivsqrt(
   logic SpecialCaseM;
 
   fdivsqrtpreproc fdivsqrtpreproc(
-    .clk, .DivStart(DivStartE), .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), 
-    .Sqrt(SqrtE), .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc);
+    .clk, .DivStartE, .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), 
+    .Sqrt(SqrtE), .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc, 
+    .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .MDUE, .W64E);
   fdivsqrtfsm fdivsqrtfsm(
     .clk, .reset, .FmtE, .XsE, .SqrtE, 
-    .DivBusy, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, 
+    .DivBusy, .DivStartE,.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, 
     .XNaNE, .YNaNE,
     .XInfE, .YInfE, .WZero, .SpecialCaseM);
   fdivsqrtiter fdivsqrtiter(
     .clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, .SqrtM, 
     .X,.Dpreproc, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, 
-    .DivStart(DivStartE), .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
+    .DivStartE, .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
     .DivBusy);
   fdivsqrtpostproc fdivsqrtpostproc(.WS, .WC, .D, .FirstU, .FirstUM, .FirstC, .Firstun, .SqrtM, .SpecialCaseM, .QmM, .WZero, .DivSM);
 endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index db11dcefd..8dc188c6b 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -37,7 +37,7 @@ module fdivsqrtfsm(
   input  logic XInfE, YInfE, 
   input  logic XZeroE, YZeroE, 
   input  logic XNaNE, YNaNE, 
-  input  logic DivStart, 
+  input  logic DivStartE, 
   input  logic XsE,
   input  logic SqrtE,
   input  logic StallE,
@@ -101,8 +101,8 @@ module fdivsqrtfsm(
   always_ff @(posedge clk) begin
       if (reset) begin
           state <= #1 IDLE; 
-      end else if (DivStart&~StallE) begin 
-          step <= cycles; // *** this should be adjusted to depend on the precision; sqrt should use one fewer step becasue firststep=1
+      end else if (DivStartE&~StallE) begin 
+          step <= cycles; 
 //          $display("Setting Nf = %d fbits %d cycles = %d FmtE %d FPSIZES = %d Q_NF = %d num = %d denom = %d\n", Nf, fbits, cycles, FmtE, `FPSIZES, `Q_NF,
 //          (fbits +(`LOGR*`DIVCOPIES)-1), (`LOGR*`DIVCOPIES));
           if (SpecialCaseE) state <= #1 DONE;
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
index b0beae6d9..d13d706f4 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -32,7 +32,7 @@
 
 module fdivsqrtiter(
   input  logic clk,
-  input  logic DivStart, 
+  input  logic DivStartE, 
   input  logic DivBusy, 
   input  logic [`NE-1:0] Xe, Ye,
   input  logic XZeroE, YZeroE, 
@@ -90,19 +90,19 @@ module fdivsqrtiter(
 
   // Initialize C to -1 for sqrt and -R for division
   logic [1:0] initCSqrt, initCDiv2, initCDiv4, initCUpper;
-  assign initCSqrt = 2'b11;
-  assign initCDiv2 = 2'b10;
-  assign initCDiv4 = 2'b00; // *** not sure why this works; seems like it should be 00 for initializing to -4
+  assign initCSqrt = 2'b11; // -1
+  assign initCDiv2 = 2'b10; // -2
+  assign initCDiv4 = 2'b00; // -4
   assign initCUpper = SqrtE ? initCSqrt : (`RADIX == 4) ? initCDiv4 : initCDiv2;
   assign initC = {initCUpper, {`DIVb{1'b0}}};
 
-  mux2   #(`DIVb+4) wsmux(NextWSN, X, DivStart, WSN);
-  flopen   #(`DIVb+4) wsflop(clk, DivStart|DivBusy, WSN, WS[0]);
-  mux2   #(`DIVb+4) wcmux(NextWCN, '0, DivStart, WCN);
-  flopen   #(`DIVb+4) wcflop(clk, DivStart|DivBusy, WCN, WC[0]);
-  flopen #(`DIVN-1) dflop(clk, DivStart, Dpreproc, D);
-  mux2 #(`DIVb+2) Cmux(C[`DIVCOPIES], initC, DivStart, CMux); 
-  flopen #(`DIVb+2) cflop(clk, DivStart|DivBusy, CMux, C[0]);
+  mux2   #(`DIVb+4) wsmux(NextWSN, X, DivStartE, WSN);
+  flopen   #(`DIVb+4) wsflop(clk, DivStartE|DivBusy, WSN, WS[0]);
+  mux2   #(`DIVb+4) wcmux(NextWCN, '0, DivStartE, WCN);
+  flopen   #(`DIVb+4) wcflop(clk, DivStartE|DivBusy, WCN, WC[0]);
+  flopen #(`DIVN-1) dflop(clk, DivStartE, Dpreproc, D);
+  mux2 #(`DIVb+2) Cmux(C[`DIVCOPIES], initC, DivStartE, CMux); 
+  flopen #(`DIVb+2) cflop(clk, DivStartE|DivBusy, CMux, C[0]);
 
   // Divisor Selections
   //  - choose the negitive version of what's being selected
@@ -139,10 +139,10 @@ module fdivsqrtiter(
   // Initialize U to 1.0 and UM to 0 for square root; U to 0 and UM to -1 for division
   assign initU = SqrtE ? {1'b1, {(`DIVb){1'b0}}} : 0;
   assign initUM = SqrtE ? 0 : {1'b1, {(`DIVb){1'b0}}}; 
-  mux2 #(`DIVb+1) Umux(UNext[`DIVCOPIES-1], initU, DivStart, UMux);
-  mux2 #(`DIVb+1) UMmux(UMNext[`DIVCOPIES-1], initUM, DivStart, UMMux);
-  flopen #(`DIVb+1) UReg(clk, DivStart|DivBusy, UMux, U[0]);
-  flopen #(`DIVb+1) UMReg(clk, DivStart|DivBusy, UMMux, UM[0]);
+  mux2 #(`DIVb+1) Umux(UNext[`DIVCOPIES-1], initU, DivStartE, UMux);
+  mux2 #(`DIVb+1) UMmux(UMNext[`DIVCOPIES-1], initUM, DivStartE, UMMux);
+  flopen #(`DIVb+1) UReg(clk, DivStartE|DivBusy, UMux, U[0]);
+  flopen #(`DIVb+1) UMReg(clk, DivStartE|DivBusy, UMMux, UM[0]);
   
   assign FirstWS = WS[0];
   assign FirstWC = WC[0];
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 2a6f6a9e2..d1f9b93ba 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -32,12 +32,15 @@
 
 module fdivsqrtpreproc (
   input  logic clk,
-  input  logic DivStart, 
+  input  logic DivStartE, 
   input  logic [`NF:0] Xm, Ym,
   input  logic [`NE-1:0] Xe, Ye,
   input  logic [`FMTBITS-1:0] Fmt,
   input  logic Sqrt,
   input logic XZero,
+  input  logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
+	input  logic [2:0] 	Funct3E, Funct3M,
+	input  logic MDUE, W64E,
   output logic  [`NE+1:0] QeM,
   output logic [`DIVb+3:0] X,
   output logic [`DIVN-2:0] Dpreproc
@@ -76,7 +79,7 @@ module fdivsqrtpreproc (
   // DIVRESLEN = DIVLEN or DIVLEN+2
   // r = 1 or 2
   // DIVRESLEN/(r*`DIVCOPIES)
-  flopen #(`NE+2) expflop(clk, DivStart, Qe, QeM);
+  flopen #(`NE+2) expflop(clk, DivStartE, Qe, QeM);
   expcalc expcalc(.Fmt, .Xe, .Ye, .Sqrt, .XZero, .XZeroCnt, .YZeroCnt, .Qe);
 
 endmodule
diff --git a/pipelined/src/fpu/fpu.sv b/pipelined/src/fpu/fpu.sv
index 459b891d0..dcc0db6d5 100755
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@@ -163,7 +163,8 @@ module fpu (
    //////////////////////////////////////////////////////////////////////////////////////////
 
    // calculate FP control signals
-   fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .InstrD,
+   fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
+               .Funct3E, .MDUE, .InstrD,
                .StallE, .StallM, .StallW, .FlushE, .FlushM, .FlushW, .FRM_REGW, .STATUS_FS, .FDivBusyE,
                .reset, .clk, .FRegWriteM, .FRegWriteW, .FrmM, .FmtE, .FmtM, .YEnForwardE, .ZEnForwardE,
                .DivStartE, .FWriteIntE, .FCvtIntE, .FWriteIntM, .OpCtrlE, .OpCtrlM, .IllegalFPUInstrM, .XEnE, .YEnE, .ZEnE,