diff --git a/wally-pipelined/regression/wave-dos/peripheral-waves.do b/wally-pipelined/regression/wave-dos/peripheral-waves.do
index 0203836e..594d572a 100644
--- a/wally-pipelined/regression/wave-dos/peripheral-waves.do
+++ b/wally-pipelined/regression/wave-dos/peripheral-waves.do
@@ -35,7 +35,7 @@ add wave -hex /testbench/dut/hart/ieu/dp/SrcAE
 add wave -hex /testbench/dut/hart/ieu/dp/SrcBE
 add wave -hex /testbench/dut/hart/ieu/dp/ALUResultE
 #add wave /testbench/dut/hart/ieu/dp/PCSrcE
-add wave /testbench/dut/hart/mdu/genblk1/div/StartDivideE
+add wave /testbench/dut/hart/mdu/genblk1/div/DivStartE
 add wave /testbench/dut/hart/mdu/DivBusyE
 add wave -hex /testbench/dut/hart/mdu/genblk1/div/RemM
 add wave -hex /testbench/dut/hart/mdu/genblk1/div/QuotM
diff --git a/wally-pipelined/src/muldiv/intdivrestoring.sv b/wally-pipelined/src/muldiv/intdivrestoring.sv
index 64a65b0a..4ca12f4b 100644
--- a/wally-pipelined/src/muldiv/intdivrestoring.sv
+++ b/wally-pipelined/src/muldiv/intdivrestoring.sv
@@ -40,26 +40,28 @@ module intdivrestoring (
 
   logic [`XLEN-1:0] WE[`DIV_BITSPERCYCLE:0];
   logic [`XLEN-1:0] XQE[`DIV_BITSPERCYCLE:0];
-  logic [`XLEN-1:0] DSavedE, XSavedE, XSavedM, DinE, XinE, DnE, DAbsBE, XnE, XInitE, WM, XQM, WnM, XQnM;
+  logic [`XLEN-1:0] DSavedE, XSavedE, XSavedM, DinE, XinE, DnE, DAbsBE, DAbsBM, XnE, XInitE, WM, XQM, WnM, XQnM;
   localparam STEPBITS = $clog2(`XLEN/`DIV_BITSPERCYCLE);
   logic [STEPBITS:0] step;
   logic Div0E, Div0M;
   logic DivInitE, SignXE, SignXM, SignDE, SignDM, NegWM, NegQM;
+
+  logic [`XLEN-1:0] WNextE, XQNextE;
  
   // save inputs on the negative edge of the execute clock.  
   // This is unusual practice, but the inputs are not guaranteed to be stable due to some hazard and forwarding logic.
   // Saving the inputs is the most hardware-efficient way to fix the issue.
-  flopen #(`XLEN) xsavereg(~clk, DivStartE, SrcAE, XSavedE);
-  flopen #(`XLEN) dsavereg(~clk, DivStartE, SrcBE, DSavedE); 
+  //flopen #(`XLEN) xsavereg(~clk, DivStartE, SrcAE, XSavedE);
+ // flopen #(`XLEN) dsavereg(~clk, DivStartE, SrcBE, DSavedE); 
 
   // Handle sign extension for W-type instructions
   generate
     if (`XLEN == 64) begin // RV64 has W-type instructions
-      mux2 #(`XLEN) xinmux(XSavedE, {XSavedE[31:0], 32'b0}, W64E, XinE);
-      mux2 #(`XLEN) dinmux(DSavedE, {{32{DSavedE[31]&DivSignedE}}, DSavedE[31:0]}, W64E, DinE);
+      mux2 #(`XLEN) xinmux(SrcAE, {SrcAE[31:0], 32'b0}, W64E, XinE);
+      mux2 #(`XLEN) dinmux(SrcBE, {{32{SrcBE[31]&DivSignedE}}, SrcBE[31:0]}, W64E, DinE);
 	end else begin // RV32 has no W-type instructions
-      assign XinE = XSavedE;
-      assign DinE = DSavedE;	    
+      assign XinE = SrcAE;
+      assign DinE = SrcBE;	    
     end   
   endgenerate 
 
@@ -69,10 +71,9 @@ module intdivrestoring (
   assign Div0E = (DinE == 0);
 
   // pipeline registers
-  flopenrc #(1) Div0eMReg(clk, reset, FlushM, ~StallM, Div0E, Div0M);
-  flopenrc #(1) SignDMReg(clk, reset, FlushM, ~StallM, SignDE, SignDM);
-  flopenrc #(1) SignXMReg(clk, reset, FlushM, ~StallM, SignXE, SignXM);
-  flopenrc #(`XLEN) XSavedMReg(clk, reset, FlushM, ~StallM, XSavedE, XSavedM); // is this truly necessary?
+  flopen #(1) Div0eMReg(clk, DivStartE, Div0E, Div0M);
+  flopen #(1) SignDMReg(clk, DivStartE, SignDE, SignDM);
+  flopen #(1) SignXMReg(clk, DivStartE, SignXE, SignXM);
 
   // Take absolute value for signed operations, and negate D to handle subtraction in divider stages
   neg #(`XLEN) negd(DinE, DnE);
@@ -81,19 +82,25 @@ module intdivrestoring (
   mux2 #(`XLEN) xabsmux(XinE, XnE, SignXE, XInitE);  // need original X as remainder if doing divide by 0
 
   // initialization multiplexers on first cycle of operation (one cycle after start is asserted)
-  mux2 #(`XLEN) wmux(WM, {`XLEN{1'b0}}, DivInitE, WE[0]);
-  mux2 #(`XLEN) xmux(XQM, XInitE, DivInitE, XQE[0]);
+  mux2 #(`XLEN) wmux(WE[`DIV_BITSPERCYCLE], {`XLEN{1'b0}}, DivStartE, WNextE);
+  mux2 #(`XLEN) xmux(XQE[`DIV_BITSPERCYCLE], XInitE, DivStartE, XQNextE);
 
+  // registers before division steps
+  // *** maybe change this stuff to M stage
+  flopen #(`XLEN) dabsreg(clk, DivStartE, DAbsBE, DAbsBM);
+  flopen #(`XLEN) wreg(clk, BusyE | DivStartE, WNextE, WE[0]); // *** merge Busy and start without combinational loop
+  flopen #(`XLEN) xreg(clk, BusyE | DivStartE, XQNextE, XQE[0]);
+  flopen #(`XLEN) XSavedMReg(clk, DivStartE, SrcAE, XSavedM); 
+  
   // one copy of divstep for each bit produced per cycle
   generate
       genvar i;
       for (i=0; i<`DIV_BITSPERCYCLE; i = i+1)
-        intdivrestoringstep divstep(WE[i], XQE[i], DAbsBE, WE[i+1], XQE[i+1]);
+        intdivrestoringstep divstep(WE[i], XQE[i], DAbsBM, WE[i+1], XQE[i+1]);
   endgenerate
 
-  // registers after division steps
-  flopen #(`XLEN) wreg(clk, BusyE, WE[`DIV_BITSPERCYCLE], WM); 
-  flopen #(`XLEN) xreg(clk, BusyE, XQE[`DIV_BITSPERCYCLE], XQM);
+  assign WM = WE[0];
+  assign XQM = XQE[0];
 
   // Output selection logic in Memory Stage
   // On final setp of signed operations, negate outputs as needed
@@ -112,7 +119,7 @@ module intdivrestoring (
     end else if (DivStartE & ~StallM) begin 
         if (Div0E) DivDoneM = 1;
         else begin
-            BusyE = 1; step = 0; DivInitE = 1;
+            BusyE = 1; step = 0; DivInitE = 1; // *** can drop DivInit
         end
     end else if (BusyE & ~DivDoneM) begin // pause one cycle at beginning of signed operations for absolute value
         DivInitE = 0;
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index c38a6ce0..09f1547f 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -58,9 +58,10 @@ module muldiv (
 
 	 // Divide
 	 // Start a divide when a new division instruction is received and the divider isn't already busy or finishing
+	 assign DivE = MulDivE & Funct3E[2];
 	 assign DivStartE = MulDivE & Funct3E[2] & ~BusyE & ~DivDoneM; 
-	 assign DivBusyE = DivStartE | BusyE;
 	 assign DivSignedE = ~Funct3E[0];
+	 assign DivBusyE = BusyE | DivStartE;
 	 intdivrestoring div(.clk, .reset, .StallM, .FlushM, 
 	   .DivSignedE, .W64E, .DivStartE, .SrcAE, .SrcBE, .BusyE, .DivDoneM, .QuotM, .RemM);