diff --git a/wally-pipelined/regression/sim-fp64 b/wally-pipelined/regression/sim-fp64
new file mode 100755
index 00000000..b6b8ba5c
--- /dev/null
+++ b/wally-pipelined/regression/sim-fp64
@@ -0,0 +1 @@
+vsim -do wally-fp64.do
diff --git a/wally-pipelined/regression/sim-fp64-batch b/wally-pipelined/regression/sim-fp64-batch
new file mode 100755
index 00000000..693bfeb2
--- /dev/null
+++ b/wally-pipelined/regression/sim-fp64-batch
@@ -0,0 +1,3 @@
+vsim -c <<!
+do wally-fp64-batch.do rv64g imperas64d
+!
diff --git a/wally-pipelined/regression/wally-fp64-batch.do b/wally-pipelined/regression/wally-fp64-batch.do
new file mode 100644
index 00000000..33398dc6
--- /dev/null
+++ b/wally-pipelined/regression/wally-fp64-batch.do
@@ -0,0 +1,50 @@
+# wally-pipelined-batch.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# Usage: do wally-pipelined-batch.do <config> <testcases>
+# Example: do wally-pipelined-batch.do rv32ic imperas-32i
+
+# Use this wally-pipelined-batch.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined-batch.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined-batch.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work_${1}_${2}] {
+    vdel -lib work_${1}_${2} -all
+}
+vlib work_${1}_${2}
+
+# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# default to config/rv64ic, but allow this to be overridden at the command line.  For example:
+# do wally-pipelined-batch.do ../config/rv32ic rv32ic
+vlog -work work_${1}_${2} +incdir+../config/$1 +incdir+../config/shared ../testbench/testbench-f64.sv ../testbench/common/*.sv   ../src/*/*.sv -suppress 2583
+
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+vopt work_${1}_${2}.testbench -work work_${1}_${2} -G TEST=$2 -o testbenchopt
+vsim -lib work_${1}_${2} testbenchopt 
+# Adding coverage increases runtime from 2:00 to 4:29.  Can't run it all the time
+#vopt work_$2.testbench -work work_$2 -o workopt_$2 +cover=sbectf
+#vsim -coverage -lib work_$2 workopt_$2
+
+run -all
+#coverage report -file wally-pipelined-coverage.txt
+# These aren't doing anything helpful
+#coverage report -memory 
+#profile report -calltree -file wally-pipelined-calltree.rpt -cutoff 2
+quit
diff --git a/wally-pipelined/regression/wally-fp64.do b/wally-pipelined/regression/wally-fp64.do
new file mode 100644
index 00000000..c131ff16
--- /dev/null
+++ b/wally-pipelined/regression/wally-fp64.do
@@ -0,0 +1,54 @@
+# wally-pipelined.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# run with vsim -do "do wally-pipelined.do rv64ic riscvarchtest-64m"
+
+# Use this wally-pipelined.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# default to config/rv64ic, but allow this to be overridden at the command line.  For example:
+# do wally-pipelined.do ../config/rv32ic
+#switch $argc {
+#    0 {vlog +incdir+../config/rv64ic +incdir+../config/shared ../testbench/testbench.sv ../testbench/common/*.sv ../src/*/*.sv -suppress 2583}
+#    1 {vlog +incdir+$1  +incdir+../config/shared ../testbench/testbench.sv ../testbench/common/*.sv ../src/*/*.sv -suppress 2583}
+#}
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+vlog +incdir+../config/rv64g +incdir+../config/shared ../testbench/testbench-f64.sv ../testbench/common/*.sv   ../src/*/*.sv -suppress 2583
+vopt +acc work.testbench -G TEST=imperas64d -o workopt 
+vsim workopt
+
+view wave
+-- display input and output signals as hexidecimal values
+do ./wave-dos/generic.do
+
+-- Run the Simulation 
+#run 3600 
+run -all
+#quit
+#noview ../testbench/testbench-imperas.sv
+noview ../testbench/testbench.sv
+view wave
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index 7ca34f50..8258b9c6 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -191,33 +191,20 @@ module fpu (
 	      .FmtE, .FmtM, .FrmM, 
 	      .FMAFlgM, .FMAResM);
      
-     // clock gater
-     //    - creates a clock that only runs durring divide/sqrt instructions
-     //    - using the seperate clock gives the divide/sqrt unit some to get set up
-     // *** the module says not to use in synthisis
-     clockgater fpdivclkg(.E(FDivStartE),
-			  .SE(1'b0),
-			  .CLK(clk),
-			  .ECLK(FDivClk));
-     
      // capture the inputs for divide/sqrt
-     //    - if not captured any forwarded inputs will change durring computation
-     //        - this problem is caused by stalling the execute stage
-     //    - the other units don't have this problem, only div/sqrt stalls the execute stage
      floprc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
-				.clear(FDivSqrtDoneE),
-				.reset(reset),  .clk(FDivBusyE));
+			      .clear(FDivSqrtDoneE),
+			      .reset(reset),  .clk(FDivBusyE));
      floprc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
-				.clear(FDivSqrtDoneE),
-				.reset(reset),  .clk(FDivBusyE));
+			      .clear(FDivSqrtDoneE),
+			      .reset(reset),  .clk(FDivBusyE));
      floprc #(6) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE}), 
-				.q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ}),
-			   .clear(FDivSqrtDoneE),
-				.reset(reset),  .clk(FDivBusyE));
-            
-      // fpdivsqrt using Goldschmidt's iteration
-      fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
-		      .reset, .clk(FDivClk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
+			     .q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ}),
+			     .clear(FDivSqrtDoneE),
+			     .reset(reset),  .clk(FDivBusyE));            
+     // fpdivsqrt using Goldschmidt's iteration
+     fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
+		      .reset, .clk(clk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
 		      .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ,
 		      .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
 
diff --git a/wally-pipelined/src/fpu/fsm.sv b/wally-pipelined/src/fpu/fsm.sv
index a0e874bc..9b0e18a7 100755
--- a/wally-pipelined/src/fpu/fsm.sv
+++ b/wally-pipelined/src/fpu/fsm.sv
@@ -47,7 +47,7 @@ module fsm (
    
    statetype current_state, next_state;
    
-   always @(negedge clk)
+   always @(posedge clk)
      begin
 	if (reset == 1'b1)
 	  current_state = S0;
@@ -269,8 +269,23 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
+	       next_state = S11;
+	    end // case: S10
+	  S11:  // done
+	    begin
+	       done = 1'b0;
+	       divBusy = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
 	       next_state = S0;
-	    end 
+	    end 	  
 	  S13:  // start of sqrt path
 	    begin
 	       done = 1'b0;
@@ -479,8 +494,23 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
+	       next_state = S27;
+	    end // case: S26
+	  S27:  // done
+	    begin
+	       done = 1'b0;
+	       divBusy = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
 	       next_state = S0;
-	    end 
+	    end 	  
 	  default: 
 	    begin
 	       done = 1'b0;
diff --git a/wally-pipelined/testbench/testbench-f64.sv b/wally-pipelined/testbench/testbench-f64.sv
index a9dd9ad2..5ae96f83 100755
--- a/wally-pipelined/testbench/testbench-f64.sv
+++ b/wally-pipelined/testbench/testbench-f64.sv
@@ -30,7 +30,7 @@ module testbench ();
    logic 	XExpMaxE;  
    logic 	XNormE;
    logic 	FDivBusyE;   
-   
+    
    logic 	start;
    logic 	reset;
 
@@ -57,16 +57,13 @@ module testbench ();
 		       .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
 		       .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
    fpdiv fdivsqrt (.op1, .op2, .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]),
-		   .reset, .clk, .start, .P(FmtE), .OvEn(1'b1), .UnEn(1'b1),
+		   .reset, .clk, .start, .P(~FmtE), .OvEn(1'b0), .UnEn(1'b0),
 		   .XNaNQ(XNaNE), .YNaNQ(YNaNE), .XInfQ(XInfE), .YInfQ(YInfE), .XZeroQ(XZeroE), .YZeroQ(YZeroE),
 		   .FDivBusyE, .done(done), .AS_Result(AS_Result), .Flags(Flags));
 
+
    // current fpdivsqrt does not operation on denorms yet
-   assign XZeroM = (op1[51:0] == 52'h0);
-   assign YZeroM = (op2[51:0] == 52'h0);   
-   assign XDenorm = XZeroE & ~XZeroM;
-   assign YDenorm = YZeroE & ~YZeroM;
-   assign Denorm = XDenorm | YDenorm;   
+   assign Denorm = XDenormE | YDenormE | Flags[3];   
 
   // generate clock to sequence tests
   always
@@ -77,7 +74,7 @@ module testbench ();
    initial
      begin
 	handle3 = $fopen("f64_div_rne.out");
-	$readmemh("../testbench/fp/f64_div_rne.tv", testvectors);
+	$readmemh("../testbench/fp/vectors/f64_div_rne.tv", testvectors);
 	vectornum = 0; errors = 0;
 	start = 1'b0;
 	// reset
@@ -90,7 +87,7 @@ module testbench ();
 	// Operation (if applicable)
 	#0  op_type = 1'b0;
 	// Precision (32-bit or 64-bit)
-	#0  FmtE = 1'b0;
+	#0  FmtE = 1'b1;
 	// From fctrl logic to dictate operation
 	#0  FOpCtrlE = 3'b000;
 	// Rounding Mode
@@ -114,7 +111,7 @@ module testbench ();
 	       @(posedge clk);
 	     $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
 	     vectornum = vectornum + 1;
-	     if (vectornum == 1)
+	     if (vectornum == 40)
 	       $finish;	     
 	     if (testvectors[vectornum] === 200'bx) begin
 		$display("%d tests completed", vectornum);