diff --git a/wally-pipelined/src/fpu/divconv.sv b/wally-pipelined/src/fpu/divconv.sv
index 7fbafeb1..88596451 100755
--- a/wally-pipelined/src/fpu/divconv.sv
+++ b/wally-pipelined/src/fpu/divconv.sv
@@ -1,18 +1,41 @@
-module divconv (
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 9/28/2021
+//
+// Purpose: Main convergence routine for floating point divider/square root unit (Goldschmidt)
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
 
-   input logic [52:0]   d, n,
+module divconv (
+   input logic [52:0] 	d, n,
    input logic [2:0] 	sel_muxa, sel_muxb,
-   input logic 	      sel_muxr,   
-   input logic 	      load_rega, load_regb, load_regc, load_regd,
-   input logic 		   load_regr, load_regs,
-   input logic 		   P,
-   input logic 		   op_type,
-   input logic 		   exp_odd,   
-   input logic 	      reset,
-   input logic 	      clk,   
-   
+   input logic 		sel_muxr, 
+   input logic 		load_rega, load_regb, load_regc, load_regd,
+   input logic 		load_regr, load_regs,
+   input logic 		P,
+   input logic 		op_type,
+   input logic 		exp_odd, 
+   input logic 		reset,
+   input logic 		clk, 
+		
    output logic [63:0] 	q1, qp1, qm1,
-   output logic [63:0] 	q0, qp0, qm0,   
+   output logic [63:0] 	q0, qp0, qm0, 
    output logic [63:0] 	rega_out, regb_out, regc_out, regd_out,
    output logic [127:0] regr_out
 );
@@ -26,14 +49,11 @@ module divconv (
    logic [63:0] 	mcand, mplier, mcand_q;   
    logic [63:0] 	twocmp_out;
    logic [64:0] 	three;   
-   logic [127:0] 	Carry, Carry2;
-   logic [127:0] 	Sum, Sum2;
    logic [127:0] 	constant, constant2;
    logic [63:0] 	q_const, qp_const, qm_const;
    logic [63:0] 	d2, n2;   
-   logic [11:0] 	d3;   
-   logic          muxr_out;
-   logic          cout1, cout2, cout3, cout4, cout5, cout6, cout7;
+   logic 		muxr_out;
+   logic 		cout1, cout2, cout3, cout4, cout5, cout6, cout7;
 
    // Check if exponent is odd for sqrt
    // If exp_odd=1 and sqrt, then M/2 and use ia_addr=0 as IA
@@ -99,4 +119,3 @@ module divconv (
    flopenr #(64) regk (clk, reset, load_regs, {qp_out0[63:39], (qp_out0[38:10] & {29{~P}}), 10'h0}, qp0);
    
 endmodule // divconv
-
diff --git a/wally-pipelined/src/fpu/fpdiv.sv b/wally-pipelined/src/fpu/fpdiv.sv
index f0f81bd2..1f1788f9 100755
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@@ -60,8 +60,6 @@ module fpdiv (
    logic 	      Invalid;
    logic [4:0] 	      FlagsIn;   	
    logic 	      signResult;      
-   logic 	      convert;
-   logic 	      sub;
    
    logic [63:0]       q1, qm1, qp1, q0, qm0, qp0;
    logic [63:0]       rega_out, regb_out, regc_out, regd_out;
@@ -105,10 +103,10 @@ module fpdiv (
 		  .load_regr, .load_regs, .P, .op_type, .exp_odd);
    
    // FSM : control divider   
-   fsm control (.clk, .reset, .start, .op_type,
-		.done, .load_rega, .load_regb, .load_regc, .load_regd, 
-		.load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, 
-		.divBusy(FDivBusyE));
+   fsm_fpdiv control (.clk, .reset, .start, .op_type,
+		      .done, .load_rega, .load_regb, .load_regc, .load_regd, 
+		      .load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, 
+		      .divBusy(FDivBusyE));
    
    // Round the mantissa to a 52-bit value, with the leading one
    // removed. The rounding units also handles special cases and 
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index 8258b9c6..fd91b1b2 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -129,16 +129,16 @@ module fpu (
      logic [63:0] 	  AlignedSrcAE;           // align SrcA to the floating point format
 
      // DECODE STAGE
+     
      // calculate FP control signals
      fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
 		  .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
 		  .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
 	
      // FP register file
-     //    - can read 3 registers and write 1 register every cycle
      fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
-			.a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), .a4(RdW), 
-			.wd4(FPUResultW),
+			.a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), 
+			.a4(RdW), .wd4(FPUResultW),
 			.rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
 
      // D/E pipeline registers
@@ -158,23 +158,23 @@ module fpu (
                      .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
      
      // forwarding muxs
-     mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
-     mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
-     mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
-     mux3  #(64)  fyaddmux(FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, 
-			   {2'b0, {10{1'b1}}, 52'b0}, 
-			   {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01)}, 
-			   FSrcYE); // Force Z to be 0 for multiply instructions
+     mux3  #(64)  fxemux (FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
+     mux3  #(64)  fyemux (FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
+     mux3  #(64)  fzemux (FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
+     mux3  #(64)  fyaddmux (FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, 
+			    {2'b0, {10{1'b1}}, 52'b0}, 
+			    {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==2'b01)}, 
+			    FSrcYE); // Force Z to be 0 for multiply instructions
      // Force Z to be 0 for multiply instructions     
-     mux3  #(64)  fzmulmux(FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE);
+     mux3  #(64)  fzmulmux (FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE);
        
      // unpacking unit
      //    - splits FP inputs into their various parts
      //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
-     unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
-			 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
-			 .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
-			 .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
+     unpacking unpacking (.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
+			  .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
+			  .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
+			  .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
      
      // FMA
      //   - two stage FMA
@@ -191,7 +191,7 @@ module fpu (
 	      .FmtE, .FmtM, .FrmM, 
 	      .FMAFlgM, .FMAResM);
      
-     // capture the inputs for divide/sqrt
+     // fpdivsqrt using Goldschmidt's iteration
      floprc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
 			      .clear(FDivSqrtDoneE),
 			      .reset(reset),  .clk(FDivBusyE));
@@ -201,12 +201,11 @@ module fpu (
      floprc #(6) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE}), 
 			     .q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ}),
 			     .clear(FDivSqrtDoneE),
-			     .reset(reset),  .clk(FDivBusyE));            
-     // fpdivsqrt using Goldschmidt's iteration
+			     .reset(reset),  .clk(FDivBusyE));     
      fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
-		      .reset, .clk(clk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
-		      .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ,
-		      .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
+		     .reset, .clk(clk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
+		     .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ,
+		     .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
 
      // convert from signle to double and vice versa
      cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
@@ -221,17 +220,14 @@ module fpu (
 		.Invalid(CmpNVE), .CmpResE);
      
      // sign injection unit
-     //    - computation is done in one stage
      fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
 		.SgnNVE, .SgnResE);
      
      // classify
-     //    - computation is done in one stage
-     //    - most of the work is done in the unpacking unit
-     //    - result is written to the integer register
      fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
 			  .XSNaNE, .ClassResE);
-     
+
+     // Convert
      fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE,
 		.CvtResE, .CvtFlgE);
      
@@ -254,22 +250,23 @@ module fpu (
      // E/M pipe registers
 
      // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
-     flopenrc #(65) EMFpReg2(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
-     flopenrc #(65) EMFpReg3(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
-     flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
-     flopenrc #(12) EMFpReg5(clk, reset, FlushM, ~StallM, 
-			     {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
-			     {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
-     flopenrc #(64) EMRegCmpRes(clk, reset, FlushM, ~StallM, FResE, FResM); 
-     flopenrc #(5)  EMRegCmpFlg(clk, reset, FlushM, ~StallM, FFlgE, FFlgM);      
-     flopenrc #(`XLEN) EMRegSgnRes(clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
-     flopenrc #(11) EMCtrlReg(clk, reset, FlushM, ~StallM,
-			      {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
-			      {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
+     flopenrc #(65) EMFpReg2 (clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
+     flopenrc #(65) EMFpReg3 (clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
+     flopenrc #(64) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
+     flopenrc #(12) EMFpReg5 (clk, reset, FlushM, ~StallM, 
+			      {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
+			      {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
+     flopenrc #(64) EMRegCmpRes (clk, reset, FlushM, ~StallM, FResE, FResM); 
+     flopenrc #(5)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, FFlgE, FFlgM);      
+     flopenrc #(`XLEN) EMRegSgnRes (clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
+     flopenrc #(11) EMCtrlReg (clk, reset, FlushM, ~StallM,
+			       {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
+			       {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
      
      // BEGIN MEMORY STAGE
+     
      // FPU flag selection - to privileged
-     mux4  #(5)  FPUFlgMux(5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
+     mux4  #(5)  FPUFlgMux (5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
   
      // M/W pipe registers
      flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
@@ -285,10 +282,10 @@ module fpu (
      // put ReadData into NaN-blocking format
      //    - if there are any unsused bits the most significant bits are filled with 1s
      //    - for load instruction
-     mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
+     mux2  #(64)  ReadResMux ({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
      
      // select the result to be written to the FP register
-     mux4  #(64)  FPUResultMux(ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
+     mux4  #(64)  FPUResultMux (ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
 
   end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low
      assign FStallD = 0;
diff --git a/wally-pipelined/src/fpu/fsm.sv b/wally-pipelined/src/fpu/fsm_fpdiv.sv
similarity index 99%
rename from wally-pipelined/src/fpu/fsm.sv
rename to wally-pipelined/src/fpu/fsm_fpdiv.sv
index 9b0e18a7..14358758 100755
--- a/wally-pipelined/src/fpu/fsm.sv
+++ b/wally-pipelined/src/fpu/fsm_fpdiv.sv
@@ -22,7 +22,7 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
-module fsm (
+module fsm_fpdiv (
    input logic 	      clk,
    input logic 	      reset,
    input logic 	      start,
diff --git a/wally-pipelined/src/fpu/rounder_div.sv b/wally-pipelined/src/fpu/rounder_div.sv
index 1eb50644..03dcff7a 100755
--- a/wally-pipelined/src/fpu/rounder_div.sv
+++ b/wally-pipelined/src/fpu/rounder_div.sv
@@ -52,13 +52,13 @@ module rounder_div (
     output logic [4:0] 	Flags
     );
       
-   logic 	       Rsign;
-   logic [10:0]    Rexp;
-   logic [12:0]    Texp;
-   logic [51:0]    Rmant;
-   logic [63:0]    Tmant;
-   logic [51:0]    Smant;   
-   logic 	       Rzero;
+   logic 		Rsign;
+   logic [10:0] 	Rexp;
+   logic [12:0] 	Texp;
+   logic [51:0] 	Rmant;
+   logic [63:0] 	Tmant;
+   logic [51:0] 	Smant;   
+   logic 		Rzero;
    logic 	       Gdp, Gsp, G;
    logic 	       UnFlow_SP, UnFlow_DP, UnderFlow; 
    logic 	       OvFlow_SP, OvFlow_DP, OverFlow;		
@@ -187,9 +187,9 @@ module rounder_div (
    assign NaN_Sign_out = ~XNaNQ&YNaNQ ? Float2[63] : Float1[63];
    assign Sign_out = (XZeroQ&YZeroQ | XInfQ&YInfQ)&~op_type | Rsign&~XNaNQ&~YNaNQ | 
    		     NaN_Sign_out&(XNaNQ|YNaNQ);
-
    // FIXME (jes) - Imperas gives sNaN a Sign=0 where x86 gives Sign=1
-   // | Float1[63]&op_type;
+   // | Float1[63]&op_type;  (logic to fix this but removed for now)
+   
    assign Rmant[51] = Largest | NaN | (Smant[51]&~Infinite&~Rzero);
    assign Rmant[50:0] = ({51{Largest}} | (Smant[50:0]&{51{~Infinite&Valid&~Rzero}}) |
 			(NaN_out&{51{NaN}}))&({51{~(op_type&Float1[63]&~XZeroQ)}});
diff --git a/wally-pipelined/testbench/testbench-f64.sv b/wally-pipelined/testbench/testbench-f64.sv
index 5ae96f83..e3cdc84d 100755
--- a/wally-pipelined/testbench/testbench-f64.sv
+++ b/wally-pipelined/testbench/testbench-f64.sv
@@ -49,7 +49,6 @@ module testbench ();
 
    integer 	handle3;
    integer 	desc3;  
-   integer 	desc4; 
    
    // instantiate device under test
    unpacking unpacking(.X(op1), .Y(op2), .Z(64'h0), .FOpCtrlE, .FmtE, 
@@ -111,8 +110,6 @@ module testbench ();
 	       @(posedge clk);
 	     $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
 	     vectornum = vectornum + 1;
-	     if (vectornum == 40)
-	       $finish;	     
 	     if (testvectors[vectornum] === 200'bx) begin
 		$display("%d tests completed", vectornum);
 		$finish;