Merge branch 'main' of github.com:davidharrishmc/riscv-wally into main

2025-02-11 06:05:49 +00:00 · 2022-09-20 09:47:16 -05:00 · 2022-09-20 09:47:16 -05:00 · ea6b687f7c
commit ea6b687f7c
parent 6a1b909a3f 811f498f63
52 changed files with 887 additions and 777 deletions
--- a/fpga/constraints/debug2.xdc
+++ b/fpga/constraints/debug2.xdc
--- a/fpga/src/fpgaTop.v
+++ b/fpga/src/fpgaTop.v
@ -70,21 +70,21 @@ module fpgaTop
  wire 		peripheral_aresetn;
  wire 		mb_reset;
  
-  wire [`AHBW-1:0] HRDATAEXT;
-  wire 		   HREADYEXT;
-  wire 		   HRESPEXT;
-  wire 		   HSELEXT;
  wire 		   HCLKOpen;
  wire 		   HRESETnOpen;
-  wire [31:0] 	   HADDR;
-  wire [`AHBW-1:0] HWDATA;
-  wire 		   HWRITE;
-  wire [2:0] 	   HSIZE;
-  wire [2:0] 	   HBURST;
+(* mark_debug = "true" *)  wire [`AHBW-1:0] HRDATAEXT;
+(* mark_debug = "true" *)  wire 		   HREADYEXT;
+(* mark_debug = "true" *)  wire 		   HRESPEXT;
+(* mark_debug = "true" *)  wire 		   HSELEXT;
+(* mark_debug = "true" *)  wire [31:0] 	   HADDR;
+(* mark_debug = "true" *)  wire [`AHBW-1:0] HWDATA;
+(* mark_debug = "true" *)  wire 		   HWRITE;
+(* mark_debug = "true" *)  wire [2:0] 	   HSIZE;
+(* mark_debug = "true" *)  wire [2:0] 	   HBURST;
+(* mark_debug = "true" *)  wire [1:0] 	   HTRANS;
+(* mark_debug = "true" *)  wire 		   HREADY;
  wire [3:0] 	   HPROT;
-  wire [1:0] 	   HTRANS;
  wire 		   HMASTLOCK;
-  wire 		   HREADY;
  
  

@ -94,41 +94,41 @@ module fpgaTop
  wire 		   SDCCmdOE;
  wire 		   SDCCmdOut;

-  wire [3:0] 	   m_axi_awid;
-  wire [7:0] 	   m_axi_awlen;
-  wire [2:0] 	   m_axi_awsize;
-  wire [1:0] 	   m_axi_awburst;
-  wire [3:0] 	   m_axi_awcache;
-  wire [31:0] 	   m_axi_awaddr;
+(* mark_debug = "true" *)  wire [3:0] 	   m_axi_awid;
+(* mark_debug = "true" *)  wire [7:0] 	   m_axi_awlen;
+(* mark_debug = "true" *)  wire [2:0] 	   m_axi_awsize;
+(* mark_debug = "true" *)  wire [1:0] 	   m_axi_awburst;
+(* mark_debug = "true" *)  wire [3:0] 	   m_axi_awcache;
+(* mark_debug = "true" *)  wire [31:0] 	   m_axi_awaddr;
  wire [2:0] 	   m_axi_awprot;
-  wire 		   m_axi_awvalid;
-  wire 		   m_axi_awready;
-  wire 		   m_axi_awlock;
-  wire [63:0] 	   m_axi_wdata;
-  wire [7:0] 	   m_axi_wstrb;
-  wire 		   m_axi_wlast;
-  wire 		   m_axi_wvalid;
-  wire 		   m_axi_wready;
-  wire [3:0] 	   m_axi_bid;
-  wire [1:0] 	   m_axi_bresp;
-  wire 		   m_axi_bvalid;
-  wire 		   m_axi_bready;
-  wire [3:0] 	   m_axi_arid;
-  wire [7:0] 	   m_axi_arlen;
-  wire [2:0] 	   m_axi_arsize;
-  wire [1:0] 	   m_axi_arburst;
+(* mark_debug = "true" *)  wire 		   m_axi_awvalid;
+(* mark_debug = "true" *)  wire 		   m_axi_awready;
+(* mark_debug = "true" *)  wire 		   m_axi_awlock;
+(* mark_debug = "true" *)  wire [63:0] 	   m_axi_wdata;
+(* mark_debug = "true" *)  wire [7:0] 	   m_axi_wstrb;
+(* mark_debug = "true" *)  wire 		   m_axi_wlast;
+(* mark_debug = "true" *)  wire 		   m_axi_wvalid;
+(* mark_debug = "true" *)  wire 		   m_axi_wready;
+(* mark_debug = "true" *)  wire [3:0] 	   m_axi_bid;
+(* mark_debug = "true" *)  wire [1:0] 	   m_axi_bresp;
+(* mark_debug = "true" *)  wire 		   m_axi_bvalid;
+(* mark_debug = "true" *)  wire 		   m_axi_bready;
+(* mark_debug = "true" *)  wire [3:0] 	   m_axi_arid;
+(* mark_debug = "true" *)  wire [7:0] 	   m_axi_arlen;
+(* mark_debug = "true" *)  wire [2:0] 	   m_axi_arsize;
+(* mark_debug = "true" *)  wire [1:0] 	   m_axi_arburst;
  wire [2:0] 	   m_axi_arprot;
-  wire [3:0] 	   m_axi_arcache;
-  wire 		   m_axi_arvalid;
-  wire [31:0] 	   m_axi_araddr;
+(* mark_debug = "true" *)  wire [3:0] 	   m_axi_arcache;
+(* mark_debug = "true" *)  wire 		   m_axi_arvalid;
+(* mark_debug = "true" *)  wire [31:0] 	   m_axi_araddr;
  wire 		   m_axi_arlock;
-  wire 		   m_axi_arready;
-  wire [3:0] 	   m_axi_rid;
-  wire [63:0] 	   m_axi_rdata;
-  wire [1:0] 	   m_axi_rresp;
-  wire 		   m_axi_rvalid;
-  wire 		   m_axi_rlast;
-  wire 		   m_axi_rready;
+(* mark_debug = "true" *)  wire 		   m_axi_arready;
+(* mark_debug = "true" *)  wire [3:0] 	   m_axi_rid;
+(* mark_debug = "true" *)  wire [63:0] 	   m_axi_rdata;
+(* mark_debug = "true" *)  wire [1:0] 	   m_axi_rresp;
+(* mark_debug = "true" *)  wire 		   m_axi_rvalid;
+(* mark_debug = "true" *)  wire 		   m_axi_rlast;
+(* mark_debug = "true" *)  wire 		   m_axi_rready;

  wire [3:0] 	   BUS_axi_arregion;
  wire [3:0] 	   BUS_axi_arqos;
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@ -102,7 +102,7 @@

 // division constants
 `define RADIX 32'h2
-`define DIVCOPIES 32'h4
+`define DIVCOPIES 32'h5
 `define DIVLEN ((`NF < `XLEN) ? (`XLEN) : (`NF + 3))
 // `define DIVN (`NF < `XLEN ? `XLEN : `NF+1) // length of input
 `define DIVN (`NF < `XLEN ? `XLEN : `NF+3) // length of input
--- a/pipelined/regression/testfloat.do
+++ b/pipelined/regression/testfloat.do
@ -32,7 +32,7 @@ vlib work
 # start and run simulation
 # remove +acc flag for faster sim during regressions if there is no need to access internal signals
 # $num = the added words after the call
-vlog +incdir+../config/$1 +incdir+../config/shared ../testbench/testbench-fp.sv ../src/fpu/*.sv ../src/generic/*.sv  ../src/generic/flop/*.sv -suppress 2583,7063,8607,2697 
+vlog +incdir+../config/$1 +incdir+../config/shared ../testbench/testbench-fp.sv ../src/fpu/*.sv ../src/fpu/*/*.sv ../src/generic/*.sv  ../src/generic/flop/*.sv -suppress 2583,7063,8607,2697 

 vsim -voptargs=+acc work.testbenchfp -G TEST=$2

--- a/pipelined/regression/wave-fpu.do
+++ b/pipelined/regression/wave-fpu.do
@ -24,10 +24,10 @@ add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/W
 add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/WS
 add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/WCA
 add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/WSA
-add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/Q
-add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/QM
-add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/QNext
-add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/QMNext
+add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/U
+add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/UM
+add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/UNext
+add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/UMNext
 add wave -group {Divide} -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/*
 # add wave -group {Divide} -group inter0 -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/interations[0]/stage/fdivsqrtstage/*
 # add wave -group {Divide} -group inter0 -noupdate /testbenchfp/fdivsqrt/fdivsqrt/fdivsqrtiter/interations[0]/stage/fdivsqrtstage/otfc/otfc2/*
--- a/pipelined/src/fpu/divshiftcalc.sv
+++ b/pipelined/src/fpu/divshiftcalc.sv
@ -1,42 +0,0 @@
-`include "wally-config.vh"
-
-module divshiftcalc(
-    input logic  [`DIVb-(`RADIX/4):0] DivQm,
-    input logic  [`FMTBITS-1:0] Fmt,
-    input logic Sqrt,
-    input logic [`DURLEN-1:0] DivEarlyTermShift,
-    input logic [`NE+1:0] DivQe,
-    output logic [$clog2(`NORMSHIFTSZ)-1:0] DivShiftAmt,
-    output logic [`NORMSHIFTSZ-1:0] DivShiftIn,
-    output logic DivResDenorm,
-    output logic [`NE+1:0] DivDenormShift
-);
-    logic [`NE+1:0] NormShift;
-
-    // is the result denromalized
-    // if the exponent is 1 then the result needs to be normalized then the result is denormalizes
-    assign DivResDenorm = DivQe[`NE+1]|(~|DivQe[`NE+1:0]);
-
-    // if the result is denormalized
-    //  00000000x.xxxxxx...                     Exp = DivQe
-    //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
-    //  .00xxxxxxxxxxxxx... << DivQe+NF+1  Exp = +1
-    //  .0000xxxxxxxxxxx... >> 1                Exp = 1
-    // Left shift amount  = DivQe+NF+1-1
-    assign DivDenormShift = (`NE+2)'(`NF)+DivQe;
-    // if the result is normalized
-    //  00000000x.xxxxxx...                     Exp = DivQe
-    //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
-    //  00000000.xxxxxxx... << NF               Exp = DivQe+1
-    //  00000000x.xxxxxx... << NF               Exp = DivQe (extra shift done afterwards)
-    //  00000000xx.xxxxx... << 1?               Exp = DivQe-1 (determined after)
-    // inital Left shift amount  = NF
-    // shift one more if the it's a minimally redundent radix 4 - one entire cycle needed for integer bit
-    assign NormShift = (`NE+2)'(`NF);
-    // if the shift amount is negitive then dont shift (keep sticky bit)
-    // need to multiply the early termination shift by LOGR*DIVCOPIES =  left shift of log2(LOGR*DIVCOPIES)
-    assign DivShiftAmt = (DivResDenorm ?  DivDenormShift[$clog2(`NORMSHIFTSZ)-1:0]&{$clog2(`NORMSHIFTSZ){~DivDenormShift[`NE+1]}} : NormShift[$clog2(`NORMSHIFTSZ)-1:0])+{{$clog2(`NORMSHIFTSZ)-`DURLEN-$clog2(`LOGR*`DIVCOPIES){1'b0}}, DivEarlyTermShift&{`DURLEN{~(DivDenormShift[`NE+1]|Sqrt)}}, {$clog2(`LOGR*`DIVCOPIES){1'b0}}};
-
-    assign DivShiftIn = {{`NF{1'b0}}, DivQm, {`NORMSHIFTSZ-`DIVb+1+(`RADIX/4)-`NF{1'b0}}};
-
-endmodule
--- a/pipelined/src/fpu/fclassify.sv
+++ b/pipelined/src/fpu/fclassify.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fclassivy.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/fcmp.sv
+++ b/pipelined/src/fpu/fcmp.sv
@ -1,5 +1,6 @@

 ///////////////////////////////////////////
+// fcmp.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/fctrl.sv
+++ b/pipelined/src/fpu/fctrl.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fctrl.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/fcvt.sv
+++ b/pipelined/src/fpu/fcvt.sv
@ -1,5 +1,6 @@

 ///////////////////////////////////////////
+// fcvt.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // fdivsqrt.sv
 //
-// Written: David_Harris@hmc.edu, me@KatherineParry.com, Cedar Turek
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
 // Modified:13 January 2022
 //
 // Purpose: Combined Divide and Square Root Floating Point and Integer Unit
@ -48,8 +48,7 @@ module fdivsqrt(
  output logic DivBusy,
  output logic DivDone,
  output logic [`NE+1:0] QeM,
-  output logic [`DURLEN-1:0] EarlyTermShiftM,
-  output logic [`DIVb-(`RADIX/4):0] QmM
+  output logic [`DIVb:0] QmM
 //   output logic [`XLEN-1:0] RemM,
 );

@ -58,9 +57,9 @@ module fdivsqrt(
  logic [`DIVb+3:0] X;
  logic [`DIVN-2:0]  D; // U0.N-1
  logic [`DIVN-2:0] Dpreproc;
-  logic [`DIVb:0] FirstS, FirstSM, FirstQ, FirstQM;
-  logic [`DIVb-1:0] FirstC;
-  logic Firstqn;
+  logic [`DIVb:0] FirstU, FirstUM;
+  logic [`DIVb+1:0] FirstC;
+  logic Firstun;
  logic WZero;

  fdivsqrtpreproc fdivsqrtpreproc(
@ -70,11 +69,11 @@ module fdivsqrt(
    .clk, .reset, .FmtE, .XsE, .SqrtE, 
    .DivBusy, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, 
    .XNaNE, .YNaNE,
-    .XInfE, .YInfE, .EarlyTermShiftE(EarlyTermShiftM), .WZero);
+    .XInfE, .YInfE, .WZero);
  fdivsqrtiter fdivsqrtiter(
-    .clk, .Firstqn, .D, .FirstS, .FirstSM, .FirstQ, .FirstQM, .FirstC, .SqrtE, .SqrtM, 
+    .clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, .SqrtM, 
    .X,.Dpreproc, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, 
    .DivStart(DivStartE), .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
    .DivBusy);
-  fdivsqrtpostproc fdivsqrtpostproc(.WS, .WC, .D, .FirstS, .FirstSM, .FirstQ, .FirstQM, .FirstC, .Firstqn, .SqrtM, .QmM, .WZero, .DivSM);
+  fdivsqrtpostproc fdivsqrtpostproc(.WS, .WC, .D, .FirstU, .FirstUM, .FirstC, .Firstun, .SqrtM, .QmM, .WZero, .DivSM);
 endmodule
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
@ -0,0 +1,58 @@
+///////////////////////////////////////////
+// fdivsqrtfgen2.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
+// Modified:13 January 2022
+//
+// Purpose: Radix 2 F Addend Generator
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fdivsqrtfgen2 (
+  input  logic up, uz,
+  input  logic [`DIVb+1:0] C,
+  input  logic [`DIVb:0] U, UM,
+  output logic [`DIVb+3:0] F
+);
+  logic [`DIVb+3:0] FP, FN, FZ;
+  logic [`DIVb+3:0] SExt, SMExt, CExt;
+
+  assign SExt = {3'b0, U};
+  assign SMExt = {3'b0, UM};
+  assign CExt = {2'b11, C}; // extend C from Q2.k to Q4.k
+
+  // Generate for both positive and negative bits
+  assign FP = ~(SExt << 1) & CExt;
+  assign FN = (SMExt << 1) | (CExt & ~(CExt << 2));
+  assign FZ = '0;
+
+  // Choose which adder input will be used
+
+  always_comb
+    if (up)       F = FP;
+    else if (uz)  F = FZ;
+    else          F = FN;
+
+endmodule
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
@ -0,0 +1,55 @@
+///////////////////////////////////////////
+// fdivsqrtfgen4.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
+// Modified:13 January 2022
+//
+// Purpose: Radix 4 F Addend Generator
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fdivsqrtfgen4 (
+  input  logic [3:0] u,
+  input  logic [`DIVb+3:0] C, U, UM,
+  output logic [`DIVb+3:0] F
+);
+  logic [`DIVb+3:0] F2, F1, F0, FN1, FN2;
+  
+  // Generate for both positive and negative bits
+  assign F2  = (~U << 2) & (C << 2);
+  assign F1  = ~(U << 1) & C;
+  assign F0  = '0;
+  assign FN1 = (UM << 1) | (C & ~(C << 3));
+  assign FN2 = (UM << 2) | ((C << 2)&~(C << 4));
+
+  // Choose which adder input will be used
+
+  always_comb
+    if (u[3])       F = F2;
+    else if (u[2])  F = F1;
+    else if (U[1])  F = FN1;
+    else if (u[0])  F = FN2;
+    else            F = F0;
+endmodule
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // fdivsqrtfsm.sv
 //
-// Written: David_Harris@hmc.edu, me@KatherineParry.com, Cedar Turek
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
 // Modified:13 January 2022
 //
 // Purpose: Combined Divide and Square Root Floating Point and Integer Unit
@ -43,7 +43,6 @@ module fdivsqrtfsm(
  input  logic StallE,
  input  logic StallM,
  input logic WZero,
-  output logic [`DURLEN-1:0] EarlyTermShiftE,
  output logic DivDone,
  output logic DivBusy
 );
@ -55,8 +54,6 @@ module fdivsqrtfsm(
  logic SpecialCase;
  logic [`DURLEN-1:0] cycles;

-  assign EarlyTermShiftE = step;
-
  // terminate immediately on special cases
  assign SpecialCase = XZeroE | (YZeroE&~SqrtE) | XInfE | YInfE | XNaNE | YNaNE | (XsE&SqrtE);

@ -94,8 +91,7 @@ module fdivsqrtfsm(
  always_comb begin 
    if (SqrtE) fbits = Nf + 2 + 2; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2
    else       fbits = Nf + 2 + `LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
-    if (SqrtE) cycles =  (fbits + (`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES);  // ceiling(fbits / r*k)
-    else       cycles = `FPDUR; // *** line above should work once otfc is used to put results in upper bits
+    cycles =  (fbits + (`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES);
  end 

  /* verilator lint_on WIDTH */
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
@ -42,10 +42,9 @@ module fdivsqrtiter(
  input  logic [`DIVN-2:0] Dpreproc,
  output logic [`DIVN-2:0]  D, // U0.N-1
  output logic [`DIVb+3:0]  NextWSN, NextWCN,
-  output logic [`DIVb:0] FirstS, FirstSM,
-  output logic [`DIVb:0] FirstQ, FirstQM,
-  output logic [`DIVb-1:0] FirstC,
-  output logic             Firstqn,
+  output logic [`DIVb:0] FirstU, FirstUM,
+  output logic [`DIVb+1:0] FirstC,
+  output logic             Firstun,
  output logic [`DIVb+3:0]  FirstWS, FirstWC
 );

@ -54,33 +53,29 @@ module fdivsqrtiter(
 // WC/WS is dependent on D so 4.N-1 ie N+3 bits or N+2:0 + one more bit in fraction for possible sqrt right shift
 // D is 1.N-1, but the msb is always 1 so 0.N-1 or N-1 bits or N-1:0
 // Dsel should match WC/WS so 4.N-1 ie N+3 bits or N+2:0
-// Q/QM/S/SM should be 1.b so b+1 bits or b:0
+// U/UM should be 1.b so b+1 bits or b:0
 // C needs to be the lenght of the final fraction 0.b so b or b-1:0
 /* verilator lint_off UNOPTFLAT */
  logic [`DIVb+3:0]  WSA[`DIVCOPIES-1:0]; // Q4.b
  logic [`DIVb+3:0]  WCA[`DIVCOPIES-1:0]; // Q4.b
  logic [`DIVb+3:0]  WS[`DIVCOPIES-1:0]; // Q4.b
  logic [`DIVb+3:0]  WC[`DIVCOPIES-1:0]; // Q4.b
-  logic [`DIVb:0] Q[`DIVCOPIES-1:0]; // U1.b
-  logic [`DIVb:0] QM[`DIVCOPIES-1:0];// 1.b
-  logic [`DIVb:0] QNext[`DIVCOPIES-1:0];// U1.b
-  logic [`DIVb:0] QMNext[`DIVCOPIES-1:0];// U1.b
-  logic [`DIVb:0] S[`DIVCOPIES-1:0];// U1.b
-  logic [`DIVb:0] SM[`DIVCOPIES-1:0];// U1.b
-  logic [`DIVb:0] SNext[`DIVCOPIES-1:0];// U1.b
-  logic [`DIVb:0] SMNext[`DIVCOPIES-1:0];// U1.b
-  logic [`DIVb-1:0] C[`DIVCOPIES:0]; // 0.b
-  logic [`DIVb-1:0] initC; // 0.b
-  logic [`DIVCOPIES-1:0] qn; 
-
+  logic [`DIVb:0] U[`DIVCOPIES-1:0]; // U1.b
+  logic [`DIVb:0] UM[`DIVCOPIES-1:0];// 1.b
+  logic [`DIVb:0] UNext[`DIVCOPIES-1:0];// U1.b
+  logic [`DIVb:0] UMNext[`DIVCOPIES-1:0];// U1.b
+  logic [`DIVb+1:0] C[`DIVCOPIES:0]; // Q2.b
+  logic [`DIVb+1:0] initC; // Q2.b
+  logic [`DIVCOPIES-1:0] un; 

 /* verilator lint_on UNOPTFLAT */
  logic [`DIVb+3:0]  WSN, WCN; // Q4.N-1
  logic [`DIVb+3:0]  DBar, D2, DBar2; // Q4.N-1
-  logic [`DIVb:0] QMMux;
-  logic [`DIVb-1:0] NextC;
-  logic [`DIVb-1:0] CMux;
-  logic [`DIVb:0] SMux;
+  logic [`DIVb+1:0] NextC;
+  logic [`DIVb+1:0] CMux;
+  logic [`DIVb:0] UMux, UMMux;
+  logic [`DIVb:0] initU, initUM;
+

  // Top Muxes and Registers
  // When start is asserted, the inputs are loaded into the divider.
@ -90,22 +85,24 @@ module fdivsqrtiter(
  //  - otherwise load WSA into the flipflop
  //  - the assumed one is added to D since it's always normalized (and X/0 is a special case handeled by result selection)
  //  - XZeroE is used as the assumed one to avoid creating a sticky bit - all other numbers are normalized
-  if (`RADIX == 2) begin : nextw
-    assign NextWSN = {WSA[`DIVCOPIES-1][`DIVb+2:0], 1'b0};
-    assign NextWCN = {WCA[`DIVCOPIES-1][`DIVb+2:0], 1'b0};
-  end else begin : nextw
-    assign NextWSN = {WSA[`DIVCOPIES-1][`DIVb+1:0], 2'b0};
-    assign NextWCN = {WCA[`DIVCOPIES-1][`DIVb+1:0], 2'b0};
-  end
-  assign initC = 0;
+  assign NextWSN = WSA[`DIVCOPIES-1] << `LOGR;
+  assign NextWCN = WCA[`DIVCOPIES-1] << `LOGR;
+
+  // Initialize C to -1 for sqrt and -R for division
+  logic [1:0] initCSqrt, initCDiv2, initCDiv4, initCUpper;
+  assign initCSqrt = 2'b11;
+  assign initCDiv2 = 2'b10;
+  assign initCDiv4 = 2'b10; // *** not sure why this works; seems like it should be 00 for initializing to -4
+  assign initCUpper = SqrtE ? initCSqrt : (`RADIX == 4) ? initCDiv4 : initCDiv2;
+  assign initC = {initCUpper, {`DIVb{1'b0}}};

  mux2   #(`DIVb+4) wsmux(NextWSN, X, DivStart, WSN);
  flopen   #(`DIVb+4) wsflop(clk, DivStart|DivBusy, WSN, WS[0]);
  mux2   #(`DIVb+4) wcmux(NextWCN, '0, DivStart, WCN);
  flopen   #(`DIVb+4) wcflop(clk, DivStart|DivBusy, WCN, WC[0]);
  flopen #(`DIVN-1) dflop(clk, DivStart, Dpreproc, D);
-  mux2 #(`DIVb) Cmux(C[`DIVCOPIES], initC, DivStart, CMux); 
-  flopen #(`DIVb) cflop(clk, DivStart|DivBusy, CMux, C[0]);
+  mux2 #(`DIVb+2) Cmux(C[`DIVCOPIES], initC, DivStart, CMux); 
+  flopen #(`DIVb+2) cflop(clk, DivStart|DivBusy, CMux, C[0]);

  // Divisor Selections
  //  - choose the negitive version of what's being selected
@ -120,54 +117,38 @@ module fdivsqrtiter(
  generate
    for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin : interations
      if (`RADIX == 2) begin: stage
-        fdivsqrtstage2 fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtM,
-        .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), .Q(Q[i]), .QM(QM[i]), .QNext(QNext[i]), .QMNext(QMNext[i]),
-        .C(C[i]), .S(S[i]), .SM(SM[i]), .CNext(C[i+1]), .SNext(SNext[i]), .SMNext(SMNext[i]), .qn(qn[i]));
+        fdivsqrtstage2 fdivsqrtstage(.D, .DBar, .SqrtM,
+        .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), 
+        .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
      end else begin: stage
        logic j1;
        assign j1 = (i == 0 & ~C[0][`DIVb-1]);
-//        assign j1 = (i == 0 &  C[0][`DIVb-2] & ~C[0][`DIVb-3]);
        fdivsqrtstage4 fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtM, .j1,
-        .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), .Q(Q[i]), .QM(QM[i]), .QNext(QNext[i]), .QMNext(QMNext[i]),
-        .C(C[i]), .S(S[i]), .SM(SM[i]), .CNext(C[i+1]), .SNext(SNext[i]), .SMNext(SMNext[i]), .qn(qn[i]));
+        .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), 
+        .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
      end
      if(i<(`DIVCOPIES-1)) begin 
-        if (`RADIX==2)begin 
-          assign WS[i+1] = {WSA[i][`DIVb+2:0], 1'b0};
-          assign WC[i+1] = {WCA[i][`DIVb+2:0], 1'b0};
-//          assign  C[i+1] = {1'b1, C[i][`DIVb-1:1]};
-        end else begin
-          assign WS[i+1] = {WSA[i][`DIVb+1:0], 2'b0};
-          assign WC[i+1] = {WCA[i][`DIVb+1:0], 2'b0};
-//          assign  C[i+1] = {2'b11, C[i][`DIVb-1:2]};
-        end
-        assign Q[i+1] = QNext[i];
-        assign QM[i+1] = QMNext[i];
-        assign S[i+1] = SNext[i];
-        assign SM[i+1] = SMNext[i];
+        assign WS[i+1] = WSA[i] << `LOGR;
+        assign WC[i+1] = WCA[i] << `LOGR;
+        assign U[i+1] = UNext[i];
+        assign UM[i+1] = UMNext[i];
      end
    end
  endgenerate

-
-  // if starting a new divison set Q to 0 and QM to -1
-  flopenr #(`DIVb+1) Qreg(clk, DivStart, DivBusy, QNext[`DIVCOPIES-1], Q[0]);
-  mux2 #(`DIVb+1) QMmux(QMNext[`DIVCOPIES-1], '1, DivStart, QMMux);
-  flopen #(`DIVb+1) QMreg(clk, DivStart|DivBusy, QMMux, QM[0]);
-
-  // if starting new square root, set S to 1 and SM to 0
-  flopenr #(`DIVb+1) SMreg(clk, DivStart, DivBusy, SMNext[`DIVCOPIES-1], SM[0]);
-  mux2 #(`DIVb+1) Smux(SNext[`DIVCOPIES-1], {1'b1, {(`DIVb){1'b0}}}, DivStart, SMux);
-  flopen #(`DIVb+1) Sreg(clk, DivStart|DivBusy, SMux, S[0]);
-
+  // Initialize U to 1.0 and UM to 0 for square root; U to 0 and UM to -1 for division
+  assign initU = SqrtE ? {1'b1, {(`DIVb){1'b0}}} : 0;
+  assign initUM = SqrtE ? 0 : {1'b1, {(`DIVb){1'b0}}}; 
+  mux2 #(`DIVb+1) Umux(UNext[`DIVCOPIES-1], initU, DivStart, UMux);
+  mux2 #(`DIVb+1) UMmux(UMNext[`DIVCOPIES-1], initUM, DivStart, UMMux);
+  flopen #(`DIVb+1) UReg(clk, DivStart|DivBusy, UMux, U[0]);
+  flopen #(`DIVb+1) UMReg(clk, DivStart|DivBusy, UMMux, UM[0]);
+  
  assign FirstWS = WS[0];
  assign FirstWC = WC[0];
-
-  assign FirstS = S[0];
-  assign FirstSM = SM[0];
-  assign FirstQ = Q[0];
-  assign FirstQM = QM[0];
+  assign FirstU = U[0];
+  assign FirstUM = UM[0];
  assign FirstC = C[0];
-  assign Firstqn = qn[0];
+  assign Firstun = un[0];
 endmodule

--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // fdivsqrtpostproc.sv
 //
-// Written: David_Harris@hmc.edu, me@KatherineParry.com, Cedar Turek
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
 // Modified:13 January 2022
 //
 // Purpose: Combined Divide and Square Root Floating Point and Integer Unit
@ -33,20 +33,21 @@
 module fdivsqrtpostproc(
  input logic [`DIVb+3:0] WS, WC,
  input logic [`DIVN-2:0]  D, // U0.N-1
-  input logic [`DIVb:0] FirstS, FirstSM, FirstQ, FirstQM,
-  input logic [`DIVb-1:0] FirstC,
-  input logic  Firstqn,
+  input logic [`DIVb:0] FirstU, FirstUM, 
+  input logic [`DIVb+1:0] FirstC,
+  input logic  Firstun,
  input logic SqrtM,
-  output logic [`DIVb-(`RADIX/4):0] QmM,
+  output logic [`DIVb:0] QmM, 
  output logic WZero,
  output logic DivSM
 );
  
  logic [`DIVb+3:0] W;
+  logic [`DIVb:0] PreQmM;
  logic NegSticky;
+  logic weq0;

  // check for early termination on an exact result.  If the result is not exact, the sticky should be set
-  logic weq0;
  aplusbeq0 #(`DIVb+4) wspluswceq0(WS, WC, weq0);

  if (`RADIX == 2) begin
@ -55,11 +56,11 @@ module fdivsqrtpostproc(
    logic wfeq0;
    logic [`DIVb+3:0] WCF, WSF;

-    assign FirstK = ({3'b111, FirstC} & ~({3'b111, FirstC} << 1));
-    assign FZero = SqrtM ? {FirstSM[`DIVb], FirstSM, 2'b0} | {FirstK,1'b0} : {3'b1,D,{`DIVb-`DIVN+2{1'b0}}};
+    assign FirstK = ({1'b1, FirstC} & ~({1'b1, FirstC} << 1));
+    assign FZero = SqrtM ? {FirstUM[`DIVb], FirstUM, 2'b0} | {FirstK,1'b0} : {3'b1,D,{`DIVb-`DIVN+2{1'b0}}};
    csa #(`DIVb+4) fadd(WS, WC, FZero, 1'b0, WSF, WCF); // compute {WCF, WSF} = {WS + WC + FZero};
    aplusbeq0 #(`DIVb+4) wcfpluswsfeq0(WCF, WSF, wfeq0);
-    assign WZero = weq0|(wfeq0 & Firstqn);
+    assign WZero = weq0|(wfeq0 & Firstun);
  end else begin
    assign WZero = weq0;
  end 
@ -70,12 +71,7 @@ module fdivsqrtpostproc(
  assign NegSticky = W[`DIVb+3];

   // division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
-  always_comb
-    if(SqrtM) // sqrt ouputs in the range (1, .5]
-      if(NegSticky) QmM = {FirstSM[`DIVb-1-(`RADIX/4):0], 1'b0};
-      else          QmM = {FirstS[`DIVb-1-(`RADIX/4):0], 1'b0};
-    else  
-      if(NegSticky) QmM = FirstQM[`DIVb-(`RADIX/4):0];
-      else          QmM = FirstQ[`DIVb-(`RADIX/4):0];

+  assign PreQmM = NegSticky ? FirstUM : FirstU; // Select U or U-1 depending on negative sticky bit
+  assign QmM = SqrtM ? (PreQmM << 1) : PreQmM;
 endmodule
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // fdivsqrtpreproc.sv
 //
-// Written: David_Harris@hmc.edu, me@KatherineParry.com, Cedar Turek
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
 // Modified:13 January 2022
 //
 // Purpose: Combined Divide and Square Root Floating Point and Integer Unit
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel2.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel2.sv
@ -0,0 +1,63 @@
+///////////////////////////////////////////
+// fdivsqrtqsel2.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
+// Modified:13 January 2022
+//
+// Purpose: Radix 2 Quotient Digit Selection
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fdivsqrtqsel2 ( 
+  input  logic [3:0] ps, pc, 
+  output logic         up, uz, un
+);
+ 
+  logic [3:0]  p, g;
+  logic          magnitude, sign, cout;
+
+  // The quotient selection logic is presented for simplicity, not
+  // for efficiency.  You can probably optimize your logic to
+  // select the proper divisor with less delay.
+
+  // Quotient equations from EE371 lecture notes 13-20
+  assign p = ps ^ pc;
+  assign g = ps & pc;
+
+  //assign magnitude = ~(&p[2:0]);
+  assign cout = g[2] | (p[2] & (g[1] | p[1] & g[0]));
+  //assign sign = p[3] ^ cout;
+  assign magnitude = ~((ps[2]^pc[2]) & (ps[1]^pc[1]) & 
+			  (ps[0]^pc[0]));
+  assign sign = (ps[3]^pc[3])^
+      (ps[2] & pc[2] | ((ps[2]^pc[2]) &
+			    (ps[1]&pc[1] | ((ps[1]^pc[1]) &
+						(ps[0]&pc[0])))));
+
+  // Produce digit = +1, 0, or -1
+  assign up = magnitude & ~sign;
+  assign uz = ~magnitude;
+  assign un = magnitude & sign;
+endmodule
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtqsel4.sv
@ -0,0 +1,112 @@
+///////////////////////////////////////////
+// fdivsqrtqsel4.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
+// Modified:13 January 2022
+//
+// Purpose: Radix 4 Quotient Digit Selection
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fdivsqrtqsel4 (
+  input logic [`DIVN-2:0] D,
+  input logic [4:0] Smsbs,
+  input logic [`DIVb+3:0] WS, WC,
+  input logic Sqrt, j1,
+  output logic [3:0] u
+);
+	logic [6:0] Wmsbs;
+	logic [7:0] PreWmsbs;
+	logic [2:0] Dmsbs, A;
+
+	assign PreWmsbs = WC[`DIVb+3:`DIVb-4] + WS[`DIVb+3:`DIVb-4];
+	assign Wmsbs = PreWmsbs[7:1];
+	assign Dmsbs = D[`DIVN-2:`DIVN-4];//|{3{D[`DIVN-2]&Sqrt}};
+	// D = 0001.xxx...
+	// Dmsbs = |   |
+  // W =      xxxx.xxx...
+	// Wmsbs = |        |
+
+	logic [3:0] USel4[1023:0];
+
+  always_comb begin 
+    integer a, w, i, w2;
+    for(a=0; a<8; a++)
+      for(w=0; w<128; w++)begin
+        i = a*128+w;
+        w2 = w-128*(w>=64); // convert to two's complement
+        case(a)
+          0: if($signed(w2)>=$signed(12))      USel4[i] = 4'b1000;
+            else if(w2>=4)   USel4[i] = 4'b0100; 
+            else if(w2>=-4)  USel4[i] = 4'b0000; 
+            else if(w2>=-13) USel4[i] = 4'b0010; 
+            else             USel4[i] = 4'b0001; 
+          1: if(w2>=14)      USel4[i] = 4'b1000;
+            else if(w2>=4)   USel4[i] = 4'b0100;  
+            else if(w2>=-4)  USel4[i] = 4'b0000; 
+            else if(w2>=-14) USel4[i] = 4'b0010;  
+            else             USel4[i] = 4'b0001; 
+          2: if(w2>=16)      USel4[i] = 4'b1000;
+            else if(w2>=4)   USel4[i] = 4'b0100; 
+            else if(w2>=-6)  USel4[i] = 4'b0000; 
+            else if(w2>=-16) USel4[i] = 4'b0010; 
+            else             USel4[i] = 4'b0001; 
+          3: if(w2>=16)      USel4[i] = 4'b1000;
+            else if(w2>=4)   USel4[i] = 4'b0100; 
+            else if(w2>=-6)  USel4[i] = 4'b0000; 
+            else if(w2>=-17) USel4[i] = 4'b0010; 
+            else             USel4[i] = 4'b0001; 
+          4: if(w2>=18)      USel4[i] = 4'b1000;
+            else if(w2>=6)   USel4[i] = 4'b0100; 
+            else if(w2>=-6)  USel4[i] = 4'b0000; 
+            else if(w2>=-18) USel4[i] = 4'b0010; 
+            else             USel4[i] = 4'b0001; 
+          5: if(w2>=20)      USel4[i] = 4'b1000;
+            else if(w2>=6)   USel4[i] = 4'b0100; 
+            else if(w2>=-8)  USel4[i] = 4'b0000; 
+            else if(w2>=-20) USel4[i] = 4'b0010; 
+            else             USel4[i] = 4'b0001; 
+          6: if(w2>=20)      USel4[i] = 4'b1000;
+            else if(w2>=8)   USel4[i] = 4'b0100; 
+            else if(w2>=-8)  USel4[i] = 4'b0000; 
+            else if(w2>=-22) USel4[i] = 4'b0010; 
+            else             USel4[i] = 4'b0001; 
+          7: if(w2>=24)      USel4[i] = 4'b1000; 
+            else if(w2>=8)   USel4[i] = 4'b0100; 
+            else if(w2>=-8)  USel4[i] = 4'b0000; 
+            else if(w2>=-22) USel4[i] = 4'b0010; 
+            else             USel4[i] = 4'b0001; 
+        endcase
+      end
+  end
+  always_comb
+    if (Sqrt) begin 
+      if (j1) A = 3'b101;
+      else if (Smsbs == 5'b10000) A = 3'b111;
+      else A = Smsbs[2:0];
+    end else A = Dmsbs;
+	assign u = USel4[{A,Wmsbs}];
+	
+endmodule
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@ -33,47 +33,47 @@
 /* verilator lint_off UNOPTFLAT */
 module fdivsqrtstage2 (
  input logic [`DIVN-2:0] D,
-  input logic [`DIVb+3:0]  DBar, D2, DBar2,
-  input logic [`DIVb:0] Q, QM,
-  input logic [`DIVb:0] S, SM,
+  input logic [`DIVb+3:0]  DBar, 
+  input logic [`DIVb:0] U, UM,
  input logic [`DIVb+3:0]  WS, WC,
-  input logic [`DIVb-1:0] C,
+  input logic [`DIVb+1:0] C,
  input logic SqrtM,
-  output logic [`DIVb:0] QNext, QMNext, 
-  output logic qn,
-  output logic [`DIVb-1:0] CNext,
-  output logic [`DIVb:0] SNext, SMNext, 
+  output logic un,
+  output logic [`DIVb+1:0] CNext,
+  output logic [`DIVb:0] UNext, UMNext, 
  output logic [`DIVb+3:0]  WSA, WCA
 );
 /* verilator lint_on UNOPTFLAT */

  logic [`DIVb+3:0]  Dsel;
-  logic qp, qz;
+  logic up, uz;
  logic [`DIVb+3:0] F;
  logic [`DIVb+3:0] AddIn;

-  assign CNext = {1'b1, C[`DIVb-1:1]};
+  assign CNext = {1'b1, C[`DIVb+1:1]};

  // Qmient Selection logic
-  // Given partial remainder, select quotient of +1, 0, or -1 (qp, qz, pm)
+  // Given partial remainder, select digit of +1, 0, or -1 (up, uz, un)
  // q encoding:
 	// 1000 = +2
 	// 0100 = +1
 	// 0000 =  0
 	// 0010 = -1
 	// 0001 = -2
-  qsel2 qsel2(WS[`DIVb+3:`DIVb], WC[`DIVb+3:`DIVb], qp, qz, qn);
-  fgen2 fgen2(.sp(qp), .sz(qz), .C(CNext), .S, .SM, .F);
+  fdivsqrtqsel2 qsel2(WS[`DIVb+3:`DIVb], WC[`DIVb+3:`DIVb], up, uz, un);
+  fdivsqrtfgen2 fgen2(.up, .uz, .C(CNext), .U, .UM, .F);
+
+  always_comb
+    if      (up) Dsel = DBar;
+    else if (uz) Dsel = '0; // qz
+    else         Dsel = {3'b0, 1'b1, D, {`DIVb-`DIVN+1{1'b0}}}; // un

-  assign Dsel = {`DIVb+4{~qz}}&(qp ? DBar : {3'b0, 1'b1, D, {`DIVb-`DIVN+1{1'b0}}});
  // Partial Product Generation
  //  WSA, WCA = WS + WC - qD
  assign AddIn = SqrtM ? F : Dsel;
-  csa #(`DIVb+4) csa(WS, WC, AddIn, qp&~SqrtM, WSA, WCA);
+  csa #(`DIVb+4) csa(WS, WC, AddIn, up&~SqrtM, WSA, WCA);

-  // *** dh 8/29/22: will need to trim down to just sotfc
-  otfc2 otfc2(.qp, .qz, .Q, .QM, .QNext, .QMNext);
-  sotfc2 sotfc2(.sp(qp), .sz(qz), .C(CNext), .S, .SM, .SNext, .SMNext);
+  fdivsqrtuotfc2 uotfc2(.up, .uz, .C(CNext), .U, .UM, .UNext, .UMNext);
 endmodule


--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@ -34,42 +34,38 @@
 module fdivsqrtstage4 (
  input logic [`DIVN-2:0] D,
  input logic [`DIVb+3:0]  DBar, D2, DBar2,
-  input logic [`DIVb:0] Q, QM,
-  input logic [`DIVb:0] S, SM,
+  input logic [`DIVb:0] U, UM,
  input logic [`DIVb+3:0]  WS, WC,
-  input logic [`DIVb-1:0] C,
-  output logic [`DIVb-1:0] CNext,
+  input logic [`DIVb+1:0] C,
+  output logic [`DIVb+1:0] CNext,
  input logic SqrtM, j1,
-  output logic [`DIVb:0] QNext, QMNext, 
-  output logic qn,
-  output logic [`DIVb:0] SNext, SMNext, 
+  output logic un,
+  output logic [`DIVb:0] UNext, UMNext, 
  output logic [`DIVb+3:0]  WSA, WCA
 );
 /* verilator lint_on UNOPTFLAT */

  logic [`DIVb+3:0]  Dsel;
-  logic [3:0]     q;
+  logic [3:0]     u;
  logic [`DIVb+3:0] F;
  logic [`DIVb+3:0] AddIn;
  logic [4:0] Smsbs;
  logic CarryIn;
+  assign CNext = {2'b11, C[`DIVb+1:2]};

-  assign CNext = {2'b11, C[`DIVb-1:2]};
-
-  // Qmient Selection logic
-  // Given partial remainder, select quotient of +1, 0, or -1 (qp, qz, pm)
-  // q encoding:
+  // Digit Selection logic
+  // u encoding:
 	// 1000 = +2
 	// 0100 = +1
 	// 0000 =  0
 	// 0010 = -1
 	// 0001 = -2
-  assign Smsbs = S[`DIVb:`DIVb-4];
-  qsel4 qsel4(.D, .Smsbs, .WS, .WC, .Sqrt(SqrtM), .j1, .q);
-  fgen4 fgen4(.s(q), .C({4'b1111, CNext}), .S({3'b000, S}), .SM({3'b000, SM}), .F);
+  assign Smsbs = U[`DIVb:`DIVb-4];
+  fdivsqrtqsel4 qsel4(.D, .Smsbs, .WS, .WC, .Sqrt(SqrtM), .j1, .u);
+  fdivsqrtfgen4 fgen4(.u, .C({2'b11, CNext}), .U({3'b000, U}), .UM({3'b000, UM}), .F);

  always_comb
-  case (q)
+  case (u)
    4'b1000: Dsel = DBar2;
    4'b0100: Dsel = DBar;
    4'b0000: Dsel = '0;
@ -81,11 +77,12 @@ module fdivsqrtstage4 (
  // Partial Product Generation
  //  WSA, WCA = WS + WC - qD
  assign AddIn = SqrtM ? F : Dsel;
-  assign CarryIn = ~SqrtM & (q[3] | q[2]); // +1 for 2's complement of -D and -2D 
+  assign CarryIn = ~SqrtM & (u[3] | u[2]); // +1 for 2's complement of -D and -2D 
  csa #(`DIVb+4) csa(WS, WC, AddIn, CarryIn, WSA, WCA);
 
-  otfc4 otfc4(.q, .Q, .QM, .QNext, .QMNext);
-  sotfc4 sotfc4(.s(q), .Sqrt(SqrtM), .C({1'b1, CNext}), .S, .SM, .SNext, .SMNext);
+  fdivsqrtuotfc4 fdivsqrtuotfc4(.u, .Sqrt(SqrtM), .C(CNext[`DIVb:0]), .U, .UM, .UNext, .UMNext);
+
+  assign un = 0; // unused for radix 4
 endmodule


--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtuotfc2.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtuotfc2.sv
@ -0,0 +1,61 @@
+///////////////////////////////////////////
+// fdivsqrtuotfc2.sv
+//
+// Written: me@KatherineParry.com, cturek@hmc.edu 
+// Modified:7/14/2022
+//
+// Purpose: Radix 2 unified on-the-fly converter
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+///////////////////////////////
+// Unified OTFC, Radix 2 //
+///////////////////////////////
+module fdivsqrtuotfc2(
+  input  logic         up, uz,
+  input  logic [`DIVb+1:0] C,
+  input logic [`DIVb:0] U, UM,
+  output logic [`DIVb:0] UNext, UMNext
+);
+  //  The on-the-fly converter transfers the divsqrt
+  //  bits to the quotient as they come.
+  logic [`DIVb:0] K;
+
+  assign K = (C[`DIVb:0] & ~(C[`DIVb:0] << 1));
+
+  always_comb begin
+    if (up) begin
+      UNext  = U | K;
+      UMNext = U;
+    end else if (uz) begin
+      UNext  = U;
+      UMNext = UM | K;
+    end else begin        // If up and uz are not true, then un is
+      UNext  = UM | K;
+      UMNext = UM;
+    end 
+  end
+
+endmodule
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
@ -0,0 +1,68 @@
+///////////////////////////////////////////
+// fdivsqrtuotfc4.sv
+//
+// Written: me@KatherineParry.com, cturek@hmc.edu 
+// Modified:7/14/2022
+//
+// Purpose: Radix 4 unified on-the-fly converter
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fdivsqrtuotfc4(
+  input  logic [3:0]   u,
+  input  logic         Sqrt,
+  input  logic [`DIVb:0] U, UM,
+  input  logic [`DIVb:0] C,
+  output logic [`DIVb:0] UNext, UMNext
+);
+  //  The on-the-fly converter transfers the square root 
+  //  bits to the quotient as they come.
+  //  Use this otfc for division and square root.
+
+  logic [`DIVb:0] K1, K2, K3;
+  assign K1 = (C&~(C << 1));        // K
+  assign K2 = ((C << 1)&~(C << 2)); // 2K
+  assign K3 = (C & ~(C << 2));      // 3K
+
+  always_comb begin
+    if (u[3]) begin
+      UNext  = U | K2;
+      UMNext = U | K1;
+    end else if (u[2]) begin
+      UNext  = U | K1;
+      UMNext = U;
+    end else if (u[1]) begin
+      UNext  = UM | K3;
+      UMNext = UM | K2;
+    end else if (u[0]) begin
+      UNext  = UM | K2;
+      UMNext = UM | K1;
+    end else begin        // digit = 0
+      UNext  = U;
+      UMNext = UM | K3;
+    end 
+  end
+
+endmodule
--- a/pipelined/src/fpu/fhazard.sv
+++ b/pipelined/src/fpu/fhazard.sv
@ -1,5 +1,5 @@
 ///////////////////////////////////////////
-// fpuhazard.sv
+// fhazard.sv
 //
 // Written: me@KatherineParry.com 19 May 2021
 // Modified: 
--- a/pipelined/src/fpu/flags.sv
+++ b/pipelined/src/fpu/flags.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// flags.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/fma/fma.sv
+++ b/pipelined/src/fpu/fma/fma.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fma.sv
 //
 // Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
 // Modified: 
--- a/pipelined/src/fpu/fma/fmaadd.sv
+++ b/pipelined/src/fpu/fma/fmaadd.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fmaadd.sv
 //
 // Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
 // Modified: 
--- a/pipelined/src/fpu/fma/fmaalign.sv
+++ b/pipelined/src/fpu/fma/fmaalign.sv
@ -1,5 +1,6 @@

 ///////////////////////////////////////////
+// fmaalign.sv
 //
 // Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
 // Modified: 
--- a/pipelined/src/fpu/fma/fmaexpadd.sv
+++ b/pipelined/src/fpu/fma/fmaexpadd.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fmaexpadd.sv
 //
 // Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
 // Modified: 
--- a/pipelined/src/fpu/fma/fmalza.sv
+++ b/pipelined/src/fpu/fma/fmalza.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fmalza.sv
 //
 // Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
 // Modified: 
--- a/pipelined/src/fpu/fma/fmamult.sv
+++ b/pipelined/src/fpu/fma/fmamult.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fmamult.sv
 //
 // Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
 // Modified: 
--- a/pipelined/src/fpu/fma/fmasign.sv
+++ b/pipelined/src/fpu/fma/fmasign.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fmasign.sv
 //
 // Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
 // Modified: 
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fpu.sv
 //
 // Written: me@KatherineParry.com, James Stine, Brett Mathis
 // Modified: 6/23/2021
@ -123,11 +124,10 @@ module fpu (
   logic [`CVTLEN-1:0]     CvtLzcInE, CvtLzcInM;      // input to the Leading Zero Counter (priority encoder)
   
   //divide signals
-   logic [`DIVb-(`RADIX/4):0] QmM;
+   logic [`DIVb:0]      QmM;
   logic [`NE+1:0]      QeE, QeM; 
   logic                DivSE, DivSM;
   logic                DivDoneM;
-   logic [`DURLEN-1:0]  EarlyTermShiftM;

   // result and flag signals
   logic [`XLEN-1:0] ClassResE;               // classify result
@ -260,7 +260,7 @@ module fpu (
   fdivsqrt fdivsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
                  .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .DivStartE(DivStartE), .XsE,
                  .StallE, .StallM, .DivSM, .DivBusy(FDivBusyE), .QeM, //***change divbusyE to M signal
-                  .EarlyTermShiftM, .QmM, .DivDone(DivDoneM));
+                  .QmM, .DivDone(DivDoneM));
   // compare
   //    - fmin/fmax
   //    - flt/fle/feq
@ -364,7 +364,7 @@ module fpu (

   assign FpLoadStoreM = FResSelM[1];

-   postprocess postprocess(.Xs(XsM), .Ys(YsM), .Ze(ZeM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), .FmaPe(PeM), .DivEarlyTermShift(EarlyTermShiftM),
+   postprocess postprocess(.Xs(XsM), .Ys(YsM), .Ze(ZeM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), .FmaPe(PeM), 
                           .FmaZmS(ZmStickyM), .FmaKillProd(KillProdM), .XZero(XZeroM), .YZero(YZeroM), .ZZero(ZZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QmM), .FmaSs(SsM),
                           .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), .FmaSm(SmM), .DivQe(QeM), .DivDone(DivDoneM),
                           .FmaNegSum(NegSumM), .FmaInvA(InvAM), .ZDenorm(ZDenormM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
--- a/pipelined/src/fpu/fregfile.sv
+++ b/pipelined/src/fpu/fregfile.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fregfile.sv
 //
 // Written: David_Harris@hmc.edu 9 January 2021
 // Modified: James Stine 
--- a/pipelined/src/fpu/fsgninj.sv
+++ b/pipelined/src/fpu/fsgninj.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fsgninj.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 6/23/2021
--- a/pipelined/src/fpu/normshift.sv
+++ b/pipelined/src/fpu/normshift.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// normshift.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
@ -65,7 +66,7 @@
    //              - plus 1 to shift out the first 1

 module normshift(
-    input logic  [$clog2(`NORMSHIFTSZ)-1:0]      ShiftAmt,   // normalization shift count
+    input logic  [`LOGNORMSHIFTSZ-1:0]      ShiftAmt,   // normalization shift count
    input logic  [`NORMSHIFTSZ-1:0]              ShiftIn,        // is the sum zero
    output logic [`NORMSHIFTSZ-1:0]             Shifted        // is the sum zero
 );
--- a/pipelined/src/fpu/otfc.sv
+++ b/pipelined/src/fpu/otfc.sv
@ -1,177 +0,0 @@
-///////////////////////////////////////////
-// otfc.sv
-//
-// Written: me@KatherineParry.com, cturek@hmc.edu 
-// Modified:7/14/2022
-//
-// Purpose: On the fly conversion
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// MIT LICENSE
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
-// software and associated documentation files (the "Software"), to deal in the Software 
-// without restriction, including without limitation the rights to use, copy, modify, merge, 
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
-// to whom the Software is furnished to do so, subject to the following conditions:
-//
-//   The above copyright notice and this permission notice shall be included in all copies or 
-//   substantial portions of the Software.
-//
-//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
-//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
-//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
-//   OR OTHER DEALINGS IN THE SOFTWARE.
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-`include "wally-config.vh"
-
-module otfc2 (
-  input  logic         qp, qz,
-  input  logic [`DIVb:0] Q, QM,
-  output logic [`DIVb:0] QNext, QMNext
-);
-  //  The on-the-fly converter transfers the quotient 
-  //  bits to the quotient as they come.
-  //  Use this otfc for division only.
-  logic [`DIVb-1:0] QR, QMR;
-
-  assign QR  = Q[`DIVb-1:0];
-  assign QMR = QM[`DIVb-1:0];     // Shifted Q and QM
-
-  always_comb begin
-    if (qp) begin
-      QNext  = {QR,  1'b1};
-      QMNext = {QR,  1'b0};
-    end else if (qz) begin
-      QNext  = {QR,  1'b0};
-      QMNext = {QMR, 1'b1};
-    end else begin        // If qp and qz are not true, then qn is
-      QNext  = {QMR, 1'b1};
-      QMNext = {QMR, 1'b0};
-    end 
-  end
-
-endmodule
-
-///////////////////////////////
-// Square Root OTFC, Radix 2 //
-///////////////////////////////
-module sotfc2(
-  input  logic         sp, sz,
-  input  logic [`DIVb-1:0] C,
-  input logic [`DIVb:0] S, SM,
-  output logic [`DIVb:0] SNext, SMNext
-);
-  //  The on-the-fly converter transfers the square root 
-  //  bits to the quotient as they come.
-  //  Use this otfc for division and square root.
-  logic [`DIVb:0] CExt;
-
-  assign CExt = {1'b1, C};
-
-  always_comb begin
-    if (sp) begin
-      SNext  = S | (CExt & ~(CExt << 1));
-      SMNext = S;
-    end else if (sz) begin
-      SNext  = S;
-      SMNext = SM | (CExt & ~(CExt << 1));
-    end else begin        // If sp and sz are not true, then sn is
-      SNext  = SM | (CExt & ~(CExt << 1));
-      SMNext = SM;
-    end 
-  end
-
-endmodule
-
-module otfc4 (
-  input  logic [3:0]   q,
-  input  logic [`DIVb:0] Q, QM,
-  output logic [`DIVb:0] QNext, QMNext
-);
-
-  //  The on-the-fly converter transfers the quotient 
-  //  bits to the quotient as they come. 
-  //
-  //  This code follows the psuedocode presented in the 
-  //  floating point chapter of the book. Right now, 
-  //  it is written for Radix-4 division.
-  //
-  //  QM is Q-1. It allows us to write negative bits 
-  //  without using a costly CPA. 
-
-  //  QR and QMR are the shifted versions of Q and QM.
-  //  They are treated as [N-1:r] size signals, and 
-  //  discard the r most significant bits of Q and QM. 
-  logic [`DIVb-2:0] QR, QMR;
-
-  // shift Q (quotent) and QM (quotent-1)
-		// if 	q = 2  	    Q = {Q, 10} 	QM = {Q, 01}		
-		// else if 	q = 1   Q = {Q, 01} 	QM = {Q, 00}	
-		// else if 	q = 0   Q = {Q, 00} 	QM = {QM, 11}	
-		// else if 	q = -1	Q = {QM, 11} 	QM = {QM, 10}
-		// else if 	q = -2	Q = {QM, 10} 	QM = {QM, 01}
-
-  assign QR  = Q[`DIVb-2:0];
-  assign QMR = QM[`DIVb-2:0];     // Shifted Q and QM
-  always_comb begin
-    if (q[3]) begin // +2
-      QNext  = {QR,  2'b10};
-      QMNext = {QR,  2'b01};
-    end else if (q[2]) begin // +1
-      QNext  = {QR,  2'b01};
-      QMNext = {QR,  2'b00};
-    end else if (q[1]) begin // -1
-      QNext  = {QMR,  2'b11};
-      QMNext = {QMR,  2'b10};
-    end else if (q[0]) begin // -2
-      QNext  = {QMR,  2'b10};
-      QMNext = {QMR,  2'b01};
-    end else begin           // 0
-      QNext  = {QR,  2'b00};
-      QMNext = {QMR, 2'b11};
-    end 
-  end
-  // Final Qmeint is in the range [.5, 2)
-
-endmodule
-
-///////////////////////////////
-// Square Root OTFC, Radix 4 //
-///////////////////////////////
-module sotfc4(
-  input  logic [3:0]   s,
-  input  logic         Sqrt,
-  input  logic [`DIVb:0] S, SM,
-  input  logic [`DIVb:0] C,
-  output logic [`DIVb:0] SNext, SMNext
-);
-  //  The on-the-fly converter transfers the square root 
-  //  bits to the quotient as they come.
-  //  Use this otfc for division and square root.
-
-  always_comb begin
-    if (s[3]) begin
-      SNext  = S | ((C << 1)&~(C << 2));
-      SMNext = S | (C&~(C << 1));
-    end else if (s[2]) begin
-      SNext  = S | (C&~(C << 1));
-      SMNext = S;
-    end else if (s[1]) begin
-      SNext  = SM | (C&~(C << 2));
-      SMNext = SM | ((C << 1)&~(C << 2));
-    end else if (s[0]) begin
-      SNext  = SM | ((C << 1)&~(C << 2));
-      SMNext = SM | (C&~(C << 1));
-    end else begin        // If sp and sn are not true, then sz is
-      SNext  = S;
-      SMNext = SM | (C & ~(C << 2));
-    end 
-  end
-
-endmodule
--- a/pipelined/src/fpu/postproc/cvtshiftcalc.sv
+++ b/pipelined/src/fpu/postproc/cvtshiftcalc.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// cvtshiftcalc.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/postproc/divshiftcalc.sv
+++ b/pipelined/src/fpu/postproc/divshiftcalc.sv
@ -0,0 +1,80 @@
+///////////////////////////////////////////
+// divshiftcalc.sv
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: Conversion shift calculation
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////`include "wally-config.vh"
+
+`include "wally-config.vh"
+
+module divshiftcalc(
+    input logic  [`DIVb:0] DivQm,
+    input logic  [`FMTBITS-1:0] Fmt,
+    input logic Sqrt,
+    input logic [`NE+1:0] DivQe,
+    output logic [`LOGNORMSHIFTSZ-1:0] DivShiftAmt,
+    output logic [`NORMSHIFTSZ-1:0] DivShiftIn,
+    output logic DivResDenorm,
+    output logic DivDenormShiftPos
+);
+    logic [`LOGNORMSHIFTSZ-1:0] NormShift, DivDenormShiftAmt;
+    logic [`NE+1:0] DivDenormShift;
+
+    logic [`DURLEN-1:0] DivEarlyTermShift = 0;
+
+    // is the result denromalized
+    // if the exponent is 1 then the result needs to be normalized then the result is denormalizes
+    assign DivResDenorm = DivQe[`NE+1]|(~|DivQe[`NE+1:0]);
+
+    // if the result is denormalized
+    //  00000000x.xxxxxx...                     Exp = DivQe
+    //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
+    //  .00xxxxxxxxxxxxx... << DivQe+NF+1  Exp = +1
+    //  .0000xxxxxxxxxxx... >> 1                Exp = 1
+    // Left shift amount  = DivQe+NF+1-1
+    assign DivDenormShift = (`NE+2)'(`NF)+DivQe;
+    assign DivDenormShiftPos = ~DivDenormShift[`NE+1];
+
+    // if the result is normalized
+    //  00000000x.xxxxxx...                     Exp = DivQe
+    //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
+    //  00000000.xxxxxxx... << NF               Exp = DivQe+1
+    //  00000000x.xxxxxx... << NF               Exp = DivQe (extra shift done afterwards)
+    //  00000000xx.xxxxx... << 1?               Exp = DivQe-1 (determined after)
+    // inital Left shift amount  = NF
+    // shift one more if the it's a minimally redundent radix 4 - one entire cycle needed for integer bit
+    assign NormShift = (`LOGNORMSHIFTSZ)'(`NF);
+
+    // if the shift amount is negitive then don't shift (keep sticky bit)
+    // need to multiply the early termination shift by LOGR*DIVCOPIES =  left shift of log2(LOGR*DIVCOPIES)
+    assign DivDenormShiftAmt = DivDenormShiftPos ? DivDenormShift[`LOGNORMSHIFTSZ-1:0] : '0;
+    assign DivShiftAmt = DivResDenorm ? DivDenormShiftAmt : NormShift;
+
+    if (`RADIX == 4)
+        assign DivShiftIn = {{`NF{1'b0}}, DivQm[`DIVb-1:0], {`NORMSHIFTSZ-`DIVb+2-`NF{1'b0}}};
+    else
+        assign DivShiftIn = {{`NF{1'b0}}, DivQm, {`NORMSHIFTSZ-`DIVb+1-`NF{1'b0}}};
+endmodule
--- a/pipelined/src/fpu/postproc/fmashiftcalc.sv
+++ b/pipelined/src/fpu/postproc/fmashiftcalc.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// fmashiftcalc.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/postproc/negateintres.sv
+++ b/pipelined/src/fpu/postproc/negateintres.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// negateintres.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/postproc/postprocess.sv
+++ b/pipelined/src/fpu/postproc/postprocess.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// postprocess.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
@ -56,11 +57,10 @@ module postprocess (
    input logic                             FmaSs,
    input logic  [$clog2(3*`NF+7)-1:0]      FmaSCnt,   // the normalization shift count
    //divide signals
-    input logic  [`DURLEN-1:0]              DivEarlyTermShift,
    input logic                             DivS,
    input logic                             DivDone,
    input logic  [`NE+1:0]                  DivQe,
-    input logic  [`DIVb-(`RADIX/4):0]                DivQm,
+    input logic  [`DIVb:0]                  DivQm,
    // conversion signals
    input logic                             CvtCs,     // the result's sign
    input logic  [`NE:0]                    CvtCe,    // the calculated expoent
@ -84,7 +84,7 @@ module postprocess (
    logic [`CORRSHIFTSZ-1:0] Mf; // corectly shifted fraction
    logic [`NE+1:0] FullRe;  // Re with bits to determine sign and overflow
    logic UfPlus1;                    // do you add one (for determining underflow flag)
-    logic [$clog2(`NORMSHIFTSZ)-1:0] ShiftAmt;   // normalization shift count
+    logic [`LOGNORMSHIFTSZ-1:0] ShiftAmt;   // normalization shift count
    logic [`NORMSHIFTSZ-1:0] ShiftIn;        // is the sum zero
    logic [`NORMSHIFTSZ-1:0] Shifted;    // the shifted result
    logic Plus1;      // add one to the final result?
@ -99,12 +99,12 @@ module postprocess (
    logic FmaPreResultDenorm;    // is the result denormalized - calculated before LZA corection
    logic [$clog2(3*`NF+7)-1:0] FmaShiftAmt;   // normalization shift count
    // division singals
-    logic [$clog2(`NORMSHIFTSZ)-1:0] DivShiftAmt;
+    logic [`LOGNORMSHIFTSZ-1:0] DivShiftAmt;
    logic [`NORMSHIFTSZ-1:0] DivShiftIn;
    logic [`NE+1:0] Qe;
    logic DivByZero;
    logic DivResDenorm;
-    logic [`NE+1:0] DivDenormShift;
+    logic DivDenormShiftPos;
    // conversion signals
    logic [`CVTLEN+`NF:0] CvtShiftIn;    // number to be shifted
    logic [1:0] CvtNegResMsbs;
@ -152,16 +152,16 @@ module postprocess (
                              .XZero, .IntToFp, .OutFmt, .CvtResUf, .CvtShiftIn);
    fmashiftcalc fmashiftcalc(.FmaSm, .Ze, .FmaPe, .FmaSCnt, .Fmt, .FmaKillProd, .NormSumExp, .FmaSe,
                          .FmaSZero, .FmaPreResultDenorm, .FmaShiftAmt, .FmaShiftIn);
-    divshiftcalc divshiftcalc(.Fmt, .Sqrt, .DivQe, .DivQm, .DivEarlyTermShift, .DivResDenorm, .DivDenormShift, .DivShiftAmt, .DivShiftIn);
+    divshiftcalc divshiftcalc(.Fmt, .Sqrt, .DivQe, .DivQm, .DivResDenorm, .DivDenormShiftPos, .DivShiftAmt, .DivShiftIn);

    always_comb
        case(PostProcSel)
            2'b10: begin // fma
-                ShiftAmt = {{$clog2(`NORMSHIFTSZ)-$clog2(3*`NF+7){1'b0}}, FmaShiftAmt};
+                ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(3*`NF+7){1'b0}}, FmaShiftAmt};
                ShiftIn =  {FmaShiftIn, {`NORMSHIFTSZ-(3*`NF+8){1'b0}}};
            end
            2'b00: begin // cvt
-                ShiftAmt = {{$clog2(`NORMSHIFTSZ)-$clog2(`CVTLEN+1){1'b0}}, CvtShiftAmt};
+                ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(`CVTLEN+1){1'b0}}, CvtShiftAmt};
                ShiftIn =  {CvtShiftIn, {`NORMSHIFTSZ-`CVTLEN-`NF-1{1'b0}}};
            end
            2'b01: begin //div
@ -174,7 +174,7 @@ module postprocess (
                end
            end
            default: begin 
-                ShiftAmt = {$clog2(`NORMSHIFTSZ){1'bx}}; 
+                ShiftAmt = {`LOGNORMSHIFTSZ{1'bx}}; 
                ShiftIn = {`NORMSHIFTSZ{1'bx}}; 
            end
        endcase
@ -182,7 +182,7 @@ module postprocess (
    normshift normshift (.ShiftIn, .ShiftAmt, .Shifted);

    shiftcorrection shiftcorrection(.FmaOp, .FmaPreResultDenorm, .NormSumExp,
-                                .DivResDenorm, .DivDenormShift, .DivOp, .DivQe,
+                                .DivResDenorm, .DivDenormShiftPos, .DivOp, .DivQe,
                                .Qe, .FmaSZero, .Shifted, .FmaMe, .Mf);

    ///////////////////////////////////////////////////////////////////////////////
--- a/pipelined/src/fpu/postproc/resultsign.sv
+++ b/pipelined/src/fpu/postproc/resultsign.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// resultsign.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/postproc/round.sv
+++ b/pipelined/src/fpu/postproc/round.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// round.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/postproc/roundsign.sv
+++ b/pipelined/src/fpu/postproc/roundsign.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// roundsign.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/postproc/shiftcorrection.sv
+++ b/pipelined/src/fpu/postproc/shiftcorrection.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// shiftcorrection.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
@ -34,7 +35,7 @@ module shiftcorrection(
    input logic                     DivOp,
    input logic                     DivResDenorm,
    input logic  [`NE+1:0]          DivQe,
-    input logic  [`NE+1:0]          DivDenormShift,
+    input logic                     DivDenormShiftPos,
    input logic  [`NE+1:0]          NormSumExp,          // exponent of the normalized sum not taking into account denormal or zero results
    input logic                     FmaPreResultDenorm,    // is the result denormalized - calculated before LZA corection
    input logic                     FmaSZero,
@ -66,5 +67,5 @@ module shiftcorrection(

    // the quotent is in the range [.5,2) if there is no early termination
    // if the quotent < 1 and not denormal then subtract 1 to account for the normalization shift
-    assign Qe = ((DivResDenorm)&~DivDenormShift[`NE+1]) ? (`NE+2)'(0) : DivQe - {(`NE+1)'(0), ~LZAPlus1};
+    assign Qe = (DivResDenorm & DivDenormShiftPos) ? '0 : DivQe - {(`NE+1)'(0), ~LZAPlus1};
 endmodule
--- a/pipelined/src/fpu/postproc/specialcase.sv
+++ b/pipelined/src/fpu/postproc/specialcase.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// specialcase.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/qsel.sv
+++ b/pipelined/src/fpu/qsel.sv
@ -1,277 +0,0 @@
-///////////////////////////////////////////
-// srt.sv
-//
-// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
-// Modified:13 January 2022
-//
-// Purpose: Combined Divide and Square Root Floating Point and Integer Unit
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// MIT LICENSE
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
-// software and associated documentation files (the "Software"), to deal in the Software 
-// without restriction, including without limitation the rights to use, copy, modify, merge, 
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
-// to whom the Software is furnished to do so, subject to the following conditions:
-//
-//   The above copyright notice and this permission notice shall be included in all copies or 
-//   substantial portions of the Software.
-//
-//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
-//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
-//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
-//   OR OTHER DEALINGS IN THE SOFTWARE.
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-`include "wally-config.vh"
-
-module qsel2 ( // *** eventually just change to 4 bits
-  input  logic [3:0] ps, pc, 
-  output logic         qp, qz, qn
-);
- 
-  logic [3:0]  p, g;
-  logic          magnitude, sign, cout;
-
-  // The quotient selection logic is presented for simplicity, not
-  // for efficiency.  You can probably optimize your logic to
-  // select the proper divisor with less delay.
-
-  // Qmient equations from EE371 lecture notes 13-20
-  assign p = ps ^ pc;
-  assign g = ps & pc;
-
-  //assign magnitude = ~(&p[2:0]);
-  assign cout = g[2] | (p[2] & (g[1] | p[1] & g[0]));
-  //assign sign = p[3] ^ cout;
-  assign magnitude = ~((ps[2]^pc[2]) & (ps[1]^pc[1]) & 
-			  (ps[0]^pc[0]));
-  assign sign = (ps[3]^pc[3])^
-      (ps[2] & pc[2] | ((ps[2]^pc[2]) &
-			    (ps[1]&pc[1] | ((ps[1]^pc[1]) &
-						(ps[0]&pc[0])))));
-
-  // Produce quotient = +1, 0, or -1
-  assign qp = magnitude & ~sign;
-  assign qz = ~magnitude;
-  assign qn = magnitude & sign;
-endmodule
-
-////////////////////////////////////
-// Adder Input Generation, Radix 2 //
-////////////////////////////////////
-module fgen2 (
-  input  logic sp, sz,
-  input  logic [`DIVb-1:0] C,
-  input  logic [`DIVb:0] S, SM,
-  output logic [`DIVb+3:0] F
-);
-  logic [`DIVb+3:0] FP, FN, FZ;
-  logic [`DIVb+3:0] SExt, SMExt, CExt;
-
-  assign SExt = {3'b0, S};
-  assign SMExt = {3'b0, SM};
-  assign CExt = {4'hf, C}; // extend C from U0.k to Q4.k
-
-  // Generate for both positive and negative bits
-  assign FP = ~(SExt << 1) & CExt;
-  assign FN = (SMExt << 1) | (CExt & ~(CExt << 2));
-  assign FZ = '0;
-
-  // Choose which adder input will be used
-
-  always_comb
-    if (sp)       F = FP;
-    else if (sz)  F = FZ;
-    else          F = FN;
-
-endmodule
-
-module qsel4 (
-	input logic [`DIVN-2:0] D,
-  input logic [4:0] Smsbs,
-	input logic [`DIVb+3:0] WS, WC,
-  input logic Sqrt, j1,
-	output logic [3:0] q
-);
-	logic [6:0] Wmsbs;
-	logic [7:0] PreWmsbs;
-	logic [2:0] Dmsbs, A;
-
-	assign PreWmsbs = WC[`DIVb+3:`DIVb-4] + WS[`DIVb+3:`DIVb-4];
-	assign Wmsbs = PreWmsbs[7:1];
-	assign Dmsbs = D[`DIVN-2:`DIVN-4];//|{3{D[`DIVN-2]&Sqrt}};
-	// D = 0001.xxx...
-	// Dmsbs = |   |
-  // W =      xxxx.xxx...
-	// Wmsbs = |        |
-
-	logic [3:0] QSel4[1023:0];
-
-  always_comb begin 
-    integer a, w, i, w2;
-    for(a=0; a<8; a++)
-      for(w=0; w<128; w++)begin
-        i = a*128+w;
-        w2 = w-128*(w>=64); // convert to two's complement
-        case(a)
-          0: if($signed(w2)>=$signed(12))      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-4)  QSel4[i] = 4'b0000; 
-            else if(w2>=-13) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-          1: if(w2>=14)      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100;  
-            else if(w2>=-4)  QSel4[i] = 4'b0000; 
-            else if(w2>=-14) QSel4[i] = 4'b0010;  
-            else             QSel4[i] = 4'b0001; 
-          2: if(w2>=16)      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-6)  QSel4[i] = 4'b0000; 
-            else if(w2>=-16) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-          3: if(w2>=16)      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-6)  QSel4[i] = 4'b0000; 
-            else if(w2>=-17) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-          4: if(w2>=18)      QSel4[i] = 4'b1000;
-            else if(w2>=6)   QSel4[i] = 4'b0100; 
-            else if(w2>=-6)  QSel4[i] = 4'b0000; 
-            else if(w2>=-18) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-          5: if(w2>=20)      QSel4[i] = 4'b1000;
-            else if(w2>=6)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-20) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-          6: if(w2>=20)      QSel4[i] = 4'b1000;
-            else if(w2>=8)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-22) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-          7: if(w2>=24)      QSel4[i] = 4'b1000; 
-            else if(w2>=8)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-22) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-        endcase
-      end
-  end
-  always_comb
-    if (Sqrt) begin 
-      if (j1) A = 3'b101;
-      else if (Smsbs == 5'b10000) A = 3'b111;
-      else A = Smsbs[2:0];
-    end else A = Dmsbs;
-	assign q = QSel4[{A,Wmsbs}];
-	
-endmodule
-
-// qsel4old was working for divide
-module qsel4old (
-	input logic [`DIVN-2:0] D,
-	input logic [`DIVb+3:0] WS, WC,
-  input logic Sqrt,
-	output logic [3:0] q
-);
-	logic [6:0] Wmsbs;
-	logic [7:0] PreWmsbs;
-	logic [2:0] Dmsbs;
-	assign PreWmsbs = WC[`DIVb+3:`DIVb-4] + WS[`DIVb+3:`DIVb-4];
-	assign Wmsbs = PreWmsbs[7:1];
-	assign Dmsbs = D[`DIVN-2:`DIVN-4];//|{3{D[`DIVN-2]&Sqrt}};
-	// D = 0001.xxx...
-	// Dmsbs = |   |
-  // W =      xxxx.xxx...
-	// Wmsbs = |        |
-
-	logic [3:0] QSel4[1023:0];
-
-  always_comb begin 
-    integer d, w, i, w2;
-    for(d=0; d<8; d++)
-      for(w=0; w<128; w++)begin
-        i = d*128+w;
-        w2 = w-128*(w>=64); // convert to two's complement
-        case(d)
-          0: if($signed(w2)>=$signed(12))      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-4)  QSel4[i] = 4'b0000; 
-            else if(w2>=-13) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-          1: if(w2>=14)      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-5)  QSel4[i] = 4'b0000; // was -6
-            else if(~Sqrt&(w2>=-15)) QSel4[i] = 4'b0010; // divide case
-            else if( Sqrt&(w2>=-14)) QSel4[i] = 4'b0010; // sqrt case
-            else             QSel4[i] = 4'b0001; 
-          2: if(w2>=15)      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-6)  QSel4[i] = 4'b0000; 
-            else if(w2>=-16) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-          3: if(w2>=16)      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-6)  QSel4[i] = 4'b0000; 
-            else if(w2>=-17) QSel4[i] = 4'b0010; // was -18
-            else             QSel4[i] = 4'b0001; 
-          4: if(w2>=18)      QSel4[i] = 4'b1000;
-            else if(w2>=6)   QSel4[i] = 4'b0100; 
-            else if(w2>=-6)  QSel4[i] = 4'b0000; // was -8
-            else if(~Sqrt&(w2>=-20)) QSel4[i] = 4'b0010; // divide case
-            else if( Sqrt&(w2>=-18)) QSel4[i] = 4'b0010; // sqrt case
-            else             QSel4[i] = 4'b0001; 
-          5: if(w2>=20)      QSel4[i] = 4'b1000;
-            else if(w2>=6)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-20) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-          6: if(w2>=20)      QSel4[i] = 4'b1000;
-            else if(w2>=8)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-22) QSel4[i] = 4'b0010; 
-            else             QSel4[i] = 4'b0001; 
-          7: if(w2>=22)      QSel4[i] = 4'b1000; // was 24
-            else if(w2>=8)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-23) QSel4[i] = 4'b0010; // was -24 ***use -22
-            else             QSel4[i] = 4'b0001; 
-        endcase
-      end
-  end
-	assign q = QSel4[{Dmsbs,Wmsbs}];
-	
-endmodule
-
-////////////////////////////////////
-// Adder Input Generation, Radix 4 //
-////////////////////////////////////
-module fgen4 (
-  input  logic [3:0] s,
-  input  logic [`DIVb+3:0] C, S, SM,
-  output logic [`DIVb+3:0] F
-);
-  logic [`DIVb+3:0] F2, F1, F0, FN1, FN2;
-  
-  // Generate for both positive and negative bits
-  assign F2  = (~S << 2) & (C << 2);
-  assign F1  = ~(S << 1) & C;
-  assign F0  = '0;
-  assign FN1 = (SM << 1) | (C & ~(C << 3));
-  assign FN2 = (SM << 2) | ((C << 2)&~(C << 4));
-
-  // Choose which adder input will be used
-
-  always_comb
-    if (s[3])       F = F2;
-    else if (s[2])  F = F1;
-    else if (s[1])  F = FN1;
-    else if (s[0])  F = FN2;
-    else            F = F0;
-endmodule
--- a/pipelined/src/fpu/unpack.sv
+++ b/pipelined/src/fpu/unpack.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// unpack.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/fpu/unpackinput.sv
+++ b/pipelined/src/fpu/unpackinput.sv
@ -1,4 +1,5 @@
 ///////////////////////////////////////////
+// unpackinput.sv
 //
 // Written: me@KatherineParry.com
 // Modified: 7/5/2022
--- a/pipelined/src/uncore/uartPC16550D.sv
+++ b/pipelined/src/uncore/uartPC16550D.sv
@ -82,7 +82,7 @@ module uartPC16550D(
  logic 	   DLAB; // Divisor Latch Access Bit (LCR bit 7)

  // Baud and rx/tx timing
-  logic 	   baudpulse, txbaudpulse, rxbaudpulse; // high one system clk cycle each baud/16 period
+  (* mark_debug = "true" *) logic 	   baudpulse, txbaudpulse, rxbaudpulse; // high one system clk cycle each baud/16 period
  logic [16+`UART_PRESCALE-1:0] baudcount;
  logic [3:0] 					rxoversampledcnt, txoversampledcnt; // count oversampled-by-16
  logic [3:0] 					rxbitsreceived, txbitssent;
@ -90,8 +90,8 @@ module uartPC16550D(

  // shift registrs and FIFOs
  logic [9:0] 					rxshiftreg;
-  logic [10:0] 					rxfifo[15:0];
-  logic [7:0] 					txfifo[15:0];
+  (* mark_debug = "true" *) logic [10:0] 					rxfifo[15:0];
+  (* mark_debug = "true" *) logic [7:0] 					txfifo[15:0];
  logic [4:0] 					rxfifotailunwrapped;
 (* mark_debug = "true" *)  logic [3:0] 					rxfifohead, rxfifotail, txfifohead, txfifotail, rxfifotriggerlevel;
 (* mark_debug = "true" *)  logic [3:0] 					rxfifoentries, txfifoentries;
@ -99,7 +99,7 @@ module uartPC16550D(

  // receive data
   (* mark_debug = "true" *)  logic [10:0] 					RXBR;
-  logic [6:0] 					rxtimeoutcnt;
+  (* mark_debug = "true" *) logic [6:0] 					rxtimeoutcnt;
  logic 						rxcentered;
  logic 						rxparity, rxparitybit, rxstopbit;
   (* mark_debug = "true" *)  logic 						rxparityerr, rxoverrunerr, rxframingerr, rxbreak, rxfifohaserr;
@ -107,16 +107,16 @@ module uartPC16550D(
 (* mark_debug = "true" *)  logic 						rxfifoempty, rxfifotriggered, rxfifotimeout;
  logic 						rxfifodmaready;
  logic [8:0] 					rxdata9;
-  logic [7:0] 					rxdata;
-  logic [15:0] 					RXerrbit, rxfullbit;
-  logic [31:0] 					rxfullbitunwrapped;
+  (* mark_debug = "true" *) logic [7:0] 					rxdata;
+  (* mark_debug = "true" *) logic [15:0] 					RXerrbit, rxfullbit;
+  (* mark_debug = "true" *) logic [31:0] 					rxfullbitunwrapped;

  // transmit data
  logic [7:0] 					TXHR, nexttxdata;
-  logic [11:0] 					txdata, txsr;
-  logic 						txnextbit, txhrfull, txsrfull;
+  (* mark_debug = "true" *) logic [11:0] 					txdata, txsr;
+  (* mark_debug = "true" *) logic 						txnextbit, txhrfull, txsrfull;
  logic 						txparity;
-  logic 						txfifoempty, txfifofull, txfifodmaready;
+  (* mark_debug = "true" *) logic 						txfifoempty, txfifofull, txfifodmaready;

  // control signals
 (* mark_debug = "true" *)  logic 						fifoenabled, fifodmamodesel, evenparitysel;
@ -154,7 +154,7 @@ module uartPC16550D(
 		//DLL <= #1 8'd38; // 35Mhz
 		//DLL <= #1 8'd11; // 10 Mhz
 		//DLL <= #1 8'd33; // 30 Mhz
-		DLL <= #1 8'd8; // 30 Mhz 230400
+		DLL <= #1 8'd11; // 30 Mhz 230400
 		DLM <= #1 8'b0;
      end else begin
 		DLL <= #1 8'd1; // this cannot be zero with DLM also zer0.
@ -178,7 +178,7 @@ module uartPC16550D(
 		  // freq /baud / 16 = div
          //3'b000: if (DLAB) DLL <= #1 8'd38; //else TXHR <= #1 Din; // TX handled in TX register/FIFO section
 		  //3'b000: if (DLAB) DLL <= #1 8'd11; //else TXHR <= #1 Din; // TX handled in
-		      3'b000: if (DLAB) DLL <= #1 8'd8; //else TXHR <= #1 Din; // TX handled in 		  
+		      3'b000: if (DLAB) DLL <= #1 8'd11; //else TXHR <= #1 Din; // TX handled in 		  
          3'b001: if (DLAB) DLM <= #1 8'b0; else IER <= #1 Din[3:0];
          3'b010: FCR <= #1 {Din[7:6], 2'b0, Din[3], 2'b0, Din[0]}; // Write only FIFO Control Register; 4:5 reserved and 2:1 self-clearing
          3'b011: LCR <= #1 Din;
@ -275,7 +275,7 @@ module uartPC16550D(
        rxstate <= #1 UART_ACTIVE;
        rxoversampledcnt <= #1 0;
        rxbitsreceived <= #1 0;
-        rxtimeoutcnt <= #1 0; // reset timeout when new character is arriving
+        if (~rxfifotimeout) rxtimeoutcnt <= #1 0; // reset timeout when new character is arriving. Jacob Pease: Only if the timeout was not already reached. p.16 PC16550D.pdf
      end else if (rxbaudpulse & (rxstate == UART_ACTIVE)) begin
        rxoversampledcnt <= #1 rxoversampledcnt + 1;  // 16x oversampled counter
        if (rxcentered) rxbitsreceived <= #1 rxbitsreceived + 1;
@ -357,8 +357,8 @@ module uartPC16550D(
                         (rxfifohead + 16 - rxfifotail);
  // verilator lint_on WIDTH
  assign rxfifotriggered = rxfifoentries >= rxfifotriggerlevel;
-  //assign rxfifotimeout = rxtimeoutcnt[6]; // time out after 4 character periods; *** probably not right yet
-  assign rxfifotimeout = 0; // disabled pending fix
+  assign rxfifotimeout = rxtimeoutcnt[6]; // time out after 4 character periods; *** probably not right yet
+  //assign rxfifotimeout = 0; // disabled pending fix

  // detect any errors in rx fifo
  // although rxfullbit looks like a combinational loop, in one bit rxfifotail == i and breaks the loop
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@ -80,9 +80,8 @@ module testbenchfp;
  logic CvtResSgnE;
  logic [`NE:0]           CvtCalcExpE;    // the calculated expoent
 	logic [`LOGCVTLEN-1:0] CvtShiftAmtE;  // how much to shift by
-	logic [`DIVb-(`RADIX/4):0] Quot;
+	logic [`DIVb:0] Quot;
  logic CvtResDenormUfE;
-  logic [`DURLEN-1:0] EarlyTermShift;
  logic DivStart, DivBusy;
  logic reset = 1'b0;
  logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
@ -575,13 +574,20 @@ module testbenchfp;
      end
      if (TEST === "div"   | TEST === "all") begin // if division is being tested
        // add the correct tests/op-ctrls/unit/fmt to their lists
-        Tests = {Tests, f16div};
+        Tests = {f16div, Tests};
+        OpCtrl = {`DIV_OPCTRL, OpCtrl};
+        WriteInt = {1'b0, WriteInt};
+        for(int i = 0; i<5; i++) begin
+          Unit = {`DIVUNIT, Unit};
+          Fmt = {2'b10, Fmt};
+        end
+        /* Tests = {Tests, f16div};
        OpCtrl = {OpCtrl, `DIV_OPCTRL};
        WriteInt = {WriteInt, 1'b0};
        for(int i = 0; i<5; i++) begin
          Unit = {Unit, `DIVUNIT};
          Fmt = {Fmt, 2'b10};
-        end
+        end */
      end
      if (TEST === "sqrt"  | TEST === "all") begin // if sqrt is being tested
        // add the correct tests/op-ctrls/unit/fmt to their lists
@ -694,7 +700,7 @@ module testbenchfp;
              .XInf(XInf), .YInf(YInf), .ZInf(ZInf), .CvtCs(CvtResSgnE), .ToInt(WriteIntVal),
              .XSNaN(XSNaN), .YSNaN(YSNaN), .ZSNaN(ZSNaN), .CvtLzcIn(CvtLzcInE), .IntZero,
              .FmaKillProd(KillProd), .FmaZmS(ZmSticky), .FmaPe(Pe), .DivDone, .FmaSe(Se),
-              .FmaSm(Sm), .FmaNegSum(NegSum), .FmaInvA(InvA), .FmaSCnt(SCnt), .DivEarlyTermShift(EarlyTermShift), .FmaAs(As), .FmaPs(Ps), .Fmt(ModFmt), .Frm(FrmVal), 
+              .FmaSm(Sm), .FmaNegSum(NegSum), .FmaInvA(InvA), .FmaSCnt(SCnt), .FmaAs(As), .FmaPs(Ps), .Fmt(ModFmt), .Frm(FrmVal), 
              .PostProcFlg(Flg), .PostProcRes(FpRes), .FCvtIntRes(IntRes));
  
  if (TEST === "cvtfp" | TEST === "cvtint" | TEST === "all") begin : fcvt
@ -712,7 +718,7 @@ module testbenchfp;
    fdivsqrt fdivsqrt(.clk, .reset, .XsE(Xs), .FmtE(ModFmt), .XmE(Xm), .YmE(Ym), .XeE(Xe), .YeE(Ye), .SqrtE(OpCtrlVal[0]), .SqrtM(OpCtrlVal[0]),
                    .XInfE(XInf), .YInfE(YInf), .XZeroE(XZero), .YZeroE(YZero), .XNaNE(XNaN), .YNaNE(YNaN), .DivStartE(DivStart), 
                    .StallE(1'b0), .StallM(1'b0), .DivSM(DivSticky), .DivBusy, .QeM(DivCalcExp),
-                    .EarlyTermShiftM(EarlyTermShift), .QmM(Quot), .DivDone);
+                    .QmM(Quot), .DivDone);
  end

  assign CmpFlg[3:0] = 0;
@ -801,6 +807,8 @@ always_comb begin
    endcase
 end

+  logic ResMatch, FlagMatch, CheckNow;
+
 // check results on falling edge of clk
 always @(negedge clk) begin

@ -870,7 +878,11 @@ always @(negedge clk) begin
    // check if result is correct
    //  - wait till the division result is done or one extra cylcle for early termination (to simulate the EM pipline stage)
   // if(~((Res === Ans | NaNGood | NaNGood === 1'bx) & (ResFlg === AnsFlg | AnsFlg === 5'bx))&~((DivBusy===1'b1)|DivStart)&(UnitVal !== `CVTINTUNIT)&(UnitVal !== `CMPUNIT)) begin
-    if(~((Res === Ans | NaNGood | NaNGood === 1'bx) & (ResFlg === AnsFlg | AnsFlg === 5'bx))&(DivDone | (TEST != "sqrt" & TEST != "div"))&(UnitVal !== `CVTINTUNIT)&(UnitVal !== `CMPUNIT)) begin
+    assign ResMatch = (Res === Ans | NaNGood | NaNGood === 1'bx);
+    assign FlagMatch = (ResFlg === AnsFlg | AnsFlg === 5'bx);
+    assign CheckNow = (DivDone | (TEST != "sqrt" & TEST != "div"))&(UnitVal !== `CVTINTUNIT)&(UnitVal !== `CMPUNIT);
+    if(~(ResMatch & FlagMatch) & CheckNow) begin
+//    if(~((Res === Ans | NaNGood | NaNGood === 1'bx) & (ResFlg === AnsFlg | AnsFlg === 5'bx))&(DivDone | (TEST != "sqrt" & TEST != "div"))&(UnitVal !== `CVTINTUNIT)&(UnitVal !== `CMPUNIT)) begin
      errors += 1;
      $display("Error in %s", Tests[TestNum]);
      $display("inputs: %h %h %h\nSrcA: %h\n Res: %h %h\n Expected: %h %h", X, Y, Z, SrcA, Res, ResFlg, Ans, AnsFlg);