Merge branch 'main' of https://github.com/davidharrishmc/riscv-wally into main

2022-07-18 13:30:50 -07:00 · 2022-07-18 13:30:50 -07:00 · a190bc4471
commit a190bc4471
parent 877d0b7364 c65aa54a1e
56 changed files with 1103 additions and 3474 deletions
--- a/pipelined/config/rv64fp/wally-config.vh
+++ b/pipelined/config/rv64fp/wally-config.vh
@ -39,7 +39,7 @@

 // MISA RISC-V configuration per specification
 //                    ZYXWVUTSRQPONMLKJIHGFEDCBA
-`define MISA 32'b0000000000101000001000100100101
+`define MISA 32'b0000000000101000001000100101101
 `define ZICSR_SUPPORTED 1
 `define ZIFENCEI_SUPPORTED 1
 `define COUNTERS 32
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@ -95,11 +95,25 @@

 // largest length in IEU/FPU
 `define CVTLEN ((`NF<`XLEN) ? (`XLEN) : (`NF))
-`define DIVLEN ((`NF < `XLEN) ? (`XLEN) : (`NF))
 `define LLEN ((`FLEN<`XLEN) ? (`XLEN) : (`FLEN))
 `define LOGCVTLEN $unsigned($clog2(`CVTLEN+1))
-`define NORMSHIFTSZ ((`DIVLEN+`NF+3) > (3*`NF+8) ? (`DIVLEN+`NF+3) : (3*`NF+9))
-`define CORRSHIFTSZ ((`DIVLEN+`NF+3) > (3*`NF+8) ? (`DIVLEN+`NF+3) : (3*`NF+6))
+`define NORMSHIFTSZ ((`QLEN+`NF+3) > (3*`NF+8) ? (`QLEN+`NF+1) : (3*`NF+9))
+`define CORRSHIFTSZ ((`DIVRESLEN+`NF) > (3*`NF+8) ? (`DIVRESLEN+`NF) : (3*`NF+6))
+
+// division constants
+`define RADIX 32'h2
+`define DIVCOPIES 32'h1
+`define DIVLEN ((`NF < `XLEN) ? (`XLEN) : (`NF + 3))
+`define EXTRAFRACBITS ((`NF<(`XLEN)) ? (`XLEN - `NF) : 3)
+`define EXTRAINTBITS ((`NF<(`XLEN)) ? 0 : (`NF - `XLEN + 3))
+`define DIVRESLEN ((`NF>`XLEN) ? `NF+4 : `XLEN)
+`define LOGR ((`RADIX==2) ? 32'h1 : 32'h2)
+// FPDUR = ceil(DIVRESLEN/(LOGR*DIVCOPIES))
+// one interation is required for the integer bit for minimally redundent radix-4
+`define FPDUR ((`DIVLEN+(`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES)+(`RADIX/4))
+`define DURLEN ($clog2(`FPDUR+1))
+`define QLEN (`FPDUR*`LOGR*`DIVCOPIES)
+

 `define USE_SRAM 0

--- a/pipelined/regression/sim-testfloat
+++ b/pipelined/regression/sim-testfloat
@ -6,7 +6,7 @@
 # fma    - test fma
 # sub    - test subtraction
 # div    - test division
-# sqrt   - test square ro
+# sqrt   - test square root
 # all    - test everything

-vsim -do "do testfloat.do rv64fp mul"
+vsim -do "do testfloat.do rv64fp $1"
--- a/pipelined/regression/sim-testfloat-batch
+++ b/pipelined/regression/sim-testfloat-batch
@ -1,7 +1,9 @@
+
 # cvtint - test integer conversion unit (fcvtint)
 # cvtfp  - test floating-point conversion unit (fcvtfp)
 # cmp    - test comparison unit's LT, LE, EQ opperations (fcmp)
 # add    - test addition
+# fma    - test fma
 # sub    - test subtraction
 # div    - test division
 # sqrt   - test square root
--- a/pipelined/regression/sim-wally
+++ b/pipelined/regression/sim-wally
@ -1,2 +1,2 @@
-vsim -do "do wally-pipelined.do rv32gc arch32i"
+vsim -do "do wally-pipelined.do rv32gc wally32periph"

--- a/pipelined/regression/sim-wally-batch
+++ b/pipelined/regression/sim-wally-batch
@ -1 +1 @@
-vsim -c -do "do wally-pipelined-batch.do rv64gc imperas64f"
+vsim -c -do "do wally-pipelined-batch.do rv32gc wally32d"
--- a/pipelined/regression/wave-fpu.do
+++ b/pipelined/regression/wave-fpu.do
@ -9,22 +9,31 @@ add wave -noupdate /testbenchfp/Res
 add wave -noupdate /testbenchfp/Ans
 add wave -noupdate /testbenchfp/DivStart
 add wave -noupdate /testbenchfp/DivBusy
-add wave -noupdate /testbenchfp/srtfsm/state
+add wave -noupdate /testbenchfp/divsqrt/srtfsm/state
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/*
-add wave -group {PostProc} -noupdate /testbenchfp/postprocess/resultselect/*
+add wave -group {PostProc} -noupdate /testbenchfp/postprocess/specialcase/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/flags/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/normshift/*
-add wave -group {PostProc} -noupdate /testbenchfp/postprocess/lzacorrection/*
+add wave -group {PostProc} -noupdate /testbenchfp/postprocess/shiftcorrection/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/resultsign/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/round/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/fmashiftcalc/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/divshiftcalc/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/cvtshiftcalc/*
-add wave -group {Divide} -noupdate /testbenchfp/srtradix4/*
-add wave -group {Divide} -noupdate /testbenchfp/srtradix4/qsel4/*
-add wave -group {Divide} -noupdate /testbenchfp/srtradix4/otfc4/*
-add wave -group {Divide} -noupdate /testbenchfp/srtpreproc/*
-add wave -group {Divide} -noupdate /testbenchfp/srtradix4/expcalc/*
-add wave -group {Divide} -noupdate /testbenchfp/srtfsm/*
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/WC
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/WS
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/WCA
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/WSA
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/Q
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/QM
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/QNext
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/QMNext
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/*
+add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/*
+# add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/otfc/otfc2/*
+# add wave -group {Divide} -group inter0 -noupdate /testbenchfp/divsqrt/srt/interations[0]/divinteration/qsel/qsel2/*
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srtpreproc/*
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srt/expcalc/*
+add wave -group {Divide} -noupdate /testbenchfp/divsqrt/srtfsm/*
 add wave -group {Testbench} -noupdate /testbenchfp/*
 add wave -group {Testbench} -noupdate /testbenchfp/readvectors/*
--- a/pipelined/src/cache/cache.sv
+++ b/pipelined/src/cache/cache.sv
@ -42,10 +42,8 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGWPL, WORDLEN, MUXINTER
  input logic [11:0]          NextAdr, // virtual address, but we only use the lower 12 bits.
  input logic [`PA_BITS-1:0]  PAdr, // physical address
  input logic [(`XLEN-1)/8:0] ByteMask,
-  input logic [`XLEN-1:0]     FinalWriteData,
-  input logic [`FLEN-1:0]     FWriteDataM,
-  input logic                        FLoad2,
-  input logic                 FpLoadStoreM,
+  input logic [WORDLEN-1:0]     FinalWriteData,
+  input logic                        FStore2,
  output logic                CacheCommitted,
  output logic                CacheStall,
   // to performance counters to cpu
@ -72,7 +70,7 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGWPL, WORDLEN, MUXINTER
  localparam                  SETLEN = $clog2(NUMLINES);
  localparam                  SETTOP = SETLEN+OFFSETLEN;
  localparam                  TAGLEN = `PA_BITS - SETTOP;
-  localparam                  WORDSPERLINE = LINELEN/`XLEN;
+  localparam                  WORDSPERLINE = LINELEN/WORDLEN;
  localparam                  FlushAdrThreshold   = NUMLINES - 1;

  logic                       SelAdr;
@ -123,7 +121,7 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGWPL, WORDLEN, MUXINTER

  // Array of cache ways, along with victim, hit, dirty, and read merging logic
  cacheway #(NUMLINES, LINELEN, TAGLEN, OFFSETLEN, SETLEN) 
-    CacheWays[NUMWAYS-1:0](.clk, .reset, .RAdr, .PAdr, .CacheWriteData, .ByteMask, .FLoad2,
+    CacheWays[NUMWAYS-1:0](.clk, .reset, .RAdr, .PAdr, .CacheWriteData, .ByteMask, .FStore2,
    .SetValidWay, .ClearValidWay, .SetDirtyWay, .ClearDirtyWay, .SelEvict, .VictimWay,
    .FlushWay, .SelFlush, .ReadDataLineWay, .HitWay, .VictimDirtyWay, .VictimTagWay, 
    .Invalidate(InvalidateCacheM));
@ -162,12 +160,8 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGWPL, WORDLEN, MUXINTER
  /////////////////////////////////////////////////////////////////////////////////////////////
  // Write Path: Write data and address. Muxes between writes from bus and writes from CPU.
  /////////////////////////////////////////////////////////////////////////////////////////////
-  if (`LLEN>`XLEN)
-    mux3 #(LINELEN) WriteDataMux(.d0({WORDSPERLINE{FinalWriteData}}),
-      .d1({WORDSPERLINE/2{FWriteDataM}}),	.d2(CacheBusWriteData),	.s({SetValid,FpLoadStoreM&~SetValid}), .y(CacheWriteData));
-  else
-    mux2 #(LINELEN) WriteDataMux(.d0({WORDSPERLINE{FinalWriteData}}),
-      .d1(CacheBusWriteData),	.s(SetValid), .y(CacheWriteData));
+  mux2 #(LINELEN) WriteDataMux(.d0({WORDSPERLINE{FinalWriteData}}),
+  .d1(CacheBusWriteData),	.s(SetValid), .y(CacheWriteData));
  mux3 #(`PA_BITS) CacheBusAdrMux(.d0({PAdr[`PA_BITS-1:OFFSETLEN], {OFFSETLEN{1'b0}}}),
 		.d1({VictimTag, PAdr[SETTOP-1:OFFSETLEN], {OFFSETLEN{1'b0}}}),
 		.d2({VictimTag, FlushAdr, {OFFSETLEN{1'b0}}}),
--- a/pipelined/src/cache/cacheway.sv
+++ b/pipelined/src/cache/cacheway.sv
@ -38,7 +38,7 @@ module cacheway #(parameter NUMLINES=512, parameter LINELEN = 256, TAGLEN = 26,
  input logic [$clog2(NUMLINES)-1:0] RAdr,
  input logic [`PA_BITS-1:0]         PAdr,
  input logic [LINELEN-1:0]          CacheWriteData,
-  input logic                        FLoad2,
+  input logic                        FStore2,
  input logic                        SetValidWay,
  input logic                        ClearValidWay,
  input logic                        SetDirtyWay,
@ -79,7 +79,7 @@ module cacheway #(parameter NUMLINES=512, parameter LINELEN = 256, TAGLEN = 26,
    logic [2**LOGWPL-1:0] MemPAdrDecodedtmp;
    onehotdecoder #(LOGWPL) adrdec(
      .bin(PAdr[LOGWPL+LOGXLENBYTES-1:LOGXLENBYTES]), .decoded(MemPAdrDecodedtmp));
-    assign MemPAdrDecoded = MemPAdrDecodedtmp|{MemPAdrDecodedtmp[2**LOGWPL-2:0]&{2**LOGWPL-1{FLoad2}}, 1'b0};
+    assign MemPAdrDecoded = MemPAdrDecodedtmp|{MemPAdrDecodedtmp[2**LOGWPL-2:0]&{2**LOGWPL-1{FStore2}}, 1'b0};
  end else
    onehotdecoder #(LOGWPL) adrdec(
      .bin(PAdr[LOGWPL+LOGXLENBYTES-1:LOGXLENBYTES]), .decoded(MemPAdrDecoded));
--- a/pipelined/src/fpu/divshiftcalc.sv
+++ b/pipelined/src/fpu/divshiftcalc.sv
@ -1,10 +1,10 @@
 `include "wally-config.vh"

 module divshiftcalc(
-    input logic  [`DIVLEN+2:0] Quot,
+    input logic  [`QLEN-1-(`RADIX/4):0] DivQm,
    input logic  [`FMTBITS-1:0] Fmt,
-    input logic [$clog2(`DIVLEN/2+3)-1:0] DivEarlyTermShiftDiv2,
-    input logic [`NE+1:0] DivCalcExp,
+    input logic [`DURLEN-1:0] DivEarlyTermShift,
+    input logic [`NE+1:0] DivQe,
    output logic [$clog2(`NORMSHIFTSZ)-1:0] DivShiftAmt,
    output logic [`NORMSHIFTSZ-1:0] DivShiftIn,
    output logic DivResDenorm,
@ -14,27 +14,28 @@ module divshiftcalc(

    // is the result denromalized
    // if the exponent is 1 then the result needs to be normalized then the result is denormalizes
-    assign DivResDenorm = DivCalcExp[`NE+1]|(~|DivCalcExp[`NE+1:0]);
+    assign DivResDenorm = DivQe[`NE+1]|(~|DivQe[`NE+1:0]);

    // if the result is denormalized
-    //  00000000x.xxxxxx...                     Exp = DivCalcExp
-    //  .00000000xxxxxxx... >> NF+1             Exp = DivCalcExp+NF+1
-    //  .00xxxxxxxxxxxxx... << DivCalcExp+NF+1  Exp = +1
+    //  00000000x.xxxxxx...                     Exp = DivQe
+    //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
+    //  .00xxxxxxxxxxxxx... << DivQe+NF+1  Exp = +1
    //  .0000xxxxxxxxxxx... >> 1                Exp = 1
-    // Left shift amount  = DivCalcExp+NF+1-1
-    assign DivDenormShift = (`NE+2)'(`NF)+DivCalcExp;
+    // Left shift amount  = DivQe+NF+1-1
+    assign DivDenormShift = (`NE+2)'(`NF)+DivQe;
    // if the result is normalized
-    //  00000000x.xxxxxx...                     Exp = DivCalcExp
-    //  .00000000xxxxxxx... >> NF+1             Exp = DivCalcExp+NF+1
-    //  00000000.xxxxxxx... << NF               Exp = DivCalcExp+1
-    //  00000000x.xxxxxx... << NF               Exp = DivCalcExp (extra shift done afterwards)
-    //  00000000xx.xxxxx... << 1?               Exp = DivCalcExp-1 (determined after)
+    //  00000000x.xxxxxx...                     Exp = DivQe
+    //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
+    //  00000000.xxxxxxx... << NF               Exp = DivQe+1
+    //  00000000x.xxxxxx... << NF               Exp = DivQe (extra shift done afterwards)
+    //  00000000xx.xxxxx... << 1?               Exp = DivQe-1 (determined after)
    // inital Left shift amount  = NF
+    // shift one more if the it's a minimally redundent radix 4 - one entire cycle needed for integer bit
    assign NormShift = (`NE+2)'(`NF);
    // if the shift amount is negitive then dont shift (keep sticky bit)
-    assign DivShiftAmt = (DivResDenorm ?  DivDenormShift[$clog2(`NORMSHIFTSZ)-1:0]&{$clog2(`NORMSHIFTSZ){~DivDenormShift[`NE+1]}} : NormShift[$clog2(`NORMSHIFTSZ)-1:0])+{{$clog2(`NORMSHIFTSZ)-$clog2(`DIVLEN/2+3)-1{1'b0}}, DivEarlyTermShiftDiv2&{$clog2(`DIVLEN/2+3){~DivDenormShift[`NE+1]}}, 1'b0};
+    // need to multiply the early termination shift by LOGR*DIVCOPIES =  left shift of log2(LOGR*DIVCOPIES)
+    assign DivShiftAmt = (DivResDenorm ?  DivDenormShift[$clog2(`NORMSHIFTSZ)-1:0]&{$clog2(`NORMSHIFTSZ){~DivDenormShift[`NE+1]}} : NormShift[$clog2(`NORMSHIFTSZ)-1:0])+{{$clog2(`NORMSHIFTSZ)-`DURLEN-$clog2(`LOGR*`DIVCOPIES){1'b0}}, DivEarlyTermShift&{`DURLEN{~DivDenormShift[`NE+1]}}, {$clog2(`LOGR*`DIVCOPIES){1'b0}}};

-    // *** may be able to reduce shifter size
-    assign DivShiftIn = {{`NF{1'b0}}, Quot[`DIVLEN+2:0], {`NORMSHIFTSZ-`DIVLEN-3-`NF{1'b0}}};
+    assign DivShiftIn = {{`NF{1'b0}}, DivQm, {`NORMSHIFTSZ-`QLEN+(`RADIX/4)-`NF{1'b0}}};

 endmodule
--- a/pipelined/src/fpu/divsqrt.sv
+++ b/pipelined/src/fpu/divsqrt.sv
@ -43,26 +43,27 @@ module divsqrt(
  input  logic StallM,
  input logic StallE,
  output logic DivStickyM,
-  output logic DivNegStickyM,
  output logic DivBusy,
  output logic DivDone,
  output logic [`NE+1:0] DivCalcExpM,
-  output logic [$clog2(`DIVLEN/2+3)-1:0] EarlyTermShiftDiv2M,
-  output logic [`DIVLEN+2:0] QuotM
+  output logic [`DURLEN-1:0] EarlyTermShiftM,
+  output logic [`QLEN-1-(`RADIX/4):0] QuotM
 //   output logic [`XLEN-1:0] RemM,
 );

-  logic [`DIVLEN+3:0]  WSN, WCN;
+  logic [`DIVLEN+3:0]  NextWSN, NextWCN;
  logic [`DIVLEN+3:0]  WS, WC;
+  logic [`DIVLEN+3:0] StickyWSA;
  logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
  logic [`DIVLEN-1:0] X;
  logic [`DIVLEN-1:0] Dpreproc;
-  logic [$clog2(`DIVLEN/2+3)-1:0] Dur;
+  logic [`DURLEN-1:0] Dur;
+  logic NegSticky;

-  srtpreproc srtpreproc(.XManE, .Dur, .YManE,.X,.Dpreproc, .XZeroCnt, .YZeroCnt);
+  srtpreproc srtpreproc(.Xm(XManE), .Dur, .Ym(YManE), .X,.Dpreproc, .XZeroCnt, .YZeroCnt);

-  srtfsm srtfsm(.reset, .WSN, .WCN, .WS, .WC, .Dur, .DivBusy, .clk, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, .DivStickyE(DivStickyM), .XNaNE, .YNaNE,
-                .XInfE, .YInfE, .DivNegStickyE(DivNegStickyM), .EarlyTermShiftDiv2E(EarlyTermShiftDiv2M));
-  srtradix4 srtradix4(.clk, .FmtE, .X,.Dpreproc, .XZeroCnt, .YZeroCnt, .WS, .WC, .WSN, .WCN, .DivStart(DivStartE), .XExpE, .YExpE, .XZeroE, .YZeroE,
-                .DivBusy, .Quot(QuotM), .Rem(), .DivCalcExpM);
+  srtfsm srtfsm(.reset, .NextWSN, .NextWCN, .WS, .WC, .Dur, .DivBusy, .clk, .DivStart(DivStartE),.StallE, .StallM, .DivDone, .XZeroE, .YZeroE, .DivStickyE(DivStickyM), .XNaNE, .YNaNE,
+               .StickyWSA, .XInfE, .YInfE, .NegSticky(NegSticky), .EarlyTermShiftE(EarlyTermShiftM));
+  srt srt(.clk, .FmtE, .X,.Dpreproc, .NegSticky, .XZeroCnt, .YZeroCnt, .FirstWS(WS), .FirstWC(WC), .NextWSN, .NextWCN, .DivStart(DivStartE), .Xe(XExpE), .Ye(YExpE), .XZeroE, .YZeroE,
+                .StickyWSA, .DivBusy, .Quot(QuotM), .Rem(), .DivCalcExpM);
 endmodule
--- a/pipelined/src/fpu/fcvt.sv
+++ b/pipelined/src/fpu/fcvt.sv
@ -68,7 +68,8 @@ module fcvt (
    logic                   Signed;     // is the opperation with a signed integer?
    logic                   Int64;      // is the integer 64 bits?
    logic                   IntToFp;       // is the opperation an int->fp conversion?
-    logic [`LOGCVTLEN-1:0] LeadingZeros; // output from the LZC
+    logic [`CVTLEN:0]       LzcInFull;      // input to the Leading Zero Counter (priority encoder)
+    logic [`LOGCVTLEN-1:0]  LeadingZeros; // output from the LZC


    // seperate OpCtrl for code readability
@ -102,10 +103,11 @@ module fcvt (
    // choose the input to the leading zero counter i.e. priority encoder
    //             int -> fp : | positive integer | 00000... (if needed) | 
    //             fp  -> fp : | fraction         | 00000... (if needed) | 
-    assign LzcIn = IntToFp ? {TrimInt, {`CVTLEN-`XLEN{1'b0}}} :
-                             {Xm[`NF-1:0], {`CVTLEN-`NF{1'b0}}};
+    assign LzcInFull = IntToFp ? {1'b0, TrimInt, {`CVTLEN-`XLEN{1'b0}}} :
+                             {Xm, {`CVTLEN-`NF{1'b0}}};
+    assign LzcIn = LzcInFull[`CVTLEN-1:0];
    
-    lzc #(`CVTLEN) lzc (.num(LzcIn), .ZeroCnt(LeadingZeros));
+    lzc #(`CVTLEN+1) lzc (.num(LzcInFull), .ZeroCnt(LeadingZeros));

    ///////////////////////////////////////////////////////////////////////////
    // shifter
@ -119,13 +121,13 @@ module fcvt (
    //      denormalized/undeflowed result fp -> fp:
    //          - shift left by NF-1+CalcExp - to shift till the biased expoenent is 0
    //      ??? -> fp: 
-    //          - shift left by LeadingZeros+1 - to shift till the result is normalized
+    //          - shift left by LeadingZeros - to shift till the result is normalized
    //              - only shift fp -> fp if the intital value is denormalized
    //                  - this is a problem because the input to the lzc was the fraction rather than the mantissa
    //                  - rather have a few and-gates than an extra bit in the priority encoder??? *** is this true?
    assign ShiftAmt = ToInt ? Ce[`LOGCVTLEN-1:0]&{`LOGCVTLEN{~Ce[`NE]}} :
                    ResDenormUf&~IntToFp ? (`LOGCVTLEN)'(`NF-1)+Ce[`LOGCVTLEN-1:0] : 
-                              (LeadingZeros+1)&{`LOGCVTLEN{XDenorm|IntToFp}};
+                              (LeadingZeros);
    
    ///////////////////////////////////////////////////////////////////////////
    // exp calculations
@ -197,14 +199,14 @@ module fcvt (
    //                  |  0's |     Mantissa      |      0's if nessisary     |
    //                  |     keep        |
    //
-    //              - if the input is denormalized then we dont shift... so the  "- (LeadingZeros+1)" is just leftovers from other options
-    //      int -> fp : largest bias +  XLEN - Largest bias + new bias - 1 - LeadingZeros = XLEN + NewBias - 1 - LeadingZeros
+    //              - if the input is denormalized then we dont shift... so the  "- LeadingZeros" is just leftovers from other options
+    //      int -> fp : largest bias +  XLEN - Largest bias + new bias - LeadingZeros = XLEN + NewBias - LeadingZeros
    //              Process:
    //                  - shifted right by XLEN (XLEN)
-    //                  - shift left to normilize (-1-LeadingZeros)
+    //                  - shift left to normilize (-LeadingZeros)
    //                  - newBias to make the biased exponent
-    //          oldexp - biasold +newbias - (LeadingZeros+1)&(XDenorm|IntToFp)
-    assign Ce = {1'b0, OldExp} - (`NE+1)'(`BIAS) + {2'b0, NewBias} - {{`NE{1'b0}}, XDenorm|IntToFp} - {{`NE-`LOGCVTLEN+1{1'b0}}, (LeadingZeros&{`LOGCVTLEN{XDenorm|IntToFp}})};
+    //          oldexp - biasold +newbias - LeadingZeros&(XDenorm|IntToFp)
+    assign Ce = {1'b0, OldExp} - (`NE+1)'(`BIAS) + {2'b0, NewBias} - {{`NE-`LOGCVTLEN+1{1'b0}}, (LeadingZeros&{`LOGCVTLEN{XDenorm|IntToFp}})};
    // find if the result is dnormal or underflows
    //      - if Calculated expoenent is 0 or negitive (and the input/result is not exactaly 0)
    //      - can't underflow an integer to Fp conversion
--- a/pipelined/src/fpu/flags.sv
+++ b/pipelined/src/fpu/flags.sv
@ -34,24 +34,24 @@ module flags(
    input logic                 XInf, YInf, ZInf,    // inputs are infinity
    input logic                 Plus1,
    input logic                 InfIn,                  // is a Inf input being used
+    input logic                 NaNIn,                  // is a NaN input being used
+    input logic [`FMTBITS-1:0]  OutFmt,                 // output format
    input logic                 XZero, YZero,         // inputs are zero
    input logic                 XNaN, YNaN,           // inputs are NaN
-    input logic                 NaNIn,                  // is a NaN input being used
    input logic                 Sqrt,                   // Sqrt?
    input logic                 ToInt,                  // convert to integer
    input logic                 IntToFp,                // convert integer to floating point
    input logic                 Int64,                  // convert to 64 bit integer
    input logic                 Signed,                 // convert to a signed integer
-    input logic [`FMTBITS-1:0]  OutFmt,                 // output format
    input logic [`NE:0]         CvtCe,            // the calculated expoent - Cvt
    input logic                 CvtOp,                  // conversion opperation?
    input logic                 DivOp,                  // conversion opperation?
    input logic                 FmaOp,                  // Fma opperation?
-    input logic  [`NE+1:0]      FullResExp,             // Re with bits to determine sign and overflow
-    input logic  [`NE+1:0]      Nexp,               // exponent of the normalized sum
+    input logic  [`NE+1:0]      FullRe,             // Re with bits to determine sign and overflow
+    input logic  [`NE+1:0]      Me,               // exponent of the normalized sum
    input logic  [1:0]          CvtNegResMsbs,             // the negitive integer result's most significant bits
    input logic                 FmaAs, FmaPs,        // the product and modified Z signs
-    input logic                 R, UfLSBRes, S, UfPlus1, // bits used to determine rounding
+    input logic                 R, UfL, S, UfPlus1, // bits used to determine rounding
    output logic                DivByZero,
    output logic                IntInvalid, Invalid, Overflow, // flags used to select the res
    output logic [4:0]          PostProcFlg // flags
@ -73,30 +73,30 @@ module flags(


   if (`FPSIZES == 1) begin
-        assign ResExpGteMax = &FullResExp[`NE-1:0] | FullResExp[`NE];
-        assign ShiftGtIntSz = (|FullResExp[`NE:7]|(FullResExp[6]&~Int64)) | ((|FullResExp[4:0]|(FullResExp[5]&Int64))&((FullResExp[5]&~Int64) | FullResExp[6]&Int64));
+        assign ResExpGteMax = &FullRe[`NE-1:0] | FullRe[`NE];
+        assign ShiftGtIntSz = (|FullRe[`NE:7]|(FullRe[6]&~Int64)) | ((|FullRe[4:0]|(FullRe[5]&Int64))&((FullRe[5]&~Int64) | FullRe[6]&Int64));

    end else if (`FPSIZES == 2) begin    
-        assign ResExpGteMax = OutFmt ? &FullResExp[`NE-1:0] | FullResExp[`NE] : &FullResExp[`NE1-1:0] | (|FullResExp[`NE:`NE1]);
+        assign ResExpGteMax = OutFmt ? &FullRe[`NE-1:0] | FullRe[`NE] : &FullRe[`NE1-1:0] | (|FullRe[`NE:`NE1]);

-        assign ShiftGtIntSz = (|FullResExp[`NE:7]|(FullResExp[6]&~Int64)) | ((|FullResExp[4:0]|(FullResExp[5]&Int64))&((FullResExp[5]&~Int64) | FullResExp[6]&Int64));
+        assign ShiftGtIntSz = (|FullRe[`NE:7]|(FullRe[6]&~Int64)) | ((|FullRe[4:0]|(FullRe[5]&Int64))&((FullRe[5]&~Int64) | FullRe[6]&Int64));
    end else if (`FPSIZES == 3) begin
        always_comb
            case (OutFmt)
-                `FMT: ResExpGteMax = &FullResExp[`NE-1:0] | FullResExp[`NE];
-                `FMT1: ResExpGteMax = &FullResExp[`NE1-1:0] | (|FullResExp[`NE:`NE1]);
-                `FMT2: ResExpGteMax = &FullResExp[`NE2-1:0] | (|FullResExp[`NE:`NE2]);
+                `FMT: ResExpGteMax = &FullRe[`NE-1:0] | FullRe[`NE];
+                `FMT1: ResExpGteMax = &FullRe[`NE1-1:0] | (|FullRe[`NE:`NE1]);
+                `FMT2: ResExpGteMax = &FullRe[`NE2-1:0] | (|FullRe[`NE:`NE2]);
                default: ResExpGteMax = 1'bx;
            endcase
-            assign ShiftGtIntSz = (|FullResExp[`NE:7]|(FullResExp[6]&~Int64)) | ((|FullResExp[4:0]|(FullResExp[5]&Int64))&((FullResExp[5]&~Int64) | FullResExp[6]&Int64));
+            assign ShiftGtIntSz = (|FullRe[`NE:7]|(FullRe[6]&~Int64)) | ((|FullRe[4:0]|(FullRe[5]&Int64))&((FullRe[5]&~Int64) | FullRe[6]&Int64));

    end else if (`FPSIZES == 4) begin        
        always_comb
            case (OutFmt)
-                `Q_FMT: ResExpGteMax = &FullResExp[`Q_NE-1:0] | FullResExp[`Q_NE];
-                `D_FMT: ResExpGteMax = &FullResExp[`D_NE-1:0] | (|FullResExp[`Q_NE:`D_NE]);
-                `S_FMT: ResExpGteMax = &FullResExp[`S_NE-1:0] | (|FullResExp[`Q_NE:`S_NE]);
-                `H_FMT: ResExpGteMax = &FullResExp[`H_NE-1:0] | (|FullResExp[`Q_NE:`H_NE]);
+                `Q_FMT: ResExpGteMax = &FullRe[`Q_NE-1:0] | FullRe[`Q_NE];
+                `D_FMT: ResExpGteMax = &FullRe[`D_NE-1:0] | (|FullRe[`Q_NE:`D_NE]);
+                `S_FMT: ResExpGteMax = &FullRe[`S_NE-1:0] | (|FullRe[`Q_NE:`S_NE]);
+                `H_FMT: ResExpGteMax = &FullRe[`H_NE-1:0] | (|FullRe[`Q_NE:`H_NE]);
            endcase
            // a left shift of intlen+1 is still in range but any more than that is an overflow
            //           inital: |      64 0's         |    XLEN     |
@ -110,14 +110,14 @@ module flags(
            //      - any of the bits after the most significan 1 is one
            //      - the most signifcant in 65 or 33 is still a one in the number and
            //        one of the later bits is one
-            assign ShiftGtIntSz = (|FullResExp[`Q_NE:7]|(FullResExp[6]&~Int64)) | ((|FullResExp[4:0]|(FullResExp[5]&Int64))&((FullResExp[5]&~Int64) | FullResExp[6]&Int64));
+            assign ShiftGtIntSz = (|FullRe[`Q_NE:7]|(FullRe[6]&~Int64)) | ((|FullRe[4:0]|(FullRe[5]&Int64))&((FullRe[5]&~Int64) | FullRe[6]&Int64));
    end

    //                 if the result is greater than or equal to the max exponent(not taking into account sign)
    //                 |           and the exponent isn't negitive
    //                 |           |                   if the input isnt infinity or NaN
    //                 |           |                   |            
-    assign Overflow = ResExpGteMax & ~FullResExp[`NE+1]&~(InfIn|NaNIn|DivByZero);
+    assign Overflow = ResExpGteMax & ~FullRe[`NE+1]&~(InfIn|NaNIn|DivByZero);

    // detecting tininess after rounding
    //                  the exponent is negitive
@ -127,11 +127,11 @@ module flags(
    //                  |                    |                    |                                      |                     and if the result is not exact
    //                  |                    |                    |                                      |                     |               and if the input isnt infinity or NaN
    //                  |                    |                    |                                      |                     |               |
-    assign Underflow = ((FullResExp[`NE+1] | (FullResExp == 0) | ((FullResExp == 1) & (Nexp == 0) & ~(UfPlus1&UfLSBRes)))&(R|S))&~(InfIn|NaNIn|DivByZero);
+    assign Underflow = ((FullRe[`NE+1] | (FullRe == 0) | ((FullRe == 1) & (Me == 0) & ~(UfPlus1&UfL)))&(R|S))&~(InfIn|NaNIn|DivByZero);

    // Set Inexact flag if the res is diffrent from what would be outputed given infinite precision
    //      - Don't set the underflow flag if an underflowed res isn't outputed
-    assign FpInexact = (S|Overflow|R|Underflow)&~(InfIn|NaNIn|DivByZero);
+    assign FpInexact = (S|Overflow|R)&~(InfIn|NaNIn|DivByZero);

    //                  if the res is too small to be represented and not 0
    //                  |                                     and if the res is not invalid (outside the integer bounds)
@ -153,7 +153,7 @@ module flags(
    //                  |           |                                  |                    |               or the res rounds up out of bounds
    //                  |           |                                  |                    |                       and the res didn't underflow
    //                  |           |                                  |                    |                       |
-    assign IntInvalid = XNaN|XInf|(ShiftGtIntSz&~FullResExp[`NE+1])|((Xs&~Signed)&(~((CvtCe[`NE]|(~|CvtCe))&~Plus1)))|(CvtNegResMsbs[1]^CvtNegResMsbs[0]);
+    assign IntInvalid = XNaN|XInf|(ShiftGtIntSz&~FullRe[`NE+1])|((Xs&~Signed)&(~((CvtCe[`NE]|(~|CvtCe))&~Plus1)))|(CvtNegResMsbs[1]^CvtNegResMsbs[0]);
    //                                                                                                     |
    //                                                                                                     or when the positive res rounds up out of range
    assign SigNaN = (XSNaN&~(IntToFp&CvtOp)) | (YSNaN&~CvtOp) | (ZSNaN&FmaOp);
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@ -51,7 +51,6 @@ module fma(
    logic [3*`NF+5:0]   Am;     // addend aligned's mantissa for addition in U(NF+5.2NF+1)
    logic [3*`NF+6:0]   AmInv;   // aligned addend's mantissa possibly inverted
    logic [2*`NF+1:0]   PmKilled;      // the product's mantissa possibly killed
-    logic [3*`NF+6:0]   PreSum, NegPreSum;  // positive and negitve versions of the sum
    ///////////////////////////////////////////////////////////////////////////////
    // Calculate the product
    //      - When multipliying two fp numbers, add the exponents
@ -70,20 +69,21 @@ module fma(
    ///////////////////////////////////////////////////////////////////////////////
    // Alignment shifter
    ///////////////////////////////////////////////////////////////////////////////
-
-    align align(.Ze, .Zm, .XZero, .YZero, .ZZero, .Xe, .Ye,
-                        .Am, .ZmSticky, .KillProd);
-                        
    // calculate the signs and take the opperation into account
    sign sign(.FOpCtrl, .Xs, .Ys, .Zs, .Ps, .As);

+    align align(.Ze, .Zm, .XZero, .YZero, .ZZero, .Xe, .Ye,
+                .Am, .ZmSticky, .KillProd);
+                        
+
+
    // ///////////////////////////////////////////////////////////////////////////////
    // // Addition/LZA
    // ///////////////////////////////////////////////////////////////////////////////
        
-    add add(.Am, .Pm, .Ps, .As, .KillProd, .AmInv, .PmKilled, .NegSum, .PreSum, .NegPreSum, .InvA, .XZero, .YZero, .Sm);
+    add add(.Am, .Pm, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .NegSum, .InvA, .Sm);
    
-    loa loa(.A(AmInv+{(3*`NF+6)'(0),InvA}), .P(PmKilled), .NCnt);
+    loa loa(.A(AmInv+{(3*`NF+6)'(0),InvA&~((ZmSticky&~KillProd))}), .P({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .NCnt);
 endmodule


@ -172,7 +172,7 @@ module align(
    // the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
    assign ZmPreshifted = {Zm,(3*`NF+5)'(0)};
    
-    assign KillProd = ACnt[`NE+1]|XZero|YZero;
+    assign KillProd = (ACnt[`NE+1]&~ZZero)|XZero|YZero;
    assign KillZ = $signed(ACnt)>$signed((`NE+2)'(3)*(`NE+2)'(`NF)+(`NE+2)'(5));

    always_comb
@ -183,7 +183,7 @@ module align(
        //          |   54'b0    |  106'b(product)  | 2'b0 |
        //  | addnend |
        if (KillProd) begin
-            ZmShifted = ZmPreshifted;
+            ZmShifted = {(`NF+3)'(0), Zm, (2*`NF+2)'(0)};
            ZmSticky = ~(XZero|YZero);

        // If the addend is too small to effect the addition        
@ -221,14 +221,14 @@ module add(
    input logic  [2*`NF+1:0]    Pm,       // the product's mantissa
    input logic                 Ps, As,// the product sign and the alligend addeded's sign (Modified Z sign for other opperations)
    input logic                 KillProd,      // should the product be set to 0
-    input logic                 XZero, YZero, // is the input zero
+    input logic                 ZmSticky,
    output logic [3*`NF+6:0]    AmInv,  // aligned addend possibly inverted
    output logic [2*`NF+1:0]    PmKilled,     // the product's mantissa possibly killed
    output logic                NegSum,        // was the sum negitive
    output logic                InvA,          // do you invert the aligned addend
-    output logic [3*`NF+5:0]    Sm,           // the positive sum
-    output logic [3*`NF+6:0]    PreSum, NegPreSum// possibly negitive sum
+    output logic [3*`NF+5:0]    Sm           // the positive sum
 );
+    logic [3*`NF+6:0]    PreSum, NegPreSum; // possibly negitive sum

    ///////////////////////////////////////////////////////////////////////////////
    // Addition
@ -243,13 +243,14 @@ module add(
    assign AmInv = InvA ? {1'b1, ~Am} : {1'b0, Am};
    // Kill the product if the product is too small to effect the addition (determined in fma1.sv)
    assign PmKilled = Pm&{2*`NF+2{~KillProd}};
-
-
-
    // Do the addition
    //      - calculate a positive and negitive sum in parallel
-    assign PreSum = {{`NF+3{1'b0}}, PmKilled, 2'b0} + AmInv + {{3*`NF+6{1'b0}}, InvA};
-    assign NegPreSum = {1'b0, Am} + {{`NF+3{1'b1}}, ~PmKilled, 2'b0} + {(3*`NF+7)'(4)};
+    //              Zsticky             Psticky
+    // PreSum    -1 = don't add 1     +1 = add 2
+    // NegPreSum +1 = add 2           -1 = don't add 1
+    // for NegPreSum the product is set to -1 whenever the product is killed, therefore add 1, 2 or 0
+    assign PreSum = {{`NF+3{1'b0}}, PmKilled, 1'b0, InvA&ZmSticky&KillProd} + AmInv + {{3*`NF+6{1'b0}}, InvA&~((ZmSticky&~KillProd))};
+    assign NegPreSum = {1'b0, Am} + {{`NF+3{1'b1}}, ~PmKilled, 2'b11} + {(3*`NF+5)'(0), ZmSticky&~KillProd, ~(ZmSticky)};
     
    // Is the sum negitive
    assign NegSum = PreSum[3*`NF+6];
@ -261,7 +262,7 @@ endmodule

 module loa( // [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
    input logic  [3*`NF+6:0] A,     // addend
-    input logic  [2*`NF+1:0] P,     // product
+    input logic  [2*`NF+3:0] P,     // product
    output logic [$clog2(3*`NF+7)-1:0]       NCnt   // normalization shift count for the positive result
    ); 
    
@ -273,12 +274,9 @@ module loa( // [Schmookler & Nowka, Leading zero anticipation and detection, IEE
    assign T[3*`NF+6:2*`NF+4] = A[3*`NF+6:2*`NF+4];
    assign G[3*`NF+6:2*`NF+4] = 0;
    assign Z[3*`NF+6:2*`NF+4] = ~A[3*`NF+6:2*`NF+4];
-    assign T[2*`NF+3:2] = A[2*`NF+3:2]^P;
-    assign G[2*`NF+3:2] = A[2*`NF+3:2]&P;
-    assign Z[2*`NF+3:2] = ~A[2*`NF+3:2]&~P;
-    assign T[1:0] = A[1:0];
-    assign G[1:0] = 0;
-    assign Z[1:0] = ~A[1:0];
+    assign T[2*`NF+3:0] = A[2*`NF+3:0]^P;
+    assign G[2*`NF+3:0] = A[2*`NF+3:0]&P;
+    assign Z[2*`NF+3:0] = ~A[2*`NF+3:0]&~P;


    // Apply function to determine Leading pattern
--- a/pipelined/src/fpu/fmashiftcalc.sv
+++ b/pipelined/src/fpu/fmashiftcalc.sv
@ -35,9 +35,8 @@ module fmashiftcalc(
    input logic  [$clog2(3*`NF+7)-1:0]  FmaNCnt,   // normalization shift count
    input logic  [`FMTBITS-1:0]         Fmt,       // precision 1 = double 0 = single
    input logic                         FmaKillProd,  // is the product set to zero
-    input logic 			            ZDenorm,
-    output logic [`NE+1:0]              FmaConvNormSumExp,          // exponent of the normalized sum not taking into account denormal or zero results
-    output logic                        FmaSmZero,    // is the result denormalized - calculated before LZA corection
+    output logic [`NE+1:0]              FmaNe,          // exponent of the normalized sum not taking into account denormal or zero results
+    output logic                        FmaSZero,    // is the result denormalized - calculated before LZA corection
    output logic                        FmaPreResultDenorm,    // is the result denormalized - calculated before LZA corection
    output logic [$clog2(3*`NF+7)-1:0]  FmaShiftAmt,   // normalization shift count
    output logic [3*`NF+8:0]            FmaShiftIn        // is the sum zero
@ -50,35 +49,36 @@ module fmashiftcalc(
    ///////////////////////////////////////////////////////////////////////////////
    //*** insert bias-bias simplification in fcvt.sv/phone pictures
    // Determine if the sum is zero
-    assign FmaSmZero = ~(|FmaSm);
+    assign FmaSZero = ~(|FmaSm);

    // calculate the sum's exponent
-    assign NormSumExp = FmaKillProd ? {2'b0, Ze[`NE-1:1], Ze[0]&~ZDenorm} : FmaPe + -{{`NE+2-$unsigned($clog2(3*`NF+7)){1'b0}}, FmaNCnt} - 1 + (`NE+2)'(`NF+4);
+    //                                                                      ProdExp - NormCnt - 1 + NF+4 = ProdExp + ~NormCnt + 1 - 1 + NF+4 = ProdExp + ~NormCnt + NF+4
+    assign NormSumExp = (FmaKillProd ? {2'b0, Ze} : FmaPe) + {{`NE+2-$unsigned($clog2(3*`NF+7)){1'b1}}, ~FmaNCnt} + (`NE+2)'(`NF+4);

    //convert the sum's exponent into the proper percision
    if (`FPSIZES == 1) begin
-        assign FmaConvNormSumExp = NormSumExp;
+        assign FmaNe = NormSumExp;

    end else if (`FPSIZES == 2) begin
-        assign FmaConvNormSumExp = Fmt ? NormSumExp : (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`BIAS1))&{`NE+2{|NormSumExp}};
+        assign FmaNe = Fmt ? NormSumExp : (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`BIAS1))&{`NE+2{|NormSumExp}};

    end else if (`FPSIZES == 3) begin
        always_comb begin
            case (Fmt)
-                `FMT: FmaConvNormSumExp = NormSumExp;
-                `FMT1: FmaConvNormSumExp = (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`BIAS1))&{`NE+2{|NormSumExp}};
-                `FMT2: FmaConvNormSumExp = (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`BIAS2))&{`NE+2{|NormSumExp}};
-                default: FmaConvNormSumExp = {`NE+2{1'bx}};
+                `FMT: FmaNe = NormSumExp;
+                `FMT1: FmaNe = (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`BIAS1))&{`NE+2{|NormSumExp}};
+                `FMT2: FmaNe = (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`BIAS2))&{`NE+2{|NormSumExp}};
+                default: FmaNe = {`NE+2{1'bx}};
            endcase
        end

    end else if (`FPSIZES == 4) begin
        always_comb begin
            case (Fmt)
-                2'h3: FmaConvNormSumExp = NormSumExp;
-                2'h1: FmaConvNormSumExp = (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`D_BIAS))&{`NE+2{|NormSumExp}};
-                2'h0: FmaConvNormSumExp = (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`S_BIAS))&{`NE+2{|NormSumExp}};
-                2'h2: FmaConvNormSumExp = (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`H_BIAS))&{`NE+2{|NormSumExp}};
+                2'h3: FmaNe = NormSumExp;
+                2'h1: FmaNe = (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`D_BIAS))&{`NE+2{|NormSumExp}};
+                2'h0: FmaNe = (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`S_BIAS))&{`NE+2{|NormSumExp}};
+                2'h2: FmaNe = (NormSumExp-(`NE+2)'(`BIAS)+(`NE+2)'(`H_BIAS))&{`NE+2{|NormSumExp}};
            endcase
        end

@ -90,7 +90,7 @@ module fmashiftcalc(
        logic Sum0LEZ, Sum0GEFL;
        assign Sum0LEZ  = NormSumExp[`NE+1] | ~|NormSumExp;
        assign Sum0GEFL = $signed(NormSumExp) >= $signed(-(`NE+2)'(`NF)-(`NE+2)'(2));
-        assign FmaPreResultDenorm = Sum0LEZ & Sum0GEFL & ~FmaSmZero;
+        assign FmaPreResultDenorm = Sum0LEZ & Sum0GEFL & ~FmaSZero;

    end else if (`FPSIZES == 2) begin
        logic Sum0LEZ, Sum0GEFL, Sum1LEZ, Sum1GEFL;
@ -98,7 +98,7 @@ module fmashiftcalc(
        assign Sum0GEFL = $signed(NormSumExp) >= $signed(-(`NE+2)'(`NF)-(`NE+2)'(2));
        assign Sum1LEZ  = $signed(NormSumExp) <= $signed( (`NE+2)'(`BIAS)-(`NE+2)'(`BIAS1));
        assign Sum1GEFL = $signed(NormSumExp) >= $signed(-(`NE+2)'(`NF1+2)+(`NE+2)'(`BIAS)-(`NE+2)'(`BIAS1)) | ~|NormSumExp;
-        assign FmaPreResultDenorm = (Fmt ? Sum0LEZ : Sum1LEZ) & (Fmt ? Sum0GEFL : Sum1GEFL) & ~FmaSmZero;
+        assign FmaPreResultDenorm = (Fmt ? Sum0LEZ : Sum1LEZ) & (Fmt ? Sum0GEFL : Sum1GEFL) & ~FmaSZero;

    end else if (`FPSIZES == 3) begin
        logic Sum0LEZ, Sum0GEFL, Sum1LEZ, Sum1GEFL, Sum2LEZ, Sum2GEFL;
@ -110,9 +110,9 @@ module fmashiftcalc(
        assign Sum2GEFL = $signed(NormSumExp) >= $signed(-(`NE+2)'(`NF2+2)+(`NE+2)'(`BIAS)-(`NE+2)'(`BIAS2)) | ~|NormSumExp;
        always_comb begin
            case (Fmt)
-                `FMT: FmaPreResultDenorm = Sum0LEZ & Sum0GEFL & ~FmaSmZero;
-                `FMT1: FmaPreResultDenorm = Sum1LEZ & Sum1GEFL & ~FmaSmZero;
-                `FMT2: FmaPreResultDenorm = Sum2LEZ & Sum2GEFL & ~FmaSmZero;
+                `FMT: FmaPreResultDenorm = Sum0LEZ & Sum0GEFL & ~FmaSZero;
+                `FMT1: FmaPreResultDenorm = Sum1LEZ & Sum1GEFL & ~FmaSZero;
+                `FMT2: FmaPreResultDenorm = Sum2LEZ & Sum2GEFL & ~FmaSZero;
                default: FmaPreResultDenorm = 1'bx;
            endcase
        end
@ -129,10 +129,10 @@ module fmashiftcalc(
        assign Sum3GEFL = $signed(NormSumExp) >= $signed(-(`NE+2)'(`H_NF+2)+(`NE+2)'(`BIAS)-(`NE+2)'(`H_BIAS)) | ~|NormSumExp;
        always_comb begin
            case (Fmt)
-                2'h3: FmaPreResultDenorm = Sum0LEZ & Sum0GEFL & ~FmaSmZero;
-                2'h1: FmaPreResultDenorm = Sum1LEZ & Sum1GEFL & ~FmaSmZero;
-                2'h0: FmaPreResultDenorm = Sum2LEZ & Sum2GEFL & ~FmaSmZero;
-                2'h2: FmaPreResultDenorm = Sum3LEZ & Sum3GEFL & ~FmaSmZero;
+                2'h3: FmaPreResultDenorm = Sum0LEZ & Sum0GEFL & ~FmaSZero;
+                2'h1: FmaPreResultDenorm = Sum1LEZ & Sum1GEFL & ~FmaSZero;
+                2'h0: FmaPreResultDenorm = Sum2LEZ & Sum2GEFL & ~FmaSZero;
+                2'h2: FmaPreResultDenorm = Sum3LEZ & Sum3GEFL & ~FmaSZero;
            endcase // *** remove checking to see if it's underflowed and only check for less than zero for denorm checking
        end

@ -144,13 +144,13 @@ module fmashiftcalc(
    //      - if kill prod dont add to exp

    // Determine if the result is denormal
-    // assign FmaPreResultDenorm = $signed(FmaConvNormSumExp)<=0 & ($signed(FmaConvNormSumExp)>=$signed(-FracLen)) & ~FmaSmZero;
+    // assign FmaPreResultDenorm = $signed(FmaNe)<=0 & ($signed(FmaNe)>=$signed(-FracLen)) & ~FmaSZero;

    // Determine the shift needed for denormal results
    //  - if not denorm add 1 to shift out the leading 1
-    assign DenormShift = FmaPreResultDenorm&~FmaKillProd ? FmaConvNormSumExp[$clog2(3*`NF+7)-1:0] : 1;
+    assign DenormShift = FmaPreResultDenorm ? FmaNe[$clog2(3*`NF+7)-1:0] : 1;
    // set and calculate the shift input and amount
    //  - shift once if killing a product and the result is denormalized
    assign FmaShiftIn = {3'b0, FmaSm};
-    assign FmaShiftAmt = (FmaNCnt&{$clog2(3*`NF+7){~FmaKillProd}})+DenormShift;
+    assign FmaShiftAmt = FmaNCnt+DenormShift;
 endmodule
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@ -42,7 +42,7 @@ module fpu (
  input logic [1:0]        STATUS_FS, // Is floating-point enabled?
  output logic 		   FRegWriteM, // FP register write enable
  output logic 		   FpLoadStoreM, // Fp load instruction?
-  output logic              FLoad2,
+  output logic              FStore2,
  output logic 		   FStallD, // Stall the decode stage
  output logic 		   FWriteIntE, // integer register write enables
  output logic [`XLEN-1:0] FWriteDataE, // Data to be written to memory
@ -125,12 +125,11 @@ module fpu (
   logic [`CVTLEN-1:0]      CvtLzcInE, CvtLzcInM;      // input to the Leading Zero Counter (priority encoder)
   
   //divide signals
-   logic [`DIVLEN+2:0] QuotE, QuotM;
+   logic [`QLEN-1-(`RADIX/4):0] QuotM;
   logic [`NE+1:0] DivCalcExpE, DivCalcExpM; 
-   logic DivNegStickyE, DivNegStickyM;
   logic DivStickyE, DivStickyM;
   logic DivDoneM;
-   logic [$clog2(`DIVLEN/2+3)-1:0] EarlyTermShiftDiv2E, EarlyTermShiftDiv2M;
+   logic [`DURLEN-1:0] EarlyTermShiftM;

   // result and flag signals
   logic [63:0] 	  FDivResM, FDivResW;                 // divide/squareroot result
@ -288,8 +287,8 @@ module fpu (
   //       .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
   divsqrt divsqrt(.clk, .reset, .FmtE, .XManE, .YManE, .XExpE, .YExpE, 
                  .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .DivStartE(FDivStartE), 
-                  .StallE, .StallM, .DivStickyM, .DivNegStickyM, .DivBusy(FDivBusyE), .DivCalcExpM, //***change divbusyE to M signal
-                  .EarlyTermShiftDiv2M, .QuotM, .DivDone(DivDoneM));
+                  .StallE, .StallM, .DivStickyM, .DivBusy(FDivBusyE), .DivCalcExpM, //***change divbusyE to M signal
+                  .EarlyTermShiftM, .QuotM, .DivDone(DivDoneM));
   // other FP execution units
   fcmp fcmp (.FmtE, .FOpCtrlE, .XSgnE, .YSgnE, .XExpE, .YExpE, .XManE, .YManE, 
            .XZeroE, .YZeroE, .XNaNE, .YNaNE, .XSNaNE, .YSNaNE, .FSrcXE, .FSrcYE, .CmpNVE, .CmpFpResE, .CmpIntResE);
@ -308,8 +307,8 @@ module fpu (
      assign FWriteDataE = FSrcYE[`XLEN-1:0]; 
   end else begin
      logic [`FLEN-1:0] FWriteDataE;
-      if(`FMTBITS == 2) assign FLoad2 = FmtM == `FMT;
-      else assign FLoad2 = FmtM;
+      if(`FMTBITS == 2) assign FStore2 = FmtM == `FMT;
+      else assign FStore2 = FmtM;

      if (`FPSIZES==1) assign FWriteDataE = FSrcYE;
      else if (`FPSIZES==2) assign FWriteDataE = FmtE ? FSrcYE : {2{FSrcYE[`LEN1-1:0]}};
@ -381,12 +380,12 @@ module fpu (

   assign FpLoadStoreM = FResSelM[1];

-   postprocess postprocess(.Xs(XSgnM), .Ys(YSgnM), .Ze(ZExpM), .Xm(XManM), .Ym(YManM), .Zm(ZManM), .Frm(FrmM), .Fmt(FmtM), .FmaPe(ProdExpM), .DivEarlyTermShiftDiv2(EarlyTermShiftDiv2M),
-                           .FmaZmSticky(AddendStickyM), .FmaKillProd(KillProdM), .XZero(XZeroM), .YZero(YZeroM), .ZZero(ZZeroM), .XInf(XInfM), .YInf(YInfM), .Quot(QuotM),
-                           .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), .FmaSm(SumM), .DivCalcExp(DivCalcExpM), .DivDone(DivDoneM),
-                           .FmaNegSum(NegSumM), .FmaInvA(InvAM), .ZDenorm(ZDenormM), .FmaAs(ZSgnEffM), .FmaPs(PSgnM), .FOpCtrl(FOpCtrlM), .FmaNCnt(FmaNormCntM), .DivNegSticky(DivNegStickyM),
-                           .CvtCe(CvtCalcExpM), .CvtResDenormUf(CvtResDenormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CvtResSgnM), .ToInt(FWriteIntM), .DivSticky(DivStickyM),
-                           .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), .PostProcSel(PostProcSelM), .W(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));
+   postprocess postprocess(.Xs(XSgnM), .Ys(YSgnM), .Ze(ZExpM), .Xm(XManM), .Ym(YManM), .Zm(ZManM), .Frm(FrmM), .Fmt(FmtM), .FmaPe(ProdExpM), .DivEarlyTermShift(EarlyTermShiftM),
+                           .FmaZmS(AddendStickyM), .FmaKillProd(KillProdM), .XZero(XZeroM), .YZero(YZeroM), .ZZero(ZZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QuotM),
+                           .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), .FmaSm(SumM), .DivQe(DivCalcExpM), .DivDone(DivDoneM),
+                           .FmaNegSum(NegSumM), .FmaInvA(InvAM), .ZDenorm(ZDenormM), .FmaAs(ZSgnEffM), .FmaPs(PSgnM), .FOpCtrl(FOpCtrlM), .FmaNCnt(FmaNormCntM),
+                           .CvtCe(CvtCalcExpM), .CvtResDenormUf(CvtResDenormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CvtResSgnM), .ToInt(FWriteIntM), .DivS(DivStickyM),
+                           .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));

   // FPU flag selection - to privileged
   mux2  #(5)  FPUFlgMux ({PreNVM&~FResSelM[1], 4'b0}, PostProcFlgM, ~FResSelM[1]&FResSelM[0], SetFflagsM);
--- a/pipelined/src/fpu/otfc.sv
+++ b/pipelined/src/fpu/otfc.sv
@ -0,0 +1,112 @@
+///////////////////////////////////////////
+// otfc.sv
+//
+// Written: me@KatherineParry.com, cturek@hmc.edu 
+// Modified:7/14/2022
+//
+// Purpose: On the fly conversion
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module otfc2 (
+  input  logic         qp, qz,
+  input  logic [`QLEN-1:0] Q, QM,
+  output logic [`QLEN-1:0] QNext, QMNext
+);
+  //  The on-the-fly converter transfers the quotient 
+  //  bits to the quotient as they come.
+  //  Use this otfc for division only.
+  logic [`QLEN-2:0] QR, QMR;
+
+  assign QR  = Q[`QLEN-2:0];
+  assign QMR = QM[`QLEN-2:0];     // Shifted Q and QM
+
+  always_comb begin
+    if (qp) begin
+      QNext  = {QR,  1'b1};
+      QMNext = {QR,  1'b0};
+    end else if (qz) begin
+      QNext  = {QR,  1'b0};
+      QMNext = {QMR, 1'b1};
+    end else begin        // If qp and qz are not true, then qn is
+      QNext  = {QMR, 1'b1};
+      QMNext = {QMR, 1'b0};
+    end 
+  end
+
+endmodule
+
+
+module otfc4 (
+  input  logic [3:0]   q,
+  input  logic [`QLEN-1:0] Q, QM,
+  output logic [`QLEN-1:0] QNext, QMNext
+);
+
+  //  The on-the-fly converter transfers the quotient 
+  //  bits to the quotient as they come. 
+  //
+  //  This code follows the psuedocode presented in the 
+  //  floating point chapter of the book. Right now, 
+  //  it is written for Radix-4 division.
+  //
+  //  QM is Q-1. It allows us to write negative bits 
+  //  without using a costly CPA. 
+
+  //  QR and QMR are the shifted versions of Q and QM.
+  //  They are treated as [N-1:r] size signals, and 
+  //  discard the r most significant bits of Q and QM. 
+  logic [`QLEN-3:0] QR, QMR;
+
+  // shift Q (quotent) and QM (quotent-1)
+		// if 	q = 2  	    Q = {Q, 10} 	QM = {Q, 01}		
+		// else if 	q = 1   Q = {Q, 01} 	QM = {Q, 00}	
+		// else if 	q = 0   Q = {Q, 00} 	QM = {QM, 11}	
+		// else if 	q = -1	Q = {QM, 11} 	QM = {QM, 10}
+		// else if 	q = -2	Q = {QM, 10} 	QM = {QM, 01}
+
+  assign QR  = Q[`QLEN-3:0];
+  assign QMR = QM[`QLEN-3:0];     // Shifted Q and QM
+  always_comb begin
+    if (q[3]) begin // +2
+      QNext  = {QR,  2'b10};
+      QMNext = {QR,  2'b01};
+    end else if (q[2]) begin // +1
+      QNext  = {QR,  2'b01};
+      QMNext = {QR,  2'b00};
+    end else if (q[1]) begin // -1
+      QNext  = {QMR,  2'b11};
+      QMNext = {QMR,  2'b10};
+    end else if (q[0]) begin // -2
+      QNext  = {QMR,  2'b10};
+      QMNext = {QMR,  2'b01};
+    end else begin           // 0
+      QNext  = {QR,  2'b00};
+      QMNext = {QMR, 2'b11};
+    end 
+  end
+  // Final Quoteint is in the range [.5, 2)
+
+endmodule
--- a/pipelined/src/fpu/postprocess.sv
+++ b/pipelined/src/fpu/postprocess.sv
@ -29,7 +29,7 @@

 `include "wally-config.vh"

-module postprocess(
+module postprocess (
    // general signals
    input logic                             Xs, Ys,  // input signs
    input logic  [`NE-1:0]                  Ze, // input exponents
@ -48,18 +48,17 @@ module postprocess(
    input logic                             FmaPs,      // the product's sign
    input logic  [`NE+1:0]                  FmaPe,       // Product exponent
    input logic  [3*`NF+5:0]                FmaSm,       // the positive sum
-    input logic                             FmaZmSticky,  // sticky bit that is calculated during alignment
+    input logic                             FmaZmS,  // sticky bit that is calculated during alignment
    input logic                             FmaKillProd,      // set the product to zero before addition if the product is too small to matter
    input logic                             FmaNegSum,    // was the sum negitive
    input logic                             FmaInvA,      // do you invert Z
    input logic  [$clog2(3*`NF+7)-1:0]      FmaNCnt,   // the normalization shift count
    //divide signals
-    input logic  [$clog2(`DIVLEN/2+3)-1:0]  DivEarlyTermShiftDiv2,
-    input logic                             DivSticky,
-    input logic                             DivNegSticky,
+    input logic  [`DURLEN-1:0]              DivEarlyTermShift,
+    input logic                             DivS,
    input logic                             DivDone,
-    input logic  [`NE+1:0]                  DivCalcExp,
-    input logic  [`DIVLEN+2:0]              Quot,
+    input logic  [`NE+1:0]                  DivQe,
+    input logic  [`QLEN-1-(`RADIX/4):0]                DivQm,
    // conversion signals
    input logic                             CvtCs,     // the result's sign
    input logic  [`NE:0]                    CvtCe,    // the calculated expoent
@ -69,7 +68,7 @@ module postprocess(
    input logic  [`CVTLEN-1:0]              CvtLzcIn,      // input to the Leading Zero Counter (priority encoder)
    input logic                             IntZero,         // is the input zero
    // final results
-    output logic [`FLEN-1:0]                W,    // FMA final result
+    output logic [`FLEN-1:0]                PostProcRes,    // FMA final result
    output logic [4:0]                      PostProcFlg,
    output logic [`XLEN-1:0]                FCvtIntRes    // the int conversion result
    );
@ -78,32 +77,31 @@ module postprocess(
    logic Ws;
    logic [`NF-1:0] Rf; // Result fraction
    logic [`NE-1:0] Re;  // Result exponent
-    logic Nsgn;
-    logic [`NE+1:0] Nexp;
-    logic [`CORRSHIFTSZ-1:0] Nfrac; // corectly shifted fraction
-    logic [`NE+1:0] FullResExp;  // Re with bits to determine sign and overflow
+    logic Ms;
+    logic [`NE+1:0] Me;
+    logic [`CORRSHIFTSZ-1:0] Mf; // corectly shifted fraction
+    logic [`NE+1:0] FullRe;  // Re with bits to determine sign and overflow
    logic S;           // S bit
    logic UfPlus1;                    // do you add one (for determining underflow flag)
    logic R;   // bits needed to determine rounding
-    logic [`FLEN:0] RoundAdd;       // how much to add to the result
    logic [$clog2(`NORMSHIFTSZ)-1:0] ShiftAmt;   // normalization shift count
    logic [`NORMSHIFTSZ-1:0] ShiftIn;        // is the sum zero
    logic [`NORMSHIFTSZ-1:0] Shifted;    // the shifted result
    logic Plus1;      // add one to the final result?
    logic IntInvalid, Overflow, Invalid; // flags
-    logic UfLSBRes;
+    logic UfL;
    logic [`FMTBITS-1:0] OutFmt;
    // fma signals
    logic [`NE+1:0] FmaSe;     // exponent of the normalized sum
-    logic FmaSmZero;        // is the sum zero
+    logic FmaSZero;        // is the sum zero
    logic [3*`NF+8:0] FmaShiftIn;        // shift input
-    logic [`NE+1:0] FmaConvNormSumExp;          // exponent of the normalized sum not taking into account denormal or zero results
+    logic [`NE+1:0] FmaNe;          // exponent of the normalized sum not taking into account denormal or zero results
    logic FmaPreResultDenorm;    // is the result denormalized - calculated before LZA corection
    logic [$clog2(3*`NF+7)-1:0] FmaShiftAmt;   // normalization shift count
    // division singals
    logic [$clog2(`NORMSHIFTSZ)-1:0] DivShiftAmt;
    logic [`NORMSHIFTSZ-1:0] DivShiftIn;
-    logic [`NE+1:0] DivCorrExp;
+    logic [`NE+1:0] Qe;
    logic DivByZero;
    logic DivResDenorm;
    logic [`NE+1:0] DivDenormShift;
@ -152,9 +150,9 @@ module postprocess(

    cvtshiftcalc cvtshiftcalc(.ToInt, .CvtCe, .CvtResDenormUf, .Xm, .CvtLzcIn,  
                              .XZero, .IntToFp, .OutFmt, .CvtResUf, .CvtShiftIn);
-    fmashiftcalc fmashiftcalc(.FmaSm, .Ze, .FmaPe, .FmaNCnt, .Fmt, .FmaKillProd, .FmaConvNormSumExp,
-                          .ZDenorm, .FmaSmZero, .FmaPreResultDenorm, .FmaShiftAmt, .FmaShiftIn);
-    divshiftcalc divshiftcalc(.Fmt, .DivCalcExp, .Quot, .DivEarlyTermShiftDiv2, .DivResDenorm, .DivDenormShift, .DivShiftAmt, .DivShiftIn);
+    fmashiftcalc fmashiftcalc(.FmaSm, .Ze, .FmaPe, .FmaNCnt, .Fmt, .FmaKillProd, .FmaNe,
+                          .FmaSZero, .FmaPreResultDenorm, .FmaShiftAmt, .FmaShiftIn);
+    divshiftcalc divshiftcalc(.Fmt, .DivQe, .DivQm, .DivEarlyTermShift, .DivResDenorm, .DivDenormShift, .DivShiftAmt, .DivShiftIn);

    always_comb
        case(PostProcSel)
@ -183,9 +181,9 @@ module postprocess(
    
    normshift normshift (.ShiftIn, .ShiftAmt, .Shifted);

-    lzacorrection lzacorrection(.FmaOp, .FmaKillProd, .FmaPreResultDenorm, .FmaConvNormSumExp,
-                                .DivResDenorm, .DivDenormShift, .DivOp, .DivCalcExp,
-                                .DivCorrExp, .FmaSmZero, .Shifted, .FmaSe, .Nfrac);
+    shiftcorrection shiftcorrection(.FmaOp, .FmaPreResultDenorm, .FmaNe,
+                                .DivResDenorm, .DivDenormShift, .DivOp, .DivQe,
+                                .Qe, .FmaSZero, .Shifted, .FmaSe, .Mf);

    ///////////////////////////////////////////////////////////////////////////////
    // Rounding
@ -199,19 +197,19 @@ module postprocess(

                          
    roundsign roundsign(.FmaPs, .FmaAs, .FmaInvA, .FmaOp, .DivOp, .CvtOp, .FmaNegSum, 
-                          .Xs, .Ys, .CvtCs, .Nsgn);
+                          .Xs, .Ys, .CvtCs, .Ms);

-    round round(.OutFmt, .Frm, .S, .FmaZmSticky, .ZZero, .Plus1, .PostProcSel, .CvtCe, .DivCorrExp,
-                .FmaInvA, .Nsgn, .FmaSe, .FmaOp, .CvtOp, .CvtResDenormUf, .Nfrac, .ToInt,  .CvtResUf,
-                .DivSticky, .DivNegSticky, .DivDone,
-                .DivOp, .UfPlus1, .FullResExp, .Rf, .Re, .R, .RoundAdd, .UfLSBRes, .Nexp);
+    round round(.OutFmt, .Frm, .S, .FmaZmS, .Plus1, .PostProcSel, .CvtCe, .Qe,
+                .Ms, .FmaSe, .FmaOp, .CvtOp, .CvtResDenormUf, .Mf, .ToInt,  .CvtResUf,
+                .DivS, .DivDone,
+                .DivOp, .UfPlus1, .FullRe, .Rf, .Re, .R, .UfL, .Me);

    ///////////////////////////////////////////////////////////////////////////////
    // Sign calculation
    ///////////////////////////////////////////////////////////////////////////////

    resultsign resultsign(.Frm, .FmaPs, .FmaAs, .FmaSe, .R, .S,
-                          .FmaOp, .ZInf, .InfIn, .FmaSmZero, .Mult, .Nsgn, .Ws);
+                          .FmaOp, .ZInf, .InfIn, .FmaSZero, .Mult, .Ms, .Ws);

    ///////////////////////////////////////////////////////////////////////////////
    // Flags
@ -220,18 +218,18 @@ module postprocess(
    flags flags(.XSNaN, .YSNaN, .ZSNaN, .XInf, .YInf, .ZInf, .InfIn, .XZero, .YZero, 
                .Xs, .Sqrt, .ToInt, .IntToFp, .Int64, .Signed, .OutFmt, .CvtCe,
                .XNaN, .YNaN, .NaNIn, .FmaAs, .FmaPs, .R, .IntInvalid, .DivByZero,
-                .UfLSBRes, .S, .UfPlus1, .CvtOp, .DivOp, .FmaOp, .FullResExp, .Plus1,
-                .Nexp, .CvtNegResMsbs, .Invalid, .Overflow, .PostProcFlg);
+                .UfL, .S, .UfPlus1, .CvtOp, .DivOp, .FmaOp, .FullRe, .Plus1,
+                .Me, .CvtNegResMsbs, .Invalid, .Overflow, .PostProcFlg);

    ///////////////////////////////////////////////////////////////////////////////
    // Select the result
    ///////////////////////////////////////////////////////////////////////////////

    negateintres negateintres(.Xs, .Shifted, .Signed, .Int64, .Plus1, .CvtNegResMsbs, .CvtNegRes);
-    resultselect resultselect(.Xs, .Xm, .Ym, .Zm, .XZero, .IntInvalid,
+    specialcase specialcase(.Xs, .Xm, .Ym, .Zm, .XZero, .IntInvalid,
        .IntZero, .Frm, .OutFmt, .XNaN, .YNaN, .ZNaN, .CvtResUf, 
        .NaNIn, .IntToFp, .Int64, .Signed, .CvtOp, .FmaOp, .Plus1, .Invalid, .Overflow, .InfIn, .CvtNegRes,
        .XInf, .YInf, .DivOp,
-        .DivByZero, .FullResExp, .CvtCe, .Ws, .Re, .Rf, .W, .FCvtIntRes);
+        .DivByZero, .FullRe, .CvtCe, .Ws, .Re, .Rf, .PostProcRes, .FCvtIntRes);

 endmodule
--- a/pipelined/src/fpu/qsel.sv
+++ b/pipelined/src/fpu/qsel.sv
@ -0,0 +1,135 @@
+///////////////////////////////////////////
+// srt.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
+// Modified:13 January 2022
+//
+// Purpose: Combined Divide and Square Root Floating Point and Integer Unit
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module qsel2 ( // *** eventually just change to 4 bits
+  input  logic [`DIVLEN+3:`DIVLEN] ps, pc, 
+  output logic         qp, qz//, qn
+);
+ 
+  logic [`DIVLEN+3:`DIVLEN]  p, g;
+  logic          magnitude, sign, cout;
+
+  // The quotient selection logic is presented for simplicity, not
+  // for efficiency.  You can probably optimize your logic to
+  // select the proper divisor with less delay.
+
+  // Quotient equations from EE371 lecture notes 13-20
+  assign p = ps ^ pc;
+  assign g = ps & pc;
+
+  assign magnitude = ~(&p[`DIVLEN+2:`DIVLEN]);
+  assign cout = g[`DIVLEN+2] | (p[`DIVLEN+2] & (g[`DIVLEN+1] | p[`DIVLEN+1] & g[`DIVLEN]));
+  assign sign = p[`DIVLEN+3] ^ cout;
+/*  assign #1 magnitude = ~((ps[54]^pc[54]) & (ps[53]^pc[53]) & 
+			  (ps[52]^pc[52]));
+  assign #1 sign = (ps[55]^pc[55])^
+      (ps[54] & pc[54] | ((ps[54]^pc[54]) &
+			    (ps[53]&pc[53] | ((ps[53]^pc[53]) &
+						(ps[52]&pc[52]))))); */
+
+  // Produce quotient = +1, 0, or -1
+  assign qp = magnitude & ~sign;
+  assign qz = ~magnitude;
+//   assign #1 qn = magnitude & sign;
+endmodule
+
+module qsel4 (
+	input logic [`DIVLEN+3:0] D,
+	input logic [`DIVLEN+3:0] WS, WC,
+	output logic [3:0] q
+);
+	logic [6:0] Wmsbs;
+	logic [7:0] PreWmsbs;
+	logic [2:0] Dmsbs;
+	assign PreWmsbs = WC[`DIVLEN+3:`DIVLEN-4] + WS[`DIVLEN+3:`DIVLEN-4];
+	assign Wmsbs = PreWmsbs[7:1];
+	assign Dmsbs = D[`DIVLEN-1:`DIVLEN-3];
+	// D = 0001.xxx...
+	// Dmsbs = |   |
+  // W =      xxxx.xxx...
+	// Wmsbs = |        |
+
+	logic [3:0] QSel4[1023:0];
+
+  always_comb begin 
+    integer d, w, i, w2;
+    for(d=0; d<8; d++)
+      for(w=0; w<128; w++)begin
+        i = d*128+w;
+        w2 = w-128*(w>=64); // convert to two's complement
+        case(d)
+          0: if($signed(w2)>=$signed(12))      QSel4[i] = 4'b1000;
+            else if(w2>=4)   QSel4[i] = 4'b0100; 
+            else if(w2>=-4)  QSel4[i] = 4'b0000; 
+            else if(w2>=-13) QSel4[i] = 4'b0010; 
+            else            QSel4[i] = 4'b0001; 
+          1: if(w2>=14)      QSel4[i] = 4'b1000;
+            else if(w2>=4)   QSel4[i] = 4'b0100; 
+            else if(w2>=-6)  QSel4[i] = 4'b0000; 
+            else if(w2>=-15) QSel4[i] = 4'b0010; 
+            else            QSel4[i] = 4'b0001; 
+          2: if(w2>=15)      QSel4[i] = 4'b1000;
+            else if(w2>=4)   QSel4[i] = 4'b0100; 
+            else if(w2>=-6)  QSel4[i] = 4'b0000; 
+            else if(w2>=-16) QSel4[i] = 4'b0010; 
+            else            QSel4[i] = 4'b0001; 
+          3: if(w2>=16)      QSel4[i] = 4'b1000;
+            else if(w2>=4)   QSel4[i] = 4'b0100; 
+            else if(w2>=-6)  QSel4[i] = 4'b0000; 
+            else if(w2>=-18) QSel4[i] = 4'b0010; 
+            else            QSel4[i] = 4'b0001; 
+          4: if(w2>=18)      QSel4[i] = 4'b1000;
+            else if(w2>=6)   QSel4[i] = 4'b0100; 
+            else if(w2>=-8)  QSel4[i] = 4'b0000; 
+            else if(w2>=-20) QSel4[i] = 4'b0010; 
+            else            QSel4[i] = 4'b0001; 
+          5: if(w2>=20)      QSel4[i] = 4'b1000;
+            else if(w2>=6)   QSel4[i] = 4'b0100; 
+            else if(w2>=-8)  QSel4[i] = 4'b0000; 
+            else if(w2>=-20) QSel4[i] = 4'b0010; 
+            else            QSel4[i] = 4'b0001; 
+          6: if(w2>=20)      QSel4[i] = 4'b1000;
+            else if(w2>=8)   QSel4[i] = 4'b0100; 
+            else if(w2>=-8)  QSel4[i] = 4'b0000; 
+            else if(w2>=-22) QSel4[i] = 4'b0010; 
+            else            QSel4[i] = 4'b0001; 
+          7: if(w2>=24)      QSel4[i] = 4'b1000;
+            else if(w2>=8)   QSel4[i] = 4'b0100; 
+            else if(w2>=-8)  QSel4[i] = 4'b0000; 
+            else if(w2>=-24) QSel4[i] = 4'b0010; 
+            else            QSel4[i] = 4'b0001; 
+        endcase
+      end
+  end
+	assign q = QSel4[{Dmsbs,Wmsbs}];
+	
+endmodule
--- a/pipelined/src/fpu/resultsign.sv
+++ b/pipelined/src/fpu/resultsign.sv
@ -35,32 +35,29 @@ module resultsign(
    input logic         InfIn,
    input logic         FmaOp,
    input logic [`NE+1:0] FmaSe,
-    input logic         FmaSmZero,
+    input logic         FmaSZero,
    input logic         Mult,
    input logic         R,
    input logic         S,
-    input logic         Nsgn,
+    input logic         Ms,
    output logic        Ws
 );

-    logic ZeroSgn;
-    logic InfSgn;
-    logic Underflow;
-    // logic ResultSgnTmp;
+    logic Zeros;
+    logic Infs;

    // Determine the sign if the sum is zero
    //      if cancelation then 0 unless round to -infinity
    //      if multiply then Psgn
    //      otherwise psign
-    assign Underflow = FmaSe[`NE+1] | ((FmaSe == 0) & (R|S));
-    assign ZeroSgn = (FmaPs^FmaAs)&~Underflow&~Mult ? Frm[1:0] == 2'b10 : FmaPs;
+    assign Zeros = (FmaPs^FmaAs)&~(FmaSe[`NE+1] | ((FmaSe == 0) & (R|S)))&~Mult ? Frm[1:0] == 2'b10 : FmaPs;


    // is the result negitive
    //  if p - z is the Sum negitive
    //  if -p + z is the Sum positive
    //  if -p - z then the Sum is negitive
-    assign InfSgn = ZInf ? FmaAs : FmaPs;
-    assign Ws = InfIn&FmaOp ? InfSgn : FmaSmZero&FmaOp ? ZeroSgn : Nsgn;
+    assign Infs = ZInf ? FmaAs : FmaPs;
+    assign Ws = InfIn&FmaOp ? Infs : FmaSZero&FmaOp ? Zeros : Ms;

 endmodule
--- a/pipelined/src/fpu/round.sv
+++ b/pipelined/src/fpu/round.sv
@ -46,36 +46,32 @@ module round(
    input logic  [1:0]              PostProcSel,
    input logic                     CvtResDenormUf,
    input logic                     CvtResUf,
-    input logic  [`CORRSHIFTSZ-1:0] Nfrac,
-    input logic                     FmaZmSticky,  // addend's sticky bit
-    input logic                     ZZero,         // is Z zero
-    input logic                     FmaInvA,          // invert Z
+    input logic  [`CORRSHIFTSZ-1:0] Mf,
+    input logic                     FmaZmS,  // addend's sticky bit
    input logic  [`NE+1:0]          FmaSe,         // exponent of the normalized sum
-    input logic                     Nsgn,      // the result's sign
+    input logic                     Ms,      // the result's sign
    input logic  [`NE:0]            CvtCe,    // the calculated expoent
-    input logic  [`NE+1:0]          DivCorrExp,    // the calculated expoent
-    input logic                     DivSticky,             // sticky bit
-    input logic                     DivNegSticky,
+    input logic  [`NE+1:0]          Qe,    // the calculated expoent
+    input logic                     DivS,             // sticky bit
    output logic                    UfPlus1,  // do you add or subtract on from the result
-    output logic [`NE+1:0]          FullResExp,      // Re with bits to determine sign and overflow
+    output logic [`NE+1:0]          FullRe,      // Re with bits to determine sign and overflow
    output logic [`NF-1:0]          Rf,         // Result fraction
    output logic [`NE-1:0]          Re,          // Result exponent
    output logic                    S,             // sticky bit
-    output logic [`NE+1:0]          Nexp,
+    output logic [`NE+1:0]          Me,
    output logic                    Plus1,
-    output logic [`FLEN:0]          RoundAdd,           // how much to add to the result
-    output logic                    R, UfLSBRes // bits needed to calculate rounding
+    output logic                    R, UfL // bits needed to calculate rounding
 );
-    logic           LSBRes;         // bit used for rounding - least significant bit of the normalized sum
-    logic           SubBySmallNum, UfSubBySmallNum;  // was there supposed to be a subtraction by a small number
-    logic           UfCalcPlus1, CalcMinus1, Minus1; // do you add or subtract on from the result
-    logic           NormSumSticky;  // normalized sum's sticky bit
-    logic           UfSticky;   // sticky bit for underlow calculation
+    logic           L;         // bit used for rounding - least significant bit of the normalized sum
+    logic           UfCalcPlus1; 
+    logic           NormS;  // normalized sum's sticky bit
+    logic           UfS;   // sticky bit for underlow calculation
    logic [`NF-1:0] RoundFrac;
    logic           FpRes, IntRes;
-    logic           UfRound;
+    logic           UfR;
    logic           FpRound, FpLSBRes, FpUfRound;
    logic           CalcPlus1, FpPlus1;
+    logic [`FLEN:0] RoundAdd;           // how much to add to the result

    ///////////////////////////////////////////////////////////////////////////////
    // Rounding
@ -118,61 +114,61 @@ module round(
    //      |    NF     |1|1|
    //                     ^    ^ if floating point result
    //                     ^ if not an FMA result
-        if (`XLENPOS == 1)assign NormSumSticky = (|Nfrac[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
-                                                 (|Nfrac[`CORRSHIFTSZ-`XLEN-2:0]);
+        if (`XLENPOS == 1)assign NormS = (|Mf[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
+                                                 (|Mf[`CORRSHIFTSZ-`XLEN-2:0]);
    //     2: NF > XLEN
-        if (`XLENPOS == 2)assign NormSumSticky = (|Nfrac[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&IntRes) |
-                                                 (|Nfrac[`CORRSHIFTSZ-`NF-2:0]);
+        if (`XLENPOS == 2)assign NormS = (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&IntRes) |
+                                                 (|Mf[`CORRSHIFTSZ-`NF-2:0]);

    end else if (`FPSIZES == 2) begin
        // XLEN is either 64 or 32
        // so half and single are always smaller then XLEN

        // 1: XLEN > NF   > NF1
-        if (`XLENPOS == 1) assign NormSumSticky = (|Nfrac[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&FpRes&~OutFmt) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`XLEN-2:0]);
+        if (`XLENPOS == 1) assign NormS = (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&FpRes&~OutFmt) |
+                                                  (|Mf[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
+                                                  (|Mf[`CORRSHIFTSZ-`XLEN-2:0]);
        // 2: NF   > XLEN > NF1
-        if (`XLENPOS == 2) assign NormSumSticky = (|Nfrac[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~OutFmt) | 
-                                                  (|Nfrac[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&(IntRes|~OutFmt)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`NF-2:0]);
+        if (`XLENPOS == 2) assign NormS = (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~OutFmt) | 
+                                                  (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&(IntRes|~OutFmt)) |
+                                                  (|Mf[`CORRSHIFTSZ-`NF-2:0]);
        // 3: NF   > NF1  > XLEN
-        if (`XLENPOS == 3) assign NormSumSticky = (|Nfrac[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF1-1]&IntRes) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&(~OutFmt|IntRes)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`NF-2:0]);
+        if (`XLENPOS == 3) assign NormS = (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF1-1]&IntRes) |
+                                                  (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&(~OutFmt|IntRes)) |
+                                                  (|Mf[`CORRSHIFTSZ-`NF-2:0]);

    end else if (`FPSIZES == 3) begin
        // 1: XLEN > NF   > NF1
-        if (`XLENPOS == 1) assign NormSumSticky = (|Nfrac[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`NF1-1]&FpRes&(OutFmt==`FMT1)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&FpRes&~(OutFmt==`FMT)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`XLEN-2:0]);
+        if (`XLENPOS == 1) assign NormS = (|Mf[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`NF1-1]&FpRes&(OutFmt==`FMT1)) |
+                                                  (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&FpRes&~(OutFmt==`FMT)) |
+                                                  (|Mf[`CORRSHIFTSZ-`NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes) |
+                                                  (|Mf[`CORRSHIFTSZ-`XLEN-2:0]);
        // 2: NF   > XLEN > NF1
-        if (`XLENPOS == 2) assign NormSumSticky = (|Nfrac[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`NF1-1]&FpRes&(OutFmt==`FMT1)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~(OutFmt==`FMT)) | 
-                                                  (|Nfrac[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&(IntRes|~(OutFmt==`FMT))) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`NF-2:0]);
+        if (`XLENPOS == 2) assign NormS = (|Mf[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`NF1-1]&FpRes&(OutFmt==`FMT1)) |
+                                                  (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~(OutFmt==`FMT)) | 
+                                                  (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF-1]&(IntRes|~(OutFmt==`FMT))) |
+                                                  (|Mf[`CORRSHIFTSZ-`NF-2:0]);
        // 3: NF   > NF1  > XLEN
-        if (`XLENPOS == 3) assign NormSumSticky = (|Nfrac[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&(OutFmt==`FMT1)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF1-1]&((OutFmt==`FMT1)|IntRes)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&(~(OutFmt==`FMT)|IntRes)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`NF-2:0]);
+        if (`XLENPOS == 3) assign NormS = (|Mf[`CORRSHIFTSZ-`NF2-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&(OutFmt==`FMT1)) |
+                                                  (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`NF1-1]&((OutFmt==`FMT1)|IntRes)) |
+                                                  (|Mf[`CORRSHIFTSZ-`NF1-2:`CORRSHIFTSZ-`NF-1]&(~(OutFmt==`FMT)|IntRes)) |
+                                                  (|Mf[`CORRSHIFTSZ-`NF-2:0]);

    end else if (`FPSIZES == 4) begin
        // Quad precision will always be greater than XLEN
        // 2: NF   > XLEN > NF1
-        if (`XLENPOS == 2) assign NormSumSticky = (|Nfrac[`CORRSHIFTSZ-`H_NF-2:`CORRSHIFTSZ-`S_NF-1]&FpRes&(OutFmt==`H_FMT)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`S_NF-2:`CORRSHIFTSZ-`D_NF-1]&FpRes&((OutFmt==`S_FMT)|(OutFmt==`H_FMT))) | 
-                                                  (|Nfrac[`CORRSHIFTSZ-`D_NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~(OutFmt==`Q_FMT)) | 
-                                                  (|Nfrac[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`Q_NF-1]&(~(OutFmt==`Q_FMT)|IntRes)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`Q_NF-2:0]);
+        if (`XLENPOS == 2) assign NormS = (|Mf[`CORRSHIFTSZ-`H_NF-2:`CORRSHIFTSZ-`S_NF-1]&FpRes&(OutFmt==`H_FMT)) |
+                                                  (|Mf[`CORRSHIFTSZ-`S_NF-2:`CORRSHIFTSZ-`D_NF-1]&FpRes&((OutFmt==`S_FMT)|(OutFmt==`H_FMT))) | 
+                                                  (|Mf[`CORRSHIFTSZ-`D_NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&~(OutFmt==`Q_FMT)) | 
+                                                  (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`Q_NF-1]&(~(OutFmt==`Q_FMT)|IntRes)) |
+                                                  (|Mf[`CORRSHIFTSZ-`Q_NF-2:0]);
        // 3: NF   > NF1  > XLEN
        // The extra XLEN bit will be ored later when caculating the final sticky bit - the ufplus1 not needed for integer
-        if (`XLENPOS == 3) assign NormSumSticky = (|Nfrac[`CORRSHIFTSZ-`H_NF-2:`CORRSHIFTSZ-`S_NF-1]&FpRes&(OutFmt==`H_FMT)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`S_NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&((OutFmt==`S_FMT)|(OutFmt==`H_FMT))) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`D_NF-1]&((OutFmt==`S_FMT)|(OutFmt==`H_FMT)|IntRes)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`D_NF-2:`CORRSHIFTSZ-`Q_NF-1]&(~(OutFmt==`Q_FMT)|IntRes)) |
-                                                  (|Nfrac[`CORRSHIFTSZ-`Q_NF-2:0]);
+        if (`XLENPOS == 3) assign NormS = (|Mf[`CORRSHIFTSZ-`H_NF-2:`CORRSHIFTSZ-`S_NF-1]&FpRes&(OutFmt==`H_FMT)) |
+                                                  (|Mf[`CORRSHIFTSZ-`S_NF-2:`CORRSHIFTSZ-`XLEN-1]&FpRes&((OutFmt==`S_FMT)|(OutFmt==`H_FMT))) |
+                                                  (|Mf[`CORRSHIFTSZ-`XLEN-2:`CORRSHIFTSZ-`D_NF-1]&((OutFmt==`S_FMT)|(OutFmt==`H_FMT)|IntRes)) |
+                                                  (|Mf[`CORRSHIFTSZ-`D_NF-2:`CORRSHIFTSZ-`Q_NF-1]&(~(OutFmt==`Q_FMT)|IntRes)) |
+                                                  (|Mf[`CORRSHIFTSZ-`Q_NF-2:0]);

    end
    
@ -180,37 +176,37 @@ module round(

    // only add the Addend sticky if doing an FMA opperation
    //      - the shifter shifts too far left when there's an underflow (shifting out all possible sticky bits)
-    assign UfSticky = FmaZmSticky&FmaOp | NormSumSticky | CvtResUf&CvtOp | FmaSe[`NE+1]&FmaOp | DivSticky&DivOp;
+    assign UfS = FmaZmS&FmaOp | NormS | CvtResUf&CvtOp | FmaSe[`NE+1]&FmaOp | DivS&DivOp;
    
    // determine round and LSB of the rounded value
    //      - underflow round bit is used to determint the underflow flag
    if (`FPSIZES == 1) begin
-        assign FpRound = Nfrac[`CORRSHIFTSZ-`NF-1];
-        assign FpLSBRes = Nfrac[`CORRSHIFTSZ-`NF];
-        assign FpUfRound = Nfrac[`CORRSHIFTSZ-`NF-2];
+        assign FpRound = Mf[`CORRSHIFTSZ-`NF-1];
+        assign FpLSBRes = Mf[`CORRSHIFTSZ-`NF];
+        assign FpUfRound = Mf[`CORRSHIFTSZ-`NF-2];

    end else if (`FPSIZES == 2) begin
-        assign FpRound = OutFmt ? Nfrac[`CORRSHIFTSZ-`NF-1] : Nfrac[`CORRSHIFTSZ-`NF1-1];
-        assign FpLSBRes = OutFmt ? Nfrac[`CORRSHIFTSZ-`NF] : Nfrac[`CORRSHIFTSZ-`NF1];
-        assign FpUfRound = OutFmt ? Nfrac[`CORRSHIFTSZ-`NF-2] : Nfrac[`CORRSHIFTSZ-`NF1-2];
+        assign FpRound = OutFmt ? Mf[`CORRSHIFTSZ-`NF-1] : Mf[`CORRSHIFTSZ-`NF1-1];
+        assign FpLSBRes = OutFmt ? Mf[`CORRSHIFTSZ-`NF] : Mf[`CORRSHIFTSZ-`NF1];
+        assign FpUfRound = OutFmt ? Mf[`CORRSHIFTSZ-`NF-2] : Mf[`CORRSHIFTSZ-`NF1-2];

    end else if (`FPSIZES == 3) begin
        always_comb
            case (OutFmt)
                `FMT: begin
-                    FpRound = Nfrac[`CORRSHIFTSZ-`NF-1];
-                    FpLSBRes = Nfrac[`CORRSHIFTSZ-`NF];
-                    FpUfRound = Nfrac[`CORRSHIFTSZ-`NF-2];
+                    FpRound = Mf[`CORRSHIFTSZ-`NF-1];
+                    FpLSBRes = Mf[`CORRSHIFTSZ-`NF];
+                    FpUfRound = Mf[`CORRSHIFTSZ-`NF-2];
                end
                `FMT1: begin
-                    FpRound = Nfrac[`CORRSHIFTSZ-`NF1-1];
-                    FpLSBRes = Nfrac[`CORRSHIFTSZ-`NF1];
-                    FpUfRound = Nfrac[`CORRSHIFTSZ-`NF1-2];
+                    FpRound = Mf[`CORRSHIFTSZ-`NF1-1];
+                    FpLSBRes = Mf[`CORRSHIFTSZ-`NF1];
+                    FpUfRound = Mf[`CORRSHIFTSZ-`NF1-2];
                end
                `FMT2: begin
-                    FpRound = Nfrac[`CORRSHIFTSZ-`NF2-1];
-                    FpLSBRes = Nfrac[`CORRSHIFTSZ-`NF2];
-                    FpUfRound = Nfrac[`CORRSHIFTSZ-`NF2-2];
+                    FpRound = Mf[`CORRSHIFTSZ-`NF2-1];
+                    FpLSBRes = Mf[`CORRSHIFTSZ-`NF2];
+                    FpUfRound = Mf[`CORRSHIFTSZ-`NF2-2];
                end
                default: begin
                    FpRound = 1'bx;
@ -222,130 +218,97 @@ module round(
        always_comb
            case (OutFmt)
                2'h3: begin
-                    FpRound = Nfrac[`CORRSHIFTSZ-`Q_NF-1];
-                    FpLSBRes = Nfrac[`CORRSHIFTSZ-`Q_NF];
-                    FpUfRound = Nfrac[`CORRSHIFTSZ-`Q_NF-2];
+                    FpRound = Mf[`CORRSHIFTSZ-`Q_NF-1];
+                    FpLSBRes = Mf[`CORRSHIFTSZ-`Q_NF];
+                    FpUfRound = Mf[`CORRSHIFTSZ-`Q_NF-2];
                end
                2'h1: begin
-                    FpRound = Nfrac[`CORRSHIFTSZ-`D_NF-1];
-                    FpLSBRes = Nfrac[`CORRSHIFTSZ-`D_NF];
-                    FpUfRound = Nfrac[`CORRSHIFTSZ-`D_NF-2];
+                    FpRound = Mf[`CORRSHIFTSZ-`D_NF-1];
+                    FpLSBRes = Mf[`CORRSHIFTSZ-`D_NF];
+                    FpUfRound = Mf[`CORRSHIFTSZ-`D_NF-2];
                end
                2'h0: begin
-                    FpRound = Nfrac[`CORRSHIFTSZ-`S_NF-1];
-                    FpLSBRes = Nfrac[`CORRSHIFTSZ-`S_NF];
-                    FpUfRound = Nfrac[`CORRSHIFTSZ-`S_NF-2];
+                    FpRound = Mf[`CORRSHIFTSZ-`S_NF-1];
+                    FpLSBRes = Mf[`CORRSHIFTSZ-`S_NF];
+                    FpUfRound = Mf[`CORRSHIFTSZ-`S_NF-2];
                end
                2'h2: begin
-                    FpRound = Nfrac[`CORRSHIFTSZ-`H_NF-1];
-                    FpLSBRes = Nfrac[`CORRSHIFTSZ-`H_NF];
-                    FpUfRound = Nfrac[`CORRSHIFTSZ-`H_NF-2];
+                    FpRound = Mf[`CORRSHIFTSZ-`H_NF-1];
+                    FpLSBRes = Mf[`CORRSHIFTSZ-`H_NF];
+                    FpUfRound = Mf[`CORRSHIFTSZ-`H_NF-2];
                end
            endcase
    end

-    assign R = ToInt&CvtOp ? Nfrac[`CORRSHIFTSZ-`XLEN-1] : FpRound;
-    assign LSBRes = ToInt&CvtOp ? Nfrac[`CORRSHIFTSZ-`XLEN] : FpLSBRes;
-    assign UfRound = ToInt&CvtOp ? Nfrac[`CORRSHIFTSZ-`XLEN-2] : FpUfRound;
+    assign R = ToInt&CvtOp ? Mf[`CORRSHIFTSZ-`XLEN-1] : FpRound;
+    assign L = ToInt&CvtOp ? Mf[`CORRSHIFTSZ-`XLEN] : FpLSBRes;
+    assign UfR = ToInt&CvtOp ? Mf[`CORRSHIFTSZ-`XLEN-2] : FpUfRound;

    // used to determine underflow flag
-    assign UfLSBRes = FpRound;
+    assign UfL = FpRound;
    // determine sticky
-    assign S = UfSticky | UfRound;
-
-
-    // Deterimine if a small number was supposed to be subtrated
-    //  - for FMA or if division has a negitive sticky bit
-    assign SubBySmallNum = ((FmaZmSticky&FmaOp&~ZZero&FmaInvA) | (DivNegSticky&DivOp)) & ~(NormSumSticky|UfRound);
-    assign UfSubBySmallNum = ((FmaZmSticky&FmaOp&~ZZero&FmaInvA) | (DivNegSticky&DivOp)) & ~NormSumSticky;
+    assign S = UfS | UfR;


    always_comb begin
        // Determine if you add 1
        case (Frm)
-            3'b000: CalcPlus1 = R & ((S| LSBRes)&~SubBySmallNum);//round to nearest even
+            3'b000: CalcPlus1 = R & (S| L);//round to nearest even
            3'b001: CalcPlus1 = 0;//round to zero
-            3'b010: CalcPlus1 = Nsgn & ~(SubBySmallNum & ~R);//round down
-            3'b011: CalcPlus1 = ~Nsgn & ~(SubBySmallNum & ~R);//round up
-            3'b100: CalcPlus1 = R & ~SubBySmallNum;//round to nearest max magnitude
+            3'b010: CalcPlus1 = Ms;//round down
+            3'b011: CalcPlus1 = ~Ms;//round up
+            3'b100: CalcPlus1 = R;//round to nearest max magnitude
            default: CalcPlus1 = 1'bx;
        endcase
        // Determine if you add 1 (for underflow flag)
        case (Frm)
-            3'b000: UfCalcPlus1 = UfRound & ((UfSticky| UfLSBRes)&~UfSubBySmallNum);//round to nearest even
+            3'b000: UfCalcPlus1 = UfR & (UfS| UfL);//round to nearest even
            3'b001: UfCalcPlus1 = 0;//round to zero
-            3'b010: UfCalcPlus1 = Nsgn & ~(UfSubBySmallNum & ~UfRound);//round down
-            3'b011: UfCalcPlus1 = ~Nsgn & ~(UfSubBySmallNum & ~UfRound);//round up
-            3'b100: UfCalcPlus1 = UfRound & ~UfSubBySmallNum;//round to nearest max magnitude
+            3'b010: UfCalcPlus1 = Ms;//round down
+            3'b011: UfCalcPlus1 = ~Ms;//round up
+            3'b100: UfCalcPlus1 = UfR;//round to nearest max magnitude
            default: UfCalcPlus1 = 1'bx;
        endcase
-        // Determine if you subtract 1
-        case (Frm)
-            3'b000: CalcMinus1 = 0;//round to nearest even
-            3'b001: CalcMinus1 = SubBySmallNum & ~R;//round to zero
-            3'b010: CalcMinus1 = ~Nsgn & ~R & SubBySmallNum;//round down
-            3'b011: CalcMinus1 = Nsgn & ~R & SubBySmallNum;//round up
-            3'b100: CalcMinus1 = 0;//round to nearest max magnitude
-            default: CalcMinus1 = 1'bx;
-        endcase
   
    end

    // If an answer is exact don't round
    assign Plus1 = CalcPlus1 & (S | R);
    assign FpPlus1 = Plus1&~(ToInt&CvtOp);
-    assign UfPlus1 = UfCalcPlus1 & S; // UfRound is part of sticky
-    assign Minus1 = CalcMinus1 & (S | R);
+    assign UfPlus1 = UfCalcPlus1 & S; // UfR is part of sticky

    // Compute rounded result
    if (`FPSIZES == 1) begin
-        assign RoundAdd = Minus1 ? {`FLEN+1{1'b1}} : {{`FLEN{1'b0}}, FpPlus1};
+        assign RoundAdd = {{`FLEN{1'b0}}, FpPlus1};

    end else if (`FPSIZES == 2) begin
        // \/FLEN+1
        //  | NE+2 |        NF      |
        //  '-NE+2-^----NF1----^
        // `FLEN+1-`NE-2-`NF1 = FLEN-1-NE-NF1
-        assign RoundAdd = OutFmt ? Minus1 ? {`FLEN+1{1'b1}} : {{{`FLEN{1'b0}}}, FpPlus1} :
-                                   Minus1 ? {{`NE+2+`NF1{1'b1}}, (`FLEN-1-`NE-`NF1)'(0)} : {(`NE+1+`NF1)'(0), FpPlus1, (`FLEN-1-`NE-`NF1)'(0)};
+        assign RoundAdd = {(`NE+1+`NF1)'(0), FpPlus1&~OutFmt, (`NF-`NF1-1)'(0), FpPlus1&OutFmt};

    end else if (`FPSIZES == 3) begin
-        always_comb begin
-            case (OutFmt)
-                `FMT:  RoundAdd = Minus1 ? {`FLEN+1{1'b1}} : {{{`FLEN{1'b0}}}, FpPlus1};
-                `FMT1: RoundAdd = Minus1 ? {{`NE+2+`NF1{1'b1}}, (`FLEN-1-`NE-`NF1)'(0)} : {(`NE+1+`NF1)'(0), FpPlus1, (`FLEN-1-`NE-`NF1)'(0)};
-                `FMT2: RoundAdd = Minus1 ? {{`NE+2+`NF2{1'b1}}, (`FLEN-1-`NE-`NF2)'(0)} : {(`NE+1+`NF2)'(0), FpPlus1, (`FLEN-1-`NE-`NF2)'(0)};
-                default: RoundAdd = (`FLEN+1)'(0);
-            endcase
-        end
+        assign RoundAdd = {(`NE+1+`NF2)'(0), FpPlus1&(OutFmt==`FMT2), (`NF1-`NF2-1)'(0), FpPlus1&(OutFmt==`FMT1), (`NF-`NF1-1)'(0), FpPlus1&(OutFmt==`FMT)};

-    end else if (`FPSIZES == 4) begin        
-        always_comb begin
-            case (OutFmt)
-                2'h3: RoundAdd = Minus1 ? {`FLEN+1{1'b1}} : {{{`FLEN{1'b0}}}, FpPlus1};
-                2'h1: RoundAdd = Minus1 ? {{`NE+2+`D_NF{1'b1}}, (`FLEN-1-`NE-`D_NF)'(0)} : {(`NE+1+`D_NF)'(0), FpPlus1, (`FLEN-1-`NE-`D_NF)'(0)};
-                2'h0: RoundAdd = Minus1 ? {{`NE+2+`S_NF{1'b1}}, (`FLEN-1-`NE-`S_NF)'(0)} : {(`NE+1+`S_NF)'(0), FpPlus1, (`FLEN-1-`NE-`S_NF)'(0)};
-                2'h2: RoundAdd = Minus1 ? {{`NE+2+`H_NF{1'b1}}, (`FLEN-1-`NE-`H_NF)'(0)} : {(`NE+1+`H_NF)'(0), FpPlus1, (`FLEN-1-`NE-`H_NF)'(0)};
-            endcase
-        end
-
-    end
+    end else if (`FPSIZES == 4)      
+        assign RoundAdd = {(`Q_NE+1+`H_NF)'(0), FpPlus1&(OutFmt==`H_FMT), (`S_NF-`H_NF-1)'(0), FpPlus1&(OutFmt==`S_FMT), (`D_NF-`S_NF-1)'(0), FpPlus1&(OutFmt==`D_FMT), (`Q_NF-`D_NF-1)'(0), FpPlus1&(OutFmt==`Q_FMT)};

    // determine the result to be roundned
-    assign RoundFrac = Nfrac[`CORRSHIFTSZ-1:`CORRSHIFTSZ-`NF];
+    assign RoundFrac = Mf[`CORRSHIFTSZ-1:`CORRSHIFTSZ-`NF];
    
    always_comb
        case(PostProcSel)
-            2'b10: Nexp = FmaSe; // fma
-            2'b00: Nexp = {CvtCe[`NE], CvtCe}&{`NE+2{~CvtResDenormUf|CvtResUf}}; // cvt
-            2'b01: Nexp = DivDone ? DivCorrExp : '0; // divide
-            default: Nexp = '0; 
+            2'b10: Me = FmaSe; // fma
+            2'b00: Me = {CvtCe[`NE], CvtCe}&{`NE+2{~CvtResDenormUf|CvtResUf}}; // cvt
+            2'b01: Me = DivDone ? Qe : '0; // divide
+            default: Me = '0; 
        endcase

    // round the result
    //      - if the fraction overflows one should be added to the exponent
-    assign {FullResExp, Rf} = {Nexp, RoundFrac} + RoundAdd;
-    assign Re = FullResExp[`NE-1:0];
+    assign {FullRe, Rf} = {Me, RoundFrac} + RoundAdd;
+    assign Re = FullRe[`NE-1:0];


 endmodule
--- a/pipelined/src/fpu/roundsign.sv
+++ b/pipelined/src/fpu/roundsign.sv
@ -38,11 +38,11 @@ module roundsign(
    input logic         DivOp,
    input logic         CvtOp,
    input logic         CvtCs,
-    output logic        Nsgn
+    output logic        Ms
 );

    logic FmaResSgnTmp;
-    logic DivSgn;
+    logic Qs;

    // is the result negitive
    //  if p - z is the Sum negitive
@ -52,9 +52,9 @@ module roundsign(

    // assign FmaResSgnTmp = FmaInvA&(FmaAs)&FmaNegSum | FmaInvA&FmaPs&~FmaNegSum | (FmaAs&FmaPs);

-    assign DivSgn = Xs^Ys;
+    assign Qs = Xs^Ys;

    // Sign for rounding calulation
-    assign Nsgn = (FmaResSgnTmp&FmaOp) | (CvtCs&CvtOp) | (DivSgn&DivOp);
+    assign Ms = (FmaResSgnTmp&FmaOp) | (CvtCs&CvtOp) | (Qs&DivOp);

 endmodule
--- a/pipelined/src/fpu/shiftcorrection.sv
+++ b/pipelined/src/fpu/shiftcorrection.sv
@ -28,23 +28,22 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 `include "wally-config.vh"

-module lzacorrection(
+module shiftcorrection(
    input logic  [`NORMSHIFTSZ-1:0] Shifted,         // the shifted sum before LZA correction
    input logic                     FmaOp,
    input logic                     DivOp,
    input logic                     DivResDenorm,
-    input logic  [`NE+1:0]          DivCalcExp,
+    input logic  [`NE+1:0]          DivQe,
    input logic  [`NE+1:0]          DivDenormShift,
-    input logic  [`NE+1:0]          FmaConvNormSumExp,          // exponent of the normalized sum not taking into account denormal or zero results
+    input logic  [`NE+1:0]          FmaNe,          // exponent of the normalized sum not taking into account denormal or zero results
    input logic                     FmaPreResultDenorm,    // is the result denormalized - calculated before LZA corection
-    input logic                     FmaKillProd,  // is the product set to zero
-    input logic                     FmaSmZero,
-    output logic [`CORRSHIFTSZ-1:0] Nfrac,         // the shifted sum before LZA correction
-    output logic [`NE+1:0]          DivCorrExp,
+    input logic                     FmaSZero,
+    output logic [`CORRSHIFTSZ-1:0] Mf,         // the shifted sum before LZA correction
+    output logic [`NE+1:0]          Qe,
    output logic [`NE+1:0]          FmaSe         // exponent of the normalized sum
 );
    logic [3*`NF+5:0]      CorrSumShifted;     // the shifted sum after LZA correction
-    logic [`CORRSHIFTSZ:0] CorrQuotShifted;
+    logic [`CORRSHIFTSZ-1:0] CorrQuotShifted;
    logic                  ResDenorm;    // is the result denormalized
    logic                  LZAPlus1, LZAPlus2; // add one or two to the sum's exponent due to LZA correction

@ -54,16 +53,16 @@ module lzacorrection(
 	// the only possible mantissa for a plus two is all zeroes - a one has to propigate all the way through a sum. so we can leave the bottom statement alone
    assign CorrSumShifted =  LZAPlus1 ? Shifted[`NORMSHIFTSZ-3:1] : Shifted[`NORMSHIFTSZ-4:0];
    //                        if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Denorm)
-    assign CorrQuotShifted =  {LZAPlus2|(DivCalcExp==1&~LZAPlus2) ? Shifted[`NORMSHIFTSZ-1:`NORMSHIFTSZ-`CORRSHIFTSZ] : {Shifted[`NORMSHIFTSZ-2:`NORMSHIFTSZ-`CORRSHIFTSZ], 1'b0}, 1'b0};
+    assign CorrQuotShifted = (LZAPlus2|(DivQe==1&~LZAPlus2)) ? Shifted[`NORMSHIFTSZ-2:`NORMSHIFTSZ-`CORRSHIFTSZ-1] : Shifted[`NORMSHIFTSZ-3:`NORMSHIFTSZ-`CORRSHIFTSZ-2];
    // if the result of the divider was calculated to be denormalized, then the result was correctly normalized, so select the top shifted bits
-    assign Nfrac = FmaOp ? {CorrSumShifted, {`CORRSHIFTSZ-(3*`NF+6){1'b0}}} : DivOp&~DivResDenorm ? CorrQuotShifted[`CORRSHIFTSZ-1:0] : Shifted[`NORMSHIFTSZ-1:`NORMSHIFTSZ-`CORRSHIFTSZ];
+    assign Mf = FmaOp ? {CorrSumShifted, {`CORRSHIFTSZ-(3*`NF+6){1'b0}}} : DivOp&~DivResDenorm ? CorrQuotShifted : Shifted[`NORMSHIFTSZ-1:`NORMSHIFTSZ-`CORRSHIFTSZ];
    // Determine sum's exponent
    //                          if plus1                     If plus2                                      if said denorm but norm plus 1           if said denorm but norm plus 2
-    assign FmaSe = (FmaConvNormSumExp+{{`NE+1{1'b0}}, LZAPlus1&~FmaKillProd}+{{`NE{1'b0}}, LZAPlus2&~FmaKillProd, 1'b0}+{{`NE+1{1'b0}}, ~ResDenorm&FmaPreResultDenorm&~FmaKillProd}+{{`NE+1{1'b0}}, &FmaConvNormSumExp&Shifted[3*`NF+6]&~FmaKillProd}) & {`NE+2{~(FmaSmZero|ResDenorm)}};
+    assign FmaSe = (FmaNe+{{`NE+1{1'b0}}, LZAPlus1}+{{`NE{1'b0}}, LZAPlus2, 1'b0}+{{`NE+1{1'b0}}, ~ResDenorm&FmaPreResultDenorm}+{{`NE+1{1'b0}}, &FmaNe&Shifted[3*`NF+6]}) & {`NE+2{~(FmaSZero|ResDenorm)}};
    // recalculate if the result is denormalized
    assign ResDenorm = FmaPreResultDenorm&~Shifted[`NORMSHIFTSZ-3]&~Shifted[`NORMSHIFTSZ-2];

    // the quotent is in the range [.5,2) if there is no early termination
    // if the quotent < 1 and not denormal then subtract 1 to account for the normalization shift
-    assign DivCorrExp = ((DivResDenorm)&~DivDenormShift[`NE+1]) ? (`NE+2)'(0) : DivCalcExp - {(`NE+1)'(0), ~LZAPlus2};
+    assign Qe = ((DivResDenorm)&~DivDenormShift[`NE+1]) ? (`NE+2)'(0) : DivQe - {(`NE+1)'(0), ~LZAPlus2};
 endmodule
--- a/pipelined/src/fpu/resultselect.sv
+++ b/pipelined/src/fpu/resultselect.sv
@ -29,17 +29,17 @@

 `include "wally-config.vh"

-module resultselect(
+module specialcase(
    input logic                 Xs,        // input signs
    input logic  [`NF:0]        Xm, Ym, Zm, // input mantissas
    input logic                 XNaN, YNaN, ZNaN,    // inputs are NaN
    input logic  [2:0]          Frm,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
    input logic  [`FMTBITS-1:0] OutFmt,       // output format
    input logic                 InfIn,
+    input logic                 NaNIn,
    input logic                 XInf, YInf,
    input logic                 XZero,
    input logic                 IntZero,
-    input logic                 NaNIn,
    input logic                 IntToFp,
    input logic                 Int64,
    input logic                 Signed,
@ -53,10 +53,10 @@ module resultselect(
    input logic                 IntInvalid, Invalid, Overflow,  // flags
    input logic                 CvtResUf,
    input logic  [`NE-1:0]      Re,          // Res exponent
-    input logic  [`NE+1:0]      FullResExp,          // Res exponent
+    input logic  [`NE+1:0]      FullRe,          // Res exponent
    input logic  [`NF-1:0]      Rf,         // Res fraction
    input logic  [`XLEN+1:0]    CvtNegRes,     // the negation of the result
-    output logic [`FLEN-1:0]    W,     // final res
+    output logic [`FLEN-1:0]    PostProcRes,     // final res
    output logic [`XLEN-1:0]    FCvtIntRes     // final res
 );
    logic [`FLEN-1:0]   XNaNRes, YNaNRes, ZNaNRes, InvalidRes, OfRes, UfRes, NormRes; // possible results
@ -231,11 +231,11 @@ module resultselect(
    //      - do so if the res underflows, is zero (the exp doesnt calculate correctly). or the integer input is 0
    //      - dont set to zero if fp input is zero but not using the fp input
    //      - dont set to zero if int input is zero but not using the int input
-    assign KillRes = CvtOp ? (CvtResUf|(XZero&~IntToFp)|(IntZero&IntToFp)) : FullResExp[`NE+1] | (((YInf&~XInf)|XZero)&DivOp);//Underflow & ~ResDenorm & (Re!=1);
+    assign KillRes = CvtOp ? (CvtResUf|(XZero&~IntToFp)|(IntZero&IntToFp)) : FullRe[`NE+1] | (((YInf&~XInf)|XZero)&DivOp);//Underflow & ~ResDenorm & (Re!=1);
    assign SelOfRes = Overflow|DivByZero|(InfIn&~(YInf&DivOp));
    // output infinity with result sign if divide by zero
    if(`IEEE754) begin
-        assign W = XNaN&~(IntToFp&CvtOp) ? XNaNRes :
+        assign PostProcRes = XNaN&~(IntToFp&CvtOp) ? XNaNRes :
                         YNaN&~CvtOp ? YNaNRes :
                         ZNaN&FmaOp ? ZNaNRes :
                         Invalid ? InvalidRes : 
@ -243,7 +243,7 @@ module resultselect(
                         KillRes ? UfRes :  
                         NormRes;
    end else begin
-        assign W = NaNIn|Invalid ? InvalidRes :
+        assign PostProcRes = NaNIn|Invalid ? InvalidRes :
                         SelOfRes ? OfRes :
                         KillRes ? UfRes :  
                         NormRes;
--- a/pipelined/src/fpu/srt-radix4.sv
+++ b/pipelined/src/fpu/srt-radix4.sv
@ -1,312 +0,0 @@
-///////////////////////////////////////////
-// srt.sv
-//
-// Written: David_Harris@hmc.edu, me@KatherineParry.com, Cedar Turek
-// Modified:13 January 2022
-//
-// Purpose: Combined Divide and Square Root Floating Point and Integer Unit
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// MIT LICENSE
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
-// software and associated documentation files (the "Software"), to deal in the Software 
-// without restriction, including without limitation the rights to use, copy, modify, merge, 
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
-// to whom the Software is furnished to do so, subject to the following conditions:
-//
-//   The above copyright notice and this permission notice shall be included in all copies or 
-//   substantial portions of the Software.
-//
-//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
-//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
-//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
-//   OR OTHER DEALINGS IN THE SOFTWARE.
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-`include "wally-config.vh"
-
-module srtradix4 (
-  input  logic clk,
-  input  logic DivStart, 
-  input  logic DivBusy, 
-  input logic  [`FMTBITS-1:0] FmtE,
-  input  logic [`NE-1:0] XExpE, YExpE,
-  input  logic XZeroE, YZeroE, 
-  input logic [`DIVLEN-1:0] X,
-  input logic [`DIVLEN-1:0] Dpreproc,
-  input logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
-  output logic [`DIVLEN+2:0] Quot,
-  output logic [`DIVLEN+3:0]  WSN, WCN,
-  output logic [`DIVLEN+3:0]  WS, WC,
-  output logic  [`NE+1:0] DivCalcExpM,
-  output logic [`XLEN-1:0] Rem
-);
-
-  logic [3:0]     q;
-  logic [`DIVLEN+3:0]  WSA;
-  logic [`DIVLEN+3:0]  WCA;
-  logic [`DIVLEN+3:0]  D, DBar, D2, DBar2, Dsel;
-  logic [`NE+1:0] DivCalcExp;
-  logic [$clog2(`XLEN+1)-1:0] intExp;
-  logic           intSign;
-
-  // Top Muxes and Registers
-  // When start is asserted, the inputs are loaded into the divider.
-  // Otherwise, the divisor is retained and the partial remainder
-  // is fed back for the next iteration.
-  //  - when the start signal is asserted X and 0 are loaded into WS and WC
-  //  - otherwise load WSA into the flipflop
-  //  - the assumed one is added to D since it's always normalized (and X/0 is a special case handeled by result selection)
-  //  - XZeroE is used as the assumed one to avoid creating a sticky bit - all other numbers are normalized
-  mux2   #(`DIVLEN+4) wsmux({WSA[`DIVLEN+1:0], 2'b0}, {3'b000, ~XZeroE, X}, DivStart, WSN);
-  flop   #(`DIVLEN+4) wsflop(clk, WSN, WS);
-  mux2   #(`DIVLEN+4) wcmux({WCA[`DIVLEN+1:0], 2'b0}, {`DIVLEN+4{1'b0}}, DivStart, WCN);
-  flop   #(`DIVLEN+4) wcflop(clk, WCN, WC);
-  flopen #(`DIVLEN+4) dflop(clk, DivStart, {4'b0001, Dpreproc}, D);
-  flopen #(`NE+2) expflop(clk, DivStart, DivCalcExp, DivCalcExpM);
-
-  // Quotient Selection logic
-  // Given partial remainder, select quotient of +1, 0, or -1 (qp, qz, pm)
-  // *** change this for radix 4 - generate w/ stine code
-  // q encoding:
-	// 1000 = +2
-	// 0100 = +1
-	// 0000 =  0
-	// 0010 = -1
-	// 0001 = -2
-  qsel4 qsel4(.D, .WS, .WC, .q);
-
-  // Divisor Selection logic
-  // *** radix 4 change to choose -2 to 2
-  // - choose the negitive version of what's being selected
-  assign DBar = ~D;
-  assign DBar2 = {~D[`DIVLEN+2:0], 1'b1};
-  assign D2 = {D[`DIVLEN+2:0], 1'b0};
-
-  always_comb
-    case (q)
-      4'b1000: Dsel = DBar2;
-      4'b0100: Dsel = DBar;
-      4'b0000: Dsel = {(`DIVLEN+4){1'b0}};
-      4'b0010: Dsel = D;
-      4'b0001: Dsel = D2;
-      default: Dsel = {`DIVLEN+4{1'bx}};
-    endcase
-
-  // Partial Product Generation
-  //  WSA, WCA = WS + WC - qD
-  csa    #(`DIVLEN+4) csa(WS, WC, Dsel, |q[3:2], WSA, WCA);
-  
-  //*** change for radix 4
-  otfc4 otfc4(.clk, .DivStart, .DivBusy, .q, .Quot);
-
-  expcalc expcalc(.FmtE, .XExpE, .YExpE, .XZeroE, .XZeroCnt, .YZeroCnt, .DivCalcExp);
-
-endmodule
-
-////////////////
-// Submodules //
-////////////////
-
-
-
-module qsel4 (
-	input logic [`DIVLEN+3:0] D,
-	input logic [`DIVLEN+3:0] WS, WC,
-	output logic [3:0] q
-);
-	logic [6:0] Wmsbs;
-	logic [7:0] PreWmsbs;
-	logic [2:0] Dmsbs;
-	assign PreWmsbs = WC[`DIVLEN+3:`DIVLEN-4] + WS[`DIVLEN+3:`DIVLEN-4];
-	assign Wmsbs = PreWmsbs[7:1];
-	assign Dmsbs = D[`DIVLEN-1:`DIVLEN-3];
-	// D = 0001.xxx...
-	// Dmsbs = |   |
-  // W =      xxxx.xxx...
-	// Wmsbs = |        |
-
-	logic [3:0] QSel4[1023:0];
-
-  initial begin 
-    integer d, w, i, w2;
-    for(d=0; d<8; d++)
-      for(w=0; w<128; w++)begin
-        i = d*128+w;
-        w2 = w-128*(w>=64); // convert to two's complement
-        case(d)
-          0: if($signed(w2)>=$signed(12))      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-4)  QSel4[i] = 4'b0000; 
-            else if(w2>=-13) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
-          1: if(w2>=14)      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-6)  QSel4[i] = 4'b0000; 
-            else if(w2>=-15) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
-          2: if(w2>=15)      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-6)  QSel4[i] = 4'b0000; 
-            else if(w2>=-16) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
-          3: if(w2>=16)      QSel4[i] = 4'b1000;
-            else if(w2>=4)   QSel4[i] = 4'b0100; 
-            else if(w2>=-6)  QSel4[i] = 4'b0000; 
-            else if(w2>=-18) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
-          4: if(w2>=18)      QSel4[i] = 4'b1000;
-            else if(w2>=6)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-20) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
-          5: if(w2>=20)      QSel4[i] = 4'b1000;
-            else if(w2>=6)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-20) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
-          6: if(w2>=20)      QSel4[i] = 4'b1000;
-            else if(w2>=8)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-22) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
-          7: if(w2>=24)      QSel4[i] = 4'b1000;
-            else if(w2>=8)   QSel4[i] = 4'b0100; 
-            else if(w2>=-8)  QSel4[i] = 4'b0000; 
-            else if(w2>=-24) QSel4[i] = 4'b0010; 
-            else            QSel4[i] = 4'b0001; 
-        endcase
-      end
-  end
-	assign q = QSel4[{Dmsbs,Wmsbs}];
-	
-endmodule
-
-///////////////////////////////////
-// On-The-Fly Converter, Radix 2 //
-///////////////////////////////////
-module otfc4 (
-  input  logic         clk,
-  input  logic         DivStart,
-  input  logic         DivBusy,
-  input  logic [3:0]   q,
-  output logic [`DIVLEN+2:0] Quot
-);
-
-  //  The on-the-fly converter transfers the quotient 
-  //  bits to the quotient as they come. 
-  //
-  //  This code follows the psuedocode presented in the 
-  //  floating point chapter of the book. Right now, 
-  //  it is written for Radix-4 division.
-  //
-  //  QM is Q-1. It allows us to write negative bits 
-  //  without using a costly CPA. 
-  logic [`DIVLEN+2:0] QM, QNext, QMNext, QMux, QMMux;
-  //  QR and QMR are the shifted versions of Q and QM.
-  //  They are treated as [N-1:r] size signals, and 
-  //  discard the r most significant bits of Q and QM. 
-  logic [`DIVLEN:0] QR, QMR;
-  // if starting a new divison set Q to 0 and QM to -1
-  mux2 #(`DIVLEN+3) Qmux(QNext, {`DIVLEN+3{1'b0}}, DivStart, QMux);
-  mux2 #(`DIVLEN+3) QMmux(QMNext, {`DIVLEN+3{1'b1}}, DivStart, QMMux);
-  flopen #(`DIVLEN+3) Qreg(clk, DivBusy|DivStart, QMux, Quot); // *** have to connect Quot directly to M stage
-  flop #(`DIVLEN+3) QMreg(clk, QMMux, QM);
-
-  // shift Q (quotent) and QM (quotent-1)
-		// if 	q = 2  	    Q = {Q, 10} 	QM = {Q, 01}		
-		// else if 	q = 1   Q = {Q, 01} 	QM = {Q, 00}	
-		// else if 	q = 0   Q = {Q, 00} 	QM = {QM, 11}	
-		// else if 	q = -1	Q = {QM, 11} 	QM = {QM, 10}
-		// else if 	q = -2	Q = {QM, 10} 	QM = {QM, 01}
-    // *** how does the 0 concatination numbers work?
-
-  always_comb begin
-    QR  = Quot[`DIVLEN:0];
-    QMR = QM[`DIVLEN:0];     // Shift Q and QM
-    if (q[3]) begin // +2
-      QNext  = {QR,  2'b10};
-      QMNext = {QR,  2'b01};
-    end else if (q[2]) begin // +1
-      QNext  = {QR,  2'b01};
-      QMNext = {QR,  2'b00};
-    end else if (q[1]) begin // -1
-      QNext  = {QMR,  2'b11};
-      QMNext = {QMR,  2'b10};
-    end else if (q[0]) begin // -2
-      QNext  = {QMR,  2'b10};
-      QMNext = {QMR,  2'b01};
-    end else begin           // 0
-      QNext  = {QR,  2'b00};
-      QMNext = {QMR, 2'b11};
-    end 
-  end
-  // Final Quoteint is in the range [.5, 2)
-
-endmodule
-
-
-
-/////////
-// csa //
-/////////
-module csa #(parameter N=69) (
-  input  logic [N-1:0] in1, in2, in3, 
-  input  logic         cin, 
-  output logic [N-1:0] out1, out2
-);
-
-  // This block adds in1, in2, in3, and cin to produce 
-  // a result out1 / out2 in carry-save redundant form.
-  // cin is just added to the least significant bit and
-  // is Startuired to handle adding a negative divisor.
-  // Fortunately, the carry (out2) is shifted left by one
-  // bit, leaving room in the least significant bit to 
-  // insert cin.
-
-  assign out1 = in1 ^ in2 ^ in3;
-  assign out2 = {in1[N-2:0] & (in2[N-2:0] | in3[N-2:0]) | 
-		    (in2[N-2:0] & in3[N-2:0]), cin};
-endmodule
-
-module expcalc(
-  input logic  [`FMTBITS-1:0] FmtE,
-  input  logic [`NE-1:0] XExpE, YExpE,
-  input logic XZeroE, 
-  input logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
-  output logic  [`NE+1:0] DivCalcExp
-  );
-    logic [`NE-2:0] Bias;
-    
-    if (`FPSIZES == 1) begin
-        assign Bias = (`NE-1)'(`BIAS); 
-
-    end else if (`FPSIZES == 2) begin
-        assign Bias = FmtE ? (`NE-1)'(`BIAS) : (`NE-1)'(`BIAS1); 
-
-    end else if (`FPSIZES == 3) begin
-        always_comb
-            case (FmtE)
-                `FMT: Bias  =  (`NE-1)'(`BIAS);
-                `FMT1: Bias = (`NE-1)'(`BIAS1);
-                `FMT2: Bias = (`NE-1)'(`BIAS2);
-                default: Bias = 'x;
-            endcase
-
-    end else if (`FPSIZES == 4) begin        
-        always_comb
-            case (FmtE)
-                2'h3: Bias =  (`NE-1)'(`Q_BIAS);
-                2'h1: Bias =  (`NE-1)'(`D_BIAS);
-                2'h0: Bias =  (`NE-1)'(`S_BIAS);
-                2'h2: Bias =  (`NE-1)'(`H_BIAS);
-            endcase
-    end
-    // correct exponent for denormalized input's normalization shifts
-    assign DivCalcExp = ({2'b0, XExpE} - {{`NE+1-$clog2(`NF+2){1'b0}}, XZeroCnt} - {2'b0, YExpE} + {{`NE+1-$clog2(`NF+2){1'b0}}, YZeroCnt} + {3'b0, Bias})&{`NE+2{~XZeroE}};
-    endmodule
--- a/pipelined/src/fpu/srt.sv
+++ b/pipelined/src/fpu/srt.sv
@ -0,0 +1,259 @@
+///////////////////////////////////////////
+// srt.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
+// Modified:13 January 2022
+//
+// Purpose: Combined Divide and Square Root Floating Point and Integer Unit
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module srt(
+  input  logic clk,
+  input  logic DivStart, 
+  input  logic DivBusy, 
+  input logic  [`FMTBITS-1:0] FmtE,
+  input  logic [`NE-1:0] Xe, Ye,
+  input  logic XZeroE, YZeroE, 
+  input logic [`DIVLEN-1:0] X,
+  input logic [`DIVLEN-1:0] Dpreproc,
+  input logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
+  input logic NegSticky,
+  output logic [`QLEN-1-(`RADIX/4):0] Quot,
+  output logic [`DIVLEN+3:0]  NextWSN, NextWCN,
+  output logic [`DIVLEN+3:0]  StickyWSA,
+  output logic [`DIVLEN+3:0]  FirstWS, FirstWC,
+  output logic  [`NE+1:0] DivCalcExpM,
+  output logic [`XLEN-1:0] Rem
+);
+
+
+ /* verilator lint_off UNOPTFLAT */
+  logic [`DIVLEN+3:0]  WSA[`DIVCOPIES-1:0];
+  logic [`DIVLEN+3:0]  WCA[`DIVCOPIES-1:0];
+  logic [`DIVLEN+3:0]  WS[`DIVCOPIES-1:0];
+  logic [`DIVLEN+3:0]  WC[`DIVCOPIES-1:0];
+  logic [`QLEN-1:0] Q[`DIVCOPIES-1:0];
+  logic [`QLEN-1:0] QM[`DIVCOPIES-1:0];
+  logic [`QLEN-1:0] QNext[`DIVCOPIES-1:0];
+  logic [`QLEN-1:0] QMNext[`DIVCOPIES-1:0];
+ /* verilator lint_on UNOPTFLAT */
+  logic [`DIVLEN+3:0]  WSN, WCN;
+  logic [`DIVLEN+3:0]  D, DBar, D2, DBar2;
+  logic [`NE+1:0] DivCalcExp;
+  logic [$clog2(`XLEN+1)-1:0] intExp;
+  logic           intSign;
+  logic [`QLEN-1:0] QMMux;
+
+  // Top Muxes and Registers
+  // When start is asserted, the inputs are loaded into the divider.
+  // Otherwise, the divisor is retained and the partial remainder
+  // is fed back for the next iteration.
+  //  - when the start signal is asserted X and 0 are loaded into WS and WC
+  //  - otherwise load WSA into the flipflop
+  //  - the assumed one is added to D since it's always normalized (and X/0 is a special case handeled by result selection)
+  //  - XZeroE is used as the assumed one to avoid creating a sticky bit - all other numbers are normalized
+  if (`RADIX == 2) begin : nextw
+    assign NextWSN = {WSA[`DIVCOPIES-1][`DIVLEN+2:0], 1'b0};
+    assign NextWCN = {WCA[`DIVCOPIES-1][`DIVLEN+2:0], 1'b0};
+  end else begin
+    assign NextWSN = {WSA[`DIVCOPIES-1][`DIVLEN+1:0], 2'b0};
+    assign NextWCN = {WCA[`DIVCOPIES-1][`DIVLEN+1:0], 2'b0};
+  end
+
+  mux2   #(`DIVLEN+4) wsmux(NextWSN, {3'b000, ~XZeroE, X}, DivStart, WSN);
+  flopen   #(`DIVLEN+4) wsflop(clk, DivStart|DivBusy, WSN, WS[0]);
+  mux2   #(`DIVLEN+4) wcmux(NextWCN, {`DIVLEN+4{1'b0}}, DivStart, WCN);
+  flopen   #(`DIVLEN+4) wcflop(clk, DivStart|DivBusy, WCN, WC[0]);
+  flopen #(`DIVLEN+4) dflop(clk, DivStart, {4'b0001, Dpreproc}, D);
+  flopen #(`NE+2) expflop(clk, DivStart, DivCalcExp, DivCalcExpM);
+
+
+  // Divisor Selections
+  // - choose the negitive version of what's being selected
+  assign DBar = ~D;
+  if(`RADIX == 4) begin : d2
+    assign DBar2 = {~D[`DIVLEN+2:0], 1'b1};
+    assign D2 = {D[`DIVLEN+2:0], 1'b0};
+  end
+
+  genvar i;
+  generate
+    for(i=0; $unsigned(i)<`DIVCOPIES; i++) begin : interations
+      divinteration divinteration(.D, .DBar, .D2, .DBar2, 
+      .WS(WS[i]), .WC(WC[i]), .WSA(WSA[i]), .WCA(WCA[i]), .Q(Q[i]), .QM(QM[i]), .QNext(QNext[i]), .QMNext(QMNext[i]));
+      if(i<(`DIVCOPIES-1)) begin 
+        if (`RADIX==2)begin 
+          assign WS[i+1] = {WSA[i][`DIVLEN+1:0], 1'b0};
+          assign WC[i+1] = {WCA[i][`DIVLEN+1:0], 1'b0};
+        end else begin
+          assign WS[i+1] = {WSA[i][`DIVLEN+1:0], 2'b0};
+          assign WC[i+1] = {WCA[i][`DIVLEN+1:0], 2'b0};
+        end
+        assign Q[i+1] = QNext[i];
+        assign QM[i+1] = QMNext[i];
+      end
+    end
+  endgenerate
+
+  // if starting a new divison set Q to 0 and QM to -1
+  mux2 #(`QLEN) QMmux(QMNext[`DIVCOPIES-1], {`QLEN{1'b1}}, DivStart, QMMux);
+  flopenr #(`QLEN) Qreg(clk, DivStart, DivBusy, QNext[`DIVCOPIES-1], Q[0]);
+  flopen #(`QLEN) QMreg(clk, DivBusy, QMMux, QM[0]);
+
+  assign Quot = NegSticky ? QM[0][`QLEN-1-(`RADIX/4):0] : Q[0][`QLEN-1-(`RADIX/4):0];
+  assign FirstWS = WS[0];
+  assign FirstWC = WC[0];
+  if(`RADIX==2)
+    if (`DIVCOPIES == 1)
+      assign StickyWSA = {WSA[0][`DIVLEN+2:0], 1'b0};
+    else
+      assign StickyWSA = {WSA[1][`DIVLEN+2:0], 1'b0};
+
+  expcalc expcalc(.FmtE, .Xe, .Ye, .XZeroE, .XZeroCnt, .YZeroCnt, .DivCalcExp);
+
+endmodule
+
+////////////////
+// Submodules //
+////////////////
+
+ /* verilator lint_off UNOPTFLAT */
+module divinteration (
+  input logic [`DIVLEN+3:0] D,
+  input logic [`DIVLEN+3:0]  DBar, D2, DBar2,
+  input logic [`QLEN-1:0] Q, QM,
+  input logic [`DIVLEN+3:0]  WS, WC,
+  output logic [`QLEN-1:0] QNext, QMNext, 
+  output logic [`DIVLEN+3:0]  WSA, WCA
+);
+ /* verilator lint_on UNOPTFLAT */
+
+  logic [`DIVLEN+3:0]  Dsel;
+  logic [3:0]     q;
+  logic qp, qz;//, qn;
+
+  // Quotient Selection logic
+  // Given partial remainder, select quotient of +1, 0, or -1 (qp, qz, pm)
+  // q encoding:
+	// 1000 = +2
+	// 0100 = +1
+	// 0000 =  0
+	// 0010 = -1
+	// 0001 = -2
+  if(`RADIX == 2) begin : qsel
+    qsel2 qsel2(WS[`DIVLEN+3:`DIVLEN], WC[`DIVLEN+3:`DIVLEN], qp, qz);//, qn);
+  end else begin
+    qsel4 qsel4(.D, .WS, .WC, .q);
+  end
+
+  if(`RADIX == 2) begin : dsel
+    assign Dsel = {`DIVLEN+4{~qz}}&(qp ? DBar : D);
+  end else begin
+    always_comb
+      case (q)
+        4'b1000: Dsel = DBar2;
+        4'b0100: Dsel = DBar;
+        4'b0000: Dsel = '0;
+        4'b0010: Dsel = D;
+        4'b0001: Dsel = D2;
+        default: Dsel = 'x;
+      endcase
+  end
+  // Partial Product Generation
+  //  WSA, WCA = WS + WC - qD
+  if (`RADIX == 2) begin : csa
+    csa #(`DIVLEN+4) csa(WS, WC, Dsel, qp, WSA, WCA);
+  end else begin
+    csa #(`DIVLEN+4) csa(WS, WC, Dsel, |q[3:2], WSA, WCA);
+  end
+
+  if (`RADIX == 2) begin : otfc
+    otfc2 otfc2(.qp, .qz, .Q, .QM, .QNext, .QMNext);
+  end else begin
+    otfc4 otfc4(.q, .Q, .QM, .QNext, .QMNext);
+  end
+
+endmodule
+
+
+/////////
+// csa //
+/////////
+module csa #(parameter N=69) (
+  input  logic [N-1:0] in1, in2, in3, 
+  input  logic         cin, 
+  output logic [N-1:0] out1, out2
+);
+
+  // This block adds in1, in2, in3, and cin to produce 
+  // a result out1 / out2 in carry-save redundant form.
+  // cin is just added to the least significant bit and
+  // is Startuired to handle adding a negative divisor.
+  // Fortunately, the carry (out2) is shifted left by one
+  // bit, leaving room in the least significant bit to 
+  // insert cin.
+
+  assign out1 = in1 ^ in2 ^ in3;
+  assign out2 = {in1[N-2:0] & (in2[N-2:0] | in3[N-2:0]) | 
+		    (in2[N-2:0] & in3[N-2:0]), cin};
+endmodule
+
+module expcalc(
+  input logic  [`FMTBITS-1:0] FmtE,
+  input  logic [`NE-1:0] Xe, Ye,
+  input logic XZeroE, 
+  input logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
+  output logic  [`NE+1:0] DivCalcExp
+  );
+    logic [`NE-2:0] Bias;
+    
+    if (`FPSIZES == 1) begin
+        assign Bias = (`NE-1)'(`BIAS); 
+
+    end else if (`FPSIZES == 2) begin
+        assign Bias = FmtE ? (`NE-1)'(`BIAS) : (`NE-1)'(`BIAS1); 
+
+    end else if (`FPSIZES == 3) begin
+        always_comb
+            case (FmtE)
+                `FMT: Bias  =  (`NE-1)'(`BIAS);
+                `FMT1: Bias = (`NE-1)'(`BIAS1);
+                `FMT2: Bias = (`NE-1)'(`BIAS2);
+                default: Bias = 'x;
+            endcase
+
+    end else if (`FPSIZES == 4) begin        
+        always_comb
+            case (FmtE)
+                2'h3: Bias =  (`NE-1)'(`Q_BIAS);
+                2'h1: Bias =  (`NE-1)'(`D_BIAS);
+                2'h0: Bias =  (`NE-1)'(`S_BIAS);
+                2'h2: Bias =  (`NE-1)'(`H_BIAS);
+            endcase
+    end
+    // correct exponent for denormalized input's normalization shifts
+    assign DivCalcExp = ({2'b0, Xe} - {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, XZeroCnt} - {2'b0, Ye} + {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, YZeroCnt} + {3'b0, Bias})&{`NE+2{~XZeroE}};
+    endmodule
--- a/pipelined/src/fpu/srtfsm.sv
+++ b/pipelined/src/fpu/srtfsm.sv
@ -33,37 +33,45 @@
 module srtfsm(
  input  logic clk, 
  input  logic reset, 
-  input logic [`DIVLEN+3:0] WSN, WCN, WS, WC,
+  input logic [`DIVLEN+3:0] NextWSN, NextWCN, WS, WC,
  input  logic XInfE, YInfE, 
  input  logic XZeroE, YZeroE, 
  input  logic XNaNE, YNaNE, 
  input  logic DivStart, 
-  input logic StallE,
-  input logic StallM,
-  input  logic [$clog2(`DIVLEN/2+3)-1:0] Dur,
-  output logic [$clog2(`DIVLEN/2+3)-1:0] EarlyTermShiftDiv2E,
+  input  logic StallE,
+  input  logic StallM,
+  input  logic [`DIVLEN+3:0] StickyWSA,
+  input  logic [`DURLEN-1:0] Dur,
+  output logic [`DURLEN-1:0] EarlyTermShiftE,
  output logic DivStickyE,
  output logic DivDone,
-  output logic DivNegStickyE,
+  output logic NegSticky,
  output logic DivBusy
  );
  
  typedef enum logic [1:0] {IDLE, BUSY, DONE} statetype;
  statetype state;

-  logic [$clog2(`DIVLEN/2+3)-1:0] step;
+  logic [`DURLEN-1:0] step;
  logic WZero;
  //logic [$clog2(`DIVLEN/2+3)-1:0] Dur;
  logic [`DIVLEN+3:0] W;

  //flopen #($clog2(`DIVLEN/2+3)) durflop(clk, DivStart, CalcDur, Dur);
  assign DivBusy = (state == BUSY);
-  assign WZero = ((WSN^WCN)=={WSN[`DIVLEN+2:0]|WCN[`DIVLEN+2:0], 1'b0});
-  assign DivStickyE = ~WZero;
+  assign WZero = ((NextWSN^NextWCN)=={NextWSN[`DIVLEN+2:0]|NextWCN[`DIVLEN+2:0], 1'b0});
+  // calculate sticky bit
+  //    - there is a chance that a value is subtracted infinitly, resulting in an exact QM result
+  //      this is only a problem on radix 2 (and pssibly maximally redundant 4) since minimally redundant
+  //      radix-4 division can't create a QM that continually adds 0's
+  if (`RADIX == 2)
+    assign DivStickyE = |W&~(StickyWSA == WS);
+  else
+    assign DivStickyE = |W;
  assign DivDone = (state == DONE);
  assign W = WC+WS;
-  assign DivNegStickyE = W[`DIVLEN+3]; //*** is there a better way to do this???
-  assign EarlyTermShiftDiv2E = step;
+  assign NegSticky = W[`DIVLEN+3]; //*** is there a better way to do this???
+  assign EarlyTermShiftE = step;

  always_ff @(posedge clk) begin
      if (reset) begin
@ -73,7 +81,7 @@ module srtfsm(
          if (XZeroE|YZeroE|XInfE|YInfE|XNaNE|YNaNE) state <= #1 DONE;
          else         state <= #1 BUSY;
      end else if (state == BUSY) begin
-          if ((~|step[$clog2(`DIVLEN/2+3)-1:1]&step[0])|WZero) begin
+          if ((~|step[`DURLEN-1:1]&step[0])|WZero) begin
              state <= #1 DONE;
          end
          step <= step - 1;
--- a/pipelined/src/fpu/srtpreproc.sv
+++ b/pipelined/src/fpu/srtpreproc.sv
@ -31,11 +31,11 @@
 `include "wally-config.vh"

 module srtpreproc (
-  input  logic [`NF:0] XManE, YManE,
+  input  logic [`NF:0] Xm, Ym,
  output logic [`DIVLEN-1:0] X,
  output logic [`DIVLEN-1:0] Dpreproc,
  output logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
-  output logic [$clog2(`DIVLEN/2+3)-1:0] Dur
+  output logic [`DURLEN-1:0] Dur
 );
  // logic  [`XLEN-1:0] PosA, PosB;
  // logic  [`DIVLEN-1:0] ExtraA, ExtraB, PreprocA, PreprocB, PreprocX, PreprocY;
@ -49,24 +49,33 @@ module srtpreproc (

  // ***can probably merge X LZC with conversion
  // cout the number of leading zeros
-  lzc #(`NF+1) lzcA (XManE, XZeroCnt);
-  lzc #(`NF+1) lzcB (YManE, YZeroCnt);
+  lzc #(`NF+1) lzcA (Xm, XZeroCnt);
+  lzc #(`NF+1) lzcB (Ym, YZeroCnt);

  // assign ExtraA = {PosA, {`DIVLEN-`XLEN{1'b0}}};
  // assign ExtraB = {PosB, {`DIVLEN-`XLEN{1'b0}}};

  // assign PreprocA = ExtraA << zeroCntA;
  // assign PreprocB = ExtraB << (zeroCntB + 1);
-  assign PreprocX = {XManE[`NF-1:0]<<XZeroCnt, {`DIVLEN-`NF{1'b0}}};
-  assign PreprocY = {YManE[`NF-1:0]<<YZeroCnt, {`DIVLEN-`NF{1'b0}}};
+  assign PreprocX = {Xm[`NF-1:0]<<XZeroCnt, {`DIVLEN-`NF{1'b0}}};
+  assign PreprocY = {Ym[`NF-1:0]<<YZeroCnt, {`DIVLEN-`NF{1'b0}}};

  
  assign X = PreprocX;
  assign Dpreproc = PreprocY;
-
-  assign Dur = ($clog2(`DIVLEN/2+3))'(`DIVLEN/2+2);
+  assign Dur = (`DURLEN)'(`FPDUR);
  // assign intExp = zeroCntB - zeroCntA + 1;
  // assign intSign = Signed & (SrcA[`XLEN - 1] ^ SrcB[`XLEN - 1]);

+  //           radix 2     radix 4
+  // 1 copies  DIVLEN+2    DIVLEN+2/2
+  // 2 copies  DIVLEN+2/2  DIVLEN+2/2*2
+  // 4 copies  DIVLEN+2/4  DIVLEN+2/2*4
+  // 8 copies  DIVLEN+2/8  DIVLEN+2/2*8
+
+  // DIVRESLEN = DIVLEN or DIVLEN+2
+  // r = 1 or 2
+  // DIVRESLEN/(r*`DIVCOPIES)
+

 endmodule
--- a/pipelined/src/generic/lzc.sv
+++ b/pipelined/src/generic/lzc.sv
@ -34,7 +34,7 @@ module lzc #(parameter WIDTH = 1) (
 /* verilator lint_off CMPCONST */
 /* verilator lint_off WIDTH */
    
-    int i;
+    logic [31:0] i;
    always_comb begin
        i = 0;
        while (~num[WIDTH-1-i] & (i < WIDTH)) i = i+1;  // search for leading one
--- a/pipelined/src/ifu/ifu.sv
+++ b/pipelined/src/ifu/ifu.sv
@ -226,7 +226,7 @@ module ifu (
      icache(.clk, .reset, .CPUBusy, .IgnoreRequestTLB(ITLBMissF), .TrapM(TrapM), .IgnoreRequestTrapM('0),
             .CacheBusWriteData(ICacheBusWriteData), .CacheBusAck(ICacheBusAck),
             .CacheBusAdr(ICacheBusAdr), .CacheStall(ICacheStallF), 
-             .CacheFetchLine(ICacheFetchLine), .FWriteDataM(), .FpLoadStoreM(), .FLoad2(),
+             .CacheFetchLine(ICacheFetchLine), .FStore2(),
             .CacheWriteLine(), .ReadDataWord(FinalInstrRawF),
             .Cacheable(CacheableF),
             .CacheMiss(ICacheMiss), .CacheAccess(ICacheAccess),
--- a/pipelined/src/lsu/lsu.sv
+++ b/pipelined/src/lsu/lsu.sv
@ -58,7 +58,7 @@ module lsu (
   input logic              sfencevmaM,
   // fpu
   input logic [`FLEN-1:0]  FWriteDataM,
-   input logic              FLoad2,
+   input logic              FStore2,
   input logic              FpLoadStoreM,
   // faults
   output logic             LoadPageFaultM, StoreAmoPageFaultM,
@ -192,7 +192,8 @@ module lsu (
  //  Memory System
  //  Either Data Cache or Data Tightly Integrated Memory or just bus interface
  /////////////////////////////////////////////////////////////////////////////////////////////
-  logic [`XLEN-1:0]    AMOWriteDataM, FinalWriteDataM, LittleEndianWriteDataM;
+  logic [`XLEN-1:0]    AMOWriteDataM, IEUWriteDataM, LittleEndianWriteDataM;
+  logic [`LLEN-1:0]    FinalWriteDataM;
  logic [`LLEN-1:0]    ReadDataWordM, LittleEndianReadDataWordM;
  logic [`LLEN-1:0]    ReadDataWordMuxM;
  logic                IgnoreRequest;
@ -202,7 +203,7 @@ module lsu (
  if (`DMEM == `MEM_TIM) begin : dtim
    // *** directly instantiate RAM or ROM here.  Instantiate SRAM1P1RW.  
    // Merge SimpleRAM and SRAM1p1rw into one that is good for synthesis and RAM libraries and flops
-    dtim dtim(.clk, .reset, .CPUBusy, .LSURWM, .IEUAdrM, .IEUAdrE, .TrapM, .FinalWriteDataM, 
+    dtim dtim(.clk, .reset, .CPUBusy, .LSURWM, .IEUAdrM, .IEUAdrE, .TrapM, .FinalWriteDataM(IEUWriteDataM), //*** fix the dtim FinalWriteData
              .ReadDataWordM(ReadDataWordM[`XLEN-1:0]), .BusStall, .LSUBusWrite,.LSUBusRead, .BusCommittedM,
              .DCacheStallM, .DCacheCommittedM, .ByteMaskM, .Cacheable(CacheableM),
              .DCacheMiss, .DCacheAccess);
@ -230,15 +231,19 @@ module lsu (

    mux2 #(`LLEN) UnCachedDataMux(.d0(LittleEndianReadDataWordM), .d1({{`LLEN-`XLEN{1'b0}}, DCacheBusWriteData[`XLEN-1:0]}),
      .s(SelUncachedAdr), .y(ReadDataWordMuxM));
-    mux2 #(`XLEN) LsuBushwdataMux(.d0(ReadDataWordM[`XLEN-1:0]), .d1(FinalWriteDataM),
+    mux2 #(`XLEN) LsuBushwdataMux(.d0(ReadDataWordM[`XLEN-1:0]), .d1(IEUWriteDataM),
      .s(SelUncachedAdr), .y(LSUBusHWDATA));
    
    if(CACHE_ENABLED) begin : dcache
+      if (`LLEN>`XLEN)
+        mux2 #(`LLEN) datamux({IEUWriteDataM, IEUWriteDataM}, FWriteDataM, FpLoadStoreM, FinalWriteDataM);
+      else
+        assign FinalWriteDataM = {{`LLEN-`XLEN{1'b0}}, IEUWriteDataM};
      cache #(.LINELEN(`DCACHE_LINELENINBITS), .NUMLINES(`DCACHE_WAYSIZEINBYTES*8/LINELEN),
              .NUMWAYS(`DCACHE_NUMWAYS), .LOGWPL(LOGWPL), .WORDLEN(`LLEN), .MUXINTERVAL(`XLEN), .DCACHE(1)) dcache(
        .clk, .reset, .CPUBusy, .LSUBusWriteCrit, .RW(LSURWM), .Atomic(LSUAtomicM),
        .FlushCache(FlushDCacheM), .NextAdr(LSUAdrE), .PAdr(LSUPAdrM), 
-        .ByteMask(ByteMaskM), .WordCount, .FpLoadStoreM, .FWriteDataM, .FLoad2,
+        .ByteMask(ByteMaskM), .WordCount, .FStore2,
        .FinalWriteData(FinalWriteDataM), .Cacheable(CacheableM),
        .CacheStall(DCacheStallM), .CacheMiss(DCacheMiss), .CacheAccess(DCacheAccess),
        .IgnoreRequestTLB, .IgnoreRequestTrapM, .TrapM(1'b0), .CacheCommitted(DCacheCommittedM), 
@ -286,10 +291,10 @@ module lsu (
  //  swap the bytes when read from big-endian memory
  /////////////////////////////////////////////////////////////////////////////////////////////
  if (`BIGENDIAN_SUPPORTED) begin:endian
-    bigendianswap #(`XLEN) storeswap(.BigEndianM, .a(LittleEndianWriteDataM), .y(FinalWriteDataM));
+    bigendianswap #(`XLEN) storeswap(.BigEndianM, .a(LittleEndianWriteDataM), .y(IEUWriteDataM));
    bigendianswap #(`LLEN) loadswap(.BigEndianM, .a(ReadDataWordM), .y(LittleEndianReadDataWordM));
  end else begin
-    assign FinalWriteDataM = LittleEndianWriteDataM;
+    assign IEUWriteDataM = LittleEndianWriteDataM;
    assign LittleEndianReadDataWordM = ReadDataWordM;
  end

--- a/pipelined/src/uncore/plic_apb.sv
+++ b/pipelined/src/uncore/plic_apb.sv
@ -172,8 +172,8 @@ module plic_apb (
  end

  // pending interrupt requests
-  //assign nextIntPending = (intPending | requests) & ~intInProgress; // 
-  assign nextIntPending = requests; // DH: RT made this change May 2022, but it seems to be a bug to not consider intInProgress; see May 23, 2022 slack discussion
+  assign nextIntPending = (intPending | requests) & ~intInProgress; // dh changed back 7/9/22 see if Buildroot still boots.  Confirmed to boot successfully.
+  //assign nextIntPending = requests; // DH: RT made this change May 2022, but it seems to be a bug to not consider intInProgress; see May 23, 2022 slack discussion
  flopr #(`N) intPendingFlop(PCLK,~PRESETn,nextIntPending,intPending);

  // context-dependent signals
--- a/pipelined/src/wally/wallypipelinedcore.sv
+++ b/pipelined/src/wally/wallypipelinedcore.sv
@ -93,7 +93,7 @@ module wallypipelinedcore (
  logic             FStallD;
  logic             FWriteIntE;
  logic [`XLEN-1:0]         FWriteDataE;
-  logic                     FLoad2;
+  logic                     FStore2;
  logic [`FLEN-1:0]         FWriteDataM;
  logic [`XLEN-1:0]         FIntResM;  
  logic [`XLEN-1:0]         FCvtIntResW;  
@ -259,7 +259,7 @@ module wallypipelinedcore (
  .CommittedM, .DCacheMiss, .DCacheAccess,
  .SquashSCW,            
  .FpLoadStoreM,
-  .FWriteDataM, .FLoad2,
+  .FWriteDataM, .FStore2,
  //.DataMisalignedM(DataMisalignedM),
  .IEUAdrE, .IEUAdrM, .WriteDataE,
  .ReadDataW, .FlushDCacheM,
@ -400,7 +400,7 @@ module wallypipelinedcore (
         .STATUS_FS, // is floating-point enabled?
         .FRegWriteM, // FP register write enable
         .FpLoadStoreM,
-         .FLoad2,
+         .FStore2,
         .FStallD, // Stall the decode stage
         .FWriteIntE, // integer register write enable
         .FWriteDataE, // Data to be written to memory
--- a/pipelined/srt/exptestgen.c
+++ b/pipelined/srt/exptestgen.c
@ -96,6 +96,10 @@ void output(FILE *fptr, int aSign, int aExp, double aFrac, int bSign, int bExp,
  // Print r in standard double format
  fprintf(fptr, "%03x", rExp|(rSign<<11));
  printhex(fptr, rFrac);
+  fprintf(fptr, "_");
+
+  // Spacing for testbench, value doesn't matter
+  fprintf(fptr, "%016x", 0);
  fprintf(fptr, "\n");
 }

--- a/pipelined/srt/lint-srt
+++ b/pipelined/srt/lint-srt
@ -1,2 +1 @@
 verilator --lint-only --top-module srt srt.sv -I../config/rv64gc -I../config/shared ../src/generic/*.sv ../src/generic/flop/*.sv
-verilator --lint-only --top-module srtradix4 srt-radix4.sv qsel4.sv -I../config/rv64gc -I../config/shared ../src/generic/*.sv ../src/generic/flop/*.sv
--- a/pipelined/srt/qslc_r4a2.c
+++ b/pipelined/srt/qslc_r4a2.c
@ -1,198 +0,0 @@
-/*
-  Program:      qslc_r4a2.c
-  Description:  Prints out Quotient Selection Table (assumes CPA is utilized to reduce memory)
-  User:         James E. Stine
-
-*/
-
-#include <stdio.h>
-#include <math.h>
-
-#define DIVISOR_SIZE 3
-#define CARRY_SIZE 7
-#define SUM_SIZE 7
-#define TOT_SIZE 7
-
-void disp_binary(double, int, int);
-
-struct bits {
-  unsigned int divisor : DIVISOR_SIZE;
-  int tot : TOT_SIZE;
-} pla;
-
-/* 
-
-   Function:      disp_binary
-   Description:   This function displays a Double-Precision number into
-   four 16 bit integers using the global union variable 
-   dp_number
-   Argument List: double x            The value to be converted
-   int bits_to_left    Number of bits left of radix point
-   int bits_to_right   Number of bits right of radix point
-   Return value:  none
-
-*/
-void disp_binary(double x, int bits_to_left, int bits_to_right) {
-  int i; 
-  double diff;
-
-  if (fabs(x) <  pow(2.0, ((double) -bits_to_right)) ) {
-    for (i = -bits_to_left + 1; i <= bits_to_right; i++) {
-      printf("0");
-    }
-    if (i == bits_to_right+1) 
-      ;
-    
-    return;
-  }
-
-  if (x < 0.0) 
-    x = pow(2.0, ((double) bits_to_left)) + x;
-
-  for (i = -bits_to_left + 1; i <= bits_to_right; i++) {
-    diff = pow(2.0, ((double) -i) );
-    if (x < diff) 
-      printf("0");
-    else {
-      printf("1");
-      x -= diff;
-    }
-    if (i == 0) 
-      ;
-    
-  }
-
-}
-
-int main() {
-  int m;
-  int n;
-  int o;
-  pla.divisor = 0;
-  pla.tot = 0;
-  printf("\tcase({D[5:3],Wmsbs})\n");
-  for (o=0; o < pow(2.0, DIVISOR_SIZE); o++) {
-    for (m=0; m < pow(2.0, TOT_SIZE); m++) {
-      printf("\t\t10'b");
-      disp_binary((double) pla.divisor, DIVISOR_SIZE, 0);
-      printf("_");
-      disp_binary((double) pla.tot, TOT_SIZE, 0);
-      printf(": q = 4'b");
-
-      /*
-	4 bits for Radix 4 (a=2)
-	1000 = +2
-	0100 = +1
-	0000 =  0
-	0010 = -1
-	0001 = -2		
-      */
-      switch (pla.divisor) {
-      case 0:
-	if ((pla.tot) >= 12)
-	  printf("1000");
-	else if ((pla.tot) >= 4)
-	  printf("0100");
-	else if ((pla.tot) >= -4)
-	  printf("0000");
-	else if ((pla.tot) >= -13)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 1:
-	if ((pla.tot) >= 14)
-	  printf("1000");
-	else if ((pla.tot) >= 4)
-	  printf("0100");
-	else if ((pla.tot) >= -6)
-	  printf("0000");
-	else if ((pla.tot) >= -15)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 2:
-	if ((pla.tot) >= 15)
-	  printf("1000");
-	else if ((pla.tot) >= 4)
-	  printf("0100");
-	else if ((pla.tot) >= -6)
-	  printf("0000");
-	else if ((pla.tot) >= -16)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 3:
-	if ((pla.tot) >= 16)
-	  printf("1000");
-	else if ((pla.tot) >= 4)
-	  printf("0100");
-	else if ((pla.tot) >= -6)
-	  printf("0000");
-	else if ((pla.tot) >= -18)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 4:
-	if ((pla.tot) >= 18)
-	  printf("1000");
-	else if ((pla.tot) >= 6)
-	  printf("0100");
-	else if ((pla.tot) >= -8)
-	  printf("0000");
-	else if ((pla.tot) >= -20)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 5:
-	if ((pla.tot) >= 20)
-	  printf("1000");
-	else if ((pla.tot) >= 6)
-	  printf("0100");
-	else if ((pla.tot) >= -8)
-	  printf("0000");
-	else if ((pla.tot) >= -20)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 6:
-	if ((pla.tot) >= 20)
-	  printf("1000");
-	else if ((pla.tot) >= 8)
-	  printf("0100");
-	else if ((pla.tot) >= -8)
-	  printf("0000");
-	else if ((pla.tot) >= -22)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 7:
-	if ((pla.tot) >= 24)
-	  printf("1000");
-	else if ((pla.tot) >= 8)
-	  printf("0100");
-	else if ((pla.tot) >= -8)
-	  printf("0000");
-	else if ((pla.tot) >= -24)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      default: printf ("XXX");
-			
-      }
-			
-      printf(";\n");
-      (pla.tot)++;
-    }
-    (pla.divisor)++;
-  }
-  printf("\tendcase\n");
-  
-}
--- a/pipelined/srt/qslc_r4a2b
+++ b/pipelined/srt/qslc_r4a2b
--- a/pipelined/srt/qslc_r4a2b.c
+++ b/pipelined/srt/qslc_r4a2b.c
@ -1,190 +0,0 @@
-/*
-  Program:      qslc_r4a2.c
-  Description:  Prints out Quotient Selection Table (assumes CPA is utilized to reduce memory)
-  User:         James E. Stine
-
-*/
-
-#include <stdio.h>
-#include <math.h>
-
-#define DIVISOR_SIZE 3
-#define CARRY_SIZE 7
-#define SUM_SIZE 7
-#define TOT_SIZE 7
-
-void disp_binary(double, int, int);
-
-struct bits {
-  unsigned int divisor : DIVISOR_SIZE;
-  int tot : TOT_SIZE;
-} pla;
-
-/* 
-
-   Function:      disp_binary
-   Description:   This function displays a Double-Precision number into
-   four 16 bit integers using the global union variable 
-   dp_number
-   Argument List: double x            The value to be converted
-   int bits_to_left    Number of bits left of radix point
-   int bits_to_right   Number of bits right of radix point
-   Return value:  none
-
-*/
-void disp_binary(double x, int bits_to_left, int bits_to_right) {
-  int i; 
-  double diff;
-
-  if (fabs(x) <  pow(2.0, ((double) -bits_to_right)) ) {
-    for (i = -bits_to_left + 1; i <= bits_to_right; i++) {
-      printf("0");
-    }
-    if (i == bits_to_right+1) 
-      ;
-    
-    return;
-  }
-
-  if (x < 0.0) 
-    x = pow(2.0, ((double) bits_to_left)) + x;
-
-  for (i = -bits_to_left + 1; i <= bits_to_right; i++) {
-    diff = pow(2.0, ((double) -i) );
-    if (x < diff) 
-      printf("0");
-    else {
-      printf("1");
-      x -= diff;
-    }
-    if (i == 0) 
-      ;
-    
-  }
-
-}
-
-int main() {
-  int m;
-  int n;
-  int o;
-  pla.divisor = 0;
-  pla.tot = 0;
-  for (o=0; o < pow(2.0, DIVISOR_SIZE); o++) {
-    for (m=0; m < pow(2.0, TOT_SIZE); m++) {
-      /*
-	4 bits for Radix 4 (a=2)
-	1000 = +2
-	0100 = +1
-	0000 =  0
-	0010 = -1
-	0001 = -2		
-      */
-      switch (pla.divisor) {
-      case 0:
-	if ((pla.tot) >= 12)
-	  printf("8");
-	else if ((pla.tot) >= 4)
-	  printf("4");
-	else if ((pla.tot) >= -4)
-	  printf("0");
-	else if ((pla.tot) >= -13)
-	  printf("2");
-	else
-	  printf("1");
-	break;
-      case 1:
-	if ((pla.tot) >= 14)
-	  printf("8");
-	else if ((pla.tot) >= 4)
-	  printf("4");
-	else if ((pla.tot) >= -6)
-	  printf("0");
-	else if ((pla.tot) >= -15)
-	  printf("2");
-	else
-	  printf("1");
-	break;
-      case 2:
-	if ((pla.tot) >= 15)
-	  printf("8");
-	else if ((pla.tot) >= 4)
-	  printf("4");
-	else if ((pla.tot) >= -6)
-	  printf("0");
-	else if ((pla.tot) >= -16)
-	  printf("2");
-	else
-	  printf("1");
-	break;
-      case 3:
-	if ((pla.tot) >= 16)
-	  printf("8");
-	else if ((pla.tot) >= 4)
-	  printf("4");
-	else if ((pla.tot) >= -6)
-	  printf("0");
-	else if ((pla.tot) >= -18)
-	  printf("2");
-	else
-	  printf("1");
-	break;
-      case 4:
-	if ((pla.tot) >= 18)
-	  printf("8");
-	else if ((pla.tot) >= 6)
-	  printf("4");
-	else if ((pla.tot) >= -8)
-	  printf("0");
-	else if ((pla.tot) >= -20)
-	  printf("2");
-	else
-	  printf("1");
-	break;
-      case 5:
-	if ((pla.tot) >= 20)
-	  printf("8");
-	else if ((pla.tot) >= 6)
-	  printf("4");
-	else if ((pla.tot) >= -8)
-	  printf("0");
-	else if ((pla.tot) >= -20)
-	  printf("2");
-	else
-	  printf("1");
-	break;
-      case 6:
-	if ((pla.tot) >= 20)
-	  printf("8");
-	else if ((pla.tot) >= 8)
-	  printf("4");
-	else if ((pla.tot) >= -8)
-	  printf("0");
-	else if ((pla.tot) >= -22)
-	  printf("2");
-	else
-	  printf("1");
-	break;
-      case 7:
-	if ((pla.tot) >= 24)
-	  printf("8");
-	else if ((pla.tot) >= 8)
-	  printf("4");
-	else if ((pla.tot) >= -8)
-	  printf("0");
-	else if ((pla.tot) >= -24)
-	  printf("2");
-	else
-	  printf("1");
-	break;
-      default: printf ("X");
-			
-      }
-			
-      printf("\n");
-      (pla.tot)++;
-    }
-    (pla.divisor)++;
-  }
-  
-}
--- a/pipelined/srt/qslc_r4a2b.tv
+++ b/pipelined/srt/qslc_r4a2b.tv
--- a/pipelined/srt/qslc_sqrt_r4a2
+++ b/pipelined/srt/qslc_sqrt_r4a2
--- a/pipelined/srt/qslc_sqrt_r4a2.c
+++ b/pipelined/srt/qslc_sqrt_r4a2.c
@ -1,198 +0,0 @@
-/*
-  Program:      qslc_r4a2.c
-  Description:  Prints out Quotient Selection Table (assumes CPA is utilized to reduce memory)
-  User:         James E. Stine
-
-*/
-
-#include <stdio.h>
-#include <math.h>
-
-#define DIVISOR_SIZE 3
-#define CARRY_SIZE 7
-#define SUM_SIZE 7
-#define TOT_SIZE 7
-
-void disp_binary(double, int, int);
-
-struct bits {
-  unsigned int divisor : DIVISOR_SIZE;
-  int tot : TOT_SIZE;
-} pla;
-
-/* 
-
-   Function:      disp_binary
-   Description:   This function displays a Double-Precision number into
-   four 16 bit integers using the global union variable 
-   dp_number
-   Argument List: double x            The value to be converted
-   int bits_to_left    Number of bits left of radix point
-   int bits_to_right   Number of bits right of radix point
-   Return value:  none
-
-*/
-void disp_binary(double x, int bits_to_left, int bits_to_right) {
-  int i; 
-  double diff;
-
-  if (fabs(x) <  pow(2.0, ((double) -bits_to_right)) ) {
-    for (i = -bits_to_left + 1; i <= bits_to_right; i++) {
-      printf("0");
-    }
-    if (i == bits_to_right+1) 
-      ;
-    
-    return;
-  }
-
-  if (x < 0.0) 
-    x = pow(2.0, ((double) bits_to_left)) + x;
-
-  for (i = -bits_to_left + 1; i <= bits_to_right; i++) {
-    diff = pow(2.0, ((double) -i) );
-    if (x < diff) 
-      printf("0");
-    else {
-      printf("1");
-      x -= diff;
-    }
-    if (i == 0) 
-      ;
-    
-  }
-
-}
-
-int main() {
-  int m;
-  int n;
-  int o;
-  pla.divisor = 0;
-  pla.tot = 0;
-  printf("\tcase({D[5:3],Wmsbs})\n");
-  for (o=0; o < pow(2.0, DIVISOR_SIZE); o++) {
-    for (m=0; m < pow(2.0, TOT_SIZE); m++) {
-      printf("\t\t11'b");
-      disp_binary((double) pla.divisor, DIVISOR_SIZE, 0);
-      printf("_");
-      disp_binary((double) pla.tot, TOT_SIZE, 0);
-      printf(": q = 4'b");
-
-      /*
-	4 bits for Radix 4 (a=2)
-	1000 = +2
-	0100 = +1
-	0000 =  0
-	0010 = -1
-	0001 = -2		
-      */
-      switch (pla.divisor) {
-      case 0:
-	if ((pla.tot) >= 24)
-	  printf("1000");
-	else if ((pla.tot) >= 8)
-	  printf("0100");
-	else if ((pla.tot) >= -8)
-	  printf("0000");
-	else if ((pla.tot) >= -26)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 1:
-	if ((pla.tot) >= 28)
-	  printf("1000");
-	else if ((pla.tot) >= 8)
-	  printf("0100");
-	else if ((pla.tot) >= -10)
-	  printf("0000");
-	else if ((pla.tot) >= -28)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 2:
-	if ((pla.tot) >= 32)
-	  printf("1000");
-	else if ((pla.tot) >= 8)
-	  printf("0100");
-	else if ((pla.tot) >= -12)
-	  printf("0000");
-	else if ((pla.tot) >= -32)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 3:
-	if ((pla.tot) >= 32)
-	  printf("1000");
-	else if ((pla.tot) >= 8)
-	  printf("0100");
-	else if ((pla.tot) >= -12)
-	  printf("0000");
-	else if ((pla.tot) >= -34)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 4:
-	if ((pla.tot) >= 36)
-	  printf("1000");
-	else if ((pla.tot) >= 12)
-	  printf("0100");
-	else if ((pla.tot) >= -12)
-	  printf("0000");
-	else if ((pla.tot) >= -36)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 5:
-	if ((pla.tot) >= 40)
-	  printf("1000");
-	else if ((pla.tot) >= 12)
-	  printf("0100");
-	else if ((pla.tot) >= -16)
-	  printf("0000");
-	else if ((pla.tot) >= -40)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 6:
-	if ((pla.tot) >= 40)
-	  printf("1000");
-	else if ((pla.tot) >= 16)
-	  printf("0100");
-	else if ((pla.tot) >= -16)
-	  printf("0000");
-	else if ((pla.tot) >= -44)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      case 7:
-	if ((pla.tot) >= 44)
-	  printf("1000");
-	else if ((pla.tot) >= 16)
-	  printf("0100");
-	else if ((pla.tot) >= -16)
-	  printf("0000");
-	else if ((pla.tot) >= -46)
-	  printf("0010");
-	else
-	  printf("0001");
-	break;
-      default: printf ("XXX");
-			
-      }
-			
-      printf(";\n");
-      (pla.tot)++;
-    }
-    (pla.divisor)++;
-  }
-  printf("\tendcase\n");
-  
-}
--- a/pipelined/srt/qslc_sqrt_r4a2.sv
+++ b/pipelined/srt/qslc_sqrt_r4a2.sv
--- a/pipelined/srt/sqrttestgen
+++ b/pipelined/srt/sqrttestgen
--- a/pipelined/srt/sqrttestgen.c
+++ b/pipelined/srt/sqrttestgen.c
@ -30,15 +30,11 @@ void main(void)
  FILE *fptr;
  double aFrac, rFrac;
  int    aExp,  rExp;
-  double mans[ENTRIES] = {1, 1.5, 1.25, 1.125, 1.0625,
+  double mans[ENTRIES] = {1, 1849.0/1024, 1.25, 1.125, 1.0625,
 			  1.75, 1.875, 1.99999,
-			  1.1, 1.2, 1.01, 1.001, 1.0001,
-<<<<<<< Updated upstream
-			  1/1.1, 1/1.5, 1/1.25, 1/1.125};
-=======
+			  1.1, 1.5, 1.01, 1.001, 1.0001,
 			  2/1.1, 2/1.5, 2/1.25, 2/1.125};
->>>>>>> Stashed changes
-  double exps[ENTRIES] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+  double exps[ENTRIES] = {0, 0, 2, 3, 4, 5, 6, 7, 8, 1, 10,
        11, 12, 13, 14, 15, 16};
  int i;
  int bias = 1023;
@ -51,10 +47,19 @@ void main(void)
  for (i=0; i<ENTRIES; i++) {
    aFrac = mans[i];
    aExp  = exps[i] + bias;
-    rFrac = sqrt(aFrac * pow(2, aExp - bias));
+    rFrac = sqrt(aFrac * pow(2, exps[i]));
    rExp  = (int) (log(rFrac)/log(2) + bias);
    output(fptr, aExp, aFrac, rExp, rFrac);
  }
+
+  //                                  WS
+  // Test 1: sqrt(1) = 1              0000 0000 0000 00
+  // Test 2: sqrt(1849/1024) = 43/32  0000 1100 1110 01
+  // Test 3: sqrt(5)                  0000 0100 0000 00
+  // Test 4: sqrt(9) = 3              1111 1001 0000 00
+  // Test 5: sqrt(17)                 0000 0001 0000 00
+  // Test 6: sqrt(56)                 1111 1110 0000 00
+  // Test 7: sqrt(120)                0000 1110 0000 00
  
  // for (i = 0; i< RANDOM_VECS; i++) {
  //   a = random_input();
@ -69,14 +74,23 @@ void main(void)

 void output(FILE *fptr, int aExp, double aFrac, int rExp, double rFrac)
 {
+  // Print a in standard double format
  fprintf(fptr, "%03x", aExp);
  printhex(fptr, aFrac);
  fprintf(fptr, "_");
+
+  // Spacing for testbench, value doesn't matter
+  fprintf(fptr, "%016x", 0);
+  fprintf(fptr, "_");
+
+  // Print r in standard double format
  fprintf(fptr, "%03x", rExp);
  printhex(fptr, rFrac);
+  fprintf(fptr, "_");
+
+  // Spacing for testbench, value doesn't matter
+  fprintf(fptr, "%016x", 0);
  fprintf(fptr, "\n");
-
-
 }

 void printhex(FILE *fptr, double m)
--- a/pipelined/srt/srt-waves.do
+++ b/pipelined/srt/srt-waves.do
@ -1,5 +1,5 @@
 add wave -noupdate /testbench/*
 add wave -noupdate /testbench/srt/*
-add wave -noupdate /testbench/srt/otfc2/*
+add wave -noupdate /testbench/srt/sotfc2/*
 add wave -noupdate /testbench/srt/preproc/*
 add wave -noupdate /testbench/srt/divcounter/*
--- a/pipelined/srt/srt.sv
+++ b/pipelined/srt/srt.sv
@ -29,8 +29,6 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 `include "wally-config.vh"
-`define EXTRAFRACBITS ((`NF<(`XLEN)) ? (`XLEN - `NF) : 0)
-`define EXTRAINTBITS ((`NF<(`XLEN)) ? 0 : (`NF - `XLEN))

 module srt (
  input  logic clk,
@ -49,18 +47,19 @@ module srt (
  input  logic       Int, // Choose integer inputs
  input  logic       Sqrt, // perform square root, not divide
  output logic       rsign, done,
-  output logic [`DIVLEN-1:0] Rem, Quot, // *** later handle integers
+  output logic [`DIVLEN-2:0] Rem, Quot, // *** later handle integers
  output logic [`NE-1:0] rExp,
  output logic [3:0] Flags
 );

-  logic           qp, qz, qm; // quotient is +1, 0, or -1
-  logic [`NE-1:0] calcExp;
-  logic           calcSign;
-  logic [`DIVLEN+3:0]  X, Dpreproc;
-  logic [`DIVLEN+3:0]  WS, WSA, WSN, WC, WCA, WCN, D, Db, Dsel;
+  logic                       qp, qz, qn; // quotient is +1, 0, or -1
+  logic [`NE-1:0]             calcExp;
+  logic                       calcSign;
+  logic [`DIVLEN+3:0]         X, Dpreproc, C, F, AddIn;
+  logic [`DIVLEN+3:0]         WS, WSA, WSN, WC, WCA, WCN, D, Db, Dsel;
  logic [$clog2(`XLEN+1)-1:0] intExp, dur, calcDur;
-  logic           intSign;
+  logic                       intSign;
+  logic                       cin;
 
  srtpreproc preproc(SrcA, SrcB, SrcXFrac, SrcYFrac, XExp, Fmt, W64, Signed, Int, Sqrt, X, Dpreproc, intExp, calcDur, intSign);

@ -76,23 +75,31 @@ module srt (

  // Quotient Selection logic
  // Given partial remainder, select quotient of +1, 0, or -1 (qp, qz, pm)
-  qsel2 qsel2(WS[`DIVLEN+3:`DIVLEN], WC[`DIVLEN+3:`DIVLEN], qp, qz, qm);
+  qsel2 qsel2(WS[`DIVLEN+3:`DIVLEN-1], WC[`DIVLEN+3:`DIVLEN-1], Sqrt, qp, qz, qn);

  flopen #(`NE) expflop(clk, Start, calcExp, rExp);
  flopen #(1) signflop(clk, Start, calcSign, rsign);
  flopen #(7) durflop(clk, Start, calcDur, dur);
  
-  counter divcounter(clk, Start, dur, done);
+  srtcounter divcounter(clk, Start, dur, done);

  // Divisor Selection logic
  assign Db = ~D;
-  mux3onehot #(`DIVLEN) divisorsel(Db, {(`DIVLEN+4){1'b0}}, D, qp, qz, qm, Dsel);
+  mux3onehot #(`DIVLEN) divisorsel(Db, {(`DIVLEN+4){1'b0}}, D, qp, qz, qn, Dsel);
+
+  // If only implementing division, use divide otfc
+  // otfc2  #(`DIVLEN) otfc2(clk, Start, qp, qz, qn, Quot);
+  // otherwise use sotfc
+  creg   sotfcC(clk, Start, C);
+  sotfc2 sotfc2(clk, Start, qp, qn, C, Quot, F);
+
+  // Adder input selection
+  assign AddIn = Sqrt ? F : Dsel;

  // Partial Product Generation
-  csa    #(`DIVLEN+4) csa(WS, WC, Dsel, qp, WSA, WCA);
+  assign cin = ~Sqrt & qp;
+  csa    #(`DIVLEN+4) csa(WS, WC, AddIn, cin, WSA, WCA);
  
-  otfc2  #(`DIVLEN) otfc2(clk, Start, qp, qz, qm, Quot);
-
  expcalc expcalc(.XExp, .YExp, .calcExp, .Sqrt);

  signcalc signcalc(.XSign, .YSign, .calcSign);
@ -121,42 +128,53 @@ module srtpreproc (

  logic  [$clog2(`XLEN+1)-1:0] zeroCntA, zeroCntB;
  logic  [`XLEN-1:0] PosA, PosB;
-  logic  [`DIVLEN-1:0] ExtraA, ExtraB, PreprocA, PreprocB, PreprocX, PreprocY, DivX, SqrtX;
+  logic  [`DIVLEN-1:0] ExtraA, ExtraB, PreprocA, PreprocB, PreprocX, PreprocY, DivX;
+  logic  [`NF+4:0] SqrtX;

+  // Generate positive integer inputs if they are signed
  assign PosA = (Signed & SrcA[`XLEN - 1]) ? -SrcA : SrcA;
  assign PosB = (Signed & SrcB[`XLEN - 1]) ? -SrcB : SrcB;

+  // Calculate leading zeros of integer inputs
  lzc #(`XLEN) lzcA (PosA, zeroCntA);
  lzc #(`XLEN) lzcB (PosB, zeroCntB);

+  // Make integers have DIVLEN bits
  assign ExtraA = {PosA, {`EXTRAINTBITS{1'b0}}};
  assign ExtraB = {PosB, {`EXTRAINTBITS{1'b0}}};

+  // Shift integers to have leading ones
  assign PreprocA = ExtraA << (zeroCntA + 1);
  assign PreprocB = ExtraB << (zeroCntB + 1);
+
+  // Make mantissas have DIVLEN bits
  assign PreprocX = {SrcXFrac, {`EXTRAFRACBITS{1'b0}}};
  assign PreprocY = {SrcYFrac, {`EXTRAFRACBITS{1'b0}}};

+  // Selecting correct divider inputs
  assign DivX = Int ? PreprocA : PreprocX;
-  assign SqrtX = {XExp[0] ? 4'b0000 : 4'b1111, SrcXFrac};
-
-  assign X = Sqrt ? SqrtX : {4'b0001, DivX};
+  assign SqrtX = XExp[0] ? {4'b0000, SrcXFrac, 1'b0} : {5'b11111, SrcXFrac};
+  assign X = Sqrt ? {SqrtX, {(`EXTRAFRACBITS-1){1'b0}}} : {4'b0001, DivX};
  assign D = {4'b0001, Int ? PreprocB : PreprocY};
+
+  // Integer exponent and sign calculations
  assign intExp = zeroCntB - zeroCntA + 1;
  assign intSign = Signed & (SrcA[`XLEN - 1] ^ SrcB[`XLEN - 1]);

-  assign dur = Int ? (intExp & {7{~intExp[6]}}) : (`DIVLEN + 2);
+  // Number of cycles of divider
+  assign dur = Int ? (intExp & {7{~intExp[6]}}) : (7)'(`DIVLEN);
 endmodule

 /////////////////////////////////
 // Quotient Selection, Radix 2 //
 /////////////////////////////////
 module qsel2 ( // *** eventually just change to 4 bits
-  input  logic [`DIVLEN+3:`DIVLEN] ps, pc, 
-  output logic         qp, qz, qm
+  input  logic [`DIVLEN+3:`DIVLEN-1] ps, pc, 
+  input  logic         Sqrt,
+  output logic         qp, qz, qn
 );
 
-  logic [`DIVLEN+3:`DIVLEN]  p, g;
+  logic [`DIVLEN+3:`DIVLEN-1]  p, g;
  logic          magnitude, sign, cout;

  // The quotient selection logic is presented for simplicity, not
@ -167,8 +185,8 @@ module qsel2 ( // *** eventually just change to 4 bits
  assign p = ps ^ pc;
  assign g = ps & pc;

-  assign #1 magnitude = ~(&p[`DIVLEN+2:`DIVLEN]);
-  assign #1 cout = g[`DIVLEN+2] | (p[`DIVLEN+2] & (g[`DIVLEN+1] | p[`DIVLEN+1] & g[`DIVLEN]));
+  assign #1 magnitude = ~(&p[`DIVLEN+2:`DIVLEN-1]);
+  assign #1 cout = g[`DIVLEN+2] | (p[`DIVLEN+2] & (g[`DIVLEN+1] | p[`DIVLEN+1] & (g[`DIVLEN] | (Sqrt & (p[`DIVLEN] & g[`DIVLEN-1])))));
  assign #1 sign = p[`DIVLEN+3] ^ cout;
 /*  assign #1 magnitude = ~((ps[54]^pc[54]) & (ps[53]^pc[53]) & 
 			  (ps[52]^pc[52]));
@ -180,7 +198,7 @@ module qsel2 ( // *** eventually just change to 4 bits
  // Produce quotient = +1, 0, or -1
  assign #1 qp = magnitude & ~sign;
  assign #1 qz = ~magnitude;
-  assign #1 qm = magnitude & sign;
+  assign #1 qn = magnitude & sign;
 endmodule

 ////////////////////////////////////
@ -191,45 +209,36 @@ module fsel2 (
  input  logic [`DIVLEN+3:0] C, S, SM,
  output logic [`DIVLEN+3:0] F
 );
-  logic [`DIVLEN+3:0] FP, FN;
+  logic [`DIVLEN+3:0] FP, FN, FZ;
  
  // Generate for both positive and negative bits
  assign FP = ~S & C;
  assign FN = SM | (C & (~C << 2));
+  assign FZ = {(`DIVLEN+4){1'b0}};

  // Choose which adder input will be used

-  assign F = sp ? FP : (sn ? FN : (`DIVLEN+4){1'b0});
+  assign F = sp ? FP : (sn ? FN : FZ);

 endmodule

 ///////////////////////////////////
 // On-The-Fly Converter, Radix 2 //
 ///////////////////////////////////
-module otfc2 #(parameter N=64) (
+module otfc2 #(parameter N=66) (
  input  logic         clk,
  input  logic         Start,
-  input  logic         qp, qz, qm,
-  output logic [N-1:0] r
+  input  logic         qp, qz, qn,
+  output logic [N-3:0] r
 );
-
  //  The on-the-fly converter transfers the quotient 
-  //  bits to the quotient as they come. 
-  //
-  //  This code follows the psuedocode presented in the 
-  //  floating point chapter of the book. Right now, 
-  //  it is written for Radix-2 division.
-  //
-  //  QM is Q-1. It allows us to write negative bits 
-  //  without using a costly CPA. 
+  //  bits to the quotient as they come.
+  //  Use this otfc for division only.
  logic [N+2:0] Q, QM, QNext, QMNext, QMMux;
-  //  QR and QMR are the shifted versions of Q and QM.
-  //  They are treated as [N-1:r] size signals, and 
-  //  discard the r most significant bits of Q and QM. 
  logic [N+1:0] QR, QMR;

  flopr #(N+3) Qreg(clk, Start, QNext, Q);
-  mux2 #(`DIVLEN+3) QMmux(QMNext, {`DIVLEN+3{1'b1}}, Start, QMMux);
+  mux2 #(`DIVLEN+3) Qmux(QMNext, {(`DIVLEN+3){1'b1}}, Start, QMMux);
  flop #(`DIVLEN+3) QMreg(clk, QMMux, QM);

  always_comb begin
@ -241,35 +250,76 @@ module otfc2 #(parameter N=64) (
    end else if (qz) begin
      QNext  = {QR,  1'b0};
      QMNext = {QMR, 1'b1};
-    end else begin        // If qp and qz are not true, then qm is
+    end else begin        // If qp and qz are not true, then qn is
      QNext  = {QMR, 1'b1};
      QMNext = {QMR, 1'b0};
    end 
  end
-  assign r = Q[N+2] ? Q[N+1:2] : Q[N:1];
+  assign r = Q[N] ? Q[N-1:2] : Q[N-2:1];

 endmodule

 ///////////////////////////////
 // Square Root OTFC, Radix 2 //
 ///////////////////////////////
-module softc2(
-  input  logic clk,
-  input  logic Start,
-  input  logic sp, sn,
-  output logic S,
+module sotfc2(
+  input  logic         clk,
+  input  logic         Start,
+  input  logic         sp, sn,
+  input  logic [`DIVLEN+3:0] C,
+  output logic [`DIVLEN-2:0] Sq,
+  output logic [`DIVLEN+3:0] F
 );
+  //  The on-the-fly converter transfers the square root 
+  //  bits to the quotient as they come.
+  //  Use this otfc for division and square root.
+  logic [`DIVLEN+3:0] S, SM, SNext, SMNext, SMux;
+
+  flopr #(`DIVLEN+4) SMreg(clk, Start, SMNext, SM);
+  mux2 #(`DIVLEN+4) Smux(SNext, {4'b0001, {(`DIVLEN){1'b0}}}, Start, SMux);
+  flop #(`DIVLEN+4) Sreg(clk, SMux, S);
+
+  always_comb begin
+    if (sp) begin
+      SNext  = S | ((C << 1) & ~(C << 2));
+      SMNext = S;
+    end else if (sn) begin
+      SNext  = SM | ((C << 1) & ~(C << 2));
+      SMNext = SM;
+    end else begin        // If sp and sn are not true, then sz is
+      SNext  = S;
+      SMNext = SM | ((C << 1) & ~(C << 2));
+    end 
+  end
+  assign Sq = S[`DIVLEN] ? S[`DIVLEN-1:1] : S[`DIVLEN-2:0];
+
+  fsel2 fsel(sp, sn, C, S, SM, F);

 endmodule
+
+//////////////////////////
+// C Register for SOTFC //
+//////////////////////////
+module creg(input  logic clk,
+            input  logic Start,
+            output logic [`DIVLEN+3:0] C
+);
+  logic [`DIVLEN+3:0] CMux;
+
+  mux2 #(`DIVLEN+4) Cmux({1'b1, C[`DIVLEN+3:1]}, {6'b111111, {(`DIVLEN-2){1'b0}}}, Start, CMux);
+  flop #(`DIVLEN+4) cflop(clk, CMux, C);
+endmodule
+
 /////////////
 // counter //
 /////////////
-module counter(input  logic clk, 
-               input  logic req, 
-               input  logic [$clog2(`XLEN+1)-1:0] dur,
-               output logic done);
+module srtcounter(input  logic clk, 
+                  input  logic req, 
+                  input  logic [$clog2(`XLEN+1)-1:0] dur,
+                  output logic done
+);
 
-   logic    [$clog2(`XLEN+1)-1:0]  count;
+  logic    [$clog2(`XLEN+1)-1:0]  count;

  // This block of control logic sequences the divider
  // through its iterations.  You may modify it if you
--- a/pipelined/srt/testbench.sv
+++ b/pipelined/srt/testbench.sv
@ -1,4 +1,4 @@
-`define DIVLEN 64
+`include "wally-config.vh"

 /////////////
 // counter //
@ -39,37 +39,27 @@ endmodule
 // testbench //
 //////////
 module testbench;
-  logic              clk;
-  logic              req;
-  logic              done;
-  logic              Int;
-  logic [63:0]       a, b;
-  logic [51:0]       afrac, bfrac;
-  logic [10:0]       aExp, bExp;
-  logic              asign, bsign;
-  logic [51:0]       r;
-  logic [63:0]       rInt;
-  logic [`DIVLEN-1:0]  Quot;
+  logic               clk;
+  logic               req;
+  logic               done;
+  logic               Int;
+  logic [`XLEN-1:0]   a, b;
+  logic [`NF-1:0]     afrac, bfrac;
+  logic [`NE-1:0]     aExp, bExp;
+  logic               asign, bsign;
+  logic [`NF-1:0]     r;
+  logic [`XLEN-1:0]   rInt;
+  logic [`DIVLEN-2:0] Quot;
 
  // Test parameters
  parameter MEM_SIZE = 40000;
  parameter MEM_WIDTH = 64+64+64+64;
 
-  // INT TEST SIZES
-  // `define memrem  63:0 
-  // `define memr  127:64
-  // `define memb  191:128
-  // `define mema  255:192
-
-  // FLOAT TEST SIZES
-  // `define memr  63:0 
-  // `define memb  127:64
-  // `define mema  191:128
-
-  // SQRT TEST SIZES 
-  `define memr  63:0 
-  `define mema  127:64
+  // Test sizes
+  `define memrem  63:0 
+  `define memr  127:64
  `define memb  191:128
+  `define mema  255:192

  // Test logicisters
  logic [MEM_WIDTH-1:0] Tests [0:MEM_SIZE];  // Space for input file
@ -118,16 +108,16 @@ module testbench;
      b = Vec[`memb];
      {bsign, bExp, bfrac} = b;
      nextr = Vec[`memr];
-      r = Quot[(`DIVLEN - 1):(`DIVLEN - 52)];
-      rInt = Quot;
+      r = Quot[(`DIVLEN - 2):(`DIVLEN - `NF - 1)];
+      rInt = {1'b1, Quot};
      req <= #5 1;
    end
  
  // Apply directed test vectors read from file.

  always @(posedge clk) begin
-    r = Quot[(`DIVLEN - 1):(`DIVLEN - 52)];
-    rInt = Quot;
+    r = Quot[(`DIVLEN - 2):(`DIVLEN - `NF - 1)];
+    rInt = {1'b1, Quot};
    if (done) begin
      if (~Int & ~Sqrt) begin
        req <= #5 1;
@ -165,15 +155,14 @@ module testbench;
        req <= #5 1;
        diffp = correctr[51:0] - r;
        diffn = r - correctr[51:0];
-        if (rExp !== correctr[62:52]) // check if accurate to 1 ulp
+        if ((rExp !== correctr[62:52]) | ($signed(diffn) > 1) | ($signed(diffp) > 1) | (diffn === 64'bx) | (diffp === 64'bx)) // check if accurate to 1 ulp
          begin
            errors = errors + 1;
            $display("result was %h, should be %h %h %h\n", r, correctr, diffn, diffp);
            $display("failed\n");
-            $stop;
          end
        if (afrac === 52'hxxxxxxxxxxxxx) begin 
-          $display("%d Tests completed successfully", testnum);
+          $display("%d Tests completed successfully", testnum-errors);
          $stop; end 
      end
    end
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@ -80,17 +80,17 @@ module testbenchfp;
  logic CvtResSgnE;
  logic [`NE:0]           CvtCalcExpE;    // the calculated expoent
 	logic [`LOGCVTLEN-1:0] CvtShiftAmtE;  // how much to shift by
-	logic [`DIVLEN+2:0] Quot;
+	logic [`QLEN-1-(`RADIX/4):0] Quot;
  logic CvtResDenormUfE;
-  logic [$clog2(`DIVLEN/2+3)-1:0] EarlyTermShiftDiv2;
+  logic [`DURLEN-1:0] EarlyTermShift;
  logic DivStart, DivBusy;
  logic reset = 1'b0;
  logic [`DIVLEN-1:0]    DivX;
  logic [`DIVLEN-1:0]  Dpreproc;
-  logic [`DIVLEN+3:0]  WSN, WS;
-  logic [`DIVLEN+3:0]  WCN, WC;
+  logic [`DIVLEN+3:0]  NextWSN, WS;
+  logic [`DIVLEN+3:0]  NextWCN, WC;
  logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
-  logic [$clog2(`DIVLEN/2+3)-1:0] Dur;
+  logic [`DURLEN-1:0] Dur;

  // in-between FMA signals
  logic                 Mult;
@ -679,15 +679,15 @@ module testbenchfp;
          .Pe, .ZmSticky, .KillProd); 
              
  postprocess postprocess(.Xs(XSgn), .Ys(YSgn), .PostProcSel(UnitVal[1:0]),
-              .Ze(ZExp),  .ZDenorm(ZDenorm), .FOpCtrl(OpCtrlVal), .Quot, .DivCalcExp(DivCalcExp),
-              .Xm(XMan), .Ym(YMan), .Zm(ZMan), .CvtCe(CvtCalcExpE), .DivSticky(DivSticky),
-              .XNaN(XNaN), .YNaN(YNaN), .ZNaN(ZNaN), .CvtResDenormUf(CvtResDenormUfE), .DivNegSticky,
+              .Ze(ZExp),  .ZDenorm(ZDenorm), .FOpCtrl(OpCtrlVal), .DivQm(Quot), .DivQe(DivCalcExp),
+              .Xm(XMan), .Ym(YMan), .Zm(ZMan), .CvtCe(CvtCalcExpE), .DivS(DivSticky),
+              .XNaN(XNaN), .YNaN(YNaN), .ZNaN(ZNaN), .CvtResDenormUf(CvtResDenormUfE),
              .XZero(XZero), .YZero(YZero), .ZZero(ZZero), .CvtShiftAmt(CvtShiftAmtE),
              .XInf(XInf), .YInf(YInf), .ZInf(ZInf), .CvtCs(CvtResSgnE), .ToInt(WriteIntVal),
              .XSNaN(XSNaN), .YSNaN(YSNaN), .ZSNaN(ZSNaN), .CvtLzcIn(CvtLzcInE), .IntZero,
-              .FmaKillProd(KillProd), .FmaZmSticky(ZmSticky), .FmaPe(Pe), .DivDone,
-              .FmaSm(Sm), .FmaNegSum(NegSum), .FmaInvA(InvA), .FmaNCnt(NCnt), .DivEarlyTermShiftDiv2(EarlyTermShiftDiv2), .FmaAs(As), .FmaPs(Ps), .Fmt(ModFmt), .Frm(FrmVal), 
-              .PostProcFlg(Flg), .W(FpRes), .FCvtIntRes(IntRes));
+              .FmaKillProd(KillProd), .FmaZmS(ZmSticky), .FmaPe(Pe), .DivDone,
+              .FmaSm(Sm), .FmaNegSum(NegSum), .FmaInvA(InvA), .FmaNCnt(NCnt), .DivEarlyTermShift(EarlyTermShift), .FmaAs(As), .FmaPs(Ps), .Fmt(ModFmt), .Frm(FrmVal), 
+              .PostProcFlg(Flg), .PostProcRes(FpRes), .FCvtIntRes(IntRes));
  
  fcvt fcvt (.Xs(XSgn), .Xe(XExp), .Xm(XMan), .Int(SrcA), .ToInt(WriteIntVal), 
            .XZero(XZero), .XDenorm(XDenorm), .FOpCtrl(OpCtrlVal), .IntZero,
@ -695,11 +695,10 @@ module testbenchfp;
  fcmp fcmp   (.FmtE(ModFmt), .FOpCtrlE(OpCtrlVal), .XSgnE(XSgn), .YSgnE(YSgn), .XExpE(XExp), .YExpE(YExp), 
              .XManE(XMan), .YManE(YMan), .XZeroE(XZero), .YZeroE(YZero), .CmpIntResE(CmpRes),
              .XNaNE(XNaN), .YNaNE(YNaN), .XSNaNE(XSNaN), .YSNaNE(YSNaN), .FSrcXE(X), .FSrcYE(Y), .CmpNVE(CmpFlg[4]), .CmpFpResE(FpCmpRes));
-  srtpreproc srtpreproc(.XManE(XMan), .Dur, .YManE(YMan),.X(DivX),.Dpreproc, .XZeroCnt, .YZeroCnt);
-  srtfsm srtfsm(.reset, .WSN, .WCN, .WS, .WC, .Dur, .DivBusy, .DivDone, .clk, .DivStart, .StallM(1'b0), .StallE(1'b0), .XZeroE(XZero), .YZeroE(YZero), .DivStickyE(DivSticky), .XNaNE(XNaN), .YNaNE(YNaN),
-                .XInfE(XInf), .YInfE(YInf), .DivNegStickyE(DivNegSticky), .EarlyTermShiftDiv2E(EarlyTermShiftDiv2));
-  srtradix4 srtradix4(.clk, .FmtE(ModFmt), .X(DivX),.Dpreproc, .DivBusy, .XZeroCnt, .YZeroCnt, .WS, .WC, .WSN, .WCN, .DivStart, .XExpE(XExp), .YExpE(YExp), .XZeroE(XZero), .YZeroE(YZero),
-                .Quot, .Rem(), .DivCalcExpM(DivCalcExp));
+  divsqrt divsqrt(.clk, .reset, .FmtE(ModFmt), .XManE(XMan), .YManE(YMan), .XExpE(XExp), .YExpE(YExp), 
+                  .XInfE(XInf), .YInfE(YInf), .XZeroE(XZero), .YZeroE(YZero), .XNaNE(XNaN), .YNaNE(YNaN), .DivStartE(DivStart), 
+                  .StallE(1'b0), .StallM(1'b0), .DivStickyM(DivSticky), .DivBusy, .DivCalcExpM(DivCalcExp),
+                  .EarlyTermShiftM(EarlyTermShift), .QuotM(Quot), .DivDone);

  assign CmpFlg[3:0] = 0;

@ -854,7 +853,7 @@ end

    // check if result is correct
    //  - wait till the division result is done or one extra cylcle for early termination (to simulate the EM pipline stage)
-    if(~((Res === Ans | NaNGood | NaNGood === 1'bx) & (ResFlg === AnsFlg | AnsFlg === 5'bx))&~(DivBusy|DivStart)&(UnitVal !== `CVTINTUNIT)&(UnitVal !== `CMPUNIT)) begin
+    if(~((Res === Ans | NaNGood | NaNGood === 1'bx) & (ResFlg === AnsFlg | AnsFlg === 5'bx))&~((DivBusy===1'b1)|DivStart)&(UnitVal !== `CVTINTUNIT)&(UnitVal !== `CMPUNIT)) begin
      errors += 1;
      $display("There is an error in %s", Tests[TestNum]);
      $display("inputs: %h %h %h\nSrcA: %h\n Res: %h %h\n Ans: %h %h", X, Y, Z, SrcA, Res, ResFlg, Ans, AnsFlg);
--- a/pipelined/testbench/testbench.sv
+++ b/pipelined/testbench/testbench.sv
@ -114,6 +114,7 @@ logic [3:0] dummy;
        "arch32f":      if (`F_SUPPORTED) tests = arch32f;
        "imperas32i":                     tests = imperas32i;
        "imperas32f":   if (`F_SUPPORTED) tests = imperas32f;
+        // "wally32d":     if (`D_SUPPORTED) tests = wally32d;
        "imperas32m":   if (`M_SUPPORTED) tests = imperas32m;
        "wally32a":     if (`A_SUPPORTED) tests = wally32a;
        "imperas32c":   if (`C_SUPPORTED) tests = imperas32c;
--- a/pipelined/testbench/tests.vh
+++ b/pipelined/testbench/tests.vh
@ -34,7 +34,7 @@
 string tvpaths[] = '{
    "../../addins/imperas-riscv-tests/work/",
    "../../tests/riscof/work/riscv-arch-test/",
-    "../../tests/wally-riscv-arch-test/work/", //"../../tests/riscof/work/wally-riscv-arch-test/", 
+    "../../tests/wally-riscv-arch-test/work/", //"../../tests/riscof/work/wally-riscv-arch-test/", //
    "../../tests/imperas-riscv-tests/work/",
    "../../benchmarks/coremark/work/",
    "../../addins/embench-iot/"
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@ -107,7 +107,7 @@ ifeq ($(SAIFPOWER), 1)
 	cp -f ../pipelined/regression/power.saif .
 endif
 	dc_shell-xg-t -64bit -f scripts/$(NAME).tcl | tee $(OUTPUTDIR)/$(NAME).out
-	rm -rf $(OUTPUTDIR)/hdl
+#	rm -rf $(OUTPUTDIR)/hdl
 	rm -rf $(OUTPUTDIR)/WORK
 	rm -rf $(OUTPUTDIR)/alib-52

--- a/synthDC/extractSummary.py
+++ b/synthDC/extractSummary.py
@ -11,6 +11,7 @@ import numpy as np
 from ppa.ppaAnalyze import noOutliers
 from matplotlib import ticker
 import argparse
+import os


 def synthsintocsv():
@ -59,6 +60,7 @@ def synthsintocsv():
            writer.writerow([width, config, special, tech, freq, delay, area])
    file.close()

+	
 def synthsfromcsv(filename):
    Synth = namedtuple("Synth", "width config special tech freq delay area")
    with open(filename, newline='') as csvfile:
@ -74,10 +76,16 @@ def synthsfromcsv(filename):
            allSynths[i] = Synth(*allSynths[i])
    return allSynths

+
 def freqPlot(tech, width, config):
    ''' plots delay, area for syntheses with specified tech, module, width
    '''

+    current_directory = os.getcwd()
+    final_directory = os.path.join(current_directory, 'plots/wally')
+    if not os.path.exists(final_directory):
+        os.makedirs(final_directory)
+
    freqsL, delaysL, areasL = ([[], []] for i in range(3))
    for oneSynth in allSynths:
        if (width == oneSynth.width) & (config == oneSynth.config) & (tech == oneSynth.tech) & ('' == oneSynth.special):
@ -151,6 +159,7 @@ def areaDelay(tech, delays, areas, labels, fig, ax, norm=False):

    return fig

+
 def plotFeatures(tech, width, config):
    delays, areas, labels = ([] for i in range(3))
    freq = techdict[tech].targfreq
@ -168,7 +177,8 @@ def plotFeatures(tech, width, config):
    titlestr = tech+'_'+width+config
    plt.title(titlestr)
    plt.savefig('./plots/wally/features_'+titlestr+'.png')
-    
+
+	
 def plotConfigs(tech, special=''):
    delays, areas, labels = ([] for i in range(3))
    freq = techdict[tech].targfreq
@ -207,7 +217,8 @@ def normAreaDelay(special=''):
    ax.set_ylabel('Area (add32)')        
    ax.legend(handles = fullLeg, loc='upper left')
    plt.savefig('./plots/wally/normAreaDelay.png')
-    
+
+	
 def addFO4axis(fig, ax, tech):
    fo4 = techdict[tech].fo4

--- a/synthDC/scripts/synth.tcl
+++ b/synthDC/scripts/synth.tcl
@ -56,7 +56,7 @@ set vhdlout_show_unconnected_pins "true"
 # Due to parameterized Verilog must use analyze/elaborate and not 
 # read_verilog/vhdl (change to pull in Verilog and/or VHDL)
 #
-set alib_library_analysis_path ./$outputDir
+#set alib_library_analysis_path ./$outputDir
 define_design_lib WORK -path ./$outputDir/WORK
 analyze -f sverilog -lib WORK $my_verilog_files
 elaborate $my_toplevel -lib WORK 
@ -347,7 +347,7 @@ redirect -append $filename { report_timing -capacitance -transition_time -nets -
 redirect -append $filename { echo "\n\n\n//// Critical paths through fma2 ////\n\n\n" }
 redirect -append $filename { report_timing -capacitance -transition_time -nets -through {postprocess/*} -nworst 1 }
 redirect -append $filename { echo "\n\n\n//// Critical paths through fpdiv ////\n\n\n" }
-redirect -append $filename { report_timing -capacitance -transition_time -nets -through {fdivsqrt/*} -nworst 1 }
+redirect -append $filename { report_timing -capacitance -transition_time -nets -through {divsqrt/*} -nworst 1 }
 redirect -append $filename { echo "\n\n\n//// Critical paths through fcvt ////\n\n\n" }
 redirect -append $filename { report_timing -capacitance -transition_time -nets -through {fcvt/*} -nworst 1 }

--- a/tests/riscof/Makefile
+++ b/tests/riscof/Makefile
@ -8,7 +8,7 @@ wally_workdir = $(work)/wally-riscv-arch-test
 current_dir = $(shell pwd)
 XLEN    ?= 64

-all: root build_arch # build_wally memfile
+all: root build_arch #build_wally memfile

 root:
 	mkdir -p $(work_dir)
--- a/tests/riscof/spike/riscof_spike.py
+++ b/tests/riscof/spike/riscof_spike.py
@ -108,7 +108,7 @@ class spike(pluginTemplate):

      #TODO: The following assumes you are using the riscv-gcc toolchain. If
      #      not please change appropriately
-      self.compile_cmd = self.compile_cmd+' -mabi='+('lp64 ' if 64 in ispec['supported_xlen'] else 'ilp32 ')
+      self.compile_cmd = self.compile_cmd+' -mabi='+('lp64 ' if 64 in ispec['supported_xlen'] else ('ilp32e ' if "E" in ispec["ISA"] else 'ilp32 '))

    def runTests(self, testList):

@ -158,7 +158,12 @@ class spike(pluginTemplate):
 	  # echo statement.
          if self.target_run:
            # set up the simulation command. Template is for spike. Please change.
-            simcmd = self.dut_exe + ' --isa={0} +signature={1} +signature-granularity=4 {2}'.format(self.isa, sig_file, elf)
+            if ('NO_SAIL=True' in testentry['macros']):
+                # if the tests can't run on SAIL we copy the reference output to the src directory
+                reference_output = re.sub("/src/","/references/", re.sub(".S",".reference_output", test))
+                simcmd = 'cut -c-{0:g} {1} > {2}'.format(8, reference_output, sig_file) #use cut to remove comments when copying
+            else:
+                simcmd = self.dut_exe + ' --isa={0} +signature={1} +signature-granularity=4 {2}'.format(self.isa, sig_file, elf)
          else:
            simcmd = 'echo "NO RUN"'

--- a/tests/riscof/spike/spike_rv32imc_isa.yaml
+++ b/tests/riscof/spike/spike_rv32imc_isa.yaml
@ -1,11 +1,11 @@
 hart_ids: [0]
 hart0:
-  ISA: RV32IMAFCZicsr_Zifencei
+  ISA: RV32IMAFDCZicsr_Zifencei
  physical_addr_sz: 32
  User_Spec_Version: '2.3'
  supported_xlen: [32]
  misa:
-   reset-val: 0x40001125
+   reset-val: 0x4000112D
   rv32:
     accessible: true
     mxl:
@ -23,6 +23,6 @@ hart0:
           warl:
              dependency_fields: []
              legal:
-                - extensions[25:0] bitmask [0x0001125, 0x0000000]
+                - extensions[25:0] bitmask [0x000112D, 0x0000000]
              wr_illegal:
                - Unchanged