From 961cf91482b1da5bc617c1036cadb988981ae8da Mon Sep 17 00:00:00 2001
From: Jacob Pease <jacobpease@protonmail.com>
Date: Tue, 26 Sep 2023 15:16:45 -0500
Subject: [PATCH 01/62] Linux Makefile: Fixed find utility crashing for real
 this time.

---
 linux/Makefile | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/linux/Makefile b/linux/Makefile
index 433bf0e73..093913eee 100644
--- a/linux/Makefile
+++ b/linux/Makefile
@@ -27,23 +27,6 @@ BINARIES := fw_jump.elf vmlinux busybox
 OBJDUMPS := $(foreach name, $(BINARIES), $(basename $(name) .elf))
 OBJDUMPS := $(foreach name, $(OBJDUMPS), $(DIS)/$(name).objdump)
 
-# LINUXDIR := $(shell ls $(BUILDROOT)/output/build | grep -e '^linux-[0-9]\+\.[0-9]\+\.[0-9]\+$$' )
-# LINUXDIR := $(BUILDROOT)/output/build/$(LINUXDIR)
-# BUSYBOXDIR := $(shell ls $(BUILDROOT)/output/build | grep -e '^linux-[0-9]\+\.[0-9]\+\.[0-9]\+$$' )
-# BUSYBOXDIR := $(BUILDROOT)/output/build/$(BUSYBOXDIR)
-
-# Gets Linux and Busybox output folders for objedect dumps
-# LINUXDIR ?= $(shell find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/linux-[0-9]+\.[0-9]+\.[0-9]+$$")
-# BUSYBOXDIR ?= $(shell find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/busybox-[0-9]+\.[0-9]+\.[0-9]+$$")
-
-define linuxDir =
-$(shell find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/linux-[0-9]+\.[0-9]+\.[0-9]+$$")
-endef
-
-define busyboxDir =
-$(shell find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/busybox-[0-9]+\.[0-9]+\.[0-9]+$$")
-endef
-
 .PHONY: all generate disassemble install clean cleanDTB cleanDriver test
 
 # Generate all device trees -------------------------------------------
@@ -59,8 +42,7 @@ all:
 
 # Temp rule for debugging
 test:
-	@echo $(linuxDir)
-	@echo $(busyboxDir)
+	echo $(shell find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/linux-[0-9]+\.[0-9]+\.[0-9]+$$")
 
 generate: $(DTB) $(IMAGES)
 
@@ -87,11 +69,13 @@ $(DIS)/%.objdump: $(IMAGES)/%.elf
 $(DIS)/%.objdump: $(IMAGES)/%
 	riscv64-unknown-elf-objdump -S $< >> $@
 
-$(IMAGES)/vmlinux: $(call linuxDir)/vmlinux
-	cp $< $@
+$(IMAGES)/vmlinux:
+	linuxDir=$$(find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/linux-[0-9]+\.[0-9]+\.[0-9]+$$") ;\
+	cp $$linuxDir/vmlinux $@ ;\
 
-$(IMAGES)/busybox: $(call busyboxDir)/busybox
-	cp $< $@
+$(IMAGES)/busybox:
+	busyboxDir=$$(find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/busybox-[0-9]+\.[0-9]+\.[0-9]+$$") ;\
+	cp $$busyboxDir/busybox $@ ;\
 
 # Generating new Buildroot directories --------------------------------
 

From 2b1c604016b5ddd878b64b18debadf6ea4e676e5 Mon Sep 17 00:00:00 2001
From: Jacob Pease <jacobpease@protonmail.com>
Date: Tue, 17 Oct 2023 14:13:18 -0500
Subject: [PATCH 02/62] Slight modification to testbench.sv

---
 testbench/testbench.sv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/testbench/testbench.sv b/testbench/testbench.sv
index 74077e547..dd83f7610 100644
--- a/testbench/testbench.sv
+++ b/testbench/testbench.sv
@@ -387,6 +387,7 @@ module testbench;
 
     assign SDCCmd = SDCCmdOE ? SDCCmdOut : 1'bz;
     assign SDCCmdIn = SDCCmd;
+    assign SDCDat = sd_dat_reg_t ? sd_dat_reg_o : sd_dat_i;
     assign SDCDatIn = SDCDat;
  -----/\----- EXCLUDED -----/\----- */
     assign SDCIntr = '0;

From 4c106215f44cecc055a2e88fe47d0b683d0282e0 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 08:46:55 -0800
Subject: [PATCH 03/62] Started cleaning up shifting leading 1 in fdivsqrt

---
 config/shared/config-shared.vh      | 18 +++++++++---------
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 18 +++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 48f02b848..acc7996cb 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -94,15 +94,15 @@ localparam FMT2  = ((F_SUPPORTED & (LEN1 != S_LEN)) ? 2'd0    : 2'd2);
 localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 
 // division constants
-localparam DIVN        = (((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2); // standard length of input
-localparam LOGR        = ($clog2(RADIX));           // r = log(R)
-localparam RK          = (LOGR*DIVCOPIES);         // r*k used for intdiv preproc
-localparam LOGRK       = ($clog2(RK));               // log2(r*k)
-localparam FPDUR       = ((DIVN+1+(LOGR*DIVCOPIES))/(LOGR*DIVCOPIES)+(RADIX/4));
-localparam DURLEN      = ($clog2(FPDUR+1));
-localparam DIVb        = (FPDUR*LOGR*DIVCOPIES-1); // canonical fdiv size (b)
-localparam DIVBLEN     = ($clog2(DIVb+1)-1);
-localparam DIVa        = (DIVb+1-XLEN); // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
+localparam DIVN        = ((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2; // standard length of input
+localparam LOGR        = $clog2(RADIX);           // r = log(R)
+localparam RK          = LOGR*DIVCOPIES;         // r*k used for intdiv preproc
+localparam LOGRK       = $clog2(RK);               // log2(r*k)
+localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);
+localparam DURLEN      = $clog2(FPDUR+1);
+localparam DIVb        = FPDUR*RK - 1; // canonical fdiv size (b)
+localparam DIVBLEN     = $clog2(DIVb+1)-1;
+localparam DIVa        = DIVb+1-XLEN; // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
 
 // largest length in IEU/FPU
 localparam CVTLEN = ((NF<XLEN) ? (XLEN) : (NF));  // max(XLEN, NF)
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 6c397576a..8f3c477c4 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -52,7 +52,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic [P.DIVb:0]             PreSqrtX;
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
   logic [P.NE+1:0]             QeE;                                 // Quotient Exponent (FP only)
-  logic [P.DIVb-1:0]           IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
+  logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
   logic [P.DIVBLEN:0]          mE, nE, ell;                         // Leading zeros of inputs
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
   logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
@@ -89,12 +89,12 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.XLEN) posbmux(BE, -BE, BsE, PosB);
 
     // Select integer or floating point inputs
-    mux2 #(P.DIVb) ifxmux({Xm, {(P.DIVb-P.NF-1){1'b0}}}, {PosA, {(P.DIVb-P.XLEN){1'b0}}}, IntDivE, IFX);
-    mux2 #(P.DIVb) ifdmux({Ym, {(P.DIVb-P.NF-1){1'b0}}}, {PosB, {(P.DIVb-P.XLEN){1'b0}}}, IntDivE, IFD);
+    mux2 #(P.DIVb+1) ifxmux({Xm, {(P.DIVb-P.NF){1'b0}}}, {PosA, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFX);
+    mux2 #(P.DIVb+1) ifdmux({Ym, {(P.DIVb-P.NF){1'b0}}}, {PosB, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFD);
     mux2 #(1)    numzmux(XZeroE, AZeroE, IntDivE, NumerZeroE);
   end else begin // Int not supported
-    assign IFX = {Xm, {(P.DIVb-P.NF-1){1'b0}}};
-    assign IFD = {Ym, {(P.DIVb-P.NF-1){1'b0}}};
+    assign IFX = {Xm, {(P.DIVb-P.NF){1'b0}}};
+    assign IFD = {Ym, {(P.DIVb-P.NF){1'b0}}};
     assign NumerZeroE = XZeroE;
   end
 
@@ -103,12 +103,12 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   //////////////////////////////////////////////////////
 
   // count leading zeros for Subnorm FP and to normalize integer inputs
-  lzc #(P.DIVb) lzcX (IFX, ell);
-  lzc #(P.DIVb) lzcY (IFD, mE);
+  lzc #(P.DIVb) lzcX (IFX[P.DIVb:1], ell);
+  lzc #(P.DIVb) lzcY (IFD[P.DIVb:1], mE);
 
   // Normalization shift: shift off leading one
-  assign Xfract = (IFX << ell) << 1;
-  assign Dfract = (IFD << mE)  << 1; 
+  assign Xfract = (IFX[P.DIVb:1] << ell) << 1;
+  assign Dfract = (IFD[P.DIVb:1] << mE)  << 1; 
 
   //////////////////////////////////////////////////////
   // Integer Right Shift to digit boundary

From 953c53d065127ec688c07b0ac45362551328ce75 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 09:11:15 -0800
Subject: [PATCH 04/62] fdivsqrt parameter cleanup

---
 config/shared/config-shared.vh       | 13 ++++++-------
 config/shared/parameter-defs.vh      |  3 +--
 src/cvw.sv                           |  1 -
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv |  3 ++-
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 18 +++++++++---------
 5 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index acc7996cb..17b1ede83 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -94,15 +94,14 @@ localparam FMT2  = ((F_SUPPORTED & (LEN1 != S_LEN)) ? 2'd0    : 2'd2);
 localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 
 // division constants
-localparam DIVN        = ((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2; // standard length of input
-localparam LOGR        = $clog2(RADIX);           // r = log(R)
-localparam RK          = LOGR*DIVCOPIES;         // r*k used for intdiv preproc
-localparam LOGRK       = $clog2(RK);               // log2(r*k)
-localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);
+localparam DIVN        = ((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2; // standard length of input: max(XLEN, NF+2)
+localparam LOGR        = $clog2(RADIX);                             // r = log(R)
+localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
+localparam LOGRK       = $clog2(RK);                                // log2(r*k)
+localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // 
 localparam DURLEN      = $clog2(FPDUR+1);
 localparam DIVb        = FPDUR*RK - 1; // canonical fdiv size (b)
-localparam DIVBLEN     = $clog2(DIVb+1)-1;
-localparam DIVa        = DIVb+1-XLEN; // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
+localparam DIVBLEN     = $clog2(DIVb+2)-1;
 
 // largest length in IEU/FPU
 localparam CVTLEN = ((NF<XLEN) ? (XLEN) : (NF));  // max(XLEN, NF)
diff --git a/config/shared/parameter-defs.vh b/config/shared/parameter-defs.vh
index d04b35e56..0c377c02d 100644
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@@ -184,6 +184,5 @@ localparam cvw_t P = '{
   FPDUR       : FPDUR,
   DURLEN      : DURLEN,
   DIVb        : DIVb,
-  DIVBLEN     : DIVBLEN,
-  DIVa        : DIVa
+  DIVBLEN     : DIVBLEN
 };
diff --git a/src/cvw.sv b/src/cvw.sv
index 4cbf67b28..02105823e 100644
--- a/src/cvw.sv
+++ b/src/cvw.sv
@@ -277,7 +277,6 @@ typedef struct packed {
   int DURLEN     ;
   int DIVb       ;
   int DIVBLEN    ;
-  int DIVa       ;
 
 } cvw_t;
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index c3c558902..9f887d4ab 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -110,7 +110,8 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.DIVb+4) quotresmux(UnsignedQuotM, -UnsignedQuotM, NegQuotM, NormQuotM);
 
     // Select quotient or remainder and do normalization shift
-    mux2 #(P.DIVBLEN+1) normshiftmux(((P.DIVBLEN+1)'(P.DIVb) - (nM * (P.DIVBLEN+1)'(P.LOGR))), (mM + (P.DIVBLEN+1)'(P.DIVa)), RemOpM, NormShiftM);
+    localparam DIVa        = (P.DIVb+1-P.XLEN); // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
+    mux2 #(P.DIVBLEN+1) normshiftmux(((P.DIVBLEN+1)'(P.DIVb) - (nM * (P.DIVBLEN+1)'(P.LOGR))), (mM + (P.DIVBLEN+1)'(DIVa)), RemOpM, NormShiftM);
     mux2 #(P.DIVb+4)    presresultmux(NormQuotM, NormRemM, RemOpM, PreResultM);
     assign PreIntResultM = $signed(PreResultM >>> NormShiftM); 
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 8f3c477c4..0e716ac20 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -48,7 +48,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   output logic [P.XLEN-1:0]    AM
 );
 
-  logic [P.DIVb-1:0]           Xfract, Dfract;
+  logic [P.DIVb:0]             Xfract, Dfract;
   logic [P.DIVb:0]             PreSqrtX;
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
   logic [P.NE+1:0]             QeE;                                 // Quotient Exponent (FP only)
@@ -103,12 +103,12 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   //////////////////////////////////////////////////////
 
   // count leading zeros for Subnorm FP and to normalize integer inputs
-  lzc #(P.DIVb) lzcX (IFX[P.DIVb:1], ell);
-  lzc #(P.DIVb) lzcY (IFD[P.DIVb:1], mE);
+  lzc #(P.DIVb+1) lzcX (IFX, ell);
+  lzc #(P.DIVb+1) lzcY (IFD, mE);
 
   // Normalization shift: shift off leading one
-  assign Xfract = (IFX[P.DIVb:1] << ell) << 1;
-  assign Dfract = (IFD[P.DIVb:1] << mE)  << 1; 
+  assign Xfract = (IFX << ell);
+  assign Dfract = (IFD << mE); 
 
   //////////////////////////////////////////////////////
   // Integer Right Shift to digit boundary
@@ -158,10 +158,10 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   //  it comes out in the wash and gives the right answer.  Investigate later if possible.
   //////////////////////////////////////////////////////
 
-  assign DivX = {3'b000, ~NumerZeroE, Xfract};
+  assign DivX = {3'b000, Xfract};
 
   // Sqrt is initialized on step one as R(X-1), so depends on Radix
-  mux2 #(P.DIVb+1) sqrtxmux({~XZeroE, Xfract}, {1'b0, ~XZeroE, Xfract[P.DIVb-1:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
+  mux2 #(P.DIVb+1) sqrtxmux(Xfract, {1'b0, Xfract[P.DIVb:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
   if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};
   else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};
   mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);
@@ -176,8 +176,8 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     assign X = PreShiftX;
   end
 
-   // Divisior register
-  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {4'b0001, Dfract}, D);
+  // Divisior register
+  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dfract}, D);
  
   // Floating-point exponent
   fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .XZero(XZeroE), .ell, .m(mE), .Qe(QeE));

From 255873a50cbc1b5af537130097fb318f0d17d8e8 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 11:21:02 -0800
Subject: [PATCH 05/62] Divsqrt cleanup: change Q to U, commenting code

---
 src/fpu/fdivsqrt/fdivsqrt.sv         |  8 +++----
 src/fpu/fdivsqrt/fdivsqrtcycles.sv   |  2 +-
 src/fpu/fdivsqrt/fdivsqrtexpcalc.sv  | 11 ++++++---
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 12 +++++-----
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 36 +++++++++++++++++-----------
 src/fpu/fpu.sv                       | 12 +++++-----
 src/fpu/postproc/divshiftcalc.sv     | 28 +++++++++++-----------
 src/fpu/postproc/postprocess.sv      | 12 +++++-----
 src/fpu/postproc/round.sv            |  6 ++---
 src/fpu/postproc/shiftcorrection.sv  |  8 +++----
 10 files changed, 74 insertions(+), 61 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrt.sv b/src/fpu/fdivsqrt/fdivsqrt.sv
index 5c5fa0f57..60e42f457 100644
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -45,8 +45,8 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
   input  logic                 IntDivE, W64E,
   output logic                 DivStickyM,
   output logic                 FDivBusyE, IFDivStartE, FDivDoneE,
-  output logic [P.NE+1:0]      QeM,
-  output logic [P.DIVb:0]      QmM,
+  output logic [P.NE+1:0]      UeM,                         // Exponent result 
+  output logic [P.DIVb:0]      UmM,                         // Significand result
   output logic [P.XLEN-1:0]    FIntDivResultM
 );
 
@@ -74,7 +74,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
 
   fdivsqrtpreproc #(P) fdivsqrtpreproc(                          // Preprocessor
     .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE),
-    .FmtE, .SqrtE, .XZeroE, .Funct3E, .QeM, .X, .D, .CyclesE,
+    .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
     // Int-specific 
     .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
     .BZeroM, .nM, .mM, .AM, 
@@ -94,7 +94,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
   fdivsqrtpostproc #(P) fdivsqrtpostproc(                        // Postprocessor
     .clk, .reset, .StallM, .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, 
     .SqrtE, .Firstun, .SqrtM, .SpecialCaseM, 
-    .QmM, .WZeroE, .DivStickyM, 
+    .UmM, .WZeroE, .DivStickyM, 
     // Int-specific 
     .nM, .mM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
     .FIntDivResultM);
diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index ed28c9355..2122317fe 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -68,7 +68,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
     if (SqrtE) fbits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1; is it related to DIVCOPIES logic below?
     // if (SqrtE) fbits = Nf + 2 + 2; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2
     else       fbits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
-    if (P.IDIV_ON_FPU) CyclesE =  IntDivE ? ((nE + 1)/P.DIVCOPIES) : (fbits + (P.LOGR*P.DIVCOPIES)-1)/(P.LOGR*P.DIVCOPIES);
+    if (P.IDIV_ON_FPU) CyclesE =  IntDivE ? ((nE + 1)/P.DIVCOPIES) : (fbits -1)/(P.RK) + 1;
     else              CyclesE = (fbits + (P.LOGR*P.DIVCOPIES)-1)/(P.LOGR*P.DIVCOPIES);
   end 
   /* verilator lint_on WIDTH */
diff --git a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
index 5531276df..113f2b2dd 100644
--- a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
@@ -32,8 +32,9 @@ module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 Sqrt,
   input  logic                 XZero, 
   input  logic [P.DIVBLEN:0]   ell, m,
-  output logic [P.NE+1:0]      Qe
+  output logic [P.NE+1:0]      Ue
   );
+  
   logic [P.NE-2:0] Bias;
   logic [P.NE+1:0] SXExp;
   logic [P.NE+1:0] SExp;
@@ -63,10 +64,14 @@ module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
       2'h2: Bias =  (P.NE-1)'(P.H_BIAS);
     endcase
   end
+
+  // Square root exponent = (Xe - l - bias) / 2 + bias; l accounts for subnorms
   assign SXExp = {2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - (P.NE+2)'(P.BIAS);
   assign SExp  = {SXExp[P.NE+1], SXExp[P.NE+1:1]} + {2'b0, Bias};
   
-  // correct exponent for subnormal input's normalization shifts
+  // division exponent = (Xe-l) - (Ye-m) + bias; l and m account for subnorms
   assign DExp  = ({2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - {2'b0, Ye} + {{(P.NE+1-P.DIVBLEN){1'b0}}, m} + {3'b0, Bias}); 
-  assign Qe = Sqrt ? SExp : DExp;
+
+  // Select square root or division exponent
+  assign Ue = Sqrt ? SExp : DExp;
 endmodule
diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 9f887d4ab..2b9be54a7 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -38,14 +38,14 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.XLEN-1:0]  AM,
   input  logic               RemOpM, ALTBM, BZeroM, AsM, BsM, W64M,
   input  logic [P.DIVBLEN:0] nM, mM,
-  output logic [P.DIVb:0]    QmM, 
+  output logic [P.DIVb:0]    UmM,               // result significand
   output logic               WZeroE,
   output logic               DivStickyM,
   output logic [P.XLEN-1:0]  FIntDivResultM
 );
   
   logic [P.DIVb+3:0]         W, Sum;
-  logic [P.DIVb:0]           PreQmM;
+  logic [P.DIVb:0]           PreUmM;
   logic                      NegStickyM;
   logic                      weq0E, WZeroM;
   logic [P.XLEN-1:0]         IntDivResultM;
@@ -91,17 +91,17 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
   // Determine if sticky bit is negative  // *** look for ways to optimize this.  Shift shouldn't be needed.
   assign Sum = WC + WS;
   assign NegStickyM = Sum[P.DIVb+3];
-  mux2 #(P.DIVb+1) preqmmux(FirstU, FirstUM, NegStickyM, PreQmM); // Select U or U-1 depending on negative sticky bit
-  mux2 #(P.DIVb+1)    qmmux(PreQmM, (PreQmM << 1), SqrtM, QmM);
+  mux2 #(P.DIVb+1) preummux(FirstU, FirstUM, NegStickyM, PreUmM); // Select U or U-1 depending on negative sticky bit
+  mux2 #(P.DIVb+1)    ummux(PreUmM, (PreUmM << 1), SqrtM, UmM);
 
-  // Integer quotient or remainder correctoin, normalization, and special cases
+  // Integer quotient or remainder correction, normalization, and special cases
   if (P.IDIV_ON_FPU) begin:intpostproc // Int supported
     logic [P.DIVBLEN:0] NormShiftM;
     logic [P.DIVb+3:0] UnsignedQuotM, NormRemM, NormRemDM, NormQuotM;
     logic signed [P.DIVb+3:0] PreResultM, PreIntResultM;
 
     assign W = $signed(Sum) >>> P.LOGR;
-    assign UnsignedQuotM = {3'b000, PreQmM};
+    assign UnsignedQuotM = {3'b000, PreUmM};
 
     // Integer remainder: sticky and sign correction muxes
     assign NegQuotM = AsM ^ BsM; // Integer Quotient is negative
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 0e716ac20..2255aafb1 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -35,7 +35,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 SqrtE,
   input  logic                 XZeroE,
   input  logic [2:0]           Funct3E,
-  output logic [P.NE+1:0]      QeM,
+  output logic [P.NE+1:0]      UeM,
   output logic [P.DIVb+3:0]    X, D,
   // Int-specific
   input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
@@ -48,10 +48,10 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   output logic [P.XLEN-1:0]    AM
 );
 
-  logic [P.DIVb:0]             Xfract, Dfract;
+  logic [P.DIVb:0]             Xnorm, Dnorm;
   logic [P.DIVb:0]             PreSqrtX;
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
-  logic [P.NE+1:0]             QeE;                                 // Quotient Exponent (FP only)
+  logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
   logic [P.DIVBLEN:0]          mE, nE, ell;                         // Leading zeros of inputs
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
@@ -106,9 +106,9 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   lzc #(P.DIVb+1) lzcX (IFX, ell);
   lzc #(P.DIVb+1) lzcY (IFD, mE);
 
-  // Normalization shift: shift off leading one
-  assign Xfract = (IFX << ell);
-  assign Dfract = (IFD << mE); 
+  // Normalization shift: shift leading one into most significant bit
+  assign Xnorm = (IFX << ell);
+  assign Dnorm = (IFD << mE); 
 
   //////////////////////////////////////////////////////
   // Integer Right Shift to digit boundary
@@ -133,10 +133,11 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
       logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
       logic [P.DIVBLEN:0] TotalIntBits, IntSteps;
       /* verilator lint_off WIDTH */
+      // n = k*ceil((r+p)/rk) - 1
       assign TotalIntBits = P.LOGR + p;                            // Total number of result bits (r integer bits plus p fractional bits)
       assign IntTrunc = TotalIntBits % P.RK;                       // Truncation check for ceiling operator
       assign IntSteps = (TotalIntBits >> P.LOGRK) + |IntTrunc;     // Number of steps for int div
-      assign nE = (IntSteps * P.DIVCOPIES) - 1;                    // Fractional digits
+      assign nE = (IntSteps * P.DIVCOPIES) - 1;                    // Fractional digits = total digits - 1 integer digit
       assign RightShiftX = P.RK - 1 - ((TotalIntBits - 1) % P.RK); // Right shift amount
       assign DivXShifted = DivX >> RightShiftX;                    // shift X by up to R*K-1 to complete in nE steps
       /* verilator lint_on WIDTH */
@@ -150,18 +151,25 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 
   //////////////////////////////////////////////////////
   // Floating-Point Preprocessing
-  // append leading 1 (for nonzero inputs)
+  // Extend to Q4.b format
   // shift square root to be in range [1/4, 1)
   // Normalized numbers are shifted right by 1 if the exponent is odd
   // Subnormal numbers have Xe = 0 and an unbiased exponent of 1-BIAS.  They are shifted right if the number of leading zeros is odd.
   // NOTE: there might be a discrepancy that X is never right shifted by 2.  However
-  //  it comes out in the wash and gives the right answer.  Investigate later if possible.
+  //  it comes out in the wash and gives the right answer.  Investigate later if possible. ***
   //////////////////////////////////////////////////////
 
-  assign DivX = {3'b000, Xfract};
+  assign DivX = {3'b000, Xnorm}; // Zero-extend numerator for division
 
   // Sqrt is initialized on step one as R(X-1), so depends on Radix
-  mux2 #(P.DIVb+1) sqrtxmux(Xfract, {1'b0, Xfract[P.DIVb:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
+  // If X = 0, then special case logic sets sqrt = 0 so this portion doesn't matter
+  // Otherwise, X has a leading 1 after possible normalization shift and is now in range [1, 2)
+  // Next X is shifted right by 1 or 2 bits to range [1/4, 1) and exponent will be adjusted accordingly to be even
+  // Now (X-1) is negative.  Formed by placing all 1s in all four integer bits (in Q4.b) form, keeping X in fraciton bits
+  // Then multiply by R is left shift by r (1 or 2 for radix 2 or 4)
+  // For Radix 2, this gives 3 leading 1s, followed by the fraction bits
+  // For Radix 4, this gives 2 leading 1s, followed by the fraction bits (and a zero in the lsb)
+  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
   if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};
   else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};
   mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);
@@ -177,11 +185,11 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   end
 
   // Divisior register
-  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dfract}, D);
+  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dnorm}, D);
  
   // Floating-point exponent
-  fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .XZero(XZeroE), .ell, .m(mE), .Qe(QeE));
-  flopen #(P.NE+2) expreg(clk, IFDivStartE, QeE, QeM);
+  fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .XZero(XZeroE), .ell, .m(mE), .Ue(UeE));
+  flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
 
   // Number of FSM cycles (to FSM)
   fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .nE, .CyclesE);
diff --git a/src/fpu/fpu.sv b/src/fpu/fpu.sv
index f71999471..ffd9cf49a 100755
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@@ -133,8 +133,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   logic [P.XLEN-1:0]           FCvtIntResM;                        // fcvt integer result (for IEU)
   
   // divide signals
-  logic [P.DIVb:0]             QmM;                                // fdivsqrt signifcand
-  logic [P.NE+1:0]             QeM;                                // fdivsqrt exponent
+  logic [P.DIVb:0]             UmM;                                // fdivsqrt signifcand
+  logic [P.NE+1:0]             UeM;                                // fdivsqrt exponent
   logic                        DivStickyM;                         // fdivsqrt sticky bit
   logic                        FDivDoneE, IFDivStartE;             // fdivsqrt control signals
   logic [P.XLEN-1:0]           FIntDivResultM;                     // fdivsqrt integer division result (for IEU)
@@ -242,8 +242,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   fdivsqrt #(P) fdivsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
     .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .FDivStartE, .IDivStartE, .XsE,
     .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .IntDivE, .W64E,
-    .StallM, .FlushE, .DivStickyM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .QeM, 
-    .QmM, .FIntDivResultM);
+    .StallM, .FlushE, .DivStickyM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .UeM, 
+    .UmM, .FIntDivResultM);
 
   // compare: fmin/fmax, flt/fle/feq
   fcmp #(P) fcmp (.Fmt(FmtE), .OpCtrl(OpCtrlE), .Xs(XsE), .Ys(YsE), .Xe(XeE), .Ye(YeE), 
@@ -326,9 +326,9 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   //////////////////////////////////////////////////////////////////////////////////////////
 
   postprocess #(P) postprocess(.Xs(XsM), .Ys(YsM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), 
-    .FmaASticky(FmaAStickyM), .XZero(XZeroM), .YZero(YZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QmM), .FmaSs(SsM),
+    .FmaASticky(FmaAStickyM), .XZero(XZeroM), .YZero(YZeroM), .XInf(XInfM), .YInf(YInfM), .DivUm(UmM), .FmaSs(SsM),
     .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), 
-    .FmaSm(SmM), .DivQe(QeM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
+    .FmaSm(SmM), .DivUe(UeM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
     .CvtCe(CeM), .CvtResSubnormUf(CvtResSubnormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CsM), 
     .ToInt(FWriteIntM), .DivSticky(DivStickyM), .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), 
     .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));
diff --git a/src/fpu/postproc/divshiftcalc.sv b/src/fpu/postproc/divshiftcalc.sv
index d560714db..380f8f5e6 100644
--- a/src/fpu/postproc/divshiftcalc.sv
+++ b/src/fpu/postproc/divshiftcalc.sv
@@ -27,8 +27,8 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.DIVb:0]              DivQm,              // divsqrt significand
-  input  logic [P.NE+1:0]              DivQe,              // divsqrt exponent
+  input  logic [P.DIVb:0]              DivUm,              // divsqrt significand
+  input  logic [P.NE+1:0]              DivUe,              // divsqrt exponent
   output logic [P.LOGNORMSHIFTSZ-1:0]  DivShiftAmt,        // divsqrt shift amount
   output logic [P.NORMSHIFTSZ-1:0]     DivShiftIn,         // divsqrt shift input
   output logic                         DivResSubnorm,      // is the divsqrt result subnormal
@@ -41,23 +41,23 @@ module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
 
   // is the result subnormal
   // if the exponent is 1 then the result needs to be normalized then the result is Subnormalizes
-  assign DivResSubnorm = DivQe[P.NE+1]|(~|DivQe[P.NE+1:0]);
+  assign DivResSubnorm = DivUe[P.NE+1]|(~|DivUe[P.NE+1:0]);
 
   // if the result is subnormal
-  //  00000000x.xxxxxx...                     Exp = DivQe
-  //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
-  //  .00xxxxxxxxxxxxx... << DivQe+NF+1       Exp = +1
+  //  00000000x.xxxxxx...                     Exp = DivUe
+  //  .00000000xxxxxxx... >> NF+1             Exp = DivUe+NF+1
+  //  .00xxxxxxxxxxxxx... << DivUe+NF+1       Exp = +1
   //  .0000xxxxxxxxxxx... >> 1                Exp = 1
-  // Left shift amount      = DivQe+NF+1-1
-  assign DivSubnormShift    = (P.NE+2)'(P.NF)+DivQe;
+  // Left shift amount      = DivUe+NF+1-1
+  assign DivSubnormShift    = (P.NE+2)'(P.NF)+DivUe;
   assign DivSubnormShiftPos = ~DivSubnormShift[P.NE+1];
 
   // if the result is normalized
-  //  00000000x.xxxxxx...                     Exp = DivQe
-  //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
-  //  00000000.xxxxxxx... << NF               Exp = DivQe+1
-  //  00000000x.xxxxxx... << NF               Exp = DivQe (extra shift done afterwards)
-  //  00000000xx.xxxxx... << 1?               Exp = DivQe-1 (determined after)
+  //  00000000x.xxxxxx...                     Exp = DivUe
+  //  .00000000xxxxxxx... >> NF+1             Exp = DivUe+NF+1
+  //  00000000.xxxxxxx... << NF               Exp = DivUe+1
+  //  00000000x.xxxxxx... << NF               Exp = DivUe (extra shift done afterwards)
+  //  00000000xx.xxxxx... << 1?               Exp = DivUe-1 (determined after)
   // inital Left shift amount  = NF
   // shift one more if the it's a minimally redundent radix 4 - one entire cycle needed for integer bit
   assign NormShift = (P.LOGNORMSHIFTSZ)'(P.NF);
@@ -68,5 +68,5 @@ module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
   assign DivShiftAmt        = DivResSubnorm ? DivSubnormShiftAmt : NormShift;
 
   // pre-shift the divider result for normalization
-  assign DivShiftIn = {{P.NF{1'b0}}, DivQm, {P.NORMSHIFTSZ-P.DIVb-1-P.NF{1'b0}}};
+  assign DivShiftIn = {{P.NF{1'b0}}, DivUm, {P.NORMSHIFTSZ-P.DIVb-1-P.NF{1'b0}}};
 endmodule
diff --git a/src/fpu/postproc/postprocess.sv b/src/fpu/postproc/postprocess.sv
index ee96b34d2..05db352cd 100644
--- a/src/fpu/postproc/postprocess.sv
+++ b/src/fpu/postproc/postprocess.sv
@@ -48,8 +48,8 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   input logic  [$clog2(3*P.NF+5)-1:0]      FmaSCnt,             // the normalization shift count
   //divide signals
   input logic                              DivSticky,           // divider sticky bit
-  input logic  [P.NE+1:0]                  DivQe,               // divsqrt exponent
-  input logic  [P.DIVb:0]                  DivQm,               // divsqrt significand
+  input logic  [P.NE+1:0]                  DivUe,               // divsqrt exponent
+  input logic  [P.DIVb:0]                  DivUm,               // divsqrt significand
   // conversion signals
   input logic                              CvtCs,               // the result's sign
   input logic  [P.NE:0]                    CvtCe,               // the calculated expoent
@@ -91,7 +91,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   // division singals
   logic [P.LOGNORMSHIFTSZ-1:0] DivShiftAmt;          // divsqrt shif amount
   logic [P.NORMSHIFTSZ-1:0]    DivShiftIn;           // divsqrt shift input
-  logic [P.NE+1:0]             Qe;                   // divsqrt corrected exponent after corretion shift
+  logic [P.NE+1:0]             Ue;                   // divsqrt corrected exponent after corretion shift
   logic                        DivByZero;            // divide by zero flag
   logic                        DivResSubnorm;        // is the divsqrt result subnormal
   logic                        DivSubnormShiftPos;   // is the divsqrt subnorm shift amout positive (not underflowed)
@@ -146,7 +146,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   fmashiftcalc #(P) fmashiftcalc(.FmaSm, .FmaSCnt, .Fmt, .NormSumExp, .FmaSe,
       .FmaSZero, .FmaPreResultSubnorm, .FmaShiftAmt, .FmaShiftIn);
 
-  divshiftcalc #(P) divshiftcalc(.DivQe, .DivQm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
+  divshiftcalc #(P) divshiftcalc(.DivUe, .DivUm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
 
   // select which unit's output to shift
   always_comb
@@ -174,7 +174,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
 
   // correct for LZA/divsqrt error
   shiftcorrection #(P) shiftcorrection(.FmaOp, .FmaPreResultSubnorm, .NormSumExp,
-      .DivResSubnorm, .DivSubnormShiftPos, .DivOp, .DivQe, .Qe, .FmaSZero, .Shifted, .FmaMe, .Mf);
+      .DivResSubnorm, .DivSubnormShiftPos, .DivOp, .DivUe, .Ue, .FmaSZero, .Shifted, .FmaMe, .Mf);
 
   ///////////////////////////////////////////////////////////////////////////////
   // Rounding
@@ -189,7 +189,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   // calulate result sign used in rounding unit
   roundsign roundsign(.FmaOp, .DivOp, .CvtOp, .Sqrt, .FmaSs, .Xs, .Ys, .CvtCs, .Ms);
 
-  round #(P) round(.OutFmt, .Frm, .FmaASticky, .Plus1, .PostProcSel, .CvtCe, .Qe,
+  round #(P) round(.OutFmt, .Frm, .FmaASticky, .Plus1, .PostProcSel, .CvtCe, .Ue,
       .Ms, .FmaMe, .FmaOp, .CvtOp, .CvtResSubnormUf, .Mf, .ToInt,  .CvtResUf,
       .DivSticky, .DivOp, .UfPlus1, .FullRe, .Rf, .Re, .Sticky, .Round, .Guard, .Me);
 
diff --git a/src/fpu/postproc/round.sv b/src/fpu/postproc/round.sv
index 0a5d9ecc5..e01ff376b 100644
--- a/src/fpu/postproc/round.sv
+++ b/src/fpu/postproc/round.sv
@@ -39,7 +39,7 @@ module round import cvw::*;  #(parameter cvw_t P) (
   // divsqrt
   input  logic                     DivOp,              // is a division opperation being done
   input  logic                     DivSticky,          // divsqrt sticky bit
-  input  logic [P.NE+1:0]          Qe,                 // the divsqrt calculated expoent
+  input  logic [P.NE+1:0]          Ue,                 // the divsqrt calculated expoent
   // cvt
   input  logic                     CvtOp,              // is a convert opperation being done
   input  logic                     ToInt,              // is the cvt op a cvt to integer
@@ -300,8 +300,8 @@ module round import cvw::*;  #(parameter cvw_t P) (
       case(PostProcSel)
           2'b10:    Me = FmaMe; // fma
           2'b00:    Me = {CvtCe[P.NE], CvtCe}&{P.NE+2{~CvtResSubnormUf|CvtResUf}}; // cvt
-          // 2'b01: Me = DivDone ? Qe : '0; // divide
-          2'b01:    Me = Qe; // divide
+          // 2'b01: Me = DivDone ? Ue : '0; // divide
+          2'b01:    Me = Ue; // divide
           default:  Me = '0; 
       endcase
 
diff --git a/src/fpu/postproc/shiftcorrection.sv b/src/fpu/postproc/shiftcorrection.sv
index 9e0473667..f5860b42d 100644
--- a/src/fpu/postproc/shiftcorrection.sv
+++ b/src/fpu/postproc/shiftcorrection.sv
@@ -31,7 +31,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
   // divsqrt
   input logic                      DivOp,                  // is it a divsqrt opperation
   input logic                      DivResSubnorm,          // is the divsqrt result subnormal
-  input logic  [P.NE+1:0]          DivQe,                  // the divsqrt result's exponent
+  input logic  [P.NE+1:0]          DivUe,                  // the divsqrt result's exponent
   input logic                      DivSubnormShiftPos,     // is the subnorm divider shift amount positive (ie not underflowed)
   //fma
   input logic                      FmaOp,                  // is it an fma opperation
@@ -41,7 +41,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
   // output
   output logic [P.NE+1:0]          FmaMe,                  // exponent of the normalized sum
   output logic [P.CORRSHIFTSZ-1:0] Mf,                     // the shifted sum before LZA correction
-  output logic [P.NE+1:0]          Qe                      // corrected exponent for divider
+  output logic [P.NE+1:0]          Ue                      // corrected exponent for divider
 );
 
   logic [3*P.NF+3:0]               CorrSumShifted;         // the shifted sum after LZA correction
@@ -61,7 +61,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
 
   // correct the shifting of the divsqrt caused by producing a result in (2, .5] range
   // condition: if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Subnorm)
-  assign LeftShiftQm = (LZAPlus1|(DivQe==1&~LZAPlus1));
+  assign LeftShiftQm = (LZAPlus1|(DivUe==1&~LZAPlus1));
   assign CorrQm0     = Shifted[P.NORMSHIFTSZ-3:P.NORMSHIFTSZ-P.CORRSHIFTSZ-2];
   assign CorrQm1     = Shifted[P.NORMSHIFTSZ-2:P.NORMSHIFTSZ-P.CORRSHIFTSZ-1];
   mux2 #(P.CORRSHIFTSZ) divcorrmux(CorrQm0, CorrQm1, LeftShiftQm, CorrQmShifted);
@@ -87,5 +87,5 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
 
   // the quotent is in the range [.5,2) if there is no early termination
   // if the quotent < 1 and not Subnormal then subtract 1 to account for the normalization shift
-  assign Qe = (DivResSubnorm & DivSubnormShiftPos) ? '0 : DivQe - {(P.NE+1)'(0), ~LZAPlus1};
+  assign Ue = (DivResSubnorm & DivSubnormShiftPos) ? '0 : DivUe - {(P.NE+1)'(0), ~LZAPlus1};
 endmodule

From 8f87860146fa2f58cc6d3cc42020d4199d0334b2 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 11:25:54 -0800
Subject: [PATCH 06/62] Reduced duplicated logic in fdivsqrtcycles

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index 2122317fe..e9fbc6042 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -33,7 +33,10 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.DIVBLEN:0]   nE,
   output logic [P.DURLEN-1:0]  CyclesE
 );
+
   logic [P.DURLEN+1:0] Nf, fbits; // number of fractional bits
+  logic [P.DURLEN-1:0] fpcycles;  // number of cycles for floating-point operation
+
   // DIVN = P.NF+3
   // NS = NF + 1
   // N = NS or NS+2 for div/sqrt.
@@ -68,8 +71,10 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
     if (SqrtE) fbits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1; is it related to DIVCOPIES logic below?
     // if (SqrtE) fbits = Nf + 2 + 2; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2
     else       fbits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
-    if (P.IDIV_ON_FPU) CyclesE =  IntDivE ? ((nE + 1)/P.DIVCOPIES) : (fbits -1)/(P.RK) + 1;
-    else              CyclesE = (fbits + (P.LOGR*P.DIVCOPIES)-1)/(P.LOGR*P.DIVCOPIES);
+    assign     fpcycles = (fbits-1)/(P.RK) + 1;
+
+    if (P.IDIV_ON_FPU) CyclesE = IntDivE ? ((nE + 1)/P.DIVCOPIES) : fpcycles;
+    else               CyclesE = fpcycles;
   end 
   /* verilator lint_on WIDTH */
 

From 2903791820e56cc02516ad24fab358b6f9d35ba7 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 14:00:27 -0800
Subject: [PATCH 07/62] Simplified cycle count logic

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv  | 18 +++++++++---------
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 22 +++++++++++++---------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index e9fbc6042..df581701b 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -30,12 +30,12 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0] FmtE,
   input  logic                 SqrtE,
   input  logic                 IntDivE,
-  input  logic [P.DIVBLEN:0]   nE,
+  input  logic [P.DIVBLEN:0]   IntResultBits,
   output logic [P.DURLEN-1:0]  CyclesE
 );
 
-  logic [P.DURLEN+1:0] Nf, fbits; // number of fractional bits
-  logic [P.DURLEN-1:0] fpcycles;  // number of cycles for floating-point operation
+  logic [P.DURLEN+1:0] Nf, FPResultBits; // number of fractional bits
+  logic [P.DIVBLEN:0]  ResultBits; // number of result bits;
 
   // DIVN = P.NF+3
   // NS = NF + 1
@@ -68,13 +68,13 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
       endcase 
 
   always_comb begin 
-    if (SqrtE) fbits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1; is it related to DIVCOPIES logic below?
-    // if (SqrtE) fbits = Nf + 2 + 2; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2
-    else       fbits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
-    assign     fpcycles = (fbits-1)/(P.RK) + 1;
+    if (SqrtE) FPResultBits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1 rather than +2; is it related to DIVCOPIES logic below?
+    else       FPResultBits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
 
-    if (P.IDIV_ON_FPU) CyclesE = IntDivE ? ((nE + 1)/P.DIVCOPIES) : fpcycles;
-    else               CyclesE = fpcycles;
+    if (P.IDIV_ON_FPU) ResultBits = IntDivE ? IntResultBits : FPResultBits;
+    else               ResultBits = FPResultBits;
+
+    assign CyclesE = (ResultBits-1)/(P.RK) + 1;
   end 
   /* verilator lint_on WIDTH */
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 2255aafb1..ab0941aca 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -54,6 +54,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
   logic [P.DIVBLEN:0]          mE, nE, ell;                         // Leading zeros of inputs
+  logic [P.DIVBLEN:0]          IntResultBits;                       // bits in integer result
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
   logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
   logic                        SignedDivE;                          // signed division
@@ -122,7 +123,11 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     // calculate number of fractional bits p
     assign ZeroDiff = mE - ell;         // Difference in number of leading zeros
     assign ALTBE = ZeroDiff[P.DIVBLEN];  // A less than B (A has more leading zeros)
-    mux2 #(P.DIVBLEN+1) pmux(ZeroDiff, '0, ALTBE, p);              
+    mux2 #(P.DIVBLEN+1) pmux(ZeroDiff, '0, ALTBE, p);          
+
+    /* verilator lint_off WIDTH */
+    assign IntResultBits = P.LOGR + p;                            // Total number of result bits (r integer bits plus p fractional bits)
+    /* verilator lint_on WIDTH */
 
     // Integer special cases (terminate immediately)
     assign ISpecialCaseE = BZeroE | ALTBE;
@@ -131,15 +136,14 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 
     if (P.LOGRK > 0) begin // more than 1 bit per cycle
       logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
-      logic [P.DIVBLEN:0] TotalIntBits, IntSteps;
+      logic [P.DIVBLEN:0] IntSteps;
       /* verilator lint_off WIDTH */
       // n = k*ceil((r+p)/rk) - 1
-      assign TotalIntBits = P.LOGR + p;                            // Total number of result bits (r integer bits plus p fractional bits)
-      assign IntTrunc = TotalIntBits % P.RK;                       // Truncation check for ceiling operator
-      assign IntSteps = (TotalIntBits >> P.LOGRK) + |IntTrunc;     // Number of steps for int div
-      assign nE = (IntSteps * P.DIVCOPIES) - 1;                    // Fractional digits = total digits - 1 integer digit
-      assign RightShiftX = P.RK - 1 - ((TotalIntBits - 1) % P.RK); // Right shift amount
-      assign DivXShifted = DivX >> RightShiftX;                    // shift X by up to R*K-1 to complete in nE steps
+      assign IntTrunc = IntResultBits % P.RK;                       // Truncation check for ceiling operator
+      assign IntSteps = (IntResultBits >> P.LOGRK) + |IntTrunc;     // Number of steps for int div
+      assign nE = (IntSteps * P.DIVCOPIES) - 1;                     // Fractional digits = total digits - 1 integer digit
+      assign RightShiftX = P.RK - 1 - ((IntResultBits - 1) % P.RK); // Right shift amount
+      assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in nE steps
       /* verilator lint_on WIDTH */
     end else begin // radix 2 1 copy doesn't require shifting
       assign nE = p; 
@@ -192,7 +196,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
 
   // Number of FSM cycles (to FSM)
-  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .nE, .CyclesE);
+  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBits, .CyclesE);
 
   if (P.IDIV_ON_FPU) begin:intpipelineregs
     // pipeline registers

From b315ead57507cc884d070c0f77e0a05f875f705b Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 14:28:57 -0800
Subject: [PATCH 08/62] Simplified IntDivNormShift

---
 src/fpu/fdivsqrt/fdivsqrt.sv         |  6 ++--
 src/fpu/fdivsqrt/fdivsqrtcycles.sv   |  9 +++++-
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv |  4 +--
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 41 ++++++++++++++--------------
 4 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrt.sv b/src/fpu/fdivsqrt/fdivsqrt.sv
index 60e42f457..751486f86 100644
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -67,7 +67,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
   // Integer div/rem signals                                
   logic                        BZeroM;                       // Denominator is zero
   logic                        IntDivM;                      // Integer operation
-  logic [P.DIVBLEN:0]          nM, mM;                       // Shift amounts
+  logic [P.DIVBLEN:0]          mM, IntDivNormShiftM;         // Shift amounts
   logic                        ALTBM, AsM, BsM, W64M;        // Special handling for postprocessor
   logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
   logic                        ISpecialCaseE;                // Integer div/remainder special cases
@@ -77,7 +77,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
     .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
     // Int-specific 
     .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
-    .BZeroM, .nM, .mM, .AM, 
+    .BZeroM, .IntDivNormShiftM, .mM, .AM, 
     .IntDivM, .W64M, .ALTBM, .AsM, .BsM);
 
   fdivsqrtfsm #(P) fdivsqrtfsm(                                  // FSM
@@ -96,6 +96,6 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
     .SqrtE, .Firstun, .SqrtM, .SpecialCaseM, 
     .UmM, .WZeroE, .DivStickyM, 
     // Int-specific 
-    .nM, .mM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
+    .IntDivNormShiftM, .mM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
     .FIntDivResultM);
 endmodule
diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index df581701b..bba6e8005 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -67,6 +67,13 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
         P.Q_FMT: Nf = P.Q_NF;
       endcase 
 
+  // Cycle logic
+  // P.DIVCOPIES = k. P.LOGR = log(R) = r.  P.RK = rk.  
+  // Integer division needs p fractional + r integer result bits
+  // FP Division needs at least Nf fractional bits + 2 guard/round bits and one integer digit (LOG R integer bits) = Nf + 2 + r bits
+  // FP Sqrt needs at least Nf fractional bits, 2 guard/round bits, and *** shift bits
+  // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBits / rk)
+
   always_comb begin 
     if (SqrtE) FPResultBits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1 rather than +2; is it related to DIVCOPIES logic below?
     else       FPResultBits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
@@ -74,7 +81,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
     if (P.IDIV_ON_FPU) ResultBits = IntDivE ? IntResultBits : FPResultBits;
     else               ResultBits = FPResultBits;
 
-    assign CyclesE = (ResultBits-1)/(P.RK) + 1;
+    assign CyclesE = (ResultBits-1)/(P.RK) + 1; // ceil (ResultBits/rk)
   end 
   /* verilator lint_on WIDTH */
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 2b9be54a7..58649e3a8 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -37,7 +37,7 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
   input  logic               Firstun, SqrtM, SpecialCaseM, 
   input  logic [P.XLEN-1:0]  AM,
   input  logic               RemOpM, ALTBM, BZeroM, AsM, BsM, W64M,
-  input  logic [P.DIVBLEN:0] nM, mM,
+  input  logic [P.DIVBLEN:0] mM, IntDivNormShiftM,
   output logic [P.DIVb:0]    UmM,               // result significand
   output logic               WZeroE,
   output logic               DivStickyM,
@@ -111,7 +111,7 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
 
     // Select quotient or remainder and do normalization shift
     localparam DIVa        = (P.DIVb+1-P.XLEN); // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
-    mux2 #(P.DIVBLEN+1) normshiftmux(((P.DIVBLEN+1)'(P.DIVb) - (nM * (P.DIVBLEN+1)'(P.LOGR))), (mM + (P.DIVBLEN+1)'(DIVa)), RemOpM, NormShiftM);
+    mux2 #(P.DIVBLEN+1) normshiftmux(IntDivNormShiftM, (mM + (P.DIVBLEN+1)'(DIVa)), RemOpM, NormShiftM);
     mux2 #(P.DIVb+4)    presresultmux(NormQuotM, NormRemM, RemOpM, PreResultM);
     assign PreIntResultM = $signed(PreResultM >>> NormShiftM); 
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index ab0941aca..35757e480 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -42,7 +42,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 IntDivE, W64E,
   output logic                 ISpecialCaseE,
   output logic [P.DURLEN-1:0]  CyclesE,
-  output logic [P.DIVBLEN:0]   nM, mM,
+  output logic [P.DIVBLEN:0]   mM, IntDivNormShiftM,
   output logic                 ALTBM, IntDivM, W64M,
   output logic                 AsM, BsM, BZeroM,
   output logic [P.XLEN-1:0]    AM
@@ -53,7 +53,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
   logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
-  logic [P.DIVBLEN:0]          mE, nE, ell;                         // Leading zeros of inputs
+  logic [P.DIVBLEN:0]          mE, ell;                             // Leading zeros of inputs
   logic [P.DIVBLEN:0]          IntResultBits;                       // bits in integer result
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
   logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
@@ -126,27 +126,21 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.DIVBLEN+1) pmux(ZeroDiff, '0, ALTBE, p);          
 
     /* verilator lint_off WIDTH */
-    assign IntResultBits = P.LOGR + p;                            // Total number of result bits (r integer bits plus p fractional bits)
+    assign IntResultBits = P.LOGR + p;  // Total number of result bits (r integer bits plus p fractional bits)
     /* verilator lint_on WIDTH */
 
     // Integer special cases (terminate immediately)
     assign ISpecialCaseE = BZeroE | ALTBE;
 
-    // calculate number of fractional digits nE and right shift amount RightShiftX to complete in discrete number of steps
-
+    // calculate right shift amount RightShiftX to complete in discrete number of steps
     if (P.LOGRK > 0) begin // more than 1 bit per cycle
       logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
       logic [P.DIVBLEN:0] IntSteps;
-      /* verilator lint_off WIDTH */
-      // n = k*ceil((r+p)/rk) - 1
-      assign IntTrunc = IntResultBits % P.RK;                       // Truncation check for ceiling operator
-      assign IntSteps = (IntResultBits >> P.LOGRK) + |IntTrunc;     // Number of steps for int div
-      assign nE = (IntSteps * P.DIVCOPIES) - 1;                     // Fractional digits = total digits - 1 integer digit
+      /* verilator lint_offf WIDTH */
       assign RightShiftX = P.RK - 1 - ((IntResultBits - 1) % P.RK); // Right shift amount
-      assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in nE steps
+      assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in n steps
       /* verilator lint_on WIDTH */
     end else begin // radix 2 1 copy doesn't require shifting
-      assign nE = p; 
       assign DivXShifted = DivX;
     end
   end else begin
@@ -199,17 +193,22 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBits, .CyclesE);
 
   if (P.IDIV_ON_FPU) begin:intpipelineregs
+    logic [P.DIVBLEN:0] IntDivNormShiftE;
+    /* verilator lint_off WIDTH */
+    assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
+    /* verilator lint_on WIDTH */
+
     // pipeline registers
-    flopen #(1)        mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
-    flopen #(1)       altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
-    flopen #(1)      bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
-    flopen #(1)      asignreg(clk, IFDivStartE, AsE,      AsM);
-    flopen #(1)      bsignreg(clk, IFDivStartE, BsE,      BsM);
-    flopen #(P.DIVBLEN+1) nreg(clk, IFDivStartE, nE,       nM); 
-    flopen #(P.DIVBLEN+1) mreg(clk, IFDivStartE, mE,       mM);
-    flopen #(P.XLEN)   srcareg(clk, IFDivStartE, AE,       AM);
+    flopen #(1)          mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
+    flopen #(1)         altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
+    flopen #(1)        bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
+    flopen #(1)        asignreg(clk, IFDivStartE, AsE,      AsM);
+    flopen #(1)        bsignreg(clk, IFDivStartE, BsE,      BsM);
+    flopen #(P.DIVBLEN+1) nsreg(clk, IFDivStartE, IntDivNormShiftE, IntDivNormShiftM); 
+    flopen #(P.DIVBLEN+1)  mreg(clk, IFDivStartE, mE,       mM);
+    flopen #(P.XLEN)    srcareg(clk, IFDivStartE, AE,       AM);
     if (P.XLEN==64) 
-      flopen #(1)      w64reg(clk, IFDivStartE, W64E,     W64M);
+      flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);
   end
 
 endmodule

From 3108b58290d7dfe0f05d1ee47d5c7b078873b453 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 14:55:36 -0800
Subject: [PATCH 09/62] Simplified integer postnormalization shift

---
 src/fpu/fdivsqrt/fdivsqrt.sv         |  6 +++---
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv |  7 ++-----
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 12 ++++++++----
 3 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrt.sv b/src/fpu/fdivsqrt/fdivsqrt.sv
index 751486f86..ac5c2c338 100644
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -67,7 +67,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
   // Integer div/rem signals                                
   logic                        BZeroM;                       // Denominator is zero
   logic                        IntDivM;                      // Integer operation
-  logic [P.DIVBLEN:0]          mM, IntDivNormShiftM;         // Shift amounts
+  logic [P.DIVBLEN:0]          IntNormShiftM;                // Integer normalizatoin shift amount
   logic                        ALTBM, AsM, BsM, W64M;        // Special handling for postprocessor
   logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
   logic                        ISpecialCaseE;                // Integer div/remainder special cases
@@ -77,7 +77,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
     .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
     // Int-specific 
     .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
-    .BZeroM, .IntDivNormShiftM, .mM, .AM, 
+    .BZeroM, .IntNormShiftM, .AM, 
     .IntDivM, .W64M, .ALTBM, .AsM, .BsM);
 
   fdivsqrtfsm #(P) fdivsqrtfsm(                                  // FSM
@@ -96,6 +96,6 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
     .SqrtE, .Firstun, .SqrtM, .SpecialCaseM, 
     .UmM, .WZeroE, .DivStickyM, 
     // Int-specific 
-    .IntDivNormShiftM, .mM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
+    .IntNormShiftM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
     .FIntDivResultM);
 endmodule
diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 58649e3a8..3b6115201 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -37,7 +37,7 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
   input  logic               Firstun, SqrtM, SpecialCaseM, 
   input  logic [P.XLEN-1:0]  AM,
   input  logic               RemOpM, ALTBM, BZeroM, AsM, BsM, W64M,
-  input  logic [P.DIVBLEN:0] mM, IntDivNormShiftM,
+  input  logic [P.DIVBLEN:0] IntNormShiftM,
   output logic [P.DIVb:0]    UmM,               // result significand
   output logic               WZeroE,
   output logic               DivStickyM,
@@ -96,7 +96,6 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
 
   // Integer quotient or remainder correction, normalization, and special cases
   if (P.IDIV_ON_FPU) begin:intpostproc // Int supported
-    logic [P.DIVBLEN:0] NormShiftM;
     logic [P.DIVb+3:0] UnsignedQuotM, NormRemM, NormRemDM, NormQuotM;
     logic signed [P.DIVb+3:0] PreResultM, PreIntResultM;
 
@@ -110,10 +109,8 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.DIVb+4) quotresmux(UnsignedQuotM, -UnsignedQuotM, NegQuotM, NormQuotM);
 
     // Select quotient or remainder and do normalization shift
-    localparam DIVa        = (P.DIVb+1-P.XLEN); // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
-    mux2 #(P.DIVBLEN+1) normshiftmux(IntDivNormShiftM, (mM + (P.DIVBLEN+1)'(DIVa)), RemOpM, NormShiftM);
     mux2 #(P.DIVb+4)    presresultmux(NormQuotM, NormRemM, RemOpM, PreResultM);
-    assign PreIntResultM = $signed(PreResultM >>> NormShiftM); 
+    assign PreIntResultM = $signed(PreResultM >>> IntNormShiftM); 
 
     // special case logic
     // terminates immediately when B is Zero (div 0) or |A| has more leading 0s than |B|
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 35757e480..137f54d99 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -42,7 +42,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 IntDivE, W64E,
   output logic                 ISpecialCaseE,
   output logic [P.DURLEN-1:0]  CyclesE,
-  output logic [P.DIVBLEN:0]   mM, IntDivNormShiftM,
+  output logic [P.DIVBLEN:0]   IntNormShiftM,
   output logic                 ALTBM, IntDivM, W64M,
   output logic                 AsM, BsM, BZeroM,
   output logic [P.XLEN-1:0]    AM
@@ -193,10 +193,15 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBits, .CyclesE);
 
   if (P.IDIV_ON_FPU) begin:intpipelineregs
-    logic [P.DIVBLEN:0] IntDivNormShiftE;
+    logic [P.DIVBLEN:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
+    logic               RemOpE;
+
     /* verilator lint_off WIDTH */
     assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
+    assign IntRemNormShiftE = mE + (P.DIVb+1-P.XLEN);             // m + b - (N-1) for remainder normalization shift
     /* verilator lint_on WIDTH */
+    assign RemOpE = Funct3E[1];
+    mux2 #(P.DIVBLEN+1) normshiftmux(IntDivNormShiftE, IntRemNormShiftE, RemOpE, IntNormShiftE);
 
     // pipeline registers
     flopen #(1)          mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
@@ -204,8 +209,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     flopen #(1)        bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
     flopen #(1)        asignreg(clk, IFDivStartE, AsE,      AsM);
     flopen #(1)        bsignreg(clk, IFDivStartE, BsE,      BsM);
-    flopen #(P.DIVBLEN+1) nsreg(clk, IFDivStartE, IntDivNormShiftE, IntDivNormShiftM); 
-    flopen #(P.DIVBLEN+1)  mreg(clk, IFDivStartE, mE,       mM);
+    flopen #(P.DIVBLEN+1) nsreg(clk, IFDivStartE, IntNormShiftE, IntNormShiftM); 
     flopen #(P.XLEN)    srcareg(clk, IFDivStartE, AE,       AM);
     if (P.XLEN==64) 
       flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);

From 03864642a7c434f7a638522e5c70845a62142a15 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 16:42:32 -0800
Subject: [PATCH 10/62] fdivsqrt cleanup

---
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 2 +-
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 3b6115201..e9fd2fd2c 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -118,7 +118,7 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
       if (BZeroM) begin         // Divide by zero
         if (RemOpM) IntDivResultM = AM;  
         else        IntDivResultM = {(P.XLEN){1'b1}};
-     end else if (ALTBM) begin // Numerator is zero
+     end else if (ALTBM) begin // Numerator is small
         if (RemOpM) IntDivResultM = AM;
         else        IntDivResultM = '0;
      end else       IntDivResultM = PreIntResultM[P.XLEN-1:0];
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 137f54d99..66ba957e8 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -198,7 +198,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 
     /* verilator lint_off WIDTH */
     assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
-    assign IntRemNormShiftE = mE + (P.DIVb+1-P.XLEN);             // m + b - (N-1) for remainder normalization shift
+    assign IntRemNormShiftE = mE + (P.DIVb-(P.XLEN-1));           // m + b - (N-1) for remainder normalization shift
     /* verilator lint_on WIDTH */
     assign RemOpE = Funct3E[1];
     mux2 #(P.DIVBLEN+1) normshiftmux(IntDivNormShiftE, IntRemNormShiftE, RemOpE, IntNormShiftE);

From 7d0d9dcebe704a464cb156e635bf2d215762daa6 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 18:01:13 -0800
Subject: [PATCH 11/62] divider cleanup

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv  | 18 +++++++++---------
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv |  8 ++++----
 src/fpu/unpackinput.sv              |  6 ------
 3 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index bba6e8005..d5c571940 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -30,12 +30,12 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0] FmtE,
   input  logic                 SqrtE,
   input  logic                 IntDivE,
-  input  logic [P.DIVBLEN:0]   IntResultBits,
+  input  logic [P.DIVBLEN:0]   IntResultBitsE,
   output logic [P.DURLEN-1:0]  CyclesE
 );
 
-  logic [P.DURLEN+1:0] Nf, FPResultBits; // number of fractional bits
-  logic [P.DIVBLEN:0]  ResultBits; // number of result bits;
+  logic [P.DURLEN+1:0] Nf, FPResultBitsE; // number of fractional bits
+  logic [P.DIVBLEN:0]  ResultBitsE; // number of result bits;
 
   // DIVN = P.NF+3
   // NS = NF + 1
@@ -72,16 +72,16 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   // Integer division needs p fractional + r integer result bits
   // FP Division needs at least Nf fractional bits + 2 guard/round bits and one integer digit (LOG R integer bits) = Nf + 2 + r bits
   // FP Sqrt needs at least Nf fractional bits, 2 guard/round bits, and *** shift bits
-  // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBits / rk)
+  // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
 
   always_comb begin 
-    if (SqrtE) FPResultBits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1 rather than +2; is it related to DIVCOPIES logic below?
-    else       FPResultBits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
+    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1 and +0 rather than +2; is it related to DIVCOPIES logic below?
+    else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
 
-    if (P.IDIV_ON_FPU) ResultBits = IntDivE ? IntResultBits : FPResultBits;
-    else               ResultBits = FPResultBits;
+    if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;
+    else               ResultBitsE = FPResultBitsE;
 
-    assign CyclesE = (ResultBits-1)/(P.RK) + 1; // ceil (ResultBits/rk)
+    assign CyclesE = (ResultBitsE-1)/(P.RK) + 1; // ceil (ResultBitsE/rk)
   end 
   /* verilator lint_on WIDTH */
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 66ba957e8..e950a40bd 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -54,7 +54,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
   logic [P.DIVBLEN:0]          mE, ell;                             // Leading zeros of inputs
-  logic [P.DIVBLEN:0]          IntResultBits;                       // bits in integer result
+  logic [P.DIVBLEN:0]          IntResultBitsE;                      // bits in integer result
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
   logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
   logic                        SignedDivE;                          // signed division
@@ -126,7 +126,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.DIVBLEN+1) pmux(ZeroDiff, '0, ALTBE, p);          
 
     /* verilator lint_off WIDTH */
-    assign IntResultBits = P.LOGR + p;  // Total number of result bits (r integer bits plus p fractional bits)
+    assign IntResultBitsE = P.LOGR + p;  // Total number of result bits (r integer bits plus p fractional bits)
     /* verilator lint_on WIDTH */
 
     // Integer special cases (terminate immediately)
@@ -137,7 +137,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
       logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
       logic [P.DIVBLEN:0] IntSteps;
       /* verilator lint_offf WIDTH */
-      assign RightShiftX = P.RK - 1 - ((IntResultBits - 1) % P.RK); // Right shift amount
+      assign RightShiftX = P.RK - 1 - ((IntResultBitsE - 1) % P.RK); // Right shift amount
       assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in n steps
       /* verilator lint_on WIDTH */
     end else begin // radix 2 1 copy doesn't require shifting
@@ -190,7 +190,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
 
   // Number of FSM cycles (to FSM)
-  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBits, .CyclesE);
+  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
 
   if (P.IDIV_ON_FPU) begin:intpipelineregs
     logic [P.DIVBLEN:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
diff --git a/src/fpu/unpackinput.sv b/src/fpu/unpackinput.sv
index c551e8173..b3d7f901e 100644
--- a/src/fpu/unpackinput.sv
+++ b/src/fpu/unpackinput.sv
@@ -83,7 +83,6 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (
       assign BadNaNBox = ~(Fmt|(&In[P.FLEN-1:P.LEN1])); // Check NaN boxing
       always_comb
         if (BadNaNBox) begin
-//          PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, In[P.LEN1-P.NE1-3:0]};
           PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, {(P.LEN1-P.NE1-2){1'b0}}};
         end else 
           PostBox = In;
@@ -143,8 +142,6 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (
         if (BadNaNBox) begin
           case (Fmt)
             P.FMT: PostBox = In;
-//            P.FMT1: PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, In[P.LEN1-P.NE1-3:0]};
-//            P.FMT2: PostBox = {{(P.FLEN-P.LEN2){1'b1}}, 1'b1, {(P.NE2+1){1'b1}}, In[P.LEN2-P.NE2-3:0]};
             P.FMT1: PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, {(P.LEN1-P.NE1-2){1'b0}}};
             P.FMT2: PostBox = {{(P.FLEN-P.LEN2){1'b1}}, 1'b1, {(P.NE2+1){1'b1}}, {(P.LEN2-P.NE2-2){1'b0}}};
             default: PostBox = 'x;
@@ -230,9 +227,6 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (
         if (BadNaNBox) begin
           case (Fmt)
             2'b11: PostBox = In;
-//            2'b01: PostBox = {{(P.Q_LEN-P.D_LEN){1'b1}}, 1'b1, {(P.D_NE+1){1'b1}}, In[P.D_LEN-P.D_NE-3:0]};
-//            2'b00: PostBox = {{(P.Q_LEN-P.S_LEN){1'b1}}, 1'b1, {(P.S_NE+1){1'b1}}, In[P.S_LEN-P.S_NE-3:0]};
-//            2'b10: PostBox = {{(P.Q_LEN-P.H_LEN){1'b1}}, 1'b1, {(P.H_NE+1){1'b1}}, In[P.H_LEN-P.H_NE-3:0]};
             2'b01: PostBox = {{(P.Q_LEN-P.D_LEN){1'b1}}, 1'b1, {(P.D_NE+1){1'b1}}, {(P.D_LEN-P.D_NE-2){1'b0}}};
             2'b00: PostBox = {{(P.Q_LEN-P.S_LEN){1'b1}}, 1'b1, {(P.S_NE+1){1'b1}}, {(P.S_LEN-P.S_NE-2){1'b0}}};
             2'b10: PostBox = {{(P.Q_LEN-P.H_LEN){1'b1}}, 1'b1, {(P.H_NE+1){1'b1}}, {(P.H_LEN-P.H_NE-2){1'b0}}};

From 3cae2385ab00e31887656ccf8c81bdbd75124396 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 18:19:41 -0800
Subject: [PATCH 12/62] Simplified out LOGRK parameter

---
 config/shared/config-shared.vh      | 5 ++---
 config/shared/parameter-defs.vh     | 1 -
 src/cvw.sv                          | 1 -
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 5 ++---
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 17b1ede83..10b56f24e 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -97,11 +97,10 @@ localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 localparam DIVN        = ((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2; // standard length of input: max(XLEN, NF+2)
 localparam LOGR        = $clog2(RADIX);                             // r = log(R)
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
-localparam LOGRK       = $clog2(RK);                                // log2(r*k)
-localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // 
+localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // *** relate to algorithm for rest of these
 localparam DURLEN      = $clog2(FPDUR+1);
 localparam DIVb        = FPDUR*RK - 1; // canonical fdiv size (b)
-localparam DIVBLEN     = $clog2(DIVb+2)-1;
+localparam DIVBLEN     = $clog2(DIVb+2)-1;                          // *** where is 2 coming from?
 
 // largest length in IEU/FPU
 localparam CVTLEN = ((NF<XLEN) ? (XLEN) : (NF));  // max(XLEN, NF)
diff --git a/config/shared/parameter-defs.vh b/config/shared/parameter-defs.vh
index 0c377c02d..85c9d1c19 100644
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@@ -180,7 +180,6 @@ localparam cvw_t P = '{
   DIVN        : DIVN,
   LOGR        : LOGR,
   RK          : RK,
-  LOGRK       : LOGRK,
   FPDUR       : FPDUR,
   DURLEN      : DURLEN,
   DIVb        : DIVb,
diff --git a/src/cvw.sv b/src/cvw.sv
index 02105823e..3c32982bd 100644
--- a/src/cvw.sv
+++ b/src/cvw.sv
@@ -272,7 +272,6 @@ typedef struct packed {
   int DIVN       ;
   int LOGR       ;
   int RK         ;
-  int LOGRK      ;
   int FPDUR      ;
   int DURLEN     ;
   int DIVb       ;
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index e950a40bd..97ceeb085 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -133,9 +133,8 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     assign ISpecialCaseE = BZeroE | ALTBE;
 
     // calculate right shift amount RightShiftX to complete in discrete number of steps
-    if (P.LOGRK > 0) begin // more than 1 bit per cycle
-      logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
-      logic [P.DIVBLEN:0] IntSteps;
+    if (P.RK > 1) begin // more than 1 bit per cycle
+      logic [$clog2(P.RK)-1:0] RightShiftX;
       /* verilator lint_offf WIDTH */
       assign RightShiftX = P.RK - 1 - ((IntResultBitsE - 1) % P.RK); // Right shift amount
       assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in n steps

From d5ba8fc5e6f4b54a98dc7629e651ed8f6742b7b8 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 10 Nov 2023 18:33:08 -0800
Subject: [PATCH 13/62] fdivsqrt parameter cleanup

---
 config/shared/config-shared.vh  | 10 ++++++++--
 config/shared/parameter-defs.vh |  1 -
 src/cvw.sv                      |  1 -
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 10b56f24e..12967764f 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -93,11 +93,17 @@ localparam NF2   = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_NF   : H_NF);
 localparam FMT2  = ((F_SUPPORTED & (LEN1 != S_LEN)) ? 2'd0    : 2'd2);
 localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 
+// intermediate division parameters not directly used in Divider
+localparam FPDIVN      = NF+3; // length of floating-point inputs: Ns + 2 = Nf + 3 for 1 integer bit, Nf fracitonal bits, 2 extra bits to shift sqrt into [1/4, 1)]
+localparam DIVN        = ((FPDIVN<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVN+3; // standard length of input: max(XLEN, NF+2) ***
+
 // division constants
-localparam DIVN        = ((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2; // standard length of input: max(XLEN, NF+2)
+
+// *** define NF+2, justify, use in DIVN
 localparam LOGR        = $clog2(RADIX);                             // r = log(R)
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
-localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // *** relate to algorithm for rest of these
+//localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // *** relate to algorithm for rest of these
+localparam FPDUR       = (DIVN+LOGR-1)/RK + 1 ;               // ceiling((DIVN+LOGR)/RK)
 localparam DURLEN      = $clog2(FPDUR+1);
 localparam DIVb        = FPDUR*RK - 1; // canonical fdiv size (b)
 localparam DIVBLEN     = $clog2(DIVb+2)-1;                          // *** where is 2 coming from?
diff --git a/config/shared/parameter-defs.vh b/config/shared/parameter-defs.vh
index 85c9d1c19..57d61fc00 100644
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@@ -177,7 +177,6 @@ localparam cvw_t P = '{
   NORMSHIFTSZ : NORMSHIFTSZ,
   LOGNORMSHIFTSZ : LOGNORMSHIFTSZ,
   CORRSHIFTSZ : CORRSHIFTSZ,
-  DIVN        : DIVN,
   LOGR        : LOGR,
   RK          : RK,
   FPDUR       : FPDUR,
diff --git a/src/cvw.sv b/src/cvw.sv
index 3c32982bd..cc968b803 100644
--- a/src/cvw.sv
+++ b/src/cvw.sv
@@ -269,7 +269,6 @@ typedef struct packed {
   int CORRSHIFTSZ;
 
 // division constants
-  int DIVN       ;
   int LOGR       ;
   int RK         ;
   int FPDUR      ;

From 91d77902511fffc10ca631e9e64c37c1ab0db8b8 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Fri, 10 Nov 2023 21:05:42 -0600
Subject: [PATCH 14/62] update README for ppaSynth.py

---
 synthDC/README.md | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/synthDC/README.md b/synthDC/README.md
index edbd57340..30a98a76b 100644
--- a/synthDC/README.md
+++ b/synthDC/README.md
@@ -5,7 +5,7 @@ This subdirectory contains synthesis scripts for use with Synopsys
 scripts/synth.tcl.
 
 Example Usage
-make synth DESIGN=wallypipelinedcore FREQ=500
+make synth DESIGN=wallypipelinedcore FREQ=500 CONFIG=rv32e
 
 environment variables
 
@@ -38,5 +38,25 @@ To run ppa analysis that hones into target frequency, you can type:
 python3 ppa/ppaSynth.py from the synthDC directory.  This runs a sweep
 across all modules listed at the bottom of the ppaSynth.py file.
 
+Two options for running the sweep.  The first run runs all modules for
+all techs around a given frequency (i.e., freqs).  The second option
+will run all designs for the specific module based on bestSynths.csv
+values.   Since the second option is 2nd, it has priority.  If the
+second set of values is commented out, it will run all widths.
 
+WARNING:  The first option may runs lots of runs that could expend all
+the licenses available for a license.  Therefore, care must be taken
+to be sure that enough licenses are available for this first option.
 
+##### Run specific syntheses
+	widths = [8, 16, 32, 64, 128] 
+	modules = ['mul', 'adder', 'shifter', 'flop', 'comparator', 'binencoder', 'csa', 'mux2', 'mux4', 'mux8']
+	techs = ['sky90', 'sky130', 'tsmc28', 'tsmc28psyn']
+	freqs = [5000]
+	synthsToRun = allCombos(widths, modules, techs, freqs)
+
+##### Run a sweep based on best delay found in existing syntheses
+	module = 'adder'
+	width = 32
+	tech = 'tsmc28psyn'
+ 	synthsToRun = freqSweep(module, width, tech)
\ No newline at end of file

From e1c935bd9bf70dca456d84dac92164338e5645c4 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Fri, 10 Nov 2023 21:06:24 -0600
Subject: [PATCH 15/62] Add bestSynths.csv that are the initial values.  If
 this is re-run after ppaAnalysis.py is run, more refinement can be made

---
 synthDC/ppa/bestSynths.csv | 180 ++++++++++++++++++++++++++++---------
 1 file changed, 140 insertions(+), 40 deletions(-)

diff --git a/synthDC/ppa/bestSynths.csv b/synthDC/ppa/bestSynths.csv
index 7e3d35569..d57ac6924 100644
--- a/synthDC/ppa/bestSynths.csv
+++ b/synthDC/ppa/bestSynths.csv
@@ -1,24 +1,74 @@
 Module,Tech,Width,Target Freq,Delay,Area,L Power (nW),D energy (nJ)
-priorityencoder,sky90,8,7683,0.12508649056358195,50.960001,24.761,0.010685929975270078
-priorityencoder,sky90,16,5773,0.16977016282695304,136.220003,77.243,0.021773774467348
-priorityencoder,sky90,32,4500,0.2218912222222222,372.400007,189.626,0.04371111111111111
-priorityencoder,sky90,64,4098,0.2439914738897023,797.720015,382.205,0.07393850658857981
-priorityencoder,sky90,128,3409,0.2933331557641537,1602.300031,610.009,0.1261366969785861
-add,sky90,8,3658,0.27337042810278844,253.820005,154.438,0.10825587752870422
-add,sky90,16,2942,0.3393218266485384,722.260013,485.109,0.32460910944935417
-add,sky90,32,2468,0.40496338573743923,1440.600027,714.057,0.6580226904376014
-add,sky90,64,2139,0.4674681813931744,2781.240054,1050.0,0.9392239364188874
-add,sky90,128,1885,0.5304949787798409,6186.740118,2230.0,2.1480106100795755
+binencoder,sky130,8,1000,1.0000,50.960001,24.761,0.010685929975270078
+binencoder,sky130,16,1000,1.0000,136.220003,77.243,0.021773774467348
+binencoder,sky130,32,1000,1.0000,372.400007,189.626,0.04371111111111111
+binencoder,sky130,64,1000,1.0000,797.720015,382.205,0.07393850658857981
+binencoder,sky130,128,1000,1.0000,1602.300031,610.009,0.1261366969785861
+adder,sky130,8,1000,1.0000,253.820005,154.438,0.10825587752870422
+adder,sky130,16,1000,1.0000,722.260013,485.109,0.32460910944935417
+adder,sky130,32,1000,1.0000,1440.600027,714.057,0.6580226904376014
+adder,sky130,64,1000,1.0000,2781.240054,1050.0,0.9392239364188874
+adder,sky130,128,1000,1.0000,6186.740118,2230.0,2.1480106100795755
+csa,sky130,8,1000,1.0000,266.560005,154.202,0.13650573115665163
+csa,sky130,16,1000,1.0000,533.12001,308.404,0.27263530601922104
+csa,sky130,32,1000,1.0000,1066.240021,616.808,0.5448072247308093
+csa,sky130,64,1000,1.0000,2132.480042,1230.0,1.0905412240768841
+csa,sky130,128,1000,1.0000,4264.960083,2470.0,2.178553363682347
+shifter,sky130,8,1000,1.0000,259.700005,196.451,0.07534088282874972
+shifter,sky130,16,1000,1.0000,666.400006,558.433,0.19552906110283155
+shifter,sky130,32,1000,1.0000,1475.880027,768.262,0.3807431082700759
+shifter,sky130,64,1000,1.0000,3914.120062,2680.0,1.144802541988198
+shifter,sky130,128,1000,1.0000,9192.400136,6080.0,2.9008914525432616
+comparator,sky130,8,1000,1.0000,200.900004,136.6,0.05001033271337053
+comparator,sky130,16,1000,1.0000,358.680007,189.253,0.06321553011448482
+comparator,sky130,32,1000,1.0000,690.900013,315.709,0.10771793448084398
+comparator,sky130,64,1000,1.0000,1372.980026,508.393,0.2048577820389901
+comparator,sky130,128,1000,1.0000,2744.980052,796.047,0.34396273737011823
+flop,sky130,8,1000,1.0000,133.279999,64.8145,0.193835
+flop,sky130,16,1000,1.0000,266.5599975,129.629,0.38715000000000005
+flop,sky130,32,1000,1.0000,533.119995,259.258,0.7723000000000001
+flop,sky130,64,1000,1.0000,1066.23999,520.0,1.54955
+flop,sky130,128,1000,1.0000,2132.4799805,1035.0,3.094
+mux2,sky130,8,1000,1.0000,63.700001,21.541,0.01932440083034535
+mux2,sky130,16,1000,1.0000,119.560002,32.354,0.03884536082474227
+mux2,sky130,32,1000,1.0000,375.340008,259.372,0.13671796921846893
+mux2,sky130,64,1000,1.0000,479.220009,115.22,0.15148539160324087
+mux2,sky130,128,1000,1.0000,1302.420025,767.078,0.4665334665334665
+mux4,sky130,8,1000,1.0000,148.960002,66.984,0.04026661024121879
+mux4,sky130,16,1000,1.0000,392.0,398.313,0.1037037037037037
+mux4,sky130,32,1000,1.0000,594.860011,331.197,0.131617289946576
+mux4,sky130,64,1000,1.0000,899.640016,344.331,0.2862533692722372
+mux4,sky130,128,1000,1.0000,2013.900038,818.249,0.6094182825484764
+mux8,sky130,8,1000,1.0000,287.140006,116.648,0.06089260808926081
+mux8,sky130,16,1000,1.0000,582.120003,282.366,0.14455681142177274
+mux8,sky130,32,1000,1.0000,1319.079995,670.683,0.35777218376337316
+mux8,sky130,64,1000,1.0000,2132.48004,808.482,0.44287680660701995
+mux8,sky130,128,1000,1.0000,4575.620089,1830.0,0.9786276715410572
+mul,sky130,8,1000,1.0000,2194.220041,1440.0,1.421374045801527
+mul,sky130,16,1000,1.0000,7519.540137,4940.0,6.376128385155466
+mul,sky130,32,1000,1.0000,25200.700446,14900.0,24.931847968545217
+mul,sky130,64,1000,1.0000,86011.661365,42600.0,88.84651898734177
+mul,sky130,128,1000,1.0000,296198.144128,114000.0,273.3148854961832
+binencoder,sky90,8,7683,0.12508649056358195,50.960001,24.761,0.010685929975270078
+binencoder,sky90,16,5773,0.16977016282695304,136.220003,77.243,0.021773774467348
+binencoder,sky90,32,4500,0.2218912222222222,372.400007,189.626,0.04371111111111111
+binencoder,sky90,64,4098,0.2439914738897023,797.720015,382.205,0.07393850658857981
+binencoder,sky90,128,3409,0.2933331557641537,1602.300031,610.009,0.1261366969785861
+adder,sky90,8,3658,0.27337042810278844,253.820005,154.438,0.10825587752870422
+adder,sky90,16,2942,0.3393218266485384,722.260013,485.109,0.32460910944935417
+adder,sky90,32,2468,0.40496338573743923,1440.600027,714.057,0.6580226904376014
+adder,sky90,64,2139,0.4674681813931744,2781.240054,1050.0,0.9392239364188874
+adder,sky90,128,1885,0.5304949787798409,6186.740118,2230.0,2.1480106100795755
 csa,sky90,8,5758,0.16536141368530738,266.560005,154.202,0.13650573115665163
 csa,sky90,16,5931,0.1654056314280897,533.12001,308.404,0.27263530601922104
 csa,sky90,32,5758,0.16536141368530738,1066.240021,616.808,0.5448072247308093
 csa,sky90,64,5931,0.1654056314280897,2132.480042,1230.0,1.0905412240768841
 csa,sky90,128,5931,0.1654056314280897,4264.960083,2470.0,2.178553363682347
-shiftleft,sky90,8,4327,0.23025600254217704,259.700005,196.451,0.07534088282874972
-shiftleft,sky90,16,3355,0.29803959314456036,666.400006,558.433,0.19552906110283155
-shiftleft,sky90,32,2503,0.39951757530962845,1475.880027,768.262,0.3807431082700759
-shiftleft,sky90,64,2203,0.45385946391284615,3914.120062,2680.0,1.144802541988198
-shiftleft,sky90,128,1907,0.5242938489774515,9192.400136,6080.0,2.9008914525432616
+shifter,sky90,8,4327,0.23025600254217704,259.700005,196.451,0.07534088282874972
+shifter,sky90,16,3355,0.29803959314456036,666.400006,558.433,0.19552906110283155
+shifter,sky90,32,2503,0.39951757530962845,1475.880027,768.262,0.3807431082700759
+shifter,sky90,64,2203,0.45385946391284615,3914.120062,2680.0,1.144802541988198
+shifter,sky90,128,1907,0.5242938489774515,9192.400136,6080.0,2.9008914525432616
 comparator,sky90,8,4839,0.20629126741062204,200.900004,136.6,0.05001033271337053
 comparator,sky90,16,4018,0.24806303982080635,358.680007,189.253,0.06321553011448482
 comparator,sky90,32,3602,0.276293542476402,690.900013,315.709,0.10771793448084398
@@ -44,31 +94,31 @@ mux8,sky90,16,3362,0.295237998810232,582.120003,282.366,0.14455681142177274
 mux8,sky90,32,3178,0.3140553102580239,1319.079995,670.683,0.35777218376337316
 mux8,sky90,64,2906,0.3440756228492774,2132.48004,808.482,0.44287680660701995
 mux8,sky90,128,2667,0.3749401308586427,4575.620089,1830.0,0.9786276715410572
-mult,sky90,8,1310,0.7631557786259543,2194.220041,1440.0,1.421374045801527
-mult,sky90,16,997,1.0029260270812437,7519.540137,4940.0,6.376128385155466
-mult,sky90,32,763,1.3106129895150722,25200.700446,14900.0,24.931847968545217
-mult,sky90,64,632,1.5822664810126583,86011.661365,42600.0,88.84651898734177
-mult,sky90,128,524,1.9083759465648855,296198.144128,114000.0,273.3148854961832
-priorityencoder,tsmc28,8,31335,0.031912196106590074,8.316,34.836,0.001716929950534546
-priorityencoder,tsmc28,16,21253,0.04703118086858326,21.672,78.026,0.004008845810003294
-priorityencoder,tsmc28,32,16464,0.06071258114674442,61.614,207.499,0.009323372206025266
-priorityencoder,tsmc28,64,13804,0.07239877021153289,137.466,425.592,0.01847290640394089
-priorityencoder,tsmc28,128,11440,0.0874065874125874,317.646,973.649,0.041171328671328666
-add,tsmc28,8,13838,0.07207477814713109,34.272,187.089,0.013311172134701546
-add,tsmc28,16,11521,0.08678002100512108,90.972001,475.207,0.03367763214998698
-add,tsmc28,32,9812,0.1018860211985324,209.286002,1060.0,0.08153281695882594
-add,tsmc28,64,8206,0.12185605215695831,388.836003,1770.0,0.1409943943456008
-add,tsmc28,128,7354,0.13597341881968997,907.452008,4360.0,0.3451183029643731
+mul,sky90,8,1310,0.7631557786259543,2194.220041,1440.0,1.421374045801527
+mul,sky90,16,997,1.0029260270812437,7519.540137,4940.0,6.376128385155466
+mul,sky90,32,763,1.3106129895150722,25200.700446,14900.0,24.931847968545217
+mul,sky90,64,632,1.5822664810126583,86011.661365,42600.0,88.84651898734177
+mul,sky90,128,524,1.9083759465648855,296198.144128,114000.0,273.3148854961832
+binencoder,tsmc28,8,31335,0.031912196106590074,8.316,34.836,0.001716929950534546
+binencoder,tsmc28,16,21253,0.04703118086858326,21.672,78.026,0.004008845810003294
+binencoder,tsmc28,32,16464,0.06071258114674442,61.614,207.499,0.009323372206025266
+binencoder,tsmc28,64,13804,0.07239877021153289,137.466,425.592,0.01847290640394089
+binencoder,tsmc28,128,11440,0.0874065874125874,317.646,973.649,0.041171328671328666
+adder,tsmc28,8,13838,0.07207477814713109,34.272,187.089,0.013311172134701546
+adder,tsmc28,16,11521,0.08678002100512108,90.972001,475.207,0.03367763214998698
+adder,tsmc28,32,9812,0.1018860211985324,209.286002,1060.0,0.08153281695882594
+adder,tsmc28,64,8206,0.12185605215695831,388.836003,1770.0,0.1409943943456008
+adder,tsmc28,128,7354,0.13597341881968997,907.452008,4360.0,0.3451183029643731
 csa,tsmc28,8,24524,0.040663382319360626,52.416,482.462,0.02173381177621921
 csa,tsmc28,16,24524,0.040663382319360626,104.832,964.99,0.04346762355243842
 csa,tsmc28,32,24524,0.040663382319360626,209.664,1930.0,0.08677214157559941
 csa,tsmc28,64,24524,0.040663382319360626,419.327999,3860.0,0.17342195400424076
 csa,tsmc28,128,24524,0.040663382319360626,838.655998,7720.0,0.3471701190670363
-shiftleft,tsmc28,8,15202,0.0656078183133798,50.652,367.074,0.016991185370346006
-shiftleft,tsmc28,16,11804,0.08465604506946797,127.511999,602.29,0.03388681802778719
-shiftleft,tsmc28,32,9587,0.10430391697089808,384.803997,1940.0,0.10180452696359654
-shiftleft,tsmc28,64,8272,0.12086674854932303,1041.263998,5460.0,0.2895309477756286
-shiftleft,tsmc28,128,7023,0.14238329232521713,1836.953994,8670.0,0.566566994162039
+shifter,tsmc28,8,15202,0.0656078183133798,50.652,367.074,0.016991185370346006
+shifter,tsmc28,16,11804,0.08465604506946797,127.511999,602.29,0.03388681802778719
+shifter,tsmc28,32,9587,0.10430391697089808,384.803997,1940.0,0.10180452696359654
+shifter,tsmc28,64,8272,0.12086674854932303,1041.263998,5460.0,0.2895309477756286
+shifter,tsmc28,128,7023,0.14238329232521713,1836.953994,8670.0,0.566566994162039
 comparator,tsmc28,8,17422,0.05733769130983814,35.784,170.595,0.009488003673516243
 comparator,tsmc28,16,13736,0.07273839778683751,54.558,250.167,0.014349155503785673
 comparator,tsmc28,32,12139,0.08236710865804432,145.782,622.975,0.03567015404893319
@@ -94,8 +144,58 @@ mux8,tsmc28,16,12264,0.08147446510110894,128.771998,548.714,0.02666340508806262
 mux8,tsmc28,32,11713,0.08517122410996329,172.115999,823.633,0.046956373260479814
 mux8,tsmc28,64,11014,0.09067453550027238,304.163999,1460.0,0.08498274922825495
 mux8,tsmc28,128,10474,0.09542350830628223,683.045996,2820.0,0.15705556616383426
-mult,tsmc28,8,5200,0.1922996923076923,577.206,4340.0,0.37769230769230766
-mult,tsmc28,16,3819,0.26184265147944485,1634.472002,11800.0,1.4553548049227547
-mult,tsmc28,32,3033,0.3295775611605671,6343.721998,47200.0,6.303330036267723
-mult,tsmc28,64,2390,0.4184090418410042,16045.092071,109000.0,18.54602510460251
-mult,tsmc28,128,1868,0.5353279057815846,44272.49428,262000.0,50.01177730192719
+mul,tsmc28,8,5200,0.1922996923076923,577.206,4340.0,0.37769230769230766
+mul,tsmc28,16,3819,0.26184265147944485,1634.472002,11800.0,1.4553548049227547
+mul,tsmc28,32,3033,0.3295775611605671,6343.721998,47200.0,6.303330036267723
+mul,tsmc28,64,2390,0.4184090418410042,16045.092071,109000.0,18.54602510460251
+mul,tsmc28,128,1868,0.5353279057815846,44272.49428,262000.0,50.01177730192719
+binencoder,tsmc28psyn,8,31335,0.031912196106590074,8.316,34.836,0.001716929950534546
+binencoder,tsmc28psyn,16,21253,0.04703118086858326,21.672,78.026,0.004008845810003294
+binencoder,tsmc28psyn,32,16464,0.06071258114674442,61.614,207.499,0.009323372206025266
+binencoder,tsmc28psyn,64,13804,0.07239877021153289,137.466,425.592,0.01847290640394089
+binencoder,tsmc28psyn,128,11440,0.0874065874125874,317.646,973.649,0.041171328671328666
+adder,tsmc28psyn,8,13838,0.07207477814713109,34.272,187.089,0.013311172134701546
+adder,tsmc28psyn,16,11521,0.08678002100512108,90.972001,475.207,0.03367763214998698
+adder,tsmc28psyn,32,9812,0.1018860211985324,209.286002,1060.0,0.08153281695882594
+adder,tsmc28psyn,64,8206,0.12185605215695831,388.836003,1770.0,0.1409943943456008
+adder,tsmc28psyn,128,7354,0.13597341881968997,907.452008,4360.0,0.3451183029643731
+csa,tsmc28psyn,8,24524,0.040663382319360626,52.416,482.462,0.02173381177621921
+csa,tsmc28psyn,16,24524,0.040663382319360626,104.832,964.99,0.04346762355243842
+csa,tsmc28psyn,32,24524,0.040663382319360626,209.664,1930.0,0.08677214157559941
+csa,tsmc28psyn,64,24524,0.040663382319360626,419.327999,3860.0,0.17342195400424076
+csa,tsmc28psyn,128,24524,0.040663382319360626,838.655998,7720.0,0.3471701190670363
+shifter,tsmc28psyn,8,15202,0.0656078183133798,50.652,367.074,0.016991185370346006
+shifter,tsmc28psyn,16,11804,0.08465604506946797,127.511999,602.29,0.03388681802778719
+shifter,tsmc28psyn,32,9587,0.10430391697089808,384.803997,1940.0,0.10180452696359654
+shifter,tsmc28psyn,64,8272,0.12086674854932303,1041.263998,5460.0,0.2895309477756286
+shifter,tsmc28psyn,128,7023,0.14238329232521713,1836.953994,8670.0,0.566566994162039
+comparator,tsmc28psyn,8,17422,0.05733769130983814,35.784,170.595,0.009488003673516243
+comparator,tsmc28psyn,16,13736,0.07273839778683751,54.558,250.167,0.014349155503785673
+comparator,tsmc28psyn,32,12139,0.08236710865804432,145.782,622.975,0.03567015404893319
+comparator,tsmc28psyn,64,11080,0.09024670758122744,294.21,1250.0,0.0684115523465704
+comparator,tsmc28psyn,128,9371,0.10671119720414043,558.432,2400.0,0.12794792444776437
+flop,tsmc28psyn,8,10,0.048889000000002625,15.12,78.6345,0.027246000000000003
+flop,tsmc28psyn,16,10,0.048889000000002625,30.24,157.29,0.054290000000000005
+flop,tsmc28psyn,32,10,0.048889000000002625,60.4799995,314.5805,0.10908000000000001
+flop,tsmc28psyn,64,10,0.048889000000002625,120.959999,630.0,0.21765500000000004
+flop,tsmc28psyn,128,10,0.048889000000002625,241.919998,1260.0,0.43579999999999997
+mux2,tsmc28psyn,8,29614,0.03374481252110488,16.758,114.564,0.005436617815897886
+mux2,tsmc28psyn,16,18767,0.053046021580433735,15.75,88.025,0.005142004582511856
+mux2,tsmc28psyn,32,17903,0.05585556035301346,32.130001,171.146,0.009897782494553985
+mux2,tsmc28psyn,64,18568,0.05371109651012495,91.35,523.884,0.027574321413183972
+mux2,tsmc28psyn,128,16637,0.05991099044298852,176.525999,941.106,0.05012923002945243
+mux4,tsmc28psyn,8,18151,0.055092383284667513,27.971999,133.963,0.008032615282904523
+mux4,tsmc28psyn,16,16486,0.06057952759917506,39.438,186.231,0.012556108213029236
+mux4,tsmc28psyn,32,15196,0.06580579126085812,69.174,324.969,0.023229797315082915
+mux4,tsmc28psyn,64,13926,0.07180612868016659,137.465999,648.086,0.04574177796926612
+mux4,tsmc28psyn,128,13090,0.07636619404125286,294.335997,1420.0,0.09358288770053477
+mux8,tsmc28psyn,8,12902,0.07750336319950395,44.604,214.286,0.0117501162610448
+mux8,tsmc28psyn,16,12264,0.08147446510110894,128.771998,548.714,0.02666340508806262
+mux8,tsmc28psyn,32,11713,0.08517122410996329,172.115999,823.633,0.046956373260479814
+mux8,tsmc28psyn,64,11014,0.09067453550027238,304.163999,1460.0,0.08498274922825495
+mux8,tsmc28psyn,128,10474,0.09542350830628223,683.045996,2820.0,0.15705556616383426
+mul,tsmc28psyn,8,5200,0.1922996923076923,577.206,4340.0,0.37769230769230766
+mul,tsmc28psyn,16,3819,0.26184265147944485,1634.472002,11800.0,1.4553548049227547
+mul,tsmc28psyn,32,3033,0.3295775611605671,6343.721998,47200.0,6.303330036267723
+mul,tsmc28psyn,64,2390,0.4184090418410042,16045.092071,109000.0,18.54602510460251
+mul,tsmc28psyn,128,1868,0.5353279057815846,44272.49428,262000.0,50.01177730192719

From 65e536e4014c77c479f56dbda54a7fc27b545d89 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Fri, 10 Nov 2023 21:07:36 -0600
Subject: [PATCH 16/62] Update ppa/ppaSynth.py for sky130 and better sweep
 parameterization

---
 synthDC/ppa/ppaSynth.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/synthDC/ppa/ppaSynth.py b/synthDC/ppa/ppaSynth.py
index d9d07c10d..528c851a0 100755
--- a/synthDC/ppa/ppaSynth.py
+++ b/synthDC/ppa/ppaSynth.py
@@ -12,8 +12,6 @@ from ppaAnalyze import synthsfromcsv
 
 def runCommand(module, width, tech, freq):
     command = "make synth DESIGN={} WIDTH={} TECH={} DRIVE=INV FREQ={} MAXOPT=1 MAXCORES=1".format(module, width, tech, freq)
-    print('here we go')
-
     subprocess.Popen(command, shell=True)
 
 def deleteRedundant(synthsToRun):
@@ -61,15 +59,15 @@ if __name__ == '__main__':
     
     ##### Run specific syntheses
 	widths = [8, 16, 32, 64, 128] 
-	modules = ['mult', 'add', 'shiftleft', 'flop', 'comparator', 'priorityencoder', 'add', 'csa', 'mux2', 'mux4', 'mux8']
-	techs = ['sky90', 'tsmc28']
+	modules = ['mul', 'adder', 'shifter', 'flop', 'comparator', 'binencoder', 'csa', 'mux2', 'mux4', 'mux8']
+	techs = ['sky90', 'sky130', 'tsmc28', 'tsmc28psyn']
 	freqs = [5000]
 	synthsToRun = allCombos(widths, modules, techs, freqs)
     
     ##### Run a sweep based on best delay found in existing syntheses
-	module = 'add'
+	module = 'adder'
 	width = 32
-	tech = 'sky90'
+	tech = 'tsmc28psyn'
 	synthsToRun = freqSweep(module, width, tech)
         
     ##### Only do syntheses for which a run doesn't already exist

From 7b79d8edeb59d676fcef4b54d841d00ff2930f02 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Fri, 10 Nov 2023 21:10:35 -0600
Subject: [PATCH 17/62] Update scripts/synth.tcl to add with parameter for
 width and also checks wrapper to see if running CONFIG=rv32e to run without
 WIDTH

---
 synthDC/Makefile          |  6 +++---
 synthDC/scripts/synth.tcl | 14 +++++++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/synthDC/Makefile b/synthDC/Makefile
index e7918e3dc..8e1b09d01 100755
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@@ -20,11 +20,11 @@ export MAXCORES ?= 1
 export MAXOPT ?= 0
 export DRIVE ?= FLOP
 export USESRAM ?= 0
-
+export WIDTH ?= 32
 
 time := $(shell date +%F-%H-%M)
 hash := $(shell git rev-parse --short HEAD)
-export OUTPUTDIR := runs/$(DESIGN)_$(CONFIG)_$(MOD)_$(TECH)nm_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
+export OUTPUTDIR := runs/$(DESIGN)_$(WIDTH)_$(CONFIG)_$(MOD)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
 export SAIFPOWER ?= 0
 
 OLDCONFIGDIR ?= ${WALLY}/config
@@ -147,4 +147,4 @@ clean:
 	rm -f power.saif
 	rm -f Synopsys_stack_trace_*.txt
 	rm -f crte_*.txt
-	
\ No newline at end of file
+	
diff --git a/synthDC/scripts/synth.tcl b/synthDC/scripts/synth.tcl
index 9be076edf..cd4d6ff27 100755
--- a/synthDC/scripts/synth.tcl
+++ b/synthDC/scripts/synth.tcl
@@ -18,7 +18,6 @@ suppress_message {VER-274}
 # Enable Multicore
 set_host_options -max_cores $::env(MAXCORES)
 
-
 # get outputDir and configDir from environment (Makefile)
 set outputDir $::env(OUTPUTDIR)
 set cfg $::env(CONFIGDIR)
@@ -26,6 +25,7 @@ set hdl_src "../src"
 set saifpower $::env(SAIFPOWER)
 set maxopt $::env(MAXOPT)
 set drive $::env(DRIVE)
+set width $::env(WIDTH)
 
 eval file copy -force [glob ${cfg}/*.vh] {$outputDir/hdl/}
 eval file copy -force [glob ${hdl_src}/cvw.sv] {$outputDir/hdl/}
@@ -88,7 +88,13 @@ if { [shell_is_in_topographical_mode] } {
 #set alib_library_analysis_path ./$outputDir
 define_design_lib WORK -path ./$outputDir/WORK
 analyze -f sverilog -lib WORK $my_verilog_files
-elaborate $my_toplevel -lib WORK 
+# If wrapper=0, we want to run against a specific module and pass
+# width to DC
+if { $wrapper == 1 } {
+    elaborate $my_toplevel -lib WORK 
+} else {
+    elaborate $my_toplevel -lib WORK -parameters WIDTH=$width
+}
 
 # Set the current_design 
 current_design $my_toplevel
@@ -308,6 +314,8 @@ set filename [format "%s%s" $outputDir  "/reports/mindelay.rep"]
 redirect $filename { report_timing -capacitance -transition_time -nets -delay_type min -nworst 1 }
 
 set filename [format "%s%s" $outputDir  "/reports/per_module_timing.rep"]
+redirect -append $filename { echo "\n\n\n//// Critical paths through Stall ////\n\n\n" }
+redirect -append $filename { report_timing -capacitance -transition_time -nets -through {Stall*} -nworst 1 }
 redirect -append $filename { echo "\n\n\n//// Critical paths through ifu ////\n\n\n" }
 redirect -append $filename { report_timing -capacitance -transition_time -nets -through {ifu/*} -nworst 1 }
 redirect -append $filename { echo "\n\n\n//// Critical paths through ieu ////\n\n\n" }
@@ -445,4 +453,4 @@ set t2 [clock seconds]
 set t [expr $t2 - $t1]
 echo [expr $t/60]
 
-quit 
\ No newline at end of file
+quit 

From 448ced00c51cbe2b3d2433bec633c4d51b988206 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sat, 11 Nov 2023 04:05:34 -0800
Subject: [PATCH 18/62] Fixed testbench-fp to reflect signal name changes

---
 config/shared/config-shared.vh |  9 +++------
 testbench/testbench-fp.sv      | 10 +++++-----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 12967764f..61bf461eb 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -95,16 +95,13 @@ localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 
 // intermediate division parameters not directly used in Divider
 localparam FPDIVN      = NF+3; // length of floating-point inputs: Ns + 2 = Nf + 3 for 1 integer bit, Nf fracitonal bits, 2 extra bits to shift sqrt into [1/4, 1)]
-localparam DIVN        = ((FPDIVN<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVN+3; // standard length of input: max(XLEN, NF+2) ***
+localparam DIVN        = ((FPDIVN<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVN; // standard length of input: max(XLEN, NF+2) ***
 
 // division constants
-
-// *** define NF+2, justify, use in DIVN
 localparam LOGR        = $clog2(RADIX);                             // r = log(R)
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
-//localparam FPDUR       = (DIVN+1)/RK + 1 + (RADIX/4);               // *** relate to algorithm for rest of these
-localparam FPDUR       = (DIVN+LOGR-1)/RK + 1 ;               // ceiling((DIVN+LOGR)/RK)
-localparam DURLEN      = $clog2(FPDUR+1);
+localparam FPDUR       = (DIVN+LOGR-1)/RK + 1 ;                     // ceiling((n+r)/rk)
+localparam DURLEN      = $clog2(FPDUR+1);                           // number of bits to represent the duration
 localparam DIVb        = FPDUR*RK - 1; // canonical fdiv size (b)
 localparam DIVBLEN     = $clog2(DIVb+2)-1;                          // *** where is 2 coming from?
 
diff --git a/testbench/testbench-fp.sv b/testbench/testbench-fp.sv
index e5f215e07..5ef752691 100644
--- a/testbench/testbench-fp.sv
+++ b/testbench/testbench-fp.sv
@@ -115,8 +115,8 @@ module testbenchfp;
    logic 			FlushE;
    logic 			IFDivStartE;
    logic 			FDivDoneE;
-   logic [P.NE+1:0] 		QeM;
-   logic [P.DIVb:0] 		QmM;
+   logic [P.NE+1:0] 		UeM;
+   logic [P.DIVb:0] 		UmM;
    logic [P.XLEN-1:0] 		FIntDivResultM;
    logic 			ResMatch;                   // Check if result match
    logic 			FlagMatch;                  // Check if IEEE flags match
@@ -705,7 +705,7 @@ module testbenchfp;
    end
    
    postprocess #(P) postprocess(.Xs(Xs), .Ys(Ys), .PostProcSel(UnitVal[1:0]),
-				.OpCtrl(OpCtrlVal), .DivQm(Quot), .DivQe(DivCalcExp),
+				.OpCtrl(OpCtrlVal), .DivUm(Quot), .DivUe(DivCalcExp),
 				.Xm(Xm), .Ym(Ym), .Zm(Zm), .CvtCe(CvtCalcExpE), .DivSticky(DivSticky), .FmaSs(Ss),
 				.XNaN(XNaN), .YNaN(YNaN), .ZNaN(ZNaN), .CvtResSubnormUf(CvtResSubnormUfE),
 				.XZero(XZero), .YZero(YZero), .CvtShiftAmt(CvtShiftAmtE),
@@ -734,8 +734,8 @@ module testbenchfp;
 			     .XInfE(XInf), .YInfE(YInf), .XZeroE(XZero), .YZeroE(YZero), 
 			     .XNaNE(XNaN), .YNaNE(YNaN), 
 			     .FDivStartE(DivStart), .IDivStartE(1'b0), .W64E(1'b0),
-			     .StallM(1'b0), .DivStickyM(DivSticky), .FDivBusyE, .QeM(DivCalcExp),
-			     .QmM(Quot),
+			     .StallM(1'b0), .DivStickyM(DivSticky), .FDivBusyE, .UeM(DivCalcExp),
+			     .UmM(Quot),
 			     .FlushE(1'b0), .ForwardedSrcAE('0), .ForwardedSrcBE('0), .Funct3M(Funct3M),
 			     .Funct3E(Funct3E), .IntDivE(1'b0), .FIntDivResultM(FIntDivResultM),
 			     .FDivDoneE(FDivDoneE), .IFDivStartE(IFDivStartE));

From 2bf51431637e52e467c7ce9660e97aa087759e2e Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sat, 11 Nov 2023 05:58:53 -0800
Subject: [PATCH 19/62] Bug fixes related to size of fpdivsqrt bit count and
 number of cycles

---
 config/shared/config-shared.vh     | 6 ++++--
 src/fpu/fdivsqrt/fdivsqrtcycles.sv | 2 +-
 testbench/testbench-fp.sv          | 3 +++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 61bf461eb..cc230ef3e 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -93,6 +93,7 @@ localparam NF2   = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_NF   : H_NF);
 localparam FMT2  = ((F_SUPPORTED & (LEN1 != S_LEN)) ? 2'd0    : 2'd2);
 localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 
+
 // intermediate division parameters not directly used in Divider
 localparam FPDIVN      = NF+3; // length of floating-point inputs: Ns + 2 = Nf + 3 for 1 integer bit, Nf fracitonal bits, 2 extra bits to shift sqrt into [1/4, 1)]
 localparam DIVN        = ((FPDIVN<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVN; // standard length of input: max(XLEN, NF+2) ***
@@ -101,10 +102,11 @@ localparam DIVN        = ((FPDIVN<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVN; // stand
 localparam LOGR        = $clog2(RADIX);                             // r = log(R)
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
 localparam FPDUR       = (DIVN+LOGR-1)/RK + 1 ;                     // ceiling((n+r)/rk)
-localparam DURLEN      = $clog2(FPDUR+1);                           // number of bits to represent the duration
-localparam DIVb        = FPDUR*RK - 1; // canonical fdiv size (b)
+localparam DIVb        = FPDUR*RK - 1;                              // canonical fdiv size (b)
+localparam DURLEN      = $clog2(FPDUR+1);
 localparam DIVBLEN     = $clog2(DIVb+2)-1;                          // *** where is 2 coming from?
 
+
 // largest length in IEU/FPU
 localparam CVTLEN = ((NF<XLEN) ? (XLEN) : (NF));  // max(XLEN, NF)
 localparam LLEN = (($unsigned(FLEN)<$unsigned(XLEN)) ? ($unsigned(XLEN)) : ($unsigned(FLEN)));
diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index d5c571940..9d7f05fc8 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -34,7 +34,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   output logic [P.DURLEN-1:0]  CyclesE
 );
 
-  logic [P.DURLEN+1:0] Nf, FPResultBitsE; // number of fractional bits
+  logic [P.DIVBLEN:0] Nf, FPResultBitsE; // number of fractional bits
   logic [P.DIVBLEN:0]  ResultBitsE; // number of result bits;
 
   // DIVN = P.NF+3
diff --git a/testbench/testbench-fp.sv b/testbench/testbench-fp.sv
index 5ef752691..9e602cab0 100644
--- a/testbench/testbench-fp.sv
+++ b/testbench/testbench-fp.sv
@@ -148,6 +148,9 @@ module testbenchfp;
       $display("\nThe start of simulation...");      
       $display("This simulation for TEST is %s", TEST);
       $display("This simulation for TEST is of the operand size of %s", TEST_SIZE);      
+
+      $display("FPDUR %d %d DIVN %d LOGR %d RK %d RADIX %d DURLEN %d", FPDUR, DIVN, LOGR, RK, RADIX, DURLEN);
+
       if (P.Q_SUPPORTED & (TEST_SIZE == "QP" | TEST_SIZE == "all")) begin // if Quad percision is supported
 	 if (TEST === "cvtint" | TEST === "all") begin  // if testing integer conversion
             // add the 128-bit cvtint tests to the to-be-tested list

From 6ac83c776ec1e06320f1c2c65ee490b08d738fee Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sat, 11 Nov 2023 15:50:06 -0800
Subject: [PATCH 20/62] Cleaned up number of bits in fdivsqrt

---
 config/shared/config-shared.vh       | 23 +++++++++--------
 src/fpu/fdivsqrt/fdivsqrt.sv         |  2 +-
 src/fpu/fdivsqrt/fdivsqrtcycles.sv   | 11 +++-----
 src/fpu/fdivsqrt/fdivsqrtexpcalc.sv  |  2 +-
 src/fpu/fdivsqrt/fdivsqrtiter.sv     | 38 ++++++++++++++--------------
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 30 +++++++++++-----------
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 18 ++++++-------
 src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv |  6 ++---
 src/fpu/fdivsqrt/fdivsqrtstage2.sv   | 22 ++++++++--------
 src/fpu/fdivsqrt/fdivsqrtstage4.sv   | 30 +++++++++++-----------
 src/fpu/fdivsqrt/fdivsqrtuotfc2.sv   | 10 ++++----
 src/fpu/fdivsqrt/fdivsqrtuotfc4.sv   |  8 +++---
 testbench/testbench-fp.sv            | 10 ++++----
 13 files changed, 103 insertions(+), 107 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index cc230ef3e..14de5187e 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -93,19 +93,20 @@ localparam NF2   = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_NF   : H_NF);
 localparam FMT2  = ((F_SUPPORTED & (LEN1 != S_LEN)) ? 2'd0    : 2'd2);
 localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 
+// divider r and rk (bits per digit, bits per cycle)
+localparam LOGR        = $clog2(RADIX);                             // r = log(R) bits per digit
+localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
 
-// intermediate division parameters not directly used in Divider
-localparam FPDIVN      = NF+3; // length of floating-point inputs: Ns + 2 = Nf + 3 for 1 integer bit, Nf fracitonal bits, 2 extra bits to shift sqrt into [1/4, 1)]
-localparam DIVN        = ((FPDIVN<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVN; // standard length of input: max(XLEN, NF+2) ***
+// intermediate division parameters not directly used in fdivsqrt hardware
+localparam FPDIVMINb   = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit because square root could be shifted right *** explain better
+localparam DIVMINb     = ((FPDIVMINb<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVMINb; // minimum fractional bits b = max(XLEN, FPDIVMINb)
+localparam RESBITS     = DIVMINb + LOGR; // number of bits in a result: r integer + b fractional
 
 // division constants
-localparam LOGR        = $clog2(RADIX);                             // r = log(R)
-localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
-localparam FPDUR       = (DIVN+LOGR-1)/RK + 1 ;                     // ceiling((n+r)/rk)
-localparam DIVb        = FPDUR*RK - 1;                              // canonical fdiv size (b)
-localparam DURLEN      = $clog2(FPDUR+1);
-localparam DIVBLEN     = $clog2(DIVb+2)-1;                          // *** where is 2 coming from?
-
+localparam FPDUR       = (RESBITS-1)/RK + 1 ;                       // ceiling((r+b)/rk)
+localparam DIVb        = FPDUR*RK - LOGR;                           // divsqrt fractional bits, so total number of bits is a multiple of rk after r integer bits
+localparam DURLEN      = $clog2(FPDUR);                             // enough bits to count the duration
+localparam DIVBLEN     = $clog2(DIVb);                              // enough bits to count number of fractional bits
 
 // largest length in IEU/FPU
 localparam CVTLEN = ((NF<XLEN) ? (XLEN) : (NF));  // max(XLEN, NF)
@@ -113,7 +114,7 @@ localparam LLEN = (($unsigned(FLEN)<$unsigned(XLEN)) ? ($unsigned(XLEN)) : ($uns
 localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));
 localparam NORMSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVb + 1 +NF+1) > (3*NF+6) ? (DIVb + 1 +NF+1) : (3*NF+6)));
 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));
-localparam CORRSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVN+1+NF) > (3*NF+4) ? (DIVN+1+NF) : (3*NF+4)));
+localparam CORRSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVMINb+1+NF) > (3*NF+4) ? (DIVMINb+1+NF) : (3*NF+4)));
 
 
 // Disable spurious Verilator warnings
diff --git a/src/fpu/fdivsqrt/fdivsqrt.sv b/src/fpu/fdivsqrt/fdivsqrt.sv
index ac5c2c338..a4e20f229 100644
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -67,7 +67,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
   // Integer div/rem signals                                
   logic                        BZeroM;                       // Denominator is zero
   logic                        IntDivM;                      // Integer operation
-  logic [P.DIVBLEN:0]          IntNormShiftM;                // Integer normalizatoin shift amount
+  logic [P.DIVBLEN-1:0]        IntNormShiftM;                // Integer normalizatoin shift amount
   logic                        ALTBM, AsM, BsM, W64M;        // Special handling for postprocessor
   logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
   logic                        ISpecialCaseE;                // Integer div/remainder special cases
diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index 9d7f05fc8..20fb16f62 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -30,16 +30,11 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0] FmtE,
   input  logic                 SqrtE,
   input  logic                 IntDivE,
-  input  logic [P.DIVBLEN:0]   IntResultBitsE,
+  input  logic [P.DIVBLEN-1:0] IntResultBitsE,
   output logic [P.DURLEN-1:0]  CyclesE
 );
 
-  logic [P.DIVBLEN:0] Nf, FPResultBitsE; // number of fractional bits
-  logic [P.DIVBLEN:0]  ResultBitsE; // number of result bits;
-
-  // DIVN = P.NF+3
-  // NS = NF + 1
-  // N = NS or NS+2 for div/sqrt.
+  logic [P.DIVBLEN-1:0] Nf, FPResultBitsE, ResultBitsE; // number of fractional (result) bits
 
   /* verilator lint_off WIDTH */
   if (P.FPSIZES == 1)
@@ -75,7 +70,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
 
   always_comb begin 
-    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1 and +0 rather than +2; is it related to DIVCOPIES logic below?
+    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard; integer bit implicit
     else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
 
     if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;
diff --git a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
index 113f2b2dd..a1dd82e35 100644
--- a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
@@ -31,7 +31,7 @@ module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.NE-1:0]      Xe, Ye,
   input  logic                 Sqrt,
   input  logic                 XZero, 
-  input  logic [P.DIVBLEN:0]   ell, m,
+  input  logic [P.DIVBLEN-1:0] ell, m,
   output logic [P.NE+1:0]      Ue
   );
   
diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv
index 1d40e8d9a..0f66982ab 100644
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -31,31 +31,31 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
   input  logic              IFDivStartE, 
   input  logic              FDivBusyE, 
   input  logic              SqrtE,
-  input  logic [P.DIVb+3:0] X, D,
-  output logic [P.DIVb:0]   FirstU, FirstUM,
-  output logic [P.DIVb+1:0] FirstC,
+  input  logic [P.DIVb+3:0] X, D,                  // Q4.DIVb
+  output logic [P.DIVb:0]   FirstU, FirstUM,       // U1.DIVb
+  output logic [P.DIVb+1:0] FirstC,                // Q2.DIVb
   output logic              Firstun,
-  output logic [P.DIVb+3:0] FirstWS, FirstWC
+  output logic [P.DIVb+3:0] FirstWS, FirstWC       // Q4.DIVb
 );
 
   /* verilator lint_off UNOPTFLAT */
-  logic [P.DIVb+3:0]      WSNext[P.DIVCOPIES-1:0]; // Q4.b
-  logic [P.DIVb+3:0]      WCNext[P.DIVCOPIES-1:0]; // Q4.b
-  logic [P.DIVb+3:0]      WS[P.DIVCOPIES:0];       // Q4.b
-  logic [P.DIVb+3:0]      WC[P.DIVCOPIES:0];       // Q4.b
-  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.b
-  logic [P.DIVb:0]        UM[P.DIVCOPIES:0];       // U1.b
-  logic [P.DIVb:0]        UNext[P.DIVCOPIES-1:0];  // U1.b
-  logic [P.DIVb:0]        UMNext[P.DIVCOPIES-1:0]; // U1.b
-  logic [P.DIVb+1:0]      C[P.DIVCOPIES:0];        // Q2.b
-  logic [P.DIVb+1:0]      initC;                   // Q2.b
+  logic [P.DIVb+3:0]      WSNext[P.DIVCOPIES-1:0]; // Q4.DIVb
+  logic [P.DIVb+3:0]      WCNext[P.DIVCOPIES-1:0]; // Q4.DIVb
+  logic [P.DIVb+3:0]      WS[P.DIVCOPIES:0];       // Q4.DIVb
+  logic [P.DIVb+3:0]      WC[P.DIVCOPIES:0];       // Q4.DIVb
+  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.DIVb
+  logic [P.DIVb:0]        UM[P.DIVCOPIES:0];       // U1.DIVb
+  logic [P.DIVb:0]        UNext[P.DIVCOPIES-1:0];  // U1.DIVb
+  logic [P.DIVb:0]        UMNext[P.DIVCOPIES-1:0]; // U1.DIVb
+  logic [P.DIVb+1:0]      C[P.DIVCOPIES:0];        // Q2.DIVb
+  logic [P.DIVb+1:0]      initC;                   // Q2.DIVb
   logic [P.DIVCOPIES-1:0] un; 
 
-  logic [P.DIVb+3:0]      WSN, WCN;                // Q4.b
-  logic [P.DIVb+3:0]      DBar, D2, DBar2;         // Q4.b
-  logic [P.DIVb+1:0]      NextC;
-  logic [P.DIVb:0]        UMux, UMMux;
-  logic [P.DIVb:0]        initU, initUM;
+  logic [P.DIVb+3:0]      WSN, WCN;                // Q4.DIVb
+  logic [P.DIVb+3:0]      DBar, D2, DBar2;         // Q4.DIVb
+  logic [P.DIVb+1:0]      NextC;                   // Q2.DIVb
+  logic [P.DIVb:0]        UMux, UMMux;             // U1.DIVb
+  logic [P.DIVb:0]        initU, initUM;           // U1.DIVb
   /* verilator lint_on UNOPTFLAT */
 
   // Top Muxes and Registers
diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index e9fd2fd2c..cb1f56db7 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -27,21 +27,21 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
-  input  logic               clk, reset,
-  input  logic               StallM,
-  input  logic [P.DIVb+3:0]  WS, WC,
-  input  logic [P.DIVb+3:0]  D, 
-  input  logic [P.DIVb:0]    FirstU, FirstUM, 
-  input  logic [P.DIVb+1:0]  FirstC,
-  input  logic               SqrtE,
-  input  logic               Firstun, SqrtM, SpecialCaseM, 
-  input  logic [P.XLEN-1:0]  AM,
-  input  logic               RemOpM, ALTBM, BZeroM, AsM, BsM, W64M,
-  input  logic [P.DIVBLEN:0] IntNormShiftM,
-  output logic [P.DIVb:0]    UmM,               // result significand
-  output logic               WZeroE,
-  output logic               DivStickyM,
-  output logic [P.XLEN-1:0]  FIntDivResultM
+  input  logic                 clk, reset,
+  input  logic                 StallM,
+  input  logic [P.DIVb+3:0]    WS, WC,            // Q4.DIVb
+  input  logic [P.DIVb+3:0]    D,                 // Q4.DIVb
+  input  logic [P.DIVb:0]      FirstU, FirstUM,   // U1.DIVb
+  input  logic [P.DIVb+1:0]    FirstC,            // Q2.DIVb
+  input  logic                 SqrtE,
+  input  logic                 Firstun, SqrtM, SpecialCaseM, 
+  input  logic [P.XLEN-1:0]    AM,                // U/Q(XLEN.0)
+  input  logic                 RemOpM, ALTBM, BZeroM, AsM, BsM, W64M,
+  input  logic [P.DIVBLEN-1:0] IntNormShiftM,     
+  output logic [P.DIVb:0]      UmM,               // U1.DIVb result significand
+  output logic                 WZeroE,
+  output logic                 DivStickyM,
+  output logic [P.XLEN-1:0]    FIntDivResultM     // U/Q(XLEN.0)
 );
   
   logic [P.DIVb+3:0]         W, Sum;
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 97ceeb085..8d6e565b1 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -42,7 +42,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 IntDivE, W64E,
   output logic                 ISpecialCaseE,
   output logic [P.DURLEN-1:0]  CyclesE,
-  output logic [P.DIVBLEN:0]   IntNormShiftM,
+  output logic [P.DIVBLEN-1:0] IntNormShiftM,
   output logic                 ALTBM, IntDivM, W64M,
   output logic                 AsM, BsM, BZeroM,
   output logic [P.XLEN-1:0]    AM
@@ -53,8 +53,8 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
   logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
-  logic [P.DIVBLEN:0]          mE, ell;                             // Leading zeros of inputs
-  logic [P.DIVBLEN:0]          IntResultBitsE;                      // bits in integer result
+  logic [P.DIVBLEN-1:0]        mE, ell;                             // Leading zeros of inputs
+  logic [P.DIVBLEN-1:0]        IntResultBitsE;                      // bits in integer result
   logic                        NumerZeroE;                          // Numerator is zero (X or A)
   logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
   logic                        SignedDivE;                          // signed division
@@ -118,12 +118,12 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   //////////////////////////////////////////////////////
 
   if (P.IDIV_ON_FPU) begin:intrightshift // Int Supported
-    logic [P.DIVBLEN:0] ZeroDiff, p;
+    logic [P.DIVBLEN-1:0] ZeroDiff, p;
 
     // calculate number of fractional bits p
     assign ZeroDiff = mE - ell;         // Difference in number of leading zeros
-    assign ALTBE = ZeroDiff[P.DIVBLEN];  // A less than B (A has more leading zeros)
-    mux2 #(P.DIVBLEN+1) pmux(ZeroDiff, '0, ALTBE, p);          
+    assign ALTBE = ZeroDiff[P.DIVBLEN-1];  // A less than B (A has more leading zeros)
+    mux2 #(P.DIVBLEN) pmux(ZeroDiff, '0, ALTBE, p);          
 
     /* verilator lint_off WIDTH */
     assign IntResultBitsE = P.LOGR + p;  // Total number of result bits (r integer bits plus p fractional bits)
@@ -192,7 +192,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
 
   if (P.IDIV_ON_FPU) begin:intpipelineregs
-    logic [P.DIVBLEN:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
+    logic [P.DIVBLEN-1:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
     logic               RemOpE;
 
     /* verilator lint_off WIDTH */
@@ -200,7 +200,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     assign IntRemNormShiftE = mE + (P.DIVb-(P.XLEN-1));           // m + b - (N-1) for remainder normalization shift
     /* verilator lint_on WIDTH */
     assign RemOpE = Funct3E[1];
-    mux2 #(P.DIVBLEN+1) normshiftmux(IntDivNormShiftE, IntRemNormShiftE, RemOpE, IntNormShiftE);
+    mux2 #(P.DIVBLEN) normshiftmux(IntDivNormShiftE, IntRemNormShiftE, RemOpE, IntNormShiftE);
 
     // pipeline registers
     flopen #(1)          mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
@@ -208,7 +208,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     flopen #(1)        bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
     flopen #(1)        asignreg(clk, IFDivStartE, AsE,      AsM);
     flopen #(1)        bsignreg(clk, IFDivStartE, BsE,      BsM);
-    flopen #(P.DIVBLEN+1) nsreg(clk, IFDivStartE, IntNormShiftE, IntNormShiftM); 
+    flopen #(P.DIVBLEN)   nsreg(clk, IFDivStartE, IntNormShiftE, IntNormShiftM); 
     flopen #(P.XLEN)    srcareg(clk, IFDivStartE, AE,       AM);
     if (P.XLEN==64) 
       flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);
diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv b/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv
index 0eb3b71c0..fe436413e 100644
--- a/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv
@@ -27,9 +27,9 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module fdivsqrtqsel4cmp (
-  input  logic [2:0] Dmsbs,
-  input  logic [4:0] Smsbs,
-  input  logic [7:0] WSmsbs, WCmsbs,
+  input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
+  input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
+  input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4
   input  logic       SqrtE, j1,
   output logic [3:0] udigit
 );
diff --git a/src/fpu/fdivsqrt/fdivsqrtstage2.sv b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
index bb8d87234..5e319a7c1 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@@ -29,23 +29,23 @@
 
 /* verilator lint_off UNOPTFLAT */
 module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.DIVb+3:0] D, DBar, 
-  input  logic [P.DIVb:0]   U, UM,
-  input  logic [P.DIVb+3:0] WS, WC,
-  input  logic [P.DIVb+1:0] C,
+  input  logic [P.DIVb+3:0] D, DBar,        // Q4.DIVb
+  input  logic [P.DIVb:0]   U, UM,          // U1.DIVb
+  input  logic [P.DIVb+3:0] WS, WC,         // Q4.DIVb
+  input  logic [P.DIVb+1:0] C,              // Q2.DIVb
   input  logic             SqrtE,
   output logic             un,
-  output logic [P.DIVb+1:0] CNext,
-  output logic [P.DIVb:0]   UNext, UMNext, 
-  output logic [P.DIVb+3:0] WSNext, WCNext
+  output logic [P.DIVb+1:0] CNext,          // Q2.DIVb
+  output logic [P.DIVb:0]   UNext, UMNext,  // U1.DIVb
+  output logic [P.DIVb+3:0] WSNext, WCNext  // Q4.DIVb
 );
  /* verilator lint_on UNOPTFLAT */
 
-  logic [P.DIVb+3:0]        Dsel;
+  logic [P.DIVb+3:0]        Dsel;     // Q4.DIVb
   logic                    up, uz;
-  logic [P.DIVb+3:0]        F;
-  logic [P.DIVb+3:0]        AddIn;
-  logic [P.DIVb+3:0]        WSA, WCA;
+  logic [P.DIVb+3:0]        F;        // Q4.DIVb
+  logic [P.DIVb+3:0]        AddIn;    // Q4.DIVb
+  logic [P.DIVb+3:0]        WSA, WCA; // Q4.DIVb
 
   // Qmient Selection logic
   // Given partial remainder, select digit of +1, 0, or -1 (up, uz, un)
diff --git a/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
index c6477ec68..fea2851b6 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@@ -27,26 +27,26 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.DIVb+3:0] D, DBar, D2, DBar2,
-  input  logic [P.DIVb:0]   U,UM,
-  input  logic [P.DIVb+3:0] WS, WC,
-  input  logic [P.DIVb+1:0] C,
+  input  logic [P.DIVb+3:0] D, DBar, D2, DBar2, // Q4.DIVb
+  input  logic [P.DIVb:0]   U,UM,               // U1.DIVb
+  input  logic [P.DIVb+3:0] WS, WC,             // Q4.DIVb
+  input  logic [P.DIVb+1:0] C,                  // Q2.DIVb
   input  logic             SqrtE, j1,
-  output logic [P.DIVb+1:0] CNext,
+  output logic [P.DIVb+1:0] CNext,              // Q2.DIVb
   output logic             un,
-  output logic [P.DIVb:0]   UNext, UMNext, 
-  output logic [P.DIVb+3:0] WSNext, WCNext
+  output logic [P.DIVb:0]   UNext, UMNext,      // U1.DIVb
+  output logic [P.DIVb+3:0] WSNext, WCNext      // Q4.DIVb
 );
 
-  logic [P.DIVb+3:0]        Dsel;
+  logic [P.DIVb+3:0]        Dsel;               // Q4.DIVb
   logic [3:0]              udigit;
-  logic [P.DIVb+3:0]        F;
-  logic [P.DIVb+3:0]        AddIn;
+  logic [P.DIVb+3:0]        F;                  // Q4.DIVb
+  logic [P.DIVb+3:0]        AddIn;              // Q4.DIVb
   logic [4:0]              Smsbs;
   logic [2:0]              Dmsbs;
   logic [7:0]              WCmsbs, WSmsbs;
   logic                    CarryIn;
-  logic [P.DIVb+3:0]        WSA, WCA;
+  logic [P.DIVb+3:0]        WSA, WCA;           // Q4.DIVb
 
   // Digit Selection logic
   // u encoding:
@@ -55,10 +55,10 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
   // 0000 =  0
   // 0010 = -1
   // 0001 = -2
-  assign Smsbs  = U[P.DIVb:P.DIVb-4];
-  assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];
-  assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];
-  assign WSmsbs = WS[P.DIVb+3:P.DIVb-4];
+  assign Smsbs  = U[P.DIVb:P.DIVb-4];       // U1.4 most significant bits of square root
+  assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
+  assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
+  assign WSmsbs = WS[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
 
   fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .udigit);
   assign un = 1'b0; // unused for radix 4
diff --git a/src/fpu/fdivsqrt/fdivsqrtuotfc2.sv b/src/fpu/fdivsqrt/fdivsqrtuotfc2.sv
index bde28cfba..c895fa2ce 100644
--- a/src/fpu/fdivsqrt/fdivsqrtuotfc2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuotfc2.sv
@@ -31,15 +31,15 @@
 ///////////////////////////////
 module fdivsqrtuotfc2 import cvw::*;  #(parameter cvw_t P) (
   input  logic             up, un,
-  input  logic [P.DIVb+1:0] C,
-  input  logic [P.DIVb:0]   U, UM,
-  output logic [P.DIVb:0]   UNext, UMNext
+  input  logic [P.DIVb+1:0] C,                // Q2.DIVb
+  input  logic [P.DIVb:0]   U, UM,            // U1.DIVb
+  output logic [P.DIVb:0]   UNext, UMNext     // U1.DIVb
 );
   //  The on-the-fly converter transfers the divsqrt
   //  bits to the quotient as they come.
-  logic [P.DIVb:0] K;
+  logic [P.DIVb:0] K;                         // U1.DIVb one-hot 
 
-  assign K = (C[P.DIVb:0] & ~(C[P.DIVb:0] << 1)); // Thermometer to one hot encoding
+  assign K = (C[P.DIVb:0] & ~(C[P.DIVb:0] << 1)); // Thermometer to one hot encoding  
 
   always_comb begin
     if (up) begin
diff --git a/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv b/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
index 403ccf051..b12b9174b 100644
--- a/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
@@ -28,15 +28,15 @@
 
 module fdivsqrtuotfc4 import cvw::*;  #(parameter cvw_t P) (
   input  logic [3:0]     udigit,
-  input  logic [P.DIVb:0] U, UM,
-  input  logic [P.DIVb:0] C,
-  output logic [P.DIVb:0] UNext, UMNext
+  input  logic [P.DIVb:0] U, UM,          // U1.DIVb
+  input  logic [P.DIVb:0] C,              // Q1.DIVb
+  output logic [P.DIVb:0] UNext, UMNext   // U1.DIVb
 );
   //  The on-the-fly converter transfers the square root 
   //  bits to the quotient as they come.
   //  Use this otfc for division and square root.
 
-  logic [P.DIVb:0] K1, K2, K3;       
+  logic [P.DIVb:0] K1, K2, K3;            // U1.DIVb
   assign K1 = (C&~(C << 1));        // K
   assign K2 = ((C << 1)&~(C << 2)); // 2K
   assign K3 = (C & ~(C << 2));      // 3K
diff --git a/testbench/testbench-fp.sv b/testbench/testbench-fp.sv
index 9e602cab0..662036439 100644
--- a/testbench/testbench-fp.sv
+++ b/testbench/testbench-fp.sv
@@ -145,11 +145,11 @@ module testbenchfp;
    
    initial begin
       // Information displayed for user on what is simulating
-      $display("\nThe start of simulation...");      
-      $display("This simulation for TEST is %s", TEST);
-      $display("This simulation for TEST is of the operand size of %s", TEST_SIZE);      
+      //$display("\nThe start of simulation...");      
+      //$display("This simulation for TEST is %s", TEST);
+      //$display("This simulation for TEST is of the operand size of %s", TEST_SIZE);      
 
-      $display("FPDUR %d %d DIVN %d LOGR %d RK %d RADIX %d DURLEN %d", FPDUR, DIVN, LOGR, RK, RADIX, DURLEN);
+      // $display("FPDUR %d %d DIVN %d LOGR %d RK %d RADIX %d DURLEN %d", FPDUR, DIVN, LOGR, RK, RADIX, DURLEN);
 
       if (P.Q_SUPPORTED & (TEST_SIZE == "QP" | TEST_SIZE == "all")) begin // if Quad percision is supported
 	 if (TEST === "cvtint" | TEST === "all") begin  // if testing integer conversion
@@ -652,7 +652,7 @@ module testbenchfp;
       string tt0;
       tt0 = $psprintf("%s", Tests[TestNum]);
       testname = {pp, tt0};
-      $display("Here you are %s", testname);     
+      //$display("Here you are %s", testname);     
       $display("\n\nRunning %s vectors ", Tests[TestNum]);
       $readmemh(testname, TestVectors);
       // set the test index to 0

From 002034845a685c1dcb3668538658c5ed52978597 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 12 Nov 2023 06:15:47 -0800
Subject: [PATCH 21/62] fdivsqrt comment improvements

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv  |  2 +-
 src/fpu/fdivsqrt/fdivsqrtexpcalc.sv |  8 +++++---
 src/fpu/fdivsqrt/fdivsqrtfgen2.sv   |  8 ++++----
 src/fpu/fdivsqrt/fdivsqrtfgen4.sv   | 12 ++++++------
 src/fpu/fdivsqrt/fdivsqrtfsm.sv     |  2 +-
 src/fpu/fdivsqrt/fdivsqrtiter.sv    |  8 ++++----
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 11 ++++++-----
 src/fpu/fdivsqrt/fdivsqrtqsel2.sv   | 25 ++++++++-----------------
 src/fpu/fdivsqrt/fdivsqrtstage2.sv  | 16 +++++-----------
 9 files changed, 40 insertions(+), 52 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index 20fb16f62..6043ebb4a 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -30,7 +30,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0] FmtE,
   input  logic                 SqrtE,
   input  logic                 IntDivE,
-  input  logic [P.DIVBLEN-1:0] IntResultBitsE,
+  input  logic [P.DIVBLEN-1:0] IntResultBitsE,    
   output logic [P.DURLEN-1:0]  CyclesE
 );
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
index a1dd82e35..cf243a84b 100644
--- a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
@@ -28,17 +28,19 @@
 
 module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0] Fmt,
-  input  logic [P.NE-1:0]      Xe, Ye,
+  input  logic [P.NE-1:0]      Xe, Ye,    // input exponents
   input  logic                 Sqrt,
   input  logic                 XZero, 
-  input  logic [P.DIVBLEN-1:0] ell, m,
-  output logic [P.NE+1:0]      Ue
+  input  logic [P.DIVBLEN-1:0] ell, m,    // number of leading 0s in Xe and Ye
+  output logic [P.NE+1:0]      Ue         // result exponent
   );
   
   logic [P.NE-2:0] Bias;
   logic [P.NE+1:0] SXExp;
   logic [P.NE+1:0] SExp;
   logic [P.NE+1:0] DExp;
+
+  // Determine exponent bias according to the format
   
   if (P.FPSIZES == 1) begin
     assign Bias = (P.NE-1)'(P.BIAS); 
diff --git a/src/fpu/fdivsqrt/fdivsqrtfgen2.sv b/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
index 990e3f19f..cf398f570 100644
--- a/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
@@ -28,12 +28,12 @@
 
 module fdivsqrtfgen2 import cvw::*;  #(parameter cvw_t P) (
   input  logic              up, uz,
-  input  logic [P.DIVb+3:0] C, U, UM,
-  output logic [P.DIVb+3:0] F
+  input  logic [P.DIVb+3:0] C, U, UM,   // Q4.DIVb (extended from shorter forms)
+  output logic [P.DIVb+3:0] F           // Q4.DIVb
 );
-  logic [P.DIVb+3:0]        FP, FN, FZ;
+  logic [P.DIVb+3:0]        FP, FN, FZ;  // Q4.DIVb
 
-  // Generate for both positive and negative bits
+  // Generate for both positive and negative quotient digits
   assign FP = ~(U << 1) & C;
   assign FN = (UM << 1) | (C & ~(C << 2));
   assign FZ = '0;
diff --git a/src/fpu/fdivsqrt/fdivsqrtfgen4.sv b/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
index fc648f5bd..e2cec1ab4 100644
--- a/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
@@ -27,14 +27,14 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module fdivsqrtfgen4 import cvw::*;  #(parameter cvw_t P) (
-  input  logic [3:0]        udigit,
-  input  logic [P.DIVb+3:0] C, U, UM,
-  output logic [P.DIVb+3:0] F
+  input  logic [3:0]        udigit,           // {2, 1, -1, -2}; all cold for zero
+  input  logic [P.DIVb+3:0] C, U, UM,         // Q4.DIVb (extended from shorter forms)
+  output logic [P.DIVb+3:0] F                 // Q4.DIVb
 );
-  logic [P.DIVb+3:0]        F2, F1, F0, FN1, FN2;
+  logic [P.DIVb+3:0]        F2, F1, F0, FN1, FN2; // Q4.DIVb
   
-  // Generate for both positive and negative bits
-  assign F2  = (~U << 2) & (C << 2);
+  // Generate for both positive and negative digits
+  assign F2  = (~U << 2) & (C << 2);              // 
   assign F1  = ~(U << 1) & C;
   assign F0  = '0;
   assign FN1 = (UM << 1) | (C & ~(C << 3));
diff --git a/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index 0e2cba90e..862d53b25 100644
--- a/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -57,7 +57,7 @@ module fdivsqrtfsm import cvw::*;  #(parameter cvw_t P) (
   // terminate immediately on special cases
   assign FSpecialCaseE = XZeroE | XInfE  | XNaNE |  (XsE&SqrtE) | (YZeroE | YInfE | YNaNE)&~SqrtE;
   if (P.IDIV_ON_FPU) assign SpecialCaseE = IntDivE ? ISpecialCaseE : FSpecialCaseE;
-  else              assign SpecialCaseE = FSpecialCaseE;
+  else               assign SpecialCaseE = FSpecialCaseE;
   flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
 
   always_ff @(posedge clk) begin
diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv
index 0f66982ab..863d94837 100644
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -104,14 +104,14 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
     for(i=0; $unsigned(i)<P.DIVCOPIES; i++) begin : iterations
       if (P.RADIX == 2) begin: stage
         fdivsqrtstage2 #(P) fdivsqrtstage(.D, .DBar, .SqrtE,
-        .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
-        .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
+          .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
+          .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end else begin: stage
         logic j1;
         assign j1 = (i == 0 & ~C[0][P.DIVb-1]);
         fdivsqrtstage4 #(P) fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, .j1,
-        .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
-        .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
+          .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
+          .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end
       assign WS[i+1] = WSNext[i];
       assign WC[i+1] = WCNext[i];
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 8d6e565b1..c65f26fd8 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -29,17 +29,18 @@
 module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   input  logic                 clk,
   input  logic                 IFDivStartE, 
-  input  logic [P.NF:0]        Xm, Ym,
-  input  logic [P.NE-1:0]      Xe, Ye,
+  input  logic [P.NF:0]        Xm, Ym,      // Floating-point significands
+  input  logic [P.NE-1:0]      Xe, Ye,      // Floating-point exponents
   input  logic [P.FMTBITS-1:0] FmtE,
   input  logic                 SqrtE,
   input  logic                 XZeroE,
   input  logic [2:0]           Funct3E,
-  output logic [P.NE+1:0]      UeM,
-  output logic [P.DIVb+3:0]    X, D,
+  output logic [P.NE+1:0]      UeM,         // biased exponent of result
+  output logic [P.DIVb+3:0]    X, D,        // Q4.DIVb
   // Int-specific
-  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
+  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // U(XLEN.0) inputs from IEU 
   input  logic                 IntDivE, W64E,
+  // Outputs
   output logic                 ISpecialCaseE,
   output logic [P.DURLEN-1:0]  CyclesE,
   output logic [P.DIVBLEN-1:0] IntNormShiftM,
diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel2.sv b/src/fpu/fdivsqrt/fdivsqrtqsel2.sv
index fe32924e1..de64bafc9 100644
--- a/src/fpu/fdivsqrt/fdivsqrtqsel2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtqsel2.sv
@@ -18,7 +18,7 @@
 // except in compliance with the License, or, at your option, the Apache License version 2.0. You 
 // may obtain a copy of the License at
 //
-// https://solderpad.org/licenses/SHL-2.1/
+// httWS://solderpad.org/licenses/SHL-2.1/
 //
 // Unless required by applicable law or agreed to in writing, any work distributed under the 
 // License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
@@ -27,27 +27,18 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module fdivsqrtqsel2 ( 
-  input  logic [3:0] ps, pc, 
+  input  logic [3:0] WS, WC, 
   output logic       up, uz, un
 );
  
-  logic [3:0]  p, g;
   logic        magnitude, sign;
  
-  // The quotient selection logic is presented for simplicity, not
-  // for efficiency.  You can probably optimize your logic to
-  // select the proper divisor with less delay.
-
-  // Quotient equations from EE371 lecture notes 13-20
-  assign p = ps ^ pc;
-  assign g = ps & pc;
-
-  assign magnitude = ~((ps[2]^pc[2]) & (ps[1]^pc[1]) & 
-        (ps[0]^pc[0]));
-  assign sign = (ps[3]^pc[3])^
-      (ps[2] & pc[2] | ((ps[2]^pc[2]) &
-          (ps[1]&pc[1] | ((ps[1]^pc[1]) &
-            (ps[0]&pc[0])))));
+  assign magnitude = ~((WS[2]^WC[2]) & (WS[1]^WC[1]) & 
+        (WS[0]^WC[0]));
+  assign sign = (WS[3]^WC[3])^
+      (WS[2] & WC[2] | ((WS[2]^WC[2]) &
+          (WS[1]&WC[1] | ((WS[1]^WC[1]) &
+            (WS[0]&WC[0])))));
 
   // Produce digit = +1, 0, or -1
   assign up = magnitude & ~sign;
diff --git a/src/fpu/fdivsqrt/fdivsqrtstage2.sv b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
index 5e319a7c1..ad0c828e9 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@@ -33,8 +33,8 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.DIVb:0]   U, UM,          // U1.DIVb
   input  logic [P.DIVb+3:0] WS, WC,         // Q4.DIVb
   input  logic [P.DIVb+1:0] C,              // Q2.DIVb
-  input  logic             SqrtE,
-  output logic             un,
+  input  logic              SqrtE,
+  output logic              un,
   output logic [P.DIVb+1:0] CNext,          // Q2.DIVb
   output logic [P.DIVb:0]   UNext, UMNext,  // U1.DIVb
   output logic [P.DIVb+3:0] WSNext, WCNext  // Q4.DIVb
@@ -42,19 +42,13 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
  /* verilator lint_on UNOPTFLAT */
 
   logic [P.DIVb+3:0]        Dsel;     // Q4.DIVb
-  logic                    up, uz;
+  logic                     up, uz;
   logic [P.DIVb+3:0]        F;        // Q4.DIVb
   logic [P.DIVb+3:0]        AddIn;    // Q4.DIVb
   logic [P.DIVb+3:0]        WSA, WCA; // Q4.DIVb
 
-  // Qmient Selection logic
+  // Quotient Selection logic
   // Given partial remainder, select digit of +1, 0, or -1 (up, uz, un)
-  // q encoding:
-  // 1000 = +2
-  // 0100 = +1
-  // 0000 =  0
-  // 0010 = -1
-  // 0001 = -2
   fdivsqrtqsel2 qsel2(WS[P.DIVb+3:P.DIVb], WC[P.DIVb+3:P.DIVb], up, uz, un);
 
   // Sqrt F generation.  Extend C, U, UM to Q4.k
@@ -66,7 +60,7 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
     else if (uz) Dsel = '0;
     else         Dsel = D; // un
 
-  // Partial Product Generation
+  // Residual Generation
   //  WSA, WCA = WS + WC - qD
   mux2 #(P.DIVb+4) addinmux(Dsel, F, SqrtE, AddIn);
   csa #(P.DIVb+4) csa(WS, WC, AddIn, up&~SqrtE, WSA, WCA);

From 7c50b2c571ba5cc2b9c8ac7a5ec3255d115d5ba7 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 12 Nov 2023 06:36:57 -0800
Subject: [PATCH 22/62] Renamed qsel to uslc and simplified radix2 uslc

---
 src/fpu/fdivsqrt/fdivsqrtstage2.sv            |  4 +--
 src/fpu/fdivsqrt/fdivsqrtstage4.sv            | 23 +++++--------
 .../{fdivsqrtqsel2.sv => fdivsqrtuslc2.sv}    | 24 +++++++------
 .../{fdivsqrtqsel4.sv => fdivsqrtuslc4.sv}    | 34 +++++++++----------
 ...divsqrtqsel4cmp.sv => fdivsqrtuslc4cmp.sv} | 10 +++---
 5 files changed, 46 insertions(+), 49 deletions(-)
 rename src/fpu/fdivsqrt/{fdivsqrtqsel2.sv => fdivsqrtuslc2.sv} (69%)
 rename src/fpu/fdivsqrt/{fdivsqrtqsel4.sv => fdivsqrtuslc4.sv} (72%)
 rename src/fpu/fdivsqrt/{fdivsqrtqsel4cmp.sv => fdivsqrtuslc4cmp.sv} (90%)

diff --git a/src/fpu/fdivsqrt/fdivsqrtstage2.sv b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
index ad0c828e9..40a2a5a01 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@@ -49,7 +49,7 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
 
   // Quotient Selection logic
   // Given partial remainder, select digit of +1, 0, or -1 (up, uz, un)
-  fdivsqrtqsel2 qsel2(WS[P.DIVb+3:P.DIVb], WC[P.DIVb+3:P.DIVb], up, uz, un);
+  fdivsqrtuslc2 uslc2(.WS(WS[P.DIVb+3:P.DIVb]), .WC(WC[P.DIVb+3:P.DIVb]), .up, .uz, .un);
 
   // Sqrt F generation.  Extend C, U, UM to Q4.k
   fdivsqrtfgen2 #(P) fgen2(.up, .uz, .C({2'b11, CNext}), .U({3'b000, U}), .UM({3'b000, UM}), .F);
@@ -60,7 +60,7 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
     else if (uz) Dsel = '0;
     else         Dsel = D; // un
 
-  // Residual Generation
+  // Residual Update
   //  WSA, WCA = WS + WC - qD
   mux2 #(P.DIVb+4) addinmux(Dsel, F, SqrtE, AddIn);
   csa #(P.DIVb+4) csa(WS, WC, AddIn, up&~SqrtE, WSA, WCA);
diff --git a/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
index fea2851b6..a24c1155f 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@@ -31,36 +31,29 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.DIVb:0]   U,UM,               // U1.DIVb
   input  logic [P.DIVb+3:0] WS, WC,             // Q4.DIVb
   input  logic [P.DIVb+1:0] C,                  // Q2.DIVb
-  input  logic             SqrtE, j1,
+  input  logic              SqrtE, j1,
   output logic [P.DIVb+1:0] CNext,              // Q2.DIVb
-  output logic             un,
+  output logic              un,
   output logic [P.DIVb:0]   UNext, UMNext,      // U1.DIVb
   output logic [P.DIVb+3:0] WSNext, WCNext      // Q4.DIVb
 );
 
   logic [P.DIVb+3:0]        Dsel;               // Q4.DIVb
-  logic [3:0]              udigit;
+  logic [3:0]               udigit;             // {+2, +1, -1, -2} or 0000 for 0
   logic [P.DIVb+3:0]        F;                  // Q4.DIVb
   logic [P.DIVb+3:0]        AddIn;              // Q4.DIVb
-  logic [4:0]              Smsbs;
-  logic [2:0]              Dmsbs;
-  logic [7:0]              WCmsbs, WSmsbs;
-  logic                    CarryIn;
+  logic [4:0]               Smsbs;              // U1.4
+  logic [2:0]               Dmsbs;              // U0.3   drop leading 1 from D
+  logic [7:0]               WCmsbs, WSmsbs;     // U4.4
+  logic                     CarryIn;
   logic [P.DIVb+3:0]        WSA, WCA;           // Q4.DIVb
 
   // Digit Selection logic
-  // u encoding:
-  // 1000 = +2
-  // 0100 = +1
-  // 0000 =  0
-  // 0010 = -1
-  // 0001 = -2
   assign Smsbs  = U[P.DIVb:P.DIVb-4];       // U1.4 most significant bits of square root
   assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
   assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
   assign WSmsbs = WS[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
-
-  fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .udigit);
+  fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .udigit);
   assign un = 1'b0; // unused for radix 4
 
   // F generation logic
diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel2.sv b/src/fpu/fdivsqrt/fdivsqrtuslc2.sv
similarity index 69%
rename from src/fpu/fdivsqrt/fdivsqrtqsel2.sv
rename to src/fpu/fdivsqrt/fdivsqrtuslc2.sv
index de64bafc9..e4fcfeadf 100644
--- a/src/fpu/fdivsqrt/fdivsqrtqsel2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc2.sv
@@ -1,10 +1,10 @@
 ///////////////////////////////////////////
-// fdivsqrtqsel2.sv
+// fdivsqrtuslc2.sv
 //
 // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
 // Modified:13 January 2022
 //
-// Purpose: Radix 2 Quotient Digit Selection
+// Purpose: Radix 2 Unified Quotient/Square Root Digit Selection
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
@@ -26,22 +26,26 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module fdivsqrtqsel2 ( 
-  input  logic [3:0] WS, WC, 
-  output logic       up, uz, un
+module fdivsqrtuslc2 ( 
+  input  logic [3:0] WS, WC,      // Q4.0 most significant bits of redundant residual
+  output logic       up, uz, un   // {+1, 0, -1}
 );
  
-  logic        magnitude, sign;
+  logic        sign;
+
+  // Carry chain logic determines if W = WS + WC = -1, < -1, > -1 to choose 0, -1, 1 respectively
  
-  assign magnitude = ~((WS[2]^WC[2]) & (WS[1]^WC[1]) & 
+  //if p2 * p1 * p0, W = -1 and choose digit of 0
+  assign uz = ((WS[2]^WC[2]) & (WS[1]^WC[1]) & 
         (WS[0]^WC[0]));
+
+  // Otherwise determine sign using carry chain: sign = p3 ^ g_2:0
   assign sign = (WS[3]^WC[3])^
       (WS[2] & WC[2] | ((WS[2]^WC[2]) &
           (WS[1]&WC[1] | ((WS[1]^WC[1]) &
             (WS[0]&WC[0])))));
 
   // Produce digit = +1, 0, or -1
-  assign up = magnitude & ~sign;
-  assign uz = ~magnitude;
-  assign un = magnitude & sign;
+  assign up = ~uz & ~sign;
+  assign un = ~uz & sign;
 endmodule
diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel4.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
similarity index 72%
rename from src/fpu/fdivsqrt/fdivsqrtqsel4.sv
rename to src/fpu/fdivsqrt/fdivsqrtuslc4.sv
index de520bef2..268ca9ea2 100644
--- a/src/fpu/fdivsqrt/fdivsqrtqsel4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
@@ -1,10 +1,10 @@
 ///////////////////////////////////////////
-// fdivsqrtqsel4.sv
+// fdivsqrtuslc4.sv
 //
 // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
 // Modified:13 January 2022
 //
-// Purpose: Radix 4 Quotient Digit Selection
+// Purpose: Table-based Radix 4 Unified Quotient/Square Root Digit Selection
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
@@ -26,25 +26,25 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module fdivsqrtqsel4 (
-  input  logic [2:0] Dmsbs,
-  input  logic [4:0] Smsbs,
-  input  logic [7:0] WSmsbs, WCmsbs,
+module fdivsqrtuslc4 (
+  input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
+  input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
+  input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 redundant residual most significant bits
   input  logic       Sqrt, j1,
-  output logic [3:0] udigit
+  output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
-  logic [6:0] Wmsbs;
-  logic [7:0] PreWmsbs;
-  logic [2:0] A;
+  logic [7:0] PreWmsbs;                 // Q4.4 nonredundant residual msbs
+  logic [6:0] Wmsbs;                    // Q4.3 truncated nonredundant residual
+  logic [2:0] A;                        // U0.3 upper bits of D or Smsbs, discarding integer bit
 
-  assign PreWmsbs = WCmsbs + WSmsbs;
-  assign Wmsbs = PreWmsbs[7:1];
+  assign PreWmsbs = WCmsbs + WSmsbs;    // add redundant residual to find msbs
+  assign Wmsbs = PreWmsbs[7:1];         // truncate least significant bit to Q4.3 to index table
   // D = 0001.xxx...
   // Dmsbs = |   |
   // W =      xxxx.xxx...
   // Wmsbs = |        |
 
-  logic [3:0] USel4[1023:0];
+  logic [3:0] USel4[1023:0];            // 1024-bit table indexed with 3 bits of A and 7 bits of Wmsbs
 
   // Prepopulate selection table; this is constant at compile time
   always_comb begin 
@@ -101,10 +101,10 @@ module fdivsqrtqsel4 (
   // Select A
   always_comb
     if (Sqrt) begin 
-      if (j1) A = 3'b101;
-      else if (Smsbs == 5'b10000) A = 3'b111;
-      else A = Smsbs[2:0];
-    end else A = Dmsbs;
+      if (j1) A = 3'b101;                       // on first sqrt iteration        A = .101
+      else if (Smsbs == 5'b10000) A = 3'b111;   // if S = 1.0, use                A = .111
+      else A = Smsbs[2:0];                      // otherwise use                  A = S (in U0.3 format)
+    end else A = Dmsbs;                         // division Unless                A = D (IN U0.3 format, dropping leading 1)
 
   // Select quotient digit from lookup table based on A and W
   assign udigit = USel4[{A,Wmsbs}];
diff --git a/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
similarity index 90%
rename from src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv
rename to src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
index fe436413e..ccb5e618a 100644
--- a/src/fpu/fdivsqrt/fdivsqrtqsel4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@@ -1,10 +1,10 @@
 ///////////////////////////////////////////
-// fdivsqrtqsel4cmp.sv
+// fdivsqrtuslc4cmp.sv
 //
 // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
 // Modified:13 January 2022
 //
-// Purpose: Comparator-based Radix 4 Quotient Digit Selection
+// Purpose: Comparator-based Radix 4 Unified Quotient/Square Root Digit Selection 
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
@@ -26,12 +26,12 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module fdivsqrtqsel4cmp (
+module fdivsqrtuslc4cmp (
   input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
   input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
-  input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4
+  input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 residual most significant bits
   input  logic       SqrtE, j1,
-  output logic [3:0] udigit
+  output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
   logic [6:0] Wmsbs;
   logic [7:0] PreWmsbs;

From f437336540ddcd084fd40c395eb6c1f51af130c6 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 12 Nov 2023 10:05:54 -0800
Subject: [PATCH 23/62] Explained sqrt preshifting

---
 config/shared/config-shared.vh      |  1 +
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 41 ++++++++++++++++++++---------
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 14de5187e..9635d706b 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -99,6 +99,7 @@ localparam RK          = LOGR*DIVCOPIES;                            // r*k bits
 
 // intermediate division parameters not directly used in fdivsqrt hardware
 localparam FPDIVMINb   = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit because square root could be shifted right *** explain better
+//localparam FPDIVMINb   = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right
 localparam DIVMINb     = ((FPDIVMINb<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVMINb; // minimum fractional bits b = max(XLEN, FPDIVMINb)
 localparam RESBITS     = DIVMINb + LOGR; // number of bits in a result: r integer + b fractional
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index c65f26fd8..ecdf10f8b 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -50,7 +50,6 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 );
 
   logic [P.DIVb:0]             Xnorm, Dnorm;
-  logic [P.DIVb:0]             PreSqrtX;
   logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
   logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
   logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
@@ -61,7 +60,8 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   logic                        SignedDivE;                          // signed division
   logic                        AsE, BsE;                            // Signs of integer inputs
   logic [P.XLEN-1:0]           AE;                                  // input A after W64 adjustment
-  logic  ALTBE;
+  logic                        ALTBE;
+  logic                        EvenExp;
 
   //////////////////////////////////////////////////////
   // Integer Preprocessing
@@ -153,9 +153,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   // shift square root to be in range [1/4, 1)
   // Normalized numbers are shifted right by 1 if the exponent is odd
   // Subnormal numbers have Xe = 0 and an unbiased exponent of 1-BIAS.  They are shifted right if the number of leading zeros is odd.
-  // NOTE: there might be a discrepancy that X is never right shifted by 2.  However
-  //  it comes out in the wash and gives the right answer.  Investigate later if possible. ***
-  //////////////////////////////////////////////////////
+   //////////////////////////////////////////////////////
 
   assign DivX = {3'b000, Xnorm}; // Zero-extend numerator for division
 
@@ -165,13 +163,32 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   // Next X is shifted right by 1 or 2 bits to range [1/4, 1) and exponent will be adjusted accordingly to be even
   // Now (X-1) is negative.  Formed by placing all 1s in all four integer bits (in Q4.b) form, keeping X in fraciton bits
   // Then multiply by R is left shift by r (1 or 2 for radix 2 or 4)
-  // For Radix 2, this gives 3 leading 1s, followed by the fraction bits
-  // For Radix 4, this gives 2 leading 1s, followed by the fraction bits (and a zero in the lsb)
-  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
-  if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};
-  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};
-  mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);
-  
+  // This is optimized in hardware by first right shifting by 0 or 1 bit (instead of 1 or 2), then left shifting by (r-1), then subtracting 2 or 4
+  // Subtracting 2 is equivalent to adding 1110.  Subtracting 4 is equivalent to adding 1100.  Prepend leading 1s to do a free subtraction.
+  // This also means only one extra fractional bit is needed becaue we never shift right by more than 1.
+  // Radix      Exponent odd          Exponent Even
+  // 2          x-2 = 2(x/2 - 1)      x/2 - 2 = 2(x/4 - 1)
+  // 4          2x-4 = 4(x/2 - 1))    x-4 = 4(x/4 - 1)
+  // Summary: PreSqrtX = r(x/2or4 - 1)
+
+  assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
+/*  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
+  if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
+  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};                     // 2PreSqrtX - 4 = 4(PreSqrtX/2 - 1) */
+
+  if (P.RADIX == 2) begin
+    logic [P.DIVb:0] PreSqrtX;    // U1.DIVb
+    mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
+    assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
+  end else begin
+    logic [P.DIVb+1:0] PreSqrtX;  // U2.DIVb
+    mux2 #(P.DIVb+2) sqrtxmux({Xnorm, 1'b0}, {1'b0, Xnorm}, EvenExp, PreSqrtX); // 2X if exponent odd, X if exponent even
+    assign SqrtX = {2'b11, PreSqrtX};                     // PreSqrtX - 4 = 4(PreSqrtX/4 - 1)
+  end
+
+  // Initialize X for division or square root
+  mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);                    
+
   //////////////////////////////////////////////////////
   // Selet integer or floating-point operands
   //////////////////////////////////////////////////////

From 571c7d3be4687d7cfdda585ce58508f1175c07ef Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 12 Nov 2023 19:41:12 -0800
Subject: [PATCH 24/62] Divider cleanup

---
 config/shared/config-shared.vh      |  4 ++--
 src/fpu/fdivsqrt/fdivsqrtcycles.sv  |  6 +++---
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 13 ++++++++++---
 src/fpu/fdivsqrt/fdivsqrtuslc4.sv   |  2 +-
 4 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 9635d706b..55bca569f 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -98,8 +98,8 @@ localparam LOGR        = $clog2(RADIX);                             // r = log(R
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
 
 // intermediate division parameters not directly used in fdivsqrt hardware
-localparam FPDIVMINb   = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit because square root could be shifted right *** explain better
-//localparam FPDIVMINb   = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right
+localparam FPDIVMINb   = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit to allow sqrt being shifted right
+//localparam FPDIVMINb   = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right, if radix4 doesn't use a right shift.  This version saves one cycle on double-precision with R=4,k=4.  However, it doesn't work yet because C is too short, so k is incorrectly calculated as a 1 in the lsb after the last step.
 localparam DIVMINb     = ((FPDIVMINb<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVMINb; // minimum fractional bits b = max(XLEN, FPDIVMINb)
 localparam RESBITS     = DIVMINb + LOGR; // number of bits in a result: r integer + b fractional
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index 6043ebb4a..e8a430a91 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -66,12 +66,12 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   // P.DIVCOPIES = k. P.LOGR = log(R) = r.  P.RK = rk.  
   // Integer division needs p fractional + r integer result bits
   // FP Division needs at least Nf fractional bits + 2 guard/round bits and one integer digit (LOG R integer bits) = Nf + 2 + r bits
-  // FP Sqrt needs at least Nf fractional bits, 2 guard/round bits, and *** shift bits
+  // FP Sqrt needs at least Nf fractional bits and 2 guard/round bits.  The integer bit is always initialized to 1 and does not need a cycle.
   // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
 
   always_comb begin 
-    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard; integer bit implicit
-    else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
+    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
+    else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits 
 
     if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;
     else               ResultBitsE = FPResultBitsE;
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index ecdf10f8b..145bf9a68 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -168,14 +168,20 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   // This also means only one extra fractional bit is needed becaue we never shift right by more than 1.
   // Radix      Exponent odd          Exponent Even
   // 2          x-2 = 2(x/2 - 1)      x/2 - 2 = 2(x/4 - 1)
-  // 4          2x-4 = 4(x/2 - 1))    x-4 = 4(x/4 - 1)
+  // 4          2(x)-4 = 4(x/2 - 1))  2(x/2)-4 = 4(x/4 - 1)
   // Summary: PreSqrtX = r(x/2or4 - 1)
 
+  logic [P.DIVb:0] PreSqrtX;
   assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
-/*  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
+  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
   if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
-  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};                     // 2PreSqrtX - 4 = 4(PreSqrtX/2 - 1) */
+  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};                     // 2PreSqrtX - 4 = 4(PreSqrtX/2 - 1) 
 
+/*  
+  // Attempt to optimize radix 4 to use a left shift by 1 or zero initially, followed by no more left shift
+  // This saves one bit in DIVb because there is no initial right shift.
+  // However, C needs to be extended further, lest it create a k with a 1 in the lsb when C is all 1s.
+  // That is an optimization for another day.
   if (P.RADIX == 2) begin
     logic [P.DIVb:0] PreSqrtX;    // U1.DIVb
     mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
@@ -185,6 +191,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     mux2 #(P.DIVb+2) sqrtxmux({Xnorm, 1'b0}, {1'b0, Xnorm}, EvenExp, PreSqrtX); // 2X if exponent odd, X if exponent even
     assign SqrtX = {2'b11, PreSqrtX};                     // PreSqrtX - 4 = 4(PreSqrtX/4 - 1)
   end
+*/
 
   // Initialize X for division or square root
   mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);                    
diff --git a/src/fpu/fdivsqrt/fdivsqrtuslc4.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
index 268ca9ea2..b44b34a35 100644
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
@@ -103,7 +103,7 @@ module fdivsqrtuslc4 (
     if (Sqrt) begin 
       if (j1) A = 3'b101;                       // on first sqrt iteration        A = .101
       else if (Smsbs == 5'b10000) A = 3'b111;   // if S = 1.0, use                A = .111
-      else A = Smsbs[2:0];                      // otherwise use                  A = S (in U0.3 format)
+      else A = Smsbs[2:0];                      // otherwise use                  A = 2S (in U0.3 format)
     end else A = Dmsbs;                         // division Unless                A = D (IN U0.3 format, dropping leading 1)
 
   // Select quotient digit from lookup table based on A and W

From 065f3f3f6df62d13b0cad84712b1942797c3d7b3 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 12 Nov 2023 20:23:14 -0800
Subject: [PATCH 25/62] DivStickyM no longer mysteriously needs to be gated
 with SqrtM after divder improvemenst

---
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index cb1f56db7..0b358909a 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -86,9 +86,10 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
   //////////////////////////
 
   //  If the result is not exact, the sticky should be set
-  assign DivStickyM = ~WZeroM & ~(SpecialCaseM & SqrtM); // ***unsure why SpecialCaseM has to be gated by SqrtM, but otherwise fails regression on divide
+//  assign DivStickyM = ~WZeroM & ~(SpecialCaseM & SqrtM); // ***unsure why SpecialCaseM has to be gated by SqrtM, but otherwise fails regression on divide
+  assign DivStickyM = ~WZeroM & ~(SpecialCaseM); 
 
-  // Determine if sticky bit is negative  // *** look for ways to optimize this.  Shift shouldn't be needed.
+  // Determine if sticky bit is negative 
   assign Sum = WC + WS;
   assign NegStickyM = Sum[P.DIVb+3];
   mux2 #(P.DIVb+1) preummux(FirstU, FirstUM, NegStickyM, PreUmM); // Select U or U-1 depending on negative sticky bit

From c44ae93e22aeb0842ed21e18208d52a43c04bdab Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 12 Nov 2023 20:23:27 -0800
Subject: [PATCH 26/62] DivStickyM no longer mysteriously needs to be gated
 with SqrtM after divder improvemenst

---
 src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 0b358909a..5a40a3bdc 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -86,8 +86,7 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
   //////////////////////////
 
   //  If the result is not exact, the sticky should be set
-//  assign DivStickyM = ~WZeroM & ~(SpecialCaseM & SqrtM); // ***unsure why SpecialCaseM has to be gated by SqrtM, but otherwise fails regression on divide
-  assign DivStickyM = ~WZeroM & ~(SpecialCaseM); 
+  assign DivStickyM = ~WZeroM & ~SpecialCaseM; 
 
   // Determine if sticky bit is negative 
   assign Sum = WC + WS;

From 46bfdf5df9d0553daa01cf0e6457a17f84e42196 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Mon, 13 Nov 2023 02:39:25 -0600
Subject: [PATCH 27/62] update ppaAnalyze to analyze correctionly freqSweep

---
 synthDC/Makefile           |  4 +--
 synthDC/ppa/bestSynths.csv | 16 +++++------
 synthDC/ppa/ppaAnalyze.py  | 57 ++++++++++++++++++++------------------
 synthDC/ppa/ppaSynth.py    | 24 ++++++++++++++--
 4 files changed, 62 insertions(+), 39 deletions(-)

diff --git a/synthDC/Makefile b/synthDC/Makefile
index 8e1b09d01..e6332e60f 100755
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@@ -11,7 +11,7 @@ export MOD ?= orig
 # title to add a note in the synth's directory name
 TITLE = 
 # tsmc28, sky130, and sky90 presently supported
-export TECH ?= sky90
+export TECH ?= sky130
 # MAXCORES allows parallel compilation, which is faster but less CPU-efficient
 # Avoid when doing sweeps of many optimization points in parallel
 export MAXCORES ?= 1
@@ -24,7 +24,7 @@ export WIDTH ?= 32
 
 time := $(shell date +%F-%H-%M)
 hash := $(shell git rev-parse --short HEAD)
-export OUTPUTDIR := runs/$(DESIGN)_$(WIDTH)_$(CONFIG)_$(MOD)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
+export OUTPUTDIR := runs/ppa_$(DESIGN)_$(WIDTH)_$(CONFIG)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
 export SAIFPOWER ?= 0
 
 OLDCONFIGDIR ?= ${WALLY}/config
diff --git a/synthDC/ppa/bestSynths.csv b/synthDC/ppa/bestSynths.csv
index d57ac6924..885eeb962 100644
--- a/synthDC/ppa/bestSynths.csv
+++ b/synthDC/ppa/bestSynths.csv
@@ -3,12 +3,12 @@ binencoder,sky130,8,1000,1.0000,50.960001,24.761,0.010685929975270078
 binencoder,sky130,16,1000,1.0000,136.220003,77.243,0.021773774467348
 binencoder,sky130,32,1000,1.0000,372.400007,189.626,0.04371111111111111
 binencoder,sky130,64,1000,1.0000,797.720015,382.205,0.07393850658857981
-binencoder,sky130,128,1000,1.0000,1602.300031,610.009,0.1261366969785861
+binencoder,sky130,128,900,1.1111,1602.300031,610.009,0.1261366969785861
 adder,sky130,8,1000,1.0000,253.820005,154.438,0.10825587752870422
 adder,sky130,16,1000,1.0000,722.260013,485.109,0.32460910944935417
 adder,sky130,32,1000,1.0000,1440.600027,714.057,0.6580226904376014
 adder,sky130,64,1000,1.0000,2781.240054,1050.0,0.9392239364188874
-adder,sky130,128,1000,1.0000,6186.740118,2230.0,2.1480106100795755
+adder,sky130,128,900,1.1111,6186.740118,2230.0,2.1480106100795755
 csa,sky130,8,1000,1.0000,266.560005,154.202,0.13650573115665163
 csa,sky130,16,1000,1.0000,533.12001,308.404,0.27263530601922104
 csa,sky130,32,1000,1.0000,1066.240021,616.808,0.5448072247308093
@@ -18,12 +18,12 @@ shifter,sky130,8,1000,1.0000,259.700005,196.451,0.07534088282874972
 shifter,sky130,16,1000,1.0000,666.400006,558.433,0.19552906110283155
 shifter,sky130,32,1000,1.0000,1475.880027,768.262,0.3807431082700759
 shifter,sky130,64,1000,1.0000,3914.120062,2680.0,1.144802541988198
-shifter,sky130,128,1000,1.0000,9192.400136,6080.0,2.9008914525432616
+shifter,sky130,128,900,1.1111,9192.400136,6080.0,2.9008914525432616
 comparator,sky130,8,1000,1.0000,200.900004,136.6,0.05001033271337053
 comparator,sky130,16,1000,1.0000,358.680007,189.253,0.06321553011448482
-comparator,sky130,32,1000,1.0000,690.900013,315.709,0.10771793448084398
-comparator,sky130,64,1000,1.0000,1372.980026,508.393,0.2048577820389901
-comparator,sky130,128,1000,1.0000,2744.980052,796.047,0.34396273737011823
+comparator,sky130,32,1500,0.666666,690.900013,315.709,0.10771793448084398
+comparator,sky130,64,1300,0.7692307,1372.980026,508.393,0.2048577820389901
+comparator,sky130,128,1100,0.909090,2744.980052,796.047,0.34396273737011823
 flop,sky130,8,1000,1.0000,133.279999,64.8145,0.193835
 flop,sky130,16,1000,1.0000,266.5599975,129.629,0.38715000000000005
 flop,sky130,32,1000,1.0000,533.119995,259.258,0.7723000000000001
@@ -48,7 +48,7 @@ mul,sky130,8,1000,1.0000,2194.220041,1440.0,1.421374045801527
 mul,sky130,16,1000,1.0000,7519.540137,4940.0,6.376128385155466
 mul,sky130,32,1000,1.0000,25200.700446,14900.0,24.931847968545217
 mul,sky130,64,1000,1.0000,86011.661365,42600.0,88.84651898734177
-mul,sky130,128,1000,1.0000,296198.144128,114000.0,273.3148854961832
+mul,sky130,128,800,1.2500,296198.144128,114000.0,273.3148854961832
 binencoder,sky90,8,7683,0.12508649056358195,50.960001,24.761,0.010685929975270078
 binencoder,sky90,16,5773,0.16977016282695304,136.220003,77.243,0.021773774467348
 binencoder,sky90,32,4500,0.2218912222222222,372.400007,189.626,0.04371111111111111
@@ -158,7 +158,7 @@ adder,tsmc28psyn,8,13838,0.07207477814713109,34.272,187.089,0.013311172134701546
 adder,tsmc28psyn,16,11521,0.08678002100512108,90.972001,475.207,0.03367763214998698
 adder,tsmc28psyn,32,9812,0.1018860211985324,209.286002,1060.0,0.08153281695882594
 adder,tsmc28psyn,64,8206,0.12185605215695831,388.836003,1770.0,0.1409943943456008
-adder,tsmc28psyn,128,7354,0.13597341881968997,907.452008,4360.0,0.3451183029643731
+adder,tsmc28psyn,128,7000,0.142857142857,907.452008,4360.0,0.3451183029643731
 csa,tsmc28psyn,8,24524,0.040663382319360626,52.416,482.462,0.02173381177621921
 csa,tsmc28psyn,16,24524,0.040663382319360626,104.832,964.99,0.04346762355243842
 csa,tsmc28psyn,32,24524,0.040663382319360626,209.664,1930.0,0.08677214157559941
diff --git a/synthDC/ppa/ppaAnalyze.py b/synthDC/ppa/ppaAnalyze.py
index 459a8520d..80cd57604 100755
--- a/synthDC/ppa/ppaAnalyze.py
+++ b/synthDC/ppa/ppaAnalyze.py
@@ -38,7 +38,7 @@ def synthsintocsv():
         each line contains the module, tech, width, target freq, and resulting metrics
     '''
     print("This takes a moment...")
-    bashCommand = "find . -path '*runs/ppa*rv32e*' -prune"
+    bashCommand = "find . -path '*runs/ppa*' -prune"
     output = subprocess.check_output(['bash','-c', bashCommand])
     allSynths = output.decode("utf-8").split('\n')[:-1]
 
@@ -51,7 +51,6 @@ def synthsintocsv():
 
     for oneSynth in allSynths:
         module, width, risc, tech, freq = specReg.findall(oneSynth)[2:7]
-        tech = tech[:-2]
         metrics = []
         for phrase in [['Path Slack', 'qor'], ['Design Area', 'qor'], ['100', 'power']]:
             bashCommand = 'grep "{}" '+ oneSynth[2:]+'/reports/*{}*'
@@ -87,7 +86,7 @@ def cleanup():
             output = subprocess.check_output(['bash','-c', bc])
     except: pass
 
-    bashCommand = "find . -path '*runs/ppa*rv32e*' -prune"
+    bashCommand = "find . -path '*runs/ppa*' -prune"
     output = subprocess.check_output(['bash','-c', bashCommand])
     allSynths = output.decode("utf-8").split('\n')[:-1]
     for oneSynth in allSynths:
@@ -186,7 +185,7 @@ def genLegend(fits, coefs, r2=None, spec=None, ale=False):
         legend_elements += [lines.Line2D([0], [0], color=spec.color, ls='', marker=spec.shape, label='$R^2$='+ str(round(r2, 4)))]
         return legend_elements
 
-def oneMetricPlot(module, var, freq=None, ax=None, fits='clsgn', norm=True, color=None):
+def oneMetricPlot(module, widths, var, freq=None, ax=None, fits='clsgn', norm=True, color=None):
     ''' module: string module name
         freq: int freq (MHz)
         var: string delay, area, lpower, or denergy
@@ -519,7 +518,7 @@ def squarify(fig):
         l = (1.-axs/h)/2
         fig.subplots_adjust(bottom=l, top=1-l)
 
-def plotPPA(mod, freq=None, norm=True, aleOpt=False):
+def plotPPA(mod, widths, freq=None, norm=True, aleOpt=False):
     ''' for the module specified, plots width vs delay, area, leakage power, and dynamic energy with fits
         if no freq specified, uses the synthesis with best achievable delay for each width
         overlays data from both techs
@@ -539,7 +538,7 @@ def plotPPA(mod, freq=None, norm=True, aleOpt=False):
                 if (arr[i][j]=='delay') and (f==10):
                     pass
                 else:
-                    r2 = oneMetricPlot(mod, arr[i][j], ax=axs[i, j], freq=f, norm=norm)
+                    r2 = oneMetricPlot(mod, widths, arr[i][j], ax=axs[i, j], freq=f, norm=norm)
                     ls = '--' if f else '-'
                     leg += [lines.Line2D([0], [0], color='red', label='$R^2$='+str(round(r2, 4)), linestyle=ls)]
 
@@ -568,6 +567,7 @@ def makeLineLegend():
     fullLeg = [lines.Line2D([0], [0], color='black', label='fastest', linestyle='-')]
     fullLeg += [lines.Line2D([0], [0], color='black', label='smallest', linestyle='--')]
     fullLeg += [lines.Line2D([0], [0], color='blue', label='tsmc28', marker='^')]
+    fullLeg += [lines.Line2D([0], [0], color='blue', label='tsmc28psyn', marker='x')]	
     fullLeg += [lines.Line2D([0], [0], color='green', label='sky90', marker='o')]
     fullLeg += [lines.Line2D([0], [0], color='green', label='sky130', marker='+')]	
     fullLeg += [lines.Line2D([0], [0], color='red', label='combined', marker='_')]
@@ -694,7 +694,7 @@ def makePlotDirectory():
             os.makedirs(new_directory)
         os.chdir(new_directory)
         if 'freq' in folder:
-            for tech in ['sky90', 'sky130', 'tsmc28']:
+            for tech in ['sky90', 'sky130', 'tsmc28', 'tsmc28psyn']:
                 for mod in modules:
                     tech_directory = os.path.join(new_directory, tech)
                     mod_directory = os.path.join(tech_directory, mod)
@@ -707,24 +707,26 @@ def makePlotDirectory():
 if __name__ == '__main__':
     ##############################
     # set up stuff, global variables
-	widths = [8, 16, 32, 64, 128]
-    modules = ['priorityencoder', 'add', 'csa', 'shiftleft', 'comparator', 'flop', 'mux2', 'mux4', 'mux8', 'mult'] 
-    normAddWidth = 32 # divisor to use with N since normalizing to add_32
+	widths = [64, 128]
+	modules = ['adder', 'comparator']
 
-    fitDict = {'add': ['cg', 'l', 'l'], 'mult': ['cg', 's', 's'], 'comparator': ['cg', 'l', 'l'], 'csa': ['c', 'l', 'l'], 'shiftleft': ['cg', 'l', 'ln'], 'flop': ['c', 'l', 'l'], 'priorityencoder': ['cg', 'l', 'l']}  fitDict.update(dict.fromkeys(['mux2', 'mux4', 'mux8'], ['cg', 'l', 'l']))
+	normAddWidth = 32 # divisor to use with N since normalizing to add_32
 
-    TechSpec = namedtuple("TechSpec", "tech color shape delay area lpower denergy")
-    techSpecs = [['sky90', 'green', 'o', 43.2e-3, 1440.600027, 714.057, 0.658022690438],  ['sky130', 'red', 'o', 43.2e-3, 1440.600027, 714.057, 0.658022690438], ['tsmc28', 'blue', '^', 12.2e-3, 209.286002, 1060.0, .08153281695882594]]
-    techSpecs = [TechSpec(*t) for t in techSpecs]
-    combined = TechSpec('combined fit', 'red', '_', 0, 0, 0, 0)
+	fitDict = {'adder': ['cg', 'l', 'l'], 'mul': ['cg', 's', 's'], 'comparator': ['cg', 'l', 'l'], 'csa': ['c', 'l', 'l'], 'shifter': ['cg', 'l', 'ln'], 'flop': ['c', 'l', 'l'], 'binencoder': ['cg', 'l', 'l']}
+	fitDict.update(dict.fromkeys(['mux2', 'mux4', 'mux8'], ['cg', 'l', 'l']))
+
+	TechSpec = namedtuple("TechSpec", "tech color shape delay area lpower denergy")
+	techSpecs = [['sky90', 'green', 'o', 43.2e-3, 1440.600027, 714.057, 0.658022690438],  ['sky130', 'red', 'o', 43.2e-3, 1440.600027, 714.057, 0.658022690438], ['tsmc28', 'blue', '^', 12.2e-3, 209.286002, 1060.0, .08153281695882594], ['tsmc28psyn', 'blue', '^', 12.2e-3, 209.286002, 1060.0, .08153281695882594]]
+	techSpecs = [TechSpec(*t) for t in techSpecs]
+	combined = TechSpec('combined fit', 'red', '_', 0, 0, 0, 0)
     ##############################
 
     # cleanup() # run to remove garbage synth runs
-    synthsintocsv() # slow, run only when new synth runs to add to csv
+	synthsintocsv() # slow, run only when new synth runs to add to csv
   
-    allSynths = synthsfromcsv('ppaData.csv') # your csv here!
-    bestSynths = csvOfBest('bestSynths.csv')
-    makePlotDirectory()
+	allSynths = synthsfromcsv('ppaData.csv') # your csv here!
+	bestSynths = csvOfBest('bestSynths.csv')
+	makePlotDirectory()
 
     # ### other functions
     # makeCoefTable()
@@ -732,11 +734,12 @@ if __name__ == '__main__':
     # muxPlot()
     # stdDevError()
 
-    for mod in modules:
-        for w in widths:
-            freqPlot('sky90', mod, w)
-            #freqPlot('sky130', mod, w)			
-            #freqPlot('tsmc28', mod, w)
-        #plotPPA(mod, norm=False)
-        #plotPPA(mod, aleOpt=True)
-        plt.close('all')
+	for mod in modules:
+		for w in widths:
+			#freqPlot('sky90', mod, w)
+			freqPlot('sky130', mod, w)			
+			#freqPlot('tsmc28', mod, w)
+			#freqPlot('tsmc28psyn', mod, w)			
+			#plotPPA(mod, widths, norm=False)
+			#plotPPA(mod, aleOpt=True)
+			plt.close('all')
diff --git a/synthDC/ppa/ppaSynth.py b/synthDC/ppa/ppaSynth.py
index 528c851a0..ceb6edbd2 100755
--- a/synthDC/ppa/ppaSynth.py
+++ b/synthDC/ppa/ppaSynth.py
@@ -32,6 +32,19 @@ def freqSweep(module, width, tech):
                 synthsToRun += [[synth.module, str(synth.width), synth.tech, str(freq)]]
     return synthsToRun
 
+def freqModuleSweep(widths, modules, tech):
+    synthsToRun = []
+    arr = [-8, -6, -4, -2, 0, 2, 4, 6, 8]
+    allSynths = synthsfromcsv('ppa/bestSynths.csv')
+    for w in widths:
+        for module in modules:
+            for synth in allSynths:
+                if (synth.module == str(module)) & (synth.tech == tech) & (synth.width == w):
+                    f = 1000/synth.delay
+                    for freq in [round(f+f*x/100) for x in arr]:
+                        synthsToRun += [[synth.module, str(synth.width), synth.tech, str(freq)]]
+    return synthsToRun
+
 def filterRedundant(synthsToRun):
     bashCommand = "find . -path '*runs/ppa*rv32e*' -prune"
     output = subprocess.check_output(['bash','-c', bashCommand])
@@ -57,7 +70,7 @@ def allCombos(widths, modules, techs, freqs):
 
 if __name__ == '__main__':
     
-    ##### Run specific syntheses
+    ##### Run specific syntheses for a specific frequency
 	widths = [8, 16, 32, 64, 128] 
 	modules = ['mul', 'adder', 'shifter', 'flop', 'comparator', 'binencoder', 'csa', 'mux2', 'mux4', 'mux8']
 	techs = ['sky90', 'sky130', 'tsmc28', 'tsmc28psyn']
@@ -69,9 +82,16 @@ if __name__ == '__main__':
 	width = 32
 	tech = 'tsmc28psyn'
 	synthsToRun = freqSweep(module, width, tech)
+
+    ##### Run a sweep for multiple modules/widths based on best delay found in existing syntheses
+	modules = ['adder', 'comparator']
+	widths = [64, 128] 
+	tech = 'sky130'
+	synthsToRun = freqModuleSweep(widths, modules, tech)	
         
     ##### Only do syntheses for which a run doesn't already exist
 	synthsToRun = filterRedundant(synthsToRun)
 	
 	pool = Pool(processes=25)
-	pool.starmap(runCommand, synthsToRun)
+
+pool.starmap(runCommand, synthsToRun)
\ No newline at end of file

From 121f685fa27c451e535d8d25c65b23260470649c Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Mon, 13 Nov 2023 07:23:15 -0800
Subject: [PATCH 28/62] Removed assign statement inside always block

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index e8a430a91..1e6eda56c 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -76,7 +76,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
     if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;
     else               ResultBitsE = FPResultBitsE;
 
-    assign CyclesE = (ResultBitsE-1)/(P.RK) + 1; // ceil (ResultBitsE/rk)
+    CyclesE = (ResultBitsE-1)/(P.RK) + 1; // ceil (ResultBitsE/rk)
   end 
   /* verilator lint_on WIDTH */
 

From 74056246d4a57a5fd1067b8edc8448f2d6befdaf Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Mon, 13 Nov 2023 10:02:10 -0600
Subject: [PATCH 29/62] Remove ppa_ prefix and modify ppaAnalyze.py to handle
 correct vector

---
 synthDC/Makefile          | 2 +-
 synthDC/ppa/ppaAnalyze.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/synthDC/Makefile b/synthDC/Makefile
index e6332e60f..d43a36b50 100755
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@@ -24,7 +24,7 @@ export WIDTH ?= 32
 
 time := $(shell date +%F-%H-%M)
 hash := $(shell git rev-parse --short HEAD)
-export OUTPUTDIR := runs/ppa_$(DESIGN)_$(WIDTH)_$(CONFIG)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
+export OUTPUTDIR := runs/$(DESIGN)_$(WIDTH)_$(CONFIG)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
 export SAIFPOWER ?= 0
 
 OLDCONFIGDIR ?= ${WALLY}/config
diff --git a/synthDC/ppa/ppaAnalyze.py b/synthDC/ppa/ppaAnalyze.py
index 80cd57604..73cd353c4 100755
--- a/synthDC/ppa/ppaAnalyze.py
+++ b/synthDC/ppa/ppaAnalyze.py
@@ -38,7 +38,7 @@ def synthsintocsv():
         each line contains the module, tech, width, target freq, and resulting metrics
     '''
     print("This takes a moment...")
-    bashCommand = "find . -path '*runs/ppa*' -prune"
+    bashCommand = "find . -path '*runs/*' -prune"
     output = subprocess.check_output(['bash','-c', bashCommand])
     allSynths = output.decode("utf-8").split('\n')[:-1]
 
@@ -50,7 +50,7 @@ def synthsintocsv():
     writer.writerow(['Module', 'Tech', 'Width', 'Target Freq', 'Delay', 'Area', 'L Power (nW)', 'D energy (nJ)'])
 
     for oneSynth in allSynths:
-        module, width, risc, tech, freq = specReg.findall(oneSynth)[2:7]
+        module, width, risc, tech, freq = specReg.findall(oneSynth)[1:6]
         metrics = []
         for phrase in [['Path Slack', 'qor'], ['Design Area', 'qor'], ['100', 'power']]:
             bashCommand = 'grep "{}" '+ oneSynth[2:]+'/reports/*{}*'
@@ -86,7 +86,7 @@ def cleanup():
             output = subprocess.check_output(['bash','-c', bc])
     except: pass
 
-    bashCommand = "find . -path '*runs/ppa*' -prune"
+    bashCommand = "find . -path '*runs/*' -prune"
     output = subprocess.check_output(['bash','-c', bashCommand])
     allSynths = output.decode("utf-8").split('\n')[:-1]
     for oneSynth in allSynths:

From 6374d1a200329fcd4dd758833f75a7a13a155a28 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Tue, 14 Nov 2023 01:04:37 -0600
Subject: [PATCH 30/62] Modify ppaSynth.py to be able to not issue excess
 number of operations with Pool command.  This is due to the original command
 using the Popen command, whereas, using the subprocess.call command solves
 this issue.  The relieves the python script from issuing a ton of synthesis
 commands and using up all the licenses

---
 synthDC/ppa/ppaSynth.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/synthDC/ppa/ppaSynth.py b/synthDC/ppa/ppaSynth.py
index ceb6edbd2..07a342e26 100755
--- a/synthDC/ppa/ppaSynth.py
+++ b/synthDC/ppa/ppaSynth.py
@@ -12,11 +12,11 @@ from ppaAnalyze import synthsfromcsv
 
 def runCommand(module, width, tech, freq):
     command = "make synth DESIGN={} WIDTH={} TECH={} DRIVE=INV FREQ={} MAXOPT=1 MAXCORES=1".format(module, width, tech, freq)
-    subprocess.Popen(command, shell=True)
+    subprocess.call(command, shell=True)
 
 def deleteRedundant(synthsToRun):
     '''removes any previous runs for the current synthesis specifications'''
-    synthStr = "rm -rf runs/ppa_{}_{}_rv32e_{}nm_{}_*"
+    synthStr = "rm -rf runs/{}_{}_rv32e_{}_{}_*"
     for synth in synthsToRun:   
         bashCommand = synthStr.format(*synth)
         outputCPL = subprocess.check_output(['bash','-c', bashCommand])
@@ -46,7 +46,7 @@ def freqModuleSweep(widths, modules, tech):
     return synthsToRun
 
 def filterRedundant(synthsToRun):
-    bashCommand = "find . -path '*runs/ppa*rv32e*' -prune"
+    bashCommand = "find . -path '*runs/*' -prune"
     output = subprocess.check_output(['bash','-c', bashCommand])
     specReg = re.compile('[a-zA-Z0-9]+')
     allSynths = output.decode("utf-8").split('\n')[:-1]
@@ -84,14 +84,15 @@ if __name__ == '__main__':
 	synthsToRun = freqSweep(module, width, tech)
 
     ##### Run a sweep for multiple modules/widths based on best delay found in existing syntheses
-	modules = ['adder', 'comparator']
-	widths = [64, 128] 
+	modules = ['adder', "comparator"]
+	widths = [8, 16, 32, 64, 128]
 	tech = 'sky130'
 	synthsToRun = freqModuleSweep(widths, modules, tech)	
         
     ##### Only do syntheses for which a run doesn't already exist
-	synthsToRun = filterRedundant(synthsToRun)
-	
+	synthsToRun = filterRedundant(synthsToRun)	
 	pool = Pool(processes=25)
 
-pool.starmap(runCommand, synthsToRun)
\ No newline at end of file
+pool.starmap(runCommand, synthsToRun)
+pool.close()
+pool.join()
\ No newline at end of file

From c722e2c59da4e9473194d4abd4eda8b36277416c Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Tue, 14 Nov 2023 01:06:14 -0600
Subject: [PATCH 31/62] fix plotPPA and other excruciatingly painful problems
 related to using allWidths and causing empty arrays to be used.  This
 generates the normalized/unnormalized plots

---
 synthDC/ppa/bestSynths.csv |  18 +-
 synthDC/ppa/ppaAnalyze.py  | 798 +++++++++++++++++++++++--------------
 2 files changed, 504 insertions(+), 312 deletions(-)

diff --git a/synthDC/ppa/bestSynths.csv b/synthDC/ppa/bestSynths.csv
index 885eeb962..655f171a1 100644
--- a/synthDC/ppa/bestSynths.csv
+++ b/synthDC/ppa/bestSynths.csv
@@ -4,10 +4,10 @@ binencoder,sky130,16,1000,1.0000,136.220003,77.243,0.021773774467348
 binencoder,sky130,32,1000,1.0000,372.400007,189.626,0.04371111111111111
 binencoder,sky130,64,1000,1.0000,797.720015,382.205,0.07393850658857981
 binencoder,sky130,128,900,1.1111,1602.300031,610.009,0.1261366969785861
-adder,sky130,8,1000,1.0000,253.820005,154.438,0.10825587752870422
-adder,sky130,16,1000,1.0000,722.260013,485.109,0.32460910944935417
-adder,sky130,32,1000,1.0000,1440.600027,714.057,0.6580226904376014
-adder,sky130,64,1000,1.0000,2781.240054,1050.0,0.9392239364188874
+adder,sky130,8,1700,0.588235,253.820005,154.438,0.10825587752870422
+adder,sky130,16,1300,0.7692307,722.260013,485.109,0.32460910944935417
+adder,sky130,32,1100,0.90909,1440.600027,714.057,0.6580226904376014
+adder,sky130,64,950,1.0526315,2781.240054,1050.0,0.9392239364188874
 adder,sky130,128,900,1.1111,6186.740118,2230.0,2.1480106100795755
 csa,sky130,8,1000,1.0000,266.560005,154.202,0.13650573115665163
 csa,sky130,16,1000,1.0000,533.12001,308.404,0.27263530601922104
@@ -19,11 +19,11 @@ shifter,sky130,16,1000,1.0000,666.400006,558.433,0.19552906110283155
 shifter,sky130,32,1000,1.0000,1475.880027,768.262,0.3807431082700759
 shifter,sky130,64,1000,1.0000,3914.120062,2680.0,1.144802541988198
 shifter,sky130,128,900,1.1111,9192.400136,6080.0,2.9008914525432616
-comparator,sky130,8,1000,1.0000,200.900004,136.6,0.05001033271337053
-comparator,sky130,16,1000,1.0000,358.680007,189.253,0.06321553011448482
-comparator,sky130,32,1500,0.666666,690.900013,315.709,0.10771793448084398
-comparator,sky130,64,1300,0.7692307,1372.980026,508.393,0.2048577820389901
-comparator,sky130,128,1100,0.909090,2744.980052,796.047,0.34396273737011823
+comparator,sky130,8,1700,0.588235,200.900004,136.6,0.05001033271337053
+comparator,sky130,16,1500,0.6666667,358.680007,189.253,0.06321553011448482
+comparator,sky130,32,1300,0.7692307,690.900013,315.709,0.10771793448084398
+comparator,sky130,64,1200,0.8333333,1372.980026,508.393,0.2048577820389901
+comparator,sky130,128,1150,0.869565,2744.980052,796.047,0.34396273737011823
 flop,sky130,8,1000,1.0000,133.279999,64.8145,0.193835
 flop,sky130,16,1000,1.0000,266.5599975,129.629,0.38715000000000005
 flop,sky130,32,1000,1.0000,533.119995,259.258,0.7723000000000001
diff --git a/synthDC/ppa/ppaAnalyze.py b/synthDC/ppa/ppaAnalyze.py
index 73cd353c4..9af15fd80 100755
--- a/synthDC/ppa/ppaAnalyze.py
+++ b/synthDC/ppa/ppaAnalyze.py
@@ -18,92 +18,117 @@ from collections import namedtuple
 import sklearn.metrics as skm  # depricated, will need to replace with scikit-learn
 import os
 
+
 def synthsfromcsv(filename):
     Synth = namedtuple("Synth", "module tech width freq delay area lpower denergy")
-    with open(filename, newline='') as csvfile:
+    with open(filename, newline="") as csvfile:
         csvreader = csv.reader(csvfile)
         global allSynths
         allSynths = list(csvreader)[1:]
         for i in range(len(allSynths)):
             for j in range(len(allSynths[0])):
-                try: allSynths[i][j] = int(allSynths[i][j])
-                except: 
-                    try: allSynths[i][j] = float(allSynths[i][j])
-                    except: pass
+                try:
+                    allSynths[i][j] = int(allSynths[i][j])
+                except:
+                    try:
+                        allSynths[i][j] = float(allSynths[i][j])
+                    except:
+                        pass
             allSynths[i] = Synth(*allSynths[i])
     return allSynths
-    
+
+
 def synthsintocsv():
-    ''' writes a CSV with one line for every available synthesis
-        each line contains the module, tech, width, target freq, and resulting metrics
-    '''
+    """writes a CSV with one line for every available synthesis
+    each line contains the module, tech, width, target freq, and resulting metrics
+    """
     print("This takes a moment...")
     bashCommand = "find . -path '*runs/*' -prune"
-    output = subprocess.check_output(['bash','-c', bashCommand])
-    allSynths = output.decode("utf-8").split('\n')[:-1]
+    output = subprocess.check_output(["bash", "-c", bashCommand])
+    allSynths = output.decode("utf-8").split("\n")[:-1]
 
-    specReg = re.compile('[a-zA-Z0-9]+')
-    metricReg = re.compile('-?\d+\.\d+[e]?[-+]?\d*')
+    specReg = re.compile("[a-zA-Z0-9]+")
+    metricReg = re.compile("-?\d+\.\d+[e]?[-+]?\d*")
 
     file = open("ppaData.csv", "w")
     writer = csv.writer(file)
-    writer.writerow(['Module', 'Tech', 'Width', 'Target Freq', 'Delay', 'Area', 'L Power (nW)', 'D energy (nJ)'])
+    writer.writerow(
+        [
+            "Module",
+            "Tech",
+            "Width",
+            "Target Freq",
+            "Delay",
+            "Area",
+            "L Power (nW)",
+            "D energy (nJ)",
+        ]
+    )
 
     for oneSynth in allSynths:
         module, width, risc, tech, freq = specReg.findall(oneSynth)[1:6]
         metrics = []
-        for phrase in [['Path Slack', 'qor'], ['Design Area', 'qor'], ['100', 'power']]:
-            bashCommand = 'grep "{}" '+ oneSynth[2:]+'/reports/*{}*'
+        for phrase in [["Path Slack", "qor"], ["Design Area", "qor"], ["100", "power"]]:
+            bashCommand = 'grep "{}" ' + oneSynth[2:] + "/reports/*{}*"
             bashCommand = bashCommand.format(*phrase)
-            try: output = subprocess.check_output(['bash','-c', bashCommand])
-            except: 
+            try:
+                output = subprocess.check_output(["bash", "-c", bashCommand])
+            except:
                 print(module + width + tech + freq + " doesn't have reports")
                 print("Consider running cleanup() first")
             nums = metricReg.findall(str(output))
             nums = [float(m) for m in nums]
             metrics += nums
-        delay = 1000/int(freq) - metrics[0]
+        delay = 1000 / int(freq) - metrics[0]
         area = metrics[1]
         lpower = metrics[4]
-        denergy = (metrics[2] + metrics[3])/int(freq)*1000 # (switching + internal powers)*delay, more practical units for regression coefs
+        # switching, internal power in mW and leakage in nW
+        tpower = metrics[2] + metrics[3] + metrics[4]*0.000001
+        # EDP (fJ/GHz)
+        denergy = (
+            (metrics[2] + metrics[3] + metrics[4]*0.000001) / int(freq)
+        )  # (switching + internal powers)*delay, more practical units for regression coefs
 
-        if ('flop' in module): # since two flops in each module 
-            [area, lpower, denergy] = [n/2 for n in [area, lpower, denergy]] 
+        if "flop" in module:  # since two flops in each module
+            [area, lpower, denergy] = [n / 2 for n in [area, lpower, denergy]]
 
         writer.writerow([module, tech, width, freq, delay, area, lpower, denergy])
     file.close()
 
+
 def cleanup():
-    ''' removes runs that didn't work
-    '''
+    """removes runs that didn't work"""
     bashCommand = 'grep -r "Error" runs/ppa*/reports/*qor*'
-    try: 
-        output = subprocess.check_output(['bash','-c', bashCommand])
-        allSynths = output.decode("utf-8").split('\n')[:-1]
+    try:
+        output = subprocess.check_output(["bash", "-c", bashCommand])
+        allSynths = output.decode("utf-8").split("\n")[:-1]
         for run in allSynths:
-            run = run.split('MHz')[0]
-            bc = 'rm -r '+ run + '*'
-            output = subprocess.check_output(['bash','-c', bc])
-    except: pass
+            run = run.split("MHz")[0]
+            bc = "rm -r " + run + "*"
+            output = subprocess.check_output(["bash", "-c", bc])
+    except:
+        pass
 
     bashCommand = "find . -path '*runs/*' -prune"
-    output = subprocess.check_output(['bash','-c', bashCommand])
-    allSynths = output.decode("utf-8").split('\n')[:-1]
+    output = subprocess.check_output(["bash", "-c", bashCommand])
+    allSynths = output.decode("utf-8").split("\n")[:-1]
     for oneSynth in allSynths:
-        for phrase in [['Path Length', 'qor']]:
-            bashCommand = 'grep "{}" '+ oneSynth[2:]+'/reports/*{}*'
+        for phrase in [["Path Length", "qor"]]:
+            bashCommand = 'grep "{}" ' + oneSynth[2:] + "/reports/*{}*"
             bashCommand = bashCommand.format(*phrase)
-            try: output = subprocess.check_output(['bash','-c', bashCommand])
-            except: 
-                bc = 'rm -r '+ oneSynth[2:]
-                output = subprocess.check_output(['bash','-c', bc])
+            try:
+                output = subprocess.check_output(["bash", "-c", bashCommand])
+            except:
+                bc = "rm -r " + oneSynth[2:]
+                output = subprocess.check_output(["bash", "-c", bc])
     print("All cleaned up!")
 
+
 def getVals(tech, module, var, freq=None, width=None):
-    ''' for a specified tech, module, and variable/metric
-        returns a list of values for that metric in ascending width order
-        works at a specified target frequency or if none is given, uses the synthesis with the best achievable delay for each width
-    '''
+    """for a specified tech, module, and variable/metric
+    returns a list of values for that metric in ascending width order
+    works at a specified target frequency or if none is given, uses the synthesis with the best achievable delay for each width
+    """
 
     if width != None:
         widthsToGet = width
@@ -113,85 +138,132 @@ def getVals(tech, module, var, freq=None, width=None):
     metric = []
     widthL = []
 
-    if (freq != None):
+    if freq != None:
         for oneSynth in allSynths:
-            if (oneSynth.freq == freq) & (oneSynth.tech == tech) & (oneSynth.module == module) & (oneSynth.width != 1):
+            if (
+                (oneSynth.freq == freq)
+                & (oneSynth.tech == tech)
+                & (oneSynth.module == module)
+                & (oneSynth.width != 1)
+            ):
                 widthL += [oneSynth.width]
                 osdict = oneSynth._asdict()
                 metric += [osdict[var]]
-        metric = [x for _, x in sorted(zip(widthL, metric))] # ordering
+        metric = [x for _, x in sorted(zip(widthL, metric))]  # ordering
     else:
         for w in widthsToGet:
             for oneSynth in bestSynths:
-                if (oneSynth.width == w) & (oneSynth.tech == tech) & (oneSynth.module == module):
+                if (
+                    (oneSynth.width == w)
+                    & (oneSynth.tech == tech)
+                    & (oneSynth.module == module)
+                ):
                     osdict = oneSynth._asdict()
                     met = osdict[var]
                     metric += [met]
     return metric
 
+
 def csvOfBest(filename):
     bestSynths = []
     for tech in [x.tech for x in techSpecs]:
         for mod in modules:
             for w in widths:
-                m = np.Inf # large number to start
+                m = np.Inf  # large number to start
                 best = None
-                for oneSynth in allSynths: # best achievable, rightmost green
-                    if (oneSynth.width == w) & (oneSynth.tech == tech) & (oneSynth.module == mod):
-                        if (oneSynth.delay < m) & (1000/oneSynth.delay > oneSynth.freq): 
+                for oneSynth in allSynths:  # best achievable, rightmost green
+                    if (
+                        (oneSynth.width == w)
+                        & (oneSynth.tech == tech)
+                        & (oneSynth.module == mod)
+                    ):
+                        if (oneSynth.delay < m) & (
+                            1000 / oneSynth.delay > oneSynth.freq
+                        ):
                             m = oneSynth.delay
                             best = oneSynth
 
                 if (best != None) & (best not in bestSynths):
                     bestSynths += [best]
-    
+
     file = open(filename, "w")
     writer = csv.writer(file)
-    writer.writerow(['Module', 'Tech', 'Width', 'Target Freq', 'Delay', 'Area', 'L Power (nW)', 'D energy (nJ)'])
+    writer.writerow(
+        [
+            "Module",
+            "Tech",
+            "Width",
+            "Target Freq",
+            "Delay",
+            "Area",
+            "L Power (nW)",
+            "D energy (nJ)",
+        ]
+    )
     for synth in bestSynths:
         writer.writerow(list(synth))
     file.close()
     return bestSynths
-    
+
+
 def genLegend(fits, coefs, r2=None, spec=None, ale=False):
-    ''' generates a list of two legend elements (or just an equation if no r2 or spec)
-        labels line with fit equation and dots with r squared of the fit
-    '''
+    """generates a list of two legend elements (or just an equation if no r2 or spec)
+    labels line with fit equation and dots with r squared of the fit
+    """
 
     coefsr = [str(sigfig(c, 2)) for c in coefs]
     if ale:
-        if (normAddWidth == 32):
-            sub = 'S'
+        if normAddWidth == 32:
+            sub = "S"
         elif normAddWidth != 1:
-            print('Equations are wrong, check normAddWidth')
+            print("Equations are wrong, check normAddWidth")
     else:
-        sub = 'N'
+        sub = "N"
 
-    eqDict = {'c': '', 'l': sub, 's': '$'+sub+'^2$', 'g': '$log_2$('+sub+')', 'n': ''+sub+'$log_2$('+sub+')'}
-    eq = ''
-    ind = 0    
+    eqDict = {
+        "c": "",
+        "l": sub,
+        "s": "$" + sub + "^2$",
+        "g": "$log_2$(" + sub + ")",
+        "n": "" + sub + "$log_2$(" + sub + ")",
+    }
+    eq = ""
+    ind = 0
 
     for k in eqDict.keys():
         if k in fits:
-            if str(coefsr[ind]) != '0': eq += " + " + coefsr[ind] + eqDict[k]
+            if str(coefsr[ind]) != "0":
+                eq += " + " + coefsr[ind] + eqDict[k]
             ind += 1
 
-    eq = eq[3:] # chop off leading ' + '
+    eq = eq[3:]  # chop off leading ' + '
 
-    if (r2==None) or (spec==None):
+    if (r2 == None) or (spec == None):
         return eq
     else:
         legend_elements = [lines.Line2D([0], [0], color=spec.color, label=eq)]
-        legend_elements += [lines.Line2D([0], [0], color=spec.color, ls='', marker=spec.shape, label='$R^2$='+ str(round(r2, 4)))]
+        legend_elements += [
+            lines.Line2D(
+                [0],
+                [0],
+                color=spec.color,
+                ls="",
+                marker=spec.shape,
+                label="$R^2$=" + str(round(r2, 4)),
+            )
+        ]
         return legend_elements
 
-def oneMetricPlot(module, widths, var, freq=None, ax=None, fits='clsgn', norm=True, color=None):
-    ''' module: string module name
-        freq: int freq (MHz)
-        var: string delay, area, lpower, or denergy
-        fits: constant, linear, square, log2, Nlog2
-        plots given variable vs width for all matching syntheses with regression
-    '''
+
+def oneMetricPlot(
+    module, widths, var, freq=None, ax=None, fits="clsgn", norm=True, color=None
+):
+    """module: string module name
+    freq: int freq (MHz)
+    var: string delay, area, lpower, or denergy
+    fits: constant, linear, square, log2, Nlog2
+    plots given variable vs width for all matching syntheses with regression
+    """
     singlePlot = True
     if ax or (freq == 10):
         singlePlot = False
@@ -202,24 +274,27 @@ def oneMetricPlot(module, widths, var, freq=None, ax=None, fits='clsgn', norm=Tr
     allWidths = []
     allMetrics = []
 
-    ale = (var != 'delay') # if not delay, must be area, leakage, or energy
+    ale = var != "delay"  # if not delay, must be area, leakage, or energy
     modFit = fitDict[module]
     fits = modFit[ale]
 
     if freq:
-        ls = '--'
+        ls = "--"
     else:
-        ls = '-'
+        ls = "-"
 
     for spec in techSpecs:
+        # print(f"Searching for module of spec {spec} and module {module} and var {var}")
         metric = getVals(spec.tech, module, var, freq=freq)
-        
+        # print(f"Found metric : {metric}")
         if norm:
             techdict = spec._asdict()
             norm = techdict[var]
-            metric = [m/norm for m in metric]
+            metric = [m / norm for m in metric]
 
-        if len(metric) == 5: # don't include the spec if we don't have points for all widths
+        if len(widths) == len(metric):
+            # don't include the spec if we don't have points for all widths
+            # print(f"Width \neq Metric")
             xp, pred, coefs, r2 = regress(widths, metric, fits, ale)
             fullLeg += genLegend(fits, coefs, r2, spec, ale=ale)
             c = color if color else spec.color
@@ -228,44 +303,78 @@ def oneMetricPlot(module, widths, var, freq=None, ax=None, fits='clsgn', norm=Tr
             allWidths += widths
             allMetrics += metric
 
-    xp, pred, coefs, r2 = regress(allWidths, allMetrics, fits)
-    ax.plot(xp, pred, color='red', linestyle=ls)
+        # print(f"Widths passed into regress : {allWidths}")
+        # Not sure why this works (jes) - if allWidths doesn't have data widths does
+        if len(allWidths) > 0:
+            xp, pred, coefs, r2 = regress(allWidths, allMetrics, fits)
+            ax.plot(xp, pred, color="orange", linestyle=ls)
+        else:
+            xp, pred, coefs, r2 = regress(widths, metric, fits)
+            ax.plot(xp, pred, color="orange", linestyle=ls)
 
     if norm:
-        ylabeldic = {"lpower": "Leakage Power (add32)", "denergy": "Energy/Op (add32)", "area": "Area (add32)", "delay": "Delay (FO4)"}
+        ylabeldic = {
+            "lpower": "Leakage Power (add32)",
+            "denergy": "Energy/Op (add32)",
+            "area": "Area (add32)",
+            "delay": "Delay (FO4)",
+        }
     else:
-        ylabeldic = {"lpower": "Leakage Power (nW)", "denergy": "Dynamic Energy (nJ)", "area": "Area (sq microns)", "delay": "Delay (ns)"}
+        ylabeldic = {
+            "lpower": "Leakage Power (nW)",
+            "denergy": "EDP (fJ/GHz)",
+            "area": "Area (sq microns)",
+            "delay": "Delay (ns)",
+        }
 
     ax.set_ylabel(ylabeldic[var])
     ax.set_xticks(widths)
 
-    if singlePlot or (var == 'lpower') or (var == 'denergy'):
+    if singlePlot or (var == "lpower") or (var == "denergy"):
         ax.set_xlabel("Width (bits)")
-    if not singlePlot and ((var == 'delay') or (var == 'area')):
-        ax.tick_params(labelbottom=False)    
+    if not singlePlot and ((var == "delay") or (var == "area")):
+        ax.tick_params(labelbottom=False)
 
     if singlePlot:
         fullLeg += genLegend(fits, coefs, r2, combined, ale=ale)
-        legLoc = 'upper left' if ale else 'center right'
+        legLoc = "upper left" if ale else "center right"
         ax.add_artist(ax.legend(handles=fullLeg, loc=legLoc))
-        titleStr = "  (target  " + str(freq)+ "MHz)" if freq != None else " (best achievable delay)"
+        titleStr = (
+            "  (target  " + str(freq) + "MHz)"
+            if freq != None
+            else " (best achievable delay)"
+        )
         ax.set_title(module + titleStr)
-        plt.savefig('.plots/'+ module + '_' + var + '.png')
+        plt.savefig(".plots/" + module + "_" + var + ".png")
         # plt.show()
     return r2
 
-def regress(widths, var, fits='clsgn', ale=False):
-    ''' fits a curve to the given points
-        returns lists of x and y values to plot that curve and coefs for the eq with r2
-    '''
 
+def regress(widths, var, fits="clsgn", ale=False):
+    """fits a curve to the given points
+    returns lists of x and y values to plot that curve and coefs for the eq with r2
+    """
+    if len(var) != len(widths):
+        print(
+            f"There are not enough variables to match widths. Widths : {widths} Variables Found : {var}, padding to match may affect correctness (doing it anyways)\n"
+        )
+        if len(widths) > len(var):
+            while len(widths) > len(var):
+                var.append(0.0)
+        if len(var) > len(widths):
+            while len(var) > len(widths):
+                widths.append(0)
+
+    # widths = [8, 16, 32, 64, 128]
+    # print(f"Regress var : {var}")
+    # print(f"Regress widths : {widths}")
     funcArr = genFuncs(fits)
-    xp = np.linspace(min(widths)/2, max(widths)*1.1, 200)
+    xp = np.linspace(min(widths) / 2, max(widths) * 1.1, 200)
     xpToCalc = xp
 
     if ale:
-        widths = [w/normAddWidth for w in widths]
-        xpToCalc = [x/normAddWidth for x in xp]
+        widths = [w / normAddWidth for w in widths]
+        xpToCalc = [x / normAddWidth for x in xp]
 
     mat = []
     for w in widths:
@@ -273,8 +382,9 @@ def regress(widths, var, fits='clsgn', ale=False):
         for func in funcArr:
             row += [func(w)]
         mat += [row]
-    
-    y = np.array(var, dtype=np.float)
+
+    # var = [0, 1, 2, 3, 4]
+    y = np.array(var, dtype=np.float64)
     coefs = opt.nnls(mat, y)[0]
 
     yp = []
@@ -290,19 +400,22 @@ def regress(widths, var, fits='clsgn', ale=False):
 
     return xp, pred, coefs, r2
 
+
 def makeCoefTable():
-    ''' writes CSV with each line containing the coefficients for a regression fit 
-        to a particular combination of module, metric (including both techs, normalized)
-    '''
+    """writes CSV with each line containing the coefficients for a regression fit
+    to a particular combination of module, metric (including both techs, normalized)
+    """
     file = open("ppaFitting.csv", "w")
     writer = csv.writer(file)
-    writer.writerow(['Module', 'Metric', 'Target', '1', 'N', 'N^2', 'log2(N)', 'Nlog2(N)', 'R^2'])
+    writer.writerow(
+        ["Module", "Metric", "Target", "1", "N", "N^2", "log2(N)", "Nlog2(N)", "R^2"]
+    )
 
     for module in modules:
         for freq in [10, None]:
-            target = 'easy' if freq else 'hard'
-            for var in ['delay', 'area', 'lpower', 'denergy']:
-                ale = (var != 'delay')
+            target = "easy" if freq else "hard"
+            for var in ["delay", "area", "lpower", "denergy"]:
+                ale = var != "delay"
                 metL = []
                 modFit = fitDict[module]
                 fits = modFit[ale]
@@ -311,12 +424,12 @@ def makeCoefTable():
                     metric = getVals(spec.tech, module, var, freq=freq)
                     techdict = spec._asdict()
                     norm = techdict[var]
-                    metL += [m/norm for m in metric]
+                    metL += [m / norm for m in metric]
 
-                xp, pred, coefs, r2 = regress(widths*2, metL, fits, ale)
+                xp, pred, coefs, r2 = regress(widths * 2, metL, fits, ale)
                 coefs = np.ndarray.tolist(coefs)
-                coefsToWrite  = [None]*5
-                fitTerms = 'clsgn'
+                coefsToWrite = [None] * 5
+                fitTerms = "clsgn"
                 ind = 0
                 for i in range(len(fitTerms)):
                     if fitTerms[i] in fits:
@@ -327,25 +440,38 @@ def makeCoefTable():
 
     file.close()
 
+
 def sigfig(num, figs):
-    return '{:g}'.format(float('{:.{p}g}'.format(num, p=figs)))
+    return "{:g}".format(float("{:.{p}g}".format(num, p=figs)))
+
 
 def makeEqTable():
-    ''' writes CSV with each line containing the equations for fits for each metric 
-        to a particular module (including both techs, normalized)
-    '''
+    """writes CSV with each line containing the equations for fits for each metric
+    to a particular module (including both techs, normalized)
+    """
     file = open("ppaEquations.csv", "w")
     writer = csv.writer(file)
-    writer.writerow(['Element', 'Best delay', 'Fast area', 'Fast leakage', 'Fast energy', 'Small area', 'Small leakage', 'Small energy'])
+    writer.writerow(
+        [
+            "Element",
+            "Best delay",
+            "Fast area",
+            "Fast leakage",
+            "Fast energy",
+            "Small area",
+            "Small leakage",
+            "Small energy",
+        ]
+    )
 
     for module in modules:
         eqs = []
         for freq in [None, 10]:
-            for var in ['delay', 'area', 'lpower', 'denergy']:
-                if (var == 'delay') and (freq == 10):
+            for var in ["delay", "area", "lpower", "denergy"]:
+                if (var == "delay") and (freq == 10):
                     pass
                 else:
-                    ale = (var != 'delay')
+                    ale = var != "delay"
                     metL = []
                     modFit = fitDict[module]
                     fits = modFit[ale]
@@ -354,9 +480,9 @@ def makeEqTable():
                         metric = getVals(spec.tech, module, var, freq=freq)
                         techdict = spec._asdict()
                         norm = techdict[var]
-                        metL += [m/norm for m in metric]
+                        metL += [m / norm for m in metric]
 
-                    xp, pred, coefs, r2 = regress(widths*2, metL, fits, ale)
+                    xp, pred, coefs, r2 = regress(widths * 2, metL, fits, ale)
                     coefs = np.ndarray.tolist(coefs)
                     eqs += [genLegend(fits, coefs, ale=ale)]
         row = [module] + eqs
@@ -364,93 +490,113 @@ def makeEqTable():
 
     file.close()
 
-def genFuncs(fits='clsgn'):
-    ''' helper function for regress()
-        returns array of functions with one for each term desired in the regression fit
-    '''
+
+def genFuncs(fits="clsgn"):
+    """helper function for regress()
+    returns array of functions with one for each term desired in the regression fit
+    """
     funcArr = []
-    if 'c' in fits:
+    if "c" in fits:
         funcArr += [lambda x: 1]
-    if 'l' in fits:
+    if "l" in fits:
         funcArr += [lambda x: x]
-    if 's' in fits:
+    if "s" in fits:
         funcArr += [lambda x: x**2]
-    if 'g' in fits:
+    if "g" in fits:
         funcArr += [lambda x: np.log2(x)]
-    if 'n' in fits:
-        funcArr += [lambda x: x*np.log2(x)]
+    if "n" in fits:
+        funcArr += [lambda x: x * np.log2(x)]
     return funcArr
 
+
 def noOutliers(median, freqs, delays, areas):
-    ''' returns a pared down list of freqs, delays, and areas 
-        cuts out any syntheses in which target freq isn't within 75% of the min delay target to focus on interesting area
-        helper function to freqPlot()
-    '''
-    f=[]
-    d=[]
-    a=[]
+    """returns a pared down list of freqs, delays, and areas
+    cuts out any syntheses in which target freq isn't within 75% of the min delay target to focus on interesting area
+    helper function to freqPlot()
+    """
+    f = []
+    d = []
+    a = []
     for i in range(len(freqs)):
-        norm = freqs[i]/median
-        if (norm > 0.4) & (norm<1.4):
+        norm = freqs[i] / median
+        if (norm > 0.4) & (norm < 1.4):
             f += [freqs[i]]
             d += [delays[i]]
             a += [areas[i]]
-    
+
     return f, d, a
 
+
 def freqPlot(tech, mod, width):
-    ''' plots delay, area, area*delay, and area*delay^2 for syntheses with specified tech, module, width
-    '''
+    """plots delay, area, area*delay, and area*delay^2 for syntheses with specified tech, module, width"""
 
     freqsL, delaysL, areasL = ([[], []] for i in range(3))
     for oneSynth in allSynths:
-        if (mod == oneSynth.module) & (width == oneSynth.width) & (tech == oneSynth.tech):
-            ind = (1000/oneSynth.delay < oneSynth.freq) # when delay is within target clock period
+        if (
+            (mod == oneSynth.module)
+            & (width == oneSynth.width)
+            & (tech == oneSynth.tech)
+        ):
+            ind = (
+                1000 / oneSynth.delay < oneSynth.freq
+            )  # when delay is within target clock period
             freqsL[ind] += [oneSynth.freq]
             delaysL[ind] += [oneSynth.delay]
             areasL[ind] += [oneSynth.area]
 
     median = np.median(list(flatten(freqsL)))
-    
+
     f, (ax1, ax2) = plt.subplots(2, 1, sharex=True)
     for ax in (ax1, ax2):
-        ax.ticklabel_format(useOffset=False, style='plain')
+        ax.ticklabel_format(useOffset=False, style="plain")
 
-    for ind in [0,1]:
+    for ind in [0, 1]:
         areas = areasL[ind]
         delays = delaysL[ind]
         freqs = freqsL[ind]
 
-        freqs, delays, areas = noOutliers(median, freqs, delays, areas) # comment out to see all syntheses
+        freqs, delays, areas = noOutliers(
+            median, freqs, delays, areas
+        )  # comment out to see all syntheses
 
-        c = 'blue' if ind else 'green'
+        c = "blue" if ind else "green"
         ax1.scatter(freqs, delays, color=c)
         ax2.scatter(freqs, areas, color=c)
 
-    legend_elements = [lines.Line2D([0], [0], color='green', ls='', marker='o', label='timing achieved'),
-                       lines.Line2D([0], [0], color='blue', ls='', marker='o', label='slack violated')]
+    legend_elements = [
+        lines.Line2D(
+            [0], [0], color="green", ls="", marker="o", label="timing achieved"
+        ),
+        lines.Line2D([0], [0], color="blue", ls="", marker="o", label="slack violated"),
+    ]
 
     ax1.legend(handles=legend_elements)
     width = str(width)
-    
+
     ax2.set_xlabel("Target Freq (MHz)")
-    ax1.set_ylabel('Delay (ns)')
-    ax2.set_ylabel('Area (sq microns)')
-    ax1.set_title(mod + '_' + width)
-    if ('mux' in mod) & ('d' in mod):
+    ax1.set_ylabel("Delay (ns)")
+    ax2.set_ylabel("Area (sq microns)")
+    ax1.set_title(mod + "_" + width)
+    if ("mux" in mod) & ("d" in mod):
         width = mod
-        mod = 'muxd'
-    plt.savefig('./plots/freqBuckshot/' + tech + '/' + mod + '/' + width + '.png')
+        mod = "muxd"
+    plt.savefig("./plots/freqBuckshot/" + tech + "/" + mod + "/" + width + ".png")
     # plt.show()
 
+
 def squareAreaDelay(tech, mod, width):
-    ''' plots delay, area, area*delay, and area*delay^2 for syntheses with specified tech, module, width
-    '''
+    """plots delay, area, area*delay, and area*delay^2 for syntheses with specified tech, module, width"""
     global allSynths
     freqsL, delaysL, areasL = ([[], []] for i in range(3))
     for oneSynth in allSynths:
-        if (mod == oneSynth.module) & (width == oneSynth.width) & (tech == oneSynth.tech):
-            ind = (1000/oneSynth.delay < oneSynth.freq) # when delay is within target clock period
+        if (
+            (mod == oneSynth.module)
+            & (width == oneSynth.width)
+            & (tech == oneSynth.tech)
+        ):
+            ind = (
+                1000 / oneSynth.delay < oneSynth.freq
+            )  # when delay is within target clock period
             freqsL[ind] += [oneSynth.freq]
             delaysL[ind] += [oneSynth.delay]
             areasL[ind] += [oneSynth.area]
@@ -458,182 +604,212 @@ def squareAreaDelay(tech, mod, width):
     f, (ax1) = plt.subplots(1, 1)
     ax2 = ax1.twinx()
 
-    for ind in [0,1]:
+    for ind in [0, 1]:
         areas = areasL[ind]
         delays = delaysL[ind]
         targets = freqsL[ind]
-        targets = [1000/f for f in targets]
-        
-        targets, delays, areas = noOutliers(targets, delays, areas) # comment out to see all 
-        
+        targets = [1000 / f for f in targets]
+
+        targets, delays, areas = noOutliers(
+            targets, delays, areas
+        )  # comment out to see all
+
         if not ind:
             achievedDelays = delays
 
-        c = 'blue' if ind else 'green'
-        ax1.scatter(targets, delays, marker='^', color=c)
-        ax2.scatter(targets, areas, marker='s', color=c)
-    
-    bestAchieved = min(achievedDelays)
-        
-    legend_elements = [lines.Line2D([0], [0], color='green', ls='', marker='^', label='delay (timing achieved)'),
-                       lines.Line2D([0], [0], color='green', ls='', marker='s', label='area (timing achieved)'),
-                       lines.Line2D([0], [0], color='blue', ls='', marker='^', label='delay (timing violated)'),
-                       lines.Line2D([0], [0], color='blue', ls='', marker='s', label='area (timing violated)')]
+        c = "blue" if ind else "green"
+        ax1.scatter(targets, delays, marker="^", color=c)
+        ax2.scatter(targets, areas, marker="s", color=c)
+
+    bestAchieved = min(achievedDelays)
+
+    legend_elements = [
+        lines.Line2D(
+            [0], [0], color="green", ls="", marker="^", label="delay (timing achieved)"
+        ),
+        lines.Line2D(
+            [0], [0], color="green", ls="", marker="s", label="area (timing achieved)"
+        ),
+        lines.Line2D(
+            [0], [0], color="blue", ls="", marker="^", label="delay (timing violated)"
+        ),
+        lines.Line2D(
+            [0], [0], color="blue", ls="", marker="s", label="area (timing violated)"
+        ),
+    ]
+
+    ax2.legend(handles=legend_elements, loc="upper left")
 
-    ax2.legend(handles=legend_elements, loc='upper left')
-    
     ax1.set_xlabel("Delay Targeted (ns)")
     ax1.set_ylabel("Delay Achieved (ns)")
-    ax2.set_ylabel('Area (sq microns)')
-    ax1.set_title(mod + '_' + str(width))
+    ax2.set_ylabel("Area (sq microns)")
+    ax1.set_title(mod + "_" + str(width))
 
     squarify(f)
 
     xvals = np.array(ax1.get_xlim())
-    frac = (min(flatten(delaysL))-xvals[0])/(xvals[1]-xvals[0])
-    areaLowerLim = min(flatten(areasL))-100
-    areaUpperLim = max(flatten(areasL))/frac + areaLowerLim
+    frac = (min(flatten(delaysL)) - xvals[0]) / (xvals[1] - xvals[0])
+    areaLowerLim = min(flatten(areasL)) - 100
+    areaUpperLim = max(flatten(areasL)) / frac + areaLowerLim
     ax2.set_ylim([areaLowerLim, areaUpperLim])
     ax1.plot(xvals, xvals, ls="--", c=".3")
-    ax1.hlines(y=bestAchieved, xmin=xvals[0], xmax=xvals[1], color="black", ls='--')
+    ax1.hlines(y=bestAchieved, xmin=xvals[0], xmax=xvals[1], color="black", ls="--")
 
-    plt.savefig('./plots/squareareadelay_' + mod + '_' + str(width) + '.png')
+    plt.savefig("./plots/squareareadelay_" + mod + "_" + str(width) + ".png")
     # plt.show()
 
+
 def squarify(fig):
-    ''' helper function for squareAreaDelay()
-        forces matplotlib figure to be a square
-    '''
+    """helper function for squareAreaDelay()
+    forces matplotlib figure to be a square
+    """
     w, h = fig.get_size_inches()
     if w > h:
         t = fig.subplotpars.top
         b = fig.subplotpars.bottom
-        axs = h*(t-b)
-        l = (1.-axs/w)/2
-        fig.subplots_adjust(left=l, right=1-l)
+        axs = h * (t - b)
+        l = (1.0 - axs / w) / 2
+        fig.subplots_adjust(left=l, right=1 - l)
     else:
         t = fig.subplotpars.right
         b = fig.subplotpars.left
-        axs = w*(t-b)
-        l = (1.-axs/h)/2
-        fig.subplots_adjust(bottom=l, top=1-l)
+        axs = w * (t - b)
+        l = (1.0 - axs / h) / 2
+        fig.subplots_adjust(bottom=l, top=1 - l)
 
-def plotPPA(mod, widths, freq=None, norm=True, aleOpt=False):
-    ''' for the module specified, plots width vs delay, area, leakage power, and dynamic energy with fits
-        if no freq specified, uses the synthesis with best achievable delay for each width
-        overlays data from both techs
-    '''
-    with mpl.rc_context({"figure.figsize": (7,3.46)}):
+
+def plotPPA(mod, freq=None, norm=True, aleOpt=False):
+    """for the module specified, plots width vs delay, area, leakage power, and dynamic energy with fits
+    if no freq specified, uses the synthesis with best achievable delay for each width
+    overlays data from both techs
+    """
+    with mpl.rc_context({"figure.figsize": (7, 3.46)}):
         fig, axs = plt.subplots(2, 2)
 
-    arr = [['delay', 'area'], ['lpower', 'denergy']]
+    arr = [["delay", "area"], ["lpower", "denergy"]]
 
     freqs = [freq]
-    if aleOpt: freqs += [10]
+    if aleOpt:
+        freqs += [10]
 
     for i in [0, 1]:
         for j in [0, 1]:
             leg = []
             for f in freqs:
-                if (arr[i][j]=='delay') and (f==10):
+                if (arr[i][j] == "delay") and (f == 10):
                     pass
                 else:
-                    r2 = oneMetricPlot(mod, widths, arr[i][j], ax=axs[i, j], freq=f, norm=norm)
-                    ls = '--' if f else '-'
-                    leg += [lines.Line2D([0], [0], color='red', label='$R^2$='+str(round(r2, 4)), linestyle=ls)]
+                    # print(f"Pasing in widths {widths}")
+                    r2 = oneMetricPlot(
+                        mod, widths, arr[i][j], ax=axs[i, j], freq=f, norm=norm
+                    )
+                    ls = "--" if f else "-"
+                    leg += [
+                        lines.Line2D(
+                            [0],
+                            [0],
+                            color="orange",
+                            label="$R^2$=" + str(round(r2, 4)),
+                            linestyle=ls,
+                        )
+                    ]
 
-            if (mod in ['flop', 'csa']) & (arr[i][j] == 'delay'):
+            if (mod in ["flop", "csa"]) & (arr[i][j] == "delay"):
                 axs[i, j].set_ylim(ymin=0)
                 ytop = axs[i, j].get_ylim()[1]
-                axs[i, j].set_ylim(ymax=1.1*ytop)
+                axs[i, j].set_ylim(ymax=1.1 * ytop)
             else:
                 axs[i, j].legend(handles=leg, handlelength=1.5)
-    
-    titleStr = "  (target  " + str(freq)+ "MHz)" if freq != None else ""
-    plt.suptitle(mod + titleStr)
-    plt.tight_layout(pad=0.05, w_pad=1, h_pad=0.5, rect=(0,0,1,0.97))
 
-    if freq != 10: 
-        n = 'normalized' if norm else 'unnormalized'
-        saveStr = './plots/'+ n + '/' + mod + '.png'
+    titleStr = "  (target  " + str(freq) + "MHz)" if freq != None else ""
+    plt.suptitle(mod + titleStr)
+    plt.tight_layout(pad=0.05, w_pad=1, h_pad=0.5, rect=(0, 0, 1, 0.97))
+
+    if freq != 10:
+        n = "normalized" if norm else "unnormalized"
+        saveStr = "./plots/" + n + "/" + mod + "_" + ".png"
+        print(f"Saving to {saveStr}")
         plt.savefig(saveStr)
     # plt.show()
 
+
 def makeLineLegend():
-    ''' generates legend to accompany normalized plots
-    '''
-    plt.rcParams["figure.figsize"] = (5.5,0.3)
+    """generates legend to accompany normalized plots"""
+    plt.rcParams["figure.figsize"] = (5.5, 0.3)
     fig = plt.figure()
-    fullLeg = [lines.Line2D([0], [0], color='black', label='fastest', linestyle='-')]
-    fullLeg += [lines.Line2D([0], [0], color='black', label='smallest', linestyle='--')]
-    fullLeg += [lines.Line2D([0], [0], color='blue', label='tsmc28', marker='^')]
-    fullLeg += [lines.Line2D([0], [0], color='blue', label='tsmc28psyn', marker='x')]	
-    fullLeg += [lines.Line2D([0], [0], color='green', label='sky90', marker='o')]
-    fullLeg += [lines.Line2D([0], [0], color='green', label='sky130', marker='+')]	
-    fullLeg += [lines.Line2D([0], [0], color='red', label='combined', marker='_')]
-    fig.legend(handles=fullLeg, ncol=5, handlelength=1.4, loc='center') 
-    saveStr = './plots/legend.png'
+    fullLeg = [lines.Line2D([0], [0], color="black", label="fastest", linestyle="-")]
+    fullLeg += [lines.Line2D([0], [0], color="black", label="smallest", linestyle="--")]
+    fullLeg += [lines.Line2D([0], [0], color="blue", label="tsmc28", marker="^")]
+    fullLeg += [lines.Line2D([0], [0], color="blue", label="tsmc28psyn", marker="x")]
+    fullLeg += [lines.Line2D([0], [0], color="green", label="sky90", marker="o")]
+    fullLeg += [lines.Line2D([0], [0], color="purple", label="sky130", marker="+")]
+    fullLeg += [lines.Line2D([0], [0], color="orange", label="combined", marker="_")]
+    fig.legend(handles=fullLeg, ncol=5, handlelength=1.4, loc="center")
+    saveStr = "./plots/legend.png"
     plt.savefig(saveStr)
 
-def muxPlot(fits='clsgn', norm=True):
-    ''' module: string module name
-        freq: int freq (MHz)
-        var: string delay, area, lpower, or denergy
-        fits: constant, linear, square, log2, Nlog2
-        plots given variable vs width for all matching syntheses with regression
-    '''
+
+def muxPlot(fits="clsgn", norm=True):
+    """module: string module name
+    freq: int freq (MHz)
+    var: string delay, area, lpower, or denergy
+    fits: constant, linear, square, log2, Nlog2
+    plots given variable vs width for all matching syntheses with regression
+    """
     ax = plt.gca()
 
     inputs = [2, 4, 8]
-    allInputs = inputs*2
+    allInputs = inputs * 2
     fullLeg = []
 
-    for crit in ['data', 'control']:
+    for crit in ["data", "control"]:
         allMetrics = []
-        muxes = ['mux2', 'mux4', 'mux8']
+        muxes = ["mux2", "mux4", "mux8"]
 
-        if crit == 'data':
-            ls = '--'
-            muxes = [m + 'd' for m in muxes]
-        elif crit == 'control':
-            ls = '-'
+        if crit == "data":
+            ls = "--"
+            muxes = [m + "d" for m in muxes]
+        elif crit == "control":
+            ls = "-"
 
         for spec in techSpecs:
             metric = []
             for module in muxes:
-                metric += getVals(spec.tech, module, 'delay', width=[1])
-            
+                metric += getVals(spec.tech, module, "delay", width=[1])
+
             if norm:
                 techdict = spec._asdict()
-                norm = techdict['delay']
-                metric = [m/norm for m in metric]
+                norm = techdict["delay"]
+                metric = [m / norm for m in metric]
                 # print(spec.tech, ' ', metric)
 
-            if len(metric) == 3: # don't include the spec if we don't have points for all
+            if (
+                len(metric) == 3
+            ):  # don't include the spec if we don't have points for all
                 xp, pred, coefs, r2 = regress(inputs, metric, fits, ale=False)
                 ax.scatter(inputs, metric, color=spec.color, marker=spec.shape)
                 ax.plot(xp, pred, color=spec.color, linestyle=ls)
                 allMetrics += metric
 
         xp, pred, coefs, r2 = regress(allInputs, allMetrics, fits)
-        ax.plot(xp, pred, color='red', linestyle=ls)
-        fullLeg += [lines.Line2D([0], [0], color='red', label=crit, linestyle=ls)]
-    
-    ax.set_ylabel('Delay (FO4)')
+        ax.plot(xp, pred, color="orange", linestyle=ls)
+        fullLeg += [lines.Line2D([0], [0], color="orange", label=crit, linestyle=ls)]
+
+    ax.set_ylabel("Delay (FO4)")
     ax.set_xticks(inputs)
     ax.set_xlabel("Number of inputs")
-    ax.set_title('mux timing')
-    
-    ax.legend(handles = fullLeg)
-    plt.savefig('./plots/mux.png')
+    ax.set_title("mux timing")
+
+    ax.legend(handles=fullLeg)
+    plt.savefig("./plots/mux.png")
+
 
 def stdDevError():
-    ''' calculates std deviation and error for paper-writing purposes
-    '''
-    for var in ['delay', 'area', 'lpower', 'denergy']:
+    """calculates std deviation and error for paper-writing purposes"""
+    for var in ["delay", "area", "lpower", "denergy"]:
         errlist = []
         for module in modules:
-            ale = (var != 'delay')
+            ale = var != "delay"
             metL = []
             modFit = fitDict[module]
             fits = modFit[ale]
@@ -643,20 +819,20 @@ def stdDevError():
                 metric = getVals(spec.tech, module, var)
                 techdict = spec._asdict()
                 norm = techdict[var]
-                metL += [m/norm for m in metric]
+                metL += [m / norm for m in metric]
 
             if ale:
-                ws = [w/normAddWidth for w in widths]
+                ws = [w / normAddWidth for w in widths]
             else:
                 ws = widths
-            ws = ws*2
+            ws = ws * 2
             mat = []
             for w in ws:
                 row = []
                 for func in funcArr:
                     row += [func(w)]
                 mat += [row]
-            
+
             y = np.array(metL, dtype=np.float)
             coefs = opt.nnls(mat, y)[0]
 
@@ -665,68 +841,84 @@ def stdDevError():
                 n = [func(w) for func in funcArr]
                 yp += [sum(np.multiply(coefs, n))]
 
-            if (var == 'delay') & (module == 'flop'):
+            if (var == "delay") & (module == "flop"):
                 pass
-            elif (module == 'mult') & ale:
+            elif (module == "mult") & ale:
                 pass
             else:
                 for i in range(len(y)):
-                    errlist += [abs(y[i]/yp[i]-1)]
+                    errlist += [abs(y[i] / yp[i] - 1)]
                 # print(module, ' ', var, ' ', np.mean(errlist[-10:]))
-            
+
         avgErr = np.mean(errlist)
         stdv = np.std(errlist)
 
-        print(var, ' ', avgErr, ' ', stdv)
+        print(var, " ", avgErr, " ", stdv)
+
 
 def makePlotDirectory():
-    ''' creates plots directory in same level as this script to store plots in
-    '''
+    """creates plots directory in same level as this script to store plots in"""
     current_directory = os.getcwd()
-    final_directory = os.path.join(current_directory, 'plots')
+    final_directory = os.path.join(current_directory, "plots")
     if not os.path.exists(final_directory):
         os.makedirs(final_directory)
     os.chdir(final_directory)
 
-    for folder in ['freqBuckshot', 'normalized', 'unnormalized']:
+    for folder in ["freqBuckshot", "normalized", "unnormalized"]:
         new_directory = os.path.join(final_directory, folder)
         if not os.path.exists(new_directory):
             os.makedirs(new_directory)
         os.chdir(new_directory)
-        if 'freq' in folder:
-            for tech in ['sky90', 'sky130', 'tsmc28', 'tsmc28psyn']:
+        if "freq" in folder:
+            for tech in ["sky90", "sky130", "tsmc28", "tsmc28psyn"]:
                 for mod in modules:
                     tech_directory = os.path.join(new_directory, tech)
                     mod_directory = os.path.join(tech_directory, mod)
                     if not os.path.exists(mod_directory):
                         os.makedirs(mod_directory)
-                os.chdir('..')
-    
+                os.chdir("..")
+
     os.chdir(current_directory)
-    
-if __name__ == '__main__':
+
+
+if __name__ == "__main__":
     ##############################
     # set up stuff, global variables
-	widths = [64, 128]
-	modules = ['adder', 'comparator']
+    widths = [8, 16, 32, 64, 128]
+    modules = ["adder", "comparator"]
 
-	normAddWidth = 32 # divisor to use with N since normalizing to add_32
+    normAddWidth = 32  # divisor to use with N since normalizing to add_32
 
-	fitDict = {'adder': ['cg', 'l', 'l'], 'mul': ['cg', 's', 's'], 'comparator': ['cg', 'l', 'l'], 'csa': ['c', 'l', 'l'], 'shifter': ['cg', 'l', 'ln'], 'flop': ['c', 'l', 'l'], 'binencoder': ['cg', 'l', 'l']}
-	fitDict.update(dict.fromkeys(['mux2', 'mux4', 'mux8'], ['cg', 'l', 'l']))
+    fitDict = {
+        "adder": ["cg", "l", "l"],
+        "mul": ["cg", "s", "s"],
+        "comparator": ["cg", "l", "l"],
+        "csa": ["c", "l", "l"],
+        "shifter": ["cg", "l", "ln"],
+        "flop": ["c", "l", "l"],
+        "binencoder": ["cg", "l", "l"],
+    }
+    fitDict.update(dict.fromkeys(["mux2", "mux4", "mux8"], ["cg", "l", "l"]))
 
-	TechSpec = namedtuple("TechSpec", "tech color shape delay area lpower denergy")
-	techSpecs = [['sky90', 'green', 'o', 43.2e-3, 1440.600027, 714.057, 0.658022690438],  ['sky130', 'red', 'o', 43.2e-3, 1440.600027, 714.057, 0.658022690438], ['tsmc28', 'blue', '^', 12.2e-3, 209.286002, 1060.0, .08153281695882594], ['tsmc28psyn', 'blue', '^', 12.2e-3, 209.286002, 1060.0, .08153281695882594]]
-	techSpecs = [TechSpec(*t) for t in techSpecs]
-	combined = TechSpec('combined fit', 'red', '_', 0, 0, 0, 0)
+    TechSpec = namedtuple("TechSpec", "tech color shape delay area lpower denergy")
+    # FO4 delay information information
+    techSpecs = [
+        # ["sky90", "green", "o", 43.2e-3, 1440.600027, 714.057, 0.658022690438],
+        # Area/Lpower/Denergy needs to be corrected here (jes)
+        ["sky130", "orange", "o", 99.5e-3, 1440.600027, 714.057, 0.658022690438],
+        # ["tsmc28", "blue", "^", 12.2e-3, 209.286002, 1060.0, 0.08153281695882594],
+        # ["tsmc28psyn", "blue", "^", 12.2e-3, 209.286002, 1060.0, 0.08153281695882594],
+    ]
+    techSpecs = [TechSpec(*t) for t in techSpecs]
+    combined = TechSpec("combined fit", "orange", "_", 0, 0, 0, 0)
     ##############################
 
     # cleanup() # run to remove garbage synth runs
-	synthsintocsv() # slow, run only when new synth runs to add to csv
-  
-	allSynths = synthsfromcsv('ppaData.csv') # your csv here!
-	bestSynths = csvOfBest('bestSynths.csv')
-	makePlotDirectory()
+    synthsintocsv()  # slow, run only when new synth runs to add to csv
+
+    allSynths = synthsfromcsv("ppaData.csv")  # your csv here!
+    bestSynths = csvOfBest("bestSynths.csv")
+    makePlotDirectory()
 
     # ### other functions
     # makeCoefTable()
@@ -734,12 +926,12 @@ if __name__ == '__main__':
     # muxPlot()
     # stdDevError()
 
-	for mod in modules:
-		for w in widths:
-			#freqPlot('sky90', mod, w)
-			freqPlot('sky130', mod, w)			
-			#freqPlot('tsmc28', mod, w)
-			#freqPlot('tsmc28psyn', mod, w)			
-			#plotPPA(mod, widths, norm=False)
-			#plotPPA(mod, aleOpt=True)
-			plt.close('all')
+    for mod in modules:
+        for w in widths:
+            # freqPlot('sky90', mod, w)
+            # freqPlot("sky130", mod, w)
+            # freqPlot('tsmc28', mod, w)
+            # freqPlot('tsmc28psyn', mod, w)
+            plotPPA(mod, norm=False)
+            # plotPPA(mod, aleOpt=True)
+            plt.close("all")

From 9dce08a743060ceae695a544f9e7b038041a33e5 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Tue, 14 Nov 2023 02:41:44 -0600
Subject: [PATCH 32/62] minor typo on ppaSynth and ppaAnalyze

---
 synthDC/ppa/ppaAnalyze.py | 31 ++++++++++++++-----------------
 synthDC/ppa/ppaSynth.py   |  2 +-
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/synthDC/ppa/ppaAnalyze.py b/synthDC/ppa/ppaAnalyze.py
index 9af15fd80..bd98e79be 100755
--- a/synthDC/ppa/ppaAnalyze.py
+++ b/synthDC/ppa/ppaAnalyze.py
@@ -82,11 +82,9 @@ def synthsintocsv():
         delay = 1000 / int(freq) - metrics[0]
         area = metrics[1]
         lpower = metrics[4]
-        # switching, internal power in mW and leakage in nW
-        tpower = metrics[2] + metrics[3] + metrics[4]*0.000001
-        # EDP (fJ/GHz)
+        tpower = (metrics[2] + metrics[3] + metrics[4]*.000001)
         denergy = (
-            (metrics[2] + metrics[3] + metrics[4]*0.000001) / int(freq)
+            (tpower) / int(freq) * 1000
         )  # (switching + internal powers)*delay, more practical units for regression coefs
 
         if "flop" in module:  # since two flops in each module
@@ -304,7 +302,6 @@ def oneMetricPlot(
             allMetrics += metric
 
         # print(f"Widths passed into regress : {allWidths}")
-        # Not sure why this works (jes) - if allWidths doesn't have data widths does
         if len(allWidths) > 0:
             xp, pred, coefs, r2 = regress(allWidths, allMetrics, fits)
             ax.plot(xp, pred, color="orange", linestyle=ls)
@@ -322,7 +319,7 @@ def oneMetricPlot(
     else:
         ylabeldic = {
             "lpower": "Leakage Power (nW)",
-            "denergy": "EDP (fJ/GHz)",
+            "denergy": "Dynamic Energy (nJ)",
             "area": "Area (sq microns)",
             "delay": "Delay (ns)",
         }
@@ -355,9 +352,9 @@ def regress(widths, var, fits="clsgn", ale=False):
     returns lists of x and y values to plot that curve and coefs for the eq with r2
     """
     if len(var) != len(widths):
-        print(
-            f"There are not enough variables to match widths. Widths : {widths} Variables Found : {var}, padding to match may affect correctness (doing it anyways)\n"
-        )
+        # print(
+        #    f"There are not enough variables to match widths. Widths : {widths} Variables Found : {var}, padding to match may affect correctness (doing it anyways)\n"
+        # )
         if len(widths) > len(var):
             while len(widths) > len(var):
                 var.append(0.0)
@@ -792,8 +789,8 @@ def muxPlot(fits="clsgn", norm=True):
                 allMetrics += metric
 
         xp, pred, coefs, r2 = regress(allInputs, allMetrics, fits)
-        ax.plot(xp, pred, color="orange", linestyle=ls)
-        fullLeg += [lines.Line2D([0], [0], color="orange", label=crit, linestyle=ls)]
+        ax.plot(xp, pred, color="red", linestyle=ls)
+        fullLeg += [lines.Line2D([0], [0], color="red", label=crit, linestyle=ls)]
 
     ax.set_ylabel("Delay (FO4)")
     ax.set_xticks(inputs)
@@ -885,7 +882,7 @@ if __name__ == "__main__":
     ##############################
     # set up stuff, global variables
     widths = [8, 16, 32, 64, 128]
-    modules = ["adder", "comparator"]
+    modules = ["adder"]
 
     normAddWidth = 32  # divisor to use with N since normalizing to add_32
 
@@ -903,14 +900,14 @@ if __name__ == "__main__":
     TechSpec = namedtuple("TechSpec", "tech color shape delay area lpower denergy")
     # FO4 delay information information
     techSpecs = [
-        # ["sky90", "green", "o", 43.2e-3, 1440.600027, 714.057, 0.658022690438],
+        #["sky90", "green", "o", 43.2e-3, 1440.600027, 714.057, 0.658022690438],
         # Area/Lpower/Denergy needs to be corrected here (jes)
         ["sky130", "orange", "o", 99.5e-3, 1440.600027, 714.057, 0.658022690438],
         # ["tsmc28", "blue", "^", 12.2e-3, 209.286002, 1060.0, 0.08153281695882594],
         # ["tsmc28psyn", "blue", "^", 12.2e-3, 209.286002, 1060.0, 0.08153281695882594],
     ]
     techSpecs = [TechSpec(*t) for t in techSpecs]
-    combined = TechSpec("combined fit", "orange", "_", 0, 0, 0, 0)
+    combined = TechSpec("combined fit", "red", "_", 0, 0, 0, 0)
     ##############################
 
     # cleanup() # run to remove garbage synth runs
@@ -928,10 +925,10 @@ if __name__ == "__main__":
 
     for mod in modules:
         for w in widths:
-            # freqPlot('sky90', mod, w)
-            # freqPlot("sky130", mod, w)
+            #freqPlot('sky90', mod, w)
+            freqPlot("sky130", mod, w)
             # freqPlot('tsmc28', mod, w)
             # freqPlot('tsmc28psyn', mod, w)
             plotPPA(mod, norm=False)
-            # plotPPA(mod, aleOpt=True)
+            plotPPA(mod, aleOpt=True)
             plt.close("all")
diff --git a/synthDC/ppa/ppaSynth.py b/synthDC/ppa/ppaSynth.py
index 07a342e26..30fe1254f 100755
--- a/synthDC/ppa/ppaSynth.py
+++ b/synthDC/ppa/ppaSynth.py
@@ -84,7 +84,7 @@ if __name__ == '__main__':
 	synthsToRun = freqSweep(module, width, tech)
 
     ##### Run a sweep for multiple modules/widths based on best delay found in existing syntheses
-	modules = ['adder', "comparator"]
+	modules = ['adder']
 	widths = [8, 16, 32, 64, 128]
 	tech = 'sky130'
 	synthsToRun = freqModuleSweep(widths, modules, tech)	

From 8ba0336c6f231eca478244f51121678786c0803d Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Tue, 14 Nov 2023 11:01:58 -0800
Subject: [PATCH 33/62] Removed unused addins, cleaned up configuration to
 support half precision on RV64gc, gate unused hazard inputs to reduce
 critical path in rv32e

---
 .gitmodules                     |  3 ---
 addins/embench-iot              |  2 +-
 addins/riscv-arch-test          |  2 +-
 addins/riscv-tests              |  1 -
 config/rv64gc/config.vh         |  2 +-
 src/hazard/hazard.sv            | 37 +++++++++++++++++++++++++--------
 src/wally/wallypipelinedcore.sv |  2 +-
 7 files changed, 32 insertions(+), 17 deletions(-)
 delete mode 160000 addins/riscv-tests

diff --git a/.gitmodules b/.gitmodules
index 9a4c7fbb8..1e56898c8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,9 +8,6 @@
 [submodule "addins/imperas-riscv-tests"]
 	path = addins/imperas-riscv-tests
 	url = https://github.com/riscv-ovpsim/imperas-riscv-tests
-[submodule "addins/riscv-tests"]
-	path = addins/riscv-tests
-	url = https://github.com/riscv-software-src/riscv-tests
 [submodule "addins/riscv-dv"]
 	path = addins/riscv-dv
 	url = https://github.com/google/riscv-dv
diff --git a/addins/embench-iot b/addins/embench-iot
index 1480febc3..4c5eb8798 160000
--- a/addins/embench-iot
+++ b/addins/embench-iot
@@ -1 +1 @@
-Subproject commit 1480febc3ace5f471baeee4b1ae0d8fea16e4762
+Subproject commit 4c5eb87983f51ca7fcf7855306877b3d1c3aabf1
diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
index 197179fdc..2c5675d7a 160000
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@@ -1 +1 @@
-Subproject commit 197179fdc9dfeeca821e848f373c897a3fdae86c
+Subproject commit 2c5675d7a58e98d47bef3a6cf5a8373397b0d0be
diff --git a/addins/riscv-tests b/addins/riscv-tests
deleted file mode 160000
index cf04274f5..000000000
--- a/addins/riscv-tests
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit cf04274f50621fd9ef9147793cca6dd1657985c7
diff --git a/config/rv64gc/config.vh b/config/rv64gc/config.vh
index 8decf60d5..564b32f5d 100644
--- a/config/rv64gc/config.vh
+++ b/config/rv64gc/config.vh
@@ -42,7 +42,7 @@ localparam ZIFENCEI_SUPPORTED = 1;
 localparam COUNTERS = 12'd32;
 localparam ZICNTR_SUPPORTED = 1;
 localparam ZIHPM_SUPPORTED = 1;
-localparam ZFH_SUPPORTED = 0;
+localparam ZFH_SUPPORTED = 1;
 localparam SSTC_SUPPORTED = 1;
 localparam ZICBOM_SUPPORTED = 1;
 localparam ZICBOZ_SUPPORTED = 1;
diff --git a/src/hazard/hazard.sv b/src/hazard/hazard.sv
index cb70605c0..028dbf61d 100644
--- a/src/hazard/hazard.sv
+++ b/src/hazard/hazard.sv
@@ -26,7 +26,7 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module hazard (
+module hazard import cvw::*;  #(parameter cvw_t P) (
   // Detect hazards
   input  logic  BPWrongE, CSRWriteFenceM, RetM, TrapM,   
   input  logic  LoadStallD, StoreStallD, MDUStallD, CSRRdStallD,
@@ -46,9 +46,28 @@ module hazard (
 
   logic WFIStallM, WFIInterruptedM;
 
+  logic ValidWfiM, ValidTrapM, ValidRetM, ValidCSRWriteFenceM, ValidCSRRdStallD;
+  logic ValidFPUStallD, ValidFCvtIntStallD, ValidFDivBusyE, ValidMDUStallD, ValidDivBusyE;
+
+  // Gate Stall/Flush sources with supported features 
+  // This is not logically necessary because the original signals are already 0 when the feature is unsupported
+  // However, synthesis does not propagate the constant 0 across modules
+  // By gating these signals, synthesis eliminates unnecessary stall/flush logic, saving about 10% cycle time for rv32e
+  // These lines of code gating with a compile-time constant generate no hardware.
+  assign ValidWfiM = wfiM & P.ZICSR_SUPPORTED;
+  assign ValidTrapM = TrapM & P.ZICSR_SUPPORTED;
+  assign ValidRetM = RetM & P.ZICSR_SUPPORTED;
+  assign ValidCSRWriteFenceM = CSRWriteFenceM & P.ZICSR_SUPPORTED;
+  assign ValidCSRRdStallD = CSRRdStallD & P.ZICSR_SUPPORTED;
+  assign ValidFPUStallD = RetM & P.F_SUPPORTED;
+  assign ValidFCvtIntStallD = RetM & P.F_SUPPORTED;
+  assign ValidFDivBusyE = FDivBusyE & P.F_SUPPORTED;
+  assign ValidMDUStallD = MDUStallD & P.M_SUPPORTED;
+  assign ValidDivBusyE = DivBusyE & P.M_SUPPORTED;  
+
   // WFI logic
-  assign WFIStallM = wfiM & ~IntPendingM;         // WFI waiting for an interrupt or timeout
-  assign WFIInterruptedM = wfiM & IntPendingM;    // WFI detects a pending interrupt.  Retire WFI; trap if interrupt is enabled.
+  assign WFIStallM = ValidWfiM & ~IntPendingM;         // WFI waiting for an interrupt or timeout
+  assign WFIInterruptedM = ValidWfiM & IntPendingM;    // WFI detects a pending interrupt.  Retire WFI; trap if interrupt is enabled.
   
   // stalls and flushes
   // loads: stall for one cycle if the subsequent instruction depends on the load
@@ -70,10 +89,10 @@ module hazard (
   // Branch misprediction is found in the Execute stage and must flush the next two instructions.
   //   However, an active division operation resides in the Execute stage, and when the BP incorrectly mispredicts the divide as a taken branch, the divde must still complete
   // When a WFI is interrupted and causes a trap, it flushes the rest of the pipeline but not the W stage, because the WFI needs to commit
-  assign FlushDCause = TrapM | RetM | CSRWriteFenceM | BPWrongE;
-  assign FlushECause = TrapM | RetM | CSRWriteFenceM |(BPWrongE & ~(DivBusyE | FDivBusyE));
-  assign FlushMCause = TrapM | RetM | CSRWriteFenceM;
-  assign FlushWCause = TrapM & ~WFIInterruptedM;
+  assign FlushDCause = ValidTrapM | ValidRetM | ValidCSRWriteFenceM | BPWrongE;
+  assign FlushECause = ValidTrapM | ValidRetM | ValidCSRWriteFenceM |(BPWrongE & ~(ValidDivBusyE | ValidFDivBusyE));
+  assign FlushMCause = ValidTrapM | ValidRetM | ValidCSRWriteFenceM;
+  assign FlushWCause = ValidTrapM & ~WFIInterruptedM;
 
   // Stall causes
   //  Most data depenency stalls are identified in the decode stage
@@ -84,8 +103,8 @@ module hazard (
   //    The IFU stalls the entire pipeline rather than just Fetch to avoid complications with instructions later in the pipeline causing Exceptions
   //    A trap could be asserted at the start of a IFU/LSU stall, and should flush the memory operation
   assign StallFCause = '0;
-  assign StallDCause = (LoadStallD | StoreStallD | MDUStallD | CSRRdStallD | FCvtIntStallD | FPUStallD) & ~FlushDCause;
-  assign StallECause = (DivBusyE | FDivBusyE) & ~FlushECause; 
+  assign StallDCause = (LoadStallD | StoreStallD | ValidMDUStallD | ValidCSRRdStallD | ValidFCvtIntStallD | ValidFPUStallD) & ~FlushDCause;
+  assign StallECause = (ValidDivBusyE | ValidFDivBusyE) & ~FlushECause; 
   assign StallMCause = WFIStallM & ~FlushMCause;
   // Need to gate IFUStallF when the equivalent FlushFCause = FlushDCause = 1.
   // assign StallWCause = ((IFUStallF & ~FlushDCause) | LSUStallM) & ~FlushWCause;
diff --git a/src/wally/wallypipelinedcore.sv b/src/wally/wallypipelinedcore.sv
index 00b348660..46ffcac09 100644
--- a/src/wally/wallypipelinedcore.sv
+++ b/src/wally/wallypipelinedcore.sv
@@ -264,7 +264,7 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
   end
 
   // global stall and flush control  
-  hazard  hzu(
+  hazard #(P) hzu(
     .BPWrongE, .CSRWriteFenceM, .RetM, .TrapM,
     .LoadStallD, .StoreStallD, .MDUStallD, .CSRRdStallD,
     .LSUStallM, .IFUStallF,

From 1ab7c926ea4ec7ac7a1ca9f96c1f60a3ac722b3a Mon Sep 17 00:00:00 2001
From: naichewa <nwhyteaguayo@g.hmc.edu>
Date: Tue, 14 Nov 2023 13:44:59 -0800
Subject: [PATCH 34/62] Final Code Review

---
 src/uncore/spi_apb.sv | 321 +++++++++++++++++++-----------------------
 1 file changed, 143 insertions(+), 178 deletions(-)

diff --git a/src/uncore/spi_apb.sv b/src/uncore/spi_apb.sv
index 4db435be6..b0649bf93 100644
--- a/src/uncore/spi_apb.sv
+++ b/src/uncore/spi_apb.sv
@@ -2,10 +2,14 @@
 // spi_apb.sv
 //
 // Written: Naiche Whyte-Aguayo nwhyteaguayo@g.hmc.edu 11/16/2022
-
 //
 // Purpose: SPI peripheral
-//   See FU540-C000-v1.0 for specifications
+//
+// SPI module is written to the specifications described in FU540-C000-v1.0. At the top level, it is consists of synchronous 8 byte transmit and recieve FIFOs connected to shift registers. 
+// The FIFOs are connected to WALLY by an apb control register interface, which includes various control registers for modifying the SPI transmission along with registers for writing
+// to the transmit FIFO and reading from the receive FIFO. The transmissions themselves are then controlled by a finite state machine. The SPI module uses 4 tristate pins for SPI input/output, 
+// along with a 4 bit Chip Select signal, a clock signal, and an interrupt signal to WALLY.
+// Current limitations: Flash read sequencer mode not implemented, dual and quad mode not supported
 // 
 // A component of the Wally configurable RISC-V project.
 // 
@@ -25,19 +29,6 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-// Current limitations: Flash read sequencer mode not implemented, dual and quad modes untestable with current test plan.
-
-// Attempt to move from >= comparisons by initializing in FSM differently
-// Parameterize SynchFIFO
-// look at ReadIncrement/WriteIncrement delay necessity 
-
-/* 
-SPI module is written to the specifications described in FU540-C000-v1.0. At the top level, it is consists of synchronous 8 byte transmit and recieve FIFOs connected to shift registers. 
-The FIFOs are connected to WALLY by an apb control register interface, which includes various control registers for modifying the SPI transmission along with registers for writing
-to the transmit FIFO and reading from the receive FIFO. The transmissions themselves are then controlled by a finite state machine. The SPI module uses 4 tristate pins for SPI input/output, 
-along with a 4 bit Chip Select signal, a clock signal, and an interrupt signal to WALLY. 
-*/
-
 module spi_apb import cvw::*; #(parameter cvw_t P) (
     input  logic                PCLK, PRESETn,
     input  logic                PSEL,
@@ -54,27 +45,27 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
     output logic                SPIIntr
 );
 
-    //SPI control registers. Refer to SiFive FU540-C000 manual 
+    // SPI control registers. Refer to SiFive FU540-C000 manual 
     logic [11:0] SckDiv;
-    logic [1:0] SckMode;
-    logic [1:0] ChipSelectID;
-    logic [3:0] ChipSelectDef; 
-    logic [1:0] ChipSelectMode;
+    logic [1:0]  SckMode;
+    logic [1:0]  ChipSelectID;
+    logic [3:0]  ChipSelectDef; 
+    logic [1:0]  ChipSelectMode;
     logic [15:0] Delay0, Delay1;
-    logic [4:0] Format;
-    logic [7:0] ReceiveData;
-    logic [2:0] TransmitWatermark, ReceiveWatermark;
-    logic [8:0] TransmitData;
-    logic [1:0] InterruptEnable, InterruptPending;
+    logic [4:0]  Format;
+    logic [7:0]  ReceiveData;
+    logic [2:0]  TransmitWatermark, ReceiveWatermark;
+    logic [8:0]  TransmitData;
+    logic [1:0]  InterruptEnable, InterruptPending;
 
-    //Bus interface signals
+    // Bus interface signals
     logic [7:0] Entry;
     logic Memwrite;
     logic [31:0] Din, Dout;
-    logic TransmitInactive;                         //High when there is no transmission, used as hardware interlock signal
+    logic TransmitInactive;                         // High when there is no transmission, used as hardware interlock signal
 
-    //FIFO FSM signals
-    //Watermark signals - TransmitReadMark = ip[0], ReceiveWriteMark = ip[1]
+    // FIFO FSM signals
+    // Watermark signals - TransmitReadMark = ip[0], ReceiveWriteMark = ip[1]
     logic TransmitWriteMark, TransmitReadMark, RecieveWriteMark, RecieveReadMark; 
     logic TransmitFIFOWriteFull, TransmitFIFOReadEmpty;
     logic TransmitFIFOReadIncrement;
@@ -83,75 +74,68 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
     logic ReceiveFIFOWriteFull, ReceiveFIFOReadEmpty;
     logic [7:0] TransmitFIFOReadData, ReceiveFIFOWriteData;
     logic [2:0] TransmitWriteWatermarkLevel, ReceiveReadWatermarkLevel;
-    logic [7:0] ReceiveShiftRegEndian;              //reverses ReceiveShiftReg if Format[2] set (little endian transmission)
+    logic [7:0] ReceiveShiftRegEndian;              // Reverses ReceiveShiftReg if Format[2] set (little endian transmission)
 
-    //Transmission signals
+    // Transmission signals
     logic sck;
-    logic [11:0] DivCounter;                        //counter for sck 
-    logic SCLKenable;                               //flip flop enable high every sclk edge
+    logic [11:0] DivCounter;                        // Counter for sck 
+    logic SCLKenable;                               // Flip flop enable high every sclk edge
 
-    //Delay signals
-    logic [8:0] ImplicitDelay1;                     //Adds implicit delay to cs-sck delay counter based on phase  
-    logic [8:0] ImplicitDelay2;                     //Adds implicit delay to sck-cs delay counter based on phase 
-    logic [8:0] CS_SCKCount;                        //Counter for cs-sck delay
-    logic [8:0] SCK_CSCount;                        //Counter for sck-cs delay
-    logic [8:0] InterCSCount;                       //Counter for inter cs delay
-    logic [8:0] InterXFRCount;                      //Counter for inter xfr delay 
-    logic CS_SCKCompare;                            //Boolean comparison signal, high when CS_SCKCount >= cs-sck delay
-    logic SCK_CSCompare;                            //Boolean comparison signal, high when SCK_CSCount >= sck-cs delay
-    logic InterCSCompare;                           //Boolean comparison signal, high when InterCSCount >= inter cs delay
-    logic InterXFRCompare;                          //Boolean comparison signal, high when InterXFRCount >= inter xfr delay
-    logic ZeroDelayHoldMode;                        //High when ChipSelectMode is hold and Delay1[15:8] (InterXFR delay) is 0
+    // Delay signals
+    logic [8:0] ImplicitDelay1;                     // Adds implicit delay to cs-sck delay counter based on phase  
+    logic [8:0] ImplicitDelay2;                     // Adds implicit delay to sck-cs delay counter based on phase 
+    logic [8:0] CS_SCKCount;                        // Counter for cs-sck delay
+    logic [8:0] SCK_CSCount;                        // Counter for sck-cs delay
+    logic [8:0] InterCSCount;                       // Counter for inter cs delay
+    logic [8:0] InterXFRCount;                      // Counter for inter xfr delay 
+    logic ZeroDelayHoldMode;                        // High when ChipSelectMode is hold and Delay1[15:8] (InterXFR delay) is 0
 
-    //Frame counting signals
-    logic [3:0] FrameCount;                         //Counter for number of frames in transmission
-    logic FrameCompare;                             //Boolean comparison signal, high when FrameCount = Format[7:4]
-    logic [3:0] ReceivePenultimateFrame;            //Frame number - 1
-    logic [3:0] ReceivePenultimateFrameCount;       //Counter
-    logic ReceivePenultimateFrameBoolean;           //High when penultimate frame in transmission has been reached
+    // Frame counting signals
+    logic [3:0] FrameCount;                         // Counter for number of frames in transmission
+    logic [3:0] ReceivePenultimateFrameCount;       // Counter
+    logic ReceivePenultimateFrame;                  // High when penultimate frame in transmission has been reached
 
-    //State fsm signals
-    logic Active;                                   //High when state is either Active1 or Active0 (during transmission)
-    logic Active0;                                  //High when state is Active0
+    // State fsm signals
+    logic Active;                                   // High when state is either Active1 or Active0 (during transmission)
+    logic Active0;                                  // High when state is Active0
 
-    //Shift reg signals
-    logic ShiftEdge;                                //Determines which edge of sck to shift from TransmitShiftReg
-    logic [7:0] TransmitShiftReg;                   //Transmit shift register
-    logic [7:0] ReceiveShiftReg;                    //Receive shift register
-    logic SampleEdge;                               //Determines which edge of sck to sample from ReceiveShiftReg
-    logic [7:0] TransmitDataEndian;                 //Reverses TransmitData from txFIFO if littleendian, since TransmitReg always shifts MSB
-    logic TransmitShiftRegLoad;                     //Determines when to load TransmitShiftReg
-    logic ReceiveShiftFull;                         //High when receive shift register is full
-    logic TransmitShiftEmpty;                       //High when transmit shift register is empty
-    logic ShiftIn;                                  //Determines whether to shift from SPIIn or SPIOut (if SPI_LOOPBACK_TEST)  
-    logic [3:0] LeftShiftAmount;                    //Determines left shift amount to left-align data when little endian              
-    logic [7:0] ASR;                                //AlignedReceiveShiftReg    
+    // Shift reg signals
+    logic ShiftEdge;                                // Determines which edge of sck to shift from TransmitShiftReg
+    logic [7:0] TransmitShiftReg;                   // Transmit shift register
+    logic [7:0] ReceiveShiftReg;                    // Receive shift register
+    logic SampleEdge;                               // Determines which edge of sck to sample from ReceiveShiftReg
+    logic [7:0] TransmitDataEndian;                 // Reverses TransmitData from txFIFO if littleendian, since TransmitReg always shifts MSB
+    logic TransmitShiftRegLoad;                     // Determines when to load TransmitShiftReg
+    logic ReceiveShiftFull;                         // High when receive shift register is full
+    logic TransmitShiftEmpty;                       // High when transmit shift register is empty
+    logic ShiftIn;                                  // Determines whether to shift from SPIIn or SPIOut (if SPI_LOOPBACK_TEST)  
+    logic [3:0] LeftShiftAmount;                    // Determines left shift amount to left-align data when little endian              
+    logic [7:0] ASR;                                // AlignedReceiveShiftReg    
 
-    //CS signals
-    logic [3:0] ChipSelectAuto;                     //Assigns ChipSelect value to selected CS signal based on CS ID
-    logic [3:0] ChipSelectInternal;                 //Defines what each ChipSelect signal should be based on transmission status and ChipSelectDef
-    logic DelayMode;                                //Determines where to place implicit half cycle delay based on sck phase for CS assertion
+    // CS signals
+    logic [3:0] ChipSelectAuto;                     // Assigns ChipSelect value to selected CS signal based on CS ID
+    logic [3:0] ChipSelectInternal;                 // Defines what each ChipSelect signal should be based on transmission status and ChipSelectDef
+    logic DelayMode;                                // Determines where to place implicit half cycle delay based on sck phase for CS assertion
 
-    //Miscellaneous signals delayed/early by 1 PCLK cycle
-    logic ReceiveShiftFullDelay;                    //Delays ReceiveShiftFull signal by 1 PCLK cycle
-    logic TransmitFIFOWriteIncrementDelay;          //TransmitFIFOWriteIncrement delayed by 1 PCLK cycle
-    logic ReceiveShiftFullDelayPCLK;                //ReceiveShiftFull delayed by 1 PCLK cycle
+    // Miscellaneous signals delayed/early by 1 PCLK cycle
+    logic ReceiveShiftFullDelay;                    // Delays ReceiveShiftFull signal by 1 PCLK cycle
+    logic ReceiveShiftFullDelayPCLK;                // ReceiveShiftFull delayed by 1 PCLK cycle
     logic TransmitFIFOReadEmptyDelay;
-    logic SCLKenableEarly;                          //SCLKenable 1 PCLK cycle early, needed for on time register changes when ChipSelectMode is hold and Delay1[15:8] (InterXFR delay) is 0
+    logic SCLKenableEarly;                          // SCLKenable 1 PCLK cycle early, needed for on time register changes when ChipSelectMode is hold and Delay1[15:8] (InterXFR delay) is 0
 
-    //APB access
-    assign Entry = {PADDR[7:2],2'b00};  // 32-bit word-aligned accesses
-    assign Memwrite = PWRITE & PENABLE & PSEL;  // only write in access phase
-    assign PREADY = TransmitInactive; // tie PREADY to transmission for hardware interlock
+    // APB access
+    assign Entry = {PADDR[7:2],2'b00};  //  32-bit word-aligned accesses
+    assign Memwrite = PWRITE & PENABLE & PSEL;  // Only write in access phase
+    assign PREADY = TransmitInactive; // Tie PREADY to transmission for hardware interlock
 
-    //Account for subword read/write circuitry
+    // Account for subword read/write circuitry
     // -- Note SPI registers are 32 bits no matter what; access them with LW SW.
    
     assign Din = PWDATA[31:0]; 
     if (P.XLEN == 64) assign PRDATA = {Dout, Dout}; 
     else              assign PRDATA = Dout;  
 
-    //Register access  
+    // Register access  
     always_ff@(posedge PCLK, negedge PRESETn)
         if (~PRESETn) begin 
             SckDiv <= #1 12'd3;
@@ -167,13 +151,12 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
             ReceiveWatermark <= #1 3'b0;
             InterruptEnable <= #1 2'b0;
             InterruptPending <= #1 2'b0;
-        end else begin //writes
-            //According to FU540 spec: Once interrupt is pending, it will remain set until number 
-            //of entries in tx/rx fifo is strictly more/less than tx/rxmark
+        end else begin // writes
+            
 
             /* verilator lint_off CASEINCOMPLETE */
             if (Memwrite & TransmitInactive)
-                case(Entry) //flop to sample inputs
+                case(Entry) // flop to sample inputs
                     8'h00: SckDiv <= Din[11:0];
                     8'h04: SckMode <= Din[1:0];
                     8'h10: ChipSelectID <= Din[1:0];
@@ -188,18 +171,21 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
                     8'h70: InterruptEnable <= Din[1:0];
                 endcase
             /* verilator lint_off CASEINCOMPLETE */
-            //interrupt clearance
+
+            // According to FU540 spec: Once interrupt is pending, it will remain set until number 
+            // of entries in tx/rx fifo is strictly more/less than tx/rxmark
             InterruptPending[0] <= TransmitReadMark;
             InterruptPending[1] <= RecieveWriteMark;  
-            case(Entry) // flop to sample inputs
+
+            case(Entry) // Flop to sample inputs
                 8'h00: Dout <= #1 {20'b0, SckDiv};
                 8'h04: Dout <= #1 {30'b0, SckMode};
                 8'h10: Dout <= #1 {30'b0, ChipSelectID};
                 8'h14: Dout <= #1 {28'b0, ChipSelectDef};
                 8'h18: Dout <= #1 {30'b0, ChipSelectMode};
-                8'h28: Dout <= {8'b0, Delay0[15:8], 8'b0, Delay0[7:0]};
-                8'h2C: Dout <= {8'b0, Delay1[15:8], 8'b0, Delay1[7:0]};
-                8'h40: Dout <= {12'b0, Format[4:1], 13'b0, Format[0], 2'b0};
+                8'h28: Dout <= #1 {8'b0, Delay0[15:8], 8'b0, Delay0[7:0]};
+                8'h2C: Dout <= #1 {8'b0, Delay1[15:8], 8'b0, Delay1[7:0]};
+                8'h40: Dout <= #1 {12'b0, Format[4:1], 13'b0, Format[0], 2'b0};
                 8'h48: Dout <= #1 {23'b0, TransmitFIFOWriteFull, 8'b0};
                 8'h4C: Dout <= #1 {23'b0, ReceiveFIFOReadEmpty, ReceiveData[7:0]};
                 8'h50: Dout <= #1 {29'b0, TransmitWatermark};
@@ -210,8 +196,9 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
             endcase
         end
 
-    //SPI enable generation, where SCLK = PCLK/(2*(SckDiv + 1))
-    //Generates a high signal at the rising and falling edge of SCLK by counting from 0 to SckDiv
+    // SPI enable generation, where SCLK = PCLK/(2*(SckDiv + 1))
+    // Asserts SCLKenable at the rising and falling edge of SCLK by counting from 0 to SckDiv
+    // Active at 2x SCLK frequency to account for implicit half cycle delays and actions on both clock edges depending on phase
     assign SCLKenable = (DivCounter == SckDiv);
     assign SCLKenableEarly = ((DivCounter + 12'b1) == SckDiv);
     always_ff @(posedge PCLK, negedge PRESETn)
@@ -219,44 +206,38 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
         else if (SCLKenable) DivCounter <= 0;
         else DivCounter <= DivCounter + 12'b1;
 
-    //Boolean logic that tracks frame progression
-    assign FrameCompare = (FrameCount < Format[4:1]);    
-    assign ReceivePenultimateFrameBoolean = ((FrameCount + 4'b0001) == Format[4:1]);
+    // Asserts when transmission is one frame before complete
+    assign ReceivePenultimateFrame = ((FrameCount + 4'b0001) == Format[4:1]);
 
-    //Computing delays
+    // Computing delays
     // When sckmode.pha = 0, an extra half-period delay is implicit in the cs-sck delay, and vice-versa for sck-cs
     assign ImplicitDelay1 = SckMode[0] ? 9'b0 : 9'b1;
     assign ImplicitDelay2 = SckMode[0] ? 9'b1 : 9'b0;
 
-    assign CS_SCKCompare = CS_SCKCount >= (({Delay0[7:0], 1'b0}) + ImplicitDelay1);
-    assign SCK_CSCompare = SCK_CSCount >= (({Delay0[15:8], 1'b0}) + ImplicitDelay2);
-    assign InterCSCompare = (InterCSCount >= ({Delay1[7:0],1'b0}));
-    assign InterXFRCompare = (InterXFRCount >= ({Delay1[15:8], 1'b0}));
+    // Calculate when tx/rx shift registers are full/empty
+    TransmitShiftFSM TransmitShiftFSM(PCLK, PRESETn, TransmitFIFOReadEmpty, ReceivePenultimateFrame, Active0, TransmitShiftEmpty);
+    ReceiveShiftFSM ReceiveShiftFSM(PCLK, PRESETn, SCLKenable, ReceivePenultimateFrame, SampleEdge, SckMode[0], ReceiveShiftFull);
 
-    //Calculate when tx/rx shift registers are full/empty
-    TransmitShiftFSM TransmitShiftFSM_1 (PCLK, PRESETn, TransmitFIFOReadEmpty, ReceivePenultimateFrameBoolean, Active0, TransmitShiftEmpty);
-    ReceiveShiftFSM ReceiveShiftFSM_1 (PCLK, PRESETn, SCLKenable, ReceivePenultimateFrameBoolean, SampleEdge, SckMode[0], ReceiveShiftFull);
-
-    //Calculate tx/rx fifo write and recieve increment signals 
-    assign TransmitFIFOWriteIncrement = (Memwrite & (Entry == 8'h48) & ~TransmitFIFOWriteFull & TransmitInactive);
+    // Calculate tx/rx fifo write and recieve increment signals 
 
     always_ff @(posedge PCLK, negedge PRESETn)
-        if (~PRESETn) TransmitFIFOWriteIncrementDelay <= 0;
-        else TransmitFIFOWriteIncrementDelay <= TransmitFIFOWriteIncrement;
+        if (~PRESETn) TransmitFIFOWriteIncrement <= 0;
+        else TransmitFIFOWriteIncrement <= (Memwrite & (Entry == 8'h48) & ~TransmitFIFOWriteFull & TransmitInactive);
 
     always_ff @(posedge PCLK, negedge PRESETn)
         if (~PRESETn) ReceiveFIFOReadIncrement <= 0;
         else ReceiveFIFOReadIncrement <= ((Entry == 8'h4C) & ~ReceiveFIFOReadEmpty & PSEL & ~ReceiveFIFOReadIncrement);
     
-    //Tx/Rx FIFOs
-    SynchFIFO #(3,8) txFIFO(PCLK, 1'b1, SCLKenable, PRESETn, TransmitFIFOWriteIncrementDelay, TransmitShiftEmpty, TransmitData[7:0], TransmitWriteWatermarkLevel, TransmitWatermark[2:0], TransmitFIFOReadData[7:0], TransmitFIFOWriteFull, TransmitFIFOReadEmpty, TransmitWriteMark, TransmitReadMark);
-    SynchFIFO #(3,8) rxFIFO(PCLK, SCLKenable, 1'b1, PRESETn, ReceiveShiftFullDelay, ReceiveFIFOReadIncrement, ReceiveShiftRegEndian, ReceiveWatermark[2:0], ReceiveReadWatermarkLevel, ReceiveData[7:0], ReceiveFIFOWriteFull, ReceiveFIFOReadEmpty, RecieveWriteMark, RecieveReadMark);
+    // Tx/Rx FIFOs
+    SynchFIFO #(3,8) txFIFO(PCLK, 1'b1, SCLKenable, PRESETn, TransmitFIFOWriteIncrement, TransmitShiftEmpty, TransmitData[7:0], TransmitWriteWatermarkLevel, TransmitWatermark[2:0],
+                            TransmitFIFOReadData[7:0], TransmitFIFOWriteFull, TransmitFIFOReadEmpty, TransmitWriteMark, TransmitReadMark);
+    SynchFIFO #(3,8) rxFIFO(PCLK, SCLKenable, 1'b1, PRESETn, ReceiveShiftFullDelay, ReceiveFIFOReadIncrement, ReceiveShiftRegEndian, ReceiveWatermark[2:0], ReceiveReadWatermarkLevel, 
+                            ReceiveData[7:0], ReceiveFIFOWriteFull, ReceiveFIFOReadEmpty, RecieveWriteMark, RecieveReadMark);
 
     always_ff @(posedge PCLK, negedge PRESETn)
         if (~PRESETn) TransmitFIFOReadEmptyDelay <= 1;
         else  if (SCLKenable) TransmitFIFOReadEmptyDelay <= TransmitFIFOReadEmpty;
 
-    
     always_ff @(posedge PCLK, negedge PRESETn)
         if (~PRESETn) ReceiveShiftFullDelay <= 0;
         else if (SCLKenable) ReceiveShiftFullDelay <= ReceiveShiftFull;
@@ -266,16 +247,16 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
 
     assign TransmitShiftRegLoad = ~TransmitShiftEmpty & ~Active | (((ChipSelectMode == 2'b10) & ~|(Delay1[15:8])) & ((ReceiveShiftFullDelay | ReceiveShiftFull) & ~SampleEdge & ~TransmitFIFOReadEmpty));
 
-    //Main FSM which controls SPI transmission
+    // Main FSM which controls SPI transmission
     typedef enum logic [2:0] {CS_INACTIVE, DELAY_0, ACTIVE_0, ACTIVE_1, DELAY_1,INTER_CS, INTER_XFR} statetype;
     statetype state;
 
     always_ff @(posedge PCLK, negedge PRESETn)
-        if (~PRESETn) begin state <= CS_INACTIVE;
+        if (~PRESETn) begin 
+                        state <= CS_INACTIVE;
                         FrameCount <= 4'b0;                      
-
-        /* verilator lint_off CASEINCOMPLETE */
         end else if (SCLKenable) begin
+            /* verilator lint_off CASEINCOMPLETE */
             case (state)
                 CS_INACTIVE: begin
                         CS_SCKCount <= 9'b1;
@@ -288,7 +269,7 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
                         end
                 DELAY_0: begin
                         CS_SCKCount <= CS_SCKCount + 9'b1;
-                        if (CS_SCKCompare) state <= ACTIVE_0;
+                        if (CS_SCKCount >= (({Delay0[7:0], 1'b0}) + ImplicitDelay1)) state <= ACTIVE_0;
                         end
                 ACTIVE_0: begin 
                         FrameCount <= FrameCount + 4'b1;
@@ -296,7 +277,7 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
                         end
                 ACTIVE_1: begin
                         InterXFRCount <= 9'b1;
-                        if (FrameCompare) state <= ACTIVE_0;
+                        if (FrameCount < Format[4:1]) state <= ACTIVE_0;
                         else if ((ChipSelectMode[1:0] == 2'b10) & ~|(Delay1[15:8]) & (~TransmitFIFOReadEmpty)) begin
                             state <= ACTIVE_0;
                             CS_SCKCount <= 9'b1;
@@ -310,11 +291,11 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
                         end
                 DELAY_1: begin
                         SCK_CSCount <= SCK_CSCount + 9'b1;
-                        if (SCK_CSCompare) state <= INTER_CS;
+                        if (SCK_CSCount >= (({Delay0[15:8], 1'b0}) + ImplicitDelay2)) state <= INTER_CS;
                         end
                 INTER_CS: begin
                         InterCSCount <= InterCSCount + 9'b1;
-                        if (InterCSCompare ) state <= CS_INACTIVE;
+                        if (InterCSCount >= ({Delay1[7:0],1'b0})) state <= CS_INACTIVE;
                         end
                 INTER_XFR: begin
                         CS_SCKCount <= 9'b1;
@@ -322,13 +303,14 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
                         FrameCount <= 4'b0;
                         InterCSCount <= 9'b10;
                         InterXFRCount <= InterXFRCount + 9'b1;
-                        if (InterXFRCompare & ~TransmitFIFOReadEmptyDelay) state <= ACTIVE_0;
+                        if ((InterXFRCount >= ({Delay1[15:8], 1'b0})) & ~TransmitFIFOReadEmptyDelay) state <= ACTIVE_0;
                         else if (~|ChipSelectMode[1:0]) state <= CS_INACTIVE;
                         end
             endcase
+            /* verilator lint_off CASEINCOMPLETE */
         end
 
-            /* verilator lint_off CASEINCOMPLETE */
+            
 
     assign DelayMode = SckMode[0] ? (state == DELAY_1) : (state == ACTIVE_1 & ReceiveShiftFull);
     assign ChipSelectInternal = (state == CS_INACTIVE | state == INTER_CS | DelayMode & ~|(Delay0[15:8])) ? ChipSelectDef : ~ChipSelectDef;
@@ -339,7 +321,7 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
     assign TransmitInactive = ((state == INTER_CS) | (state == CS_INACTIVE) | (state == INTER_XFR) | (ReceiveShiftFullDelayPCLK & ZeroDelayHoldMode));
     assign Active0 = (state == ACTIVE_0);
 
-    //Signal tracks which edge of sck to shift data
+    // Signal tracks which edge of sck to shift data
     always_comb
         case(SckMode[1:0])
             2'b00: ShiftEdge = ~sck & SCLKenable;
@@ -349,36 +331,36 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
             default: ShiftEdge = sck & SCLKenable;
         endcase
 
-    //Transmit shift register
-    assign TransmitDataEndian =  Format[0] ? {TransmitFIFOReadData[0], TransmitFIFOReadData[1], TransmitFIFOReadData[2], TransmitFIFOReadData[3], TransmitFIFOReadData[4], TransmitFIFOReadData[5], TransmitFIFOReadData[6], TransmitFIFOReadData[7]} : TransmitFIFOReadData[7:0];
+    // Transmit shift register
+    assign TransmitDataEndian = Format[0] ? {TransmitFIFOReadData[0], TransmitFIFOReadData[1], TransmitFIFOReadData[2], TransmitFIFOReadData[3], TransmitFIFOReadData[4], TransmitFIFOReadData[5], TransmitFIFOReadData[6], TransmitFIFOReadData[7]} : TransmitFIFOReadData[7:0];
     always_ff @(posedge PCLK, negedge PRESETn)
         if(~PRESETn)                        TransmitShiftReg <= 8'b0; 
         else if (TransmitShiftRegLoad)      TransmitShiftReg <= TransmitDataEndian;
-        else if (ShiftEdge & Active)   TransmitShiftReg <= {TransmitShiftReg[6:0], 1'b0};
+        else if (ShiftEdge & Active)        TransmitShiftReg <= {TransmitShiftReg[6:0], 1'b0};
     
     assign SPIOut = TransmitShiftReg[7];
 
-    //If in loopback mode, receive shift register is connected directly to module's output pins. Else, connected to SPIIn
-    //There are no setup/hold time issues because transmit shift register and receive shift register always shift/sample on opposite edges
+    // If in loopback mode, receive shift register is connected directly to module's output pins. Else, connected to SPIIn
+    // There are no setup/hold time issues because transmit shift register and receive shift register always shift/sample on opposite edges
     assign ShiftIn = P.SPI_LOOPBACK_TEST ? SPIOut : SPIIn;
 
-    //Receive shift register
+    // Receive shift register
     always_ff @(posedge PCLK, negedge PRESETn)
         if(~PRESETn)  ReceiveShiftReg <= 8'b0;
         else if (SampleEdge & SCLKenable) begin
-            if (~Active) ReceiveShiftReg <= 8'b0;
-            else ReceiveShiftReg <= {ReceiveShiftReg[6:0], ShiftIn};
+            if (~Active)    ReceiveShiftReg <= 8'b0;
+            else            ReceiveShiftReg <= {ReceiveShiftReg[6:0], ShiftIn};
         end
 
-    //Aligns received data and reverses if little-endian
+    // Aligns received data and reverses if little-endian
     assign LeftShiftAmount = 4'h8 - Format[4:1];
     assign ASR = ReceiveShiftReg << LeftShiftAmount[2:0];
     assign ReceiveShiftRegEndian = Format[0] ? {ASR[0], ASR[1], ASR[2], ASR[3], ASR[4], ASR[5], ASR[6], ASR[7]} : ASR[7:0];
 
-    //Interrupt logic: raise interrupt if any enabled interrupts are pending
+    // Interrupt logic: raise interrupt if any enabled interrupts are pending
     assign SPIIntr = |(InterruptPending & InterruptEnable);
 
-    //Chip select logic
+    // Chip select logic
     always_comb
         case(ChipSelectID[1:0])
             2'b00: ChipSelectAuto = {ChipSelectDef[3], ChipSelectDef[2], ChipSelectDef[1], ChipSelectInternal[0]};
@@ -390,14 +372,14 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
     assign SPICS = ChipSelectMode[0] ? ChipSelectDef : ChipSelectAuto;
 endmodule
 
-module SynchFIFO #(parameter M =3 , N= 8)(
-    input logic PCLK, wen, ren, PRESETn,
-    input logic winc,rinc,
-    input logic [N-1:0] wdata,
-    input logic [M-1:0] wwatermarklevel, rwatermarklevel,
+module SynchFIFO #(parameter M=3, N=8)(                 // 2^M entries of N bits each
+    input  logic         PCLK, wen, ren, PRESETn,
+    input  logic         winc, rinc,
+    input  logic [N-1:0] wdata,
+    input  logic [M-1:0] wwatermarklevel, rwatermarklevel,
     output logic [N-1:0] rdata,
-    output logic wfull, rempty,
-    output logic wwatermark, rwatermark);
+    output logic         wfull, rempty,
+    output logic         wwatermark, rwatermark);
 
     /* Pointer FIFO using design elements from "Simulation and Synthesis Techniques
        for Asynchronous FIFO Design" by Clifford E. Cummings. Namely, M bit read and write pointers
@@ -409,8 +391,6 @@ module SynchFIFO #(parameter M =3 , N= 8)(
     logic [N-1:0] mem[2**M];
     logic [M:0] rptr, wptr;
     logic [M:0] rptrnext, wptrnext;
-    logic rempty_val;
-    logic wfull_val;
     logic [M-1:0] raddr;
     logic [M-1:0] waddr;
  
@@ -428,53 +408,43 @@ module SynchFIFO #(parameter M =3 , N= 8)(
         end
         else begin 
             if (wen) begin
-                wfull <= wfull_val;
+                wfull <= ({~wptrnext[M], wptrnext[M-1:0]} == rptr);
                 wptr  <= wptrnext;
             end
             if (ren) begin 
                 rptr <= rptrnext;
-                rempty <= rempty_val;
+                rempty <= (wptr == rptrnext);
             end
         end 
-
+    
     assign raddr = rptr[M-1:0];
-    assign rptrnext = rptr + {3'b0, (rinc & ~rempty)};      
-    assign rempty_val = (wptr == rptrnext);
+    assign rptrnext = rptr + {{(M){1'b0}}, (rinc & ~rempty)};      
     assign rwatermark = ((waddr - raddr) < rwatermarklevel) & ~wfull;
     assign waddr = wptr[M-1:0];
     assign wwatermark = ((waddr - raddr) > wwatermarklevel) | wfull;
-    assign wptrnext = wptr + {3'b0, (winc & ~wfull)};
-    assign wfull_val = ({~wptrnext[M], wptrnext[M-1:0]} == rptr);
+    assign wptrnext = wptr + {{(M){1'b0}}, (winc & ~wfull)};
 endmodule
 
 module TransmitShiftFSM(
-    input logic PCLK, PRESETn,
-    input logic TransmitFIFOReadEmpty, ReceivePenultimateFrameBoolean, Active0,
+    input  logic PCLK, PRESETn,
+    input  logic TransmitFIFOReadEmpty, ReceivePenultimateFrame, Active0,
     output logic TransmitShiftEmpty);
 
-    typedef enum logic [1:0] {TransmitShiftEmptyState, TransmitShiftHoldState, TransmitShiftNotEmptyState} statetype;
-    statetype TransmitState, TransmitNextState;
     always_ff @(posedge PCLK, negedge PRESETn)
-        if (~PRESETn) TransmitState <= TransmitShiftEmptyState;
-        else          TransmitState <= TransmitNextState;
+        if (~PRESETn) TransmitShiftEmpty <= 1;
+        else if (TransmitShiftEmpty) begin        
+            if (TransmitFIFOReadEmpty | (~TransmitFIFOReadEmpty & (ReceivePenultimateFrame & Active0))) TransmitShiftEmpty <= 1;
+            else if (~TransmitFIFOReadEmpty) TransmitShiftEmpty <= 0;
+        end else begin
+            if (ReceivePenultimateFrame & Active0) TransmitShiftEmpty <= 1;
+            else TransmitShiftEmpty <= 0;
+        end
 
-        always_comb
-            case(TransmitState)
-                TransmitShiftEmptyState: begin
-                    if (TransmitFIFOReadEmpty | (~TransmitFIFOReadEmpty & (ReceivePenultimateFrameBoolean & Active0))) TransmitNextState = TransmitShiftEmptyState;
-                    else if (~TransmitFIFOReadEmpty) TransmitNextState = TransmitShiftNotEmptyState;
-                end
-                TransmitShiftNotEmptyState: begin
-                    if (ReceivePenultimateFrameBoolean & Active0) TransmitNextState = TransmitShiftEmptyState;
-                    else TransmitNextState = TransmitShiftNotEmptyState;
-                end
-            endcase
-        assign TransmitShiftEmpty = (TransmitNextState == TransmitShiftEmptyState);
 endmodule
 
 module ReceiveShiftFSM(
-    input logic PCLK, PRESETn, SCLKenable,
-    input logic ReceivePenultimateFrameBoolean, SampleEdge, SckMode,
+    input  logic PCLK, PRESETn, SCLKenable,
+    input  logic ReceivePenultimateFrame, SampleEdge, SckMode,
     output logic ReceiveShiftFull
 );
     typedef enum logic [1:0] {ReceiveShiftFullState, ReceiveShiftNotFullState, ReceiveShiftDelayState} statetype;
@@ -484,17 +454,12 @@ module ReceiveShiftFSM(
         else if (SCLKenable) begin
             case (ReceiveState)
                 ReceiveShiftFullState: ReceiveState <= ReceiveShiftNotFullState;
-                ReceiveShiftNotFullState: if (ReceivePenultimateFrameBoolean & (SampleEdge)) ReceiveState <= ReceiveShiftDelayState;
+                ReceiveShiftNotFullState: if (ReceivePenultimateFrame & (SampleEdge)) ReceiveState <= ReceiveShiftDelayState;
                                           else ReceiveState <= ReceiveShiftNotFullState;
-                ReceiveShiftDelayState: ReceiveState <= ReceiveShiftFullState;
+                ReceiveShiftDelayState:   ReceiveState <= ReceiveShiftFullState;
             endcase
         end
 
-        assign ReceiveShiftFull = SckMode ? (ReceiveState == ReceiveShiftFullState) : (ReceiveState == ReceiveShiftDelayState);
+    assign ReceiveShiftFull = SckMode ? (ReceiveState == ReceiveShiftFullState) : (ReceiveState == ReceiveShiftDelayState);
 endmodule
 
-
-
-
-
-

From 5e9157244b841b912fd01999b350a32b3acd29a9 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Tue, 14 Nov 2023 15:18:16 -0800
Subject: [PATCH 35/62] Restored Zfh to 0 for rv64gc because it breaks
 floating-point tests

---
 config/rv64gc/config.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/rv64gc/config.vh b/config/rv64gc/config.vh
index 564b32f5d..8decf60d5 100644
--- a/config/rv64gc/config.vh
+++ b/config/rv64gc/config.vh
@@ -42,7 +42,7 @@ localparam ZIFENCEI_SUPPORTED = 1;
 localparam COUNTERS = 12'd32;
 localparam ZICNTR_SUPPORTED = 1;
 localparam ZIHPM_SUPPORTED = 1;
-localparam ZFH_SUPPORTED = 1;
+localparam ZFH_SUPPORTED = 0;
 localparam SSTC_SUPPORTED = 1;
 localparam ZICBOM_SUPPORTED = 1;
 localparam ZICBOZ_SUPPORTED = 1;

From 18c29dd7d0d309b56b72fd083f2077fb0de61f89 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 15 Nov 2023 05:46:38 -0800
Subject: [PATCH 36/62] Removed riscv-arch-test submodule that appears
 corrupted

---
 .gitmodules            | 4 ----
 addins/riscv-arch-test | 1 -
 2 files changed, 5 deletions(-)
 delete mode 160000 addins/riscv-arch-test

diff --git a/.gitmodules b/.gitmodules
index 1e56898c8..dfb5fcf20 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,10 +1,6 @@
 [submodule "sky130/sky130_osu_sc_t12"]
 	path = sky130/sky130_osu_sc_t12
 	url = https://foss-eda-tools.googlesource.com/skywater-pdk/libs/sky130_osu_sc_t12/
-[submodule "addins/riscv-arch-test"]
-	path = addins/riscv-arch-test
-	url = https://github.com/riscv-non-isa/riscv-arch-test
-	ignore = dirty
 [submodule "addins/imperas-riscv-tests"]
 	path = addins/imperas-riscv-tests
 	url = https://github.com/riscv-ovpsim/imperas-riscv-tests
diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
deleted file mode 160000
index 2c5675d7a..000000000
--- a/addins/riscv-arch-test
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2c5675d7a58e98d47bef3a6cf5a8373397b0d0be

From 90cf128349408643056907f63cfc5f29a8089784 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 15 Nov 2023 05:48:33 -0800
Subject: [PATCH 37/62] Added back riscv-arch-test fresh

---
 .gitmodules            | 3 +++
 addins/riscv-arch-test | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 addins/riscv-arch-test

diff --git a/.gitmodules b/.gitmodules
index dfb5fcf20..361441a18 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -26,3 +26,6 @@
 [submodule "addins/vivado-risc-v"]
 	path = addins/vivado-risc-v
 	url = https://github.com/eugene-tarassov/vivado-risc-v.git
+[submodule "addins/riscv-arch-test"]
+	path = addins/riscv-arch-test
+	url = https://github.com/riscv-non-isa/riscv-arch-test
diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
new file mode 160000
index 000000000..4eea0a0f0
--- /dev/null
+++ b/addins/riscv-arch-test
@@ -0,0 +1 @@
+Subproject commit 4eea0a0f0e21f2613a114e45a5ad738e721c4044

From 1c4b3e37b1e16855568f6873266980953057fbd7 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 15 Nov 2023 06:05:55 -0800
Subject: [PATCH 38/62] Removed riscv-arch-test submodule that was corrupted

---
 .gitmodules            | 3 ---
 addins/riscv-arch-test | 1 -
 2 files changed, 4 deletions(-)
 delete mode 160000 addins/riscv-arch-test

diff --git a/.gitmodules b/.gitmodules
index 361441a18..dfb5fcf20 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -26,6 +26,3 @@
 [submodule "addins/vivado-risc-v"]
 	path = addins/vivado-risc-v
 	url = https://github.com/eugene-tarassov/vivado-risc-v.git
-[submodule "addins/riscv-arch-test"]
-	path = addins/riscv-arch-test
-	url = https://github.com/riscv-non-isa/riscv-arch-test
diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
deleted file mode 160000
index 4eea0a0f0..000000000
--- a/addins/riscv-arch-test
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 4eea0a0f0e21f2613a114e45a5ad738e721c4044

From 20afaa558a2630042401250e6a5f8dc72b5c4259 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 15 Nov 2023 06:07:57 -0800
Subject: [PATCH 39/62] Added back in riscv-arch-test

---
 .gitmodules            | 3 +++
 addins/riscv-arch-test | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 addins/riscv-arch-test

diff --git a/.gitmodules b/.gitmodules
index dfb5fcf20..361441a18 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -26,3 +26,6 @@
 [submodule "addins/vivado-risc-v"]
 	path = addins/vivado-risc-v
 	url = https://github.com/eugene-tarassov/vivado-risc-v.git
+[submodule "addins/riscv-arch-test"]
+	path = addins/riscv-arch-test
+	url = https://github.com/riscv-non-isa/riscv-arch-test
diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
new file mode 160000
index 000000000..4eea0a0f0
--- /dev/null
+++ b/addins/riscv-arch-test
@@ -0,0 +1 @@
+Subproject commit 4eea0a0f0e21f2613a114e45a5ad738e721c4044

From 79d6fe8c936466d8c1b684c25c670c35b4d6ef15 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Wed, 15 Nov 2023 08:45:25 -0600
Subject: [PATCH 40/62] Add wrapper passing automatically for individual
 designs vs. Wally

---
 synthDC/Makefile        | 40 ++++++++++++++++++++++++++++++++++------
 synthDC/ppa/ppaSynth.py |  4 ++--
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/synthDC/Makefile b/synthDC/Makefile
index d43a36b50..3e344e8d2 100755
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@@ -1,7 +1,28 @@
-#  
-# Makefile for synthesis
-# Shreya Sanghai (ssanghai@hmc.edu) 2/28/2022
-# Madeleine Masser-Frye (mmasserfrye@hmc.edu) 1/27/2023
+#####################
+# Makefile
+#
+# Written: ssanghai@hmc.edu, mmasserfrye@hmc.edu, james.stine@okstate.edu 15 November 2023
+#
+# Purpose: Makefile to be used for synthesis using DC
+#
+# A component of the Wally configurable RISC-V project.
+#
+# Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+#
+# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+#
+# Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+# except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+# may obtain a copy of the License at
+#
+# https:#solderpad.org/licenses/SHL-2.1/
+#
+# Unless required by applicable law or agreed to in writing, any work distributed under the 
+# License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+# either express or implied. See the License for the specific language governing permissions 
+# and limitations under the License.
+################################################
+
 NAME := synth
 # defaults
 export DESIGN ?= wallypipelinedcore
@@ -21,11 +42,18 @@ export MAXOPT ?= 0
 export DRIVE ?= FLOP
 export USESRAM ?= 0
 export WIDTH ?= 32
+export WRAPPER ?= 1
+export SAIFPOWER ?= 0
 
 time := $(shell date +%F-%H-%M)
 hash := $(shell git rev-parse --short HEAD)
-export OUTPUTDIR := runs/$(DESIGN)_$(WIDTH)_$(CONFIG)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
-export SAIFPOWER ?= 0
+# This is done to create different naming conventions to help the PPA python 
+# TODO: cleanup later to utilize better parsing/lexing
+ifeq ($(WRAPPER), 0)
+	export OUTPUTDIR := runs/$(DESIGN)_$(WIDTH)_$(CONFIG)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
+else
+	export OUTPUTDIR := runs/$(DESIGN)_$(CONFIG)_$(MOD)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
+endif
 
 OLDCONFIGDIR ?= ${WALLY}/config
 export CONFIGDIR ?= $(OUTPUTDIR)/config
diff --git a/synthDC/ppa/ppaSynth.py b/synthDC/ppa/ppaSynth.py
index 30fe1254f..0c4744c26 100755
--- a/synthDC/ppa/ppaSynth.py
+++ b/synthDC/ppa/ppaSynth.py
@@ -11,7 +11,7 @@ from multiprocessing import Pool
 from ppaAnalyze import synthsfromcsv
 
 def runCommand(module, width, tech, freq):
-    command = "make synth DESIGN={} WIDTH={} TECH={} DRIVE=INV FREQ={} MAXOPT=1 MAXCORES=1".format(module, width, tech, freq)
+    command = "make synth DESIGN={} WIDTH={} TECH={} DRIVE=INV FREQ={} MAXOPT=1 MAXCORES=1 WRAPPER=0".format(module, width, tech, freq)
     subprocess.call(command, shell=True)
 
 def deleteRedundant(synthsToRun):
@@ -95,4 +95,4 @@ if __name__ == '__main__':
 
 pool.starmap(runCommand, synthsToRun)
 pool.close()
-pool.join()
\ No newline at end of file
+pool.join()

From 8ca1e3ba374aa50e8664473e3a2d1712ab519518 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Wed, 15 Nov 2023 08:48:07 -0600
Subject: [PATCH 41/62] missing synth.tcl added for use with wrapper

---
 synthDC/scripts/synth.tcl | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/synthDC/scripts/synth.tcl b/synthDC/scripts/synth.tcl
index cd4d6ff27..668b1c215 100755
--- a/synthDC/scripts/synth.tcl
+++ b/synthDC/scripts/synth.tcl
@@ -1,7 +1,27 @@
+#####################
+# synth.tcl
 #
-# Synthesis Synopsys Flow
-# james.stine@okstate.edu 27 Sep 2015
+# Written: james.stine@okstate.edu 15 November 2023
 #
+# Purpose: Baseline DC Tcl file
+#
+# A component of the Wally configurable RISC-V project.
+#
+# Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+#
+# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+#
+# Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+# except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+# may obtain a copy of the License at
+#
+# https:#solderpad.org/licenses/SHL-2.1/
+#
+# Unless required by applicable law or agreed to in writing, any work distributed under the 
+# License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+# either express or implied. See the License for the specific language governing permissions 
+# and limitations under the License.
+################################################
 
 # start run clock
 set t1 [clock seconds]
@@ -26,6 +46,7 @@ set saifpower $::env(SAIFPOWER)
 set maxopt $::env(MAXOPT)
 set drive $::env(DRIVE)
 set width $::env(WIDTH)
+set wrapper $::env(WRAPPER)
 
 eval file copy -force [glob ${cfg}/*.vh] {$outputDir/hdl/}
 eval file copy -force [glob ${hdl_src}/cvw.sv] {$outputDir/hdl/}
@@ -33,7 +54,6 @@ eval file copy -force [glob ${hdl_src}/*/*.sv] {$outputDir/hdl/}
 eval file copy -force [glob ${hdl_src}/*/*/*.sv] {$outputDir/hdl/}
 
 # Check if a wrapper is needed and create it (to pass parameters when cvw_t parameters are used)
-set wrapper 0
 if {[catch {eval exec grep "cvw_t" $outputDir/hdl/$::env(DESIGN).sv}] == 0} {
     echo "Creating wrapper"
     set wrapper 1
@@ -440,7 +460,7 @@ set filename [format "%s%s" $outputDir  "/reports/cell.rep"]
 #redirect $filename { report_cell [get_cells -hier *] }  # not too useful
 
 set filename [format "%s%s" $outputDir  "/reports/power.rep"]
-redirect $filename { report_power -hierarchy -levels 1 }
+redirect $filename { report_power -analysis_effort high -hierarchy -levels 1 }
 
 set filename [format "%s%s" $outputDir  "/reports/constraint.rep"]
 redirect $filename { report_constraint }

From 98176665de32e50407470d1ef5e8944781270e9c Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 15 Nov 2023 08:05:41 -0800
Subject: [PATCH 42/62] Fixed messed-up hazard.sv

---
 src/hazard/hazard.sv | 38 +++++++++-----------------------------
 1 file changed, 9 insertions(+), 29 deletions(-)

diff --git a/src/hazard/hazard.sv b/src/hazard/hazard.sv
index 028dbf61d..12bd83bc5 100644
--- a/src/hazard/hazard.sv
+++ b/src/hazard/hazard.sv
@@ -26,8 +26,7 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module hazard import cvw::*;  #(parameter cvw_t P) (
-  // Detect hazards
+module hazard import cvw::*;  #(parameter cvw_t P) ( 
   input  logic  BPWrongE, CSRWriteFenceM, RetM, TrapM,   
   input  logic  LoadStallD, StoreStallD, MDUStallD, CSRRdStallD,
   input  logic  LSUStallM, IFUStallF,
@@ -46,28 +45,9 @@ module hazard import cvw::*;  #(parameter cvw_t P) (
 
   logic WFIStallM, WFIInterruptedM;
 
-  logic ValidWfiM, ValidTrapM, ValidRetM, ValidCSRWriteFenceM, ValidCSRRdStallD;
-  logic ValidFPUStallD, ValidFCvtIntStallD, ValidFDivBusyE, ValidMDUStallD, ValidDivBusyE;
-
-  // Gate Stall/Flush sources with supported features 
-  // This is not logically necessary because the original signals are already 0 when the feature is unsupported
-  // However, synthesis does not propagate the constant 0 across modules
-  // By gating these signals, synthesis eliminates unnecessary stall/flush logic, saving about 10% cycle time for rv32e
-  // These lines of code gating with a compile-time constant generate no hardware.
-  assign ValidWfiM = wfiM & P.ZICSR_SUPPORTED;
-  assign ValidTrapM = TrapM & P.ZICSR_SUPPORTED;
-  assign ValidRetM = RetM & P.ZICSR_SUPPORTED;
-  assign ValidCSRWriteFenceM = CSRWriteFenceM & P.ZICSR_SUPPORTED;
-  assign ValidCSRRdStallD = CSRRdStallD & P.ZICSR_SUPPORTED;
-  assign ValidFPUStallD = RetM & P.F_SUPPORTED;
-  assign ValidFCvtIntStallD = RetM & P.F_SUPPORTED;
-  assign ValidFDivBusyE = FDivBusyE & P.F_SUPPORTED;
-  assign ValidMDUStallD = MDUStallD & P.M_SUPPORTED;
-  assign ValidDivBusyE = DivBusyE & P.M_SUPPORTED;  
-
   // WFI logic
-  assign WFIStallM = ValidWfiM & ~IntPendingM;         // WFI waiting for an interrupt or timeout
-  assign WFIInterruptedM = ValidWfiM & IntPendingM;    // WFI detects a pending interrupt.  Retire WFI; trap if interrupt is enabled.
+  assign WFIStallM = wfiM & ~IntPendingM;         // WFI waiting for an interrupt or timeout
+  assign WFIInterruptedM = wfiM & IntPendingM;    // WFI detects a pending interrupt.  Retire WFI; trap if interrupt is enabled.
   
   // stalls and flushes
   // loads: stall for one cycle if the subsequent instruction depends on the load
@@ -89,10 +69,10 @@ module hazard import cvw::*;  #(parameter cvw_t P) (
   // Branch misprediction is found in the Execute stage and must flush the next two instructions.
   //   However, an active division operation resides in the Execute stage, and when the BP incorrectly mispredicts the divide as a taken branch, the divde must still complete
   // When a WFI is interrupted and causes a trap, it flushes the rest of the pipeline but not the W stage, because the WFI needs to commit
-  assign FlushDCause = ValidTrapM | ValidRetM | ValidCSRWriteFenceM | BPWrongE;
-  assign FlushECause = ValidTrapM | ValidRetM | ValidCSRWriteFenceM |(BPWrongE & ~(ValidDivBusyE | ValidFDivBusyE));
-  assign FlushMCause = ValidTrapM | ValidRetM | ValidCSRWriteFenceM;
-  assign FlushWCause = ValidTrapM & ~WFIInterruptedM;
+  assign FlushDCause = TrapM | RetM | CSRWriteFenceM | BPWrongE;
+  assign FlushECause = TrapM | RetM | CSRWriteFenceM |(BPWrongE & ~(DivBusyE | FDivBusyE));
+  assign FlushMCause = TrapM | RetM | CSRWriteFenceM;
+  assign FlushWCause = TrapM & ~WFIInterruptedM;
 
   // Stall causes
   //  Most data depenency stalls are identified in the decode stage
@@ -103,8 +83,8 @@ module hazard import cvw::*;  #(parameter cvw_t P) (
   //    The IFU stalls the entire pipeline rather than just Fetch to avoid complications with instructions later in the pipeline causing Exceptions
   //    A trap could be asserted at the start of a IFU/LSU stall, and should flush the memory operation
   assign StallFCause = '0;
-  assign StallDCause = (LoadStallD | StoreStallD | ValidMDUStallD | ValidCSRRdStallD | ValidFCvtIntStallD | ValidFPUStallD) & ~FlushDCause;
-  assign StallECause = (ValidDivBusyE | ValidFDivBusyE) & ~FlushECause; 
+  assign StallDCause = (LoadStallD | StoreStallD | MDUStallD | CSRRdStallD | FCvtIntStallD | FPUStallD) & ~FlushDCause;
+  assign StallECause = (DivBusyE | FDivBusyE) & ~FlushECause; 
   assign StallMCause = WFIStallM & ~FlushMCause;
   // Need to gate IFUStallF when the equivalent FlushFCause = FlushDCause = 1.
   // assign StallWCause = ((IFUStallF & ~FlushDCause) | LSUStallM) & ~FlushWCause;

From cfaeeae25a44dfd2c95f4fbdb0b06abb1622c5ba Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 15 Nov 2023 08:15:01 -0800
Subject: [PATCH 43/62] Added cmoz support to imperas.ic and adjusted imperas
 testbench to no longer need FPGA parameter

---
 sim/imperas.ic                 | 5 ++++-
 testbench/testbench-imperas.sv | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/sim/imperas.ic b/sim/imperas.ic
index adb10dcad..8d20cdd8f 100644
--- a/sim/imperas.ic
+++ b/sim/imperas.ic
@@ -22,6 +22,9 @@
 --override cpu/Zicbom=T
 --override cpu/Zicbop=T
 --override cpu/Zicboz=T
+--override cmomp_bytes=64  # Zic64b
+--override cmoz_bytes=64   # Zic64b
+--override lr_sc_grain=64  # Za64rs
 
 # 64 KiB continuous huge pages supported
 --override cpu/Svpbmt=T
@@ -40,7 +43,7 @@
 
 --override cpu/reset_address=0x80000000
 
---override cpu/unaligned=F
+--override cpu/unaligned=T  # Zicclsm (should be true)
 --override cpu/ignore_non_leaf_DAU=1
 --override cpu/wfi_is_nop=T
 --override cpu/misa_Extensions_mask=0x0
diff --git a/testbench/testbench-imperas.sv b/testbench/testbench-imperas.sv
index b503372d4..c27722f9f 100644
--- a/testbench/testbench-imperas.sv
+++ b/testbench/testbench-imperas.sv
@@ -237,7 +237,7 @@ module testbench;
     assign HRDATAEXT = 0;
   end
 
-  if(P.FPGA) begin : sdcard
+  if(P.SDC_SUPPORTED) begin : sdcard
     // *** fix later
 /* -----\/----- EXCLUDED -----\/-----
     sdModel sdcard

From 817ddbc7c5cef82f1987eac1c9e3847d47e205cb Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 15 Nov 2023 08:19:50 -0800
Subject: [PATCH 44/62] Adjusted LSU misaligned buffer to fix synthesis warning

---
 src/lsu/lsu.sv | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/lsu/lsu.sv b/src/lsu/lsu.sv
index ba7d8e119..d872e0114 100644
--- a/src/lsu/lsu.sv
+++ b/src/lsu/lsu.sv
@@ -92,7 +92,8 @@ module lsu import cvw::*;  #(parameter cvw_t P) (
   input var logic [7:0]           PMPCFG_ARRAY_REGW[P.PMP_ENTRIES-1:0], // PMP configuration from privileged unit
   input var logic [P.PA_BITS-3:0] PMPADDR_ARRAY_REGW[P.PMP_ENTRIES-1:0] // PMP address from privileged unit
 );
-  localparam MISALIGN_SUPPORT = P.ZICCLSM_SUPPORTED & P.DCACHE_SUPPORTED;
+  localparam logic MISALIGN_SUPPORT = P.ZICCLSM_SUPPORTED & P.DCACHE_SUPPORTED;
+  localparam MLEN = MISALIGN_SUPPROT ? 2*P.LLEN : P.LLEN; // widen buffer for misaligned accessess
 
   logic [P.XLEN+1:0]     IEUAdrExtM;                             // Memory stage address zero-extended to PA_BITS or XLEN whichever is longer
   logic [P.XLEN+1:0]     IEUAdrExtE;                             // Execution stage address zero-extended to PA_BITS or XLEN whichever is longer
@@ -118,9 +119,9 @@ module lsu import cvw::*;  #(parameter cvw_t P) (
 
   logic [P.LLEN-1:0]     DTIMReadDataWordM;                      // DTIM read data
   /* verilator lint_off WIDTHEXPAND */  
-  logic [(MISALIGN_SUPPORT+1)*P.LLEN-1:0]     DCacheReadDataWordM;                    // D$ read data
-  logic [(MISALIGN_SUPPORT+1)*P.LLEN-1:0]   LSUWriteDataSpillM;                     // Final write data
-  logic [((MISALIGN_SUPPORT+1)*P.LLEN-1)/8:0] ByteMaskSpillM;                       // Selects which bytes within a word to write
+  logic [MLEN-1:0]       DCacheReadDataWordM;                    // D$ read data
+  logic [MLEN-1:0]       LSUWriteDataSpillM;                     // Final write data
+  logic [MLEN/8-1:0]     ByteMaskSpillM;                         // Selects which bytes within a word to write
   /* verilator lint_on WIDTHEXPAND */
   logic [P.LLEN-1:0]     DCacheReadDataWordSpillM;               // D$ read data
   logic [P.LLEN-1:0]     ReadDataWordMuxM;                       // DTIM or D$ read data

From eef39bd49546ed66de44cfec32acc1ea18264463 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 15 Nov 2023 08:30:48 -0800
Subject: [PATCH 45/62] Fixed typo in lsu parameter

---
 src/lsu/lsu.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lsu/lsu.sv b/src/lsu/lsu.sv
index d872e0114..f01dc609b 100644
--- a/src/lsu/lsu.sv
+++ b/src/lsu/lsu.sv
@@ -93,7 +93,7 @@ module lsu import cvw::*;  #(parameter cvw_t P) (
   input var logic [P.PA_BITS-3:0] PMPADDR_ARRAY_REGW[P.PMP_ENTRIES-1:0] // PMP address from privileged unit
 );
   localparam logic MISALIGN_SUPPORT = P.ZICCLSM_SUPPORTED & P.DCACHE_SUPPORTED;
-  localparam MLEN = MISALIGN_SUPPROT ? 2*P.LLEN : P.LLEN; // widen buffer for misaligned accessess
+  localparam MLEN = MISALIGN_SUPPORT ? 2*P.LLEN : P.LLEN; // widen buffer for misaligned accessess
 
   logic [P.XLEN+1:0]     IEUAdrExtM;                             // Memory stage address zero-extended to PA_BITS or XLEN whichever is longer
   logic [P.XLEN+1:0]     IEUAdrExtE;                             // Execution stage address zero-extended to PA_BITS or XLEN whichever is longer

From 7b2bb86ced1d59f0639f1f7f589d7b09e21d72de Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Wed, 15 Nov 2023 09:48:13 -0800
Subject: [PATCH 46/62] changed to head of riscv-arch-test

---
 addins/riscv-arch-test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
index 4eea0a0f0..9f9bdd62d 160000
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@@ -1 +1 @@
-Subproject commit 4eea0a0f0e21f2613a114e45a5ad738e721c4044
+Subproject commit 9f9bdd62d3e37fcd8ad1b1a39d71694ccf1d74f3

From ff73f798edf00a4a8f9a1e2730d6af51b6c4d021 Mon Sep 17 00:00:00 2001
From: Jacob Pease <jacobpease@protonmail.com>
Date: Thu, 16 Nov 2023 13:59:12 -0600
Subject: [PATCH 47/62] Replaced vivado-risc-v addins directory with new SDC
 repo.

---
 .gitmodules              | 6 +++---
 fpga/generator/wally.tcl | 8 ++------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 361441a18..54d9dd970 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -23,9 +23,9 @@
 [submodule "addins/vivado-boards"]
 	path = addins/vivado-boards
 	url = https://github.com/Digilent/vivado-boards/
-[submodule "addins/vivado-risc-v"]
-	path = addins/vivado-risc-v
-	url = https://github.com/eugene-tarassov/vivado-risc-v.git
+[submodule "addins/ahbsdc"]
+	path = addins/ahbsdc
+	url = https://github.com/JacobPease/ahbsdc.git
 [submodule "addins/riscv-arch-test"]
 	path = addins/riscv-arch-test
 	url = https://github.com/riscv-non-isa/riscv-arch-test
diff --git a/fpga/generator/wally.tcl b/fpga/generator/wally.tcl
index d699c3d21..bad9981df 100644
--- a/fpga/generator/wally.tcl
+++ b/fpga/generator/wally.tcl
@@ -42,13 +42,9 @@ if {$board=="ArtyA7"} {
 # read in all other rtl
 read_verilog -sv [glob -type f  ../src/CopiedFiles_do_not_add_to_repo/*/*.sv ../src/CopiedFiles_do_not_add_to_repo/*/*/*.sv]
 # *** Once the sdc is updated to use ahb changes these to system verilog.
-read_verilog [glob -type f ../src/axi_sdc_controller.v]
-read_verilog [glob -type f ../../addins/vivado-risc-v/sdc/sd_cmd_master.v]
-read_verilog [glob -type f ../../addins/vivado-risc-v/sdc/sd_cmd_serial_host.v]
-read_verilog [glob -type f ../../addins/vivado-risc-v/sdc/sd_data_master.v]
-read_verilog [glob -type f ../../addins/vivado-risc-v/sdc/sd_data_serial_host.v]
+read_verilog [glob -type f ../../addins/ahbsdc/sdc/*.v]
 
-set_property include_dirs {../src/CopiedFiles_do_not_add_to_repo/config ../../config/shared ../../addins/vivado-risc-v/sdc} [current_fileset]
+set_property include_dirs {../src/CopiedFiles_do_not_add_to_repo/config ../../config/shared ../../addins/ahbsdc/sdc} [current_fileset]
 
 if {$board=="ArtyA7"} {
     add_files -fileset constrs_1 -norecurse ../constraints/constraints-$board.xdc

From 9df87872ef73d8ec4aaa7db36f39424f3e6b6e6c Mon Sep 17 00:00:00 2001
From: Jacob Pease <jacobpease@protonmail.com>
Date: Thu, 16 Nov 2023 15:13:12 -0600
Subject: [PATCH 48/62] Deleted vivado-risc-v directory and added ahbsdc.

---
 addins/vivado-risc-v | 1 -
 1 file changed, 1 deletion(-)
 delete mode 160000 addins/vivado-risc-v

diff --git a/addins/vivado-risc-v b/addins/vivado-risc-v
deleted file mode 160000
index c76a8613a..000000000
--- a/addins/vivado-risc-v
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit c76a8613a177b3a04face2cb8e15dd07a8d2fc40

From 38cf7f0fb74553f19898a267753258d49574d4e8 Mon Sep 17 00:00:00 2001
From: Jacob Pease <jacobpease@protonmail.com>
Date: Thu, 16 Nov 2023 17:46:48 -0600
Subject: [PATCH 49/62] ahbsdc submodule actually added this time.

---
 .gitmodules   | 2 +-
 addins/ahbsdc | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)
 create mode 160000 addins/ahbsdc

diff --git a/.gitmodules b/.gitmodules
index 54d9dd970..054afa6fb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -25,7 +25,7 @@
 	url = https://github.com/Digilent/vivado-boards/
 [submodule "addins/ahbsdc"]
 	path = addins/ahbsdc
-	url = https://github.com/JacobPease/ahbsdc.git
+	url = git@github.com:jacobpease/ahbsdc.git
 [submodule "addins/riscv-arch-test"]
 	path = addins/riscv-arch-test
 	url = https://github.com/riscv-non-isa/riscv-arch-test
diff --git a/addins/ahbsdc b/addins/ahbsdc
new file mode 160000
index 000000000..5df21aa66
--- /dev/null
+++ b/addins/ahbsdc
@@ -0,0 +1 @@
+Subproject commit 5df21aa6625eca120e64ea353ca641aff37d90b2

From f4f389f3736e9a51d08e6e206c6e987bbd214fe8 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 17 Nov 2023 13:27:57 -0800
Subject: [PATCH 50/62] Initial version of embench_arch_sweep.py

---
 benchmarks/embench/embench_arch_sweep.py | 86 ++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100755 benchmarks/embench/embench_arch_sweep.py

diff --git a/benchmarks/embench/embench_arch_sweep.py b/benchmarks/embench/embench_arch_sweep.py
new file mode 100755
index 000000000..ad629320a
--- /dev/null
+++ b/benchmarks/embench/embench_arch_sweep.py
@@ -0,0 +1,86 @@
+#!/usr/bin/python3
+# embench_arch_sweep.py
+# David_Harris@hmc.edu 16 November 2023
+# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+
+# Run embench on a variety of architectures and collate results
+
+import os
+from datetime import datetime
+import re
+import collections
+
+archs = ["rv32i_zicsr", "rv32im_zicsr", "rv32imc_zicsr", "rv32imc_zba_zbb_zbc_zbs_zicsr", "rv32imafdc_zba_zbb_zbc_zbs_zicsr"]
+
+def calcgeomean(d, arch):
+    progs = ["aha-mont64", "crc32", "cubic", "edn", "huffbench", "matmult-int", "minver", "nbody", "nettle-aes", "nettle-sha256", "nsichneu", "picojpeg", "qrduino", "sglib-combined", "slre", "st", "statemate", "ud", "wikisort"]
+    result = 1.0
+    for p in progs:
+        #val = d[arch][p]
+        val = d[arch].get(p, 1.0)
+        result = result *float(val)
+    result = pow(result, (1.0/float(len(progs))))
+    return result
+
+def tabulate_arch_sweep(directory):
+    for case in ["wallySizeOpt_size", "wallySpeedOpt_speed"]:
+        d = collections.defaultdict(dict)
+        for arch in archs:
+            file = case+"_"+arch+".json"
+            file_path = os.path.join(directory, file)
+            lines = []
+            try:
+                f = open(file_path, "r")
+                lines = f.readlines()
+            except:
+                f.close()
+                #print(file_path+" does not exist")
+            for line in lines:
+                #print("File: "+file+" Line: "+line)
+                #p = re.compile('".*" : .*,')
+                p = r'"([^"]*)" : ([^,\n]+)'
+                match = re.search(p, line)
+                if match:
+                    prog = match.group(1)
+                    result = match.group(2);
+                    d[arch][prog] = result;
+                    #print(match.group(1)+" " + match.group(2))
+            f.close()
+        for arch in [""] + archs:
+            print (arch, end="\t")
+        print("")
+        for prog in d[archs[0]]:
+            print(prog, end="\t")
+            for arch in archs:
+                entry = d[arch].get(prog, "n/a");
+                print (entry, end="\t")
+            print("")
+        print("New geo mean", end="\t")
+        for arch in archs:
+            geomean = calcgeomean(d, arch)
+            print(geomean, end="\t")
+        print("")
+ 
+def run_arch_sweep():
+    # make a folder whose name depends on the date
+    # Get current date
+    current_date = datetime.now()
+    # Format date as a string in the format YYYYMMDD
+    date_string = current_date.strftime('%Y%m%d_%H%M%S')
+    dir = "run_"+date_string
+    # Create a directory with the date string as its name
+    os.mkdir(dir)
+
+    # make a directory with the current date as its name 
+
+    # sweep the runs and save the results in the run directory
+    for arch in archs:
+        os.system("make clean")
+        os.system("make run ARCH="+arch)
+        for res in ["SizeOpt_size", "SizeOpt_speed", "SpeedOpt_size", "SpeedOpt_speed"]:
+            os.system("mv -f wally"+res+".json "+dir+"/wally"+res+"_"+arch+".json")
+    return dir
+
+#directory = run_arch_sweep()
+directory = "run_20231116_071322"
+tabulate_arch_sweep(directory)
\ No newline at end of file

From 3dc7b93f57178c7d03a65347ccc5a7ece7df7d9d Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Fri, 17 Nov 2023 16:25:35 -0600
Subject: [PATCH 51/62] Revert removal of WRAPPER option that is not prudent

---
 synthDC/Makefile          | 40 ++++++---------------------------------
 synthDC/ppa/ppaSynth.py   |  4 ++--
 synthDC/scripts/synth.tcl | 28 ++++-----------------------
 3 files changed, 12 insertions(+), 60 deletions(-)

diff --git a/synthDC/Makefile b/synthDC/Makefile
index 3e344e8d2..470ec8f47 100755
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@@ -1,28 +1,7 @@
-#####################
-# Makefile
-#
-# Written: ssanghai@hmc.edu, mmasserfrye@hmc.edu, james.stine@okstate.edu 15 November 2023
-#
-# Purpose: Makefile to be used for synthesis using DC
-#
-# A component of the Wally configurable RISC-V project.
-#
-# Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-#
-# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
-#
-# Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
-# except in compliance with the License, or, at your option, the Apache License version 2.0. You 
-# may obtain a copy of the License at
-#
-# https:#solderpad.org/licenses/SHL-2.1/
-#
-# Unless required by applicable law or agreed to in writing, any work distributed under the 
-# License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
-# either express or implied. See the License for the specific language governing permissions 
-# and limitations under the License.
-################################################
-
+#  
+# Makefile for synthesis
+# Shreya Sanghai (ssanghai@hmc.edu) 2/28/2022
+# Madeleine Masser-Frye (mmasserfrye@hmc.edu) 1/27/2023
 NAME := synth
 # defaults
 export DESIGN ?= wallypipelinedcore
@@ -42,18 +21,11 @@ export MAXOPT ?= 0
 export DRIVE ?= FLOP
 export USESRAM ?= 0
 export WIDTH ?= 32
-export WRAPPER ?= 1
-export SAIFPOWER ?= 0
 
 time := $(shell date +%F-%H-%M)
 hash := $(shell git rev-parse --short HEAD)
-# This is done to create different naming conventions to help the PPA python 
-# TODO: cleanup later to utilize better parsing/lexing
-ifeq ($(WRAPPER), 0)
-	export OUTPUTDIR := runs/$(DESIGN)_$(WIDTH)_$(CONFIG)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
-else
-	export OUTPUTDIR := runs/$(DESIGN)_$(CONFIG)_$(MOD)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
-endif
+export OUTPUTDIR := runs/$(DESIGN)_$(CONFIG)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
+export SAIFPOWER ?= 0
 
 OLDCONFIGDIR ?= ${WALLY}/config
 export CONFIGDIR ?= $(OUTPUTDIR)/config
diff --git a/synthDC/ppa/ppaSynth.py b/synthDC/ppa/ppaSynth.py
index 0c4744c26..30fe1254f 100755
--- a/synthDC/ppa/ppaSynth.py
+++ b/synthDC/ppa/ppaSynth.py
@@ -11,7 +11,7 @@ from multiprocessing import Pool
 from ppaAnalyze import synthsfromcsv
 
 def runCommand(module, width, tech, freq):
-    command = "make synth DESIGN={} WIDTH={} TECH={} DRIVE=INV FREQ={} MAXOPT=1 MAXCORES=1 WRAPPER=0".format(module, width, tech, freq)
+    command = "make synth DESIGN={} WIDTH={} TECH={} DRIVE=INV FREQ={} MAXOPT=1 MAXCORES=1".format(module, width, tech, freq)
     subprocess.call(command, shell=True)
 
 def deleteRedundant(synthsToRun):
@@ -95,4 +95,4 @@ if __name__ == '__main__':
 
 pool.starmap(runCommand, synthsToRun)
 pool.close()
-pool.join()
+pool.join()
\ No newline at end of file
diff --git a/synthDC/scripts/synth.tcl b/synthDC/scripts/synth.tcl
index 668b1c215..cd4d6ff27 100755
--- a/synthDC/scripts/synth.tcl
+++ b/synthDC/scripts/synth.tcl
@@ -1,27 +1,7 @@
-#####################
-# synth.tcl
 #
-# Written: james.stine@okstate.edu 15 November 2023
+# Synthesis Synopsys Flow
+# james.stine@okstate.edu 27 Sep 2015
 #
-# Purpose: Baseline DC Tcl file
-#
-# A component of the Wally configurable RISC-V project.
-#
-# Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-#
-# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
-#
-# Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
-# except in compliance with the License, or, at your option, the Apache License version 2.0. You 
-# may obtain a copy of the License at
-#
-# https:#solderpad.org/licenses/SHL-2.1/
-#
-# Unless required by applicable law or agreed to in writing, any work distributed under the 
-# License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
-# either express or implied. See the License for the specific language governing permissions 
-# and limitations under the License.
-################################################
 
 # start run clock
 set t1 [clock seconds]
@@ -46,7 +26,6 @@ set saifpower $::env(SAIFPOWER)
 set maxopt $::env(MAXOPT)
 set drive $::env(DRIVE)
 set width $::env(WIDTH)
-set wrapper $::env(WRAPPER)
 
 eval file copy -force [glob ${cfg}/*.vh] {$outputDir/hdl/}
 eval file copy -force [glob ${hdl_src}/cvw.sv] {$outputDir/hdl/}
@@ -54,6 +33,7 @@ eval file copy -force [glob ${hdl_src}/*/*.sv] {$outputDir/hdl/}
 eval file copy -force [glob ${hdl_src}/*/*/*.sv] {$outputDir/hdl/}
 
 # Check if a wrapper is needed and create it (to pass parameters when cvw_t parameters are used)
+set wrapper 0
 if {[catch {eval exec grep "cvw_t" $outputDir/hdl/$::env(DESIGN).sv}] == 0} {
     echo "Creating wrapper"
     set wrapper 1
@@ -460,7 +440,7 @@ set filename [format "%s%s" $outputDir  "/reports/cell.rep"]
 #redirect $filename { report_cell [get_cells -hier *] }  # not too useful
 
 set filename [format "%s%s" $outputDir  "/reports/power.rep"]
-redirect $filename { report_power -analysis_effort high -hierarchy -levels 1 }
+redirect $filename { report_power -hierarchy -levels 1 }
 
 set filename [format "%s%s" $outputDir  "/reports/constraint.rep"]
 redirect $filename { report_constraint }

From 7b33331cf73ed0daa7dcf21647dc8323592f92f3 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 17 Nov 2023 15:10:57 -0800
Subject: [PATCH 52/62] Got Wally sweep running again

---
 synthDC/Makefile        | 2 +-
 synthDC/ppa/ppaSynth.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/synthDC/Makefile b/synthDC/Makefile
index 470ec8f47..2183cab83 100755
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@@ -24,7 +24,7 @@ export WIDTH ?= 32
 
 time := $(shell date +%F-%H-%M)
 hash := $(shell git rev-parse --short HEAD)
-export OUTPUTDIR := runs/$(DESIGN)_$(CONFIG)_$(TECH)_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
+export OUTPUTDIR := runs/$(DESIGN)_$(CONFIG)_$(MOD)_$(TECH)nm_$(FREQ)_MHz_$(time)_$(TITLE)_$(hash)
 export SAIFPOWER ?= 0
 
 OLDCONFIGDIR ?= ${WALLY}/config
diff --git a/synthDC/ppa/ppaSynth.py b/synthDC/ppa/ppaSynth.py
index 30fe1254f..315fa554a 100755
--- a/synthDC/ppa/ppaSynth.py
+++ b/synthDC/ppa/ppaSynth.py
@@ -85,7 +85,8 @@ if __name__ == '__main__':
 
     ##### Run a sweep for multiple modules/widths based on best delay found in existing syntheses
 	modules = ['adder']
-	widths = [8, 16, 32, 64, 128]
+#	widths = [8, 16, 32, 64, 128]
+	widths = [32]
 	tech = 'sky130'
 	synthsToRun = freqModuleSweep(widths, modules, tech)	
         

From 96f9409da4d0153459067f331dd75591feb94b89 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 17 Nov 2023 15:11:52 -0800
Subject: [PATCH 53/62] Embench Makefile to sweep experiments across configs

---
 benchmarks/embench/Makefile | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/benchmarks/embench/Makefile b/benchmarks/embench/Makefile
index 97c7660c5..d7a18b7e2 100644
--- a/benchmarks/embench/Makefile
+++ b/benchmarks/embench/Makefile
@@ -3,6 +3,7 @@
 # Compile Embench for Wally
 
 embench_dir = ../../addins/embench-iot
+ARCH=rv32imac_zicsr
 
 all: build 
 run: build size sim
@@ -15,7 +16,7 @@ buildsize: build_speedopt_size build_sizeopt_size
 
 # uses the build_all.py python file to build the tests in addins/embench-iot/bd_speed/ optimized for speed and size
 build_speedopt_speed:
-	$(embench_dir)/build_all.py --builddir=bd_speedopt_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S" --cflags="-O2 -nostartfiles" 
+	$(embench_dir)/build_all.py --builddir=bd_speedopt_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S -march=$(ARCH)" --cflags="-O2 -nostartfiles -march=$(ARCH)" 
 	# remove files not used in embench1.0  When changing to 2.0, restore these files		
 	#rm -rf $(embench_dir)/bd_speedopt_speed/src/md5sum
 	#rm -rf $(embench_dir)/bd_speedopt_speed/src/tarfind
@@ -23,7 +24,7 @@ build_speedopt_speed:
 	find $(embench_dir)/bd_speedopt_speed/ -type f ! -name "*.*" | while read f; do cp "$$f" "$$f.elf"; done
 
 build_sizeopt_speed:
-	$(embench_dir)/build_all.py --builddir=bd_sizeopt_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S" --cflags="-Os -nostartfiles" 
+	$(embench_dir)/build_all.py --builddir=bd_sizeopt_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S -march=$(ARCH)" --cflags="-Os -nostartfiles -march=$(ARCH)" 
 	# remove files not used in embench1.0  When changing to 2.0, restore these files		
 	#rm -rf $(embench_dir)/bd_sizeopt_speed/src/md5sum
 	#rm -rf $(embench_dir)/bd_sizeopt_speed/src/tarfind
@@ -32,10 +33,10 @@ build_sizeopt_speed:
 
 # uses the build_all.py python file to build the tests in addins/embench-iot/bd_speed/ optimized for speed and size
 build_speedopt_size:
-	$(embench_dir)/build_all.py --builddir=bd_speedopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S" --cflags="-O2 -msave-restore" --dummy-libs="libgcc libm libc crt0"
+	$(embench_dir)/build_all.py --builddir=bd_speedopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S -march=$(ARCH)" --cflags="-O2 -msave-restore -march=$(ARCH)" --dummy-libs="libgcc libm libc crt0"
 
 build_sizeopt_size:
-	$(embench_dir)/build_all.py --builddir=bd_sizeopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S" --cflags="-Os -msave-restore" --dummy-libs="libgcc libm libc crt0"
+	$(embench_dir)/build_all.py --builddir=bd_sizeopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S -march=$(ARCH)" --cflags="-Os -msave-restore -march=$(ARCH)" --dummy-libs="libgcc libm libc crt0"
 
 # builds dependencies, then launches modelsim and finally runs python wrapper script to present results
 sim: modelsim_build_memfile modelsim_run speed

From 423ae2bb761b4e37ebdfe5cad45a4b2735135f25 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 17 Nov 2023 17:02:32 -0800
Subject: [PATCH 54/62] Ignore benchmark results

---
 .gitignore | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index bd7e800df..3990c3823 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,7 +10,7 @@ __pycache__/
 addins/riscv-arch-test/Makefile.include
 addins/riscv-tests/target
 addins/TestFloat-3e/build/Linux-x86_64-GCC/*
-benchmarks/embench/wally*.json
+
 
 #vsim work files to ignore
 transcript
@@ -175,3 +175,6 @@ tests/fp/combined_IF_vectors/IF_vectors/*.tv
 sim/bp-results/*.log
 sim/branch*.log
 /tests/custom/fpga-test-sdc/bin/fpga-test-sdc
+benchmarks/embench/wally*.json
+benchmarks/embench/run*
+sim/cfi.log

From 96556064a4db013548a84d90ade807af9db31820 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 17 Nov 2023 18:31:44 -0800
Subject: [PATCH 55/62] Restored RV64GC BPRED_SIZE=10 for consistent synthesis
 results

---
 config/rv64gc/config.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/rv64gc/config.vh b/config/rv64gc/config.vh
index e00c9153d..84f4de599 100644
--- a/config/rv64gc/config.vh
+++ b/config/rv64gc/config.vh
@@ -150,7 +150,7 @@ localparam PLIC_SDC_ID = 32'd9;
 localparam BPRED_SUPPORTED = 1;
 localparam BPRED_TYPE = `BP_GSHARE; // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT
 localparam BPRED_NUM_LHR = 32'd6;
-localparam BPRED_SIZE = 32'd6;
+localparam BPRED_SIZE = 32'd10;
 localparam BTB_SIZE = 32'd10;
 localparam RAS_SIZE = 32'd16;
 

From acc2db256f606e253253867d9af353762b01583d Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Fri, 17 Nov 2023 20:25:24 -0800
Subject: [PATCH 56/62] turn off IDIVONFPU when FSUPPORTED=0.  Already checked
 in sim, but need it in synth too for feature sweep

---
 config/rv32gc/config.vh | 4 ++--
 src/ieu/datapath.sv     | 2 +-
 src/mdu/mdu.sv          | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/rv32gc/config.vh b/config/rv32gc/config.vh
index 3b306a005..e095e6252 100644
--- a/config/rv32gc/config.vh
+++ b/config/rv32gc/config.vh
@@ -74,8 +74,8 @@ localparam ICACHE_LINELENINBITS = 32'd512;
 
 // Integer Divider Configuration
 // IDIV_BITSPERCYCLE must be 1, 2, or 4
-localparam IDIV_BITSPERCYCLE = 32'd4;
-localparam IDIV_ON_FPU = 1;
+localparam IDIV_BITSPERCYCLE = 32'd2;
+localparam IDIV_ON_FPU = 0;
 
 // Legal number of PMP entries are 0, 16, or 64
 localparam PMP_ENTRIES = 32'd16;
diff --git a/src/ieu/datapath.sv b/src/ieu/datapath.sv
index bb7638514..8c366a2ef 100644
--- a/src/ieu/datapath.sv
+++ b/src/ieu/datapath.sv
@@ -131,7 +131,7 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
   if (P.F_SUPPORTED) begin:fpmux
     mux2  #(P.XLEN)  resultmuxM(IEUResultM, FIntResM, FWriteIntM, IFResultM);
     mux2  #(P.XLEN)  cvtresultmuxW(IFResultW, FCvtIntResW, FCvtIntW, IFCvtResultW);
-    if (P.IDIV_ON_FPU) begin
+    if (P.IDIV_ON_FPU & P.F_SUPPORTED) begin
       mux2  #(P.XLEN)  divresultmuxW(MDUResultW, FIntDivResultW, IntDivW, MulDivResultW);
     end else begin 
       assign MulDivResultW = MDUResultW;
diff --git a/src/mdu/mdu.sv b/src/mdu/mdu.sv
index 83327a460..e152fc6de 100644
--- a/src/mdu/mdu.sv
+++ b/src/mdu/mdu.sv
@@ -57,7 +57,7 @@ module mdu import cvw::*;  #(parameter cvw_t P) (
   // Start a divide when a new division instruction is received and the divider isn't already busy or finishing
   // When IDIV_ON_FPU is set, use the FPU divider instead
   // In ZMMUL, with M_SUPPORTED = 0, omit the divider
-  if ((P.IDIV_ON_FPU) || (!P.M_SUPPORTED)) begin:nodiv  
+  if ((P.IDIV_ON_FPU & P.F_SUPPORTED) || (!P.M_SUPPORTED)) begin:nodiv  
     assign QuotM = 0;
     assign RemM = 0;
     assign DivBusyE = 0;

From 87e6a5ccf2b16c4fa956a506f252c87cc477b1bb Mon Sep 17 00:00:00 2001
From: Jacob Pease <jacobpease@protonmail.com>
Date: Sat, 18 Nov 2023 19:15:39 -0600
Subject: [PATCH 57/62] Updated ROM to preload bootloader from file and infer a
 block ram when building for FPGA.

---
 fpga/constraints/marked_debug.txt |   1 +
 fpga/src/boot.mem                 | 513 ++++++++++++++++++++++++++++++
 src/generic/mem/rom1p1r.sv        |  22 +-
 3 files changed, 529 insertions(+), 7 deletions(-)
 create mode 100644 fpga/src/boot.mem

diff --git a/fpga/constraints/marked_debug.txt b/fpga/constraints/marked_debug.txt
index 3973fc451..7d5636f31 100644
--- a/fpga/constraints/marked_debug.txt
+++ b/fpga/constraints/marked_debug.txt
@@ -45,6 +45,7 @@ ifu/ifu.sv: logic            PCPF
 ifu/ifu.sv: logic     PostSpillInstrRawF
 mmu/hptw.sv: logic	   ITLBWriteF
 mmu/hptw.sv:	 statetype WalkerState
+mmu/hptw.sv: logic ValidPTE
 privileged/csrs.sv: logic        CSRSReadValM
 privileged/csrs.sv: logic        SEPC_REGW
 privileged/csrs.sv: logic        MIP_REGW
diff --git a/fpga/src/boot.mem b/fpga/src/boot.mem
new file mode 100644
index 000000000..4ad2f0657
--- /dev/null
+++ b/fpga/src/boot.mem
@@ -0,0 +1,513 @@
+8001819300002197
+4281420141014081
+4481440143814301
+4681460145814501
+4881480147814701
+4a814a0149814901
+4c814c014b814b01
+4e814e014d814d01
+0110011b4f814f01
+059b45011161016e
+0004063705fe0010
+1f6000ef8006061b
+0ff003930000100f
+4e952e3110060e37
+c602829b0053f2b7
+2023fe02dfe312fd
+829b0053f2b7007e
+fe02dfe312fdc602
+4de31efd000e2023
+059bf1402573fdd0
+0000061705e20870
+0010029b01260613
+68110002806702fe
+0085179bf0080813
+038008130107f7b3
+480508a86c632781
+1533357902a87963
+38030000181700a8
+1c6301057833f268
+081a403018370808
+0105783342280813
+1815751308081063
+00367513c295e14d
+654ded510207e793
+c1701ff00613f130
+0637c530fff6861b
+664dcd10167d0200
+17fd001007b7c25c
+859b5a5cc20cd21c
+02062a23dfed0007
+4785fffd561c664d
+4501461c06f59063
+4a1cc35c465cc31c
+e29dc75c4a5cc71c
+0c63086008138082
+1ae30a9008130105
+b7710017e793f905
+e793b75901d7e793
+5f5c674db7410197
+66cd02072e23dffd
+fff78513ff7d5698
+40a0053300a03533
+bfb100a7e7938082
+e0a2715d8082557d
+e486f052f44ef84a
+fa13e85aec56fc26
+843289ae892a0086
+00959993000a1463
+864ac4396b054a85
+0009859b4549870a
+0004049b05540363
+86a66485008b7363
+870a87aaec7ff0ef
+4531458146014681
+f0ef0207c9639c05
+17820094979beb1f
+873e020541639381
+993e99ba020a1963
+870aa8094501f85d
+e8bff0ef45454685
+60a64505fe0559e3
+79a2794274e26406
+61616b426ae27a02
+9301020497138082
+f40647057179b7f1
+d79867cdec26f022
+dff58b85571c674d
+2423d35c03600793
+fffd571c674d0207
+0007a737b00026f3
+b00027f311f70713
+674dfef77de38f95
+4f5ccf9d8b895b1c
+26f3cf5c0027e793
+071305f5e737b000
+8f95b00027f30ff7
+4f5c674dfef77de3
+b00026f3cf5c9bf5
+67f7071300989737
+7de38f95b00027f3
+458146014681fef7
+ddbff0ef4501870a
+059346014681870a
+dcbff0ef45211aa0
+1aa007134782e939
+816393d117d24411
+85220ff0041302e7
+614564e270a27402
+46e3da5ff0efa0cd
+0207c7634782fe05
+458146014681870a
+d8bff0ef03700513
+46014681870a87aa
+0a900513403005b7
+4409bf7dfc07d9e3
+c3998b8583f9bfe1
+4681870a00846413
+f0ef450945814601
+870afa0540e3d59f
+123405b746014681
+46e3d45ff0ef450d
+870a77c14482f805
+85a6460146818cfd
+4ae3d2dff0ef451d
+d3d8470567cdf605
+000f4737b00026f3
+b00027f323f70713
+67cdfef77de38f95
+4681870a0007ae23
+0370051385a64601
+f2054fe3cf7ff0ef
+458146014681870a
+ce3ff0ef08600513
+4681870af20545e3
+4541200005934601
+f0055de3ccfff0ef
+3023bf010113bf09
+4605842a86aa4081
+40113423850a4585
+86a265a6da5ff0ef
+d99ff0ef04084605
+2201358322813603
+86a2260508700513
+d81ff0ef05629e0d
+2a0135832a813603
+9e0d86a226054505
+3603d6bff0ef057e
+0513320135833281
+9e0d86a226054010
+3083d53ff0ef0556
+4501400134034081
+0000808241010113
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+00600100d2e3ca40
diff --git a/src/generic/mem/rom1p1r.sv b/src/generic/mem/rom1p1r.sv
index 93f8c82df..5a45e354a 100644
--- a/src/generic/mem/rom1p1r.sv
+++ b/src/generic/mem/rom1p1r.sv
@@ -33,7 +33,7 @@ module rom1p1r #(parameter ADDR_WIDTH = 8, DATA_WIDTH = 32, PRELOAD_ENABLED = 0)
 );
 
    // Core Memory
-   logic [DATA_WIDTH-1:0]    ROM [(2**ADDR_WIDTH)-1:0];
+   (*rom_style="block" *) logic [DATA_WIDTH-1:0]    ROM [(2**ADDR_WIDTH)-1:0];
    
    // dh 10/30/23 ROM macros are presently commented out
    // because they don't point to a generated ROM
@@ -41,15 +41,23 @@ module rom1p1r #(parameter ADDR_WIDTH = 8, DATA_WIDTH = 32, PRELOAD_ENABLED = 0)
       rom1p1r_128x64 rom1 (.CLK(clk), .CEB(~ce), .A(addr[6:0]), .Q(dout));
 
    end if ((`USE_SRAM == 1) & (ADDR_WDITH == 7) & (DATA_WIDTH == 32)) begin
-      rom1p1r_128x32 rom1 (.CLK(clk), .CEB(~ce), .A(addr[6:0]), .Q(dout));      
+ rom1p1r_128x32 rom1 (.CLK(clk), .CEB(~ce), .A(addr[6:0]), .Q(dout));
 
-   end else begin */
-   always @ (posedge clk) 
-      if(ce) dout <= ROM[addr];    
+  end else begin */
+
+  initial begin
+    if (PRELOAD_ENABLED) begin
+      $readmemh("../../../fpga/src/boot.mem", ROM, 0);
+    end
+  end
+  
+  always @ (posedge clk) begin
+    if(ce) dout <= ROM[addr];
+  end
    
    
    // for FPGA, initialize with zero-stage bootloader
-   if(PRELOAD_ENABLED) begin
+   /*if(PRELOAD_ENABLED) begin
       initial begin
         ROM[0]=64'h8001819300002197;
         ROM[1]=64'h4281420141014081;
@@ -195,6 +203,6 @@ module rom1p1r #(parameter ADDR_WIDTH = 8, DATA_WIDTH = 32, PRELOAD_ENABLED = 0)
         ROM[141]=64'h0000808241010113;
         
       end // if (PRELOAD_ENABLED)  
-   end 
+   end*/
 
 endmodule 

From b692c913c4908bbaf7e3407727681eb216c69f0b Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sat, 18 Nov 2023 20:56:50 -0800
Subject: [PATCH 58/62] Changed rv32gc to do IDIV in MDU and have k=2 copies of
 FDIV stages; added correct sky130 adder data; fixed feature substitution in
 synthesis makefile

---
 config/rv32gc/config.vh   | 2 +-
 synthDC/Makefile          | 4 ++--
 synthDC/extractSummary.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/rv32gc/config.vh b/config/rv32gc/config.vh
index e095e6252..52baad796 100644
--- a/config/rv32gc/config.vh
+++ b/config/rv32gc/config.vh
@@ -169,7 +169,7 @@ localparam ZMMUL_SUPPORTED = 0;
 
 // FPU division architecture
 localparam RADIX = 32'd4;
-localparam DIVCOPIES = 32'd4;
+localparam DIVCOPIES = 32'd2;
 
 // bit manipulation
 localparam ZBA_SUPPORTED = 1;
diff --git a/synthDC/Makefile b/synthDC/Makefile
index 2183cab83..7968a7b52 100755
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@@ -94,10 +94,10 @@ endif
 
 ifneq ($(MOD), orig)
 	# PMP 0
-	sed -i 's/PMP_ENTRIES \(64\|16\|0\)/PMP_ENTRIES = 0;/' $(CONFIGDIR)/config.vh
+	sed -i 's/PMP_ENTRIES.*\(64\|16\)/PMP_ENTRIES = 0;/' $(CONFIGDIR)/config.vh
 ifneq ($(MOD), PMP0)
 	# no priv
-	sed -i 's/ZICSR_SUPPORTED *1/ZICSR_SUPPORTED = 0;/' $(CONFIGDIR)/config.vh
+	sed -i 's/ZICSR_SUPPORTED.*1/ZICSR_SUPPORTED = 0;/' $(CONFIGDIR)/config.vh
 ifneq ($(MOD), noPriv)
 	# turn off FPU 
 	sed -i 's/1 *<< *3/0 << 3/' $(CONFIGDIR)/config.vh
diff --git a/synthDC/extractSummary.py b/synthDC/extractSummary.py
index 7a3f45ddd..d6f5933a9 100755
--- a/synthDC/extractSummary.py
+++ b/synthDC/extractSummary.py
@@ -252,7 +252,7 @@ if __name__ == '__main__':
 
     TechSpec = namedtuple("TechSpec", "color shape targfreq fo4 add32area add32lpower add32denergy")
     techdict = {}
-    techdict['sky130'] = TechSpec('green', 'o', args.sky130freq, 99.5e-3, 1440.600027, 714.057, 0.658023)
+    techdict['sky130'] = TechSpec('green', 'o', args.sky130freq, 99.5e-3, 2581, 18, 0.685)
     techdict['sky90'] = TechSpec('gray', 'o', args.sky90freq, 43.2e-3, 1440.600027, 714.057, 0.658023)
     techdict['tsmc28psyn'] = TechSpec('blue', 's', args.tsmcfreq, 12.2e-3, 209.286002, 1060.0, .081533)
 

From 887cf935dce54c77af64f71ca6d49cbb6e012f6c Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 19 Nov 2023 06:49:07 -0800
Subject: [PATCH 59/62] wallySynthAll.sh automates running all synthesis
 experiments without maxopt

---
 benchmarks/embench/embench_arch_sweep.py |  7 ++++---
 synthDC/wallySynthAll.sh                 | 13 +++++++++++++
 2 files changed, 17 insertions(+), 3 deletions(-)
 create mode 100755 synthDC/wallySynthAll.sh

diff --git a/benchmarks/embench/embench_arch_sweep.py b/benchmarks/embench/embench_arch_sweep.py
index ad629320a..130a70581 100755
--- a/benchmarks/embench/embench_arch_sweep.py
+++ b/benchmarks/embench/embench_arch_sweep.py
@@ -10,7 +10,8 @@ from datetime import datetime
 import re
 import collections
 
-archs = ["rv32i_zicsr", "rv32im_zicsr", "rv32imc_zicsr", "rv32imc_zba_zbb_zbc_zbs_zicsr", "rv32imafdc_zba_zbb_zbc_zbs_zicsr"]
+#archs = ["rv32i_zicsr", "rv32im_zicsr", "rv32imc_zicsr", "rv32imc_zba_zbb_zbc_zbs_zicsr", "rv32imafdc_zba_zbb_zbc_zbs_zicsr"]
+archs = ["rv32imafdc_zba_zbb_zbc_zbs_zicsr", "rv32i_zicsr", "rv32im_zicsr", "rv32imc_zicsr", "rv32imc_zba_zbb_zbc_zbs_zicsr"]
 
 def calcgeomean(d, arch):
     progs = ["aha-mont64", "crc32", "cubic", "edn", "huffbench", "matmult-int", "minver", "nbody", "nettle-aes", "nettle-sha256", "nsichneu", "picojpeg", "qrduino", "sglib-combined", "slre", "st", "statemate", "ud", "wikisort"]
@@ -81,6 +82,6 @@ def run_arch_sweep():
             os.system("mv -f wally"+res+".json "+dir+"/wally"+res+"_"+arch+".json")
     return dir
 
-#directory = run_arch_sweep()
-directory = "run_20231116_071322"
+directory = run_arch_sweep()
+#directory = "run_20231117_082325"
 tabulate_arch_sweep(directory)
\ No newline at end of file
diff --git a/synthDC/wallySynthAll.sh b/synthDC/wallySynthAll.sh
new file mode 100755
index 000000000..9c085601c
--- /dev/null
+++ b/synthDC/wallySynthAll.sh
@@ -0,0 +1,13 @@
+# Run all Wally synthesis experiments from chapter 8
+./wallySynth.py --freqsweep 330 --tech sky130 
+./wallySynth.py --freqsweep 870 --tech sky90 
+./wallySynth.py --freqsweep 2800 --tech tsmc28psyn --usesram
+./wallySynth.py --configsweep --tech sky130 --targetfreq 330
+./wallySynth.py --configsweep --tech sky90 --targetfreq 870
+./wallySynth.py --configsweep --tech tsmc28psyn --targetfreq 2800 --usesram
+./wallySynth.py --featuresweep --tech sky130 --targetfreq 330
+./wallySynth.py --featuresweep --tech sky90 --targetfreq 870
+./wallySynth.py --featuresweep --tech tsmc28psyn --targetfreq 2800 --usesram
+# Extract summary data (run this by hand after all experiments finish)
+#./extractSummary.py --sky130freq 330 --sky90freq 870 --tsmcfreq 2800
+

From cdd21d6635d19c264868f4e29f6848b30267a3a5 Mon Sep 17 00:00:00 2001
From: Rose Thompson <ross1728@gmail.com>
Date: Sun, 19 Nov 2023 13:44:22 -0600
Subject: [PATCH 60/62] Added menvcfg to debugger for checking what linux has
 configured.

---
 fpga/constraints/marked_debug.txt | 139 +++---------------------------
 fpga/constraints/small-debug.xdc  |  20 +++++
 linux/devicetree/wally-vcu118.dts |   1 +
 3 files changed, 31 insertions(+), 129 deletions(-)

diff --git a/fpga/constraints/marked_debug.txt b/fpga/constraints/marked_debug.txt
index 3973fc451..582af32a8 100644
--- a/fpga/constraints/marked_debug.txt
+++ b/fpga/constraints/marked_debug.txt
@@ -1,131 +1,12 @@
-lsu/lsu.sv: logic      IEUAdrM
-lsu/lsu.sv: logic      WriteDataM
-lsu/lsu.sv: logic        LSUHADDR
-lsu/lsu.sv: logic        HRDATA
-lsu/lsu.sv: logic        LSUHWDATA
-lsu/lsu.sv: logic       LSUHREADY
-lsu/lsu.sv: logic       LSUHWRITE
-lsu/lsu.sv: logic        LSUHSIZE
-lsu/lsu.sv: logic        LSUHBURST
-lsu/lsu.sv: logic        LSUHTRANS
-lsu/lsu.sv: logic        LSUHWSTRB
-lsu/lsu.sv: logic      IHAdrM
-ieu/regfile.sv: logic    rf
-ieu/datapath.sv: logic                 RegWriteW
-hazard/hazard.sv: logic	         BPPredWrongE
-hazard/hazard.sv: logic	         LoadStallD
-hazard/hazard.sv: logic          FCvtIntStallD
-hazard/hazard.sv: logic	         DivBusyE
-hazard/hazard.sv: logic	         EcallFaultM
-hazard/hazard.sv: logic          WFIStallM
-hazard/hazard.sv: logic	        StallF
-hazard/hazard.sv: logic	        FlushD
-cache/cachefsm.sv:   statetype CurrState
-wally/wallypipelinedcore.sv: logic    TrapM
-wally/wallypipelinedcore.sv: logic            SrcAM
-wally/wallypipelinedcore.sv: logic                 InstrM
 wally/wallypipelinedcore.sv: logic             PCM
-wally/wallypipelinedcore.sv: logic           MemRWM
+wally/wallypipelinedcore.sv: logic    TrapM
 wally/wallypipelinedcore.sv: logic                InstrValidM
-wally/wallypipelinedcore.sv: logic     WriteDataM
-wally/wallypipelinedcore.sv: logic     IEUAdrM
-wally/wallypipelinedcore.sv: logic   HRDATA
-ifu/spill.sv:    statetype CurrState
-ifu/ifu.sv: logic    				IFUStallF
-ifu/ifu.sv: logic     IFUHADDR
-ifu/ifu.sv: logic     	HRDATA
-ifu/ifu.sv: logic      IFUHREADY
-ifu/ifu.sv: logic     IFUHWRITE
-ifu/ifu.sv: logic      IFUHSIZE
-ifu/ifu.sv: logic      IFUHBURST
-ifu/ifu.sv: logic      IFUHTRANS
-ifu/ifu.sv: logic     PCF
-ifu/ifu.sv: logic                 PCNextF
-ifu/ifu.sv: logic            PCPF
-ifu/ifu.sv: logic     PostSpillInstrRawF
-mmu/hptw.sv: logic	   ITLBWriteF
-mmu/hptw.sv:	 statetype WalkerState
-privileged/csrs.sv: logic        CSRSReadValM
-privileged/csrs.sv: logic        SEPC_REGW
-privileged/csrs.sv: logic        MIP_REGW
-privileged/csrs.sv: logic      SSCRATCH_REGW
-privileged/csrs.sv: logic     SCAUSE_REGW      
-privileged/csr.sv: logic    CSRReadValM  
-privileged/csr.sv: logic    CSRSrcM
-privileged/csr.sv: logic    CSRWriteValM
-privileged/csr.sv: logic    MSTATUS_REGW
-privileged/trap.sv: logic      		   InstrMisalignedFaultM
-privileged/trap.sv: logic      		   BreakpointFaultM
-privileged/trap.sv: logic      		   LoadAccessFaultM
-privileged/trap.sv: logic      		   LoadPageFaultM
-privileged/trap.sv: logic      		   mretM
-privileged/trap.sv: logic      MIP_REGW
-privileged/trap.sv: logic           PendingIntsM
-privileged/privileged.sv: logic      CSRReadM
-privileged/privileged.sv: logic    InterruptM
-privileged/csrc.sv: logic      HPMCOUNTER_REGW
-privileged/csri.sv: logic       MExtInt
-privileged/csri.sv: logic        MIP_REGW_writeabl
-privileged/csrm.sv: logic        	     MIP_REGW
-privileged/csrm.sv: logic       MEPC_REGW
-privileged/csrm.sv: logic     MEDELEG_REGW
-privileged/csrm.sv: logic          MIDELEG_REGW
-privileged/csrm.sv: logic      MSCRATCH_REGW
-privileged/csrm.sv: logic       MCAUSE_REGW
-uncore/uart_apb.sv: logic                  SIN
-uncore/uart_apb.sv: logic                 SOUT
-uncore/uart_apb.sv: logic                 OUT1b
-uncore/uartPC16550D.sv: logic       RBR
-uncore/uartPC16550D.sv: logic        FCR
-uncore/uartPC16550D.sv: logic        IER
-uncore/uartPC16550D.sv: logic        MCR
-uncore/uartPC16550D.sv: logic    	   baudpulse
-uncore/uartPC16550D.sv:     statetype rxstate
-uncore/uartPC16550D.sv: logic     					rxfifo
-uncore/uartPC16550D.sv: logic     					txfifo
-uncore/uartPC16550D.sv: logic    					rxfifohead
-uncore/uartPC16550D.sv: logic    					rxfifoentries
-uncore/uartPC16550D.sv: logic       					RXBR
-uncore/uartPC16550D.sv: logic     					rxtimeoutcnt
-uncore/uartPC16550D.sv: logic      						rxparityerr
-uncore/uartPC16550D.sv: logic   						rxdataready
-uncore/uartPC16550D.sv: logic   						rxfifoempty
-uncore/uartPC16550D.sv: logic     					rxdata
-uncore/uartPC16550D.sv: logic     					RXerrbit
-uncore/uartPC16550D.sv: logic     					rxfullbitunwrapped
-uncore/uartPC16550D.sv: logic     					txdata
-uncore/uartPC16550D.sv: logic    						txnextbit
-uncore/uartPC16550D.sv: logic    						txfifoempty
-uncore/uartPC16550D.sv: logic   						fifoenabled
-uncore/uartPC16550D.sv: logic   						RXerr
-uncore/uartPC16550D.sv: logic   						THRE
-uncore/uartPC16550D.sv: logic   						rxdataavailintr
-uncore/uartPC16550D.sv: logic    					intrID
-uncore/uncore.sv: logic HSELEXTSDCD
-uncore/plic_apb.sv: logic                    MExtInt
-uncore/plic_apb.sv: logic     Din
-uncore/plic_apb.sv: logic             requests
-uncore/plic_apb.sv: logic        intPriority
-uncore/plic_apb.sv: logic             intInProgress
-uncore/plic_apb.sv: logic           intThreshold
-uncore/plic_apb.sv: logic             intEn
-uncore/plic_apb.sv: logic           intClaim
-uncore/plic_apb.sv: logic        irqMatrix
-uncore/plic_apb.sv: logic           priorities_with_irqs
-uncore/plic_apb.sv: logic           max_priority_with_irqs
-uncore/plic_apb.sv: logic          irqs_at_max_priority
-uncore/plic_apb.sv: logic           threshMask
-uncore/clint_apb.sv: logic      MTIME
-uncore/clint_apb.sv: logic         MTIMECMP
-ebu/ebu.sv: logic     HCLK
-ebu/ebu.sv: logic      HREADY
-ebu/ebu.sv: logic      HRESP
-ebu/ebu.sv: logic      HADDR
-ebu/ebu.sv: logic     HWRITE
-ebu/ebu.sv: logic      HSIZE
-ebu/ebu.sv: logic      HBURST
-ebu/ebu.sv: logic      HPROT
-ebu/ebu.sv: logic      HTRANS
-ebu/ebu.sv: logic     HMASTLOC
-ebu/buscachefsm.sv:   busstatetype CurrState
-ebu/busfsm.sv:   busstatetype CurrState
+wally/wallypipelinedcore.sv: logic                 InstrM
+lsu/lsu.sv: logic        IEUAdrM
+lsu/lsu.sv: logic        PAdrM
+lsu/lsu.sv: logic        ReadDataM
+lsu/lsu.sv: logic        WriteDataM
+lsu/lsu.sv: logic       MemRWM
+mmu/hptw.sv: logic	   SATP_REGW
+privileged/csr.sv: logic       MENVCFG_REGW
+privileged/csr.sv: logic       SENVCFG_REGW
diff --git a/fpga/constraints/small-debug.xdc b/fpga/constraints/small-debug.xdc
index 7bf498a79..8400b7281 100644
--- a/fpga/constraints/small-debug.xdc
+++ b/fpga/constraints/small-debug.xdc
@@ -53,6 +53,26 @@ set_property port_width 48 [get_debug_ports u_ila_0/probe6]
 set_property PROBE_TYPE DATA_AND_TRIGGER [get_debug_ports u_ila_0/probe6]
 connect_debug_port u_ila_0/probe6 [get_nets [list {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[0]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[1]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[2]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[3]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[4]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[5]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[6]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[7]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[8]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[9]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[10]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[11]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[12]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[13]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[14]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[15]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[16]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[17]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[18]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[19]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[20]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[21]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[22]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[23]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[24]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[25]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[26]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[27]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[28]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[29]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[30]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[31]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[32]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[33]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[34]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[35]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[36]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[37]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[38]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[39]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[40]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[41]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[42]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[43]}  {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[60]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[61]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[62]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/hptw.hptw/SATP_REGW[63]}]]
 
+create_debug_port u_ila_0 probe
+set_property port_width 64 [get_debug_ports u_ila_0/probe7]
+set_property PROBE_TYPE DATA_AND_TRIGGER [get_debug_ports u_ila_0/probe7]
+connect_debug_port u_ila_0/probe7 [get_nets [list {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[0]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[1]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[2]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[3]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[4]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[5]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[6]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[7]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[8]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[9]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[10]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[11]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[12]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[13]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[14]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[15]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[16]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[17]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[18]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[19]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[20]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[21]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[22]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[23]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[24]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[25]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[26]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[27]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[28]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[29]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[30]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[31]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[32]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[33]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[34]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[35]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[36]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[37]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[38]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[39]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[40]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[41]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[42]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[43]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[44]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[45]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[46]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[47]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[48]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[49]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[50]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[51]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[52]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[53]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[54]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[55]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[56]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[57]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[58]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[59]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[60]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[61]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[62]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/ReadDataM[63]} ]]
+
+create_debug_port u_ila_0 probe
+set_property port_width 64 [get_debug_ports u_ila_0/probe8]
+set_property PROBE_TYPE DATA_AND_TRIGGER [get_debug_ports u_ila_0/probe8]
+connect_debug_port u_ila_0/probe8 [get_nets [list {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[0]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[1]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[2]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[3]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[4]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[5]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[6]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[7]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[8]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[9]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[10]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[11]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[12]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[13]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[14]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[15]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[16]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[17]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[18]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[19]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[20]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[21]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[22]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[23]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[24]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[25]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[26]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[27]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[28]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[29]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[30]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[31]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[32]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[33]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[34]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[35]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[36]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[37]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[38]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[39]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[40]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[41]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[42]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[43]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[44]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[45]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[46]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[47]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[48]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[49]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[50]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[51]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[52]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[53]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[54]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[55]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[56]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[57]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[58]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[59]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[60]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[61]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[62]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/lsu/WriteDataM[63]} ]]
+
+create_debug_port u_ila_0 probe
+set_property port_width 64 [get_debug_ports u_ila_0/probe9]
+set_property PROBE_TYPE DATA_AND_TRIGGER [get_debug_ports u_ila_0/probe9]
+connect_debug_port u_ila_0/probe9 [get_nets [list {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[0]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[1]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[2]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[3]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[4]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[5]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[6]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[7]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[8]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[9]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[10]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[11]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[12]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[13]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[14]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[15]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[16]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[17]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[18]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[19]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[20]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[21]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[22]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[23]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[24]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[25]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[26]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[27]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[28]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[29]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[30]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[31]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[32]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[33]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[34]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[35]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[36]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[37]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[38]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[39]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[40]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[41]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[42]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[43]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[44]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[45]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[46]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[47]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[48]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[49]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[50]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[51]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[52]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[53]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[54]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[55]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[56]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[57]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[58]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[59]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[60]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[61]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[62]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/MENVCFG_REGW[63]} ]]
+
+create_debug_port u_ila_0 probe
+set_property port_width 64 [get_debug_ports u_ila_0/probe10]
+set_property PROBE_TYPE DATA_AND_TRIGGER [get_debug_ports u_ila_0/probe10]
+connect_debug_port u_ila_0/probe10 [get_nets [list {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[0]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[1]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[2]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[3]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[4]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[5]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[6]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[7]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[8]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[9]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[10]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[11]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[12]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[13]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[14]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[15]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[16]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[17]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[18]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[19]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[20]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[21]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[22]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[23]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[24]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[25]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[26]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[27]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[28]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[29]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[30]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[31]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[32]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[33]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[34]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[35]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[36]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[37]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[38]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[39]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[40]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[41]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[42]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[43]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[44]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[45]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[46]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[47]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[48]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[49]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[50]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[51]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[52]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[53]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[54]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[55]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[56]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[57]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[58]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[59]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[60]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[61]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[62]} {wallypipelinedsocwrapper/wallypipelinedsoc/core/priv.priv/csr/SENVCFG_REGW[63]} ]]
+
 # the debug hub has issues with the clocks from the mmcm so lets give up an connect to the 100Mhz input clock.
 #connect_debug_port dbg_hub/clk [get_nets default_100mhz_clk]
 connect_debug_port dbg_hub/clk [get_nets CPUCLK]
diff --git a/linux/devicetree/wally-vcu118.dts b/linux/devicetree/wally-vcu118.dts
index e0257c9a6..20448609b 100644
--- a/linux/devicetree/wally-vcu118.dts
+++ b/linux/devicetree/wally-vcu118.dts
@@ -31,6 +31,7 @@
 			status = "okay";
 			compatible = "riscv";
 			riscv,isa = "rv64imafdcsu";
+                        riscv,isa-extensions = "imafdc", "sstc", "svinval", "svnapot", "svpbmt", "zba", "zbb", "zbc", "zbs", "zicbom", "zicbop", "zicbopz", "zicntr", "zicsr", "zifencei", "zihpm";
 			mmu-type = "riscv,sv48";
 
 			interrupt-controller {

From 8cb433cb66045512b885d1f08335c8c4d39bc1b9 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Sun, 19 Nov 2023 19:33:57 -0800
Subject: [PATCH 61/62] Commented IROM preloading

---
 addins/riscv-arch-test     | 2 +-
 src/generic/mem/rom1p1r.sv | 2 +-
 src/ifu/irom.sv            | 4 +++-
 synthDC/wallySynthAll.sh   | 7 ++++---
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
index 9f9bdd62d..eb0a38922 160000
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@@ -1 +1 @@
-Subproject commit 9f9bdd62d3e37fcd8ad1b1a39d71694ccf1d74f3
+Subproject commit eb0a3892215ad2384702db02da1551a59701ec67
diff --git a/src/generic/mem/rom1p1r.sv b/src/generic/mem/rom1p1r.sv
index 5a45e354a..617a779ff 100644
--- a/src/generic/mem/rom1p1r.sv
+++ b/src/generic/mem/rom1p1r.sv
@@ -47,7 +47,7 @@ module rom1p1r #(parameter ADDR_WIDTH = 8, DATA_WIDTH = 32, PRELOAD_ENABLED = 0)
 
   initial begin
     if (PRELOAD_ENABLED) begin
-      $readmemh("../../../fpga/src/boot.mem", ROM, 0);
+      $readmemh("$WALLY/fpga/src/boot.mem", ROM, 0);
     end
   end
   
diff --git a/src/ifu/irom.sv b/src/ifu/irom.sv
index 0d4286e43..0b29c72cf 100644
--- a/src/ifu/irom.sv
+++ b/src/ifu/irom.sv
@@ -39,7 +39,9 @@ module irom import cvw::*;  #(parameter cvw_t P) (
   logic [31:0]       RawIROMInstrF;
   logic [2:1]        AdrD;
 
-  rom1p1r #(ADDR_WDITH, P.XLEN) rom(.clk, .ce, .addr(Adr[ADDR_WDITH+OFFSET-1:OFFSET]), .dout(IROMInstrFFull));
+  // preload IROM with the FPGA bootloader by default so that it syntehsizes to something, avoiding having the IEU optimized away because instructions are all 0
+  // the testbench replaces these dummy contents with the actual program of interest during simulation
+  rom1p1r #(ADDR_WDITH, P.XLEN, 1) rom(.clk, .ce, .addr(Adr[ADDR_WDITH+OFFSET-1:OFFSET]), .dout(IROMInstrFFull));
   if (P.XLEN == 32) assign RawIROMInstrF = IROMInstrFFull;
   else              begin
   // IROM is aligned to XLEN words, but instructions are 32 bits.  Select between the two
diff --git a/synthDC/wallySynthAll.sh b/synthDC/wallySynthAll.sh
index 9c085601c..9af40a379 100755
--- a/synthDC/wallySynthAll.sh
+++ b/synthDC/wallySynthAll.sh
@@ -1,7 +1,8 @@
 # Run all Wally synthesis experiments from chapter 8
-./wallySynth.py --freqsweep 330 --tech sky130 
-./wallySynth.py --freqsweep 870 --tech sky90 
-./wallySynth.py --freqsweep 2800 --tech tsmc28psyn --usesram
+# However, trying to run the freqsweeps at the same time maxes out licenses and some runs fail
+#./wallySynth.py --freqsweep 330 --tech sky130 
+#./wallySynth.py --freqsweep 870 --tech sky90 
+#./wallySynth.py --freqsweep 2800 --tech tsmc28psyn --usesram
 ./wallySynth.py --configsweep --tech sky130 --targetfreq 330
 ./wallySynth.py --configsweep --tech sky90 --targetfreq 870
 ./wallySynth.py --configsweep --tech tsmc28psyn --targetfreq 2800 --usesram

From 3594c08d4b70fa6b97b1d0063abdbd1eaf145974 Mon Sep 17 00:00:00 2001
From: Rose Thompson <ross1728@gmail.com>
Date: Mon, 20 Nov 2023 10:30:35 -0600
Subject: [PATCH 62/62] Modified linux imperas tests to 1. enable zicclsm 2.
 enable logging at 7000 ms

---
 config/buildroot/config.vh | 2 +-
 sim/imperas.ic             | 5 +++--
 sim/run-imperas-linux.sh   | 2 +-
 sim/wally-linux-imperas.do | 6 ++++--
 4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/config/buildroot/config.vh b/config/buildroot/config.vh
index 0015e2bba..35be93fd6 100644
--- a/config/buildroot/config.vh
+++ b/config/buildroot/config.vh
@@ -45,7 +45,7 @@ localparam SSTC_SUPPORTED = 1;
 localparam ZICBOM_SUPPORTED = 1;
 localparam ZICBOZ_SUPPORTED = 1;
 localparam ZICBOP_SUPPORTED = 1;
-localparam ZICCLSM_SUPPORTED = 0;
+localparam ZICCLSM_SUPPORTED = 1;
 localparam SVPBMT_SUPPORTED = 1;
 localparam SVNAPOT_SUPPORTED = 1;
 localparam SVINVAL_SUPPORTED = 1;
diff --git a/sim/imperas.ic b/sim/imperas.ic
index adb10dcad..d9226c4bb 100644
--- a/sim/imperas.ic
+++ b/sim/imperas.ic
@@ -18,6 +18,8 @@
 # More extensions
 --override cpu/Zcb=T
 
+--override cpu/unaligned=T
+
 # Cache block operations
 --override cpu/Zicbom=T
 --override cpu/Zicbop=T
@@ -40,7 +42,6 @@
 
 --override cpu/reset_address=0x80000000
 
---override cpu/unaligned=F
 --override cpu/ignore_non_leaf_DAU=1
 --override cpu/wfi_is_nop=T
 --override cpu/misa_Extensions_mask=0x0
@@ -88,7 +89,7 @@
 
 # Add Imperas simulator application instruction tracing
 --verbose
---trace --tracechange --traceshowicount --tracemode -tracemem ASX --monitornetschange --traceafter 0
+#--trace --tracechange --traceshowicount --tracemode -tracemem ASX --monitornetschange --traceafter 300000000
 --override cpu/debugflags=6 --override cpu/verbose=1
 --override cpu/show_c_prefix=T
 
diff --git a/sim/run-imperas-linux.sh b/sim/run-imperas-linux.sh
index fd265cb9e..6a49f46e9 100755
--- a/sim/run-imperas-linux.sh
+++ b/sim/run-imperas-linux.sh
@@ -7,4 +7,4 @@ export OTHERFLAGS="+TRACE2LOG_ENABLE=1"
 #export OTHERFLAGS="+TRACE2LOG_ENABLE=1  +TRACE2LOG_AFTER=10500000"
 export OTHERFLAGS=""
 
-vsim -c -do "do wally-linux-imperas.do buildroot buildroot-no-trace $::env(RISCV) 0 0 0"
+vsim -c  -do "do wally-linux-imperas.do buildroot buildroot-no-trace $::env(RISCV) 0 0 0"
diff --git a/sim/wally-linux-imperas.do b/sim/wally-linux-imperas.do
index f173f67c9..196c780be 100644
--- a/sim/wally-linux-imperas.do
+++ b/sim/wally-linux-imperas.do
@@ -40,6 +40,7 @@ if {$2 eq "buildroot" || $2 eq "buildroot-checkpoint"} {
 
     #-- Run the Simulation
     #run -all
+    run 7000 ms
     add log -recursive /*
     do linux-wave.do
     run -all
@@ -87,9 +88,10 @@ if {$2 eq "buildroot" || $2 eq "buildroot-checkpoint"} {
     #run 100 ns
     #force -deposit testbench/dut/core/priv/priv/csr/csri/IE_REGW 16'h2aa
     #force -deposit testbench/dut/uncore/uncore/clint/clint/MTIMECMP 64'h1000
+    run 7000 ms
+    add log -recursive /testbench/dut/*
+    do wave.do
     run 14000 ms
-    #add log -recursive /*
-    #do linux-wave.do
     #run -all
 
     exec ./slack-notifier/slack-notifier.py