Merge branch 'main' of https://github.com/openhwgroup/cvw

2025-02-11 06:05:49 +00:00 · 2023-11-20 10:34:36 -06:00 · 2023-11-20 10:34:36 -06:00 · b137759b45
commit b137759b45
parent 64e627841a 17e91f31f6
58 changed files with 1871 additions and 924 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,7 +10,7 @@ __pycache__/
 addins/riscv-arch-test/Makefile.include
 addins/riscv-tests/target
 addins/TestFloat-3e/build/Linux-x86_64-GCC/*
-benchmarks/embench/wally*.json
+
 #vsim work files to ignore
 transcript
@ -175,3 +175,6 @@ tests/fp/combined_IF_vectors/IF_vectors/*.tv
 sim/bp-results/*.log
 sim/branch*.log
 /tests/custom/fpga-test-sdc/bin/fpga-test-sdc
 benchmarks/embench/wally*.json
 benchmarks/embench/run*
 sim/cfi.log
--- a/.gitmodules
+++ b/.gitmodules
@ -1,16 +1,9 @@
 [submodule "sky130/sky130_osu_sc_t12"]
 	path = sky130/sky130_osu_sc_t12
 	url = https://foss-eda-tools.googlesource.com/skywater-pdk/libs/sky130_osu_sc_t12/
 [submodule "addins/riscv-arch-test"]
 	path = addins/riscv-arch-test
 	url = https://github.com/riscv-non-isa/riscv-arch-test
 	ignore = dirty
 [submodule "addins/imperas-riscv-tests"]
 	path = addins/imperas-riscv-tests
 	url = https://github.com/riscv-ovpsim/imperas-riscv-tests
 [submodule "addins/riscv-tests"]
 	path = addins/riscv-tests
 	url = https://github.com/riscv-software-src/riscv-tests
 [submodule "addins/riscv-dv"]
 	path = addins/riscv-dv
 	url = https://github.com/google/riscv-dv
@ -30,6 +23,9 @@
 [submodule "addins/vivado-boards"]
 	path = addins/vivado-boards
 	url = https://github.com/Digilent/vivado-boards/
-[submodule "addins/vivado-risc-v"]
+[submodule "addins/ahbsdc"]
-	path = addins/vivado-risc-v
+	path = addins/ahbsdc
-	url = https://github.com/eugene-tarassov/vivado-risc-v.git
+	url = git@github.com:jacobpease/ahbsdc.git
 [submodule "addins/riscv-arch-test"]
 	path = addins/riscv-arch-test
 	url = https://github.com/riscv-non-isa/riscv-arch-test
--- a/addins/ahbsdc
+++ b/addins/ahbsdc
@ -0,0 +1 @@
 Subproject commit 5df21aa6625eca120e64ea353ca641aff37d90b2
--- a/addins/embench-iot
+++ b/addins/embench-iot
@ -1 +1 @@
-Subproject commit 1480febc3ace5f471baeee4b1ae0d8fea16e4762
+Subproject commit 4c5eb87983f51ca7fcf7855306877b3d1c3aabf1
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@ -1 +1 @@
-Subproject commit 197179fdc9dfeeca821e848f373c897a3fdae86c
+Subproject commit eb0a3892215ad2384702db02da1551a59701ec67
--- a/addins/riscv-tests
+++ b/addins/riscv-tests
@ -1 +0,0 @@
 Subproject commit cf04274f50621fd9ef9147793cca6dd1657985c7
--- a/addins/vivado-risc-v
+++ b/addins/vivado-risc-v
@ -1 +0,0 @@
 Subproject commit c76a8613a177b3a04face2cb8e15dd07a8d2fc40
--- a/benchmarks/embench/Makefile
+++ b/benchmarks/embench/Makefile
@ -3,6 +3,7 @@
 # Compile Embench for Wally
 embench_dir = ../../addins/embench-iot
 ARCH=rv32imac_zicsr
 all: build 
 run: build size sim
@ -15,7 +16,7 @@ buildsize: build_speedopt_size build_sizeopt_size
 # uses the build_all.py python file to build the tests in addins/embench-iot/bd_speed/ optimized for speed and size
 build_speedopt_speed:
-	$(embench_dir)/build_all.py --builddir=bd_speedopt_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S" --cflags="-O2 -nostartfiles" 
+	$(embench_dir)/build_all.py --builddir=bd_speedopt_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S -march=$(ARCH)" --cflags="-O2 -nostartfiles -march=$(ARCH)" 
 	# remove files not used in embench1.0  When changing to 2.0, restore these files		
 	#rm -rf $(embench_dir)/bd_speedopt_speed/src/md5sum
 	#rm -rf $(embench_dir)/bd_speedopt_speed/src/tarfind
@ -23,7 +24,7 @@ build_speedopt_speed:
 	find $(embench_dir)/bd_speedopt_speed/ -type f ! -name "*.*" | while read f; do cp "$$f" "$$f.elf"; done
 build_sizeopt_speed:
-	$(embench_dir)/build_all.py --builddir=bd_sizeopt_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S" --cflags="-Os -nostartfiles" 
+	$(embench_dir)/build_all.py --builddir=bd_sizeopt_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S -march=$(ARCH)" --cflags="-Os -nostartfiles -march=$(ARCH)" 
 	# remove files not used in embench1.0  When changing to 2.0, restore these files		
 	#rm -rf $(embench_dir)/bd_sizeopt_speed/src/md5sum
 	#rm -rf $(embench_dir)/bd_sizeopt_speed/src/tarfind
@ -32,10 +33,10 @@ build_sizeopt_speed:
 # uses the build_all.py python file to build the tests in addins/embench-iot/bd_speed/ optimized for speed and size
 build_speedopt_size:
-	$(embench_dir)/build_all.py --builddir=bd_speedopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S" --cflags="-O2 -msave-restore" --dummy-libs="libgcc libm libc crt0"
+	$(embench_dir)/build_all.py --builddir=bd_speedopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S -march=$(ARCH)" --cflags="-O2 -msave-restore -march=$(ARCH)" --dummy-libs="libgcc libm libc crt0"
 build_sizeopt_size:
-	$(embench_dir)/build_all.py --builddir=bd_sizeopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S" --cflags="-Os -msave-restore" --dummy-libs="libgcc libm libc crt0"
+	$(embench_dir)/build_all.py --builddir=bd_sizeopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S -march=$(ARCH)" --cflags="-Os -msave-restore -march=$(ARCH)" --dummy-libs="libgcc libm libc crt0"
 # builds dependencies, then launches modelsim and finally runs python wrapper script to present results
 sim: modelsim_build_memfile modelsim_run speed
--- a/benchmarks/embench/embench_arch_sweep.py
+++ b/benchmarks/embench/embench_arch_sweep.py
@ -0,0 +1,87 @@
 #!/usr/bin/python3
 # embench_arch_sweep.py
 # David_Harris@hmc.edu 16 November 2023
 # SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
 # Run embench on a variety of architectures and collate results
 import os
 from datetime import datetime
 import re
 import collections
 #archs = ["rv32i_zicsr", "rv32im_zicsr", "rv32imc_zicsr", "rv32imc_zba_zbb_zbc_zbs_zicsr", "rv32imafdc_zba_zbb_zbc_zbs_zicsr"]
 archs = ["rv32imafdc_zba_zbb_zbc_zbs_zicsr", "rv32i_zicsr", "rv32im_zicsr", "rv32imc_zicsr", "rv32imc_zba_zbb_zbc_zbs_zicsr"]
 def calcgeomean(d, arch):
    progs = ["aha-mont64", "crc32", "cubic", "edn", "huffbench", "matmult-int", "minver", "nbody", "nettle-aes", "nettle-sha256", "nsichneu", "picojpeg", "qrduino", "sglib-combined", "slre", "st", "statemate", "ud", "wikisort"]
    result = 1.0
    for p in progs:
        #val = d[arch][p]
        val = d[arch].get(p, 1.0)
        result = result *float(val)
    result = pow(result, (1.0/float(len(progs))))
    return result
 def tabulate_arch_sweep(directory):
    for case in ["wallySizeOpt_size", "wallySpeedOpt_speed"]:
        d = collections.defaultdict(dict)
        for arch in archs:
            file = case+"_"+arch+".json"
            file_path = os.path.join(directory, file)
            lines = []
            try:
                f = open(file_path, "r")
                lines = f.readlines()
            except:
                f.close()
                #print(file_path+" does not exist")
            for line in lines:
                #print("File: "+file+" Line: "+line)
                #p = re.compile('".*" : .*,')
                p = r'"([^"]*)" : ([^,\n]+)'
                match = re.search(p, line)
                if match:
                    prog = match.group(1)
                    result = match.group(2);
                    d[arch][prog] = result;
                    #print(match.group(1)+" " + match.group(2))
            f.close()
        for arch in [""] + archs:
            print (arch, end="\t")
        print("")
        for prog in d[archs[0]]:
            print(prog, end="\t")
            for arch in archs:
                entry = d[arch].get(prog, "n/a");
                print (entry, end="\t")
            print("")
        print("New geo mean", end="\t")
        for arch in archs:
            geomean = calcgeomean(d, arch)
            print(geomean, end="\t")
        print("")
 def run_arch_sweep():
    # make a folder whose name depends on the date
    # Get current date
    current_date = datetime.now()
    # Format date as a string in the format YYYYMMDD
    date_string = current_date.strftime('%Y%m%d_%H%M%S')
    dir = "run_"+date_string
    # Create a directory with the date string as its name
    os.mkdir(dir)
    # make a directory with the current date as its name 
    # sweep the runs and save the results in the run directory
    for arch in archs:
        os.system("make clean")
        os.system("make run ARCH="+arch)
        for res in ["SizeOpt_size", "SizeOpt_speed", "SpeedOpt_size", "SpeedOpt_speed"]:
            os.system("mv -f wally"+res+".json "+dir+"/wally"+res+"_"+arch+".json")
    return dir
 directory = run_arch_sweep()
 #directory = "run_20231117_082325"
 tabulate_arch_sweep(directory)
--- a/config/rv32gc/config.vh
+++ b/config/rv32gc/config.vh
@ -74,8 +74,8 @@ localparam ICACHE_LINELENINBITS = 32'd512;
 // Integer Divider Configuration
 // IDIV_BITSPERCYCLE must be 1, 2, or 4
-localparam IDIV_BITSPERCYCLE = 32'd4;
+localparam IDIV_BITSPERCYCLE = 32'd2;
-localparam IDIV_ON_FPU = 1;
+localparam IDIV_ON_FPU = 0;
 // Legal number of PMP entries are 0, 16, or 64
 localparam PMP_ENTRIES = 32'd16;
@ -169,7 +169,7 @@ localparam ZMMUL_SUPPORTED = 0;
 // FPU division architecture
 localparam RADIX = 32'd4;
-localparam DIVCOPIES = 32'd4;
+localparam DIVCOPIES = 32'd2;
 // bit manipulation
 localparam ZBA_SUPPORTED = 1;
--- a/config/rv64gc/config.vh
+++ b/config/rv64gc/config.vh
@ -150,7 +150,7 @@ localparam PLIC_SDC_ID = 32'd9;
 localparam BPRED_SUPPORTED = 1;
 localparam BPRED_TYPE = `BP_GSHARE; // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT
 localparam BPRED_NUM_LHR = 32'd6;
-localparam BPRED_SIZE = 32'd6;
+localparam BPRED_SIZE = 32'd10;
 localparam BTB_SIZE = 32'd10;
 localparam RAS_SIZE = 32'd16;
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@ -93,16 +93,21 @@ localparam NF2   = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_NF   : H_NF);
 localparam FMT2  = ((F_SUPPORTED & (LEN1 != S_LEN)) ? 2'd0    : 2'd2);
 localparam BIAS2 = ((F_SUPPORTED & (LEN1 != S_LEN)) ? S_BIAS : H_BIAS);
 // divider r and rk (bits per digit, bits per cycle)
 localparam LOGR        = $clog2(RADIX);                             // r = log(R) bits per digit
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
 // intermediate division parameters not directly used in fdivsqrt hardware
 localparam FPDIVMINb   = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit to allow sqrt being shifted right
 //localparam FPDIVMINb   = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right, if radix4 doesn't use a right shift.  This version saves one cycle on double-precision with R=4,k=4.  However, it doesn't work yet because C is too short, so k is incorrectly calculated as a 1 in the lsb after the last step.
 localparam DIVMINb     = ((FPDIVMINb<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVMINb; // minimum fractional bits b = max(XLEN, FPDIVMINb)
 localparam RESBITS     = DIVMINb + LOGR; // number of bits in a result: r integer + b fractional
 // division constants
-localparam DIVN        = (((NF+2<XLEN) & IDIV_ON_FPU) ? XLEN : NF+2); // standard length of input
+localparam FPDUR       = (RESBITS-1)/RK + 1 ;                       // ceiling((r+b)/rk)
-localparam LOGR        = ($clog2(RADIX));           // r = log(R)
+localparam DIVb        = FPDUR*RK - LOGR;                           // divsqrt fractional bits, so total number of bits is a multiple of rk after r integer bits
-localparam RK          = (LOGR*DIVCOPIES);         // r*k used for intdiv preproc
+localparam DURLEN      = $clog2(FPDUR);                             // enough bits to count the duration
-localparam LOGRK       = ($clog2(RK));               // log2(r*k)
+localparam DIVBLEN     = $clog2(DIVb);                              // enough bits to count number of fractional bits
 localparam FPDUR       = ((DIVN+1+(LOGR*DIVCOPIES))/(LOGR*DIVCOPIES)+(RADIX/4));
 localparam DURLEN      = ($clog2(FPDUR+1));
 localparam DIVb        = (FPDUR*LOGR*DIVCOPIES-1); // canonical fdiv size (b)
 localparam DIVBLEN     = ($clog2(DIVb+1)-1);
 localparam DIVa        = (DIVb+1-XLEN); // used for idiv on fpu: Shift residual right by b - (XLEN-1) to put remainder in lsbs of integer result
 // largest length in IEU/FPU
 localparam CVTLEN = ((NF<XLEN) ? (XLEN) : (NF));  // max(XLEN, NF)
@ -110,7 +115,7 @@ localparam LLEN = (($unsigned(FLEN)<$unsigned(XLEN)) ? ($unsigned(XLEN)) : ($uns
 localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));
 localparam NORMSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVb + 1 +NF+1) > (3*NF+6) ? (DIVb + 1 +NF+1) : (3*NF+6)));
 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));
-localparam CORRSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVN+1+NF) > (3*NF+4) ? (DIVN+1+NF) : (3*NF+4)));
+localparam CORRSHIFTSZ = (((CVTLEN+NF+1)>(DIVb + 1 +NF+1) & (CVTLEN+NF+1)>(3*NF+6)) ? (CVTLEN+NF+1) : ((DIVMINb+1+NF) > (3*NF+4) ? (DIVMINb+1+NF) : (3*NF+4)));
 // Disable spurious Verilator warnings
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@ -179,13 +179,10 @@ localparam cvw_t P = '{
  NORMSHIFTSZ : NORMSHIFTSZ,
  LOGNORMSHIFTSZ : LOGNORMSHIFTSZ,
  CORRSHIFTSZ : CORRSHIFTSZ,
  DIVN        : DIVN,
  LOGR        : LOGR,
  RK          : RK,
  LOGRK       : LOGRK,
  FPDUR       : FPDUR,
  DURLEN      : DURLEN,
  DIVb        : DIVb,
-  DIVBLEN     : DIVBLEN,
+  DIVBLEN     : DIVBLEN
  DIVa        : DIVa
 };
--- a/fpga/generator/wally.tcl
+++ b/fpga/generator/wally.tcl
@ -42,13 +42,9 @@ if {$board=="ArtyA7"} {
 # read in all other rtl
 read_verilog -sv [glob -type f  ../src/CopiedFiles_do_not_add_to_repo/*/*.sv ../src/CopiedFiles_do_not_add_to_repo/*/*/*.sv]
 # *** Once the sdc is updated to use ahb changes these to system verilog.
-read_verilog [glob -type f ../src/axi_sdc_controller.v]
+read_verilog [glob -type f ../../addins/ahbsdc/sdc/*.v]
 read_verilog [glob -type f ../../addins/vivado-risc-v/sdc/sd_cmd_master.v]
 read_verilog [glob -type f ../../addins/vivado-risc-v/sdc/sd_cmd_serial_host.v]
 read_verilog [glob -type f ../../addins/vivado-risc-v/sdc/sd_data_master.v]
 read_verilog [glob -type f ../../addins/vivado-risc-v/sdc/sd_data_serial_host.v]
-set_property include_dirs {../src/CopiedFiles_do_not_add_to_repo/config ../../config/shared ../../addins/vivado-risc-v/sdc} [current_fileset]
+set_property include_dirs {../src/CopiedFiles_do_not_add_to_repo/config ../../config/shared ../../addins/ahbsdc/sdc} [current_fileset]
 if {$board=="ArtyA7"} {
    add_files -fileset constrs_1 -norecurse ../constraints/constraints-$board.xdc
--- a/fpga/src/boot.mem
+++ b/fpga/src/boot.mem
@ -0,0 +1,513 @@
 8001819300002197
 4281420141014081
 4481440143814301
 4681460145814501
 4881480147814701
 4a814a0149814901
 4c814c014b814b01
 4e814e014d814d01
 0110011b4f814f01
 059b45011161016e
 0004063705fe0010
 1f6000ef8006061b
 0ff003930000100f
 4e952e3110060e37
 c602829b0053f2b7
 2023fe02dfe312fd
 829b0053f2b7007e
 fe02dfe312fdc602
 4de31efd000e2023
 059bf1402573fdd0
 0000061705e20870
 0010029b01260613
 68110002806702fe
 0085179bf0080813
 038008130107f7b3
 480508a86c632781
 1533357902a87963
 38030000181700a8
 1c6301057833f268
 081a403018370808
 0105783342280813
 1815751308081063
 00367513c295e14d
 654ded510207e793
 c1701ff00613f130
 0637c530fff6861b
 664dcd10167d0200
 17fd001007b7c25c
 859b5a5cc20cd21c
 02062a23dfed0007
 4785fffd561c664d
 4501461c06f59063
 4a1cc35c465cc31c
 e29dc75c4a5cc71c
 0c63086008138082
 1ae30a9008130105
 b7710017e793f905
 e793b75901d7e793
 5f5c674db7410197
 66cd02072e23dffd
 fff78513ff7d5698
 40a0053300a03533
 bfb100a7e7938082
 e0a2715d8082557d
 e486f052f44ef84a
 fa13e85aec56fc26
 843289ae892a0086
 00959993000a1463
 864ac4396b054a85
 0009859b4549870a
 0004049b05540363
 86a66485008b7363
 870a87aaec7ff0ef
 4531458146014681
 f0ef0207c9639c05
 17820094979beb1f
 873e020541639381
 993e99ba020a1963
 870aa8094501f85d
 e8bff0ef45454685
 60a64505fe0559e3
 79a2794274e26406
 61616b426ae27a02
 9301020497138082
 f40647057179b7f1
 d79867cdec26f022
 dff58b85571c674d
 2423d35c03600793
 fffd571c674d0207
 0007a737b00026f3
 b00027f311f70713
 674dfef77de38f95
 4f5ccf9d8b895b1c
 26f3cf5c0027e793
 071305f5e737b000
 8f95b00027f30ff7
 4f5c674dfef77de3
 b00026f3cf5c9bf5
 67f7071300989737
 7de38f95b00027f3
 458146014681fef7
 ddbff0ef4501870a
 059346014681870a
 dcbff0ef45211aa0
 1aa007134782e939
 816393d117d24411
 85220ff0041302e7
 614564e270a27402
 46e3da5ff0efa0cd
 0207c7634782fe05
 458146014681870a
 d8bff0ef03700513
 46014681870a87aa
 0a900513403005b7
 4409bf7dfc07d9e3
 c3998b8583f9bfe1
 4681870a00846413
 f0ef450945814601
 870afa0540e3d59f
 123405b746014681
 46e3d45ff0ef450d
 870a77c14482f805
 85a6460146818cfd
 4ae3d2dff0ef451d
 d3d8470567cdf605
 000f4737b00026f3
 b00027f323f70713
 67cdfef77de38f95
 4681870a0007ae23
 0370051385a64601
 f2054fe3cf7ff0ef
 458146014681870a
 ce3ff0ef08600513
 4681870af20545e3
 4541200005934601
 f0055de3ccfff0ef
 3023bf010113bf09
 4605842a86aa4081
 40113423850a4585
 86a265a6da5ff0ef
 d99ff0ef04084605
 2201358322813603
 86a2260508700513
 d81ff0ef05629e0d
 2a0135832a813603
 9e0d86a226054505
 3603d6bff0ef057e
 0513320135833281
 9e0d86a226054010
 3083d53ff0ef0556
 4501400134034081
 0000808241010113
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 0000000000000000
 00600100d2e3ca40
--- a/linux/Makefile
+++ b/linux/Makefile
@ -27,14 +27,6 @@ BINARIES := fw_jump.elf vmlinux busybox
 OBJDUMPS := $(foreach name, $(BINARIES), $(basename $(name) .elf))
 OBJDUMPS := $(foreach name, $(OBJDUMPS), $(DIS)/$(name).objdump)
 define linuxDir =
 $(shell find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/linux-[0-9]+\.[0-9]+\.[0-9]+$$")
 endef
 define busyboxDir =
 $(shell find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/busybox-[0-9]+\.[0-9]+\.[0-9]+$$")
 endef
 .PHONY: all generate disassemble install clean cleanDTB cleanDriver test
 all:
@ -46,8 +38,7 @@ all:
 # Temp rule for debugging
 test:
-	@echo $(linuxDir)
+	echo $(shell find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/linux-[0-9]+\.[0-9]+\.[0-9]+$$")
 	@echo $(busyboxDir)
 generate: $(DTB) $(IMAGES)
@ -74,11 +65,13 @@ $(DIS)/%.objdump: $(IMAGES)/%.elf
 $(DIS)/%.objdump: $(IMAGES)/%
 	riscv64-unknown-elf-objdump -S $< >> $@
-$(IMAGES)/vmlinux: $(call linuxDir)/vmlinux
+$(IMAGES)/vmlinux:
-	cp $< $@
+	linuxDir=$$(find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/linux-[0-9]+\.[0-9]+\.[0-9]+$$") ;\
 	cp $$linuxDir/vmlinux $@ ;\
-$(IMAGES)/busybox: $(call busyboxDir)/busybox
+$(IMAGES)/busybox:
-	cp $< $@
+	busyboxDir=$$(find $(BUILDROOT)/output/build -maxdepth 2 -type d -regex ".*/busybox-[0-9]+\.[0-9]+\.[0-9]+$$") ;\
 	cp $$busyboxDir/busybox $@ ;\
 # Generating new Buildroot directories --------------------------------
--- a/sim/imperas.ic
+++ b/sim/imperas.ic
@ -18,12 +18,13 @@
 # More extensions
 --override cpu/Zcb=T
 --override cpu/unaligned=T
 # Cache block operations
 --override cpu/Zicbom=T
 --override cpu/Zicbop=T
 --override cpu/Zicboz=T
 --override cmomp_bytes=64  # Zic64b
 --override cmoz_bytes=64   # Zic64b
 --override lr_sc_grain=64  # Za64rs
 # 64 KiB continuous huge pages supported
 --override cpu/Svpbmt=T
@ -42,6 +43,7 @@
 --override cpu/reset_address=0x80000000
 --override cpu/unaligned=T  # Zicclsm (should be true)
 --override cpu/ignore_non_leaf_DAU=1
 --override cpu/wfi_is_nop=T
 --override cpu/misa_Extensions_mask=0x0
--- a/src/cvw.sv
+++ b/src/cvw.sv
@ -271,15 +271,12 @@ typedef struct packed {
  int CORRSHIFTSZ;
 // division constants
  int DIVN       ;
  int LOGR       ;
  int RK         ;
  int LOGRK      ;
  int FPDUR      ;
  int DURLEN     ;
  int DIVb       ;
  int DIVBLEN    ;
  int DIVa       ;
 } cvw_t;
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@ -45,8 +45,8 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
  input  logic                 IntDivE, W64E,
  output logic                 DivStickyM,
  output logic                 FDivBusyE, IFDivStartE, FDivDoneE,
-  output logic [P.NE+1:0]      QeM,
+  output logic [P.NE+1:0]      UeM,                         // Exponent result 
-  output logic [P.DIVb:0]      QmM,
+  output logic [P.DIVb:0]      UmM,                         // Significand result
  output logic [P.XLEN-1:0]    FIntDivResultM
 );
@ -67,17 +67,17 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
  // Integer div/rem signals                                
  logic                        BZeroM;                       // Denominator is zero
  logic                        IntDivM;                      // Integer operation
-  logic [P.DIVBLEN:0]          nM, mM;                       // Shift amounts
+  logic [P.DIVBLEN-1:0]        IntNormShiftM;                // Integer normalizatoin shift amount
  logic                        ALTBM, AsM, BsM, W64M;        // Special handling for postprocessor
  logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
  logic                        ISpecialCaseE;                // Integer div/remainder special cases
  fdivsqrtpreproc #(P) fdivsqrtpreproc(                          // Preprocessor
    .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE),
-    .FmtE, .SqrtE, .XZeroE, .Funct3E, .QeM, .X, .D, .CyclesE,
+    .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
    // Int-specific 
    .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
-    .BZeroM, .nM, .mM, .AM, 
+    .BZeroM, .IntNormShiftM, .AM, 
    .IntDivM, .W64M, .ALTBM, .AsM, .BsM);
  fdivsqrtfsm #(P) fdivsqrtfsm(                                  // FSM
@ -94,8 +94,8 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
  fdivsqrtpostproc #(P) fdivsqrtpostproc(                        // Postprocessor
    .clk, .reset, .StallM, .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, 
    .SqrtE, .Firstun, .SqrtM, .SpecialCaseM, 
-    .QmM, .WZeroE, .DivStickyM, 
+    .UmM, .WZeroE, .DivStickyM, 
    // Int-specific 
-    .nM, .mM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
+    .IntNormShiftM, .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
    .FIntDivResultM);
 endmodule
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@ -30,13 +30,11 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.FMTBITS-1:0] FmtE,
  input  logic                 SqrtE,
  input  logic                 IntDivE,
-  input  logic [P.DIVBLEN:0]   nE,
+  input  logic [P.DIVBLEN-1:0] IntResultBitsE,    
  output logic [P.DURLEN-1:0]  CyclesE
 );
-  logic [P.DURLEN+1:0] Nf, fbits; // number of fractional bits
+
-  // DIVN = P.NF+3
+  logic [P.DIVBLEN-1:0] Nf, FPResultBitsE, ResultBitsE; // number of fractional (result) bits
  // NS = NF + 1
  // N = NS or NS+2 for div/sqrt.
  /* verilator lint_off WIDTH */
  if (P.FPSIZES == 1)
@ -64,12 +62,21 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
        P.Q_FMT: Nf = P.Q_NF;
      endcase 
  // Cycle logic
  // P.DIVCOPIES = k. P.LOGR = log(R) = r.  P.RK = rk.  
  // Integer division needs p fractional + r integer result bits
  // FP Division needs at least Nf fractional bits + 2 guard/round bits and one integer digit (LOG R integer bits) = Nf + 2 + r bits
  // FP Sqrt needs at least Nf fractional bits and 2 guard/round bits.  The integer bit is always initialized to 1 and does not need a cycle.
  // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
  always_comb begin 
-    if (SqrtE) fbits = Nf + 2 + 1; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2 *** unclear why it works with just +1; is it related to DIVCOPIES logic below?
+    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
-    // if (SqrtE) fbits = Nf + 2 + 2; // Nf + two fractional bits for round/guard + 2 for right shift by up to 2
+    else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits 
-    else       fbits = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits - try this when placing results in msbs
+
-    if (P.IDIV_ON_FPU) CyclesE =  IntDivE ? ((nE + 1)/P.DIVCOPIES) : (fbits + (P.LOGR*P.DIVCOPIES)-1)/(P.LOGR*P.DIVCOPIES);
+    if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;
-    else              CyclesE = (fbits + (P.LOGR*P.DIVCOPIES)-1)/(P.LOGR*P.DIVCOPIES);
+    else               ResultBitsE = FPResultBitsE;
    CyclesE = (ResultBitsE-1)/(P.RK) + 1; // ceil (ResultBitsE/rk)
  end 
  /* verilator lint_on WIDTH */
--- a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
@ -28,16 +28,19 @@
 module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.FMTBITS-1:0] Fmt,
-  input  logic [P.NE-1:0]      Xe, Ye,
+  input  logic [P.NE-1:0]      Xe, Ye,    // input exponents
  input  logic                 Sqrt,
  input  logic                 XZero, 
-  input  logic [P.DIVBLEN:0]   ell, m,
+  input  logic [P.DIVBLEN-1:0] ell, m,    // number of leading 0s in Xe and Ye
-  output logic [P.NE+1:0]      Qe
+  output logic [P.NE+1:0]      Ue         // result exponent
  );
  logic [P.NE-2:0] Bias;
  logic [P.NE+1:0] SXExp;
  logic [P.NE+1:0] SExp;
  logic [P.NE+1:0] DExp;
  // Determine exponent bias according to the format
  if (P.FPSIZES == 1) begin
    assign Bias = (P.NE-1)'(P.BIAS); 
@ -63,10 +66,14 @@ module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
      2'h2: Bias =  (P.NE-1)'(P.H_BIAS);
    endcase
  end
  // Square root exponent = (Xe - l - bias) / 2 + bias; l accounts for subnorms
  assign SXExp = {2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - (P.NE+2)'(P.BIAS);
  assign SExp  = {SXExp[P.NE+1], SXExp[P.NE+1:1]} + {2'b0, Bias};
-  // correct exponent for subnormal input's normalization shifts
+  // division exponent = (Xe-l) - (Ye-m) + bias; l and m account for subnorms
  assign DExp  = ({2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - {2'b0, Ye} + {{(P.NE+1-P.DIVBLEN){1'b0}}, m} + {3'b0, Bias}); 
-  assign Qe = Sqrt ? SExp : DExp;
+
  // Select square root or division exponent
  assign Ue = Sqrt ? SExp : DExp;
 endmodule
--- a/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
@ -28,12 +28,12 @@
 module fdivsqrtfgen2 import cvw::*;  #(parameter cvw_t P) (
  input  logic              up, uz,
-  input  logic [P.DIVb+3:0] C, U, UM,
+  input  logic [P.DIVb+3:0] C, U, UM,   // Q4.DIVb (extended from shorter forms)
-  output logic [P.DIVb+3:0] F
+  output logic [P.DIVb+3:0] F           // Q4.DIVb
 );
-  logic [P.DIVb+3:0]        FP, FN, FZ;
+  logic [P.DIVb+3:0]        FP, FN, FZ;  // Q4.DIVb
-  // Generate for both positive and negative bits
+  // Generate for both positive and negative quotient digits
  assign FP = ~(U << 1) & C;
  assign FN = (UM << 1) | (C & ~(C << 2));
  assign FZ = '0;
--- a/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
@ -27,14 +27,14 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 module fdivsqrtfgen4 import cvw::*;  #(parameter cvw_t P) (
-  input  logic [3:0]        udigit,
+  input  logic [3:0]        udigit,           // {2, 1, -1, -2}; all cold for zero
-  input  logic [P.DIVb+3:0] C, U, UM,
+  input  logic [P.DIVb+3:0] C, U, UM,         // Q4.DIVb (extended from shorter forms)
-  output logic [P.DIVb+3:0] F
+  output logic [P.DIVb+3:0] F                 // Q4.DIVb
 );
-  logic [P.DIVb+3:0]        F2, F1, F0, FN1, FN2;
+  logic [P.DIVb+3:0]        F2, F1, F0, FN1, FN2; // Q4.DIVb
-  // Generate for both positive and negative bits
+  // Generate for both positive and negative digits
-  assign F2  = (~U << 2) & (C << 2);
+  assign F2  = (~U << 2) & (C << 2);              // 
  assign F1  = ~(U << 1) & C;
  assign F0  = '0;
  assign FN1 = (UM << 1) | (C & ~(C << 3));
--- a/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@ -57,7 +57,7 @@ module fdivsqrtfsm import cvw::*;  #(parameter cvw_t P) (
  // terminate immediately on special cases
  assign FSpecialCaseE = XZeroE | XInfE  | XNaNE |  (XsE&SqrtE) | (YZeroE | YInfE | YNaNE)&~SqrtE;
  if (P.IDIV_ON_FPU) assign SpecialCaseE = IntDivE ? ISpecialCaseE : FSpecialCaseE;
-  else              assign SpecialCaseE = FSpecialCaseE;
+  else               assign SpecialCaseE = FSpecialCaseE;
  flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
  always_ff @(posedge clk) begin
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@ -31,31 +31,31 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
  input  logic              IFDivStartE, 
  input  logic              FDivBusyE, 
  input  logic              SqrtE,
-  input  logic [P.DIVb+3:0] X, D,
+  input  logic [P.DIVb+3:0] X, D,                  // Q4.DIVb
-  output logic [P.DIVb:0]   FirstU, FirstUM,
+  output logic [P.DIVb:0]   FirstU, FirstUM,       // U1.DIVb
-  output logic [P.DIVb+1:0] FirstC,
+  output logic [P.DIVb+1:0] FirstC,                // Q2.DIVb
  output logic              Firstun,
-  output logic [P.DIVb+3:0] FirstWS, FirstWC
+  output logic [P.DIVb+3:0] FirstWS, FirstWC       // Q4.DIVb
 );
  /* verilator lint_off UNOPTFLAT */
-  logic [P.DIVb+3:0]      WSNext[P.DIVCOPIES-1:0]; // Q4.b
+  logic [P.DIVb+3:0]      WSNext[P.DIVCOPIES-1:0]; // Q4.DIVb
-  logic [P.DIVb+3:0]      WCNext[P.DIVCOPIES-1:0]; // Q4.b
+  logic [P.DIVb+3:0]      WCNext[P.DIVCOPIES-1:0]; // Q4.DIVb
-  logic [P.DIVb+3:0]      WS[P.DIVCOPIES:0];       // Q4.b
+  logic [P.DIVb+3:0]      WS[P.DIVCOPIES:0];       // Q4.DIVb
-  logic [P.DIVb+3:0]      WC[P.DIVCOPIES:0];       // Q4.b
+  logic [P.DIVb+3:0]      WC[P.DIVCOPIES:0];       // Q4.DIVb
-  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.b
+  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.DIVb
-  logic [P.DIVb:0]        UM[P.DIVCOPIES:0];       // U1.b
+  logic [P.DIVb:0]        UM[P.DIVCOPIES:0];       // U1.DIVb
-  logic [P.DIVb:0]        UNext[P.DIVCOPIES-1:0];  // U1.b
+  logic [P.DIVb:0]        UNext[P.DIVCOPIES-1:0];  // U1.DIVb
-  logic [P.DIVb:0]        UMNext[P.DIVCOPIES-1:0]; // U1.b
+  logic [P.DIVb:0]        UMNext[P.DIVCOPIES-1:0]; // U1.DIVb
-  logic [P.DIVb+1:0]      C[P.DIVCOPIES:0];        // Q2.b
+  logic [P.DIVb+1:0]      C[P.DIVCOPIES:0];        // Q2.DIVb
-  logic [P.DIVb+1:0]      initC;                   // Q2.b
+  logic [P.DIVb+1:0]      initC;                   // Q2.DIVb
  logic [P.DIVCOPIES-1:0] un; 
-  logic [P.DIVb+3:0]      WSN, WCN;                // Q4.b
+  logic [P.DIVb+3:0]      WSN, WCN;                // Q4.DIVb
-  logic [P.DIVb+3:0]      DBar, D2, DBar2;         // Q4.b
+  logic [P.DIVb+3:0]      DBar, D2, DBar2;         // Q4.DIVb
-  logic [P.DIVb+1:0]      NextC;
+  logic [P.DIVb+1:0]      NextC;                   // Q2.DIVb
-  logic [P.DIVb:0]        UMux, UMMux;
+  logic [P.DIVb:0]        UMux, UMMux;             // U1.DIVb
-  logic [P.DIVb:0]        initU, initUM;
+  logic [P.DIVb:0]        initU, initUM;           // U1.DIVb
  /* verilator lint_on UNOPTFLAT */
  // Top Muxes and Registers
@ -104,14 +104,14 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
    for(i=0; $unsigned(i)<P.DIVCOPIES; i++) begin : iterations
      if (P.RADIX == 2) begin: stage
        fdivsqrtstage2 #(P) fdivsqrtstage(.D, .DBar, .SqrtE,
-        .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
+          .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
-        .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
+          .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
      end else begin: stage
        logic j1;
        assign j1 = (i == 0 & ~C[0][P.DIVb-1]);
        fdivsqrtstage4 #(P) fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, .j1,
-        .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
+          .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
-        .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
+          .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
      end
      assign WS[i+1] = WSNext[i];
      assign WC[i+1] = WCNext[i];
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@ -27,25 +27,25 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
-  input  logic               clk, reset,
+  input  logic                 clk, reset,
-  input  logic               StallM,
+  input  logic                 StallM,
-  input  logic [P.DIVb+3:0]  WS, WC,
+  input  logic [P.DIVb+3:0]    WS, WC,            // Q4.DIVb
-  input  logic [P.DIVb+3:0]  D, 
+  input  logic [P.DIVb+3:0]    D,                 // Q4.DIVb
-  input  logic [P.DIVb:0]    FirstU, FirstUM, 
+  input  logic [P.DIVb:0]      FirstU, FirstUM,   // U1.DIVb
-  input  logic [P.DIVb+1:0]  FirstC,
+  input  logic [P.DIVb+1:0]    FirstC,            // Q2.DIVb
-  input  logic               SqrtE,
+  input  logic                 SqrtE,
-  input  logic               Firstun, SqrtM, SpecialCaseM, 
+  input  logic                 Firstun, SqrtM, SpecialCaseM, 
-  input  logic [P.XLEN-1:0]  AM,
+  input  logic [P.XLEN-1:0]    AM,                // U/Q(XLEN.0)
-  input  logic               RemOpM, ALTBM, BZeroM, AsM, BsM, W64M,
+  input  logic                 RemOpM, ALTBM, BZeroM, AsM, BsM, W64M,
-  input  logic [P.DIVBLEN:0] nM, mM,
+  input  logic [P.DIVBLEN-1:0] IntNormShiftM,     
-  output logic [P.DIVb:0]    QmM, 
+  output logic [P.DIVb:0]      UmM,               // U1.DIVb result significand
-  output logic               WZeroE,
+  output logic                 WZeroE,
-  output logic               DivStickyM,
+  output logic                 DivStickyM,
-  output logic [P.XLEN-1:0]  FIntDivResultM
+  output logic [P.XLEN-1:0]    FIntDivResultM     // U/Q(XLEN.0)
 );
  logic [P.DIVb+3:0]         W, Sum;
-  logic [P.DIVb:0]           PreQmM;
+  logic [P.DIVb:0]           PreUmM;
  logic                      NegStickyM;
  logic                      weq0E, WZeroM;
  logic [P.XLEN-1:0]         IntDivResultM;
@ -86,22 +86,21 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
  //////////////////////////
  //  If the result is not exact, the sticky should be set
-  assign DivStickyM = ~WZeroM & ~(SpecialCaseM & SqrtM); // ***unsure why SpecialCaseM has to be gated by SqrtM, but otherwise fails regression on divide
+  assign DivStickyM = ~WZeroM & ~SpecialCaseM; 
-  // Determine if sticky bit is negative  // *** look for ways to optimize this.  Shift shouldn't be needed.
+  // Determine if sticky bit is negative 
  assign Sum = WC + WS;
  assign NegStickyM = Sum[P.DIVb+3];
-  mux2 #(P.DIVb+1) preqmmux(FirstU, FirstUM, NegStickyM, PreQmM); // Select U or U-1 depending on negative sticky bit
+  mux2 #(P.DIVb+1) preummux(FirstU, FirstUM, NegStickyM, PreUmM); // Select U or U-1 depending on negative sticky bit
-  mux2 #(P.DIVb+1)    qmmux(PreQmM, (PreQmM << 1), SqrtM, QmM);
+  mux2 #(P.DIVb+1)    ummux(PreUmM, (PreUmM << 1), SqrtM, UmM);
-  // Integer quotient or remainder correctoin, normalization, and special cases
+  // Integer quotient or remainder correction, normalization, and special cases
  if (P.IDIV_ON_FPU) begin:intpostproc // Int supported
    logic [P.DIVBLEN:0] NormShiftM;
    logic [P.DIVb+3:0] UnsignedQuotM, NormRemM, NormRemDM, NormQuotM;
    logic signed [P.DIVb+3:0] PreResultM, PreIntResultM;
    assign W = $signed(Sum) >>> P.LOGR;
-    assign UnsignedQuotM = {3'b000, PreQmM};
+    assign UnsignedQuotM = {3'b000, PreUmM};
    // Integer remainder: sticky and sign correction muxes
    assign NegQuotM = AsM ^ BsM; // Integer Quotient is negative
@ -110,9 +109,8 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
    mux2 #(P.DIVb+4) quotresmux(UnsignedQuotM, -UnsignedQuotM, NegQuotM, NormQuotM);
    // Select quotient or remainder and do normalization shift
    mux2 #(P.DIVBLEN+1) normshiftmux(((P.DIVBLEN+1)'(P.DIVb) - (nM * (P.DIVBLEN+1)'(P.LOGR))), (mM + (P.DIVBLEN+1)'(P.DIVa)), RemOpM, NormShiftM);
    mux2 #(P.DIVb+4)    presresultmux(NormQuotM, NormRemM, RemOpM, PreResultM);
-    assign PreIntResultM = $signed(PreResultM >>> NormShiftM); 
+    assign PreIntResultM = $signed(PreResultM >>> IntNormShiftM); 
    // special case logic
    // terminates immediately when B is Zero (div 0) or |A| has more leading 0s than |B|
@ -120,7 +118,7 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
      if (BZeroM) begin         // Divide by zero
        if (RemOpM) IntDivResultM = AM;  
        else        IntDivResultM = {(P.XLEN){1'b1}};
-     end else if (ALTBM) begin // Numerator is zero
+     end else if (ALTBM) begin // Numerator is small
        if (RemOpM) IntDivResultM = AM;
        else        IntDivResultM = '0;
     end else       IntDivResultM = PreIntResultM[P.XLEN-1:0];
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@ -29,37 +29,39 @@
 module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
  input  logic                 clk,
  input  logic                 IFDivStartE, 
-  input  logic [P.NF:0]        Xm, Ym,
+  input  logic [P.NF:0]        Xm, Ym,      // Floating-point significands
-  input  logic [P.NE-1:0]      Xe, Ye,
+  input  logic [P.NE-1:0]      Xe, Ye,      // Floating-point exponents
  input  logic [P.FMTBITS-1:0] FmtE,
  input  logic                 SqrtE,
  input  logic                 XZeroE,
  input  logic [2:0]           Funct3E,
-  output logic [P.NE+1:0]      QeM,
+  output logic [P.NE+1:0]      UeM,         // biased exponent of result
-  output logic [P.DIVb+3:0]    X, D,
+  output logic [P.DIVb+3:0]    X, D,        // Q4.DIVb
  // Int-specific
-  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
+  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // U(XLEN.0) inputs from IEU 
  input  logic                 IntDivE, W64E,
  // Outputs
  output logic                 ISpecialCaseE,
  output logic [P.DURLEN-1:0]  CyclesE,
-  output logic [P.DIVBLEN:0]   nM, mM,
+  output logic [P.DIVBLEN-1:0] IntNormShiftM,
  output logic                 ALTBM, IntDivM, W64M,
  output logic                 AsM, BsM, BZeroM,
  output logic [P.XLEN-1:0]    AM
 );
-  logic [P.DIVb-1:0]           Xfract, Dfract;
+  logic [P.DIVb:0]             Xnorm, Dnorm;
  logic [P.DIVb:0]             PreSqrtX;
  logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
-  logic [P.NE+1:0]             QeE;                                 // Quotient Exponent (FP only)
+  logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
-  logic [P.DIVb-1:0]           IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
+  logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
-  logic [P.DIVBLEN:0]          mE, nE, ell;                         // Leading zeros of inputs
+  logic [P.DIVBLEN-1:0]        mE, ell;                             // Leading zeros of inputs
  logic [P.DIVBLEN-1:0]        IntResultBitsE;                      // bits in integer result
  logic                        NumerZeroE;                          // Numerator is zero (X or A)
  logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
  logic                        SignedDivE;                          // signed division
  logic                        AsE, BsE;                            // Signs of integer inputs
  logic [P.XLEN-1:0]           AE;                                  // input A after W64 adjustment
-  logic  ALTBE;
+  logic                        ALTBE;
  logic                        EvenExp;
  //////////////////////////////////////////////////////
  // Integer Preprocessing
@ -89,12 +91,12 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
    mux2 #(P.XLEN) posbmux(BE, -BE, BsE, PosB);
    // Select integer or floating point inputs
-    mux2 #(P.DIVb) ifxmux({Xm, {(P.DIVb-P.NF-1){1'b0}}}, {PosA, {(P.DIVb-P.XLEN){1'b0}}}, IntDivE, IFX);
+    mux2 #(P.DIVb+1) ifxmux({Xm, {(P.DIVb-P.NF){1'b0}}}, {PosA, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFX);
-    mux2 #(P.DIVb) ifdmux({Ym, {(P.DIVb-P.NF-1){1'b0}}}, {PosB, {(P.DIVb-P.XLEN){1'b0}}}, IntDivE, IFD);
+    mux2 #(P.DIVb+1) ifdmux({Ym, {(P.DIVb-P.NF){1'b0}}}, {PosB, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFD);
    mux2 #(1)    numzmux(XZeroE, AZeroE, IntDivE, NumerZeroE);
  end else begin // Int not supported
-    assign IFX = {Xm, {(P.DIVb-P.NF-1){1'b0}}};
+    assign IFX = {Xm, {(P.DIVb-P.NF){1'b0}}};
-    assign IFD = {Ym, {(P.DIVb-P.NF-1){1'b0}}};
+    assign IFD = {Ym, {(P.DIVb-P.NF){1'b0}}};
    assign NumerZeroE = XZeroE;
  end
@ -103,12 +105,12 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
  //////////////////////////////////////////////////////
  // count leading zeros for Subnorm FP and to normalize integer inputs
-  lzc #(P.DIVb) lzcX (IFX, ell);
+  lzc #(P.DIVb+1) lzcX (IFX, ell);
-  lzc #(P.DIVb) lzcY (IFD, mE);
+  lzc #(P.DIVb+1) lzcY (IFD, mE);
-  // Normalization shift: shift off leading one
+  // Normalization shift: shift leading one into most significant bit
-  assign Xfract = (IFX << ell) << 1;
+  assign Xnorm = (IFX << ell);
-  assign Dfract = (IFD << mE)  << 1; 
+  assign Dnorm = (IFD << mE); 
  //////////////////////////////////////////////////////
  // Integer Right Shift to digit boundary
@ -117,31 +119,28 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
  //////////////////////////////////////////////////////
  if (P.IDIV_ON_FPU) begin:intrightshift // Int Supported
-    logic [P.DIVBLEN:0] ZeroDiff, p;
+    logic [P.DIVBLEN-1:0] ZeroDiff, p;
    // calculate number of fractional bits p
    assign ZeroDiff = mE - ell;         // Difference in number of leading zeros
-    assign ALTBE = ZeroDiff[P.DIVBLEN];  // A less than B (A has more leading zeros)
+    assign ALTBE = ZeroDiff[P.DIVBLEN-1];  // A less than B (A has more leading zeros)
-    mux2 #(P.DIVBLEN+1) pmux(ZeroDiff, '0, ALTBE, p);              
+    mux2 #(P.DIVBLEN) pmux(ZeroDiff, '0, ALTBE, p);          
    /* verilator lint_off WIDTH */
    assign IntResultBitsE = P.LOGR + p;  // Total number of result bits (r integer bits plus p fractional bits)
    /* verilator lint_on WIDTH */
    // Integer special cases (terminate immediately)
    assign ISpecialCaseE = BZeroE | ALTBE;
-    // calculate number of fractional digits nE and right shift amount RightShiftX to complete in discrete number of steps
+    // calculate right shift amount RightShiftX to complete in discrete number of steps
-
+    if (P.RK > 1) begin // more than 1 bit per cycle
-    if (P.LOGRK > 0) begin // more than 1 bit per cycle
+      logic [$clog2(P.RK)-1:0] RightShiftX;
-      logic [P.LOGRK-1:0] IntTrunc, RightShiftX;
+      /* verilator lint_offf WIDTH */
-      logic [P.DIVBLEN:0] TotalIntBits, IntSteps;
+      assign RightShiftX = P.RK - 1 - ((IntResultBitsE - 1) % P.RK); // Right shift amount
-      /* verilator lint_off WIDTH */
+      assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in n steps
      assign TotalIntBits = P.LOGR + p;                            // Total number of result bits (r integer bits plus p fractional bits)
      assign IntTrunc = TotalIntBits % P.RK;                       // Truncation check for ceiling operator
      assign IntSteps = (TotalIntBits >> P.LOGRK) + |IntTrunc;     // Number of steps for int div
      assign nE = (IntSteps * P.DIVCOPIES) - 1;                    // Fractional digits
      assign RightShiftX = P.RK - 1 - ((TotalIntBits - 1) % P.RK); // Right shift amount
      assign DivXShifted = DivX >> RightShiftX;                    // shift X by up to R*K-1 to complete in nE steps
      /* verilator lint_on WIDTH */
    end else begin // radix 2 1 copy doesn't require shifting
      assign nE = p; 
      assign DivXShifted = DivX;
    end
  end else begin
@ -150,22 +149,53 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
  //////////////////////////////////////////////////////
  // Floating-Point Preprocessing
-  // append leading 1 (for nonzero inputs)
+  // Extend to Q4.b format
  // shift square root to be in range [1/4, 1)
  // Normalized numbers are shifted right by 1 if the exponent is odd
  // Subnormal numbers have Xe = 0 and an unbiased exponent of 1-BIAS.  They are shifted right if the number of leading zeros is odd.
-  // NOTE: there might be a discrepancy that X is never right shifted by 2.  However
+   //////////////////////////////////////////////////////
  //  it comes out in the wash and gives the right answer.  Investigate later if possible.
  //////////////////////////////////////////////////////
-  assign DivX = {3'b000, ~NumerZeroE, Xfract};
+  assign DivX = {3'b000, Xnorm}; // Zero-extend numerator for division
  // Sqrt is initialized on step one as R(X-1), so depends on Radix
-  mux2 #(P.DIVb+1) sqrtxmux({~XZeroE, Xfract}, {1'b0, ~XZeroE, Xfract[P.DIVb-1:1]}, (Xe[0] ^ ell[0]), PreSqrtX);
+  // If X = 0, then special case logic sets sqrt = 0 so this portion doesn't matter
-  if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};
+  // Otherwise, X has a leading 1 after possible normalization shift and is now in range [1, 2)
-  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};
+  // Next X is shifted right by 1 or 2 bits to range [1/4, 1) and exponent will be adjusted accordingly to be even
-  mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);
+  // Now (X-1) is negative.  Formed by placing all 1s in all four integer bits (in Q4.b) form, keeping X in fraciton bits
-  
+  // Then multiply by R is left shift by r (1 or 2 for radix 2 or 4)
  // This is optimized in hardware by first right shifting by 0 or 1 bit (instead of 1 or 2), then left shifting by (r-1), then subtracting 2 or 4
  // Subtracting 2 is equivalent to adding 1110.  Subtracting 4 is equivalent to adding 1100.  Prepend leading 1s to do a free subtraction.
  // This also means only one extra fractional bit is needed becaue we never shift right by more than 1.
  // Radix      Exponent odd          Exponent Even
  // 2          x-2 = 2(x/2 - 1)      x/2 - 2 = 2(x/4 - 1)
  // 4          2(x)-4 = 4(x/2 - 1))  2(x/2)-4 = 4(x/4 - 1)
  // Summary: PreSqrtX = r(x/2or4 - 1)
  logic [P.DIVb:0] PreSqrtX;
  assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
  if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};                     // 2PreSqrtX - 4 = 4(PreSqrtX/2 - 1) 
 /*  
  // Attempt to optimize radix 4 to use a left shift by 1 or zero initially, followed by no more left shift
  // This saves one bit in DIVb because there is no initial right shift.
  // However, C needs to be extended further, lest it create a k with a 1 in the lsb when C is all 1s.
  // That is an optimization for another day.
  if (P.RADIX == 2) begin
    logic [P.DIVb:0] PreSqrtX;    // U1.DIVb
    mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
    assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
  end else begin
    logic [P.DIVb+1:0] PreSqrtX;  // U2.DIVb
    mux2 #(P.DIVb+2) sqrtxmux({Xnorm, 1'b0}, {1'b0, Xnorm}, EvenExp, PreSqrtX); // 2X if exponent odd, X if exponent even
    assign SqrtX = {2'b11, PreSqrtX};                     // PreSqrtX - 4 = 4(PreSqrtX/4 - 1)
  end
 */
  // Initialize X for division or square root
  mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);                    
  //////////////////////////////////////////////////////
  // Selet integer or floating-point operands
  //////////////////////////////////////////////////////
@ -176,28 +206,37 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
    assign X = PreShiftX;
  end
-   // Divisior register
+  // Divisior register
-  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {4'b0001, Dfract}, D);
+  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dnorm}, D);
  // Floating-point exponent
-  fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .XZero(XZeroE), .ell, .m(mE), .Qe(QeE));
+  fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .XZero(XZeroE), .ell, .m(mE), .Ue(UeE));
-  flopen #(P.NE+2) expreg(clk, IFDivStartE, QeE, QeM);
+  flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
  // Number of FSM cycles (to FSM)
-  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .nE, .CyclesE);
+  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
  if (P.IDIV_ON_FPU) begin:intpipelineregs
    logic [P.DIVBLEN-1:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
    logic               RemOpE;
    /* verilator lint_off WIDTH */
    assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
    assign IntRemNormShiftE = mE + (P.DIVb-(P.XLEN-1));           // m + b - (N-1) for remainder normalization shift
    /* verilator lint_on WIDTH */
    assign RemOpE = Funct3E[1];
    mux2 #(P.DIVBLEN) normshiftmux(IntDivNormShiftE, IntRemNormShiftE, RemOpE, IntNormShiftE);
    // pipeline registers
-    flopen #(1)        mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
+    flopen #(1)          mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
-    flopen #(1)       altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
+    flopen #(1)         altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
-    flopen #(1)      bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
+    flopen #(1)        bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
-    flopen #(1)      asignreg(clk, IFDivStartE, AsE,      AsM);
+    flopen #(1)        asignreg(clk, IFDivStartE, AsE,      AsM);
-    flopen #(1)      bsignreg(clk, IFDivStartE, BsE,      BsM);
+    flopen #(1)        bsignreg(clk, IFDivStartE, BsE,      BsM);
-    flopen #(P.DIVBLEN+1) nreg(clk, IFDivStartE, nE,       nM); 
+    flopen #(P.DIVBLEN)   nsreg(clk, IFDivStartE, IntNormShiftE, IntNormShiftM); 
-    flopen #(P.DIVBLEN+1) mreg(clk, IFDivStartE, mE,       mM);
+    flopen #(P.XLEN)    srcareg(clk, IFDivStartE, AE,       AM);
    flopen #(P.XLEN)   srcareg(clk, IFDivStartE, AE,       AM);
    if (P.XLEN==64) 
-      flopen #(1)      w64reg(clk, IFDivStartE, W64E,     W64M);
+      flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);
  end
 endmodule
--- a/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@ -29,33 +29,27 @@
 /* verilator lint_off UNOPTFLAT */
 module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.DIVb+3:0] D, DBar, 
+  input  logic [P.DIVb+3:0] D, DBar,        // Q4.DIVb
-  input  logic [P.DIVb:0]   U, UM,
+  input  logic [P.DIVb:0]   U, UM,          // U1.DIVb
-  input  logic [P.DIVb+3:0] WS, WC,
+  input  logic [P.DIVb+3:0] WS, WC,         // Q4.DIVb
-  input  logic [P.DIVb+1:0] C,
+  input  logic [P.DIVb+1:0] C,              // Q2.DIVb
-  input  logic             SqrtE,
+  input  logic              SqrtE,
-  output logic             un,
+  output logic              un,
-  output logic [P.DIVb+1:0] CNext,
+  output logic [P.DIVb+1:0] CNext,          // Q2.DIVb
-  output logic [P.DIVb:0]   UNext, UMNext, 
+  output logic [P.DIVb:0]   UNext, UMNext,  // U1.DIVb
-  output logic [P.DIVb+3:0] WSNext, WCNext
+  output logic [P.DIVb+3:0] WSNext, WCNext  // Q4.DIVb
 );
 /* verilator lint_on UNOPTFLAT */
-  logic [P.DIVb+3:0]        Dsel;
+  logic [P.DIVb+3:0]        Dsel;     // Q4.DIVb
-  logic                    up, uz;
+  logic                     up, uz;
-  logic [P.DIVb+3:0]        F;
+  logic [P.DIVb+3:0]        F;        // Q4.DIVb
-  logic [P.DIVb+3:0]        AddIn;
+  logic [P.DIVb+3:0]        AddIn;    // Q4.DIVb
-  logic [P.DIVb+3:0]        WSA, WCA;
+  logic [P.DIVb+3:0]        WSA, WCA; // Q4.DIVb
-  // Qmient Selection logic
+  // Quotient Selection logic
  // Given partial remainder, select digit of +1, 0, or -1 (up, uz, un)
-  // q encoding:
+  fdivsqrtuslc2 uslc2(.WS(WS[P.DIVb+3:P.DIVb]), .WC(WC[P.DIVb+3:P.DIVb]), .up, .uz, .un);
  // 1000 = +2
  // 0100 = +1
  // 0000 =  0
  // 0010 = -1
  // 0001 = -2
  fdivsqrtqsel2 qsel2(WS[P.DIVb+3:P.DIVb], WC[P.DIVb+3:P.DIVb], up, uz, un);
  // Sqrt F generation.  Extend C, U, UM to Q4.k
  fdivsqrtfgen2 #(P) fgen2(.up, .uz, .C({2'b11, CNext}), .U({3'b000, U}), .UM({3'b000, UM}), .F);
@ -66,7 +60,7 @@ module fdivsqrtstage2 import cvw::*;  #(parameter cvw_t P) (
    else if (uz) Dsel = '0;
    else         Dsel = D; // un
-  // Partial Product Generation
+  // Residual Update
  //  WSA, WCA = WS + WC - qD
  mux2 #(P.DIVb+4) addinmux(Dsel, F, SqrtE, AddIn);
  csa #(P.DIVb+4) csa(WS, WC, AddIn, up&~SqrtE, WSA, WCA);
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@ -27,40 +27,33 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.DIVb+3:0] D, DBar, D2, DBar2,
+  input  logic [P.DIVb+3:0] D, DBar, D2, DBar2, // Q4.DIVb
-  input  logic [P.DIVb:0]   U,UM,
+  input  logic [P.DIVb:0]   U,UM,               // U1.DIVb
-  input  logic [P.DIVb+3:0] WS, WC,
+  input  logic [P.DIVb+3:0] WS, WC,             // Q4.DIVb
-  input  logic [P.DIVb+1:0] C,
+  input  logic [P.DIVb+1:0] C,                  // Q2.DIVb
-  input  logic             SqrtE, j1,
+  input  logic              SqrtE, j1,
-  output logic [P.DIVb+1:0] CNext,
+  output logic [P.DIVb+1:0] CNext,              // Q2.DIVb
-  output logic             un,
+  output logic              un,
-  output logic [P.DIVb:0]   UNext, UMNext, 
+  output logic [P.DIVb:0]   UNext, UMNext,      // U1.DIVb
-  output logic [P.DIVb+3:0] WSNext, WCNext
+  output logic [P.DIVb+3:0] WSNext, WCNext      // Q4.DIVb
 );
-  logic [P.DIVb+3:0]        Dsel;
+  logic [P.DIVb+3:0]        Dsel;               // Q4.DIVb
-  logic [3:0]              udigit;
+  logic [3:0]               udigit;             // {+2, +1, -1, -2} or 0000 for 0
-  logic [P.DIVb+3:0]        F;
+  logic [P.DIVb+3:0]        F;                  // Q4.DIVb
-  logic [P.DIVb+3:0]        AddIn;
+  logic [P.DIVb+3:0]        AddIn;              // Q4.DIVb
-  logic [4:0]              Smsbs;
+  logic [4:0]               Smsbs;              // U1.4
-  logic [2:0]              Dmsbs;
+  logic [2:0]               Dmsbs;              // U0.3   drop leading 1 from D
-  logic [7:0]              WCmsbs, WSmsbs;
+  logic [7:0]               WCmsbs, WSmsbs;     // U4.4
-  logic                    CarryIn;
+  logic                     CarryIn;
-  logic [P.DIVb+3:0]        WSA, WCA;
+  logic [P.DIVb+3:0]        WSA, WCA;           // Q4.DIVb
  // Digit Selection logic
-  // u encoding:
+  assign Smsbs  = U[P.DIVb:P.DIVb-4];       // U1.4 most significant bits of square root
-  // 1000 = +2
+  assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
-  // 0100 = +1
+  assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
-  // 0000 =  0
+  assign WSmsbs = WS[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
-  // 0010 = -1
+  fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .udigit);
  // 0001 = -2
  assign Smsbs  = U[P.DIVb:P.DIVb-4];
  assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];
  assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];
  assign WSmsbs = WS[P.DIVb+3:P.DIVb-4];
  fdivsqrtqsel4cmp qsel4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .udigit);
  assign un = 1'b0; // unused for radix 4
  // F generation logic
--- a/src/fpu/fdivsqrt/fdivsqrtuotfc2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuotfc2.sv
@ -31,15 +31,15 @@
 ///////////////////////////////
 module fdivsqrtuotfc2 import cvw::*;  #(parameter cvw_t P) (
  input  logic             up, un,
-  input  logic [P.DIVb+1:0] C,
+  input  logic [P.DIVb+1:0] C,                // Q2.DIVb
-  input  logic [P.DIVb:0]   U, UM,
+  input  logic [P.DIVb:0]   U, UM,            // U1.DIVb
-  output logic [P.DIVb:0]   UNext, UMNext
+  output logic [P.DIVb:0]   UNext, UMNext     // U1.DIVb
 );
  //  The on-the-fly converter transfers the divsqrt
  //  bits to the quotient as they come.
-  logic [P.DIVb:0] K;
+  logic [P.DIVb:0] K;                         // U1.DIVb one-hot 
-  assign K = (C[P.DIVb:0] & ~(C[P.DIVb:0] << 1)); // Thermometer to one hot encoding
+  assign K = (C[P.DIVb:0] & ~(C[P.DIVb:0] << 1)); // Thermometer to one hot encoding  
  always_comb begin
    if (up) begin
--- a/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
@ -28,15 +28,15 @@
 module fdivsqrtuotfc4 import cvw::*;  #(parameter cvw_t P) (
  input  logic [3:0]     udigit,
-  input  logic [P.DIVb:0] U, UM,
+  input  logic [P.DIVb:0] U, UM,          // U1.DIVb
-  input  logic [P.DIVb:0] C,
+  input  logic [P.DIVb:0] C,              // Q1.DIVb
-  output logic [P.DIVb:0] UNext, UMNext
+  output logic [P.DIVb:0] UNext, UMNext   // U1.DIVb
 );
  //  The on-the-fly converter transfers the square root 
  //  bits to the quotient as they come.
  //  Use this otfc for division and square root.
-  logic [P.DIVb:0] K1, K2, K3;       
+  logic [P.DIVb:0] K1, K2, K3;            // U1.DIVb
  assign K1 = (C&~(C << 1));        // K
  assign K2 = ((C << 1)&~(C << 2)); // 2K
  assign K3 = (C & ~(C << 2));      // 3K
--- a/src/fpu/fdivsqrt/fdivsqrtuslc2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc2.sv
@ -1,10 +1,10 @@
 ///////////////////////////////////////////
-// fdivsqrtqsel2.sv
+// fdivsqrtuslc2.sv
 //
 // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
 // Modified:13 January 2022
 //
-// Purpose: Radix 2 Quotient Digit Selection
+// Purpose: Radix 2 Unified Quotient/Square Root Digit Selection
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
@ -18,7 +18,7 @@
 // except in compliance with the License, or, at your option, the Apache License version 2.0. You 
 // may obtain a copy of the License at
 //
-// https://solderpad.org/licenses/SHL-2.1/
+// httWS://solderpad.org/licenses/SHL-2.1/
 //
 // Unless required by applicable law or agreed to in writing, any work distributed under the 
 // License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
@ -26,31 +26,26 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
-module fdivsqrtqsel2 ( 
+module fdivsqrtuslc2 ( 
-  input  logic [3:0] ps, pc, 
+  input  logic [3:0] WS, WC,      // Q4.0 most significant bits of redundant residual
-  output logic       up, uz, un
+  output logic       up, uz, un   // {+1, 0, -1}
 );
-  logic [3:0]  p, g;
+  logic        sign;
-  logic        magnitude, sign;
+
  // Carry chain logic determines if W = WS + WC = -1, < -1, > -1 to choose 0, -1, 1 respectively
-  // The quotient selection logic is presented for simplicity, not
+  //if p2 * p1 * p0, W = -1 and choose digit of 0
-  // for efficiency.  You can probably optimize your logic to
+  assign uz = ((WS[2]^WC[2]) & (WS[1]^WC[1]) & 
-  // select the proper divisor with less delay.
+        (WS[0]^WC[0]));
-  // Quotient equations from EE371 lecture notes 13-20
+  // Otherwise determine sign using carry chain: sign = p3 ^ g_2:0
-  assign p = ps ^ pc;
+  assign sign = (WS[3]^WC[3])^
-  assign g = ps & pc;
+      (WS[2] & WC[2] | ((WS[2]^WC[2]) &
-
+          (WS[1]&WC[1] | ((WS[1]^WC[1]) &
-  assign magnitude = ~((ps[2]^pc[2]) & (ps[1]^pc[1]) & 
+            (WS[0]&WC[0])))));
        (ps[0]^pc[0]));
  assign sign = (ps[3]^pc[3])^
      (ps[2] & pc[2] | ((ps[2]^pc[2]) &
          (ps[1]&pc[1] | ((ps[1]^pc[1]) &
            (ps[0]&pc[0])))));
  // Produce digit = +1, 0, or -1
-  assign up = magnitude & ~sign;
+  assign up = ~uz & ~sign;
-  assign uz = ~magnitude;
+  assign un = ~uz & sign;
  assign un = magnitude & sign;
 endmodule
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
@ -1,10 +1,10 @@
 ///////////////////////////////////////////
-// fdivsqrtqsel4.sv
+// fdivsqrtuslc4.sv
 //
 // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
 // Modified:13 January 2022
 //
-// Purpose: Radix 4 Quotient Digit Selection
+// Purpose: Table-based Radix 4 Unified Quotient/Square Root Digit Selection
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
@ -26,25 +26,25 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
-module fdivsqrtqsel4 (
+module fdivsqrtuslc4 (
-  input  logic [2:0] Dmsbs,
+  input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
-  input  logic [4:0] Smsbs,
+  input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
-  input  logic [7:0] WSmsbs, WCmsbs,
+  input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 redundant residual most significant bits
  input  logic       Sqrt, j1,
-  output logic [3:0] udigit
+  output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
-  logic [6:0] Wmsbs;
+  logic [7:0] PreWmsbs;                 // Q4.4 nonredundant residual msbs
-  logic [7:0] PreWmsbs;
+  logic [6:0] Wmsbs;                    // Q4.3 truncated nonredundant residual
-  logic [2:0] A;
+  logic [2:0] A;                        // U0.3 upper bits of D or Smsbs, discarding integer bit
-  assign PreWmsbs = WCmsbs + WSmsbs;
+  assign PreWmsbs = WCmsbs + WSmsbs;    // add redundant residual to find msbs
-  assign Wmsbs = PreWmsbs[7:1];
+  assign Wmsbs = PreWmsbs[7:1];         // truncate least significant bit to Q4.3 to index table
  // D = 0001.xxx...
  // Dmsbs = |   |
  // W =      xxxx.xxx...
  // Wmsbs = |        |
-  logic [3:0] USel4[1023:0];
+  logic [3:0] USel4[1023:0];            // 1024-bit table indexed with 3 bits of A and 7 bits of Wmsbs
  // Prepopulate selection table; this is constant at compile time
  always_comb begin 
@ -101,10 +101,10 @@ module fdivsqrtqsel4 (
  // Select A
  always_comb
    if (Sqrt) begin 
-      if (j1) A = 3'b101;
+      if (j1) A = 3'b101;                       // on first sqrt iteration        A = .101
-      else if (Smsbs == 5'b10000) A = 3'b111;
+      else if (Smsbs == 5'b10000) A = 3'b111;   // if S = 1.0, use                A = .111
-      else A = Smsbs[2:0];
+      else A = Smsbs[2:0];                      // otherwise use                  A = 2S (in U0.3 format)
-    end else A = Dmsbs;
+    end else A = Dmsbs;                         // division Unless                A = D (IN U0.3 format, dropping leading 1)
  // Select quotient digit from lookup table based on A and W
  assign udigit = USel4[{A,Wmsbs}];
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@ -1,10 +1,10 @@
 ///////////////////////////////////////////
-// fdivsqrtqsel4cmp.sv
+// fdivsqrtuslc4cmp.sv
 //
 // Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu 
 // Modified:13 January 2022
 //
-// Purpose: Comparator-based Radix 4 Quotient Digit Selection
+// Purpose: Comparator-based Radix 4 Unified Quotient/Square Root Digit Selection 
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
@ -26,12 +26,12 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
-module fdivsqrtqsel4cmp (
+module fdivsqrtuslc4cmp (
-  input  logic [2:0] Dmsbs,
+  input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
-  input  logic [4:0] Smsbs,
+  input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
-  input  logic [7:0] WSmsbs, WCmsbs,
+  input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 residual most significant bits
  input  logic       SqrtE, j1,
-  output logic [3:0] udigit
+  output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
  logic [6:0] Wmsbs;
  logic [7:0] PreWmsbs;
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@ -133,8 +133,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  logic [P.XLEN-1:0]           FCvtIntResM;                        // fcvt integer result (for IEU)
  // divide signals
-  logic [P.DIVb:0]             QmM;                                // fdivsqrt signifcand
+  logic [P.DIVb:0]             UmM;                                // fdivsqrt signifcand
-  logic [P.NE+1:0]             QeM;                                // fdivsqrt exponent
+  logic [P.NE+1:0]             UeM;                                // fdivsqrt exponent
  logic                        DivStickyM;                         // fdivsqrt sticky bit
  logic                        FDivDoneE, IFDivStartE;             // fdivsqrt control signals
  logic [P.XLEN-1:0]           FIntDivResultM;                     // fdivsqrt integer division result (for IEU)
@ -242,8 +242,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  fdivsqrt #(P) fdivsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
    .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .FDivStartE, .IDivStartE, .XsE,
    .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .IntDivE, .W64E,
-    .StallM, .FlushE, .DivStickyM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .QeM, 
+    .StallM, .FlushE, .DivStickyM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .UeM, 
-    .QmM, .FIntDivResultM);
+    .UmM, .FIntDivResultM);
  // compare: fmin/fmax, flt/fle/feq
  fcmp #(P) fcmp (.Fmt(FmtE), .OpCtrl(OpCtrlE), .Xs(XsE), .Ys(YsE), .Xe(XeE), .Ye(YeE), 
@ -326,9 +326,9 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  //////////////////////////////////////////////////////////////////////////////////////////
  postprocess #(P) postprocess(.Xs(XsM), .Ys(YsM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), 
-    .FmaASticky(FmaAStickyM), .XZero(XZeroM), .YZero(YZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QmM), .FmaSs(SsM),
+    .FmaASticky(FmaAStickyM), .XZero(XZeroM), .YZero(YZeroM), .XInf(XInfM), .YInf(YInfM), .DivUm(UmM), .FmaSs(SsM),
    .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), 
-    .FmaSm(SmM), .DivQe(QeM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
+    .FmaSm(SmM), .DivUe(UeM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
    .CvtCe(CeM), .CvtResSubnormUf(CvtResSubnormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CsM), 
    .ToInt(FWriteIntM), .DivSticky(DivStickyM), .CvtLzcIn(CvtLzcInM), .IntZero(IntZeroM), 
    .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));
--- a/src/fpu/postproc/divshiftcalc.sv
+++ b/src/fpu/postproc/divshiftcalc.sv
@ -27,8 +27,8 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.DIVb:0]              DivQm,              // divsqrt significand
+  input  logic [P.DIVb:0]              DivUm,              // divsqrt significand
-  input  logic [P.NE+1:0]              DivQe,              // divsqrt exponent
+  input  logic [P.NE+1:0]              DivUe,              // divsqrt exponent
  output logic [P.LOGNORMSHIFTSZ-1:0]  DivShiftAmt,        // divsqrt shift amount
  output logic [P.NORMSHIFTSZ-1:0]     DivShiftIn,         // divsqrt shift input
  output logic                         DivResSubnorm,      // is the divsqrt result subnormal
@ -41,23 +41,23 @@ module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
  // is the result subnormal
  // if the exponent is 1 then the result needs to be normalized then the result is Subnormalizes
-  assign DivResSubnorm = DivQe[P.NE+1]|(~|DivQe[P.NE+1:0]);
+  assign DivResSubnorm = DivUe[P.NE+1]|(~|DivUe[P.NE+1:0]);
  // if the result is subnormal
-  //  00000000x.xxxxxx...                     Exp = DivQe
+  //  00000000x.xxxxxx...                     Exp = DivUe
-  //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
+  //  .00000000xxxxxxx... >> NF+1             Exp = DivUe+NF+1
-  //  .00xxxxxxxxxxxxx... << DivQe+NF+1       Exp = +1
+  //  .00xxxxxxxxxxxxx... << DivUe+NF+1       Exp = +1
  //  .0000xxxxxxxxxxx... >> 1                Exp = 1
-  // Left shift amount      = DivQe+NF+1-1
+  // Left shift amount      = DivUe+NF+1-1
-  assign DivSubnormShift    = (P.NE+2)'(P.NF)+DivQe;
+  assign DivSubnormShift    = (P.NE+2)'(P.NF)+DivUe;
  assign DivSubnormShiftPos = ~DivSubnormShift[P.NE+1];
  // if the result is normalized
-  //  00000000x.xxxxxx...                     Exp = DivQe
+  //  00000000x.xxxxxx...                     Exp = DivUe
-  //  .00000000xxxxxxx... >> NF+1             Exp = DivQe+NF+1
+  //  .00000000xxxxxxx... >> NF+1             Exp = DivUe+NF+1
-  //  00000000.xxxxxxx... << NF               Exp = DivQe+1
+  //  00000000.xxxxxxx... << NF               Exp = DivUe+1
-  //  00000000x.xxxxxx... << NF               Exp = DivQe (extra shift done afterwards)
+  //  00000000x.xxxxxx... << NF               Exp = DivUe (extra shift done afterwards)
-  //  00000000xx.xxxxx... << 1?               Exp = DivQe-1 (determined after)
+  //  00000000xx.xxxxx... << 1?               Exp = DivUe-1 (determined after)
  // inital Left shift amount  = NF
  // shift one more if the it's a minimally redundent radix 4 - one entire cycle needed for integer bit
  assign NormShift = (P.LOGNORMSHIFTSZ)'(P.NF);
@ -68,5 +68,5 @@ module divshiftcalc import cvw::*;  #(parameter cvw_t P) (
  assign DivShiftAmt        = DivResSubnorm ? DivSubnormShiftAmt : NormShift;
  // pre-shift the divider result for normalization
-  assign DivShiftIn = {{P.NF{1'b0}}, DivQm, {P.NORMSHIFTSZ-P.DIVb-1-P.NF{1'b0}}};
+  assign DivShiftIn = {{P.NF{1'b0}}, DivUm, {P.NORMSHIFTSZ-P.DIVb-1-P.NF{1'b0}}};
 endmodule
--- a/src/fpu/postproc/postprocess.sv
+++ b/src/fpu/postproc/postprocess.sv
@ -48,8 +48,8 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
  input logic  [$clog2(3*P.NF+5)-1:0]      FmaSCnt,             // the normalization shift count
  //divide signals
  input logic                              DivSticky,           // divider sticky bit
-  input logic  [P.NE+1:0]                  DivQe,               // divsqrt exponent
+  input logic  [P.NE+1:0]                  DivUe,               // divsqrt exponent
-  input logic  [P.DIVb:0]                  DivQm,               // divsqrt significand
+  input logic  [P.DIVb:0]                  DivUm,               // divsqrt significand
  // conversion signals
  input logic                              CvtCs,               // the result's sign
  input logic  [P.NE:0]                    CvtCe,               // the calculated expoent
@ -91,7 +91,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
  // division singals
  logic [P.LOGNORMSHIFTSZ-1:0] DivShiftAmt;          // divsqrt shif amount
  logic [P.NORMSHIFTSZ-1:0]    DivShiftIn;           // divsqrt shift input
-  logic [P.NE+1:0]             Qe;                   // divsqrt corrected exponent after corretion shift
+  logic [P.NE+1:0]             Ue;                   // divsqrt corrected exponent after corretion shift
  logic                        DivByZero;            // divide by zero flag
  logic                        DivResSubnorm;        // is the divsqrt result subnormal
  logic                        DivSubnormShiftPos;   // is the divsqrt subnorm shift amout positive (not underflowed)
@ -146,7 +146,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
  fmashiftcalc #(P) fmashiftcalc(.FmaSm, .FmaSCnt, .Fmt, .NormSumExp, .FmaSe,
      .FmaSZero, .FmaPreResultSubnorm, .FmaShiftAmt, .FmaShiftIn);
-  divshiftcalc #(P) divshiftcalc(.DivQe, .DivQm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
+  divshiftcalc #(P) divshiftcalc(.DivUe, .DivUm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
  // select which unit's output to shift
  always_comb
@ -174,7 +174,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
  // correct for LZA/divsqrt error
  shiftcorrection #(P) shiftcorrection(.FmaOp, .FmaPreResultSubnorm, .NormSumExp,
-      .DivResSubnorm, .DivSubnormShiftPos, .DivOp, .DivQe, .Qe, .FmaSZero, .Shifted, .FmaMe, .Mf);
+      .DivResSubnorm, .DivSubnormShiftPos, .DivOp, .DivUe, .Ue, .FmaSZero, .Shifted, .FmaMe, .Mf);
  ///////////////////////////////////////////////////////////////////////////////
  // Rounding
@ -189,7 +189,7 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
  // calulate result sign used in rounding unit
  roundsign roundsign(.FmaOp, .DivOp, .CvtOp, .Sqrt, .FmaSs, .Xs, .Ys, .CvtCs, .Ms);
-  round #(P) round(.OutFmt, .Frm, .FmaASticky, .Plus1, .PostProcSel, .CvtCe, .Qe,
+  round #(P) round(.OutFmt, .Frm, .FmaASticky, .Plus1, .PostProcSel, .CvtCe, .Ue,
      .Ms, .FmaMe, .FmaOp, .CvtOp, .CvtResSubnormUf, .Mf, .ToInt,  .CvtResUf,
      .DivSticky, .DivOp, .UfPlus1, .FullRe, .Rf, .Re, .Sticky, .Round, .Guard, .Me);
--- a/src/fpu/postproc/round.sv
+++ b/src/fpu/postproc/round.sv
@ -39,7 +39,7 @@ module round import cvw::*;  #(parameter cvw_t P) (
  // divsqrt
  input  logic                     DivOp,              // is a division opperation being done
  input  logic                     DivSticky,          // divsqrt sticky bit
-  input  logic [P.NE+1:0]          Qe,                 // the divsqrt calculated expoent
+  input  logic [P.NE+1:0]          Ue,                 // the divsqrt calculated expoent
  // cvt
  input  logic                     CvtOp,              // is a convert opperation being done
  input  logic                     ToInt,              // is the cvt op a cvt to integer
@ -300,8 +300,8 @@ module round import cvw::*;  #(parameter cvw_t P) (
      case(PostProcSel)
          2'b10:    Me = FmaMe; // fma
          2'b00:    Me = {CvtCe[P.NE], CvtCe}&{P.NE+2{~CvtResSubnormUf|CvtResUf}}; // cvt
-          // 2'b01: Me = DivDone ? Qe : '0; // divide
+          // 2'b01: Me = DivDone ? Ue : '0; // divide
-          2'b01:    Me = Qe; // divide
+          2'b01:    Me = Ue; // divide
          default:  Me = '0; 
      endcase
--- a/src/fpu/postproc/shiftcorrection.sv
+++ b/src/fpu/postproc/shiftcorrection.sv
@ -31,7 +31,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
  // divsqrt
  input logic                      DivOp,                  // is it a divsqrt opperation
  input logic                      DivResSubnorm,          // is the divsqrt result subnormal
-  input logic  [P.NE+1:0]          DivQe,                  // the divsqrt result's exponent
+  input logic  [P.NE+1:0]          DivUe,                  // the divsqrt result's exponent
  input logic                      DivSubnormShiftPos,     // is the subnorm divider shift amount positive (ie not underflowed)
  //fma
  input logic                      FmaOp,                  // is it an fma opperation
@ -41,7 +41,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
  // output
  output logic [P.NE+1:0]          FmaMe,                  // exponent of the normalized sum
  output logic [P.CORRSHIFTSZ-1:0] Mf,                     // the shifted sum before LZA correction
-  output logic [P.NE+1:0]          Qe                      // corrected exponent for divider
+  output logic [P.NE+1:0]          Ue                      // corrected exponent for divider
 );
  logic [3*P.NF+3:0]               CorrSumShifted;         // the shifted sum after LZA correction
@ -61,7 +61,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
  // correct the shifting of the divsqrt caused by producing a result in (2, .5] range
  // condition: if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Subnorm)
-  assign LeftShiftQm = (LZAPlus1|(DivQe==1&~LZAPlus1));
+  assign LeftShiftQm = (LZAPlus1|(DivUe==1&~LZAPlus1));
  assign CorrQm0     = Shifted[P.NORMSHIFTSZ-3:P.NORMSHIFTSZ-P.CORRSHIFTSZ-2];
  assign CorrQm1     = Shifted[P.NORMSHIFTSZ-2:P.NORMSHIFTSZ-P.CORRSHIFTSZ-1];
  mux2 #(P.CORRSHIFTSZ) divcorrmux(CorrQm0, CorrQm1, LeftShiftQm, CorrQmShifted);
@ -87,5 +87,5 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
  // the quotent is in the range [.5,2) if there is no early termination
  // if the quotent < 1 and not Subnormal then subtract 1 to account for the normalization shift
-  assign Qe = (DivResSubnorm & DivSubnormShiftPos) ? '0 : DivQe - {(P.NE+1)'(0), ~LZAPlus1};
+  assign Ue = (DivResSubnorm & DivSubnormShiftPos) ? '0 : DivUe - {(P.NE+1)'(0), ~LZAPlus1};
 endmodule
--- a/src/fpu/unpackinput.sv
+++ b/src/fpu/unpackinput.sv
@ -83,7 +83,6 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (
      assign BadNaNBox = ~(Fmt|(&In[P.FLEN-1:P.LEN1])); // Check NaN boxing
      always_comb
        if (BadNaNBox) begin
 //          PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, In[P.LEN1-P.NE1-3:0]};
          PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, {(P.LEN1-P.NE1-2){1'b0}}};
        end else 
          PostBox = In;
@ -143,8 +142,6 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (
        if (BadNaNBox) begin
          case (Fmt)
            P.FMT: PostBox = In;
 //            P.FMT1: PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, In[P.LEN1-P.NE1-3:0]};
 //            P.FMT2: PostBox = {{(P.FLEN-P.LEN2){1'b1}}, 1'b1, {(P.NE2+1){1'b1}}, In[P.LEN2-P.NE2-3:0]};
            P.FMT1: PostBox = {{(P.FLEN-P.LEN1){1'b1}}, 1'b1, {(P.NE1+1){1'b1}}, {(P.LEN1-P.NE1-2){1'b0}}};
            P.FMT2: PostBox = {{(P.FLEN-P.LEN2){1'b1}}, 1'b1, {(P.NE2+1){1'b1}}, {(P.LEN2-P.NE2-2){1'b0}}};
            default: PostBox = 'x;
@ -230,9 +227,6 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (
        if (BadNaNBox) begin
          case (Fmt)
            2'b11: PostBox = In;
 //            2'b01: PostBox = {{(P.Q_LEN-P.D_LEN){1'b1}}, 1'b1, {(P.D_NE+1){1'b1}}, In[P.D_LEN-P.D_NE-3:0]};
 //            2'b00: PostBox = {{(P.Q_LEN-P.S_LEN){1'b1}}, 1'b1, {(P.S_NE+1){1'b1}}, In[P.S_LEN-P.S_NE-3:0]};
 //            2'b10: PostBox = {{(P.Q_LEN-P.H_LEN){1'b1}}, 1'b1, {(P.H_NE+1){1'b1}}, In[P.H_LEN-P.H_NE-3:0]};
            2'b01: PostBox = {{(P.Q_LEN-P.D_LEN){1'b1}}, 1'b1, {(P.D_NE+1){1'b1}}, {(P.D_LEN-P.D_NE-2){1'b0}}};
            2'b00: PostBox = {{(P.Q_LEN-P.S_LEN){1'b1}}, 1'b1, {(P.S_NE+1){1'b1}}, {(P.S_LEN-P.S_NE-2){1'b0}}};
            2'b10: PostBox = {{(P.Q_LEN-P.H_LEN){1'b1}}, 1'b1, {(P.H_NE+1){1'b1}}, {(P.H_LEN-P.H_NE-2){1'b0}}};
--- a/src/generic/mem/rom1p1r.sv
+++ b/src/generic/mem/rom1p1r.sv
@ -33,7 +33,7 @@ module rom1p1r #(parameter ADDR_WIDTH = 8, DATA_WIDTH = 32, PRELOAD_ENABLED = 0)
 );
   // Core Memory
-   logic [DATA_WIDTH-1:0]    ROM [(2**ADDR_WIDTH)-1:0];
+   (*rom_style="block" *) logic [DATA_WIDTH-1:0]    ROM [(2**ADDR_WIDTH)-1:0];
   // dh 10/30/23 ROM macros are presently commented out
   // because they don't point to a generated ROM
@ -41,15 +41,23 @@ module rom1p1r #(parameter ADDR_WIDTH = 8, DATA_WIDTH = 32, PRELOAD_ENABLED = 0)
      rom1p1r_128x64 rom1 (.CLK(clk), .CEB(~ce), .A(addr[6:0]), .Q(dout));
   end if ((`USE_SRAM == 1) & (ADDR_WDITH == 7) & (DATA_WIDTH == 32)) begin
-      rom1p1r_128x32 rom1 (.CLK(clk), .CEB(~ce), .A(addr[6:0]), .Q(dout));      
+ rom1p1r_128x32 rom1 (.CLK(clk), .CEB(~ce), .A(addr[6:0]), .Q(dout));
-   end else begin */
+  end else begin */
-   always @ (posedge clk) 
+
-      if(ce) dout <= ROM[addr];    
+  initial begin
    if (PRELOAD_ENABLED) begin
      $readmemh("$WALLY/fpga/src/boot.mem", ROM, 0);
    end
  end
  always @ (posedge clk) begin
    if(ce) dout <= ROM[addr];
  end
   // for FPGA, initialize with zero-stage bootloader
-   if(PRELOAD_ENABLED) begin
+   /*if(PRELOAD_ENABLED) begin
      initial begin
        ROM[0]=64'h8001819300002197;
        ROM[1]=64'h4281420141014081;
@ -195,6 +203,6 @@ module rom1p1r #(parameter ADDR_WIDTH = 8, DATA_WIDTH = 32, PRELOAD_ENABLED = 0)
        ROM[141]=64'h0000808241010113;
      end // if (PRELOAD_ENABLED)  
-   end 
+   end*/
 endmodule 
--- a/src/hazard/hazard.sv
+++ b/src/hazard/hazard.sv
@ -26,8 +26,7 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
-module hazard (
+module hazard import cvw::*;  #(parameter cvw_t P) ( 
  // Detect hazards
  input  logic  BPWrongE, CSRWriteFenceM, RetM, TrapM,   
  input  logic  LoadStallD, StoreStallD, MDUStallD, CSRRdStallD,
  input  logic  LSUStallM, IFUStallF,
--- a/src/ieu/datapath.sv
+++ b/src/ieu/datapath.sv
@ -131,7 +131,7 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
  if (P.F_SUPPORTED) begin:fpmux
    mux2  #(P.XLEN)  resultmuxM(IEUResultM, FIntResM, FWriteIntM, IFResultM);
    mux2  #(P.XLEN)  cvtresultmuxW(IFResultW, FCvtIntResW, FCvtIntW, IFCvtResultW);
-    if (P.IDIV_ON_FPU) begin
+    if (P.IDIV_ON_FPU & P.F_SUPPORTED) begin
      mux2  #(P.XLEN)  divresultmuxW(MDUResultW, FIntDivResultW, IntDivW, MulDivResultW);
    end else begin 
      assign MulDivResultW = MDUResultW;
--- a/src/ifu/irom.sv
+++ b/src/ifu/irom.sv
@ -39,7 +39,9 @@ module irom import cvw::*;  #(parameter cvw_t P) (
  logic [31:0]       RawIROMInstrF;
  logic [2:1]        AdrD;
-  rom1p1r #(ADDR_WDITH, P.XLEN) rom(.clk, .ce, .addr(Adr[ADDR_WDITH+OFFSET-1:OFFSET]), .dout(IROMInstrFFull));
+  // preload IROM with the FPGA bootloader by default so that it syntehsizes to something, avoiding having the IEU optimized away because instructions are all 0
  // the testbench replaces these dummy contents with the actual program of interest during simulation
  rom1p1r #(ADDR_WDITH, P.XLEN, 1) rom(.clk, .ce, .addr(Adr[ADDR_WDITH+OFFSET-1:OFFSET]), .dout(IROMInstrFFull));
  if (P.XLEN == 32) assign RawIROMInstrF = IROMInstrFFull;
  else              begin
  // IROM is aligned to XLEN words, but instructions are 32 bits.  Select between the two
--- a/src/lsu/lsu.sv
+++ b/src/lsu/lsu.sv
@ -92,7 +92,8 @@ module lsu import cvw::*;  #(parameter cvw_t P) (
  input var logic [7:0]           PMPCFG_ARRAY_REGW[P.PMP_ENTRIES-1:0], // PMP configuration from privileged unit
  input var logic [P.PA_BITS-3:0] PMPADDR_ARRAY_REGW[P.PMP_ENTRIES-1:0] // PMP address from privileged unit
 );
-  localparam MISALIGN_SUPPORT = P.ZICCLSM_SUPPORTED & P.DCACHE_SUPPORTED;
+  localparam logic MISALIGN_SUPPORT = P.ZICCLSM_SUPPORTED & P.DCACHE_SUPPORTED;
  localparam MLEN = MISALIGN_SUPPORT ? 2*P.LLEN : P.LLEN; // widen buffer for misaligned accessess
  logic [P.XLEN+1:0]     IEUAdrExtM;                             // Memory stage address zero-extended to PA_BITS or XLEN whichever is longer
  logic [P.XLEN+1:0]     IEUAdrExtE;                             // Execution stage address zero-extended to PA_BITS or XLEN whichever is longer
@ -118,9 +119,9 @@ module lsu import cvw::*;  #(parameter cvw_t P) (
  logic [P.LLEN-1:0]     DTIMReadDataWordM;                      // DTIM read data
  /* verilator lint_off WIDTHEXPAND */  
-  logic [(MISALIGN_SUPPORT+1)*P.LLEN-1:0]     DCacheReadDataWordM;                    // D$ read data
+  logic [MLEN-1:0]       DCacheReadDataWordM;                    // D$ read data
-  logic [(MISALIGN_SUPPORT+1)*P.LLEN-1:0]   LSUWriteDataSpillM;                     // Final write data
+  logic [MLEN-1:0]       LSUWriteDataSpillM;                     // Final write data
-  logic [((MISALIGN_SUPPORT+1)*P.LLEN-1)/8:0] ByteMaskSpillM;                       // Selects which bytes within a word to write
+  logic [MLEN/8-1:0]     ByteMaskSpillM;                         // Selects which bytes within a word to write
  /* verilator lint_on WIDTHEXPAND */
  logic [P.LLEN-1:0]     DCacheReadDataWordSpillM;               // D$ read data
  logic [P.LLEN-1:0]     ReadDataWordMuxM;                       // DTIM or D$ read data
--- a/src/mdu/mdu.sv
+++ b/src/mdu/mdu.sv
@ -57,7 +57,7 @@ module mdu import cvw::*;  #(parameter cvw_t P) (
  // Start a divide when a new division instruction is received and the divider isn't already busy or finishing
  // When IDIV_ON_FPU is set, use the FPU divider instead
  // In ZMMUL, with M_SUPPORTED = 0, omit the divider
-  if ((P.IDIV_ON_FPU) || (!P.M_SUPPORTED)) begin:nodiv  
+  if ((P.IDIV_ON_FPU & P.F_SUPPORTED) || (!P.M_SUPPORTED)) begin:nodiv  
    assign QuotM = 0;
    assign RemM = 0;
    assign DivBusyE = 0;
--- a/src/uncore/spi_apb.sv
+++ b/src/uncore/spi_apb.sv
@ -2,10 +2,14 @@
 // spi_apb.sv
 //
 // Written: Naiche Whyte-Aguayo nwhyteaguayo@g.hmc.edu 11/16/2022
 //
 // Purpose: SPI peripheral
-//   See FU540-C000-v1.0 for specifications
+//
 // SPI module is written to the specifications described in FU540-C000-v1.0. At the top level, it is consists of synchronous 8 byte transmit and recieve FIFOs connected to shift registers. 
 // The FIFOs are connected to WALLY by an apb control register interface, which includes various control registers for modifying the SPI transmission along with registers for writing
 // to the transmit FIFO and reading from the receive FIFO. The transmissions themselves are then controlled by a finite state machine. The SPI module uses 4 tristate pins for SPI input/output, 
 // along with a 4 bit Chip Select signal, a clock signal, and an interrupt signal to WALLY.
 // Current limitations: Flash read sequencer mode not implemented, dual and quad mode not supported
 // 
 // A component of the Wally configurable RISC-V project.
 // 
@ -25,19 +29,6 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 // Current limitations: Flash read sequencer mode not implemented, dual and quad modes untestable with current test plan.
 // Attempt to move from >= comparisons by initializing in FSM differently
 // Parameterize SynchFIFO
 // look at ReadIncrement/WriteIncrement delay necessity 
 /* 
 SPI module is written to the specifications described in FU540-C000-v1.0. At the top level, it is consists of synchronous 8 byte transmit and recieve FIFOs connected to shift registers. 
 The FIFOs are connected to WALLY by an apb control register interface, which includes various control registers for modifying the SPI transmission along with registers for writing
 to the transmit FIFO and reading from the receive FIFO. The transmissions themselves are then controlled by a finite state machine. The SPI module uses 4 tristate pins for SPI input/output, 
 along with a 4 bit Chip Select signal, a clock signal, and an interrupt signal to WALLY. 
 */
 module spi_apb import cvw::*; #(parameter cvw_t P) (
    input  logic                PCLK, PRESETn,
    input  logic                PSEL,
@ -54,27 +45,27 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
    output logic                SPIIntr
 );
-    //SPI control registers. Refer to SiFive FU540-C000 manual 
+    // SPI control registers. Refer to SiFive FU540-C000 manual 
    logic [11:0] SckDiv;
-    logic [1:0] SckMode;
+    logic [1:0]  SckMode;
-    logic [1:0] ChipSelectID;
+    logic [1:0]  ChipSelectID;
-    logic [3:0] ChipSelectDef; 
+    logic [3:0]  ChipSelectDef; 
-    logic [1:0] ChipSelectMode;
+    logic [1:0]  ChipSelectMode;
    logic [15:0] Delay0, Delay1;
-    logic [4:0] Format;
+    logic [4:0]  Format;
-    logic [7:0] ReceiveData;
+    logic [7:0]  ReceiveData;
-    logic [2:0] TransmitWatermark, ReceiveWatermark;
+    logic [2:0]  TransmitWatermark, ReceiveWatermark;
-    logic [8:0] TransmitData;
+    logic [8:0]  TransmitData;
-    logic [1:0] InterruptEnable, InterruptPending;
+    logic [1:0]  InterruptEnable, InterruptPending;
-    //Bus interface signals
+    // Bus interface signals
    logic [7:0] Entry;
    logic Memwrite;
    logic [31:0] Din, Dout;
-    logic TransmitInactive;                         //High when there is no transmission, used as hardware interlock signal
+    logic TransmitInactive;                         // High when there is no transmission, used as hardware interlock signal
-    //FIFO FSM signals
+    // FIFO FSM signals
-    //Watermark signals - TransmitReadMark = ip[0], ReceiveWriteMark = ip[1]
+    // Watermark signals - TransmitReadMark = ip[0], ReceiveWriteMark = ip[1]
    logic TransmitWriteMark, TransmitReadMark, RecieveWriteMark, RecieveReadMark; 
    logic TransmitFIFOWriteFull, TransmitFIFOReadEmpty;
    logic TransmitFIFOReadIncrement;
@ -83,75 +74,68 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
    logic ReceiveFIFOWriteFull, ReceiveFIFOReadEmpty;
    logic [7:0] TransmitFIFOReadData, ReceiveFIFOWriteData;
    logic [2:0] TransmitWriteWatermarkLevel, ReceiveReadWatermarkLevel;
-    logic [7:0] ReceiveShiftRegEndian;              //reverses ReceiveShiftReg if Format[2] set (little endian transmission)
+    logic [7:0] ReceiveShiftRegEndian;              // Reverses ReceiveShiftReg if Format[2] set (little endian transmission)
-    //Transmission signals
+    // Transmission signals
    logic sck;
-    logic [11:0] DivCounter;                        //counter for sck 
+    logic [11:0] DivCounter;                        // Counter for sck 
-    logic SCLKenable;                               //flip flop enable high every sclk edge
+    logic SCLKenable;                               // Flip flop enable high every sclk edge
-    //Delay signals
+    // Delay signals
-    logic [8:0] ImplicitDelay1;                     //Adds implicit delay to cs-sck delay counter based on phase  
+    logic [8:0] ImplicitDelay1;                     // Adds implicit delay to cs-sck delay counter based on phase  
-    logic [8:0] ImplicitDelay2;                     //Adds implicit delay to sck-cs delay counter based on phase 
+    logic [8:0] ImplicitDelay2;                     // Adds implicit delay to sck-cs delay counter based on phase 
-    logic [8:0] CS_SCKCount;                        //Counter for cs-sck delay
+    logic [8:0] CS_SCKCount;                        // Counter for cs-sck delay
-    logic [8:0] SCK_CSCount;                        //Counter for sck-cs delay
+    logic [8:0] SCK_CSCount;                        // Counter for sck-cs delay
-    logic [8:0] InterCSCount;                       //Counter for inter cs delay
+    logic [8:0] InterCSCount;                       // Counter for inter cs delay
-    logic [8:0] InterXFRCount;                      //Counter for inter xfr delay 
+    logic [8:0] InterXFRCount;                      // Counter for inter xfr delay 
-    logic CS_SCKCompare;                            //Boolean comparison signal, high when CS_SCKCount >= cs-sck delay
+    logic ZeroDelayHoldMode;                        // High when ChipSelectMode is hold and Delay1[15:8] (InterXFR delay) is 0
    logic SCK_CSCompare;                            //Boolean comparison signal, high when SCK_CSCount >= sck-cs delay
    logic InterCSCompare;                           //Boolean comparison signal, high when InterCSCount >= inter cs delay
    logic InterXFRCompare;                          //Boolean comparison signal, high when InterXFRCount >= inter xfr delay
    logic ZeroDelayHoldMode;                        //High when ChipSelectMode is hold and Delay1[15:8] (InterXFR delay) is 0
-    //Frame counting signals
+    // Frame counting signals
-    logic [3:0] FrameCount;                         //Counter for number of frames in transmission
+    logic [3:0] FrameCount;                         // Counter for number of frames in transmission
-    logic FrameCompare;                             //Boolean comparison signal, high when FrameCount = Format[7:4]
+    logic [3:0] ReceivePenultimateFrameCount;       // Counter
-    logic [3:0] ReceivePenultimateFrame;            //Frame number - 1
+    logic ReceivePenultimateFrame;                  // High when penultimate frame in transmission has been reached
    logic [3:0] ReceivePenultimateFrameCount;       //Counter
    logic ReceivePenultimateFrameBoolean;           //High when penultimate frame in transmission has been reached
-    //State fsm signals
+    // State fsm signals
-    logic Active;                                   //High when state is either Active1 or Active0 (during transmission)
+    logic Active;                                   // High when state is either Active1 or Active0 (during transmission)
-    logic Active0;                                  //High when state is Active0
+    logic Active0;                                  // High when state is Active0
-    //Shift reg signals
+    // Shift reg signals
-    logic ShiftEdge;                                //Determines which edge of sck to shift from TransmitShiftReg
+    logic ShiftEdge;                                // Determines which edge of sck to shift from TransmitShiftReg
-    logic [7:0] TransmitShiftReg;                   //Transmit shift register
+    logic [7:0] TransmitShiftReg;                   // Transmit shift register
-    logic [7:0] ReceiveShiftReg;                    //Receive shift register
+    logic [7:0] ReceiveShiftReg;                    // Receive shift register
-    logic SampleEdge;                               //Determines which edge of sck to sample from ReceiveShiftReg
+    logic SampleEdge;                               // Determines which edge of sck to sample from ReceiveShiftReg
-    logic [7:0] TransmitDataEndian;                 //Reverses TransmitData from txFIFO if littleendian, since TransmitReg always shifts MSB
+    logic [7:0] TransmitDataEndian;                 // Reverses TransmitData from txFIFO if littleendian, since TransmitReg always shifts MSB
-    logic TransmitShiftRegLoad;                     //Determines when to load TransmitShiftReg
+    logic TransmitShiftRegLoad;                     // Determines when to load TransmitShiftReg
-    logic ReceiveShiftFull;                         //High when receive shift register is full
+    logic ReceiveShiftFull;                         // High when receive shift register is full
-    logic TransmitShiftEmpty;                       //High when transmit shift register is empty
+    logic TransmitShiftEmpty;                       // High when transmit shift register is empty
-    logic ShiftIn;                                  //Determines whether to shift from SPIIn or SPIOut (if SPI_LOOPBACK_TEST)  
+    logic ShiftIn;                                  // Determines whether to shift from SPIIn or SPIOut (if SPI_LOOPBACK_TEST)  
-    logic [3:0] LeftShiftAmount;                    //Determines left shift amount to left-align data when little endian              
+    logic [3:0] LeftShiftAmount;                    // Determines left shift amount to left-align data when little endian              
-    logic [7:0] ASR;                                //AlignedReceiveShiftReg    
+    logic [7:0] ASR;                                // AlignedReceiveShiftReg    
-    //CS signals
+    // CS signals
-    logic [3:0] ChipSelectAuto;                     //Assigns ChipSelect value to selected CS signal based on CS ID
+    logic [3:0] ChipSelectAuto;                     // Assigns ChipSelect value to selected CS signal based on CS ID
-    logic [3:0] ChipSelectInternal;                 //Defines what each ChipSelect signal should be based on transmission status and ChipSelectDef
+    logic [3:0] ChipSelectInternal;                 // Defines what each ChipSelect signal should be based on transmission status and ChipSelectDef
-    logic DelayMode;                                //Determines where to place implicit half cycle delay based on sck phase for CS assertion
+    logic DelayMode;                                // Determines where to place implicit half cycle delay based on sck phase for CS assertion
-    //Miscellaneous signals delayed/early by 1 PCLK cycle
+    // Miscellaneous signals delayed/early by 1 PCLK cycle
-    logic ReceiveShiftFullDelay;                    //Delays ReceiveShiftFull signal by 1 PCLK cycle
+    logic ReceiveShiftFullDelay;                    // Delays ReceiveShiftFull signal by 1 PCLK cycle
-    logic TransmitFIFOWriteIncrementDelay;          //TransmitFIFOWriteIncrement delayed by 1 PCLK cycle
+    logic ReceiveShiftFullDelayPCLK;                // ReceiveShiftFull delayed by 1 PCLK cycle
    logic ReceiveShiftFullDelayPCLK;                //ReceiveShiftFull delayed by 1 PCLK cycle
    logic TransmitFIFOReadEmptyDelay;
-    logic SCLKenableEarly;                          //SCLKenable 1 PCLK cycle early, needed for on time register changes when ChipSelectMode is hold and Delay1[15:8] (InterXFR delay) is 0
+    logic SCLKenableEarly;                          // SCLKenable 1 PCLK cycle early, needed for on time register changes when ChipSelectMode is hold and Delay1[15:8] (InterXFR delay) is 0
-    //APB access
+    // APB access
-    assign Entry = {PADDR[7:2],2'b00};  // 32-bit word-aligned accesses
+    assign Entry = {PADDR[7:2],2'b00};  //  32-bit word-aligned accesses
-    assign Memwrite = PWRITE & PENABLE & PSEL;  // only write in access phase
+    assign Memwrite = PWRITE & PENABLE & PSEL;  // Only write in access phase
-    assign PREADY = TransmitInactive; // tie PREADY to transmission for hardware interlock
+    assign PREADY = TransmitInactive; // Tie PREADY to transmission for hardware interlock
-    //Account for subword read/write circuitry
+    // Account for subword read/write circuitry
    // -- Note SPI registers are 32 bits no matter what; access them with LW SW.
    assign Din = PWDATA[31:0]; 
    if (P.XLEN == 64) assign PRDATA = {Dout, Dout}; 
    else              assign PRDATA = Dout;  
-    //Register access  
+    // Register access  
    always_ff@(posedge PCLK, negedge PRESETn)
        if (~PRESETn) begin 
            SckDiv <= #1 12'd3;
@ -167,13 +151,12 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
            ReceiveWatermark <= #1 3'b0;
            InterruptEnable <= #1 2'b0;
            InterruptPending <= #1 2'b0;
-        end else begin //writes
+        end else begin // writes
-            //According to FU540 spec: Once interrupt is pending, it will remain set until number 
+            
            //of entries in tx/rx fifo is strictly more/less than tx/rxmark
            /* verilator lint_off CASEINCOMPLETE */
            if (Memwrite & TransmitInactive)
-                case(Entry) //flop to sample inputs
+                case(Entry) // flop to sample inputs
                    8'h00: SckDiv <= Din[11:0];
                    8'h04: SckMode <= Din[1:0];
                    8'h10: ChipSelectID <= Din[1:0];
@ -188,18 +171,21 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
                    8'h70: InterruptEnable <= Din[1:0];
                endcase
            /* verilator lint_off CASEINCOMPLETE */
-            //interrupt clearance
+
            // According to FU540 spec: Once interrupt is pending, it will remain set until number 
            // of entries in tx/rx fifo is strictly more/less than tx/rxmark
            InterruptPending[0] <= TransmitReadMark;
            InterruptPending[1] <= RecieveWriteMark;  
-            case(Entry) // flop to sample inputs
+
            case(Entry) // Flop to sample inputs
                8'h00: Dout <= #1 {20'b0, SckDiv};
                8'h04: Dout <= #1 {30'b0, SckMode};
                8'h10: Dout <= #1 {30'b0, ChipSelectID};
                8'h14: Dout <= #1 {28'b0, ChipSelectDef};
                8'h18: Dout <= #1 {30'b0, ChipSelectMode};
-                8'h28: Dout <= {8'b0, Delay0[15:8], 8'b0, Delay0[7:0]};
+                8'h28: Dout <= #1 {8'b0, Delay0[15:8], 8'b0, Delay0[7:0]};
-                8'h2C: Dout <= {8'b0, Delay1[15:8], 8'b0, Delay1[7:0]};
+                8'h2C: Dout <= #1 {8'b0, Delay1[15:8], 8'b0, Delay1[7:0]};
-                8'h40: Dout <= {12'b0, Format[4:1], 13'b0, Format[0], 2'b0};
+                8'h40: Dout <= #1 {12'b0, Format[4:1], 13'b0, Format[0], 2'b0};
                8'h48: Dout <= #1 {23'b0, TransmitFIFOWriteFull, 8'b0};
                8'h4C: Dout <= #1 {23'b0, ReceiveFIFOReadEmpty, ReceiveData[7:0]};
                8'h50: Dout <= #1 {29'b0, TransmitWatermark};
@ -210,8 +196,9 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
            endcase
        end
-    //SPI enable generation, where SCLK = PCLK/(2*(SckDiv + 1))
+    // SPI enable generation, where SCLK = PCLK/(2*(SckDiv + 1))
-    //Generates a high signal at the rising and falling edge of SCLK by counting from 0 to SckDiv
+    // Asserts SCLKenable at the rising and falling edge of SCLK by counting from 0 to SckDiv
    // Active at 2x SCLK frequency to account for implicit half cycle delays and actions on both clock edges depending on phase
    assign SCLKenable = (DivCounter == SckDiv);
    assign SCLKenableEarly = ((DivCounter + 12'b1) == SckDiv);
    always_ff @(posedge PCLK, negedge PRESETn)
@ -219,44 +206,38 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
        else if (SCLKenable) DivCounter <= 0;
        else DivCounter <= DivCounter + 12'b1;
-    //Boolean logic that tracks frame progression
+    // Asserts when transmission is one frame before complete
-    assign FrameCompare = (FrameCount < Format[4:1]);    
+    assign ReceivePenultimateFrame = ((FrameCount + 4'b0001) == Format[4:1]);
    assign ReceivePenultimateFrameBoolean = ((FrameCount + 4'b0001) == Format[4:1]);
-    //Computing delays
+    // Computing delays
    // When sckmode.pha = 0, an extra half-period delay is implicit in the cs-sck delay, and vice-versa for sck-cs
    assign ImplicitDelay1 = SckMode[0] ? 9'b0 : 9'b1;
    assign ImplicitDelay2 = SckMode[0] ? 9'b1 : 9'b0;
-    assign CS_SCKCompare = CS_SCKCount >= (({Delay0[7:0], 1'b0}) + ImplicitDelay1);
+    // Calculate when tx/rx shift registers are full/empty
-    assign SCK_CSCompare = SCK_CSCount >= (({Delay0[15:8], 1'b0}) + ImplicitDelay2);
+    TransmitShiftFSM TransmitShiftFSM(PCLK, PRESETn, TransmitFIFOReadEmpty, ReceivePenultimateFrame, Active0, TransmitShiftEmpty);
-    assign InterCSCompare = (InterCSCount >= ({Delay1[7:0],1'b0}));
+    ReceiveShiftFSM ReceiveShiftFSM(PCLK, PRESETn, SCLKenable, ReceivePenultimateFrame, SampleEdge, SckMode[0], ReceiveShiftFull);
    assign InterXFRCompare = (InterXFRCount >= ({Delay1[15:8], 1'b0}));
-    //Calculate when tx/rx shift registers are full/empty
+    // Calculate tx/rx fifo write and recieve increment signals 
    TransmitShiftFSM TransmitShiftFSM_1 (PCLK, PRESETn, TransmitFIFOReadEmpty, ReceivePenultimateFrameBoolean, Active0, TransmitShiftEmpty);
    ReceiveShiftFSM ReceiveShiftFSM_1 (PCLK, PRESETn, SCLKenable, ReceivePenultimateFrameBoolean, SampleEdge, SckMode[0], ReceiveShiftFull);
    //Calculate tx/rx fifo write and recieve increment signals 
    assign TransmitFIFOWriteIncrement = (Memwrite & (Entry == 8'h48) & ~TransmitFIFOWriteFull & TransmitInactive);
    always_ff @(posedge PCLK, negedge PRESETn)
-        if (~PRESETn) TransmitFIFOWriteIncrementDelay <= 0;
+        if (~PRESETn) TransmitFIFOWriteIncrement <= 0;
-        else TransmitFIFOWriteIncrementDelay <= TransmitFIFOWriteIncrement;
+        else TransmitFIFOWriteIncrement <= (Memwrite & (Entry == 8'h48) & ~TransmitFIFOWriteFull & TransmitInactive);
    always_ff @(posedge PCLK, negedge PRESETn)
        if (~PRESETn) ReceiveFIFOReadIncrement <= 0;
        else ReceiveFIFOReadIncrement <= ((Entry == 8'h4C) & ~ReceiveFIFOReadEmpty & PSEL & ~ReceiveFIFOReadIncrement);
-    //Tx/Rx FIFOs
+    // Tx/Rx FIFOs
-    SynchFIFO #(3,8) txFIFO(PCLK, 1'b1, SCLKenable, PRESETn, TransmitFIFOWriteIncrementDelay, TransmitShiftEmpty, TransmitData[7:0], TransmitWriteWatermarkLevel, TransmitWatermark[2:0], TransmitFIFOReadData[7:0], TransmitFIFOWriteFull, TransmitFIFOReadEmpty, TransmitWriteMark, TransmitReadMark);
+    SynchFIFO #(3,8) txFIFO(PCLK, 1'b1, SCLKenable, PRESETn, TransmitFIFOWriteIncrement, TransmitShiftEmpty, TransmitData[7:0], TransmitWriteWatermarkLevel, TransmitWatermark[2:0],
-    SynchFIFO #(3,8) rxFIFO(PCLK, SCLKenable, 1'b1, PRESETn, ReceiveShiftFullDelay, ReceiveFIFOReadIncrement, ReceiveShiftRegEndian, ReceiveWatermark[2:0], ReceiveReadWatermarkLevel, ReceiveData[7:0], ReceiveFIFOWriteFull, ReceiveFIFOReadEmpty, RecieveWriteMark, RecieveReadMark);
+                            TransmitFIFOReadData[7:0], TransmitFIFOWriteFull, TransmitFIFOReadEmpty, TransmitWriteMark, TransmitReadMark);
    SynchFIFO #(3,8) rxFIFO(PCLK, SCLKenable, 1'b1, PRESETn, ReceiveShiftFullDelay, ReceiveFIFOReadIncrement, ReceiveShiftRegEndian, ReceiveWatermark[2:0], ReceiveReadWatermarkLevel, 
                            ReceiveData[7:0], ReceiveFIFOWriteFull, ReceiveFIFOReadEmpty, RecieveWriteMark, RecieveReadMark);
    always_ff @(posedge PCLK, negedge PRESETn)
        if (~PRESETn) TransmitFIFOReadEmptyDelay <= 1;
        else  if (SCLKenable) TransmitFIFOReadEmptyDelay <= TransmitFIFOReadEmpty;
    always_ff @(posedge PCLK, negedge PRESETn)
        if (~PRESETn) ReceiveShiftFullDelay <= 0;
        else if (SCLKenable) ReceiveShiftFullDelay <= ReceiveShiftFull;
@ -266,16 +247,16 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
    assign TransmitShiftRegLoad = ~TransmitShiftEmpty & ~Active | (((ChipSelectMode == 2'b10) & ~|(Delay1[15:8])) & ((ReceiveShiftFullDelay | ReceiveShiftFull) & ~SampleEdge & ~TransmitFIFOReadEmpty));
-    //Main FSM which controls SPI transmission
+    // Main FSM which controls SPI transmission
    typedef enum logic [2:0] {CS_INACTIVE, DELAY_0, ACTIVE_0, ACTIVE_1, DELAY_1,INTER_CS, INTER_XFR} statetype;
    statetype state;
    always_ff @(posedge PCLK, negedge PRESETn)
-        if (~PRESETn) begin state <= CS_INACTIVE;
+        if (~PRESETn) begin 
                        state <= CS_INACTIVE;
                        FrameCount <= 4'b0;                      
        /* verilator lint_off CASEINCOMPLETE */
        end else if (SCLKenable) begin
            /* verilator lint_off CASEINCOMPLETE */
            case (state)
                CS_INACTIVE: begin
                        CS_SCKCount <= 9'b1;
@ -288,7 +269,7 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
                        end
                DELAY_0: begin
                        CS_SCKCount <= CS_SCKCount + 9'b1;
-                        if (CS_SCKCompare) state <= ACTIVE_0;
+                        if (CS_SCKCount >= (({Delay0[7:0], 1'b0}) + ImplicitDelay1)) state <= ACTIVE_0;
                        end
                ACTIVE_0: begin 
                        FrameCount <= FrameCount + 4'b1;
@ -296,7 +277,7 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
                        end
                ACTIVE_1: begin
                        InterXFRCount <= 9'b1;
-                        if (FrameCompare) state <= ACTIVE_0;
+                        if (FrameCount < Format[4:1]) state <= ACTIVE_0;
                        else if ((ChipSelectMode[1:0] == 2'b10) & ~|(Delay1[15:8]) & (~TransmitFIFOReadEmpty)) begin
                            state <= ACTIVE_0;
                            CS_SCKCount <= 9'b1;
@ -310,11 +291,11 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
                        end
                DELAY_1: begin
                        SCK_CSCount <= SCK_CSCount + 9'b1;
-                        if (SCK_CSCompare) state <= INTER_CS;
+                        if (SCK_CSCount >= (({Delay0[15:8], 1'b0}) + ImplicitDelay2)) state <= INTER_CS;
                        end
                INTER_CS: begin
                        InterCSCount <= InterCSCount + 9'b1;
-                        if (InterCSCompare ) state <= CS_INACTIVE;
+                        if (InterCSCount >= ({Delay1[7:0],1'b0})) state <= CS_INACTIVE;
                        end
                INTER_XFR: begin
                        CS_SCKCount <= 9'b1;
@ -322,13 +303,14 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
                        FrameCount <= 4'b0;
                        InterCSCount <= 9'b10;
                        InterXFRCount <= InterXFRCount + 9'b1;
-                        if (InterXFRCompare & ~TransmitFIFOReadEmptyDelay) state <= ACTIVE_0;
+                        if ((InterXFRCount >= ({Delay1[15:8], 1'b0})) & ~TransmitFIFOReadEmptyDelay) state <= ACTIVE_0;
                        else if (~|ChipSelectMode[1:0]) state <= CS_INACTIVE;
                        end
            endcase
            /* verilator lint_off CASEINCOMPLETE */
        end
-            /* verilator lint_off CASEINCOMPLETE */
+            
    assign DelayMode = SckMode[0] ? (state == DELAY_1) : (state == ACTIVE_1 & ReceiveShiftFull);
    assign ChipSelectInternal = (state == CS_INACTIVE | state == INTER_CS | DelayMode & ~|(Delay0[15:8])) ? ChipSelectDef : ~ChipSelectDef;
@ -339,7 +321,7 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
    assign TransmitInactive = ((state == INTER_CS) | (state == CS_INACTIVE) | (state == INTER_XFR) | (ReceiveShiftFullDelayPCLK & ZeroDelayHoldMode));
    assign Active0 = (state == ACTIVE_0);
-    //Signal tracks which edge of sck to shift data
+    // Signal tracks which edge of sck to shift data
    always_comb
        case(SckMode[1:0])
            2'b00: ShiftEdge = ~sck & SCLKenable;
@ -349,36 +331,36 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
            default: ShiftEdge = sck & SCLKenable;
        endcase
-    //Transmit shift register
+    // Transmit shift register
-    assign TransmitDataEndian =  Format[0] ? {TransmitFIFOReadData[0], TransmitFIFOReadData[1], TransmitFIFOReadData[2], TransmitFIFOReadData[3], TransmitFIFOReadData[4], TransmitFIFOReadData[5], TransmitFIFOReadData[6], TransmitFIFOReadData[7]} : TransmitFIFOReadData[7:0];
+    assign TransmitDataEndian = Format[0] ? {TransmitFIFOReadData[0], TransmitFIFOReadData[1], TransmitFIFOReadData[2], TransmitFIFOReadData[3], TransmitFIFOReadData[4], TransmitFIFOReadData[5], TransmitFIFOReadData[6], TransmitFIFOReadData[7]} : TransmitFIFOReadData[7:0];
    always_ff @(posedge PCLK, negedge PRESETn)
        if(~PRESETn)                        TransmitShiftReg <= 8'b0; 
        else if (TransmitShiftRegLoad)      TransmitShiftReg <= TransmitDataEndian;
-        else if (ShiftEdge & Active)   TransmitShiftReg <= {TransmitShiftReg[6:0], 1'b0};
+        else if (ShiftEdge & Active)        TransmitShiftReg <= {TransmitShiftReg[6:0], 1'b0};
    assign SPIOut = TransmitShiftReg[7];
-    //If in loopback mode, receive shift register is connected directly to module's output pins. Else, connected to SPIIn
+    // If in loopback mode, receive shift register is connected directly to module's output pins. Else, connected to SPIIn
-    //There are no setup/hold time issues because transmit shift register and receive shift register always shift/sample on opposite edges
+    // There are no setup/hold time issues because transmit shift register and receive shift register always shift/sample on opposite edges
    assign ShiftIn = P.SPI_LOOPBACK_TEST ? SPIOut : SPIIn;
-    //Receive shift register
+    // Receive shift register
    always_ff @(posedge PCLK, negedge PRESETn)
        if(~PRESETn)  ReceiveShiftReg <= 8'b0;
        else if (SampleEdge & SCLKenable) begin
-            if (~Active) ReceiveShiftReg <= 8'b0;
+            if (~Active)    ReceiveShiftReg <= 8'b0;
-            else ReceiveShiftReg <= {ReceiveShiftReg[6:0], ShiftIn};
+            else            ReceiveShiftReg <= {ReceiveShiftReg[6:0], ShiftIn};
        end
-    //Aligns received data and reverses if little-endian
+    // Aligns received data and reverses if little-endian
    assign LeftShiftAmount = 4'h8 - Format[4:1];
    assign ASR = ReceiveShiftReg << LeftShiftAmount[2:0];
    assign ReceiveShiftRegEndian = Format[0] ? {ASR[0], ASR[1], ASR[2], ASR[3], ASR[4], ASR[5], ASR[6], ASR[7]} : ASR[7:0];
-    //Interrupt logic: raise interrupt if any enabled interrupts are pending
+    // Interrupt logic: raise interrupt if any enabled interrupts are pending
    assign SPIIntr = |(InterruptPending & InterruptEnable);
-    //Chip select logic
+    // Chip select logic
    always_comb
        case(ChipSelectID[1:0])
            2'b00: ChipSelectAuto = {ChipSelectDef[3], ChipSelectDef[2], ChipSelectDef[1], ChipSelectInternal[0]};
@ -390,14 +372,14 @@ module spi_apb import cvw::*; #(parameter cvw_t P) (
    assign SPICS = ChipSelectMode[0] ? ChipSelectDef : ChipSelectAuto;
 endmodule
-module SynchFIFO #(parameter M =3 , N= 8)(
+module SynchFIFO #(parameter M=3, N=8)(                 // 2^M entries of N bits each
-    input logic PCLK, wen, ren, PRESETn,
+    input  logic         PCLK, wen, ren, PRESETn,
-    input logic winc,rinc,
+    input  logic         winc, rinc,
-    input logic [N-1:0] wdata,
+    input  logic [N-1:0] wdata,
-    input logic [M-1:0] wwatermarklevel, rwatermarklevel,
+    input  logic [M-1:0] wwatermarklevel, rwatermarklevel,
    output logic [N-1:0] rdata,
-    output logic wfull, rempty,
+    output logic         wfull, rempty,
-    output logic wwatermark, rwatermark);
+    output logic         wwatermark, rwatermark);
    /* Pointer FIFO using design elements from "Simulation and Synthesis Techniques
       for Asynchronous FIFO Design" by Clifford E. Cummings. Namely, M bit read and write pointers
@ -409,8 +391,6 @@ module SynchFIFO #(parameter M =3 , N= 8)(
    logic [N-1:0] mem[2**M];
    logic [M:0] rptr, wptr;
    logic [M:0] rptrnext, wptrnext;
    logic rempty_val;
    logic wfull_val;
    logic [M-1:0] raddr;
    logic [M-1:0] waddr;
@ -428,53 +408,43 @@ module SynchFIFO #(parameter M =3 , N= 8)(
        end
        else begin 
            if (wen) begin
-                wfull <= wfull_val;
+                wfull <= ({~wptrnext[M], wptrnext[M-1:0]} == rptr);
                wptr  <= wptrnext;
            end
            if (ren) begin 
                rptr <= rptrnext;
-                rempty <= rempty_val;
+                rempty <= (wptr == rptrnext);
            end
        end 
-
+    
    assign raddr = rptr[M-1:0];
-    assign rptrnext = rptr + {3'b0, (rinc & ~rempty)};      
+    assign rptrnext = rptr + {{(M){1'b0}}, (rinc & ~rempty)};      
    assign rempty_val = (wptr == rptrnext);
    assign rwatermark = ((waddr - raddr) < rwatermarklevel) & ~wfull;
    assign waddr = wptr[M-1:0];
    assign wwatermark = ((waddr - raddr) > wwatermarklevel) | wfull;
-    assign wptrnext = wptr + {3'b0, (winc & ~wfull)};
+    assign wptrnext = wptr + {{(M){1'b0}}, (winc & ~wfull)};
    assign wfull_val = ({~wptrnext[M], wptrnext[M-1:0]} == rptr);
 endmodule
 module TransmitShiftFSM(
-    input logic PCLK, PRESETn,
+    input  logic PCLK, PRESETn,
-    input logic TransmitFIFOReadEmpty, ReceivePenultimateFrameBoolean, Active0,
+    input  logic TransmitFIFOReadEmpty, ReceivePenultimateFrame, Active0,
    output logic TransmitShiftEmpty);
    typedef enum logic [1:0] {TransmitShiftEmptyState, TransmitShiftHoldState, TransmitShiftNotEmptyState} statetype;
    statetype TransmitState, TransmitNextState;
    always_ff @(posedge PCLK, negedge PRESETn)
-        if (~PRESETn) TransmitState <= TransmitShiftEmptyState;
+        if (~PRESETn) TransmitShiftEmpty <= 1;
-        else          TransmitState <= TransmitNextState;
+        else if (TransmitShiftEmpty) begin        
            if (TransmitFIFOReadEmpty | (~TransmitFIFOReadEmpty & (ReceivePenultimateFrame & Active0))) TransmitShiftEmpty <= 1;
            else if (~TransmitFIFOReadEmpty) TransmitShiftEmpty <= 0;
        end else begin
            if (ReceivePenultimateFrame & Active0) TransmitShiftEmpty <= 1;
            else TransmitShiftEmpty <= 0;
        end
        always_comb
            case(TransmitState)
                TransmitShiftEmptyState: begin
                    if (TransmitFIFOReadEmpty | (~TransmitFIFOReadEmpty & (ReceivePenultimateFrameBoolean & Active0))) TransmitNextState = TransmitShiftEmptyState;
                    else if (~TransmitFIFOReadEmpty) TransmitNextState = TransmitShiftNotEmptyState;
                end
                TransmitShiftNotEmptyState: begin
                    if (ReceivePenultimateFrameBoolean & Active0) TransmitNextState = TransmitShiftEmptyState;
                    else TransmitNextState = TransmitShiftNotEmptyState;
                end
            endcase
        assign TransmitShiftEmpty = (TransmitNextState == TransmitShiftEmptyState);
 endmodule
 module ReceiveShiftFSM(
-    input logic PCLK, PRESETn, SCLKenable,
+    input  logic PCLK, PRESETn, SCLKenable,
-    input logic ReceivePenultimateFrameBoolean, SampleEdge, SckMode,
+    input  logic ReceivePenultimateFrame, SampleEdge, SckMode,
    output logic ReceiveShiftFull
 );
    typedef enum logic [1:0] {ReceiveShiftFullState, ReceiveShiftNotFullState, ReceiveShiftDelayState} statetype;
@ -484,17 +454,12 @@ module ReceiveShiftFSM(
        else if (SCLKenable) begin
            case (ReceiveState)
                ReceiveShiftFullState: ReceiveState <= ReceiveShiftNotFullState;
-                ReceiveShiftNotFullState: if (ReceivePenultimateFrameBoolean & (SampleEdge)) ReceiveState <= ReceiveShiftDelayState;
+                ReceiveShiftNotFullState: if (ReceivePenultimateFrame & (SampleEdge)) ReceiveState <= ReceiveShiftDelayState;
                                          else ReceiveState <= ReceiveShiftNotFullState;
-                ReceiveShiftDelayState: ReceiveState <= ReceiveShiftFullState;
+                ReceiveShiftDelayState:   ReceiveState <= ReceiveShiftFullState;
            endcase
        end
-        assign ReceiveShiftFull = SckMode ? (ReceiveState == ReceiveShiftFullState) : (ReceiveState == ReceiveShiftDelayState);
+    assign ReceiveShiftFull = SckMode ? (ReceiveState == ReceiveShiftFullState) : (ReceiveState == ReceiveShiftDelayState);
 endmodule
--- a/src/wally/wallypipelinedcore.sv
+++ b/src/wally/wallypipelinedcore.sv
@ -264,7 +264,7 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
  end
  // global stall and flush control  
-  hazard  hzu(
+  hazard #(P) hzu(
    .BPWrongE, .CSRWriteFenceM, .RetM, .TrapM,
    .LoadStallD, .StoreStallD, .MDUStallD, .CSRRdStallD,
    .LSUStallM, .IFUStallF,
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@ -11,7 +11,7 @@ export MOD ?= orig
 # title to add a note in the synth's directory name
 TITLE = 
 # tsmc28, sky130, and sky90 presently supported
-export TECH ?= sky90
+export TECH ?= sky130
 # MAXCORES allows parallel compilation, which is faster but less CPU-efficient
 # Avoid when doing sweeps of many optimization points in parallel
 export MAXCORES ?= 1
@ -20,7 +20,7 @@ export MAXCORES ?= 1
 export MAXOPT ?= 0
 export DRIVE ?= FLOP
 export USESRAM ?= 0
-
+export WIDTH ?= 32
 time := $(shell date +%F-%H-%M)
 hash := $(shell git rev-parse --short HEAD)
@ -94,10 +94,10 @@ endif
 ifneq ($(MOD), orig)
 	# PMP 0
-	sed -i 's/PMP_ENTRIES \(64\|16\|0\)/PMP_ENTRIES = 0;/' $(CONFIGDIR)/config.vh
+	sed -i 's/PMP_ENTRIES.*\(64\|16\)/PMP_ENTRIES = 0;/' $(CONFIGDIR)/config.vh
 ifneq ($(MOD), PMP0)
 	# no priv
-	sed -i 's/ZICSR_SUPPORTED *1/ZICSR_SUPPORTED = 0;/' $(CONFIGDIR)/config.vh
+	sed -i 's/ZICSR_SUPPORTED.*1/ZICSR_SUPPORTED = 0;/' $(CONFIGDIR)/config.vh
 ifneq ($(MOD), noPriv)
 	# turn off FPU 
 	sed -i 's/1 *<< *3/0 << 3/' $(CONFIGDIR)/config.vh
@ -147,4 +147,4 @@ clean:
 	rm -f power.saif
 	rm -f Synopsys_stack_trace_*.txt
 	rm -f crte_*.txt
-	
+	
--- a/synthDC/README.md
+++ b/synthDC/README.md
@ -5,7 +5,7 @@ This subdirectory contains synthesis scripts for use with Synopsys
 scripts/synth.tcl.
 Example Usage
-make synth DESIGN=wallypipelinedcore FREQ=500
+make synth DESIGN=wallypipelinedcore FREQ=500 CONFIG=rv32e
 environment variables
@ -38,5 +38,25 @@ To run ppa analysis that hones into target frequency, you can type:
 python3 ppa/ppaSynth.py from the synthDC directory.  This runs a sweep
 across all modules listed at the bottom of the ppaSynth.py file.
 Two options for running the sweep.  The first run runs all modules for
 all techs around a given frequency (i.e., freqs).  The second option
 will run all designs for the specific module based on bestSynths.csv
 values.   Since the second option is 2nd, it has priority.  If the
 second set of values is commented out, it will run all widths.
 WARNING:  The first option may runs lots of runs that could expend all
 the licenses available for a license.  Therefore, care must be taken
 to be sure that enough licenses are available for this first option.
 ##### Run specific syntheses
 	widths = [8, 16, 32, 64, 128] 
 	modules = ['mul', 'adder', 'shifter', 'flop', 'comparator', 'binencoder', 'csa', 'mux2', 'mux4', 'mux8']
 	techs = ['sky90', 'sky130', 'tsmc28', 'tsmc28psyn']
 	freqs = [5000]
 	synthsToRun = allCombos(widths, modules, techs, freqs)
 ##### Run a sweep based on best delay found in existing syntheses
 	module = 'adder'
 	width = 32
 	tech = 'tsmc28psyn'
 	synthsToRun = freqSweep(module, width, tech)
--- a/synthDC/extractSummary.py
+++ b/synthDC/extractSummary.py
@ -252,7 +252,7 @@ if __name__ == '__main__':
    TechSpec = namedtuple("TechSpec", "color shape targfreq fo4 add32area add32lpower add32denergy")
    techdict = {}
-    techdict['sky130'] = TechSpec('green', 'o', args.sky130freq, 99.5e-3, 1440.600027, 714.057, 0.658023)
+    techdict['sky130'] = TechSpec('green', 'o', args.sky130freq, 99.5e-3, 2581, 18, 0.685)
    techdict['sky90'] = TechSpec('gray', 'o', args.sky90freq, 43.2e-3, 1440.600027, 714.057, 0.658023)
    techdict['tsmc28psyn'] = TechSpec('blue', 's', args.tsmcfreq, 12.2e-3, 209.286002, 1060.0, .081533)
--- a/synthDC/ppa/bestSynths.csv
+++ b/synthDC/ppa/bestSynths.csv
@ -1,24 +1,74 @@
 Module,Tech,Width,Target Freq,Delay,Area,L Power (nW),D energy (nJ)
-priorityencoder,sky90,8,7683,0.12508649056358195,50.960001,24.761,0.010685929975270078
+binencoder,sky130,8,1000,1.0000,50.960001,24.761,0.010685929975270078
-priorityencoder,sky90,16,5773,0.16977016282695304,136.220003,77.243,0.021773774467348
+binencoder,sky130,16,1000,1.0000,136.220003,77.243,0.021773774467348
-priorityencoder,sky90,32,4500,0.2218912222222222,372.400007,189.626,0.04371111111111111
+binencoder,sky130,32,1000,1.0000,372.400007,189.626,0.04371111111111111
-priorityencoder,sky90,64,4098,0.2439914738897023,797.720015,382.205,0.07393850658857981
+binencoder,sky130,64,1000,1.0000,797.720015,382.205,0.07393850658857981
-priorityencoder,sky90,128,3409,0.2933331557641537,1602.300031,610.009,0.1261366969785861
+binencoder,sky130,128,900,1.1111,1602.300031,610.009,0.1261366969785861
-add,sky90,8,3658,0.27337042810278844,253.820005,154.438,0.10825587752870422
+adder,sky130,8,1700,0.588235,253.820005,154.438,0.10825587752870422
-add,sky90,16,2942,0.3393218266485384,722.260013,485.109,0.32460910944935417
+adder,sky130,16,1300,0.7692307,722.260013,485.109,0.32460910944935417
-add,sky90,32,2468,0.40496338573743923,1440.600027,714.057,0.6580226904376014
+adder,sky130,32,1100,0.90909,1440.600027,714.057,0.6580226904376014
-add,sky90,64,2139,0.4674681813931744,2781.240054,1050.0,0.9392239364188874
+adder,sky130,64,950,1.0526315,2781.240054,1050.0,0.9392239364188874
-add,sky90,128,1885,0.5304949787798409,6186.740118,2230.0,2.1480106100795755
+adder,sky130,128,900,1.1111,6186.740118,2230.0,2.1480106100795755
 csa,sky130,8,1000,1.0000,266.560005,154.202,0.13650573115665163
 csa,sky130,16,1000,1.0000,533.12001,308.404,0.27263530601922104
 csa,sky130,32,1000,1.0000,1066.240021,616.808,0.5448072247308093
 csa,sky130,64,1000,1.0000,2132.480042,1230.0,1.0905412240768841
 csa,sky130,128,1000,1.0000,4264.960083,2470.0,2.178553363682347
 shifter,sky130,8,1000,1.0000,259.700005,196.451,0.07534088282874972
 shifter,sky130,16,1000,1.0000,666.400006,558.433,0.19552906110283155
 shifter,sky130,32,1000,1.0000,1475.880027,768.262,0.3807431082700759
 shifter,sky130,64,1000,1.0000,3914.120062,2680.0,1.144802541988198
 shifter,sky130,128,900,1.1111,9192.400136,6080.0,2.9008914525432616
 comparator,sky130,8,1700,0.588235,200.900004,136.6,0.05001033271337053
 comparator,sky130,16,1500,0.6666667,358.680007,189.253,0.06321553011448482
 comparator,sky130,32,1300,0.7692307,690.900013,315.709,0.10771793448084398
 comparator,sky130,64,1200,0.8333333,1372.980026,508.393,0.2048577820389901
 comparator,sky130,128,1150,0.869565,2744.980052,796.047,0.34396273737011823
 flop,sky130,8,1000,1.0000,133.279999,64.8145,0.193835
 flop,sky130,16,1000,1.0000,266.5599975,129.629,0.38715000000000005
 flop,sky130,32,1000,1.0000,533.119995,259.258,0.7723000000000001
 flop,sky130,64,1000,1.0000,1066.23999,520.0,1.54955
 flop,sky130,128,1000,1.0000,2132.4799805,1035.0,3.094
 mux2,sky130,8,1000,1.0000,63.700001,21.541,0.01932440083034535
 mux2,sky130,16,1000,1.0000,119.560002,32.354,0.03884536082474227
 mux2,sky130,32,1000,1.0000,375.340008,259.372,0.13671796921846893
 mux2,sky130,64,1000,1.0000,479.220009,115.22,0.15148539160324087
 mux2,sky130,128,1000,1.0000,1302.420025,767.078,0.4665334665334665
 mux4,sky130,8,1000,1.0000,148.960002,66.984,0.04026661024121879
 mux4,sky130,16,1000,1.0000,392.0,398.313,0.1037037037037037
 mux4,sky130,32,1000,1.0000,594.860011,331.197,0.131617289946576
 mux4,sky130,64,1000,1.0000,899.640016,344.331,0.2862533692722372
 mux4,sky130,128,1000,1.0000,2013.900038,818.249,0.6094182825484764
 mux8,sky130,8,1000,1.0000,287.140006,116.648,0.06089260808926081
 mux8,sky130,16,1000,1.0000,582.120003,282.366,0.14455681142177274
 mux8,sky130,32,1000,1.0000,1319.079995,670.683,0.35777218376337316
 mux8,sky130,64,1000,1.0000,2132.48004,808.482,0.44287680660701995
 mux8,sky130,128,1000,1.0000,4575.620089,1830.0,0.9786276715410572
 mul,sky130,8,1000,1.0000,2194.220041,1440.0,1.421374045801527
 mul,sky130,16,1000,1.0000,7519.540137,4940.0,6.376128385155466
 mul,sky130,32,1000,1.0000,25200.700446,14900.0,24.931847968545217
 mul,sky130,64,1000,1.0000,86011.661365,42600.0,88.84651898734177
 mul,sky130,128,800,1.2500,296198.144128,114000.0,273.3148854961832
 binencoder,sky90,8,7683,0.12508649056358195,50.960001,24.761,0.010685929975270078
 binencoder,sky90,16,5773,0.16977016282695304,136.220003,77.243,0.021773774467348
 binencoder,sky90,32,4500,0.2218912222222222,372.400007,189.626,0.04371111111111111
 binencoder,sky90,64,4098,0.2439914738897023,797.720015,382.205,0.07393850658857981
 binencoder,sky90,128,3409,0.2933331557641537,1602.300031,610.009,0.1261366969785861
 adder,sky90,8,3658,0.27337042810278844,253.820005,154.438,0.10825587752870422
 adder,sky90,16,2942,0.3393218266485384,722.260013,485.109,0.32460910944935417
 adder,sky90,32,2468,0.40496338573743923,1440.600027,714.057,0.6580226904376014
 adder,sky90,64,2139,0.4674681813931744,2781.240054,1050.0,0.9392239364188874
 adder,sky90,128,1885,0.5304949787798409,6186.740118,2230.0,2.1480106100795755
 csa,sky90,8,5758,0.16536141368530738,266.560005,154.202,0.13650573115665163
 csa,sky90,16,5931,0.1654056314280897,533.12001,308.404,0.27263530601922104
 csa,sky90,32,5758,0.16536141368530738,1066.240021,616.808,0.5448072247308093
 csa,sky90,64,5931,0.1654056314280897,2132.480042,1230.0,1.0905412240768841
 csa,sky90,128,5931,0.1654056314280897,4264.960083,2470.0,2.178553363682347
-shiftleft,sky90,8,4327,0.23025600254217704,259.700005,196.451,0.07534088282874972
+shifter,sky90,8,4327,0.23025600254217704,259.700005,196.451,0.07534088282874972
-shiftleft,sky90,16,3355,0.29803959314456036,666.400006,558.433,0.19552906110283155
+shifter,sky90,16,3355,0.29803959314456036,666.400006,558.433,0.19552906110283155
-shiftleft,sky90,32,2503,0.39951757530962845,1475.880027,768.262,0.3807431082700759
+shifter,sky90,32,2503,0.39951757530962845,1475.880027,768.262,0.3807431082700759
-shiftleft,sky90,64,2203,0.45385946391284615,3914.120062,2680.0,1.144802541988198
+shifter,sky90,64,2203,0.45385946391284615,3914.120062,2680.0,1.144802541988198
-shiftleft,sky90,128,1907,0.5242938489774515,9192.400136,6080.0,2.9008914525432616
+shifter,sky90,128,1907,0.5242938489774515,9192.400136,6080.0,2.9008914525432616
 comparator,sky90,8,4839,0.20629126741062204,200.900004,136.6,0.05001033271337053
 comparator,sky90,16,4018,0.24806303982080635,358.680007,189.253,0.06321553011448482
 comparator,sky90,32,3602,0.276293542476402,690.900013,315.709,0.10771793448084398
@ -44,31 +94,31 @@ mux8,sky90,16,3362,0.295237998810232,582.120003,282.366,0.14455681142177274
 mux8,sky90,32,3178,0.3140553102580239,1319.079995,670.683,0.35777218376337316
 mux8,sky90,64,2906,0.3440756228492774,2132.48004,808.482,0.44287680660701995
 mux8,sky90,128,2667,0.3749401308586427,4575.620089,1830.0,0.9786276715410572
-mult,sky90,8,1310,0.7631557786259543,2194.220041,1440.0,1.421374045801527
+mul,sky90,8,1310,0.7631557786259543,2194.220041,1440.0,1.421374045801527
-mult,sky90,16,997,1.0029260270812437,7519.540137,4940.0,6.376128385155466
+mul,sky90,16,997,1.0029260270812437,7519.540137,4940.0,6.376128385155466
-mult,sky90,32,763,1.3106129895150722,25200.700446,14900.0,24.931847968545217
+mul,sky90,32,763,1.3106129895150722,25200.700446,14900.0,24.931847968545217
-mult,sky90,64,632,1.5822664810126583,86011.661365,42600.0,88.84651898734177
+mul,sky90,64,632,1.5822664810126583,86011.661365,42600.0,88.84651898734177
-mult,sky90,128,524,1.9083759465648855,296198.144128,114000.0,273.3148854961832
+mul,sky90,128,524,1.9083759465648855,296198.144128,114000.0,273.3148854961832
-priorityencoder,tsmc28,8,31335,0.031912196106590074,8.316,34.836,0.001716929950534546
+binencoder,tsmc28,8,31335,0.031912196106590074,8.316,34.836,0.001716929950534546
-priorityencoder,tsmc28,16,21253,0.04703118086858326,21.672,78.026,0.004008845810003294
+binencoder,tsmc28,16,21253,0.04703118086858326,21.672,78.026,0.004008845810003294
-priorityencoder,tsmc28,32,16464,0.06071258114674442,61.614,207.499,0.009323372206025266
+binencoder,tsmc28,32,16464,0.06071258114674442,61.614,207.499,0.009323372206025266
-priorityencoder,tsmc28,64,13804,0.07239877021153289,137.466,425.592,0.01847290640394089
+binencoder,tsmc28,64,13804,0.07239877021153289,137.466,425.592,0.01847290640394089
-priorityencoder,tsmc28,128,11440,0.0874065874125874,317.646,973.649,0.041171328671328666
+binencoder,tsmc28,128,11440,0.0874065874125874,317.646,973.649,0.041171328671328666
-add,tsmc28,8,13838,0.07207477814713109,34.272,187.089,0.013311172134701546
+adder,tsmc28,8,13838,0.07207477814713109,34.272,187.089,0.013311172134701546
-add,tsmc28,16,11521,0.08678002100512108,90.972001,475.207,0.03367763214998698
+adder,tsmc28,16,11521,0.08678002100512108,90.972001,475.207,0.03367763214998698
-add,tsmc28,32,9812,0.1018860211985324,209.286002,1060.0,0.08153281695882594
+adder,tsmc28,32,9812,0.1018860211985324,209.286002,1060.0,0.08153281695882594
-add,tsmc28,64,8206,0.12185605215695831,388.836003,1770.0,0.1409943943456008
+adder,tsmc28,64,8206,0.12185605215695831,388.836003,1770.0,0.1409943943456008
-add,tsmc28,128,7354,0.13597341881968997,907.452008,4360.0,0.3451183029643731
+adder,tsmc28,128,7354,0.13597341881968997,907.452008,4360.0,0.3451183029643731
 csa,tsmc28,8,24524,0.040663382319360626,52.416,482.462,0.02173381177621921
 csa,tsmc28,16,24524,0.040663382319360626,104.832,964.99,0.04346762355243842
 csa,tsmc28,32,24524,0.040663382319360626,209.664,1930.0,0.08677214157559941
 csa,tsmc28,64,24524,0.040663382319360626,419.327999,3860.0,0.17342195400424076
 csa,tsmc28,128,24524,0.040663382319360626,838.655998,7720.0,0.3471701190670363
-shiftleft,tsmc28,8,15202,0.0656078183133798,50.652,367.074,0.016991185370346006
+shifter,tsmc28,8,15202,0.0656078183133798,50.652,367.074,0.016991185370346006
-shiftleft,tsmc28,16,11804,0.08465604506946797,127.511999,602.29,0.03388681802778719
+shifter,tsmc28,16,11804,0.08465604506946797,127.511999,602.29,0.03388681802778719
-shiftleft,tsmc28,32,9587,0.10430391697089808,384.803997,1940.0,0.10180452696359654
+shifter,tsmc28,32,9587,0.10430391697089808,384.803997,1940.0,0.10180452696359654
-shiftleft,tsmc28,64,8272,0.12086674854932303,1041.263998,5460.0,0.2895309477756286
+shifter,tsmc28,64,8272,0.12086674854932303,1041.263998,5460.0,0.2895309477756286
-shiftleft,tsmc28,128,7023,0.14238329232521713,1836.953994,8670.0,0.566566994162039
+shifter,tsmc28,128,7023,0.14238329232521713,1836.953994,8670.0,0.566566994162039
 comparator,tsmc28,8,17422,0.05733769130983814,35.784,170.595,0.009488003673516243
 comparator,tsmc28,16,13736,0.07273839778683751,54.558,250.167,0.014349155503785673
 comparator,tsmc28,32,12139,0.08236710865804432,145.782,622.975,0.03567015404893319
@ -94,8 +144,58 @@ mux8,tsmc28,16,12264,0.08147446510110894,128.771998,548.714,0.02666340508806262
 mux8,tsmc28,32,11713,0.08517122410996329,172.115999,823.633,0.046956373260479814
 mux8,tsmc28,64,11014,0.09067453550027238,304.163999,1460.0,0.08498274922825495
 mux8,tsmc28,128,10474,0.09542350830628223,683.045996,2820.0,0.15705556616383426
-mult,tsmc28,8,5200,0.1922996923076923,577.206,4340.0,0.37769230769230766
+mul,tsmc28,8,5200,0.1922996923076923,577.206,4340.0,0.37769230769230766
-mult,tsmc28,16,3819,0.26184265147944485,1634.472002,11800.0,1.4553548049227547
+mul,tsmc28,16,3819,0.26184265147944485,1634.472002,11800.0,1.4553548049227547
-mult,tsmc28,32,3033,0.3295775611605671,6343.721998,47200.0,6.303330036267723
+mul,tsmc28,32,3033,0.3295775611605671,6343.721998,47200.0,6.303330036267723
-mult,tsmc28,64,2390,0.4184090418410042,16045.092071,109000.0,18.54602510460251
+mul,tsmc28,64,2390,0.4184090418410042,16045.092071,109000.0,18.54602510460251
-mult,tsmc28,128,1868,0.5353279057815846,44272.49428,262000.0,50.01177730192719
+mul,tsmc28,128,1868,0.5353279057815846,44272.49428,262000.0,50.01177730192719
 binencoder,tsmc28psyn,8,31335,0.031912196106590074,8.316,34.836,0.001716929950534546
 binencoder,tsmc28psyn,16,21253,0.04703118086858326,21.672,78.026,0.004008845810003294
 binencoder,tsmc28psyn,32,16464,0.06071258114674442,61.614,207.499,0.009323372206025266
 binencoder,tsmc28psyn,64,13804,0.07239877021153289,137.466,425.592,0.01847290640394089
 binencoder,tsmc28psyn,128,11440,0.0874065874125874,317.646,973.649,0.041171328671328666
 adder,tsmc28psyn,8,13838,0.07207477814713109,34.272,187.089,0.013311172134701546
 adder,tsmc28psyn,16,11521,0.08678002100512108,90.972001,475.207,0.03367763214998698
 adder,tsmc28psyn,32,9812,0.1018860211985324,209.286002,1060.0,0.08153281695882594
 adder,tsmc28psyn,64,8206,0.12185605215695831,388.836003,1770.0,0.1409943943456008
 adder,tsmc28psyn,128,7000,0.142857142857,907.452008,4360.0,0.3451183029643731
 csa,tsmc28psyn,8,24524,0.040663382319360626,52.416,482.462,0.02173381177621921
 csa,tsmc28psyn,16,24524,0.040663382319360626,104.832,964.99,0.04346762355243842
 csa,tsmc28psyn,32,24524,0.040663382319360626,209.664,1930.0,0.08677214157559941
 csa,tsmc28psyn,64,24524,0.040663382319360626,419.327999,3860.0,0.17342195400424076
 csa,tsmc28psyn,128,24524,0.040663382319360626,838.655998,7720.0,0.3471701190670363
 shifter,tsmc28psyn,8,15202,0.0656078183133798,50.652,367.074,0.016991185370346006
 shifter,tsmc28psyn,16,11804,0.08465604506946797,127.511999,602.29,0.03388681802778719
 shifter,tsmc28psyn,32,9587,0.10430391697089808,384.803997,1940.0,0.10180452696359654
 shifter,tsmc28psyn,64,8272,0.12086674854932303,1041.263998,5460.0,0.2895309477756286
 shifter,tsmc28psyn,128,7023,0.14238329232521713,1836.953994,8670.0,0.566566994162039
 comparator,tsmc28psyn,8,17422,0.05733769130983814,35.784,170.595,0.009488003673516243
 comparator,tsmc28psyn,16,13736,0.07273839778683751,54.558,250.167,0.014349155503785673
 comparator,tsmc28psyn,32,12139,0.08236710865804432,145.782,622.975,0.03567015404893319
 comparator,tsmc28psyn,64,11080,0.09024670758122744,294.21,1250.0,0.0684115523465704
 comparator,tsmc28psyn,128,9371,0.10671119720414043,558.432,2400.0,0.12794792444776437
 flop,tsmc28psyn,8,10,0.048889000000002625,15.12,78.6345,0.027246000000000003
 flop,tsmc28psyn,16,10,0.048889000000002625,30.24,157.29,0.054290000000000005
 flop,tsmc28psyn,32,10,0.048889000000002625,60.4799995,314.5805,0.10908000000000001
 flop,tsmc28psyn,64,10,0.048889000000002625,120.959999,630.0,0.21765500000000004
 flop,tsmc28psyn,128,10,0.048889000000002625,241.919998,1260.0,0.43579999999999997
 mux2,tsmc28psyn,8,29614,0.03374481252110488,16.758,114.564,0.005436617815897886
 mux2,tsmc28psyn,16,18767,0.053046021580433735,15.75,88.025,0.005142004582511856
 mux2,tsmc28psyn,32,17903,0.05585556035301346,32.130001,171.146,0.009897782494553985
 mux2,tsmc28psyn,64,18568,0.05371109651012495,91.35,523.884,0.027574321413183972
 mux2,tsmc28psyn,128,16637,0.05991099044298852,176.525999,941.106,0.05012923002945243
 mux4,tsmc28psyn,8,18151,0.055092383284667513,27.971999,133.963,0.008032615282904523
 mux4,tsmc28psyn,16,16486,0.06057952759917506,39.438,186.231,0.012556108213029236
 mux4,tsmc28psyn,32,15196,0.06580579126085812,69.174,324.969,0.023229797315082915
 mux4,tsmc28psyn,64,13926,0.07180612868016659,137.465999,648.086,0.04574177796926612
 mux4,tsmc28psyn,128,13090,0.07636619404125286,294.335997,1420.0,0.09358288770053477
 mux8,tsmc28psyn,8,12902,0.07750336319950395,44.604,214.286,0.0117501162610448
 mux8,tsmc28psyn,16,12264,0.08147446510110894,128.771998,548.714,0.02666340508806262
 mux8,tsmc28psyn,32,11713,0.08517122410996329,172.115999,823.633,0.046956373260479814
 mux8,tsmc28psyn,64,11014,0.09067453550027238,304.163999,1460.0,0.08498274922825495
 mux8,tsmc28psyn,128,10474,0.09542350830628223,683.045996,2820.0,0.15705556616383426
 mul,tsmc28psyn,8,5200,0.1922996923076923,577.206,4340.0,0.37769230769230766
 mul,tsmc28psyn,16,3819,0.26184265147944485,1634.472002,11800.0,1.4553548049227547
 mul,tsmc28psyn,32,3033,0.3295775611605671,6343.721998,47200.0,6.303330036267723
 mul,tsmc28psyn,64,2390,0.4184090418410042,16045.092071,109000.0,18.54602510460251
 mul,tsmc28psyn,128,1868,0.5353279057815846,44272.49428,262000.0,50.01177730192719
--- a/synthDC/ppa/ppaAnalyze.py
+++ b/synthDC/ppa/ppaAnalyze.py
--- a/synthDC/ppa/ppaSynth.py
+++ b/synthDC/ppa/ppaSynth.py
@ -12,13 +12,11 @@ from ppaAnalyze import synthsfromcsv
 def runCommand(module, width, tech, freq):
    command = "make synth DESIGN={} WIDTH={} TECH={} DRIVE=INV FREQ={} MAXOPT=1 MAXCORES=1".format(module, width, tech, freq)
-    print('here we go')
+    subprocess.call(command, shell=True)
    subprocess.Popen(command, shell=True)
 def deleteRedundant(synthsToRun):
    '''removes any previous runs for the current synthesis specifications'''
-    synthStr = "rm -rf runs/ppa_{}_{}_rv32e_{}nm_{}_*"
+    synthStr = "rm -rf runs/{}_{}_rv32e_{}_{}_*"
    for synth in synthsToRun:   
        bashCommand = synthStr.format(*synth)
        outputCPL = subprocess.check_output(['bash','-c', bashCommand])
@ -34,8 +32,21 @@ def freqSweep(module, width, tech):
                synthsToRun += [[synth.module, str(synth.width), synth.tech, str(freq)]]
    return synthsToRun
 def freqModuleSweep(widths, modules, tech):
    synthsToRun = []
    arr = [-8, -6, -4, -2, 0, 2, 4, 6, 8]
    allSynths = synthsfromcsv('ppa/bestSynths.csv')
    for w in widths:
        for module in modules:
            for synth in allSynths:
                if (synth.module == str(module)) & (synth.tech == tech) & (synth.width == w):
                    f = 1000/synth.delay
                    for freq in [round(f+f*x/100) for x in arr]:
                        synthsToRun += [[synth.module, str(synth.width), synth.tech, str(freq)]]
    return synthsToRun
 def filterRedundant(synthsToRun):
-    bashCommand = "find . -path '*runs/ppa*rv32e*' -prune"
+    bashCommand = "find . -path '*runs/*' -prune"
    output = subprocess.check_output(['bash','-c', bashCommand])
    specReg = re.compile('[a-zA-Z0-9]+')
    allSynths = output.decode("utf-8").split('\n')[:-1]
@ -59,21 +70,30 @@ def allCombos(widths, modules, techs, freqs):
 if __name__ == '__main__':
-    ##### Run specific syntheses
+    ##### Run specific syntheses for a specific frequency
 	widths = [8, 16, 32, 64, 128] 
-	modules = ['mult', 'add', 'shiftleft', 'flop', 'comparator', 'priorityencoder', 'add', 'csa', 'mux2', 'mux4', 'mux8']
+	modules = ['mul', 'adder', 'shifter', 'flop', 'comparator', 'binencoder', 'csa', 'mux2', 'mux4', 'mux8']
-	techs = ['sky90', 'tsmc28']
+	techs = ['sky90', 'sky130', 'tsmc28', 'tsmc28psyn']
 	freqs = [5000]
 	synthsToRun = allCombos(widths, modules, techs, freqs)
    ##### Run a sweep based on best delay found in existing syntheses
-	module = 'add'
+	module = 'adder'
 	width = 32
-	tech = 'sky90'
+	tech = 'tsmc28psyn'
 	synthsToRun = freqSweep(module, width, tech)
    ##### Run a sweep for multiple modules/widths based on best delay found in existing syntheses
 	modules = ['adder']
 #	widths = [8, 16, 32, 64, 128]
 	widths = [32]
 	tech = 'sky130'
 	synthsToRun = freqModuleSweep(widths, modules, tech)	
    ##### Only do syntheses for which a run doesn't already exist
-	synthsToRun = filterRedundant(synthsToRun)
+	synthsToRun = filterRedundant(synthsToRun)	
 	pool = Pool(processes=25)
-	pool.starmap(runCommand, synthsToRun)
+
 pool.starmap(runCommand, synthsToRun)
 pool.close()
 pool.join()
--- a/synthDC/scripts/synth.tcl
+++ b/synthDC/scripts/synth.tcl
@ -18,7 +18,6 @@ suppress_message {VER-274}
 # Enable Multicore
 set_host_options -max_cores $::env(MAXCORES)
 # get outputDir and configDir from environment (Makefile)
 set outputDir $::env(OUTPUTDIR)
 set cfg $::env(CONFIGDIR)
@ -26,6 +25,7 @@ set hdl_src "../src"
 set saifpower $::env(SAIFPOWER)
 set maxopt $::env(MAXOPT)
 set drive $::env(DRIVE)
 set width $::env(WIDTH)
 eval file copy -force [glob ${cfg}/*.vh] {$outputDir/hdl/}
 eval file copy -force [glob ${hdl_src}/cvw.sv] {$outputDir/hdl/}
@ -88,7 +88,13 @@ if { [shell_is_in_topographical_mode] } {
 #set alib_library_analysis_path ./$outputDir
 define_design_lib WORK -path ./$outputDir/WORK
 analyze -f sverilog -lib WORK $my_verilog_files
-elaborate $my_toplevel -lib WORK 
+# If wrapper=0, we want to run against a specific module and pass
 # width to DC
 if { $wrapper == 1 } {
    elaborate $my_toplevel -lib WORK 
 } else {
    elaborate $my_toplevel -lib WORK -parameters WIDTH=$width
 }
 # Set the current_design 
 current_design $my_toplevel
@ -447,4 +453,4 @@ set t2 [clock seconds]
 set t [expr $t2 - $t1]
 echo [expr $t/60]
-quit 
+quit 
--- a/synthDC/wallySynthAll.sh
+++ b/synthDC/wallySynthAll.sh
@ -0,0 +1,14 @@
 # Run all Wally synthesis experiments from chapter 8
 # However, trying to run the freqsweeps at the same time maxes out licenses and some runs fail
 #./wallySynth.py --freqsweep 330 --tech sky130 
 #./wallySynth.py --freqsweep 870 --tech sky90 
 #./wallySynth.py --freqsweep 2800 --tech tsmc28psyn --usesram
 ./wallySynth.py --configsweep --tech sky130 --targetfreq 330
 ./wallySynth.py --configsweep --tech sky90 --targetfreq 870
 ./wallySynth.py --configsweep --tech tsmc28psyn --targetfreq 2800 --usesram
 ./wallySynth.py --featuresweep --tech sky130 --targetfreq 330
 ./wallySynth.py --featuresweep --tech sky90 --targetfreq 870
 ./wallySynth.py --featuresweep --tech tsmc28psyn --targetfreq 2800 --usesram
 # Extract summary data (run this by hand after all experiments finish)
 #./extractSummary.py --sky130freq 330 --sky90freq 870 --tsmcfreq 2800
--- a/testbench/testbench-fp.sv
+++ b/testbench/testbench-fp.sv
@ -115,8 +115,8 @@ module testbenchfp;
   logic 			FlushE;
   logic 			IFDivStartE;
   logic 			FDivDoneE;
-   logic [P.NE+1:0] 		QeM;
+   logic [P.NE+1:0] 		UeM;
-   logic [P.DIVb:0] 		QmM;
+   logic [P.DIVb:0] 		UmM;
   logic [P.XLEN-1:0] 		FIntDivResultM;
   logic 			ResMatch;                   // Check if result match
   logic 			FlagMatch;                  // Check if IEEE flags match
@ -145,9 +145,12 @@ module testbenchfp;
   initial begin
      // Information displayed for user on what is simulating
-      $display("\nThe start of simulation...");      
+      //$display("\nThe start of simulation...");      
-      $display("This simulation for TEST is %s", TEST);
+      //$display("This simulation for TEST is %s", TEST);
-      $display("This simulation for TEST is of the operand size of %s", TEST_SIZE);      
+      //$display("This simulation for TEST is of the operand size of %s", TEST_SIZE);      
      // $display("FPDUR %d %d DIVN %d LOGR %d RK %d RADIX %d DURLEN %d", FPDUR, DIVN, LOGR, RK, RADIX, DURLEN);
      if (P.Q_SUPPORTED & (TEST_SIZE == "QP" | TEST_SIZE == "all")) begin // if Quad percision is supported
 	 if (TEST === "cvtint" | TEST === "all") begin  // if testing integer conversion
            // add the 128-bit cvtint tests to the to-be-tested list
@ -649,7 +652,7 @@ module testbenchfp;
      string tt0;
      tt0 = $psprintf("%s", Tests[TestNum]);
      testname = {pp, tt0};
-      $display("Here you are %s", testname);     
+      //$display("Here you are %s", testname);     
      $display("\n\nRunning %s vectors ", Tests[TestNum]);
      $readmemh(testname, TestVectors);
      // set the test index to 0
@ -705,7 +708,7 @@ module testbenchfp;
   end
   postprocess #(P) postprocess(.Xs(Xs), .Ys(Ys), .PostProcSel(UnitVal[1:0]),
-				.OpCtrl(OpCtrlVal), .DivQm(Quot), .DivQe(DivCalcExp),
+				.OpCtrl(OpCtrlVal), .DivUm(Quot), .DivUe(DivCalcExp),
 				.Xm(Xm), .Ym(Ym), .Zm(Zm), .CvtCe(CvtCalcExpE), .DivSticky(DivSticky), .FmaSs(Ss),
 				.XNaN(XNaN), .YNaN(YNaN), .ZNaN(ZNaN), .CvtResSubnormUf(CvtResSubnormUfE),
 				.XZero(XZero), .YZero(YZero), .CvtShiftAmt(CvtShiftAmtE),
@ -734,8 +737,8 @@ module testbenchfp;
 			     .XInfE(XInf), .YInfE(YInf), .XZeroE(XZero), .YZeroE(YZero), 
 			     .XNaNE(XNaN), .YNaNE(YNaN), 
 			     .FDivStartE(DivStart), .IDivStartE(1'b0), .W64E(1'b0),
-			     .StallM(1'b0), .DivStickyM(DivSticky), .FDivBusyE, .QeM(DivCalcExp),
+			     .StallM(1'b0), .DivStickyM(DivSticky), .FDivBusyE, .UeM(DivCalcExp),
-			     .QmM(Quot),
+			     .UmM(Quot),
 			     .FlushE(1'b0), .ForwardedSrcAE('0), .ForwardedSrcBE('0), .Funct3M(Funct3M),
 			     .Funct3E(Funct3E), .IntDivE(1'b0), .FIntDivResultM(FIntDivResultM),
 			     .FDivDoneE(FDivDoneE), .IFDivStartE(IFDivStartE));
--- a/testbench/testbench.sv
+++ b/testbench/testbench.sv
@ -389,6 +389,7 @@ module testbench;
    assign SDCCmd = SDCCmdOE ? SDCCmdOut : 1'bz;
    assign SDCCmdIn = SDCCmd;
    assign SDCDat = sd_dat_reg_t ? sd_dat_reg_o : sd_dat_i;
    assign SDCDatIn = SDCDat;
 -----/\----- EXCLUDED -----/\----- */
    assign SDCIntr = '0;
		`@ -0,0 +1 @@`
							`Subproject commit 5df21aa6625eca120e64ea353ca641aff37d90b2`
		`@ -1 +1 @@`
			`Subproject commit 1480febc3ace5f471baeee4b1ae0d8fea16e4762`				`Subproject commit 4c5eb87983f51ca7fcf7855306877b3d1c3aabf1`
		`@ -1 +1 @@`
			`Subproject commit 197179fdc9dfeeca821e848f373c897a3fdae86c`				`Subproject commit eb0a3892215ad2384702db02da1551a59701ec67`
		`@ -1 +0,0 @@`
			`Subproject commit cf04274f50621fd9ef9147793cca6dd1657985c7`
		`@ -1 +0,0 @@`
			`Subproject commit c76a8613a177b3a04face2cb8e15dd07a8d2fc40`