Merge pull request #936 from kevindkim723/divremsqrtport

adding divremsqrt paper files
2025-02-11 06:05:49 +00:00 · 2024-08-29 15:01:37 -07:00 · 2024-08-29 15:01:37 -07:00 · 3a772416df
commit 3a772416df
parent 86aaf43306 4824b6c569
43 changed files with 6539 additions and 495 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,49 @@
+# divremsqrt
+This branch contains the relevant hardware and test/synthesis flows for cvw's unified integer/fp divide/sqrt recurrence unit. The recurrence unit can be generated for a variety configurations, which span flavors of radix = {2,4}, floating-point precision = {float,double,quad}, integer width = {unsupported,32,64} and divider copies = {1,2,4,8}. 
+
+The fpu postprocessor on cvw handles inputs not only from the div/sqrt unit, but also the fma and convert units. This branch's drsu unit contains a postprocessor with logic only relevant to division/sqrt.
+
+# file hiearchy 
+
+The RTL files for the divider can be found under `cvw/src/fpu`
+
+The majority of divider modules are found in `cvw/src/fpu/divremsqrt`, which also borrows some modules from `cvw/src/fpu/fdivsqrt`
+
+divremsqrt/drsu desribes the top-level unit for the divider, taking in unpacked floating point signals, including Xs, Xm Xe, Ys, Ym, Ye.
+
+drsu first feeds signals to `divremsqrt/divremsqrt`, which contains the preprocessor, iteration units, fsm, and postprocessing logic. The postprocessor in `divremsqrt/divremsqrt` also contains all integer postprocessing logic. Outputs from `divremsqrt/divremsqrt` are then sent to `divremsqrt/divremsqrtpostprocess`, which handles rounding and flags.
+
+# verification flow
+
+drsu is verified with the risc-v arch test Berkeley SoftFloat floating point suite of test vectors for floating point square-root and division. In order to run the top-level regression script, run `regression-wally-intdiv -intdiv`
+
+The top-level regression python script is found accordingly in `cvw/bin/regression-wally-intdiv`. The testbench is found in `cvw/testbench/testbench_fp`, which runs drsu against testvectors. Batches of testvectors are stored within `cvw/testbench/tests-fp.vh`, and the raw binary test vectors are read from `tests/fp/vectors`
+
+Regression log files can be found in `cvw/sim/questa/logs` after running `regression-wally-intdiv -intdiv`. Files are named with `{precision}_ieee_div_{R}_{K}_{integer}_rv{XLEN}gc_{TESTNAME}.log`
+
+* precision denotes the floating-point precision types supported by the divider: f, fd, fdq, fdqh
+* R denotes the radix of the divider: 2,4
+* K denotes the number of divider copies in the unit: 1,2,4,8
+* integer denotes whether integer division/remainder is supported on the divider: i
+* XLEN denotes the width of integers: 32, 64 (this only matters if integer is supported on the divider)
+* TESTNAME denotes which tests are being run:
+    * fdivremsqrt: runs fdiv, fsqrt, intdiv, intrem
+    * fdiv: runs fdiv
+    * fsqrt: runs fsqrt
+   
+
+ 
+# synthesis flow
+To run synthesis results for all flavors of the recurrence unit, go to `cvw/synthDC/scripts` and run `python3 synthdrsu.py`. This will execute a python script that runs the installed version of synopsis design compiler on divider permutations for a target frequency of 5GHz and 100MHz. To then pipe area, delay and energy results to a CSV, run `./writeCSV.sh`. Results can then be viewed in `fp-synthresults_reordered.csv` in a format similar to the one presented in the paper.
+# start-up steps
+1) `git clone --recurse-submodules https://github.com/openhwgroup/cvw.git`
+2) `cd cvw`
+3) `git checkout divremsqrt`
+4) `source ./setup.sh`
+5) `make`
+6) `/sim/regression-wally -intdiv`
+
+
 # core-v-wally

 Wally is a 5-stage pipelined processor configurable to support all the standard RISC-V options, including RV32/64, A, B, C, D, F, M, Q, and Zk* extensions, virtual memory, PMP, and the various privileged modes and CSRs. It provides optional caches, branch prediction, and standard RISC-V peripherals (CLINT, PLIC, UART, GPIO).   Wally is written in SystemVerilog.  It passes the [RISC-V Arch Tests](https://github.com/riscv-non-isa/riscv-arch-test) and boots Linux on an FPGA.  Configurations range from a minimal RV32E core to a fully featured RV64GC application processor.
--- a/bin/regression-wally
+++ b/bin/regression-wally
@ -371,6 +371,7 @@ args = parser.parse_args()
 if (args.nightly):
    nightMode = "--nightly";
    sims = ["questa", "verilator", "vcs"] # exercise all simulators; can omit a sim if no license is available
+#    sims = ["questa", "verilator"] # exercise all simulators; can omit a sim if no license is available
 else:
    nightMode = ""
    sims = [defaultsim]
@ -512,10 +513,12 @@ def main():
    elif args.fcov:
        TIMEOUT_DUR = 1*60
        os.system('rm -f questa/fcov_ucdb/* questa/fcov_logs/* questa/fcov/*')
-    elif args.nightly:
+    elif args.buildroot:
        TIMEOUT_DUR = 60*1440 # 1 day
    elif args.testfloat:
        TIMEOUT_DUR = 30*60 # seconds
+    elif args.nightly:
+        TIMEOUT_DUR = 30*60 # seconds
    else:
        TIMEOUT_DUR = 10*60 # seconds

--- a/bin/regression-wally-intdiv
+++ b/bin/regression-wally-intdiv
@ -0,0 +1,577 @@
+#!/usr/bin/python3
+##################################
+#
+# regression-wally
+# David_Harris@Hmc.edu 25 January 2021
+# Modified by Jarred Allen <jaallen@g.hmc.edu>
+#
+# Run a regression with multiple configurations in parallel and exit with
+# non-zero status code if an error happened, as well as printing human-readable
+# output.
+#
+##################################
+import sys,os,shutil
+import multiprocessing
+
+
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+from collections import namedtuple
+
+WALLY = os.environ.get('WALLY')
+regressionDir = WALLY + '/sim'
+os.chdir(regressionDir)
+
+coverage = '-coverage' in sys.argv
+fp = '-fp' in sys.argv
+nightly = '-nightly' in sys.argv
+softfloat = '-softfloat' in sys.argv
+intdiv = '-intdiv' in sys.argv
+
+TestCase = namedtuple("TestCase", ['name', 'variant', 'cmd', 'grepstr'])
+# name:     the name of this test configuration (used in printing human-readable
+#           output and picking logfile names)
+# cmd:      the command to run to test (should include the logfile as '{}', and
+#           the command needs to write to that file)
+# grepstr:  the string to grep through the log file for. The test succeeds iff
+#           grep finds that string in the logfile (is used by grep, so it may
+#           be any pattern grep accepts, see `man 1 grep` for more info).
+
+# edit this list to add more test cases
+if (nightly):
+    nightMode = "-nightly";
+    configs = []
+else:
+    nightMode = "";
+    configs = [
+        TestCase(
+            name="lints",
+            variant="all",
+            cmd="./lint-wally " + nightMode + " | tee {}",
+            grepstr="lints run with no errors or warnings"
+        )
+    ]
+
+def getBuildrootTC(boot):
+    INSTR_LIMIT = 1000000 # multiple of 100000; 4M is interesting because it gets into the kernel and enabling VM
+    MAX_EXPECTED = 246000000 # *** TODO: replace this with a search for the login prompt.
+    if boot:
+        name="buildrootboot"
+        BRcmd="vsim > {} -c <<!\ndo wally.do buildroot buildroot-no-trace $RISCV 0 1 0\n!"
+        BRgrepstr="WallyHostname login:"
+    else:
+        name="buildroot"
+        if (coverage):
+            print( "buildroot coverage")
+            BRcmd="vsim > {} -c <<!\ndo wally-batch.do buildroot buildroot $RISCV "+str(INSTR_LIMIT)+" 1 0 -coverage\n!"
+        else:
+            print( "buildroot no coverage")
+            BRcmd="vsim > {} -c <<!\ndo wally-batch.do buildroot buildroot configOptions -GINSTR_LIMIT=" +str(INSTR_LIMIT) + " \n!"
+        BRgrepstr=str(INSTR_LIMIT)+" instructions"
+    return  TestCase(name,variant="rv64gc",cmd=BRcmd,grepstr=BRgrepstr)
+
+tests64gcimperas = ["imperas64i", "imperas64f", "imperas64d", "imperas64m", "imperas64c"] # unused
+
+tests64i = ["arch64i"] 
+for test in tests64i:
+  tc = TestCase(
+        name=test,
+        variant="rv64i",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv64i "+test+"\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+tests32gcimperas = ["imperas32i", "imperas32f", "imperas32m", "imperas32c"] # unused
+tests32gc = ["arch32f", "arch32d", "arch32f_fma", "arch32d_fma", "arch32f_divsqrt", "arch32d_divsqrt", 
+             "arch32i", "arch32priv", "arch32c",  "arch32m", "arch32a", "arch32zifencei", "arch32zicond", 
+             "arch32zba", "arch32zbb", "arch32zbc", "arch32zbs", "arch32zfh", "arch32zfh_fma", 
+             "arch32zfh_divsqrt", "arch32zfaf", "wally32a", "wally32priv", "wally32periph", 
+             "arch32zbkb", "arch32zbkc", "arch32zbkx", "arch32zknd", "arch32zkne", "arch32zknh"]  # "arch32zbc", "arch32zfad",
+#tests32gc = ["arch32f", "arch32d", "arch32f_fma", "arch32d_fma", "arch32i", "arch32priv", "arch32c",  "arch32m", "arch32a", "arch32zifencei", "arch32zba", "arch32zbb", "arch32zbc", "arch32zbs", "arch32zicboz", "arch32zcb", "wally32a",  "wally32priv", "wally32periph"]  
+for test in tests32gc:
+  tc = TestCase(
+        name=test,
+        variant="rv32gc",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv32gc "+test+"\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+tests32imcimperas = ["imperas32i", "imperas32c"] # unused
+tests32imc = ["arch32i", "arch32c", "arch32m", "wally32periph"] 
+for test in tests32imc:
+  tc = TestCase(
+        name=test,
+        variant="rv32imc",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv32imc "+test+"\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+tests32i = ["arch32i"] 
+for test in tests32i:
+  tc = TestCase(
+        name=test,
+        variant="rv32i",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv32i "+test+"\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+
+tests32e = ["arch32e"] 
+for test in tests32e:
+  tc = TestCase(
+        name=test,
+        variant="rv32e",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv32e "+test+"\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+tests64gc = ["arch64f", "arch64d", "arch64f_fma", "arch64d_fma", "arch64f_divsqrt", "arch64d_divsqrt", "arch64i", "arch64zba", "arch64zbb", "arch64zbc", "arch64zbs",  "arch64zfh", "arch64zfh_divsqrt", "arch64zfh_fma", "arch64zfaf", "arch64zfad", "arch64zbkb", "arch64zbkc", "arch64zbkx", "arch64zknd", "arch64zkne", "arch64zknh",
+             "arch64priv", "arch64c",  "arch64m", "arch64a", "arch64zifencei", "arch64zicond", "wally64a", "wally64periph", "wally64priv"] # add arch64zfh_fma when available; arch64zicobz, arch64zcb when working
+#tests64gc = ["arch64f", "arch64d", "arch64f_fma", "arch64d_fma", "arch64i", "arch64zba", "arch64zbb", "arch64zbc", "arch64zbs", 
+#             "arch64priv", "arch64c",  "arch64m", "arch64a", "arch64zifencei", "wally64a", "wally64periph", "wally64priv", "arch64zicboz", "arch64zcb"] 
+if (coverage):  # delete all but 64gc tests when running coverage
+    configs = []
+    tests64gc = ["coverage64gc", "arch64i", "arch64priv", "arch64c",  "arch64m",
+                 "arch64zifencei", "arch64zicond", "arch64a", "wally64a", "wally64periph", "wally64priv", 
+                 "arch64zba",  "arch64zbb",  "arch64zbc", "arch64zbs"] # add when working: "arch64zcb", "arch64zicboz"
+    if (fp):
+       tests64gc.append("arch64f")
+       tests64gc.append("arch64d")
+       tests64gc.append("arch64zfh")
+       tests64gc.append("arch64f_fma")
+       tests64gc.append("arch64d_fma") 
+       tests64gc.append("arch64zfh_fma")
+       tests64gc.append("arch64f_divsqrt")
+       tests64gc.append("arch64d_divsqrt")
+       tests64gc.append("arch64zfh_divsqrt")
+       tests64gc.append("arch64zfaf")
+       tests64gc.append("arch64zfad")
+    coverStr = '-coverage'
+else:
+   coverStr = ''
+for test in tests64gc:
+  tc = TestCase(
+        name=test,
+        variant="rv64gc",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv64gc "+test+" " + coverStr + "\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+# run derivative configurations if requested  
+if (nightly):
+    derivconfigtests = [
+        ["tlb2_rv32gc", ["wally32priv"]],
+        ["tlb16_rv32gc", ["wally32priv"]],
+        ["tlb2_rv64gc", ["wally64priv"]],
+        ["tlb16_rv64gc", ["wally64priv"]],
+        ["way_1_4096_512_rv32gc", ["arch32i"]],
+        ["way_2_4096_512_rv32gc", ["arch32i"]],
+        ["way_8_4096_512_rv32gc", ["arch32i"]],
+        ["way_4_2048_512_rv32gc", ["arch32i"]],
+        ["way_4_4096_256_rv32gc", ["arch32i"]],
+        ["way_1_4096_512_rv64gc", ["arch64i"]],
+        ["way_2_4096_512_rv64gc", ["arch64i"]],
+        ["way_8_4096_512_rv64gc", ["arch64i"]],
+        ["way_4_2048_512_rv64gc", ["arch64i"]],
+        ["way_4_4096_256_rv64gc", ["arch64i"]],
+        ["way_4_4096_1024_rv64gc", ["arch64i"]],
+
+        ["ram_0_0_rv64gc", ["ahb64"]],
+        ["ram_1_0_rv64gc", ["ahb64"]],
+        ["ram_1_1_rv64gc", ["ahb64"]],
+        ["ram_2_0_rv64gc", ["ahb64"]],
+        ["ram_2_1_rv64gc", ["ahb64"]],
+        
+        ["noicache_rv32gc", ["ahb32"]],
+# cacheless designs will not work until DTIM supports FLEN > XLEN
+#        ["nodcache_rv32gc", ["ahb32"]],
+#        ["nocache_rv32gc", ["ahb32"]],
+        ["noicache_rv64gc", ["ahb64"]],
+        ["nodcache_rv64gc", ["ahb64"]],
+        ["nocache_rv64gc", ["ahb64"]],
+
+        ### add misaligned tests
+
+        ["div_2_1_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_1i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_2_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_2i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_4_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_4i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_1_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_1i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_2_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_2i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_4_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_4i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_1_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_1i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_2_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_2i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_4_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_1_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_1i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_2_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_2i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_4_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+
+        ### branch predictor simulation
+
+        # ["bpred_TWOBIT_6_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_8_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_10_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],        
+        # ["bpred_TWOBIT_12_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_14_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],        
+        # ["bpred_TWOBIT_16_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_6_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_8_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_10_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],        
+        # ["bpred_TWOBIT_12_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_14_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],        
+        # ["bpred_TWOBIT_16_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+
+        # ["bpred_GSHARE_6_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_6_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_8_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_8_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_12_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_12_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_14_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_14_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_16_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_16_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+
+        # # btb
+        # ["bpred_GSHARE_10_16_6_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_6_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_8_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_8_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_12_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_12_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+
+        # # ras
+        # ["bpred_GSHARE_10_2_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_2_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_3_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_3_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_4_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_4_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_6_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_6_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_10_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_10_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+
+#  enable floating-point tests when lint is fixed
+        ["f_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma"]],
+        ["fh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfh", "arch32zfh_divsqrt"]],
+        ["fdh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt"]],
+        ["fdq_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32i"]],
+        ["fdqh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt", "arch32i"]],
+        ["f_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma"]],
+        ["fh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64zfh", "arch64zfh_divsqrt"]], # hanging 1/31/24 dh; try again when lint is fixed
+        ["fdh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt"]],
+        ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64i"]],
+        ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt", "arch64i", "wally64q"]],
+        
+
+    ]
+    for test in derivconfigtests:
+        config = test[0];
+        tests = test[1];
+        if(len(test) >= 4 and test[2] == "configOptions"):
+            configOptions = test[3]
+            cmdPrefix = "vsim > {} -c <<!\ndo wally-batch.do "+config
+        else:
+            configOptions = ""
+            cmdPrefix = "vsim > {} -c <<!\ndo wally-batch.do "+config
+        for t in tests:
+            tc = TestCase(
+                    name=t,
+                    variant=config,
+                    cmd=cmdPrefix+" "+t+" configOptions "+configOptions+"\n!",
+                    grepstr="All tests ran without failures")
+            configs.append(tc)
+
+
+
+
+# softfloat tests
+if (softfloat):
+    testfloatsim = "questa" # change to Verilator when Issue #707 about testfloat not running Verilator is resolved
+    configs = []
+    softfloatconfigs = [
+    "fdh_ieee_div_2_1_rv32gc", "fdh_ieee_div_2_1_rv64gc", "fdh_ieee_div_2_2_rv32gc",
+    "fdh_ieee_div_2_2_rv64gc", "fdh_ieee_div_2_4_rv32gc", "fdh_ieee_div_2_4_rv64gc",
+    "fdh_ieee_div_4_1_rv32gc", "fdh_ieee_div_4_1_rv64gc", "fdh_ieee_div_4_2_rv32gc",
+    "fdh_ieee_div_4_2_rv64gc", "fdh_ieee_div_4_4_rv32gc", "fdh_ieee_div_4_4_rv64gc",
+    "fd_ieee_div_2_1_rv32gc", "fd_ieee_div_2_1_rv64gc", "fd_ieee_div_2_2_rv32gc",
+    "fd_ieee_div_2_2_rv64gc", "fd_ieee_div_2_4_rv32gc", "fd_ieee_div_2_4_rv64gc",
+    "fd_ieee_div_4_1_rv32gc", "fd_ieee_div_4_1_rv64gc", "fd_ieee_div_4_2_rv32gc",
+    "fd_ieee_div_4_2_rv64gc", "fd_ieee_div_4_4_rv32gc", "fd_ieee_div_4_4_rv64gc",
+    "fdqh_ieee_div_2_1_rv32gc", "fdqh_ieee_div_2_1_rv64gc", "fdqh_ieee_div_2_2_rv32gc",
+    "fdqh_ieee_div_2_2_rv64gc", "fdqh_ieee_div_2_4_rv32gc", "fdqh_ieee_div_2_4_rv64gc",
+    "fdqh_ieee_div_4_1_rv32gc", "fdqh_ieee_div_4_1_rv64gc", "fdqh_ieee_div_4_2_rv32gc",
+    "fdqh_ieee_div_4_2_rv64gc", "fdqh_ieee_div_4_4_rv32gc", "fdqh_ieee_div_4_4_rv64gc",
+    "fdq_ieee_div_2_1_rv32gc", "fdq_ieee_div_2_1_rv64gc", "fdq_ieee_div_2_2_rv32gc",
+    "fdq_ieee_div_2_2_rv64gc", "fdq_ieee_div_2_4_rv32gc", "fdq_ieee_div_2_4_rv64gc",
+    "fdq_ieee_div_4_1_rv32gc", "fdq_ieee_div_4_1_rv64gc", "fdq_ieee_div_4_2_rv32gc",
+    "fdq_ieee_div_4_2_rv64gc", "fdq_ieee_div_4_4_rv32gc", "fdq_ieee_div_4_4_rv64gc",
+    "fh_ieee_div_2_1_rv32gc", "fh_ieee_div_2_1_rv64gc", "fh_ieee_div_2_2_rv32gc",
+    "fh_ieee_div_2_2_rv64gc", "fh_ieee_div_2_4_rv32gc", "fh_ieee_div_2_4_rv64gc",
+    "fh_ieee_div_4_1_rv32gc", "fh_ieee_div_4_1_rv64gc", "fh_ieee_div_4_2_rv32gc",
+    "fh_ieee_div_4_2_rv64gc", "fh_ieee_div_4_4_rv32gc", "fh_ieee_div_4_4_rv64gc",
+    "f_ieee_div_2_1_rv32gc", "f_ieee_div_2_1_rv64gc", "f_ieee_div_2_2_rv32gc",
+    "f_ieee_div_2_2_rv64gc", "f_ieee_div_2_4_rv32gc", "f_ieee_div_2_4_rv64gc",
+    "f_ieee_div_4_1_rv32gc", "f_ieee_div_4_1_rv64gc", "f_ieee_div_4_2_rv32gc",
+    "f_ieee_div_4_2_rv64gc", "f_ieee_div_4_4_rv32gc", "f_ieee_div_4_4_rv64gc"
+    ]
+    for config in softfloatconfigs:
+        # div test case
+        divtest = TestCase(
+            name="div",
+            variant=config,
+            cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " div \n!",
+            grepstr="All Tests completed with          0 errors"
+        )
+        configs.insert(0,divtest)
+
+        # sqrt test case
+        sqrttest = TestCase(
+            name="sqrt",
+            variant=config,
+            cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " sqrt \n!",
+            grepstr="All Tests completed with          0 errors"
+        )
+        #configs.append(sqrttest)
+        configs.insert(0,sqrttest)
+
+
+        # skip if divider variant config
+        if ("ieee" in config):
+            # cvtint test case
+            cvtinttest = TestCase(
+                name="cvtint",
+                variant=config,
+                cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " cvtint \n!",
+                grepstr="All Tests completed with          0 errors"
+                )
+            configs.append(cvtinttest)
+
+            # cvtfp test case
+            # WILL fail on F_only (refer to spec)
+            cvtfptest = TestCase(
+                name="cvtfp",
+                variant=config,
+                cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " cvtfp \n!",
+                grepstr="All Tests completed with          0 errors"
+            )
+            configs.append(cvtfptest)    
+      
+# intdiv verification
+if (intdiv):
+    configs = []
+    testfloatsim = "questa" # change to Verilator when Issue #707 about testfloat not running Verilator is resolved
+    # ***NOTE add to this
+    
+    intdivconfigs = [
+    "fdh_ieee_div_2_1i_rv32gc", "fdh_ieee_div_2_1i_rv64gc", "fdh_ieee_div_2_2i_rv32gc",
+    "fdh_ieee_div_2_2i_rv64gc", "fdh_ieee_div_2_4i_rv32gc", "fdh_ieee_div_2_4i_rv64gc",
+    "fdh_ieee_div_4_1i_rv32gc", "fdh_ieee_div_4_1i_rv64gc", "fdh_ieee_div_4_2i_rv32gc",
+    "fdh_ieee_div_4_2i_rv64gc", "fdh_ieee_div_4_4i_rv32gc", "fdh_ieee_div_4_4i_rv64gc",
+    "fd_ieee_div_2_1i_rv32gc", "fd_ieee_div_2_1i_rv64gc", "fd_ieee_div_2_2i_rv32gc",
+    "fd_ieee_div_2_2i_rv64gc", "fd_ieee_div_2_4i_rv32gc", "fd_ieee_div_2_4i_rv64gc",
+    "fd_ieee_div_4_1i_rv32gc", "fd_ieee_div_4_1i_rv64gc", "fd_ieee_div_4_2i_rv32gc",
+    "fd_ieee_div_4_2i_rv64gc", "fd_ieee_div_4_4i_rv32gc", "fd_ieee_div_4_4i_rv64gc",
+    "fdqh_ieee_div_2_1i_rv32gc", "fdqh_ieee_div_2_1i_rv64gc", "fdqh_ieee_div_2_2i_rv32gc",
+    "fdqh_ieee_div_2_2i_rv64gc", "fdqh_ieee_div_2_4i_rv32gc", "fdqh_ieee_div_2_4i_rv64gc",
+    "fdqh_ieee_div_4_1i_rv32gc", "fdqh_ieee_div_4_1i_rv64gc", "fdqh_ieee_div_4_2i_rv32gc",
+    "fdqh_ieee_div_4_2i_rv64gc", "fdqh_ieee_div_4_4i_rv32gc", "fdqh_ieee_div_4_4i_rv64gc",
+    "fdq_ieee_div_2_1i_rv32gc", "fdq_ieee_div_2_1i_rv64gc", "fdq_ieee_div_2_2i_rv32gc",
+    "fdq_ieee_div_2_2i_rv64gc", "fdq_ieee_div_2_4i_rv32gc", "fdq_ieee_div_2_4i_rv64gc",
+    "fdq_ieee_div_4_1i_rv32gc", "fdq_ieee_div_4_1i_rv64gc", "fdq_ieee_div_4_2i_rv32gc",
+    "fdq_ieee_div_4_2i_rv64gc", "fdq_ieee_div_4_4i_rv32gc", "fdq_ieee_div_4_4i_rv64gc",
+    "fh_ieee_div_2_1i_rv32gc", "fh_ieee_div_2_1i_rv64gc", "fh_ieee_div_2_2i_rv32gc",
+    "fh_ieee_div_2_2i_rv64gc", "fh_ieee_div_2_4i_rv32gc", "fh_ieee_div_2_4i_rv64gc",
+    "fh_ieee_div_4_1i_rv32gc", "fh_ieee_div_4_1i_rv64gc", "fh_ieee_div_4_2i_rv32gc",
+    "fh_ieee_div_4_2i_rv64gc", "fh_ieee_div_4_4i_rv32gc", "fh_ieee_div_4_4i_rv64gc",
+    "f_ieee_div_2_1i_rv32gc", "f_ieee_div_2_1i_rv64gc", "f_ieee_div_2_2i_rv32gc",
+    "f_ieee_div_2_2i_rv64gc", "f_ieee_div_2_4i_rv32gc", "f_ieee_div_2_4i_rv64gc",
+    "f_ieee_div_4_1i_rv32gc", "f_ieee_div_4_1i_rv64gc", "f_ieee_div_4_2i_rv32gc",
+    "f_ieee_div_4_2i_rv64gc", "f_ieee_div_4_4i_rv32gc", "f_ieee_div_4_4i_rv64gc",
+    "fd_ieee_div_2_8i_rv32gc",
+    "fd_ieee_div_2_8i_rv64gc",
+    "fdq_ieee_div_2_8i_rv64gc",
+    "fdq_ieee_div_2_8i_rv32gc",
+    "f_ieee_div_2_8i_rv64gc",
+    "f_ieee_div_2_8i_rv32gc"
+    ]
+    nointdivconfigs = [
+    "fdh_ieee_div_2_1_rv32gc", "fdh_ieee_div_2_1_rv64gc", "fdh_ieee_div_2_2_rv32gc",
+    "fdh_ieee_div_2_2_rv64gc", "fdh_ieee_div_2_4_rv32gc", "fdh_ieee_div_2_4_rv64gc",
+    "fdh_ieee_div_4_1_rv32gc", "fdh_ieee_div_4_1_rv64gc", "fdh_ieee_div_4_2_rv32gc",
+    "fdh_ieee_div_4_2_rv64gc", "fdh_ieee_div_4_4_rv32gc", "fdh_ieee_div_4_4_rv64gc",
+    "fd_ieee_div_2_1_rv32gc", "fd_ieee_div_2_1_rv64gc", "fd_ieee_div_2_2_rv32gc",
+    "fd_ieee_div_2_2_rv64gc", "fd_ieee_div_2_4_rv32gc", "fd_ieee_div_2_4_rv64gc",
+    "fd_ieee_div_4_1_rv32gc", "fd_ieee_div_4_1_rv64gc", "fd_ieee_div_4_2_rv32gc",
+    "fd_ieee_div_4_2_rv64gc", "fd_ieee_div_4_4_rv32gc", "fd_ieee_div_4_4_rv64gc",
+    "fdqh_ieee_div_2_1_rv32gc", "fdqh_ieee_div_2_1_rv64gc", "fdqh_ieee_div_2_2_rv32gc",
+    "fdqh_ieee_div_2_2_rv64gc", "fdqh_ieee_div_2_4_rv32gc", "fdqh_ieee_div_2_4_rv64gc",
+    "fdqh_ieee_div_4_1_rv32gc", "fdqh_ieee_div_4_1_rv64gc", "fdqh_ieee_div_4_2_rv32gc",
+    "fdqh_ieee_div_4_2_rv64gc", "fdqh_ieee_div_4_4_rv32gc", "fdqh_ieee_div_4_4_rv64gc",
+    "fdq_ieee_div_2_1_rv32gc", "fdq_ieee_div_2_1_rv64gc", "fdq_ieee_div_2_2_rv32gc",
+    "fdq_ieee_div_2_2_rv64gc", "fdq_ieee_div_2_4_rv32gc", "fdq_ieee_div_2_4_rv64gc",
+    "fdq_ieee_div_4_1_rv32gc", "fdq_ieee_div_4_1_rv64gc", "fdq_ieee_div_4_2_rv32gc",
+    "fdq_ieee_div_4_2_rv64gc", "fdq_ieee_div_4_4_rv32gc", "fdq_ieee_div_4_4_rv64gc",
+    "fh_ieee_div_2_1_rv32gc", "fh_ieee_div_2_1_rv64gc", "fh_ieee_div_2_2_rv32gc",
+    "fh_ieee_div_2_2_rv64gc", "fh_ieee_div_2_4_rv32gc", "fh_ieee_div_2_4_rv64gc",
+    "fh_ieee_div_4_1_rv32gc", "fh_ieee_div_4_1_rv64gc", "fh_ieee_div_4_2_rv32gc",
+    "fh_ieee_div_4_2_rv64gc", "fh_ieee_div_4_4_rv32gc", "fh_ieee_div_4_4_rv64gc",
+    "f_ieee_div_2_1_rv32gc", "f_ieee_div_2_1_rv64gc", "f_ieee_div_2_2_rv32gc",
+    "f_ieee_div_2_2_rv64gc", "f_ieee_div_2_4_rv32gc", "f_ieee_div_2_4_rv64gc",
+    "f_ieee_div_4_1_rv32gc", "f_ieee_div_4_1_rv64gc", "f_ieee_div_4_2_rv32gc",
+    "f_ieee_div_4_2_rv64gc", "f_ieee_div_4_4_rv32gc", "f_ieee_div_4_4_rv64gc"
+    ]
+    for config in intdivconfigs:
+        # fdivremsqrt test case
+        name = "div_drsu"
+        logname = WALLY + "/sim/" + testfloatsim + "/logs/"+config+"_"+name+".log" 
+        fdivremsqrttestcase = TestCase(
+            name=name,
+            variant=config,
+            cmd="wsim --tb testbench_fp " + " " + config + " " + name + " > " + logname,
+            grepstr="All Tests completed with          0 errors"
+        )
+        configs.insert(0,fdivremsqrttestcase)
+    for config in nointdivconfigs:
+        # div,sqrt test cases for no integer flavor of divider
+
+        name = "div_drsu"
+        logname = WALLY + "/sim/" + testfloatsim + "/logs/"+config+"_"+name+".log" 
+        divtestcase = TestCase(
+            name=name,
+            variant=config,
+            #cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " div_drsu \n!",
+            cmd="wsim --tb testbench_fp " + " " + config + " " + name + " > " + logname,
+            grepstr="All Tests completed with          0 errors"
+        )
+        configs.insert(0,divtestcase)
+
+        name = "sqrt_drsu"
+        logname = WALLY + "/sim/" + testfloatsim + "/logs/"+config+"_"+name+".log" 
+        sqrttestcase = TestCase(
+            name=name,
+            variant=config,
+            #cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " sqrt_drsu \n!",
+            cmd="wsim --tb testbench_fp  " + " " + config + " " + name + " > " + logname,
+            grepstr="All Tests completed with          0 errors"
+        )
+        configs.insert(0,sqrttestcase)
+
+import os
+from multiprocessing import Pool, TimeoutError
+
+def search_log_for_text(text, logfile):
+    """Search through the given log file for text, returning True if it is found or False if it is not"""
+    grepcmd = "grep -e '%s' '%s' > /dev/null" % (text, logfile)
+    return os.system(grepcmd) == 0
+
+def run_test_case(config):
+    testfloatsim = "questa" # change to Verilator when Issue #707 about testfloat not running Verilator is resolved
+    """Run the given test case, and return 0 if the test suceeds and 1 if it fails"""
+    #sim_logdir = WALLY+ "/sim/" + sim + "/logs/"
+    logname = WALLY + "/sim/" + testfloatsim + "/logs/"+config.variant+"_"+config.name+".log" 
+    #logname = "logs/"+config.variant+"_"+config.name+".log"
+    cmd = config.cmd.format(logname)
+#    print(cmd)
+    os.chdir(regressionDir)
+    os.system(cmd)
+    if search_log_for_text(config.grepstr, logname):
+        print(f"{bcolors.OKGREEN}%s_%s: Success{bcolors.ENDC}" % (config.variant, config.name))
+        return 0
+    else:
+        print(f"{bcolors.FAIL}%s_%s: Failures detected in output{bcolors.ENDC}" % (config.variant, config.name))
+        print("  Check %s" % logname)
+        return 1
+
+def main():
+    """Run the tests and count the failures"""
+    global configs, coverage
+    try:
+        os.chdir(regressionDir)
+        os.mkdir("logs")
+    except:
+        pass
+    try:
+        shutil.rmtree("wkdir")
+    except:
+        pass
+    finally:
+        os.mkdir("wkdir")
+ 
+    if '-makeTests' in sys.argv:
+        os.chdir(regressionDir)
+        os.system('./make-tests.sh | tee ./logs/make-tests.log')
+
+    if '-all' in sys.argv:
+        TIMEOUT_DUR = 30*7200 # seconds
+        configs.append(getBuildrootTC(boot=True))
+    elif '-buildroot' in sys.argv:
+        TIMEOUT_DUR = 30*7200 # seconds
+        configs=[getBuildrootTC(boot=True)]
+    elif '-coverage' in sys.argv:
+        TIMEOUT_DUR = 20*60 # seconds    
+    #   Presently don't run buildroot because it has a different config and can't be merged with the rv64gc coverage.
+    #   Also it is slow to run.   
+    #    configs.append(getBuildrootTC(boot=False))
+        os.system('rm -f cov/*.ucdb')
+    elif '-nightly' in sys.argv:
+        TIMEOUT_DUR = 60*1440 # 1 day
+        configs.append(getBuildrootTC(boot=False))
+    elif '-softfloat' in sys.argv:
+        TIMEOUT_DUR = 60*60 # seconds
+    elif '-intdiv' in sys.argv:
+        TIMEOUT_DUR = 60*60 # seconds
+    else:
+        TIMEOUT_DUR = 10*60 # seconds
+        configs.append(getBuildrootTC(boot=False))
+
+    # Scale the number of concurrent processes to the number of test cases, but
+    # max out at a limited number of concurrent processes to not overwhelm the system
+    with Pool(processes=min(len(configs),multiprocessing.cpu_count())) as pool:
+       num_fail = 0
+       results = {}
+       for config in configs:
+           results[config] = pool.apply_async(run_test_case,(config,))
+       for (config,result) in results.items():
+           try:
+             num_fail+=result.get(timeout=TIMEOUT_DUR)
+           except TimeoutError:
+             num_fail+=1
+             print(f"{bcolors.FAIL}%s_%s: Timeout - runtime exceeded %d seconds{bcolors.ENDC}" % (config.variant, config.name, TIMEOUT_DUR))
+
+    # Coverage report
+    if coverage:
+       os.system('make coverage')
+    # Count the number of failures
+    if num_fail:
+        print(f"{bcolors.FAIL}Regression failed with %s failed configurations{bcolors.ENDC}" % num_fail)
+    else:
+        print(f"{bcolors.OKGREEN}SUCCESS! All tests ran without failures{bcolors.ENDC}")
+    return num_fail
+
+if __name__ == '__main__':
+    exit(main())
--- a/bin/wsim
+++ b/bin/wsim
@ -28,6 +28,7 @@ parser.add_argument("--tb", "-t", help="Testbench", choices=["testbench", "testb
 parser.add_argument("--gui", "-g", help="Simulate with GUI", action="store_true")
 parser.add_argument("--ccov", "-c", help="Code Coverage", action="store_true")
 parser.add_argument("--fcov", "-f", help="Functional Coverage, implies lockstep", action="store_true")
+parser.add_argument("--fcov2", "-f2", help="Functional Coverage, implies lockstep", action="store_true")
 parser.add_argument("--fcovrvvi", "-fr", help="Functional Coverage RVVI", action="store_true")
 parser.add_argument("--args", "-a", help="Optional arguments passed to simulator via $value$plusargs", default="")
 parser.add_argument("--vcd", "-v", help="Generate testbench.vcd", action="store_true")
@ -66,7 +67,7 @@ if(args.testsuite.endswith('.elf') and args.elf == ""): # No --elf argument; che


 # Validate arguments
-if (args.gui or args.ccov or args.fcov or args.fcovrvvi or args.lockstep):
+if (args.gui or args.ccov or args.fcov or args.fcov2 or args.fcovrvvi or args.lockstep):
    if args.sim not in ["questa", "vcs"]:
        print("Option only supported for Questa and VCS")
        exit(1)
@ -81,7 +82,7 @@ if (args.rvvi):
 if(int(args.locksteplog) >= 1): EnableLog = 1
 else: EnableLog = 0
 if (args.lockstep):
-    prefix = "IMPERAS_TOOLS=" + WALLY + "/sim/imperas.ic"
+    prefix = "IMPERAS_TOOLS=" + WALLY + "/config/"+args.config+"/imperas.ic"
    if(args.locksteplog != 0): ImperasPlusArgs = " +IDV_TRACE2LOG=" + str(EnableLog) + " +IDV_TRACE2LOG_AFTER=" + str(args.locksteplog) 
    else: ImperasPlusArgs = ""
    if(args.fcov):
@ -90,6 +91,12 @@ if (args.lockstep):
        else: EnableLog = 0
        ImperasPlusArgs = " +IDV_TRACE2COV=" + str(EnableLog) + " +TRACE2LOG_AFTER=" + str(args.covlog) + " +TRACE2COV_ENABLE=" + CovEnableStr;
        suffix = ""
+    if(args.fcov2):
+        CovEnableStr = "1" if int(args.covlog) > 0  else "0";
+        if(args.covlog >= 1): EnableLog = 1
+        else: EnableLog = 0
+        ImperasPlusArgs = " +IDV_TRACE2COV=" + str(EnableLog) + " +TRACE2LOG_AFTER=" + str(args.covlog) + " +TRACE2COV_ENABLE=" + CovEnableStr;
+        suffix = ""
    else:
        CovEnableStr = ""
        suffix = "--lockstep"
@ -104,6 +111,8 @@ if (args.ccov):
    flags += " --ccov"
 if (args.fcov):
    flags += " --fcov"
+if (args.fcov2):
+    flags += " --fcov2"
 if (args.fcovrvvi):
    flags += "--fcovrvvi"

--- a/config/derivlist.txt
+++ b/config/derivlist.txt
@ -950,6 +950,9 @@ D_SUPPORTED     0
 ZCD_SUPPORTED   0
 ZFH_SUPPORTED   0

+deriv f_div_2_8_rv64gc    f_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv f_div_4_1_rv64gc    div_4_1_rv64gc    
 D_SUPPORTED     0
 ZCD_SUPPORTED   0
@ -982,6 +985,9 @@ D_SUPPORTED     0
 ZCD_SUPPORTED   0
 ZFH_SUPPORTED   1

+deriv fh_div_2_8_rv32gc    fh_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv fh_div_4_1_rv32gc    div_4_1_rv32gc    
 D_SUPPORTED     0
 ZCD_SUPPORTED   0
@ -1012,6 +1018,9 @@ D_SUPPORTED     0
 ZCD_SUPPORTED   0
 ZFH_SUPPORTED   1

+deriv fh_div_2_8_rv64gc    fh_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv fh_div_4_1_rv64gc    div_4_1_rv64gc    
 D_SUPPORTED     0
 ZCD_SUPPORTED   0
@ -1038,6 +1047,9 @@ ZFH_SUPPORTED   0
 deriv fd_div_2_4_rv32gc    div_2_4_rv32gc
 ZFH_SUPPORTED   0

+deriv fd_div_2_8_rv32gc    fd_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv fd_div_4_1_rv32gc    div_4_1_rv32gc    
 ZFH_SUPPORTED   0

@ -1056,6 +1068,9 @@ ZFH_SUPPORTED   0
 deriv fd_div_2_4_rv64gc    div_2_4_rv64gc    
 ZFH_SUPPORTED   0

+deriv fd_div_2_8_rv64gc    fd_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv fd_div_4_1_rv64gc    div_4_1_rv64gc    
 ZFH_SUPPORTED   0

@ -1077,6 +1092,9 @@ ZFH_SUPPORTED   1
 deriv fdh_div_2_4_rv32gc    div_2_4_rv32gc
 ZFH_SUPPORTED   1

+deriv fdh_div_2_8_rv32gc    fdh_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv fdh_div_4_1_rv32gc    div_4_1_rv32gc    
 ZFH_SUPPORTED   1

@ -1095,6 +1113,9 @@ ZFH_SUPPORTED   1
 deriv fdh_div_2_4_rv64gc    div_2_4_rv64gc    
 ZFH_SUPPORTED   1

+deriv fdh_div_2_8_rv64gc    fdh_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv fdh_div_4_1_rv64gc    div_4_1_rv64gc    
 ZFH_SUPPORTED   1

@ -1118,6 +1139,9 @@ deriv fdq_div_2_4_rv32gc    div_2_4_rv32gc
 Q_SUPPORTED     1
 ZFH_SUPPORTED   0

+deriv fdq_div_2_8_rv32gc    fdq_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv fdq_div_4_1_rv32gc    div_4_1_rv32gc    
 Q_SUPPORTED     1
 ZFH_SUPPORTED   0
@ -1142,6 +1166,9 @@ deriv fdq_div_2_4_rv64gc    div_2_4_rv64gc
 Q_SUPPORTED     1
 ZFH_SUPPORTED   0

+deriv fdq_div_2_8_rv64gc    fdq_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv fdq_div_4_1_rv64gc    div_4_1_rv64gc    
 Q_SUPPORTED     1
 ZFH_SUPPORTED   0
@ -1168,6 +1195,9 @@ deriv fdqh_div_2_4_rv32gc    div_2_4_rv32gc
 Q_SUPPORTED     1
 ZFH_SUPPORTED   1

+deriv fdqh_div_2_8_rv32gc    fdqh_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv fdqh_div_4_1_rv32gc    div_4_1_rv32gc    
 Q_SUPPORTED     1
 ZFH_SUPPORTED   1
@ -1192,6 +1222,9 @@ deriv fdqh_div_2_4_rv64gc    div_2_4_rv64gc
 Q_SUPPORTED     1
 ZFH_SUPPORTED   1

+deriv fdqh_div_2_8_rv64gc    fdqh_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv fdqh_div_4_1_rv64gc    div_4_1_rv64gc    
 Q_SUPPORTED     1
 ZFH_SUPPORTED   1
@ -1215,6 +1248,9 @@ IEEE754         1
 deriv f_ieee_div_2_4_rv32gc    f_div_2_4_rv32gc    
 IEEE754         1

+deriv f_ieee_div_2_8_rv32gc    f_ieee_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv f_ieee_div_4_1_rv32gc    f_div_4_1_rv32gc    
 IEEE754         1

@ -1233,6 +1269,9 @@ IEEE754         1
 deriv f_ieee_div_2_4_rv64gc    f_div_2_4_rv64gc    
 IEEE754         1

+deriv f_ieee_div_2_8_rv64gc    f_ieee_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv f_ieee_div_4_1_rv64gc    f_div_4_1_rv64gc    
 IEEE754         1

@ -1252,6 +1291,9 @@ IEEE754         1
 deriv fh_ieee_div_2_4_rv32gc    fh_div_2_4_rv32gc    
 IEEE754         1

+deriv fh_ieee_div_2_8_rv32gc    fh_ieee_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv fh_ieee_div_4_1_rv32gc    fh_div_4_1_rv32gc    
 IEEE754         1

@ -1270,6 +1312,9 @@ IEEE754         1
 deriv fh_ieee_div_2_4_rv64gc    fh_div_2_4_rv64gc    
 IEEE754         1

+deriv fh_ieee_div_2_8_rv64gc    fh_ieee_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv fh_ieee_div_4_1_rv64gc    fh_div_4_1_rv64gc    
 IEEE754         1

@ -1289,6 +1334,9 @@ IEEE754         1
 deriv fd_ieee_div_2_4_rv32gc    fd_div_2_4_rv32gc    
 IEEE754         1

+deriv fd_ieee_div_2_8_rv32gc    fd_ieee_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv fd_ieee_div_4_1_rv32gc    fd_div_4_1_rv32gc    
 IEEE754         1

@ -1307,6 +1355,9 @@ IEEE754         1
 deriv fd_ieee_div_2_4_rv64gc    fd_div_2_4_rv64gc    
 IEEE754         1

+deriv fd_ieee_div_2_8_rv64gc    fd_ieee_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv fd_ieee_div_4_1_rv64gc    fd_div_4_1_rv64gc    
 IEEE754         1

@ -1327,6 +1378,9 @@ IEEE754         1
 deriv fdh_ieee_div_2_4_rv32gc    fdh_div_2_4_rv32gc    
 IEEE754         1

+deriv fdh_ieee_div_2_8_rv32gc    fdh_ieee_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv fdh_ieee_div_4_1_rv32gc    fdh_div_4_1_rv32gc    
 IEEE754         1

@ -1345,6 +1399,9 @@ IEEE754         1
 deriv fdh_ieee_div_2_4_rv64gc    fdh_div_2_4_rv64gc    
 IEEE754         1

+deriv fdh_ieee_div_2_8_rv64gc    fdh_ieee_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv fdh_ieee_div_4_1_rv64gc    fdh_div_4_1_rv64gc    
 IEEE754         1

@ -1364,6 +1421,9 @@ IEEE754         1
 deriv fdq_ieee_div_2_4_rv32gc    fdq_div_2_4_rv32gc    
 IEEE754         1

+deriv fdq_ieee_div_2_8_rv32gc    fdq_ieee_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv fdq_ieee_div_4_1_rv32gc    fdq_div_4_1_rv32gc    
 IEEE754         1

@ -1382,6 +1442,9 @@ IEEE754         1
 deriv fdq_ieee_div_2_4_rv64gc    fdq_div_2_4_rv64gc    
 IEEE754         1

+deriv fdq_ieee_div_2_8_rv64gc    fdq_ieee_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv fdq_ieee_div_4_1_rv64gc    fdq_div_4_1_rv64gc    
 IEEE754         1

@ -1402,6 +1465,9 @@ IEEE754         1
 deriv fdqh_ieee_div_2_4_rv32gc    fdqh_div_2_4_rv32gc    
 IEEE754         1

+deriv fdqh_ieee_div_2_8_rv32gc    fdqh_ieee_div_2_4_rv32gc    
+DIVCOPIES       32'd8
+
 deriv fdqh_ieee_div_4_1_rv32gc    fdqh_div_4_1_rv32gc    
 IEEE754         1

@ -1420,6 +1486,9 @@ IEEE754         1
 deriv fdqh_ieee_div_2_4_rv64gc    fdqh_div_2_4_rv64gc    
 IEEE754         1

+deriv fdqh_ieee_div_2_8_rv64gc    fdqh_ieee_div_2_4_rv64gc    
+DIVCOPIES       32'd8
+
 deriv fdqh_ieee_div_4_1_rv64gc    fdqh_div_4_1_rv64gc    
 IEEE754         1

@ -1440,6 +1509,9 @@ IDIV_ON_FPU     1
 deriv f_ieee_div_2_4i_rv32gc f_ieee_div_2_4_rv32gc        
 IDIV_ON_FPU     1

+deriv f_ieee_div_2_8i_rv32gc f_ieee_div_2_4i_rv32gc 
+DIVCOPIES       32'd8
+
 deriv f_ieee_div_4_1i_rv32gc f_ieee_div_4_1_rv32gc        
 IDIV_ON_FPU     1

@ -1458,6 +1530,9 @@ IDIV_ON_FPU     1
 deriv f_ieee_div_2_4i_rv64gc f_ieee_div_2_4_rv64gc        
 IDIV_ON_FPU     1

+deriv f_ieee_div_2_8i_rv64gc f_ieee_div_2_4i_rv64gc 
+DIVCOPIES       32'd8
+
 deriv f_ieee_div_4_1i_rv64gc f_ieee_div_4_1_rv64gc        
 IDIV_ON_FPU     1

@ -1477,6 +1552,9 @@ IDIV_ON_FPU     1
 deriv fh_ieee_div_2_4i_rv32gc fh_ieee_div_2_4_rv32gc        
 IDIV_ON_FPU     1

+deriv fh_ieee_div_2_8i_rv32gc fh_ieee_div_2_4i_rv32gc 
+DIVCOPIES       32'd8
+
 deriv fh_ieee_div_4_1i_rv32gc fh_ieee_div_4_1_rv32gc        
 IDIV_ON_FPU     1

@ -1495,6 +1573,9 @@ IDIV_ON_FPU     1
 deriv fh_ieee_div_2_4i_rv64gc fh_ieee_div_2_4_rv64gc        
 IDIV_ON_FPU     1

+deriv fh_ieee_div_2_8i_rv64gc fh_ieee_div_2_4i_rv64gc 
+DIVCOPIES       32'd8
+
 deriv fh_ieee_div_4_1i_rv64gc fh_ieee_div_4_1_rv64gc        
 IDIV_ON_FPU     1

@ -1515,6 +1596,9 @@ IDIV_ON_FPU     1
 deriv fd_ieee_div_2_4i_rv32gc fd_ieee_div_2_4_rv32gc        
 IDIV_ON_FPU     1

+deriv fd_ieee_div_2_8i_rv32gc fd_ieee_div_2_4i_rv32gc 
+DIVCOPIES       32'd8
+
 deriv fd_ieee_div_4_1i_rv32gc fd_ieee_div_4_1_rv32gc        
 IDIV_ON_FPU     1

@ -1533,6 +1617,9 @@ IDIV_ON_FPU     1
 deriv fd_ieee_div_2_4i_rv64gc fd_ieee_div_2_4_rv64gc        
 IDIV_ON_FPU     1

+deriv fd_ieee_div_2_8i_rv64gc fd_ieee_div_2_4i_rv64gc 
+DIVCOPIES       32'd8
+
 deriv fd_ieee_div_4_1i_rv64gc fd_ieee_div_4_1_rv64gc        
 IDIV_ON_FPU     1

@ -1553,6 +1640,9 @@ IDIV_ON_FPU     1
 deriv fdh_ieee_div_2_4i_rv32gc fdh_ieee_div_2_4_rv32gc        
 IDIV_ON_FPU     1

+deriv fdh_ieee_div_2_8i_rv32gc fdh_ieee_div_2_4i_rv32gc 
+DIVCOPIES       32'd8
+
 deriv fdh_ieee_div_4_1i_rv32gc fdh_ieee_div_4_1_rv32gc        
 IDIV_ON_FPU     1

@ -1571,6 +1661,9 @@ IDIV_ON_FPU     1
 deriv fdh_ieee_div_2_4i_rv64gc fdh_ieee_div_2_4_rv64gc        
 IDIV_ON_FPU     1

+deriv fdh_ieee_div_2_8i_rv64gc fdh_ieee_div_2_4i_rv64gc 
+DIVCOPIES       32'd8
+
 deriv fdh_ieee_div_4_1i_rv64gc fdh_ieee_div_4_1_rv64gc        
 IDIV_ON_FPU     1

@ -1591,6 +1684,9 @@ IDIV_ON_FPU     1
 deriv fdq_ieee_div_2_4i_rv32gc fdq_ieee_div_2_4_rv32gc        
 IDIV_ON_FPU     1

+deriv fdq_ieee_div_2_8i_rv32gc fdq_ieee_div_2_4i_rv32gc 
+DIVCOPIES       32'd8
+
 deriv fdq_ieee_div_4_1i_rv32gc fdq_ieee_div_4_1_rv32gc        
 IDIV_ON_FPU     1

@ -1609,6 +1705,9 @@ IDIV_ON_FPU     1
 deriv fdq_ieee_div_2_4i_rv64gc fdq_ieee_div_2_4_rv64gc        
 IDIV_ON_FPU     1

+deriv fdq_ieee_div_2_8i_rv64gc fdq_ieee_div_2_4i_rv64gc 
+DIVCOPIES       32'd8
+
 deriv fdq_ieee_div_4_1i_rv64gc fdq_ieee_div_4_1_rv64gc        
 IDIV_ON_FPU     1

@ -1629,6 +1728,9 @@ IDIV_ON_FPU     1
 deriv fdqh_ieee_div_2_4i_rv32gc fdqh_ieee_div_2_4_rv32gc        
 IDIV_ON_FPU     1

+deriv fdqh_ieee_div_2_8i_rv32gc fdqh_ieee_div_2_4i_rv32gc 
+DIVCOPIES       32'd8
+
 deriv fdqh_ieee_div_4_1i_rv32gc fdqh_ieee_div_4_1_rv32gc        
 IDIV_ON_FPU     1

@ -1647,6 +1749,9 @@ IDIV_ON_FPU     1
 deriv fdqh_ieee_div_2_4i_rv64gc fdqh_ieee_div_2_4_rv64gc        
 IDIV_ON_FPU     1

+deriv fdqh_ieee_div_2_8i_rv64gc fdqh_ieee_div_2_4i_rv64gc 
+DIVCOPIES       32'd8
+
 deriv fdqh_ieee_div_4_1i_rv64gc fdqh_ieee_div_4_1_rv64gc        
 IDIV_ON_FPU     1

--- a/config/rv32gc/imperas.ic
+++ b/config/rv32gc/imperas.ic
@ -9,6 +9,7 @@
 #--showcommands

 # Core settings
+--variant RV32GC # for RV32GC
 --override cpu/priv_version=1.12 
 --override cpu/user_version=20191213
 # arch
@ -38,11 +39,12 @@
 --override lr_sc_grain=8   # Za64rs requires <=64; we use native word size

 # 64 KiB continuous huge pages supported
--override cpu/Svpbmt=T
--override cpu/Svnapot_page_mask=65536
+#--override cpu/Svpbmt=F
+#--override cpu/Svnapot_page_mask=65536

-# SV39 and SV48 supported
--override cpu/Sv_modes=768
+# SV32 supported
+--override cpu/Sv_modes=3
+#--showoverrides

 --override cpu/Svinval=T

@ -59,7 +61,7 @@

 --override cpu/reset_address=0x80000000

--override cpu/unaligned=T  # Zicclsm (should be true)
+--override cpu/unaligned=F  # Zicclsm (should be true)
 --override cpu/ignore_non_leaf_DAU=1
 --override cpu/wfi_is_nop=T
 --override cpu/misa_Extensions_mask=0x0 # MISA not writable
@ -74,7 +76,7 @@
 --override cpu/PMP_undefined=T

 # mstatus.FS is set dirty on any write to a FPR, or when a fp operation signals an exception
--override cpu/mstatus_fs_mode=rvfs_write_nz
+--override cpu/mstatus_fs_mode=write_1

 # PMA Settings 
 # 'r': read access allowed
--- a/config/rv64gc/imperas.ic
+++ b/config/rv64gc/imperas.ic
@ -0,0 +1,117 @@
+# imperas.ic
+# Initialization file for ImperasDV lock step simulation
+# David_Harris@hmc.edu 15 August 2024
+# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+
+#--mpdconsole
+#--gdbconsole
+#--showoverrides
+#--showcommands
+
+# Core settings
+--override cpu/priv_version=1.12 
+--override cpu/user_version=20191213
+# arch
+--override cpu/mimpid=0x100
+--override cpu/mvendorid=0x602
+--override cpu/marchid=0x24
+--override refRoot/cpu/tvec_align=64
+--override refRoot/cpu/envcfg_mask=1   # dh 1/26/24 this should be deleted when ImperasDV is updated to allow envcfg.FIOM to be written
+
+# bit manipulation
+--override cpu/add_Extensions=B 
+--override cpu/bitmanip_version=1.0.0
+--override cpu/misa_B_Zba_Zbb_Zbs=T
+
+# More extensions
+--override cpu/Zcb=T
+--override cpu/Zicond=T
+--override cpu/Zfh=T
+--override cpu/Zfa=T
+
+# Cache block operations
+--override cpu/Zicbom=T
+--override cpu/Zicbop=T
+--override cpu/Zicboz=T
+--override cmomp_bytes=64  # Zic64b
+--override cmoz_bytes=64   # Zic64b
+--override lr_sc_grain=8   # Za64rs requires <=64; we use native word size
+
+# 64 KiB continuous huge pages supported
+--override cpu/Svpbmt=T
+--override cpu/Svnapot_page_mask=65536
+
+# SV39 and SV48 supported
+--override cpu/Sv_modes=768
+
+--override cpu/Svinval=T
+
+
+#  clarify
+#--override refRoot/cpu/mtvec_sext=F
+
+--override cpu/tval_ii_code=T
+
+#--override cpu/time_undefined=T
+#--override cpu/cycle_undefined=T
+#--override cpu/instret_undefined=T
+#--override cpu/hpmcounter_undefined=T
+
+--override cpu/reset_address=0x80000000
+
+--override cpu/unaligned=T  # Zicclsm (should be true)
+--override cpu/ignore_non_leaf_DAU=1
+--override cpu/wfi_is_nop=T
+--override cpu/misa_Extensions_mask=0x0 # MISA not writable
+--override cpu/Sstc=T
+
+# Enable SVADU hardware update of A/D bits when menvcfg.ADUE=1
+--override cpu/Svadu=T
+#--override cpu/updatePTEA=F
+#--override cpu/updatePTED=F
+
+--override cpu/PMP_registers=16
+--override cpu/PMP_undefined=T
+
+# mstatus.FS is set dirty on any write to a FPR, or when a fp operation signals an exception
+--override cpu/mstatus_fs_mode=write_1
+
+# PMA Settings 
+# 'r': read access allowed
+# 'w': write access allowed
+# 'x': execute access allowed
+# 'a': aligned access required
+# 'A': atomic instructions NOT allowed (actually USER1 privilege needed)
+# 'P': push/pop instructions NOT allowed (actually USER2 privilege needed)
+# '1': 1-byte accesses allowed
+# '2': 2-byte accesses allowed
+# '4': 4-byte accesses allowed
+# '8': 8-byte accesses allowed
+# '-', space: ignored (use for input string formatting).
+#
+# SVxx Memory 0x0000000000 0x7FFFFFFFFF
+#
+--callcommand refRoot/cpu/setPMA -lo 0x0000000000 -hi 0xFFFFFFFFFFFFFFFFFF -attributes " ---a-- ---- " # All memory inaccessible unless defined otherwise
+--callcommand refRoot/cpu/setPMA -lo 0x0000000000 -hi 0x7FFFFFFFFF -attributes " ---a-- ---- " # INITIAL
+--callcommand refRoot/cpu/setPMA -lo 0x0000001000 -hi 0x0000001FFF -attributes " r-x-A- 1248 " # BOOTROM
+--callcommand refRoot/cpu/setPMA -lo 0x0000012100 -hi 0x000001211F -attributes " rw-aA- --48 " # SDC
+--callcommand refRoot/cpu/setPMA -lo 0x0002000000 -hi 0x000200FFFF -attributes " rw-aA- 1248 " # CLINT
+--callcommand refRoot/cpu/setPMA -lo 0x000C000000 -hi 0x000FFFFFFF -attributes " rw-aA- --4- " # PLIC
+--callcommand refRoot/cpu/setPMA -lo 0x0010000000 -hi 0x0010000007 -attributes " rw-aA- 1--- " # UART0 error - 0x10000000 - 0x100000FF
+--callcommand refRoot/cpu/setPMA -lo 0x0010060000 -hi 0x00100600FF -attributes " rw-aA- --4- " # GPIO  error - 0x10069000 - 0x100600FF
+--callcommand refRoot/cpu/setPMA -lo 0x0010040000 -hi 0x0010040FFF -attributes " rw-aA- --4- " # SPI   error - 0x10040000 - 0x10040FFF
+--callcommand refRoot/cpu/setPMA -lo 0x0080000000 -hi 0x008FFFFFFF -attributes " rwx--- 1248 " # UNCORE_RAM
+
+# Enable the Imperas instruction coverage
+#-extlib    refRoot/cpu/cv=imperas.com/intercept/riscvInstructionCoverage/1.0
+#-override  refRoot/cpu/cv/cover=basic
+#-override  refRoot/cpu/cv/extensions=RV32I
+
+# Add Imperas simulator application instruction tracing
+# uncomment these to provide tracing
+#--verbose --trace --tracechange --traceshowicount --tracemode -tracemem ASX --monitornetschange # --traceafter 300000000
+#--override cpu/debugflags=6 --override cpu/verbose=1
+#--override cpu/show_c_prefix=T
+
+# Store simulator output to logfile
+--output imperas.log
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@ -123,6 +123,10 @@ localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1), (DIVb + 1 + NF + 1)), (FMALEN

 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));                  // log_2(NORMSHIFTSZ)

+localparam CORRSHIFTSZ = `max((NORMSHIFTSZ-2), (DIVMINb + 1 + NF));
+localparam NORMSHIFTSZDRSU = DIVb+1+NF;
+localparam LOGNORMSHIFTSZDRSU = $clog2(NORMSHIFTSZDRSU);
+
 // Disable spurious Verilator warnings

 /* verilator lint_off STMTDLY */
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@ -194,6 +194,8 @@ localparam cvw_t P = '{
  FMALEN : FMALEN,
  NORMSHIFTSZ : NORMSHIFTSZ,
  LOGNORMSHIFTSZ : LOGNORMSHIFTSZ,
+  NORMSHIFTSZDRSU : NORMSHIFTSZDRSU,
+  LOGNORMSHIFTSZDRSU : LOGNORMSHIFTSZDRSU,
  LOGR        : LOGR,
  RK          : RK,
  FPDUR       : FPDUR,
--- a/fpga/zsbl/boot.c
+++ b/fpga/zsbl/boot.c
@ -52,6 +52,42 @@ when 8 bytes are transferred

 */

+// crc16 table to reduce byte processing time
+static const uint16_t crctable[256] = {
+  0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, 
+  0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, 
+  0x1231, 0x0210, 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6, 
+  0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de, 
+  0x2462, 0x3443, 0x0420, 0x1401, 0x64e6, 0x74c7, 0x44a4, 0x5485, 
+  0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d, 
+  0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, 0x5695, 0x46b4, 
+  0xb75b, 0xa77a, 0x9719, 0x8738, 0xf7df, 0xe7fe, 0xd79d, 0xc7bc, 
+  0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823, 
+  0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b, 
+  0x5af5, 0x4ad4, 0x7ab7, 0x6a96, 0x1a71, 0x0a50, 0x3a33, 0x2a12, 
+  0xdbfd, 0xcbdc, 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a, 
+  0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41, 
+  0xedae, 0xfd8f, 0xcdec, 0xddcd, 0xad2a, 0xbd0b, 0x8d68, 0x9d49, 
+  0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70, 
+  0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, 0x9f59, 0x8f78, 
+  0x9188, 0x81a9, 0xb1ca, 0xa1eb, 0xd10c, 0xc12d, 0xf14e, 0xe16f, 
+  0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067, 
+  0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e, 
+  0x02b1, 0x1290, 0x22f3, 0x32d2, 0x4235, 0x5214, 0x6277, 0x7256, 
+  0xb5ea, 0xa5cb, 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d, 
+  0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, 
+  0xa7db, 0xb7fa, 0x8799, 0x97b8, 0xe75f, 0xf77e, 0xc71d, 0xd73c, 
+  0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634, 
+  0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, 0xb98a, 0xa9ab, 
+  0x5844, 0x4865, 0x7806, 0x6827, 0x18c0, 0x08e1, 0x3882, 0x28a3, 
+  0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a, 
+  0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92, 
+  0xfd2e, 0xed0f, 0xdd6c, 0xcd4d, 0xbdaa, 0xad8b, 0x9de8, 0x8dc9, 
+  0x7c26, 0x6c07, 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1, 
+  0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8, 
+  0x6e17, 0x7e36, 0x4e55, 0x5e74, 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0 
+};
+
 int disk_read(BYTE * buf, LBA_t sector, UINT count) {
  uint64_t r;
  UINT i, j;
@ -86,6 +122,7 @@ int disk_read(BYTE * buf, LBA_t sector, UINT count) {
  for (i = 0; i < count; i++) {
    uint16_t crc, crc_exp;
    uint64_t n = 0;
+    uint64_t readCount = 0;

    // Wait for data token
    while((r = spi_dummy()) != SD_DATA_TOKEN);
@ -98,21 +135,45 @@ int disk_read(BYTE * buf, LBA_t sector, UINT count) {
    /*   crc = crc16(crc, x); */
    /* } while (--n > 0); */

-    n = 512/8;
-    do {
-      // Send 8 dummy bytes (fifo should be empty)
-      for (j = 0; j < 8; j++) {
+    /* n = 512/8; */
+    /* do { */
+    /*   // Send 8 dummy bytes (fifo should be empty) */
+    /*   for (j = 0; j < 8; j++) { */
+    /*     spi_sendbyte(0xff); */
+    /*   } */
+
+    /*   // Reset counter. Process bytes AS THEY COME IN. */
+    /*   for (j = 0; j < 8; j++) { */
+    /*     while (!(read_reg(SPI_IP) & 2)) {} */
+    /*     uint8_t x = spi_readbyte(); */
+    /*     *p++ = x; */
+    /*     // crc = crc16(crc, x); */
+    /*     crc = ((crc << 8) ^ crctable[x ^ (crc >> 8)]) & 0xffff; */
+    /*   } */
+    /* } while(--n > 0); */
+
+    n = 512;
+    // Initially fill the transmit fifo
+    for (j = 0; j < 8; j++) {
+      spi_sendbyte(0xff);
+    }
+
+    
+    while (n > 0) {
+      // Wait for bytes to be received
+      while (!(read_reg(SPI_IP) & 2)) {}
+      // Read byte
+      uint8_t x = spi_readbyte();
+      // Send another dummy byte
+      if (n > 8) {
        spi_sendbyte(0xff);
      }
-
-      // Reset counter. Process bytes AS THEY COME IN.
-      for (j = 0; j < 8; j++) {
-        while (!(read_reg(SPI_IP) & 2)) {}
-        uint8_t x = spi_readbyte();
-        *p++ = x;
-        crc = crc16(crc, x);
-      }
-    } while(--n > 0);
+      // Place received byte into memory
+      *p++ = x;
+      // Update CRC16 with fast table based method
+      crc = ((crc << 8) ^ crctable[x ^ (crc >> 8)]) & 0xffff;
+      n = n - 1;
+    }
    
    // Read CRC16 and check
    crc_exp = ((uint16_t)spi_dummy() << 8);
--- a/fpga/zsbl/spi.h
+++ b/fpga/zsbl/spi.h
@ -1,3 +1,32 @@
+///////////////////////////////////////////////////////////////////////
+// spi.h
+//
+// Written: Jaocb Pease jacob.pease@okstate.edu 7/22/2024
+//
+// Purpose: Header file for interfaceing with the SPI peripheral
+//
+// 
+//
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the
+// “License”); you may not use this file except in compliance with the
+// License, or, at your option, the Apache License version 2.0. You
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work
+// distributed under the License is distributed on an “AS IS” BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
 #pragma once
 #ifndef SPI_HEADER
 #define SPI_HEADER
--- a/sim/questa/wally.do
+++ b/sim/questa/wally.do
@ -149,10 +149,32 @@ if {$FunctCoverageIndex >= 0} {
    set lst [lreplace $lst $FunctCoverageIndex $FunctCoverageIndex]
 }\

+set FunctCoverageIndex2 [lsearch -exact $lst "--fcov2"]
+if {$FunctCoverageIndex2 >= 0} {
+    set FunctCoverage 1
+    set riscvISACOVsrc +incdir+$env(IMPERAS_HOME)/ImpProprietary/source/host/riscvISACOV/source
+
+    set FCdefineINCLUDE_TRACE2COV "+define+INCLUDE_TRACE2COV"
+    set FCdefineCOVER_BASE_RV64I "+define+COVER_BASE_RV64I"
+    set FCdefineCOVER_LEVEL_DV_PR_EXT  "+define+COVER_LEVEL_DV_PR_EXT"
+    # Uncomment various cover statements below to control which extensions get functional coverage
+    set FCdefineCOVER_RV64I "+define+COVER_RV64I"
+    #set FCdefineCOVER_RV64M "+define+COVER_RV64M"
+    #set FCdefineCOVER_RV64A "+define+COVER_RV64A"
+    #set FCdefineCOVER_RV64F "+define+COVER_RV64F"
+    #set FCdefineCOVER_RV64D "+define+COVER_RV64D"
+    #set FCdefineCOVER_RV64ZICSR "+define+COVER_RV64ZICSR"
+    #set FCdefineCOVER_RV64C "+define+COVER_RV64C"
+    set FCdefineIDV_INCLUDE_TRACE2COV "+define+IDV_INCLUDE_TRACE2COV"
+    set FCTRACE2COV "+TRACE2COV_ENABLE=1"
+    set FCdefineIDV_TRACE2COV "+IDV_TRACE2COV=1"
+    set lst [lreplace $lst $FunctCoverageIndex2 $FunctCoverageIndex2]
+}\
+ 
 set LockStepIndex [lsearch -exact $lst "--lockstep"]
 # ugh.  can't have more than 9 arguments passed to vsim. why? I'll have to remove --lockstep when running
 # functional coverage and imply it.
-if {$LockStepIndex >= 0 || $FunctCoverageIndex >= 0} {
+if {$LockStepIndex >= 0 || $FunctCoverageIndex >= 0 || $FunctCoverageIndex2 >= 0} {
    set lockstep 1

    # ideally this would all be one or two variables, but questa is having a real hard time
--- a/site-setup.sh
+++ b/site-setup.sh
@ -11,6 +11,7 @@
 # Must edit these based on your local environment.
 export MGLS_LICENSE_FILE=27002@zircon.eng.hmc.edu                   # Change this to your Siemens license server for Questa
 export SNPSLMD_LICENSE_FILE=27020@zircon.eng.hmc.edu                # Change this to your Synopsys license server
+export IMPERASD_LICENSE_FILE=27020@zircon.eng.hmc.edu               # Change this to your Imperas license server
 export QUESTA_HOME=/cad/mentor/questa_sim-2023.4/questasim          # Change this for your path to Questa, excluding bin
 export DC_HOME=/cad/synopsys/SYN                                    # Change this for your path to Synopsys Design Compiler, excluding bin
 export VCS_HOME=/cad/synopsys/vcs/U-2023.03-SP2-4                   # Change this for your path to Synopsys VCS, excluding bin
--- a/src/cvw.sv
+++ b/src/cvw.sv
@ -285,6 +285,8 @@ typedef struct packed {
  int LOGCVTLEN;
  int NORMSHIFTSZ;
  int LOGNORMSHIFTSZ;
+  int NORMSHIFTSZDRSU;
+  int LOGNORMSHIFTSZDRSU;
  int FMALEN;

 // division constants
--- a/src/fpu/divremsqrt/arithrightshift.sv
+++ b/src/fpu/divremsqrt/arithrightshift.sv
@ -0,0 +1,9 @@
+
+module arithrightshift import cvw::*;  #(parameter cvw_t P) (
+  input logic signed [P.INTDIVb+3:0] shiftin,
+  output logic signed [P.INTDIVb+3:0] shifted
+);
+  assign shifted = $signed(shiftin) >>> P.LOGR;
+
+endmodule
+
--- a/src/fpu/divremsqrt/divremsqrt.sv
+++ b/src/fpu/divremsqrt/divremsqrt.sv
@ -0,0 +1,110 @@
+///////////////////////////////////////////
+// divremsqrt.sv
+//
+// Written: kekim@hmc.edu
+// Modified:19 May 2023
+//
+// Purpose: Combined Divide and Square Root Floating Point and Integer Unit with postprocessing
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+ module divremsqrt import cvw::*;  #(parameter cvw_t P) (
+  input  logic                clk, 
+  input  logic                reset, 
+  input  logic [P.FMTBITS-1:0] FmtE,
+  input  logic                XsE,
+  input  logic [P.NF:0]        XmE, YmE,
+  input  logic [P.NE-1:0]      XeE, YeE,
+  input  logic                XInfE, YInfE, 
+  input  logic                XZeroE, YZeroE, 
+  input  logic                XNaNE, YNaNE, 
+  input  logic                FDivStartE, IDivStartE,
+  input  logic                StallM,
+  input  logic                FlushE,
+  input  logic                SqrtE, SqrtM,
+  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // these are the src outputs before the mux choosing between them and PCE to put in srcA/B
+  input  logic [2:0]          Funct3E, Funct3M,
+  input  logic                IntDivE, W64E,
+  output logic                DivStickyM,
+  output logic                FDivBusyE, IFDivStartE, FDivDoneE,
+  output logic [P.NE+1:0]      UeM,
+  output logic [P.DIVb:0]      UmM,
+  output logic [P.XLEN-1:0]    FIntDivResultM,
+  output logic                 IntDivM,
+  // integer normalization shifter signals
+  output logic [P.INTDIVb+3:0]          PreResultM,
+  input logic [P.XLEN-1:0]          PreIntResultM,
+  output logic [P.DIVBLEN-1:0]       IntNormShiftM
+
+);
+
+  // Floating-point division and square root module, with optional integer division and remainder
+  // Computes X/Y, sqrt(X), A/B, or A%B
+
+  logic [P.DIVb+3:0]           WS, WC;                       // Partial remainder components
+  logic [P.DIVb+3:0]           X;                            // Iterator Initial Value (from dividend)
+  logic [P.DIVb+3:0]           D;                            // Iterator Divisor
+  logic [P.DIVb:0]             FirstU, FirstUM;              // Intermediate result values
+  logic [P.DIVb+1:0]           FirstC;                       // Step tracker
+  logic                       WZeroE;                       // Early termination flag
+  logic [P.DURLEN:0]         CyclesE;                      // FSM cycles
+  logic                       SpecialCaseM;                 // Divide by zero, square root of negative, etc.
+  logic                       DivStartE;                    // Enable signal for flops during stall
+                                                            
+  // Integer div/rem signals                                
+  logic                       BZeroM;                       // Denominator is zero
+  logic [P.DIVBLEN:0]          nM, mM;                       // Shift amounts
+  logic                       NegQuotM, ALTBM, AsM, BsM, W64M, SIGNOVERFLOWM, ZeroDiffM;   // Special handling for postprocessor
+  logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
+  logic                       ISpecialCaseE;                // Integer div/remainder special cases
+
+
+  divremsqrtfdivsqrtpreproc #(P) divremsqrtfdivsqrtpreproc(                          // Preprocessor
+    .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE),
+    .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
+    // Int-specific 
+    .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
+    .BZeroM, .AM, 
+    .IntDivM, .W64M, .ALTBM, .AsM, .BsM, .IntNormShiftM, .SIGNOVERFLOWM, .ZeroDiffM);
+
+  fdivsqrtfsm #(P) fdivsqrtfsm(                                  // FSM
+    .clk, .reset, .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, 
+    .FDivStartE, .XsE, .SqrtE, .WZeroE, .FlushE, .StallM, 
+    .FDivBusyE, .IFDivStartE, .FDivDoneE, .SpecialCaseM, .CyclesE,
+    // Int-specific 
+    .IDivStartE, .ISpecialCaseE, .IntDivE);
+
+  fdivsqrtiter #(P) fdivsqrtiter(                                // CSA Iterator
+    .clk, .IFDivStartE, .FDivBusyE, .SqrtE, .X, .D, 
+    .FirstU, .FirstUM, .FirstC, .FirstWS(WS), .FirstWC(WC));
+
+  divremsqrtfdivsqrtpostproc #(P) fdivsqrtpostproc(                        // Postprocessor
+    .clk, .reset, .StallM, .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, 
+    .SqrtE, .SqrtM, .SpecialCaseM, 
+    .UmM, .WZeroE, .DivStickyM, 
+    // Int-specific 
+    .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
+    .FIntDivResultM,  .PreResultM, .PreIntResultM, .SIGNOVERFLOWM, .ZeroDiffM, .IntDivM, .IntNormShiftM);
+  
+  
+endmodule
+
--- a/src/fpu/divremsqrt/divremsqrtdivshiftcalc.sv
+++ b/src/fpu/divremsqrt/divremsqrtdivshiftcalc.sv
@ -0,0 +1,73 @@
+///////////////////////////////////////////
+// divshiftcalc.sv
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: Division shift calculation
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtdivshiftcalc import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.NF+2:0]              DivUm,              // divsqrt significand
+  input  logic [P.NE+1:0]              DivUe,              // divsqrt exponent
+  output logic [P.LOGNORMSHIFTSZDRSU-1:0]  DivShiftAmt,        // divsqrt shift amount
+  output logic [P.NORMSHIFTSZDRSU-1:0]     DivShiftIn,         // divsqrt shift input
+  output logic                         DivResSubnorm,      // is the divsqrt result subnormal
+  output logic                         DivSubnormShiftPos  // is the subnormal shift amount positive
+);
+
+  logic [P.LOGNORMSHIFTSZDRSU-1:0]         NormShift;          // normalized result shift amount
+  logic [P.LOGNORMSHIFTSZDRSU-1:0]         DivSubnormShiftAmt; // subnormal result shift amount (killed if negative)
+  logic [P.NE+1:0]                     DivSubnormShift;    // subnormal result shift amount
+
+  // is the result subnormal
+  // if the exponent is 1 then the result needs to be normalized then the result is Subnormalizes
+  assign DivResSubnorm = DivUe[P.NE+1]|(~|DivUe[P.NE+1:0]);
+
+  // if the result is subnormal
+  //  00000000x.xxxxxx...                     Exp = DivUe
+  //  .00000000xxxxxxx... >> NF+1             Exp = DivUe+NF+1
+  //  .00xxxxxxxxxxxxx... << DivUe+NF+1       Exp = +1
+  //  .0000xxxxxxxxxxx... >> 1                Exp = 1
+  // Left shift amount      = DivUe+NF+1-1
+  assign DivSubnormShift    = (P.NE+2)'(P.NF)+DivUe;
+  assign DivSubnormShiftPos = ~DivSubnormShift[P.NE+1];
+
+  // if the result is normalized
+  //  00000000x.xxxxxx...                     Exp = DivUe
+  //  .00000000xxxxxxx... >> NF+1             Exp = DivUe+NF+1
+  //  00000000.xxxxxxx... << NF               Exp = DivUe+1
+  //  00000000x.xxxxxx... << NF               Exp = DivUe (extra shift done afterwards)
+  //  00000000xx.xxxxx... << 1?               Exp = DivUe-1 (determined after)
+  // inital Left shift amount  = NF
+  // shift one more if the it's a minimally redundent radix 4 - one entire cycle needed for integer bit
+  assign NormShift = (P.LOGNORMSHIFTSZDRSU)'(P.NF);
+
+  // if the shift amount is negative then don't shift (keep sticky bit)
+  // need to multiply the early termination shift by LOGR*DIVCOPIES =  left shift of log2(LOGR*DIVCOPIES)
+  assign DivSubnormShiftAmt = DivSubnormShiftPos ? DivSubnormShift[P.LOGNORMSHIFTSZDRSU-1:0] : 0;
+  assign DivShiftAmt        = DivResSubnorm ? DivSubnormShiftAmt : NormShift;
+
+  // pre-shift the divider result for normalization
+  assign DivShiftIn = {{P.NF{1'b0}}, DivUm, {P.NORMSHIFTSZDRSU-(P.NF+2)-1-P.NF{1'b0}}};
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtearlyterm.sv
+++ b/src/fpu/divremsqrt/divremsqrtearlyterm.sv
@ -0,0 +1,27 @@
+module divremsqrtearlyterm import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.DIVb+3:0]    WS, WC,            // Q4.DIVb
+  input  logic [P.DIVb+3:0]    D,                 // Q4.DIVb
+  input  logic [P.DIVb:0]      FirstUM,   // U1.DIVb
+  input  logic [P.DIVb+1:0]    FirstC,            // Q2.DIVb
+  input  logic                 SqrtE,
+  output logic                 WZeroE
+);
+  logic weq0E;
+  aplusbeq0 #(P.DIVb+4) wspluswceq0(WS, WC, weq0E);
+  if (P.RADIX == 2) begin: R2EarlyTerm
+    logic [P.DIVb+3:0] FZeroE, FZeroSqrtE, FZeroDivE;
+    logic [P.DIVb+2:0] FirstK;
+    logic wfeq0E;
+    logic [P.DIVb+3:0] WCF, WSF;
+
+    assign FirstK = ({1'b1, FirstC} & ~({1'b1, FirstC} << 1));
+    assign FZeroSqrtE = {FirstUM[P.DIVb], FirstUM, 2'b0} | {FirstK,1'b0};    // F for square root
+    assign FZeroDivE =  D << 1;                                    // F for divide
+    mux2 #(P.DIVb+4) fzeromux(FZeroDivE, FZeroSqrtE, SqrtE, FZeroE);
+    csa #(P.DIVb+4) fadd(WS, WC, FZeroE, 1'b0, WSF, WCF); // compute {WCF, WSF} = {WS + WC + FZero};
+    aplusbeq0 #(P.DIVb+4) wcfpluswsfeq0(WCF, WSF, wfeq0E);
+    assign WZeroE = weq0E|wfeq0E;
+  end else begin
+    assign WZeroE = weq0E;
+  end 
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtfdivsqrtcycles.sv
+++ b/src/fpu/divremsqrt/divremsqrtfdivsqrtcycles.sv
@ -0,0 +1,83 @@
+///////////////////////////////////////////
+// fdivsqrtcycles.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu, amaiuolo@hmc.edu
+// Modified: 18 April 2022
+//
+// Purpose: Determine number of cycles for divsqrt
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtfdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.FMTBITS-1:0] FmtE,
+  input  logic                 SqrtE,
+  input  logic                 IntDivE,
+  input  logic [P.DIVBLEN-1:0] IntResultBitsE,    
+  output logic [P.DURLEN:0]  CyclesE
+);
+
+  logic [P.DIVBLEN-1:0] Nf, FPResultBitsE, ResultBitsE; // number of fractional (result) bits
+
+  /* verilator lint_off WIDTH */
+  if (P.FPSIZES == 1)
+    assign Nf = P.NF;
+  else if (P.FPSIZES == 2)
+    always_comb
+      case (FmtE)
+        1'b0: Nf = P.NF1;
+        1'b1: Nf = P.NF;
+      endcase
+  else if (P.FPSIZES == 3)
+    always_comb
+      case (FmtE)
+        P.FMT:   Nf = P.NF;
+        P.FMT1:  Nf = P.NF1;
+        P.FMT2:  Nf = P.NF2; 
+        default: Nf = 'x; // shouldn't happen
+      endcase
+  else if (P.FPSIZES == 4)  
+    always_comb
+      case(FmtE)
+        P.S_FMT: Nf = P.S_NF;
+        P.D_FMT: Nf = P.D_NF;
+        P.H_FMT: Nf = P.H_NF;
+        P.Q_FMT: Nf = P.Q_NF;
+      endcase 
+
+  // Cycle logic
+  // P.DIVCOPIES = k. P.LOGR = log(R) = r.  P.RK = rk.  
+  // Integer division needs p fractional + r integer result bits
+  // FP Division needs at least Nf fractional bits + 2 guard/round bits and one integer digit (LOG R integer bits) = Nf + 2 + r bits
+  // FP Sqrt needs at least Nf fractional bits and 2 guard/round bits.  The integer bit is always initialized to 1 and does not need a cycle.
+  // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
+
+  always_comb begin 
+    FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
+
+    if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;
+    else               ResultBitsE = FPResultBitsE;
+
+    CyclesE = (ResultBitsE-1)/(P.RK) + 1; // ceil (ResultBitsE/rk)
+  end 
+  /* verilator lint_on WIDTH */
+
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtfdivsqrtexpcalc.sv
+++ b/src/fpu/divremsqrt/divremsqrtfdivsqrtexpcalc.sv
@ -0,0 +1,79 @@
+///////////////////////////////////////////
+// fdivsqrtexpcalc.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
+// Modified:13 January 2022
+//
+// Purpose: Exponent caclulation for divide and square root
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtfdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.FMTBITS-1:0] Fmt,
+  input  logic [P.NE-1:0]      Xe, Ye,    // input exponents
+  input  logic                 Sqrt,
+ input  logic [P.DIVBLEN-1:0] ell, m,    // number of leading 0s in Xe and Ye
+  output logic [P.NE+1:0]      Ue         // result exponent
+  );
+  
+  logic [P.NE-2:0] Bias;
+  logic [P.NE+1:0] SXExp;
+  logic [P.NE+1:0] SExp;
+  logic [P.NE+1:0] DExp;
+
+  // Determine exponent bias according to the format
+  
+  if (P.FPSIZES == 1) begin
+    assign Bias = (P.NE-1)'(P.BIAS); 
+
+  end else if (P.FPSIZES == 2) begin
+    assign Bias = Fmt ? (P.NE-1)'(P.BIAS) : (P.NE-1)'(P.BIAS1); 
+
+  end else if (P.FPSIZES == 3) begin
+    always_comb
+      case (Fmt)
+        P.FMT: Bias  =  (P.NE-1)'(P.BIAS);
+        P.FMT1: Bias = (P.NE-1)'(P.BIAS1);
+        P.FMT2: Bias = (P.NE-1)'(P.BIAS2);
+        default: Bias = 'x;
+      endcase
+
+  end else if (P.FPSIZES == 4) begin        
+  always_comb
+    case (Fmt)
+      2'h3: Bias =  (P.NE-1)'(P.Q_BIAS);
+      2'h1: Bias =  (P.NE-1)'(P.D_BIAS);
+      2'h0: Bias =  (P.NE-1)'(P.S_BIAS);
+      2'h2: Bias =  (P.NE-1)'(P.H_BIAS);
+    endcase
+  end
+
+  // Square root exponent = (Xe - l - bias) / 2 + bias; l accounts for subnorms
+  assign SXExp = {2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - (P.NE+2)'(P.BIAS);
+  assign SExp  = {SXExp[P.NE+1], SXExp[P.NE+1:1]} + {2'b0, Bias};
+  
+  // division exponent = (Xe-l) - (Ye-m) + bias; l and m account for subnorms
+  assign DExp  = ({2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - {2'b0, Ye} + {{(P.NE+1-P.DIVBLEN){1'b0}}, m} + {3'b0, Bias}); 
+
+  // Select square root or division exponent
+  assign Ue = Sqrt ? SExp : DExp;
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtfdivsqrtpostproc.sv
+++ b/src/fpu/divremsqrt/divremsqrtfdivsqrtpostproc.sv
@ -0,0 +1,116 @@
+///////////////////////////////////////////
+// fdivsqrtpostproc.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
+// Modified:13 January 2022
+//
+// Purpose: Divide/Square root postprocessing
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtfdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
+  input  logic                 clk, reset,
+  input  logic                 StallM,
+  input  logic [P.DIVb+3:0]    WS, WC,            // Q4.DIVb
+  input  logic [P.DIVb+3:0]    D,                 // Q4.DIVb
+  input  logic [P.DIVb:0]      FirstU, FirstUM,   // U1.DIVb
+  input  logic [P.DIVb+1:0]    FirstC,            // Q2.DIVb
+  input  logic                 SqrtE,
+  input  logic                 SqrtM, SpecialCaseM, 
+  input  logic [P.XLEN-1:0]    AM,                // U/Q(XLEN.0)
+  input  logic                 RemOpM, ALTBM, BZeroM, AsM, BsM, W64M, SIGNOVERFLOWM, ZeroDiffM, IntDivM,
+  input  logic [P.DIVBLEN-1:0] IntNormShiftM,
+  input  logic [P.XLEN-1:0]    PreIntResultM,
+  output logic [P.DIVb:0]      UmM,               // U1.DIVb result significand
+  output logic                 WZeroE,
+  output logic                 DivStickyM,
+  output logic [P.XLEN-1:0]    FIntDivResultM,     // U/Q(XLEN.0)
+  output logic [P.INTDIVb+3:0]    PreResultM
+
+);
+  
+  logic [P.DIVb+3:0]         Sum;
+  logic [P.INTDIVb+3:0]         W;
+  logic [P.DIVb:0]           PreUmM;
+  logic                      NegStickyM;
+  logic                      weq0E, WZeroM;
+  logic [P.XLEN-1:0]         IntDivResultM;
+  logic                      NegQuotM; // Integer quotient is negative
+
+  //////////////////////////
+  // Execute Stage: Detect early termination for an exact result
+  //////////////////////////
+
+  // check for early termination on an exact result. 
+  divremsqrtearlyterm #(P) earlyterm(.FirstC, .FirstUM, .D, .SqrtE, .WC, .WS, .WZeroE);
+  
+
+  //////////////////////////
+  // E/M Pipeline register
+  //////////////////////////
+ 
+  flopenr #(1) WZeroMReg(clk, reset, ~StallM, WZeroE, WZeroM);
+
+  //////////////////////////
+  // Memory Stage: Postprocessing
+  //////////////////////////
+
+  //  If the result is not exact, the sticky should be set
+  assign DivStickyM = ~WZeroM & ~SpecialCaseM; 
+
+  // Determine if sticky bit is negative *** Full sum only needed for Integer
+  assign Sum = WC + WS;
+  assign NegStickyM = Sum[P.DIVb+3];
+  mux2 #(P.DIVb+1) preummux(FirstU, FirstUM, NegStickyM, PreUmM); // Select U or U-1 depending on negative sticky bit
+  mux2 #(P.DIVb+1)    ummux(PreUmM, (PreUmM << 1), SqrtM, UmM);
+
+   // Integer quotient or remainder correction, normalization, and special cases
+  if (P.IDIV_ON_FPU) begin:intpostproc // Int supported
+    logic [P.INTDIVb+3:0] UnsignedQuotM, NormRemM, NormRemDM, NormQuotM;
+    logic signed [P.INTDIVb+3:0] PreResultM, PreResultShiftedM, PreIntResultM;
+    logic [P.INTDIVb+3:0] DTrunc, SumTrunc;
+
+    assign SumTrunc = Sum[P.DIVb+3:P.DIVb-P.INTDIVb];
+    assign DTrunc = D[P.DIVb+3:P.DIVb-P.INTDIVb];
+    arithrightshift #(P) rshift(SumTrunc, W);
+
+    assign UnsignedQuotM = {3'b000, PreUmM[P.DIVb:P.DIVb-P.INTDIVb]};
+
+    // Integer remainder: sticky and sign correction muxes
+    assign NegQuotM = AsM ^ BsM; // Integer Quotient is negative
+    mux2 #(P.INTDIVb+4) normremdmux(W, W+DTrunc, NegStickyM, NormRemDM);
+
+    // Select quotient or remainder and do normalization shift
+    mux2 #(P.INTDIVb+4)    presresultmux(UnsignedQuotM, NormRemDM, RemOpM, PreResultM);
+    intrightshift #(P) intnormshifter(PreResultM, IntNormShiftM, PreResultShiftedM);
+    mux2 #(P.INTDIVb+4)    preintresultmux(PreResultShiftedM, -PreResultShiftedM,AsM ^ (BsM&~RemOpM), PreIntResultM);
+
+    divremsqrtintspecialcase #(P) intspecialcase(BZeroM,RemOpM, ALTBM,AM,PreIntResultM,IntDivResultM);
+    // sign extend result for W64
+    if (P.XLEN==64) begin
+      mux2 #(64) resmux(IntDivResultM[P.XLEN-1:0], 
+        {{(P.XLEN-32){IntDivResultM[31]}}, IntDivResultM[31:0]}, // Sign extending in case of W64
+        W64M, FIntDivResultM);
+    end else 
+      assign FIntDivResultM = IntDivResultM[P.XLEN-1:0];
+  end
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtfdivsqrtpreproc.sv
+++ b/src/fpu/divremsqrt/divremsqrtfdivsqrtpreproc.sv
@ -0,0 +1,250 @@
+///////////////////////////////////////////
+// fdivsqrtpreproc.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
+// Modified:13 January 2022
+//
+// Purpose: Divide/Square root preprocessing: integer absolute value and W64, normalization shift
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtfdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
+  input  logic                 clk,
+  input  logic                 IFDivStartE, 
+  input  logic [P.NF:0]        Xm, Ym,      // Floating-point significands
+  input  logic [P.NE-1:0]      Xe, Ye,      // Floating-point exponents
+  input  logic [P.FMTBITS-1:0] FmtE,
+  input  logic                 SqrtE,
+  input  logic                 XZeroE,
+  input  logic [2:0]           Funct3E,
+  output logic [P.NE+1:0]      UeM,         // biased exponent of result
+  output logic [P.DIVb+3:0]    X, D,        // Q4.DIVb
+  // Int-specific
+  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // U(XLEN.0) inputs from IEU 
+  input  logic                 IntDivE, W64E,
+  // Outputs
+  output logic                 ISpecialCaseE,
+  output logic [P.DURLEN:0]  CyclesE,
+  output logic [P.DIVBLEN-1:0] IntNormShiftM,
+  output logic                 ALTBM, IntDivM, W64M, SIGNOVERFLOWM, ZeroDiffM,
+  output logic                 AsM, BsM, BZeroM,
+  output logic [P.XLEN-1:0]    AM
+);
+
+  logic [P.DIVb:0]             Xnorm, Dnorm;
+  logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
+  logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
+  logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
+  logic [P.DIVBLEN-1:0]        mE, ell;                             // Leading zeros of inputs
+  logic [P.DIVBLEN-1:0]        IntResultBitsE;                      // bits in integer result
+  logic                        NumerZeroE;                          // Numerator is zero (X or A)
+  logic                        SIGNOVERFLOWE;
+  logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
+  logic                        SignedDivE;                          // signed division
+  logic                        AsE, BsE;                            // Signs of integer inputs
+  logic [P.XLEN-1:0]           AE;                                  // input A after W64 adjustment
+  logic                        ALTBE;
+  logic                        EvenExp;
+
+  logic [$clog2(P.RK):0] RightShiftX;
+  logic [P.DIVBLEN-1:0] ZeroDiff, p;
+
+
+  //////////////////////////////////////////////////////
+  // Integer Preprocessing
+  //////////////////////////////////////////////////////
+
+  if (P.IDIV_ON_FPU) begin:intpreproc // Int Supported
+    logic [P.XLEN-1:0] BE, PosA, PosB;
+
+    // Extract inputs, signs, zero, depending on W64 mode if applicable
+    assign SignedDivE = ~Funct3E[0];
+  
+    // Source handling
+    if (P.XLEN==64) begin // 64-bit, supports W64
+      mux2 #(64)    amux(ForwardedSrcAE, {{32{ForwardedSrcAE[31] & SignedDivE}}, ForwardedSrcAE[31:0]}, W64E, AE);
+      mux2 #(64)    bmux(ForwardedSrcBE, {{32{ForwardedSrcBE[31] & SignedDivE}}, ForwardedSrcBE[31:0]}, W64E, BE);
+    end else begin // 32 bits only
+      assign AE = ForwardedSrcAE;
+      assign BE = ForwardedSrcBE;
+     end
+    assign AZeroE = ~(|AE);
+    assign BZeroE = ~(|BE);
+    assign AsE = AE[P.XLEN-1] & SignedDivE;
+    assign BsE = BE[P.XLEN-1] & SignedDivE; 
+
+    // Force integer inputs to be postiive
+    mux2 #(P.XLEN) posamux(AE, -AE, AsE, PosA);
+    mux2 #(P.XLEN) posbmux(BE, -BE, BsE, PosB);
+
+    // Select integer or floating point inputs
+    mux2 #(P.DIVb+1) ifxmux({Xm, {(P.DIVb-P.NF){1'b0}}}, {PosA, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFX);
+    mux2 #(P.DIVb+1) ifdmux({Ym, {(P.DIVb-P.NF){1'b0}}}, {PosB, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFD);
+    mux2 #(1)    numzmux(XZeroE, AZeroE, IntDivE, NumerZeroE);
+  end else begin // Int not supported
+    assign IFX = {Xm, {(P.DIVb-P.NF){1'b0}}};
+    assign IFD = {Ym, {(P.DIVb-P.NF){1'b0}}};
+    assign NumerZeroE = XZeroE;
+  end
+
+  //////////////////////////////////////////////////////
+  // Integer & FP leading zero and normalization shift
+  //////////////////////////////////////////////////////
+
+  // count leading zeros for Subnorm FP and to normalize integer inputs
+  divremsqrtlzc #(P.DIVb+1) lzcX (IFX, ell);
+  divremsqrtlzc #(P.DIVb+1) lzcY (IFD, mE);
+
+  // Normalization shift: shift leading one into most significant bit
+  assign Xnorm = (IFX << ell);
+  assign Dnorm = (IFD << mE); 
+
+  //////////////////////////////////////////////////////
+  // Integer Right Shift to digit boundary
+  //  Determine DivXShifted (X shifted to digit boundary)
+  //  and nE (number of fractional digits)
+  //////////////////////////////////////////////////////
+
+  assign DivX = {3'b000, Xnorm}; // Zero-extend numerator for division
+
+  if (P.IDIV_ON_FPU) begin:intrightshift // Int Supported
+
+    // calculate number of result bits
+    assign ZeroDiff = mE - ell;         // Difference in number of leading zeros
+    assign ALTBE = ZeroDiff[P.DIVBLEN-1];  // A less than B (A has more leading zeros)
+    assign SIGNOVERFLOWE = 1'b0;
+
+    mux2 #(P.DIVBLEN) pmux(ZeroDiff, '0, ALTBE, p);          
+
+    /* verilator lint_off WIDTH */
+    assign IntResultBitsE = P.LOGR + p;  // Total number of result bits (r integer bits plus p fractional bits)
+   
+    /* verilator lint_on WIDTH */
+
+    // Integer special cases (terminate immediately)
+    assign ISpecialCaseE = BZeroE | ALTBE;
+
+    // calculate right shift amount RightShiftX to complete in discrete number of steps
+    if (P.RK > 1) begin // more than 1 bit per cycle
+      
+      /* verilator lint_offf WIDTH */
+      assign RightShiftX = P.RK - 1 - ((IntResultBitsE - 1) % P.RK); // Right shift amount
+      assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in n steps
+      /* verilator lint_on WIDTH */
+    end else begin // radix 2 1 copy doesn't require shifting
+      assign DivXShifted = DivX;
+      assign RightShiftX = 0;
+    end
+  end else begin
+    assign ISpecialCaseE = 0;
+  end
+
+  //////////////////////////////////////////////////////
+  // Floating-Point Preprocessing
+  // Extend to Q4.b format
+  // shift square root to be in range [1/4, 1)
+  // Normalized numbers are shifted right by 1 if the exponent is odd
+  // Subnormal numbers have Xe = 0 and an unbiased exponent of 1-BIAS.  They are shifted right if the number of leading zeros is odd.
+   //////////////////////////////////////////////////////
+
+
+  // Sqrt is initialized on step one as R(X-1), so depends on Radix
+  // If X = 0, then special case logic sets sqrt = 0 so this portion doesn't matter
+  // Otherwise, X has a leading 1 after possible normalization shift and is now in range [1, 2)
+  // Next X is shifted right by 1 or 2 bits to range [1/4, 1) and exponent will be adjusted accordingly to be even
+  // Now (X-1) is negative.  Formed by placing all 1s in all four integer bits (in Q4.b) form, keeping X in fraciton bits
+  // Then multiply by R is left shift by r (1 or 2 for radix 2 or 4)
+  // This is optimized in hardware by first right shifting by 0 or 1 bit (instead of 1 or 2), then left shifting by (r-1), then subtracting 2 or 4
+  // Subtracting 2 is equivalent to adding 1110.  Subtracting 4 is equivalent to adding 1100.  Prepend leading 1s to do a free subtraction.
+  // This also means only one extra fractional bit is needed becaue we never shift right by more than 1.
+  // Radix      Exponent odd          Exponent Even
+  // 2          x-2 = 2(x/2 - 1)      x/2 - 2 = 2(x/4 - 1)
+  // 4          2(x)-4 = 4(x/2 - 1))  2(x/2)-4 = 4(x/4 - 1)
+  // Summary: PreSqrtX = r(x/2or4 - 1)
+
+  logic [P.DIVb:0] PreSqrtX;
+  assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
+  mux2 #(P.DIVb+4) sqrtxmux({4'b0,Xnorm[P.DIVb:1]}, {5'b00, Xnorm[P.DIVb:2]}, EvenExp, SqrtX); // X/2 if exponent odd, X/4 if exponent even
+
+/*  
+  // Attempt to optimize radix 4 to use a left shift by 1 or zero initially, followed by no more left shift
+  // This saves one bit in DIVb because there is no initial right shift.
+  // However, C needs to be extended further, lest it create a k with a 1 in the lsb when C is all 1s.
+  // That is an optimization for another day.
+  if (P.RADIX == 2) begin
+    logic [P.DIVb:0] PreSqrtX;    // U1.DIVb
+    mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
+    assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
+  end else begin
+    logic [P.DIVb+1:0] PreSqrtX;  // U2.DIVb
+    mux2 #(P.DIVb+2) sqrtxmux({Xnorm, 1'b0}, {1'b0, Xnorm}, EvenExp, PreSqrtX); // 2X if exponent odd, X if exponent even
+    assign SqrtX = {2'b11, PreSqrtX};                     // PreSqrtX - 4 = 4(PreSqrtX/4 - 1)
+  end
+*/
+
+  // Initialize X for division or square root
+  mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);                    
+
+  //////////////////////////////////////////////////////
+  // Selet integer or floating-point operands
+  //////////////////////////////////////////////////////
+ if (P.IDIV_ON_FPU) begin
+    mux2 #(P.DIVb+4) xmux(PreShiftX, DivXShifted, IntDivE, X);
+  end else begin
+    assign X = PreShiftX;
+  end
+
+  // Divisior register
+  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dnorm}, D);
+ 
+  // Floating-point exponent
+  divremsqrtfdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .ell, .m(mE), .Ue(UeE));
+  flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
+
+  // Number of FSM cycles (to FSM)
+  divremsqrtfdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
+
+  if (P.IDIV_ON_FPU) begin:intpipelineregs
+    logic [P.DIVBLEN-1:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
+    logic               RemOpE;
+
+    /* verilator lint_off WIDTH */
+    assign IntDivNormShiftE = P.INTDIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
+    assign IntRemNormShiftE = mE + (P.INTDIVb-(P.XLEN-1));           // m + b - (N-1) for remainder normalization shift
+    /* verilator lint_on WIDTH */
+    assign RemOpE = Funct3E[1];
+    mux2 #(P.DIVBLEN) normshiftmux(IntDivNormShiftE, IntRemNormShiftE, RemOpE, IntNormShiftE);
+
+    // pipeline registers
+    flopen #(1)          mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
+    flopen #(1)         altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
+    flopen #(1)        bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
+    flopen #(1)        asignreg(clk, IFDivStartE, AsE,      AsM);
+    flopen #(1)        bsignreg(clk, IFDivStartE, BsE,      BsM);
+    flopen #(P.DIVBLEN)   nsreg(clk, IFDivStartE, IntNormShiftE, IntNormShiftM); 
+    flopen #(P.XLEN)    srcareg(clk, IFDivStartE, AE,       AM);
+    if (P.XLEN==64) 
+      flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);
+  end
+
+endmodule
+
--- a/src/fpu/divremsqrt/divremsqrtflags.sv
+++ b/src/fpu/divremsqrt/divremsqrtflags.sv
@ -0,0 +1,183 @@
+
+///////////////////////////////////////////
+// flags.sv
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: Post-Processing flag calculation
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtflags import cvw::*;  #(parameter cvw_t P) (
+  input  logic                Xs,                     // X sign
+  input  logic [P.FMTBITS-1:0] OutFmt,                 // output format
+  input  logic                InfIn,                  // is a Inf input being used
+  input  logic                XInf, YInf,             // inputs are infinity
+  input  logic                NaNIn,                  // is a NaN input being used
+  input  logic                XSNaN, YSNaN,           // inputs are signaling NaNs
+  input  logic                XZero, YZero,           // inputs are zero
+  input  logic [P.NE+1:0]      FullRe,                 // Re with bits to determine sign and overflow
+  input  logic [P.NE+1:0]      Me,                     // exponent of the normalized sum
+  // rounding
+  input  logic                Plus1,                  // do you add one for rounding
+  input  logic                Round, Guard, Sticky,   // bits used to determine rounding
+  input  logic                UfPlus1,                // do you add one for rounding for the unbounded exponent result
+  // divsqrt
+  input  logic                DivOp,                  // conversion opperation?
+  input  logic                Sqrt,                   // Sqrt?
+  // flags
+  output logic                DivByZero,              // divide by zero flag
+  output logic                Overflow,               // overflow flag to select result
+  output logic                Invalid,                // invalid flag to select the result
+  output logic [4:0]          PostProcFlg             // flags
+);
+
+  logic               SigNaN;         // is an input a signaling NaN
+  logic               Inexact;        // final inexact flag
+  logic               FpInexact;      // floating point inexact flag
+  logic               DivInvalid;     // integer invalid flag
+  logic               Underflow;      // Underflow flag
+  logic               ResExpGteMax;   // is the result greater than or equal to the maximum floating point expoent
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Overflow
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // determine if the result exponent is greater than or equal to the maximum exponent or 
+  // the shift amount is greater than the integers size (for cvt to int)
+  // ShiftGtIntSz calculation:  
+  //      a left shift of intlen+1 is still in range but any more than that is an overflow
+  //              inital: |      64 0's         |    XLEN     |
+  //                      |      64 0's         |    XLEN     | << 64
+  //                      |      XLEN           |    00000... |
+  //      65 = ...0 0 0 0   0 1 0 0   0 0 0 1
+  //          |     or      | |     or      |
+  //      33 = ...0 0 0 0   0 0 1 0   0 0 0 1
+  //          |     or        | |     or    |
+  //      larger or equal if:
+  //          - any of the bits after the most significan 1 is one
+  //          - the most signifcant in 65 or 33 is still a one in the number and
+  //            one of the later bits is one
+  if (P.FPSIZES == 1) begin
+      assign ResExpGteMax = &FullRe[P.NE-1:0] | FullRe[P.NE];
+
+  end else if (P.FPSIZES == 2) begin    
+      assign ResExpGteMax = OutFmt ? &FullRe[P.NE-1:0] | FullRe[P.NE] : &FullRe[P.NE1-1:0] | (|FullRe[P.NE:P.NE1]);
+
+  end else if (P.FPSIZES == 3) begin
+      always_comb
+          case (OutFmt)
+              P.FMT: ResExpGteMax = &FullRe[P.NE-1:0] | FullRe[P.NE];
+              P.FMT1: ResExpGteMax = &FullRe[P.NE1-1:0] | (|FullRe[P.NE:P.NE1]);
+              P.FMT2: ResExpGteMax = &FullRe[P.NE2-1:0] | (|FullRe[P.NE:P.NE2]);
+              default: ResExpGteMax = 1'bx;
+          endcase
+
+  end else if (P.FPSIZES == 4) begin        
+      always_comb
+          case (OutFmt)
+              P.Q_FMT: ResExpGteMax = &FullRe[P.Q_NE-1:0] | FullRe[P.Q_NE];
+              P.D_FMT: ResExpGteMax = &FullRe[P.D_NE-1:0] | (|FullRe[P.Q_NE:P.D_NE]);
+              P.S_FMT: ResExpGteMax = &FullRe[P.S_NE-1:0] | (|FullRe[P.Q_NE:P.S_NE]);
+              P.H_FMT: ResExpGteMax = &FullRe[P.H_NE-1:0] | (|FullRe[P.Q_NE:P.H_NE]);
+          endcase
+  end
+
+
+  // calulate overflow flag:
+  //                 if the result is greater than or equal to the max exponent(not taking into account sign)
+  //                 |           and the exponent isn't negitive
+  //                 |           |                   if the input isnt infinity or NaN
+  //                 |           |                   |            
+  assign Overflow = ResExpGteMax & ~FullRe[P.NE+1]&~(InfIn|NaNIn|DivByZero);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Underflow
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // calculate underflow flag: detecting tininess after rounding
+  //                  the exponent is negitive
+  //                  |                    the result is subnormal
+  //                  |                    |                    the result is normal and rounded from a Subnorm
+  //                  |                    |                    |                                      and if given an unbounded exponent the result does not round
+  //                  |                    |                    |                                      |                     and if the result is not exact
+  //                  |                    |                    |                                      |                     |               and if the input isnt infinity or NaN
+  //                  |                    |                    |                                      |                     |               |
+  //assign Underflow = ((FullRe[P.NE+1] | (FullRe == 0) | ((FullRe == 1) & (Me == 0) & ~(UfPlus1&Guard)))&(Round|(Sticky&~XZero)|Guard))&~(InfIn|NaNIn|DivByZero|Invalid);
+  assign Underflow = ((FullRe[P.NE+1] | (FullRe == 0) | ((FullRe == 1) & (Me == 0) & ~(UfPlus1&Guard)))&(Round|(Sticky)|Guard))&~(InfIn|NaNIn|DivByZero|Invalid);
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Inexact
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
+  //      - Don't set the underflow flag if an underflowed res isn't outputed
+  //assign FpInexact = ((Sticky&~XZero)|Guard|Overflow|Round)&~(InfIn|NaNIn|DivByZero|Invalid);
+  assign FpInexact = (Sticky|Guard|Overflow|Round)&~(InfIn|NaNIn|DivByZero|Invalid|XZero);
+
+  //                  if the res is too small to be represented and not 0
+  //                  |                                     and if the res is not invalid (outside the integer bounds)
+  //                  |                                     |
+
+  // select the inexact flag to output
+  assign Inexact = FpInexact;
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Invalid
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // Set Invalid flag for following cases:
+  //   1) any input is a signaling NaN
+  //   2) Inf - Inf (unless x or y is NaN)
+  //   3) 0 * Inf
+
+  
+  assign SigNaN = (XSNaN) | (YSNaN) ;
+  
+  //invalid flag for division
+  assign DivInvalid = ((XInf & YInf) | (XZero & YZero))&~Sqrt | (Xs&Sqrt&~NaNIn&~XZero);
+
+  assign Invalid = SigNaN | (DivInvalid&DivOp);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Divide by Zero
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // if dividing by zero and not 0/0
+  //  - don't set flag if an input is NaN or Inf(IEEE says has to be a finite numerator)
+  assign DivByZero = YZero&DivOp&~Sqrt&~(XZero|NaNIn|InfIn);  
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // final flags
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // Combine flags
+  //      - to integer results do not set the underflow or overflow flags
+  assign PostProcFlg = {Invalid, DivByZero, Overflow, Underflow, Inexact};
+
+endmodule
+
+
+
+
--- a/src/fpu/divremsqrt/divremsqrtintspecialcase.sv
+++ b/src/fpu/divremsqrt/divremsqrtintspecialcase.sv
@ -0,0 +1,15 @@
+module divremsqrtintspecialcase import cvw::*; #(parameter cvw_t P) (
+    input logic BZeroM,RemOpM, ALTBM,
+    input logic [P.XLEN-1:0] AM,
+    input  signed [P.INTDIVb+3:0] PreIntResultM,
+    output logic [P.XLEN-1:0] IntDivResultM
+);
+always_comb
+      if (BZeroM) begin         // Divide by zero
+        if (RemOpM) IntDivResultM = AM;  
+        else        IntDivResultM = {(P.XLEN){1'b1}};
+     end else if (ALTBM) begin // Numerator is small
+        if (RemOpM) IntDivResultM = AM;
+        else        IntDivResultM = 0;
+     end else       IntDivResultM = PreIntResultM[P.XLEN-1:0];
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtlzc.sv
+++ b/src/fpu/divremsqrt/divremsqrtlzc.sv
@ -0,0 +1,39 @@
+///////////////////////////////////////////
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: Leading Zero Counter
+// 
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtlzc #(parameter WIDTH = 1) (
+  input  logic [WIDTH-1:0]            num,    // number to count the leading zeroes of
+  output logic [$clog2(WIDTH)-1:0]  ZeroCnt // the number of leading zeroes
+);
+
+  integer i;
+  
+  always_comb begin
+    i = 0;
+    while ((i < WIDTH) & ~num[WIDTH-1-i]) i = i+1;  // search for leading one
+    ZeroCnt = i[$clog2(WIDTH)-1:0];
+  end
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtnormshift.sv
+++ b/src/fpu/divremsqrt/divremsqrtnormshift.sv
@ -0,0 +1,81 @@
+///////////////////////////////////////////
+// normshift.sv
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: normalization shifter
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // convert shift
+    //      fp -> int:  | `XLEN  zeros      |     Mantissa      | 0's if necessary | << CalcExp
+    //          process:
+    //              - start - CalcExp = 1 + XExp - Largest Bias
+    //                  | `XLEN  zeros      |     Mantissa      | 0's if necessary |
+    //
+    //              - shift left 1 (1)
+    //                  | `XLEN-1 zeros |bit|     frac          | 0's if necessary |
+    //                                      . <- binary point
+    //
+    //              - shift left till unbiased exponent is 0 (XExp - Largest Bias)
+    //                  |  0's |     Mantissa      |      0's if necessary     |
+    //                  |     keep          |
+    //
+    //      fp -> fp:
+    //          - if result is subnormal or underflowed:
+    //              |  `NF-1  zeros   |     Mantissa      | 0's if necessary | << NF+CalcExp-1
+    //          process:
+    //             - start
+    //                 |     mantissa      | 0's |
+    //
+    //             - shift right by NF-1 (NF-1)
+    //                 |    `NF-1  zeros   |     mantissa      | 0's |
+    //
+    //             - shift left by CalcExp = XExp - Largest bias + new bias
+    //                 |   0's  |     mantissa      |     0's      |
+    //                 |       keep      |
+    //
+    //          - if the input is subnormal:
+    //                 |     lzcIn      | 0's if necessary | << ZeroCnt+1
+    //              - plus 1 to shift out the first 1
+    //
+    //      int -> fp: |     lzcIn      | 0's if necessary | << ZeroCnt+1
+    //              - plus 1 to shift out the first 1
+
+    // fma shift
+    //      |   00   |           Sm           | << LZA output
+    //             .
+    //      - two extra bits so we can correct for an LZA error of 1 or 2
+
+    // divsqrt shift
+    //      | Nf 0's |           Qm           | << calculated shift amount
+    //        .
+
+module divremsqrtnormshift import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.LOGNORMSHIFTSZDRSU-1:0]  ShiftAmt,   // shift amount
+  input  logic [P.NORMSHIFTSZDRSU-1:0]     ShiftIn,    // number to be shifted
+  output logic [P.NORMSHIFTSZDRSU-1:0]     Shifted     // shifted result
+);
+   
+  assign Shifted = ShiftIn << ShiftAmt;
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtpostprocess.sv
+++ b/src/fpu/divremsqrt/divremsqrtpostprocess.sv
@ -0,0 +1,177 @@
+///////////////////////////////////////////
+// postprocess.sv
+//
+// Written: kekim@hmc.edu
+// Modified: 19 May 2023
+//
+// Purpose: Post-Processing: normalization, rounding, sign, flags, special cases
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+module divremsqrtpostprocess import cvw::*;  #(parameter cvw_t P)  (
+  // general signals
+  input logic                             Xs, Ys,     // input signs
+  input logic  [P.NF:0]                    Xm, Ym,     // input mantissas
+  input logic  [2:0]                      Frm,        // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+  input logic  [P.FMTBITS-1:0]             Fmt,        // precision 1 = double 0 = single
+  input logic  [3:0]                      OpCtrl,     // choose which opperation (look below for values)
+  input logic                             XZero, YZero,        // inputs are zero
+  input logic                             XInf, YInf,          // inputs are infinity
+  input logic                             XNaN, YNaN,          // inputs are NaN
+  input logic                             XSNaN, YSNaN,        // inputs are signaling NaNs
+  input logic  [1:0]                      PostProcSel,         // select result to be written to fp register
+  //fma signals
+  //divide signals
+  input logic                             DivSticky,  // divider sticky bit
+  input logic  [P.NE+1:0]                  DivUe,      // divsqrt exponent
+  input logic  [P.NF+2:0]                  DivUm,      // divsqrt significand
+  input logic  [P.DIVBLEN-1:0]             IntNormShiftM, // integer normalization left-shift amount (after pre-shifting right)
+  input logic  [P.INTDIVb+3:0]          PreResultM, // integer result to be shifted
+  input logic                              IntDivM,
+  // final results
+  output logic [P.FLEN-1:0]                PostProcRes,// postprocessor final result
+  output logic [4:0]                      PostProcFlg, // postprocesser flags
+  output logic [P.XLEN-1:0]  PreIntResultM // normalized integer result
+  );
+
+  
+  // general signals
+  logic                       Rs;         // result sign
+  logic [P.NF-1:0]             Rf;         // Result fraction
+  logic [P.NE-1:0]             Re;         // Result exponent
+  logic                       Ms;         // norMalized sign
+  logic [P.NORMSHIFTSZDRSU-1:0]    Mf;         // norMalized fraction
+  logic [P.NE+1:0]             Me;         // normalized exponent
+  logic [P.NE+1:0]             FullRe;     // Re with bits to determine sign and overflow
+  logic                       UfPlus1;    // do you add one (for determining underflow flag)
+  logic [P.LOGNORMSHIFTSZDRSU-1:0] ShiftAmt;   // normalization shift amount
+  logic [P.NORMSHIFTSZDRSU-1:0]    ShiftIn;    // input to normalization shift
+  logic [P.NORMSHIFTSZDRSU-1:0]    Shifted;    // the ouput of the normalized shifter (before shift correction)
+  logic                       Plus1;      // add one to the final result?
+  logic                       Overflow;   // overflow flag used to select results
+  logic                       Invalid;    // invalid flag used to select results
+  logic                       Guard, Round, Sticky; // bits needed to determine rounding
+  logic [P.FMTBITS-1:0]        OutFmt;     // output format
+  // division singals
+  logic [P.LOGNORMSHIFTSZDRSU-1:0] DivShiftAmt;        // divsqrt shif amount
+  logic [P.NORMSHIFTSZDRSU-1:0]    DivShiftIn;         // divsqrt shift input
+  logic [P.NE+1:0]             Ue;                 // divsqrt corrected exponent after corretion shift
+  logic                       DivByZero;          // divide by zero flag
+  logic                       DivResSubnorm;      // is the divsqrt result subnormal
+  logic                       DivSubnormShiftPos; // is the divsqrt subnorm shift amout positive (not underflowed)
+  // conversion signals
+  logic [P.CVTLEN+P.NF:0]       CvtShiftIn;         // number to be shifted for converter
+  logic [1:0]                 CvtNegResMsbs;      // most significant bits of possibly negated int result
+  logic [P.XLEN+1:0]           CvtNegRes;          // possibly negated integer result
+  logic                       CvtResUf;           // did the convert result underflow
+  logic                       IntInvalid;         // invalid integer flag
+  // readability signals
+  logic                       Mult;       // multiply opperation
+  logic                       Sqrt;       // is the divsqrt opperation sqrt
+  logic                       Int64;      // is the integer 64 bits?
+  logic                       Signed;     // is the opperation with a signed integer?
+  logic                       IntToFp;    // is the opperation an int->fp conversion?
+  logic                       CvtOp;      // convertion opperation
+  logic                       DivOp;      // divider opperation
+  logic                       InfIn;      // are any of the inputs infinity
+  logic                       NaNIn;      // are any of the inputs NaN
+
+  // signals to help readability
+  
+  assign DivOp = (PostProcSel == 2'b01);
+  assign Sqrt =  OpCtrl[0];
+
+  // is there an input of infinity or NaN being used
+  assign InfIn = XInf|YInf;
+  assign NaNIn = XNaN|YNaN;
+
+  // choose the ouptut format depending on the opperation
+  //      - fp -> fp: OpCtrl contains the percision of the output
+  //      - otherwise: Fmt contains the percision of the output
+  if (P.FPSIZES == 2) 
+      //assign OutFmt = IntToFp|~CvtOp ? Fmt : (OpCtrl[1:0] == P.FMT); 
+      assign OutFmt = Fmt;
+  else if (P.FPSIZES == 3 | P.FPSIZES == 4) 
+      //assign OutFmt = IntToFp|~CvtOp ? Fmt : OpCtrl[1:0]; 
+      assign OutFmt = Fmt;
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Normalization
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // final claulations before shifting
+
+  divremsqrtdivshiftcalc #(P) divremsqrtdivshiftcalc(.DivUe, .DivUm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
+
+  assign ShiftAmt = DivShiftAmt;
+  assign ShiftIn = DivShiftIn;
+  
+  // main normalization shift
+  divremsqrtnormshift #(P) divremsqrtnormshift (.ShiftIn, .ShiftAmt, .Shifted);
+
+  // correct for LZA/divsqrt error
+  divremsqrtshiftcorrection #(P) shiftcorrection(.DivResSubnorm, .DivSubnormShiftPos, .DivOp(1'b1), .DivUe, .Ue, .Shifted, .Mf);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Rounding
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // round to nearest even
+  // round to zero
+  // round to -infinity
+  // round to infinity
+  // round to nearest max magnitude
+
+  // calulate result sign used in rounding unit
+  divremsqrtroundsign #(P) roundsign( .DivOp(1'b1), .Sqrt, .Xs, .Ys, .Ms);
+
+  divremsqrtround #(P) round(.OutFmt, .Frm, .Plus1, .Ue,
+      .Ms, .Mf, .DivSticky, .DivOp(1'b1), .UfPlus1, .FullRe, .Rf, .Re, .Sticky, .Round, .Guard, .Me);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Sign calculation
+  ///////////////////////////////////////////////////////////////////////////////
+
+  assign Rs = Ms;
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Flags
+  ///////////////////////////////////////////////////////////////////////////////
+
+  divremsqrtflags #(P) flags(.XSNaN, .YSNaN, .XInf, .YInf, .InfIn, .XZero, .YZero, 
+              .Xs, .OutFmt, .Sqrt,
+              .NaNIn, .Round, .DivByZero,
+              .Guard, .Sticky, .UfPlus1,.DivOp(1'b1), .FullRe, .Plus1,
+              .Me, .Invalid, .Overflow, .PostProcFlg);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Select the result
+  ///////////////////////////////////////////////////////////////////////////////
+
+  //negateintres negateintres(.Xs, .Shifted, .Signed, .Int64, .Plus1, .CvtNegResMsbs, .CvtNegRes);
+
+  divremsqrtspecialcase #(P) specialcase(.Xs, .Xm, .Ym, .XZero, 
+      .Frm, .OutFmt, .XNaN, .YNaN,  
+      .NaNIn, .Plus1, .Invalid, .Overflow, .InfIn,
+      .XInf, .YInf, .DivOp(1'b1), .DivByZero, .FullRe, .Rs, .Re, .Rf, .PostProcRes );
+
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtround.sv
+++ b/src/fpu/divremsqrt/divremsqrtround.sv
@ -0,0 +1,268 @@
+///////////////////////////////////////////
+// divremsqrtround.sv
+//
+// Written: kekim@hmc.edu, me@KatherineParry.com
+// Modified: 19 May 2023
+//
+// Purpose: Rounder
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+module divremsqrtround import cvw::*;  #(parameter cvw_t P)  (
+  input  logic [P.FMTBITS-1:0]     OutFmt,             // output format
+  input  logic [2:0]              Frm,                // rounding mode
+  input  logic                    Ms,                 // normalized sign
+  input  logic [P.NORMSHIFTSZDRSU-1:0] Mf,                 // normalized fraction
+  // divsqrt
+  input  logic                    DivOp,              // is a division opperation being done
+  input  logic                    DivSticky,          // divsqrt sticky bit
+  input  logic [P.NE+1:0]          Ue,                 // the divsqrt calculated expoent
+  // outputs
+  output logic [P.NE+1:0]          Me,                 // normalied fraction
+  output logic                    UfPlus1,            // do you add one to the result if given an unbounded exponent
+  output logic [P.NE+1:0]          FullRe,             // Re with bits to determine sign and overflow
+  output logic [P.NE-1:0]          Re,                 // Result exponent
+  output logic [P.NF-1:0]          Rf,                 // Result fractionNormS
+  output logic                    Sticky,             // sticky bit
+  output logic                    Plus1,              // do you add one to the final result
+  output logic                    Round, Guard        // bits needed to calculate rounding
+);
+
+  logic           UfCalcPlus1;        // calculated plus one for unbounded exponent
+  logic           NormSticky;         // normalized sum's sticky bit
+  logic [P.NF-1:0] RoundFrac;          // rounded fraction
+  logic           FpGuard, FpRound;   // floating point round/guard bits
+  logic           FpLsbRes;           // least significant bit of floating point result
+  logic           LsbRes;             // lsb of result
+  logic           CalcPlus1;          // calculated plus1
+  logic           FpPlus1;            // do you add one to the fp result 
+  logic [P.FLEN:0] RoundAdd;           // how much to add to the result
+
+// what position is XLEN in?
+//  options: 
+//     1: XLEN > NF   > NF1
+//     2: NF   > XLEN > NF1
+//     3: NF   > NF1  > XLEN
+//  single and double will always be smaller than XLEN
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Rounding
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // round to nearest even
+  //      {Round, Sticky}
+  //      0x - do nothing
+  //      10 - tie - Plus1 if result is odd  (LSBNormSum = 1)
+  //          - don't add 1 if a small number was supposed to be subtracted
+  //      11 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
+  //         - plus 1 otherwise
+
+  //  round to zero - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
+
+  //  round to -infinity
+  //          - Plus1 if negative unless a small number was supposed to be subtracted from a result with guard and round bits of 0
+  //          - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
+
+  //  round to infinity
+  //          - Plus1 if positive unless a small number was supposed to be subtracted from a result with guard and round bits of 0
+  //          - subtract 1 if a small number was supposed to be subtracted from a negative result with guard and round bits of 0
+
+  //  round to nearest max magnitude
+  //      {Guard, Round, Sticky}
+  //      0x - do nothing
+  //      10 - tie - Plus1
+  //          - don't add 1 if a small number was supposed to be subtracted
+  //      11 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
+  //         - Plus 1 otherwise
+
+
+  // determine what format the final result is in: int or fp
+
+  // sticky bit calculation
+  if (P.FPSIZES == 1) begin
+    assign NormSticky = (|Mf[P.NORMSHIFTSZDRSU-P.NF-2:0]);
+
+  end else if (P.FPSIZES == 2) begin
+    assign NormSticky = (|Mf[P.NORMSHIFTSZDRSU-P.NF1-2:P.NORMSHIFTSZDRSU-P.NF-1]&(~OutFmt)) |
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.NF-2:0]);
+
+
+  end else if (P.FPSIZES == 3) begin
+
+    assign NormSticky = (|Mf[P.NORMSHIFTSZDRSU-P.NF2-2:P.NORMSHIFTSZDRSU-P.NF1-1]&(OutFmt==P.FMT2)) |
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.NF1-2:P.NORMSHIFTSZDRSU-P.NF-1]&(~(OutFmt==P.FMT))) |
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.NF-2:0]);
+
+  end else if (P.FPSIZES == 4) begin
+    assign NormSticky = (|Mf[P.NORMSHIFTSZDRSU-P.H_NF-2:P.NORMSHIFTSZDRSU-P.Q_NF-1]&(OutFmt==P.H_FMT)) |
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.S_NF-2:P.NORMSHIFTSZDRSU-P.Q_NF-1]&((OutFmt==P.S_FMT))) | 
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.D_NF-2:P.NORMSHIFTSZDRSU-P.Q_NF-1]&((OutFmt==P.D_FMT))) |
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.Q_NF-2:0]&(OutFmt==P.Q_FMT));
+  end
+  
+
+
+  // only add the Addend sticky if doing an FMA opperation
+  //      - the shifter shifts too far left when there's an underflow (shifting out all possible sticky bits)
+  //assign Sticky = DivSticky&DivOp | NormSticky | StickySubnorm;
+  assign Sticky = DivSticky&DivOp | NormSticky;
+  //assign Sticky = DivSticky&DivOp;
+  
+
+
+
+  // determine round and LSB of the rounded value
+  //      - underflow round bit is used to determint the underflow flag
+  if (P.FPSIZES == 1) begin
+      assign FpGuard = Mf[P.NORMSHIFTSZDRSU-P.NF-1];
+      assign FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.NF];
+      assign FpRound = Mf[P.NORMSHIFTSZDRSU-P.NF-2];
+
+  end else if (P.FPSIZES == 2) begin
+      assign FpGuard = OutFmt ? Mf[P.NORMSHIFTSZDRSU-P.NF-1] : Mf[P.NORMSHIFTSZDRSU-P.NF1-1];
+      assign FpLsbRes = OutFmt ? Mf[P.NORMSHIFTSZDRSU-P.NF] : Mf[P.NORMSHIFTSZDRSU-P.NF1];
+      assign FpRound = OutFmt ? Mf[P.NORMSHIFTSZDRSU-P.NF-2] : Mf[P.NORMSHIFTSZDRSU-P.NF1-2];
+
+  end else if (P.FPSIZES == 3) begin
+      always_comb
+          case (OutFmt)
+              P.FMT: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.NF];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.NF-2];
+              end
+              P.FMT1: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.NF1-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.NF1];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.NF1-2];
+              end
+              P.FMT2: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.NF2-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.NF2];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.NF2-2];
+              end
+              default: begin
+                  FpGuard = 1'bx;
+                  FpLsbRes = 1'bx;
+                  FpRound = 1'bx;
+              end
+          endcase
+  end else if (P.FPSIZES == 4) begin
+      always_comb
+          case (OutFmt)
+              2'h3: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.Q_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.Q_NF];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.Q_NF-2];
+              end
+              2'h1: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.D_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.D_NF];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.D_NF-2];
+              end
+              2'h0: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.S_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.S_NF];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.S_NF-2];
+              end
+              2'h2: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.H_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.H_NF];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.H_NF-2];
+              end
+          endcase
+  end
+
+  
+  assign Guard =  FpGuard;
+  assign LsbRes = FpLsbRes;
+  assign Round =  FpRound;
+
+
+  always_comb begin
+      // Determine if you add 1
+      case (Frm)
+          3'b000: CalcPlus1 = Guard & (Round|Sticky|LsbRes);//round to nearest even
+          3'b001: CalcPlus1 = 0;//round to zero
+          3'b010: CalcPlus1 = Ms;//round down
+          3'b011: CalcPlus1 = ~Ms;//round up
+          3'b100: CalcPlus1 = Guard;//round to nearest max magnitude
+          default: CalcPlus1 = 1'bx;
+      endcase
+      // Determine if you add 1 (for underflow flag)
+      case (Frm)
+          3'b000: UfCalcPlus1 = Round & (Sticky|Guard);//round to nearest even
+          3'b001: UfCalcPlus1 = 0;//round to zero
+          3'b010: UfCalcPlus1 = Ms;//round down
+          3'b011: UfCalcPlus1 = ~Ms;//round up
+          3'b100: UfCalcPlus1 = Round;//round to nearest max magnitude
+          default: UfCalcPlus1 = 1'bx;
+      endcase
+  
+  end
+
+  // If an answer is exact don't round
+  assign Plus1 = CalcPlus1 & (Sticky|Round|Guard);
+  assign FpPlus1 = Plus1;
+  assign UfPlus1 = UfCalcPlus1 & (Sticky|Round);
+
+
+
+
+  // place Plus1 into the proper position for the format
+  if (P.FPSIZES == 1) begin
+      assign RoundAdd = {{P.FLEN{1'b0}}, FpPlus1};
+
+  end else if (P.FPSIZES == 2) begin
+      // \/FLEN+1
+      //  | NE+2 |        NF      |
+      //  '-NE+2-^----NF1----^
+      // P.FLEN+1-P.NE-2-P.NF1 = FLEN-1-NE-NF1
+      assign RoundAdd = {(P.NE+1+P.NF1)'(0), FpPlus1&~OutFmt, (P.NF-P.NF1-1)'(0), FpPlus1&OutFmt};
+
+  end else if (P.FPSIZES == 3) begin
+      assign RoundAdd = {(P.NE+1+P.NF2)'(0), FpPlus1&(OutFmt==P.FMT2), (P.NF1-P.NF2-1)'(0), FpPlus1&(OutFmt==P.FMT1), (P.NF-P.NF1-1)'(0), FpPlus1&(OutFmt==P.FMT)};
+
+  end else if (P.FPSIZES == 4)      
+      assign RoundAdd = {(P.Q_NE+1+P.H_NF)'(0), FpPlus1&(OutFmt==P.H_FMT), (P.S_NF-P.H_NF-1)'(0), FpPlus1&(OutFmt==P.S_FMT), (P.D_NF-P.S_NF-1)'(0), FpPlus1&(OutFmt==P.D_FMT), (P.Q_NF-P.D_NF-1)'(0), FpPlus1&(OutFmt==P.Q_FMT)};
+
+
+
+  // trim unneeded bits from fraction
+  assign RoundFrac = Mf[P.NORMSHIFTSZDRSU-1:P.NORMSHIFTSZDRSU-P.NF];
+  
+
+
+  // select the exponent
+  assign Me = Ue;
+
+
+
+  // round the result
+  //      - if the fraction overflows one should be added to the exponent
+  assign {FullRe, Rf} = {Me, RoundFrac} + RoundAdd;
+  assign Re = FullRe[P.NE-1:0];
+
+
+endmodule
+
--- a/src/fpu/divremsqrt/divremsqrtroundsign.sv
+++ b/src/fpu/divremsqrt/divremsqrtroundsign.sv
@ -0,0 +1,45 @@
+///////////////////////////////////////////
+// divremsqrtroundsign.sv
+//
+// Written: kekim@hmc.edu,me@KatherineParry.com
+// Modified: 19 May 2023
+//
+// Purpose: Sign calculation for rounding
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtroundsign import cvw::*;  #(parameter cvw_t P) (
+  input logic         Xs,     // x sign
+  input logic         Ys,     // y sign
+  input logic         Sqrt,   // sqrt oppertion? (when using divsqrt unit)
+  input logic         DivOp,  // is divsqrt opperation
+  output logic        Ms      // normalized result sign
+);
+
+  logic               Qs;     // divsqrt result sign
+
+  // calculate divsqrt sign
+  assign Qs = Xs^(Ys&~Sqrt);
+
+  // Select sign for rounding calulation
+  assign Ms = (Qs&DivOp);
+
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtshiftcorrection.sv
+++ b/src/fpu/divremsqrt/divremsqrtshiftcorrection.sv
@ -0,0 +1,94 @@
+///////////////////////////////////////////
+// divremsqrtshiftcorrection.sv
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: shift correction
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+module divremsqrtshiftcorrection import cvw::*;  #(parameter cvw_t P) (
+  input logic  [P.NORMSHIFTSZDRSU-1:0] Shifted,                // the shifted sum before LZA correction
+  // divsqrt
+  input logic                     DivOp,                  // is it a divsqrt opperation
+  input logic                     DivResSubnorm,          // is the divsqrt result subnormal
+  input logic  [P.NE+1:0]          DivUe,                  // the divsqrt result's exponent
+  input logic                     DivSubnormShiftPos,     // is the subnorm divider shift amount positive (ie not underflowed)
+  //fma
+  //input logic                     FmaOp,                  // is it an fma opperation
+  //input logic  [P.NE+1:0]          NormSumExp,             // exponent of the normalized sum not taking into account Subnormal or zero results
+  //input logic                     FmaPreResultSubnorm,    // is the result subnormal - calculated before LZA corection
+  //input logic                     FmaSZero,
+  // output
+  //output logic [P.NE+1:0]          FmaMe,                  // exponent of the normalized sum
+  output logic [P.NORMSHIFTSZDRSU-1:0] Mf,                     // the shifted sum before LZA correction
+  output logic [P.NE+1:0]          Ue                      // corrected exponent for divider
+);
+
+  logic [P.NORMSHIFTSZDRSU-1:0]    CorrQm0, CorrQm1;           // portions of Shifted to select for CorrQmShifted
+  logic [P.NORMSHIFTSZDRSU-1:0]    CorrQmShifted;              // the shifted divsqrt result after one bit shift
+  logic                       ResSubnorm;                 // is the result Subnormal
+  logic                       LZAPlus1;                   // add one or two to the sum's exponent due to LZA correction
+  logic                       LeftShiftQm;                // should the divsqrt result be shifted one to the left
+
+  // LZA correction
+  assign LZAPlus1 = Shifted[P.NORMSHIFTSZDRSU-1];
+
+  // correct the shifting error caused by the LZA
+  //  - the only possible mantissa for a plus two is all zeroes 
+  //      - a one has to propigate all the way through a sum. so we can leave the bottom statement alone
+  //mux2 #(P.NORMSHIFTSZDRSU-2) lzacorrmux(Shifted[P.NORMSHIFTSZDRSU-3:0], Shifted[P.NORMSHIFTSZDRSU-2:1], LZAPlus1, CorrSumShifted);
+
+  // correct the shifting of the divsqrt caused by producing a result in (2, .5] range
+  //    condition: if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Subnorm)
+  assign LeftShiftQm = (LZAPlus1|(DivUe==1&~LZAPlus1));
+  //assign LeftShiftQm = ((DivUe==1));
+  assign CorrQm0 = {Shifted[P.NORMSHIFTSZDRSU-3:0],{2'b00}};
+  assign CorrQm1 = {Shifted[P.NORMSHIFTSZDRSU-2:0],{1'b0}};
+  mux2 #(P.NORMSHIFTSZDRSU) divcorrmux(CorrQm0, CorrQm1, LeftShiftQm, CorrQmShifted);
+  
+  // if the result of the divider was calculated to be subnormal, then the result was correctly normalized, so select the top shifted bits
+  always_comb
+    //if(FmaOp)                       Mf = {CorrSumShifted, {P.NORMSHIFTSZDRSU-(3*P.NF+4){1'b0}}};
+    //if (DivOp&~DivResSubnorm)  Mf = CorrQmShifted;
+    if (~DivResSubnorm)  Mf = CorrQmShifted;
+    else                       Mf = Shifted[P.NORMSHIFTSZDRSU-1:0];
+    
+  // Determine sum's exponent
+  //  main exponent issues: 
+  //      - LZA was one too large
+  //      - LZA was two too large
+  //      - if the result was calulated to be subnorm but it's norm and the LZA was off by 1
+  //      - if the result was calulated to be subnorm but it's norm and the LZA was off by 2
+  //                          if plus1                    If plus2                               kill if the result Zero or actually subnormal
+  //                          |                           |                                      |
+  //assign FmaMe = (NormSumExp+{{P.NE+1{1'b0}}, LZAPlus1} +{{P.NE+1{1'b0}}, FmaPreResultSubnorm}) & {P.NE+2{~(FmaSZero|ResSubnorm)}};
+  
+  // recalculate if the result is subnormal after LZA correction
+  //assign ResSubnorm = FmaPreResultSubnorm&~Shifted[P.NORMSHIFTSZDRSU-2]&~Shifted[P.NORMSHIFTSZDRSU-1];
+
+  // the quotent is in the range [.5,2) if there is no early termination
+  // if the quotent < 1 and not Subnormal then subtract 1 to account for the normalization shift
+  assign Ue = (DivResSubnorm & DivSubnormShiftPos) ? '0 : DivUe - {(P.NE+1)'(0), ~LZAPlus1};
+  //assign Ue = (DivResSubnorm ) ? '0 : DivUe - {(P.NE+1)'(0), ~LZAPlus1};
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtspecialcase.sv
+++ b/src/fpu/divremsqrt/divremsqrtspecialcase.sv
@ -0,0 +1,240 @@
+///////////////////////////////////////////
+// divremsqrtspecialcase.sv
+//
+// Written: kekim@hmc.edu,me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: special case selection
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+module divremsqrtspecialcase import cvw::*;  #(parameter cvw_t P) (
+  input  logic                Xs,         // X sign
+  input  logic [P.NF:0]        Xm, Ym, // input significand's
+  input  logic                XNaN, YNaN, // are the inputs NaN
+  input  logic [2:0]          Frm,        // rounding mode
+  input  logic [P.FMTBITS-1:0] OutFmt,     // output format
+  input  logic                InfIn,      // are any inputs infinity
+  input  logic                NaNIn,      // are any input NaNs
+  input  logic                XInf, YInf, // are X or Y inifnity
+  input  logic                XZero,      // is X zero
+  input  logic                Plus1,      // do you add one for rounding
+  input  logic                Rs,         // the result's sign
+  input  logic                Invalid, Overflow,  // flags to choose the result
+  input  logic [P.NE-1:0]      Re,         // Result exponent
+  input  logic [P.NE+1:0]      FullRe,     // Result full exponent
+  input  logic [P.NF-1:0]      Rf,         // Result fraction
+  // divsqrt
+  input  logic                DivOp,      // is it a divsqrt opperation
+  input  logic                DivByZero,  // divide by zero flag
+  // outputs
+  output logic [P.FLEN-1:0]    PostProcRes // final result
+);
+
+  logic [P.FLEN-1:0]   XNaNRes;    // X is NaN result
+  logic [P.FLEN-1:0]   YNaNRes;    // Y is NaN result
+  logic [P.FLEN-1:0]   InvalidRes; // Invalid result result
+  logic [P.FLEN-1:0]   UfRes;      // underflowed result result
+  logic [P.FLEN-1:0]   OfRes;      // overflowed result result
+  logic [P.FLEN-1:0]   NormRes;    // normal result
+  logic               OfResMax;   // does the of result output maximum norm fp number
+  logic               KillRes;    // kill the result for underflow
+  logic               SelOfRes;   // should the overflow result be selected
+
+
+  // does the overflow result output the maximum normalized floating point number
+  //                output infinity if the input is infinity
+  assign OfResMax = (~InfIn)&~DivByZero&((Frm[1:0]==2'b01) | (Frm[1:0]==2'b10&~Rs) | (Frm[1:0]==2'b11&Rs));
+
+  // select correct outputs for special cases
+  if (P.FPSIZES == 1) begin
+      //NaN res selection depending on standard
+      if(P.IEEE754) begin
+          assign XNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Xm[P.NF-2:0]};
+          assign YNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Ym[P.NF-2:0]};
+          assign InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+      end else begin
+          assign InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+      end
+
+      assign OfRes =  OfResMax ? {Rs, {P.NE-1{1'b1}}, 1'b0, {P.NF{1'b1}}} : {Rs, {P.NE{1'b1}}, {P.NF{1'b0}}};
+      assign UfRes = {Rs, {P.FLEN-2{1'b0}}, Plus1&Frm[1]&~(DivOp&YInf)};
+      assign NormRes = {Rs, Re, Rf};
+
+  end else if (P.FPSIZES == 2) begin
+      if(P.IEEE754) begin
+          assign XNaNRes = OutFmt ? {1'b0, {P.NE{1'b1}}, 1'b1, Xm[P.NF-2:0]} : {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.NF1]};
+          assign YNaNRes = OutFmt ? {1'b0, {P.NE{1'b1}}, 1'b1, Ym[P.NF-2:0]} : {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.NF1]};
+          assign InvalidRes = OutFmt ? {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}} : {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, (P.NF1-1)'(0)};
+      end else begin 
+          assign InvalidRes = OutFmt ? {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}} : {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, (P.NF1-1)'(0)};
+      end
+
+      always_comb
+          if(OutFmt)
+              if(OfResMax)    OfRes = {Rs, {P.NE-1{1'b1}}, 1'b0, {P.NF{1'b1}}};
+              else            OfRes = {Rs, {P.NE{1'b1}}, {P.NF{1'b0}}};
+          else
+              if(OfResMax)    OfRes = {{P.FLEN-P.LEN1{1'b1}}, Rs, {P.NE1-1{1'b1}}, 1'b0, {P.NF1{1'b1}}};
+              else            OfRes = {{P.FLEN-P.LEN1{1'b1}}, Rs, {P.NE1{1'b1}}, (P.NF1)'(0)};
+      assign UfRes = OutFmt ? {Rs, (P.FLEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)} : {{P.FLEN-P.LEN1{1'b1}}, Rs, (P.LEN1-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+      assign NormRes = OutFmt ? {Rs, Re, Rf} : {{P.FLEN-P.LEN1{1'b1}}, Rs, Re[P.NE1-1:0], Rf[P.NF-1:P.NF-P.NF1]};
+
+  end else if (P.FPSIZES == 3) begin
+      always_comb
+          case (OutFmt)
+              P.FMT: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Xm[P.NF-2:0]};
+                      YNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Ym[P.NF-2:0]};
+                      InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+                  end else begin 
+                      InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+                  end
+                  
+                  OfRes = OfResMax ? {Rs, {P.NE-1{1'b1}}, 1'b0, {P.NF{1'b1}}} : {Rs, {P.NE{1'b1}}, {P.NF{1'b0}}};
+                  UfRes = {Rs, (P.FLEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {Rs, Re, Rf};
+              end
+              P.FMT1: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.NF1]};
+                      YNaNRes = {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.NF1]};
+                      InvalidRes = {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, (P.NF1-1)'(0)};
+                  end else begin 
+                      InvalidRes = {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, (P.NF1-1)'(0)};
+                  end
+                  OfRes = OfResMax ? {{P.FLEN-P.LEN1{1'b1}}, Rs, {P.NE1-1{1'b1}}, 1'b0, {P.NF1{1'b1}}} : {{P.FLEN-P.LEN1{1'b1}}, Rs, {P.NE1{1'b1}}, (P.NF1)'(0)};
+                  UfRes = {{P.FLEN-P.LEN1{1'b1}}, Rs, (P.LEN1-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {{P.FLEN-P.LEN1{1'b1}}, Rs, Re[P.NE1-1:0], Rf[P.NF-1:P.NF-P.NF1]};
+              end
+              P.FMT2: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {{P.FLEN-P.LEN2{1'b1}}, 1'b0, {P.NE2{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.NF2]};
+                      YNaNRes = {{P.FLEN-P.LEN2{1'b1}}, 1'b0, {P.NE2{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.NF2]};
+                      InvalidRes = {{P.FLEN-P.LEN2{1'b1}}, 1'b0, {P.NE2{1'b1}}, 1'b1, (P.NF2-1)'(0)};
+                  end else begin 
+                      InvalidRes = {{P.FLEN-P.LEN2{1'b1}}, 1'b0, {P.NE2{1'b1}}, 1'b1, (P.NF2-1)'(0)};
+                  end
+                  
+                  OfRes = OfResMax ? {{P.FLEN-P.LEN2{1'b1}}, Rs, {P.NE2-1{1'b1}}, 1'b0, {P.NF2{1'b1}}} : {{P.FLEN-P.LEN2{1'b1}}, Rs, {P.NE2{1'b1}}, (P.NF2)'(0)};
+                  UfRes = {{P.FLEN-P.LEN2{1'b1}}, Rs, (P.LEN2-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {{P.FLEN-P.LEN2{1'b1}}, Rs, Re[P.NE2-1:0], Rf[P.NF-1:P.NF-P.NF2]};
+              end
+              default: begin
+                  if(P.IEEE754) begin
+                      XNaNRes = (P.FLEN)'(0);
+                      YNaNRes = (P.FLEN)'(0);
+                      InvalidRes = (P.FLEN)'(0);
+                  end else begin 
+                      InvalidRes = (P.FLEN)'(0);
+                  end
+                  OfRes = (P.FLEN)'(0);
+                  UfRes = (P.FLEN)'(0);
+                  NormRes = (P.FLEN)'(0);
+              end
+          endcase
+
+  end else if (P.FPSIZES == 4) begin 
+      always_comb
+          case (OutFmt)
+              2'h3: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Xm[P.NF-2:0]};
+                      YNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Ym[P.NF-2:0]};
+                      InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+                  end else begin 
+                      InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+                  end
+                  
+                  OfRes = OfResMax ? {Rs, {P.NE-1{1'b1}}, 1'b0, {P.NF{1'b1}}} : {Rs, {P.NE{1'b1}}, {P.NF{1'b0}}};
+                  UfRes = {Rs, (P.FLEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {Rs, Re, Rf};
+              end
+              2'h1: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {{P.FLEN-P.D_LEN{1'b1}}, 1'b0, {P.D_NE{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.D_NF]};
+                      YNaNRes = {{P.FLEN-P.D_LEN{1'b1}}, 1'b0, {P.D_NE{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.D_NF]};
+                      InvalidRes = {{P.FLEN-P.D_LEN{1'b1}}, 1'b0, {P.D_NE{1'b1}}, 1'b1, (P.D_NF-1)'(0)};
+                  end else begin 
+                      InvalidRes = {{P.FLEN-P.D_LEN{1'b1}}, 1'b0, {P.D_NE{1'b1}}, 1'b1, (P.D_NF-1)'(0)};
+                  end
+                  OfRes = OfResMax ? {{P.FLEN-P.D_LEN{1'b1}}, Rs, {P.D_NE-1{1'b1}}, 1'b0, {P.D_NF{1'b1}}} : {{P.FLEN-P.D_LEN{1'b1}}, Rs, {P.D_NE{1'b1}}, (P.D_NF)'(0)};
+                  UfRes = {{P.FLEN-P.D_LEN{1'b1}}, Rs, (P.D_LEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {{P.FLEN-P.D_LEN{1'b1}}, Rs, Re[P.D_NE-1:0], Rf[P.NF-1:P.NF-P.D_NF]};
+              end
+              2'h0: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {{P.FLEN-P.S_LEN{1'b1}}, 1'b0, {P.S_NE{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.S_NF]};
+                      YNaNRes = {{P.FLEN-P.S_LEN{1'b1}}, 1'b0, {P.S_NE{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.S_NF]};
+                      InvalidRes = {{P.FLEN-P.S_LEN{1'b1}}, 1'b0, {P.S_NE{1'b1}}, 1'b1, (P.S_NF-1)'(0)};
+                  end else begin 
+                      InvalidRes = {{P.FLEN-P.S_LEN{1'b1}}, 1'b0, {P.S_NE{1'b1}}, 1'b1, (P.S_NF-1)'(0)};
+                  end
+                  
+                  OfRes = OfResMax ? {{P.FLEN-P.S_LEN{1'b1}}, Rs, {P.S_NE-1{1'b1}}, 1'b0, {P.S_NF{1'b1}}} : {{P.FLEN-P.S_LEN{1'b1}}, Rs, {P.S_NE{1'b1}}, (P.S_NF)'(0)};
+                  UfRes = {{P.FLEN-P.S_LEN{1'b1}}, Rs, (P.S_LEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {{P.FLEN-P.S_LEN{1'b1}}, Rs, Re[P.S_NE-1:0], Rf[P.NF-1:P.NF-P.S_NF]};
+              end
+              2'h2: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {{P.FLEN-P.H_LEN{1'b1}}, 1'b0, {P.H_NE{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.H_NF]};
+                      YNaNRes = {{P.FLEN-P.H_LEN{1'b1}}, 1'b0, {P.H_NE{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.H_NF]};
+                      InvalidRes = {{P.FLEN-P.H_LEN{1'b1}}, 1'b0, {P.H_NE{1'b1}}, 1'b1, (P.H_NF-1)'(0)};
+                  end else begin 
+                      InvalidRes = {{P.FLEN-P.H_LEN{1'b1}}, 1'b0, {P.H_NE{1'b1}}, 1'b1, (P.H_NF-1)'(0)};
+                  end
+                  
+                  OfRes = OfResMax ? {{P.FLEN-P.H_LEN{1'b1}}, Rs, {P.H_NE-1{1'b1}}, 1'b0, {P.H_NF{1'b1}}} : {{P.FLEN-P.H_LEN{1'b1}}, Rs, {P.H_NE{1'b1}}, (P.H_NF)'(0)};      
+                // zero is exact if dividing by infinity so don't add 1
+                  UfRes = {{P.FLEN-P.H_LEN{1'b1}}, Rs, (P.H_LEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {{P.FLEN-P.H_LEN{1'b1}}, Rs, Re[P.H_NE-1:0], Rf[P.NF-1:P.NF-P.H_NF]};
+              end
+          endcase
+  end
+
+  // determine if you shoould kill the res - Cvt
+  //      - do so if the res underflows, is zero (the exp doesnt calculate correctly). or the integer input is 0
+  //      - dont set to zero if fp input is zero but not using the fp input
+  //      - dont set to zero if int input is zero but not using the int input
+  assign KillRes = FullRe[P.NE+1] | (((YInf&~XInf)|XZero)&DivOp);//Underflow & ~ResSubnorm & (Re!=1);
+  
+  // calculate if the overflow result should be selected
+  assign SelOfRes = Overflow|DivByZero|(InfIn&~(YInf&DivOp));
+  
+  // output infinity with result sign if divide by zero
+  if(P.IEEE754)
+    always_comb
+      if(XNaN)                    PostProcRes = XNaNRes;
+      else if(YNaN)               PostProcRes = YNaNRes;
+      else if(Invalid)            PostProcRes = InvalidRes;
+      else if(SelOfRes)           PostProcRes = OfRes;
+      else if(KillRes)            PostProcRes = UfRes;
+      else                        PostProcRes = NormRes;
+  else
+    always_comb
+      if(NaNIn|Invalid)           PostProcRes = InvalidRes;
+      else if(SelOfRes)           PostProcRes = OfRes;
+      else if(KillRes)            PostProcRes = UfRes;
+      else                        PostProcRes = NormRes;
+
+endmodule
--- a/src/fpu/divremsqrt/drsu.sv
+++ b/src/fpu/divremsqrt/drsu.sv
@ -0,0 +1,102 @@
+///////////////////////////////////////////
+// drsu.sv
+//
+// Written: kekim@hmc.edu
+// Modified:19 May 2023
+//
+// Purpose: Combined Divide and Square Root Floating Point and Integer Unit with postprocessing
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+module drsu import cvw::*;  #(parameter cvw_t P) (
+  input  logic                clk, 
+  input  logic                reset, 
+  input  logic [P.FMTBITS-1:0] FmtE,
+  input  logic                XsE, YsE,
+  input  logic [P.NF:0]        XmE, YmE,
+  input  logic [P.NE-1:0]      XeE, YeE,
+  input  logic                XInfE, YInfE, 
+  input  logic                XZeroE, YZeroE, 
+  input  logic                XNaNE, YNaNE, 
+  input  logic                XSNaNE, YSNaNE,
+  input  logic                FDivStartE, IDivStartE,
+  input  logic                StallM,
+  input  logic                FlushE,
+  input  logic                SqrtE, SqrtM,
+  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // these are the src outputs before the mux choosing between them and PCE to put in srcA/B
+  input  logic [2:0]          Funct3E, Funct3M,
+  input  logic                IntDivE, W64E,
+  input  logic [2:0]          Frm,
+  input  logic [3:0]          OpCtrl,
+  input  logic [1:0]          PostProcSel,
+  output logic                FDivBusyE, IFDivStartE, FDivDoneE,
+  output logic [P.FLEN-1:0]    FResM,
+  output logic [P.XLEN-1:0]    FIntDivResultM,
+  output logic [4:0]          FlgM
+);
+
+  // Floating-point division and square root module, with optional integer division and remainder
+  // Computes X/Y, sqrt(X), A/B, or A%B
+
+  logic [P.DIVb+3:0]           WS, WC;                       // Partial remainder components
+  logic [P.DIVb+3:0]           X;                            // Iterator Initial Value (from dividend)
+  logic [P.DIVb+3:0]           D;                            // Iterator Divisor
+  logic [P.DIVb:0]             FirstU, FirstUM;              // Intermediate result values
+  logic [P.DIVb+1:0]           FirstC;                       // Step tracker
+  logic                       Firstun;                      // Quotient selection
+  logic                       WZeroE;                       // Early termination flag
+  logic [P.DURLEN-1:0]         CyclesE;                      // FSM cycles
+  logic                       SpecialCaseM;                 // Divide by zero, square root of negative, etc.
+  logic                       DivStartE;                    // Enable signal for flops during stall
+                                                            
+  // Integer div/rem signals                                
+  logic                       BZeroM;                       // Denominator is zero
+  logic                       IntDivM;                      // Integer operation
+  logic [P.DIVBLEN:0]          nM, mM;                       // Shift amounts
+  logic                       NegQuotM, ALTBM, AsM, W64M;   // Special handling for postprocessor
+  logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
+  logic                       ISpecialCaseE;                // Integer div/remainder special cases
+  logic [P.DIVb:0]             UmM;
+  logic [P.NF+2:0]             UmMexact; //U1.NF+2
+  logic [P.NE+1:0]             UeM;
+  logic                       DivStickyM;
+  logic [P.INTDIVb+3:0]          PreResultM;
+  logic [P.XLEN-1:0]          PreIntResultM;
+  logic [P.DIVBLEN-1:0]       IntNormShiftM;
+
+  divremsqrt #(P) divremsqrt(.clk, .reset, .XsE, .FmtE, .XmE, .YmE, 
+            .XeE, .YeE, .SqrtE, .SqrtM,
+                    .XInfE, .YInfE, .XZeroE, .YZeroE, 
+            .XNaNE, .YNaNE, 
+                    .FDivStartE, .IDivStartE, .W64E,
+                    .StallM, .DivStickyM, .FDivBusyE, .UeM,
+                    .UmM,
+                    .FlushE, .ForwardedSrcAE, .ForwardedSrcBE, .Funct3M,
+                    .Funct3E, .IntDivE, .FIntDivResultM, .IntDivM,
+                    .FDivDoneE, .IFDivStartE, .IntNormShiftM, .PreIntResultM, .PreResultM);
+  assign UmMexact = UmM[P.DIVb:P.DIVb-(P.NF+3-1)]; // grabbing top 1+(NF+2) msbs
+  divremsqrtpostprocess #(P) divremsqrtpostprocess(.Xs(XsE), .Ys(YsE), .Xm(XmE), .Ym(YmE), .Frm(Frm), .Fmt(FmtE), .OpCtrl, .IntDivM,
+    .XZero(XZeroE), .YZero(YZeroE), .XInf(XInfE), .YInf(YInfE), .XNaN(XNaNE), .YNaN(YNaNE), .XSNaN(XSNaNE), 
+    .YSNaN(YSNaNE), .PostProcSel,.DivSticky(DivStickyM), .DivUe(UeM), .DivUm(UmMexact), .PostProcRes(FResM), .PostProcFlg(FlgM),
+    .PreIntResultM, .PreResultM, .IntNormShiftM);
+endmodule
+
--- a/src/fpu/divremsqrt/intrightshift.sv
+++ b/src/fpu/divremsqrt/intrightshift.sv
@ -0,0 +1,37 @@
+///////////////////////////////////////////
+// fdivsqrtpostproc.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
+// Modified:13 January 2022
+//
+// Purpose: Divide/Square root postprocessing
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module intrightshift import cvw::*;  #(parameter cvw_t P) (
+  input logic signed [P.INTDIVb+3:0] shiftin,
+  input logic [P.DIVBLEN-1:0] shiftamt,
+  output logic signed [P.INTDIVb+3:0] shifted
+);
+  assign shifted = shiftin >> shiftamt;
+
+endmodule
--- a/testbench/common/wallyTracer.sv
+++ b/testbench/common/wallyTracer.sv
@ -169,12 +169,17 @@ module wallyTracer import cvw::*; #(parameter cvw_t P) (rvviTrace rvvi);
 	  CSRArray[12'h143] = testbench.dut.core.priv.priv.csr.csrs.csrs.STVAL_REGW;
 	  CSRArray[12'h142] = testbench.dut.core.priv.priv.csr.csrs.csrs.SCAUSE_REGW;
 	  CSRArray[12'h144] = testbench.dut.core.priv.priv.csr.csrm.MIP_REGW & 12'h222 & testbench.dut.core.priv.priv.csr.csrm.MIDELEG_REGW;
-	  CSRArray[12'h14D] = testbench.dut.core.priv.priv.csr.csrs.csrs.STIMECMP_REGW;
+	  CSRArray[12'h14D] = testbench.dut.core.priv.priv.csr.csrs.csrs.STIMECMP_REGW[P.XLEN-1:0];
 	  // user CSRs
 	  CSRArray[12'h001] = testbench.dut.core.priv.priv.csr.csru.csru.FFLAGS_REGW;
 	  CSRArray[12'h002] = testbench.dut.core.priv.priv.csr.csru.csru.FRM_REGW;
 	  CSRArray[12'h003] = {testbench.dut.core.priv.priv.csr.csru.csru.FRM_REGW, testbench.dut.core.priv.priv.csr.csru.csru.FFLAGS_REGW};
 	
+    if (P.XLEN == 32) begin
+      CSRArray[12'h310] = testbench.dut.core.priv.priv.csr.csrsr.MSTATUSH_REGW;
+      CSRArray[12'h31A] = testbench.dut.core.priv.priv.csr.csrm.MENVCFGH_REGW;
+      CSRArray[12'h15D] = testbench.dut.core.priv.priv.csr.csrs.csrs.STIMECMP_REGW[63:32];
+    end
 	end else begin // hold the old value if the pipeline is stalled.

      // PMP CFG 3A0 to 3AF
--- a/testbench/testbench-fp.sv
+++ b/testbench/testbench-fp.sv
--- a/testbench/testbench.sv
+++ b/testbench/testbench.sv
@ -762,7 +762,7 @@ end
    void'(rvviRefConfigSetString(IDV_CONFIG_MODEL_VENDOR,            "riscv.ovpworld.org"));
    void'(rvviRefConfigSetString(IDV_CONFIG_MODEL_NAME,              "riscv"));
    void'(rvviRefConfigSetString(IDV_CONFIG_MODEL_VARIANT,           "RV64GCK"));
-    void'(rvviRefConfigSetInt(IDV_CONFIG_MODEL_ADDRESS_BUS_WIDTH,     56));
+    void'(rvviRefConfigSetInt(IDV_CONFIG_MODEL_ADDRESS_BUS_WIDTH,     XLEN==64 ? 56 : 34));
    void'(rvviRefConfigSetInt(IDV_CONFIG_MAX_NET_LATENCY_RETIREMENTS, 6));

    if(elffilename == "buildroot") filename = "";    
@ -824,15 +824,25 @@ end
    void'(rvviRefCsrSetVolatile(0, 32'hC02));   // INSTRET
    void'(rvviRefCsrSetVolatile(0, 32'hB02));   // MINSTRET
    void'(rvviRefCsrSetVolatile(0, 32'hC01));   // TIME
-    
+    if (P.XLEN == 32) begin
+      void'(rvviRefCsrSetVolatile(0, 32'hC80));   // CYCLEH
+      void'(rvviRefCsrSetVolatile(0, 32'hB80));   // MCYCLEH
+      void'(rvviRefCsrSetVolatile(0, 32'hC82));   // INSTRETH
+      void'(rvviRefCsrSetVolatile(0, 32'hB82));   // MINSTRETH
+      void'(rvviRefCsrSetVolatile(0, 32'hC81));   // TIMEH 
+    end
    // User HPMCOUNTER3 - HPMCOUNTER31
    for (iter='hC03; iter<='hC1F; iter++) begin
      void'(rvviRefCsrSetVolatile(0, iter));   // HPMCOUNTERx
+      if (P.XLEN == 32) 
+        void'(rvviRefCsrSetVolatile(0, iter+128));   // HPMCOUNTERxH
    end       
    
    // Machine MHPMCOUNTER3 - MHPMCOUNTER31
    for (iter='hB03; iter<='hB1F; iter++) begin
      void'(rvviRefCsrSetVolatile(0, iter));   // MHPMCOUNTERx
+      if (P.XLEN == 32) 
+        void'(rvviRefCsrSetVolatile(0, iter+128));   // MHPMCOUNTERxH
    end       
    
    // cannot predict this register due to latency between
--- a/testbench/testbench_fp.sv
+++ b/testbench/testbench_fp.sv
--- a/testbench/tests-fp.vh
+++ b/testbench/tests-fp.vh
@ -0,0 +1,639 @@
+//////////////////////////////////////////
+// tests0fo.vh
+//
+// Written: Katherine Parry 2022
+// Modified: 
+//
+// Purpose: List of floating-point tests to apply
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021-3 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`define PATH "../../tests/fp/vectors/"
+`define ADD_OPCTRL     4'b0110
+`define MUL_OPCTRL     4'b0100
+`define SUB_OPCTRL     4'b0111
+`define FMA_OPCTRL     4'b0000
+`define DIV_OPCTRL     4'b0000
+`define SQRT_OPCTRL    4'b0001
+`define LE_OPCTRL      4'b0011
+`define LT_OPCTRL      4'b0001
+`define EQ_OPCTRL      4'b0010
+`define TO_UI_OPCTRL   4'b0000
+`define TO_I_OPCTRL    4'b0001
+`define TO_UL_OPCTRL   4'b0010
+`define TO_L_OPCTRL    4'b0011
+`define FROM_UI_OPCTRL 4'b0100
+`define FROM_I_OPCTRL  4'b0101
+`define FROM_UL_OPCTRL 4'b0110
+`define FROM_L_OPCTRL  4'b0111
+`define INTREMU_OPCTRL 4'b1001
+`define INTREM_OPCTRL  4'b1010
+`define INTDIV_OPCTRL  4'b1011
+`define INTDIVW_OPCTRL 4'b1100
+`define INTDIVU_OPCTRL 4'b1101
+`define INTREMW_OPCTRL 4'b1110
+`define INTREMUW_OPCTRL 4'b1111
+`define INTDIVUW_OPCTRL 4'b1000
+`define RNE            3'b000
+`define RZ             3'b001
+`define RU             3'b011
+`define RD             3'b010
+`define RNM            3'b100
+`define FMAUNIT        2
+`define DIVUNIT        1
+`define CVTINTUNIT     0
+`define CVTFPUNIT      4
+`define CMPUNIT        3
+`define DIVREMSQRTUNIT 5
+`define INTDIVUNIT     6
+
+string f16rv32cvtint[] = '{
+	"ui32_to_f16_rne.tv",
+	"ui32_to_f16_rz.tv",
+	"ui32_to_f16_ru.tv",
+	"ui32_to_f16_rd.tv",
+	"ui32_to_f16_rnm.tv",
+	"i32_to_f16_rne.tv",
+	"i32_to_f16_rz.tv",
+	"i32_to_f16_ru.tv",
+	"i32_to_f16_rd.tv",
+	"i32_to_f16_rnm.tv",
+	"f16_to_ui32_rne.tv",
+	"f16_to_ui32_rz.tv",
+	"f16_to_ui32_ru.tv",
+	"f16_to_ui32_rd.tv",
+	"f16_to_ui32_rnm.tv",
+	"f16_to_i32_rne.tv",
+	"f16_to_i32_rz.tv",
+	"f16_to_i32_ru.tv",
+	"f16_to_i32_rd.tv",
+	"f16_to_i32_rnm.tv"
+};
+
+string f16rv64cvtint[] = '{
+	"ui64_to_f16_rne.tv",
+	"ui64_to_f16_rz.tv",
+	"ui64_to_f16_ru.tv",
+	"ui64_to_f16_rd.tv",
+	"ui64_to_f16_rnm.tv",
+	"i64_to_f16_rne.tv",
+	"i64_to_f16_rz.tv",
+	"i64_to_f16_ru.tv",
+	"i64_to_f16_rd.tv",
+	"i64_to_f16_rnm.tv",
+	"f16_to_ui64_rne.tv",
+	"f16_to_ui64_rz.tv",
+	"f16_to_ui64_ru.tv",
+	"f16_to_ui64_rd.tv",
+	"f16_to_ui64_rnm.tv",
+	"f16_to_i64_rne.tv",
+	"f16_to_i64_rz.tv",
+	"f16_to_i64_ru.tv",
+	"f16_to_i64_rd.tv",
+	"f16_to_i64_rnm.tv"
+};
+
+string f32rv32cvtint[] = '{
+	"ui32_to_f32_rne.tv",
+	"ui32_to_f32_rz.tv",
+	"ui32_to_f32_ru.tv",
+	"ui32_to_f32_rd.tv",
+	"ui32_to_f32_rnm.tv",
+	"i32_to_f32_rne.tv",
+	"i32_to_f32_rz.tv",
+	"i32_to_f32_ru.tv",
+	"i32_to_f32_rd.tv",
+	"i32_to_f32_rnm.tv",
+	"f32_to_ui32_rne.tv",
+	"f32_to_ui32_rz.tv",
+	"f32_to_ui32_ru.tv",
+	"f32_to_ui32_rd.tv",
+	"f32_to_ui32_rnm.tv",
+	"f32_to_i32_rne.tv",
+	"f32_to_i32_rz.tv",
+	"f32_to_i32_ru.tv",
+	"f32_to_i32_rd.tv",
+	"f32_to_i32_rnm.tv"
+};
+
+string f32rv64cvtint[] = '{
+	"ui64_to_f32_rne.tv",
+	"ui64_to_f32_rz.tv",
+	"ui64_to_f32_ru.tv",
+	"ui64_to_f32_rd.tv",
+	"ui64_to_f32_rnm.tv",
+	"i64_to_f32_rne.tv",
+	"i64_to_f32_rz.tv",
+	"i64_to_f32_ru.tv",
+	"i64_to_f32_rd.tv",
+	"i64_to_f32_rnm.tv",
+	"f32_to_ui64_rne.tv",
+	"f32_to_ui64_rz.tv",
+	"f32_to_ui64_ru.tv",
+	"f32_to_ui64_rd.tv",
+	"f32_to_ui64_rnm.tv",
+	"f32_to_i64_rne.tv",
+	"f32_to_i64_rz.tv",
+	"f32_to_i64_ru.tv",
+	"f32_to_i64_rd.tv",
+	"f32_to_i64_rnm.tv"
+};
+
+
+string f64rv32cvtint[] = '{
+	"ui32_to_f64_rne.tv",
+	"ui32_to_f64_rz.tv",
+	"ui32_to_f64_ru.tv",
+	"ui32_to_f64_rd.tv",
+	"ui32_to_f64_rnm.tv",
+	"i32_to_f64_rne.tv",
+	"i32_to_f64_rz.tv",
+	"i32_to_f64_ru.tv",
+	"i32_to_f64_rd.tv",
+	"i32_to_f64_rnm.tv",
+	"f64_to_ui32_rne.tv",
+	"f64_to_ui32_rz.tv",
+	"f64_to_ui32_ru.tv",
+	"f64_to_ui32_rd.tv",
+	"f64_to_ui32_rnm.tv",
+	"f64_to_i32_rne.tv",
+	"f64_to_i32_rz.tv",
+	"f64_to_i32_ru.tv",
+	"f64_to_i32_rd.tv",
+	"f64_to_i32_rnm.tv"
+};
+
+string f64rv64cvtint[] = '{
+	"ui64_to_f64_rne.tv",
+	"ui64_to_f64_rz.tv",
+	"ui64_to_f64_ru.tv",
+	"ui64_to_f64_rd.tv",
+	"ui64_to_f64_rnm.tv",
+	"i64_to_f64_rne.tv",
+	"i64_to_f64_rz.tv",
+	"i64_to_f64_ru.tv",
+	"i64_to_f64_rd.tv",
+	"i64_to_f64_rnm.tv",
+	"f64_to_ui64_rne.tv",
+	"f64_to_ui64_rz.tv",
+	"f64_to_ui64_ru.tv",
+	"f64_to_ui64_rd.tv",
+	"f64_to_ui64_rnm.tv",
+	"f64_to_i64_rne.tv",
+	"f64_to_i64_rz.tv",
+	"f64_to_i64_ru.tv",
+	"f64_to_i64_rd.tv",
+	"f64_to_i64_rnm.tv"
+};
+
+string f128rv64cvtint[] = '{
+	"ui64_to_f128_rne.tv",
+	"ui64_to_f128_rz.tv",
+	"ui64_to_f128_ru.tv",
+	"ui64_to_f128_rd.tv",
+	"ui64_to_f128_rnm.tv",
+	"i64_to_f128_rne.tv",
+	"i64_to_f128_rz.tv",
+	"i64_to_f128_ru.tv",
+	"i64_to_f128_rd.tv",
+	"i64_to_f128_rnm.tv",
+	"f128_to_ui64_rne.tv",
+	"f128_to_ui64_rz.tv",
+	"f128_to_ui64_ru.tv",
+	"f128_to_ui64_rd.tv",
+	"f128_to_ui64_rnm.tv",
+	"f128_to_i64_rne.tv",
+	"f128_to_i64_rz.tv",
+	"f128_to_i64_ru.tv",
+	"f128_to_i64_rd.tv",
+	"f128_to_i64_rnm.tv"
+};
+
+string f128rv32cvtint[] = '{
+	"ui32_to_f128_rne.tv",
+	"ui32_to_f128_rz.tv",
+	"ui32_to_f128_ru.tv",
+	"ui32_to_f128_rd.tv",
+	"ui32_to_f128_rnm.tv",
+	"i32_to_f128_rne.tv",
+	"i32_to_f128_rz.tv",
+	"i32_to_f128_ru.tv",
+	"i32_to_f128_rd.tv",
+	"i32_to_f128_rnm.tv",
+	"f128_to_ui32_rne.tv",
+	"f128_to_ui32_rz.tv",
+	"f128_to_ui32_ru.tv",
+	"f128_to_ui32_rd.tv",
+	"f128_to_ui32_rnm.tv",
+	"f128_to_i32_rne.tv",
+	"f128_to_i32_rz.tv",
+	"f128_to_i32_ru.tv",
+	"f128_to_i32_rd.tv",
+	"f128_to_i32_rnm.tv"
+};
+
+string f32f16cvt[] = '{
+	"f32_to_f16_rne.tv",
+	"f32_to_f16_rz.tv",
+	"f32_to_f16_ru.tv",
+	"f32_to_f16_rd.tv",
+	"f32_to_f16_rnm.tv",
+	"f16_to_f32_rne.tv",
+	"f16_to_f32_rz.tv",
+	"f16_to_f32_ru.tv",
+	"f16_to_f32_rd.tv",
+	"f16_to_f32_rnm.tv"
+};
+
+string f64f16cvt[] = '{
+	"f64_to_f16_rne.tv",
+	"f64_to_f16_rz.tv",
+	"f64_to_f16_ru.tv",
+	"f64_to_f16_rd.tv",
+	"f64_to_f16_rnm.tv",
+	"f16_to_f64_rne.tv",
+	"f16_to_f64_rz.tv",
+	"f16_to_f64_ru.tv",
+	"f16_to_f64_rd.tv",
+	"f16_to_f64_rnm.tv"
+};
+
+string f128f16cvt[] = '{
+	"f128_to_f16_rne.tv",
+	"f128_to_f16_rz.tv",
+	"f128_to_f16_ru.tv",
+	"f128_to_f16_rd.tv",
+	"f128_to_f16_rnm.tv",
+	"f16_to_f128_rne.tv",
+	"f16_to_f128_rz.tv",
+	"f16_to_f128_ru.tv",
+	"f16_to_f128_rd.tv",
+	"f16_to_f128_rnm.tv"
+};
+
+string f64f32cvt[] = '{
+	"f64_to_f32_rne.tv",
+	"f64_to_f32_rz.tv",
+	"f64_to_f32_ru.tv",
+	"f64_to_f32_rd.tv",
+	"f64_to_f32_rnm.tv",
+	"f32_to_f64_rne.tv",
+	"f32_to_f64_rz.tv",
+	"f32_to_f64_ru.tv",
+	"f32_to_f64_rd.tv",
+	"f32_to_f64_rnm.tv"
+};
+
+string f128f32cvt[] = '{
+	"f128_to_f32_rne.tv",
+	"f128_to_f32_rz.tv",
+	"f128_to_f32_ru.tv",
+	"f128_to_f32_rd.tv",
+	"f128_to_f32_rnm.tv",
+	"f32_to_f128_rne.tv",
+	"f32_to_f128_rz.tv",
+	"f32_to_f128_ru.tv",
+	"f32_to_f128_rd.tv",
+	"f32_to_f128_rnm.tv"
+};
+
+string f128f64cvt[] = '{
+	"f128_to_f64_rne.tv",
+	"f128_to_f64_rz.tv",
+	"f128_to_f64_ru.tv",
+	"f128_to_f64_rd.tv",
+	"f128_to_f64_rnm.tv",
+	"f64_to_f128_rne.tv",
+	"f64_to_f128_rz.tv",
+	"f64_to_f128_ru.tv",
+	"f64_to_f128_rd.tv",
+	"f64_to_f128_rnm.tv"
+};
+
+string f16add[] = '{
+	"f16_add_rne.tv",
+	"f16_add_rz.tv",
+	"f16_add_ru.tv",
+	"f16_add_rd.tv",
+	"f16_add_rnm.tv"
+};
+
+string f32add[] = '{
+	"f32_add_rne.tv",
+	"f32_add_rz.tv",
+	"f32_add_ru.tv",
+	"f32_add_rd.tv",
+	"f32_add_rnm.tv"
+};
+
+string f64add[] = '{
+	"f64_add_rne.tv",
+	"f64_add_rz.tv",
+	"f64_add_ru.tv",
+	"f64_add_rd.tv",
+	"f64_add_rnm.tv"
+};
+
+string f128add[] = '{
+	"f128_add_rne.tv",
+	"f128_add_rz.tv",
+	"f128_add_ru.tv",
+	"f128_add_rd.tv",
+	"f128_add_rnm.tv"
+};
+
+string f16sub[] = '{
+	"f16_sub_rne.tv",
+	"f16_sub_rz.tv",
+	"f16_sub_ru.tv",
+	"f16_sub_rd.tv",
+	"f16_sub_rnm.tv"
+};
+
+string f32sub[] = '{
+	"f32_sub_rne.tv",
+	"f32_sub_rz.tv",
+	"f32_sub_ru.tv",
+	"f32_sub_rd.tv",
+	"f32_sub_rnm.tv"
+};
+
+string f64sub[] = '{
+	"f64_sub_rne.tv",
+	"f64_sub_rz.tv",
+	"f64_sub_ru.tv",
+	"f64_sub_rd.tv",
+	"f64_sub_rnm.tv"
+};
+
+string f128sub[] = '{
+	"f128_sub_rne.tv",
+	"f128_sub_rz.tv",
+	"f128_sub_ru.tv",
+	"f128_sub_rd.tv",
+	"f128_sub_rnm.tv"
+};
+
+string f16mul[] = '{
+	"f16_mul_rne.tv",
+	"f16_mul_rz.tv",
+	"f16_mul_ru.tv",
+	"f16_mul_rd.tv",
+	"f16_mul_rnm.tv"
+};
+
+string f32mul[] = '{
+	"f32_mul_rne.tv",
+	"f32_mul_rz.tv",
+	"f32_mul_ru.tv",
+	"f32_mul_rd.tv",
+	"f32_mul_rnm.tv"
+};
+
+string f64mul[] = '{
+	"f64_mul_rne.tv",
+	"f64_mul_rz.tv",
+	"f64_mul_ru.tv",
+	"f64_mul_rd.tv",
+	"f64_mul_rnm.tv"
+};
+
+string f128mul[] = '{
+	"f128_mul_rne.tv",
+	"f128_mul_rz.tv",
+	"f128_mul_ru.tv",
+	"f128_mul_rd.tv",
+	"f128_mul_rnm.tv"
+};
+
+string f16div[] = '{
+	"f16_div_rne.tv",
+	"f16_div_rz.tv",
+	"f16_div_ru.tv",
+	"f16_div_rd.tv",
+	"f16_div_rnm.tv"
+};
+
+string f32div[] = '{
+	"f32_div_rne.tv",
+	"f32_div_rz.tv",
+	"f32_div_ru.tv",
+	"f32_div_rd.tv",
+	"f32_div_rnm.tv"
+};
+
+string f64div[] = '{
+	"f64_div_rne.tv",
+	"f64_div_rz.tv",
+	"f64_div_ru.tv",
+	"f64_div_rd.tv",
+	"f64_div_rnm.tv"
+};
+
+string f128div[] = '{
+	"f128_div_rne.tv",
+	"f128_div_rz.tv",
+	"f128_div_ru.tv",
+	"f128_div_rd.tv",
+	"f128_div_rnm.tv"
+};
+
+string f16sqrt[] = '{
+	"f16_sqrt_rne.tv",
+	"f16_sqrt_rz.tv",
+	"f16_sqrt_ru.tv",
+	"f16_sqrt_rd.tv",
+	"f16_sqrt_rnm.tv"
+};
+
+string f32sqrt[] = '{
+	"f32_sqrt_rne.tv",
+	"f32_sqrt_rz.tv",
+	"f32_sqrt_ru.tv",
+	"f32_sqrt_rd.tv",
+	"f32_sqrt_rnm.tv"
+};
+
+string f64sqrt[] = '{
+	"f64_sqrt_rne.tv",
+	"f64_sqrt_rz.tv",
+	"f64_sqrt_ru.tv",
+	"f64_sqrt_rd.tv",
+	"f64_sqrt_rnm.tv"
+};
+
+string f128sqrt[] = '{
+	"f128_sqrt_rne.tv",
+	"f128_sqrt_rz.tv",
+	"f128_sqrt_ru.tv",
+	"f128_sqrt_rd.tv",
+	"f128_sqrt_rnm.tv"
+};
+
+string f16cmp[] = '{
+	"f16_eq_rne.tv",
+	"f16_eq_rz.tv",
+	"f16_eq_ru.tv",
+	"f16_eq_rd.tv",
+	"f16_eq_rnm.tv",
+	"f16_le_rne.tv",
+	"f16_le_rz.tv",
+	"f16_le_ru.tv",
+	"f16_le_rd.tv",
+	"f16_le_rnm.tv",
+	"f16_lt_rne.tv",
+	"f16_lt_rz.tv",
+	"f16_lt_ru.tv",
+	"f16_lt_rd.tv",
+	"f16_lt_rnm.tv"
+};
+
+string f32cmp[] = '{
+	"f32_eq_rne.tv",
+	"f32_eq_rz.tv",
+	"f32_eq_ru.tv",
+	"f32_eq_rd.tv",
+	"f32_eq_rnm.tv",
+	"f32_le_rne.tv",
+	"f32_le_rz.tv",
+	"f32_le_ru.tv",
+	"f32_le_rd.tv",
+	"f32_le_rnm.tv",
+	"f32_lt_rne.tv",
+	"f32_lt_rz.tv",
+	"f32_lt_ru.tv",
+	"f32_lt_rd.tv",
+	"f32_lt_rnm.tv"
+};
+
+string f64cmp[] = '{
+	"f64_eq_rne.tv",
+	"f64_eq_rz.tv",
+	"f64_eq_ru.tv",
+	"f64_eq_rd.tv",
+	"f64_eq_rnm.tv",
+	"f64_le_rne.tv",
+	"f64_le_rz.tv",
+	"f64_le_ru.tv",
+	"f64_le_rd.tv",
+	"f64_le_rnm.tv",
+	"f64_lt_rne.tv",
+	"f64_lt_rz.tv",
+	"f64_lt_ru.tv",
+	"f64_lt_rd.tv",
+	"f64_lt_rnm.tv"
+};
+
+string f128cmp[] = '{
+	"f128_eq_rne.tv",
+	"f128_eq_rz.tv",
+	"f128_eq_ru.tv",
+	"f128_eq_rd.tv",
+	"f128_eq_rnm.tv",
+	"f128_le_rne.tv",
+	"f128_le_rz.tv",
+	"f128_le_ru.tv",
+	"f128_le_rd.tv",
+	"f128_le_rnm.tv",
+	"f128_lt_rne.tv",
+	"f128_lt_rz.tv",
+	"f128_lt_ru.tv",
+	"f128_lt_rd.tv",
+	"f128_lt_rnm.tv"
+};
+
+string f16fma[] = '{
+	"f16_mulAdd_rne.tv",
+	"f16_mulAdd_rz.tv",
+	"f16_mulAdd_ru.tv",
+	"f16_mulAdd_rd.tv",
+	"f16_mulAdd_rnm.tv"
+};
+
+string f32fma[] = '{
+	"f32_mulAdd_rne.tv",
+	"f32_mulAdd_rz.tv",
+	"f32_mulAdd_ru.tv",
+	"f32_mulAdd_rd.tv",
+	"f32_mulAdd_rnm.tv"
+};
+
+string f64fma[] = '{
+	"f64_mulAdd_rne.tv",
+	"f64_mulAdd_rz.tv",
+	"f64_mulAdd_ru.tv",
+	"f64_mulAdd_rd.tv",
+	"f64_mulAdd_rnm.tv"
+};
+
+string f128fma[] = '{
+	"f128_mulAdd_rne.tv",
+	"f128_mulAdd_rz.tv",
+	"f128_mulAdd_ru.tv",
+	"f128_mulAdd_rd.tv",
+	"f128_mulAdd_rnm.tv"
+};
+
+string int64rem[] = '{
+	"cvw_64_rem-01.tv"
+};
+
+string int64div[] = '{
+	"cvw_64_div-01.tv"
+};
+
+string int64remu[] = '{
+	"cvw_64_remu-01.tv"
+};
+
+string int64divu[] = '{
+	"cvw_64_divu-01.tv"
+};
+
+string int64remw[] = '{
+	"cvw_64_remw-01.tv"
+};
+
+string int64remuw[] = '{
+	"cvw_64_remuw-01.tv"
+};
+
+string int64divuw[] = '{
+	"cvw_64_divuw-01.tv"
+};
+
+string int64divw[] = '{
+	"cvw_64_divw-01.tv"
+};
+
+string int32rem[] = '{
+	"cvw_32_rem-01.tv"
+};
+
+string int32div[] = '{
+	"cvw_32_div-01.tv"
+};
+
+string int32remu[] = '{
+	"cvw_32_remu-01.tv"
+};
+
+string int32divu[] = '{
+	"cvw_32_divu-01.tv"
+};
--- a/tests/custom/spitest/Makefile
+++ b/tests/custom/spitest/Makefile
@ -0,0 +1,112 @@
+CEXT		:= c
+CPPEXT		:= cpp
+AEXT		:= s
+SEXT		:= S
+SRCEXT 		:= \([$(CEXT)$(AEXT)$(SEXT)]\|$(CPPEXT)\)
+OBJEXT		:= o
+DEPEXT		:= d
+SRCDIR		:= .
+BUILDDIR	:= OBJ
+
+SOURCES		?= $(shell find $(SRCDIR) -type f -regex ".*\.$(SRCEXT)" | sort)
+OBJECTS		:= $(SOURCES:.$(CEXT)=.$(OBJEXT))
+OBJECTS		:= $(OBJECTS:.$(AEXT)=.$(OBJEXT))
+OBJECTS		:= $(OBJECTS:.$(SEXT)=.$(OBJEXT))
+OBJECTS		:= $(OBJECTS:.$(CPPEXT)=.$(OBJEXT))
+OBJECTS		:= $(patsubst $(SRCDIR)/%,$(BUILDDIR)/%,$(OBJECTS))
+
+TARGETDIR	:= bin
+TARGET		:= $(TARGETDIR)/spitest.elf
+ROOT		:= ..
+LIBRARY_DIRS	:= 
+LIBRARY_FILES	:=
+
+MARCH           :=-march=rv64imfdc
+MABI            :=-mabi=lp64d
+LINK_FLAGS      :=$(MARCH) $(MABI) -nostartfiles
+LINKER		:=$(ROOT)/linker8000-0000.x
+
+
+AFLAGS =$(MARCH) $(MABI) -W
+CFLAGS =$(MARCH) $(MABI) -mcmodel=medany  -O2
+AS=riscv64-unknown-elf-as
+CC=riscv64-unknown-elf-gcc
+AR=riscv64-unknown-elf-ar
+
+
+#Default Make
+all: directories  $(TARGET).memfile
+
+#Remake
+remake: clean all
+
+#Make the Directories
+directories:
+	@mkdir -p $(TARGETDIR)
+	@mkdir -p $(BUILDDIR)
+
+clean:
+	rm -rf $(BUILDDIR) $(TARGETDIR) *.memfile *.objdump 
+
+
+#Needed for building additional library projects
+ifdef LIBRARY_DIRS
+LIBS+=${LIBRARY_DIRS:%=-L%}  ${LIBRARY_FILES:%=-l%}
+INC+=${LIBRARY_DIRS:%=-I%}
+
+${LIBRARY_DIRS}: 
+	make -C $@ -j 1
+
+.PHONY: $(LIBRARY_DIRS) $(TARGET)
+endif
+
+
+#Pull in dependency info for *existing* .o files
+-include $(OBJECTS:.$(OBJEXT)=.$(DEPEXT))
+
+#Link
+$(TARGET): $(OBJECTS) $(LIBRARY_DIRS)
+	$(CC) $(LINK_FLAGS) -g -o $(TARGET) $(OBJECTS) ${LIBS} -T ${LINKER}
+
+
+#Compile
+$(BUILDDIR)/%.$(OBJEXT): $(SRCDIR)/%.$(CEXT)
+	@mkdir -p $(dir $@)
+	$(CC) $(CFLAGS) $(INC) -c -o $@ $< > $(BUILDDIR)/$*.list
+	@$(CC) $(CFLAGS) $(INC) -MM $(SRCDIR)/$*.$(CEXT) > $(BUILDDIR)/$*.$(DEPEXT)
+	@cp -f $(BUILDDIR)/$*.$(DEPEXT) $(BUILDDIR)/$*.$(DEPEXT).tmp
+	@sed -e 's|.*:|$(BUILDDIR)/$*.$(OBJEXT):|' < $(BUILDDIR)/$*.$(DEPEXT).tmp > $(BUILDDIR)/$*.$(DEPEXT)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(BUILDDIR)/$*.$(DEPEXT).tmp | fmt -1 | sed -e 's/^ *//' -e 's/$$/:/' >> $(BUILDDIR)/$*.$(DEPEXT)
+	@rm -f $(BUILDDIR)/$*.$(DEPEXT).tmp
+
+# gcc won't output dependencies for assembly files for some reason
+# most asm files don't have dependencies so the echo will work for now.
+$(BUILDDIR)/%.$(OBJEXT): $(SRCDIR)/%.$(AEXT)
+	@mkdir -p $(dir $@)
+	$(CC) $(CFLAGS) -c -o $@ $< > $(BUILDDIR)/$*.list
+	@echo $@: $< > $(BUILDDIR)/$*.$(DEPEXT)
+
+$(BUILDDIR)/%.$(OBJEXT): $(SRCDIR)/%.$(SEXT)
+	@mkdir -p $(dir $@)
+	$(CC) $(CFLAGS) $(INC) -c -o $@ $< > $(BUILDDIR)/$*.list
+	@echo $@: $< > $(BUILDDIR)/$*.$(DEPEXT)
+
+# C++
+$(BUILDDIR)/%.$(OBJEXT): $(SRCDIR)/%.$(CPPEXT)
+	@mkdir -p $(dir $@)
+	$(CC) $(CFLAGS) $(INC) -c -o $@ $< > $(BUILDDIR)/$*.list
+	@$(CC) $(CFLAGS) $(INC) -MM $(SRCDIR)/$*.$(CPPEXT) > $(BUILDDIR)/$*.$(DEPEXT)
+	@cp -f $(BUILDDIR)/$*.$(DEPEXT) $(BUILDDIR)/$*.$(DEPEXT).tmp
+	@sed -e 's|.*:|$(BUILDDIR)/$*.$(OBJEXT):|' < $(BUILDDIR)/$*.$(DEPEXT).tmp > $(BUILDDIR)/$*.$(DEPEXT)
+	@sed -e 's/.*://' -e 's/\\$$//' < $(BUILDDIR)/$*.$(DEPEXT).tmp | fmt -1 | sed -e 's/^ *//' -e 's/$$/:/' >> $(BUILDDIR)/$*.$(DEPEXT)
+	@rm -f $(BUILDDIR)/$*.$(DEPEXT).tmp
+
+# convert to hex
+$(TARGET).memfile: $(TARGET)
+	@echo 'Making object dump file.'
+	@riscv64-unknown-elf-objdump -D $< > $<.objdump
+	@echo 'Making memory file'
+	riscv64-unknown-elf-elf2hex --bit-width 64 --input $^ --output $@
+	extractFunctionRadix.sh $<.objdump
+	mkdir -p ../work/
+	cp -f $(TARGETDIR)/* ../work/
--- a/tests/custom/spitest/spi.h
+++ b/tests/custom/spitest/spi.h
@ -0,0 +1,116 @@
+///////////////////////////////////////////////////////////////////////
+// spi.h
+//
+// Written: Jaocb Pease jacob.pease@okstate.edu 7/22/2024
+//
+// Purpose: Header file for interfaceing with the SPI peripheral
+//
+// 
+//
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the
+// “License”); you may not use this file except in compliance with the
+// License, or, at your option, the Apache License version 2.0. You
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work
+// distributed under the License is distributed on an “AS IS” BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#pragma once
+#ifndef SPI_HEADER
+#define SPI_HEADER
+
+#include <stdint.h>
+
+#define SPI_BASE              0x13000 /* Base address of SPI device used for SDC */
+
+/* register offsets */
+#define SPI_SCKDIV            SPI_BASE + 0x00 /* Serial clock divisor */
+#define SPI_SCKMODE           SPI_BASE + 0x04 /* Serial clock mode */
+#define SPI_CSID              SPI_BASE + 0x10 /* Chip select ID */
+#define SPI_CSDEF             SPI_BASE + 0x14 /* Chip select default */
+#define SPI_CSMODE            SPI_BASE + 0x18 /* Chip select mode */
+#define SPI_DELAY0            SPI_BASE + 0x28 /* Delay control 0 */
+#define SPI_DELAY1            SPI_BASE + 0x2c /* Delay control 1 */
+#define SPI_FMT               SPI_BASE + 0x40 /* Frame format */
+#define SPI_TXDATA            SPI_BASE + 0x48 /* Tx FIFO data */
+#define SPI_RXDATA            SPI_BASE + 0x4c /* Rx FIFO data */
+#define SPI_TXMARK            SPI_BASE + 0x50 /* Tx FIFO [<35;39;29Mwatermark */
+#define SPI_RXMARK            SPI_BASE + 0x54 /* Rx FIFO watermark */
+
+/* Non-implemented
+#define SPI_FCTRL             SPI_BASE + 0x60 // SPI flash interface control
+#define SPI_FFMT              SPI_BASE + 0x64 // SPI flash instruction format
+*/
+#define SPI_IE                SPI_BASE + 0x70 /* Interrupt Enable Register */
+#define SPI_IP                SPI_BASE + 0x74 /* Interrupt Pendings Register */
+
+/* delay0 bits */
+#define SIFIVE_SPI_DELAY0_CSSCK(x)       ((uint32_t)(x))
+#define SIFIVE_SPI_DELAY0_CSSCK_MASK     0xffU
+#define SIFIVE_SPI_DELAY0_SCKCS(x)       ((uint32_t)(x) << 16)
+#define SIFIVE_SPI_DELAY0_SCKCS_MASK     (0xffU << 16)
+
+/* delay1 bits */
+#define SIFIVE_SPI_DELAY1_INTERCS(x)     ((uint32_t)(x))
+#define SIFIVE_SPI_DELAY1_INTERCS_MASK   0xffU
+#define SIFIVE_SPI_DELAY1_INTERXFR(x)    ((uint32_t)(x) << 16)
+#define SIFIVE_SPI_DELAY1_INTERXFR_MASK  (0xffU << 16)
+
+/* csmode bits */
+#define SIFIVE_SPI_CSMODE_MODE_AUTO      0U
+#define SIFIVE_SPI_CSMODE_MODE_HOLD      2U
+#define SIFIVE_SPI_CSMODE_MODE_OFF       3U
+
+// inline void write_reg(uintptr_t addr, uint32_t value);
+//inline uint32_t read_reg(uintptr_t addr);
+//inline void spi_sendbyte(uint8_t byte);
+//inline void waittx();
+//inline void waitrx();
+uint8_t spi_txrx(uint8_t byte);
+uint8_t spi_dummy();
+//inline uint8_t spi_readbyte();
+//uint64_t spi_read64();
+void spi_init();
+void spi_set_clock(uint32_t clkin, uint32_t clkout);
+
+static inline void write_reg(uintptr_t addr, uint32_t value) {
+  volatile uint32_t * loc = (volatile uint32_t *) addr;
+  *loc = value;
+}
+
+// Read a register
+static inline uint32_t read_reg(uintptr_t addr) {
+  return *(volatile uint32_t *) addr;
+}
+
+// Queues a single byte in the transfer fifo
+static inline void spi_sendbyte(uint8_t byte) {
+  // Write byte to transfer fifo
+  write_reg(SPI_TXDATA, byte);
+}
+
+static inline void waittx() {
+  while(!(read_reg(SPI_IP) & 1)) {}
+}
+
+static inline void waitrx() {
+  while(read_reg(SPI_IP) & 2) {}
+}
+
+static inline uint8_t spi_readbyte() {
+  return read_reg(SPI_RXDATA);
+}
+
+#endif
--- a/tests/custom/spitest/spitest.c
+++ b/tests/custom/spitest/spitest.c
@ -0,0 +1,107 @@
+///////////////////////////////////////////////////////////////////////
+// spi.c
+//
+// Written: Jaocb Pease jacob.pease@okstate.edu 8/27/2024
+//
+// Purpose: C code to test SPI bugs
+//
+// 
+//
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the
+// “License”); you may not use this file except in compliance with the
+// License, or, at your option, the Apache License version 2.0. You
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work
+// distributed under the License is distributed on an “AS IS” BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+// implied. See the License for the specific language governing
+// permissions and limitations under the License.
+///////////////////////////////////////////////////////////////////////
+
+#include "spi.h"
+
+// Testing SPI peripheral in loopback mode
+// TODO: Need to make sure the configuration I'm using uses loopback
+//       mode. This can be specified in derivlists.txt
+// TODO:
+
+uint8_t spi_txrx(uint8_t byte) {
+  spi_sendbyte(byte);
+  waittx();
+  return spi_readbyte();
+}
+
+uint8_t spi_dummy() {
+  return spi_txrx(0xff);
+}
+
+void spi_set_clock(uint32_t clkin, uint32_t clkout) {
+  uint32_t div = (clkin/(2*clkout)) - 1;
+  write_reg(SPI_SCKDIV, div);
+}
+
+// Initialize Sifive FU540 based SPI Controller
+void spi_init(uint32_t clkin) {
+  // Enable interrupts
+  write_reg(SPI_IE, 0x3);
+
+  // Set TXMARK to 1. If the number of entries is < 1
+  // IP's txwm field will go high.
+  // Set RXMARK to 0. If the number of entries is > 0
+  // IP's rwxm field will go high.
+  write_reg(SPI_TXMARK, 1);
+  write_reg(SPI_RXMARK, 0);
+
+  // Set Delay 0 to default
+  write_reg(SPI_DELAY0,
+            SIFIVE_SPI_DELAY0_CSSCK(1) |
+			SIFIVE_SPI_DELAY0_SCKCS(1));
+
+  // Set Delay 1 to default
+  write_reg(SPI_DELAY1,
+            SIFIVE_SPI_DELAY1_INTERCS(1) |
+            SIFIVE_SPI_DELAY1_INTERXFR(0));
+
+  // Initialize the SPI controller clock to 
+  // div = (20MHz/(2*400kHz)) - 1 = 24 = 0x18 
+  write_reg(SPI_SCKDIV, 0x18); 
+}
+
+void main() {
+  spi_init(100000000);
+
+  spi_set_clock(100000000,50000000);
+  
+  volatile uint8_t *p = (uint8_t *)(0x8F000000);
+  int j;
+  uint64_t n = 0;
+
+  write_reg(SPI_CSMODE, SIFIVE_SPI_CSMODE_MODE_HOLD);
+  //n = 512/8;
+
+  n = 4;
+  do {
+    // Send 8 dummy bytes (fifo should be empty)
+    for (j = 0; j < 8; j++) {
+      spi_sendbyte(0xaa + j);
+    }
+    
+    // Reset counter. Process bytes AS THEY COME IN.
+    for (j = 0; j < 8; j++) {
+      while (!(read_reg(SPI_IP) & 2)) {}
+      uint8_t x = spi_readbyte();
+      *p++ = x;      
+    }
+  } while(--n > 0);
+
+  write_reg(SPI_CSMODE, SIFIVE_SPI_CSMODE_MODE_AUTO);
+}
--- a/tests/custom/spitest/start.s
+++ b/tests/custom/spitest/start.s
@ -0,0 +1,59 @@
+.section .init
+.global _start
+.type _start, @function
+
+_start:
+	  # Initialize global pointer
+	.option push
+	.option norelax
+	1:auipc gp, %pcrel_hi(__global_pointer$)
+	addi  gp, gp, %pcrel_lo(1b)
+	.option pop
+	
+	li x1, 0
+	li x2, 0
+	li x4, 0
+	li x5, 0
+	li x6, 0
+	li x7, 0
+	li x8, 0
+	li x9, 0
+	li x10, 0
+	li x11, 0
+	li x12, 0
+	li x13, 0
+	li x14, 0
+	li x15, 0
+	li x16, 0
+	li x17, 0
+	li x18, 0
+	li x19, 0
+	li x20, 0
+	li x21, 0
+	li x22, 0
+	li x23, 0
+	li x24, 0
+	li x25, 0
+	li x26, 0
+	li x27, 0
+	li x28, 0
+	li x29, 0
+	li x30, 0
+	li x31, 0
+
+
+
+	# set the stack pointer to the top of memory - 8 bytes (pointer size)
+	li sp, 0x87FFFFF8
+
+	jal ra, main
+	jal ra, _halt
+
+.section .text
+.global _halt
+.type _halt, @function
+_halt:
+	li gp, 1
+	li a0, 0
+	ecall
+	j _halt
--- a/tests/fp/combined_IF_vectors/create_IF_vectors.sh
+++ b/tests/fp/combined_IF_vectors/create_IF_vectors.sh
@ -1,5 +1,7 @@
 #!/bin/sh
 # create test vectors for stand alone int

+mkdir IF_vectors
 ./extract_testfloat_vectors.py
 ./extract_arch_vectors.py
+cp IF_vectors/*  ../vectors