began porting over divremsqrt

2025-02-11 06:05:49 +00:00 · 2024-08-27 17:07:35 -07:00 · 2024-08-27 17:07:35 -07:00 · fda6305d1c
commit fda6305d1c
parent 0ce289c937
23 changed files with 4119 additions and 0 deletions
--- a/bin/regression-wally-intdiv
+++ b/bin/regression-wally-intdiv
@ -0,0 +1,561 @@
+#!/usr/bin/python3
+##################################
+#
+# regression-wally
+# David_Harris@Hmc.edu 25 January 2021
+# Modified by Jarred Allen <jaallen@g.hmc.edu>
+#
+# Run a regression with multiple configurations in parallel and exit with
+# non-zero status code if an error happened, as well as printing human-readable
+# output.
+#
+##################################
+import sys,os,shutil
+import multiprocessing
+
+
+
+class bcolors:
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKCYAN = '\033[96m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    BOLD = '\033[1m'
+    UNDERLINE = '\033[4m'
+
+from collections import namedtuple
+regressionDir = os.path.dirname(os.path.abspath(__file__))
+os.chdir(regressionDir)
+
+coverage = '-coverage' in sys.argv
+fp = '-fp' in sys.argv
+nightly = '-nightly' in sys.argv
+softfloat = '-softfloat' in sys.argv
+intdiv = '-intdiv' in sys.argv
+
+TestCase = namedtuple("TestCase", ['name', 'variant', 'cmd', 'grepstr'])
+# name:     the name of this test configuration (used in printing human-readable
+#           output and picking logfile names)
+# cmd:      the command to run to test (should include the logfile as '{}', and
+#           the command needs to write to that file)
+# grepstr:  the string to grep through the log file for. The test succeeds iff
+#           grep finds that string in the logfile (is used by grep, so it may
+#           be any pattern grep accepts, see `man 1 grep` for more info).
+
+# edit this list to add more test cases
+if (nightly):
+    nightMode = "-nightly";
+    configs = []
+else:
+    nightMode = "";
+    configs = [
+        TestCase(
+            name="lints",
+            variant="all",
+            cmd="./lint-wally " + nightMode + " | tee {}",
+            grepstr="lints run with no errors or warnings"
+        )
+    ]
+
+def getBuildrootTC(boot):
+    INSTR_LIMIT = 1000000 # multiple of 100000; 4M is interesting because it gets into the kernel and enabling VM
+    MAX_EXPECTED = 246000000 # *** TODO: replace this with a search for the login prompt.
+    if boot:
+        name="buildrootboot"
+        BRcmd="vsim > {} -c <<!\ndo wally.do buildroot buildroot-no-trace $RISCV 0 1 0\n!"
+        BRgrepstr="WallyHostname login:"
+    else:
+        name="buildroot"
+        if (coverage):
+            print( "buildroot coverage")
+            BRcmd="vsim > {} -c <<!\ndo wally-batch.do buildroot buildroot $RISCV "+str(INSTR_LIMIT)+" 1 0 -coverage\n!"
+        else:
+            print( "buildroot no coverage")
+            BRcmd="vsim > {} -c <<!\ndo wally-batch.do buildroot buildroot configOptions -GINSTR_LIMIT=" +str(INSTR_LIMIT) + " \n!"
+        BRgrepstr=str(INSTR_LIMIT)+" instructions"
+    return  TestCase(name,variant="rv64gc",cmd=BRcmd,grepstr=BRgrepstr)
+
+tests64gcimperas = ["imperas64i", "imperas64f", "imperas64d", "imperas64m", "imperas64c"] # unused
+
+tests64i = ["arch64i"] 
+for test in tests64i:
+  tc = TestCase(
+        name=test,
+        variant="rv64i",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv64i "+test+"\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+tests32gcimperas = ["imperas32i", "imperas32f", "imperas32m", "imperas32c"] # unused
+tests32gc = ["arch32f", "arch32d", "arch32f_fma", "arch32d_fma", "arch32f_divsqrt", "arch32d_divsqrt", 
+             "arch32i", "arch32priv", "arch32c",  "arch32m", "arch32a", "arch32zifencei", "arch32zicond", 
+             "arch32zba", "arch32zbb", "arch32zbc", "arch32zbs", "arch32zfh", "arch32zfh_fma", 
+             "arch32zfh_divsqrt", "arch32zfaf", "wally32a", "wally32priv", "wally32periph", 
+             "arch32zbkb", "arch32zbkc", "arch32zbkx", "arch32zknd", "arch32zkne", "arch32zknh"]  # "arch32zbc", "arch32zfad",
+#tests32gc = ["arch32f", "arch32d", "arch32f_fma", "arch32d_fma", "arch32i", "arch32priv", "arch32c",  "arch32m", "arch32a", "arch32zifencei", "arch32zba", "arch32zbb", "arch32zbc", "arch32zbs", "arch32zicboz", "arch32zcb", "wally32a",  "wally32priv", "wally32periph"]  
+for test in tests32gc:
+  tc = TestCase(
+        name=test,
+        variant="rv32gc",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv32gc "+test+"\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+tests32imcimperas = ["imperas32i", "imperas32c"] # unused
+tests32imc = ["arch32i", "arch32c", "arch32m", "wally32periph"] 
+for test in tests32imc:
+  tc = TestCase(
+        name=test,
+        variant="rv32imc",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv32imc "+test+"\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+tests32i = ["arch32i"] 
+for test in tests32i:
+  tc = TestCase(
+        name=test,
+        variant="rv32i",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv32i "+test+"\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+
+tests32e = ["arch32e"] 
+for test in tests32e:
+  tc = TestCase(
+        name=test,
+        variant="rv32e",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv32e "+test+"\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+tests64gc = ["arch64f", "arch64d", "arch64f_fma", "arch64d_fma", "arch64f_divsqrt", "arch64d_divsqrt", "arch64i", "arch64zba", "arch64zbb", "arch64zbc", "arch64zbs",  "arch64zfh", "arch64zfh_divsqrt", "arch64zfh_fma", "arch64zfaf", "arch64zfad", "arch64zbkb", "arch64zbkc", "arch64zbkx", "arch64zknd", "arch64zkne", "arch64zknh",
+             "arch64priv", "arch64c",  "arch64m", "arch64a", "arch64zifencei", "arch64zicond", "wally64a", "wally64periph", "wally64priv"] # add arch64zfh_fma when available; arch64zicobz, arch64zcb when working
+#tests64gc = ["arch64f", "arch64d", "arch64f_fma", "arch64d_fma", "arch64i", "arch64zba", "arch64zbb", "arch64zbc", "arch64zbs", 
+#             "arch64priv", "arch64c",  "arch64m", "arch64a", "arch64zifencei", "wally64a", "wally64periph", "wally64priv", "arch64zicboz", "arch64zcb"] 
+if (coverage):  # delete all but 64gc tests when running coverage
+    configs = []
+    tests64gc = ["coverage64gc", "arch64i", "arch64priv", "arch64c",  "arch64m",
+                 "arch64zifencei", "arch64zicond", "arch64a", "wally64a", "wally64periph", "wally64priv", 
+                 "arch64zba",  "arch64zbb",  "arch64zbc", "arch64zbs"] # add when working: "arch64zcb", "arch64zicboz"
+    if (fp):
+       tests64gc.append("arch64f")
+       tests64gc.append("arch64d")
+       tests64gc.append("arch64zfh")
+       tests64gc.append("arch64f_fma")
+       tests64gc.append("arch64d_fma") 
+       tests64gc.append("arch64zfh_fma")
+       tests64gc.append("arch64f_divsqrt")
+       tests64gc.append("arch64d_divsqrt")
+       tests64gc.append("arch64zfh_divsqrt")
+       tests64gc.append("arch64zfaf")
+       tests64gc.append("arch64zfad")
+    coverStr = '-coverage'
+else:
+   coverStr = ''
+for test in tests64gc:
+  tc = TestCase(
+        name=test,
+        variant="rv64gc",
+        cmd="vsim > {} -c <<!\ndo wally-batch.do rv64gc "+test+" " + coverStr + "\n!",
+        grepstr="All tests ran without failures")
+  configs.append(tc)
+
+# run derivative configurations if requested  
+if (nightly):
+    derivconfigtests = [
+        ["tlb2_rv32gc", ["wally32priv"]],
+        ["tlb16_rv32gc", ["wally32priv"]],
+        ["tlb2_rv64gc", ["wally64priv"]],
+        ["tlb16_rv64gc", ["wally64priv"]],
+        ["way_1_4096_512_rv32gc", ["arch32i"]],
+        ["way_2_4096_512_rv32gc", ["arch32i"]],
+        ["way_8_4096_512_rv32gc", ["arch32i"]],
+        ["way_4_2048_512_rv32gc", ["arch32i"]],
+        ["way_4_4096_256_rv32gc", ["arch32i"]],
+        ["way_1_4096_512_rv64gc", ["arch64i"]],
+        ["way_2_4096_512_rv64gc", ["arch64i"]],
+        ["way_8_4096_512_rv64gc", ["arch64i"]],
+        ["way_4_2048_512_rv64gc", ["arch64i"]],
+        ["way_4_4096_256_rv64gc", ["arch64i"]],
+        ["way_4_4096_1024_rv64gc", ["arch64i"]],
+
+        ["ram_0_0_rv64gc", ["ahb64"]],
+        ["ram_1_0_rv64gc", ["ahb64"]],
+        ["ram_1_1_rv64gc", ["ahb64"]],
+        ["ram_2_0_rv64gc", ["ahb64"]],
+        ["ram_2_1_rv64gc", ["ahb64"]],
+        
+        ["noicache_rv32gc", ["ahb32"]],
+# cacheless designs will not work until DTIM supports FLEN > XLEN
+#        ["nodcache_rv32gc", ["ahb32"]],
+#        ["nocache_rv32gc", ["ahb32"]],
+        ["noicache_rv64gc", ["ahb64"]],
+        ["nodcache_rv64gc", ["ahb64"]],
+        ["nocache_rv64gc", ["ahb64"]],
+
+        ### add misaligned tests
+
+        ["div_2_1_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_1i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_2_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_2i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_4_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_4i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_1_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_1i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_2_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_2i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_4_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_4i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_1_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_1i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_2_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_2i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_4_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_1_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_1i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_2_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_2i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_4_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+
+        ### branch predictor simulation
+
+        # ["bpred_TWOBIT_6_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_8_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_10_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],        
+        # ["bpred_TWOBIT_12_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_14_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],        
+        # ["bpred_TWOBIT_16_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_6_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_8_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_10_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],        
+        # ["bpred_TWOBIT_12_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_TWOBIT_14_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],        
+        # ["bpred_TWOBIT_16_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+
+        # ["bpred_GSHARE_6_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_6_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_8_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_8_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_12_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_12_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_14_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_14_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_16_16_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_16_16_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+
+        # # btb
+        # ["bpred_GSHARE_10_16_6_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_6_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_8_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_8_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_12_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_16_12_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+
+        # # ras
+        # ["bpred_GSHARE_10_2_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_2_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_3_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_3_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_4_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_4_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_6_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_6_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_10_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+        # ["bpred_GSHARE_10_10_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
+
+#  enable floating-point tests when lint is fixed
+        ["f_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma"]],
+        ["fh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfh", "arch32zfh_divsqrt"]],
+        ["fdh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt"]],
+        ["fdq_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32i"]],
+        ["fdqh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt", "arch32i"]],
+        ["f_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma"]],
+        ["fh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64zfh", "arch64zfh_divsqrt"]], # hanging 1/31/24 dh; try again when lint is fixed
+        ["fdh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt"]],
+        ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64i"]],
+        ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt", "arch64i", "wally64q"]],
+        
+
+    ]
+    for test in derivconfigtests:
+        config = test[0];
+        tests = test[1];
+        if(len(test) >= 4 and test[2] == "configOptions"):
+            configOptions = test[3]
+            cmdPrefix = "vsim > {} -c <<!\ndo wally-batch.do "+config
+        else:
+            configOptions = ""
+            cmdPrefix = "vsim > {} -c <<!\ndo wally-batch.do "+config
+        for t in tests:
+            tc = TestCase(
+                    name=t,
+                    variant=config,
+                    cmd=cmdPrefix+" "+t+" configOptions "+configOptions+"\n!",
+                    grepstr="All tests ran without failures")
+            configs.append(tc)
+
+
+
+
+# softfloat tests
+if (softfloat):
+    configs = []
+    softfloatconfigs = [
+    "fdh_ieee_div_2_1_rv32gc", "fdh_ieee_div_2_1_rv64gc", "fdh_ieee_div_2_2_rv32gc",
+    "fdh_ieee_div_2_2_rv64gc", "fdh_ieee_div_2_4_rv32gc", "fdh_ieee_div_2_4_rv64gc",
+    "fdh_ieee_div_4_1_rv32gc", "fdh_ieee_div_4_1_rv64gc", "fdh_ieee_div_4_2_rv32gc",
+    "fdh_ieee_div_4_2_rv64gc", "fdh_ieee_div_4_4_rv32gc", "fdh_ieee_div_4_4_rv64gc",
+    "fd_ieee_div_2_1_rv32gc", "fd_ieee_div_2_1_rv64gc", "fd_ieee_div_2_2_rv32gc",
+    "fd_ieee_div_2_2_rv64gc", "fd_ieee_div_2_4_rv32gc", "fd_ieee_div_2_4_rv64gc",
+    "fd_ieee_div_4_1_rv32gc", "fd_ieee_div_4_1_rv64gc", "fd_ieee_div_4_2_rv32gc",
+    "fd_ieee_div_4_2_rv64gc", "fd_ieee_div_4_4_rv32gc", "fd_ieee_div_4_4_rv64gc",
+    "fdqh_ieee_div_2_1_rv32gc", "fdqh_ieee_div_2_1_rv64gc", "fdqh_ieee_div_2_2_rv32gc",
+    "fdqh_ieee_div_2_2_rv64gc", "fdqh_ieee_div_2_4_rv32gc", "fdqh_ieee_div_2_4_rv64gc",
+    "fdqh_ieee_div_4_1_rv32gc", "fdqh_ieee_div_4_1_rv64gc", "fdqh_ieee_div_4_2_rv32gc",
+    "fdqh_ieee_div_4_2_rv64gc", "fdqh_ieee_div_4_4_rv32gc", "fdqh_ieee_div_4_4_rv64gc",
+    "fdq_ieee_div_2_1_rv32gc", "fdq_ieee_div_2_1_rv64gc", "fdq_ieee_div_2_2_rv32gc",
+    "fdq_ieee_div_2_2_rv64gc", "fdq_ieee_div_2_4_rv32gc", "fdq_ieee_div_2_4_rv64gc",
+    "fdq_ieee_div_4_1_rv32gc", "fdq_ieee_div_4_1_rv64gc", "fdq_ieee_div_4_2_rv32gc",
+    "fdq_ieee_div_4_2_rv64gc", "fdq_ieee_div_4_4_rv32gc", "fdq_ieee_div_4_4_rv64gc",
+    "fh_ieee_div_2_1_rv32gc", "fh_ieee_div_2_1_rv64gc", "fh_ieee_div_2_2_rv32gc",
+    "fh_ieee_div_2_2_rv64gc", "fh_ieee_div_2_4_rv32gc", "fh_ieee_div_2_4_rv64gc",
+    "fh_ieee_div_4_1_rv32gc", "fh_ieee_div_4_1_rv64gc", "fh_ieee_div_4_2_rv32gc",
+    "fh_ieee_div_4_2_rv64gc", "fh_ieee_div_4_4_rv32gc", "fh_ieee_div_4_4_rv64gc",
+    "f_ieee_div_2_1_rv32gc", "f_ieee_div_2_1_rv64gc", "f_ieee_div_2_2_rv32gc",
+    "f_ieee_div_2_2_rv64gc", "f_ieee_div_2_4_rv32gc", "f_ieee_div_2_4_rv64gc",
+    "f_ieee_div_4_1_rv32gc", "f_ieee_div_4_1_rv64gc", "f_ieee_div_4_2_rv32gc",
+    "f_ieee_div_4_2_rv64gc", "f_ieee_div_4_4_rv32gc", "f_ieee_div_4_4_rv64gc"
+    ]
+    for config in softfloatconfigs:
+        # div test case
+        divtest = TestCase(
+            name="div",
+            variant=config,
+            cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " div \n!",
+            grepstr="All Tests completed with          0 errors"
+        )
+        configs.insert(0,divtest)
+
+        # sqrt test case
+        sqrttest = TestCase(
+            name="sqrt",
+            variant=config,
+            cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " sqrt \n!",
+            grepstr="All Tests completed with          0 errors"
+        )
+        #configs.append(sqrttest)
+        configs.insert(0,sqrttest)
+
+
+        # skip if divider variant config
+        if ("ieee" in config):
+            # cvtint test case
+            cvtinttest = TestCase(
+                name="cvtint",
+                variant=config,
+                cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " cvtint \n!",
+                grepstr="All Tests completed with          0 errors"
+                )
+            configs.append(cvtinttest)
+
+            # cvtfp test case
+            # WILL fail on F_only (refer to spec)
+            cvtfptest = TestCase(
+                name="cvtfp",
+                variant=config,
+                cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " cvtfp \n!",
+                grepstr="All Tests completed with          0 errors"
+            )
+            configs.append(cvtfptest)    
+      
+# intdiv verification
+if (intdiv):
+    configs = []
+    # ***NOTE add to this
+    
+    intdivconfigs = [
+    "fdh_ieee_div_2_1i_rv32gc", "fdh_ieee_div_2_1i_rv64gc", "fdh_ieee_div_2_2i_rv32gc",
+    "fdh_ieee_div_2_2i_rv64gc", "fdh_ieee_div_2_4i_rv32gc", "fdh_ieee_div_2_4i_rv64gc",
+    "fdh_ieee_div_4_1i_rv32gc", "fdh_ieee_div_4_1i_rv64gc", "fdh_ieee_div_4_2i_rv32gc",
+    "fdh_ieee_div_4_2i_rv64gc", "fdh_ieee_div_4_4i_rv32gc", "fdh_ieee_div_4_4i_rv64gc",
+    "fd_ieee_div_2_1i_rv32gc", "fd_ieee_div_2_1i_rv64gc", "fd_ieee_div_2_2i_rv32gc",
+    "fd_ieee_div_2_2i_rv64gc", "fd_ieee_div_2_4i_rv32gc", "fd_ieee_div_2_4i_rv64gc",
+    "fd_ieee_div_4_1i_rv32gc", "fd_ieee_div_4_1i_rv64gc", "fd_ieee_div_4_2i_rv32gc",
+    "fd_ieee_div_4_2i_rv64gc", "fd_ieee_div_4_4i_rv32gc", "fd_ieee_div_4_4i_rv64gc",
+    "fdqh_ieee_div_2_1i_rv32gc", "fdqh_ieee_div_2_1i_rv64gc", "fdqh_ieee_div_2_2i_rv32gc",
+    "fdqh_ieee_div_2_2i_rv64gc", "fdqh_ieee_div_2_4i_rv32gc", "fdqh_ieee_div_2_4i_rv64gc",
+    "fdqh_ieee_div_4_1i_rv32gc", "fdqh_ieee_div_4_1i_rv64gc", "fdqh_ieee_div_4_2i_rv32gc",
+    "fdqh_ieee_div_4_2i_rv64gc", "fdqh_ieee_div_4_4i_rv32gc", "fdqh_ieee_div_4_4i_rv64gc",
+    "fdq_ieee_div_2_1i_rv32gc", "fdq_ieee_div_2_1i_rv64gc", "fdq_ieee_div_2_2i_rv32gc",
+    "fdq_ieee_div_2_2i_rv64gc", "fdq_ieee_div_2_4i_rv32gc", "fdq_ieee_div_2_4i_rv64gc",
+    "fdq_ieee_div_4_1i_rv32gc", "fdq_ieee_div_4_1i_rv64gc", "fdq_ieee_div_4_2i_rv32gc",
+    "fdq_ieee_div_4_2i_rv64gc", "fdq_ieee_div_4_4i_rv32gc", "fdq_ieee_div_4_4i_rv64gc",
+    "fh_ieee_div_2_1i_rv32gc", "fh_ieee_div_2_1i_rv64gc", "fh_ieee_div_2_2i_rv32gc",
+    "fh_ieee_div_2_2i_rv64gc", "fh_ieee_div_2_4i_rv32gc", "fh_ieee_div_2_4i_rv64gc",
+    "fh_ieee_div_4_1i_rv32gc", "fh_ieee_div_4_1i_rv64gc", "fh_ieee_div_4_2i_rv32gc",
+    "fh_ieee_div_4_2i_rv64gc", "fh_ieee_div_4_4i_rv32gc", "fh_ieee_div_4_4i_rv64gc",
+    "f_ieee_div_2_1i_rv32gc", "f_ieee_div_2_1i_rv64gc", "f_ieee_div_2_2i_rv32gc",
+    "f_ieee_div_2_2i_rv64gc", "f_ieee_div_2_4i_rv32gc", "f_ieee_div_2_4i_rv64gc",
+    "f_ieee_div_4_1i_rv32gc", "f_ieee_div_4_1i_rv64gc", "f_ieee_div_4_2i_rv32gc",
+    "f_ieee_div_4_2i_rv64gc", "f_ieee_div_4_4i_rv32gc", "f_ieee_div_4_4i_rv64gc",
+    "fd_ieee_div_2_8i_rv32gc",
+    "fd_ieee_div_2_8i_rv64gc",
+    "fdq_ieee_div_2_8i_rv64gc",
+    "fdq_ieee_div_2_8i_rv32gc",
+    "f_ieee_div_2_8i_rv64gc",
+    "f_ieee_div_2_8i_rv32gc"
+    ]
+    nointdivconfigs = [
+    "fdh_ieee_div_2_1_rv32gc", "fdh_ieee_div_2_1_rv64gc", "fdh_ieee_div_2_2_rv32gc",
+    "fdh_ieee_div_2_2_rv64gc", "fdh_ieee_div_2_4_rv32gc", "fdh_ieee_div_2_4_rv64gc",
+    "fdh_ieee_div_4_1_rv32gc", "fdh_ieee_div_4_1_rv64gc", "fdh_ieee_div_4_2_rv32gc",
+    "fdh_ieee_div_4_2_rv64gc", "fdh_ieee_div_4_4_rv32gc", "fdh_ieee_div_4_4_rv64gc",
+    "fd_ieee_div_2_1_rv32gc", "fd_ieee_div_2_1_rv64gc", "fd_ieee_div_2_2_rv32gc",
+    "fd_ieee_div_2_2_rv64gc", "fd_ieee_div_2_4_rv32gc", "fd_ieee_div_2_4_rv64gc",
+    "fd_ieee_div_4_1_rv32gc", "fd_ieee_div_4_1_rv64gc", "fd_ieee_div_4_2_rv32gc",
+    "fd_ieee_div_4_2_rv64gc", "fd_ieee_div_4_4_rv32gc", "fd_ieee_div_4_4_rv64gc",
+    "fdqh_ieee_div_2_1_rv32gc", "fdqh_ieee_div_2_1_rv64gc", "fdqh_ieee_div_2_2_rv32gc",
+    "fdqh_ieee_div_2_2_rv64gc", "fdqh_ieee_div_2_4_rv32gc", "fdqh_ieee_div_2_4_rv64gc",
+    "fdqh_ieee_div_4_1_rv32gc", "fdqh_ieee_div_4_1_rv64gc", "fdqh_ieee_div_4_2_rv32gc",
+    "fdqh_ieee_div_4_2_rv64gc", "fdqh_ieee_div_4_4_rv32gc", "fdqh_ieee_div_4_4_rv64gc",
+    "fdq_ieee_div_2_1_rv32gc", "fdq_ieee_div_2_1_rv64gc", "fdq_ieee_div_2_2_rv32gc",
+    "fdq_ieee_div_2_2_rv64gc", "fdq_ieee_div_2_4_rv32gc", "fdq_ieee_div_2_4_rv64gc",
+    "fdq_ieee_div_4_1_rv32gc", "fdq_ieee_div_4_1_rv64gc", "fdq_ieee_div_4_2_rv32gc",
+    "fdq_ieee_div_4_2_rv64gc", "fdq_ieee_div_4_4_rv32gc", "fdq_ieee_div_4_4_rv64gc",
+    "fh_ieee_div_2_1_rv32gc", "fh_ieee_div_2_1_rv64gc", "fh_ieee_div_2_2_rv32gc",
+    "fh_ieee_div_2_2_rv64gc", "fh_ieee_div_2_4_rv32gc", "fh_ieee_div_2_4_rv64gc",
+    "fh_ieee_div_4_1_rv32gc", "fh_ieee_div_4_1_rv64gc", "fh_ieee_div_4_2_rv32gc",
+    "fh_ieee_div_4_2_rv64gc", "fh_ieee_div_4_4_rv32gc", "fh_ieee_div_4_4_rv64gc",
+    "f_ieee_div_2_1_rv32gc", "f_ieee_div_2_1_rv64gc", "f_ieee_div_2_2_rv32gc",
+    "f_ieee_div_2_2_rv64gc", "f_ieee_div_2_4_rv32gc", "f_ieee_div_2_4_rv64gc",
+    "f_ieee_div_4_1_rv32gc", "f_ieee_div_4_1_rv64gc", "f_ieee_div_4_2_rv32gc",
+    "f_ieee_div_4_2_rv64gc", "f_ieee_div_4_4_rv32gc", "f_ieee_div_4_4_rv64gc"
+    ]
+
+    for config in intdivconfigs:
+        # fdivremsqrt test case
+        fdivremsqrttestcase = TestCase(
+            name="fdivremsqrt",
+            variant=config,
+            cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " fdivremsqrt \n!",
+            grepstr="All Tests completed with          0 errors"
+        )
+        configs.insert(0,fdivremsqrttestcase)
+    for config in nointdivconfigs:
+        # div,sqrt test cases for no integer flavor of divider
+        divtestcase = TestCase(
+            name="fdiv",
+            variant=config,
+            cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " div_drsu \n!",
+            grepstr="All Tests completed with          0 errors"
+        )
+        configs.insert(0,divtestcase)
+        sqrttestcase = TestCase(
+            name="fsqrt",
+            variant=config,
+            cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " sqrt_drsu \n!",
+            grepstr="All Tests completed with          0 errors"
+        )
+        configs.insert(0,sqrttestcase)
+
+import os
+from multiprocessing import Pool, TimeoutError
+
+def search_log_for_text(text, logfile):
+    """Search through the given log file for text, returning True if it is found or False if it is not"""
+    grepcmd = "grep -e '%s' '%s' > /dev/null" % (text, logfile)
+    return os.system(grepcmd) == 0
+
+def run_test_case(config):
+    """Run the given test case, and return 0 if the test suceeds and 1 if it fails"""
+    logname = "logs/"+config.variant+"_"+config.name+".log"
+    cmd = config.cmd.format(logname)
+#    print(cmd)
+    os.chdir(regressionDir)
+    os.system(cmd)
+    if search_log_for_text(config.grepstr, logname):
+        print(f"{bcolors.OKGREEN}%s_%s: Success{bcolors.ENDC}" % (config.variant, config.name))
+        return 0
+    else:
+        print(f"{bcolors.FAIL}%s_%s: Failures detected in output{bcolors.ENDC}" % (config.variant, config.name))
+        print("  Check %s" % logname)
+        return 1
+
+def main():
+    """Run the tests and count the failures"""
+    global configs, coverage
+    try:
+        os.chdir(regressionDir)
+        os.mkdir("logs")
+    except:
+        pass
+    try:
+        shutil.rmtree("wkdir")
+    except:
+        pass
+    finally:
+        os.mkdir("wkdir")
+ 
+    if '-makeTests' in sys.argv:
+        os.chdir(regressionDir)
+        os.system('./make-tests.sh | tee ./logs/make-tests.log')
+
+    if '-all' in sys.argv:
+        TIMEOUT_DUR = 30*7200 # seconds
+        configs.append(getBuildrootTC(boot=True))
+    elif '-buildroot' in sys.argv:
+        TIMEOUT_DUR = 30*7200 # seconds
+        configs=[getBuildrootTC(boot=True)]
+    elif '-coverage' in sys.argv:
+        TIMEOUT_DUR = 20*60 # seconds    
+    #   Presently don't run buildroot because it has a different config and can't be merged with the rv64gc coverage.
+    #   Also it is slow to run.   
+    #    configs.append(getBuildrootTC(boot=False))
+        os.system('rm -f cov/*.ucdb')
+    elif '-nightly' in sys.argv:
+        TIMEOUT_DUR = 60*1440 # 1 day
+        configs.append(getBuildrootTC(boot=False))
+    elif '-softfloat' in sys.argv:
+        TIMEOUT_DUR = 60*60 # seconds
+    elif '-intdiv' in sys.argv:
+        TIMEOUT_DUR = 60*60 # seconds
+    else:
+        TIMEOUT_DUR = 10*60 # seconds
+        configs.append(getBuildrootTC(boot=False))
+
+    # Scale the number of concurrent processes to the number of test cases, but
+    # max out at a limited number of concurrent processes to not overwhelm the system
+    with Pool(processes=min(len(configs),multiprocessing.cpu_count())) as pool:
+       num_fail = 0
+       results = {}
+       for config in configs:
+           results[config] = pool.apply_async(run_test_case,(config,))
+       for (config,result) in results.items():
+           try:
+             num_fail+=result.get(timeout=TIMEOUT_DUR)
+           except TimeoutError:
+             num_fail+=1
+             print(f"{bcolors.FAIL}%s_%s: Timeout - runtime exceeded %d seconds{bcolors.ENDC}" % (config.variant, config.name, TIMEOUT_DUR))
+
+    # Coverage report
+    if coverage:
+       os.system('make coverage')
+    # Count the number of failures
+    if num_fail:
+        print(f"{bcolors.FAIL}Regression failed with %s failed configurations{bcolors.ENDC}" % num_fail)
+    else:
+        print(f"{bcolors.OKGREEN}SUCCESS! All tests ran without failures{bcolors.ENDC}")
+    return num_fail
+
+if __name__ == '__main__':
+    exit(main())
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@ -123,6 +123,10 @@ localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1), (DIVb + 1 + NF + 1)), (FMALEN

 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));                  // log_2(NORMSHIFTSZ)

+localparam CORRSHIFTSZ = `max((NORMSHIFTSZ-2), (DIVMINb + 1 + NF));
+localparam NORMSHIFTSZDRSU = DIVb+1+NF;
+localparam LOGNORMSHIFTSZDRSU = $clog2(NORMSHIFTSZDRSU);
+
 // Disable spurious Verilator warnings

 /* verilator lint_off STMTDLY */
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@ -194,6 +194,8 @@ localparam cvw_t P = '{
  FMALEN : FMALEN,
  NORMSHIFTSZ : NORMSHIFTSZ,
  LOGNORMSHIFTSZ : LOGNORMSHIFTSZ,
+  NORMSHIFTSZDRSU : NORMSHIFTSZDRSU,
+  LOGNORMSHIFTSZDRSU : LOGNORMSHIFTSZDRSU,
  LOGR        : LOGR,
  RK          : RK,
  FPDUR       : FPDUR,
--- a/src/cvw.sv
+++ b/src/cvw.sv
@ -285,6 +285,8 @@ typedef struct packed {
  int LOGCVTLEN;
  int NORMSHIFTSZ;
  int LOGNORMSHIFTSZ;
+  int NORMSHIFTSZDRSU;
+  int LOGNORMSHIFTSZDRSU;
  int FMALEN;

 // division constants
--- a/src/fpu/divremsqrt/arithrightshift.sv
+++ b/src/fpu/divremsqrt/arithrightshift.sv
@ -0,0 +1,9 @@
+
+module arithrightshift import cvw::*;  #(parameter cvw_t P) (
+  input logic signed [P.INTDIVb+3:0] shiftin,
+  output logic signed [P.INTDIVb+3:0] shifted
+);
+  assign shifted = $signed(shiftin) >>> P.LOGR;
+
+endmodule
+
--- a/src/fpu/divremsqrt/divremsqrt.sv
+++ b/src/fpu/divremsqrt/divremsqrt.sv
@ -0,0 +1,111 @@
+///////////////////////////////////////////
+// divremsqrt.sv
+//
+// Written: kekim@hmc.edu
+// Modified:19 May 2023
+//
+// Purpose: Combined Divide and Square Root Floating Point and Integer Unit with postprocessing
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+ module divremsqrt import cvw::*;  #(parameter cvw_t P) (
+  input  logic                clk, 
+  input  logic                reset, 
+  input  logic [P.FMTBITS-1:0] FmtE,
+  input  logic                XsE,
+  input  logic [P.NF:0]        XmE, YmE,
+  input  logic [P.NE-1:0]      XeE, YeE,
+  input  logic                XInfE, YInfE, 
+  input  logic                XZeroE, YZeroE, 
+  input  logic                XNaNE, YNaNE, 
+  input  logic                FDivStartE, IDivStartE,
+  input  logic                StallM,
+  input  logic                FlushE,
+  input  logic                SqrtE, SqrtM,
+  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // these are the src outputs before the mux choosing between them and PCE to put in srcA/B
+  input  logic [2:0]          Funct3E, Funct3M,
+  input  logic                IntDivE, W64E,
+  output logic                DivStickyM,
+  output logic                FDivBusyE, IFDivStartE, FDivDoneE,
+  output logic [P.NE+1:0]      UeM,
+  output logic [P.DIVb:0]      UmM,
+  output logic [P.XLEN-1:0]    FIntDivResultM,
+  output logic                 IntDivM,
+  // integer normalization shifter signals
+  output logic [P.INTDIVb+3:0]          PreResultM,
+  input logic [P.XLEN-1:0]          PreIntResultM,
+  output logic [P.DIVBLEN-1:0]       IntNormShiftM
+
+);
+
+  // Floating-point division and square root module, with optional integer division and remainder
+  // Computes X/Y, sqrt(X), A/B, or A%B
+
+  logic [P.DIVb+3:0]           WS, WC;                       // Partial remainder components
+  logic [P.DIVb+3:0]           X;                            // Iterator Initial Value (from dividend)
+  logic [P.DIVb+3:0]           D;                            // Iterator Divisor
+  logic [P.DIVb:0]             FirstU, FirstUM;              // Intermediate result values
+  logic [P.DIVb+1:0]           FirstC;                       // Step tracker
+  logic                       Firstun;                      // Quotient selection
+  logic                       WZeroE;                       // Early termination flag
+  logic [P.DURLEN:0]         CyclesE;                      // FSM cycles
+  logic                       SpecialCaseM;                 // Divide by zero, square root of negative, etc.
+  logic                       DivStartE;                    // Enable signal for flops during stall
+                                                            
+  // Integer div/rem signals                                
+  logic                       BZeroM;                       // Denominator is zero
+  logic [P.DIVBLEN:0]          nM, mM;                       // Shift amounts
+  logic                       NegQuotM, ALTBM, AsM, BsM, W64M, SIGNOVERFLOWM, ZeroDiffM;   // Special handling for postprocessor
+  logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
+  logic                       ISpecialCaseE;                // Integer div/remainder special cases
+
+
+  divremsqrtfdivsqrtpreproc #(P) divremsqrtfdivsqrtpreproc(                          // Preprocessor
+    .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE),
+    .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
+    // Int-specific 
+    .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
+    .BZeroM, .AM, 
+    .IntDivM, .W64M, .ALTBM, .AsM, .BsM, .IntNormShiftM, .SIGNOVERFLOWM, .ZeroDiffM);
+
+  fdivsqrtfsm #(P) fdivsqrtfsm(                                  // FSM
+    .clk, .reset, .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, 
+    .FDivStartE, .XsE, .SqrtE, .WZeroE, .FlushE, .StallM, 
+    .FDivBusyE, .IFDivStartE, .FDivDoneE, .SpecialCaseM, .CyclesE,
+    // Int-specific 
+    .IDivStartE, .ISpecialCaseE, .IntDivE);
+
+  fdivsqrtiter #(P) fdivsqrtiter(                                // CSA Iterator
+    .clk, .IFDivStartE, .FDivBusyE, .SqrtE, .X, .D, 
+    .FirstU, .FirstUM, .FirstC, .Firstun, .FirstWS(WS), .FirstWC(WC));
+
+  divremsqrtfdivsqrtpostproc #(P) fdivsqrtpostproc(                        // Postprocessor
+    .clk, .reset, .StallM, .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, 
+    .SqrtE, .Firstun, .SqrtM, .SpecialCaseM, 
+    .UmM, .WZeroE, .DivStickyM, 
+    // Int-specific 
+    .ALTBM, .AsM, .BsM, .BZeroM, .W64M, .RemOpM(Funct3M[1]), .AM, 
+    .FIntDivResultM,  .PreResultM, .PreIntResultM, .SIGNOVERFLOWM, .ZeroDiffM, .IntDivM, .IntNormShiftM);
+  
+  
+endmodule
+
--- a/src/fpu/divremsqrt/divremsqrtdivshiftcalc.sv
+++ b/src/fpu/divremsqrt/divremsqrtdivshiftcalc.sv
@ -0,0 +1,73 @@
+///////////////////////////////////////////
+// divshiftcalc.sv
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: Division shift calculation
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtdivshiftcalc import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.NF+2:0]              DivUm,              // divsqrt significand
+  input  logic [P.NE+1:0]              DivUe,              // divsqrt exponent
+  output logic [P.LOGNORMSHIFTSZDRSU-1:0]  DivShiftAmt,        // divsqrt shift amount
+  output logic [P.NORMSHIFTSZDRSU-1:0]     DivShiftIn,         // divsqrt shift input
+  output logic                         DivResSubnorm,      // is the divsqrt result subnormal
+  output logic                         DivSubnormShiftPos  // is the subnormal shift amount positive
+);
+
+  logic [P.LOGNORMSHIFTSZDRSU-1:0]         NormShift;          // normalized result shift amount
+  logic [P.LOGNORMSHIFTSZDRSU-1:0]         DivSubnormShiftAmt; // subnormal result shift amount (killed if negative)
+  logic [P.NE+1:0]                     DivSubnormShift;    // subnormal result shift amount
+
+  // is the result subnormal
+  // if the exponent is 1 then the result needs to be normalized then the result is Subnormalizes
+  assign DivResSubnorm = DivUe[P.NE+1]|(~|DivUe[P.NE+1:0]);
+
+  // if the result is subnormal
+  //  00000000x.xxxxxx...                     Exp = DivUe
+  //  .00000000xxxxxxx... >> NF+1             Exp = DivUe+NF+1
+  //  .00xxxxxxxxxxxxx... << DivUe+NF+1       Exp = +1
+  //  .0000xxxxxxxxxxx... >> 1                Exp = 1
+  // Left shift amount      = DivUe+NF+1-1
+  assign DivSubnormShift    = (P.NE+2)'(P.NF)+DivUe;
+  assign DivSubnormShiftPos = ~DivSubnormShift[P.NE+1];
+
+  // if the result is normalized
+  //  00000000x.xxxxxx...                     Exp = DivUe
+  //  .00000000xxxxxxx... >> NF+1             Exp = DivUe+NF+1
+  //  00000000.xxxxxxx... << NF               Exp = DivUe+1
+  //  00000000x.xxxxxx... << NF               Exp = DivUe (extra shift done afterwards)
+  //  00000000xx.xxxxx... << 1?               Exp = DivUe-1 (determined after)
+  // inital Left shift amount  = NF
+  // shift one more if the it's a minimally redundent radix 4 - one entire cycle needed for integer bit
+  assign NormShift = (P.LOGNORMSHIFTSZDRSU)'(P.NF);
+
+  // if the shift amount is negative then don't shift (keep sticky bit)
+  // need to multiply the early termination shift by LOGR*DIVCOPIES =  left shift of log2(LOGR*DIVCOPIES)
+  assign DivSubnormShiftAmt = DivSubnormShiftPos ? DivSubnormShift[P.LOGNORMSHIFTSZDRSU-1:0] : 0;
+  assign DivShiftAmt        = DivResSubnorm ? DivSubnormShiftAmt : NormShift;
+
+  // pre-shift the divider result for normalization
+  assign DivShiftIn = {{P.NF{1'b0}}, DivUm, {P.NORMSHIFTSZDRSU-(P.NF+2)-1-P.NF{1'b0}}};
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtearlyterm.sv
+++ b/src/fpu/divremsqrt/divremsqrtearlyterm.sv
@ -0,0 +1,27 @@
+module divremsqrtearlyterm import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.DIVb+3:0]    WS, WC,            // Q4.DIVb
+  input  logic [P.DIVb+3:0]    D,                 // Q4.DIVb
+  input  logic [P.DIVb:0]      FirstUM,   // U1.DIVb
+  input  logic [P.DIVb+1:0]    FirstC,            // Q2.DIVb
+  input  logic                 Firstun, SqrtE,
+  output logic                 WZeroE
+);
+  logic weq0E;
+  aplusbeq0 #(P.DIVb+4) wspluswceq0(WS, WC, weq0E);
+  if (P.RADIX == 2) begin: R2EarlyTerm
+    logic [P.DIVb+3:0] FZeroE, FZeroSqrtE, FZeroDivE;
+    logic [P.DIVb+2:0] FirstK;
+    logic wfeq0E;
+    logic [P.DIVb+3:0] WCF, WSF;
+
+    assign FirstK = ({1'b1, FirstC} & ~({1'b1, FirstC} << 1));
+    assign FZeroSqrtE = {FirstUM[P.DIVb], FirstUM, 2'b0} | {FirstK,1'b0};    // F for square root
+    assign FZeroDivE =  D << 1;                                    // F for divide
+    mux2 #(P.DIVb+4) fzeromux(FZeroDivE, FZeroSqrtE, SqrtE, FZeroE);
+    csa #(P.DIVb+4) fadd(WS, WC, FZeroE, 1'b0, WSF, WCF); // compute {WCF, WSF} = {WS + WC + FZero};
+    aplusbeq0 #(P.DIVb+4) wcfpluswsfeq0(WCF, WSF, wfeq0E);
+    assign WZeroE = weq0E|wfeq0E;
+  end else begin
+    assign WZeroE = weq0E;
+  end 
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtfdivsqrtpostproc.sv
+++ b/src/fpu/divremsqrt/divremsqrtfdivsqrtpostproc.sv
@ -0,0 +1,116 @@
+///////////////////////////////////////////
+// fdivsqrtpostproc.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
+// Modified:13 January 2022
+//
+// Purpose: Divide/Square root postprocessing
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtfdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
+  input  logic                 clk, reset,
+  input  logic                 StallM,
+  input  logic [P.DIVb+3:0]    WS, WC,            // Q4.DIVb
+  input  logic [P.DIVb+3:0]    D,                 // Q4.DIVb
+  input  logic [P.DIVb:0]      FirstU, FirstUM,   // U1.DIVb
+  input  logic [P.DIVb+1:0]    FirstC,            // Q2.DIVb
+  input  logic                 SqrtE,
+  input  logic                 Firstun, SqrtM, SpecialCaseM, 
+  input  logic [P.XLEN-1:0]    AM,                // U/Q(XLEN.0)
+  input  logic                 RemOpM, ALTBM, BZeroM, AsM, BsM, W64M, SIGNOVERFLOWM, ZeroDiffM, IntDivM,
+  input  logic [P.DIVBLEN-1:0] IntNormShiftM,
+  input  logic [P.XLEN-1:0]    PreIntResultM,
+  output logic [P.DIVb:0]      UmM,               // U1.DIVb result significand
+  output logic                 WZeroE,
+  output logic                 DivStickyM,
+  output logic [P.XLEN-1:0]    FIntDivResultM,     // U/Q(XLEN.0)
+  output logic [P.INTDIVb+3:0]    PreResultM
+
+);
+  
+  logic [P.DIVb+3:0]         Sum;
+  logic [P.INTDIVb+3:0]         W;
+  logic [P.DIVb:0]           PreUmM;
+  logic                      NegStickyM;
+  logic                      weq0E, WZeroM;
+  logic [P.XLEN-1:0]         IntDivResultM;
+  logic                      NegQuotM; // Integer quotient is negative
+
+  //////////////////////////
+  // Execute Stage: Detect early termination for an exact result
+  //////////////////////////
+
+  // check for early termination on an exact result. 
+  divremsqrtearlyterm #(P) earlyterm(.FirstC, .FirstUM, .D, .SqrtE, .WC, .WS,.Firstun, .WZeroE);
+  
+
+  //////////////////////////
+  // E/M Pipeline register
+  //////////////////////////
+ 
+  flopenr #(1) WZeroMReg(clk, reset, ~StallM, WZeroE, WZeroM);
+
+  //////////////////////////
+  // Memory Stage: Postprocessing
+  //////////////////////////
+
+  //  If the result is not exact, the sticky should be set
+  assign DivStickyM = ~WZeroM & ~SpecialCaseM; 
+
+  // Determine if sticky bit is negative *** Full sum only needed for Integer
+  assign Sum = WC + WS;
+  assign NegStickyM = Sum[P.DIVb+3];
+  mux2 #(P.DIVb+1) preummux(FirstU, FirstUM, NegStickyM, PreUmM); // Select U or U-1 depending on negative sticky bit
+  mux2 #(P.DIVb+1)    ummux(PreUmM, (PreUmM << 1), SqrtM, UmM);
+
+   // Integer quotient or remainder correction, normalization, and special cases
+  if (P.IDIV_ON_FPU) begin:intpostproc // Int supported
+    logic [P.INTDIVb+3:0] UnsignedQuotM, NormRemM, NormRemDM, NormQuotM;
+    logic signed [P.INTDIVb+3:0] PreResultM, PreResultShiftedM, PreIntResultM;
+    logic [P.INTDIVb+3:0] DTrunc, SumTrunc;
+
+    assign SumTrunc = Sum[P.DIVb+3:P.DIVb-P.INTDIVb];
+    assign DTrunc = D[P.DIVb+3:P.DIVb-P.INTDIVb];
+    arithrightshift #(P) rshift(SumTrunc, W);
+
+    assign UnsignedQuotM = {3'b000, PreUmM[P.DIVb:P.DIVb-P.INTDIVb]};
+
+    // Integer remainder: sticky and sign correction muxes
+    assign NegQuotM = AsM ^ BsM; // Integer Quotient is negative
+    mux2 #(P.INTDIVb+4) normremdmux(W, W+DTrunc, NegStickyM, NormRemDM);
+
+    // Select quotient or remainder and do normalization shift
+    mux2 #(P.INTDIVb+4)    presresultmux(UnsignedQuotM, NormRemDM, RemOpM, PreResultM);
+    intrightshift #(P) intnormshifter(PreResultM, IntNormShiftM, PreResultShiftedM);
+    mux2 #(P.INTDIVb+4)    preintresultmux(PreResultShiftedM, -PreResultShiftedM,AsM ^ (BsM&~RemOpM), PreIntResultM);
+
+    divremsqrtintspecialcase #(P) intspecialcase(BZeroM,RemOpM, ALTBM,AM,PreIntResultM,IntDivResultM);
+    // sign extend result for W64
+    if (P.XLEN==64) begin
+      mux2 #(64) resmux(IntDivResultM[P.XLEN-1:0], 
+        {{(P.XLEN-32){IntDivResultM[31]}}, IntDivResultM[31:0]}, // Sign extending in case of W64
+        W64M, FIntDivResultM);
+    end else 
+      assign FIntDivResultM = IntDivResultM[P.XLEN-1:0];
+  end
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtfdivsqrtpreproc.sv
+++ b/src/fpu/divremsqrt/divremsqrtfdivsqrtpreproc.sv
@ -0,0 +1,250 @@
+///////////////////////////////////////////
+// fdivsqrtpreproc.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
+// Modified:13 January 2022
+//
+// Purpose: Divide/Square root preprocessing: integer absolute value and W64, normalization shift
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtfdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
+  input  logic                 clk,
+  input  logic                 IFDivStartE, 
+  input  logic [P.NF:0]        Xm, Ym,      // Floating-point significands
+  input  logic [P.NE-1:0]      Xe, Ye,      // Floating-point exponents
+  input  logic [P.FMTBITS-1:0] FmtE,
+  input  logic                 SqrtE,
+  input  logic                 XZeroE,
+  input  logic [2:0]           Funct3E,
+  output logic [P.NE+1:0]      UeM,         // biased exponent of result
+  output logic [P.DIVb+3:0]    X, D,        // Q4.DIVb
+  // Int-specific
+  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // U(XLEN.0) inputs from IEU 
+  input  logic                 IntDivE, W64E,
+  // Outputs
+  output logic                 ISpecialCaseE,
+  output logic [P.DURLEN:0]  CyclesE,
+  output logic [P.DIVBLEN-1:0] IntNormShiftM,
+  output logic                 ALTBM, IntDivM, W64M, SIGNOVERFLOWM, ZeroDiffM,
+  output logic                 AsM, BsM, BZeroM,
+  output logic [P.XLEN-1:0]    AM
+);
+
+  logic [P.DIVb:0]             Xnorm, Dnorm;
+  logic [P.DIVb+3:0]           DivX, DivXShifted, SqrtX, PreShiftX; // Variations of dividend, to be muxed
+  logic [P.NE+1:0]             UeE;                                 // Result Exponent (FP only)
+  logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
+  logic [P.DIVBLEN-1:0]        mE, ell;                             // Leading zeros of inputs
+  logic [P.DIVBLEN-1:0]        IntResultBitsE;                      // bits in integer result
+  logic                        NumerZeroE;                          // Numerator is zero (X or A)
+  logic                        SIGNOVERFLOWE;
+  logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
+  logic                        SignedDivE;                          // signed division
+  logic                        AsE, BsE;                            // Signs of integer inputs
+  logic [P.XLEN-1:0]           AE;                                  // input A after W64 adjustment
+  logic                        ALTBE;
+  logic                        EvenExp;
+
+  logic [$clog2(P.RK):0] RightShiftX;
+  logic [P.DIVBLEN-1:0] ZeroDiff, p;
+
+
+  //////////////////////////////////////////////////////
+  // Integer Preprocessing
+  //////////////////////////////////////////////////////
+
+  if (P.IDIV_ON_FPU) begin:intpreproc // Int Supported
+    logic [P.XLEN-1:0] BE, PosA, PosB;
+
+    // Extract inputs, signs, zero, depending on W64 mode if applicable
+    assign SignedDivE = ~Funct3E[0];
+  
+    // Source handling
+    if (P.XLEN==64) begin // 64-bit, supports W64
+      mux2 #(64)    amux(ForwardedSrcAE, {{32{ForwardedSrcAE[31] & SignedDivE}}, ForwardedSrcAE[31:0]}, W64E, AE);
+      mux2 #(64)    bmux(ForwardedSrcBE, {{32{ForwardedSrcBE[31] & SignedDivE}}, ForwardedSrcBE[31:0]}, W64E, BE);
+    end else begin // 32 bits only
+      assign AE = ForwardedSrcAE;
+      assign BE = ForwardedSrcBE;
+     end
+    assign AZeroE = ~(|AE);
+    assign BZeroE = ~(|BE);
+    assign AsE = AE[P.XLEN-1] & SignedDivE;
+    assign BsE = BE[P.XLEN-1] & SignedDivE; 
+
+    // Force integer inputs to be postiive
+    mux2 #(P.XLEN) posamux(AE, -AE, AsE, PosA);
+    mux2 #(P.XLEN) posbmux(BE, -BE, BsE, PosB);
+
+    // Select integer or floating point inputs
+    mux2 #(P.DIVb+1) ifxmux({Xm, {(P.DIVb-P.NF){1'b0}}}, {PosA, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFX);
+    mux2 #(P.DIVb+1) ifdmux({Ym, {(P.DIVb-P.NF){1'b0}}}, {PosB, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFD);
+    mux2 #(1)    numzmux(XZeroE, AZeroE, IntDivE, NumerZeroE);
+  end else begin // Int not supported
+    assign IFX = {Xm, {(P.DIVb-P.NF){1'b0}}};
+    assign IFD = {Ym, {(P.DIVb-P.NF){1'b0}}};
+    assign NumerZeroE = XZeroE;
+  end
+
+  //////////////////////////////////////////////////////
+  // Integer & FP leading zero and normalization shift
+  //////////////////////////////////////////////////////
+
+  // count leading zeros for Subnorm FP and to normalize integer inputs
+  divremsqrtlzc #(P.DIVb+1) lzcX (IFX, ell);
+  divremsqrtlzc #(P.DIVb+1) lzcY (IFD, mE);
+
+  // Normalization shift: shift leading one into most significant bit
+  assign Xnorm = (IFX << ell);
+  assign Dnorm = (IFD << mE); 
+
+  //////////////////////////////////////////////////////
+  // Integer Right Shift to digit boundary
+  //  Determine DivXShifted (X shifted to digit boundary)
+  //  and nE (number of fractional digits)
+  //////////////////////////////////////////////////////
+
+  assign DivX = {3'b000, Xnorm}; // Zero-extend numerator for division
+
+  if (P.IDIV_ON_FPU) begin:intrightshift // Int Supported
+
+    // calculate number of result bits
+    assign ZeroDiff = mE - ell;         // Difference in number of leading zeros
+    assign ALTBE = ZeroDiff[P.DIVBLEN-1];  // A less than B (A has more leading zeros)
+    assign SIGNOVERFLOWE = 1'b0;
+
+    mux2 #(P.DIVBLEN) pmux(ZeroDiff, '0, ALTBE, p);          
+
+    /* verilator lint_off WIDTH */
+    assign IntResultBitsE = P.LOGR + p;  // Total number of result bits (r integer bits plus p fractional bits)
+   
+    /* verilator lint_on WIDTH */
+
+    // Integer special cases (terminate immediately)
+    assign ISpecialCaseE = BZeroE | ALTBE;
+
+    // calculate right shift amount RightShiftX to complete in discrete number of steps
+    if (P.RK > 1) begin // more than 1 bit per cycle
+      
+      /* verilator lint_offf WIDTH */
+      assign RightShiftX = P.RK - 1 - ((IntResultBitsE - 1) % P.RK); // Right shift amount
+      assign DivXShifted = DivX >> RightShiftX;                     // shift X by up to R*K-1 to complete in n steps
+      /* verilator lint_on WIDTH */
+    end else begin // radix 2 1 copy doesn't require shifting
+      assign DivXShifted = DivX;
+      assign RightShiftX = 0;
+    end
+  end else begin
+    assign ISpecialCaseE = 0;
+  end
+
+  //////////////////////////////////////////////////////
+  // Floating-Point Preprocessing
+  // Extend to Q4.b format
+  // shift square root to be in range [1/4, 1)
+  // Normalized numbers are shifted right by 1 if the exponent is odd
+  // Subnormal numbers have Xe = 0 and an unbiased exponent of 1-BIAS.  They are shifted right if the number of leading zeros is odd.
+   //////////////////////////////////////////////////////
+
+
+  // Sqrt is initialized on step one as R(X-1), so depends on Radix
+  // If X = 0, then special case logic sets sqrt = 0 so this portion doesn't matter
+  // Otherwise, X has a leading 1 after possible normalization shift and is now in range [1, 2)
+  // Next X is shifted right by 1 or 2 bits to range [1/4, 1) and exponent will be adjusted accordingly to be even
+  // Now (X-1) is negative.  Formed by placing all 1s in all four integer bits (in Q4.b) form, keeping X in fraciton bits
+  // Then multiply by R is left shift by r (1 or 2 for radix 2 or 4)
+  // This is optimized in hardware by first right shifting by 0 or 1 bit (instead of 1 or 2), then left shifting by (r-1), then subtracting 2 or 4
+  // Subtracting 2 is equivalent to adding 1110.  Subtracting 4 is equivalent to adding 1100.  Prepend leading 1s to do a free subtraction.
+  // This also means only one extra fractional bit is needed becaue we never shift right by more than 1.
+  // Radix      Exponent odd          Exponent Even
+  // 2          x-2 = 2(x/2 - 1)      x/2 - 2 = 2(x/4 - 1)
+  // 4          2(x)-4 = 4(x/2 - 1))  2(x/2)-4 = 4(x/4 - 1)
+  // Summary: PreSqrtX = r(x/2or4 - 1)
+
+  logic [P.DIVb:0] PreSqrtX;
+  assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
+  mux2 #(P.DIVb+4) sqrtxmux({4'b0,Xnorm[P.DIVb:1]}, {5'b00, Xnorm[P.DIVb:2]}, EvenExp, SqrtX); // X/2 if exponent odd, X/4 if exponent even
+
+/*  
+  // Attempt to optimize radix 4 to use a left shift by 1 or zero initially, followed by no more left shift
+  // This saves one bit in DIVb because there is no initial right shift.
+  // However, C needs to be extended further, lest it create a k with a 1 in the lsb when C is all 1s.
+  // That is an optimization for another day.
+  if (P.RADIX == 2) begin
+    logic [P.DIVb:0] PreSqrtX;    // U1.DIVb
+    mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
+    assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
+  end else begin
+    logic [P.DIVb+1:0] PreSqrtX;  // U2.DIVb
+    mux2 #(P.DIVb+2) sqrtxmux({Xnorm, 1'b0}, {1'b0, Xnorm}, EvenExp, PreSqrtX); // 2X if exponent odd, X if exponent even
+    assign SqrtX = {2'b11, PreSqrtX};                     // PreSqrtX - 4 = 4(PreSqrtX/4 - 1)
+  end
+*/
+
+  // Initialize X for division or square root
+  mux2 #(P.DIVb+4) prexmux(DivX, SqrtX, SqrtE, PreShiftX);                    
+
+  //////////////////////////////////////////////////////
+  // Selet integer or floating-point operands
+  //////////////////////////////////////////////////////
+ if (P.IDIV_ON_FPU) begin
+    mux2 #(P.DIVb+4) xmux(PreShiftX, DivXShifted, IntDivE, X);
+  end else begin
+    assign X = PreShiftX;
+  end
+
+  // Divisior register
+  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dnorm}, D);
+ 
+  // Floating-point exponent
+  fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .ell, .m(mE), .Ue(UeE));
+  flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
+
+  // Number of FSM cycles (to FSM)
+  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
+
+  if (P.IDIV_ON_FPU) begin:intpipelineregs
+    logic [P.DIVBLEN-1:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
+    logic               RemOpE;
+
+    /* verilator lint_off WIDTH */
+    assign IntDivNormShiftE = P.INTDIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
+    assign IntRemNormShiftE = mE + (P.INTDIVb-(P.XLEN-1));           // m + b - (N-1) for remainder normalization shift
+    /* verilator lint_on WIDTH */
+    assign RemOpE = Funct3E[1];
+    mux2 #(P.DIVBLEN) normshiftmux(IntDivNormShiftE, IntRemNormShiftE, RemOpE, IntNormShiftE);
+
+    // pipeline registers
+    flopen #(1)          mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
+    flopen #(1)         altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
+    flopen #(1)        bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
+    flopen #(1)        asignreg(clk, IFDivStartE, AsE,      AsM);
+    flopen #(1)        bsignreg(clk, IFDivStartE, BsE,      BsM);
+    flopen #(P.DIVBLEN)   nsreg(clk, IFDivStartE, IntNormShiftE, IntNormShiftM); 
+    flopen #(P.XLEN)    srcareg(clk, IFDivStartE, AE,       AM);
+    if (P.XLEN==64) 
+      flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);
+  end
+
+endmodule
+
--- a/src/fpu/divremsqrt/divremsqrtflags.sv
+++ b/src/fpu/divremsqrt/divremsqrtflags.sv
@ -0,0 +1,183 @@
+
+///////////////////////////////////////////
+// flags.sv
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: Post-Processing flag calculation
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtflags import cvw::*;  #(parameter cvw_t P) (
+  input  logic                Xs,                     // X sign
+  input  logic [P.FMTBITS-1:0] OutFmt,                 // output format
+  input  logic                InfIn,                  // is a Inf input being used
+  input  logic                XInf, YInf,             // inputs are infinity
+  input  logic                NaNIn,                  // is a NaN input being used
+  input  logic                XSNaN, YSNaN,           // inputs are signaling NaNs
+  input  logic                XZero, YZero,           // inputs are zero
+  input  logic [P.NE+1:0]      FullRe,                 // Re with bits to determine sign and overflow
+  input  logic [P.NE+1:0]      Me,                     // exponent of the normalized sum
+  // rounding
+  input  logic                Plus1,                  // do you add one for rounding
+  input  logic                Round, Guard, Sticky,   // bits used to determine rounding
+  input  logic                UfPlus1,                // do you add one for rounding for the unbounded exponent result
+  // divsqrt
+  input  logic                DivOp,                  // conversion opperation?
+  input  logic                Sqrt,                   // Sqrt?
+  // flags
+  output logic                DivByZero,              // divide by zero flag
+  output logic                Overflow,               // overflow flag to select result
+  output logic                Invalid,                // invalid flag to select the result
+  output logic [4:0]          PostProcFlg             // flags
+);
+
+  logic               SigNaN;         // is an input a signaling NaN
+  logic               Inexact;        // final inexact flag
+  logic               FpInexact;      // floating point inexact flag
+  logic               DivInvalid;     // integer invalid flag
+  logic               Underflow;      // Underflow flag
+  logic               ResExpGteMax;   // is the result greater than or equal to the maximum floating point expoent
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Overflow
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // determine if the result exponent is greater than or equal to the maximum exponent or 
+  // the shift amount is greater than the integers size (for cvt to int)
+  // ShiftGtIntSz calculation:  
+  //      a left shift of intlen+1 is still in range but any more than that is an overflow
+  //              inital: |      64 0's         |    XLEN     |
+  //                      |      64 0's         |    XLEN     | << 64
+  //                      |      XLEN           |    00000... |
+  //      65 = ...0 0 0 0   0 1 0 0   0 0 0 1
+  //          |     or      | |     or      |
+  //      33 = ...0 0 0 0   0 0 1 0   0 0 0 1
+  //          |     or        | |     or    |
+  //      larger or equal if:
+  //          - any of the bits after the most significan 1 is one
+  //          - the most signifcant in 65 or 33 is still a one in the number and
+  //            one of the later bits is one
+  if (P.FPSIZES == 1) begin
+      assign ResExpGteMax = &FullRe[P.NE-1:0] | FullRe[P.NE];
+
+  end else if (P.FPSIZES == 2) begin    
+      assign ResExpGteMax = OutFmt ? &FullRe[P.NE-1:0] | FullRe[P.NE] : &FullRe[P.NE1-1:0] | (|FullRe[P.NE:P.NE1]);
+
+  end else if (P.FPSIZES == 3) begin
+      always_comb
+          case (OutFmt)
+              P.FMT: ResExpGteMax = &FullRe[P.NE-1:0] | FullRe[P.NE];
+              P.FMT1: ResExpGteMax = &FullRe[P.NE1-1:0] | (|FullRe[P.NE:P.NE1]);
+              P.FMT2: ResExpGteMax = &FullRe[P.NE2-1:0] | (|FullRe[P.NE:P.NE2]);
+              default: ResExpGteMax = 1'bx;
+          endcase
+
+  end else if (P.FPSIZES == 4) begin        
+      always_comb
+          case (OutFmt)
+              P.Q_FMT: ResExpGteMax = &FullRe[P.Q_NE-1:0] | FullRe[P.Q_NE];
+              P.D_FMT: ResExpGteMax = &FullRe[P.D_NE-1:0] | (|FullRe[P.Q_NE:P.D_NE]);
+              P.S_FMT: ResExpGteMax = &FullRe[P.S_NE-1:0] | (|FullRe[P.Q_NE:P.S_NE]);
+              P.H_FMT: ResExpGteMax = &FullRe[P.H_NE-1:0] | (|FullRe[P.Q_NE:P.H_NE]);
+          endcase
+  end
+
+
+  // calulate overflow flag:
+  //                 if the result is greater than or equal to the max exponent(not taking into account sign)
+  //                 |           and the exponent isn't negitive
+  //                 |           |                   if the input isnt infinity or NaN
+  //                 |           |                   |            
+  assign Overflow = ResExpGteMax & ~FullRe[P.NE+1]&~(InfIn|NaNIn|DivByZero);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Underflow
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // calculate underflow flag: detecting tininess after rounding
+  //                  the exponent is negitive
+  //                  |                    the result is subnormal
+  //                  |                    |                    the result is normal and rounded from a Subnorm
+  //                  |                    |                    |                                      and if given an unbounded exponent the result does not round
+  //                  |                    |                    |                                      |                     and if the result is not exact
+  //                  |                    |                    |                                      |                     |               and if the input isnt infinity or NaN
+  //                  |                    |                    |                                      |                     |               |
+  //assign Underflow = ((FullRe[P.NE+1] | (FullRe == 0) | ((FullRe == 1) & (Me == 0) & ~(UfPlus1&Guard)))&(Round|(Sticky&~XZero)|Guard))&~(InfIn|NaNIn|DivByZero|Invalid);
+  assign Underflow = ((FullRe[P.NE+1] | (FullRe == 0) | ((FullRe == 1) & (Me == 0) & ~(UfPlus1&Guard)))&(Round|(Sticky)|Guard))&~(InfIn|NaNIn|DivByZero|Invalid);
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Inexact
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
+  //      - Don't set the underflow flag if an underflowed res isn't outputed
+  //assign FpInexact = ((Sticky&~XZero)|Guard|Overflow|Round)&~(InfIn|NaNIn|DivByZero|Invalid);
+  assign FpInexact = (Sticky|Guard|Overflow|Round)&~(InfIn|NaNIn|DivByZero|Invalid|XZero);
+
+  //                  if the res is too small to be represented and not 0
+  //                  |                                     and if the res is not invalid (outside the integer bounds)
+  //                  |                                     |
+
+  // select the inexact flag to output
+  assign Inexact = FpInexact;
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Invalid
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // Set Invalid flag for following cases:
+  //   1) any input is a signaling NaN
+  //   2) Inf - Inf (unless x or y is NaN)
+  //   3) 0 * Inf
+
+  
+  assign SigNaN = (XSNaN) | (YSNaN) ;
+  
+  //invalid flag for division
+  assign DivInvalid = ((XInf & YInf) | (XZero & YZero))&~Sqrt | (Xs&Sqrt&~NaNIn&~XZero);
+
+  assign Invalid = SigNaN | (DivInvalid&DivOp);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Divide by Zero
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // if dividing by zero and not 0/0
+  //  - don't set flag if an input is NaN or Inf(IEEE says has to be a finite numerator)
+  assign DivByZero = YZero&DivOp&~Sqrt&~(XZero|NaNIn|InfIn);  
+
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // final flags
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // Combine flags
+  //      - to integer results do not set the underflow or overflow flags
+  assign PostProcFlg = {Invalid, DivByZero, Overflow, Underflow, Inexact};
+
+endmodule
+
+
+
+
--- a/src/fpu/divremsqrt/divremsqrtintspecialcase.sv
+++ b/src/fpu/divremsqrt/divremsqrtintspecialcase.sv
@ -0,0 +1,15 @@
+module divremsqrtintspecialcase import cvw::*; #(parameter cvw_t P) (
+    input logic BZeroM,RemOpM, ALTBM,
+    input logic [P.XLEN-1:0] AM,
+    input  signed [P.INTDIVb+3:0] PreIntResultM,
+    output logic [P.XLEN-1:0] IntDivResultM
+);
+always_comb
+      if (BZeroM) begin         // Divide by zero
+        if (RemOpM) IntDivResultM = AM;  
+        else        IntDivResultM = {(P.XLEN){1'b1}};
+     end else if (ALTBM) begin // Numerator is small
+        if (RemOpM) IntDivResultM = AM;
+        else        IntDivResultM = 0;
+     end else       IntDivResultM = PreIntResultM[P.XLEN-1:0];
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtlzc.sv
+++ b/src/fpu/divremsqrt/divremsqrtlzc.sv
@ -0,0 +1,39 @@
+///////////////////////////////////////////
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: Leading Zero Counter
+// 
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtlzc #(parameter WIDTH = 1) (
+  input  logic [WIDTH-1:0]            num,    // number to count the leading zeroes of
+  output logic [$clog2(WIDTH)-1:0]  ZeroCnt // the number of leading zeroes
+);
+
+  integer i;
+  
+  always_comb begin
+    i = 0;
+    while ((i < WIDTH) & ~num[WIDTH-1-i]) i = i+1;  // search for leading one
+    ZeroCnt = i[$clog2(WIDTH)-1:0];
+  end
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtnormshift.sv
+++ b/src/fpu/divremsqrt/divremsqrtnormshift.sv
@ -0,0 +1,81 @@
+///////////////////////////////////////////
+// normshift.sv
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: normalization shifter
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+    // convert shift
+    //      fp -> int:  | `XLEN  zeros      |     Mantissa      | 0's if necessary | << CalcExp
+    //          process:
+    //              - start - CalcExp = 1 + XExp - Largest Bias
+    //                  | `XLEN  zeros      |     Mantissa      | 0's if necessary |
+    //
+    //              - shift left 1 (1)
+    //                  | `XLEN-1 zeros |bit|     frac          | 0's if necessary |
+    //                                      . <- binary point
+    //
+    //              - shift left till unbiased exponent is 0 (XExp - Largest Bias)
+    //                  |  0's |     Mantissa      |      0's if necessary     |
+    //                  |     keep          |
+    //
+    //      fp -> fp:
+    //          - if result is subnormal or underflowed:
+    //              |  `NF-1  zeros   |     Mantissa      | 0's if necessary | << NF+CalcExp-1
+    //          process:
+    //             - start
+    //                 |     mantissa      | 0's |
+    //
+    //             - shift right by NF-1 (NF-1)
+    //                 |    `NF-1  zeros   |     mantissa      | 0's |
+    //
+    //             - shift left by CalcExp = XExp - Largest bias + new bias
+    //                 |   0's  |     mantissa      |     0's      |
+    //                 |       keep      |
+    //
+    //          - if the input is subnormal:
+    //                 |     lzcIn      | 0's if necessary | << ZeroCnt+1
+    //              - plus 1 to shift out the first 1
+    //
+    //      int -> fp: |     lzcIn      | 0's if necessary | << ZeroCnt+1
+    //              - plus 1 to shift out the first 1
+
+    // fma shift
+    //      |   00   |           Sm           | << LZA output
+    //             .
+    //      - two extra bits so we can correct for an LZA error of 1 or 2
+
+    // divsqrt shift
+    //      | Nf 0's |           Qm           | << calculated shift amount
+    //        .
+
+module divremsqrtnormshift import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.LOGNORMSHIFTSZDRSU-1:0]  ShiftAmt,   // shift amount
+  input  logic [P.NORMSHIFTSZDRSU-1:0]     ShiftIn,    // number to be shifted
+  output logic [P.NORMSHIFTSZDRSU-1:0]     Shifted     // shifted result
+);
+   
+  assign Shifted = ShiftIn << ShiftAmt;
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtpostprocess.sv
+++ b/src/fpu/divremsqrt/divremsqrtpostprocess.sv
@ -0,0 +1,177 @@
+///////////////////////////////////////////
+// postprocess.sv
+//
+// Written: kekim@hmc.edu
+// Modified: 19 May 2023
+//
+// Purpose: Post-Processing: normalization, rounding, sign, flags, special cases
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+module divremsqrtpostprocess import cvw::*;  #(parameter cvw_t P)  (
+  // general signals
+  input logic                             Xs, Ys,     // input signs
+  input logic  [P.NF:0]                    Xm, Ym,     // input mantissas
+  input logic  [2:0]                      Frm,        // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+  input logic  [P.FMTBITS-1:0]             Fmt,        // precision 1 = double 0 = single
+  input logic  [3:0]                      OpCtrl,     // choose which opperation (look below for values)
+  input logic                             XZero, YZero,        // inputs are zero
+  input logic                             XInf, YInf,          // inputs are infinity
+  input logic                             XNaN, YNaN,          // inputs are NaN
+  input logic                             XSNaN, YSNaN,        // inputs are signaling NaNs
+  input logic  [1:0]                      PostProcSel,         // select result to be written to fp register
+  //fma signals
+  //divide signals
+  input logic                             DivSticky,  // divider sticky bit
+  input logic  [P.NE+1:0]                  DivUe,      // divsqrt exponent
+  input logic  [P.NF+2:0]                  DivUm,      // divsqrt significand
+  input logic  [P.DIVBLEN-1:0]             IntNormShiftM, // integer normalization left-shift amount (after pre-shifting right)
+  input logic  [P.INTDIVb+3:0]          PreResultM, // integer result to be shifted
+  input logic                              IntDivM,
+  // final results
+  output logic [P.FLEN-1:0]                PostProcRes,// postprocessor final result
+  output logic [4:0]                      PostProcFlg, // postprocesser flags
+  output logic [P.XLEN-1:0]  PreIntResultM // normalized integer result
+  );
+
+  
+  // general signals
+  logic                       Rs;         // result sign
+  logic [P.NF-1:0]             Rf;         // Result fraction
+  logic [P.NE-1:0]             Re;         // Result exponent
+  logic                       Ms;         // norMalized sign
+  logic [P.NORMSHIFTSZDRSU-1:0]    Mf;         // norMalized fraction
+  logic [P.NE+1:0]             Me;         // normalized exponent
+  logic [P.NE+1:0]             FullRe;     // Re with bits to determine sign and overflow
+  logic                       UfPlus1;    // do you add one (for determining underflow flag)
+  logic [P.LOGNORMSHIFTSZDRSU-1:0] ShiftAmt;   // normalization shift amount
+  logic [P.NORMSHIFTSZDRSU-1:0]    ShiftIn;    // input to normalization shift
+  logic [P.NORMSHIFTSZDRSU-1:0]    Shifted;    // the ouput of the normalized shifter (before shift correction)
+  logic                       Plus1;      // add one to the final result?
+  logic                       Overflow;   // overflow flag used to select results
+  logic                       Invalid;    // invalid flag used to select results
+  logic                       Guard, Round, Sticky; // bits needed to determine rounding
+  logic [P.FMTBITS-1:0]        OutFmt;     // output format
+  // division singals
+  logic [P.LOGNORMSHIFTSZDRSU-1:0] DivShiftAmt;        // divsqrt shif amount
+  logic [P.NORMSHIFTSZDRSU-1:0]    DivShiftIn;         // divsqrt shift input
+  logic [P.NE+1:0]             Ue;                 // divsqrt corrected exponent after corretion shift
+  logic                       DivByZero;          // divide by zero flag
+  logic                       DivResSubnorm;      // is the divsqrt result subnormal
+  logic                       DivSubnormShiftPos; // is the divsqrt subnorm shift amout positive (not underflowed)
+  // conversion signals
+  logic [P.CVTLEN+P.NF:0]       CvtShiftIn;         // number to be shifted for converter
+  logic [1:0]                 CvtNegResMsbs;      // most significant bits of possibly negated int result
+  logic [P.XLEN+1:0]           CvtNegRes;          // possibly negated integer result
+  logic                       CvtResUf;           // did the convert result underflow
+  logic                       IntInvalid;         // invalid integer flag
+  // readability signals
+  logic                       Mult;       // multiply opperation
+  logic                       Sqrt;       // is the divsqrt opperation sqrt
+  logic                       Int64;      // is the integer 64 bits?
+  logic                       Signed;     // is the opperation with a signed integer?
+  logic                       IntToFp;    // is the opperation an int->fp conversion?
+  logic                       CvtOp;      // convertion opperation
+  logic                       DivOp;      // divider opperation
+  logic                       InfIn;      // are any of the inputs infinity
+  logic                       NaNIn;      // are any of the inputs NaN
+
+  // signals to help readability
+  
+  assign DivOp = (PostProcSel == 2'b01);
+  assign Sqrt =  OpCtrl[0];
+
+  // is there an input of infinity or NaN being used
+  assign InfIn = XInf|YInf;
+  assign NaNIn = XNaN|YNaN;
+
+  // choose the ouptut format depending on the opperation
+  //      - fp -> fp: OpCtrl contains the percision of the output
+  //      - otherwise: Fmt contains the percision of the output
+  if (P.FPSIZES == 2) 
+      //assign OutFmt = IntToFp|~CvtOp ? Fmt : (OpCtrl[1:0] == P.FMT); 
+      assign OutFmt = Fmt;
+  else if (P.FPSIZES == 3 | P.FPSIZES == 4) 
+      //assign OutFmt = IntToFp|~CvtOp ? Fmt : OpCtrl[1:0]; 
+      assign OutFmt = Fmt;
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Normalization
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // final claulations before shifting
+
+  divremsqrtdivshiftcalc #(P) divremsqrtdivshiftcalc(.DivUe, .DivUm, .DivResSubnorm, .DivSubnormShiftPos, .DivShiftAmt, .DivShiftIn);
+
+  assign ShiftAmt = DivShiftAmt;
+  assign ShiftIn = DivShiftIn;
+  
+  // main normalization shift
+  divremsqrtnormshift #(P) divremsqrtnormshift (.ShiftIn, .ShiftAmt, .Shifted);
+
+  // correct for LZA/divsqrt error
+  divremsqrtshiftcorrection #(P) shiftcorrection(.DivResSubnorm, .DivSubnormShiftPos, .DivOp(1'b1), .DivUe, .Ue, .Shifted, .Mf);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Rounding
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // round to nearest even
+  // round to zero
+  // round to -infinity
+  // round to infinity
+  // round to nearest max magnitude
+
+  // calulate result sign used in rounding unit
+  divremsqrtroundsign #(P) roundsign( .DivOp(1'b1), .Sqrt, .Xs, .Ys, .Ms);
+
+  divremsqrtround #(P) round(.OutFmt, .Frm, .Plus1, .Ue,
+      .Ms, .Mf, .DivSticky, .DivOp(1'b1), .UfPlus1, .FullRe, .Rf, .Re, .Sticky, .Round, .Guard, .Me);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Sign calculation
+  ///////////////////////////////////////////////////////////////////////////////
+
+  assign Rs = Ms;
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Flags
+  ///////////////////////////////////////////////////////////////////////////////
+
+  divremsqrtflags #(P) flags(.XSNaN, .YSNaN, .XInf, .YInf, .InfIn, .XZero, .YZero, 
+              .Xs, .OutFmt, .Sqrt,
+              .NaNIn, .Round, .DivByZero,
+              .Guard, .Sticky, .UfPlus1,.DivOp(1'b1), .FullRe, .Plus1,
+              .Me, .Invalid, .Overflow, .PostProcFlg);
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Select the result
+  ///////////////////////////////////////////////////////////////////////////////
+
+  //negateintres negateintres(.Xs, .Shifted, .Signed, .Int64, .Plus1, .CvtNegResMsbs, .CvtNegRes);
+
+  divremsqrtspecialcase #(P) specialcase(.Xs, .Xm, .Ym, .XZero, 
+      .Frm, .OutFmt, .XNaN, .YNaN,  
+      .NaNIn, .Plus1, .Invalid, .Overflow, .InfIn,
+      .XInf, .YInf, .DivOp(1'b1), .DivByZero, .FullRe, .Rs, .Re, .Rf, .PostProcRes );
+
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtround.sv
+++ b/src/fpu/divremsqrt/divremsqrtround.sv
@ -0,0 +1,267 @@
+///////////////////////////////////////////
+// divremsqrtround.sv
+//
+// Written: kekim@hmc.edu, me@KatherineParry.com
+// Modified: 19 May 2023
+//
+// Purpose: Rounder
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+module divremsqrtround import cvw::*;  #(parameter cvw_t P)  (
+  input  logic [P.FMTBITS-1:0]     OutFmt,             // output format
+  input  logic [2:0]              Frm,                // rounding mode
+  input  logic                    Ms,                 // normalized sign
+  input  logic [P.NORMSHIFTSZDRSU-1:0] Mf,                 // normalized fraction
+  // divsqrt
+  input  logic                    DivOp,              // is a division opperation being done
+  input  logic                    DivSticky,          // divsqrt sticky bit
+  input  logic [P.NE+1:0]          Ue,                 // the divsqrt calculated expoent
+  // outputs
+  output logic [P.NE+1:0]          Me,                 // normalied fraction
+  output logic                    UfPlus1,            // do you add one to the result if given an unbounded exponent
+  output logic [P.NE+1:0]          FullRe,             // Re with bits to determine sign and overflow
+  output logic [P.NE-1:0]          Re,                 // Result exponent
+  output logic [P.NF-1:0]          Rf,                 // Result fractionNormS
+  output logic                    Sticky,             // sticky bit
+  output logic                    Plus1,              // do you add one to the final result
+  output logic                    Round, Guard        // bits needed to calculate rounding
+);
+
+  logic           UfCalcPlus1;        // calculated plus one for unbounded exponent
+  logic           NormSticky;         // normalized sum's sticky bit
+  logic [P.NF-1:0] RoundFrac;          // rounded fraction
+  logic           FpGuard, FpRound;   // floating point round/guard bits
+  logic           FpLsbRes;           // least significant bit of floating point result
+  logic           LsbRes;             // lsb of result
+  logic           CalcPlus1;          // calculated plus1
+  logic           FpPlus1;            // do you add one to the fp result 
+  logic [P.FLEN:0] RoundAdd;           // how much to add to the result
+
+// what position is XLEN in?
+//  options: 
+//     1: XLEN > NF   > NF1
+//     2: NF   > XLEN > NF1
+//     3: NF   > NF1  > XLEN
+//  single and double will always be smaller than XLEN
+
+  ///////////////////////////////////////////////////////////////////////////////
+  // Rounding
+  ///////////////////////////////////////////////////////////////////////////////
+
+  // round to nearest even
+  //      {Round, Sticky}
+  //      0x - do nothing
+  //      10 - tie - Plus1 if result is odd  (LSBNormSum = 1)
+  //          - don't add 1 if a small number was supposed to be subtracted
+  //      11 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
+  //         - plus 1 otherwise
+
+  //  round to zero - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
+
+  //  round to -infinity
+  //          - Plus1 if negative unless a small number was supposed to be subtracted from a result with guard and round bits of 0
+  //          - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
+
+  //  round to infinity
+  //          - Plus1 if positive unless a small number was supposed to be subtracted from a result with guard and round bits of 0
+  //          - subtract 1 if a small number was supposed to be subtracted from a negative result with guard and round bits of 0
+
+  //  round to nearest max magnitude
+  //      {Guard, Round, Sticky}
+  //      0x - do nothing
+  //      10 - tie - Plus1
+  //          - don't add 1 if a small number was supposed to be subtracted
+  //      11 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
+  //         - Plus 1 otherwise
+
+
+  // determine what format the final result is in: int or fp
+
+  // sticky bit calculation
+  if (P.FPSIZES == 1) begin
+    assign NormSticky = (|Mf[P.NORMSHIFTSZDRSU-P.NF-2:0]);
+
+  end else if (P.FPSIZES == 2) begin
+    assign NormSticky = (|Mf[P.NORMSHIFTSZDRSU-P.NF1-2:P.NORMSHIFTSZDRSU-P.NF-1]&(~OutFmt)) |
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.NF-2:0]);
+
+
+  end else if (P.FPSIZES == 3) begin
+
+    assign NormSticky = (|Mf[P.NORMSHIFTSZDRSU-P.NF2-2:P.NORMSHIFTSZDRSU-P.NF1-1]&(OutFmt==P.FMT2)) |
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.NF1-2:P.NORMSHIFTSZDRSU-P.NF-1]&(~(OutFmt==P.FMT))) |
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.NF-2:0]);
+
+  end else if (P.FPSIZES == 4) begin
+    assign NormSticky = (|Mf[P.NORMSHIFTSZDRSU-P.H_NF-2:P.NORMSHIFTSZDRSU-P.Q_NF-1]&(OutFmt==P.H_FMT)) |
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.S_NF-2:P.NORMSHIFTSZDRSU-P.Q_NF-1]&((OutFmt==P.S_FMT))) | 
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.D_NF-2:P.NORMSHIFTSZDRSU-P.Q_NF-1]&((OutFmt==P.D_FMT))) |
+                                                (|Mf[P.NORMSHIFTSZDRSU-P.Q_NF-2:0]&(OutFmt==P.Q_FMT));
+  end
+  
+
+
+  // only add the Addend sticky if doing an FMA opperation
+  //      - the shifter shifts too far left when there's an underflow (shifting out all possible sticky bits)
+  //assign Sticky = DivSticky&DivOp | NormSticky | StickySubnorm;
+  assign Sticky = DivSticky&DivOp | NormSticky;
+  //assign Sticky = DivSticky&DivOp;
+  
+
+
+
+  // determine round and LSB of the rounded value
+  //      - underflow round bit is used to determint the underflow flag
+  if (P.FPSIZES == 1) begin
+      assign FpGuard = Mf[P.NORMSHIFTSZDRSU-P.NF-1];
+      assign FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.NF];
+      assign FpRound = Mf[P.NORMSHIFTSZDRSU-P.NF-2];
+
+  end else if (P.FPSIZES == 2) begin
+      assign FpGuard = OutFmt ? Mf[P.NORMSHIFTSZDRSU-P.NF-1] : Mf[P.NORMSHIFTSZDRSU-P.NF1-1];
+      assign FpLsbRes = OutFmt ? Mf[P.NORMSHIFTSZDRSU-P.NF] : Mf[P.NORMSHIFTSZDRSU-P.NF1];
+      assign FpRound = OutFmt ? Mf[P.NORMSHIFTSZDRSU-P.NF-2] : Mf[P.NORMSHIFTSZDRSU-P.NF1-2];
+
+  end else if (P.FPSIZES == 3) begin
+      always_comb
+          case (OutFmt)
+              P.FMT: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.NF];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.NF-2];
+              end
+              P.FMT1: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.NF1-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.NF1];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.NF1-2];
+              end
+              P.FMT2: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.NF2-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.NF2];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.NF2-2];
+              end
+              default: begin
+                  FpGuard = 1'bx;
+                  FpLsbRes = 1'bx;
+                  FpRound = 1'bx;
+              end
+          endcase
+  end else if (P.FPSIZES == 4) begin
+      always_comb
+          case (OutFmt)
+              2'h3: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.Q_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.Q_NF];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.Q_NF-2];
+              end
+              2'h1: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.D_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.D_NF];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.D_NF-2];
+              end
+              2'h0: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.S_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.S_NF];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.S_NF-2];
+              end
+              2'h2: begin
+                  FpGuard = Mf[P.NORMSHIFTSZDRSU-P.H_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZDRSU-P.H_NF];
+                  FpRound = Mf[P.NORMSHIFTSZDRSU-P.H_NF-2];
+              end
+          endcase
+  end
+
+  
+  assign Guard =  FpGuard;
+  assign LsbRes = FpLsbRes;
+  assign Round =  FpRound;
+
+
+  always_comb begin
+      // Determine if you add 1
+      case (Frm)
+          3'b000: CalcPlus1 = Guard & (Round|Sticky|LsbRes);//round to nearest even
+          3'b001: CalcPlus1 = 0;//round to zero
+          3'b010: CalcPlus1 = Ms;//round down
+          3'b011: CalcPlus1 = ~Ms;//round up
+          3'b100: CalcPlus1 = Guard;//round to nearest max magnitude
+          default: CalcPlus1 = 1'bx;
+      endcase
+      // Determine if you add 1 (for underflow flag)
+      case (Frm)
+          3'b000: UfCalcPlus1 = Round & (Sticky|Guard);//round to nearest even
+          3'b001: UfCalcPlus1 = 0;//round to zero
+          3'b010: UfCalcPlus1 = Ms;//round down
+          3'b011: UfCalcPlus1 = ~Ms;//round up
+          3'b100: UfCalcPlus1 = Round;//round to nearest max magnitude
+          default: UfCalcPlus1 = 1'bx;
+      endcase
+  
+  end
+
+  // If an answer is exact don't round
+  assign Plus1 = CalcPlus1 & (Sticky|Round|Guard);
+  assign FpPlus1 = Plus1;
+  assign UfPlus1 = UfCalcPlus1 & (Sticky|Round);
+
+
+
+
+  // place Plus1 into the proper position for the format
+  if (P.FPSIZES == 1) begin
+      assign RoundAdd = {{P.FLEN{1'b0}}, FpPlus1};
+
+  end else if (P.FPSIZES == 2) begin
+      // \/FLEN+1
+      //  | NE+2 |        NF      |
+      //  '-NE+2-^----NF1----^
+      // P.FLEN+1-P.NE-2-P.NF1 = FLEN-1-NE-NF1
+      assign RoundAdd = {(P.NE+1+P.NF1)'(0), FpPlus1&~OutFmt, (P.NF-P.NF1-1)'(0), FpPlus1&OutFmt};
+
+  end else if (P.FPSIZES == 3) begin
+      assign RoundAdd = {(P.NE+1+P.NF2)'(0), FpPlus1&(OutFmt==P.FMT2), (P.NF1-P.NF2-1)'(0), FpPlus1&(OutFmt==P.FMT1), (P.NF-P.NF1-1)'(0), FpPlus1&(OutFmt==P.FMT)};
+
+  end else if (P.FPSIZES == 4)      
+      assign RoundAdd = {(P.Q_NE+1+P.H_NF)'(0), FpPlus1&(OutFmt==P.H_FMT), (P.S_NF-P.H_NF-1)'(0), FpPlus1&(OutFmt==P.S_FMT), (P.D_NF-P.S_NF-1)'(0), FpPlus1&(OutFmt==P.D_FMT), (P.Q_NF-P.D_NF-1)'(0), FpPlus1&(OutFmt==P.Q_FMT)};
+
+
+
+  // trim unneeded bits from fraction
+  assign RoundFrac = Mf[P.NORMSHIFTSZDRSU-1:P.NORMSHIFTSZDRSU-P.NF];
+  
+
+
+  // select the exponent
+  assign Me = Ue;
+
+
+
+  // round the result
+  //      - if the fraction overflows one should be added to the exponent
+  assign {FullRe, Rf} = {Me, RoundFrac} + RoundAdd;
+  assign Re = FullRe[P.NE-1:0];
+
+
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtroundsign.sv
+++ b/src/fpu/divremsqrt/divremsqrtroundsign.sv
@ -0,0 +1,45 @@
+///////////////////////////////////////////
+// divremsqrtroundsign.sv
+//
+// Written: kekim@hmc.edu,me@KatherineParry.com
+// Modified: 19 May 2023
+//
+// Purpose: Sign calculation for rounding
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module divremsqrtroundsign import cvw::*;  #(parameter cvw_t P) (
+  input logic         Xs,     // x sign
+  input logic         Ys,     // y sign
+  input logic         Sqrt,   // sqrt oppertion? (when using divsqrt unit)
+  input logic         DivOp,  // is divsqrt opperation
+  output logic        Ms      // normalized result sign
+);
+
+  logic               Qs;     // divsqrt result sign
+
+  // calculate divsqrt sign
+  assign Qs = Xs^(Ys&~Sqrt);
+
+  // Select sign for rounding calulation
+  assign Ms = (Qs&DivOp);
+
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtshiftcorrection.sv
+++ b/src/fpu/divremsqrt/divremsqrtshiftcorrection.sv
@ -0,0 +1,94 @@
+///////////////////////////////////////////
+// divremsqrtshiftcorrection.sv
+//
+// Written: me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: shift correction
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+module divremsqrtshiftcorrection import cvw::*;  #(parameter cvw_t P) (
+  input logic  [P.NORMSHIFTSZDRSU-1:0] Shifted,                // the shifted sum before LZA correction
+  // divsqrt
+  input logic                     DivOp,                  // is it a divsqrt opperation
+  input logic                     DivResSubnorm,          // is the divsqrt result subnormal
+  input logic  [P.NE+1:0]          DivUe,                  // the divsqrt result's exponent
+  input logic                     DivSubnormShiftPos,     // is the subnorm divider shift amount positive (ie not underflowed)
+  //fma
+  //input logic                     FmaOp,                  // is it an fma opperation
+  //input logic  [P.NE+1:0]          NormSumExp,             // exponent of the normalized sum not taking into account Subnormal or zero results
+  //input logic                     FmaPreResultSubnorm,    // is the result subnormal - calculated before LZA corection
+  //input logic                     FmaSZero,
+  // output
+  //output logic [P.NE+1:0]          FmaMe,                  // exponent of the normalized sum
+  output logic [P.NORMSHIFTSZDRSU-1:0] Mf,                     // the shifted sum before LZA correction
+  output logic [P.NE+1:0]          Ue                      // corrected exponent for divider
+);
+
+  logic [P.NORMSHIFTSZDRSU-1:0]    CorrQm0, CorrQm1;           // portions of Shifted to select for CorrQmShifted
+  logic [P.NORMSHIFTSZDRSU-1:0]    CorrQmShifted;              // the shifted divsqrt result after one bit shift
+  logic                       ResSubnorm;                 // is the result Subnormal
+  logic                       LZAPlus1;                   // add one or two to the sum's exponent due to LZA correction
+  logic                       LeftShiftQm;                // should the divsqrt result be shifted one to the left
+
+  // LZA correction
+  assign LZAPlus1 = Shifted[P.NORMSHIFTSZDRSU-1];
+
+  // correct the shifting error caused by the LZA
+  //  - the only possible mantissa for a plus two is all zeroes 
+  //      - a one has to propigate all the way through a sum. so we can leave the bottom statement alone
+  //mux2 #(P.NORMSHIFTSZDRSU-2) lzacorrmux(Shifted[P.NORMSHIFTSZDRSU-3:0], Shifted[P.NORMSHIFTSZDRSU-2:1], LZAPlus1, CorrSumShifted);
+
+  // correct the shifting of the divsqrt caused by producing a result in (2, .5] range
+  //    condition: if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Subnorm)
+  assign LeftShiftQm = (LZAPlus1|(DivUe==1&~LZAPlus1));
+  //assign LeftShiftQm = ((DivUe==1));
+  assign CorrQm0 = {Shifted[P.NORMSHIFTSZDRSU-3:0],{2'b00}};
+  assign CorrQm1 = {Shifted[P.NORMSHIFTSZDRSU-2:0],{1'b0}};
+  mux2 #(P.NORMSHIFTSZDRSU) divcorrmux(CorrQm0, CorrQm1, LeftShiftQm, CorrQmShifted);
+  
+  // if the result of the divider was calculated to be subnormal, then the result was correctly normalized, so select the top shifted bits
+  always_comb
+    //if(FmaOp)                       Mf = {CorrSumShifted, {P.NORMSHIFTSZDRSU-(3*P.NF+4){1'b0}}};
+    //if (DivOp&~DivResSubnorm)  Mf = CorrQmShifted;
+    if (~DivResSubnorm)  Mf = CorrQmShifted;
+    else                       Mf = Shifted[P.NORMSHIFTSZDRSU-1:0];
+    
+  // Determine sum's exponent
+  //  main exponent issues: 
+  //      - LZA was one too large
+  //      - LZA was two too large
+  //      - if the result was calulated to be subnorm but it's norm and the LZA was off by 1
+  //      - if the result was calulated to be subnorm but it's norm and the LZA was off by 2
+  //                          if plus1                    If plus2                               kill if the result Zero or actually subnormal
+  //                          |                           |                                      |
+  //assign FmaMe = (NormSumExp+{{P.NE+1{1'b0}}, LZAPlus1} +{{P.NE+1{1'b0}}, FmaPreResultSubnorm}) & {P.NE+2{~(FmaSZero|ResSubnorm)}};
+  
+  // recalculate if the result is subnormal after LZA correction
+  //assign ResSubnorm = FmaPreResultSubnorm&~Shifted[P.NORMSHIFTSZDRSU-2]&~Shifted[P.NORMSHIFTSZDRSU-1];
+
+  // the quotent is in the range [.5,2) if there is no early termination
+  // if the quotent < 1 and not Subnormal then subtract 1 to account for the normalization shift
+  assign Ue = (DivResSubnorm & DivSubnormShiftPos) ? '0 : DivUe - {(P.NE+1)'(0), ~LZAPlus1};
+  //assign Ue = (DivResSubnorm ) ? '0 : DivUe - {(P.NE+1)'(0), ~LZAPlus1};
+endmodule
--- a/src/fpu/divremsqrt/divremsqrtspecialcase.sv
+++ b/src/fpu/divremsqrt/divremsqrtspecialcase.sv
@ -0,0 +1,240 @@
+///////////////////////////////////////////
+// divremsqrtspecialcase.sv
+//
+// Written: kekim@hmc.edu,me@KatherineParry.com
+// Modified: 7/5/2022
+//
+// Purpose: special case selection
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+module divremsqrtspecialcase import cvw::*;  #(parameter cvw_t P) (
+  input  logic                Xs,         // X sign
+  input  logic [P.NF:0]        Xm, Ym, // input significand's
+  input  logic                XNaN, YNaN, // are the inputs NaN
+  input  logic [2:0]          Frm,        // rounding mode
+  input  logic [P.FMTBITS-1:0] OutFmt,     // output format
+  input  logic                InfIn,      // are any inputs infinity
+  input  logic                NaNIn,      // are any input NaNs
+  input  logic                XInf, YInf, // are X or Y inifnity
+  input  logic                XZero,      // is X zero
+  input  logic                Plus1,      // do you add one for rounding
+  input  logic                Rs,         // the result's sign
+  input  logic                Invalid, Overflow,  // flags to choose the result
+  input  logic [P.NE-1:0]      Re,         // Result exponent
+  input  logic [P.NE+1:0]      FullRe,     // Result full exponent
+  input  logic [P.NF-1:0]      Rf,         // Result fraction
+  // divsqrt
+  input  logic                DivOp,      // is it a divsqrt opperation
+  input  logic                DivByZero,  // divide by zero flag
+  // outputs
+  output logic [P.FLEN-1:0]    PostProcRes // final result
+);
+
+  logic [P.FLEN-1:0]   XNaNRes;    // X is NaN result
+  logic [P.FLEN-1:0]   YNaNRes;    // Y is NaN result
+  logic [P.FLEN-1:0]   InvalidRes; // Invalid result result
+  logic [P.FLEN-1:0]   UfRes;      // underflowed result result
+  logic [P.FLEN-1:0]   OfRes;      // overflowed result result
+  logic [P.FLEN-1:0]   NormRes;    // normal result
+  logic               OfResMax;   // does the of result output maximum norm fp number
+  logic               KillRes;    // kill the result for underflow
+  logic               SelOfRes;   // should the overflow result be selected
+
+
+  // does the overflow result output the maximum normalized floating point number
+  //                output infinity if the input is infinity
+  assign OfResMax = (~InfIn)&~DivByZero&((Frm[1:0]==2'b01) | (Frm[1:0]==2'b10&~Rs) | (Frm[1:0]==2'b11&Rs));
+
+  // select correct outputs for special cases
+  if (P.FPSIZES == 1) begin
+      //NaN res selection depending on standard
+      if(P.IEEE754) begin
+          assign XNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Xm[P.NF-2:0]};
+          assign YNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Ym[P.NF-2:0]};
+          assign InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+      end else begin
+          assign InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+      end
+
+      assign OfRes =  OfResMax ? {Rs, {P.NE-1{1'b1}}, 1'b0, {P.NF{1'b1}}} : {Rs, {P.NE{1'b1}}, {P.NF{1'b0}}};
+      assign UfRes = {Rs, {P.FLEN-2{1'b0}}, Plus1&Frm[1]&~(DivOp&YInf)};
+      assign NormRes = {Rs, Re, Rf};
+
+  end else if (P.FPSIZES == 2) begin
+      if(P.IEEE754) begin
+          assign XNaNRes = OutFmt ? {1'b0, {P.NE{1'b1}}, 1'b1, Xm[P.NF-2:0]} : {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.NF1]};
+          assign YNaNRes = OutFmt ? {1'b0, {P.NE{1'b1}}, 1'b1, Ym[P.NF-2:0]} : {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.NF1]};
+          assign InvalidRes = OutFmt ? {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}} : {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, (P.NF1-1)'(0)};
+      end else begin 
+          assign InvalidRes = OutFmt ? {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}} : {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, (P.NF1-1)'(0)};
+      end
+
+      always_comb
+          if(OutFmt)
+              if(OfResMax)    OfRes = {Rs, {P.NE-1{1'b1}}, 1'b0, {P.NF{1'b1}}};
+              else            OfRes = {Rs, {P.NE{1'b1}}, {P.NF{1'b0}}};
+          else
+              if(OfResMax)    OfRes = {{P.FLEN-P.LEN1{1'b1}}, Rs, {P.NE1-1{1'b1}}, 1'b0, {P.NF1{1'b1}}};
+              else            OfRes = {{P.FLEN-P.LEN1{1'b1}}, Rs, {P.NE1{1'b1}}, (P.NF1)'(0)};
+      assign UfRes = OutFmt ? {Rs, (P.FLEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)} : {{P.FLEN-P.LEN1{1'b1}}, Rs, (P.LEN1-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+      assign NormRes = OutFmt ? {Rs, Re, Rf} : {{P.FLEN-P.LEN1{1'b1}}, Rs, Re[P.NE1-1:0], Rf[P.NF-1:P.NF-P.NF1]};
+
+  end else if (P.FPSIZES == 3) begin
+      always_comb
+          case (OutFmt)
+              P.FMT: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Xm[P.NF-2:0]};
+                      YNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Ym[P.NF-2:0]};
+                      InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+                  end else begin 
+                      InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+                  end
+                  
+                  OfRes = OfResMax ? {Rs, {P.NE-1{1'b1}}, 1'b0, {P.NF{1'b1}}} : {Rs, {P.NE{1'b1}}, {P.NF{1'b0}}};
+                  UfRes = {Rs, (P.FLEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {Rs, Re, Rf};
+              end
+              P.FMT1: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.NF1]};
+                      YNaNRes = {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.NF1]};
+                      InvalidRes = {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, (P.NF1-1)'(0)};
+                  end else begin 
+                      InvalidRes = {{P.FLEN-P.LEN1{1'b1}}, 1'b0, {P.NE1{1'b1}}, 1'b1, (P.NF1-1)'(0)};
+                  end
+                  OfRes = OfResMax ? {{P.FLEN-P.LEN1{1'b1}}, Rs, {P.NE1-1{1'b1}}, 1'b0, {P.NF1{1'b1}}} : {{P.FLEN-P.LEN1{1'b1}}, Rs, {P.NE1{1'b1}}, (P.NF1)'(0)};
+                  UfRes = {{P.FLEN-P.LEN1{1'b1}}, Rs, (P.LEN1-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {{P.FLEN-P.LEN1{1'b1}}, Rs, Re[P.NE1-1:0], Rf[P.NF-1:P.NF-P.NF1]};
+              end
+              P.FMT2: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {{P.FLEN-P.LEN2{1'b1}}, 1'b0, {P.NE2{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.NF2]};
+                      YNaNRes = {{P.FLEN-P.LEN2{1'b1}}, 1'b0, {P.NE2{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.NF2]};
+                      InvalidRes = {{P.FLEN-P.LEN2{1'b1}}, 1'b0, {P.NE2{1'b1}}, 1'b1, (P.NF2-1)'(0)};
+                  end else begin 
+                      InvalidRes = {{P.FLEN-P.LEN2{1'b1}}, 1'b0, {P.NE2{1'b1}}, 1'b1, (P.NF2-1)'(0)};
+                  end
+                  
+                  OfRes = OfResMax ? {{P.FLEN-P.LEN2{1'b1}}, Rs, {P.NE2-1{1'b1}}, 1'b0, {P.NF2{1'b1}}} : {{P.FLEN-P.LEN2{1'b1}}, Rs, {P.NE2{1'b1}}, (P.NF2)'(0)};
+                  UfRes = {{P.FLEN-P.LEN2{1'b1}}, Rs, (P.LEN2-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {{P.FLEN-P.LEN2{1'b1}}, Rs, Re[P.NE2-1:0], Rf[P.NF-1:P.NF-P.NF2]};
+              end
+              default: begin
+                  if(P.IEEE754) begin
+                      XNaNRes = (P.FLEN)'(0);
+                      YNaNRes = (P.FLEN)'(0);
+                      InvalidRes = (P.FLEN)'(0);
+                  end else begin 
+                      InvalidRes = (P.FLEN)'(0);
+                  end
+                  OfRes = (P.FLEN)'(0);
+                  UfRes = (P.FLEN)'(0);
+                  NormRes = (P.FLEN)'(0);
+              end
+          endcase
+
+  end else if (P.FPSIZES == 4) begin 
+      always_comb
+          case (OutFmt)
+              2'h3: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Xm[P.NF-2:0]};
+                      YNaNRes = {1'b0, {P.NE{1'b1}}, 1'b1, Ym[P.NF-2:0]};
+                      InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+                  end else begin 
+                      InvalidRes = {1'b0, {P.NE{1'b1}}, 1'b1, {P.NF-1{1'b0}}};
+                  end
+                  
+                  OfRes = OfResMax ? {Rs, {P.NE-1{1'b1}}, 1'b0, {P.NF{1'b1}}} : {Rs, {P.NE{1'b1}}, {P.NF{1'b0}}};
+                  UfRes = {Rs, (P.FLEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {Rs, Re, Rf};
+              end
+              2'h1: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {{P.FLEN-P.D_LEN{1'b1}}, 1'b0, {P.D_NE{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.D_NF]};
+                      YNaNRes = {{P.FLEN-P.D_LEN{1'b1}}, 1'b0, {P.D_NE{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.D_NF]};
+                      InvalidRes = {{P.FLEN-P.D_LEN{1'b1}}, 1'b0, {P.D_NE{1'b1}}, 1'b1, (P.D_NF-1)'(0)};
+                  end else begin 
+                      InvalidRes = {{P.FLEN-P.D_LEN{1'b1}}, 1'b0, {P.D_NE{1'b1}}, 1'b1, (P.D_NF-1)'(0)};
+                  end
+                  OfRes = OfResMax ? {{P.FLEN-P.D_LEN{1'b1}}, Rs, {P.D_NE-1{1'b1}}, 1'b0, {P.D_NF{1'b1}}} : {{P.FLEN-P.D_LEN{1'b1}}, Rs, {P.D_NE{1'b1}}, (P.D_NF)'(0)};
+                  UfRes = {{P.FLEN-P.D_LEN{1'b1}}, Rs, (P.D_LEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {{P.FLEN-P.D_LEN{1'b1}}, Rs, Re[P.D_NE-1:0], Rf[P.NF-1:P.NF-P.D_NF]};
+              end
+              2'h0: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {{P.FLEN-P.S_LEN{1'b1}}, 1'b0, {P.S_NE{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.S_NF]};
+                      YNaNRes = {{P.FLEN-P.S_LEN{1'b1}}, 1'b0, {P.S_NE{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.S_NF]};
+                      InvalidRes = {{P.FLEN-P.S_LEN{1'b1}}, 1'b0, {P.S_NE{1'b1}}, 1'b1, (P.S_NF-1)'(0)};
+                  end else begin 
+                      InvalidRes = {{P.FLEN-P.S_LEN{1'b1}}, 1'b0, {P.S_NE{1'b1}}, 1'b1, (P.S_NF-1)'(0)};
+                  end
+                  
+                  OfRes = OfResMax ? {{P.FLEN-P.S_LEN{1'b1}}, Rs, {P.S_NE-1{1'b1}}, 1'b0, {P.S_NF{1'b1}}} : {{P.FLEN-P.S_LEN{1'b1}}, Rs, {P.S_NE{1'b1}}, (P.S_NF)'(0)};
+                  UfRes = {{P.FLEN-P.S_LEN{1'b1}}, Rs, (P.S_LEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {{P.FLEN-P.S_LEN{1'b1}}, Rs, Re[P.S_NE-1:0], Rf[P.NF-1:P.NF-P.S_NF]};
+              end
+              2'h2: begin  
+                  if(P.IEEE754) begin
+                      XNaNRes = {{P.FLEN-P.H_LEN{1'b1}}, 1'b0, {P.H_NE{1'b1}}, 1'b1, Xm[P.NF-2:P.NF-P.H_NF]};
+                      YNaNRes = {{P.FLEN-P.H_LEN{1'b1}}, 1'b0, {P.H_NE{1'b1}}, 1'b1, Ym[P.NF-2:P.NF-P.H_NF]};
+                      InvalidRes = {{P.FLEN-P.H_LEN{1'b1}}, 1'b0, {P.H_NE{1'b1}}, 1'b1, (P.H_NF-1)'(0)};
+                  end else begin 
+                      InvalidRes = {{P.FLEN-P.H_LEN{1'b1}}, 1'b0, {P.H_NE{1'b1}}, 1'b1, (P.H_NF-1)'(0)};
+                  end
+                  
+                  OfRes = OfResMax ? {{P.FLEN-P.H_LEN{1'b1}}, Rs, {P.H_NE-1{1'b1}}, 1'b0, {P.H_NF{1'b1}}} : {{P.FLEN-P.H_LEN{1'b1}}, Rs, {P.H_NE{1'b1}}, (P.H_NF)'(0)};      
+                // zero is exact if dividing by infinity so don't add 1
+                  UfRes = {{P.FLEN-P.H_LEN{1'b1}}, Rs, (P.H_LEN-2)'(0), Plus1&Frm[1]&~(DivOp&YInf)};
+                  NormRes = {{P.FLEN-P.H_LEN{1'b1}}, Rs, Re[P.H_NE-1:0], Rf[P.NF-1:P.NF-P.H_NF]};
+              end
+          endcase
+  end
+
+  // determine if you shoould kill the res - Cvt
+  //      - do so if the res underflows, is zero (the exp doesnt calculate correctly). or the integer input is 0
+  //      - dont set to zero if fp input is zero but not using the fp input
+  //      - dont set to zero if int input is zero but not using the int input
+  assign KillRes = FullRe[P.NE+1] | (((YInf&~XInf)|XZero)&DivOp);//Underflow & ~ResSubnorm & (Re!=1);
+  
+  // calculate if the overflow result should be selected
+  assign SelOfRes = Overflow|DivByZero|(InfIn&~(YInf&DivOp));
+  
+  // output infinity with result sign if divide by zero
+  if(P.IEEE754)
+    always_comb
+      if(XNaN)                    PostProcRes = XNaNRes;
+      else if(YNaN)               PostProcRes = YNaNRes;
+      else if(Invalid)            PostProcRes = InvalidRes;
+      else if(SelOfRes)           PostProcRes = OfRes;
+      else if(KillRes)            PostProcRes = UfRes;
+      else                        PostProcRes = NormRes;
+  else
+    always_comb
+      if(NaNIn|Invalid)           PostProcRes = InvalidRes;
+      else if(SelOfRes)           PostProcRes = OfRes;
+      else if(KillRes)            PostProcRes = UfRes;
+      else                        PostProcRes = NormRes;
+
+endmodule
--- a/src/fpu/divremsqrt/drsu.sv
+++ b/src/fpu/divremsqrt/drsu.sv
@ -0,0 +1,102 @@
+///////////////////////////////////////////
+// drsu.sv
+//
+// Written: kekim@hmc.edu
+// Modified:19 May 2023
+//
+// Purpose: Combined Divide and Square Root Floating Point and Integer Unit with postprocessing
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+module drsu import cvw::*;  #(parameter cvw_t P) (
+  input  logic                clk, 
+  input  logic                reset, 
+  input  logic [P.FMTBITS-1:0] FmtE,
+  input  logic                XsE, YsE,
+  input  logic [P.NF:0]        XmE, YmE,
+  input  logic [P.NE-1:0]      XeE, YeE,
+  input  logic                XInfE, YInfE, 
+  input  logic                XZeroE, YZeroE, 
+  input  logic                XNaNE, YNaNE, 
+  input  logic                XSNaNE, YSNaNE,
+  input  logic                FDivStartE, IDivStartE,
+  input  logic                StallM,
+  input  logic                FlushE,
+  input  logic                SqrtE, SqrtM,
+  input  logic [P.XLEN-1:0]    ForwardedSrcAE, ForwardedSrcBE, // these are the src outputs before the mux choosing between them and PCE to put in srcA/B
+  input  logic [2:0]          Funct3E, Funct3M,
+  input  logic                IntDivE, W64E,
+  input  logic [2:0]          Frm,
+  input  logic [3:0]          OpCtrl,
+  input  logic [1:0]          PostProcSel,
+  output logic                FDivBusyE, IFDivStartE, FDivDoneE,
+  output logic [P.FLEN-1:0]    FResM,
+  output logic [P.XLEN-1:0]    FIntDivResultM,
+  output logic [4:0]          FlgM
+);
+
+  // Floating-point division and square root module, with optional integer division and remainder
+  // Computes X/Y, sqrt(X), A/B, or A%B
+
+  logic [P.DIVb+3:0]           WS, WC;                       // Partial remainder components
+  logic [P.DIVb+3:0]           X;                            // Iterator Initial Value (from dividend)
+  logic [P.DIVb+3:0]           D;                            // Iterator Divisor
+  logic [P.DIVb:0]             FirstU, FirstUM;              // Intermediate result values
+  logic [P.DIVb+1:0]           FirstC;                       // Step tracker
+  logic                       Firstun;                      // Quotient selection
+  logic                       WZeroE;                       // Early termination flag
+  logic [P.DURLEN-1:0]         CyclesE;                      // FSM cycles
+  logic                       SpecialCaseM;                 // Divide by zero, square root of negative, etc.
+  logic                       DivStartE;                    // Enable signal for flops during stall
+                                                            
+  // Integer div/rem signals                                
+  logic                       BZeroM;                       // Denominator is zero
+  logic                       IntDivM;                      // Integer operation
+  logic [P.DIVBLEN:0]          nM, mM;                       // Shift amounts
+  logic                       NegQuotM, ALTBM, AsM, W64M;   // Special handling for postprocessor
+  logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
+  logic                       ISpecialCaseE;                // Integer div/remainder special cases
+  logic [P.DIVb:0]             UmM;
+  logic [P.NF+2:0]             UmMexact; //U1.NF+2
+  logic [P.NE+1:0]             UeM;
+  logic                       DivStickyM;
+  logic [P.INTDIVb+3:0]          PreResultM;
+  logic [P.XLEN-1:0]          PreIntResultM;
+  logic [P.DIVBLEN-1:0]       IntNormShiftM;
+
+  divremsqrt #(P) divremsqrt(.clk, .reset, .XsE, .FmtE, .XmE, .YmE, 
+            .XeE, .YeE, .SqrtE, .SqrtM,
+                    .XInfE, .YInfE, .XZeroE, .YZeroE, 
+            .XNaNE, .YNaNE, 
+                    .FDivStartE, .IDivStartE, .W64E,
+                    .StallM, .DivStickyM, .FDivBusyE, .UeM,
+                    .UmM,
+                    .FlushE, .ForwardedSrcAE, .ForwardedSrcBE, .Funct3M,
+                    .Funct3E, .IntDivE, .FIntDivResultM, .IntDivM,
+                    .FDivDoneE, .IFDivStartE, .IntNormShiftM, .PreIntResultM, .PreResultM);
+  assign UmMexact = UmM[P.DIVb:P.DIVb-(P.NF+3-1)]; // grabbing top 1+(NF+2) msbs
+  divremsqrtpostprocess #(P) divremsqrtpostprocess(.Xs(XsE), .Ys(YsE), .Xm(XmE), .Ym(YmE), .Frm(Frm), .Fmt(FmtE), .OpCtrl, .IntDivM,
+    .XZero(XZeroE), .YZero(YZeroE), .XInf(XInfE), .YInf(YInfE), .XNaN(XNaNE), .YNaN(YNaNE), .XSNaN(XSNaNE), 
+    .YSNaN(YSNaNE), .PostProcSel,.DivSticky(DivStickyM), .DivUe(UeM), .DivUm(UmMexact), .PostProcRes(FResM), .PostProcFlg(FlgM),
+    .PreIntResultM, .PreResultM, .IntNormShiftM);
+endmodule
+
--- a/src/fpu/divremsqrt/intrightshift.sv
+++ b/src/fpu/divremsqrt/intrightshift.sv
@ -0,0 +1,37 @@
+///////////////////////////////////////////
+// fdivsqrtpostproc.sv
+//
+// Written: David_Harris@hmc.edu, me@KatherineParry.com, cturek@hmc.edu
+// Modified:13 January 2022
+//
+// Purpose: Divide/Square root postprocessing
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module intrightshift import cvw::*;  #(parameter cvw_t P) (
+  input logic signed [P.INTDIVb+3:0] shiftin,
+  input logic [P.DIVBLEN-1:0] shiftamt,
+  output logic signed [P.INTDIVb+3:0] shifted
+);
+  assign shifted = shiftin >> shiftamt;
+
+endmodule
--- a/testbench/testbench-fp.sv
+++ b/testbench/testbench-fp.sv
--- a/tests/fp/combined_IF_vectors/create_IF_vectors.sh
+++ b/tests/fp/combined_IF_vectors/create_IF_vectors.sh
@ -1,5 +1,7 @@
 #!/bin/sh
 # create test vectors for stand alone int

+mkdir IF_vectors
 ./extract_testfloat_vectors.py
 ./extract_arch_vectors.py
+cp IF_vectors/*  ../vectors