From b815f17560004993f80cd529047ed9a4cddc05a4 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Tue, 20 Feb 2024 17:16:29 -0800
Subject: [PATCH 01/19] regression-wally handles softfloat

---
 sim/regression-wally | 114 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 104 insertions(+), 10 deletions(-)

diff --git a/sim/regression-wally b/sim/regression-wally
index d06ac0b28..61f133fa9 100755
--- a/sim/regression-wally
+++ b/sim/regression-wally
@@ -33,6 +33,7 @@ os.chdir(regressionDir)
 coverage = '-coverage' in sys.argv
 fp = '-fp' in sys.argv
 nightly = '-nightly' in sys.argv
+softfloat = '-softfloat' in sys.argv
 
 TestCase = namedtuple("TestCase", ['name', 'variant', 'cmd', 'grepstr'])
 # name:     the name of this test configuration (used in printing human-readable
@@ -161,6 +162,45 @@ for test in tests64gc:
 
 # run derivative configurations if requested  
 if (nightly):
+    derivconfigtests = [
+        ["div_2_1_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_1i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_2_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_2i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_4_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_4i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_1_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_1i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_2_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_2i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_4_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_4_4i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
+        ["div_2_1_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_1i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_2_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_2i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_4_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_2_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_1_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_1i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_2_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_2i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_4_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+
+
+        ["f_rv32gc", ["arch32f", "arch32f_divsqrt"]],
+        ["fh_rv32gc", ["arch32f", "arch32f_divsqrt"]],
+        ["fdh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32d", "arch32d_divsqrt"]],
+        ["fdq_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32d", "arch32d_divsqrt" ]],
+        ["fdqh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32d", "arch32d_divsqrt"]],
+        ["f_rv64gc", ["arch64f", "arch64f_divsqrt"]],
+        ["fh_rv64gc", ["arch64f", "arch64f_divsqrt"]], # hanging 1/31/24 dh; try again when lint is fixed
+        ["fdh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64d", "arch64d_divsqrt"]],
+        ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64d", "arch64d_divsqrt"]],
+        ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64d", "arch64d_divsqrt"]],
+    ]
+    """
     derivconfigtests = [
         ["tlb2_rv32gc", ["wally32priv"]],
         ["tlb16_rv32gc", ["wally32priv"]],
@@ -269,16 +309,16 @@ if (nightly):
 
 
 #  enable floating-point tests when lint is fixed
-#        ["f_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma"]],
-#        ["fh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfh", "arch32zfh_divsqrt"]],
-#        ["fdh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt"]],
-#        ["fdq_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt"]],
-#        ["fdqh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt"]],
-#        ["f_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma"]],
-#        ["fh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64zfh", "arch64zfh_divsqrt"]], # hanging 1/31/24 dh; try again when lint is fixed
-#        ["fdh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt"]],
-#        ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt"]],
-#        ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt"]],
+        ["f_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma"]],
+        ["fh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfh", "arch32zfh_divsqrt"]],
+        ["fdh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt"]],
+        ["fdq_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt"]],
+        ["fdqh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt"]],
+        ["f_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma"]],
+        ["fh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64zfh", "arch64zfh_divsqrt"]], # hanging 1/31/24 dh; try again when lint is fixed
+        ["fdh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt"]],
+        ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt"]],
+        ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt"]],
         
 
     ]
@@ -309,6 +349,58 @@ for test in tests32e:
         grepstr="All tests ran without failures")
   configs.append(tc)
 
+
+# softfloat tests
+if (softfloat):
+    configs = []
+    softfloatconfigs = ['fdh_ieee_rv32gc', 'fdqh_ieee_rv32gc', 'fdq_ieee_rv32gc', \
+    'fh_ieee_v32gc', 'f_ieee_rv64gc', 'fdqh_ieee_rv64gc', \
+    'fdq_ieee_rv64gc', 'div_2_1_rv32gc', 'div_2_2_rv32gc', \
+    'div_2_4_rv32gc', 'div_4_1_rv32gc', 'div_4_2_rv32gc', \
+    'div_4_4_rv32gc', 'fd_ieee_rv32gc', 'fh_ieee_rv32gc', \
+    'div_2_1_rv64gc', 'div_2_2_rv64gc', 'div_2_4_rv64gc', \
+    'div_4_1_rv64gc', 'div_4_2_rv64gc', 'div_4_4_rv64gc', \
+    'fd_ieee_rv64gc', 'fh_ieee_rv64gc', 'f_ieee_rv32gc']
+    for config in softfloatconfigs:
+        # div test case
+        divtest = TestCase(
+            name="div",
+            variant=config,
+            cmd="vsim > {} -c  <<!\ndo testfloat.do " + config + " div \n!",
+            grepstr="All Tests completed with          0 errors"
+        )
+        configs.append(divtest)
+
+        # sqrt test case
+        sqrttest = TestCase(
+            name="sqrt",
+            variant=config,
+            cmd="vsim > {} -c  <<!\ndo testfloat.do " + config + " sqrt \n!",
+            grepstr="All Tests completed with          0 errors"
+        )
+        configs.append(sqrttest)
+
+        # skip if divider variant config
+        if ("ieee" in config[0]):
+            # cvtint test case
+            cvtinttest = TestCase(
+                name="cvtint",
+                variant=config,
+                cmd="vsim > {} -c  <<!\ndo testfloat.do " + config + " cvtint \n!",
+                grepstr="All Tests completed with          0 errors"
+                )
+            configs.append(cvtinttest)
+
+            # cvtfp test case
+            cvtfptest = TestCase(
+                name="cvtfp",
+                variant=config,
+                cmd="vsim > {} -c  <<!\ndo testfloat.do " + config + " cvtfp \n!",
+                grepstr="All Tests completed with          0 errors"
+            )
+            configs.append(cvtfptest)    
+      
+
     
 
 import os
@@ -368,6 +460,8 @@ def main():
     elif '-nightly' in sys.argv:
         TIMEOUT_DUR = 60*1440 # 1 day
         configs.append(getBuildrootTC(boot=False))
+    elif '-softfloat' in sys.argv:
+        TIMEOUT_DUR = 60*60 # seconds
     else:
         TIMEOUT_DUR = 10*60 # seconds
         configs.append(getBuildrootTC(boot=False))

From 2b662565d7b6e2d4046337e86efa1feeca5b379c Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Tue, 20 Feb 2024 17:17:45 -0800
Subject: [PATCH 02/19] typo fix

---
 sim/regression-wally | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sim/regression-wally b/sim/regression-wally
index 61f133fa9..b721d8899 100755
--- a/sim/regression-wally
+++ b/sim/regression-wally
@@ -200,7 +200,6 @@ if (nightly):
         ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64d", "arch64d_divsqrt"]],
         ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64d", "arch64d_divsqrt"]],
     ]
-    """
     derivconfigtests = [
         ["tlb2_rv32gc", ["wally32priv"]],
         ["tlb16_rv32gc", ["wally32priv"]],

From c8ff1bddec4bb87aae060c9b9da6094871bff55b Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Tue, 20 Feb 2024 17:21:29 -0800
Subject: [PATCH 03/19] formatting

---
 addins/riscv-arch-test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
index 8a52b016d..c955abf75 160000
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@@ -1 +1 @@
-Subproject commit 8a52b016dbe1e2733cc168b9d6e5c93e39059d4d
+Subproject commit c955abf757df98cf38809e40a62d2a6b448ea507

From 7e3df23f28c202a8eaf0aaa67e539e1c32c45fc4 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Tue, 20 Feb 2024 17:24:04 -0800
Subject: [PATCH 04/19] Revert "formatting"

This reverts commit c8ff1bddec4bb87aae060c9b9da6094871bff55b.
---
 addins/riscv-arch-test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
index c955abf75..8a52b016d 160000
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@@ -1 +1 @@
-Subproject commit c955abf757df98cf38809e40a62d2a6b448ea507
+Subproject commit 8a52b016dbe1e2733cc168b9d6e5c93e39059d4d

From 19a61e301ea4a1aea3382225b6fdd017c054d088 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Tue, 20 Feb 2024 17:24:15 -0800
Subject: [PATCH 05/19] formatting

---
 sim/regression-wally | 39 ---------------------------------------
 1 file changed, 39 deletions(-)

diff --git a/sim/regression-wally b/sim/regression-wally
index b721d8899..28c2e9a7a 100755
--- a/sim/regression-wally
+++ b/sim/regression-wally
@@ -162,44 +162,6 @@ for test in tests64gc:
 
 # run derivative configurations if requested  
 if (nightly):
-    derivconfigtests = [
-        ["div_2_1_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_2_1i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_2_2_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_2_2i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_2_4_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_2_4i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_4_1_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_4_1i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_4_2_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_4_2i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_4_4_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_4_4i_rv32gc", ["arch32f_divsqrt", "arch32d_divsqrt", "arch32m"]],
-        ["div_2_1_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_2_1i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_2_2_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_2_2i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_2_4_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_2_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_4_1_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_4_1i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_4_2_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_4_2i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_4_4_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_4_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-
-
-        ["f_rv32gc", ["arch32f", "arch32f_divsqrt"]],
-        ["fh_rv32gc", ["arch32f", "arch32f_divsqrt"]],
-        ["fdh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32d", "arch32d_divsqrt"]],
-        ["fdq_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32d", "arch32d_divsqrt" ]],
-        ["fdqh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32d", "arch32d_divsqrt"]],
-        ["f_rv64gc", ["arch64f", "arch64f_divsqrt"]],
-        ["fh_rv64gc", ["arch64f", "arch64f_divsqrt"]], # hanging 1/31/24 dh; try again when lint is fixed
-        ["fdh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64d", "arch64d_divsqrt"]],
-        ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64d", "arch64d_divsqrt"]],
-        ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64d", "arch64d_divsqrt"]],
-    ]
     derivconfigtests = [
         ["tlb2_rv32gc", ["wally32priv"]],
         ["tlb16_rv32gc", ["wally32priv"]],
@@ -306,7 +268,6 @@ if (nightly):
         ["bpred_GSHARE_10_10_10_0_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
         ["bpred_GSHARE_10_10_10_1_rv32gc", ["embench"], "configOptions", "-GPrintHPMCounters=1"],
 
-
 #  enable floating-point tests when lint is fixed
         ["f_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma"]],
         ["fh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfh", "arch32zfh_divsqrt"]],

From 02081cac409fec665c0ed0de29648f8dbbca4fb4 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Wed, 21 Feb 2024 20:49:38 -0800
Subject: [PATCH 06/19] softfloat jobs now run concurrently with help of
 testfloat-batch.do directing compiled designs into individual folders for
 each config/test

---
 sim/regression-wally   | 15 +++++++-----
 sim/testfloat-batch.do | 55 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 6 deletions(-)
 create mode 100644 sim/testfloat-batch.do

diff --git a/sim/regression-wally b/sim/regression-wally
index 28c2e9a7a..e53ebd0d8 100755
--- a/sim/regression-wally
+++ b/sim/regression-wally
@@ -326,7 +326,7 @@ if (softfloat):
         divtest = TestCase(
             name="div",
             variant=config,
-            cmd="vsim > {} -c  <<!\ndo testfloat.do " + config + " div \n!",
+            cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " div \n!",
             grepstr="All Tests completed with          0 errors"
         )
         configs.append(divtest)
@@ -335,27 +335,30 @@ if (softfloat):
         sqrttest = TestCase(
             name="sqrt",
             variant=config,
-            cmd="vsim > {} -c  <<!\ndo testfloat.do " + config + " sqrt \n!",
+            cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " sqrt \n!",
             grepstr="All Tests completed with          0 errors"
         )
-        configs.append(sqrttest)
+        #configs.append(sqrttest)
+        configs.insert(0,sqrttest)
+
 
         # skip if divider variant config
-        if ("ieee" in config[0]):
+        if ("ieee" in config):
             # cvtint test case
             cvtinttest = TestCase(
                 name="cvtint",
                 variant=config,
-                cmd="vsim > {} -c  <<!\ndo testfloat.do " + config + " cvtint \n!",
+                cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " cvtint \n!",
                 grepstr="All Tests completed with          0 errors"
                 )
             configs.append(cvtinttest)
 
             # cvtfp test case
+            # WILL fail on F_only (cvtfp converts from one fpsize to another. refer to spec)
             cvtfptest = TestCase(
                 name="cvtfp",
                 variant=config,
-                cmd="vsim > {} -c  <<!\ndo testfloat.do " + config + " cvtfp \n!",
+                cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " cvtfp \n!",
                 grepstr="All Tests completed with          0 errors"
             )
             configs.append(cvtfptest)    
diff --git a/sim/testfloat-batch.do b/sim/testfloat-batch.do
new file mode 100644
index 000000000..f20867ddc
--- /dev/null
+++ b/sim/testfloat-batch.do
@@ -0,0 +1,55 @@
+# testfloat-batch.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021; Kevin Kim 2024
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# run with vsim -do "do wally.do rv64ic riscvarchtest-64m"
+
+onbreak {resume}
+
+# create library
+
+if [file exists wkdir/work_${1}_${2}] {
+    vdel -lib wkdir/work_${1}_${2} -all
+}
+vlib wkdir/work_${1}_${2}
+
+
+
+# c# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+# $num = the added words after the call
+
+vlog -lint -work wkdir/work_${1}_${2} +incdir+../config/$1 +incdir+../config/deriv/$1 +incdir+../config/shared ../src/cvw.sv ../testbench/testbench-fp.sv ../src/fpu/*.sv ../src/fpu/*/*.sv ../src/generic/*.sv  ../src/generic/flop/*.sv -suppress 2583,7063,8607,2697,7033 
+
+
+# Set WAV variable to avoid having any output to wave (to limit disk space)
+quietly set WAV 0;
+
+# Determine if nowave argument is provided this removes any output to
+# a wlf or wave window to reduce disk space.
+if {$WAV eq 0} {
+    puts "No wave output is selected"
+} else {
+    puts "wave output is selected"
+    view wave
+    add log -recursive /*
+    do wave-fpu.do    
+}  
+
+# Change TEST_SIZE to only test certain FP width
+# values are QP, DP, SP, HP or all for all tests
+
+vopt +acc wkdir/work_${1}_${2}.testbenchfp -work wkdir/work_${1}_${2} -G TEST=$2 -G TEST_SIZE="all" -o testbenchopt
+vsim -lib wkdir/work_${1}_${2} testbenchopt  -fatal 7 -suppress 3829
+#-- Run the Simulation 
+run -all

From dd88b4765a0529054d16e295c6a91baf8b9f6d38 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Thu, 22 Feb 2024 10:22:23 -0800
Subject: [PATCH 07/19] updated configs list in regression-wally

---
 sim/regression-wally | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/sim/regression-wally b/sim/regression-wally
index e53ebd0d8..ad1720004 100755
--- a/sim/regression-wally
+++ b/sim/regression-wally
@@ -321,6 +321,32 @@ if (softfloat):
     'div_2_1_rv64gc', 'div_2_2_rv64gc', 'div_2_4_rv64gc', \
     'div_4_1_rv64gc', 'div_4_2_rv64gc', 'div_4_4_rv64gc', \
     'fd_ieee_rv64gc', 'fh_ieee_rv64gc', 'f_ieee_rv32gc']
+    softfloatconfigs = ['fdh_ieee_div_2_1_rv32gc', 'fdh_ieee_div_2_1_rv64gc', \
+    'fdh_ieee_div_2_2_rv32gc', 'fdh_ieee_div_2_2_rv64gc', 'fdh_ieee_div_2_4_rv32gc', \
+    'fdh_ieee_div_2_4_rv64gc', 'fdh_ieee_div_4_1_rv32gc', 'fdh_ieee_div_4_1_rv64gc', \
+    'fdh_ieee_div_4_2_rv32gc', 'fdh_ieee_div_4_2_rv64gc', 'fdh_ieee_div_4_4_rv64gc', \
+    'fdh_ieee_rv32gc', 'fd_ieee_div_2_1_rv32gc', 'fd_ieee_div_2_1_rv64gc', \
+    'fd_ieee_div_2_2_rv32gc', 'fd_ieee_div_2_2_rv64gc', 'fd_ieee_div_2_4_rv32gc', \
+    'fd_ieee_div_2_4_rv64gc', 'fd_ieee_div_4_1_rv32gc', 'fd_ieee_div_4_1_rv64gc', \
+    'fd_ieee_div_4_2_rv32gc', 'fd_ieee_div_4_2_rv64gc', 'fd_ieee_div_4_4_rv64gc', \
+    'fd_ieee_rv32gc', 'fd_ieee_rv64gc', 'fdqh_ieee_div_2_1_rv32gc', \
+    'fdqh_ieee_div_2_1_rv64gc', 'fdqh_ieee_div_2_2_rv32gc', 'fdqh_ieee_div_2_2_rv64gc', \
+    'fdqh_ieee_div_2_4_rv32gc', 'fdqh_ieee_div_2_4_rv64gc', 'fdqh_ieee_div_4_1_rv32gc', \
+    'fdqh_ieee_div_4_1_rv64gc', 'fdqh_ieee_div_4_2_rv32gc', 'fdqh_ieee_div_4_2_rv64gc',\
+    'fdqh_ieee_div_4_4_rv64gc', 'fdqh_ieee_rv32gc', 'fdqh_ieee_rv64gc', \
+    'fdq_ieee_div_2_1_rv32gc', 'fdq_ieee_div_2_1_rv64gc', 'fdq_ieee_div_2_2_rv32gc',\
+    'fdq_ieee_div_2_2_rv64gc', 'fdq_ieee_div_2_4_rv32gc', 'fdq_ieee_div_2_4_rv64gc', \
+    'fdq_ieee_div_4_1_rv32gc', 'fdq_ieee_div_4_1_rv64gc', 'fdq_ieee_div_4_2_rv32gc', \
+    'fdq_ieee_div_4_2_rv64gc', 'fdq_ieee_div_4_4_rv64gc', 'fdq_ieee_rv32gc', \
+    'fdq_ieee_rv64gc', 'fh_ieee_div_2_1_rv32gc', 'fh_ieee_div_2_1_rv64gc', \
+    'fh_ieee_div_2_2_rv32gc', 'fh_ieee_div_2_2_rv64gc', 'fh_ieee_div_2_4_rv32gc',\
+    'fh_ieee_div_2_4_rv64gc', 'fh_ieee_div_4_1_rv32gc', 'fh_ieee_div_4_1_rv64gc',\
+    'fh_ieee_div_4_2_rv32gc', 'fh_ieee_div_4_2_rv64gc', 'fh_ieee_div_4_4_rv64gc', \
+    'fh_ieee_rv32gc', 'fh_ieee_rv64gc', 'fh_ieee_v32gc', 'f_ieee_div_2_1_rv32gc', \
+    'f_ieee_div_2_1_rv64gc', 'f_ieee_div_2_2_rv32gc', 'f_ieee_div_2_2_rv64gc', \
+    'f_ieee_div_2_4_rv32gc', 'f_ieee_div_2_4_rv64gc', 'f_ieee_div_4_1_rv32gc', \
+    'f_ieee_div_4_1_rv64gc', 'f_ieee_div_4_2_rv32gc', 'f_ieee_div_4_2_rv64gc', \
+    'f_ieee_div_4_4_rv64gc', 'f_ieee_rv32gc', 'f_ieee_rv64gc']
     for config in softfloatconfigs:
         # div test case
         divtest = TestCase(
@@ -329,7 +355,7 @@ if (softfloat):
             cmd="vsim > {} -c  <<!\ndo testfloat-batch.do " + config + " div \n!",
             grepstr="All Tests completed with          0 errors"
         )
-        configs.append(divtest)
+        configs.insert(0,divtest)
 
         # sqrt test case
         sqrttest = TestCase(
@@ -354,7 +380,7 @@ if (softfloat):
             configs.append(cvtinttest)
 
             # cvtfp test case
-            # WILL fail on F_only (cvtfp converts from one fpsize to another. refer to spec)
+            # WILL fail on F_only (refer to spec)
             cvtfptest = TestCase(
                 name="cvtfp",
                 variant=config,

From b487477ecd0b8ba5b5e9aa0200f921000dea4590 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Thu, 22 Feb 2024 19:40:06 -0800
Subject: [PATCH 08/19] modified synth makefile to handle derived configs

---
 synthDC/Makefile | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/synthDC/Makefile b/synthDC/Makefile
index 7968a7b52..03c3c6612 100755
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@@ -51,7 +51,8 @@ configs: $(CONFIG)
 $(CONFIG):
 	@echo $(CONFIG)
 	cp -r $(OLDCONFIGDIR)/shared/*.vh $(CONFIGDIR)
-	cp -r $(OLDCONFIGDIR)/$(CONFIG)/* $(CONFIGDIR)
+#   cp -r $(OLDCONFIGDIR)/$(CONFIG)/* $(CONFIGDIR)
+	cp -r $(OLDCONFIGDIR)/deriv/$(CONFIG)/* $(CONFIGDIR)
 
 # adjust DTIM and IROM to reasonable values depending on config	
 ifneq ($(filter $(CONFIG), $(DIRS32)),)
@@ -61,8 +62,8 @@ else ifneq ($(filter $(CONFIG), $(DIRS64)),)
 	sed -i "s/DTIM_RANGE.*/DTIM_RANGE	= 56\'h01FF;/g" $(CONFIGDIR)/config.vh
 	sed -i "s/IROM_RANGE.*/IROM_RANGE	= 56\'h01FF;/g" $(CONFIGDIR)/config.vh
 else 
-    $(info $(CONFIG) does not exist in $(DIRS32) or $(DIRS64))
-    @echo "Config not in list, RAM_RANGE will be unmodified"
+	$(info $(CONFIG) does not exist in $(DIRS32) or $(DIRS64))
+	@echo "Config not in list, RAM_RANGE will be unmodified"
 endif
 
 # if USESRAM = 1, set that in the config file, otherwise reduce sizes

From 77ccc7b319536e267c0a33713586d6c924b3c989 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Sat, 2 Mar 2024 15:55:34 -0800
Subject: [PATCH 09/19] removed square root pre-process muxes

---
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 1c56e04e5..cc77c47d0 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -174,9 +174,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 
   logic [P.DIVb:0] PreSqrtX;
   assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
-  mux2 #(P.DIVb+1) sqrtxmux(Xnorm, {1'b0, Xnorm[P.DIVb:1]}, EvenExp, PreSqrtX); // X if exponent odd, X/2 if exponent even
-  if (P.RADIX == 2) assign SqrtX = {3'b111, PreSqrtX};                          // PreSqrtX - 2 = 2(PreSqrtX/2 - 1)
-  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};                     // 2PreSqrtX - 4 = 4(PreSqrtX/2 - 1) 
+  mux2 #(P.DIVb+1) sqrtxmux({1'b0,Xnorm[P.DIVb:1]}, {1'b00, Xnorm[P.DIVb:2]}, EvenExp, PreSqrtX); // X/2 if exponent odd, X/4 if exponent even
 
 /*  
   // Attempt to optimize radix 4 to use a left shift by 1 or zero initially, followed by no more left shift

From c45d67f8ba4b9f3b4f1a3dfc874049a620ccb6a9 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Sat, 2 Mar 2024 20:29:03 -0800
Subject: [PATCH 10/19] fdivsqrt changes

---
 src/fpu/fdivsqrt/fdivsqrtiter.sv     |  9 +++++----
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv  |  2 +-
 src/fpu/fdivsqrt/fdivsqrtstage4.sv   |  4 ++--
 src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv | 25 +++++++++++++++++--------
 4 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv
index 20f88b6cb..30232a232 100644
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -83,7 +83,7 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
   // Initialize C to -1 for sqrt and -R for division
   logic [1:0] initCUpper;
   if(P.RADIX == 4) begin
-    mux2 #(2) cuppermux4(2'b00, 2'b11, SqrtE, initCUpper);
+    mux2 #(2) cuppermux4(2'b00, 2'b00, SqrtE, initCUpper); // *** Remove this soon
   end else begin
     mux2 #(2) cuppermux2(2'b10, 2'b11, SqrtE, initCUpper);
   end
@@ -108,9 +108,10 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
           .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
           .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end else begin: stage
-        logic j1;
-        assign j1 = (i == 0 & ~C[0][P.DIVb-1]);
-        fdivsqrtstage4 #(P) fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, .j1,
+        logic j1,j0;
+        assign j0 = (i == 0 & ~C[0][P.DIVb+1]);
+        assign j1 = (i == 1 & ~C[0][P.DIVb+1]);
+        fdivsqrtstage4 #(P) fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, .j1, .j0,
           .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
           .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index cc77c47d0..e81f5c872 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -174,7 +174,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 
   logic [P.DIVb:0] PreSqrtX;
   assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
-  mux2 #(P.DIVb+1) sqrtxmux({1'b0,Xnorm[P.DIVb:1]}, {1'b00, Xnorm[P.DIVb:2]}, EvenExp, PreSqrtX); // X/2 if exponent odd, X/4 if exponent even
+  mux2 #(P.DIVb+1) sqrtxmux({1'b0,Xnorm[P.DIVb:1]}, {1'b00, Xnorm[P.DIVb:2]}, EvenExp, SqrtX); // X/2 if exponent odd, X/4 if exponent even
 
 /*  
   // Attempt to optimize radix 4 to use a left shift by 1 or zero initially, followed by no more left shift
diff --git a/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
index 0d7a722ff..e7df4399d 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@@ -32,7 +32,7 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.DIVb:0]   U,UM,               // U1.DIVb
   input  logic [P.DIVb+3:0] WS, WC,             // Q4.DIVb
   input  logic [P.DIVb+1:0] C,                  // Q2.DIVb
-  input  logic              SqrtE, j1,
+  input  logic              SqrtE, j1,j0,
   output logic [P.DIVb+1:0] CNext,              // Q2.DIVb
   output logic              un,
   output logic [P.DIVb:0]   UNext, UMNext,      // U1.DIVb
@@ -54,7 +54,7 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
   assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
   assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
   assign WSmsbs = WS[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
-  fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .udigit);
+  fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .j0, .udigit);
   assign un = 1'b0; // unused for radix 4
 
   // F generation logic
diff --git a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
index c0cbe9b1c..69571b105 100644
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@@ -31,7 +31,7 @@ module fdivsqrtuslc4cmp (
   input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
   input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
   input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 residual most significant bits
-  input  logic       SqrtE, j1,
+  input  logic       SqrtE, j0, j1,
   output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
   logic [6:0] Wmsbs;
@@ -46,7 +46,9 @@ module fdivsqrtuslc4cmp (
   // Wmsbs = |        |
 
   logic [6:0] mk2, mk1, mk0, mkm1;
+  logic [6:0] mkj2, mkj1, mkj0, mkjm1;
   logic [6:0] mks2[7:0], mks1[7:0]; 
+  logic sqrtspecial;
 
   // Prepopulate table of mks0
   assign mks2[0] = 12;
@@ -65,20 +67,27 @@ module fdivsqrtuslc4cmp (
   assign mks1[5] = 8; // is the logic any cheaper if this is a 6?
   assign mks1[6] = 8;
   assign mks1[7] = 8;
+  
+  // handles special case when j = 0 or j = 1 for sqrt
+  assign mkj2 = 20; // when j = 1 use mk2[101] when j = 0 use anything bigger than 7.
+  assign mkj1 = j1 ? 8 : 0; // when j = 1 use mk1[101] = 8 and when j = 0 use 0 so we choose u_0 = 1
+  assign sqrtspecial = SqrtE & (j1 | j0);
 
-  // Choose A for current operation
+  // Choose A for current operation *** Come back to this
  always_comb
     if (SqrtE) begin 
-      if (j1) A = 3'b101;
-      else if (Smsbs == 5'b10000) A = 3'b111;
+      //if (j1) A = 3'b101;
+      if (Smsbs == 5'b10000) A = 3'b111; // *** can we get rid of SMSBs case?
       else A = Smsbs[2:0];
     end else A = Dmsbs;
 
+    
   // Choose selection constants based on a
-  assign mk2 = mks2[A];
-  assign mk1 = mks1[A];
-  assign mk0 = -mks1[A];
-  assign mkm1 = (A == 3'b000) ? -13 : -mks2[A]; // asymmetry in table
+  
+  assign mk2 = sqrtspecial ? mkj2 : mks2[A];
+  assign mk1 = sqrtspecial ? mkj1 : mks1[A];
+  assign mk0 = -mk1;
+  assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table *** can we hide?
  
   // Compare residual W to selection constants to choose digit
   always_comb 

From 6c24afaf9898027559166d0cf8a624fc91e888d2 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Sun, 3 Mar 2024 10:29:32 -0800
Subject: [PATCH 11/19] changed cycle count to account for integer bit
 generation for sqrt

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index 2649632eb..2239bed40 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -71,7 +71,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
 
   always_comb begin 
-    if (SqrtE) FPResultBitsE = Nf + 2 + 0; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
+    if (SqrtE) FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
     else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits 
 
     if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;

From c32173f163a18437429d518368cea7a626c3c667 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Sun, 3 Mar 2024 10:30:18 -0800
Subject: [PATCH 12/19] changed U/C initialization to account for integer bit
 generation on divider stage for sqrt. Quick and dirty j1 logic fix

---
 src/fpu/fdivsqrt/fdivsqrtiter.sv | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv
index 30232a232..d3ee9a4f1 100644
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -72,8 +72,8 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
 
   // UOTFC Result U and UM registers/initialization mux
   // Initialize U to 1.0 and UM to 0 for square root; U to 0 and UM to -1 otherwise
-  assign initU  = {SqrtE, {(P.DIVb){1'b0}}};
-  assign initUM = {~SqrtE, {(P.DIVb){1'b0}}};
+  assign initU  ={(P.DIVb+1){1'b0}};
+  assign initUM = {{1'b1}, {(P.DIVb){1'b0}}};
   mux2   #(P.DIVb+1)  Umux(UNext[P.DIVCOPIES-1],  initU,  IFDivStartE, UMux);
   mux2   #(P.DIVb+1) UMmux(UMNext[P.DIVCOPIES-1], initUM, IFDivStartE, UMMux);
   flopen #(P.DIVb+1)  UReg(clk, FDivBusyE, UMux,  U[0]);
@@ -85,7 +85,7 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
   if(P.RADIX == 4) begin
     mux2 #(2) cuppermux4(2'b00, 2'b00, SqrtE, initCUpper); // *** Remove this soon
   end else begin
-    mux2 #(2) cuppermux2(2'b10, 2'b11, SqrtE, initCUpper);
+    mux2 #(2) cuppermux2(2'b10, 2'b10, SqrtE, initCUpper);
   end
   
   assign initC = {initCUpper, {P.DIVb{1'b0}}};
@@ -110,7 +110,7 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
       end else begin: stage
         logic j1,j0;
         assign j0 = (i == 0 & ~C[0][P.DIVb+1]);
-        assign j1 = (i == 1 & ~C[0][P.DIVb+1]);
+        assign j1 = (i == 1 & ~C[0][P.DIVb+1]) || (i == 0 & (C[0][P.DIVb-1] ^ C[0][P.DIVb]));
         fdivsqrtstage4 #(P) fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, .j1, .j0,
           .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
           .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));

From 2547e4c6d1a9428638946771e5b732b293ec8d32 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Sun, 3 Mar 2024 11:17:51 -0800
Subject: [PATCH 13/19] divider still works with NF+2

---
 config/shared/config-shared.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index be5543967..dd766f2fd 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -94,7 +94,7 @@ localparam LOGR        = $clog2(RADIX);                             // r = log(R
 localparam RK          = LOGR*DIVCOPIES;                            // r*k bits per cycle generated
 
 // intermediate division parameters not directly used in fdivsqrt hardware
-localparam FPDIVMINb   = NF + 3; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit to allow sqrt being shifted right
+localparam FPDIVMINb   = NF + 2; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit to allow sqrt being shifted right
 //localparam FPDIVMINb   = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right, if radix4 doesn't use a right shift.  This version saves one cycle on double-precision with R=4,k=4.  However, it doesn't work yet because C is too short, so k is incorrectly calculated as a 1 in the lsb after the last step.
 localparam DIVMINb     = ((FPDIVMINb<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVMINb; // minimum fractional bits b = max(XLEN, FPDIVMINb)
 localparam RESBITS     = DIVMINb + LOGR; // number of bits in a result: r integer + b fractional

From 0ff59ff1574e8b2eff7802c9ebc3c820b7b421a0 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Sun, 3 Mar 2024 13:00:20 -0800
Subject: [PATCH 14/19] remove redundant mux

---
 src/fpu/fdivsqrt/fdivsqrtiter.sv | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv
index d3ee9a4f1..311565f56 100644
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -80,12 +80,11 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
   flopen #(P.DIVb+1) UMReg(clk, FDivBusyE, UMMux, UM[0]);
 
   // C register/initialization mux
-  // Initialize C to -1 for sqrt and -R for division
   logic [1:0] initCUpper;
   if(P.RADIX == 4) begin
-    mux2 #(2) cuppermux4(2'b00, 2'b00, SqrtE, initCUpper); // *** Remove this soon
+    assign initCUpper = 2'b00;
   end else begin
-    mux2 #(2) cuppermux2(2'b10, 2'b10, SqrtE, initCUpper);
+    assign initCUpper = 2'b10;
   end
   
   assign initC = {initCUpper, {P.DIVb{1'b0}}};

From 9c95cba86591a1b77e7a239085d04a4ff0ce0d60 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Sun, 3 Mar 2024 18:51:10 -0800
Subject: [PATCH 15/19] remove sqrt cycle muxing

---
 src/fpu/fdivsqrt/fdivsqrtcycles.sv | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index 2239bed40..72fe04249 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -71,8 +71,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
 
   always_comb begin 
-    if (SqrtE) FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
-    else       FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard + integer bits 
+    FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
 
     if (P.IDIV_ON_FPU) ResultBitsE = IntDivE ? IntResultBitsE : FPResultBitsE;
     else               ResultBitsE = FPResultBitsE;

From 7dec9cdf212bb82e48eed2bd0db2c08ae32491de Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Mon, 4 Mar 2024 10:46:16 -0800
Subject: [PATCH 16/19] optimization in uslc

---
 src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
index 69571b105..c8b065f31 100644
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@@ -77,7 +77,8 @@ module fdivsqrtuslc4cmp (
  always_comb
     if (SqrtE) begin 
       //if (j1) A = 3'b101;
-      if (Smsbs == 5'b10000) A = 3'b111; // *** can we get rid of SMSBs case?
+      //if (Smsbs == 5'b10000) A = 3'b111; // *** can we get rid of SMSBs case?
+      if (Smsbs[4]) A = 3'b111; // *** can we get rid of SMSBs case?
       else A = Smsbs[2:0];
     end else A = Dmsbs;
 

From 587fdbdf8eab2accc0c1f0b49b2326221da199da Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Mon, 4 Mar 2024 14:30:05 -0800
Subject: [PATCH 17/19] removed j1,j0 from iteration and put inside divider
 stage

---
 src/fpu/fdivsqrt/fdivsqrtiter.sv   | 5 +----
 src/fpu/fdivsqrt/fdivsqrtstage4.sv | 5 ++++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv
index 311565f56..29b6d4fe6 100644
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -107,10 +107,7 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
           .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]),
           .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end else begin: stage
-        logic j1,j0;
-        assign j0 = (i == 0 & ~C[0][P.DIVb+1]);
-        assign j1 = (i == 1 & ~C[0][P.DIVb+1]) || (i == 0 & (C[0][P.DIVb-1] ^ C[0][P.DIVb]));
-        fdivsqrtstage4 #(P) fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, .j1, .j0,
+        fdivsqrtstage4 #(P) fdivsqrtstage(.D, .DBar, .D2, .DBar2, .SqrtE, 
           .WS(WS[i]), .WC(WC[i]), .WSNext(WSNext[i]), .WCNext(WCNext[i]), 
           .C(C[i]), .U(U[i]), .UM(UM[i]), .CNext(C[i+1]), .UNext(UNext[i]), .UMNext(UMNext[i]), .un(un[i]));
       end
diff --git a/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
index e7df4399d..4323ee35c 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@@ -32,7 +32,7 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.DIVb:0]   U,UM,               // U1.DIVb
   input  logic [P.DIVb+3:0] WS, WC,             // Q4.DIVb
   input  logic [P.DIVb+1:0] C,                  // Q2.DIVb
-  input  logic              SqrtE, j1,j0,
+  input  logic              SqrtE, 
   output logic [P.DIVb+1:0] CNext,              // Q2.DIVb
   output logic              un,
   output logic [P.DIVb:0]   UNext, UMNext,      // U1.DIVb
@@ -48,8 +48,11 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
   logic [7:0]               WCmsbs, WSmsbs;     // U4.4
   logic                     CarryIn;
   logic [P.DIVb+3:0]        WSA, WCA;           // Q4.DIVb
+  logic j0,j1;
 
   // Digit Selection logic
+  assign j0     = ~C[P.DIVb+1];             // first step of R digit selection: C = 00...0
+  assign j1     = C[P.DIVb] ^ C[P.DIVb-1];  // second step of R digit selection: C = 1100...0
   assign Smsbs  = U[P.DIVb:P.DIVb-4];       // U1.4 most significant bits of square root
   assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
   assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual

From 9b87a00698e7f49c0e42eb4baf831e1cfe040582 Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Mon, 4 Mar 2024 14:31:07 -0800
Subject: [PATCH 18/19] sqrt mux lint fixes

---
 src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index e81f5c872..0f0273c25 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -174,7 +174,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
 
   logic [P.DIVb:0] PreSqrtX;
   assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
-  mux2 #(P.DIVb+1) sqrtxmux({1'b0,Xnorm[P.DIVb:1]}, {1'b00, Xnorm[P.DIVb:2]}, EvenExp, SqrtX); // X/2 if exponent odd, X/4 if exponent even
+  mux2 #(P.DIVb+4) sqrtxmux({4'b0,Xnorm[P.DIVb:1]}, {5'b00, Xnorm[P.DIVb:2]}, EvenExp, SqrtX); // X/2 if exponent odd, X/4 if exponent even
 
 /*  
   // Attempt to optimize radix 4 to use a left shift by 1 or zero initially, followed by no more left shift

From 10ab07975fb21b2762ae7e9f3a648918da256a4e Mon Sep 17 00:00:00 2001
From: Kevin Kim <kevindkim723@gmail.com>
Date: Mon, 4 Mar 2024 14:31:21 -0800
Subject: [PATCH 19/19] uslc comments

---
 src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
index c8b065f31..7812248a9 100644
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@@ -31,7 +31,8 @@ module fdivsqrtuslc4cmp (
   input  logic [2:0] Dmsbs,             // U0.3 fractional bits after implicit leading 1
   input  logic [4:0] Smsbs,             // U1.4 leading bits of square root approximation
   input  logic [7:0] WSmsbs, WCmsbs,    // Q4.4 residual most significant bits
-  input  logic       SqrtE, j0, j1,
+  input  logic       SqrtE, 
+  input  logic       j0,j1,             // are we on first (j0) or second step (j1) of digit selection
   output logic [3:0] udigit             // {2, 1, -1, -2} digit is 0 if none are hot
 );
   logic [6:0] Wmsbs;
@@ -73,11 +74,9 @@ module fdivsqrtuslc4cmp (
   assign mkj1 = j1 ? 8 : 0; // when j = 1 use mk1[101] = 8 and when j = 0 use 0 so we choose u_0 = 1
   assign sqrtspecial = SqrtE & (j1 | j0);
 
-  // Choose A for current operation *** Come back to this
+  // Choose A for current operation 
  always_comb
     if (SqrtE) begin 
-      //if (j1) A = 3'b101;
-      //if (Smsbs == 5'b10000) A = 3'b111; // *** can we get rid of SMSBs case?
       if (Smsbs[4]) A = 3'b111; // *** can we get rid of SMSBs case?
       else A = Smsbs[2:0];
     end else A = Dmsbs;