Merge branch 'main' of https://github.com/davidharrishmc/riscv-wally

2025-02-11 06:05:49 +00:00 · 2022-06-24 00:23:53 +00:00 · 2022-06-24 00:23:53 +00:00 · c45fc8ecf9
commit c45fc8ecf9
parent d8c7122a75 51426ab71a
44 changed files with 2750 additions and 831 deletions
--- a/pipelined/config/buildroot/wally-config.vh
+++ b/pipelined/config/buildroot/wally-config.vh
@ -124,8 +124,6 @@
 `define PLIC_NUM_SRC 53
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/buildroot/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/buildroot/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/pipelined/config/rv32e/wally-config.vh
+++ b/pipelined/config/rv32e/wally-config.vh
@ -130,8 +130,6 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/rv32ic/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/rv32ic/BTBPredictor.txt"
 `define BPRED_ENABLED 0
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/pipelined/config/rv32gc/wally-config.vh
+++ b/pipelined/config/rv32gc/wally-config.vh
@ -128,8 +128,6 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/rv32ic/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/rv32ic/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/pipelined/config/rv32i/wally-config.vh
+++ b/pipelined/config/rv32i/wally-config.vh
@ -130,8 +130,6 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/rv32i/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/rv32i/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/pipelined/config/rv32ic/wally-config.vh
+++ b/pipelined/config/rv32ic/wally-config.vh
@ -128,8 +128,6 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/rv32ic/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/rv32ic/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/pipelined/config/rv64BP/wally-config.vh
+++ b/pipelined/config/rv64BP/wally-config.vh
@ -130,8 +130,6 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/rv64BP/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/rv64BP/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 //`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE
 `define BPTYPE "BPGSHARE" // BPTWOBIT or "BPGLOBAL"  or BPLOCALPAg or BPGSHARE
--- a/pipelined/config/rv64fp/wally-config.vh
+++ b/pipelined/config/rv64fp/wally-config.vh
@ -32,7 +32,7 @@
 `define DESIGN_COMPILER 0
 // RV32 or RV64: XLEN = 32 or 64
-`define XLEN 64
+`define XLEN 32
 // IEEE 754 compliance
 `define IEEE754 0
@ -132,8 +132,6 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/shared/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/shared/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/pipelined/config/rv64fpquad/wally-config.vh
+++ b/pipelined/config/rv64fpquad/wally-config.vh
@ -131,8 +131,6 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/shared/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/shared/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/pipelined/config/rv64gc/wally-config.vh
+++ b/pipelined/config/rv64gc/wally-config.vh
@ -131,8 +131,6 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/shared/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/shared/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/pipelined/config/rv64i/wally-config.vh
+++ b/pipelined/config/rv64i/wally-config.vh
@ -131,8 +131,6 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/rv64i/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/rv64i/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/pipelined/config/rv64ic/wally-config.vh
+++ b/pipelined/config/rv64ic/wally-config.vh
@ -131,8 +131,6 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 `define TWO_BIT_PRELOAD "../config/rv64ic/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/rv64ic/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@ -94,11 +94,12 @@
 `define BIAS2 ((`F_SUPPORTED & (`LEN1 != `S_LEN)) ? `S_BIAS : `H_BIAS)
 // largest length in IEU/FPU
-`define LGLEN ((`NF<`XLEN) ? `XLEN : `NF)
+`define CVTLEN ((`NF<`XLEN) ? `XLEN : `NF)
 `define LLEN ((`FLEN<`XLEN) ? `XLEN : `FLEN)
-`define LOGLGLEN $unsigned($clog2(`LGLEN+1))
+`define LOGCVTLEN $unsigned($clog2(`CVTLEN+1))
-`define NORMSHIFTSZ ((`LGLEN+`NF) > (3*`NF+8) ? (`LGLEN+`NF+1) : (3*`NF+9))
+`define NORMSHIFTSZ ((`CVTLEN+`NF) > (3*`NF+8) ? (`CVTLEN+`NF+1) : (3*`NF+9))
-`define CORRSHIFTSZ ((`LGLEN+`NF) > (3*`NF+8) ? (`LGLEN+`NF+1) : (3*`NF+6))
+`define CORRSHIFTSZ ((`CVTLEN+`NF) > (3*`NF+8) ? (`CVTLEN+`NF+1) : (3*`NF+6))
 `define DIVLEN ((`NF < `XLEN) ? `XLEN : `NF)
 // Disable spurious Verilator warnings
--- a/pipelined/regression/Makefile
+++ b/pipelined/regression/Makefile
@ -1,30 +1,9 @@
-make allclean:
+all: archtests wallytests memfiles
 	make clean
 	make all
 make clean:
 	make clean -C ../../tests/riscof
 	make clean -C ../../tests/wally-riscv-arch-test
 #	make allclean -C ../../tests/imperas-riscv-tests
 make all:
 	# *** Build old tests/imperas-riscv-tests for now;
 	# Delete this part when the privileged tests transition over to tests/wally-riscv-arch-test
 	# DH: 2/27/22 temporarily commented out imperas-riscv-tests because license expired
 	#make -C ../../tests/imperas-riscv-tests --jobs
 	#make -C ../../tests/imperas-riscv-tests XLEN=64 --jobs
 	# Build riscv-arch-test 64 and 32-bit versions
 	make -C ../../tests/riscof/ --jobs
 	make -C ../../tests/riscof/ XLEN=32 --jobs
 	# Build wally-riscv-arch-test
 	make -C ../../tests/wally-riscv-arch-test/ --jobs
 	make -C ../../tests/wally-riscv-arch-test/ XLEN=32  --jobs
 # build the memfiles and address files.
 	make -f makefile-memfile wally-sim-files --jobs
 	# Only compile Imperas tests if they are installed locally.  
 	# They are usually a symlink to $RISCV/imperas-riscv-tests and only 
 	# get compiled there manually during installation
@ -36,4 +15,22 @@ make all:
 	# Link Linux test vectors (fix this later***)
 	#cd ../../tests/linux-testgen/linux-testvectors/;./tvLinker.sh
 allclean: clean all
 clean:
 	make clean -C ../../tests/riscof
 	make clean -C ../../tests/wally-riscv-arch-test
 #	make allclean -C ../../tests/imperas-riscv-tests
 archtests:
 	# Build riscv-arch-test 64 and 32-bit versions
 	make -C ../../tests/riscof/ --jobs
 	make -C ../../tests/riscof/ XLEN=32 --jobs
 wallytests:
 	# Build wally-riscv-arch-test
 	make -C ../../tests/wally-riscv-arch-test/ --jobs
 	make -C ../../tests/wally-riscv-arch-test/ XLEN=32  --jobs
 memfiles:
 	make -f makefile-memfile wally-sim-files --jobs
--- a/pipelined/regression/lint-wally
+++ b/pipelined/regression/lint-wally
@ -5,7 +5,7 @@ export PATH=$PATH:/usr/local/bin/
 verilator=`which verilator`
 basepath=$(dirname $0)/..
-for config in rv64fp rv64fpquad rv32e rv64gc rv32gc rv32ic; do
+for config in rv32e rv64gc rv32gc rv32ic rv64fpquad; do
    echo "$config linting..."
    if !($verilator --lint-only "$@" --top-module wallypipelinedsoc "-I$basepath/config/shared" "-I$basepath/config/$config" $basepath/src/*/*.sv $basepath/src/*/*/*.sv --relative-includes); then
        echo "Exiting after $config lint due to errors or warnings"
--- a/pipelined/regression/makefile-memfile
+++ b/pipelined/regression/makefile-memfile
@ -8,8 +8,9 @@ IMPERASDIR	:= $(ROOT)/tests/imperas-riscv-tests
 ALLDIRS := $(ARCHDIR)/$(SUFFIX) $(WALLYDIR)/$(SUFFIX) 
 ELFFILES	?= $(shell find $(ALLDIRS) -type f -regex ".*\.elf")
 OBJDUMPFILES	?= $(shell find $(ALLDIRS) -type f -regex ".*\.elf.objdump")
 MEMFILES ?= $(ELFFILES:.elf=.elf.memfile)
-ADDRFILES ?= $(ELFFILES:.elf=.elf.objdump.addr)
+ADDRFILES ?= $(OBJDUMPFILES:.objdump=.objdump.addr)
 print:
 	echo "files in $(ALLDIRS) are $(ELFFILES)."
--- a/pipelined/regression/regression-wally
+++ b/pipelined/regression/regression-wally
@ -71,7 +71,7 @@ for test in tests64gc:
        grepstr="All tests ran without failures")
  configs.append(tc)
-tests32gc = ["arch32i", "arch32priv", "arch32c",  "arch32m", "arch32f", "imperas32i", "imperas32f", "imperas32m", "wally32a",  "imperas32c", "wally32priv"]  #, "imperas32mmu""wally32i", 
+tests32gc = ["arch32i", "arch32priv", "arch32c",  "arch32m", "arch32f", "imperas32i", "imperas32f", "imperas32m", "wally32a",  "imperas32c", "wally32priv", "wally32periph"]  #, "imperas32mmu""wally32i", 
 for test in tests32gc:
  tc = TestCase(
        name=test,
--- a/pipelined/regression/testfloat.do
+++ b/pipelined/regression/testfloat.do
@ -32,7 +32,7 @@ vlib work
 # start and run simulation
 # remove +acc flag for faster sim during regressions if there is no need to access internal signals
 # $num = the added words after the call
-vlog +incdir+../config/$1 +incdir+../config/shared ../testbench/testbench-fp.sv ../src/fpu/*.sv ../src/generic/*.sv -suppress 2583,7063,8607,2697 
+vlog +incdir+../config/$1 +incdir+../config/shared ../testbench/testbench-fp.sv ../src/fpu/*.sv ../srt/srt-radix4.sv ../src/generic/*.sv  ../src/generic/flop/*.sv -suppress 2583,7063,8607,2697 
 vsim -voptargs=+acc work.testbenchfp -G TEST=$2
--- a/pipelined/regression/wave-fpu.do
+++ b/pipelined/regression/wave-fpu.do
@ -7,3 +7,22 @@ add wave -noupdate /testbenchfp/Y
 add wave -noupdate /testbenchfp/Z
 add wave -noupdate /testbenchfp/Res
 add wave -noupdate /testbenchfp/Ans
 add wave -noupdate /testbenchfp/DivStart
 add wave -noupdate /testbenchfp/DivDone
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/resultselect/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/flags/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/normshift/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/lzacorrection/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/resultsign/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/round/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/fmashiftcalc/*
 add wave -group {PostProc} -noupdate /testbenchfp/postprocess/cvtshiftcalc/*
 add wave -group {Divide} -noupdate /testbenchfp/srtradix4/*
 add wave -group {Divide} -noupdate /testbenchfp/srtradix4/qsel4/*
 add wave -group {Divide} -noupdate /testbenchfp/srtradix4/otfc4/*
 add wave -group {Divide} -noupdate /testbenchfp/srtradix4/preproc/*
 add wave -group {Divide} -noupdate /testbenchfp/srtradix4/divcounter/*
 add wave -group {Divide} -noupdate /testbenchfp/srtradix4/expcalc/*
 add wave -group {Testbench} -noupdate /testbenchfp/*
 add wave -group {Testbench} -noupdate /testbenchfp/readvectors/*
--- a/pipelined/src/fpu/cvtshiftcalc.sv
+++ b/pipelined/src/fpu/cvtshiftcalc.sv
@ -7,10 +7,10 @@ module cvtshiftcalc(
    input logic  [`NE:0]           CvtCalcExpM,    // the calculated expoent
    input logic  [`NF:0]           XManM,          // input mantissas
    input logic     [`FMTBITS-1:0]  OutFmt,       // output format
-    input logic  [`LGLEN-1:0]      CvtLzcInM,      // input to the Leading Zero Counter (priority encoder)
+    input logic  [`CVTLEN-1:0]      CvtLzcInM,      // input to the Leading Zero Counter (priority encoder)
    input logic CvtResDenormUfM,
    output logic CvtResUf,
-    output logic [`LGLEN+`NF:0]    CvtShiftIn    // number to be shifted
+    output logic [`CVTLEN+`NF:0]    CvtShiftIn    // number to be shifted
 );
    logic [$clog2(`NF):0]	ResNegNF;   // the result's fraction length negated (-NF)
@ -31,8 +31,8 @@ module cvtshiftcalc(
    //              |  `NF-1  zeros   |     Mantissa      | 0's if nessisary | 
    //          - otherwise:
    //              |     LzcInM      | 0's if nessisary | 
-    assign CvtShiftIn = ToInt ? {{`XLEN{1'b0}}, XManM[`NF]&~CvtCalcExpM[`NE], XManM[`NF-1]|(CvtCalcExpM[`NE]&XManM[`NF]), XManM[`NF-2:0], {`LGLEN-`XLEN{1'b0}}} : 
+    assign CvtShiftIn = ToInt ? {{`XLEN{1'b0}}, XManM[`NF]&~CvtCalcExpM[`NE], XManM[`NF-1]|(CvtCalcExpM[`NE]&XManM[`NF]), XManM[`NF-2:0], {`CVTLEN-`XLEN{1'b0}}} : 
-                     CvtResDenormUfM ? {{`NF-1{1'b0}}, XManM, {`LGLEN-`NF+1{1'b0}}} : 
+                     CvtResDenormUfM ? {{`NF-1{1'b0}}, XManM, {`CVTLEN-`NF+1{1'b0}}} : 
                                   {CvtLzcInM, {`NF+1{1'b0}}};
--- a/pipelined/src/fpu/divshiftcalc.sv
+++ b/pipelined/src/fpu/divshiftcalc.sv
@ -0,0 +1,15 @@
 `include "wally-config.vh"
 module divshiftcalc(
    input logic  [`DIVLEN+2:0] Quot,
    input logic  [`NE:0] DivCalcExpM,
    output logic [$clog2(`NORMSHIFTSZ)-1:0] DivShiftAmt,
    output logic [`NE:0] CorrDivExp
 );
    assign DivShiftAmt = {{$clog2(`NORMSHIFTSZ)-1{1'b0}}, ~Quot[`DIVLEN+2]};
    // the quotent is in the range [.5,2)
    // if the quotent < 1 and not denormal then subtract 1 to account for the normalization shift
    assign CorrDivExp = DivCalcExpM - {(`NE)'(0), ~Quot[`DIVLEN+2]};
 endmodule
--- a/pipelined/src/fpu/fcvt.sv
+++ b/pipelined/src/fpu/fcvt.sv
@ -12,11 +12,11 @@ module fcvt (
    input logic             XDenormE,   // is the input denormalized
    input logic [`FMTBITS-1:0] FmtE,        // the input's precision (11=quad 01=double 00=single 10=half)
    output logic [`NE:0]           CvtCalcExpE,    // the calculated expoent
-	output logic [`LOGLGLEN-1:0] CvtShiftAmtE,  // how much to shift by
+	output logic [`LOGCVTLEN-1:0] CvtShiftAmtE,  // how much to shift by
    output logic                   CvtResDenormUfE,// does the result underflow or is denormalized
    output logic                   CvtResSgnE,     // the result's sign
    output logic                   IntZeroE,      // is the integer zero?
-    output logic [`LGLEN-1:0]      CvtLzcInE      // input to the Leading Zero Counter (priority encoder)
+    output logic [`CVTLEN-1:0]      CvtLzcInE      // input to the Leading Zero Counter (priority encoder)
    );
    // OpCtrls:
@ -43,7 +43,7 @@ module fcvt (
    logic                   Int64;      // is the integer 64 bits?
    logic                   IntToFp;       // is the opperation an int->fp conversion?
    logic                   ToInt;      // is the opperation an fp->int conversion?
-    logic [`LOGLGLEN-1:0] ZeroCnt; // output from the LZC
+    logic [`LOGCVTLEN-1:0] ZeroCnt; // output from the LZC
    // seperate OpCtrl for code readability
@ -78,10 +78,10 @@ module fcvt (
    // choose the input to the leading zero counter i.e. priority encoder
    //             int -> fp : | positive integer | 00000... (if needed) | 
    //             fp  -> fp : | fraction         | 00000... (if needed) | 
-    assign CvtLzcInE = IntToFp ? {TrimInt, {`LGLEN-`XLEN{1'b0}}} :
+    assign CvtLzcInE = IntToFp ? {TrimInt, {`CVTLEN-`XLEN{1'b0}}} :
-                             {XManE[`NF-1:0], {`LGLEN-`NF{1'b0}}};
+                             {XManE[`NF-1:0], {`CVTLEN-`NF{1'b0}}};
-    lzc #(`LGLEN) lzc (.num(CvtLzcInE), .ZeroCnt);
+    lzc #(`CVTLEN) lzc (.num(CvtLzcInE), .ZeroCnt);
    ///////////////////////////////////////////////////////////////////////////
    // shifter
@ -99,9 +99,9 @@ module fcvt (
    //              - only shift fp -> fp if the intital value is denormalized
    //                  - this is a problem because the input to the lzc was the fraction rather than the mantissa
    //                  - rather have a few and-gates than an extra bit in the priority encoder??? *** is this true?
-    assign CvtShiftAmtE = ToInt ? CvtCalcExpE[`LOGLGLEN-1:0]&{`LOGLGLEN{~CvtCalcExpE[`NE]}} :
+    assign CvtShiftAmtE = ToInt ? CvtCalcExpE[`LOGCVTLEN-1:0]&{`LOGCVTLEN{~CvtCalcExpE[`NE]}} :
-                    CvtResDenormUfE&~IntToFp ? (`LOGLGLEN)'(`NF-1)+CvtCalcExpE[`LOGLGLEN-1:0] : 
+                    CvtResDenormUfE&~IntToFp ? (`LOGCVTLEN)'(`NF-1)+CvtCalcExpE[`LOGCVTLEN-1:0] : 
-                              (ZeroCnt+1)&{`LOGLGLEN{XDenormE|IntToFp}};
+                              (ZeroCnt+1)&{`LOGCVTLEN{XDenormE|IntToFp}};
    ///////////////////////////////////////////////////////////////////////////
    // exp calculations
@ -180,7 +180,7 @@ module fcvt (
    //                  - shift left to normilize (-1-ZeroCnt)
    //                  - newBias to make the biased exponent
    //          
-    assign CvtCalcExpE = {1'b0, OldExp} - (`NE+1)'(`BIAS) + {2'b0, NewBias} - {{`NE{1'b0}}, XDenormE|IntToFp} - {{`NE-`LOGLGLEN+1{1'b0}}, (ZeroCnt&{`LOGLGLEN{XDenormE|IntToFp}})};
+    assign CvtCalcExpE = {1'b0, OldExp} - (`NE+1)'(`BIAS) + {2'b0, NewBias} - {{`NE{1'b0}}, XDenormE|IntToFp} - {{`NE-`LOGCVTLEN+1{1'b0}}, (ZeroCnt&{`LOGCVTLEN{XDenormE|IntToFp}})};
    // find if the result is dnormal or underflows
    //      - if Calculated expoenent is 0 or negitive (and the input/result is not exactaly 0)
    //      - can't underflow an integer to Fp conversion
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@ -82,7 +82,7 @@ module fpu (
   // unpacking signals
   logic 		  XSgnE, YSgnE, ZSgnE;                // input's sign - execute stage
-   logic 		  XSgnM;                       // input's sign - memory stage
+   logic 		  XSgnM, YSgnM;                       // input's sign - memory stage
   logic [`NE-1:0] 	  XExpE, YExpE, ZExpE;                // input's exponent - execute stage
   logic [`NE-1:0] 	  ZExpM;                              // input's exponent - memory stage
   logic [`NF:0] 	  XManE, YManE, ZManE;                // input's fraction - execute stage
@ -104,23 +104,27 @@ module fpu (
   logic 		  FOpCtrlQ;   
   // Fma Signals
-    logic [3*`NF+5:0]	SumE, SumM;                       
+   logic [3*`NF+5:0]	SumE, SumM;                       
-    logic [`NE+1:0]	    ProdExpE, ProdExpM;
+   logic [`NE+1:0]	    ProdExpE, ProdExpM;
-    logic 			    AddendStickyE, AddendStickyM;
+   logic 			    AddendStickyE, AddendStickyM;
-    logic 			    KillProdE, KillProdM;
+   logic 			    KillProdE, KillProdM;
-    logic 			    InvZE, InvZM;
+   logic 			    InvZE, InvZM;
-    logic 			    NegSumE, NegSumM;
+   logic 			    NegSumE, NegSumM;
-    logic 			    ZSgnEffE, ZSgnEffM;
+   logic 			    ZSgnEffE, ZSgnEffM;
-    logic 			    PSgnE, PSgnM;
+   logic 			    PSgnE, PSgnM;
-    logic [$clog2(3*`NF+7)-1:0]			FmaNormCntE, FmaNormCntM;
+   logic [$clog2(3*`NF+7)-1:0]			FmaNormCntE, FmaNormCntM;
   // Cvt Signals
-    logic [`NE:0]           CvtCalcExpE, CvtCalcExpM;    // the calculated expoent
+   logic [`NE:0]           CvtCalcExpE, CvtCalcExpM;    // the calculated expoent
-	 logic [`LOGLGLEN-1:0]   CvtShiftAmtE, CvtShiftAmtM;  // how much to shift by
+   logic [`LOGCVTLEN-1:0]   CvtShiftAmtE, CvtShiftAmtM;  // how much to shift by
-    logic                   CvtResDenormUfE, CvtResDenormUfM;// does the result underflow or is denormalized
+   logic                   CvtResDenormUfE, CvtResDenormUfM;// does the result underflow or is denormalized
-    logic                   CvtResSgnE, CvtResSgnM;     // the result's sign
+   logic                   CvtResSgnE, CvtResSgnM;     // the result's sign
-    logic                   IntZeroE, IntZeroM;      // is the integer zero?
+   logic                   IntZeroE, IntZeroM;      // is the integer zero?
-    logic [`LGLEN-1:0]      CvtLzcInE, CvtLzcInM;      // input to the Leading Zero Counter (priority encoder)
+   logic [`CVTLEN-1:0]      CvtLzcInE, CvtLzcInM;      // input to the Leading Zero Counter (priority encoder)
   //divide signals
   logic [`DIVLEN+2:0] Quot;
   logic [`NE:0] DivCalcExpM;
   // result and flag signals
   logic [63:0] 	  FDivResM, FDivResW;                 // divide/squareroot result
@ -317,7 +321,7 @@ module fpu (
   // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
   flopenrc #(`NF+2) EMFpReg2 (clk, reset, FlushM, ~StallM, {XSgnE,XManE}, {XSgnM,XManM});
-   flopenrc #(`NF+1) EMFpReg3 (clk, reset, FlushM, ~StallM, YManE, YManM);
+   flopenrc #(`NF+2) EMFpReg3 (clk, reset, FlushM, ~StallM, {YSgnE,YManE}, {YSgnM,YManM});
   flopenrc #(`FLEN) EMFpReg4 (clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
   flopenrc #(`XLEN) EMFpReg6 (clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
   flopenrc #(`FLEN) EMFpReg7 (clk, reset, FlushM, ~StallM, PreFpResE, PreFpResM);
@ -333,7 +337,7 @@ module fpu (
   flopenrc #($clog2(3*`NF+7)+6) EMRegFma4(clk, reset, FlushM, ~StallM, 
                           {AddendStickyE, KillProdE, InvZE, FmaNormCntE, NegSumE, ZSgnEffE, PSgnE},
                           {AddendStickyM, KillProdM, InvZM, FmaNormCntM, NegSumM, ZSgnEffM, PSgnM});
-   flopenrc #(`NE+`LOGLGLEN+`LGLEN+4) EMRegCvt(clk, reset, FlushM, ~StallM, 
+   flopenrc #(`NE+`LOGCVTLEN+`CVTLEN+4) EMRegCvt(clk, reset, FlushM, ~StallM, 
                           {CvtCalcExpE, CvtShiftAmtE, CvtResDenormUfE, CvtResSgnE, IntZeroE, CvtLzcInE},
                           {CvtCalcExpM, CvtShiftAmtM, CvtResDenormUfM, CvtResSgnM, IntZeroM, CvtLzcInM});
@ -351,9 +355,9 @@ module fpu (
   assign FpLoadM = FResSelM[1];
-   postprocess postprocess(.XSgnM, .ZExpM, .XManM, .YManM, .ZManM, .FrmM, .FmtM, .ProdExpM, 
+   postprocess postprocess(.XSgnM, .YSgnM, .ZExpM, .XManM, .YManM, .ZManM, .FrmM, .FmtM, .ProdExpM, 
-                           .AddendStickyM, .KillProdM, .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, 
+                           .AddendStickyM, .KillProdM, .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .Quot,
-                           .ZInfM, .XNaNM, .YNaNM, .ZNaNM, .XSNaNM, .YSNaNM, .ZSNaNM, .SumM, 
+                           .ZInfM, .XNaNM, .YNaNM, .ZNaNM, .XSNaNM, .YSNaNM, .ZSNaNM, .SumM, .DivCalcExpM,
                           .NegSumM, .InvZM, .ZDenormM, .ZSgnEffM, .PSgnM, .FOpCtrlM, .FmaNormCntM, 
                           .CvtCalcExpM, .CvtResDenormUfM,.CvtShiftAmtM, .CvtResSgnM, .FWriteIntM, 
                           .CvtLzcInM, .IntZeroM, .PostProcSelM, .PostProcResM, .PostProcFlgM, .FCvtIntResM);
--- a/pipelined/src/fpu/postprocess.sv
+++ b/pipelined/src/fpu/postprocess.sv
@ -30,7 +30,7 @@
 `include "wally-config.vh"
 module postprocess(
-    input logic                             XSgnM,  // input signs
+    input logic                             XSgnM, YSgnM,  // input signs
    input logic     [`NE-1:0]               ZExpM, // input exponents
    input logic     [`NF:0]                 XManM, YManM, ZManM, // input mantissas
    input logic     [2:0]                   FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
@ -51,13 +51,15 @@ module postprocess(
    input logic [2:0]                       FOpCtrlM,       // choose which opperation (look below for values)
    input logic     [$clog2(3*`NF+7)-1:0]   FmaNormCntM,   // the normalization shift count
    input logic [`NE:0]           CvtCalcExpM,    // the calculated expoent
    input logic [`NE:0]           DivCalcExpM,    // the calculated expoent
    input logic CvtResDenormUfM,
-	input logic [`LOGLGLEN-1:0] CvtShiftAmtM,  // how much to shift by
+	input logic [`LOGCVTLEN-1:0] CvtShiftAmtM,  // how much to shift by
    input logic                   CvtResSgnM,     // the result's sign
    input logic             FWriteIntM,     // is fp->int (since it's writting to the integer register)
-    input logic  [`LGLEN-1:0]      CvtLzcInM,      // input to the Leading Zero Counter (priority encoder)
+    input logic  [`CVTLEN-1:0]      CvtLzcInM,      // input to the Leading Zero Counter (priority encoder)
    input logic             IntZeroM,         // is the input zero
    input logic [1:0] PostProcSelM, // select result to be written to fp register
    input logic [`DIVLEN+2:0]   Quot,
    output logic    [`FLEN-1:0]    PostProcResM,    // FMA final result
    output logic    [4:0]          PostProcFlgM,
    output logic [`XLEN-1:0] FCvtIntResM    // the int conversion result
@ -75,13 +77,14 @@ module postprocess(
    logic [3*`NF+8:0]            FmaShiftIn;        // is the sum zero
    logic               UfPlus1;                    // do you add one (for determining underflow flag)
    logic               Round;   // bits needed to determine rounding
-    logic [`LGLEN+`NF:0]    CvtShiftIn;    // number to be shifted
+    logic [`CVTLEN+`NF:0]    CvtShiftIn;    // number to be shifted
    logic               Mult;       // multiply opperation
    logic [`FLEN:0]     RoundAdd;       // how much to add to the result
    logic [`NE+1:0]     ConvNormSumExp;          // exponent of the normalized sum not taking into account denormal or zero results
    logic               PreResultDenorm;    // is the result denormalized - calculated before LZA corection
    logic [$clog2(3*`NF+7)-1:0]  FmaShiftAmt;   // normalization shift count
    logic [$clog2(`NORMSHIFTSZ)-1:0]  ShiftAmt;   // normalization shift count
    logic [$clog2(`NORMSHIFTSZ)-1:0]  DivShiftAmt;
    logic [`NORMSHIFTSZ-1:0]            ShiftIn;        // is the sum zero
    logic [`NORMSHIFTSZ-1:0]    Shifted;    // the shifted result
    logic                   Plus1;      // add one to the final result?
@ -91,6 +94,7 @@ module postprocess(
    logic                   IntToFp;       // is the opperation an int->fp conversion?
    logic                   ToInt;      // is the opperation an fp->int conversion?
    logic [`NE+1:0] RoundExp;
    logic [`NE:0] CorrDivExp;
    logic [1:0] NegResMSBS;
    logic CvtOp;
    logic FmaOp;
@ -135,6 +139,7 @@ module postprocess(
                              .XZeroM, .IntToFp, .OutFmt, .CvtResUf, .CvtShiftIn);
    fmashiftcalc fmashiftcalc(.SumM, .ZExpM, .ProdExpM, .FmaNormCntM, .FmtM, .KillProdM, .ConvNormSumExp,
                          .ZDenormM, .SumZero, .PreResultDenorm, .FmaShiftAmt, .FmaShiftIn);
    divshiftcalc divshiftcalc(.Quot, .DivCalcExpM, .CorrDivExp, .DivShiftAmt);
    always_comb
        case(PostProcSelM)
@ -143,12 +148,12 @@ module postprocess(
                ShiftIn =  {FmaShiftIn, {`NORMSHIFTSZ-(3*`NF+9){1'b0}}};
            end
            2'b00: begin // cvt
-                ShiftAmt = {{$clog2(`NORMSHIFTSZ)-$clog2(`LGLEN+1){1'b0}}, CvtShiftAmtM};
+                ShiftAmt = {{$clog2(`NORMSHIFTSZ)-$clog2(`CVTLEN+1){1'b0}}, CvtShiftAmtM};
-                ShiftIn =  {CvtShiftIn, {`NORMSHIFTSZ-`LGLEN-`NF-1{1'b0}}};
+                ShiftIn =  {CvtShiftIn, {`NORMSHIFTSZ-`CVTLEN-`NF-1{1'b0}}};
            end
-            2'b01: begin //div
+            2'b01: begin //div ***prob can take out
-                ShiftAmt = 0;//{DivShiftAmt};
+                ShiftAmt = DivShiftAmt;
-                ShiftIn =  0;//{{`NORMSHIFTSZ-(3*`NF+8){1'b0}}, DivShiftIn};
+                ShiftIn =  {Quot[`DIVLEN+1:0], {`NORMSHIFTSZ-`DIVLEN-2{1'b0}}};
            end
            default: begin 
                ShiftAmt = {$clog2(`NORMSHIFTSZ){1'bx}}; 
@ -171,9 +176,9 @@ module postprocess(
    // round to infinity
    // round to nearest max magnitude
-    round round(.OutFmt, .FrmM, .Sticky, .AddendStickyM, .ZZeroM, .Plus1, .PostProcSelM, .CvtCalcExpM,
+    round round(.OutFmt, .FrmM, .Sticky, .AddendStickyM, .ZZeroM, .Plus1, .PostProcSelM, .CvtCalcExpM, .CorrDivExp,
                .InvZM, .RoundSgn, .SumExp, .FmaOp, .CvtOp, .CvtResDenormUfM, .CorrShifted, .ToInt,  .CvtResUf,
-                .UfPlus1, .FullResExp, .ResFrac, .ResExp, .Round, .RoundAdd, .UfLSBRes, .RoundExp);
+                .DivOp, .UfPlus1, .FullResExp, .ResFrac, .ResExp, .Round, .RoundAdd, .UfLSBRes, .RoundExp);
    ///////////////////////////////////////////////////////////////////////////////
    // Sign calculation
@ -181,7 +186,7 @@ module postprocess(
    resultsign resultsign(.FrmM, .PSgnM, .ZSgnEffM, .InvZM, .SumExp, .Round, .Sticky,
                          .FmaOp, .DivOp, .CvtOp, .ZInfM, .InfIn, .NegSumM, .SumZero, .Mult, 
-                          .CvtResSgnM, .RoundSgn, .ResSgn);
+                          .XSgnM, .YSgnM, .CvtResSgnM, .RoundSgn, .ResSgn);
    ///////////////////////////////////////////////////////////////////////////////
    // Flags
--- a/pipelined/src/fpu/resultsign.sv
+++ b/pipelined/src/fpu/resultsign.sv
@ -4,6 +4,8 @@ module resultsign(
    input logic [2:0]   FrmM,
    input logic         PSgnM, ZSgnEffM,
    input logic         InvZM,
    input logic         XSgnM,
    input logic         YSgnM,
    input logic         ZInfM,
    input logic         InfIn,
    input logic         NegSumM,
@ -25,6 +27,7 @@ module resultsign(
    logic FmaResSgn;
    logic FmaResSgnTmp;
    logic Underflow;
    logic DivSgn;
    // logic ResultSgnTmp;
    // Determine the sign if the sum is zero
@ -43,9 +46,10 @@ module resultsign(
    assign InfSgn = ZInfM ? ZSgnEffM : PSgnM;
    assign FmaResSgn = InfIn ? InfSgn : SumZero ? ZeroSgn : FmaResSgnTmp;
-    // Sign for rounding calulation
+    assign DivSgn = XSgnM^YSgnM;
    assign RoundSgn = (FmaResSgnTmp&FmaOp) | (CvtResSgnM&CvtOp) | (1'b0&DivOp);
-    assign ResSgn = (FmaResSgn&FmaOp) | (CvtResSgnM&CvtOp) | (1'b0&DivOp);
+    // Sign for rounding calulation
    assign RoundSgn = (FmaResSgnTmp&FmaOp) | (CvtResSgnM&CvtOp) | (DivSgn&DivOp);
    assign ResSgn = (FmaResSgn&FmaOp) | (CvtResSgnM&CvtOp) | (DivSgn&DivOp);
 endmodule
--- a/pipelined/src/fpu/round.sv
+++ b/pipelined/src/fpu/round.sv
@ -11,6 +11,7 @@ module round(
    input logic  [`FMTBITS-1:0] OutFmt,       // precision 1 = double 0 = single
    input logic  [2:0]          FrmM,       // rounding mode
    input logic                 FmaOp,
    input logic                 DivOp,
    input logic [1:0] PostProcSelM,
    input logic                 CvtResDenormUfM,
    input logic                 ToInt,
@ -23,6 +24,7 @@ module round(
    input logic  [`NE+1:0]      SumExp,         // exponent of the normalized sum
    input logic                 RoundSgn,      // the result's sign
    input logic [`NE:0]           CvtCalcExpM,    // the calculated expoent
    input logic [`NE:0]           CorrDivExp,    // the calculated expoent
    output logic                UfPlus1,  // do you add or subtract on from the result
    output logic [`NE+1:0]      FullResExp,      // ResExp with bits to determine sign and overflow
    output logic [`NF-1:0]      ResFrac,         // Result fraction
@ -303,7 +305,7 @@ module round(
        case(PostProcSelM)
            2'b10: RoundExp = SumExp; // fma
            2'b00: RoundExp = {CvtCalcExpM[`NE], CvtCalcExpM}&{`NE+2{~CvtResDenormUfM|CvtResUf}}; // cvt
-            2'b01: RoundExp = 0; // divide
+            2'b01: RoundExp = {CorrDivExp[`NE], CorrDivExp[`NE:0]}; // divide
            default: RoundExp = 0; 
        endcase
--- a/pipelined/src/generic/flop/bram1p1rw.sv
+++ b/pipelined/src/generic/flop/bram1p1rw.sv
@ -54,10 +54,6 @@ module bram1p1rw
  logic [DATA_WIDTH-1:0] 			 RAM [(2**ADDR_WIDTH)-1:0];
  integer 							 i;
  initial begin
 	$readmemh("big64.txt", RAM);
  end
  always @ (posedge clk) begin
 	dout <= RAM[addr];    
 	if(we) begin
--- a/pipelined/src/uncore/clint.sv
+++ b/pipelined/src/uncore/clint.sv
@ -60,7 +60,7 @@ module clint (
  flopr #(16) entrydflop(HCLK, ~HRESETn, entry, entryd);
  assign HRESPCLINT = 0; // OK
-  assign HREADYCLINT = 1'b1; // *** needs to depend on DONE during accesses 
+  assign HREADYCLINT = 1'b1; // *** needs to depend on DONE during asynchronous MTIME accesses 
  // word aligned reads
  if (`XLEN==64) assign #2 entry = {HADDR[15:3], 3'b000};
@ -87,8 +87,7 @@ module clint (
    always_ff @(posedge HCLK or negedge HRESETn) 
      if (~HRESETn) begin
        MSIP <= 0;
-        MTIMECMP <= 0;
+        MTIMECMP <= 64'hFFFFFFFFFFFFFFFF; // Spec says MTIMECMP is not reset, but we reset to maximum value to prevent spurious timer interrupts
        // MTIMECMP is not reset
      end else if (memwrite) begin
        if (entryd == 16'h0000) MSIP <= HWDATA[0];
        if (entryd == 16'h4000) begin
@ -104,7 +103,6 @@ module clint (
    always_ff @(posedge HCLK or negedge HRESETn) 
      if (~HRESETn) begin
        MTIME <= 0;
        // MTIMECMP is not reset
      end else if (memwrite & entryd == 16'hBFF8) begin
        // MTIME Counter.  Eventually change this to run off separate clock.  Synchronization then needed
        for(j=0;j<`XLEN/8;j++)
--- a/pipelined/srt/Makefile
+++ b/pipelined/srt/Makefile
@ -1,4 +1,4 @@
-all: exptestgen testgen qslc_r4a2 qslc_r4a2b
+all: exptestgen testgen qslc_r4a2 qslc_r4a2b qslc_sqrt_r4a2
 sqrttestgen: sqrttestgen.c
 	gcc sqrttestgen.c -o sqrttestgen -lm
@ -19,5 +19,9 @@ qslc_r4a2b: qslc_r4a2b.c
 	gcc qslc_r4a2b.c -o qslc_r4a2b -lm
 	./qslc_r4a2b > qslc_r4a2b.tv
 qslc_sqrt_r4a2: qslc_sqrt_r4a2.c
 	gcc qslc_sqrt_r4a2.c -o qslc_sqrt_r4a2 -lm
 	./qslc_sqrt_r4a2 > qslc_sqrt_r4a2.sv
 clean:
-	rm -f testgen exptestgen qslc_r4a2 
+	rm -f testgen exptestgen qslc_r4a2 qslc_r4a2b qslc_sqrt_r4a2
--- a/pipelined/srt/qsel4.dat
+++ b/pipelined/srt/qsel4.dat
--- a/pipelined/srt/qsel4.sv
+++ b/pipelined/srt/qsel4.sv
@ -11,7 +11,7 @@ module qsel4 (
 	logic [2:0] Dmsbs;
 	assign PreWmsbs = WC[`DIVLEN+3:`DIVLEN-4] + WS[`DIVLEN+3:`DIVLEN-4];
 	assign Wmsbs = PreWmsbs[7:1];
-	assign Dmsbs = D[`DIVLEN-1:`DIVLEN-3];
+        assign Dmsbs = D[`DIVLEN-1:`DIVLEN-3];
 	// D = 0001.xxx...
 	// Dmsbs = |   |
    // W =      xxxx.xxx...
--- a/pipelined/srt/qslc_sqrt_r4a2
+++ b/pipelined/srt/qslc_sqrt_r4a2
--- a/pipelined/srt/qslc_sqrt_r4a2.c
+++ b/pipelined/srt/qslc_sqrt_r4a2.c
@ -0,0 +1,198 @@
 /*
  Program:      qslc_r4a2.c
  Description:  Prints out Quotient Selection Table (assumes CPA is utilized to reduce memory)
  User:         James E. Stine
 */
 #include <stdio.h>
 #include <math.h>
 #define DIVISOR_SIZE 3
 #define CARRY_SIZE 7
 #define SUM_SIZE 7
 #define TOT_SIZE 7
 void disp_binary(double, int, int);
 struct bits {
  unsigned int divisor : DIVISOR_SIZE;
  int tot : TOT_SIZE;
 } pla;
 /* 
   Function:      disp_binary
   Description:   This function displays a Double-Precision number into
   four 16 bit integers using the global union variable 
   dp_number
   Argument List: double x            The value to be converted
   int bits_to_left    Number of bits left of radix point
   int bits_to_right   Number of bits right of radix point
   Return value:  none
 */
 void disp_binary(double x, int bits_to_left, int bits_to_right) {
  int i; 
  double diff;
  if (fabs(x) <  pow(2.0, ((double) -bits_to_right)) ) {
    for (i = -bits_to_left + 1; i <= bits_to_right; i++) {
      printf("0");
    }
    if (i == bits_to_right+1) 
      ;
    return;
  }
  if (x < 0.0) 
    x = pow(2.0, ((double) bits_to_left)) + x;
  for (i = -bits_to_left + 1; i <= bits_to_right; i++) {
    diff = pow(2.0, ((double) -i) );
    if (x < diff) 
      printf("0");
    else {
      printf("1");
      x -= diff;
    }
    if (i == 0) 
      ;
  }
 }
 int main() {
  int m;
  int n;
  int o;
  pla.divisor = 0;
  pla.tot = 0;
  printf("\tcase({D[5:3],Wmsbs})\n");
  for (o=0; o < pow(2.0, DIVISOR_SIZE); o++) {
    for (m=0; m < pow(2.0, TOT_SIZE); m++) {
      printf("\t\t11'b");
      disp_binary((double) pla.divisor, DIVISOR_SIZE, 0);
      printf("_");
      disp_binary((double) pla.tot, TOT_SIZE, 0);
      printf(": q = 4'b");
      /*
 	4 bits for Radix 4 (a=2)
 	1000 = +2
 	0100 = +1
 	0000 =  0
 	0010 = -1
 	0001 = -2		
      */
      switch (pla.divisor) {
      case 0:
 	if ((pla.tot) >= 24)
 	  printf("1000");
 	else if ((pla.tot) >= 8)
 	  printf("0100");
 	else if ((pla.tot) >= -8)
 	  printf("0000");
 	else if ((pla.tot) >= -26)
 	  printf("0010");
 	else
 	  printf("0001");
 	break;
      case 1:
 	if ((pla.tot) >= 28)
 	  printf("1000");
 	else if ((pla.tot) >= 8)
 	  printf("0100");
 	else if ((pla.tot) >= -10)
 	  printf("0000");
 	else if ((pla.tot) >= -28)
 	  printf("0010");
 	else
 	  printf("0001");
 	break;
      case 2:
 	if ((pla.tot) >= 32)
 	  printf("1000");
 	else if ((pla.tot) >= 8)
 	  printf("0100");
 	else if ((pla.tot) >= -12)
 	  printf("0000");
 	else if ((pla.tot) >= -32)
 	  printf("0010");
 	else
 	  printf("0001");
 	break;
      case 3:
 	if ((pla.tot) >= 32)
 	  printf("1000");
 	else if ((pla.tot) >= 8)
 	  printf("0100");
 	else if ((pla.tot) >= -12)
 	  printf("0000");
 	else if ((pla.tot) >= -34)
 	  printf("0010");
 	else
 	  printf("0001");
 	break;
      case 4:
 	if ((pla.tot) >= 36)
 	  printf("1000");
 	else if ((pla.tot) >= 12)
 	  printf("0100");
 	else if ((pla.tot) >= -12)
 	  printf("0000");
 	else if ((pla.tot) >= -36)
 	  printf("0010");
 	else
 	  printf("0001");
 	break;
      case 5:
 	if ((pla.tot) >= 40)
 	  printf("1000");
 	else if ((pla.tot) >= 12)
 	  printf("0100");
 	else if ((pla.tot) >= -16)
 	  printf("0000");
 	else if ((pla.tot) >= -40)
 	  printf("0010");
 	else
 	  printf("0001");
 	break;
      case 6:
 	if ((pla.tot) >= 40)
 	  printf("1000");
 	else if ((pla.tot) >= 16)
 	  printf("0100");
 	else if ((pla.tot) >= -16)
 	  printf("0000");
 	else if ((pla.tot) >= -44)
 	  printf("0010");
 	else
 	  printf("0001");
 	break;
      case 7:
 	if ((pla.tot) >= 44)
 	  printf("1000");
 	else if ((pla.tot) >= 16)
 	  printf("0100");
 	else if ((pla.tot) >= -16)
 	  printf("0000");
 	else if ((pla.tot) >= -46)
 	  printf("0010");
 	else
 	  printf("0001");
 	break;
      default: printf ("XXX");
      }
      printf(";\n");
      (pla.tot)++;
    }
    (pla.divisor)++;
  }
  printf("\tendcase\n");
 }
--- a/pipelined/srt/qslc_sqrt_r4a2.sv
+++ b/pipelined/srt/qslc_sqrt_r4a2.sv
--- a/pipelined/srt/srt-radix4.do
+++ b/pipelined/srt/srt-radix4.do
@ -17,7 +17,7 @@ if [file exists work] {
 }
 vlib work
-vlog +incdir+../config/rv64gc +incdir+../config/shared srt-radix4.sv testbench-radix4.sv qsel4.sv ../src/generic/flop/flop*.sv ../src/generic/mux.sv ../src/generic/lzc.sv
+vlog +incdir+../config/rv64gc +incdir+../config/shared srt-radix4.sv testbench-radix4.sv ../src/generic/flop/flop*.sv ../src/generic/mux.sv ../src/generic/lzc.sv
 vopt +acc work.testbenchradix4 -o workopt 
 vsim workopt
--- a/pipelined/srt/srt-radix4.sv
+++ b/pipelined/srt/srt-radix4.sv
@ -30,42 +30,35 @@
 `include "wally-config.vh"
 `define DIVLEN ((`NF<(`XLEN)) ? (`XLEN) : `NF)
 module srtradix4 (
  input  logic clk,
-  input  logic Start, 
+  input  logic DivStart, 
-  input  logic Stall, // *** multiple pipe stages
+  input  logic [`NE-1:0] XExpE, YExpE,
-  input  logic Flush, // *** multiple pipe stages
+  input  logic [`NF:0] XManE, YManE,
  // Floating Point Inputs
  // later add exponents, signs, special cases
  input  logic       XSign, YSign,
  input  logic [`NE-1:0] XExp, YExp,
  input  logic [`NF-1:0] XFrac, YFrac,
  input  logic [`XLEN-1:0] SrcA, SrcB,
-  input  logic [1:0] Fmt, // Floats: 00 = 16 bit, 01 = 32 bit, 10 = 64 bit, 11 = 128 bit
+  input  logic XZeroE,
  input  logic       W64, // 32-bit ints on XLEN=64
  input  logic       Signed, // Interpret integers as signed 2's complement
  input  logic       Int, // Choose integer inputs
  input  logic       Sqrt, // perform square root, not divide
-  output logic       rsign,
+  output logic       DivDone,
-  output logic [`DIVLEN-1:0] Quot, Rem, // *** later handle integers
+  output logic [`DIVLEN+2:0] Quot,
-  output logic [`NE-1:0] rExp,
+  output logic [`XLEN-1:0] Rem, // *** later handle integers
-  output logic [3:0] Flags
+  output logic [`NE:0] DivCalcExpE
 );
  // logic           qp, qz, qm; // quotient is +1, 0, or -1
  logic [3:0]     q;
-  logic [`NE-1:0] calcExp;
+  logic [`NE:0] DivCalcExp;
-  logic           calcSign;
+  logic [`DIVLEN:0]    X;
-  logic [`DIVLEN-1:0]  X, Dpreproc;
+  logic [`DIVLEN-1:0]  Dpreproc;
  logic [`DIVLEN+3:0]  WS, WSA, WSN;
  logic [`DIVLEN+3:0]  WC, WCA, WCN;
  logic [`DIVLEN+3:0]  D, DBar, D2, DBar2, Dsel;
  logic [$clog2(`XLEN+1)-1:0] intExp;
  logic           intSign;
-  srtpreproc preproc(SrcA, SrcB, XFrac, YFrac, Fmt, W64, Signed, Int, Sqrt, X, Dpreproc, intExp, intSign);
+  srtpreproc preproc(SrcA, SrcB, XManE, YManE, W64, Signed, Int, Sqrt, X, Dpreproc, intExp, intSign);
  // Top Muxes and Registers
  // When start is asserted, the inputs are loaded into the divider.
@ -77,11 +70,11 @@ module srtradix4 (
  //  - otherwise load WSA into the flipflop
  //  *** what does N and A stand for?
  //  *** change shift amount for radix4
-  mux2   #(`DIVLEN+4) wsmux({WSA[`DIVLEN+1:0], 2'b0}, {4'b0001, X}, Start, WSN);
+  mux2   #(`DIVLEN+4) wsmux({WSA[`DIVLEN+1:0], 2'b0}, {3'b000, X}, DivStart, WSN);
  flop   #(`DIVLEN+4) wsflop(clk, WSN, WS);
-  mux2   #(`DIVLEN+4) wcmux({WCA[`DIVLEN+1:0], 2'b0}, {`DIVLEN+4{1'b0}}, Start, WCN);
+  mux2   #(`DIVLEN+4) wcmux({WCA[`DIVLEN+1:0], 2'b0}, {`DIVLEN+4{1'b0}}, DivStart, WCN);
  flop   #(`DIVLEN+4) wcflop(clk, WCN, WC);
-  flopen #(`DIVLEN+4) dflop(clk, Start, {4'b0001, Dpreproc}, D);
+  flopen #(`DIVLEN+4) dflop(clk, DivStart, {4'b0001, Dpreproc}, D);
  // Quotient Selection logic
  // Given partial remainder, select quotient of +1, 0, or -1 (qp, qz, pm)
@ -94,9 +87,8 @@ module srtradix4 (
 	// 0001 = -2
  qsel4 qsel4(.D, .WS, .WC, .q);
-  // Store the expoenent and sign until division is done
+  // Store the expoenent and sign until division is DivDone
-  flopen #(`NE) expflop(clk, Start, calcExp, rExp);
+  flopen #(`NE+1) expflop(clk, DivStart, DivCalcExp, DivCalcExpE);
  flopen #(1) signflop(clk, Start, calcSign, rsign);
  // Divisor Selection logic
  // *** radix 4 change to choose -2 to 2
@ -120,11 +112,11 @@ module srtradix4 (
  csa    #(`DIVLEN+4) csa(WS, WC, Dsel, |q[3:2], WSA, WCA);
  //*** change for radix 4
-  otfc4  #(`DIVLEN) otfc4(clk, Start, q, Quot);
+  otfc4 otfc4(.clk, .DivStart, .q, .Quot);
-  expcalc expcalc(.XExp, .YExp, .calcExp);
+  expcalc expcalc(.XExpE, .YExpE, .XZeroE, .DivCalcExp);
-  signcalc signcalc(.XSign, .YSign, .calcSign);
+  divcounter divcounter(clk, DivStart, DivDone);
 endmodule
@ -132,91 +124,154 @@ endmodule
 // Submodules //
 ////////////////
 /////////////
 // counter //
 /////////////
 module divcounter(input  logic clk, 
               input  logic DivStart, 
               output logic DivDone);
   logic    [5:0]  count;
  // This block of control logic sequences the divider
  // through its iterations.  You may modify it if you
  // build a divider which completes in fewer iterations.
  // You are not responsible for the (trivial) circuit
  // design of the block.
  always @(posedge clk)
    begin
      DivDone = 0;
      if      (count == `DIVLEN/2+1) DivDone <= #1 1;
      else if (DivDone | DivStart) DivDone <= #1 0;	
      if (DivStart) count <= #1 0;
      else     count <= #1 count+1;
    end
 endmodule
 module qsel4 (
 	input logic [`DIVLEN+3:0] D,
 	input logic [`DIVLEN+3:0] WS, WC,
 	output logic [3:0] q
 );
 	logic [6:0] Wmsbs;
 	logic [7:0] PreWmsbs;
 	logic [2:0] Dmsbs;
 	assign PreWmsbs = WC[`DIVLEN+3:`DIVLEN-4] + WS[`DIVLEN+3:`DIVLEN-4];
 	assign Wmsbs = PreWmsbs[7:1];
 	assign Dmsbs = D[`DIVLEN-1:`DIVLEN-3];
 	// D = 0001.xxx...
 	// Dmsbs = |   |
  // W =      xxxx.xxx...
 	// Wmsbs = |        |
 	logic [3:0] QSel4[1023:0];
  initial begin 
    integer d, w, i, w2;
    for(d=0; d<8; d++)
      for(w=0; w<128; w++)begin
        i = d*128+w;
        w2 = w-128*(w>=64); // convert to two's complement
        case(d)
          0: if($signed(w2)>=$signed(12))      QSel4[i] = 4'b1000;
            else if(w2>=4)   QSel4[i] = 4'b0100; 
            else if(w2>=-4)  QSel4[i] = 4'b0000; 
            else if(w2>=-13) QSel4[i] = 4'b0010; 
            else            QSel4[i] = 4'b0001; 
          1: if(w2>=14)      QSel4[i] = 4'b1000;
            else if(w2>=4)   QSel4[i] = 4'b0100; 
            else if(w2>=-6)  QSel4[i] = 4'b0000; 
            else if(w2>=-15) QSel4[i] = 4'b0010; 
            else            QSel4[i] = 4'b0001; 
          2: if(w2>=15)      QSel4[i] = 4'b1000;
            else if(w2>=4)   QSel4[i] = 4'b0100; 
            else if(w2>=-6)  QSel4[i] = 4'b0000; 
            else if(w2>=-16) QSel4[i] = 4'b0010; 
            else            QSel4[i] = 4'b0001; 
          3: if(w2>=16)      QSel4[i] = 4'b1000;
            else if(w2>=4)   QSel4[i] = 4'b0100; 
            else if(w2>=-6)  QSel4[i] = 4'b0000; 
            else if(w2>=-18) QSel4[i] = 4'b0010; 
            else            QSel4[i] = 4'b0001; 
          4: if(w2>=18)      QSel4[i] = 4'b1000;
            else if(w2>=6)   QSel4[i] = 4'b0100; 
            else if(w2>=-8)  QSel4[i] = 4'b0000; 
            else if(w2>=-20) QSel4[i] = 4'b0010; 
            else            QSel4[i] = 4'b0001; 
          5: if(w2>=20)      QSel4[i] = 4'b1000;
            else if(w2>=6)   QSel4[i] = 4'b0100; 
            else if(w2>=-8)  QSel4[i] = 4'b0000; 
            else if(w2>=-20) QSel4[i] = 4'b0010; 
            else            QSel4[i] = 4'b0001; 
          6: if(w2>=20)      QSel4[i] = 4'b1000;
            else if(w2>=8)   QSel4[i] = 4'b0100; 
            else if(w2>=-8)  QSel4[i] = 4'b0000; 
            else if(w2>=-22) QSel4[i] = 4'b0010; 
            else            QSel4[i] = 4'b0001; 
          7: if(w2>=24)      QSel4[i] = 4'b1000;
            else if(w2>=8)   QSel4[i] = 4'b0100; 
            else if(w2>=-8)  QSel4[i] = 4'b0000; 
            else if(w2>=-24) QSel4[i] = 4'b0010; 
            else            QSel4[i] = 4'b0001; 
        endcase
      end
  end
 	assign q = QSel4[{Dmsbs,Wmsbs}];
 endmodule
 ///////////////////
 // Preprocessing //
 ///////////////////
 module srtpreproc (
  input  logic [`XLEN-1:0] SrcA, SrcB,
-  input  logic [`NF-1:0] XFrac, YFrac,
+  input  logic [`NF:0] XManE, YManE,
  input  logic [1:0] Fmt, // Floats: 00 = 16 bit, 01 = 32 bit, 10 = 64 bit, 11 = 128 bit
  input  logic       W64, // 32-bit ints on XLEN=64
  input  logic       Signed, // Interpret integers as signed 2's complement
  input  logic       Int, // Choose integer inputs
  input  logic       Sqrt, // perform square root, not divide
-  output logic [`DIVLEN-1:0] X, D,
+  output logic [`DIVLEN:0] X,
  output logic [`DIVLEN-1:0] Dpreproc,
  output logic [$clog2(`XLEN+1)-1:0] intExp, // Quotient integer exponent
  output logic       intSign // Quotient integer sign
 );
-  logic  [$clog2(`XLEN+1)-1:0] zeroCntA, zeroCntB;
+  // logic  [$clog2(`XLEN+1)-1:0] zeroCntA, zeroCntB;
-  logic  [`XLEN-1:0] PosA, PosB;
+  // logic  [`XLEN-1:0] PosA, PosB;
-  logic  [`DIVLEN-1:0] ExtraA, ExtraB, PreprocA, PreprocB, PreprocX, PreprocY;
+  // logic  [`DIVLEN-1:0] ExtraA, ExtraB, PreprocA, PreprocB, PreprocX, PreprocY;
  logic  [`DIVLEN:0] PreprocA, PreprocX;
  logic  [`DIVLEN-1:0] PreprocB, PreprocY;
-  assign PosA = (Signed & SrcA[`XLEN - 1]) ? -SrcA : SrcA;
+  // assign PosA = (Signed & SrcA[`XLEN - 1]) ? -SrcA : SrcA;
-  assign PosB = (Signed & SrcB[`XLEN - 1]) ? -SrcB : SrcB;
+  // assign PosB = (Signed & SrcB[`XLEN - 1]) ? -SrcB : SrcB;
-  lzc #(`XLEN) lzcA (PosA, zeroCntA);
+  // lzc #(`XLEN) lzcA (PosA, zeroCntA);
-  lzc #(`XLEN) lzcB (PosB, zeroCntB);
+  // lzc #(`XLEN) lzcB (PosB, zeroCntB);
-  assign ExtraA = {PosA, {`DIVLEN-`XLEN{1'b0}}};
+  // assign ExtraA = {PosA, {`DIVLEN-`XLEN{1'b0}}};
-  assign ExtraB = {PosB, {`DIVLEN-`XLEN{1'b0}}};
+  // assign ExtraB = {PosB, {`DIVLEN-`XLEN{1'b0}}};
-  assign PreprocA = ExtraA << zeroCntA;
+  // assign PreprocA = ExtraA << zeroCntA;
-  assign PreprocB = ExtraB << (zeroCntB + 1);
+  // assign PreprocB = ExtraB << (zeroCntB + 1);
-  assign PreprocX = {XFrac, {`DIVLEN-`NF{1'b0}}};
+  assign PreprocX = {XManE, {`DIVLEN-`NF{1'b0}}};
-  assign PreprocY = {YFrac, {`DIVLEN-`NF{1'b0}}};
+  assign PreprocY = {YManE[`NF-1:0], {`DIVLEN-`NF{1'b0}}};
  assign X = Int ? PreprocA : PreprocX;
-  assign D = Int ? PreprocB : PreprocY;
+  assign Dpreproc = Int ? PreprocB : PreprocY;
-  assign intExp = zeroCntB - zeroCntA + 1;
+  // assign intExp = zeroCntB - zeroCntA + 1;
-  assign intSign = Signed & (SrcA[`XLEN - 1] ^ SrcB[`XLEN - 1]);
+  // assign intSign = Signed & (SrcA[`XLEN - 1] ^ SrcB[`XLEN - 1]);
 endmodule
 /////////////////////////////////
 // Quotient Selection, Radix 2 //
 /////////////////////////////////
 module qsel2 ( // *** eventually just change to 4 bits
  input  logic [`DIVLEN+3:`DIVLEN] ps, pc, 
  output logic         qp, qz, qm
 );
  logic [`DIVLEN+3:`DIVLEN]  p, g;
  logic          magnitude, sign, cout;
  // The quotient selection logic is presented for simplicity, not
  // for efficiency.  You can probably optimize your logic to
  // select the proper divisor with less delay.
  // Quotient equations from EE371 lecture notes 13-20
  assign p = ps ^ pc;
  assign g = ps & pc;
  assign #1 magnitude = ~(&p[`DIVLEN+2:`DIVLEN]);
  assign #1 cout = g[`DIVLEN+2] | (p[`DIVLEN+2] & (g[`DIVLEN+1] | p[`DIVLEN+1] & g[`DIVLEN]));
  assign #1 sign = p[`DIVLEN+3] ^ cout;
 /*  assign #1 magnitude = ~((ps[54]^pc[54]) & (ps[53]^pc[53]) & 
 			  (ps[52]^pc[52]));
  assign #1 sign = (ps[55]^pc[55])^
      (ps[54] & pc[54] | ((ps[54]^pc[54]) &
 			    (ps[53]&pc[53] | ((ps[53]^pc[53]) &
 						(ps[52]&pc[52]))))); */
  // Produce quotient = +1, 0, or -1
  assign #1 qp = magnitude & ~sign;
  assign #1 qz = ~magnitude;
  assign #1 qm = magnitude & sign;
 endmodule
 ///////////////////////////////////
 // On-The-Fly Converter, Radix 2 //
 ///////////////////////////////////
-module otfc4 #(parameter N=65) (
+module otfc4 (
  input  logic         clk,
-  input  logic         Start,
+  input  logic         DivStart,
  input  logic [3:0]   q,
-  output logic [N-1:0] r
+  output logic [`DIVLEN+2:0] Quot
 );
  //  The on-the-fly converter transfers the quotient 
@ -224,20 +279,20 @@ module otfc4 #(parameter N=65) (
  //
  //  This code follows the psuedocode presented in the 
  //  floating point chapter of the book. Right now, 
-  //  it is written for Radix-2 division.
+  //  it is written for Radix-4 division.
  //
  //  QM is Q-1. It allows us to write negative bits 
  //  without using a costly CPA. 
-  logic [N+2:0] Q, QM, QNext, QMNext, QMux, QMMux;
+  logic [`DIVLEN+2:0] QM, QNext, QMNext, QMux, QMMux;
  //  QR and QMR are the shifted versions of Q and QM.
  //  They are treated as [N-1:r] size signals, and 
  //  discard the r most significant bits of Q and QM. 
-  logic [N:0] QR, QMR;
+  logic [`DIVLEN:0] QR, QMR;
  // if starting a new divison set Q to 0 and QM to -1
-  mux2 #(N+3) Qmux(QNext, {N+3{1'b0}}, Start, QMux);
+  mux2 #(`DIVLEN+3) Qmux(QNext, {`DIVLEN+3{1'b0}}, DivStart, QMux);
-  mux2 #(N+3) QMmux(QMNext, {N+3{1'b1}}, Start, QMMux);
+  mux2 #(`DIVLEN+3) QMmux(QMNext, {`DIVLEN+3{1'b1}}, DivStart, QMMux);
-  flop #(N+3) Qreg(clk, QMux, Q);
+  flop #(`DIVLEN+3) Qreg(clk, QMux, Quot);
-  flop #(N+3) QMreg(clk, QMMux, QM);
+  flop #(`DIVLEN+3) QMreg(clk, QMMux, QM);
  // shift Q (quotent) and QM (quotent-1)
 		// if 	q = 2  	    Q = {Q, 10} 	QM = {Q, 01}		
@ -247,11 +302,9 @@ module otfc4 #(parameter N=65) (
 		// else if 	q = -2	Q = {QM, 10} 	QM = {QM, 01}
    // *** how does the 0 concatination numbers work?
  always_comb begin
-    QR  = Q[N:0];
+    QR  = Quot[`DIVLEN:0];
-    QMR = QM[N:0];     // Shift Q and QM
+    QMR = QM[`DIVLEN:0];     // Shift Q and QM
    if (q[3]) begin // +2
      QNext  = {QR,  2'b10};
      QMNext = {QR,  2'b01};
@ -269,7 +322,8 @@ module otfc4 #(parameter N=65) (
      QMNext = {QMR, 2'b11};
    end 
  end
-  assign r = Q[N+2] ? Q[N+1:2] : Q[N:1];
+  // Quot is in the range [.5, 2) so normalize the result if nesissary
  // assign Quot = Q[`DIVLEN+2] ? Q[`DIVLEN+1:2] : Q[`DIVLEN:1];
 endmodule
@ -287,7 +341,7 @@ module csa #(parameter N=69) (
  // This block adds in1, in2, in3, and cin to produce 
  // a result out1 / out2 in carry-save redundant form.
  // cin is just added to the least significant bit and
-  // is required to handle adding a negative divisor.
+  // is Startuired to handle adding a negative divisor.
  // Fortunately, the carry (out2) is shifted left by one
  // bit, leaving room in the least significant bit to 
  // insert cin.
@ -302,22 +356,11 @@ endmodule
 // expcalc  //
 //////////////
 module expcalc(
-  input logic  [`NE-1:0] XExp, YExp,
+  input logic  [`NE-1:0] XExpE, YExpE,
-  output logic [`NE-1:0] calcExp
+  input logic XZeroE,
  output logic [`NE:0] DivCalcExp
 );
-  assign calcExp = XExp - YExp + (`NE)'(`BIAS);
+  assign DivCalcExp = (XExpE - YExpE + (`NE)'(`BIAS))&{`NE+1{~XZeroE}};
 endmodule
 //////////////
 // signcalc //
 //////////////
 module signcalc(
  input logic  XSign, YSign,
  output logic calcSign
 );
  assign calcSign = XSign ^ YSign;
 endmodule
--- a/pipelined/srt/testbench-radix4.sv
+++ b/pipelined/srt/testbench-radix4.sv
@ -2,30 +2,6 @@
 `include "wally-config.vh"
 `define DIVLEN ((`NF<`XLEN) ? `XLEN : `NF)
 /////////////
 // counter //
 /////////////
 module counter(input  logic clk, 
               input  logic req, 
               output logic done);
   logic    [5:0]  count;
  // This block of control logic sequences the divider
  // through its iterations.  You may modify it if you
  // build a divider which completes in fewer iterations.
  // You are not responsible for the (trivial) circuit
  // design of the block.
  always @(posedge clk)
    begin
      if      (count == `DIVLEN/2+1) done <= #1 1;
      else if (done | req) done <= #1 0;	
      if (req) count <= #1 0;
      else     count <= #1 count+1;
    end
 endmodule
 ///////////
 // clock //
 ///////////
@ -43,7 +19,7 @@ endmodule
 module testbenchradix4;
  logic              clk;
  logic              req;
-  logic              done;
+  logic              DivDone;
  logic [63:0]       a, b;
  logic [51:0]       afrac, bfrac;
  logic [10:0]       aExp, bExp;
@ -65,22 +41,20 @@ module testbenchradix4;
  logic [MEM_WIDTH-1:0] Vec;  // Verilog doesn't allow direct access to a
                            // bit field of an array 
  logic [63:0] correctr, nextr, diffn, diffp;
-  logic [10:0] rExp;
+  logic [10:0] DivExp;
-  logic        rsign;
+  logic        DivSgn;
  integer testnum, errors;
  // Divider
-  srtradix4 srtradix4(.clk, .Start(req), 
+  srtradix4 srtradix4(.clk, .DivStart(req), 
-                .Stall(1'b0), .Flush(1'b0), 
+                .XExpE(aExp), .YExpE(bExp), .DivExp,
-                .XExp(aExp), .YExp(bExp), .rExp,
+                .XSgnE(asign), .YSgnE(bsign), .DivSgn,
                .XSign(asign), .YSign(bsign), .rsign,
                .XFrac(afrac), .YFrac(bfrac), 
-                .SrcA('0), .SrcB('0), .Fmt(2'b00), 
+                .SrcA('0), .SrcB('0),
-                .W64(1'b0), .Signed(1'b0), .Int(1'b0), .Sqrt(1'b0), 
+                .W64(1'b0), .Signed(1'b0), .Int(1'b0), .Sqrt(1'b0), .DivDone,
-                .Quot, .Rem(), .Flags());
+                .Quot, .Rem());
  // Counter
  counter counter(clk, req, done);
    initial
@ -112,14 +86,14 @@ module testbenchradix4;
  always @(posedge clk)
    begin
      r = Quot[`DIVLEN-1:`DIVLEN - 52];
-      if (done) begin
+      if (DivDone) begin
        req <= 1;
        diffp = correctr[51:0] - r;
        diffn = r - correctr[51:0];
-        if ((rsign !== correctr[63]) | (rExp !== correctr[62:52]) | ($signed(diffn) > 1) | ($signed(diffp) > 1) | (diffn === 64'bx) | (diffp === 64'bx)) // check if accurate to 1 ulp
+        if ((DivSgn !== correctr[63]) | (DivExp !== correctr[62:52]) | ($signed(diffn) > 1) | ($signed(diffp) > 1) | (diffn === 64'bx) | (diffp === 64'bx)) // check if accurate to 1 ulp
          begin
            errors = errors+1;
-            $display("result was %h_%h, should be %h %h %h\n", rExp, r, correctr, diffn, diffp);
+            $display("result was %h_%h, should be %h %h %h\n", DivExp, r, correctr, diffn, diffp);
            $display("failed\n");
            $stop;
          end
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@ -48,13 +48,14 @@ module testbenchfp;
  logic                 XInf, YInf, ZInf;                   // is the input infinity
  logic                 XZero, YZero, ZZero;                // is the input zero
  logic                 XExpMax, YExpMax, ZExpMax;         // is the input's exponent all ones  
-  logic  [`LGLEN-1:0]      CvtLzcInE;      // input to the Leading Zero Counter (priority encoder)
+  logic  [`CVTLEN-1:0]      CvtLzcInE;      // input to the Leading Zero Counter (priority encoder)
  logic        IntZeroE;
  logic CvtResSgnE;
  logic [`XLEN-1:0] Empty1,Empty2,Empty3,Empty4,Empty5;
  logic [`NE:0]           CvtCalcExpE;    // the calculated expoent
-	logic [`LOGLGLEN-1:0] CvtShiftAmtE;  // how much to shift by
+	logic [`LOGCVTLEN-1:0] CvtShiftAmtE;  // how much to shift by
 	logic [`DIVLEN+2:0] Quot;
  logic CvtResDenormUfE;
  logic DivStart, DivDone;
  // in-between FMA signals
@ -68,6 +69,8 @@ module testbenchfp;
  logic 			          NegSumE;
  logic 			          ZSgnEffE;
  logic 			          PSgnE;
  logic       DivSgn;
  logic [`NE:0] DivCalcExp;
  ///////////////////////////////////////////////////////////////////////////////////////////////
@ -205,16 +208,16 @@ module testbenchfp;
            Fmt = {Fmt, 2'b11};
          end
      end
-      // if (TEST === "div"   | TEST === "all") begin // if division is being tested
+      if (TEST === "div"   | TEST === "all") begin // if division is being tested
-      //   // add the divide tests/op-ctrls/unit/fmt
+        // add the divide tests/op-ctrls/unit/fmt
-      //   Tests = {Tests, f128div};
+        Tests = {Tests, f128div};
-      //   OpCtrl = {OpCtrl, `DIV_OPCTRL};
+        OpCtrl = {OpCtrl, `DIV_OPCTRL};
-      //   WriteInt = {WriteInt, 1'b0};
+        WriteInt = {WriteInt, 1'b0};
-      //     for(int i = 0; i<5; i++) begin
+          for(int i = 0; i<5; i++) begin
-      //       Unit = {Unit, `DIVUNIT};
+            Unit = {Unit, `DIVUNIT};
-      //       Fmt = {Fmt, 2'b11};
+            Fmt = {Fmt, 2'b11};
-      //     end
+          end
-      // end
+      end
      // if (TEST === "sqrt"  | TEST === "all") begin // if square-root is being tested
      //   // add the square-root tests/op-ctrls/unit/fmt
      //   Tests = {Tests, f128sqrt};
@ -332,16 +335,16 @@ module testbenchfp;
          Fmt = {Fmt, 2'b01};
        end
      end
-      // if (TEST === "div"   | TEST === "all") begin // if division is being tested
+      if (TEST === "div"   | TEST === "all") begin // if division is being tested
-      //   // add the correct tests/op-ctrls/unit/fmt to their lists
+        // add the correct tests/op-ctrls/unit/fmt to their lists
-      //   Tests = {Tests, f64div};
+        Tests = {Tests, f64div};
-      //   OpCtrl = {OpCtrl, `DIV_OPCTRL};
+        OpCtrl = {OpCtrl, `DIV_OPCTRL};
-      //   WriteInt = {WriteInt, 1'b0};
+        WriteInt = {WriteInt, 1'b0};
-      //   for(int i = 0; i<5; i++) begin
+        for(int i = 0; i<5; i++) begin
-      //     Unit = {Unit, `DIVUNIT};
+          Unit = {Unit, `DIVUNIT};
-      //     Fmt = {Fmt, 2'b01};
+          Fmt = {Fmt, 2'b01};
-      //   end
+        end
-      // end
+      end
      // if (TEST === "sqrt"  | TEST === "all") begin // if square-root is being tessted
      //   // add the correct tests/op-ctrls/unit/fmt to their lists
      //   Tests = {Tests, f64sqrt};
@ -443,16 +446,16 @@ module testbenchfp;
          Fmt = {Fmt, 2'b00};
        end
      end
-      // if (TEST === "div"   | TEST === "all") begin // if division is being tested
+      if (TEST === "div"   | TEST === "all") begin // if division is being tested
-      //   // add the correct tests/op-ctrls/unit/fmt to their lists
+        // add the correct tests/op-ctrls/unit/fmt to their lists
-      //   Tests = {Tests, f32div};
+        Tests = {Tests, f32div};
-      //   OpCtrl = {OpCtrl, `DIV_OPCTRL};
+        OpCtrl = {OpCtrl, `DIV_OPCTRL};
-      //   WriteInt = {WriteInt, 1'b0};
+        WriteInt = {WriteInt, 1'b0};
-      //   for(int i = 0; i<5; i++) begin
+        for(int i = 0; i<5; i++) begin
-      //     Unit = {Unit, `DIVUNIT};
+          Unit = {Unit, `DIVUNIT};
-      //     Fmt = {Fmt, 2'b00};
+          Fmt = {Fmt, 2'b00};
-      //   end
+        end
-      // end
+      end
      // if (TEST === "sqrt"  | TEST === "all") begin // if sqrt is being tested
      //   // add the correct tests/op-ctrls/unit/fmt to their lists
      //   Tests = {Tests, f32sqrt};
@ -536,16 +539,16 @@ module testbenchfp;
          Fmt = {Fmt, 2'b10};
        end
      end
-      // if (TEST === "div"   | TEST === "all") begin // if division is being tested
+      if (TEST === "div"   | TEST === "all") begin // if division is being tested
-      //   // add the correct tests/op-ctrls/unit/fmt to their lists
+        // add the correct tests/op-ctrls/unit/fmt to their lists
-      //   Tests = {Tests, f16div};
+        Tests = {Tests, f16div};
-      //   OpCtrl = {OpCtrl, `DIV_OPCTRL};
+        OpCtrl = {OpCtrl, `DIV_OPCTRL};
-      //   WriteInt = {WriteInt, 1'b0};
+        WriteInt = {WriteInt, 1'b0};
-      //   for(int i = 0; i<5; i++) begin
+        for(int i = 0; i<5; i++) begin
-      //     Unit = {Unit, `DIVUNIT};
+          Unit = {Unit, `DIVUNIT};
-      //     Fmt = {Fmt, 2'b10};
+          Fmt = {Fmt, 2'b10};
-      //   end
+        end
-      // end
+      end
      // if (TEST === "sqrt"  | TEST === "all") begin // if sqrt is being tested
      //   // add the correct tests/op-ctrls/unit/fmt to their lists
      //   Tests = {Tests, f16sqrt};
@ -611,7 +614,7 @@ module testbenchfp;
  readvectors readvectors          (.clk, .Fmt(FmtVal), .ModFmt, .TestVector(TestVectors[VectorNum]), .VectorNum, .Ans(Ans), .AnsFlg(AnsFlg), .SrcA, 
                                    .XSgnE(XSgn), .YSgnE(YSgn), .ZSgnE(ZSgn), .Unit (UnitVal),
                                    .XExpE(XExp), .YExpE(YExp), .ZExpE(ZExp), .TestNum, .OpCtrl(OpCtrlVal),
-                                    .XManE(XMan), .YManE(YMan), .ZManE(ZMan),
+                                    .XManE(XMan), .YManE(YMan), .ZManE(ZMan), .DivStart,
                                    .XNaNE(XNaN), .YNaNE(YNaN), .ZNaNE(ZNaN),
                                    .XSNaNE(XSNaN), .YSNaNE(YSNaN), .ZSNaNE(ZSNaN), 
                                    .XDenormE(XDenorm), .ZDenormE(ZDenorm), 
@ -639,8 +642,8 @@ module testbenchfp;
              .FOpCtrlE(OpCtrlVal), .FmtE(ModFmt), .SumE, .NegSumE, .InvZE, .FmaNormCntE, .ZSgnEffE, .PSgnE,
              .ProdExpE, .AddendStickyE, .KillProdE); 
-  postprocess postprocess(.XSgnM(XSgn), .PostProcSelM(UnitVal[1:0]),
+  postprocess postprocess(.XSgnM(XSgn), .YSgnM(YSgn), .PostProcSelM(UnitVal[1:0]),
-              .ZExpM(ZExp),  .ZDenormM(ZDenorm), .FOpCtrlM(OpCtrlVal),
+              .ZExpM(ZExp),  .ZDenormM(ZDenorm), .FOpCtrlM(OpCtrlVal), .Quot, .DivCalcExpM(DivCalcExp),
              .XManM(XMan), .YManM(YMan), .ZManM(ZMan), .CvtCalcExpM(CvtCalcExpE),
              .XNaNM(XNaN), .YNaNM(YNaN), .ZNaNM(ZNaN), .CvtResDenormUfM(CvtResDenormUfE),
              .XZeroM(XZero), .YZeroM(YZero), .ZZeroM(ZZero), .CvtShiftAmtM(CvtShiftAmtE),
@ -650,20 +653,15 @@ module testbenchfp;
              .SumM(SumE), .NegSumM(NegSumE), .InvZM(InvZE), .FmaNormCntM(FmaNormCntE), .ZSgnEffM(ZSgnEffE), .PSgnM(PSgnE), .FmtM(ModFmt), .FrmM(FrmVal), 
              .PostProcFlgM(Flg), .PostProcResM(FpRes), .FCvtIntResM(IntRes));
-fcvt fcvt (.XSgnE(XSgn), .XExpE(XExp), .XManE(XMan), .ForwardedSrcAE(SrcA), .FWriteIntE(WriteIntVal), 
+  fcvt fcvt (.XSgnE(XSgn), .XExpE(XExp), .XManE(XMan), .ForwardedSrcAE(SrcA), .FWriteIntE(WriteIntVal), 
            .XZeroE(XZero), .XDenormE(XDenorm), .FOpCtrlE(OpCtrlVal), .IntZeroE,
            .FmtE(ModFmt), .CvtCalcExpE, .CvtShiftAmtE, .CvtResDenormUfE, .CvtResSgnE, .CvtLzcInE);
  fcmp fcmp   (.FmtE(ModFmt), .FOpCtrlE(OpCtrlVal), .XSgnE(XSgn), .YSgnE(YSgn), .XExpE(XExp), .YExpE(YExp), 
              .XManE(XMan), .YManE(YMan), .XZeroE(XZero), .YZeroE(YZero), .CmpIntResE(CmpRes),
              .XNaNE(XNaN), .YNaNE(YNaN), .XSNaNE(XSNaN), .YSNaNE(YSNaN), .FSrcXE(X), .FSrcYE(Y), .CmpNVE(CmpFlg[4]), .CmpFpResE(FpCmpRes));
-  // fcvtint fcvtint (.XSgnE(XSgn), .XExpE(XExp), .XManE(XMan), .XZeroE(XZero), .XNaNE(XNaN), .XInfE(XInf), 
+  srtradix4 srtradix4(.clk, .DivStart, .XExpE(XExp), .YExpE(YExp), .DivCalcExpE(DivCalcExp), .XZeroE(XZero),
-  //                 .XDenormE(XDenorm), .ForwardedSrcAE(SrcA), .FOpCtrlE, .FmtE(ModFmt), .FrmE(Frmal),
+                .XManE(XMan), .YManE(YMan), .SrcA('0), .SrcB('0), .W64(1'b0), .Signed(1'b0), .Int(1'b0), .Sqrt(OpCtrlVal[0]), 
-  //                 .CvtRes, .CvtFlgE);
+                .DivDone, .Quot, .Rem());
  // *** integrade divide and squareroot
  //  fpdiv_pipe fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmVal[1:0]), .op_type(FOpCtrlQ), 
  //        .reset, .clk(clk), .start(FDivStartE), .P(~FmtQ), .OvEn(1'b1), .UnEn(1'b1),
  //        .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ, .load_preload,
  //        .FDivBusyE, .done(FDivSqrtDoneE), .AS_Res(FDivRes), .Flg(FDivFlg));
  assign CmpFlg[3:0] = 0;
@ -817,7 +815,7 @@ end
  ///////////////////////////////////////////////////////////////////////////////////////////////
    // check if the non-fma test is correct
-    if(~((Res === Ans | NaNGood | NaNGood === 1'bx) & (ResFlg === AnsFlg | AnsFlg === 5'bx))&(UnitVal !== `CVTINTUNIT)&(UnitVal !== `CMPUNIT)) begin
+    if(~((Res === Ans | NaNGood | NaNGood === 1'bx) & (ResFlg === AnsFlg | AnsFlg === 5'bx))&(DivDone&(UnitVal == `DIVUNIT))&(UnitVal !== `CVTINTUNIT)&(UnitVal !== `CMPUNIT)) begin
      errors += 1;
      $display("There is an error in %s", Tests[TestNum]);
      $display("inputs: %h %h %h\nSrcA: %h\n Res: %h %h\n Ans: %h %h", X, Y, Z, SrcA, Res, ResFlg, Ans, AnsFlg);
@ -840,8 +838,7 @@ end
      $stop;
    end
-
+    if(DivDone|(UnitVal != `DIVUNIT)) VectorNum += 1; // increment the vector
    VectorNum += 1; // increment the vector
    if (TestVectors[VectorNum][0] === 1'bx & Tests[TestNum] !== "") begin // if reached the end of file
@ -895,15 +892,17 @@ module readvectors (
  output logic                    XDenormE, ZDenormE,   // is XYZ denormalized
  output logic                    XZeroE, YZeroE, ZZeroE,         // is XYZ zero
  output logic                    XInfE, YInfE, ZInfE,            // is XYZ infinity
-  output logic XExpMaxE,
+  output logic                    XExpMaxE,
  output logic                    DivStart,
  output logic [`FLEN-1:0] X, Y, Z
 );
  // apply test vectors on rising edge of clk
  // Format of vectors Inputs(1/2/3)_AnsFlg
-  always @(posedge clk) begin
+  always @(VectorNum) begin
    #1; 
    AnsFlg = TestVector[4:0];
    DivStart = 1'b0;
    case (Unit)
      `FMAUNIT:
        case (Fmt)
@ -972,21 +971,33 @@ module readvectors (
            X = TestVector[8+3*(`Q_LEN)-1:8+2*(`Q_LEN)];
            Y = TestVector[8+2*(`Q_LEN)-1:8+(`Q_LEN)];
            Ans = TestVector[8+(`Q_LEN-1):8];
            if (~clk) #5;
            DivStart = 1'b1; #10 // one clk cycle
            DivStart = 1'b0;
          end
          2'b01:	begin	  // double
            X = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+3*(`D_LEN)-1:8+2*(`D_LEN)]};
            Y = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+2*(`D_LEN)-1:8+(`D_LEN)]};
            Ans = {{`FLEN-`D_LEN{1'b1}}, TestVector[8+(`D_LEN-1):8]};
            if (~clk) #5;
            DivStart = 1'b1; #10
            DivStart = 1'b0;
          end
          2'b00:	begin	  // single
            X = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+3*(`S_LEN)-1:8+2*(`S_LEN)]};
            Y = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+2*(`S_LEN)-1:8+1*(`S_LEN)]};
            Ans = {{`FLEN-`S_LEN{1'b1}}, TestVector[8+(`S_LEN-1):8]};
            if (~clk) #5;
            DivStart = 1'b1; #10
            DivStart = 1'b0;
          end
          2'b10:	begin	  // half
            X = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+3*(`H_LEN)-1:8+2*(`H_LEN)]};
            Y = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+2*(`H_LEN)-1:8+(`H_LEN)]};
            Ans = {{`FLEN-`H_LEN{1'b1}}, TestVector[8+(`H_LEN-1):8]};
            if (~clk) #5;
            DivStart = 1'b1; #10
            DivStart = 1'b0;
          end
        endcase
      `CMPUNIT:
--- a/pipelined/testbench/testbench.sv.bak
+++ b/pipelined/testbench/testbench.sv.bak
@ -1,473 +0,0 @@
 ///////////////////////////////////////////
 // testbench.sv
 //
 // Written: David_Harris@hmc.edu 9 January 2021
 // Modified: 
 //
 // Purpose: Wally Testbench and helper modules
 //          Applies test programs from the riscv-arch-test and Imperas suites
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // MIT LICENSE
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this 
 // software and associated documentation files (the "Software"), to deal in the Software 
 // without restriction, including without limitation the rights to use, copy, modify, merge, 
 // publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
 // to whom the Software is furnished to do so, subject to the following conditions:
 //
 //   The above copyright notice and this permission notice shall be included in all copies or 
 //   substantial portions of the Software.
 //
 //   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
 //   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
 //   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 //   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 //   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
 //   OR OTHER DEALINGS IN THE SOFTWARE.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 `include "wally-config.vh"
 `include "tests.vh"
 module testbench;
  parameter TESTSPERIPH = 0; // set to 0 for regression
  parameter TESTSPRIV = 0; // set to 0 for regression
  parameter DEBUG=0;
  parameter TEST="none";
  logic        clk;
  logic        reset_ext, reset;
  parameter SIGNATURESIZE = 5000000;
  int test, i, errors, totalerrors;
  logic [31:0] sig32[0:SIGNATURESIZE];
  logic [`XLEN-1:0] signature[0:SIGNATURESIZE];
  logic [`XLEN-1:0] testadr;
  string InstrFName, InstrDName, InstrEName, InstrMName, InstrWName;
  logic [31:0] InstrW;
 string tests[];
 logic [3:0] dummy;
  string ProgramAddrMapFile, ProgramLabelMapFile;
  logic [`AHBW-1:0] HRDATAEXT;
  logic             HREADYEXT, HRESPEXT;
  logic [31:0]      HADDR;
  logic [`AHBW-1:0] HWDATA;
  logic             HWRITE;
  logic [2:0]       HSIZE;
  logic [2:0]       HBURST;
  logic [3:0]       HPROT;
  logic [1:0]       HTRANS;
  logic             HMASTLOCK;
  logic             HCLK, HRESETn;
  logic [`XLEN-1:0] PCW;
  logic 	    DCacheFlushDone, DCacheFlushStart;
  flopenr #(`XLEN) PCWReg(clk, reset, ~dut.core.ieu.dp.StallW, dut.core.ifu.PCM, PCW);
  flopenr  #(32)   InstrWReg(clk, reset, ~dut.core.ieu.dp.StallW,  dut.core.ifu.InstrM, InstrW);
  // check assertions for a legal configuration
  riscvassertions riscvassertions();
  // pick tests based on modes supported
  initial begin
    $display("TEST is %s", TEST);
    //tests = '{};
    if (`XLEN == 64) begin // RV64
      case (TEST)
        "arch64i":                        tests = arch64i;
        "arch64priv":                     tests = arch64priv;
        "arch64c":      if (`C_SUPPORTED) 
                          if (`ZICSR_SUPPORTED) tests = {arch64c, arch64cpriv};
                          else                  tests = {arch64c};
        "arch64m":      if (`M_SUPPORTED) tests = arch64m;
        "arch64d":      if (`D_SUPPORTED) tests = arch64d;
        "imperas64i":                     tests = imperas64i;
        "imperas64p":                     tests = imperas64p;
 //        "imperas64mmu": if (`VIRTMEM_SUPPORTED) tests = imperas64mmu;
        "imperas64f":   if (`F_SUPPORTED) tests = imperas64f;
        "imperas64d":   if (`D_SUPPORTED) tests = imperas64d;
        "imperas64m":   if (`M_SUPPORTED) tests = imperas64m;
        "imperas64a":   if (`A_SUPPORTED) tests = imperas64a;
        "imperas64c":   if (`C_SUPPORTED) tests = imperas64c;
                        else              tests = imperas64iNOc;
        "testsBP64":                      tests = testsBP64;
        "wally64i":                       tests = wally64i; // *** redo
        "wally64priv":                    tests = wally64priv;// *** redo
        "imperas64periph":                tests = imperas64periph;
        "coremark":                       tests = coremark;
      endcase 
    end else begin // RV32
      case (TEST)
        "arch32i":                        tests = arch32i;
        "arch32priv":                     tests = arch32priv;
        "arch32c":      if (`C_SUPPORTED) 
                          if (`ZICSR_SUPPORTED) tests = {arch32c, arch32cpriv};
                          else                  tests = {arch32c};
        "arch32m":      if (`M_SUPPORTED) tests = arch32m;
        "arch32f":      if (`F_SUPPORTED) tests = arch32f;
        "imperas32i":                     tests = imperas32i;
        "imperas32p":                     tests = imperas32p;
 //        "imperas32mmu": if (`VIRTMEM_SUPPORTED) tests = imperas32mmu;
        "imperas32f":   if (`F_SUPPORTED) tests = imperas32f;
        "imperas32m":   if (`M_SUPPORTED) tests = imperas32m;
        "imperas32a":   if (`A_SUPPORTED) tests = imperas32a;
        "imperas32c":   if (`C_SUPPORTED) tests = imperas32c;
                        else              tests = imperas32iNOc;
        "wally32i":                       tests = wally32i; // *** redo
        "wally32e":                       tests = wally32e; 
        "wally32priv":                    tests = wally32priv; // *** redo
        "imperas32periph":                  tests = imperas32periph;
      endcase
    end
    if (tests.size() == 0) begin
      $display("TEST %s not supported in this configuration", TEST);
      $stop;
    end
  end
  string signame, memfilename, pathname;
  logic [31:0] GPIOPinsIn, GPIOPinsOut, GPIOPinsEn;
  logic UARTSin, UARTSout;
  logic SDCCLK;
  logic      SDCCmdIn;
  logic      SDCCmdOut;
  logic      SDCCmdOE;
  logic [3:0] SDCDatIn;
  logic             HREADY;
  logic 	    HSELEXT;
  // instantiate device to be tested
  assign GPIOPinsIn = 0;
  assign UARTSin = 1;
  assign HREADYEXT = 1;
  assign HRESPEXT = 0;
  assign HRDATAEXT = 0;
  wallypipelinedsoc dut(.clk, .reset_ext, .reset, .HRDATAEXT,.HREADYEXT, .HRESPEXT,.HSELEXT,
                        .HCLK, .HRESETn, .HADDR, .HWDATA, .HWRITE, .HSIZE, .HBURST, .HPROT,
                        .HTRANS, .HMASTLOCK, .HREADY, .TIMECLK(1'b0), .GPIOPinsIn, .GPIOPinsOut, .GPIOPinsEn,
                        .UARTSin, .UARTSout, .SDCCmdIn, .SDCCmdOut, .SDCCmdOE, .SDCDatIn, .SDCCLK); 
  // Track names of instructions
  instrTrackerTB it(clk, reset, dut.core.ieu.dp.FlushE,
                dut.core.ifu.FinalInstrRawF[31:0],
                dut.core.ifu.InstrD, dut.core.ifu.InstrE,
                dut.core.ifu.InstrM,  InstrW,
                InstrFName, InstrDName, InstrEName, InstrMName, InstrWName);
  // initialize tests
  localparam integer 	   MemStartAddr = `RAM_BASE>>(1+`XLEN/32);
  localparam integer 	   MemEndAddr = (`RAM_RANGE+`RAM_BASE)>>1+(`XLEN/32);
  initial
    begin
      test = 1;
      totalerrors = 0;
      testadr = 0;
      // fill memory with defined values to reduce Xs in simulation
      // Quick note the memory will need to be initialized.  The C library does not
      //  guarantee the  initialized reads.  For example a strcmp can read 6 byte
      //  strings, but uses a load double to read them in.  If the last 2 bytes are
      //  not initialized the compare results in an 'x' which propagates through 
      // the design.
      if (TEST == "coremark") 
        for (i=MemStartAddr; i<MemEndAddr; i = i+1) 
          dut.uncore.ram.ram.RAM[i] = 64'h0; 
      // read test vectors into memory
      pathname = tvpaths[tests[0].atoi()];
 /*      if (tests[0] == `IMPERASTEST)
        pathname = tvpaths[0];
      else pathname = tvpaths[1]; */
      memfilename = {pathname, tests[test], ".elf.memfile"};
      if (`IMEM == `MEM_TIM) $readmemh(memfilename, dut.core.ifu.irom.ram.RAM);
      else              $readmemh(memfilename, dut.uncore.ram.RAM);
      if (`DMEM == `MEM_TIM) $readmemh(memfilename, dut.core.lsu.dtim.ram.RAM);
      ProgramAddrMapFile = {pathname, tests[test], ".elf.objdump.addr"};
      ProgramLabelMapFile = {pathname, tests[test], ".elf.objdump.lab"};
      $display("Read memfile %s", memfilename);
      reset_ext = 1; # 42; reset_ext = 0;
    end
  // generate clock to sequence tests
  always
    begin
      clk = 1; # 5; clk = 0; # 5;
      // if ($time % 100000 == 0) $display("Time is %0t", $time);
    end
  // check results
  always @(negedge clk)
    begin    
      if (TEST == "coremark")
        if (dut.core.priv.priv.ecallM) begin
          $display("Benchmark: coremark is done.");
          $stop;
        end
      if (DCacheFlushDone) begin
        #600; // give time for instructions in pipeline to finish
        // clear signature to prevent contamination from previous tests
        for(i=0; i<SIGNATURESIZE; i=i+1) begin
          sig32[i] = 'bx;
        end
        // read signature, reformat in 64 bits if necessary
        signame = {pathname, tests[test], ".signature.output"};
        $readmemh(signame, sig32);
        i = 0;
        while (i < SIGNATURESIZE) begin
          if (`XLEN == 32) begin
            signature[i] = sig32[i];
            i = i+1;
          end else begin
            signature[i/2] = {sig32[i+1], sig32[i]};
            i = i + 2;
          end
          if (i >= 4 & sig32[i-4] === 'bx) begin
            if (i == 4) begin
              i = SIGNATURESIZE+1; // flag empty file
              $display("  Error: empty test file");
            end else i = SIGNATURESIZE; // skip over the rest of the x's for efficiency
          end
        end
        // Check errors
        errors = (i == SIGNATURESIZE+1); // error if file is empty
        i = 0;
        testadr = (`RAM_BASE+tests[test+1].atohex())/(`XLEN/8);
        /* verilator lint_off INFINITELOOP */
        while (signature[i] !== 'bx) begin
          logic [`XLEN-1:0] sig;
          if (`DMEM == `MEM_TIM) sig = dut.core.lsu.dtim.ram.RAM[testadr+i];
          else                   sig = dut.uncore.ram.RAM[testadr+i];
 //          $display("signature[%h] = %h sig = %h", i, signature[i], sig);
          if (signature[i] !== sig &
          //if (signature[i] !== dut.core.lsu.dtim.ram.RAM[testadr+i] &
 	      (signature[i] !== DCacheFlushFSM.ShadowRAM[testadr+i])) begin  // ***i+1?
            if ((signature[i] !== '0 | signature[i+4] !== 'x)) begin
 //            if (signature[i+4] !== 'bx | (signature[i] !== 32'hFFFFFFFF & signature[i] !== 32'h00000000)) begin
              // report errors unless they are garbage at the end of the sim
              // kind of hacky test for garbage right now
              $display("sig4 = %h ne %b", signature[i+4], signature[i+4] !== 'bx);
              errors = errors+1;
              $display("  Error on test %s result %d: adr = %h sim (D$) %h sim (DMEM) = %h, signature = %h", 
                    tests[test], i, (testadr+i)*(`XLEN/8), DCacheFlushFSM.ShadowRAM[testadr+i], sig, signature[i]);
                    //   tests[test], i, (testadr+i)*(`XLEN/8), DCacheFlushFSM.ShadowRAM[testadr+i], dut.core.lsu.dtim.ram.RAM[testadr+i], signature[i]);
              $stop;//***debug
            end
          end
          i = i + 1;
        end
        /* verilator lint_on INFINITELOOP */
        if (errors == 0) begin
          $display("%s succeeded.  Brilliant!!!", tests[test]);
        end
        else begin
          $display("%s failed with %d errors. :(", tests[test], errors);
          totalerrors = totalerrors+1;
        end
        test = test + 2;
        if (test == tests.size()) begin
          if (totalerrors == 0) $display("SUCCESS! All tests ran without failures.");
          else $display("FAIL: %d test programs had errors", totalerrors);
          $stop;
        end
        else begin
            //pathname = tvpaths[tests[0]];
            memfilename = {pathname, tests[test], ".elf.memfile"};
            //$readmemh(memfilename, dut.uncore.ram.ram.RAM);
            if (`IMEM == `MEM_TIM) $readmemh(memfilename, dut.core.ifu.irom.ram.RAM);
            else                   $readmemh(memfilename, dut.uncore.ram.RAM);
            if (`DMEM == `MEM_TIM) $readmemh(memfilename, dut.core.lsu.dtim.ram.RAM);
            ProgramAddrMapFile = {pathname, tests[test], ".elf.objdump.addr"};
            ProgramLabelMapFile = {pathname, tests[test], ".elf.objdump.lab"};
            $display("Read memfile %s", memfilename);
            reset_ext = 1; # 47; //reset_ext = 0;
        end
      end
    end // always @ (negedge clk)
  // track the current function or global label
  if (DEBUG == 1) begin : FunctionName
    FunctionName FunctionName(.reset(reset),
 			      .clk(clk),
 			      .ProgramAddrMapFile(ProgramAddrMapFile),
 			      .ProgramLabelMapFile(ProgramLabelMapFile));
  end
  // Termination condition
  // terminate on a specific ECALL after li x3,1 for old Imperas tests,  *** remove this when old imperas tests are removed
  // or sw	gp,-56(t0) for new Imperas tests
  // or sd gp, -56(t0) 
  // or on a jump to self infinite loop (6f) for RISC-V Arch tests
  logic ecf; // remove this once we don't rely on old Imperas tests with Ecalls
  if (`ZICSR_SUPPORTED) assign ecf = dut.core.priv.priv.EcallFaultM;
  else                  assign ecf = 0;
  assign DCacheFlushStart = ecf & 
 			    (dut.core.ieu.dp.regf.rf[3] == 1 | 
 			     (dut.core.ieu.dp.regf.we3 & 
 			      dut.core.ieu.dp.regf.a3 == 3 & 
 			      dut.core.ieu.dp.regf.wd3 == 1)) |
          (dut.core.ifu.InstrM == 32'h6f | dut.core.ifu.InstrM == 32'hfc32a423 | dut.core.ifu.InstrM == 32'hfc32a823) & dut.core.ieu.c.InstrValidM;
  DCacheFlushFSM DCacheFlushFSM(.clk(clk),
    			.reset(reset),
 	    		.start(DCacheFlushStart),
 		    	.done(DCacheFlushDone));
  // initialize the branch predictor
  if (`BPRED_ENABLED == 1) 
    initial begin
      $readmemb(`TWO_BIT_PRELOAD, dut.core.ifu.bpred.bpred.Predictor.DirPredictor.PHT.mem);
      $readmemb(`BTB_PRELOAD, dut.core.ifu.bpred.bpred.TargetPredictor.memory.mem);    
    end 
 endmodule
 module riscvassertions;
  initial begin
    assert (`PMP_ENTRIES == 0 | `PMP_ENTRIES==16 | `PMP_ENTRIES==64) else $error("Illegal number of PMP entries: PMP_ENTRIES must be 0, 16, or 64");
    assert (`S_SUPPORTED | `VIRTMEM_SUPPORTED == 0) else $error("Virtual memory requires S mode support");
    assert (`DIV_BITSPERCYCLE == 1 | `DIV_BITSPERCYCLE==2 | `DIV_BITSPERCYCLE==4) else $error("Illegal number of divider bits/cycle: DIV_BITSPERCYCLE must be 1, 2, or 4");
    assert (`F_SUPPORTED | ~`D_SUPPORTED) else $error("Can't support double (D) without supporting float (F)");
    assert (`I_SUPPORTED ^ `E_SUPPORTED) else $error("Exactly one of I and E must be supported");
    assert (`XLEN == 64 | ~`D_SUPPORTED) else $error("Wally does not yet support D extensions on RV32");
    assert (`DCACHE_WAYSIZEINBYTES <= 4096 | (`DMEM != `MEM_CACHE) | `VIRTMEM_SUPPORTED == 0) else $error("DCACHE_WAYSIZEINBYTES cannot exceed 4 KiB when caches and vitual memory is enabled (to prevent aliasing)");
    assert (`DCACHE_LINELENINBITS >= 128 | (`DMEM != `MEM_CACHE)) else $error("DCACHE_LINELENINBITS must be at least 128 when caches are enabled");
    assert (`DCACHE_LINELENINBITS < `DCACHE_WAYSIZEINBYTES*8) else $error("DCACHE_LINELENINBITS must be smaller than way size");
    assert (`ICACHE_WAYSIZEINBYTES <= 4096 | (`IMEM != `MEM_CACHE) | `VIRTMEM_SUPPORTED == 0) else $error("ICACHE_WAYSIZEINBYTES cannot exceed 4 KiB when caches and vitual memory is enabled (to prevent aliasing)");
    assert (`ICACHE_LINELENINBITS >= 32 | (`IMEM != `MEM_CACHE)) else $error("ICACHE_LINELENINBITS must be at least 32 when caches are enabled");
    assert (`ICACHE_LINELENINBITS < `ICACHE_WAYSIZEINBYTES*8) else $error("ICACHE_LINELENINBITS must be smaller than way size");
    assert (2**$clog2(`DCACHE_LINELENINBITS) == `DCACHE_LINELENINBITS | (`DMEM != `MEM_CACHE)) else $error("DCACHE_LINELENINBITS must be a power of 2");
    assert (2**$clog2(`DCACHE_WAYSIZEINBYTES) == `DCACHE_WAYSIZEINBYTES | (`DMEM != `MEM_CACHE)) else $error("DCACHE_WAYSIZEINBYTES must be a power of 2");
    assert (2**$clog2(`ICACHE_LINELENINBITS) == `ICACHE_LINELENINBITS | (`IMEM != `MEM_CACHE)) else $error("ICACHE_LINELENINBITS must be a power of 2");
    assert (2**$clog2(`ICACHE_WAYSIZEINBYTES) == `ICACHE_WAYSIZEINBYTES | (`IMEM != `MEM_CACHE)) else $error("ICACHE_WAYSIZEINBYTES must be a power of 2");
    assert (2**$clog2(`ITLB_ENTRIES) == `ITLB_ENTRIES | `VIRTMEM_SUPPORTED==0) else $error("ITLB_ENTRIES must be a power of 2");
    assert (2**$clog2(`DTLB_ENTRIES) == `DTLB_ENTRIES | `VIRTMEM_SUPPORTED==0) else $error("DTLB_ENTRIES must be a power of 2");
    assert (`RAM_RANGE >= 56'h07FFFFFF) else $warning("Some regression tests will fail if RAM_RANGE is less than 56'h07FFFFFF");
 	  assert (`ZICSR_SUPPORTED == 1 | (`PMP_ENTRIES == 0 & `VIRTMEM_SUPPORTED == 0)) else $error("PMP_ENTRIES and VIRTMEM_SUPPORTED must be zero if ZICSR not supported.");
    assert (`ZICSR_SUPPORTED == 1 | (`S_SUPPORTED == 0 & `U_SUPPORTED == 0)) else $error("S and U modes not supported if ZISR not supported");
    assert (`U_SUPPORTED | (`S_SUPPORTED == 0)) else $error ("S mode only supported if U also is supported");
 //    assert (`MEM_DCACHE == 0 | `MEM_DTIM == 0) else $error("Can't simultaneously have a data cache and TIM");
    assert (`DMEM == `MEM_CACHE | `VIRTMEM_SUPPORTED ==0) else $error("Virtual memory needs dcache");
    assert (`IMEM == `MEM_CACHE | `VIRTMEM_SUPPORTED ==0) else $error("Virtual memory needs icache");
  end
 endmodule
 /* verilator lint_on STMTDLY */
 /* verilator lint_on WIDTH */
 module DCacheFlushFSM
  (input logic clk,
   input logic reset,
   input logic start,
   output logic done);
  genvar adr;
  logic [`XLEN-1:0] ShadowRAM[`RAM_BASE>>(1+`XLEN/32):(`RAM_RANGE+`RAM_BASE)>>1+(`XLEN/32)];
 	if(`DMEM == `MEM_CACHE) begin
 	  localparam integer numlines = testbench.dut.core.lsu.bus.dcache.dcache.NUMLINES;
 	  localparam integer numways = testbench.dut.core.lsu.bus.dcache.dcache.NUMWAYS;
 	  localparam integer linebytelen = testbench.dut.core.lsu.bus.dcache.dcache.LINEBYTELEN;
 	  localparam integer numwords = testbench.dut.core.lsu.bus.dcache.dcache.LINELEN/`XLEN;  
 	  localparam integer lognumlines = $clog2(numlines);
 	  localparam integer loglinebytelen = $clog2(linebytelen);
 	  localparam integer lognumways = $clog2(numways);
 	  localparam integer tagstart = lognumlines + loglinebytelen;
 	  genvar 			 index, way, cacheWord;
 	  logic [`XLEN-1:0]  CacheData [numways-1:0] [numlines-1:0] [numwords-1:0];
 	  logic [`XLEN-1:0]  CacheTag [numways-1:0] [numlines-1:0] [numwords-1:0];
 	  logic 			 CacheValid  [numways-1:0] [numlines-1:0] [numwords-1:0];
 	  logic 			 CacheDirty  [numways-1:0] [numlines-1:0] [numwords-1:0];
 	  logic [`PA_BITS-1:0] CacheAdr [numways-1:0] [numlines-1:0] [numwords-1:0];
      for(index = 0; index < numlines; index++) begin
 		for(way = 0; way < numways; way++) begin
 		  for(cacheWord = 0; cacheWord < numwords; cacheWord++) begin
 			copyShadow #(.tagstart(tagstart),
 						 .loglinebytelen(loglinebytelen))
 			copyShadow(.clk,
 					   .start,
 					   .tag(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].CacheTagMem.StoredData[index]),
 					   .valid(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].ValidBits[index]),
 					   .dirty(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].DirtyBits[index]),
 					   .data(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].word[cacheWord].CacheDataMem.StoredData[index]),
 					   .index(index),
 					   .cacheWord(cacheWord),
 					   .CacheData(CacheData[way][index][cacheWord]),
 					   .CacheAdr(CacheAdr[way][index][cacheWord]),
 					   .CacheTag(CacheTag[way][index][cacheWord]),
 					   .CacheValid(CacheValid[way][index][cacheWord]),
 					   .CacheDirty(CacheDirty[way][index][cacheWord]));
 		  end
 		end
      end
 	  integer i, j, k;
 	  always @(posedge clk) begin
 		if (start) begin #1
 		  #1
 			for(i = 0; i < numlines; i++) begin
 			  for(j = 0; j < numways; j++) begin
 				for(k = 0; k < numwords; k++) begin
 				  if (CacheValid[j][i][k] & CacheDirty[j][i][k]) begin
 					ShadowRAM[CacheAdr[j][i][k] >> $clog2(`XLEN/8)] = CacheData[j][i][k];
 				  end
 				end	
 			  end
 			end
 		end
 	  end
 	end
  flop #(1) doneReg(.clk, .d(start), .q(done));
 endmodule
 module copyShadow
  #(parameter tagstart, loglinebytelen)
  (input logic clk,
   input logic 			     start,
   input logic [`PA_BITS-1:tagstart] tag,
   input logic 			     valid, dirty,
   input logic [`XLEN-1:0] 	     data,
   input logic [32-1:0] 	     index,
   input logic [32-1:0] 	     cacheWord,
   output logic [`XLEN-1:0] 	     CacheData,
   output logic [`PA_BITS-1:0] 	     CacheAdr,
   output logic [`XLEN-1:0] 	     CacheTag,
   output logic 		     CacheValid,
   output logic 		     CacheDirty);
  always_ff @(posedge clk) begin
    if(start) begin
      CacheTag = tag;
      CacheValid = valid;
      CacheDirty = dirty;
      CacheData = data;
      CacheAdr = (tag << tagstart) + (index << loglinebytelen) + (cacheWord << $clog2(`XLEN/8));
    end
  end
 endmodule		      
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@ -15,6 +15,7 @@ export MAXCORES ?= 4
 # MAXOPT turns on flattening, boundary optimization, and retiming
 # The output netlist is hard to interpret, but significantly better PPA
 export MAXOPT ?= 0
 export DRIVE ?= FLOP
 time := $(shell date +%F-%H-%M)
 hash := $(shell git rev-parse --short HEAD)
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-gpio-01.reference_output
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-gpio-01.reference_output
@ -1,7 +1,23 @@
 00000000 # test reset to zero
 00000000
-00000000
+A5A5A5A5 # test output pins
 A5A5A5A5
 5A5AFFFF
-00000000
+00000000 # test input enables
 5A5A0000
-A55A0000
+A55A0000 # test XOR
 A55A0000 # Test interrupt pending bits: high_ip
 5AA5FFFF #   low_ip
 00000000 #   rise_ip
 00000000 #   fall_ip
 A4AA0000 #   input_val
 A5FA0000 #   high_ip
 5BF5FFFF #   low_ip
 00A00000 #   rise_ip
 01500000 #   fall_ip
 00000000 #   MEIP
 00000000 # Test interrupts can be enabled without being triggered: MIP = 0
 00000000 #   MIP = 0
 00000000 #   MIP = 0
 00000000 #   MIP = 0
 00000800 #  Test interrupts can be enabled and triggered: MEIP set
 00000000 #   MEIP = 0
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-TEST-LIB-32.h
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-TEST-LIB-32.h
@ -827,6 +827,28 @@ trap_handler_end_\MODE\(): // place to jump to so we can skip the trap handler a
    addi a6, a6, 4 
 .endm
 // Place this macro in peripheral tests to setup all the PLIC registers to generate external interrupts
 .macro SETUP_PLIC  
    # Setup PLIC with a series of register writes
    .equ PLIC_INTPRI_GPIO, 0x0C00000C       # GPIO is interrupt 3
    .equ PLIC_INTPRI_UART, 0x0C000028       # UART is interrupt 10
    .equ PLIC_INTPENDING0, 0x0C001000       # intPending0 register
    .equ PLIC_INTEN00,     0x0C002000       # interrupt enables for context 0 (machine mode) sources 31:1
    .equ PLIC_INTEN10,     0x0C002080       # interrupt enables for context 1 (supervisor mode) sources 31:1
    .equ PLIC_THRESH0,     0x0C200000       # Priority threshold for context 0 (machine mode)
    .equ PLIC_CLAIM0,      0x0C200004       # Claim/Complete register for context 0
    .equ PLIC_THRESH1,     0x0C201000       # Priority threshold for context 1 (supervisor mode)
    .equ PLIC_CLAIM1,      0x0C201004       # Claim/Complete register for context 1
    .4byte PLIC_THRESH0, 0, write32_test    # Set PLIC machine mode interrupt threshold to 0 to accept all interrupts
    .4byte PLIC_THRESH1, 7, write32_test    # Set PLIC supervisor mode interrupt threshold to 7 to accept no interrupts
    .4byte PLIC_INTPRI_GPIO, 7, write32_test # Set GPIO to high priority
    .4byte PLIC_INTPRI_UART, 7, write32_test # Set UART to high priority
    .4byte PLIC_INTEN00, 0xFFFFFFFF, write32_test # Enable all interrupt sources for machine mode
    .4byte PLIC_INTEN10, 0x00000000, write32_test # Disable all interrupt sources for supervisor mode
 .endm
 .macro END_TESTS
    // invokes one final ecall to return to machine mode then terminates this program, so the output is
    //      0x8: termination called from U mode
@ -937,6 +959,20 @@ read08_test:
    addi a6, a6, 4
    j test_loop // go to next test case
 readmip_test:  // read the MIP into the signature
    csrr t2, mip
    sw t2, 0(t1)
    addi t1, t1, 4
    addi a6, a6, 4
    j test_loop // go to next test case
 readsip_test:  // read the MIP into the signature
    csrr t2, sip
    sw t2, 0(t1)
    addi t1, t1, 4
    addi a6, a6, 4
    j test_loop // go to next test case
 goto_s_mode:
    // return to address in t3, 
    li a0, 3 // Trap handler behavior (go to supervisor mode)
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-gpio-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-gpio-01.S
@ -72,6 +72,7 @@ test_cases:
 .4byte input_val, 0x00000000, read32_test  # input_val reset to zero
 .4byte input_en, 0x00000000, read32_test  # input_en reset to zero
 # *** add more
 # =========== Test output and input pins ===========
@ -86,14 +87,49 @@ test_cases:
 .4byte input_en, 0x00000000, write32_test       # disable all input pins
 .4byte input_val, 0x00000000, read32_test       # read 0 since input pins are disabled
 .4byte input_en, 0xFFFF0000, write32_test       # enable a few input pins
-.4byte input_val, 0x5A5A0000, read32_test      # read part of pattern set above.
+.4byte input_val, 0x5A5A0000, read32_test       # read part of pattern set above.
 # =========== Test output enables(?) ===========
 .4byte output_en, 0xFFFFFFFF, write32_test      # undo changes made to output enable
 # =========== Test XOR functionality ===========
 .4byte out_xor, 0xFF00FF00, write32_test        # invert certain pin values
-.4byte input_val, 0xA55A0000, read32_test           # read inverted pins and verify input enable is working
+.4byte input_val, 0xA55A0000, read32_test       # read inverted pins and verify input enable is working
 # =========== Test Interrupt Pending bits ===========
 SETUP_PLIC
 .4byte low_ip, 0xFFFFFFFF, write32_test             # clear pending low interrupts
 .4byte high_ip, 0xFFFFFFFF, write32_test            # clear pending high interrupts
 .4byte rise_ip, 0xFFFFFFFF, write32_test            # clear pending rise interrupts
 .4byte fall_ip, 0xFFFFFFFF, write32_test            # clear pending fall interrupts
 .4byte high_ip, 0xA55A0000, read32_test             # check pending high interrupts
 .4byte low_ip, 0x5AA5FFFF, read32_test              # check pending low interrupts
 .4byte rise_ip, 0x00000000, read32_test             # check pending rise interrupts
 .4byte fall_ip, 0x00000000, read32_test             # check pending fall interrupts
 .4byte output_val, 0x5BAA000F, write32_test         # change output pattern to check rise/fall interrupts
 .4byte input_val, 0xA4AA0000, read32_test           # check new output matches expected output
 .4byte high_ip, 0xA5FA00000, read32_test            # high interrupt pending *** (is this correct?)
 .4byte low_ip, 0x5BF5FFFF, read32_test              # low interrupt pending should be opposite high for enabled pins
 .4byte rise_ip, 0x00A00000, read32_test             # check for changed bits (rising)
 .4byte fall_ip, 0x01500000, read32_test             # check for changed bits (falling)
 .4byte 0x0, 0x00000000, readmip_test                # Check no external interrupt has been generated
 # =========== Test interrupts can be enabled without being triggered ===========
 .4byte high_ie, 0x00010000, write32_test            # enable high interrupt on bit 16, no pending interrupt
 .4byte 0x0, 0x00000000, readmip_test                # No external interrupt should be pending
 .4byte low_ie, 0x00020000, write32_test             # enable low interrupt on bit 17, no pending interrupt
 .4byte 0x0, 0x00000000, readmip_test                # No external interrupt should be pending
 .4byte rise_ie, 0x00010000, write32_test            # enable rise interrupt on bit 16, no pending interrupt
 .4byte 0x0, 0x00000000, readmip_test                # No external interrupt should be pending
 .4byte fall_ie, 0x00010000, write32_test            # enable fall interrupt on bit 16, no pending interrupt
 .4byte 0x0, 0x00000000, readmip_test                # No external interrupt should be pending
 # =========== Test interrupts can be enabled and triggered
 .4byte high_ie, 0x00020000, write32_test            # enable high interrupt on bit 17, which is pending
 .4byte 0x0, 0x00000800, readmip_test                # MEIP should be raised
 .4byte high_ie, 0x00000000, write32_test             # disable high interrupt on bit 17, which is pending
 .4byte 0x0, 0x00000000, readmip_test                # MEIP should be released
 .4byte 0x0, 0x0, terminate_test # terminate tests
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-periph.reference_output
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-periph.reference_output
@ -254,12 +254,12 @@ FFFFEE00
 FFFFEE00
 00000000
 00000000
-02BEEF10
+02BEEF10 # Something here is failing
 0000000B
 80000000
 00000003
 000000FF
-FFFFFFFF
+00000000
 000000FF
 00000000
 00000000
@ -270,20 +270,20 @@ FFFFFFFF
 FFFFFF00
 00000000
 00000000
-02BEEF11
+02BEEF11 # this might be wrong
 0000000B
 80000000
 00000003 
-000000CC
+00000033 # input
-CCCCCCCC
+00000000 # output
 00000000 # rise ip
 00000000 # serviced rise ip
 000000CC # fall ip
 00000000 
-00000000
+000000FF # high ip
-00000033
+00000033 # why is this 0x33?
-00000000
+FFFFFFCC # low ip
-000000FF
+FFFFFFCC # serviced low ip
 000000CC
 FFFFFF33
 FFFFFF33
 00000000
 00000000
 03BEEF12
@ -454,9 +454,9 @@ FFFFFF33
 00080000
 00080000
 00000000
 00000000 # is it this one that's failing?
 00000000
-00000000
+00080000 # failing
 00080000
 00080000
 FFFFFFFF
 FFF7FFFF
@ -478,7 +478,7 @@ FFFFFFFF
 FFFFFFFE
 00000000
 00000000
-04BEEF1E
+04BEEF1E # this might also be wrong
 00000009
 80000000
 0000000A
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-periph.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-periph.S
@ -271,7 +271,7 @@ main_code: #####
    sw t1, 0x04(t0)
    # raise all output_en
    sw t1, 0x08(t0)
-    # raise all input_en
+    # raise all rise_en
    sw t1, 0x18(t0)
    # ========== Execute Test ==========
    # set MEIE
@ -616,6 +616,9 @@ Intr02BEEF11:
    sw t1, 0x08(t0)
    # set initial output state
    sw x0, 0x0C(t0)
    # clear XOR
    li t1, 0x00000000
    sw t1, 0x40(t0)
    # clear all pending interrupts
    li t1, 0xFFFFFFFF
    sw t1, 0x1C(t0)
@ -843,7 +846,7 @@ Intr03BEEF1A:
    sw t1, 0x04(t0)
    # raise all output_en
    sw t1, 0x08(t0)
-    # raise all input_en
+    # raise all rise_en
    sw t1, 0x18(t0)
    # ========== Execute Test ==========
    # set MEIE and SEIE