Merge branch 'main' of github.com:davidharrishmc/riscv-wally into main

2025-02-11 06:05:49 +00:00 · 2022-03-22 21:28:50 -05:00 · 2022-03-22 21:28:50 -05:00 · b2487f4b72
commit b2487f4b72
parent 4ca9458534 23adb2dd03
62 changed files with 10046 additions and 671 deletions
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@ -1 +1 @@
-Subproject commit 307c77b26e070ae85ffea665ad9b642b40e33c86
+Subproject commit be67c99bd461742aa1c100bcc0732657faae2230
--- a/pipelined/config/buildroot/wally-config.vh
+++ b/pipelined/config/buildroot/wally-config.vh
@ -49,6 +49,8 @@
 `define UARCH_SINGLECYCLE 0
 `define DMEM `MEM_CACHE
 `define IMEM `MEM_CACHE
+`define DBUS 1
+`define IBUS 1
 `define VIRTMEM_SUPPORTED 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1 

--- a/pipelined/config/fpga/wally-config.vh
+++ b/pipelined/config/fpga/wally-config.vh
@ -49,6 +49,8 @@
 `define UARCH_SINGLECYCLE 0
 `define DMEM `MEM_CACHE
 `define IMEM `MEM_CACHE
+`define DBUS 1
+`define IBUS 1
 `define VIRTMEM_SUPPORTED 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1 

--- a/pipelined/config/rv32e/wally-config.vh
+++ b/pipelined/config/rv32e/wally-config.vh
@ -49,8 +49,10 @@
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
 // *** replace with MEM_BUS
-`define DMEM `MEM_BUS
-`define IMEM `MEM_BUS
+`define DMEM `MEM_NONE
+`define IMEM `MEM_NONE
+`define DBUS 1
+`define IBUS 1
 `define VIRTMEM_SUPPORTED 0
 `define VECTORED_INTERRUPTS_SUPPORTED 0 

--- a/pipelined/config/rv32gc/wally-config.vh
+++ b/pipelined/config/rv32gc/wally-config.vh
@ -49,6 +49,8 @@
 `define UARCH_SINGLECYCLE 0
 `define DMEM `MEM_CACHE
 `define IMEM `MEM_CACHE
+`define DBUS 1
+`define IBUS 1
 `define VIRTMEM_SUPPORTED 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1 

--- a/pipelined/config/rv32ic/wally-config.vh
+++ b/pipelined/config/rv32ic/wally-config.vh
@ -49,6 +49,8 @@
 `define UARCH_SINGLECYCLE 0
 `define DMEM `MEM_TIM
 `define IMEM `MEM_TIM
+`define DBUS 0
+`define IBUS 0
 `define VIRTMEM_SUPPORTED 0
 `define VECTORED_INTERRUPTS_SUPPORTED 1 

--- a/pipelined/config/rv64BP/wally-config.vh
+++ b/pipelined/config/rv64BP/wally-config.vh
@ -51,6 +51,8 @@
 `define UARCH_SINGLECYCLE 0
 `define DMEM `MEM_CACHE
 `define IMEM `MEM_CACHE
+`define DBUS 1
+`define IBUS 1
 `define VIRTMEM_SUPPORTED 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1 

--- a/pipelined/config/rv64fp/BTBPredictor.txt
+++ b/pipelined/config/rv64fp/BTBPredictor.txt
--- a/pipelined/config/rv64fp/twoBitPredictor.txt
+++ b/pipelined/config/rv64fp/twoBitPredictor.txt
--- a/pipelined/config/rv64fp/wally-config.vh
+++ b/pipelined/config/rv64fp/wally-config.vh
@ -0,0 +1,134 @@
+//////////////////////////////////////////
+// wally-config.vh
+//
+// Written: David_Harris@hmc.edu 4 January 2021
+// Modified: 
+//
+// Purpose: Specify which features are configured
+//          Macros to determine which modes are supported based on MISA
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+// include shared configuration
+`include "wally-shared.vh"
+
+`define FPGA 0
+`define QEMU 0
+`define DESIGN_COMPILER 0
+
+// RV32 or RV64: XLEN = 32 or 64
+`define XLEN 64
+
+// IEEE 754 compliance
+`define IEEE754 1
+
+// MISA RISC-V configuration per specification
+`define MISA (32'h00000104 | 1 << 5 | 0 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0 )
+`define ZICSR_SUPPORTED 1
+`define ZIFENCEI_SUPPORTED 1
+`define COUNTERS 32
+`define ZICOUNTERS_SUPPORTED 1
+
+/// Microarchitectural Features
+`define UARCH_PIPELINED 1
+`define UARCH_SUPERSCALR 0
+`define UARCH_SINGLECYCLE 0
+`define DMEM `MEM_CACHE
+`define IMEM `MEM_CACHE
+`define VIRTMEM_SUPPORTED 1
+`define VECTORED_INTERRUPTS_SUPPORTED 1 
+
+// TLB configuration.  Entries should be a power of 2
+`define ITLB_ENTRIES 32
+`define DTLB_ENTRIES 32
+
+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more lines
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 4096
+`define DCACHE_LINELENINBITS 256
+`define ICACHE_NUMWAYS 4
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_LINELENINBITS 256
+
+// Integer Divider Configuration
+// DIV_BITSPERCYCLE must be 1, 2, or 4
+`define DIV_BITSPERCYCLE 4
+
+// Legal number of PMP entries are 0, 16, or 64
+`define PMP_ENTRIES 64
+
+// Address space
+`define RESET_VECTOR 64'h0000000080000000
+
+// Bus Interface width
+`define AHBW 64
+
+// Peripheral Physiccal Addresses
+// Peripheral memory space extends from BASE to BASE+RANGE
+// Range should be a thermometer code with 0's in the upper bits and 1s in the lower bits
+
+// *** each of these is `PA_BITS wide. is this paramaterizable INSIDE the config file?
+`define BOOTROM_SUPPORTED 1'b1
+`define BOOTROM_BASE   56'h00001000 // spec had been 0x1000 to 0x2FFF, but dh truncated to 0x1000 to 0x1FFF because upper half seems to be all zeros and this is easier for decoder
+`define BOOTROM_RANGE  56'h00000FFF
+`define RAM_SUPPORTED 1'b1
+`define RAM_BASE       56'h80000000
+`define RAM_RANGE      56'h7FFFFFFF
+`define EXT_MEM_SUPPORTED 1'b0
+`define EXT_MEM_BASE       56'h80000000
+`define EXT_MEM_RANGE      56'h07FFFFFF
+`define CLINT_SUPPORTED 1'b1
+`define CLINT_BASE  56'h02000000
+`define CLINT_RANGE 56'h0000FFFF
+`define GPIO_SUPPORTED 1'b1
+`define GPIO_BASE   56'h10060000
+`define GPIO_RANGE  56'h000000FF
+`define UART_SUPPORTED 1'b1
+`define UART_BASE   56'h10000000
+`define UART_RANGE  56'h00000007
+`define PLIC_SUPPORTED 1'b1
+`define PLIC_BASE   56'h0C000000
+`define PLIC_RANGE  56'h03FFFFFF
+`define SDC_SUPPORTED 1'b0
+`define SDC_BASE   56'h00012100
+`define SDC_RANGE  56'h0000001F
+
+// Test modes
+
+// Tie GPIO outputs back to inputs
+`define GPIO_LOOPBACK_TEST 1
+
+// Hardware configuration
+`define UART_PRESCALE 1
+
+// Interrupt configuration
+`define PLIC_NUM_SRC 10
+// comment out the following if >=32 sources
+`define PLIC_NUM_SRC_LT_32
+`define PLIC_GPIO_ID 3
+`define PLIC_UART_ID 10
+
+`define TWO_BIT_PRELOAD "../config/rv64ic/twoBitPredictor.txt"
+`define BTB_PRELOAD "../config/rv64ic/BTBPredictor.txt"
+`define BPRED_ENABLED 1
+`define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
+`define TESTSBP 0
+
+`define REPLAY 0
+`define HPTW_WRITES_SUPPORTED 0
--- a/pipelined/config/rv64gc/wally-config.vh
+++ b/pipelined/config/rv64gc/wally-config.vh
@ -50,6 +50,8 @@
 `define UARCH_SINGLECYCLE 0
 `define DMEM `MEM_CACHE
 `define IMEM `MEM_CACHE
+`define DBUS 1
+`define IBUS 1
 `define VIRTMEM_SUPPORTED 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1 

--- a/pipelined/config/rv64ic/wally-config.vh
+++ b/pipelined/config/rv64ic/wally-config.vh
@ -50,6 +50,8 @@
 `define UARCH_SINGLECYCLE 0
 `define DMEM `MEM_TIM
 `define IMEM `MEM_TIM
+`define DBUS 0
+`define IBUS 0
 `define VIRTMEM_SUPPORTED 0
 `define VECTORED_INTERRUPTS_SUPPORTED 1 

--- a/pipelined/config/shared/wally-constants.vh
+++ b/pipelined/config/shared/wally-constants.vh
@ -50,7 +50,7 @@
 `define SV39 8
 `define SV48 9

-`define MEM_BUS 1
+`define MEM_NONE 1
 `define MEM_TIM 2
 `define MEM_CACHE 3

--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@ -50,10 +50,47 @@
 // Number of 64 bit PMP Configuration Register entries (or pairs of 32 bit entries)
 `define PMPCFG_ENTRIES (`PMP_ENTRIES/8)

+
+// Floating-point half-precision
+`define ZFH_SUPPORTED 0
+
+// Floating point constants for Quad, Double, Single, and Half precisions
+`define Q_LEN 128
+`define Q_NE 15
+`define Q_NF 112
+`define Q_BIAS 16383
+`define D_LEN 64
+`define D_NE 11
+`define D_NF 52
+`define D_BIAS 1023
+`define S_LEN 32
+`define S_NE 8
+`define S_NF 23
+`define S_BIAS 127
+`define H_LEN 16
+`define H_NE 5
+`define H_NF 10
+`define H_BIAS 15
+
 // Floating point length FLEN and number of exponent (NE) and fraction (NF) bits
-`define FLEN 64//(`Q_SUPPORTED ? 128 : `D_SUPPORTED ? 64 : 32)
-`define NE   11//(`Q_SUPPORTED ? 15 : `D_SUPPORTED ? 11 : 8)
-`define NF   52//(`Q_SUPPORTED ? 112 : `D_SUPPORTED ? 52 : 23)
+`define FLEN (`Q_SUPPORTED ? `Q_LEN  : `D_SUPPORTED ? `D_LEN  : `F_SUPPORTED ? `S_LEN  : `H_LEN)
+`define NE   (`Q_SUPPORTED ? `Q_NE   : `D_SUPPORTED ? `D_NE   : `F_SUPPORTED ? `S_NE   : `H_NE)
+`define NF   (`Q_SUPPORTED ? `Q_NF   : `D_SUPPORTED ? `D_NF   : `F_SUPPORTED ? `S_NF   : `H_NF)
+`define FMT  (`Q_SUPPORTED ? 3       : `D_SUPPORTED ? 1       : `F_SUPPORTED ? 0       : 2)
+`define BIAS (`Q_SUPPORTED ? `Q_BIAS : `D_SUPPORTED ? `D_BIAS : `F_SUPPORTED ? `S_BIAS : `H_BIAS)
+
+// Floating point constants needed for FPU paramerterization
+`define FPSIZES (`Q_SUPPORTED+`D_SUPPORTED+`F_SUPPORTED+`ZFH_SUPPORTED)
+`define LEN1  ((`D_SUPPORTED & (`FLEN != `D_LEN)) ? `D_LEN   : (`F_SUPPORTED & (`FLEN != `S_LEN)) ? `S_LEN  : `H_LEN)
+`define NE1   ((`D_SUPPORTED & (`FLEN != `D_LEN)) ? `D_NE   : (`F_SUPPORTED & (`FLEN != `S_LEN)) ? `S_NE  : `H_NE)
+`define NF1   ((`D_SUPPORTED & (`FLEN != `D_LEN)) ? `D_NF  : (`F_SUPPORTED & (`FLEN != `S_LEN)) ? `S_NF : `H_NF)
+`define FMT1  ((`D_SUPPORTED & (`FLEN != `D_LEN)) ? 1        : (`F_SUPPORTED & (`FLEN != `S_LEN)) ? 0       : 2)
+`define BIAS1 ((`D_SUPPORTED & (`FLEN != `D_LEN)) ? `D_BIAS  : (`F_SUPPORTED & (`FLEN != `S_LEN)) ? `S_BIAS : `H_BIAS)
+`define LEN2  ((`F_SUPPORTED & (`LEN1 != `S_LEN)) ? `S_LEN   : `H_LEN)
+`define NE2   ((`F_SUPPORTED & (`LEN1 != `S_LEN)) ? `S_NE   : `H_NE)
+`define NF2   ((`F_SUPPORTED & (`LEN1 != `S_LEN)) ? `S_NF  : `H_NF)
+`define FMT2  ((`F_SUPPORTED & (`LEN1 != `S_LEN)) ? 0        : 2)
+`define BIAS2 ((`F_SUPPORTED & (`LEN1 != `S_LEN)) ? `S_BIAS  : `H_BIAS)

 // Disable spurious Verilator warnings

--- a/pipelined/fpu-testfloat/FMA/tbgen/tb.sv
+++ b/pipelined/fpu-testfloat/FMA/tbgen/tb.sv
@ -1,10 +1,33 @@

-//`include "../../../config/old/rv64icfd/wally-config.vh"
+`include "../../../config/old/rv64icfd/wally-config.vh"

-`define FLEN 64//(`Q_SUPPORTED ? 128 : `D_SUPPORTED ? 64 : 32)
-`define NE   11//(`Q_SUPPORTED ? 15 : `D_SUPPORTED ? 11 : 8)
-`define NF   52//(`Q_SUPPORTED ? 112 : `D_SUPPORTED ? 52 : 23)
-`define XLEN 64
+// `define FLEN (`Q_SUPPORTED ? 128 : `D_SUPPORTED ? 64 : `F_SUPPORTED ? 32 : 16)
+// `define NE   (`Q_SUPPORTED ? 15 : `D_SUPPORTED ? 11 : `F_SUPPORTED ? 8 : 5)
+// `define NF   (`Q_SUPPORTED ? 112 : `D_SUPPORTED ? 52 : `F_SUPPORTED ? 23 : 10)
+// `define FMT (`Q_SUPPORTED ? 3 : `D_SUPPORTED ? 1 : `F_SUPPORTED ? 0 : 2)
+// `define BIAS (`Q_SUPPORTED ? 16383 : `D_SUPPORTED ? 1023 : `F_SUPPORTED ? 127 : 15)
+// `define XLEN 64
+// `define IEEE754 1
+`define Q_SUPPORTED 1
+// `define D_SUPPORTED 0
+// `define F_SUPPORTED 0
+`define H_SUPPORTED 0
+`define FPSIZES ((`Q_SUPPORTED&`D_SUPPORTED&`F_SUPPORTED&`H_SUPPORTED) ? 4 : (`Q_SUPPORTED&`D_SUPPORTED&`F_SUPPORTED) | (`Q_SUPPORTED&`D_SUPPORTED&`H_SUPPORTED) | (`Q_SUPPORTED&`F_SUPPORTED&`H_SUPPORTED) | (`D_SUPPORTED&`F_SUPPORTED&`H_SUPPORTED) ? 3 : (`Q_SUPPORTED&`D_SUPPORTED) | (`Q_SUPPORTED&`F_SUPPORTED) | (`Q_SUPPORTED&`H_SUPPORTED) | (`D_SUPPORTED&`F_SUPPORTED) | (`D_SUPPORTED&`H_SUPPORTED) | (`F_SUPPORTED&`H_SUPPORTED) ? 2 : 1)
+`define LEN1  ((`D_SUPPORTED & (`FLEN !== 64)) ? 64   : (`F_SUPPORTED & (`FLEN !== 32)) ? 32  : 16)
+`define NE1   ((`D_SUPPORTED & (`FLEN !== 64)) ? 11   : (`F_SUPPORTED & (`FLEN !== 32)) ? 8   : 5)
+`define NF1   ((`D_SUPPORTED & (`FLEN !== 64)) ? 52   : (`F_SUPPORTED & (`FLEN !== 32)) ? 23  : 10)
+`define FMT1  ((`D_SUPPORTED & (`FLEN !== 64)) ? 1    : (`F_SUPPORTED & (`FLEN !== 32)) ? 0   : 2)
+`define BIAS1 ((`D_SUPPORTED & (`FLEN !== 64)) ? 1023 : (`F_SUPPORTED & (`FLEN !== 32)) ? 127 : 15)
+`define LEN2  ((`F_SUPPORTED & (`LEN1 !== 32)) ? 32   : 16)
+`define NE2   ((`F_SUPPORTED & (`LEN1 !== 32)) ? 8    : 5)
+`define NF2   ((`F_SUPPORTED & (`LEN1 !== 32)) ? 23   : 10)
+`define FMT2  ((`F_SUPPORTED & (`LEN1 !== 32)) ? 0    : 2)
+`define BIAS2 ((`F_SUPPORTED & (`LEN1 !== 32)) ? 127  : 15)
+`define LEN3 16
+`define NE3 5//make constants for the constants ie 11/8/5 ect
+`define NF3 10 // always support less hten max - maybe halfs
+`define FMT3 2
+`define BIAS3 15
 module testbench3();

 logic [31:0] errors=0;
@ -15,33 +38,17 @@ module testbench3();
 logic 	[`FLEN-1:0]		ans;
 logic 	[7:0]	 	flags;
 logic 	[2:0]		FrmE;
- logic				FmtE;
+ logic	[`FPSIZES/3:0]			FmtE;
 logic  [`FLEN-1:0]      FMAResM;
 logic  [4:0]       FMAFlgM;
-integer fp;
 logic 	[2:0]		FOpCtrlE;
 logic 		[2*`NF+1:0]		ProdManE; 
 logic 		[3*`NF+5:0]		AlignedAddendE;	
 logic 		[`NE+1:0]		ProdExpE; 
 logic 					AddendStickyE;
 logic 					KillProdE; 
-// logic					XZeroE;
-// logic					YZeroE;
-// logic					ZZeroE;
-// logic					XDenormE;
-// logic					YDenormE;
-// logic					ZDenormE;
-// logic					XInfE;
-// logic					YInfE;
-// logic					ZInfE;
-// logic					XNaNE;
-// logic					YNaNE;
-// logic					ZNaNE;

 logic wnan;
-// logic XNaNE;
-// logic YNaNE;
-// logic ZNaNE;
 logic ansnan, clk;


@ -52,88 +59,86 @@ assign FOpCtrlE = 3'b0;
 // down - 010
 // up - 011
 // nearest max mag - 100  
-assign FrmE = 3'b000;
-assign FmtE = 1'b1;
+assign FrmE = 3'b010;
+assign FmtE = (`FPSIZES/3+1)'(1);

    logic  [`FLEN-1:0] X, Y, Z;
    // logic         FmtE;
    // logic  [2:0]  FOpCtrlE;
    logic        XSgnE, YSgnE, ZSgnE;
    logic [`NE-1:0] XExpE, YExpE, ZExpE;
-    logic [`NF-1:0] XFracE, YFracE, ZFracE;
-    logic        XAssumed1E, YAssumed1E, ZAssumed1E;
+    logic [`NF:0] XManE, YManE, ZManE;
    logic XNormE;
+    logic XExpMaxE;
    logic XNaNE, YNaNE, ZNaNE;
    logic XSNaNE, YSNaNE, ZSNaNE;
    logic XDenormE, YDenormE, ZDenormE;
    logic XZeroE, YZeroE, ZZeroE;
    logic [`NE-1:0] BiasE;
    logic XInfE, YInfE, ZInfE;
-    logic XExpMaxE;
- //***rename to make significand = 1.frac m = significand
-    logic           XFracZero, YFracZero, ZFracZero; // input fraction zero
-    logic           XExpZero, YExpZero, ZExpZero; // input exponent zero
    logic [`FLEN-1:0]    Addend; // value to add (Z or zero)
-    logic           YExpMaxE, ZExpMaxE;  // input exponent all 1s
+    logic           YExpMaxE, ZExpMaxE, Mult;  // input exponent all 1s

-    assign Addend = FOpCtrlE[2] ? (`FLEN)'(0) : Z; // Z is only used in the FMA, and is set to Zero if a multiply opperation
-    assign XSgnE = FmtE ? X[`FLEN-1] : X[31];
-    assign YSgnE = FmtE ? Y[`FLEN-1] : Y[31];
-    assign ZSgnE = FmtE ? Addend[`FLEN-1] : Addend[31];
+	assign Mult = 1'b0;
+  unpacking unpacking(.*);

-    assign XExpE = FmtE ? X[62:52] : {X[30], {3{~X[30]&~XExpZero|XExpMaxE}}, X[29:23]}; 
-    assign YExpE = FmtE ? Y[62:52] : {Y[30], {3{~Y[30]&~YExpZero|YExpMaxE}}, Y[29:23]}; 
-    assign ZExpE = FmtE ? Addend[62:52] : {Addend[30], {3{~Addend[30]&~ZExpZero|ZExpMaxE}}, Addend[29:23]}; 
+// assign	wnan = XNaNE|YNaNE|ZNaNE; 
+// assign	ansnan = FmtE ? &ans[`FLEN-2:`NF] && |ans[`NF-1:0] : &ans[30:23] && |ans[22:0]; 
+ 
+    if (`FPSIZES === 1) begin
+      assign ansnan = &ans[`FLEN-2:`NF]&(|ans[`NF-1:0]);
+      assign wnan = &FMAResM[`FLEN-2:`NF]&(|FMAResM[`NF-1:0]);
+    end else if (`FPSIZES === 2) begin                  
+      assign ansnan = FmtE ? &ans[`FLEN-2:`NF]&(|ans[`NF-1:0]) : &ans[`LEN1-2:`NF1]&(|ans[`NF1-1:0]);
+      assign wnan = FmtE ? &FMAResM[`FLEN-2:`NF]&(|FMAResM[`NF-1:0]) : &FMAResM[`LEN1-2:`NF1]&(|FMAResM[`NF1-1:0]);
+    end else if (`FPSIZES === 3) begin
+        always_comb begin
+            case (FmtE)
+                `FMT: begin                  
+                  assign ansnan = &ans[`FLEN-2:`NF]&(|ans[`NF-1:0]);
+                  assign wnan = &FMAResM[`FLEN-2:`NF]&(|FMAResM[`NF-1:0]);

-    assign XFracE = FmtE ? X[`NF-1:0] : {X[22:0], 29'b0};
-    assign YFracE = FmtE ? Y[`NF-1:0] : {Y[22:0], 29'b0};
-    assign ZFracE = FmtE ? Addend[`NF-1:0] : {Addend[22:0], 29'b0};
+                end
+                `FMT1: begin                    
+                  assign ansnan = &ans[`LEN1-2:`NF1]&(|ans[`NF1-1:0]);
+                  assign wnan = &FMAResM[`LEN1-2:`NF1]&(|FMAResM[`NF1-1:0]);

-    assign XAssumed1E = FmtE ? |X[62:52] : |X[30:23]; 
-    assign YAssumed1E = FmtE ? |Y[62:52] : |Y[30:23];
-    assign ZAssumed1E = FmtE ? |Z[62:52] : |Z[30:23];
+                end
+                `FMT2: begin
+                    assign ansnan = &ans[`LEN2-2:`NF2]&(|ans[`NF2-1:0]);
+                    assign wnan = &FMAResM[`LEN2-2:`NF2]&(|FMAResM[`NF2-1:0]);
+                end
+                default: begin
+                    assign ansnan = 0;
+                    assign wnan = 0;
+                end
+            endcase
+        end

-    assign XExpZero = ~XAssumed1E;
-    assign YExpZero = ~YAssumed1E;
-    assign ZExpZero = ~ZAssumed1E;
-   
-    assign XFracZero = ~|XFracE;
-    assign YFracZero = ~|YFracE;
-    assign ZFracZero = ~|ZFracE;
+    end else begin
+        always_comb begin
+            case (FmtE)
+                `FMT: begin                  
+                  assign ansnan = &ans[`FLEN-2:`NF]&(|ans[`NF-1:0]);
+                  assign wnan = &FMAResM[`FLEN-2:`NF]&(|FMAResM[`NF-1:0]);

-    assign XExpMaxE = FmtE ? &X[62:52] : &X[30:23];
-    assign YExpMaxE = FmtE ? &Y[62:52] : &Y[30:23];
-    assign ZExpMaxE = FmtE ? &Z[62:52] : &Z[30:23];
-   
-    assign XNormE = ~(XExpMaxE|XExpZero);
-    
-    assign XNaNE = XExpMaxE & ~XFracZero;
-    assign YNaNE = YExpMaxE & ~YFracZero;
-    assign ZNaNE = ZExpMaxE & ~ZFracZero;
+                end
+                `FMT1: begin                    
+                  assign ansnan = &ans[`LEN1-2:`NF1]&(|ans[`NF1-1:0]);
+                  assign wnan = &FMAResM[`LEN1-2:`NF1]&(|FMAResM[`NF1-1:0]);

-    assign XSNaNE = XNaNE&~XFracE[`NF-1];
-    assign YSNaNE = YNaNE&~YFracE[`NF-1];
-    assign ZSNaNE = ZNaNE&~ZFracE[`NF-1];
-
-    assign XDenormE = XExpZero & ~XFracZero;
-    assign YDenormE = YExpZero & ~YFracZero;
-    assign ZDenormE = ZExpZero & ~ZFracZero;
-
-    assign XInfE = XExpMaxE & XFracZero;
-    assign YInfE = YExpMaxE & YFracZero;
-    assign ZInfE = ZExpMaxE & ZFracZero;
-
-    assign XZeroE = XExpZero & XFracZero;
-    assign YZeroE = YExpZero & YFracZero;
-    assign ZZeroE = ZExpZero & ZFracZero;
-
-    assign BiasE = 13'h3ff;
-
-assign	wnan = FmtE ? &FMAResM[`FLEN-2:`NF] & |FMAResM[`NF-1:0] : &FMAResM[30:23] & |FMAResM[22:0]; 
-// assign	XNaNE = FmtE ? &X[62:52] & |X[51:0] : &X[62:55] & |X[54:32]; 
-// assign	YNaNE = FmtE ? &Y[62:52] & |Y[51:0] : &Y[62:55] & |Y[54:32]; 
-// assign	ZNaNE = FmtE ? &Z[62:52] & |Z[51:0] : &Z[62:55] & |Z[54:32]; 
-assign	ansnan = FmtE ? &ans[`FLEN-2:`NF] & |ans[`NF-1:0] : &ans[30:23] & |ans[22:0]; 
+                end
+                `FMT2: begin
+                    assign ansnan = &ans[`LEN2-2:`NF2]&(|ans[`NF2-1:0]);
+                    assign wnan = &FMAResM[`LEN2-2:`NF2]&(|FMAResM[`NF2-1:0]);
+                end
+                `FMT3: begin
+                    assign ansnan = &ans[`LEN3-2:`NF3]&(|ans[`NF3-1:0]);
+                    assign wnan = &FMAResM[`LEN3-2:`NF3]&(|FMAResM[`NF3-1:0]);
+                end
+            endcase
+        end
+    end
 // instantiate device under test

    logic [3*`NF+5:0]	SumE, SumM;       
@ -141,16 +146,16 @@ assign	ansnan = FmtE ? &ans[`FLEN-2:`NF] & |ans[`NF-1:0] : &ans[30:23] & |ans[22
    logic 			    NegSumE, NegSumM;
    logic 			    ZSgnEffE, ZSgnEffM;
    logic 			    PSgnE, PSgnM;
-    logic [8:0]			NormCntE, NormCntM;
+    logic [$clog2(3*`NF+7)-1:0]			NormCntE, NormCntM;
    
-    fma1 fma1 (.XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE({XAssumed1E,XFracE}), .YManE({YAssumed1E,YFracE}), .ZManE({ZAssumed1E,ZFracE}),
+    fma1 fma1 (.XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE,
                 .XDenormE, .YDenormE, .ZDenormE,  .XZeroE, .YZeroE, .ZZeroE,
                .FOpCtrlE, .FmtE, .SumE, .NegSumE, .InvZE, .NormCntE, .ZSgnEffE, .PSgnE,
                .ProdExpE, .AddendStickyE, .KillProdE); 
-fma2 UUT2(.XSgnM(XSgnE), .YSgnM(YSgnE), .XExpM(XExpE), .YExpM(YExpE), .ZExpM(ZExpE), .XManM({XAssumed1E,XFracE}), .YManM({YAssumed1E,YFracE}), .ZManM({ZAssumed1E,ZFracE}), .XNaNM(XNaNE), .YNaNM(YNaNE), .ZNaNM(ZNaNE), .XZeroM(XZeroE), .YZeroM(YZeroE), .ZZeroM(ZZeroE), .XInfM(XInfE), .YInfM(YInfE), .ZInfM(ZInfE), .XSNaNM(XSNaNE), .YSNaNM(YSNaNE), .ZSNaNM(ZSNaNE),
+fma2 UUT2(.XSgnM(XSgnE), .YSgnM(YSgnE), .XExpM(XExpE), .YExpM(YExpE), .ZExpM(ZExpE), .XManM(XManE), .YManM(YManE), .ZManM(ZManE), .XNaNM(XNaNE), .YNaNM(YNaNE), .ZNaNM(ZNaNE), .XZeroM(XZeroE), .YZeroM(YZeroE), .ZZeroM(ZZeroE), .XInfM(XInfE), .YInfM(YInfE), .ZInfM(ZInfE), .XSNaNM(XSNaNE), .YSNaNM(YSNaNE), .ZSNaNM(ZSNaNE),
              //  .FSrcXE, .FSrcYE, .FSrcZE, .FSrcXM, .FSrcYM, .FSrcZM, 
                .KillProdM(KillProdE), .AddendStickyM(AddendStickyE), .ProdExpM(ProdExpE), .SumM(SumE), .NegSumM(NegSumE), .InvZM(InvZE), .NormCntM(NormCntE), .ZSgnEffM(ZSgnEffE), .PSgnM(PSgnE),
-               .FmtM(FmtE), .FrmM(FrmE), .FMAFlgM, .FMAResM);
+               .FmtM(FmtE), .FrmM(FrmE), .FMAFlgM, .FMAResM, .Mult);


 // produce clock
@ -168,61 +173,156 @@ fma2 UUT2(.XSgnM(XSgnE), .YSgnM(YSgnE), .XExpM(XExpE), .YExpM(YExpE), .ZExpM(ZEx
 always @(posedge clk)
 begin
  #1; 
-  if (FmtE==1'b1) {X, Y, Z, ans, flags} = testvectors[vectornum];
-  else	begin	  X = {{32{1'b1}}, testvectors[vectornum][135:104]};
-  		  Y = {{32{1'b1}}, testvectors[vectornum][103:72]};
-  		  Z = {{32{1'b1}}, testvectors[vectornum][71:40]};
-  		  ans = {{32{1'b1}}, testvectors[vectornum][39:8]};
-  		  flags = testvectors[vectornum][7:0];
+  if (`FPSIZES === 3 | `FPSIZES === 4) begin
+    if (FmtE==2'b11) {X, Y, Z, ans, flags} = testvectors[vectornum];
+    else if (FmtE==2'b01)	begin	  
+      X = {{`FLEN-64{1'b1}}, testvectors[vectornum][263:200]};
+      Y = {{`FLEN-64{1'b1}}, testvectors[vectornum][199:136]};
+      Z = {{`FLEN-64{1'b1}}, testvectors[vectornum][135:72]};
+      ans = {{`FLEN-64{1'b1}}, testvectors[vectornum][71:8]};
+      flags = testvectors[vectornum][7:0];
+    end
+    else if (FmtE==2'b00)	begin	  
+      X = {{`FLEN-32{1'b1}}, testvectors[vectornum][135:104]};
+      Y = {{`FLEN-32{1'b1}}, testvectors[vectornum][103:72]};
+      Z = {{`FLEN-32{1'b1}}, testvectors[vectornum][71:40]};
+      ans = {{`FLEN-32{1'b1}}, testvectors[vectornum][39:8]};
+      flags = testvectors[vectornum][7:0];
+    end
+    else	begin	  
+      X = {{`FLEN-16{1'b1}}, testvectors[vectornum][71:56]};
+      Y = {{`FLEN-16{1'b1}}, testvectors[vectornum][55:40]};
+      Z = {{`FLEN-16{1'b1}}, testvectors[vectornum][39:24]};
+      ans = {{`FLEN-16{1'b1}}, testvectors[vectornum][23:8]};
+      flags = testvectors[vectornum][7:0];
+    end
+  end
+  else begin
+    if (FmtE==1'b1) {X, Y, Z, ans, flags} = testvectors[vectornum];
+    else if (FmtE==1'b0)	begin	  
+      X = {{`FLEN-`LEN1{1'b1}}, testvectors[vectornum][8+4*(`LEN1)-1:8+3*(`LEN1)]};
+      Y = {{`FLEN-`LEN1{1'b1}}, testvectors[vectornum][8+3*(`LEN1)-1:8+2*(`LEN1)]};
+      Z = {{`FLEN-`LEN1{1'b1}}, testvectors[vectornum][8+2*(`LEN1)-1:8+(`LEN1)]};
+      ans = {{`FLEN-`LEN1{1'b1}}, testvectors[vectornum][8+(`LEN1-1):8]};
+      flags = testvectors[vectornum][7:0];
+    end
  end
 end
 // check results on falling edge of clk
  always @(negedge clk) begin
- 
-	if((FmtE==1'b1) & (FMAFlgM != flags[4:0] | (!wnan & (FMAResM != ans)) | (wnan & ansnan & ~((XNaNE & (FMAResM[`FLEN-2:0] == {XExpE,1'b1,X[`NF-2:0]})) | (YNaNE & (FMAResM[`FLEN-2:0] == {YExpE,1'b1,Y[`NF-2:0]}))  | (ZNaNE & (FMAResM[`FLEN-2:0] == {ZExpE,1'b1,Z[`NF-2:0]})) | (FMAResM[`FLEN-2:0] == ans[`FLEN-2:0]))))) begin
-  //  fp = $fopen("/home/kparry/riscv-wally/pipelined/src/fpu/FMA/tbgen/results.dat","w");
-	// if((FmtE==1'b1) & (FMAFlgM != flags[4:0] | (FMAResM != ans))) begin
-        $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
-		if(FMAResM == 64'h8000000000000000) $display( "FMAResM=-zero ");
-		if(XDenormE) $display( "xdenorm ");
-		if(YDenormE) $display( "ydenorm ");
-		if(ZDenormE) $display( "zdenorm ");
-		if(FMAFlgM[4] != 0) $display( "invld ");
-		if(FMAFlgM[2] != 0) $display( "ovrflw ");
-		if(FMAFlgM[1] != 0) $display( "unflw ");
-		if(FMAResM[`FLEN] & FMAResM[`FLEN-2:`NF] == {`NE{1'b1}} & FMAResM[`NF-1:0] == 0) $display( "FMAResM=-inf ");
-		if(~FMAResM[`FLEN] & FMAResM[`FLEN-2:`NF] == {`NE{1'b1}} & FMAResM[`NF-1:0] == 0) $display( "FMAResM=+inf ");
-		if(FMAResM[`FLEN-2:`NF] == {`NE{1'b1}} & FMAResM[`NF-1:0] != 0 & ~FMAResM[`NF-1]) $display( "FMAResM=sigNaN ");
-		if(FMAResM[`FLEN-2:`NF] == {`NE{1'b1}} & FMAResM[`NF-1:0] != 0 & FMAResM[`NF-1]) $display( "FMAResM=qutNaN ");
-		if(ans[`FLEN] & ans[`FLEN-2:`NF] == {`NE{1'b1}} & ans[`NF-1:0] == 0) $display( "ans=-inf ");
-		if(~ans[`FLEN] & ans[`FLEN-2:`NF] == {`NE{1'b1}} & ans[`NF-1:0] == 0) $display( "ans=+inf ");
-		if(ans[`FLEN-2:`NF] == {`NE{1'b1}} & ans[`NF-1:0] != 0 & ~ans[`NF-1]) $display( "ans=sigNaN ");
-		if(ans[`FLEN-2:`NF] == {`NE{1'b1}} & ans[`NF-1:0] != 0 & ans[`NF-1]) $display( "ans=qutNaN ");
-        errors = errors + 1;
-	  //if (errors == 10)
-		$stop;
-    end
-    if((FmtE==1'b0)&(FMAFlgM != flags[4:0] | (!wnan & (FMAResM != ans)) | (wnan & ansnan & ~(((XNaNE & (FMAResM[30:0] == {X[30:23],1'b1,X[21:0]})) | (YNaNE & (FMAResM[30:0] == {Y[30:23],1'b1,Y[21:0]}))  | (ZNaNE & (FMAResM[30:0] == {Z[30:23],1'b1,Z[21:0]})) | (FMAResM[30:0] == ans[30:0]))) ))) begin
-        $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
-		if(FMAResM == 64'h8000000000000000) $display( "FMAResM=-zero ");
-		if(~(|X[30:23]) & |X[22:0]) $display( "xdenorm ");
-		if(~(|Y[30:23]) & |Y[22:0]) $display( "ydenorm ");
-		if(~(|Z[30:23]) & |Z[22:0]) $display( "zdenorm ");
-		if(FMAFlgM[4] != 0) $display( "invld ");
-		if(FMAFlgM[2] != 0) $display( "ovrflw ");
-		if(FMAFlgM[1] != 0) $display( "unflw ");
-		if(FMAResM == 64'hFF80000000000000) $display( "FMAResM=-inf ");
-		if(FMAResM == 64'h7F80000000000000) $display( "FMAResM=+inf ");
-		if(&FMAResM[30:23] & |FMAResM[22:0] & ~FMAResM[22]) $display( "FMAResM=sigNaN ");
-		if(&FMAResM[30:23] & |FMAResM[22:0] & FMAResM[22] ) $display( "FMAResM=qutNaN ");
-		if(ans == 64'hFF80000000000000) $display( "ans=-inf ");
-		if(ans == 64'h7F80000000000000) $display( "ans=+inf ");
-		if(&ans[30:23] & |ans[22:0] & ~ans[22] ) $display( "ans=sigNaN ");
-		if(&ans[30:23] & |ans[22:0] & ans[22]) $display( "ans=qutNaN ");
-        errors = errors + 1;
-	  if (errors == 10)
-		$stop;
-    end
+      if (`FPSIZES === 1 | `FPSIZES === 2) begin
+        if((FmtE==1'b1) & (FMAFlgM !== flags[4:0] || (!wnan && (FMAResM !== ans)) || (wnan && ansnan && ~((XNaNE && (FMAResM[`FLEN-2:0] === {X[`FLEN-2:`NF],1'b1,X[`NF-2:0]})) || (YNaNE && (FMAResM[`FLEN-2:0] === {Y[`FLEN-2:`NF],1'b1,Y[`NF-2:0]}))  || (ZNaNE && (FMAResM[`FLEN-2:0] === {Z[`FLEN-2:`NF],1'b1,Z[`NF-2:0]})) || (FMAResM[`FLEN-2:0] === ans[`FLEN-2:0]))))) begin
+        //  fp = $fopen("/home/kparry/riscv-wally/pipelined/src/fpu/FMA/tbgen/results.dat","w");
+        // if((FmtE==1'b1) & (FMAFlgM !== flags[4:0] || (FMAResM !== ans))) begin
+              $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+          if(XDenormE) $display( "xdenorm ");
+          if(YDenormE) $display( "ydenorm ");
+          if(ZDenormE) $display( "zdenorm ");
+          if(FMAFlgM[4] !== 0) $display( "invld ");
+          if(FMAFlgM[2] !== 0) $display( "ovrflw ");
+          if(FMAFlgM[1] !== 0) $display( "unflw ");
+          if(FMAResM[`FLEN] && FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] === 0) $display( "FMAResM=-inf ");
+          if(~FMAResM[`FLEN] && FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] === 0) $display( "FMAResM=+inf ");
+          if(FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] !== 0 && ~FMAResM[`NF-1]) $display( "FMAResM=sigNaN ");
+          if(FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] !== 0 && FMAResM[`NF-1]) $display( "FMAResM=qutNaN ");
+          if(ans[`FLEN] && ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] === 0) $display( "ans=-inf ");
+          if(~ans[`FLEN] && ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] === 0) $display( "ans=+inf ");
+          if(ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] !== 0 && ~ans[`NF-1]) $display( "ans=sigNaN ");
+          if(ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] !== 0 && ans[`NF-1]) $display( "ans=qutNaN ");
+              errors = errors + 1;
+          //if (errors === 10)
+          $stop;
+          end
+          if((FmtE==1'b0)&(FMAFlgM !== flags[4:0] || (!wnan && (FMAResM !== ans)) || (wnan && ansnan && ~(((XNaNE && (FMAResM[`LEN1-2:0] === {X[`LEN1-2:`NF1],1'b1,X[`NF1-2:0]})) || (YNaNE && (FMAResM[`LEN1-2:0] === {Y[`LEN1-2:`NF1],1'b1,Y[`NF1-2:0]}))  || (ZNaNE && (FMAResM[`LEN1-2:0] === {Z[`LEN1-2:`NF1],1'b1,Z[`NF1-2:0]})) || (FMAResM[`LEN1-2:0] === ans[`LEN1-2:0]))) ))) begin
+              $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+          if(~(|X[30:23]) && |X[22:0]) $display( "xdenorm ");
+          if(~(|Y[30:23]) && |Y[22:0]) $display( "ydenorm ");
+          if(~(|Z[30:23]) && |Z[22:0]) $display( "zdenorm ");
+          if(FMAFlgM[4] !== 0) $display( "invld ");
+          if(FMAFlgM[2] !== 0) $display( "ovrflw ");
+          if(FMAFlgM[1] !== 0) $display( "unflw ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && ~FMAResM[22]) $display( "FMAResM=sigNaN ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && FMAResM[22] ) $display( "FMAResM=qutNaN ");
+          if(&ans[30:23] && |ans[22:0] && ~ans[22] ) $display( "ans=sigNaN ");
+          if(&ans[30:23] && |ans[22:0] && ans[22]) $display( "ans=qutNaN ");
+              errors = errors + 1;
+        // if (errors === 9)
+          $stop;
+          end
+ end else begin
+   
+        if((FmtE==2'b11) & (FMAFlgM !== flags[4:0] || (!wnan && (FMAResM !== ans)) || (wnan && ansnan && ~((XNaNE && (FMAResM[`FLEN-2:0] === {X[`FLEN-2:`NF],1'b1,X[`NF-2:0]})) || (YNaNE && (FMAResM[`FLEN-2:0] === {Y[`FLEN-2:`NF],1'b1,Y[`NF-2:0]}))  || (ZNaNE && (FMAResM[`FLEN-2:0] === {Z[`FLEN-2:`NF],1'b1,Z[`NF-2:0]})) || (FMAResM[`FLEN-2:0] === ans[`FLEN-2:0]))))) begin
+        //  fp = $fopen("/home/kparry/riscv-wally/pipelined/src/fpu/FMA/tbgen/results.dat","w");
+        // if((FmtE==1'b1) & (FMAFlgM !== flags[4:0] || (FMAResM !== ans))) begin
+              $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+          if(XDenormE) $display( "xdenorm ");
+          if(YDenormE) $display( "ydenorm ");
+          if(ZDenormE) $display( "zdenorm ");
+          if(FMAFlgM[4] !== 0) $display( "invld ");
+          if(FMAFlgM[2] !== 0) $display( "ovrflw ");
+          if(FMAFlgM[1] !== 0) $display( "unflw ");
+          if(FMAResM[`FLEN] && FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] === 0) $display( "FMAResM=-inf ");
+          if(~FMAResM[`FLEN] && FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] === 0) $display( "FMAResM=+inf ");
+          if(FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] !== 0 && ~FMAResM[`NF-1]) $display( "FMAResM=sigNaN ");
+          if(FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] !== 0 && FMAResM[`NF-1]) $display( "FMAResM=qutNaN ");
+          if(ans[`FLEN] && ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] === 0) $display( "ans=-inf ");
+          if(~ans[`FLEN] && ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] === 0) $display( "ans=+inf ");
+          if(ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] !== 0 && ~ans[`NF-1]) $display( "ans=sigNaN ");
+          if(ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] !== 0 && ans[`NF-1]) $display( "ans=qutNaN ");
+              errors = errors + 1;
+          //if (errors === 10)
+          $stop;
+          end
+          if((FmtE==1'b01)&(FMAFlgM !== flags[4:0] || (!wnan && (FMAResM !== ans)) || (wnan && ansnan && ~(((XNaNE && (FMAResM[64-2:0] === {X[64-2:52],1'b1,X[52-2:0]})) || (YNaNE && (FMAResM[64-2:0] === {Y[64-2:52],1'b1,Y[52-2:0]}))  || (ZNaNE && (FMAResM[64-2:0] === {Z[64-2:52],1'b1,Z[52-2:0]})) || (FMAResM[62:0] === ans[62:0]))) ))) begin
+              $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+          if(~(|X[30:23]) && |X[22:0]) $display( "xdenorm ");
+          if(~(|Y[30:23]) && |Y[22:0]) $display( "ydenorm ");
+          if(~(|Z[30:23]) && |Z[22:0]) $display( "zdenorm ");
+          if(FMAFlgM[4] !== 0) $display( "invld ");
+          if(FMAFlgM[2] !== 0) $display( "ovrflw ");
+          if(FMAFlgM[1] !== 0) $display( "unflw ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && ~FMAResM[22]) $display( "FMAResM=sigNaN ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && FMAResM[22] ) $display( "FMAResM=qutNaN ");
+          if(&ans[30:23] && |ans[22:0] && ~ans[22] ) $display( "ans=sigNaN ");
+          if(&ans[30:23] && |ans[22:0] && ans[22]) $display( "ans=qutNaN ");
+              errors = errors + 1;
+        // if (errors === 9)
+          $stop;
+          end
+          if((FmtE==2'b00)&(FMAFlgM !== flags[4:0] || (!wnan && (FMAResM !== ans)) || (wnan && ansnan && ~(((XNaNE && (FMAResM[32-2:0] === {X[32-2:23],1'b1,X[23-2:0]})) || (YNaNE && (FMAResM[32-2:0] === {Y[32-2:23],1'b1,Y[23-2:0]}))  || (ZNaNE && (FMAResM[32-2:0] === {Z[32-2:23],1'b1,Z[23-2:0]})) || (FMAResM[30:0] === ans[30:0]))) ))) begin
+              $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+          if(~(|X[30:23]) && |X[22:0]) $display( "xdenorm ");
+          if(~(|Y[30:23]) && |Y[22:0]) $display( "ydenorm ");
+          if(~(|Z[30:23]) && |Z[22:0]) $display( "zdenorm ");
+          if(FMAFlgM[4] !== 0) $display( "invld ");
+          if(FMAFlgM[2] !== 0) $display( "ovrflw ");
+          if(FMAFlgM[1] !== 0) $display( "unflw ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && ~FMAResM[22]) $display( "FMAResM=sigNaN ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && FMAResM[22] ) $display( "FMAResM=qutNaN ");
+          if(&ans[30:23] && |ans[22:0] && ~ans[22] ) $display( "ans=sigNaN ");
+          if(&ans[30:23] && |ans[22:0] && ans[22]) $display( "ans=qutNaN ");
+              errors = errors + 1;
+        // if (errors === 9)
+          $stop;
+          end
+          if((FmtE==2'b10)&(FMAFlgM !== flags[4:0] || (!wnan && (FMAResM !== ans)) || (wnan && ansnan && ~(((XNaNE && (FMAResM[16-2:0] === {X[16-2:10],1'b1,X[10-2:0]})) || (YNaNE && (FMAResM[16-2:0] === {Y[16-2:10],1'b1,Y[10-2:0]}))  || (ZNaNE && (FMAResM[16-2:0] === {Z[16-2:10],1'b1,Z[10-2:0]})) || (FMAResM[14:0] === ans[14:0]))) ))) begin
+              $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+          if(~(|X[30:23]) && |X[22:0]) $display( "xdenorm ");
+          if(~(|Y[30:23]) && |Y[22:0]) $display( "ydenorm ");
+          if(~(|Z[30:23]) && |Z[22:0]) $display( "zdenorm ");
+          if(FMAFlgM[4] !== 0) $display( "invld ");
+          if(FMAFlgM[2] !== 0) $display( "ovrflw ");
+          if(FMAFlgM[1] !== 0) $display( "unflw ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && ~FMAResM[22]) $display( "FMAResM=sigNaN ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && FMAResM[22] ) $display( "FMAResM=qutNaN ");
+          if(&ans[30:23] && |ans[22:0] && ~ans[22] ) $display( "ans=sigNaN ");
+          if(&ans[30:23] && |ans[22:0] && ans[22]) $display( "ans=qutNaN ");
+              errors = errors + 1;
+        // if (errors === 9)
+          $stop;
+          end
+ end
+	
 vectornum = vectornum + 1;
 if (testvectors[vectornum] === 194'bx) begin
 $display("%d tests completed with %d errors", vectornum, errors);
--- a/pipelined/fpu-testfloat/FMA/tbgen/test_gen.sh
+++ b/pipelined/fpu-testfloat/FMA/tbgen/test_gen.sh
@ -1,3 +1,3 @@
-testfloat_gen f64_mulAdd -tininessafter -n 6133248 -rnear_even  -seed 113355 -level 1 > testFloat
+testfloat_gen f128_mulAdd -tininessafter -n 6133248 -rmin  -seed 113355 -level 1 > testFloat
 tr -d ' ' < testFloat > testFloatNoSpace

--- a/pipelined/src/cache/cache.sv
+++ b/pipelined/src/cache/cache.sv
@ -30,7 +30,7 @@

 `include "wally-config.vh"

-module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, DCACHE = 1) (
+module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGWPL, WORDLEN, MUXINTERVAL, DCACHE) (
  input logic                 clk,
  input logic                 reset,
   // cpu side
@ -41,24 +41,26 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, DCACHE = 1) (
  input logic                 InvalidateCacheM,
  input logic [11:0]          NextAdr, // virtual address, but we only use the lower 12 bits.
  input logic [`PA_BITS-1:0]  PAdr, // physical address
+  input logic [(`XLEN-1)/8:0] ByteMask,
  input logic [`XLEN-1:0]     FinalWriteData,
  output logic                CacheCommitted,
  output logic                CacheStall,
   // to performance counters to cpu
  output logic                CacheMiss,
  output logic                CacheAccess,
-  output logic                save, restore,
   // lsu control
  input logic                 IgnoreRequestTLB,
-  input logic                 IgnoreRequestTrapM,                                                                    
+  input logic                 IgnoreRequestTrapM, 
  input logic                 Cacheable,
   // Bus fsm interface
  output logic                CacheFetchLine,
  output logic                CacheWriteLine,
  input logic                 CacheBusAck,
+  input logic [LOGWPL-1:0]    WordCount,
+  input logic                 LSUBusWriteCrit, 
  output logic [`PA_BITS-1:0] CacheBusAdr,
  input logic [LINELEN-1:0]   CacheBusWriteData,
-  output logic [LINELEN-1:0]  ReadDataLine);
+  output logic [WORDLEN-1:0]  ReadDataWord);

  // Cache parameters
  localparam                  LINEBYTELEN = LINELEN/8;
@ -101,6 +103,9 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, DCACHE = 1) (
  logic [NUMWAYS-1:0]         SelectedWay;
  logic [NUMWAYS-1:0]         SetValidWay, ClearValidWay, SetDirtyWay, ClearDirtyWay;
  logic [1:0]                 CacheRW, CacheAtomic;
+  logic [LINELEN-1:0]         ReadDataLine;
+  logic [$clog2(LINELEN/8) - $clog2(MUXINTERVAL/8) - 1:0]          WordOffsetAddr;
+  logic                       save, restore;
  
  /////////////////////////////////////////////////////////////////////////////////////////////
  // Read Path
@ -114,7 +119,7 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, DCACHE = 1) (

  // Array of cache ways, along with victim, hit, dirty, and read merging logic
  cacheway #(NUMLINES, LINELEN, TAGLEN, OFFSETLEN, SETLEN) CacheWays[NUMWAYS-1:0](
-    .clk, .reset, .RAdr, .PAdr, .CacheWriteData, 
+    .clk, .reset, .RAdr, .PAdr, .CacheWriteData, .ByteMask,
    .SetValidWay, .ClearValidWay, .SetDirtyWay, .ClearDirtyWay, .SelEvict, .VictimWay,
    .FlushWay, .SelFlush, .ReadDataLineWay, .HitWay, .VictimDirtyWay, .VictimTagWay, 
    .Invalidate(InvalidateCacheM));
@ -138,6 +143,17 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, DCACHE = 1) (
    flopenr #(NUMWAYS) wayhitsavereg(clk, save, reset, HitWay, HitWaySaved);
    mux2 #(NUMWAYS) saverestoremux(HitWay, HitWaySaved, restore, HitWayFinal);
  end else assign HitWayFinal = HitWay;
+
+  // like to fix this.
+  if(DCACHE) 
+    mux2 #(LOGWPL) WordAdrrMux(.d0(PAdr[$clog2(LINELEN/8) - 1 : $clog2(MUXINTERVAL/8)]), 
+      .d1(WordCount), .s(LSUBusWriteCrit),
+      .y(WordOffsetAddr)); 
+  else assign WordOffsetAddr = PAdr[$clog2(LINELEN/8) - 1 : $clog2(MUXINTERVAL/8)];
+  
+  subcachelineread #(LINELEN, WORDLEN, MUXINTERVAL, LOGWPL) subcachelineread(
+    .clk, .reset, .PAdr(WordOffsetAddr), .save, .restore,
+    .ReadDataLine, .ReadDataWord);
  
  /////////////////////////////////////////////////////////////////////////////////////////////
  // Write Path: Write data and address. Muxes between writes from bus and writes from CPU.
--- a/pipelined/src/cache/cacheway.sv
+++ b/pipelined/src/cache/cacheway.sv
@ -47,7 +47,7 @@ module cacheway #(parameter NUMLINES=512, parameter LINELEN = 256, TAGLEN = 26,
  input logic                        VictimWay,
  input logic                        FlushWay,
  input logic                        Invalidate,
-
+  input logic [(`XLEN-1)/8:0]        ByteMask,

  output logic [LINELEN-1:0]         ReadDataLineWay,
  output logic                       HitWay,
@ -69,6 +69,7 @@ module cacheway #(parameter NUMLINES=512, parameter LINELEN = 256, TAGLEN = 26,
  logic [$clog2(NUMLINES)-1:0]       RAdrD;
  logic [2**LOGWPL-1:0]              MemPAdrDecoded;
  logic [LINELEN/`XLEN-1:0]          SelectedWriteWordEn;
+  logic [(`XLEN-1)/8:0]              FinalByteMask;
  
  /////////////////////////////////////////////////////////////////////////////////////////////
  // Write Enable demux
@ -77,13 +78,14 @@ module cacheway #(parameter NUMLINES=512, parameter LINELEN = 256, TAGLEN = 26,
    .bin(PAdr[LOGWPL+LOGXLENBYTES-1:LOGXLENBYTES]), .decoded(MemPAdrDecoded));
  // If writing the whole line set all write enables to 1, else only set the correct word.
  assign SelectedWriteWordEn = SetValidWay ? '1 : SetDirtyWay ? MemPAdrDecoded : '0; // OR-AND
+  assign FinalByteMask = SetValidWay ? '1 : ByteMask; // OR

  /////////////////////////////////////////////////////////////////////////////////////////////
  // Tag Array
  /////////////////////////////////////////////////////////////////////////////////////////////

  sram1p1rw #(.DEPTH(NUMLINES), .WIDTH(TAGLEN)) CacheTagMem(.clk,
-    .Adr(RAdr), .ReadData(ReadTag),
+    .Adr(RAdr), .ReadData(ReadTag), .ByteMask('1),
    .CacheWriteData(PAdr[`PA_BITS-1:OFFSETLEN+INDEXLEN]), .WriteEnable(SetValidWay));

  // AND portion of distributed tag multiplexer
@ -102,7 +104,7 @@ module cacheway #(parameter NUMLINES=512, parameter LINELEN = 256, TAGLEN = 26,
    sram1p1rw #(.DEPTH(NUMLINES), .WIDTH(`XLEN)) CacheDataMem(.clk, .Adr(RAdr),
      .ReadData(ReadDataLine[(words+1)*`XLEN-1:words*`XLEN] ),
      .CacheWriteData(CacheWriteData[(words+1)*`XLEN-1:words*`XLEN]),
-      .WriteEnable(SelectedWriteWordEn[words]));
+      .WriteEnable(SelectedWriteWordEn[words]), .ByteMask(FinalByteMask));
  end

  // AND portion of distributed read multiplexers
--- a/pipelined/src/cache/sram1p1rw.sv
+++ b/pipelined/src/cache/sram1p1rw.sv
@ -38,18 +38,31 @@ module sram1p1rw #(parameter DEPTH=128, WIDTH=256) (
  input logic [$clog2(DEPTH)-1:0] Adr,
  input logic [WIDTH-1:0]         CacheWriteData,
  input logic                     WriteEnable,
+  input logic [(WIDTH-1)/8:0]     ByteMask,
  output logic [WIDTH-1:0]        ReadData);

  logic [WIDTH-1:0]               StoredData[DEPTH-1:0];
  logic [$clog2(DEPTH)-1:0]       AdrD;
  logic                           WriteEnableD;

+  always_ff @(posedge clk)       AdrD <= Adr;
+
+  genvar                          index;
+  for(index = 0; index < WIDTH/8; index++) begin
    always_ff @(posedge clk) begin
-      AdrD <= Adr;
-      if (WriteEnable) begin
-        StoredData[Adr] <= #1 CacheWriteData;
+      if (WriteEnable & ByteMask[index]) begin
+        StoredData[Adr][8*(index+1)-1:8*index] <= #1 CacheWriteData[8*(index+1)-1:8*index];
      end
    end
+  end
+  // if not a multiple of 8, MSByte is not 8 bits long.
+  if(WIDTH%8 != 0) begin
+    always_ff @(posedge clk) begin
+      if (WriteEnable & ByteMask[WIDTH/8]) begin
+        StoredData[Adr][WIDTH-1:WIDTH-WIDTH%8] <= #1 CacheWriteData[WIDTH-1:WIDTH-WIDTH%8];
+      end
+    end
+  end

  assign ReadData = StoredData[AdrD];
 endmodule
--- a/pipelined/src/cache/subcachelineread.sv
+++ b/pipelined/src/cache/subcachelineread.sv
@ -30,20 +30,17 @@

 `include "wally-config.vh"

-module subcachelineread #(parameter LINELEN, WORDLEN, MUXINTERVAL)(
+module subcachelineread #(parameter LINELEN, WORDLEN, MUXINTERVAL, LOGWPL)(
  input logic                clk,
  input logic                reset,
-  input logic [`PA_BITS-1:0] PAdr,
+  input logic [$clog2(LINELEN/8) - $clog2(MUXINTERVAL/8) - 1 : 0]   PAdr,
  input logic                save, restore,
  input logic [LINELEN-1:0]  ReadDataLine,
  output logic [WORDLEN-1:0] ReadDataWord);

  localparam WORDSPERLINE = LINELEN/MUXINTERVAL;
+  // pad is for icache. Muxing extends over the cacheline boundary.
  localparam PADLEN = WORDLEN-MUXINTERVAL;
-  // Convert the Read data bus ReadDataSelectWay into sets of XLEN so we can
-  // easily build a variable input mux.
-  // *** move this to LSU and IFU, also remove mux from busdp into LSU. 
-  // *** give this a module name to match block diagram
  logic [LINELEN+(WORDLEN-MUXINTERVAL)-1:0] ReadDataLinePad;
  logic [WORDLEN-1:0]          ReadDataLineSets [(LINELEN/MUXINTERVAL)-1:0];
  logic [WORDLEN-1:0] ReadDataWordRaw, ReadDataWordSaved;
@ -60,7 +57,7 @@ module subcachelineread #(parameter LINELEN, WORDLEN, MUXINTERVAL)(
  end
  // variable input mux
  // *** maybe remove REPLAY config later after deciding which way is best
-  assign ReadDataWordRaw = ReadDataLineSets[PAdr[$clog2(LINELEN/8) - 1 : $clog2(MUXINTERVAL/8)]];
+  assign ReadDataWordRaw = ReadDataLineSets[PAdr];
  if(!`REPLAY) begin
    flopen #(WORDLEN) cachereaddatasavereg(clk, save, ReadDataWordRaw, ReadDataWordSaved);
    mux2 #(WORDLEN) readdatasaverestoremux(ReadDataWordRaw, ReadDataWordSaved,
--- a/pipelined/src/fpu/fcmp.sv
+++ b/pipelined/src/fpu/fcmp.sv
@ -42,6 +42,7 @@ module fcmp (
   //             - if negitive - no
   //             - if positive - yes
   // note: LT does -0 < 0
+   //*** compare Exp and Man together
   assign LT = XSgnE^YSgnE ? XSgnE : XExpE==YExpE ? ((XManE<YManE)^XSgnE)&~EQ : (XExpE<YExpE)^XSgnE;
   assign EQ = (FSrcXE == FSrcYE);

--- a/pipelined/src/fpu/fcvtfp.sv
+++ b/pipelined/src/fpu/fcvtfp.sv
@ -103,7 +103,7 @@ module cvtfp (
    assign LSBFrac = DSFrac[3];


-    always_comb begin
+    always_comb begin // ***remove guard bit
        // Determine if you add 1
        case (FrmE)
            3'b000: CalcPlus1 = Guard & (Round | (Sticky) | (~Round&~Sticky&LSBFrac));//round to nearest even
@ -166,6 +166,7 @@ module cvtfp (
                {XSgnE, DSResExp, DSResFrac};

        // select the final result based on the opperation
+        //*** in al units before putting into : ? put in a seperate signal
        assign CvtFpResE = FmtE ? {{32{1'b1}},DSRes} : {XSgnE, SDExp, SDFrac[51]|XNaNE, SDFrac[50:0]};
    end else begin
        // select the double to single precision result
--- a/pipelined/src/fpu/fcvtint.sv
+++ b/pipelined/src/fpu/fcvtint.sv
@ -10,7 +10,6 @@ module fcvt (
    input logic             XNaNE,      // is X NaN 
    input logic             XInfE,      // is X infinity
    input logic             XDenormE,   // is X denormalized
-    input logic [10:0]      BiasE,      // bias - depends on precision (max exponent/2)
    input logic [`XLEN-1:0] ForwardedSrcAE,      // integer input
    input logic [2:0]       FOpCtrlE,   // chooses which instruction is done (full list below)
    input logic [2:0]       FrmE,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
@ -70,7 +69,7 @@ module fcvt (
    assign Bits = Res64 ? 8'd64 : 8'd32;

    // calulate the unbiased exponent
-    assign ExpVal = {1'b0,XExpE} - {1'b0,BiasE} + {12'b0, XDenormE};
+    assign ExpVal = {1'b0,XExpE} - {1'b0, (11)'(`BIAS)} + {12'b0, XDenormE};

 ////////////////////////////////////////////////////////

@ -121,7 +120,7 @@ module fcvt (
    assign Round = FOpCtrlE[0] ? ShiftedMan[0] : FmtE ? ShiftedMan[12] : ShiftedMan[41];
    assign LSB = FOpCtrlE[0] ? ShiftedMan[2] : FmtE ? ShiftedMan[14] : ShiftedMan[43];

-    always_comb begin
+    always_comb begin//*** remove guard bit
        // Determine if you add 1
        case (FrmE)
            3'b000: CalcPlus1 = Guard & (Round | Sticky | (~Round&~Sticky&LSB));//round to nearest even
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@ -29,17 +29,12 @@

 `include "wally-config.vh"

-//  `define FLEN 64//(`Q_SUPPORTED ? 128 : `D_SUPPORTED ? 64 : 32)
-//  `define NE   11//(`Q_SUPPORTED ? 15 : `D_SUPPORTED ? 11 : 8)
-//  `define NF   52//(`Q_SUPPORTED ? 112 : `D_SUPPORTED ? 52 : 23)
-//  `define XLEN 64
-//  `define IEEE754 1
 module fma(
    input logic                 clk,
    input logic                 reset,
    input logic                 FlushM,     // flush the memory stage
    input logic                 StallM,     // stall memory stage
-    input logic                 FmtE, FmtM, // precision 1 = double 0 = single
+    input logic  [`FPSIZES/3:0] FmtE, FmtM, // precision 1 = double 0 = single
    input logic  [2:0]          FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
    input logic  [2:0]          FrmM,               // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
    input logic                 XSgnE, YSgnE, ZSgnE,    // input signs - execute stage
@ -75,7 +70,7 @@ module fma(
    logic 			    NegSumE, NegSumM;
    logic 			    ZSgnEffE, ZSgnEffM;
    logic 			    PSgnE, PSgnM;
-    logic [8:0]			NormCntE, NormCntM;
+    logic [$clog2(3*`NF+7)-1:0]			NormCntE, NormCntM;
    logic               Mult;
    
    fma1 fma1 (.XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
@ -86,7 +81,7 @@ module fma(
    // E/M pipeline registers
    flopenrc #(3*`NF+6) EMRegFma2(clk, reset, FlushM, ~StallM, SumE, SumM); 
    flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
-    flopenrc #(16) EMRegFma4(clk, reset, FlushM, ~StallM, 
+    flopenrc #($clog2(3*`NF+7)+7) EMRegFma4(clk, reset, FlushM, ~StallM, 
                            {AddendStickyE, KillProdE, InvZE, NormCntE, NegSumE, ZSgnEffE, PSgnE, FOpCtrlE[2]&~FOpCtrlE[1]&~FOpCtrlE[0]},
                            {AddendStickyM, KillProdM, InvZM, NormCntM, NegSumM, ZSgnEffM, PSgnM, Mult});

@ -98,6 +93,7 @@ module fma(
 endmodule
      

+        //*** in al units before putting into : ? put in a seperate signal

 module fma1(
    input logic                 XSgnE, YSgnE, ZSgnE,    // input's signs
@ -106,7 +102,7 @@ module fma1(
    input logic                 XDenormE, YDenormE, ZDenormE, // is the input denormal
    input logic                 XZeroE, YZeroE, ZZeroE, // is the input zero
    input logic  [2:0]          FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
-    input logic                 FmtE,       // precision 1 = double 0 = single
+    input logic  [`FPSIZES/3:0] FmtE,       // precision 1 = double 0 = single
    output logic [`NE+1:0]      ProdExpE,       // X exponent + Y exponent - bias in B(NE+2.0) format; adds 2 bits to allow for size of number and negative sign
    output logic                AddendStickyE,  // sticky bit that is calculated during alignment
    output logic                KillProdE,      // set the product to zero before addition if the product is too small to matter
@ -115,7 +111,7 @@ module fma1(
    output logic                InvZE,          // intert Z
    output logic                ZSgnEffE,       // the modified Z sign
    output logic                PSgnE,          // the product's sign
-    output logic [8:0]          NormCntE        // normalization shift cnt
+    output logic [$clog2(3*`NF+7)-1:0]          NormCntE        // normalization shift cnt
    );

    logic [`NE-1:0]     Denorm;             // value of a denormaized number based on precision
@ -157,37 +153,63 @@ module fma1(
        
    add add(.AlignedAddendE, .ProdManE, .PSgnE, .ZSgnEffE, .KillProdE, .AlignedAddendInv, .ProdManKilled, .NegSumE, .PreSum, .NegPreSum, .InvZE, .XZeroE, .YZeroE);
    
-    loa loa(.A(AlignedAddendInv+{162'b0,InvZE}), .P(ProdManKilled), .NormCntE);
+    loa loa(.A(AlignedAddendInv+{(3*`NF+6)'(0),InvZE}), .P(ProdManKilled), .NormCntE);

    // Choose the positive sum and accompanying LZA result.
    assign SumE = NegSumE ? NegPreSum[3*`NF+5:0] : PreSum[3*`NF+5:0];
-    // assign NormCntE = NegSumE ? NNormCnt : PNormCnt;


 endmodule


 module expadd(    
-    input  logic            FmtE,          // precision
-    input  logic [`NE-1:0]  XExpE, YExpE,  // input exponents
-    input  logic            XDenormE, YDenormE,    // are the inputs denormalized
-    input  logic            XZeroE, YZeroE,        // are the inputs zero
-    output logic [`NE-1:0]  XExpVal, YExpVal,      // Exponent value after taking into account denormals
-    output logic [`NE-1:0]  Denorm,        // value of denormalized exponent
-    output logic [`NE+1:0]  ProdExpE       // product's exponent B^(1023)NE+2
+    input  logic [`FPSIZES/3:0] FmtE,          // precision
+    input  logic [`NE-1:0]      XExpE, YExpE,  // input exponents
+    input  logic                XDenormE, YDenormE,    // are the inputs denormalized
+    input  logic                XZeroE, YZeroE,        // are the inputs zero
+    output logic [`NE-1:0]      XExpVal, YExpVal,      // Exponent value after taking into account denormals
+    output logic [`NE-1:0]      Denorm,        // value of denormalized exponent
+    output logic [`NE+1:0]      ProdExpE       // product's exponent B^(1023)NE+2
 );


    // denormalized numbers have diffrent values depending on which precison it is.
-    //      double - 1
-    //      single - 1023-127+1 = 897
-    assign Denorm = FmtE ? 1 : 897;
+    //      FLEN - 1
+    //      Other - BIAS - other bias + 1
+    
+    if (`FPSIZES == 1) begin
+        assign Denorm = 1;
+
+    end else if (`FPSIZES == 2) begin
+        assign Denorm = FmtE ? (`NE)'(1) : (`NE)'(`BIAS)-(`NE)'(`BIAS1)+(`NE)'(1);
+
+    end else if (`FPSIZES == 3) begin
+        always_comb begin
+            case (FmtE)
+                `FMT: assign Denorm = 1;
+                `FMT1: assign Denorm = `BIAS-`BIAS1+1;
+                `FMT2: assign Denorm = `BIAS-`BIAS2+1;
+                default: assign Denorm = 1'bx;
+            endcase
+        end
+
+    end else begin
+        always_comb begin
+            case (FmtE)
+                2'h3: assign Denorm = 1;
+                2'h1: assign Denorm = `BIAS-`D_BIAS+1;
+                2'h0: assign Denorm = `BIAS-`S_BIAS+1;
+                2'h2: assign Denorm = `BIAS-`H_BIAS+1;
+            endcase
+        end
+
+    end

    // pick denormalized value or exponent
    assign XExpVal = XDenormE ? Denorm : XExpE;
    assign YExpVal = YDenormE ? Denorm : YExpE;
    // kill the exponent if the product is zero - either X or Y is 0
-    assign ProdExpE = ({2'b0, XExpVal} + {2'b0, YExpVal} - {2'b0, `NE'h3ff})&{`NE+2{~(XZeroE|YZeroE)}};
+    assign ProdExpE = ({2'b0, XExpVal} + {2'b0, YExpVal} - {2'b0, (`NE)'(`BIAS)})&{`NE+2{~(XZeroE|YZeroE)}};

 endmodule

@ -261,7 +283,7 @@ module align(
    //      - Denormal numbers have a diffrent exponent value depending on the precision
    assign ZExpVal = ZDenormE ? Denorm : ZExpE;
    // assign AlignCnt = ProdExpE - {2'b0, ZExpVal} + (`NF+3);
-    assign AlignCnt = XZeroE|YZeroE ? -1 : {2'b0, XExpVal} + {2'b0, YExpVal} - 1020+`NF - {2'b0, ZExpVal};
+    assign AlignCnt = XZeroE|YZeroE ? -1 : {2'b0, XExpVal} + {2'b0, YExpVal} - {2'b0, (`NE)'(`BIAS)} + `NF+3 - {2'b0, ZExpVal};

    // Defualt Addition without shifting
    //          |   54'b0    |  106'b(product)  | 2'b0 |
@ -276,7 +298,7 @@ module align(

        //          |   54'b0    |  106'b(product)  | 2'b0 |
        //  | addnend |
-        if ($signed(AlignCnt) < $signed(13'b0)) begin
+        if ($signed(AlignCnt) < $signed((`NE+2)'(0))) begin
            KillProdE = 1;
            ZManShifted = ZManPreShifted;
            AddendStickyE = ~(XZeroE|YZeroE);
@ -284,7 +306,7 @@ module align(
        // If the Addend is shifted right
        //          |   54'b0    |  106'b(product)  | 2'b0 |
        //                                  | addnend |
-        end else if ($signed(AlignCnt)<=$signed(13'd3*13'd`NF+13'd4))  begin
+        end else if ($signed(AlignCnt)<=$signed((`NE+2)'(3)*(`NE+2)'(`NF)+(`NE+2)'(5)))  begin
            KillProdE = 0;
            ZManShifted = ZManPreShifted >> AlignCnt;
            AddendStickyE = |(ZManShifted[`NF-1:0]);
@ -356,7 +378,7 @@ endmodule
 module loa( //https://ieeexplore.ieee.org/abstract/document/930098
    input logic  [3*`NF+6:0] A,     // addend
    input logic  [2*`NF+1:0] P,     // product
-    output logic [8:0]       NormCntE   // normalization shift count for the positive result
+    output logic [$clog2(3*`NF+7)-1:0]       NormCntE   // normalization shift count for the positive result
    ); 
    
    logic [3*`NF+6:0] T;
@ -389,14 +411,14 @@ module loa( //https://ieeexplore.ieee.org/abstract/document/930098
 endmodule

 module lzc(
-    input logic  [3*`NF+6:0]    f,
-    output logic [8:0]          NormCntE    // normalization shift
+    input logic  [3*`NF+6:0]            f,
+    output logic [$clog2(3*`NF+7)-1:0]    NormCntE    // normalization shift
 );
    
-    logic [8:0] i;
+    logic [$clog2(3*`NF+7)-1:0] i;
    always_comb begin
        i = 0;
-        while (~f[3*`NF+6-i] & $unsigned(i) <= $unsigned(9'd3*9'd`NF+9'd6)) i = i+1;  // search for leading one
+        while (~f[3*`NF+6-i] & $unsigned(i) <= $unsigned($clog2(3*`NF+7)'(3)*($clog2(3*`NF+7))'(`NF)+($clog2(3*`NF+7))'(6))) i = i+1;  // search for leading one
        NormCntE = i;
    end
 endmodule
@ -410,27 +432,27 @@ endmodule

 module fma2(
    
-    input logic                 XSgnM, YSgnM,        // input signs
-    input logic     [`NE-1:0]   XExpM, YExpM, ZExpM, // input exponents
-    input logic     [`NF:0]     XManM, YManM, ZManM, // input mantissas
-    input logic     [2:0]       FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
-    input logic                 FmtM,       // precision 1 = double 0 = single
-    input logic     [`NE+1:0]   ProdExpM,       // X exponent + Y exponent - bias
-    input logic                 AddendStickyM,  // sticky bit that is calculated during alignment
-    input logic                 KillProdM,      // set the product to zero before addition if the product is too small to matter
-    input logic                 XZeroM, YZeroM, ZZeroM, // inputs are zero
-    input logic                 XInfM, YInfM, ZInfM,    // inputs are infinity
-    input logic                 XNaNM, YNaNM, ZNaNM,    // inputs are NaN
-    input logic                 XSNaNM, YSNaNM, ZSNaNM, // inputs are signaling NaNs
-    input logic     [3*`NF+5:0] SumM,       // the positive sum
-    input logic                 NegSumM,    // was the sum negitive
-    input logic                 InvZM,      // do you invert Z
-    input logic                 ZSgnEffM,   // the modified Z sign - depends on instruction
-    input logic                 PSgnM,      // the product's sign
-    input logic                 Mult,       // multiply opperation
-    input logic     [8:0]       NormCntM,   // the normalization shift count
-    output logic    [`FLEN-1:0] FMAResM,    // FMA final result
-    output logic    [4:0]       FMAFlgM);   // FMA flags {invalid, divide by zero, overflow, underflow, inexact}
+    input logic                             XSgnM, YSgnM,        // input signs
+    input logic     [`NE-1:0]               XExpM, YExpM, ZExpM, // input exponents
+    input logic     [`NF:0]                 XManM, YManM, ZManM, // input mantissas
+    input logic     [2:0]                   FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+    input logic     [`FPSIZES/3:0]          FmtM,       // precision 1 = double 0 = single
+    input logic     [`NE+1:0]               ProdExpM,       // X exponent + Y exponent - bias
+    input logic                             AddendStickyM,  // sticky bit that is calculated during alignment
+    input logic                             KillProdM,      // set the product to zero before addition if the product is too small to matter
+    input logic                             XZeroM, YZeroM, ZZeroM, // inputs are zero
+    input logic                             XInfM, YInfM, ZInfM,    // inputs are infinity
+    input logic                             XNaNM, YNaNM, ZNaNM,    // inputs are NaN
+    input logic                             XSNaNM, YSNaNM, ZSNaNM, // inputs are signaling NaNs
+    input logic     [3*`NF+5:0]             SumM,       // the positive sum
+    input logic                             NegSumM,    // was the sum negitive
+    input logic                             InvZM,      // do you invert Z
+    input logic                             ZSgnEffM,   // the modified Z sign - depends on instruction
+    input logic                             PSgnM,      // the product's sign
+    input logic                             Mult,       // multiply opperation
+    input logic     [$clog2(3*`NF+7)-1:0]   NormCntM,   // the normalization shift count
+    output logic    [`FLEN-1:0]             FMAResM,    // FMA final result
+    output logic    [4:0]                   FMAFlgM);   // FMA flags {invalid, divide by zero, overflow, underflow, inexact}
   


@ -548,28 +570,27 @@ endmodule


 module normalize(
-    input logic  [3*`NF+5:0]    SumM,       // the positive sum
-    input logic  [`NE-1:0]      ZExpM,      // exponent of Z
-    input logic  [`NE+1:0]      ProdExpM,   // X exponent + Y exponent - bias
-    input logic  [8:0]          NormCntM,   // normalization shift count
-    input logic                 FmtM,       // precision 1 = double 0 = single
-    input logic                 KillProdM,  // is the product set to zero
-    input logic                 AddendStickyM,  // the sticky bit caclulated from the aligned addend
-    input logic                 NegSumM,    // was the sum negitive
-    output logic [`NF+2:0]      NormSum,        // normalized sum
-    output logic                SumZero,        // is the sum zero
-    output logic                NormSumSticky, UfSticky,    // sticky bits
-    output logic [`NE+1:0]      SumExp,         // exponent of the normalized sum
-    output logic                ResultDenorm    // is the result denormalized
+    input logic  [3*`NF+5:0]            SumM,       // the positive sum
+    input logic  [`NE-1:0]              ZExpM,      // exponent of Z
+    input logic  [`NE+1:0]              ProdExpM,   // X exponent + Y exponent - bias
+    input logic  [$clog2(3*`NF+7)-1:0]  NormCntM,   // normalization shift count
+    input logic  [`FPSIZES/3:0]         FmtM,       // precision 1 = double 0 = single
+    input logic                         KillProdM,  // is the product set to zero
+    input logic                         AddendStickyM,  // the sticky bit caclulated from the aligned addend
+    input logic                         NegSumM,    // was the sum negitive
+    output logic [`NF+2:0]              NormSum,        // normalized sum
+    output logic                        SumZero,        // is the sum zero
+    output logic                        NormSumSticky, UfSticky,    // sticky bits
+    output logic [`NE+1:0]              SumExp,         // exponent of the normalized sum
+    output logic                        ResultDenorm    // is the result denormalized
 );
-    logic [`NE+1:0]     SumExpTmp;          // exponent of the normalized sum not taking into account denormal or zero results
-    logic [8:0]         DenormShift;        // right shift if the result is denormalized //***change this later
-    logic [3*`NF+5:0]   CorrSumShifted;     // the shifted sum after LZA correction
-    logic [3*`NF+8:0]   SumShifted;         // the shifted sum before LZA correction
-    logic [`NE+1:0]     SumExpTmpTmp;       // the exponent of the normalized sum with the `FLEN bias
-    logic               PreResultDenorm;    // is the result denormalized - calculated before LZA corection
-    logic               PreResultDenorm2;   // is the result denormalized - calculated before LZA corection
-    logic               LZAPlus1, LZAPlus2; // add one or two to the sum's exponent due to LZA correction
+    logic [`NE+1:0]             SumExpTmp;          // exponent of the normalized sum not taking into account denormal or zero results
+    logic [$clog2(3*`NF+7)-1:0] DenormShift;        // right shift if the result is denormalized //***change this later
+    logic [3*`NF+5:0]           CorrSumShifted;     // the shifted sum after LZA correction
+    logic [3*`NF+8:0]           SumShifted;         // the shifted sum before LZA correction
+    logic [`NE+1:0]             SumExpTmpTmp;       // the exponent of the normalized sum with the `FLEN bias
+    logic                       PreResultDenorm;    // is the result denormalized - calculated before LZA corection
+    logic                       LZAPlus1, LZAPlus2; // add one or two to the sum's exponent due to LZA correction

    ///////////////////////////////////////////////////////////////////////////////
    // Normalization
@ -580,14 +601,89 @@ module normalize(

    // calculate the sum's exponent
    assign SumExpTmpTmp = KillProdM ? {2'b0, ZExpM} : ProdExpM + -({4'b0, NormCntM} + 1 - (`NF+4));
-    assign SumExpTmp = FmtM ? SumExpTmpTmp : (SumExpTmpTmp-1023+127)&{`NE+2{|SumExpTmpTmp}};
+
+    //convert the sum's exponent into the propper percision
+    if (`FPSIZES == 1) begin
+        assign SumExpTmp = SumExpTmpTmp;
+
+    end else if (`FPSIZES == 2) begin
+        assign SumExpTmp = FmtM ? SumExpTmpTmp : (SumExpTmpTmp-(`NE+2)'(`BIAS)+(`NE+2)'(`BIAS1))&{`NE+2{|SumExpTmpTmp}};
+
+    end else if (`FPSIZES == 3) begin
+        always_comb begin
+            case (FmtM)
+                `FMT: assign SumExpTmp = SumExpTmpTmp;
+                `FMT1: assign SumExpTmp = (SumExpTmpTmp-`BIAS+`BIAS1)&{`NE+2{|SumExpTmpTmp}};
+                `FMT2: assign SumExpTmp = (SumExpTmpTmp-`BIAS+`BIAS2)&{`NE+2{|SumExpTmpTmp}};
+                default: assign SumExpTmp = `NE+2'bx;
+            endcase
+        end
+
+    end else begin
+        always_comb begin
+            case (FmtM)
+                2'h3: assign SumExpTmp = SumExpTmpTmp;
+                2'h1: assign SumExpTmp = (SumExpTmpTmp-`BIAS+`D_BIAS)&{`NE+2{|SumExpTmpTmp}};
+                2'h0: assign SumExpTmp = (SumExpTmpTmp-`BIAS+`S_BIAS)&{`NE+2{|SumExpTmpTmp}};
+                2'h2: assign SumExpTmp = (SumExpTmpTmp-`BIAS+`H_BIAS)&{`NE+2{|SumExpTmpTmp}};
+            endcase
+        end
+
+    end
    
-    logic SumDLTEZ, SumDGEFL, SumSLTEZ, SumSGEFL;
-    assign SumDLTEZ = SumExpTmpTmp[`NE+1] | ~|SumExpTmpTmp;
-    assign SumDGEFL = ($signed(SumExpTmpTmp)>=$signed(-(13'd`NF+13'd2)));
-    assign SumSLTEZ = $signed(SumExpTmpTmp) <= $signed(13'd1023-13'd127);
-    assign SumSGEFL = ($signed(SumExpTmpTmp)>=$signed(-13'd25+13'd1023-13'd127)) | ~|SumExpTmpTmp;
-    assign PreResultDenorm2 = (FmtM ? SumDLTEZ : SumSLTEZ) & (FmtM ? SumDGEFL : SumSGEFL) & ~SumZero;
+    // determine if the result is denormalized
+    
+    if (`FPSIZES == 1) begin
+        logic Sum0LEZ, Sum0GEFL;
+        assign Sum0LEZ  = SumExpTmpTmp[`NE+1] | ~|SumExpTmpTmp;
+        assign Sum0GEFL = $signed(SumExpTmpTmp) >= $signed(-(`NE+2)'(`NF)-(`NE+2)'(2));
+        assign PreResultDenorm = Sum0LEZ & Sum0GEFL & ~SumZero;
+
+    end else if (`FPSIZES == 2) begin
+        logic Sum0LEZ, Sum0GEFL, Sum1LEZ, Sum1GEFL;
+        assign Sum0LEZ  = SumExpTmpTmp[`NE+1] | ~|SumExpTmpTmp;
+        assign Sum0GEFL = $signed(SumExpTmpTmp) >= $signed(-(`NE+2)'(`NF)-(`NE+2)'(2));
+        assign Sum1LEZ  = $signed(SumExpTmpTmp) <= $signed( (`NE+2)'(`BIAS)-(`NE+2)'(`BIAS1));
+        assign Sum1GEFL = $signed(SumExpTmpTmp) >= $signed(-(`NE+2)'(`NF1+2)+(`NE+2)'(`BIAS)-(`NE+2)'(`BIAS1)) | ~|SumExpTmpTmp;
+        assign PreResultDenorm = (FmtM ? Sum0LEZ : Sum1LEZ) & (FmtM ? Sum0GEFL : Sum1GEFL) & ~SumZero;
+
+    end else if (`FPSIZES == 3) begin
+        logic Sum0LEZ, Sum0GEFL, Sum1LEZ, Sum1GEFL, Sum2LEZ, Sum2GEFL;
+        assign Sum0LEZ  = SumExpTmpTmp[`NE+1] | ~|SumExpTmpTmp;
+        assign Sum0GEFL = $signed(SumExpTmpTmp) >= $signed(-(`NE+2)'(`NF)-(`NE+2)'(2));
+        assign Sum1LEZ  = $signed(SumExpTmpTmp) <= $signed( (`NE+2)'(`BIAS)-(`NE+2)'(`BIAS1));
+        assign Sum1GEFL = $signed(SumExpTmpTmp) >= $signed(-(`NE+2)'(`NF1+2)+(`NE+2)'(`BIAS)-(`NE+2)'(`BIAS1)) | ~|SumExpTmpTmp;
+        assign Sum2LEZ  = $signed(SumExpTmpTmp) <= $signed( (`NE+2)'(`BIAS)-(`NE+2)'(`BIAS2));
+        assign Sum2GEFL = $signed(SumExpTmpTmp) >= $signed(-(`NE+2)'(`NF2+2)+(`NE+2)'(`BIAS)-(`NE+2)'(`BIAS2)) | ~|SumExpTmpTmp;
+        always_comb begin
+            case (FmtM)
+                `FMT: assign PreResultDenorm = Sum0LEZ & Sum0GEFL & ~SumZero;
+                `FMT1: assign PreResultDenorm = Sum1LEZ & Sum1GEFL & ~SumZero;
+                `FMT2: assign PreResultDenorm = Sum2LEZ & Sum2GEFL & ~SumZero;
+                default: assign PreResultDenorm = 1'bx;
+            endcase
+        end
+
+    end else begin
+        logic Sum0LEZ, Sum0GEFL, Sum1LEZ, Sum1GEFL, Sum2LEZ, Sum2GEFL, Sum3LEZ, Sum3GEFL;
+        assign Sum0LEZ  = SumExpTmpTmp[`NE+1] | ~|SumExpTmpTmp;
+        assign Sum0GEFL = $signed(SumExpTmpTmp) >= $signed(-(`NE+2)'(`NF  )-(`NE+2)'(2));
+        assign Sum1LEZ  = $signed(SumExpTmpTmp) <= $signed( (`NE+2)'(`BIAS)-(`NE+2)'(`D_BIAS));
+        assign Sum1GEFL = $signed(SumExpTmpTmp) >= $signed(-(`NE+2)'(`D_NF+2)+(`NE+2)'(`BIAS)-(`NE+2)'(`D_BIAS)) | ~|SumExpTmpTmp;
+        assign Sum2LEZ  = $signed(SumExpTmpTmp) <= $signed( (`NE+2)'(`BIAS)-(`NE+2)'(`S_BIAS));
+        assign Sum2GEFL = $signed(SumExpTmpTmp) >= $signed(-(`NE+2)'(`S_NF+2)+(`NE+2)'(`BIAS)-(`NE+2)'(`S_BIAS)) | ~|SumExpTmpTmp;
+        assign Sum3LEZ  = $signed(SumExpTmpTmp) <= $signed( (`NE+2)'(`BIAS)-(`NE+2)'(`H_BIAS));
+        assign Sum3GEFL = $signed(SumExpTmpTmp) >= $signed(-(`NE+2)'(`H_NF+2)+(`NE+2)'(`BIAS)-(`NE+2)'(`H_BIAS)) | ~|SumExpTmpTmp;
+        always_comb begin
+            case (FmtM)
+                2'h3: assign PreResultDenorm = Sum0LEZ & Sum0GEFL & ~SumZero;
+                2'h1: assign PreResultDenorm = Sum1LEZ & Sum1GEFL & ~SumZero;
+                2'h0: assign PreResultDenorm = Sum2LEZ & Sum2GEFL & ~SumZero;
+                2'h2: assign PreResultDenorm = Sum3LEZ & Sum3GEFL & ~SumZero;
+            endcase
+        end
+
+    end

    // 010. when should be 001.
    //      - shift left one
@ -599,45 +695,66 @@ module normalize(

    // Determine the shift needed for denormal results
    //  - if not denorm add 1 to shift out the leading 1
-    assign DenormShift = PreResultDenorm2 ? SumExpTmp[8:0] : 1;
+    assign DenormShift = PreResultDenorm ? SumExpTmp[$clog2(3*`NF+7)-1:0] : 1;
    // Normalize the sum
    assign SumShifted = {3'b0, SumM} << NormCntM+DenormShift;
    // LZA correction
    assign LZAPlus1 = SumShifted[3*`NF+7];
    assign LZAPlus2 = SumShifted[3*`NF+8];
 	// the only possible mantissa for a plus two is all zeroes - a one has to propigate all the way through a sum. so we can leave the bottom statement alone
-    assign CorrSumShifted =  LZAPlus1&~KillProdM ? SumShifted[3*`NF+6:1] : SumShifted[3*`NF+5:0];
+    assign CorrSumShifted =  LZAPlus1 ? SumShifted[3*`NF+6:1] : SumShifted[3*`NF+5:0];
    assign NormSum = CorrSumShifted[3*`NF+5:2*`NF+3];
+
    // Calculate the sticky bit
-    assign NormSumSticky = (|CorrSumShifted[2*`NF+2:0]) | (|CorrSumShifted[136:2*`NF+3]&~FmtM);
+    if (`FPSIZES == 1) begin
+        assign NormSumSticky = |CorrSumShifted[2*`NF+2:0];
+
+    end else if (`FPSIZES == 2) begin
+        // 3*NF+5 - NF1 - 3
+        assign NormSumSticky = (|CorrSumShifted[2*`NF+2:0]) | 
+        (|CorrSumShifted[3*`NF+2-`NF1:2*`NF+3]&~FmtM);
+
+    end else if (`FPSIZES == 3) begin
+        assign NormSumSticky = (|CorrSumShifted[2*`NF+2:0]) | 
+        (|CorrSumShifted[3*`NF+2-`NF1:2*`NF+3]&((FmtM==`FMT1)|(FmtM==`FMT2))) | 
+        (|CorrSumShifted[3*`NF+2-`NF2:3*`NF+3-`NF1]&(FmtM==`FMT2));
+
+    end else begin        
+        assign NormSumSticky = (|CorrSumShifted[2*`NF+2:0]) | 
+        (|CorrSumShifted[3*`NF+2-`D_NF:2*`NF+3]&((FmtM==1)|(FmtM==0)|(FmtM==2))) | 
+        (|CorrSumShifted[3*`NF+2-`S_NF:3*`NF+3-`D_NF]&((FmtM==0)|(FmtM==2))) |
+        (|CorrSumShifted[3*`NF+2-`H_NF:3*`NF+3-`S_NF]&(FmtM==2));
+
+    end
+
    assign UfSticky = AddendStickyM | NormSumSticky;

    // Determine sum's exponent
    //                          if plus1                     If plus2                                      if said denorm but norm plus 1           if said denorm but norm plus 2
-    assign SumExp = (SumExpTmp+{12'b0, LZAPlus1&~KillProdM}+{11'b0, LZAPlus2&~KillProdM, 1'b0}+{12'b0, ~ResultDenorm&PreResultDenorm2&~KillProdM}+{12'b0, &SumExpTmp&SumShifted[3*`NF+6]&~KillProdM}) & {`NE+2{~(SumZero|ResultDenorm)}};
+    assign SumExp = (SumExpTmp+{12'b0, LZAPlus1&~KillProdM}+{11'b0, LZAPlus2&~KillProdM, 1'b0}+{12'b0, ~ResultDenorm&PreResultDenorm&~KillProdM}+{12'b0, &SumExpTmp&SumShifted[3*`NF+6]&~KillProdM}) & {`NE+2{~(SumZero|ResultDenorm)}};
    // recalculate if the result is denormalized
-    assign ResultDenorm = PreResultDenorm2&~SumShifted[3*`NF+6]&~SumShifted[3*`NF+7];
+    assign ResultDenorm = PreResultDenorm&~SumShifted[3*`NF+6]&~SumShifted[3*`NF+7];

 endmodule

 module fmaround(
-    input logic             FmtM,       // precision 1 = double 0 = single
-    input logic  [2:0]      FrmM,       // rounding mode
-    input logic             UfSticky,   // sticky bit for underlow calculation
-    input logic  [`NF+2:0]  NormSum,    // normalized sum
-    input logic             AddendStickyM,  // addend's sticky bit
-    input logic             NormSumSticky,  // normalized sum's sticky bit
-    input logic             ZZeroM,         // is Z zero
-    input logic             InvZM,          // invert Z
-    input logic  [`NE+1:0]  SumExp,         // exponent of the normalized sum
-    input logic             ResultSgnTmp,      // the result's sign
-    output logic            CalcPlus1, UfPlus1,  // do you add or subtract on from the result
-    output logic [`NE+1:0]  FullResultExp,      // ResultExp with bits to determine sign and overflow
-    output logic [`NF-1:0]  ResultFrac,         // Result fraction
-    output logic [`NE-1:0]  ResultExp,          // Result exponent
-    output logic            Sticky,             // sticky bit
-    output logic [`FLEN:0]  RoundAdd,           // how much to add to the result
-    output logic            Round, Guard, UfLSBNormSum // bits needed to calculate rounding
+    input logic  [`FPSIZES/3:0] FmtM,       // precision 1 = double 0 = single
+    input logic  [2:0]          FrmM,       // rounding mode
+    input logic                 UfSticky,   // sticky bit for underlow calculation
+    input logic  [`NF+2:0]      NormSum,    // normalized sum
+    input logic                 AddendStickyM,  // addend's sticky bit
+    input logic                 NormSumSticky,  // normalized sum's sticky bit
+    input logic                 ZZeroM,         // is Z zero
+    input logic                 InvZM,          // invert Z
+    input logic  [`NE+1:0]      SumExp,         // exponent of the normalized sum
+    input logic                 ResultSgnTmp,      // the result's sign
+    output logic                CalcPlus1, UfPlus1,  // do you add or subtract on from the result
+    output logic [`NE+1:0]      FullResultExp,      // ResultExp with bits to determine sign and overflow
+    output logic [`NF-1:0]      ResultFrac,         // Result fraction
+    output logic [`NE-1:0]      ResultExp,          // Result exponent
+    output logic                Sticky,             // sticky bit
+    output logic [`FLEN:0]      RoundAdd,           // how much to add to the result
+    output logic                Round, Guard, UfLSBNormSum // bits needed to calculate rounding
 );
    logic           LSBNormSum;         // bit used for rounding - least significant bit of the normalized sum
    logic           SubBySmallNum, UfSubBySmallNum;  // was there supposed to be a subtraction by a small number
@ -676,18 +793,146 @@ module fmaround(
    //      101 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
    //      110/111 - Plus1

-    // determine guard, round, and least significant bit of the result
-    assign Guard = FmtM ? NormSum[2] : NormSum[31];
-    assign Round = FmtM ? NormSum[1] : NormSum[30];
-    assign LSBNormSum = FmtM ? NormSum[3] : NormSum[32];
+    if (`FPSIZES == 1) begin
+        // determine guard, round, and least significant bit of the result
+        assign Guard = NormSum[2];
+        assign Round = NormSum[1];
+        assign LSBNormSum = NormSum[3];
+
+        // used to determine underflow flag
+        assign UfGuard = NormSum[1];
+        assign UfRound = NormSum[0];
+        assign UfLSBNormSum = NormSum[2];
+
+        // determine sticky
+        assign Sticky = UfSticky | NormSum[0];
+
+    end else if (`FPSIZES == 2) begin
+        //         \/-------------NF---------------,
+        //      |      NF1       | 3 |             |
+        //          '-------NF1------^
+
+        // determine guard, round, and least significant bit of the result
+        assign Guard = FmtM ? NormSum[2] : NormSum[`NF-`NF1+2];
+        assign Round = FmtM ? NormSum[1] : NormSum[`NF-`NF1+1];
+        assign LSBNormSum = FmtM ? NormSum[3] : NormSum[`NF-`NF1+3];
+
+        // used to determine underflow flag
+        assign UfGuard = FmtM ? NormSum[1] : NormSum[`NF-`NF1+1];
+        assign UfRound = FmtM ? NormSum[0] : NormSum[`NF-`NF1];
+        assign UfLSBNormSum = FmtM ? NormSum[2] : NormSum[`NF-`NF1+2];
+
+        // determine sticky
+        assign Sticky = UfSticky | (FmtM ? NormSum[0] : NormSum[`NF-`NF1]);
+
+    end else if (`FPSIZES == 3) begin
+        always_comb begin
+            case (FmtM)
+                `FMT: begin
+                    // determine guard, round, and least significant bit of the result
+                    assign Guard = NormSum[2];
+                    assign Round = NormSum[1];
+                    assign LSBNormSum = NormSum[3];
+                    // used to determine underflow flag
+                    assign UfGuard = NormSum[1];
+                    assign UfRound = NormSum[0];
+                    assign UfLSBNormSum = NormSum[2];
+                    // determine sticky
+                    assign Sticky = UfSticky | NormSum[0];
+                end
+                `FMT1: begin
+                    // determine guard, round, and least significant bit of the result
+                    assign Guard = NormSum[`NF-`NF1+2];
+                    assign Round = NormSum[`NF-`NF1+1];
+                    assign LSBNormSum = NormSum[`NF-`NF1+3];
+                    // used to determine underflow flag
+                    assign UfGuard = NormSum[`NF-`NF1+1];
+                    assign UfRound = NormSum[`NF-`NF1];
+                    assign UfLSBNormSum = NormSum[`NF-`NF1+2];
+                    // determine sticky
+                    assign Sticky = UfSticky | NormSum[`NF-`NF1];
+                end
+                `FMT2: begin
+                    // determine guard, round, and least significant bit of the result
+                    assign Guard = NormSum[`NF-`NF2+2];
+                    assign Round = NormSum[`NF-`NF2+1];
+                    assign LSBNormSum = NormSum[`NF-`NF2+3];
+                    // used to determine underflow flag
+                    assign UfGuard = NormSum[`NF-`NF2+1];
+                    assign UfRound = NormSum[`NF-`NF2];
+                    assign UfLSBNormSum = NormSum[`NF-`NF2+2];
+                    // determine sticky
+                    assign Sticky = UfSticky | NormSum[`NF-`NF2];
+                end
+                default: begin
+                    assign Guard = 1'bx;
+                    assign Round = 1'bx;
+                    assign LSBNormSum = 1'bx;
+                    assign UfGuard = 1'bx;
+                    assign UfRound = 1'bx;
+                    assign UfLSBNormSum = 1'bx;
+                    assign Sticky = 1'bx;
+                end
+            endcase
+        end
+
+    end else begin
+        always_comb begin
+            case (FmtM)
+                2'h3: begin
+                    // determine guard, round, and least significant bit of the result
+                    assign Guard = NormSum[2];
+                    assign Round = NormSum[1];
+                    assign LSBNormSum = NormSum[3];
+                    // used to determine underflow flag
+                    assign UfGuard = NormSum[1];
+                    assign UfRound = NormSum[0];
+                    assign UfLSBNormSum = NormSum[2];
+                    // determine sticky
+                    assign Sticky = UfSticky | NormSum[0];
+                end
+                2'h1: begin
+                    // determine guard, round, and least significant bit of the result
+                    assign Guard = NormSum[`NF-`D_NF+2];
+                    assign Round = NormSum[`NF-`D_NF+1];
+                    assign LSBNormSum = NormSum[`NF-`D_NF+3];
+                    // used to determine underflow flag
+                    assign UfGuard = NormSum[`NF-`D_NF+1];
+                    assign UfRound = NormSum[`NF-`D_NF];
+                    assign UfLSBNormSum = NormSum[`NF-`D_NF+2];
+                    // determine sticky
+                    assign Sticky = UfSticky | NormSum[`NF-`D_NF];
+                end
+                2'h0: begin
+                    // determine guard, round, and least significant bit of the result
+                    assign Guard = NormSum[`NF-`S_NF+2];
+                    assign Round = NormSum[`NF-`S_NF+1];
+                    assign LSBNormSum = NormSum[`NF-`S_NF+3];
+                    // used to determine underflow flag
+                    assign UfGuard = NormSum[`NF-`S_NF+1];
+                    assign UfRound = NormSum[`NF-`S_NF];
+                    assign UfLSBNormSum = NormSum[`NF-`S_NF+2];
+                    // determine sticky
+                    assign Sticky = UfSticky | NormSum[`NF-`S_NF];
+                end
+                2'h2: begin
+                    // determine guard, round, and least significant bit of the result
+                    assign Guard = NormSum[`NF-`H_NF+2];
+                    assign Round = NormSum[`NF-`H_NF+1];
+                    assign LSBNormSum = NormSum[`NF-`H_NF+3];
+                    // used to determine underflow flag
+                    assign UfGuard = NormSum[`NF-`H_NF+1];
+                    assign UfRound = NormSum[`NF-`H_NF];
+                    assign UfLSBNormSum = NormSum[`NF-`H_NF+2];
+                    // determine sticky
+                    assign Sticky = UfSticky | NormSum[`NF-`H_NF];
+                end
+            endcase
+        end
+
+    end

-    // used to determine underflow flag
-    assign UfGuard = FmtM ? NormSum[1] : NormSum[30];
-    assign UfRound = FmtM ? NormSum[0] : NormSum[29];
-    assign UfLSBNormSum = FmtM ? NormSum[2] : NormSum[31];

-    // determine sticky
-    assign Sticky = UfSticky | NormSum[0];
    // Deterimine if a small number was supposed to be subtrated
    assign SubBySmallNum = AddendStickyM & InvZM & ~(NormSumSticky|UfRound) & ~ZZeroM; //***here
    assign UfSubBySmallNum = AddendStickyM & InvZM & ~(NormSumSticky) & ~ZZeroM; //***here
@ -729,10 +974,40 @@ module fmaround(
    assign Minus1 = CalcMinus1 & (Sticky | Guard | Round);

    // Compute rounded result
-    assign RoundAdd = FmtM ? Minus1 ? {`FLEN+1{1'b1}} : {{{`FLEN{1'b0}}}, Plus1} :
-                             Minus1 ? {{36{1'b1}}, 29'b0} : {35'b0, Plus1, 29'b0};
-    assign NormSumTruncated = {NormSum[`NF+2:32], NormSum[31:3]&{29{FmtM}}};
+    if (`FPSIZES == 1) begin
+        assign RoundAdd = Minus1 ? {`FLEN+1{1'b1}} : {{`FLEN{1'b0}}, Plus1};

+    end else if (`FPSIZES == 2) begin
+        // \/FLEN+1
+        //  | NE+2 |        NF      |
+        //  '-NE+2-^----NF1----^
+        // `FLEN+1-`NE-2-`NF1 = FLEN-1-NE-NF1
+        assign RoundAdd = FmtM ? Minus1 ? {`FLEN+1{1'b1}} : {{{`FLEN{1'b0}}}, Plus1} :
+                                Minus1 ? {{`NE+2+`NF1{1'b1}}, (`FLEN-1-`NE-`NF1)'(0)} : {(`NE+1+`NF1)'(0), Plus1, (`FLEN-1-`NE-`NF1)'(0)};
+
+    end else if (`FPSIZES == 3) begin
+        always_comb begin
+            case (FmtM)
+                `FMT: assign RoundAdd = Minus1 ? {`FLEN+1{1'b1}} : {{{`FLEN{1'b0}}}, Plus1};
+                `FMT1: assign RoundAdd = Minus1 ? {{`NE+2+`NF1{1'b1}}, (`FLEN-1-`NE-`NF1)'(0)} : {(`NE+1+`NF1)'(0), Plus1, (`FLEN-1-`NE-`NF1)'(0)};
+                `FMT2: assign RoundAdd = Minus1 ? {{`NE+2+`NF2{1'b1}}, (`FLEN-1-`NE-`NF2)'(0)} : {(`NE+1+`NF2)'(0), Plus1, (`FLEN-1-`NE-`NF2)'(0)};
+                default: assign RoundAdd = (`FLEN+1)'(0);
+            endcase
+        end
+
+    end else begin        
+        always_comb begin
+            case (FmtM)
+                2'h3: assign RoundAdd = Minus1 ? {`FLEN+1{1'b1}} : {{{`FLEN{1'b0}}}, Plus1};
+                2'h1: assign RoundAdd = Minus1 ? {{`NE+2+`D_NF{1'b1}}, (`FLEN-1-`NE-`D_NF)'(0)} : {(`NE+1+`D_NF)'(0), Plus1, (`FLEN-1-`NE-`D_NF)'(0)};
+                2'h0: assign RoundAdd = Minus1 ? {{`NE+2+`S_NF{1'b1}}, (`FLEN-1-`NE-`S_NF)'(0)} : {(`NE+1+`S_NF)'(0), Plus1, (`FLEN-1-`NE-`S_NF)'(0)};
+                2'h2: assign RoundAdd = Minus1 ? {{`NE+2+`H_NF{1'b1}}, (`FLEN-1-`NE-`H_NF)'(0)} : {(`NE+1+`H_NF)'(0), Plus1, (`FLEN-1-`NE-`H_NF)'(0)};
+            endcase
+        end
+
+    end
+
+    assign NormSumTruncated = NormSum[`NF+2:3];
    assign {FullResultExp, ResultFrac} = {SumExp, NormSumTruncated} + RoundAdd;
    assign ResultExp = FullResultExp[`NE-1:0];

@ -748,7 +1023,7 @@ module fmaflags(
    input logic  [`NE+1:0]      SumExp,                 // exponent of the normalized sum
    input logic                 ZSgnEffM, PSgnM,        // the product and modified Z signs
    input logic                 Round, Guard, UfLSBNormSum, Sticky, UfPlus1, // bits used to determine rounding
-    input logic                 FmtM,                   // precision 1 = double 0 = single
+    input logic  [`FPSIZES/3:0] FmtM,                   // precision 1 = double 0 = single
    output logic                Invalid, Overflow, Underflow, // flags used to select the result
    output logic [4:0]          FMAFlgM // FMA flags
 );
@ -771,8 +1046,34 @@ module fmaflags(
    assign Invalid = SigNaN | ((XInfM | YInfM) & ZInfM & (PSgnM ^ ZSgnEffM) & ~XNaNM & ~YNaNM) | (XZeroM & YInfM) | (YZeroM & XInfM);  
   
    // Set Overflow flag if the number is too big to be represented
-    //      - Don't set the overflow flag if an overflowed result isn't outputed
-    assign GtMaxExp = FmtM ? &FullResultExp[`NE-1:0] | FullResultExp[`NE] : &FullResultExp[7:0] | FullResultExp[8];
+    //      - Don't set the overflow flag if an overflowed result isn't outputed    
+    if (`FPSIZES == 1) begin
+        assign GtMaxExp = &FullResultExp[`NE-1:0] | FullResultExp[`NE];
+
+    end else if (`FPSIZES == 2) begin
+        assign GtMaxExp = FmtM ? &FullResultExp[`NE-1:0] | FullResultExp[`NE] : &FullResultExp[`NE1-1:0] | FullResultExp[`NE1];
+
+    end else if (`FPSIZES == 3) begin
+        always_comb begin
+            case (FmtM)
+                `FMT: assign GtMaxExp =  &FullResultExp[`NE-1:0] | FullResultExp[`NE];
+                `FMT1: assign GtMaxExp = &FullResultExp[`NE1-1:0] | FullResultExp[`NE1];
+                `FMT2: assign GtMaxExp = &FullResultExp[`NE2-1:0] | FullResultExp[`NE2];
+                default: assign GtMaxExp = 1'bx;
+            endcase
+        end
+
+    end else begin        
+        always_comb begin
+            case (FmtM)
+                2'h3: assign GtMaxExp =  &FullResultExp[`NE-1:0] | FullResultExp[`NE];
+                2'h1: assign GtMaxExp = &FullResultExp[`D_NE-1:0] | FullResultExp[`D_NE];
+                2'h0: assign GtMaxExp = &FullResultExp[`S_NE-1:0] | FullResultExp[`S_NE];
+                2'h2: assign GtMaxExp = &FullResultExp[`H_NE-1:0] | FullResultExp[`H_NE];
+            endcase
+        end
+
+    end
    assign Overflow = GtMaxExp & ~FullResultExp[`NE+1]&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);

    // Set Underflow flag if the number is too small to be represented in normal numbers
@ -793,57 +1094,227 @@ endmodule


 module resultselect(
-    input logic                 XSgnM, YSgnM,        // input signs
-    input logic     [`NE-1:0]   XExpM, YExpM, ZExpM, // input exponents
-    input logic     [`NF:0]     XManM, YManM, ZManM, // input mantissas
-    input logic     [2:0]       FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
-    input logic                 FmtM,       // precision 1 = double 0 = single
-    input logic                 AddendStickyM,  // sticky bit that is calculated during alignment
-    input logic                 KillProdM,      // set the product to zero before addition if the product is too small to matter
-    input logic                 XInfM, YInfM, ZInfM,    // inputs are infinity
-    input logic                 XNaNM, YNaNM, ZNaNM,    // inputs are NaN
-    input logic                 ZSgnEffM,   // the modified Z sign - depends on instruction
-    input logic                 PSgnM,      // the product's sign
-    input logic                 ResultSgn,  // the result's sign
-    input logic                 CalcPlus1,  // rounding bits
-    input logic     [`FLEN:0]   RoundAdd,   // how much to add to the result
-    input logic                 Invalid, Overflow, Underflow,  // flags
-    input logic                 ResultDenorm,       // is the result denormalized
-    input logic     [`NE-1:0]   ResultExp,          // Result exponent
-    input logic     [`NF-1:0]   ResultFrac,         // Result fraction
-    output logic    [`FLEN-1:0] FMAResM     // FMA final result
+    input logic                     XSgnM, YSgnM,        // input signs
+    input logic     [`NE-1:0]       XExpM, YExpM, ZExpM, // input exponents
+    input logic     [`NF:0]         XManM, YManM, ZManM, // input mantissas
+    input logic     [2:0]           FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+    input logic     [`FPSIZES/3:0]  FmtM,       // precision 1 = double 0 = single
+    input logic                     AddendStickyM,  // sticky bit that is calculated during alignment
+    input logic                     KillProdM,      // set the product to zero before addition if the product is too small to matter
+    input logic                     XInfM, YInfM, ZInfM,    // inputs are infinity
+    input logic                     XNaNM, YNaNM, ZNaNM,    // inputs are NaN
+    input logic                     ZSgnEffM,   // the modified Z sign - depends on instruction
+    input logic                     PSgnM,      // the product's sign
+    input logic                     ResultSgn,  // the result's sign
+    input logic                     CalcPlus1,  // rounding bits
+    input logic     [`FLEN:0]       RoundAdd,   // how much to add to the result
+    input logic                     Invalid, Overflow, Underflow,  // flags
+    input logic                     ResultDenorm,       // is the result denormalized
+    input logic     [`NE-1:0]       ResultExp,          // Result exponent
+    input logic     [`NF-1:0]       ResultFrac,         // Result fraction
+    output logic    [`FLEN-1:0]     FMAResM     // FMA final result
 );
-    logic [`FLEN-1:0]   XNaNResult, YNaNResult, ZNaNResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult; // possible results
+    logic               InfSgn;
+    logic [`FLEN-1:0]   XNaNResult, YNaNResult, ZNaNResult, InfResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult, NormResult; // possible results
+    assign InfSgn = ZInfM ? ZSgnEffM : PSgnM;
+    if (`FPSIZES == 1) begin
+        if(`IEEE754) begin
+            assign XNaNResult = {XSgnM, {`NE{1'b1}}, 1'b1, XManM[`NF-2:0]};
+            assign YNaNResult = {YSgnM, {`NE{1'b1}}, 1'b1, YManM[`NF-2:0]};
+            assign ZNaNResult = {ZSgnEffM, {`NE{1'b1}}, 1'b1, ZManM[`NF-2:0]};
+            assign InvalidResult = {ResultSgn, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}};
+        end else begin
+            assign XNaNResult = {1'b0, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}};
+        end
+        assign OverflowResult =  ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, {`NE-1{1'b1}}, 1'b0, {`NF{1'b1}}} :
+                                                                                                                    {ResultSgn, {`NE{1'b1}}, {`NF{1'b0}}};
+        assign KillProdResult = {ResultSgn, {ZExpM, ZManM[`NF-1:0]} + (RoundAdd[`FLEN-2:0]&{`FLEN-1{AddendStickyM}})};
+        assign UnderflowResult = {ResultSgn, {`FLEN-1{1'b0}}} + {(`FLEN-1)'(0),(CalcPlus1&(AddendStickyM|FrmM[1]))};
+        assign InfResult = {InfSgn, {`NE{1'b1}}, (`NF)'(0)};
+        assign NormResult = {ResultSgn, ResultExp, ResultFrac};
+
+    end else if (`FPSIZES == 2) begin //will the format conversion in killprod work in other conversions?
+        if(`IEEE754) begin
+            assign XNaNResult = FmtM ? {XSgnM, {`NE{1'b1}}, 1'b1, XManM[`NF-2:0]} : {{`FLEN-`LEN1{1'b1}}, XSgnM, {`NE1{1'b1}}, 1'b1, XManM[`NF-2:`NF-`NF1]};
+            assign YNaNResult = FmtM ? {YSgnM, {`NE{1'b1}}, 1'b1, YManM[`NF-2:0]} : {{`FLEN-`LEN1{1'b1}}, YSgnM, {`NE1{1'b1}}, 1'b1, YManM[`NF-2:`NF-`NF1]};
+            assign ZNaNResult = FmtM ? {ZSgnEffM, {`NE{1'b1}}, 1'b1, ZManM[`NF-2:0]} : {{`FLEN-`LEN1{1'b1}}, ZSgnEffM, {`NE1{1'b1}}, 1'b1, ZManM[`NF-2:`NF-`NF1]};
+            assign InvalidResult = FmtM ? {ResultSgn, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}} : {{`FLEN-`LEN1{1'b1}}, ResultSgn, {`NE1{1'b1}}, 1'b1, (`NF1-1)'(0)};
+        end else begin 
+            assign XNaNResult = FmtM ? {1'b0, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}} : {{`FLEN-`LEN1{1'b1}}, 1'b0, {`NE1{1'b1}}, 1'b1, (`NF1-1)'(0)};
+        end
+        
+        assign OverflowResult =  FmtM ? ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, {`NE-1{1'b1}}, 1'b0, {`NF{1'b1}}} :
+                                                                                                                            {ResultSgn, {`NE{1'b1}}, {`NF{1'b0}}} :
+                                        ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{`FLEN-`LEN1{1'b1}}, ResultSgn, {`NE1-1{1'b1}}, 1'b0, {`NF1{1'b1}}} :
+                                                                                                                            {{`FLEN-`LEN1{1'b1}}, ResultSgn, {`NE1{1'b1}}, (`NF1)'(0)};
+        assign KillProdResult = FmtM ? {ResultSgn, {ZExpM, ZManM[`NF-1:0]} + (RoundAdd[`FLEN-2:0]&{`FLEN-1{AddendStickyM}})} : {{`FLEN-`LEN1{1'b1}}, ResultSgn, {ZExpM[`NE-1], ZExpM[`NE1-2:0], ZManM[`NF-1:`NF-`NF1]} + (RoundAdd[`NF-`NF1+`LEN1-2:`NF-`NF1]&{`LEN1-1{AddendStickyM}})};
+        assign UnderflowResult = FmtM ? {ResultSgn, {`FLEN-1{1'b0}}} + {(`FLEN-1)'(0),(CalcPlus1&(AddendStickyM|FrmM[1]))} : {{`FLEN-`LEN1{1'b1}}, {ResultSgn, (`LEN1-1)'(0)} + {(`LEN1-1)'(0), (CalcPlus1&(AddendStickyM|FrmM[1]))}};
+        assign InfResult = FmtM ? {InfSgn, {`NE{1'b1}}, (`NF)'(0)} : {{`FLEN-`LEN1{1'b1}}, InfSgn, {`NE1{1'b1}}, (`NF1)'(0)};
+        assign NormResult = FmtM ? {ResultSgn, ResultExp, ResultFrac} : {{`FLEN-`LEN1{1'b1}}, ResultSgn, ResultExp[`NE1-1:0], ResultFrac[`NF-1:`NF-`NF1]};
+
+    end else if (`FPSIZES == 3) begin
+        always_comb begin
+            case (FmtM)
+                `FMT: begin  
+                    if(`IEEE754) begin
+                        assign XNaNResult = {XSgnM, {`NE{1'b1}}, 1'b1, XManM[`NF-2:0]};
+                        assign YNaNResult = {YSgnM, {`NE{1'b1}}, 1'b1, YManM[`NF-2:0]};
+                        assign ZNaNResult = {ZSgnEffM, {`NE{1'b1}}, 1'b1, ZManM[`NF-2:0]};
+                        assign InvalidResult = {ResultSgn, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}};
+                    end else begin 
+                        assign XNaNResult = {1'b0, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}};
+                    end
+                    
+                    assign OverflowResult = ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, {`NE-1{1'b1}}, 1'b0, {`NF{1'b1}}} :
+                                                                                                                                        {ResultSgn, {`NE{1'b1}}, {`NF{1'b0}}};
+                    assign KillProdResult = {ResultSgn, {ZExpM, ZManM[`NF-1:0]} + (RoundAdd[`FLEN-2:0]&{`FLEN-1{AddendStickyM}})};
+                    assign UnderflowResult = {ResultSgn, {`FLEN-1{1'b0}}} + {(`FLEN-1)'(0),(CalcPlus1&(AddendStickyM|FrmM[1]))};
+                    assign InfResult = {InfSgn, {`NE{1'b1}}, (`NF)'(0)};
+                    assign NormResult = {ResultSgn, ResultExp, ResultFrac};
+                end
+                `FMT1: begin  
+                    if(`IEEE754) begin
+                        assign XNaNResult = {{`FLEN-`LEN1{1'b1}}, XSgnM, {`NE1{1'b1}}, 1'b1, XManM[`NF-2:`NF-`NF1]};
+                        assign YNaNResult = {{`FLEN-`LEN1{1'b1}}, YSgnM, {`NE1{1'b1}}, 1'b1, YManM[`NF-2:`NF-`NF1]};
+                        assign ZNaNResult = {{`FLEN-`LEN1{1'b1}}, ZSgnEffM, {`NE1{1'b1}}, 1'b1, ZManM[`NF-2:`NF-`NF1]};
+                        assign InvalidResult = {{`FLEN-`LEN1{1'b1}}, ResultSgn, {`NE1{1'b1}}, 1'b1, (`NF1-1)'(0)};
+                    end else begin 
+                        assign XNaNResult = {{`FLEN-`LEN1{1'b1}}, 1'b0, {`NE1{1'b1}}, 1'b1, (`NF1-1)'(0)};
+                    end
+                    assign OverflowResult = ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{`FLEN-`LEN1{1'b1}}, ResultSgn, {`NE1-1{1'b1}}, 1'b0, {`NF1{1'b1}}} :
+                                                                                                                                  {{`FLEN-`LEN1{1'b1}}, ResultSgn, {`NE1{1'b1}}, (`NF1)'(0)};
+                    assign KillProdResult = {{`FLEN-`LEN1{1'b1}}, ResultSgn, {ZExpM[`NE-1], ZExpM[`NE1-2:0], ZManM[`NF-1:`NF-`NF1]} + (RoundAdd[`NF-`NF1+`LEN1-2:`NF-`NF1]&{`LEN1-1{AddendStickyM}})};
+                    assign UnderflowResult = {{`FLEN-`LEN1{1'b1}}, {ResultSgn, (`LEN1-1)'(0)} + {(`LEN1-1)'(0), (CalcPlus1&(AddendStickyM|FrmM[1]))}};
+                    assign InfResult = {{`FLEN-`LEN1{1'b1}}, InfSgn, {`NE1{1'b1}}, (`NF1)'(0)};
+                    assign NormResult = {{`FLEN-`LEN1{1'b1}}, ResultSgn, ResultExp[`NE1-1:0], ResultFrac[`NF-1:`NF-`NF1]};
+                end
+                `FMT2: begin  
+                    if(`IEEE754) begin
+                        assign XNaNResult = {{`FLEN-`LEN2{1'b1}}, XSgnM, {`NE2{1'b1}}, 1'b1, XManM[`NF-2:`NF-`NF2]};
+                        assign YNaNResult = {{`FLEN-`LEN2{1'b1}}, YSgnM, {`NE2{1'b1}}, 1'b1, YManM[`NF-2:`NF-`NF2]};
+                        assign ZNaNResult = {{`FLEN-`LEN2{1'b1}}, ZSgnEffM, {`NE2{1'b1}}, 1'b1, ZManM[`NF-2:`NF-`NF2]};
+                        assign InvalidResult = {{`FLEN-`LEN2{1'b1}}, ResultSgn, {`NE2{1'b1}}, 1'b1, (`NF2-1)'(0)};
+                    end else begin 
+                        assign XNaNResult = {{`FLEN-`LEN2{1'b1}}, 1'b0, {`NE2{1'b1}}, 1'b1, (`NF2-1)'(0)};
+                    end
+                    
+                    assign OverflowResult = ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{`FLEN-`LEN2{1'b1}}, ResultSgn, {`NE2-1{1'b1}}, 1'b0, {`NF2{1'b1}}} :
+                                                                                                                                  {{`FLEN-`LEN2{1'b1}}, ResultSgn, {`NE2{1'b1}}, (`NF2)'(0)};
+                    assign KillProdResult = {{`FLEN-`LEN2{1'b1}}, ResultSgn, {ZExpM[`NE-1], ZExpM[`NE2-2:0], ZManM[`NF-1:`NF-`NF2]} + (RoundAdd[`NF-`NF2+`LEN2-2:`NF-`NF2]&{`LEN2-1{AddendStickyM}})};
+                    assign UnderflowResult = {{`FLEN-`LEN2{1'b1}}, {ResultSgn, (`LEN2-1)'(0)} + {(`LEN2-1)'(0), (CalcPlus1&(AddendStickyM|FrmM[1]))}};
+                    assign InfResult = {{`FLEN-`LEN2{1'b1}}, InfSgn, {`NE2{1'b1}}, (`NF2)'(0)};
+                    assign NormResult = {{`FLEN-`LEN2{1'b1}}, ResultSgn, ResultExp[`NE2-1:0], ResultFrac[`NF-1:`NF-`NF2]};
+                end
+                default: begin
+                    if(`IEEE754) begin
+                        assign XNaNResult = (`FLEN)'(0);
+                        assign YNaNResult = (`FLEN)'(0);
+                        assign ZNaNResult = (`FLEN)'(0);
+                        assign InvalidResult = (`FLEN)'(0);
+                    end else begin 
+                        assign XNaNResult = (`FLEN)'(0);
+                    end
+                    assign OverflowResult = (`FLEN)'(0);
+                    assign KillProdResult = (`FLEN)'(0);
+                    assign UnderflowResult = (`FLEN)'(0);
+                    assign InfResult = (`FLEN)'(0);
+                    assign NormResult = (`FLEN)'(0);
+                end
+            endcase
+        end
+
+    end else begin 
+        always_comb begin
+            case (FmtM)
+                2'h3: begin  
+                    if(`IEEE754) begin
+                        assign XNaNResult = {XSgnM, {`NE{1'b1}}, 1'b1, XManM[`NF-2:0]};
+                        assign YNaNResult = {YSgnM, {`NE{1'b1}}, 1'b1, YManM[`NF-2:0]};
+                        assign ZNaNResult = {ZSgnEffM, {`NE{1'b1}}, 1'b1, ZManM[`NF-2:0]};
+                        assign InvalidResult = {ResultSgn, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}};
+                    end else begin 
+                        assign XNaNResult = {1'b0, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}};
+                    end
+                    
+                    assign OverflowResult = ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, {`NE-1{1'b1}}, 1'b0, {`NF{1'b1}}} :
+                                                                                                                                        {ResultSgn, {`NE{1'b1}}, {`NF{1'b0}}};
+                    assign KillProdResult = {ResultSgn, {ZExpM, ZManM[`NF-1:0]} + (RoundAdd[`FLEN-2:0]&{`FLEN-1{AddendStickyM}})};
+                    assign UnderflowResult = {ResultSgn, {`FLEN-1{1'b0}}} + {(`FLEN-1)'(0),(CalcPlus1&(AddendStickyM|FrmM[1]))};
+                    assign InfResult = {InfSgn, {`NE{1'b1}}, (`NF)'(0)};
+                    assign NormResult = {ResultSgn, ResultExp, ResultFrac};
+                end
+                2'h1: begin  
+                    if(`IEEE754) begin
+                        assign XNaNResult = {{`FLEN-`D_LEN{1'b1}}, XSgnM, {`D_NE{1'b1}}, 1'b1, XManM[`NF-2:`NF-`D_NF]};
+                        assign YNaNResult = {{`FLEN-`D_LEN{1'b1}}, YSgnM, {`D_NE{1'b1}}, 1'b1, YManM[`NF-2:`NF-`D_NF]};
+                        assign ZNaNResult = {{`FLEN-`D_LEN{1'b1}}, ZSgnEffM, {`D_NE{1'b1}}, 1'b1, ZManM[`NF-2:`NF-`D_NF]};
+                        assign InvalidResult = {{`FLEN-`D_LEN{1'b1}}, ResultSgn, {`D_NE{1'b1}}, 1'b1, (`D_NF-1)'(0)};
+                    end else begin 
+                        assign XNaNResult = {{`FLEN-`D_LEN{1'b1}}, 1'b0, {`D_NE{1'b1}}, 1'b1, (`D_NF-1)'(0)};
+                    end
+                    assign OverflowResult = ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{`FLEN-`D_LEN{1'b1}}, ResultSgn, {`D_NE-1{1'b1}}, 1'b0, {`D_NF{1'b1}}} :
+                                                                                                                                  {{`FLEN-`D_LEN{1'b1}}, ResultSgn, {`D_NE{1'b1}}, (`D_NF)'(0)};
+                    assign KillProdResult = {{`FLEN-`D_LEN{1'b1}}, ResultSgn, {ZExpM[`NE-1], ZExpM[`D_NE-2:0], ZManM[`NF-1:`NF-`D_NF]} + (RoundAdd[`NF-`D_NF+`D_LEN-2:`NF-`D_NF]&{`D_LEN-1{AddendStickyM}})};
+                    assign UnderflowResult = {{`FLEN-`D_LEN{1'b1}}, {ResultSgn, (`D_LEN-1)'(0)} + {(`D_LEN-1)'(0), (CalcPlus1&(AddendStickyM|FrmM[1]))}};
+                    assign InfResult = {{`FLEN-`D_LEN{1'b1}}, InfSgn, {`D_NE{1'b1}}, (`D_NF)'(0)};
+                    assign NormResult = {{`FLEN-`D_LEN{1'b1}}, ResultSgn, ResultExp[`D_NE-1:0], ResultFrac[`NF-1:`NF-`D_NF]};
+                end
+                2'h0: begin  
+                    if(`IEEE754) begin
+                        assign XNaNResult = {{`FLEN-`S_LEN{1'b1}}, XSgnM, {`S_NE{1'b1}}, 1'b1, XManM[`NF-2:`NF-`S_NF]};
+                        assign YNaNResult = {{`FLEN-`S_LEN{1'b1}}, YSgnM, {`S_NE{1'b1}}, 1'b1, YManM[`NF-2:`NF-`S_NF]};
+                        assign ZNaNResult = {{`FLEN-`S_LEN{1'b1}}, ZSgnEffM, {`S_NE{1'b1}}, 1'b1, ZManM[`NF-2:`NF-`S_NF]};
+                        assign InvalidResult = {{`FLEN-`S_LEN{1'b1}}, ResultSgn, {`S_NE{1'b1}}, 1'b1, (`S_NF-1)'(0)};
+                    end else begin 
+                        assign XNaNResult = {{`FLEN-`S_LEN{1'b1}}, 1'b0, {`S_NE{1'b1}}, 1'b1, (`S_NF-1)'(0)};
+                    end
+                    
+                    assign OverflowResult = ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{`FLEN-`S_LEN{1'b1}}, ResultSgn, {`S_NE-1{1'b1}}, 1'b0, {`S_NF{1'b1}}} :
+                                                                                                                                  {{`FLEN-`S_LEN{1'b1}}, ResultSgn, {`S_NE{1'b1}}, (`S_NF)'(0)};
+                    assign KillProdResult = {{`FLEN-`S_LEN{1'b1}}, ResultSgn, {ZExpM[`NE-1], ZExpM[`NE2-2:0], ZManM[`NF-1:`NF-`S_NF]} + (RoundAdd[`NF-`S_NF+`S_LEN-2:`NF-`S_NF]&{`S_LEN-1{AddendStickyM}})};
+                    assign UnderflowResult = {{`FLEN-`S_LEN{1'b1}}, {ResultSgn, (`S_LEN-1)'(0)} + {(`S_LEN-1)'(0), (CalcPlus1&(AddendStickyM|FrmM[1]))}};
+                    assign InfResult = {{`FLEN-`S_LEN{1'b1}}, InfSgn, {`S_NE{1'b1}}, (`S_NF)'(0)};
+                    assign NormResult = {{`FLEN-`S_LEN{1'b1}}, ResultSgn, ResultExp[`S_NE-1:0], ResultFrac[`NF-1:`NF-`S_NF]};
+                end
+                2'h2: begin  
+                    if(`IEEE754) begin
+                        assign XNaNResult = {{`FLEN-`H_LEN{1'b1}}, XSgnM, {`H_NE{1'b1}}, 1'b1, XManM[`NF-2:`NF-`H_NF]};
+                        assign YNaNResult = {{`FLEN-`H_LEN{1'b1}}, YSgnM, {`H_NE{1'b1}}, 1'b1, YManM[`NF-2:`NF-`H_NF]};
+                        assign ZNaNResult = {{`FLEN-`H_LEN{1'b1}}, ZSgnEffM, {`H_NE{1'b1}}, 1'b1, ZManM[`NF-2:`NF-`H_NF]};
+                        assign InvalidResult = {{`FLEN-`H_LEN{1'b1}}, 1'b0, {`H_NE{1'b1}}, 1'b1, (`H_NF-1)'(0)};
+                    end else begin 
+                        assign XNaNResult = {{`FLEN-`H_LEN{1'b1}}, 1'b0, {`H_NE{1'b1}}, 1'b1, (`H_NF-1)'(0)};
+                    end
+                    
+                    assign OverflowResult = ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{`FLEN-`H_LEN{1'b1}}, ResultSgn, {`H_NE-1{1'b1}}, 1'b0, {`H_NF{1'b1}}} :
+                                                                                                              {{`FLEN-`H_LEN{1'b1}}, ResultSgn, {`H_NE{1'b1}}, (`H_NF)'(0)};      
+
+                    assign KillProdResult = {{`FLEN-`H_LEN{1'b1}}, ResultSgn, {ZExpM[`NE-1], ZExpM[`H_NE-2:0], ZManM[`NF-1:`NF-`H_NF]} + (RoundAdd[`NF-`H_NF+`H_LEN-2:`NF-`H_NF]&{`H_LEN-1{AddendStickyM}})};
+                    assign UnderflowResult = {{`FLEN-`H_LEN{1'b1}}, {ResultSgn, (`H_LEN-1)'(0)} + {(`H_LEN-1)'(0), (CalcPlus1&(AddendStickyM|FrmM[1]))}};
+                    assign InfResult = {{`FLEN-`H_LEN{1'b1}}, InfSgn, {`H_NE{1'b1}}, (`H_NF)'(0)};
+                    assign NormResult = {{`FLEN-`H_LEN{1'b1}}, ResultSgn, ResultExp[`H_NE-1:0], ResultFrac[`NF-1:`NF-`H_NF]};
+                end
+            endcase
+        end

-    if(`IEEE754) begin
-        assign XNaNResult = FmtM ? {XSgnM, XExpM, 1'b1, XManM[`NF-2:0]} : {{32{1'b1}}, XSgnM, XExpM[7:0], 1'b1, XManM[50:29]};
-        assign YNaNResult = FmtM ? {YSgnM, YExpM, 1'b1, YManM[`NF-2:0]} : {{32{1'b1}}, YSgnM, YExpM[7:0], 1'b1, YManM[50:29]};
-        assign ZNaNResult = FmtM ? {ZSgnEffM, ZExpM, 1'b1, ZManM[`NF-2:0]} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], 1'b1, ZManM[50:29]};
-        assign InvalidResult = FmtM ? {ResultSgn, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}} : {{32{1'b1}}, ResultSgn, 8'hff, 1'b1, 22'b0};
-      end else begin
-        assign XNaNResult = FmtM ? {1'b0, XExpM, 1'b1, 51'b0} : {{32{1'b1}}, 1'b0, XExpM[7:0], 1'b1, 22'b0};
-        assign YNaNResult = FmtM ? {1'b0, YExpM, 1'b1, 51'b0} : {{32{1'b1}}, 1'b0, YExpM[7:0], 1'b1, 22'b0};
-        assign ZNaNResult = FmtM ? {1'b0, ZExpM, 1'b1, 51'b0} : {{32{1'b1}}, 1'b0, ZExpM[7:0], 1'b1, 22'b0};
-        assign InvalidResult = FmtM ? {1'b0, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}} : {{32{1'b1}}, 1'b0, 8'hff, 1'b1, 22'b0};
    end
-     
-    assign OverflowResult =  FmtM ? ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, {`NE-1{1'b1}}, 1'b0, {`NF{1'b1}}} :
-                                                                                                                          {ResultSgn, {`NE{1'b1}}, {`NF{1'b0}}} :
-                                    ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{32{1'b1}}, ResultSgn, 8'hfe, {23{1'b1}}} :
-                                                                                                                          {{32{1'b1}}, ResultSgn, 8'hff, 23'b0};
-    assign KillProdResult = FmtM ? {ResultSgn, {ZExpM, ZManM[`NF-1:0]} + (RoundAdd[`FLEN-2:0]&{`FLEN-1{AddendStickyM}})} : {{32{1'b1}}, ResultSgn, {ZExpM[`NE-1],ZExpM[6:0], ZManM[51:29]} + (RoundAdd[59:29]&{31{AddendStickyM}})};
-    assign UnderflowResult = FmtM ? {ResultSgn, {`FLEN-1{1'b0}}} + {63'b0,(CalcPlus1&(AddendStickyM|FrmM[1]))} : {{32{1'b1}}, {ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}};
-    assign FMAResM = XNaNM ? XNaNResult :
-                        YNaNM ? YNaNResult :
-                        ZNaNM ? ZNaNResult :
-                        Invalid ? InvalidResult :
-                        XInfM ? FmtM ? {PSgnM, XExpM, XManM[`NF-1:0]} : {{32{1'b1}}, PSgnM,  XExpM[7:0], XManM[51:29]} : 
-                        YInfM ? FmtM ? {PSgnM, YExpM, YManM[`NF-1:0]} : {{32{1'b1}}, PSgnM,  YExpM[7:0], YManM[51:29]} :
-                        ZInfM ? FmtM ? {ZSgnEffM, ZExpM, ZManM[`NF-1:0]} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], ZManM[51:29]} :
-                        KillProdM ? KillProdResult :  
-			            Overflow ? OverflowResult :
-                        Underflow & ~ResultDenorm & (ResultExp!=1) ? UnderflowResult :  
-                        FmtM ? {ResultSgn, ResultExp, ResultFrac} :
-                               {{32{1'b1}}, ResultSgn, ResultExp[7:0], ResultFrac[51:29]};
+    if(`IEEE754) begin
+        assign FMAResM = XNaNM ? XNaNResult :
+                            YNaNM ? YNaNResult :
+                            ZNaNM ? ZNaNResult :
+                            Invalid ? InvalidResult :
+                            XInfM|YInfM|ZInfM ? InfResult :
+                            KillProdM ? KillProdResult :  
+                            Overflow ? OverflowResult :
+                            Underflow & ~ResultDenorm & (ResultExp!=1) ? UnderflowResult :  
+                            NormResult;
+    end else begin
+        assign FMAResM = XNaNM|YNaNM|ZNaNM|Invalid ? XNaNResult :
+                            XInfM|YInfM|ZInfM ? InfResult :
+                            KillProdM ? KillProdResult :  
+                            Overflow ? OverflowResult :
+                            Underflow & ~ResultDenorm & (ResultExp!=1) ? UnderflowResult :  
+                            NormResult;
+    end

 endmodule
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@ -89,7 +89,6 @@ module fpu (
   logic [10:0] 	  XExpM, YExpM, ZExpM;                // input's exponent - memory stage
   logic [52:0] 	  XManE, YManE, ZManE;                // input's fraction - execute stage
   logic [52:0] 	  XManM, YManM, ZManM;                // input's fraction - memory stage
-   logic [10:0] 	  BiasE;                              // bias based on precision (single=7f double=3ff)
   logic 		  XNaNE, YNaNE, ZNaNE;                // is the input a NaN - execute stage
   logic 		  XNaNM, YNaNM, ZNaNM;                // is the input a NaN - memory stage
   logic 		  XNaNQ, YNaNQ;                       // is the input a NaN - divide
@ -176,10 +175,10 @@ module fpu (
   // unpack unit
   //    - splits FP inputs into their various parts
   //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
-   unpack unpack (.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
+   unpack unpack (.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FmtE, 
         .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
         .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
-         .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
+         .XZeroE, .YZeroE, .ZZeroE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);

   // FMA
   //   - two stage FMA
@ -231,7 +230,7 @@ module fpu (
         .XSNaNE, .ClassResE);

   // Convert
-   fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .ForwardedSrcAE, .FOpCtrlE, .FmtE, .FrmE,
+   fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .ForwardedSrcAE, .FOpCtrlE, .FmtE, .FrmE,
   .CvtResE, .CvtFlgE);

   // data to be stored in memory - to IEU
--- a/pipelined/src/fpu/unpack.sv
+++ b/pipelined/src/fpu/unpack.sv
@ -0,0 +1,473 @@
+`include "wally-config.vh"
+
+module unpack ( 
+    input logic  [`FLEN-1:0]        X, Y, Z,    // inputs from register file
+    input logic  [`FPSIZES/3:0]     FmtE,       // format signal 00 - single 10 - double 11 - quad 10 - half
+    output logic                    XSgnE, YSgnE, ZSgnE,    // sign bits of XYZ
+    output logic [`NE-1:0]          XExpE, YExpE, ZExpE,    // exponents of XYZ (converted to largest supported precision)
+    output logic [`NF:0]            XManE, YManE, ZManE,    // mantissas of XYZ (converted to largest supported precision)
+    output logic                    XNormE,                 // is X a normalized number
+    output logic                    XNaNE, YNaNE, ZNaNE,    // is XYZ a NaN
+    output logic                    XSNaNE, YSNaNE, ZSNaNE, // is XYZ a signaling NaN
+    output logic                    XDenormE, YDenormE, ZDenormE,   // is XYZ denormalized
+    output logic                    XZeroE, YZeroE, ZZeroE,         // is XYZ zero
+    output logic                    XInfE, YInfE, ZInfE,            // is XYZ infinity
+    output logic                    XExpMaxE                        // does X have the maximum exponent (NaN or Inf)
+);
+ 
+    logic [`NF-1:0] XFracE, YFracE, ZFracE; //Fraction of XYZ
+    logic           XExpNonzero, YExpNonzero, ZExpNonzero; // is the exponent of XYZ non-zero
+    logic           XFracZero, YFracZero, ZFracZero; // is the fraction zero
+    logic           XExpZero, YExpZero, ZExpZero; // is the exponent zero
+    logic           YExpMaxE, ZExpMaxE;  // is the exponent all 1s
+    
+    if (`FPSIZES == 1) begin        // if there is only one floating point format supported
+
+        // sign bit
+        assign XSgnE = X[`FLEN-1];
+        assign YSgnE = Y[`FLEN-1];
+        assign ZSgnE = Z[`FLEN-1];
+
+        // exponent
+        assign XExpE = X[`FLEN-2:`NF]; 
+        assign YExpE = Y[`FLEN-2:`NF]; 
+        assign ZExpE = Z[`FLEN-2:`NF]; 
+
+        // fraction (no assumed 1)
+        assign XFracE = X[`NF-1:0];
+        assign YFracE = Y[`NF-1:0];
+        assign ZFracE = Z[`NF-1:0];
+
+        // is the exponent non-zero
+        assign XExpNonzero = |XExpE; 
+        assign YExpNonzero = |YExpE;
+        assign ZExpNonzero = |ZExpE;
+
+        // is the exponent all 1's
+        assign XExpMaxE = &XExpE;
+        assign YExpMaxE = &YExpE;
+        assign ZExpMaxE = &ZExpE;
+    
+
+    end else if (`FPSIZES == 2) begin   // if there are 2 floating point formats supported
+
+        //***need better names for these constants
+        // largest format | smaller format
+        //----------------------------------
+        //      `FLEN     |     `LEN1       length of floating point number
+        //      `NE       |     `NE1        length of exponent
+        //      `NF       |     `NF1        length of fraction
+        //      `BIAS     |     `BIAS1      exponent's bias value
+        //      `FMT      |     `FMT1       precision's format value - Q=11 D=01 S=00 H=10
+
+        // Possible combinantions specified by spec:
+        //      double and single
+        //      single and half
+
+        // Not needed but can also handle:
+        //      quad   and double
+        //      quad   and single
+        //      quad   and half
+        //      double and half
+
+        logic  [`LEN1-1:0]   XLen1, YLen1, ZLen1; // Remove NaN boxing or NaN, if not properly NaN boxed
+
+        // Check NaN boxing, If the value is not properly NaN boxed, set the value to a quiet NaN
+        assign XLen1 = &X[`FLEN-1:`LEN1] ? X[`LEN1-1:0] : {1'b0, {`NE1+1{1'b1}}, (`NF1-1)'(0)};
+        assign YLen1 = &Y[`FLEN-1:`LEN1] ? Y[`LEN1-1:0] : {1'b0, {`NE1+1{1'b1}}, (`NF1-1)'(0)};
+        assign ZLen1 = &Z[`FLEN-1:`LEN1] ? Z[`LEN1-1:0] : {1'b0, {`NE1+1{1'b1}}, (`NF1-1)'(0)};  
+
+        // choose sign bit depending on format - 1=larger precsion 0=smaller precision
+        assign XSgnE = FmtE ? X[`FLEN-1] : XLen1[`LEN1-1];
+        assign YSgnE = FmtE ? Y[`FLEN-1] : YLen1[`LEN1-1];
+        assign ZSgnE = FmtE ? Z[`FLEN-1] : ZLen1[`LEN1-1];
+
+        // example double to single conversion:
+        // 1023 = 0011 1111 1111
+        // 127  = 0000 0111 1111 (subtract this)
+        // 896  = 0011 1000 0000
+        // sexp = 0000 bbbb bbbb (add this) b = bit d = ~b 
+        // dexp = 0bdd dbbb bbbb 
+        // also need to take into account possible zero/denorm/inf/NaN values
+
+        // extract the exponent, converting the smaller exponent into the larger precision if nessisary
+        assign XExpE = FmtE ? X[`FLEN-2:`NF] : {XLen1[`LEN1-2], {`NE-`NE1{~XLen1[`LEN1-2]&~XExpZero|XExpMaxE}}, XLen1[`LEN1-3:`NF1]}; 
+        assign YExpE = FmtE ? Y[`FLEN-2:`NF] : {YLen1[`LEN1-2], {`NE-`NE1{~YLen1[`LEN1-2]&~YExpZero|YExpMaxE}}, YLen1[`LEN1-3:`NF1]}; 
+        assign ZExpE = FmtE ? Z[`FLEN-2:`NF] : {ZLen1[`LEN1-2], {`NE-`NE1{~ZLen1[`LEN1-2]&~ZExpZero|ZExpMaxE}}, ZLen1[`LEN1-3:`NF1]}; 
+
+        // extract the fraction, add trailing zeroes to the mantissa if nessisary
+        assign XFracE = FmtE ? X[`NF-1:0] : {XLen1[`NF1-1:0], (`NF-`NF1)'(0)};
+        assign YFracE = FmtE ? Y[`NF-1:0] : {YLen1[`NF1-1:0], (`NF-`NF1)'(0)};
+        assign ZFracE = FmtE ? Z[`NF-1:0] : {ZLen1[`NF1-1:0], (`NF-`NF1)'(0)};
+
+        // is the exponent non-zero
+        assign XExpNonzero = FmtE ? |X[`FLEN-2:`NF] : |XLen1[`LEN1-2:`NF1]; 
+        assign YExpNonzero = FmtE ? |Y[`FLEN-2:`NF] : |YLen1[`LEN1-2:`NF1];
+        assign ZExpNonzero = FmtE ? |Z[`FLEN-2:`NF] : |ZLen1[`LEN1-2:`NF1];
+
+        // is the exponent all 1's
+        assign XExpMaxE = FmtE ? &X[`FLEN-2:`NF] : &XLen1[`LEN1-2:`NF1];
+        assign YExpMaxE = FmtE ? &Y[`FLEN-2:`NF] : &YLen1[`LEN1-2:`NF1];
+        assign ZExpMaxE = FmtE ? &Z[`FLEN-2:`NF] : &ZLen1[`LEN1-2:`NF1];
+    
+
+    end else if (`FPSIZES == 3) begin       // three floating point precsions supported
+
+        //***need better names for these constants
+        // largest format | larger format  | smallest format
+        //---------------------------------------------------
+        //      `FLEN     |     `LEN1      |    `LEN2       length of floating point number
+        //      `NE       |     `NE1       |    `NE2        length of exponent
+        //      `NF       |     `NF1       |    `NF2        length of fraction
+        //      `BIAS     |     `BIAS1     |    `BIAS2      exponent's bias value
+        //      `FMT      |     `FMT1      |    `FMT2       precision's format value - Q=11 D=01 S=00 H=10
+
+        // Possible combinantions specified by spec:
+        //      quad   and double and single
+        //      double and single and half
+
+        // Not needed but can also handle:
+        //      quad   and double and half
+        //      quad   and single and half
+
+        logic  [`LEN1-1:0]   XLen1, YLen1, ZLen1; // Remove NaN boxing or NaN, if not properly NaN boxed for larger percision
+        logic  [`LEN2-1:0]   XLen2, YLen2, ZLen2; // Remove NaN boxing or NaN, if not properly NaN boxed for smallest precision
+        
+        // Check NaN boxing, If the value is not properly NaN boxed, set the value to a quiet NaN - for larger precision
+        assign XLen1 = &X[`FLEN-1:`LEN1] ? X[`LEN1-1:0] : {1'b0, {`NE1+1{1'b1}}, (`NF1-1)'(0)};
+        assign YLen1 = &Y[`FLEN-1:`LEN1] ? Y[`LEN1-1:0] : {1'b0, {`NE1+1{1'b1}}, (`NF1-1)'(0)};
+        assign ZLen1 = &Z[`FLEN-1:`LEN1] ? Z[`LEN1-1:0] : {1'b0, {`NE1+1{1'b1}}, (`NF1-1)'(0)}; 
+
+        // Check NaN boxing, If the value is not properly NaN boxed, set the value to a quiet NaN - for smaller precision
+        assign XLen2 = &X[`FLEN-1:`LEN2] ? X[`LEN2-1:0] : {1'b0, {`NE2+1{1'b1}}, (`NF2-1)'(0)};
+        assign YLen2 = &Y[`FLEN-1:`LEN2] ? Y[`LEN2-1:0] : {1'b0, {`NE2+1{1'b1}}, (`NF2-1)'(0)};
+        assign ZLen2 = &Z[`FLEN-1:`LEN2] ? Z[`LEN2-1:0] : {1'b0, {`NE2+1{1'b1}}, (`NF2-1)'(0)}; 
+
+        always_comb begin
+            case (FmtE)
+                `FMT: begin // if input is largest precision (`FLEN - ie quad or double)
+                    // extract the sign bit
+                    XSgnE = X[`FLEN-1];
+                    YSgnE = Y[`FLEN-1];
+                    ZSgnE = Z[`FLEN-1];
+
+                    // extract the exponent
+                    XExpE = X[`FLEN-2:`NF]; 
+                    YExpE = Y[`FLEN-2:`NF]; 
+                    ZExpE = Z[`FLEN-2:`NF]; 
+
+                    // extract the fraction
+                    XFracE = X[`NF-1:0];
+                    YFracE = Y[`NF-1:0];
+                    ZFracE = Z[`NF-1:0];
+
+                    // is the exponent non-zero
+                    XExpNonzero = |X[`FLEN-2:`NF]; 
+                    YExpNonzero = |Y[`FLEN-2:`NF];
+                    ZExpNonzero = |Z[`FLEN-2:`NF];
+
+                    // is the exponent all 1's
+                    XExpMaxE = &X[`FLEN-2:`NF];
+                    YExpMaxE = &Y[`FLEN-2:`NF];
+                    ZExpMaxE = &Z[`FLEN-2:`NF];
+                end
+                `FMT1: begin    // if input is larger precsion (`LEN1 - double or single)
+
+                    // extract the sign bit
+                    XSgnE = XLen1[`LEN1-1];
+                    YSgnE = YLen1[`LEN1-1];
+                    ZSgnE = ZLen1[`LEN1-1];
+
+                    // example double to single conversion:
+                    // 1023 = 0011 1111 1111
+                    // 127  = 0000 0111 1111 (subtract this)
+                    // 896  = 0011 1000 0000
+                    // sexp = 0000 bbbb bbbb (add this) b = bit d = ~b 
+                    // dexp = 0bdd dbbb bbbb 
+                    // also need to take into account possible zero/denorm/inf/NaN values
+
+                    // convert the larger precision's exponent to use the largest precision's bias
+                    XExpE = {XLen1[`LEN1-2], {`NE-`NE1{~XLen1[`LEN1-2]&~XExpZero|XExpMaxE}}, XLen1[`LEN1-3:`NF1]}; 
+                    YExpE = {YLen1[`LEN1-2], {`NE-`NE1{~YLen1[`LEN1-2]&~YExpZero|YExpMaxE}}, YLen1[`LEN1-3:`NF1]}; 
+                    ZExpE = {ZLen1[`LEN1-2], {`NE-`NE1{~ZLen1[`LEN1-2]&~ZExpZero|ZExpMaxE}}, ZLen1[`LEN1-3:`NF1]}; 
+
+                    // extract the fraction and add the nessesary trailing zeros
+                    XFracE = {XLen1[`NF1-1:0], (`NF-`NF1)'(0)};
+                    YFracE = {YLen1[`NF1-1:0], (`NF-`NF1)'(0)};
+                    ZFracE = {ZLen1[`NF1-1:0], (`NF-`NF1)'(0)};
+
+                    // is the exponent non-zero
+                    XExpNonzero = |XLen1[`LEN1-2:`NF1]; 
+                    YExpNonzero = |YLen1[`LEN1-2:`NF1];
+                    ZExpNonzero = |ZLen1[`LEN1-2:`NF1];
+
+                    // is the exponent all 1's
+                    XExpMaxE = &XLen1[`LEN1-2:`NF1];
+                    YExpMaxE = &YLen1[`LEN1-2:`NF1];
+                    ZExpMaxE = &ZLen1[`LEN1-2:`NF1];
+                end
+                `FMT2: begin        // if input is smallest precsion (`LEN2 - single or half)
+
+                    // exctract the sign bit
+                    XSgnE = XLen2[`LEN2-1];
+                    YSgnE = YLen2[`LEN2-1];
+                    ZSgnE = ZLen2[`LEN2-1];
+
+                    // example double to single conversion:
+                    // 1023 = 0011 1111 1111
+                    // 127  = 0000 0111 1111 (subtract this)
+                    // 896  = 0011 1000 0000
+                    // sexp = 0000 bbbb bbbb (add this) b = bit d = ~b 
+                    // dexp = 0bdd dbbb bbbb 
+                    // also need to take into account possible zero/denorm/inf/NaN values
+                    
+                    // convert the smallest precision's exponent to use the largest precision's bias
+                    XExpE = {XLen2[`LEN2-2], {`NE-`NE2{~XLen2[`LEN2-2]&~XExpZero|XExpMaxE}}, XLen2[`LEN2-3:`NF2]}; 
+                    YExpE = {YLen2[`LEN2-2], {`NE-`NE2{~YLen2[`LEN2-2]&~YExpZero|YExpMaxE}}, YLen2[`LEN2-3:`NF2]}; 
+                    ZExpE = {ZLen2[`LEN2-2], {`NE-`NE2{~ZLen2[`LEN2-2]&~ZExpZero|ZExpMaxE}}, ZLen2[`LEN2-3:`NF2]}; 
+
+                    // extract the fraction and add the nessesary trailing zeros
+                    XFracE = {XLen2[`NF2-1:0], (`NF-`NF2)'(0)};
+                    YFracE = {YLen2[`NF2-1:0], (`NF-`NF2)'(0)};
+                    ZFracE = {ZLen2[`NF2-1:0], (`NF-`NF2)'(0)};
+
+                    // is the exponent non-zero
+                    XExpNonzero = |XLen2[`LEN2-2:`NF2]; 
+                    YExpNonzero = |YLen2[`LEN2-2:`NF2];
+                    ZExpNonzero = |ZLen2[`LEN2-2:`NF2];
+
+                    // is the exponent all 1's
+                    XExpMaxE = &XLen2[`LEN2-2:`NF2];
+                    YExpMaxE = &YLen2[`LEN2-2:`NF2];
+                    ZExpMaxE = &ZLen2[`LEN2-2:`NF2];
+                end
+                default: begin
+                    XSgnE = 0;
+                    YSgnE = 0;
+                    ZSgnE = 0;
+                    XExpE = 0; 
+                    YExpE = 0;
+                    ZExpE = 0; 
+                    XFracE = 0;
+                    YFracE = 0;
+                    ZFracE = 0;
+                    XExpNonzero = 0; 
+                    YExpNonzero = 0;
+                    ZExpNonzero = 0;
+                    XExpMaxE = 0;
+                    YExpMaxE = 0;
+                    ZExpMaxE = 0;
+                end
+            endcase
+        end
+
+    end else begin      // if all precsisons are supported - quad, double, single, and half
+    
+        //    quad   |  double  |  single  |  half    
+        //-------------------------------------------------------------------
+        //   `Q_LEN  |  `D_LEN  |  `S_LEN  |  `H_LEN     length of floating point number
+        //   `Q_NE   |  `D_NE   |  `S_NE   |  `H_NE      length of exponent
+        //   `Q_NF   |  `D_NF   |  `S_NF   |  `H_NF      length of fraction
+        //   `Q_BIAS |  `D_BIAS |  `S_BIAS |  `H_BIAS    exponent's bias value
+        //   `Q_FMT  |  `D_FMT  |  `S_FMT  |  `H_FMT     precision's format value - Q=11 D=01 S=00 H=10
+
+
+        logic  [`LEN1-1:0]   XLen1, YLen1, ZLen1; // Remove NaN boxing or NaN, if not properly NaN boxed for double percision
+        logic  [`LEN2-1:0]   XLen2, YLen2, ZLen2; // Remove NaN boxing or NaN, if not properly NaN boxed for single percision
+        logic  [`LEN2-1:0]   XLen3, YLen3, ZLen3; // Remove NaN boxing or NaN, if not properly NaN boxed for half percision
+        
+        // Check NaN boxing, If the value is not properly NaN boxed, set the value to a quiet NaN - for double precision
+        assign XLen1 = &X[`Q_LEN-1:`D_LEN] ? X[`D_LEN-1:0] : {1'b0, {`D_NE+1{1'b1}}, (`D_NF-1)'(0)};
+        assign YLen1 = &Y[`Q_LEN-1:`D_LEN] ? Y[`D_LEN-1:0] : {1'b0, {`D_NE+1{1'b1}}, (`D_NF-1)'(0)};
+        assign ZLen1 = &Z[`Q_LEN-1:`D_LEN] ? Z[`D_LEN-1:0] : {1'b0, {`D_NE+1{1'b1}}, (`D_NF-1)'(0)}; 
+
+        // Check NaN boxing, If the value is not properly NaN boxed, set the value to a quiet NaN - for single precision
+        assign XLen2 = &X[`Q_LEN-1:`S_LEN] ? X[`S_LEN-1:0] : {1'b0, {`S_NE+1{1'b1}}, (`S_NF-1)'(0)};
+        assign YLen2 = &Y[`Q_LEN-1:`S_LEN] ? Y[`S_LEN-1:0] : {1'b0, {`S_NE+1{1'b1}}, (`S_NF-1)'(0)};
+        assign ZLen2 = &Z[`Q_LEN-1:`S_LEN] ? Z[`S_LEN-1:0] : {1'b0, {`S_NE+1{1'b1}}, (`S_NF-1)'(0)}; 
+
+        // Check NaN boxing, If the value is not properly NaN boxed, set the value to a quiet NaN - for half precision
+        assign XLen3 = &X[`Q_LEN-1:`H_LEN] ? X[`H_LEN-1:0] : {1'b0, {`H_NE+1{1'b1}}, (`H_NF-1)'(0)};
+        assign YLen3 = &Y[`Q_LEN-1:`H_LEN] ? Y[`H_LEN-1:0] : {1'b0, {`H_NE+1{1'b1}}, (`H_NF-1)'(0)};
+        assign ZLen3 = &Z[`Q_LEN-1:`H_LEN] ? Z[`H_LEN-1:0] : {1'b0, {`H_NE+1{1'b1}}, (`H_NF-1)'(0)}; 
+
+        always_comb begin
+            case (FmtE)
+                `Q_BIAS: begin  // if input is quad percision
+                    // extract sign bit
+                    XSgnE = X[`Q_LEN-1];
+                    YSgnE = Y[`Q_LEN-1];
+                    ZSgnE = Z[`Q_LEN-1];
+
+                    // extract the exponent
+                    XExpE = X[`Q_LEN-2:`Q_NF]; 
+                    YExpE = Y[`Q_LEN-2:`Q_NF]; 
+                    ZExpE = Z[`Q_LEN-2:`Q_NF]; 
+
+                    // extract the fraction
+                    XFracE = X[`Q_NF-1:0];
+                    YFracE = Y[`Q_NF-1:0];
+                    ZFracE = Z[`Q_NF-1:0];
+
+                    // is the exponent non-zero
+                    XExpNonzero = |X[`Q_LEN-2:`Q_NF]; 
+                    YExpNonzero = |Y[`Q_LEN-2:`Q_NF];
+                    ZExpNonzero = |Z[`Q_LEN-2:`Q_NF];
+
+                    // is the exponent all 1's
+                    XExpMaxE = &X[`Q_LEN-2:`Q_NF];
+                    YExpMaxE = &Y[`Q_LEN-2:`Q_NF];
+                    ZExpMaxE = &Z[`Q_LEN-2:`Q_NF];
+                end
+                `D_BIAS: begin  // if input is double percision
+                    // extract sign bit
+                    XSgnE = XLen1[`D_LEN-1];
+                    YSgnE = YLen1[`D_LEN-1];
+                    ZSgnE = ZLen1[`D_LEN-1];
+
+                    // example double to single conversion:
+                    // 1023 = 0011 1111 1111
+                    // 127  = 0000 0111 1111 (subtract this)
+                    // 896  = 0011 1000 0000
+                    // sexp = 0000 bbbb bbbb (add this) b = bit d = ~b 
+                    // dexp = 0bdd dbbb bbbb 
+                    // also need to take into account possible zero/denorm/inf/NaN values
+                    
+                    // convert the double precsion exponent into quad precsion
+                    XExpE = {XLen1[`D_LEN-2], {`Q_NE-`D_NE{~XLen1[`D_LEN-2]&~XExpZero|XExpMaxE}}, XLen1[`D_LEN-3:`D_NF]}; 
+                    YExpE = {YLen1[`D_LEN-2], {`Q_NE-`D_NE{~YLen1[`D_LEN-2]&~YExpZero|YExpMaxE}}, YLen1[`D_LEN-3:`D_NF]}; 
+                    ZExpE = {ZLen1[`D_LEN-2], {`Q_NE-`D_NE{~ZLen1[`D_LEN-2]&~ZExpZero|ZExpMaxE}}, ZLen1[`D_LEN-3:`D_NF]}; 
+
+                    // extract the fraction and add the nessesary trailing zeros
+                    XFracE = {XLen1[`D_NE-1:0], (`Q_NF-`D_NE)'(0)};
+                    YFracE = {YLen1[`D_NE-1:0], (`Q_NF-`D_NE)'(0)};
+                    ZFracE = {ZLen1[`D_NE-1:0], (`Q_NF-`D_NE)'(0)};
+
+                    // is the exponent non-zero
+                    XExpNonzero = |XLen1[`D_LEN-2:`D_NE]; 
+                    YExpNonzero = |YLen1[`D_LEN-2:`D_NE];
+                    ZExpNonzero = |ZLen1[`D_LEN-2:`D_NE];
+
+                    // is the exponent all 1's
+                    XExpMaxE = &XLen1[`D_LEN-2:`D_NE];
+                    YExpMaxE = &YLen1[`D_LEN-2:`D_NE];
+                    ZExpMaxE = &ZLen1[`D_LEN-2:`D_NE];
+                end
+                `S_BIAS: begin      // if input is single percision
+                    // extract sign bit
+                    XSgnE = XLen2[`S_LEN-1];
+                    YSgnE = YLen2[`S_LEN-1];
+                    ZSgnE = ZLen2[`S_LEN-1];
+
+                    // example double to single conversion:
+                    // 1023 = 0011 1111 1111
+                    // 127  = 0000 0111 1111 (subtract this)
+                    // 896  = 0011 1000 0000
+                    // sexp = 0000 bbbb bbbb (add this) b = bit d = ~b 
+                    // dexp = 0bdd dbbb bbbb 
+                    // also need to take into account possible zero/denorm/inf/NaN values
+                    
+                    // convert the single precsion exponent into quad precsion
+                    XExpE = {XLen2[`S_LEN-2], {`Q_NE-`S_NE{~XLen2[`S_LEN-2]&~XExpZero|XExpMaxE}}, XLen2[`S_LEN-3:`S_NF]}; 
+                    YExpE = {YLen2[`S_LEN-2], {`Q_NE-`S_NE{~YLen2[`S_LEN-2]&~YExpZero|YExpMaxE}}, YLen2[`S_LEN-3:`S_NF]}; 
+                    ZExpE = {ZLen2[`S_LEN-2], {`Q_NE-`S_NE{~ZLen2[`S_LEN-2]&~ZExpZero|ZExpMaxE}}, ZLen2[`S_LEN-3:`S_NF]}; 
+
+                    // extract the fraction and add the nessesary trailing zeros
+                    XFracE = {XLen2[`S_NF-1:0], (`Q_NF-`S_NF)'(0)};
+                    YFracE = {YLen2[`S_NF-1:0], (`Q_NF-`S_NF)'(0)};
+                    ZFracE = {ZLen2[`S_NF-1:0], (`Q_NF-`S_NF)'(0)};
+
+                    // is the exponent non-zero
+                    XExpNonzero = |XLen2[`S_LEN-2:`S_NF]; 
+                    YExpNonzero = |YLen2[`S_LEN-2:`S_NF];
+                    ZExpNonzero = |ZLen2[`S_LEN-2:`S_NF];
+
+                    // is the exponent all 1's
+                    XExpMaxE = &XLen2[`S_LEN-2:`S_NF];
+                    YExpMaxE = &YLen2[`S_LEN-2:`S_NF];
+                    ZExpMaxE = &ZLen2[`S_LEN-2:`S_NF];
+                end
+                `H_BIAS: begin      // if input is half percision
+                    // extract sign bit
+                    XSgnE = XLen3[`H_LEN-1];
+                    YSgnE = YLen3[`H_LEN-1];
+                    ZSgnE = ZLen3[`H_LEN-1];
+
+                    // example double to single conversion:
+                    // 1023 = 0011 1111 1111
+                    // 127  = 0000 0111 1111 (subtract this)
+                    // 896  = 0011 1000 0000
+                    // sexp = 0000 bbbb bbbb (add this) b = bit d = ~b 
+                    // dexp = 0bdd dbbb bbbb 
+                    // also need to take into account possible zero/denorm/inf/NaN values
+                    
+                    // convert the half precsion exponent into quad precsion
+                    XExpE = {XLen3[`H_LEN-2], {`Q_NE-`H_NE{~XLen3[`H_LEN-2]&~XExpZero|XExpMaxE}}, XLen3[`H_LEN-3:`H_NF]}; 
+                    YExpE = {YLen3[`H_LEN-2], {`Q_NE-`H_NE{~YLen3[`H_LEN-2]&~YExpZero|YExpMaxE}}, YLen3[`H_LEN-3:`H_NF]}; 
+                    ZExpE = {ZLen3[`H_LEN-2], {`Q_NE-`H_NE{~ZLen3[`H_LEN-2]&~ZExpZero|ZExpMaxE}}, ZLen3[`H_LEN-3:`H_NF]}; 
+
+                    // extract the fraction and add the nessesary trailing zeros
+                    XFracE = {XLen3[`H_NF-1:0], (`Q_NF-`H_NF)'(0)};
+                    YFracE = {YLen3[`H_NF-1:0], (`Q_NF-`H_NF)'(0)};
+                    ZFracE = {ZLen3[`H_NF-1:0], (`Q_NF-`H_NF)'(0)};
+
+                    // is the exponent non-zero
+                    XExpNonzero = |XLen3[`H_LEN-2:`H_NF]; 
+                    YExpNonzero = |YLen3[`H_LEN-2:`H_NF];
+                    ZExpNonzero = |ZLen3[`H_LEN-2:`H_NF];
+
+                    // is the exponent all 1's
+                    XExpMaxE = &XLen3[`H_LEN-2:`H_NF];
+                    YExpMaxE = &YLen3[`H_LEN-2:`H_NF];
+                    ZExpMaxE = &ZLen3[`H_LEN-2:`H_NF];
+                end
+            endcase
+        end
+
+    end
+
+    // is the exponent all 0's
+    assign XExpZero = ~XExpNonzero;
+    assign YExpZero = ~YExpNonzero;
+    assign ZExpZero = ~ZExpNonzero;
+
+    // is the fraction zero
+    assign XFracZero = ~|XFracE;
+    assign YFracZero = ~|YFracE;
+    assign ZFracZero = ~|ZFracE;
+
+    // add the assumed one (or zero if denormal or zero) to create the mantissa
+    assign XManE = {XExpNonzero, XFracE};
+    assign YManE = {YExpNonzero, YFracE};
+    assign ZManE = {ZExpNonzero, ZFracE};
+
+    // is X normalized
+    assign XNormE = ~(XExpMaxE|XExpZero);
+    
+    // is the input a NaN
+    //     - force to be a NaN if it isn't properly Nan Boxed
+    assign XNaNE = XExpMaxE & ~XFracZero;
+    assign YNaNE = YExpMaxE & ~YFracZero;
+    assign ZNaNE = ZExpMaxE & ~ZFracZero;
+
+    // is the input a singnaling NaN
+    assign XSNaNE = XNaNE&~XFracE[`NF-1];
+    assign YSNaNE = YNaNE&~YFracE[`NF-1];
+    assign ZSNaNE = ZNaNE&~ZFracE[`NF-1];
+
+    // is the input denormalized
+    assign XDenormE = XExpZero & ~XFracZero;
+    assign YDenormE = YExpZero & ~YFracZero;
+    assign ZDenormE = ZExpZero & ~ZFracZero;
+
+    // is the input infinity
+    assign XInfE = XExpMaxE & XFracZero;
+    assign YInfE = YExpMaxE & YFracZero;
+    assign ZInfE = ZExpMaxE & ZFracZero;
+
+    // is the input zero
+    assign XZeroE = XExpZero & XFracZero;
+    assign YZeroE = YExpZero & YFracZero;
+    assign ZZeroE = ZExpZero & ZFracZero;
+    
+endmodule
--- a/pipelined/src/fpu/unpacking.sv
+++ b/pipelined/src/fpu/unpacking.sv
@ -1,95 +0,0 @@
-`include "wally-config.vh"
-
-module unpack ( 
-    input logic  [63:0] X, Y, Z,
-    input logic         FmtE,
-    input logic  [2:0]  FOpCtrlE,
-    output logic        XSgnE, YSgnE, ZSgnE,
-    output logic [10:0] XExpE, YExpE, ZExpE,
-    output logic [52:0] XManE, YManE, ZManE,
-    output logic XNormE,
-    output logic XNaNE, YNaNE, ZNaNE,
-    output logic XSNaNE, YSNaNE, ZSNaNE,
-    output logic XDenormE, YDenormE, ZDenormE,
-    output logic XZeroE, YZeroE, ZZeroE,
-    output logic [10:0] BiasE,
-    output logic XInfE, YInfE, ZInfE,
-    output logic XExpMaxE
-);
- 
-    logic [51:0]    XFracE, YFracE, ZFracE;
-    logic           XExpNonzero, YExpNonzero, ZExpNonzero;
-    logic           XFracZero, YFracZero, ZFracZero; // input fraction zero
-    logic           XExpZero, YExpZero, ZExpZero; // input exponent zero
-    logic           YExpMaxE, ZExpMaxE;  // input exponent all 1s
-    logic  [31:0]   XFloat, YFloat, ZFloat; // Bottom half or NaN, if RV64 and not properly NaN boxed
-
-    // Determine if number is NaN as double precision to check single precision NaN boxing
-    if (`F_SUPPORTED & ~`D_SUPPORTED) begin  // eventually this should change to FLEN when FLEN isn't hardwared to 64
-        assign XFloat = X[31:0]; 
-        assign YFloat = Y[31:0];  
-        assign ZFloat = Z[31:0]; 
-    end else begin
-        assign XFloat = &X[`FLEN-1:32] ? X[31:0] : 32'h7fc00000; 
-        assign YFloat = &Y[`FLEN-1:32] ? Y[31:0] : 32'h7fc00000;
-        assign ZFloat = &Z[`FLEN-1:32] ? Z[31:0] : 32'h7fc00000;
-    end   
-
-    assign XSgnE = FmtE ? X[63] : XFloat[31];
-    assign YSgnE = FmtE ? Y[63] : YFloat[31];
-    assign ZSgnE = FmtE ? Z[63] : ZFloat[31];
-
-    assign XExpE = FmtE ? X[62:52] : {XFloat[30], {3{~XFloat[30]&~XExpZero|XExpMaxE}}, XFloat[29:23]}; 
-    assign YExpE = FmtE ? Y[62:52] : {YFloat[30], {3{~YFloat[30]&~YExpZero|YExpMaxE}}, YFloat[29:23]}; 
-    assign ZExpE = FmtE ? Z[62:52] : {ZFloat[30], {3{~ZFloat[30]&~ZExpZero|ZExpMaxE}}, ZFloat[29:23]}; 
-
-    assign XFracE = FmtE ? X[51:0] : {XFloat[22:0], 29'b0};
-    assign YFracE = FmtE ? Y[51:0] : {YFloat[22:0], 29'b0};
-    assign ZFracE = FmtE ? Z[51:0] : {ZFloat[22:0], 29'b0};
-
-    assign XExpNonzero = FmtE ? |X[62:52] : |XFloat[30:23]; 
-    assign YExpNonzero = FmtE ? |Y[62:52] : |YFloat[30:23];
-    assign ZExpNonzero = FmtE ? |Z[62:52] : |ZFloat[30:23];
-
-    assign XExpZero = ~XExpNonzero;
-    assign YExpZero = ~YExpNonzero;
-    assign ZExpZero = ~ZExpNonzero;
-   
-    assign XFracZero = ~|XFracE;
-    assign YFracZero = ~|YFracE;
-    assign ZFracZero = ~|ZFracE;
-
-    assign XManE = {XExpNonzero, XFracE};
-    assign YManE = {YExpNonzero, YFracE};
-    assign ZManE = {ZExpNonzero, ZFracE};
-
-    assign XExpMaxE = FmtE ? &X[62:52] : &XFloat[30:23];
-    assign YExpMaxE = FmtE ? &Y[62:52] : &YFloat[30:23];
-    assign ZExpMaxE = FmtE ? &Z[62:52] : &ZFloat[30:23];
-  
-    assign XNormE = ~(XExpMaxE|XExpZero);
-    
-    // force single precision input to be a NaN if it isn't properly Nan Boxed
-    assign XNaNE = XExpMaxE & ~XFracZero;
-    assign YNaNE = YExpMaxE & ~YFracZero;
-    assign ZNaNE = ZExpMaxE & ~ZFracZero;
-
-    assign XSNaNE = XNaNE&~XFracE[51];
-    assign YSNaNE = YNaNE&~YFracE[51];
-    assign ZSNaNE = ZNaNE&~ZFracE[51];
-
-    assign XDenormE = XExpZero & ~XFracZero;
-    assign YDenormE = YExpZero & ~YFracZero;
-    assign ZDenormE = ZExpZero & ~ZFracZero;
-
-    assign XInfE = XExpMaxE & XFracZero;
-    assign YInfE = YExpMaxE & YFracZero;
-    assign ZInfE = ZExpMaxE & ZFracZero;
-
-    assign XZeroE = XExpZero & XFracZero;
-    assign YZeroE = YExpZero & YFracZero;
-    assign ZZeroE = ZExpZero & ZFracZero;
-
-    assign BiasE = 11'h3ff; // always use 1023 because exponents are unpacked to double precision
-
-endmodule
--- a/pipelined/src/generic/flop/simpleram.sv
+++ b/pipelined/src/generic/flop/simpleram.sv
@ -34,6 +34,7 @@ module simpleram #(parameter BASE=0, RANGE = 65535) (
  input  logic             clk, 
  input  logic [31:0]      a,
  input  logic             we,
+  input  logic [`XLEN/8-1:0] ByteMask,
  input  logic [`XLEN-1:0] wd,
  output logic [`XLEN-1:0] rd
 );
@ -45,9 +46,14 @@ module simpleram #(parameter BASE=0, RANGE = 65535) (
  logic [31:adrlsb] adrmsbs;
  assign adrmsbs = a[31:adrlsb];

-  always_ff @(posedge clk) begin
+  always_ff @(posedge clk)
    rd <= RAM[adrmsbs];
-    if (we) RAM[adrmsbs] <= #1 wd;
+
+  genvar            index;
+  for(index = 0; index < `XLEN/8; index++) begin
+    always_ff @(posedge clk) begin
+      if (we & ByteMask[index]) RAM[adrmsbs][8*(index+1)-1:8*index] <= #1 wd[8*(index+1)-1:8*index];
+    end
  end
 endmodule

--- a/pipelined/src/ifu/SRAM2P1R1W.sv
+++ b/pipelined/src/ifu/SRAM2P1R1W.sv
@ -101,17 +101,8 @@ module SRAM2P1R1W
  
  // write port
  assign bwe = {WIDTH{WEN1Q}} & BitWEN1;
-  always_ff @(posedge clk) begin
+  always_ff @(posedge clk)
    mem[WA1Q] <= WD1Q & bwe | mem[WA1Q] & ~bwe;
-/*    
-  genvar       index;
-   for (index = 0; index < WIDTH; index = index + 1) begin:bitwrite
-    always_ff @(posedge clk) begin
-      if (WEN1Q & BitWEN1[index]) begin
-        mem[WA1Q][index] <= WD1Q[index];
-      end
-    end*/
-  end
 
 endmodule  

--- a/pipelined/src/ifu/ifu.sv
+++ b/pipelined/src/ifu/ifu.sv
@ -175,12 +175,13 @@ module ifu (
  
  if (`IMEM == `MEM_TIM) begin : irom // *** fix up dtim taking PA_BITS rather than XLEN, *** IEUAdr is a bad name.  Probably use a ROM rather than DTIM
    dtim irom(.clk, .reset, .CPUBusy, .LSURWM(2'b10), .IEUAdrM(PCPF[31:0]), .IEUAdrE(PCNextFSpill),
-              .TrapM(1'b0), .FinalWriteDataM(), 
-              .ReadDataWordM(AllInstrRawF), .BusStall, .LSUBusWrite(), .LSUBusRead(IFUBusRead),
-              .BusCommittedM(), .ReadDataWordMuxM(), .DCacheStallM(ICacheStallF), 
+              .TrapM(1'b0), .FinalWriteDataM(), .ByteMaskM('0),
+              .ReadDataWordM(FinalInstrRawF), .BusStall, .LSUBusWrite(), .LSUBusRead(IFUBusRead),
+              .BusCommittedM(), .DCacheStallM(ICacheStallF), 
              .DCacheCommittedM(), .DCacheMiss(ICacheMiss), .DCacheAccess(ICacheAccess));
    
-  end else begin : bus
+  end 
+  if (`IBUS) begin : bus
    localparam integer   WORDSPERLINE = (CACHE_ENABLED) ? `ICACHE_LINELENINBITS/`XLEN : 1;
    localparam integer   LINELEN = (CACHE_ENABLED) ? `ICACHE_LINELENINBITS : `XLEN;
    localparam integer   LOGWPL = (`DMEM == `MEM_CACHE) ? $clog2(WORDSPERLINE) : 1;
@ -188,7 +189,6 @@ module ifu (
    logic [LINELEN-1:0]  ICacheBusWriteData;
    logic [`PA_BITS-1:0] ICacheBusAdr;
    logic                ICacheBusAck;
-    logic                save,restore;
    logic [31:0]         temp;
    logic                SelUncachedAdr;
    
@ -212,14 +212,15 @@ module ifu (
    if(CACHE_ENABLED) begin : icache
      cache #(.LINELEN(`ICACHE_LINELENINBITS),
              .NUMLINES(`ICACHE_WAYSIZEINBYTES*8/`ICACHE_LINELENINBITS),
-              .NUMWAYS(`ICACHE_NUMWAYS), .DCACHE(0))
+              .NUMWAYS(`ICACHE_NUMWAYS), .LOGWPL(LOGWPL), .WORDLEN(32), .MUXINTERVAL(16), .DCACHE(0))
      icache(.clk, .reset, .CPUBusy, .IgnoreRequestTLB(ITLBMissF), .IgnoreRequestTrapM('0),
             .CacheBusWriteData(ICacheBusWriteData), .CacheBusAck(ICacheBusAck),
             .CacheBusAdr(ICacheBusAdr), .CacheStall(ICacheStallF), 
             .CacheFetchLine(ICacheFetchLine),
-             .CacheWriteLine(), .ReadDataLine(ReadDataLine),
-             .save, .restore, .Cacheable(CacheableF),
+             .CacheWriteLine(), .ReadDataWord(FinalInstrRawF),
+             .Cacheable(CacheableF),
             .CacheMiss(ICacheMiss), .CacheAccess(ICacheAccess),
+             .ByteMask('0), .WordCount('0), .LSUBusWriteCrit('0),
             .FinalWriteData('0),
             .RW(2'b10), 
             .Atomic('0), .FlushCache('0),
@ -227,15 +228,13 @@ module ifu (
             .PAdr(PCPF),
             .CacheCommitted(), .InvalidateCacheM(InvalidateICacheM));

-      subcachelineread #(LINELEN, 32, 16) subcachelineread(
-        .clk, .reset, .PAdr(PCPF), .save, .restore,
-        .ReadDataLine, .ReadDataWord(FinalInstrRawF));
-
    end else begin : passthrough
      assign {ICacheFetchLine, ICacheBusAdr, ICacheStallF, FinalInstrRawF} = '0;
      assign ICacheAccess = CacheableF; assign ICacheMiss = CacheableF;
    end
-  end  
+  end else begin : nobus // block: bus
+    assign AllInstrRawF = FinalInstrRawF;
+  end
  
  assign IFUCacheBusStallF = ICacheStallF | BusStall;
  assign IFUStallF = IFUCacheBusStallF | SelNextSpillF;
--- a/pipelined/src/lsu/atomic.sv
+++ b/pipelined/src/lsu/atomic.sv
@ -41,7 +41,6 @@ module atomic (
  input logic [1:0]          LSUAtomicM,
  input logic [1:0]          PreLSURWM,
  input logic                IgnoreRequest,
-  input logic                DTLBMissM,
  output logic [`XLEN-1:0]   FinalAMOWriteDataM,
  output logic               SquashSCW,
  output logic [1:0]         LSURWM);
@ -52,7 +51,7 @@ module atomic (
  amoalu amoalu(.srca(ReadDataM), .srcb(LSUWriteDataM), .funct(LSUFunct7M), .width(LSUFunct3M[1:0]), 
                .result(AMOResult));
  mux2 #(`XLEN) wdmux(LSUWriteDataM, AMOResult, LSUAtomicM[1], FinalAMOWriteDataM);
-  assign MemReadM = PreLSURWM[1] & ~(IgnoreRequest) & ~DTLBMissM; // *** is DTLBMiss needed; might be par tof ignorerequest
+  assign MemReadM = PreLSURWM[1] & ~IgnoreRequest;
  lrsc lrsc(.clk, .reset, .FlushW, .CPUBusy, .MemReadM, .PreLSURWM, .LSUAtomicM, .LSUPAdrM,
    .SquashSCW, .LSURWM);

--- a/pipelined/src/lsu/dtim.sv
+++ b/pipelined/src/lsu/dtim.sv
@ -37,19 +37,19 @@ module dtim(
  input logic [`XLEN-1:0]     IEUAdrE,
  input logic                 TrapM, 
  input logic [`XLEN-1:0]     FinalWriteDataM,
+  input logic [`XLEN/8-1:0]   ByteMaskM,
  output logic [`XLEN-1:0]    ReadDataWordM,
  output logic                BusStall,
  output logic                LSUBusWrite,
  output logic                LSUBusRead,
  output logic                BusCommittedM,
-  output logic [`XLEN-1:0]    ReadDataWordMuxM,
  output logic                DCacheStallM,
  output logic                DCacheCommittedM,
  output logic                DCacheMiss,
  output logic                DCacheAccess);

  simpleram #(.BASE(`RAM_BASE), .RANGE(`RAM_RANGE)) ram (
-      .clk, 
+      .clk, .ByteMask(ByteMaskM),
      .a(CPUBusy | LSURWM[0] | reset ? IEUAdrM[31:0] : IEUAdrE[31:0]), // move mux out; this shouldn't be needed when stails are handled differently ***
      .we(LSURWM[0] & ~TrapM),  // have to ignore write if Trap.
      .wd(FinalWriteDataM), .rd(ReadDataWordM));
@ -57,7 +57,6 @@ module dtim(
  // since we have a local memory the bus connections are all disabled.
  // There are no peripherals supported.
  assign {BusStall, LSUBusWrite, LSUBusRead, BusCommittedM} = '0;   
-  assign ReadDataWordMuxM = ReadDataWordM;
  assign {DCacheStallM, DCacheCommittedM} = '0;
  assign {DCacheMiss, DCacheAccess} = '0;

--- a/pipelined/src/lsu/interlockfsm.sv
+++ b/pipelined/src/lsu/interlockfsm.sv
@ -56,7 +56,7 @@ module interlockfsm(
  logic             AnyCPUReqM;

  typedef enum      logic[2:0]  {STATE_T0_READY,
-				                 STATE_T0_REPLAY,
+				                 STATE_T1_REPLAY,
 				                 STATE_T3_DTLB_MISS,
 				                 STATE_T4_ITLB_MISS,
 				                 STATE_T5_ITLB_MISS,
@ -82,13 +82,13 @@ module interlockfsm(
                      else if(ToITLBMiss)         InterlockNextState = STATE_T5_ITLB_MISS;
 	                  else if(ToBoth)             InterlockNextState = STATE_T7_DITLB_MISS;
 	                  else                        InterlockNextState = STATE_T0_READY;
-	  STATE_T0_REPLAY:     if(DCacheStallM)       InterlockNextState = STATE_T0_REPLAY;
+	  STATE_T1_REPLAY:     if(DCacheStallM)       InterlockNextState = STATE_T1_REPLAY;
 	                       else                   InterlockNextState = STATE_T0_READY;
-	  STATE_T3_DTLB_MISS:  if(DTLBWriteM)         InterlockNextState = STATE_T0_REPLAY;
+	  STATE_T3_DTLB_MISS:  if(DTLBWriteM)         InterlockNextState = STATE_T1_REPLAY;
 	                       else                   InterlockNextState = STATE_T3_DTLB_MISS;
 	  STATE_T4_ITLB_MISS:  if(ITLBWriteF)         InterlockNextState = STATE_T0_READY;
 	                       else                   InterlockNextState = STATE_T4_ITLB_MISS;
-	  STATE_T5_ITLB_MISS:  if(ITLBWriteF)         InterlockNextState = STATE_T0_REPLAY;
+	  STATE_T5_ITLB_MISS:  if(ITLBWriteF)         InterlockNextState = STATE_T1_REPLAY;
 	                       else                   InterlockNextState = STATE_T5_ITLB_MISS;
 	  STATE_T7_DITLB_MISS: if(DTLBWriteM)         InterlockNextState = STATE_T5_ITLB_MISS;
 	                       else                   InterlockNextState = STATE_T7_DITLB_MISS;
@ -122,12 +122,12 @@ module interlockfsm(
 	endcase
  end
  
-  assign SelReplayMemE = (InterlockCurrState == STATE_T0_REPLAY & DCacheStallM) |
+  assign SelReplayMemE = (InterlockCurrState == STATE_T1_REPLAY & DCacheStallM) |
                         (InterlockCurrState == STATE_T3_DTLB_MISS & DTLBWriteM) | 
                         (InterlockCurrState == STATE_T5_ITLB_MISS & ITLBWriteF);
  assign SelHPTW = (InterlockCurrState == STATE_T3_DTLB_MISS) | (InterlockCurrState == STATE_T4_ITLB_MISS) |
 				   (InterlockCurrState == STATE_T5_ITLB_MISS) | (InterlockCurrState == STATE_T7_DITLB_MISS);
  assign IgnoreRequestTLB = (InterlockCurrState == STATE_T0_READY & (ITLBMissOrDAFaultF | DTLBMissOrDAFaultM));
  assign IgnoreRequestTrapM = (InterlockCurrState == STATE_T0_READY & (TrapM)) |
-							  ((InterlockCurrState == STATE_T0_REPLAY) & (TrapM));
+							  ((InterlockCurrState == STATE_T1_REPLAY) & (TrapM));
 endmodule
--- a/pipelined/src/lsu/lsu.sv
+++ b/pipelined/src/lsu/lsu.sv
@ -82,7 +82,6 @@ module lsu (
   input var                logic [`XLEN-1:0] PMPADDR_ARRAY_REGW[`PMP_ENTRIES-1:0] // *** this one especially has a large note attached to it in pmpchecker.
  );

-  localparam                CACHE_ENABLED = `DMEM == `MEM_CACHE;
  logic [`XLEN+1:0]         IEUAdrExtM;
  logic [`PA_BITS-1:0]      LSUPAdrM;
  logic                     DTLBMissM;
@ -105,7 +104,8 @@ module lsu (
  logic                     LSUBusWriteCrit;
  logic                     DataDAPageFaultM;
  logic [`XLEN-1:0]         LSUWriteDataM;
-    
+  logic [(`XLEN-1)/8:0]     ByteMaskM;
+  
  // *** TO DO: Burst mode, byte write enables to DTIM, cache, exeternal memory, remove subword write from uncore, 

  flopenrc #(`XLEN) AddressMReg(clk, reset, FlushM, ~StallM, IEUAdrE, IEUAdrM);
@ -193,10 +193,11 @@ module lsu (
    // Merge SimpleRAM and SRAM1p1rw into one that is good for synthesis and RAM libraries and flops
    dtim dtim(.clk, .reset, .CPUBusy, .LSURWM, .IEUAdrM, .IEUAdrE, .TrapM, .FinalWriteDataM, 
              .ReadDataWordM, .BusStall, .LSUBusWrite,.LSUBusRead, .BusCommittedM,
-              .ReadDataWordMuxM, .DCacheStallM, .DCacheCommittedM,
+              .DCacheStallM, .DCacheCommittedM, .ByteMaskM,
              .DCacheMiss, .DCacheAccess);
-    assign SelUncachedAdr = '0; // value does not matter.
-  end else begin : bus  
+  end 
+  if (`DBUS) begin : bus  
+    localparam           CACHE_ENABLED = `DMEM == `MEM_CACHE;
    localparam integer   WORDSPERLINE = (CACHE_ENABLED) ? `DCACHE_LINELENINBITS/`XLEN : 1;
    localparam integer   LINELEN = (CACHE_ENABLED) ? `DCACHE_LINELENINBITS : `XLEN;
    localparam integer   LOGWPL = (CACHE_ENABLED) ? $clog2(WORDSPERLINE) : 1;
@ -206,8 +207,6 @@ module lsu (
    logic                DCacheWriteLine;
    logic                DCacheFetchLine;
    logic                DCacheBusAck;
-    logic                save, restore;
-    logic [`PA_BITS-1:0] WordOffsetAddr;
    logic                SelBus;
    logic [LOGWPL-1:0]   WordCount;
            
@ -224,58 +223,45 @@ module lsu (
      .s(SelUncachedAdr), .y(ReadDataWordMuxM));
    mux2 #(`XLEN) LsuBushwdataMux(.d0(ReadDataWordM), .d1(FinalWriteDataM),
      .s(SelUncachedAdr), .y(LSUBusHWDATA));
-    mux2 #(`PA_BITS) WordAdrrMux(.d0(LSUPAdrM), 
-      .d1({{`PA_BITS-LOGWPL{1'b0}}, WordCount} << $clog2(`XLEN/8)), .s(LSUBusWriteCrit),
-      .y(WordOffsetAddr)); // *** can reduce width of mux. only need the offset.  
    
-
    if(CACHE_ENABLED) begin : dcache
      cache #(.LINELEN(`DCACHE_LINELENINBITS), .NUMLINES(`DCACHE_WAYSIZEINBYTES*8/LINELEN),
-              .NUMWAYS(`DCACHE_NUMWAYS), .DCACHE(1)) dcache(
-        .clk, .reset, .CPUBusy, .save, .restore, .RW(LSURWM), .Atomic(LSUAtomicM),
+              .NUMWAYS(`DCACHE_NUMWAYS), .LOGWPL(LOGWPL), .WORDLEN(`XLEN), .MUXINTERVAL(`XLEN), .DCACHE(1)) dcache(
+        .clk, .reset, .CPUBusy, .LSUBusWriteCrit, .RW(LSURWM), .Atomic(LSUAtomicM),
        .FlushCache(FlushDCacheM), .NextAdr(LSUAdrE), .PAdr(LSUPAdrM), 
+        .ByteMask(ByteMaskM), .WordCount,
        .FinalWriteData(FinalWriteDataM), .Cacheable(CacheableM),
        .CacheStall(DCacheStallM), .CacheMiss(DCacheMiss), .CacheAccess(DCacheAccess),
        .IgnoreRequestTLB, .IgnoreRequestTrapM, .CacheCommitted(DCacheCommittedM), 
-        .CacheBusAdr(DCacheBusAdr), .ReadDataLine(ReadDataLineM), 
+        .CacheBusAdr(DCacheBusAdr), .ReadDataWord(ReadDataWordM), 
        .CacheBusWriteData(DCacheBusWriteData), .CacheFetchLine(DCacheFetchLine), 
        .CacheWriteLine(DCacheWriteLine), .CacheBusAck(DCacheBusAck), .InvalidateCacheM(1'b0));

-      subcachelineread #(LINELEN, `XLEN, `XLEN) subcachelineread(  // *** merge into cache
-        .clk, .reset, .PAdr(WordOffsetAddr), .save, .restore,
-        .ReadDataLine(ReadDataLineM), .ReadDataWord(ReadDataWordM));
-
    end else begin : passthrough
      assign {ReadDataWordM, DCacheStallM, DCacheCommittedM, DCacheFetchLine, DCacheWriteLine} = '0;
      assign DCacheMiss = CacheableM; assign DCacheAccess = CacheableM;
    end
+  end else begin: nobus // block: bus
+    assign {LSUBusHWDATA, SelUncachedAdr} = '0; 
+    assign ReadDataWordMuxM = ReadDataWordM;
  end

-  if(`DMEM != `MEM_BUS) begin // *** always, not just with no MEM_BUS.  Only produces byte write enable
-    logic [`XLEN-1:0] ReadDataWordMaskedM;
-    // ** there is definitely a sww bug with memory mapped i/o. check wally64priv.    
-    assign ReadDataWordMaskedM = SelUncachedAdr ? '0 : ReadDataWordM; // AND-gate
-    // *** consider moving this AND gate into the sww.
-    //assign ReadDataWordMaskedM = ReadDataWordM; // *** this change only works because the i/o devices dont' write bytes other than the ones specific to their address.
-    subwordwrite subwordwrite(.HRDATA(ReadDataWordMaskedM), .HADDRD(LSUPAdrM[2:0]),
-      .HSIZED({LSUFunct3M[2], 1'b0, LSUFunct3M[1:0]}),
-         .HWDATAIN(FinalAMOWriteDataM), .HWDATA(FinalWriteDataM));
-  end else 
-    assign FinalWriteDataM = FinalAMOWriteDataM;
-
  subwordread subwordread(.ReadDataWordMuxM, .LSUPAdrM(LSUPAdrM[2:0]),
 		.Funct3M(LSUFunct3M), .ReadDataM);

  /////////////////////////////////////////////////////////////////////////////////////////////
  // Atomic operations
  /////////////////////////////////////////////////////////////////////////////////////////////
-
-  // *** why does this need DTLBMissM?
  if (`A_SUPPORTED) begin:atomic
    atomic atomic(.clk, .reset, .FlushW, .CPUBusy, .ReadDataM, .LSUWriteDataM, .LSUPAdrM, 
      .LSUFunct7M, .LSUFunct3M, .LSUAtomicM, .PreLSURWM, .IgnoreRequest, 
-      .DTLBMissM, .FinalAMOWriteDataM, .SquashSCW, .LSURWM);
+      .FinalAMOWriteDataM, .SquashSCW, .LSURWM);
  end else begin:lrsc
    assign SquashSCW = 0; assign LSURWM = PreLSURWM; assign FinalAMOWriteDataM = LSUWriteDataM;
  end
+
+  subwordwrite subwordwrite(.LSUPAdrM(LSUPAdrM[2:0]),
+    .LSUFunct3M, .FinalAMOWriteDataM, .FinalWriteDataM, .ByteMaskM);
+
+  
 endmodule
--- a/pipelined/src/lsu/subwordwrite.sv
+++ b/pipelined/src/lsu/subwordwrite.sv
@ -0,0 +1,87 @@
+///////////////////////////////////////////
+// subwordwrite.sv
+//
+// Written: David_Harris@hmc.edu 9 January 2021
+// Modified: 
+//
+// Purpose: Masking and muxing for subword writes
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module subwordwrite (
+  input logic [2:0]          LSUPAdrM,
+  input logic [2:0]          LSUFunct3M,
+  input logic [`XLEN-1:0]    FinalAMOWriteDataM,
+  output logic [`XLEN-1:0]   FinalWriteDataM,
+  output logic [`XLEN/8-1:0] ByteMaskM
+                     );
+                  
+  logic [`XLEN-1:0]          WriteDataSubwordDuplicated;
+
+  swbytemask swbytemask(.HSIZED({LSUFunct3M[2], 1'b0, LSUFunct3M[1:0]}), .HADDRD(LSUPAdrM),
+    .ByteMask(ByteMaskM));
+  
+  if (`XLEN == 64) begin:sww
+    // Handle subword writes
+    always_comb 
+      case(LSUFunct3M[1:0])
+        2'b00:  WriteDataSubwordDuplicated = {8{FinalAMOWriteDataM[7:0]}};  // sb
+        2'b01:  WriteDataSubwordDuplicated = {4{FinalAMOWriteDataM[15:0]}}; // sh
+        2'b10:  WriteDataSubwordDuplicated = {2{FinalAMOWriteDataM[31:0]}}; // sw
+        2'b11:  WriteDataSubwordDuplicated = FinalAMOWriteDataM;            // sw
+      endcase
+
+    always_comb begin
+      FinalWriteDataM='0;
+      if (ByteMaskM[0]) FinalWriteDataM[7:0]   = WriteDataSubwordDuplicated[7:0];
+      if (ByteMaskM[1]) FinalWriteDataM[15:8]  = WriteDataSubwordDuplicated[15:8];
+      if (ByteMaskM[2]) FinalWriteDataM[23:16] = WriteDataSubwordDuplicated[23:16];
+      if (ByteMaskM[3]) FinalWriteDataM[31:24] = WriteDataSubwordDuplicated[31:24];
+      if (ByteMaskM[4]) FinalWriteDataM[39:32] = WriteDataSubwordDuplicated[39:32];
+      if (ByteMaskM[5]) FinalWriteDataM[47:40] = WriteDataSubwordDuplicated[47:40];
+      if (ByteMaskM[6]) FinalWriteDataM[55:48] = WriteDataSubwordDuplicated[55:48];
+      if (ByteMaskM[7]) FinalWriteDataM[63:56] = WriteDataSubwordDuplicated[63:56];
+    end 
+
+  end else begin:sww // 32-bit
+    // Handle subword writes
+    always_comb 
+      case(LSUFunct3M[1:0])
+        2'b00:  WriteDataSubwordDuplicated = {4{FinalAMOWriteDataM[7:0]}};  // sb
+        2'b01:  WriteDataSubwordDuplicated = {2{FinalAMOWriteDataM[15:0]}}; // sh
+        2'b10:  WriteDataSubwordDuplicated = FinalAMOWriteDataM;            // sw
+        default: WriteDataSubwordDuplicated = FinalAMOWriteDataM; // shouldn't happen
+      endcase
+
+    always_comb begin
+      FinalWriteDataM='0;
+      if (ByteMaskM[0]) FinalWriteDataM[7:0]   = WriteDataSubwordDuplicated[7:0];
+      if (ByteMaskM[1]) FinalWriteDataM[15:8]  = WriteDataSubwordDuplicated[15:8];
+      if (ByteMaskM[2]) FinalWriteDataM[23:16] = WriteDataSubwordDuplicated[23:16];
+      if (ByteMaskM[3]) FinalWriteDataM[31:24] = WriteDataSubwordDuplicated[31:24];
+    end 
+
+  end
+endmodule
--- a/pipelined/src/lsu/swbytemask.sv
+++ b/pipelined/src/lsu/swbytemask.sv
@ -0,0 +1,66 @@
+///////////////////////////////////////////
+// ram.sv
+//
+// Written: David_Harris@hmc.edu 9 January 2021
+// Modified: 
+//
+// Purpose: On-chip RAM, external to core
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module swbytemask (
+  input logic [3:0]          HSIZED,
+  input logic [2:0]         HADDRD,
+  output logic [`XLEN/8-1:0] ByteMask);
+  
+
+  if(`XLEN == 64) begin
+    always_comb begin
+      case(HSIZED[1:0])
+        2'b00: begin ByteMask = 8'b00000000; ByteMask[HADDRD[2:0]] = 1; end // sb
+        2'b01: case (HADDRD[2:1])
+                  2'b00: ByteMask = 8'b0000_0011;
+                  2'b01: ByteMask = 8'b0000_1100;
+                  2'b10: ByteMask = 8'b0011_0000;
+                  2'b11: ByteMask = 8'b1100_0000;
+                endcase
+        2'b10: if (HADDRD[2]) ByteMask = 8'b11110000;
+               else           ByteMask = 8'b00001111;
+        2'b11: ByteMask = 8'b1111_1111;
+      endcase
+    end
+  end else begin
+    always_comb begin
+      case(HSIZED[1:0])
+        2'b00: begin ByteMask = 4'b0000; ByteMask[HADDRD[1:0]] = 1; end // sb
+        2'b01: if (HADDRD[1]) ByteMask = 4'b1100;
+               else           ByteMask = 4'b0011;
+        2'b10: ByteMask = 4'b1111;
+        default: ByteMask =  4'b1111;
+      endcase
+    end
+  end
+
+endmodule
--- a/pipelined/src/uncore/ram.sv
+++ b/pipelined/src/uncore/ram.sv
@ -38,6 +38,7 @@ module ram #(parameter BASE=0, RANGE = 65535) (
  input  logic             HREADY,
  input  logic [1:0]       HTRANS,
  input  logic [`XLEN-1:0] HWDATA,
+  input  logic [3:0]       HSIZED,
  output logic [`XLEN-1:0] HREADRam,
  output logic             HRESPRam, HREADYRam
 );
@ -53,6 +54,7 @@ module ram #(parameter BASE=0, RANGE = 65535) (
  logic        initTrans;
  logic        memwrite;
  logic [3:0]  busycount;
+  logic [`XLEN/8-1:0] ByteMaskM;

  if(`FPGA) begin:ram
    initial begin
@ -104,6 +106,8 @@ module ram #(parameter BASE=0, RANGE = 65535) (
    end // initial begin
  end // if (FPGA)

+  swbytemask swbytemask(.HSIZED, .HADDRD(A[2:0]), .ByteMask(ByteMaskM));
+  
  assign initTrans = HREADY & HSELRam & (HTRANS != 2'b00);

  // *** this seems like a weird way to use reset
@ -148,17 +152,24 @@ module ram #(parameter BASE=0, RANGE = 65535) (
 -----/\----- EXCLUDED -----/\----- */
  
  /* verilator lint_off WIDTH */
+  genvar index;
+  always_ff @(posedge HCLK)
+    HWADDR <= #1 A;
  if (`XLEN == 64)  begin:ramrw
-    always_ff @(posedge HCLK) begin
-      HWADDR <= #1 A;
+    always_ff @(posedge HCLK) 
      HREADRam0 <= #1 RAM[A[31:3]];
-      if (memwrite & risingHREADYRam) RAM[HWADDR[31:3]] <= #1 HWDATA;
+    for(index = 0; index < `XLEN/8; index++) begin
+      always_ff @(posedge HCLK) begin
+        if (memwrite & risingHREADYRam & ByteMaskM[index]) RAM[HWADDR[31:3]][8*(index+1)-1:8*index] <= #1 HWDATA[8*(index+1)-1:8*index];
+      end
    end
  end else begin 
-    always_ff @(posedge HCLK) begin:ramrw
-      HWADDR <= #1 A;  
+    always_ff @(posedge HCLK) 
      HREADRam0 <= #1 RAM[A[31:2]];
-      if (memwrite & risingHREADYRam) RAM[HWADDR[31:2]] <= #1 HWDATA;
+    for(index = 0; index < `XLEN/8; index++) begin
+      always_ff @(posedge HCLK) begin:ramrw
+        if (memwrite & risingHREADYRam & ByteMaskM[index]) RAM[HWADDR[31:2]][8*(index+1)-1:8*index] <= #1 HWDATA[8*(index+1)-1:8*index];
+      end
    end
  end
  /* verilator lint_on WIDTH */
--- a/pipelined/src/uncore/subwordwrite.sv
+++ b/pipelined/src/uncore/subwordwrite.sv
@ -1,111 +0,0 @@
-///////////////////////////////////////////
-// subwordwrite.sv
-//
-// Written: David_Harris@hmc.edu 9 January 2021
-// Modified: 
-//
-// Purpose: Masking and muxing for subword writes
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// MIT LICENSE
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
-// software and associated documentation files (the "Software"), to deal in the Software 
-// without restriction, including without limitation the rights to use, copy, modify, merge, 
-// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
-// to whom the Software is furnished to do so, subject to the following conditions:
-//
-//   The above copyright notice and this permission notice shall be included in all copies or 
-//   substantial portions of the Software.
-//
-//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
-//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
-//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
-//   OR OTHER DEALINGS IN THE SOFTWARE.
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-`include "wally-config.vh"
-
-module subwordwrite (
-  input  logic [`XLEN-1:0] HRDATA,
-  input  logic [2:0]       HADDRD,
-  input  logic [3:0]       HSIZED,
-  input  logic [`XLEN-1:0] HWDATAIN,
-  output logic [`XLEN-1:0] HWDATA
-);
-                  
-  logic [`XLEN-1:0] WriteDataSubwordDuplicated;
-  
-  if (`XLEN == 64) begin:sww
-    logic [7:0]      ByteMaskM;
-    // Compute write mask
-    always_comb 
-      case(HSIZED[1:0])
-        2'b00:  begin ByteMaskM = 8'b00000000; ByteMaskM[HADDRD[2:0]] = 1; end // sb
-        2'b01:  case (HADDRD[2:1])
-                  2'b00: ByteMaskM = 8'b00000011;
-                  2'b01: ByteMaskM = 8'b00001100;
-                  2'b10: ByteMaskM = 8'b00110000;
-                  2'b11: ByteMaskM = 8'b11000000;
-                endcase
-        2'b10:  if (HADDRD[2]) ByteMaskM = 8'b11110000;
-                  else        ByteMaskM = 8'b00001111;
-        2'b11:  ByteMaskM = 8'b11111111;
-      endcase
-
-    // Handle subword writes
-    always_comb 
-      case(HSIZED[1:0])
-        2'b00:  WriteDataSubwordDuplicated = {8{HWDATAIN[7:0]}};  // sb
-        2'b01:  WriteDataSubwordDuplicated = {4{HWDATAIN[15:0]}}; // sh
-        2'b10:  WriteDataSubwordDuplicated = {2{HWDATAIN[31:0]}}; // sw
-        2'b11:  WriteDataSubwordDuplicated = HWDATAIN;            // sw
-      endcase
-
-    always_comb begin
-      HWDATA=HRDATA;
-      if (ByteMaskM[0]) HWDATA[7:0]   = WriteDataSubwordDuplicated[7:0];
-      if (ByteMaskM[1]) HWDATA[15:8]  = WriteDataSubwordDuplicated[15:8];
-      if (ByteMaskM[2]) HWDATA[23:16] = WriteDataSubwordDuplicated[23:16];
-      if (ByteMaskM[3]) HWDATA[31:24] = WriteDataSubwordDuplicated[31:24];
-      if (ByteMaskM[4]) HWDATA[39:32] = WriteDataSubwordDuplicated[39:32];
-      if (ByteMaskM[5]) HWDATA[47:40] = WriteDataSubwordDuplicated[47:40];
-      if (ByteMaskM[6]) HWDATA[55:48] = WriteDataSubwordDuplicated[55:48];
-      if (ByteMaskM[7]) HWDATA[63:56] = WriteDataSubwordDuplicated[63:56];
-    end 
-
-  end else begin:sww // 32-bit
-    logic [3:0]      ByteMaskM;
-    // Compute write mask
-    always_comb 
-      case(HSIZED[1:0])
-        2'b00:  begin ByteMaskM = 4'b0000; ByteMaskM[HADDRD[1:0]] = 1; end // sb
-        2'b01:  if (HADDRD[1]) ByteMaskM = 4'b1100;
-                  else         ByteMaskM = 4'b0011;
-        2'b10:  ByteMaskM = 4'b1111;
-        default: ByteMaskM = 4'b111; // shouldn't happen
-      endcase
-
-    // Handle subword writes
-    always_comb 
-      case(HSIZED[1:0])
-        2'b00:  WriteDataSubwordDuplicated = {4{HWDATAIN[7:0]}};  // sb
-        2'b01:  WriteDataSubwordDuplicated = {2{HWDATAIN[15:0]}}; // sh
-        2'b10:  WriteDataSubwordDuplicated = HWDATAIN;            // sw
-        default: WriteDataSubwordDuplicated = HWDATAIN; // shouldn't happen
-      endcase
-
-    always_comb begin
-      HWDATA=HRDATA;
-      if (ByteMaskM[0]) HWDATA[7:0]   = WriteDataSubwordDuplicated[7:0];
-      if (ByteMaskM[1]) HWDATA[15:8]  = WriteDataSubwordDuplicated[15:8];
-      if (ByteMaskM[2]) HWDATA[23:16] = WriteDataSubwordDuplicated[23:16];
-      if (ByteMaskM[3]) HWDATA[31:24] = WriteDataSubwordDuplicated[31:24];
-    end 
-
-  end
-endmodule
--- a/pipelined/src/uncore/uncore.sv
+++ b/pipelined/src/uncore/uncore.sv
@ -38,7 +38,7 @@ module uncore (
  input  logic             HCLK, HRESETn,
  input  logic             TIMECLK,
  input  logic [31:0]      HADDR,
-  input  logic [`AHBW-1:0] HWDATAIN,
+  input  logic [`AHBW-1:0] HWDATA,
  input  logic             HWRITE,
  input  logic [2:0]       HSIZE,
  input  logic [2:0]       HBURST,
@ -68,7 +68,6 @@ module uncore (
  output logic [63:0]      MTIME_CLINT
 );
  
-  logic [`XLEN-1:0] HWDATA;
  logic [`XLEN-1:0] HREADRam, HREADCLINT, HREADPLIC, HREADGPIO, HREADUART, HREADSDC;

  logic [8:0]      HSELRegions;
@ -90,15 +89,6 @@ module uncore (
  // unswizzle HSEL signals
  assign {HSELEXT, HSELBootRom, HSELRam, HSELCLINT, HSELGPIO, HSELUART, HSELPLIC, HSELSDC} = HSELRegions[7:0];

-  // subword accesses: converts HWDATAIN to HWDATA only if no dtim or cache.
-  if(`DMEM == `MEM_BUS)
-    subwordwrite sww(
-      .HRDATA,
-      .HADDRD, .HSIZED, 
-      .HWDATAIN, .HWDATA);
-  else assign HWDATA = HWDATAIN;
-  
-
 //  generate
    // on-chip RAM
    if (`RAM_SUPPORTED) begin : ram
@ -106,7 +96,7 @@ module uncore (
        .BASE(`RAM_BASE), .RANGE(`RAM_RANGE)) ram (
        .HCLK, .HRESETn, 
        .HSELRam, .HADDR,
-        .HWRITE, .HREADY,
+        .HWRITE, .HREADY, .HSIZED,
        .HTRANS, .HWDATA, .HREADRam,
        .HRESPRam, .HREADYRam);
    end
@ -116,7 +106,7 @@ module uncore (
      bootrom(
        .HCLK, .HRESETn, 
        .HSELRam(HSELBootRom), .HADDR,
-        .HWRITE, .HREADY, .HTRANS,
+        .HWRITE, .HREADY, .HTRANS, .HSIZED,
        .HWDATA,
        .HREADRam(HREADBootRom), .HRESPRam(HRESPBootRom), .HREADYRam(HREADYBootRom));
    end
--- a/pipelined/src/wally/wallypipelinedsoc.sv
+++ b/pipelined/src/wally/wallypipelinedsoc.sv
@ -92,7 +92,7 @@ module wallypipelinedsoc (
   );

  uncore uncore(.HCLK, .HRESETn, .TIMECLK,
-    .HADDR, .HWDATAIN(HWDATA), .HWRITE, .HSIZE, .HBURST, .HPROT, .HTRANS, .HMASTLOCK, .HRDATAEXT,
+    .HADDR, .HWDATA, .HWRITE, .HSIZE, .HBURST, .HPROT, .HTRANS, .HMASTLOCK, .HRDATAEXT,
    .HREADYEXT, .HRESPEXT, .HRDATA, .HREADY, .HRESP, .HADDRD, .HSIZED, .HWRITED,
    .TimerIntM, .SwIntM, .ExtIntM, .GPIOPinsIn, .GPIOPinsOut, .GPIOPinsEn, .UARTSin, .UARTSout, .MTIME_CLINT, 
 		.HSELEXT,
--- a/pipelined/testbench/fp/tests/fma-testbench.sv
+++ b/pipelined/testbench/fp/tests/fma-testbench.sv
@ -0,0 +1,279 @@
+
+`include "wally-config.vh"
+`define PATH "../../../../tests/fp/vectors/"
+
+string tests[] = '{
+    "f16_mulAdd_rne.tv",
+    "f16_mulAdd_rz.tv",
+    "f16_mulAdd_ru.tv",
+    "f16_mulAdd_rd.tv",
+    "f16_mulAdd_rnm.tv",
+    "f32_mulAdd_rne.tv",
+    "f32_mulAdd_rz.tv",
+    "f32_mulAdd_ru.tv",
+    "f32_mulAdd_rd.tv",
+    "f32_mulAdd_rnm.tv",
+    "f64_mulAdd_rne.tv",
+    "f64_mulAdd_rz.tv",
+    "f64_mulAdd_ru.tv",
+    "f64_mulAdd_rd.tv",
+    "f64_mulAdd_rnm.tv",
+    "f128_mulAdd_rne.tv",
+    "f128_mulAdd_rz.tv",
+    "f128_mulAdd_ru.tv",
+    "f128_mulAdd_rd.tv",
+    "f128_mulAdd_rnm.tv"
+};
+
+// steps to run FMA tests
+//    1) create test vectors in riscv-wally/tests/fp with: ./run-all.sh
+//    2) go to riscv-wally/pipelined/testbench/fp/tests
+//    3) run ./sim-wally-batch
+
+module fmatestbench();
+
+  logic clk;
+  logic [31:0] errors=0;
+  logic [31:0] vectornum=0;
+  logic [`FLEN*4+7+4+4:0] testvectors[6133248:0];
+  int i = `ZFH_SUPPORTED ? 0 : `F_SUPPORTED ? 5 : `D_SUPPORTED ? 10 : 15; // set i to the first test that is run
+
+  logic [`FLEN-1:0]     X, Y, Z;  // inputs read from TestFloat
+  logic [`FLEN-1:0]	    ans;      // result from TestFloat
+  logic [7:0]	 	        flags;    // flags read form testfloat
+  logic [2:0]		        FrmE;     // rounding mode
+  logic	[`FPSIZES/3:0]  FmtE;     // format - 10 = half, 00 = single, 01 = double, 11 = quad
+  logic [3:0]		        FrmRead;  // rounding mode read from testfloat
+  logic	[3:0]			      FmtRead;  // format read from testfloat
+  logic [`FLEN-1:0]     FMAResM;  // FMA's outputed result
+  logic [4:0]           FMAFlgM;  // FMA's outputed flags
+  logic [2:0]		        FOpCtrlE; // which opperation
+  logic                 wnan;     // is the outputed result NaN
+  logic                 ansnan;   // is the correct answer NaN
+  
+  // signals needed to connect modules
+  logic [`NE+1:0]	  ProdExpE;
+  logic 				    AddendStickyE;
+  logic 					  KillProdE; 
+  logic             XSgnE, YSgnE, ZSgnE;
+  logic [`NE-1:0]   XExpE, YExpE, ZExpE;
+  logic [`NF:0]     XManE, YManE, ZManE;
+  logic             XNormE;
+  logic             XExpMaxE;
+  logic             XNaNE, YNaNE, ZNaNE;
+  logic             XSNaNE, YSNaNE, ZSNaNE;
+  logic             XDenormE, YDenormE, ZDenormE;
+  logic             XInfE, YInfE, ZInfE;
+  logic             XZeroE, YZeroE, ZZeroE;
+  logic             YExpMaxE, ZExpMaxE, Mult;
+  logic [3*`NF+5:0]	SumE;       
+  logic 			      InvZE;
+  logic 			      NegSumE;
+  logic 			      ZSgnEffE;
+  logic 			      PSgnE;
+  logic [$clog2(3*`NF+7)-1:0]	NormCntE;
+
+
+  assign FOpCtrlE = 3'b0; // set to 0 because test float only tests fMADD
+  assign Mult = 1'b0;     // set to zero because not testing multiplication
+
+  // check if the calculated result or correct answer is NaN
+  always_comb begin
+    case (FmtRead)
+        4'b11: begin // quad             
+          assign ansnan = &ans[`FLEN-2:`NF]&(|ans[`NF-1:0]);
+          assign wnan = &FMAResM[`FLEN-2:`NF]&(|FMAResM[`NF-1:0]);
+
+        end
+        4'b01: begin // double                 
+          assign ansnan = &ans[`LEN1-2:`NF1]&(|ans[`NF1-1:0]);
+          assign wnan = &FMAResM[`LEN1-2:`NF1]&(|FMAResM[`NF1-1:0]);
+
+        end
+        4'b00: begin // single
+            assign ansnan = &ans[`LEN2-2:`NF2]&(|ans[`NF2-1:0]);
+            assign wnan = &FMAResM[`LEN2-2:`NF2]&(|FMAResM[`NF2-1:0]);
+        end
+        4'b10: begin // half
+            assign ansnan = &ans[`H_LEN-2:`H_NF]&(|ans[`H_NF-1:0]);
+            assign wnan = &FMAResM[`H_LEN-2:`H_NF]&(|FMAResM[`H_NF-1:0]);
+        end
+    endcase
+  end
+
+  // instantiate devices under test
+  unpack unpack(.X, .Y, .Z, .FmtE, .FOpCtrlE, .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE,
+                .XManE, .YManE, .ZManE, .XNormE, .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE,
+                .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .XInfE, .YInfE, .ZInfE,
+                .XExpMaxE);
+  fma1 fma1(.XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE,
+            .XDenormE, .YDenormE, .ZDenormE,  .XZeroE, .YZeroE, .ZZeroE,
+            .FOpCtrlE, .FmtE, .SumE, .NegSumE, .InvZE, .NormCntE, .ZSgnEffE, .PSgnE,
+            .ProdExpE, .AddendStickyE, .KillProdE); 
+  fma2 fma2(.XSgnM(XSgnE), .YSgnM(YSgnE), .XExpM(XExpE), .YExpM(YExpE), .ZExpM(ZExpE), .XManM(XManE), .YManM(YManE), .ZManM(ZManE), 
+            .XNaNM(XNaNE), .YNaNM(YNaNE), .ZNaNM(ZNaNE), .XZeroM(XZeroE), .YZeroM(YZeroE), .ZZeroM(ZZeroE), .XInfM(XInfE), .YInfM(YInfE), .ZInfM(ZInfE), 
+            .XSNaNM(XSNaNE), .YSNaNM(YSNaNE), .ZSNaNM(ZSNaNE), .KillProdM(KillProdE), .AddendStickyM(AddendStickyE), .ProdExpM(ProdExpE), 
+            .SumM(SumE), .NegSumM(NegSumE), .InvZM(InvZE), .NormCntM(NormCntE), .ZSgnEffM(ZSgnEffE), .PSgnM(PSgnE), .FmtM(FmtE), .FrmM(FrmE), 
+            .FMAFlgM, .FMAResM, .Mult);
+
+
+  // produce clock
+  always begin
+    clk = 1; #5; clk = 0; #5;
+  end
+  
+  // Read first test
+  initial begin
+      $display("\n\nRunning %s vectors", tests[i]);
+      $readmemh({`PATH, tests[i]}, testvectors);
+  end
+
+  // apply test vectors on rising edge of clk
+  always @(posedge clk) begin
+    #1; 
+    flags = testvectors[vectornum][15:8];
+    FrmRead = testvectors[vectornum][7:4];
+    FmtRead = testvectors[vectornum][3:0];
+    if (FmtRead==4'b11 & `Q_SUPPORTED) 	begin       // quad
+      X = testvectors[vectornum][16+4*(`Q_LEN)-1:16+3*(`Q_LEN)];
+      Y = testvectors[vectornum][16+3*(`Q_LEN)-1:16+2*(`Q_LEN)];
+      Z = testvectors[vectornum][16+2*(`Q_LEN)-1:16+`Q_LEN];
+      ans = testvectors[vectornum][16+(`Q_LEN-1):16];
+    end
+    else if (FmtRead==4'b01 & `D_SUPPORTED)	begin	  // double
+      X = {{`FLEN-`D_LEN{1'b1}}, testvectors[vectornum][16+4*(`D_LEN)-1:16+3*(`D_LEN)]};
+      Y = {{`FLEN-`D_LEN{1'b1}}, testvectors[vectornum][16+3*(`D_LEN)-1:16+2*(`D_LEN)]};
+      Z = {{`FLEN-`D_LEN{1'b1}}, testvectors[vectornum][16+2*(`D_LEN)-1:16+`D_LEN]};
+      ans = {{`FLEN-`D_LEN{1'b1}}, testvectors[vectornum][16+(`D_LEN-1):16]};
+    end
+    else if (FmtRead==4'b00 & `F_SUPPORTED)	begin	  // single
+      X = {{`FLEN-`S_LEN{1'b1}}, testvectors[vectornum][16+4*(`S_LEN)-1:16+3*(`S_LEN)]};
+      Y = {{`FLEN-`S_LEN{1'b1}}, testvectors[vectornum][16+3*(`S_LEN)-1:16+2*(`S_LEN)]};
+      Z = {{`FLEN-`S_LEN{1'b1}}, testvectors[vectornum][16+2*(`S_LEN)-1:16+`S_LEN]};
+      ans = {{`FLEN-`S_LEN{1'b1}}, testvectors[vectornum][16+(`S_LEN-1):16]};
+    end
+    else if (FmtRead==4'b10 & `ZFH_SUPPORTED)	begin	  // half
+      X = {{`FLEN-`H_LEN{1'b1}}, testvectors[vectornum][16+4*(`H_LEN)-1:16+3*(`H_LEN)]};
+      Y = {{`FLEN-`H_LEN{1'b1}}, testvectors[vectornum][16+3*(`H_LEN)-1:16+2*(`H_LEN)]};
+      Z = {{`FLEN-`H_LEN{1'b1}}, testvectors[vectornum][16+2*(`H_LEN)-1:16+`H_LEN]};
+      ans = {{`FLEN-`H_LEN{1'b1}}, testvectors[vectornum][16+(`H_LEN-1):16]};
+    end
+    else begin	  
+      X = {`FLEN{1'bx}};
+      Y = {`FLEN{1'bx}};
+      Z = {`FLEN{1'bx}};
+      ans = {`FLEN{1'bx}};
+    end
+
+    // trim format and rounding mode to appropriate size
+    if (`FPSIZES <= 2) FmtE = FmtRead === `FMT; // rewrite format if 2 or less floating formats are supported
+    else FmtE = FmtRead[1:0];
+    FrmE = FrmRead[2:0];
+  end
+
+  // check results on falling edge of clk
+    always @(negedge clk) begin
+      // quad
+        if((FmtRead==4'b11) & ~((FMAFlgM === flags[4:0]) | (FMAResM === ans) | (wnan & (FMAResM[`FLEN-2:0] === ans[`FLEN-2:0] | (XNaNE&(FMAResM[`FLEN-2:0] === {X[`FLEN-2:`NF],1'b1,X[`NF-2:0]})) | (YNaNE&(FMAResM[`FLEN-2:0] === {Y[`FLEN-2:`NF],1'b1,Y[`NF-2:0]})) | (ZNaNE&(FMAResM[`FLEN-2:0] === {Z[`FLEN-2:`NF],1'b1,Z[`NF-2:0]})))))) begin
+          $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+          if(XDenormE) $display( "xdenorm ");
+          if(YDenormE) $display( "ydenorm ");
+          if(ZDenormE) $display( "zdenorm ");
+          if(FMAFlgM[4] !== 0) $display( "invld ");
+          if(FMAFlgM[2] !== 0) $display( "ovrflw ");
+          if(FMAFlgM[1] !== 0) $display( "unflw ");
+          if(FMAResM[`FLEN] && FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] === 0) $display( "FMAResM=-inf ");
+          if(~FMAResM[`FLEN] && FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] === 0) $display( "FMAResM=+inf ");
+          if(FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] !== 0 && ~FMAResM[`NF-1]) $display( "FMAResM=sigNaN ");
+          if(FMAResM[`FLEN-2:`NF] === {`NE{1'b1}} && FMAResM[`NF-1:0] !== 0 && FMAResM[`NF-1]) $display( "FMAResM=qutNaN ");
+          if(ans[`FLEN] && ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] === 0) $display( "ans=-inf ");
+          if(~ans[`FLEN] && ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] === 0) $display( "ans=+inf ");
+          if(ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] !== 0 && ~ans[`NF-1]) $display( "ans=sigNaN ");
+          if(ans[`FLEN-2:`NF] === {`NE{1'b1}} && ans[`NF-1:0] !== 0 && ans[`NF-1]) $display( "ans=qutNaN ");
+          errors = errors + 1;
+          if (errors === 1) $stop;
+        end
+      // double
+        if((FmtRead==4'b01) & ~((FMAFlgM === flags[4:0]) | (FMAResM === ans) | (wnan & (FMAResM[`D_LEN-2:0] === ans[`D_LEN-2:0] | (XNaNE&(FMAResM[`D_LEN-2:0] === {X[`D_LEN-2:`D_NF],1'b1,X[`D_NF-2:0]})) | (YNaNE&(FMAResM[`D_LEN-2:0] === {Y[`D_LEN-2:`D_NF],1'b1,Y[`D_NF-2:0]})) | (ZNaNE&(FMAResM[`D_LEN-2:0] === {Z[`D_LEN-2:`D_NF],1'b1,Z[`D_NF-2:0]})))))) begin
+          $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+          if(~(|X[30:23]) && |X[22:0]) $display( "xdenorm ");
+          if(~(|Y[30:23]) && |Y[22:0]) $display( "ydenorm ");
+          if(~(|Z[30:23]) && |Z[22:0]) $display( "zdenorm ");
+          if(FMAFlgM[4] !== 0) $display( "invld ");
+          if(FMAFlgM[2] !== 0) $display( "ovrflw ");
+          if(FMAFlgM[1] !== 0) $display( "unflw ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && ~FMAResM[22]) $display( "FMAResM=sigNaN ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && FMAResM[22] ) $display( "FMAResM=qutNaN ");
+          if(&ans[30:23] && |ans[22:0] && ~ans[22] ) $display( "ans=sigNaN ");
+          if(&ans[30:23] && |ans[22:0] && ans[22]) $display( "ans=qutNaN ");
+          errors = errors + 1;
+          if (errors === 1) $stop;
+        end
+      // single
+        if((FmtRead==4'b00) & ~((FMAFlgM === flags[4:0]) | (FMAResM === ans) | (wnan & (FMAResM[`S_LEN-2:0] === ans[`S_LEN-2:0] | (XNaNE&(FMAResM[`S_LEN-2:0] === {X[`S_LEN-2:`S_NF],1'b1,X[`S_NF-2:0]})) | (YNaNE&(FMAResM[`S_LEN-2:0] === {Y[`S_LEN-2:`S_NF],1'b1,Y[`S_NF-2:0]})) | (ZNaNE&(FMAResM[`S_LEN-2:0] === {Z[`S_LEN-2:`S_NF],1'b1,Z[`S_NF-2:0]})))))) begin
+          $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+          if(~(|X[30:23]) && |X[22:0]) $display( "xdenorm ");
+          if(~(|Y[30:23]) && |Y[22:0]) $display( "ydenorm ");
+          if(~(|Z[30:23]) && |Z[22:0]) $display( "zdenorm ");
+          if(FMAFlgM[4] !== 0) $display( "invld ");
+          if(FMAFlgM[2] !== 0) $display( "ovrflw ");
+          if(FMAFlgM[1] !== 0) $display( "unflw ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && ~FMAResM[22]) $display( "FMAResM=sigNaN ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && FMAResM[22] ) $display( "FMAResM=qutNaN ");
+          if(&ans[30:23] && |ans[22:0] && ~ans[22] ) $display( "ans=sigNaN ");
+          if(&ans[30:23] && |ans[22:0] && ans[22]) $display( "ans=qutNaN ");
+          errors = errors + 1;
+          if (errors === 1) $stop;
+        end
+      // half
+        if((FmtRead==4'b01) & ~((FMAFlgM === flags[4:0]) | (FMAResM === ans) | (wnan & (FMAResM[`H_LEN-2:0] === ans[`H_LEN-2:0] | (XNaNE&(FMAResM[`H_LEN-2:0] === {X[`H_LEN-2:`H_NF],1'b1,X[`H_NF-2:0]})) | (YNaNE&(FMAResM[`H_LEN-2:0] === {Y[`H_LEN-2:`H_NF],1'b1,Y[`H_NF-2:0]})) | (ZNaNE&(FMAResM[`H_LEN-2:0] === {Z[`H_LEN-2:`H_NF],1'b1,Z[`H_NF-2:0]})))))) begin
+          $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+          if(~(|X[30:23]) && |X[22:0]) $display( "xdenorm ");
+          if(~(|Y[30:23]) && |Y[22:0]) $display( "ydenorm ");
+          if(~(|Z[30:23]) && |Z[22:0]) $display( "zdenorm ");
+          if(FMAFlgM[4] !== 0) $display( "invld ");
+          if(FMAFlgM[2] !== 0) $display( "ovrflw ");
+          if(FMAFlgM[1] !== 0) $display( "unflw ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && ~FMAResM[22]) $display( "FMAResM=sigNaN ");
+          if(&FMAResM[30:23] && |FMAResM[22:0] && FMAResM[22] ) $display( "FMAResM=qutNaN ");
+          if(&ans[30:23] && |ans[22:0] && ~ans[22] ) $display( "ans=sigNaN ");
+          if(&ans[30:23] && |ans[22:0] && ans[22]) $display( "ans=qutNaN ");
+          errors = errors + 1;
+          if (errors === 1) $stop;
+        end
+        
+	    // if ( vectornum === 3165862) $stop; // uncomment for specific test
+      vectornum = vectornum + 1; // increment test
+      if (testvectors[vectornum][0] === 1'bx) begin // if reached the end of file
+        if (errors) begin // if there were errors
+          $display("%s completed with %d tests and %d errors", tests[i], vectornum, errors);
+          $stop;
+        end
+        else begin // if no errors
+          if(tests[i] === "") begin // if no more tests
+            $display("\nAll tests completed with %d errors\n", errors);
+            $stop;
+          end
+
+          $display("%s completed successfully with %d tests and %d errors (across all tests)\n", tests[i], vectornum, errors);
+
+          // increment tests - skip some precisions if needed
+          if ((i === 4 & ~`F_SUPPORTED) | (i === 9 & ~`D_SUPPORTED) | (i === 14 & ~`Q_SUPPORTED)) i = i+5;
+          if ((i === 9 & ~`D_SUPPORTED) | (i === 14 & ~`Q_SUPPORTED)) i = i+5;
+          if ((i === 14 & ~`Q_SUPPORTED)) i = i+5;
+          i = i+1;
+
+          // if no more tests - finish
+          if(tests[i] === "") begin
+            $display("\nAll tests completed with %d errors\n", errors);
+            $stop;
+          end 
+
+          // read next files
+          $display("Running %s vectors", tests[i]);
+          $readmemh({`PATH, tests[i]}, testvectors);
+          vectornum = 0;
+        end
+      end
+  end
+endmodule
--- a/pipelined/testbench/fp/tests/fma.do
+++ b/pipelined/testbench/fp/tests/fma.do
@ -0,0 +1,50 @@
+# wally-pipelined.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# run with vsim -do "do wally-pipelined.do rv64ic riscvarchtest-64m"
+
+# Use this wally-pipelined.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+# $num = the added words after the call
+vlog +incdir+../../../config/$1 +incdir+../../../config/shared fma-testbench.sv ../../../src/fpu/fma.sv ../../../src/fpu/unpack.sv -suppress 2583 -suppress 7063
+
+vsim -voptargs=+acc work.fmatestbench
+
+view wave
+#-- display input and output signals as hexidecimal values
+#do ./wave-dos/peripheral-waves.do
+#add log -recursive /*
+#do wave.do deal with when ready
+
+#-- Run the Simulation 
+#run 3600 
+run -all
+noview fma-testbench.sv
+view wave
+
--- a/pipelined/testbench/fp/tests/sim-fma
+++ b/pipelined/testbench/fp/tests/sim-fma
@ -0,0 +1 @@
+vsim -do "do fma.do rv64fp"
--- a/pipelined/testbench/fp/tests/sim-fma-batch
+++ b/pipelined/testbench/fp/tests/sim-fma-batch
@ -0,0 +1 @@
+vsim -c -do "do fma.do rv64fp"
--- a/pipelined/testbench/testbench.sv
+++ b/pipelined/testbench/testbench.sv
@ -359,6 +359,8 @@ module riscvassertions;
 //    assert (`MEM_DCACHE == 0 | `MEM_DTIM == 0) else $error("Can't simultaneously have a data cache and TIM");
    assert (`DMEM == `MEM_CACHE | `VIRTMEM_SUPPORTED ==0) else $error("Virtual memory needs dcache");
    assert (`IMEM == `MEM_CACHE | `VIRTMEM_SUPPORTED ==0) else $error("Virtual memory needs icache");
+    assert (`DMEM == `MEM_CACHE | `DBUS ==0) else $error("Dcache rquires DBUS.");
+    assert (`IMEM == `MEM_CACHE | `IBUS ==0) else $error("Icache rquires IBUS.");    
  end
 endmodule

--- a/tests/fp/create_vectors128fma.sh
+++ b/tests/fp/create_vectors128fma.sh
@ -0,0 +1,31 @@
+#!/bin/sh
+
+BUILD="./TestFloat-3e/build/Linux-x86_64-GCC"
+OUTPUT="./vectors"
+
+$BUILD/testfloat_gen -rnear_even f128_mulAdd > $OUTPUT/f128_mulAdd_rne.tv
+$BUILD/testfloat_gen -rminMag f128_mulAdd > $OUTPUT/f128_mulAdd_rz.tv
+$BUILD/testfloat_gen -rmax f128_mulAdd > $OUTPUT/f128_mulAdd_ru.tv
+$BUILD/testfloat_gen -rmin f128_mulAdd > $OUTPUT/f128_mulAdd_rd.tv
+$BUILD/testfloat_gen -rnear_maxMag f128_mulAdd > $OUTPUT/f128_mulAdd_rnm.tv
+
+# format: X_Y_Z_answer_flags_Frm_Fmt
+sed -i 's/ /_/g' $OUTPUT/f128_mulAdd_rne.tv
+sed -ie 's/$/_0/' $OUTPUT/f128_mulAdd_rne.tv
+sed -ie 's/$/_3/' $OUTPUT/f128_mulAdd_rne.tv
+
+sed -i 's/ /_/g' $OUTPUT/f128_mulAdd_rz.tv
+sed -ie 's/$/_1/' $OUTPUT/f128_mulAdd_rz.tv
+sed -ie 's/$/_3/' $OUTPUT/f128_mulAdd_rz.tv
+
+sed -i 's/ /_/g' $OUTPUT/f128_mulAdd_ru.tv
+sed -ie 's/$/_3/' $OUTPUT/f128_mulAdd_ru.tv
+sed -ie 's/$/_3/' $OUTPUT/f128_mulAdd_ru.tv
+
+sed -i 's/ /_/g' $OUTPUT/f128_mulAdd_rd.tv
+sed -ie 's/$/_2/' $OUTPUT/f128_mulAdd_rd.tv
+sed -ie 's/$/_3/' $OUTPUT/f128_mulAdd_rd.tv
+
+sed -i 's/ /_/g' $OUTPUT/f128_mulAdd_rnm.tv
+sed -ie 's/$/_4/' $OUTPUT/f128_mulAdd_rnm.tv
+sed -ie 's/$/_3/' $OUTPUT/f128_mulAdd_rnm.tv
--- a/tests/fp/create_vectors16fma.sh
+++ b/tests/fp/create_vectors16fma.sh
@ -0,0 +1,31 @@
+#!/bin/sh
+
+BUILD="./TestFloat-3e/build/Linux-x86_64-GCC"
+OUTPUT="./vectors"
+
+$BUILD/testfloat_gen -rnear_even f16_mulAdd > $OUTPUT/f16_mulAdd_rne.tv
+$BUILD/testfloat_gen -rminMag f16_mulAdd > $OUTPUT/f16_mulAdd_rz.tv
+$BUILD/testfloat_gen -rmax f16_mulAdd > $OUTPUT/f16_mulAdd_ru.tv
+$BUILD/testfloat_gen -rmin f16_mulAdd > $OUTPUT/f16_mulAdd_rd.tv
+$BUILD/testfloat_gen -rnear_maxMag f16_mulAdd > $OUTPUT/f16_mulAdd_rnm.tv
+
+# format: X_Y_Z_answer_flags_Frm_Fmt
+sed -i 's/ /_/g' $OUTPUT/f16_mulAdd_rne.tv
+sed -ie 's/$/_0/' $OUTPUT/f16_mulAdd_rne.tv
+sed -ie 's/$/_2/' $OUTPUT/f16_mulAdd_rne.tv
+
+sed -i 's/ /_/g' $OUTPUT/f16_mulAdd_rz.tv
+sed -ie 's/$/_1/' $OUTPUT/f16_mulAdd_rz.tv
+sed -ie 's/$/_2/' $OUTPUT/f16_mulAdd_rz.tv
+
+sed -i 's/ /_/g' $OUTPUT/f16_mulAdd_ru.tv
+sed -ie 's/$/_3/' $OUTPUT/f16_mulAdd_ru.tv
+sed -ie 's/$/_2/' $OUTPUT/f16_mulAdd_ru.tv
+
+sed -i 's/ /_/g' $OUTPUT/f16_mulAdd_rd.tv
+sed -ie 's/$/_2/' $OUTPUT/f16_mulAdd_rd.tv
+sed -ie 's/$/_2/' $OUTPUT/f16_mulAdd_rd.tv
+
+sed -i 's/ /_/g' $OUTPUT/f16_mulAdd_rnm.tv
+sed -ie 's/$/_4/' $OUTPUT/f16_mulAdd_rnm.tv
+sed -ie 's/$/_2/' $OUTPUT/f16_mulAdd_rnm.tv
--- a/tests/fp/create_vectors32fma.sh
+++ b/tests/fp/create_vectors32fma.sh
@ -0,0 +1,31 @@
+#!/bin/sh
+
+BUILD="./TestFloat-3e/build/Linux-x86_64-GCC"
+OUTPUT="./vectors"
+
+$BUILD/testfloat_gen -rnear_even f32_mulAdd > $OUTPUT/f32_mulAdd_rne.tv
+$BUILD/testfloat_gen -rminMag f32_mulAdd > $OUTPUT/f32_mulAdd_rz.tv
+$BUILD/testfloat_gen -rmax f32_mulAdd > $OUTPUT/f32_mulAdd_ru.tv
+$BUILD/testfloat_gen -rmin f32_mulAdd > $OUTPUT/f32_mulAdd_rd.tv
+$BUILD/testfloat_gen -rnear_maxMag f32_mulAdd > $OUTPUT/f32_mulAdd_rnm.tv
+
+# format: X_Y_Z_answer_flags_Frm_Fmt
+sed -i 's/ /_/g' $OUTPUT/f32_mulAdd_rne.tv
+sed -ie 's/$/_0/' $OUTPUT/f32_mulAdd_rne.tv
+sed -ie 's/$/_0/' $OUTPUT/f32_mulAdd_rne.tv
+
+sed -i 's/ /_/g' $OUTPUT/f32_mulAdd_rz.tv
+sed -ie 's/$/_1/' $OUTPUT/f32_mulAdd_rz.tv
+sed -ie 's/$/_0/' $OUTPUT/f32_mulAdd_rz.tv
+
+sed -i 's/ /_/g' $OUTPUT/f32_mulAdd_ru.tv
+sed -ie 's/$/_3/' $OUTPUT/f32_mulAdd_ru.tv
+sed -ie 's/$/_0/' $OUTPUT/f32_mulAdd_ru.tv
+
+sed -i 's/ /_/g' $OUTPUT/f32_mulAdd_rd.tv
+sed -ie 's/$/_2/' $OUTPUT/f32_mulAdd_rd.tv
+sed -ie 's/$/_0/' $OUTPUT/f32_mulAdd_rd.tv
+
+sed -i 's/ /_/g' $OUTPUT/f32_mulAdd_rnm.tv
+sed -ie 's/$/_4/' $OUTPUT/f32_mulAdd_rnm.tv
+sed -ie 's/$/_0/' $OUTPUT/f32_mulAdd_rnm.tv
--- a/tests/fp/create_vectors64fma.sh
+++ b/tests/fp/create_vectors64fma.sh
@ -0,0 +1,31 @@
+#!/bin/sh
+
+BUILD="./TestFloat-3e/build/Linux-x86_64-GCC"
+OUTPUT="./vectors"
+
+$BUILD/testfloat_gen -rnear_even f64_mulAdd > $OUTPUT/f64_mulAdd_rne.tv
+$BUILD/testfloat_gen -rminMag f64_mulAdd > $OUTPUT/f64_mulAdd_rz.tv
+$BUILD/testfloat_gen -rmax f64_mulAdd > $OUTPUT/f64_mulAdd_ru.tv
+$BUILD/testfloat_gen -rmin f64_mulAdd > $OUTPUT/f64_mulAdd_rd.tv
+$BUILD/testfloat_gen -rnear_maxMag f64_mulAdd > $OUTPUT/f64_mulAdd_rnm.tv
+
+# format: X_Y_Z_answer_flags_Frm_Fmt
+sed -i 's/ /_/g' $OUTPUT/f64_mulAdd_rne.tv
+sed -ie 's/$/_0/' $OUTPUT/f64_mulAdd_rne.tv
+sed -ie 's/$/_1/' $OUTPUT/f64_mulAdd_rne.tv
+
+sed -i 's/ /_/g' $OUTPUT/f64_mulAdd_rz.tv
+sed -ie 's/$/_1/' $OUTPUT/f64_mulAdd_rz.tv
+sed -ie 's/$/_1/' $OUTPUT/f64_mulAdd_rz.tv
+
+sed -i 's/ /_/g' $OUTPUT/f64_mulAdd_ru.tv
+sed -ie 's/$/_3/' $OUTPUT/f64_mulAdd_ru.tv
+sed -ie 's/$/_1/' $OUTPUT/f64_mulAdd_ru.tv
+
+sed -i 's/ /_/g' $OUTPUT/f64_mulAdd_rd.tv
+sed -ie 's/$/_2/' $OUTPUT/f64_mulAdd_rd.tv
+sed -ie 's/$/_1/' $OUTPUT/f64_mulAdd_rd.tv
+
+sed -i 's/ /_/g' $OUTPUT/f64_mulAdd_rnm.tv
+sed -ie 's/$/_4/' $OUTPUT/f64_mulAdd_rnm.tv
+sed -ie 's/$/_1/' $OUTPUT/f64_mulAdd_rnm.tv
--- a/tests/fp/run_all.sh
+++ b/tests/fp/run_all.sh
@ -8,3 +8,7 @@
 ./create_vectors64cmp.sh
 ./create_vectors64.sh
 ./create_vectorsi.sh
+./create_vectors16fma.sh
+./create_vectors32fma.sh
+./create_vectors64fma.sh
+./create_vectors128fma.sh
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/Makefile
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/Makefile
@ -1,3 +1,5 @@
 include ../../Makefile.include

+RVTEST_DEFINES += -march=rv$(XLEN)ia # KMG: removed compressed instructions from privileged tests
+
 $(eval $(call compile_template,-march=rv64iac -mabi=lp64 -Drvtest_mtrap_routine=True -DXLEN=$(XLEN)))
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-mtvec-01.reference_output
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-mtvec-01.reference_output
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-stvec-01.reference_output
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-stvec-01.reference_output
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-01.reference_output
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-01.reference_output
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-s-01.reference_output
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-s-01.reference_output
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-u-01.reference_output
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-u-01.reference_output
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-TEST-LIB-64.h
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-TEST-LIB-64.h
@ -55,12 +55,17 @@ RVTEST_CODE_BEGIN

 .endm

-.macro TRAP_HANDLER MODE, VECTORED=1 // default to vectored tests
-    //   Set up the exception Handler, keeping the original handler in x4.
+.macro TRAP_HANDLER MODE, VECTORED=1, DEBUG=0
+    // MODE decides which mode this trap handler will be taken in (M or S mode)
+    // Vectored decides whether interrumpts are handled with the vector table at trap_handler_MODE (1)
+    //      vs Using the non-vector approach the rest of the trap handler takes (0)
+    // DEBUG decides whether we will print mtval a string with status.mpie, status.mie, and status.mpp to the signature (1)
+    //      vs not saving that info to the signature (0)

-    // trap handler setup
+
+    //   Set up the exception Handler, keeping the original handler in x4.
    la x1, trap_handler_\MODE\()
-.if (\VECTORED == 1)
+.if (\VECTORED\() == 1)
    ori x1, x1, 0x1 // set mode field of tvec to 1, forcing vectored interrupts
 .endif

@ -115,17 +120,18 @@ trap_handler_\MODE\():
    j trap_unvectored_\MODE\() // for the unvectored implimentation: jump past this table of addresses into the actual handler
    // *** ASSUMES that a cause value of 0 for an interrupt is unimplemented
    // otherwise, a vectored interrupt handler should jump to trap_handler_\MODE\() + 4 * Interrupt cause code
-    .4byte s_soft_interrupt_\MODE\()    // 1: instruction access fault // the zero spot is taken up by the instruction to skip this table.
-    .4byte segfault_\MODE\()            // 2: reserved
-    .4byte m_soft_interrupt_\MODE\()    // 3: breakpoint
-    .4byte segfault_\MODE\()            // 4: reserved
-    .4byte s_time_interrupt_\MODE\()    // 5: load access fault
-    .4byte segfault_\MODE\()            // 6: reserved
-    .4byte m_time_interrupt_\MODE\()    // 7: store access fault
-    .4byte segfault_\MODE\()            // 8: reserved
-    .4byte s_ext_interrupt_\MODE\()     // 9: ecall from S-mode
-    .4byte segfault_\MODE\()            // 10: reserved
-    .4byte m_ext_interrupt_\MODE\()     // 11: ecall from M-mode
+    // No matter the value of VECTORED, exceptions (not interrupts) are handled in an unvecotred way
+    j s_soft_interrupt_\MODE\()    // 1: instruction access fault // the zero spot is taken up by the instruction to skip this table.
+    j segfault_\MODE\()            // 2: reserved
+    j m_soft_interrupt_\MODE\()    // 3: breakpoint
+    j segfault_\MODE\()            // 4: reserved
+    j s_time_interrupt_\MODE\()    // 5: load access fault
+    j segfault_\MODE\()            // 6: reserved
+    j m_time_interrupt_\MODE\()    // 7: store access fault
+    j segfault_\MODE\()            // 8: reserved
+    j s_ext_interrupt_\MODE\()     // 9: ecall from S-mode
+    j segfault_\MODE\()            // 10: reserved
+    j m_ext_interrupt_\MODE\()     // 11: ecall from M-mode
    // 12 through >=16 are reserved or designated for platform use

 trap_unvectored_\MODE\():
@ -139,12 +145,34 @@ trap_unvectored_\MODE\():
    addi x6, x6, 8     
    addi x16, x16, 8    // update pointers for logging results

+.if (\DEBUG\() == 1) // record extra information (MTVAL, some status bits) about traps
+    csrr x1, \MODE\()tval
+    sd x1, 0(x16)
+    addi x6, x6, 8     
+    addi x16, x16, 8
+
+    csrr x1, \MODE\()status
+.if (\MODE\() == m) // Taking traps in different modes means we want to get different bits from the status register.
+    li x5, 0x1888 // mask bits to select MPP, MPIE, and MIE.
+.else
+    li x5, 0x122 // mask bits to select SPP, SPIE, and SIE.
+.endif
+
+    and x5, x5, x1
+    sd x5, 0(x16) // store masked out status bits to the output
+    addi x6, x6, 8
+    addi x16, x16, 8
+
+.endif
+
    // Respond to trap based on cause
    // All interrupts should return after being logged
+    csrr x1, \MODE\()cause
    li x5, 0x8000000000000000   // if msb is set, it is an interrupt
    and x5, x5, x1
    bnez x5, trapreturn_\MODE\()   // return from interrupt
    // Other trap handling is specified in the vector Table
+    csrr x1, \MODE\()cause
    slli x1, x1, 3      // multiply cause by 8 to get offset in vector Table
    la x5, exception_vector_table_\MODE\()
    add x5, x5, x1      // compute address of vector in Table
@ -171,16 +199,16 @@ trapreturn_\MODE\():
 //     lw x5, 0(x1)        // read the faulting instruction
 //     li x1, 3            // check bottom 2 bits of instruction to see if compressed
 //     and x5, x5, x1      // mask the other bits
-//     beq x5, x1, trapreturn_uncompressed  // if 11, the instruction is return_uncompressed
+//     beq x5, x1, trapreturn_uncompressed_\MODE\()  // if 11, the instruction is return_uncompressed

-// trapreturn_compressed:
+// trapreturn_compressed_\MODE\():
 //     csrr x1, mepc       // get the mepc again
 //     addi x1, x1, 2      // add 2 to find the next instruction
-//     j trapreturn_specified // and return
+//     j trapreturn_specified_\MODE\() // and return

-// trapreturn_uncompressed:
-//     csrr x1, mepc       // get the mepc again    
-//     addi x1, x1, 4      // add 4 to find the next instruction
+// trapreturn_uncompressed_\MODE\():
+//      csrr x1, mepc       // get the mepc again    
+//      addi x1, x1, 4      // add 4 to find the next instruction

 trapreturn_specified_\MODE\():
    // reset the necessary pointers and registers (x1, x5, x6, and the return address going to mepc)
@ -224,6 +252,7 @@ trapreturn_finished_\MODE\():
    csrw \MODE\()epc, x1       // update the epc with address of next instruction
    ld x5, -16(sp)      // restore registers from stack before returning
    ld x1, -8(sp)
+    csrw \MODE\()ip, 0x0 // clear interrupt pending register to indicate interrupt has been handled
    \MODE\()ret  // return from trap

 ecallhandler_\MODE\():
@ -257,10 +286,14 @@ ecallhandler_changetousermode_\MODE\():
    csrc mstatus, x1
    j trapreturn_\MODE\()

-instrfault_\MODE\():
-    ld x1, -8(sp) // load return address int x1 (the address AFTER the jal into faulting page)
+instrpagefault_\MODE\():
+    ld x1, -8(sp) // load return address int x1 (the address AFTER the jal to the faulting address)
    j trapreturn_finished_\MODE\() // puts x1 into mepc, restores stack and returns to program (outside of faulting page)

+instrfault_\MODE\():
+    ld x1, -8(sp) // load return address int x1 (the address AFTER the jal to the faulting address)
+    j trapreturn_finished_\MODE\() // return to the code after recording the mcause
+
 illegalinstr_\MODE\():
    j trapreturn_\MODE\() // return to the code after recording the mcause

@ -268,23 +301,63 @@ accessfault_\MODE\():
    // *** What do I have to do here?
    j trapreturn_\MODE\()

-s_soft_interrupt_\MODE\(): // these labels are here to make sure the code compiles, but don't actually do anything yet
+addr_misaligned_\MODE\():
    j trapreturn_\MODE\()

+breakpt_\MODE\():
+    j trapreturn_\MODE\()
+
+s_soft_interrupt_\MODE\(): // these labels are here to make sure the code compiles, but don't actually do anything yet
+    li x5, 0x7EC // write 0x7EC (looks like VEC) to the output before the mcause and extras to indicate that this trap was handled with a vector table. 
+    sd x5, 0(x16)
+    addi x6, x6, 8
+    addi x16, x16, 8
+    la x28, 0x02000000 // Reset by clearing MSIP interrupt from CLINT
+    sw x0, 0(x28)
+    j trap_unvectored_\MODE\()
+
 m_soft_interrupt_\MODE\():
-    j trapreturn_\MODE\()
+    li x5, 0x7EC
+    sd x5, 0(x16)
+    addi x6, x6, 8
+    addi x16, x16, 8
+    la x28, 0x02000000 // Reset by clearing MSIP interrupt from CLINT
+    sw x0, 0(x28)
+    j trap_unvectored_\MODE\()

 s_time_interrupt_\MODE\():
-    j trapreturn_\MODE\()
+    li x5, 0x7EC
+    sd x5, 0(x16)
+    addi x6, x6, 8
+    addi x16, x16, 8
+    j trap_unvectored_\MODE\()

 m_time_interrupt_\MODE\():
-    j trapreturn_\MODE\()
+    li x5, 0x7EC
+    sd x5, 0(x16)
+    addi x6, x6, 8
+    addi x16, x16, 8
+    j trap_unvectored_\MODE\()

 s_ext_interrupt_\MODE\():
-    j trapreturn_\MODE\()
+    li x5, 0x7EC
+    sd x5, 0(x16)
+    addi x6, x6, 8
+    addi x16, x16, 8
+    li x28, 0x10060000 // reset interrupt by clearing all the GPIO bits
+    sw x0, 8(x28) // disable the first pin as an output
+    sw x0, 40(x28) // write a 0 to the first output pin (reset interrupt)
+    j trap_unvectored_\MODE\()

 m_ext_interrupt_\MODE\():
-    j trapreturn_\MODE\()
+    li x5, 0x7EC
+    sd x5, 0(x16)
+    addi x6, x6, 8
+    addi x16, x16, 8
+    li x28, 0x10060000 // reset interrupt by clearing all the GPIO bits
+    sw x0, 8(x28) // disable the first pin as an output
+    sw x0, 40(x28) // write a 0 to the first output pin (reset interrupt)
+    j trap_unvectored_\MODE\()


    // Table of trap behavior
@ -294,19 +367,19 @@ m_ext_interrupt_\MODE\():

    .align 3 // aligns this data table to an 8 byte boundary
 exception_vector_table_\MODE\():
-    .8byte segfault_\MODE\()      // 0: instruction address misaligned
+    .8byte addr_misaligned_\MODE\()      // 0: instruction address misaligned
    .8byte instrfault_\MODE\()    // 1: instruction access fault
    .8byte illegalinstr_\MODE\()  // 2: illegal instruction
-    .8byte segfault_\MODE\()      // 3: breakpoint
-    .8byte segfault_\MODE\()      // 4: load address misaligned
+    .8byte breakpt_\MODE\()      // 3: breakpoint
+    .8byte addr_misaligned_\MODE\()      // 4: load address misaligned
    .8byte accessfault_\MODE\()   // 5: load access fault
-    .8byte segfault_\MODE\()      // 6: store address misaligned
+    .8byte addr_misaligned_\MODE\()      // 6: store address misaligned
    .8byte accessfault_\MODE\()   // 7: store access fault
    .8byte ecallhandler_\MODE\()  // 8: ecall from U-mode
    .8byte ecallhandler_\MODE\()  // 9: ecall from S-mode
    .8byte segfault_\MODE\()      // 10: reserved
    .8byte ecallhandler_\MODE\()  // 11: ecall from M-mode
-    .8byte instrfault_\MODE\()    // 12: instruction page fault
+    .8byte instrpagefault_\MODE\()    // 12: instruction page fault
    .8byte trapreturn_\MODE\()    // 13: load page fault
    .8byte segfault_\MODE\()      // 14: reserved
    .8byte trapreturn_\MODE\()    // 15: store page fault
@ -438,7 +511,7 @@ trap_handler_end_\MODE\(): // place to jump to so we can skip the trap handler a
 // they generally do not fault or cause issues as long as these modes are enabled 
 // *** add functionality to check if modes are enabled before jumping? maybe cause a fault if not?

-.macro GOTO_M_MODE RETURN_VPN RETURN_PAGETYPE
+.macro GOTO_M_MODE RETURN_VPN=0x0 RETURN_PAGETYPE=0x0
    li a0, 2 // determine trap handler behavior (go to machine mode)
    li a1, \RETURN_VPN // return VPN
    li a2, \RETURN_PAGETYPE // return page types
@ -446,7 +519,7 @@ trap_handler_end_\MODE\(): // place to jump to so we can skip the trap handler a
    // now in S mode
 .endm

-.macro GOTO_S_MODE RETURN_VPN RETURN_PAGETYPE
+.macro GOTO_S_MODE RETURN_VPN=0x0 RETURN_PAGETYPE=0x0
    li a0, 3 // determine trap handler behavior (go to supervisor mode)
    li a1, \RETURN_VPN // return VPN
    li a2, \RETURN_PAGETYPE // return page types
@ -454,7 +527,7 @@ trap_handler_end_\MODE\(): // place to jump to so we can skip the trap handler a
    // now in S mode
 .endm

-.macro GOTO_U_MODE RETURN_VPN RETURN_PAGETYPE
+.macro GOTO_U_MODE RETURN_VPN=0x0 RETURN_PAGETYPE=0x0
    li a0, 4 // determine trap handler behavior (go to user mode)
    li a1, \RETURN_VPN // return VPN
    li a2, \RETURN_PAGETYPE // return page types
@ -554,6 +627,87 @@ trap_handler_end_\MODE\(): // place to jump to so we can skip the trap handler a
    addi x16, x16, 8 
 .endm

+// The following tests involve causing many of the interrupts and exceptions that are easily done in a few lines
+//      This effectively includes everything that isn't to do with page faults (virtual memory)
+
+.macro CAUSE_INSTR_ADDR_MISALIGNED
+    // cause a misaligned address trap
+    auipc x28, 0      // get current PC, which is aligned
+    addi x28, x28, 0x1  // add 1 to pc to create misaligned address
+    jalr x28 // cause instruction address midaligned trap
+.endm
+
+.macro CAUSE_INSTR_ACCESS
+    la x28, 0x0 // address zero is an address with no memory
+    jalr x28 // cause instruction access trap
+.endm
+
+.macro CAUSE_ILLEGAL_INSTR
+    .word 0x00000000 // a 32 bit zros is an illegal instruction
+.endm
+
+.macro CAUSE_BREAKPNT // ****
+    ebreak
+.endm
+
+.macro CAUSE_LOAD_ADDR_MISALIGNED
+    auipc x28, 0      // get current PC, which is aligned
+    addi x28, x28, 1
+    lw x29, 0(x28)    // load from a misaligned address
+.endm
+
+.macro CAUSE_LOAD_ACC
+    la x28, 0         // 0 is an address with no memory
+    lw x29, 0(x28)    // load from unimplemented address
+.endm
+
+.macro CAUSE_STORE_ADDR_MISALIGNED
+    auipc x28, 0      // get current PC, which is aligned
+    addi x28, x28, 1
+    sw x29, 0(x28)     // store to a misaligned address
+.endm
+
+.macro CAUSE_STORE_ACC 
+    la x28, 0         // 0 is an address with no memory
+    sw x29, 0(x28)     // store to unimplemented address
+.endm
+
+.macro CAUSE_ECALL
+    // *** ASSUMES you have already gone to the mode you need to call this from.
+    ecall
+.endm
+
+.macro CAUSE_TIME_INTERRUPT
+    // The following code works for both RV32 and RV64.  
+    // RV64 alone would be easier using double-word adds and stores
+    li x28, 0x100          // Desired offset from the present time
+    la x29, 0x02004000    // MTIMECMP register in CLINT
+    la x30, 0x0200BFF8    // MTIME register in CLINT
+    lw x7, 0(x30)         // low word of MTIME
+    lw x31, 4(x30)         // high word of MTIME
+    add x28, x7, x28       // add desired offset to the current time
+    bgtu x28, x7, nowrap  // check new time exceeds current time (no wraparound)
+    addi x31, x31, 1       // if wrap, increment most significant word
+    sw x31,4(x29)          // store into most significant word of MTIMECMP
+nowrap:
+    sw x28, 0(x29)         // store into least significant word of MTIMECMP
+    loop: j loop         // wait until interrupt occurs
+.endm
+
+.macro CAUSE_SOFT_INTERRUPT
+    la x28, 0x02000000      // MSIP register in CLINT
+    li x29, 1               // 1 in the lsb
+    sw x29, 0(x28)          // Write MSIP bit
+.endm
+
+.macro CAUSE_EXT_INTERRUPT
+    li x28, 0x10060000 // load base GPIO memory location
+    li x29, 0x1
+    sw x29, 8(x28) // enable the first pin as an output
+    sw x29, 28(x28) // set first pin to high interrupt enable
+    sw x29, 40(x28) // write a 1 to the first output pin (cause interrupt)
+.endm
+
 .macro END_TESTS
    // invokes one final ecall to return to machine mode then terminates this program, so the output is
    //      0x8: termination called from U mode
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-mtvec-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-mtvec-01.S
@ -0,0 +1,45 @@
+///////////////////////////////////////////
+//
+// WALLY-unvectored-interrupt
+//
+// Author: Kip Macsai-Goren <kmacsaigoren@g.hmc.edu>
+//
+// Created 2022-03-11
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+#include "WALLY-TEST-LIB-64.h"
+
+INIT_TESTS
+
+// test 5.3.1.5 Unvectored interrupt tests 
+
+TRAP_HANDLER m, VECTORED=0, DEBUG=1 // turn off vectored interrupts, while turning on recording of mstatus bits.
+
+li x28, 0x8
+csrs sstatus, x28 // set sstatus.MIE bit to 1 // *** might be unneccessary for s mode
+// WRITE_READ_CSR mie, 0xFFFF *** commented out until I can get the trap handler (and spike for time interrupts) to work correctly with interrupts
+
+// cause traps, ensuring that we DONT go through the vectored part of the trap handler
+// *** this assumes that interrupt code 0 remains reserved
+
+// CAUSE_TIME_INTERRUPT *** intentionally causing this trap seems difficult in spike. although it is possible for it to accidentally happen. 
+// CAUSE_SOFT_INTERRUPT *** exiting out of the trap handler after these is current;y broken
+// CAUSE_EXT_INTERRUPT
+
+END_TESTS
+
+TEST_STACK_AND_DATA
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-stvec-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-stvec-01.S
@ -0,0 +1,55 @@
+///////////////////////////////////////////
+//
+// WALLY-unvectored-interrupt
+//
+// Author: Kip Macsai-Goren <kmacsaigoren@g.hmc.edu>
+//
+// Created 2022-03-11
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+#include "WALLY-TEST-LIB-64.h"
+
+INIT_TESTS
+
+// test 5.3.1.5 Unvectored interrupt tests 
+
+TRAP_HANDLER s, VECTORED=0, DEBUG=1 // turn off vectored interrupts, while turning on recording of mstatus bits.
+
+// li x28, 0x8
+// csrs sstatus, x28 // set sstatus.MIE bit to 1 // *** might be unneccessary for s mode
+// WRITE_READ_CSR mie, 0xFFFF *** commented out until I can get the trap handler (and spike for time interrupts) to work correctly with interrupts
+
+WRITE_READ_CSR mideleg, 0xFFFFFFFFFFFFFFFF
+
+GOTO_S_MODE
+
+// cause traps, ensuring that we DONT go through the vectored part of the trap handler
+// *** this assumes that interrupt code 0 remains reserved
+
+// CAUSE_TIME_INTERRUPT *** intentionally causing this trap seems difficult in spike. although it is possible for it to accidentally happen. 
+// CAUSE_SOFT_INTERRUPT *** exiting out of the trap handler after these is current;y broken
+// CAUSE_EXT_INTERRUPT
+
+GOTO_U_MODE 
+
+// CAUSE_TIME_INTERRUPT *** intentionally causing this trap seems difficult in spike. although it is possible for it to accidentally happen. 
+// CAUSE_SOFT_INTERRUPT *** exiting out of the trap handler after these is current;y broken
+// CAUSE_EXT_INTERRUPT
+
+END_TESTS
+
+TEST_STACK_AND_DATA
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-trap-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-trap-01.S
@ -0,0 +1,76 @@
+///////////////////////////////////////////
+//
+// WALLY-trap
+//
+// Author: Kip Macsai-Goren <kmacsaigoren@g.hmc.edu>
+//
+// Created 2022-02-20
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+#include "WALLY-TEST-LIB-64.h"
+
+INIT_TESTS
+
+TRAP_HANDLER m, DEBUG=1 // turn on recording mtval and status bits on traps
+
+li x28, 0x8
+csrs mstatus, x28 // set mstatus.MIE bit to 1
+// WRITE_READ_CSR mie, 0xFFFF *** commented out until I can get the trap handler (and spike for time interrupts) to work correctly with interrupts
+
+// test 5.3.1.4 Basic trap tests 
+
+// CAUSE_INSTR_ADDR_MISALIGNED //skipped becuase this exception may be impossible when compressed instructions are enabled)
+CAUSE_INSTR_ACCESS
+CAUSE_ILLEGAL_INSTR
+CAUSE_BREAKPNT
+CAUSE_LOAD_ADDR_MISALIGNED
+CAUSE_LOAD_ACC
+CAUSE_STORE_ADDR_MISALIGNED
+CAUSE_STORE_ACC
+GOTO_U_MODE // Causes M mode ecall
+GOTO_S_MODE // Causes U mode ecall
+GOTO_M_MODE // Causes S mode ecall
+
+// CAUSE_TIME_INTERRUPT *** intentionally causing this trap seems difficult in spike. although it is possible for it to accidentally happen. 
+// CAUSE_SOFT_INTERRUPT *** exiting out of the trap handler after these is current;y broken
+// CAUSE_EXT_INTERRUPT
+
+// try the traps again with mideleg = medeleg = all 1's to ensure traps still go to M mode from M mode
+
+WRITE_READ_CSR medeleg, 0xFFFFFFFFFFFFFFFF
+WRITE_READ_CSR mideleg, 0xFFFFFFFFFFFFFFFF
+
+// CAUSE_INSTR_ADDR_MISALIGNED //skipped becuase this exception may be impossible when compressed instructions are enabled)
+CAUSE_INSTR_ACCESS
+CAUSE_ILLEGAL_INSTR
+CAUSE_BREAKPNT
+CAUSE_LOAD_ADDR_MISALIGNED
+CAUSE_LOAD_ACC
+CAUSE_STORE_ADDR_MISALIGNED
+CAUSE_STORE_ACC
+CAUSE_ECALL // M mode ecall
+// GOTO_U_MODE // leave these untested since we only need to ensure that from M mode are not delegated
+// GOTO_S_MODE 
+
+// CAUSE_TIME_INTERRUPT *** intentionally causing this trap seems difficult in spike. although it is possible for it to accidentally happen. 
+// CAUSE_SOFT_INTERRUPT *** exiting out of the trap handler after these is current;y broken
+// CAUSE_EXT_INTERRUPT
+
+END_TESTS
+
+TEST_STACK_AND_DATA
+
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-trap-s-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-trap-s-01.S
@ -0,0 +1,85 @@
+///////////////////////////////////////////
+//
+// WALLY-trap-s
+//
+// Author: Kip Macsai-Goren <kmacsaigoren@g.hmc.edu>
+//
+// Created 2022-03-11
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+#include "WALLY-TEST-LIB-64.h"
+
+INIT_TESTS
+
+// test 5.3.1.4 Basic trap tests 
+
+TRAP_HANDLER m, DEBUG=1 // turn on recording mtval and status bits on traps
+TRAP_HANDLER s, DEBUG=1 // have S mode trap handler as well
+
+// Like WALLY-trap, cause all the same traps from S mode and make sure they go to machine mode with zeroed mideleg, medeleg
+
+GOTO_S_MODE
+
+li x28, 0x8
+csrs sstatus, x28 // set sstatus.MIE bit to 1 // *** might be unneccessary for s mode
+// WRITE_READ_CSR mie, 0xFFFF *** commented out until I can get the trap handler (and spike for time interrupts) to work correctly with interrupts
+
+
+// CAUSE_INSTR_ADDR_MISALIGNED //skipped becuase this exception may be impossible when compressed instructions are enabled)
+CAUSE_INSTR_ACCESS
+CAUSE_ILLEGAL_INSTR
+CAUSE_BREAKPNT
+CAUSE_LOAD_ADDR_MISALIGNED
+CAUSE_LOAD_ACC
+CAUSE_STORE_ADDR_MISALIGNED
+CAUSE_STORE_ACC
+CAUSE_ECALL
+
+// CAUSE_TIME_INTERRUPT *** intentionally causing this trap seems difficult in spike. although it is possible for it to accidentally happen. 
+// CAUSE_SOFT_INTERRUPT *** exiting out of the trap handler after these is current;y broken
+// CAUSE_EXT_INTERRUPT
+
+
+// Now delegate all traps to S mode and attempt them again, ensuring they now go to the S mode trap handler
+// We can tell which one becuase the different trap handler modes write different bits of the status register 
+// to the output when debug is on.
+
+GOTO_M_MODE // so we can write the delegate registers
+
+WRITE_READ_CSR medeleg, 0xFFFFFFFFFFFFFFFF
+WRITE_READ_CSR mideleg, 0xFFFFFFFFFFFFFFFF
+
+GOTO_S_MODE
+
+// CAUSE_INSTR_ADDR_MISALIGNED //skipped becuase this exception may be impossible when compressed instructions are enabled)
+CAUSE_INSTR_ACCESS
+CAUSE_ILLEGAL_INSTR
+CAUSE_BREAKPNT
+CAUSE_LOAD_ADDR_MISALIGNED
+CAUSE_LOAD_ACC
+CAUSE_STORE_ADDR_MISALIGNED
+CAUSE_STORE_ACC
+CAUSE_ECALL
+
+// CAUSE_TIME_INTERRUPT *** intentionally causing this trap seems difficult in spike. although it is possible for it to accidentally happen. 
+// CAUSE_SOFT_INTERRUPT *** exiting out of the trap handler after these is current;y broken
+// CAUSE_EXT_INTERRUPT
+
+END_TESTS
+
+TEST_STACK_AND_DATA
+
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-trap-u-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-trap-u-01.S
@ -0,0 +1,84 @@
+///////////////////////////////////////////
+//
+// WALLY-trap-u
+//
+// Author: Kip Macsai-Goren <kmacsaigoren@g.hmc.edu>
+//
+// Created 2022-03-11
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+#include "WALLY-TEST-LIB-64.h"
+
+INIT_TESTS
+
+// test 5.3.1.4 Basic trap tests 
+
+TRAP_HANDLER m, DEBUG=1 // turn on recording mtval and status bits on traps
+TRAP_HANDLER s, DEBUG=1 // have S mode trap handler as well
+
+// Like WALLY-trap, cause all the same traps from U mode and make sure they go to machine mode with zeroed mideleg, medeleg
+
+GOTO_U_MODE
+
+// li x28, 0x8
+// csrs sstatus, x28 // set sstatus.MIE bit to 1 // *** might be unneccessary for s mode
+// WRITE_READ_CSR mie, 0xFFFF *** commented out until I can get the trap handler (and spike for time interrupts) to work correctly with interrupts
+
+
+// CAUSE_INSTR_ADDR_MISALIGNED //skipped becuase this exception may be impossible when compressed instructions are enabled)
+CAUSE_INSTR_ACCESS
+CAUSE_ILLEGAL_INSTR
+CAUSE_BREAKPNT
+CAUSE_LOAD_ADDR_MISALIGNED
+CAUSE_LOAD_ACC
+CAUSE_STORE_ADDR_MISALIGNED
+CAUSE_STORE_ACC
+CAUSE_ECALL
+
+// CAUSE_TIME_INTERRUPT *** intentionally causing this trap seems difficult in spike. although it is possible for it to accidentally happen. 
+// CAUSE_SOFT_INTERRUPT *** exiting out of the trap handler after these is current;y broken
+// CAUSE_EXT_INTERRUPT
+
+
+// Now delegate all traps to S mode and attempt them again, ensuring they now go to the S mode trap handler
+// We can tell which one becuase the different trap handler modes write different bits of the status register 
+// to the output when debug is on.
+
+GOTO_M_MODE // so we can write the delegate registers
+
+WRITE_READ_CSR medeleg, 0xFFFFFFFFFFFFFFFF
+WRITE_READ_CSR mideleg, 0xFFFFFFFFFFFFFFFF
+
+GOTO_U_MODE
+
+// CAUSE_INSTR_ADDR_MISALIGNED //skipped becuase this exception may be impossible when compressed instructions are enabled)
+CAUSE_INSTR_ACCESS
+CAUSE_ILLEGAL_INSTR
+CAUSE_BREAKPNT
+CAUSE_LOAD_ADDR_MISALIGNED
+CAUSE_LOAD_ACC
+CAUSE_STORE_ADDR_MISALIGNED
+CAUSE_STORE_ACC
+CAUSE_ECALL
+
+// CAUSE_TIME_INTERRUPT *** intentionally causing this trap seems difficult in spike. although it is possible for it to accidentally happen. 
+// CAUSE_SOFT_INTERRUPT *** exiting out of the trap handler after these is current;y broken
+// CAUSE_EXT_INTERRUPT
+
+END_TESTS
+
+TEST_STACK_AND_DATA