Merge branch 'main' of https://github.com/davidharrishmc/riscv-wally into main

2025-02-11 06:05:49 +00:00 · 2021-07-15 10:52:39 -04:00 · 2021-07-15 10:52:39 -04:00 · 74e67df080
commit 74e67df080
parent dd313d57c0 c74d26eea4
28 changed files with 2774 additions and 748 deletions
--- a/wally-pipelined/config/rv32icfd/BTBPredictor.txt
+++ b/wally-pipelined/config/rv32icfd/BTBPredictor.txt
--- a/wally-pipelined/config/rv32icfd/twoBitPredictor.txt
+++ b/wally-pipelined/config/rv32icfd/twoBitPredictor.txt
--- a/wally-pipelined/config/rv32icfd/wally-config.vh
+++ b/wally-pipelined/config/rv32icfd/wally-config.vh
@ -0,0 +1,106 @@
+//////////////////////////////////////////
+// wally-config.vh
+//
+// Written: David_Harris@hmc.edu 4 January 2021
+// Modified: 
+//
+// Purpose: Specify which features are configured
+//          Macros to determine which modes are supported based on MISA
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+// include shared configuration
+`include "wally-shared.vh"
+
+`define BUILDROOT 0
+`define BUSYBEAR 0
+
+// RV32 or RV64: XLEN = 32 or 64
+`define XLEN 32
+
+`define MISA (32'h00000104 | 1 << 5 | 1 << 20 | 1 << 18 | 1 << 12)
+`define ZCSR_SUPPORTED 1
+`define COUNTERS 32
+`define ZCOUNTERS_SUPPORTED 1
+
+// Microarchitectural Features
+`define UARCH_PIPELINED 1
+`define UARCH_SUPERSCALR 0
+`define UARCH_SINGLECYCLE 0
+`define MEM_DCACHE 0
+`define MEM_DTIM 1
+`define MEM_ICACHE 0
+`define MEM_VIRTMEM 1
+`define VECTORED_INTERRUPTS_SUPPORTED 1
+
+`define ITLB_ENTRIES 32
+`define DTLB_ENTRIES 32
+
+// Legal number of PMP entries are 0, 16, or 64
+`define PMP_ENTRIES 16
+
+// Address space
+`define RESET_VECTOR 32'h80000000
+
+// Peripheral Addresses
+// Peripheral memory space extends from BASE to BASE+RANGE
+// Range should be a thermometer code with 0's in the upper bits and 1s in the lower bits
+
+// *** each of these is `PA_BITS wide. is this paramaterizable INSIDE the config file?
+`define BOOTTIM_SUPPORTED 1'b1
+`define BOOTTIM_BASE   34'h00001000 
+`define BOOTTIM_RANGE  34'h00000FFF
+`define TIM_SUPPORTED 1'b1
+`define TIM_BASE       34'h80000000
+`define TIM_RANGE      34'h07FFFFFF
+`define CLINT_SUPPORTED 1'b1
+`define CLINT_BASE  34'h02000000
+`define CLINT_RANGE 34'h0000FFFF
+`define GPIO_SUPPORTED 1'b1
+`define GPIO_BASE   34'h10012000
+`define GPIO_RANGE  34'h000000FF
+`define UART_SUPPORTED 1'b1
+`define UART_BASE   34'h10000000
+`define UART_RANGE  34'h00000007
+`define PLIC_SUPPORTED 1'b1
+`define PLIC_BASE   34'h0C000000
+`define PLIC_RANGE  34'h03FFFFFF
+
+// Bus Interface width
+`define AHBW 32
+
+// Test modes
+
+// Tie GPIO outputs back to inputs
+`define GPIO_LOOPBACK_TEST 1
+
+// Hardware configuration
+`define UART_PRESCALE 1
+
+// Interrupt configuration
+`define PLIC_NUM_SRC 4
+// comment out the following if >=32 sources
+`define PLIC_NUM_SRC_LT_32
+`define PLIC_GPIO_ID 3
+`define PLIC_UART_ID 4
+
+`define TWO_BIT_PRELOAD "../config/rv32icfd/twoBitPredictor.txt"
+`define BTB_PRELOAD "../config/rv32icfd/BTBPredictor.txt"
+`define BPRED_ENABLED 1
+`define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
+`define TESTSBP 0
--- a/wally-pipelined/config/rv64icfd/wally-config.vh
+++ b/wally-pipelined/config/rv64icfd/wally-config.vh
@ -1,109 +1,3 @@
-// //////////////////////////////////////////
-// // wally-config.vh
-// //
-// // Written: David_Harris@hmc.edu 4 January 2021
-// // Modified: 
-// //
-// // Purpose: Specify which features are configured
-// //          Macros to determine which modes are supported based on MISA
-// // 
-// // A component of the Wally configurable RISC-V project.
-// // 
-// // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-// //
-// // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// // is furnished to do so, subject to the following conditions:
-// //
-// // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-// //
-// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-// ///////////////////////////////////////////
-
-// // include shared configuration
-// `include "wally-shared.vh"
-
-// `define BUILDROOT 0
-// `define BUSYBEAR 0
-
-// // RV32 or RV64: XLEN = 32 or 64
-// `define XLEN 32
-
-// `define MISA (32'h00000104 | 1 << 5 | 1 << 20 | 1 << 18 | 1 << 12)
-// `define ZCSR_SUPPORTED 1
-// `define COUNTERS 32
-// `define ZCOUNTERS_SUPPORTED 1
-
-// // Microarchitectural Features
-// `define UARCH_PIPELINED 1
-// `define UARCH_SUPERSCALR 0
-// `define UARCH_SINGLECYCLE 0
-// `define MEM_DCACHE 0
-// `define MEM_DTIM 1
-// `define MEM_ICACHE 0
-// `define MEM_VIRTMEM 1
-// `define VECTORED_INTERRUPTS_SUPPORTED 1
-
-// `define ITLB_ENTRIES 32
-// `define DTLB_ENTRIES 32
-
-// // Legal number of PMP entries are 0, 16, or 64
-// `define PMP_ENTRIES 16
-
-// // Address space
-// `define RESET_VECTOR 32'h80000000
-
-// // Peripheral Addresses
-// // Peripheral memory space extends from BASE to BASE+RANGE
-// // Range should be a thermometer code with 0's in the upper bits and 1s in the lower bits
-
-// // *** each of these is `PA_BITS wide. is this paramaterizable INSIDE the config file?
-// `define BOOTTIM_SUPPORTED 1'b1
-// `define BOOTTIM_BASE   34'h00001000 
-// `define BOOTTIM_RANGE  34'h00000FFF
-// `define TIM_SUPPORTED 1'b1
-// `define TIM_BASE       34'h80000000
-// `define TIM_RANGE      34'h07FFFFFF
-// `define CLINT_SUPPORTED 1'b1
-// `define CLINT_BASE  34'h02000000
-// `define CLINT_RANGE 34'h0000FFFF
-// `define GPIO_SUPPORTED 1'b1
-// `define GPIO_BASE   34'h10012000
-// `define GPIO_RANGE  34'h000000FF
-// `define UART_SUPPORTED 1'b1
-// `define UART_BASE   34'h10000000
-// `define UART_RANGE  34'h00000007
-// `define PLIC_SUPPORTED 1'b1
-// `define PLIC_BASE   34'h0C000000
-// `define PLIC_RANGE  34'h03FFFFFF
-
-// // Bus Interface width
-// `define AHBW 32
-
-// // Test modes
-
-// // Tie GPIO outputs back to inputs
-// `define GPIO_LOOPBACK_TEST 1
-
-// // Hardware configuration
-// `define UART_PRESCALE 1
-
-// // Interrupt configuration
-// `define PLIC_NUM_SRC 4
-// // comment out the following if >=32 sources
-// `define PLIC_NUM_SRC_LT_32
-// `define PLIC_GPIO_ID 3
-// `define PLIC_UART_ID 4
-
-// `define TWO_BIT_PRELOAD "../config/rv32ic/twoBitPredictor.txt"
-// `define BTB_PRELOAD "../config/rv32ic/BTBPredictor.txt"
-// `define BPRED_ENABLED 1
-// `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
-// `define TESTSBP 0
 //////////////////////////////////////////
 // wally-config.vh
 //
--- a/wally-pipelined/regression/sim-wally-batch-rv32icfd
+++ b/wally-pipelined/regression/sim-wally-batch-rv32icfd
@ -0,0 +1,3 @@
+vsim -c <<!
+do wally-pipelined-batch-rv32icfd.do ../config/rv32icfd rv32icfd
+!
--- a/wally-pipelined/regression/sim-wally-rv32icfd
+++ b/wally-pipelined/regression/sim-wally-rv32icfd
@ -0,0 +1 @@
+vsim -do wally-pipelined-rv32icfd.do
--- a/wally-pipelined/regression/wally-pipelined-batch-rv32icfd.do
+++ b/wally-pipelined/regression/wally-pipelined-batch-rv32icfd.do
@ -0,0 +1,42 @@
+# wally-pipelined-batch.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# Use this wally-pipelined-batch.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined-batch.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined-batch.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work_$2] {
+    vdel -lib work_$2 -all
+}
+vlib work_$2
+
+# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# default to config/rv64ic, but allow this to be overridden at the command line.  For example:
+switch $argc {
+    0 {vlog +incdir+../config/rv32icfd +incdir+../config/shared ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+    1 {vlog +incdir+$1 +incdir+../config/shared ../testbench/testbench-imperas.sv  ../src/*/*.sv -suppress 2583}
+    2 {vlog -work work_$2 +incdir+$1 +incdir+../config/shared ../testbench/testbench-imperas.sv  ../src/*/*.sv -suppress 2583}
+}
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+vopt work_$2.testbench -work work_$2 -o workopt_$2
+vsim -lib work_$2 workopt_$2
+
+run -all
+quit
--- a/wally-pipelined/regression/wally-pipelined-rv32icfd.do
+++ b/wally-pipelined/regression/wally-pipelined-rv32icfd.do
@ -0,0 +1,50 @@
+# wally-pipelined.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# Use this wally-pipelined.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# default to config/rv64ic, but allow this to be overridden at the command line.  For example:
+# do wally-pipelined.do ../config/rv32ic
+switch $argc {
+    0 {vlog +incdir+../config/rv32icfd +incdir+../config/shared ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+    1 {vlog +incdir+$1  +incdir+../config/shared ../testbench/testbench-imperas.sv ../testbench/function_radix.sv ../src/*/*.sv -suppress 2583}
+}
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+vopt +acc work.testbench -o workopt 
+vsim workopt
+
+view wave
+-- display input and output signals as hexidecimal values
+do ./wave-dos/default-waves.do
+
+-- Run the Simulation 
+#run 5000 
+run -all
+#quit
+noview ../testbench/testbench-imperas.sv
+view wave
--- a/wally-pipelined/src/fpu/divconv.sv
+++ b/wally-pipelined/src/fpu/divconv.sv
@ -1,4 +1,3 @@
-`timescale 1ps/1ps
 module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_out,
 		regr_out, d, n, sel_muxa, sel_muxb, sel_muxr, reset, clk, load_rega, load_regb, 
 		load_regc, load_regd, load_regr, load_regs, P, op_type, exp_odd);
@ -106,123 +105,123 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_o
   
 endmodule // divconv

-module adder #(parameter WIDTH=8)
-   (input  logic [WIDTH-1:0] a, b,
-    input logic 	     cin,
-    output logic [WIDTH-1:0] y,
-    output logic 	     cout);
+// module adder #(parameter WIDTH=8)
+//    (input  logic [WIDTH-1:0] a, b,
+//     input logic 	     cin,
+//     output logic [WIDTH-1:0] y,
+//     output logic 	     cout);
   
-   assign {cout, y} = a + b + cin;
+//    assign {cout, y} = a + b + cin;
   
-endmodule // adder
+// endmodule // adder

-module flopenr #(parameter WIDTH = 8)
-   (input  logic             clk, reset, en,
-    input  logic [WIDTH-1:0] d, 
-    output logic [WIDTH-1:0] q);
+// module flopenr #(parameter WIDTH = 8)
+//    (input  logic             clk, reset, en,
+//     input  logic [WIDTH-1:0] d, 
+//     output logic [WIDTH-1:0] q);

-   always_ff @(posedge clk, posedge reset)
-     if (reset)   q <= #10 0;
-     else if (en) q <= #10 d;
+//    always_ff @(posedge clk, posedge reset)
+//      if (reset)   q <= #10 0;
+//      else if (en) q <= #10 d;
   
-endmodule // flopenr
+// endmodule // flopenr

-module flopr #(parameter WIDTH = 8)
-   (input  logic             clk, reset,
-    input  logic [WIDTH-1:0] d, 
-    output logic [WIDTH-1:0] q);
+// module flopr #(parameter WIDTH = 8)
+//    (input  logic             clk, reset,
+//     input  logic [WIDTH-1:0] d, 
+//     output logic [WIDTH-1:0] q);

-   always_ff @(posedge clk, posedge reset)
-     if (reset) q <= #10 0;
-     else       q <= #10 d;
+//    always_ff @(posedge clk, posedge reset)
+//      if (reset) q <= #10 0;
+//      else       q <= #10 d;
   
-endmodule // flopr
+// endmodule // flopr

-module flopenrc #(parameter WIDTH = 8)
-   (input  logic             clk, reset, en, clear,
-    input  logic [WIDTH-1:0] d, 
-    output logic [WIDTH-1:0] q);
+// module flopenrc #(parameter WIDTH = 8)
+//    (input  logic             clk, reset, en, clear,
+//     input  logic [WIDTH-1:0] d, 
+//     output logic [WIDTH-1:0] q);

-   always_ff @(posedge clk, posedge reset)
-     if (reset)    q <= #10 0;
-     else if (en) 
-       if (clear) q <= #10 0;
-       else       q <= #10 d;
+//    always_ff @(posedge clk, posedge reset)
+//      if (reset)    q <= #10 0;
+//      else if (en) 
+//        if (clear) q <= #10 0;
+//        else       q <= #10 d;
   
-endmodule // flopenrc
+// endmodule // flopenrc

-module floprc #(parameter WIDTH = 8)
-   (input  logic             clk, reset, clear,
-    input  logic [WIDTH-1:0] d, 
-    output logic [WIDTH-1:0] q);
+// module floprc #(parameter WIDTH = 8)
+//    (input  logic             clk, reset, clear,
+//     input  logic [WIDTH-1:0] d, 
+//     output logic [WIDTH-1:0] q);

-   always_ff @(posedge clk, posedge reset)
-     if (reset) q <= #10 0;
-     else       
-       if (clear) q <= #10 0;
-       else       q <= #10 d;
+//    always_ff @(posedge clk, posedge reset)
+//      if (reset) q <= #10 0;
+//      else       
+//        if (clear) q <= #10 0;
+//        else       q <= #10 d;
   
-endmodule // floprc
+// endmodule // floprc

-module mux2 #(parameter WIDTH = 8)
-   (input  logic [WIDTH-1:0] d0, d1, 
-    input  logic             s, 
-    output logic [WIDTH-1:0] y);
+// module mux2 #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] d0, d1, 
+//     input  logic             s, 
+//     output logic [WIDTH-1:0] y);

-   assign y = s ? d1 : d0;
+//    assign y = s ? d1 : d0;
   
-endmodule // mux2
+// endmodule // mux2

-module mux3 #(parameter WIDTH = 8)
-   (input  logic [WIDTH-1:0] d0, d1, d2,
-    input  logic [1:0]       s, 
-    output logic [WIDTH-1:0] y);
+// module mux3 #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] d0, d1, d2,
+//     input  logic [1:0]       s, 
+//     output logic [WIDTH-1:0] y);

-   assign y = s[1] ? d2 : (s[0] ? d1 : d0);
+//    assign y = s[1] ? d2 : (s[0] ? d1 : d0);
   
-endmodule // mux3
+// endmodule // mux3

-module mux4 #(parameter WIDTH = 8)
-   (input  logic [WIDTH-1:0] d0, d1, d2, d3,
-    input  logic [1:0]       s, 
-    output logic [WIDTH-1:0] y);
+// module mux4 #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] d0, d1, d2, d3,
+//     input  logic [1:0]       s, 
+//     output logic [WIDTH-1:0] y);

-   assign y = s[1] ? (s[0] ? d3 : d2) : (s[0] ? d1 : d0);
+//    assign y = s[1] ? (s[0] ? d3 : d2) : (s[0] ? d1 : d0);

-endmodule // mux4
+// endmodule // mux4

-module mux5 #(parameter WIDTH = 8)
-   (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4,
-    input  logic [2:0]       s,
-    output logic [WIDTH-1:0] y);
+// module mux5 #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4,
+//     input  logic [2:0]       s,
+//     output logic [WIDTH-1:0] y);
   
-   always_comb
-     casez (s)
-       3'b000 : y = d0;       
-       3'b001 : y = d1;
-       3'b010 : y = d2;
-       3'b011 : y = d3;
-       3'b1?? : y = d4;
-     endcase // casez (s)
+//    always_comb
+//      casez (s)
+//        3'b000 : y = d0;       
+//        3'b001 : y = d1;
+//        3'b010 : y = d2;
+//        3'b011 : y = d3;
+//        3'b1?? : y = d4;
+//      endcase // casez (s)

-endmodule // mux5
+// endmodule // mux5

-module mux6 #(parameter WIDTH = 8)
-   (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4, d5,
-    input  logic [2:0]       s,
-    output logic [WIDTH-1:0] y);
+// module mux6 #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4, d5,
+//     input  logic [2:0]       s,
+//     output logic [WIDTH-1:0] y);
   
-   always_comb
-     casez (s)
-       3'b000 : y = d0;       
-       3'b001 : y = d1;
-       3'b010 : y = d2;
-       3'b011 : y = d3;
-       3'b10? : y = d4;
-       3'b11? : y = d5;       
-     endcase // casez (s)
+//    always_comb
+//      casez (s)
+//        3'b000 : y = d0;       
+//        3'b001 : y = d1;
+//        3'b010 : y = d2;
+//        3'b011 : y = d3;
+//        3'b10? : y = d4;
+//        3'b11? : y = d5;       
+//      endcase // casez (s)

-endmodule // mux6
+// endmodule // mux6

 module eqcmp #(parameter WIDTH = 8)
   (input  logic [WIDTH-1:0] a, b,
@ -232,25 +231,25 @@ module eqcmp #(parameter WIDTH = 8)
   
 endmodule // eqcmp

-module fa (input logic a, b, c, output logic sum, carry);
+// module fa (input logic a, b, c, output logic sum, carry);

-   assign sum = a^b^c;
-   assign carry = a&b|a&c|b&c;   
+//    assign sum = a^b^c;
+//    assign carry = a&b|a&c|b&c;   

-endmodule // fa
+// endmodule // fa

-module csa #(parameter WIDTH=8) 
-   (input logic [WIDTH-1:0] a, b, c,
-    output logic [WIDTH-1:0] sum, carry);
+// module csa #(parameter WIDTH=8) 
+//    (input logic [WIDTH-1:0] a, b, c,
+//     output logic [WIDTH-1:0] sum, carry);

-   logic [WIDTH:0] 	     carry_temp;   
-   genvar 		     i;
-   generate
-      for (i=0;i<WIDTH;i=i+1)
-	begin : genbit
-	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
-	end
-   endgenerate
-   assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     
+//    logic [WIDTH:0] 	     carry_temp;   
+//    genvar 		     i;
+//    generate
+//       for (i=0;i<WIDTH;i=i+1)
+// 	begin : genbit
+// 	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
+// 	end
+//    endgenerate
+//    assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     
   
-endmodule // csa
+// endmodule // csa
--- a/wally-pipelined/src/fpu/faddcvt.sv
+++ b/wally-pipelined/src/fpu/faddcvt.sv
@ -31,8 +31,8 @@ module faddcvt(
   input logic          reset,
   input logic          FlushM,
   input logic          StallM,
-   input logic  [63:0]  SrcXE,		// 1st input operand (A)
-   input logic  [63:0]  SrcYE,		// 2nd input operand (B)
+   input logic  [63:0]  FSrcXE,		// 1st input operand (A)
+   input logic  [63:0]  FSrcYE,		// 2nd input operand (B)
   input logic  [3:0]   FOpCtrlE, FOpCtrlM,	// Function opcode
   input logic          FmtE, FmtM,   		// Result Precision (0 for double, 1 for single)
   input logic  [2:0] 	FrmM,		// Rounding mode - specify values 
@ -59,7 +59,7 @@ module faddcvt(
   logic [10:0] 	AddExponentE, AddExponentM;


-   fpuaddcvt1 fpadd1 (.SrcXE, .SrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
+   fpuaddcvt1 fpadd1 (.FSrcXE, .FSrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
                     .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
                     .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
                     .AddDenormInE, .AddConvertE, .AddSwapE, .AddNormOvflowE);
@ -83,10 +83,10 @@ module faddcvt(
                     .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
 endmodule

-module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, SrcXE, SrcYE, FOpCtrlE, FmtE);
+module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, FSrcXE, FSrcYE, FOpCtrlE, FmtE);

-   input logic [63:0] SrcXE;		// 1st input operand (A)
-   input logic [63:0] SrcYE;		// 2nd input operand (B)
+   input logic [63:0] FSrcXE;		// 1st input operand (A)
+   input logic [63:0] FSrcYE;		// 2nd input operand (B)
   input logic [3:0]	FOpCtrlE;	// Function opcode
   input logic 	FmtE;   		// Result Precision (1 for double, 0 for single)

@ -137,12 +137,12 @@ module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE,
   // and the sign of the first operand is set appropratiately based on
   // if the operation is absolute value or negation. 

-   convert_inputs conv1 (AddFloat1E, AddFloat2E, SrcXE, SrcYE, FOpCtrlE, P);
+   convert_inputs conv1 (AddFloat1E, AddFloat2E, FSrcXE, FSrcYE, FOpCtrlE, P);

   // Test for exceptions and return the "Invalid Operation" and
   // "Denormalized" Input Flags. The "AddSelInvE" is used in
   // the third pipeline stage to select the result. Also, AddOp1NormE
-   // and AddOp2NormE are one if SrcXE and SrcYE are not zero or denormalized.
+   // and AddOp2NormE are one if FSrcXE and FSrcYE are not zero or denormalized.
   // sub is one if the effective operation is subtaction. 

   exception exc1 (AddSelInvE, AddInvalidE, AddDenormInE, AddOp1NormE, AddOp2NormE, sub, 
@ -215,8 +215,8 @@ module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE,

   // Place either the sign-extened 32-bit value or the original 64-bit value 
   // into IntValue (to be used for integer to floating point conversion)
-   // assign IntValue [31:0] = SrcXE[31:0];
-   // assign IntValue [63:32] = FOpCtrlE[0] ? {32{SrcXE[31]}} : SrcXE[63:32];
+   // assign IntValue [31:0] = FSrcXE[31:0];
+   // assign IntValue [63:32] = FOpCtrlE[0] ? {32{FSrcXE[31]}} : FSrcXE[63:32];

   // If doing an integer to floating point conversion, mantissaA3 is set to 
   // IntVal and the prenomalized exponent is set to 1084. Otherwise, 
--- a/wally-pipelined/src/fpu/fclassify.sv
+++ b/wally-pipelined/src/fpu/fclassify.sv
@ -2,45 +2,52 @@
 `include "wally-config.vh"

 module fclassify (
-    input  logic [63:0] SrcXE,
-    input  logic        FmtE,           // 0-Single 1-Double
+    input  logic XSgnE,
+    input  logic [51:0] XFracE,
+    input logic XNaNE, 
+    input logic XSNaNE,
+    input logic XNormE,
+    input logic XDenormE,
+    input logic XZeroE,
+    input logic XInfE,
+    // input  logic        FmtE,           // 0-Single 1-Double
    output logic [63:0] ClassResE
    );

-    logic Sgn;
-    logic Inf, NaN, Zero, Norm, Denorm;
-    logic PInf, QNaN, PZero, PNorm, PDenorm;
-    logic NInf, SNaN, NZero, NNorm, NDenorm;
-    logic MaxExp, ExpZero, ManZero, FirstBitFrac;
+    // logic XSgnE;
+    // logic Inf, NaN, Zero, Norm, Denorm;
+    logic PInf, PZero, PNorm, PDenorm;
+    logic NInf, NZero, NNorm, NDenorm;
+    // logic MaxExp, ExpZero, ManZero, FirstBitFrac;
   
    // Single and Double precision layouts
-    assign Sgn = FmtE ? SrcXE[63] : SrcXE[31];
+    // assign XSgnE = FmtE ? FSrcXE[63] : FSrcXE[31];

    // basic calculations for readabillity
    
-    assign ExpZero = FmtE ? ~|SrcXE[62:52] : ~|SrcXE[30:23];
-    assign MaxExp = FmtE ? &SrcXE[62:52] : &SrcXE[30:23];
-    assign ManZero = FmtE ? ~|SrcXE[51:0] : ~|SrcXE[22:0];
-    assign FirstBitFrac = FmtE ? SrcXE[51] : SrcXE[22];
+    // assign ExpZero = FmtE ? ~|FSrcXE[62:52] : ~|FSrcXE[30:23];
+    // assign MaxExp = FmtE ? &FSrcXE[62:52] : &FSrcXE[30:23];
+    // assign ManZero = FmtE ? ~|FSrcXE[51:0] : ~|FSrcXE[22:0];
+    // assign FirstBitFrac = FmtE ? FSrcXE[51] : FSrcXE[22];

    // determine the type of number
-    assign NaN      = MaxExp & ~ManZero;
-    assign Inf = MaxExp & ManZero;
-    assign Zero     = ExpZero & ManZero;
-    assign Denorm= ExpZero & ~ManZero;
-    assign Norm   = ~ExpZero;
+    // assign NaN      = MaxExp & ~ManZero;
+    // assign Inf = MaxExp & ManZero;
+    // assign Zero     = ExpZero & ManZero;
+    // assign Denorm= ExpZero & ~ManZero;
+    // assign Norm   = ~ExpZero;

    // determine the sub categories
-    assign QNaN = FirstBitFrac&NaN;
-    assign SNaN = ~FirstBitFrac&NaN;
-    assign PInf = ~Sgn&Inf;
-    assign NInf = Sgn&Inf;
-    assign PNorm = ~Sgn&Norm;
-    assign NNorm = Sgn&Norm;
-    assign PDenorm = ~Sgn&Denorm;
-    assign NDenorm = Sgn&Denorm;
-    assign PZero = ~Sgn&Zero;
-    assign NZero = Sgn&Zero;
+    // assign QNaN = FirstBitFrac&NaN;
+    // assign SNaN = ~FirstBitFrac&NaN;
+    assign PInf = ~XSgnE&XInfE;
+    assign NInf = XSgnE&XInfE;
+    assign PNorm = ~XSgnE&XNormE;
+    assign NNorm = XSgnE&XNormE;
+    assign PDenorm = ~XSgnE&XDenormE;
+    assign NDenorm = XSgnE&XDenormE;
+    assign PZero = ~XSgnE&XZeroE;
+    assign NZero = XSgnE&XZeroE;

    // determine sub category and combine into the result
    //  bit 0 - -Inf
@ -53,6 +60,6 @@ module fclassify (
    //  bit 7 - +Inf
    //  bit 8 - signaling NaN
    //  bit 9 - quiet NaN
-    assign ClassResE = {{54{1'b0}}, QNaN, SNaN, PInf, PNorm,  PDenorm, PZero, NZero, NDenorm, NNorm, NInf};
+    assign ClassResE = {{54{1'b0}}, XNaNE&~XSNaNE, XSNaNE, PInf, PNorm,  PDenorm, PZero, NZero, NDenorm, NNorm, NInf};

 endmodule
--- a/wally-pipelined/src/fpu/fcmp.sv
+++ b/wally-pipelined/src/fpu/fcmp.sv
@ -42,31 +42,32 @@
 module fcmp (   
   input logic [63:0] op1, 
   input logic [63:0] op2,
+   input logic XNaNE, YNaNE,
+   input logic XZeroE, YZeroE,
+   input logic [63:0] FSrcXE,
+   input logic [63:0] FSrcYE,
   input logic [2:0]  FOpCtrlE,
   input logic 	      FmtE,

   
   output logic       Invalid, 		 // Invalid Operation
-   // output logic [1:0] FCC,  		 // Condition Codes 
   output logic [63:0] CmpResE);
+
   // Perform magnitude comparison between the 63 least signficant bits
   // of the input operands. Only LT and EQ are returned, since GT can
   // be determined from these values. 
   logic [1:0] FCC;  		 // Condition Codes 
   logic [7:0]	      w, x;
-   logic	      ANaN, BNaN;
-   logic	      Azero, Bzero;
+   // logic	      ANaN, BNaN;
+   // logic	      Azero, Bzero;
   logic 	      LT;                // magnitude op1 < magnitude op2
   logic 	      EQ;                // magnitude op1 = magnitude op2
-   logic [63:0]   PosOp1, PosOp2;
-   
-   assign PosOp1 = FmtE ? {~op1[63], op1[62:0]} : {~op1[31], op1[30:0], 32'b0};
-   assign PosOp2 = FmtE ? {~op2[63], op2[62:0]} : {~op2[31], op2[30:0], 32'b0};
-   magcompare64b_1 magcomp1 (w, x, PosOp1, PosOp2);
+
+
+   magcompare64b_1 magcomp1 (w, x, {~op1[63], op1[62:0]}, {~op2[63], op2[62:0]});

   // Determine final values based on output of magnitude comparison, 
   // sign bits, and special case testing. 
-   exception_cmp_1 exc1 (ANaN, BNaN, Azero, Bzero, op1, op2, FOpCtrlE);
   
   // Perform magnitude comparison between the 63 least signficant bits
   // of the input operands. Only LT and EQ are returned, since GT can
@ -75,24 +76,10 @@ module fcmp (

   // Determine final values based on output of magnitude comparison, 
   // sign bits, and special case testing. 
-   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .FOpCtrlE(FOpCtrlE), .A(op1), .B(op2), .*);
+   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(XNaNE), .BNaN(YNaNE), .Azero(XZeroE), .Bzero(YZeroE), .FOpCtrlE(FOpCtrlE), .A(op1), .B(op2), .FSrcXE, .FSrcYE, .*);

 endmodule // fpcomp

-// module magcompare2b (LT, GT, A, B);
-
-//    input logic [1:0] A;
-//    input logic [1:0] B;
-   
-//    output logic     LT;
-//    output logic     GT;
-
-//    // Determine if A < B  using a minimized sum-of-products expression
-//    assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
-//    // Determine if A > B  using a minimized sum-of-products expression
-//    assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
-
-// endmodule // magcompare2b

 // 2-bit magnitude comparator
 // This module compares two 2-bit values A and B. LT is '1' if A < B 
@ -198,135 +185,6 @@ module magcompare64b_1 (w, x,  A, B);

 endmodule // magcompare64b

-// This module takes 64-bits inputs A and B, two magnitude comparison
-// flags LT_mag and EQ_mag, and a 2-bit signal FOpCtrlE that indicates the type of 
-// operands being compared as indicated below.
-//	FOpCtrlE	Description
-//	 00	double precision numbers
-//	 01	single precision numbers
-//	 10	half precision numbers
-//	 11	bfloat precision numbers
-//
-// The comparator produces a 2-bit signal fcc, which
-// indicates the result of the comparison as follows:
-//     fcc 	decscription
-//      00	A = B	
-//      01	A < B	
-//      10	A > B	
-//      11	A and B	are unordered (i.e., A or B is NaN)
-// It also produces a invalid operation flag, which is one
-// if either of the input operands is a signaling NaN.
-
-module exception_cmp_1 (ANaN, BNaN, Azero, Bzero, A, B, FOpCtrlE);
-
-   input logic [63:0] A;
-   input logic [63:0] B;
-   input logic [2:0]  FOpCtrlE;
-
-   logic 		      dp, sp, hp;
-
-   output logic 	      ANaN;
-   output logic 	      BNaN;
-   output logic               Azero;
-   output logic               Bzero;
-
-   assign dp = !FOpCtrlE[1]&!FOpCtrlE[0];
-   assign sp = !FOpCtrlE[1]&FOpCtrlE[0];
-   assign hp = FOpCtrlE[1]&!FOpCtrlE[0];
-
-   // Test if A or B is NaN.
-   assign ANaN = (A[62]&A[61]&A[60]&A[59]&A[58]) & 
-		 ((sp&A[57]&A[56]&A[55]&(A[54]|A[53])) | 
-		 (dp&A[57]&A[56]&A[55]&A[54]&A[53]&A[52]&(A[51]|A[50])) |
-		 (hp&(A[57]|A[56])));
-
-   assign BNaN = (B[62]&B[61]&B[60]&B[59]&B[58]) & 
-		 ((sp&B[57]&B[56]&B[55]&(B[54]|B[53])) | 
-		 (dp&B[57]&B[56]&B[55]&B[54]&B[53]&B[52]&(B[51]|B[50])) |
-		 (hp&(B[57]|B[56])));
-
-   // Test if A is +0 or -0 when viewed as a floating point number (i.e,
-   // the 63 least siginficant bits of A are zero). 
-   // Depending on how this synthesizes, it may work better to replace
-   // this with assign Azero = ~(A[62] | A[61] | ... | A[0])
-   assign Azero = (A[62:0] == 63'h0);
-   assign Bzero = (B[62:0] == 63'h0);
-
-endmodule // exception_cmp
-//
-// File name : fpcomp.v
-// Title     : Floating-Point Comparator
-// project   : FPU
-// Library   : fpcomp
-// Author(s) : James E. Stine
-// Purpose   : definition of main unit to floating-point comparator
-// notes :   
-//
-// Copyright Oklahoma State University
-//
-// Floating Point Comparator (Algorithm)
-//
-// 1.) Performs sign-extension if the inputs are 32-bit integers.
-// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
-// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
-//     and correct for sign bits
-//
-// This module takes 64-bits inputs op1 and op2, VSS, and VDD
-// signals, and a 2-bit signal FOpCtrlE that indicates the type of 
-// operands being compared as indicated below.
-//	FOpCtrlE	Description
-//	 00	double precision numbers
-//	 01	single precision numbers
-//	 10	half precision numbers
-//	 11	(unused)
-//
-// The comparator produces a 2-bit signal FCC, which
-// indicates the result of the comparison:
-//
-//     fcc 	decscription
-//      00	A = B	
-//      01	A < B	
-//      10	A > B	
-//      11	A and B	are unordered (i.e., A or B is NaN)
-//
-// It also produces an invalid operation flag, which is one
-// if either of the input operands is a signaling NaN per 754
-
-
-/*module magcompare2b (LT, GT, A, B);
-
-   input logic [1:0] A;
-   input logic [1:0] B;
-   
-   output logic     LT;
-   output logic     GT;
-
-   // Determine if A < B  using a minimized sum-of-products expression
-   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
-   // Determine if A > B  using a minimized sum-of-products expression
-   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
-
-endmodule*/ // magcompare2b
-
-// 2-bit magnitude comparator
-// This module compares two 2-bit values A and B. LT is '1' if A < B 
-// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
-// this version actually incorporates don't cares into the equation to
-// simplify the optimization
-
-// module magcompare2c (LT, GT, A, B);
-
-//    input logic [1:0] A;
-//    input logic [1:0] B;
-   
-//    output logic      LT;
-//    output logic      GT;
-
-//    assign LT = B[1] | (!A[1]&B[0]);
-//    assign GT = A[1] | (!B[1]&A[0]);
-
-// endmodule // magcompare2b
-
 // This module compares two 64-bit values A and B. LT is '1' if A < B 
 // and EQ is '1'if A = B. LT and GT are both '0' if A > B.
 // This structure was modified so
@ -388,6 +246,8 @@ endmodule // magcompare64b
 module exception_cmp_2 (
   input logic [63:0] A,
   input logic [63:0] B,
+   input logic [63:0] FSrcXE,
+   input logic [63:0] FSrcYE,
   input logic 	      FmtE,
   input logic 	      LT_mag,
   input logic 	      EQ_mag,
@ -456,8 +316,8 @@ module exception_cmp_2 (

   always_comb begin
      case (FOpCtrlE[2:0])
-         3'b111: CmpResE = LT ? A : B;//min 
-         3'b101: CmpResE = GT ? A : B;//max
+         3'b111: CmpResE = LT ? FSrcXE : FSrcYE;//min 
+         3'b101: CmpResE = GT ? FSrcXE : FSrcYE;//max
         3'b010: CmpResE = {63'b0, EQ};//equal
         3'b001: CmpResE = {63'b0, LT};//less than
         3'b011: CmpResE = {63'b0, LT|EQ};//less than or equal
--- a/wally-pipelined/src/fpu/fcvt.sv
+++ b/wally-pipelined/src/fpu/fcvt.sv
@ -1,7 +1,15 @@

 `include "wally-config.vh"
 module fcvt (
-    input logic [63:0] X,           // floating point input
+	input logic        XSgnE,
+    input logic [10:0] XExpE,
+    input logic [51:0] XFracE,
+    input logic XAssumed1E,
+    input logic XZeroE,
+    input logic XNaNE,
+    input logic XInfE,
+    input logic XDenormE,
+    input logic [10:0] BiasE,
    input logic [`XLEN-1:0] SrcAE,  // integer input
    input logic [3:0] FOpCtrlE,     // chooses which instruction is done (full list below)
    input logic [2:0] FrmE,         // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
@ -9,15 +17,10 @@ module fcvt (
    output logic [63:0] CvtResE,    // convert final result
    output logic [4:0] CvtFlgE);     // convert flags {invalid, divide by zero, overflow, underflow, inexact}

-    logic               XSgn;   // FP input's sign
-    logic [10:0]        XExp;   // FP input's exponent
-    logic [51:0]        XFrac;  // FP input's fraction
    logic               ResSgn; // FP result's sign
    logic [10:0]        ResExp,TmpExp; // FP result's exponent
    logic [51:0]        ResFrac;    // FP result's fraction
    logic [5:0]         LZResP;     // lz output
-    // logic              LZResV;
-    logic [11:0]        Bias;       // 1023 for double, 127 for single
    logic [7:0]         Bits;       // how many bits are in the integer result
    logic [7:0]         SubBits;    // subtract these bits from the exponent (FP result)
    logic [64+51:0]  ShiftedManTmp; // Shifted mantissa
@ -31,11 +34,7 @@ module fcvt (
    logic [64-1:0]   PosInt;         // absolute value of the integer input
    logic [63:0]        CvtIntRes;      // interger result from the fp -> int instructions
    logic [63:0]        CvtFPRes;       // floating point result from the int -> fp instructions
-    logic               XFracZero;      // is the fraction of X zero?
    logic               Of, Uf;         // did the integer result underflow or overflow
-    logic               XExpZero;       // is X's exponent zero
-    logic               XExpMax;        // is the exponent all ones
-    logic               XNaN, XDenorm, XInf, XZero; // is X a special value
    logic               Guard, Round, LSB, Sticky;  // bits used to determine rounding
    logic               Plus1,CalcPlus1;    // do you add one for rounding
    logic               SgnRes;             // sign of the floating point result
@ -62,31 +61,15 @@ module fcvt (
      //  fcvt.d.lu = 1101
      //  {long, unsigned, to int, from int}
   
-    // split the input into it's various parts
-    assign XSgn = FmtE ? X[63] : X[31];
-    assign XExp = FmtE ? X[62:52] : {3'b0, X[30:23]};
-    assign XFrac = FmtE ? X[51:0] : {X[23:0], 29'b0};
-
-    // determine if the exponent and fraction are all zero or ones
-    assign XExpZero = ~|XExp;
-    assign XFracZero = ~|XFrac;
-    assign XExpMax = FmtE ? &XExp[10:0] : &XExp[7:0];
-
-    // determine if X is a special value
-    assign XNaN = XExpMax & ~XFracZero;
-    assign XDenorm = XExpZero & ~XFracZero;
-    assign XInf = XExpMax & XFracZero;
-    assign XZero = XExpZero & XFracZero;
-
    // calculate signals based off the input and output's size
-    assign Bias = FmtE ? 12'h3ff : 12'h7f;
+    // assign Bias = FmtE ? 12'h3ff : 12'h7f;
    assign Res64 = ((FOpCtrlE==4'b1010 || FOpCtrlE==4'b1110) | (FmtE&(FOpCtrlE==4'b0001 | FOpCtrlE==4'b0101 | FOpCtrlE==4'b0000 | FOpCtrlE==4'b1001 | FOpCtrlE==4'b1101)));
    assign In64 = ((FOpCtrlE==4'b1001 || FOpCtrlE==4'b1101) | (FmtE&(FOpCtrlE==4'b0010 | FOpCtrlE==4'b0110 | FOpCtrlE==4'b1010 | FOpCtrlE==4'b1110) | (FOpCtrlE==4'b1101 & ~FmtE)));
    assign SubBits = In64 ? 8'd64 : 8'd32;
    assign Bits = Res64 ? 8'd64 : 8'd32;

    // calulate the unbiased exponent
-    assign ExpVal = XExp - Bias + XDenorm;
+    assign ExpVal = XExpE - BiasE + XDenormE;

 ////////////////////////////////////////////////////////

@ -97,11 +80,10 @@ module fcvt (
    // determine the integer's sign
    assign ResSgn = ~FOpCtrlE[2] ? IntIn[64-1] : 1'b0;
    
-    // This did not work \/
    // generate
-    //     if(64 == 64) 
+    //     if(`XLEN == 64) 
    //         lz64 lz(LZResP, LZResV, PosInt);
-    //     else if(64 == 32) begin
+    //     else if(`XLEN == 32) begin
    //         assign LZResP[5] = 1'b0;
    //         lz32 lz(LZResP[4:0], LZResV, PosInt);
    //     end 
@ -111,12 +93,12 @@ module fcvt (
 	logic [8:0]	i;
 	always_comb begin
 			i = 0;
-			while (~PosInt[64-1-i] && i <= `XLEN) i = i+1;  // search for leading one 
+			while (~PosInt[64-1-i] && i < `XLEN) i = i+1;  // search for leading one 
 			LZResP = i+1;    // compute shift count
 	end

    // if no one was found set to zero otherwise calculate the exponent
-    assign TmpExp = i==`XLEN ? 0 : Bias + SubBits - LZResP;
+    assign TmpExp = i==`XLEN ? 0 : BiasE + SubBits - LZResP;



@ -126,12 +108,12 @@ module fcvt (

    // select the shift value and amount based on operation (to fp or int)
    assign ShiftCnt = FOpCtrlE[1] ? ExpVal : LZResP;
-    assign ShiftVal = FOpCtrlE[1] ? {{64-2{1'b0}}, ~(XDenorm|XZero), XFrac} : {PosInt, 52'b0};
+    assign ShiftVal = FOpCtrlE[1] ? {{64-2{1'b0}}, XAssumed1E, XFracE} : {PosInt, 52'b0};

 	// if shift = -1 then shift one bit right for gaurd bit (right shifting twice never rounds)
 	// if the shift is negitive add a bit for sticky bit calculation
 	// otherwise shift left
-    assign ShiftedManTmp = &ShiftCnt ? {{64-1{1'b0}}, ~(XDenorm|XZero), XFrac[51:1]} : ShiftCnt[12] ? {{64+51{1'b0}}, ~XZero} : ShiftVal << ShiftCnt;
+    assign ShiftedManTmp = &ShiftCnt ? {{64-1{1'b0}}, XAssumed1E, XFracE[51:1]} : ShiftCnt[12] ? {{64+51{1'b0}}, ~XZeroE} : ShiftVal << ShiftCnt;

    // truncate the shifted mantissa
    assign ShiftedMan = ShiftedManTmp[64+51:50];
@ -139,7 +121,7 @@ module fcvt (
    // calculate sticky bit 
    //  - take into account the possible right shift from before
    //  - the sticky bit calculation covers three diffrent sizes depending on the opperation
-    assign Sticky = |ShiftedManTmp[49:0] | &ShiftCnt&XFrac[0] | (FOpCtrlE[0]&|ShiftedManTmp[62:50]) | (FOpCtrlE[0]&~FmtE&|ShiftedManTmp[91:63]);
+    assign Sticky = |ShiftedManTmp[49:0] | &ShiftCnt&XFracE[0] | (FOpCtrlE[0]&|ShiftedManTmp[62:50]) | (FOpCtrlE[0]&~FmtE&|ShiftedManTmp[91:63]);

    
    // determine guard, round, and least significant bit of the result
@ -152,23 +134,23 @@ module fcvt (
        case (FrmE)
            3'b000: CalcPlus1 = Guard & (Round | Sticky | (~Round&~Sticky&LSB));//round to nearest even
            3'b001: CalcPlus1 = 0;//round to zero
-            3'b010: CalcPlus1 = (XSgn&FOpCtrlE[1]) | (ResSgn&FOpCtrlE[0]);//round down
-            3'b011: CalcPlus1 = (~XSgn&FOpCtrlE[1]) | (~ResSgn&FOpCtrlE[0]);//round up
+            3'b010: CalcPlus1 = (XSgnE&FOpCtrlE[1]) | (ResSgn&FOpCtrlE[0]);//round down
+            3'b011: CalcPlus1 = (~XSgnE&FOpCtrlE[1]) | (~ResSgn&FOpCtrlE[0]);//round up
            3'b100: CalcPlus1 = Guard & (Round | Sticky | (~Round&~Sticky));//round to nearest max magnitude
            default: CalcPlus1 = 1'bx;
        endcase
    end

    // dont tound if the result is exact
-    assign Plus1 = CalcPlus1 & (Guard|Round|Sticky)&~(XZero&FOpCtrlE[1]);
+    assign Plus1 = CalcPlus1 & (Guard|Round|Sticky)&~(XZeroE&FOpCtrlE[1]);

    // round the shifted mantissa
    assign RoundedTmp = ShiftedMan[64+1:2] + Plus1;
    assign {ResExp, ResFrac} = FmtE ? {TmpExp, ShiftedMan[64+1:14]} + Plus1 :  {{TmpExp, ShiftedMan[64+1:43]} + Plus1, 29'b0} ;

    // fit the rounded result into the appropriate size and take the 2's complement if needed
-     assign Rounded = Res64 ? XSgn&FOpCtrlE[1] ? -RoundedTmp[63:0] : RoundedTmp[63:0] : 
-			      XSgn ? {{32{1'b1}}, -RoundedTmp[31:0]} : {32'b0, RoundedTmp[31:0]};
+     assign Rounded = Res64 ? XSgnE&FOpCtrlE[1] ? -RoundedTmp[63:0] : RoundedTmp[63:0] : 
+			      XSgnE ? {{32{1'b1}}, -RoundedTmp[31:0]} : {32'b0, RoundedTmp[31:0]};

    // extract the MSB and Sign for later use (will be used to determine underflow and overflow)
     assign RoundMSB = Res64 ? RoundedTmp[64] : RoundedTmp[32];
@ -176,10 +158,10 @@ module fcvt (


    // check if the result overflows
-    assign Of = (~XSgn&($signed(ShiftCnt) >= $signed(Bits))) | (~XSgn&RoundSgn&~FOpCtrlE[2]) | (RoundMSB&(ShiftCnt==(Bits-1))) | (~XSgn&XInf) | XNaN;
+    assign Of = (~XSgnE&($signed(ShiftCnt) >= $signed(Bits))) | (~XSgnE&RoundSgn&~FOpCtrlE[2]) | (RoundMSB&(ShiftCnt==(Bits-1))) | (~XSgnE&XInfE) | XNaNE;

    // check if the result underflows (this calculation changes if the result is signed or unsigned)
-    assign Uf = FOpCtrlE[2] ? XSgn&~XZero | (XSgn&XInf) | (XSgn&~XZero&(~ShiftCnt[12]|CalcPlus1)) | (ShiftCnt[12]&Plus1) : (XSgn&XInf) | (XSgn&($signed(ShiftCnt) >= $signed(Bits))) | (XSgn&~RoundSgn&~ShiftCnt[12]);    // assign CvtIntRes =  (XSgn | ShiftCnt[12]) ? {64{1'b0}}  : (ShiftCnt >= 64) ? {64{1'b1}} : Rounded;
+    assign Uf = FOpCtrlE[2] ? XSgnE&~XZeroE | (XSgnE&XInfE) | (XSgnE&~XZeroE&(~ShiftCnt[12]|CalcPlus1)) | (ShiftCnt[12]&Plus1) : (XSgnE&XInfE) | (XSgnE&($signed(ShiftCnt) >= $signed(Bits))) | (XSgnE&~RoundSgn&~ShiftCnt[12]);    // assign CvtIntRes =  (XSgnE | ShiftCnt[12]) ? {64{1'b0}}  : (ShiftCnt >= 64) ? {64{1'b1}} : Rounded;
    
    // calculate the result's sign
    assign SgnRes = ~FOpCtrlE[3] & FOpCtrlE[1];
--- a/wally-pipelined/src/fpu/fhazard.sv
+++ b/wally-pipelined/src/fpu/fhazard.sv
@ -31,36 +31,36 @@ module fhazard(
 	  input logic [4:0] RdM, RdW,
    input logic [2:0] FResultSelM,
    output logic FStallD,
-    output logic [1:0] ForwardXE, ForwardYE, ForwardZE
+    output logic [1:0] FForwardXE, FForwardYE, FForwardZE
 );


  always_comb begin
    // set ReadData as default
-    ForwardXE = 2'b00; // choose FRD1E
-    ForwardYE = 2'b00; // choose FRD2E
-    ForwardZE = 2'b00; // choose FRD3E
+    FForwardXE = 2'b00; // choose FRD1E
+    FForwardYE = 2'b00; // choose FRD2E
+    FForwardZE = 2'b00; // choose FRD3E
    FStallD = 0;

      if ((Adr1E == RdM) & FRegWriteM)
      // if the result will be FResM
-        if(FResultSelM == 3'b100) ForwardXE = 2'b10; // choose FResM
+        if(FResultSelM == 3'b100) FForwardXE = 2'b10; // choose FResM
        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr1E == RdW) & FRegWriteW) ForwardXE = 2'b01; // choose FPUResult64W
+      else if ((Adr1E == RdW) & FRegWriteW) FForwardXE = 2'b01; // choose FPUResult64W
    

      if ((Adr2E == RdM) & FRegWriteM)
      // if the result will be FResM
-        if(FResultSelM == 3'b100) ForwardYE = 2'b10; // choose FResM
+        if(FResultSelM == 3'b100) FForwardYE = 2'b10; // choose FResM
        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr2E == RdW) & FRegWriteW) ForwardYE = 2'b01; // choose FPUResult64W
+      else if ((Adr2E == RdW) & FRegWriteW) FForwardYE = 2'b01; // choose FPUResult64W

 
      if ((Adr3E == RdM) & FRegWriteM)
      // if the result will be FResM
-        if(FResultSelM == 3'b100) ForwardZE = 2'b10; // choose FResM
+        if(FResultSelM == 3'b100) FForwardZE = 2'b10; // choose FResM
        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr3E == RdW) & FRegWriteW) ForwardZE = 2'b01; // choose FPUResult64W
+      else if ((Adr3E == RdW) & FRegWriteW) FForwardZE = 2'b01; // choose FPUResult64W

  end 

--- a/wally-pipelined/src/fpu/fma.sv
+++ b/wally-pipelined/src/fpu/fma.sv
@ -3,12 +3,23 @@ module fma(
    input logic             reset,
    input logic             FlushM,
    input logic             StallM,
-    input logic  [63:0]     SrcXE, SrcXM,  // X
-    input logic  [63:0]     SrcYE, SrcYM,  // Y
-    input logic  [63:0]     SrcZE, SrcZM,  // Z
    input logic             FmtE, FmtM,       // precision 1 = double 0 = single
    input logic  [2:0]      FOpCtrlM, FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
    input logic  [2:0]      FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+    input logic        XSgnE, YSgnE, ZSgnE,
+    input logic [10:0] XExpE, YExpE, ZExpE,
+    input logic [51:0] XFracE, YFracE, ZFracE,
+    input logic        XSgnM, YSgnM, ZSgnM,
+    input logic [10:0] XExpM, YExpM, ZExpM,
+    input logic [51:0] XFracM, YFracM, ZFracM,
+    input logic        XAssumed1E, YAssumed1E, ZAssumed1E,
+    input logic XDenormE, YDenormE, ZDenormE,
+    input logic XZeroE, YZeroE, ZZeroE,
+    input logic XNaNM, YNaNM, ZNaNM,
+    input logic XSNaNM, YSNaNM, ZSNaNM,
+    input logic XZeroM, YZeroM, ZZeroM,
+    input logic XInfM, YInfM, ZInfM,
+    input logic [10:0] BiasE,
 	output logic [63:0]		FMAResM,
 	output logic [4:0]		FMAFlgM);
 	
@ -18,24 +29,23 @@ module fma(
    logic [12:0]	ProdExpE, ProdExpM;
    logic 			AddendStickyE, AddendStickyM;
    logic 			KillProdE, KillProdM;
-    logic				XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
-    logic				XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
-    logic				XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
    
-    fma1 fma1 (.X(SrcXE), .Y(SrcYE), .Z(SrcZE), .FOpCtrlE, .FmtE, .ProdManE, .AlignedAddendE,
-                .ProdExpE, .AddendStickyE, .KillProdE, .XZeroE, .YZeroE, .ZZeroE, .XInfE, .YInfE, .ZInfE,
-                .XNaNE, .YNaNE, .ZNaNE ); 
+    fma1 fma1 (.XExpE, .YExpE, .ZExpE, .XFracE, .YFracE, .ZFracE, 
+                .BiasE, .XAssumed1E, .YAssumed1E, .ZAssumed1E, .XDenormE, .YDenormE, .ZDenormE,  .XZeroE, .YZeroE, .ZZeroE,
+                .FOpCtrlE, .FmtE, .ProdManE, .AlignedAddendE,
+                .ProdExpE, .AddendStickyE, .KillProdE); 
                
    flopenrc #(106) EMRegFma1(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
    flopenrc #(162) EMRegFma2(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
    flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
-    flopenrc #(11) EMRegFma4(clk, reset, FlushM, ~StallM, 
-                            {AddendStickyE, KillProdE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE},
-                            {AddendStickyM, KillProdM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM});
+    flopenrc #(2) EMRegFma4(clk, reset, FlushM, ~StallM, 
+                            {AddendStickyE, KillProdE},
+                            {AddendStickyM, KillProdM});

-    fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM, .FrmM, .FmtM, 
+    fma2 fma2(.XSgnM, .YSgnM, .ZSgnM, .XExpM, .YExpM, .ZExpM, .XFracM, .YFracM, .ZFracM, 
+            .FOpCtrlM, .FrmM, .FmtM, 
            .ProdManM, .AlignedAddendM, .ProdExpM, .AddendStickyM, .KillProdM, 
-            .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, 
+            .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, .XSNaNM, .YSNaNM, .ZSNaNM,
            .FMAResM, .FMAFlgM);

 endmodule
@ -43,98 +53,27 @@ endmodule


 module fma1(
- 
-    input logic     [63:0]      X,  // X
-    input logic     [63:0]      Y,  // Y
-    input logic     [63:0]      Z,  // Z
+    // input logic        XSgnE, YSgnE, ZSgnE,
+    input logic [10:0] XExpE, YExpE, ZExpE,
+    input logic [51:0] XFracE, YFracE, ZFracE,
+    input logic        XAssumed1E, YAssumed1E, ZAssumed1E,
+    input logic        XDenormE, YDenormE, ZDenormE,
+    input logic XZeroE, YZeroE, ZZeroE,
+    input logic [10:0] BiasE,
    input logic     [2:0]       FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
    input logic                 FmtE,       // precision 1 = double 0 = single
    output logic    [105:0]     ProdManE,   // 1.X frac * 1.Y frac
    output logic    [161:0]     AlignedAddendE, // Z aligned for addition
    output logic    [12:0]      ProdExpE,       // X exponent + Y exponent - bias
    output logic                AddendStickyE,  // sticky bit that is calculated during alignment
-    output logic                KillProdE,      // set the product to zero before addition if the product is too small to matter
-    output logic                XZeroE, YZeroE, ZZeroE, // inputs are zero
-    output logic                XInfE, YInfE, ZInfE,    // inputs are infinity
-    output logic                XNaNE, YNaNE, ZNaNE);   // inputs are NaN
+    output logic                KillProdE      // set the product to zero before addition if the product is too small to matter
+    );

-    logic [51:0]    XFrac,YFrac,ZFrac;  // input fraction
-    logic [52:0]    XMan,YMan,ZMan;     // input mantissa (with leading one)
-    logic [12:0]    XExp,YExp,ZExp;     // input exponents
-    logic           XSgn,YSgn,ZSgn;     // input signs
    logic [12:0]    AlignCnt;           // how far to shift the addend to align with the product
    logic [213:0]   ZManShifted;                // output of the alignment shifter including sticky bit
    logic [213:0]   ZManPreShifted;     // input to the alignment shifter
-    logic           XDenorm, YDenorm, ZDenorm;  // inputs are denormal
-    logic [63:0]    Addend; // value to add (Z or zero)
-    logic [12:0]    Bias;   // 1023 for double, 127 for single
-    logic           XExpZero, YExpZero, ZExpZero;   // input exponent zero
-    logic           XFracZero, YFracZero, ZFracZero; // input fraction zero
-    logic           XExpMax, YExpMax, ZExpMax;  // input exponent all 1s
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // split inputs into the sign bit, fraction, and exponent to handle single or double precision
-    //      - single precision is in the top half of the inputs
-    ///////////////////////////////////////////////////////////////////////////////
-
-    // Set addend to zero if FMUL instruction
-    assign Addend = FOpCtrlE[2] ? 64'b0 : Z;
-
-    assign XSgn = X[63];
-    assign YSgn = Y[63];
-    assign ZSgn = Addend[63];
-
-    assign XExp = FmtE ? {2'b0, X[62:52]} : {5'b0, X[62:55]};
-    assign YExp = FmtE ? {2'b0, Y[62:52]} : {5'b0, Y[62:55]};
-    assign ZExp = FmtE ? {2'b0, Addend[62:52]} : {5'b0, Addend[62:55]};
-
-    assign XFrac = FmtE ? X[51:0] : {X[54:32], 29'b0};
-    assign YFrac = FmtE ? Y[51:0] : {Y[54:32], 29'b0};
-    assign ZFrac = FmtE ? Addend[51:0] : {Addend[54:32], 29'b0};
-   
-    assign XMan = {~XExpZero, XFrac};
-    assign YMan = {~YExpZero, YFrac};
-    assign ZMan = {~ZExpZero, ZFrac};
-
-    assign Bias = FmtE ? 13'h3ff : 13'h7f;
-
-
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // determine if an input is a special value
-    ///////////////////////////////////////////////////////////////////////////////
-
-    assign XExpZero = ~|XExp;
-    assign YExpZero = ~|YExp;
-    assign ZExpZero = ~|ZExp;
-   
-    assign XFracZero = ~|XFrac;
-    assign YFracZero = ~|YFrac;
-    assign ZFracZero = ~|ZFrac;
-
-    assign XExpMax = FmtE ? &XExp[10:0] : &XExp[7:0];
-    assign YExpMax = FmtE ? &YExp[10:0] : &YExp[7:0];
-    assign ZExpMax = FmtE ? &ZExp[10:0] : &ZExp[7:0];
-   
-    assign XNaNE = XExpMax & ~XFracZero;
-    assign YNaNE = YExpMax & ~YFracZero;
-    assign ZNaNE = ZExpMax & ~ZFracZero;
-
-    assign XDenorm = XExpZero & ~XFracZero;
-    assign YDenorm = YExpZero & ~YFracZero;
-    assign ZDenorm = ZExpZero & ~ZFracZero;
-
-    assign XInfE = XExpMax & XFracZero;
-    assign YInfE = YExpMax & YFracZero;
-    assign ZInfE = ZExpMax & ZFracZero;
-
-    assign XZeroE = XExpZero & XFracZero;
-    assign YZeroE = YExpZero & YFracZero;
-    assign ZZeroE = ZExpZero & ZFracZero;
-
-
-
-
+    
+    
    ///////////////////////////////////////////////////////////////////////////////
    // Calculate the product
    //      - When multipliying two fp numbers, add the exponents
@ -145,11 +84,11 @@ module fma1(
   
    // verilator lint_off WIDTH
    assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 :
-                 XExp + YExp - Bias + XDenorm + YDenorm;
+                 XExpE + YExpE - BiasE + XDenormE + YDenormE;

    // Calculate the product's mantissa
    //      - Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
-    assign ProdManE =  XMan * YMan;
+    assign ProdManE =  {XAssumed1E, XFracE} * {YAssumed1E, YFracE};



@ -168,7 +107,7 @@ module fma1(
    //      - positive means the product is larger, so shift Z right
    //      - Denormal numbers have an an exponent value of 1, however they are
    //        represented with an exponent of 0. add one to the exponent if it is a denormal number
-    assign AlignCnt = ProdExpE - ZExp - ZDenorm;
+    assign AlignCnt = ProdExpE - ZExpE - ZDenormE;
    // verilator lint_on WIDTH


@ -177,7 +116,7 @@ module fma1(
    //                       |1'b0| addnend |

    // the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
-    assign ZManPreShifted = {55'b0, ZMan, 106'b0};
+    assign ZManPreShifted = {55'b0, {ZAssumed1E, ZFracE}, 106'b0};
    always_comb
        begin
           
@ -187,7 +126,7 @@ module fma1(
        //  | addnend |
        if ($signed(AlignCnt) <= $signed(-13'd56)) begin
            KillProdE = 1;
-            ZManShifted = ZManPreShifted;//{107'b0, ZMan, 54'b0};
+            ZManShifted = ZManPreShifted;//{107'b0, {~ZAssumed1E, ZFrac}, 54'b0};
            AddendStickyE = ~(XZeroE|YZeroE);

        // If the Addend is shifted left (negitive AlignCnt)
@ -229,10 +168,10 @@ endmodule


 module fma2(
- 
-    input logic     [63:0]      X,  // X
-    input logic     [63:0]      Y,  // Y
-    input logic     [63:0]      Z,  // Z
+    
+    input logic        XSgnM, YSgnM, ZSgnM,
+    input logic [10:0] XExpM, YExpM, ZExpM,
+    input logic [51:0] XFracM, YFracM, ZFracM,
    input logic     [2:0]       FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
    input logic     [2:0]       FOpCtrlM,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
    input logic                 FmtM,       // precision 1 = double 0 = single
@ -244,6 +183,7 @@ module fma2(
    input logic                 XZeroM, YZeroM, ZZeroM, // inputs are zero
    input logic                 XInfM, YInfM, ZInfM,    // inputs are infinity
    input logic                 XNaNM, YNaNM, ZNaNM,    // inputs are NaN
+    input logic                 XSNaNM, YSNaNM, ZSNaNM,    // inputs are signaling NaNs
    output logic    [63:0]      FMAResM,     // FMA final result
    output logic    [4:0]       FMAFlgM);     // FMA flags {invalid, divide by zero, overflow, underflow, inexact}
   
@ -252,8 +192,6 @@ module fma2(
    logic [51:0]    ResultFrac; // Result fraction
    logic [10:0]    ResultExp;  // Result exponent
    logic           ResultSgn;  // Result sign
-    logic [10:0]    ZExp;   // input exponent
-    logic           XSgn, YSgn, ZSgn;   // input sign
    logic           PSgn;       // product sign
    logic [105:0]   ProdMan2;   // product being added
    logic [162:0]   AlignedAddend2; // possibly inverted aligned Z
@ -289,28 +227,10 @@ module fma2(
    logic [63:0] XNaNResult, YNaNResult, ZNaNResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult; // possible results

   
-    ///////////////////////////////////////////////////////////////////////////////
-    // Select input fields
-    // The following logic duplicates fma1 because it's cheaper to recompute than provide registers
-    ///////////////////////////////////////////////////////////////////////////////
-
-    // Set addend to zero if FMUL instruction
-    assign Addend = FOpCtrlM[2] ? 64'b0 : Z;
-
-    // split inputs into the sign bit, and exponent to handle single or double precision
-    //      - single precision is in the top half of the inputs
-    assign XSgn = X[63];
-    assign YSgn = Y[63];
-    assign ZSgn = Addend[63]^FOpCtrlM[0]; //Negate Z if subtraction
-
-    assign ZExp = FmtM ? Addend[62:52] : {3'b0, Addend[62:55]};
-
-
-
-
+    
    // Calculate the product's sign
    //      Negate product's sign if FNMADD or FNMSUB
-    assign PSgn = XSgn ^ YSgn ^ FOpCtrlM[1];
+    assign PSgn = XSgnM ^ YSgnM ^ FOpCtrlM[1];



@ -321,7 +241,7 @@ module fma2(
    // Negate Z  when doing one of the following opperations:
    //      -prod +  Z
    //       prod -  Z
-    assign InvZ = ZSgn ^ PSgn;
+    assign InvZ = ZSgnM ^ PSgn;

    // Choose an inverted or non-inverted addend - the one is added later
    assign AlignedAddend2 = InvZ ? ~{1'b0, AlignedAddendM} : {1'b0, AlignedAddendM};
@ -376,7 +296,7 @@ module fma2(
    assign FracLen = FmtM ? 13'd52 : 13'd23;

    // Determine if the result is denormal
-    assign SumExpTmp = KillProdM ? {2'b0, ZExp} : ProdExpM + -({4'b0, NormCnt} - 13'd56);
+    assign SumExpTmp = KillProdM ? {2'b0, ZExpM} : ProdExpM + -({4'b0, NormCnt} - 13'd56);
    assign ResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp)>=$signed(-FracLen)) & ~SumZero;

    // Determine the shift needed for denormal results
@ -501,13 +421,13 @@ module fma2(
    // Determine the sign if the sum is zero
    //      if cancelation then 0 unless round to -infinity
    //      otherwise psign
-    assign ZeroSgn = (PSgn^ZSgn)&~Underflow ? FrmM == 3'b010 : PSgn;
+    assign ZeroSgn = (PSgn^ZSgnM)&~Underflow ? FrmM == 3'b010 : PSgn;

    // is the result negitive
    //  if p - z is the Sum negitive
    //  if -p + z is the Sum positive
    //  if -p - z then the Sum is negitive
-    assign ResultSgnTmp = InvZ&(ZSgn)&NegSum | InvZ&PSgn&~NegSum | ((ZSgn)&PSgn);
+    assign ResultSgnTmp = InvZ&(ZSgnM)&NegSum | InvZ&PSgn&~NegSum | ((ZSgnM)&PSgn);
    assign ResultSgn = SumZero ? ZeroSgn : ResultSgnTmp;
 

@ -525,9 +445,8 @@ module fma2(
    //   2) Inf - Inf (unless x or y is NaN)
    //   3) 0 * Inf
    assign MaxExp = FmtM ? 13'd2047 : 13'd255;
-    assign SigNaN = FmtM ? (XNaNM&~X[51]) | (YNaNM&~Y[51]) | (ZNaNM&~Addend[51]) :
-                           (XNaNM&~X[54]) | (YNaNM&~Y[54]) | (ZNaNM&~Addend[54]);
-    assign Invalid = SigNaN | ((XInfM || YInfM) & ZInfM & (PSgn ^ ZSgn) & ~XNaNM & ~YNaNM) | (XZeroM & YInfM) | (YZeroM & XInfM);  
+    assign SigNaN = XSNaNM | YSNaNM | ZSNaNM;
+    assign Invalid = SigNaN | ((XInfM || YInfM) & ZInfM & (PSgn ^ ZSgnM) & ~XNaNM & ~YNaNM) | (XZeroM & YInfM) | (YZeroM & XInfM);  
   
    // Set Overflow flag if the number is too big to be represented
    //      - Don't set the overflow flag if an overflowed result isn't outputed
@ -555,28 +474,28 @@ module fma2(
    ///////////////////////////////////////////////////////////////////////////////
    // Select the result
    ///////////////////////////////////////////////////////////////////////////////
-    assign XNaNResult = FmtM ? {XSgn, X[62:52], 1'b1,X[50:0]} : {XSgn, X[62:55], 1'b1,X[53:0]};
-    assign YNaNResult = FmtM ? {YSgn, Y[62:52], 1'b1,Y[50:0]} : {YSgn, Y[62:55], 1'b1,Y[53:0]};
-    assign ZNaNResult = FmtM ? {ZSgn, Addend[62:52], 1'b1,Addend[50:0]} : {ZSgn, Addend[62:55], 1'b1,Addend[53:0]};
+    assign XNaNResult = FmtM ? {XSgnM, XExpM, 1'b1, XFracM[50:0]} : {{32{1'b1}}, XSgnM, XExpM[7:0], 1'b1, XFracM[50:29]};
+    assign YNaNResult = FmtM ? {YSgnM, YExpM, 1'b1, YFracM[50:0]} : {{32{1'b1}}, YSgnM, YExpM[7:0], 1'b1, YFracM[50:29]};
+    assign ZNaNResult = FmtM ? {ZSgnM, ZExpM, 1'b1, ZFracM[50:0]} : {{32{1'b1}}, ZSgnM, ZExpM[7:0], 1'b1, ZFracM[50:29]};
    assign OverflowResult =  FmtM ? ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, 11'h7fe, {52{1'b1}}} :
                                                                                                                          {ResultSgn, 11'h7ff, 52'b0} :
-                                    ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, 8'hfe, {23{1'b1}}, 32'b0} :
-                                                                                                                          {ResultSgn, 8'hff, 55'b0};
-    assign InvalidResult = FmtM ? {ResultSgn, 11'h7ff, 1'b1, 51'b0} : {ResultSgn, 8'hff, 1'b1, 54'b0};
-    assign KillProdResult = FmtM ?{ResultSgn, Addend[62:0] - {62'b0, (Minus1&AddendStickyM)}} + {62'b0, (Plus1&AddendStickyM)} : {ResultSgn, Addend[62:32] - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}, 32'b0};
-    assign UnderflowResult = FmtM ? {ResultSgn, 63'b0} + {63'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))} : {{ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}, 32'b0};
+                                    ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{32{1'b1}}, ResultSgn, 8'hfe, {23{1'b1}}} :
+                                                                                                                          {{32{1'b1}}, ResultSgn, 8'hff, 23'b0};
+    assign InvalidResult = FmtM ? {ResultSgn, 11'h7ff, 1'b1, 51'b0} : {{32{1'b1}}, ResultSgn, 8'hff, 1'b1, 22'b0};
+    assign KillProdResult = FmtM ? {ResultSgn, {ZExpM, ZFracM} - {62'b0, (Minus1&AddendStickyM)}} + {62'b0, (Plus1&AddendStickyM)} : {{32{1'b1}}, ResultSgn, {ZExpM[7:0], ZFracM[51:29]} - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}};
+    assign UnderflowResult = FmtM ? {ResultSgn, 63'b0} + {63'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))} : {{32{1'b1}}, {ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}};
    assign FMAResM = XNaNM ? XNaNResult :
                        YNaNM ? YNaNResult :
                        ZNaNM ? ZNaNResult :
                        Invalid ? InvalidResult : // has to be before inf
-                        XInfM ? {PSgn, X[62:0]} :
-                        YInfM ? {PSgn, Y[62:0]} :
-                        ZInfM ? {ZSgn, Addend[62:0]} :
+                        XInfM ? FmtM ? {PSgn, XExpM, XFracM} : {{32{1'b1}}, PSgn, XExpM[7:0], XFracM[51:29]} :
+                        XInfM ? FmtM ? {PSgn, YExpM, YFracM} : {{32{1'b1}}, PSgn, YExpM[7:0], YFracM[51:29]} :
+                        XInfM ? FmtM ? {ZSgnM, ZExpM, ZFracM} : {{32{1'b1}}, ZSgnM, ZExpM[7:0], ZFracM[51:29]} :
                        Overflow ? OverflowResult :
                        KillProdM ? KillProdResult : // has to be after Underflow      
                        Underflow & ~ResultDenorm ? UnderflowResult :  
                        FmtM ? {ResultSgn, ResultExp, ResultFrac} :
-                               {ResultSgn, ResultExp[7:0], ResultFrac, 3'b0};
+                               {{32{1'b1}}, ResultSgn, ResultExp[7:0], ResultFrac[51:29]};



--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@ -22,8 +22,8 @@
 // Step 7: Put quotient/remainder onto output.
 //

-`timescale 1ps/1ps
-module fpdiv (done, AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn,
+// `timescale 1ps/1ps
+module fpdiv (AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn,
 	      start, reset, clk);

   input [63:0] op1;		// 1st input operand (A)
@ -40,7 +40,8 @@ module fpdiv (done, AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, Un
   output [63:0] AS_Result;	// Result of operation
   output [4:0]  Flags;   	// IEEE exception flags 
   output 	 Denorm;   	// Denorm on input or output
-   output 	 done;
+   logic 	 done;
+   // output 	 done;

   supply1 	  vdd;
   supply0 	  vss;   
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -34,6 +34,7 @@ module fpu (
  input logic [`XLEN-1:0]  SrcAM,      // Integer input being written into fpreg
  input logic 		         StallE, StallM, StallW,
  input logic 		         FlushE, FlushM, FlushW,
+  input logic [4:0]        RdE, RdM, RdW, 
  output logic          FRegWriteM,
  output logic 		      FStallD,    // Stall the decode stage
  output logic 		      FWriteIntE, FWriteIntM, FWriteIntW, // Write integer register enable
@ -52,7 +53,7 @@ module fpu (
      logic 		   FmtD, FmtE, FmtM, FmtW;                                  // FP precision 0-single 1-double
      logic 		   FDivStartD, FDivStartE;                                  // Start division
      logic 		   FWriteIntD;                                              // Write to integer register
-      logic [1:0]    ForwardXE, ForwardYE, ForwardZE;                        // Input3 forwarding mux control signal
+      logic [1:0]    FForwardXE, FForwardYE, FForwardZE;                        // Input3 forwarding mux control signal
      logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
      logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM;                  // Select which opperation to do in each component
      logic [1:0]    FResSelD, FResSelE, FResSelM;  
@ -60,13 +61,34 @@ module fpu (
      logic [4:0] 	Adr1E, Adr2E, Adr3E;
      
      // regfile signals
-      logic [4:0]    RdE, RdM, RdW;                                           // what adress to write to    // ***Can take from ieu insted of pipelining
      logic [63:0] 	FRD1D, FRD2D, FRD3D;                                     // Read Data from FP register - decode stage
      logic [63:0] 	FRD1E, FRD2E, FRD3E;                                     // Read Data from FP register - execute stage
-      logic [`XLEN-1:0]   SrcXMAligned;
-      logic [63:0] 	SrcXE, SrcXM;                         // Input 1 to the various units (after forwarding)
-      logic [63:0] 	SrcYE, SrcYM;                                      // Input 2 to the various units (after forwarding)
-      logic [63:0] 	SrcZE, SrcZM;                                      // Input 3 to the various units (after forwarding)
+      logic [`XLEN-1:0]   FSrcXMAligned;
+      logic [63:0] 	FSrcXE, FSrcXM;                         // Input 1 to the various units (after forwarding)
+      logic [63:0] 	FSrcYE;                                      // Input 2 to the various units (after forwarding)
+      logic [63:0] 	FSrcZE;                                      // Input 3 to the various units (after forwarding)
+      
+      // unpacking signals
+      logic XSgnE, YSgnE, ZSgnE;
+      logic [10:0] XExpE, YExpE, ZExpE;
+      logic [51:0] XFracE, YFracE, ZFracE;
+      logic        XAssumed1E, YAssumed1E, ZAssumed1E;
+      logic XNaNE, YNaNE, ZNaNE;
+      logic XSNaNE, YSNaNE, ZSNaNE;
+      logic XDenormE, YDenormE, ZDenormE;
+      logic XZeroE, YZeroE, ZZeroE;
+      logic [10:0] BiasE;
+      logic XInfE, YInfE, ZInfE;
+      logic XExpMaxE;
+      logic XNormE;
+
+      logic XSgnM, YSgnM, ZSgnM;
+      logic [10:0] XExpM, YExpM, ZExpM;
+      logic [51:0] XFracM, YFracM, ZFracM;
+      logic XNaNM, YNaNM, ZNaNM;
+      logic XSNaNM, YSNaNM, ZSNaNM;
+      logic XZeroM, YZeroM, ZZeroM;
+      logic XInfM, YInfM, ZInfM;
      
      // div/sqrt signals
      logic [63:0] 	FDivResultM, FDivResultW;
@ -131,26 +153,28 @@ module fpu (
      flopenrc #(1) DECtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
      flopenrc #(15) DECtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
                                                            {Adr1E,         Adr2E,         Adr3E});
-      flopenrc #(22) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-                           {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FWriteIntD},
-                           {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE,          FOpCtrlE, FWriteIntE});
+      flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+                           {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD},
+                           {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE});


      //EXECUTION STAGE
      
      // Hazard unit for FPU
      fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, .FStallD, 
-                        .ForwardXE, .ForwardYE, .ForwardZE);
+                        .FForwardXE, .FForwardYE, .FForwardZE);

      // forwarding muxs
-      mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, ForwardXE, SrcXE);
-      mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, ForwardYE, SrcYE);
-      mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, ForwardZE, SrcZE);
+      mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
+      mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FSrcYE);
+      mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FSrcZE);

-      
+      unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XFracE, .YFracE, .ZFracE, .XAssumed1E, .YAssumed1E, .ZAssumed1E, .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
      // first of two-stage instance of floating-point fused multiply-add unit
      fma fma (.clk, .reset, .FlushM, .StallM, 
-               .SrcXE, .SrcYE, .SrcZE, .SrcXM, .SrcYM, .SrcZM, 
+               .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XFracE, .YFracE, .ZFracE, .XAssumed1E, .YAssumed1E, .ZAssumed1E, .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
+               .XSgnM, .YSgnM, .ZSgnM, .XExpM, .YExpM, .ZExpM, .XFracM, .YFracM, .ZFracM, .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
+              //  .FSrcXE, .FSrcYE, .FSrcZE, .FSrcXM, .FSrcYM, .FSrcZM, 
               .FOpCtrlE(FOpCtrlE[2:0]), .FOpCtrlM(FOpCtrlM[2:0]), 
               .FmtE, .FmtM, .FrmM, .FMAFlgM, .FMAResM);
      
@ -163,43 +187,50 @@ module fpu (
            .ECLK(fpdivClk));
      
      // capture the inputs for div/sqrt	 
-      flopenrc #(64) reg_input1 (.d(SrcXE), .q(DivInput1E),
+      flopenrc #(64) reg_input1 (.d(FSrcXE), .q(DivInput1E),
                  .en(~HoldInputs), .clear(FDivSqrtDoneE),
                  .reset(reset),  .clk(clk));
-      flopenrc #(64) reg_input2 (.d(SrcYE), .q(DivInput2E),
+      flopenrc #(64) reg_input2 (.d(FSrcYE), .q(DivInput2E),
                  .en(~HoldInputs), .clear(FDivSqrtDoneE),
                  .reset(reset),  .clk(clk));

-      fpdiv fdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
-                        .FrmE, .DivOvEn(1'b1), .DivUnEn(1'b1), .FDivStartE, .FDivResultM, .FDivSqrtFlgM, 
-                        .FDivSqrtDoneE, .FDivBusyE, .HoldInputs, .reset);
-      
+      // fpdiv fdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
+      //                   .FrmE, .DivOvEn(1'b1), .DivUnEn(1'b1), .FDivStartE, .FDivResultM, .FDivSqrtFlgM, 
+      //                   .FDivSqrtDoneE, .FDivBusyE, .HoldInputs, .reset);
+      assign FDivBusyE = 0;
      // first of two-stage instance of floating-point add/cvt unit
      faddcvt faddcvt (.clk, .reset, .FlushM, .StallM, .FrmM, .FOpCtrlM, .FmtE, .FmtM,
-                        .SrcXE, .SrcYE, .FOpCtrlE, .FAddResM, .FAddFlgM);
+                        .FSrcXE, .FSrcYE, .FOpCtrlE, .FAddResM, .FAddFlgM);
      
      // first and only instance of floating-point comparator
-      fcmp fcmp (SrcXE, SrcYE, FOpCtrlE[2:0], FmtE, CmpNVE, CmpResE);
+      fcmp fcmp (.op1({XSgnE,XExpE,XFracE}), .op2({YSgnE,YExpE,YFracE}), .FSrcXE, .FSrcYE, .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .Invalid(CmpNVE), .CmpResE, .XNaNE, .YNaNE, .XZeroE, .YZeroE);
      
      // first and only instance of floating-point sign converter
-      fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .SrcXE, .SrcYE, .SgnResE, .SgnNVE);
+      fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .XExpE, .XFracE, .FmtE, .SgnResE, .SgnNVE, .XExpMaxE);
      
      // first and only instance of floating-point classify unit
-      fclassify fclassify (.SrcXE, .FmtE, .ClassResE);
+      fclassify fclassify (.XSgnE, .XFracE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, .XSNaNE, .ClassResE);


-      fcvt fcvt (.X(SrcXE), .SrcAE, .FOpCtrlE, .FmtE, .FrmE, .CvtResE, .CvtFlgE);
+      fcvt fcvt (.XSgnE, .XExpE, .XFracE, .XAssumed1E, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE, .CvtResE, .CvtFlgE);

      // output for store instructions
-      // mux2  #(`XLEN)  FWriteDataMux({{`XLEN-32{1'b0}}, SrcYE[63:32]}, SrcYE[63:64-`XLEN], FmtE, FWriteDataE);
-      assign FWriteDataE = SrcYE[`XLEN-1:0];
+      // mux2  #(`XLEN)  FWriteDataMux({{`XLEN-32{1'b0}}, FSrcYE[63:32]}, FSrcYE[63:64-`XLEN], FmtE, FWriteDataE);
+      assign FWriteDataE = FSrcYE[`XLEN-1:0];

      //*****************
      // E/M pipe registers
      //*****************
-      flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, SrcXE, SrcXM);
-      flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, SrcYE, SrcYM);
-      flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, SrcZE, SrcZM);
+      flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
+      // flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, FSrcYE, FSrcYM);
+      // flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, FSrcZE, FSrcZM);
+      flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XFracE}, {XSgnM,XExpM,XFracM});
+      flopenrc #(64) EMFpReg5(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YFracE}, {YSgnM,YExpM,YFracM});
+      flopenrc #(64) EMFpReg6(clk, reset, FlushM, ~StallM, {ZSgnE,ZExpE,ZFracE}, {ZSgnM,ZExpM,ZFracM});
+      flopenrc #(12) EMFpReg7(clk, reset, FlushM, ~StallM, 
+                          {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
+                          {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});
+
      
     
      flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
@ -211,9 +242,9 @@ module fpu (
      flopenrc #(64) EMRegCvt1(clk, reset, FlushM, ~StallM, CvtResE, CvtResM);
      flopenrc #(5) EMRegCvt2(clk, reset, FlushM, ~StallM, CvtFlgE, CvtFlgM);
      
-      flopenrc #(22) EMCtrlReg(clk, reset, FlushM, ~StallM,
-                           {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE, FOpCtrlE, FWriteIntE},
-                           {FRegWriteM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, RdM, FOpCtrlM, FWriteIntM});
+      flopenrc #(17) EMCtrlReg(clk, reset, FlushM, ~StallM,
+                           {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
+                           {FRegWriteM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});

      flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);

@ -221,8 +252,8 @@ module fpu (
      mux4  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, CvtResM, FResSelM, FResM);
      mux4  #(5)  FFlgMux(5'b0, {4'b0, SgnNVM}, {4'b0, CmpNVM}, CvtFlgM, FResSelM, FFlgM);

-      // mux2  #(`XLEN)  SrcXAlignedMux({{`XLEN-32{1'b0}}, SrcXM[63:32]}, SrcXM[63:64-`XLEN], FmtM, SrcXMAligned);
-      mux4  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], SrcXM[`XLEN-1:0], ClassResM[`XLEN-1:0], CvtResM[`XLEN-1:0], FIntResSelM, FIntResM);
+      // mux2  #(`XLEN)  FSrcXAlignedMux({{`XLEN-32{1'b0}}, FSrcXM[63:32]}, FSrcXM[63:64-`XLEN], FmtM, FSrcXMAligned);
+      mux4  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], FSrcXM[`XLEN-1:0], ClassResM[`XLEN-1:0], CvtResM[`XLEN-1:0], FIntResSelM, FIntResM);
      
      // Align SrcA to MSB when single precicion
      mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAM[31:0]}, {{64-`XLEN{1'b1}}, SrcAM}, FmtM, AlignedSrcAM);
@ -241,9 +272,9 @@ module fpu (

      flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
      
-      flopenrc #(11) MWCtrlReg(clk, reset, FlushW, ~StallW,
-                           {FRegWriteM, FResultSelM, RdM, FmtM, FWriteIntM},
-                           {FRegWriteW, FResultSelW, RdW, FmtW, FWriteIntW});
+      flopenrc #(6) MWCtrlReg(clk, reset, FlushW, ~StallW,
+                           {FRegWriteM, FResultSelM, FmtM, FWriteIntM},
+                           {FRegWriteW, FResultSelW, FmtW, FWriteIntW});
      
   //#########################################
   // BEGIN WRITEBACK STAGE
--- a/wally-pipelined/src/fpu/fsgn.sv
+++ b/wally-pipelined/src/fpu/fsgn.sv
@ -1,30 +1,34 @@
 //performs the fsgnj/fsgnjn/fsgnjx RISCV instructions

-module fsgn (
-	input  logic [63:0]  SrcXE, SrcYE,
+module fsgn (  
+	input logic        XSgnE, YSgnE,
+    input logic [10:0] XExpE,
+    input logic [51:0] XFracE,
+	input logic XExpMaxE,
+	input logic FmtE,
 	input  logic [1:0]   SgnOpCodeE,
 	output logic [63:0]  SgnResE,
 	output logic   SgnNVE);

 	logic AonesExp;
+	logic ResSgn;

 	//op code designation:
 	//
-	//00 - fsgnj - directly copy over sign value of SrcYE
-	//01 - fsgnjn - negate sign value of SrcYE
-	//10 - fsgnjx - XOR sign values of SrcXE & SrcYE
+	//00 - fsgnj - directly copy over sign value of FSrcYE
+	//01 - fsgnjn - negate sign value of FSrcYE
+	//10 - fsgnjx - XOR sign values of FSrcXE & FSrcYE
 	//
 	
-	assign SgnResE[63] = SgnOpCodeE[1] ? (SrcXE[63] ^ SrcYE[63]) : (SrcYE[63] ^ SgnOpCodeE[0]);
-	assign SgnResE[62:0] = SrcXE[62:0];
+	assign ResSgn = SgnOpCodeE[1] ? (XSgnE ^ YSgnE) : (YSgnE ^ SgnOpCodeE[0]);
+	assign SgnResE = FmtE ? {ResSgn, XExpE, XFracE} : {{32{1'b1}}, ResSgn, XExpE[7:0], XFracE[51:29]};

 	//If the exponent is all ones, then the value is either Inf or NaN,
 	//both of which will produce a QNaN/SNaN value of some sort. This will 
 	//set the invalid flag high.
-	assign AonesExp = SrcXE[62]&SrcXE[61]&SrcXE[60]&SrcXE[59]&SrcXE[58]&SrcXE[57]&SrcXE[56]&SrcXE[55]&SrcXE[54]&SrcXE[53]&SrcXE[52];

 	//the only flag that can occur during this operation is invalid
 	//due to changing sign on already existing NaN
-	assign SgnNVE = AonesExp & SgnResE[63];
+	assign SgnNVE = XExpMaxE & SgnResE[63];

 endmodule
--- a/wally-pipelined/src/fpu/fsm_div.v
+++ b/wally-pipelined/src/fpu/fsm_div.v
@ -1,3 +1,5 @@
+
+`timescale 1ps/1ps
 module fsm_div (done, load_rega, load_regb, load_regc, 
 		load_regd, load_regr, load_regs,
 		sel_muxa, sel_muxb, sel_muxr, 
--- a/wally-pipelined/src/fpu/sbtm.sv
+++ b/wally-pipelined/src/fpu/sbtm.sv
@ -1,33 +1,33 @@
-module sbtm (input logic [11:0] a, output logic [10:0] ia_out);
+// module sbtm (input logic [11:0] a, output logic [10:0] ia_out);

-   // bit partitions
-   logic [3:0] x0;
-   logic [2:0] x1;
-   logic [3:0] x2;
-   logic [2:0] x2_1cmp;   
-   // mem outputs
-   logic [12:0] y0;
-   logic [4:0] 	y1;
-   // input to CPA
-   logic [14:0] op1;
-   logic [14:0] op2;
-   logic [14:0] p;   
+//    // bit partitions
+//    logic [3:0] x0;
+//    logic [2:0] x1;
+//    logic [3:0] x2;
+//    logic [2:0] x2_1cmp;   
+//    // mem outputs
+//    logic [12:0] y0;
+//    logic [4:0] 	y1;
+//    // input to CPA
+//    logic [14:0] op1;
+//    logic [14:0] op2;
+//    logic [14:0] p;   

-   assign x0 = a[10:7];
-   assign x1 = a[6:4];
-   assign x2 = a[3:0];   
+//    assign x0 = a[10:7];
+//    assign x1 = a[6:4];
+//    assign x2 = a[3:0];   

-   sbtm_a0 mem1 ({x0, x1}, y0);
-   // 1s cmp per sbtm/stam
-   assign x2_1cmp = x2[3] ? ~x2[2:0] : x2[2:0];   
-   sbtm_a1 mem2 ({x0, x2_1cmp}, y1);
-   assign op1 = {1'b0, y0, 1'b0};
-   // 1s cmp per sbtm/stam
-   assign op2 = x2[3] ? {1'b1, {8{1'b1}}, ~y1, 1'b1} :
-		{1'b0, 8'b0, y1, 1'b1};
-   // CPA
-   adder #(15) cp1 (op1, op2, 1'b0, p, cout);  
-   //assign ia_out = {p[14:4], {53{1'b0}}};
-   assign ia_out = p[14:4];
+//    sbtm_a0 mem1 ({x0, x1}, y0);
+//    // 1s cmp per sbtm/stam
+//    assign x2_1cmp = x2[3] ? ~x2[2:0] : x2[2:0];   
+//    sbtm_a1 mem2 ({x0, x2_1cmp}, y1);
+//    assign op1 = {1'b0, y0, 1'b0};
+//    // 1s cmp per sbtm/stam
+//    assign op2 = x2[3] ? {1'b1, {8{1'b1}}, ~y1, 1'b1} :
+// 		{1'b0, 8'b0, y1, 1'b1};
+//    // CPA
+//    adder #(15) cp1 (op1, op2, 1'b0, p, cout);  
+//    //assign ia_out = {p[14:4], {53{1'b0}}};
+//    assign ia_out = p[14:4];

-endmodule // sbtm
+// endmodule // sbtm
--- a/wally-pipelined/src/fpu/sbtm2.sv
+++ b/wally-pipelined/src/fpu/sbtm2.sv
@ -1,39 +1,39 @@
  
-module sbtm2 (input logic [11:0] a, output logic [10:0] y);
+// module sbtm2 (input logic [11:0] a, output logic [10:0] y);

-   // bit partitions
-   logic [4:0] x0;
-   logic [2:0] x1;
-   logic [3:0] x2;
-   logic [2:0] x2_1cmp;   
-   // mem outputs
-   logic [12:0] y0;
-   logic [5:0] 	y1;
-   // input to CPA
-   logic [14:0] op1;
-   logic [14:0] op2;
-   logic [14:0] p; 
-   logic cout;  
+//    // bit partitions
+//    logic [4:0] x0;
+//    logic [2:0] x1;
+//    logic [3:0] x2;
+//    logic [2:0] x2_1cmp;   
+//    // mem outputs
+//    logic [12:0] y0;
+//    logic [5:0] 	y1;
+//    // input to CPA
+//    logic [14:0] op1;
+//    logic [14:0] op2;
+//    logic [14:0] p; 
+//    logic cout;  

-   assign x0 = a[11:7];
-   assign x1 = a[6:4];
-   assign x2 = a[3:0];   
+//    assign x0 = a[11:7];
+//    assign x1 = a[6:4];
+//    assign x2 = a[3:0];   

-   sbtm_a2 mem1 ({x0[3:0], x1}, y0);
-   assign op1 = {1'b0, y0, 1'b0};
+//    sbtm_a2 mem1 ({x0[3:0], x1}, y0);
+//    assign op1 = {1'b0, y0, 1'b0};
   
-   // 1s cmp per sbtm/stam
-   assign x2_1cmp = x2[3] ? ~x2[2:0] : x2[2:0];   
-   sbtm_a3 mem2 ({x0, x2_1cmp}, y1);
-   // 1s cmp per sbtm/stam
-   assign op2 = x2[3] ? {{8{1'b1}}, ~y1, 1'b1} :
-		{8'b0, y1, 1'b1};
+//    // 1s cmp per sbtm/stam
+//    assign x2_1cmp = x2[3] ? ~x2[2:0] : x2[2:0];   
+//    sbtm_a3 mem2 ({x0, x2_1cmp}, y1);
+//    // 1s cmp per sbtm/stam
+//    assign op2 = x2[3] ? {{8{1'b1}}, ~y1, 1'b1} :
+// 		{8'b0, y1, 1'b1};
   
-   // CPA
-   bk15 cp1 (cout, p, op1, op2, 1'b0);
-   assign y = p[14:4];
+//    // CPA
+//    bk15 cp1 (cout, p, op1, op2, 1'b0);
+//    assign y = p[14:4];

-endmodule // sbtm2
+// endmodule // sbtm2


   
--- a/wally-pipelined/src/fpu/sbtm3.sv
+++ b/wally-pipelined/src/fpu/sbtm3.sv
@ -1,37 +1,37 @@
-module sbtm2 (input logic [11:0] a, output logic [10:0] y);
+// module sbtm2 (input logic [11:0] a, output logic [10:0] y);

-   // bit partitions
-   logic [4:0] x0;
-   logic [2:0] x1;
-   logic [3:0] x2;
-   logic [2:0] x2_1cmp;   
-   // mem outputs
-   logic [13:0] y0;
-   logic [5:0] 	y1;
-   // input to CPA
-   logic [14:0] op1;
-   logic [14:0] op2;
-   logic [14:0] p;   
+//    // bit partitions
+//    logic [4:0] x0;
+//    logic [2:0] x1;
+//    logic [3:0] x2;
+//    logic [2:0] x2_1cmp;   
+//    // mem outputs
+//    logic [13:0] y0;
+//    logic [5:0] 	y1;
+//    // input to CPA
+//    logic [14:0] op1;
+//    logic [14:0] op2;
+//    logic [14:0] p;   

-   assign x0 = a[11:7];
-   assign x1 = a[6:4];
-   assign x2 = a[3:0];   
+//    assign x0 = a[11:7];
+//    assign x1 = a[6:4];
+//    assign x2 = a[3:0];   

-   sbtm_a2 mem1 ({x0, x1}, y0);
-   assign op1 = {y0, 1'b0};
+//    sbtm_a2 mem1 ({x0, x1}, y0);
+//    assign op1 = {y0, 1'b0};
   
-   // 1s cmp per sbtm/stam
-   assign x2_1cmp = x2[3] ? ~x2[2:0] : x2[2:0];   
-   sbtm_a3 mem2 ({x0, x2_1cmp}, y1);
-   // 1s cmp per sbtm/stam
-   assign op2 = x2[3] ? {{8{1'b1}}, ~y1, 1'b1} :
-		{8'b0, y1, 1'b1};
+//    // 1s cmp per sbtm/stam
+//    assign x2_1cmp = x2[3] ? ~x2[2:0] : x2[2:0];   
+//    sbtm_a3 mem2 ({x0, x2_1cmp}, y1);
+//    // 1s cmp per sbtm/stam
+//    assign op2 = x2[3] ? {{8{1'b1}}, ~y1, 1'b1} :
+// 		{8'b0, y1, 1'b1};
   
-   // CPA
-   adder #(15) cp1 (op1, op2, 1'b0, p, cout); 
-   assign y = p[14:4];
+//    // CPA
+//    adder #(15) cp1 (op1, op2, 1'b0, p, cout); 
+//    assign y = p[14:4];

-endmodule // sbtm2
+// endmodule // sbtm2


   
--- a/wally-pipelined/src/fpu/sbtm_a4.sv
+++ b/wally-pipelined/src/fpu/sbtm_a4.sv
@ -1,4 +1,4 @@
-module sbtm_a2 (input  logic [7:0] a,
+module sbtm_a4 (input  logic [7:0] a,
 		output logic [13:0] y);
   always_comb
     case(a)
--- a/wally-pipelined/src/fpu/sbtm_a5.sv
+++ b/wally-pipelined/src/fpu/sbtm_a5.sv
@ -1,4 +1,4 @@
-module sbtm_a3 (input  logic [7:0] a,
+module sbtm_a5 (input  logic [7:0] a,
 		output logic [5:0] y);
   always_comb
     case(a)
--- a/wally-pipelined/src/fpu/unpacking.sv
+++ b/wally-pipelined/src/fpu/unpacking.sv
@ -0,0 +1,77 @@
+module unpacking (
+    input logic  [63:0] X, Y, Z,
+    input logic         FmtE,
+    input logic  [2:0]  FOpCtrlE,
+    output logic        XSgnE, YSgnE, ZSgnE,
+    output logic [10:0] XExpE, YExpE, ZExpE,
+    output logic [51:0] XFracE, YFracE, ZFracE,
+    output logic        XAssumed1E, YAssumed1E, ZAssumed1E,
+    output logic XNormE,
+    output logic XNaNE, YNaNE, ZNaNE,
+    output logic XSNaNE, YSNaNE, ZSNaNE,
+    output logic XDenormE, YDenormE, ZDenormE,
+    output logic XZeroE, YZeroE, ZZeroE,
+    output logic [10:0] BiasE,
+    output logic XInfE, YInfE, ZInfE,
+    output logic XExpMaxE
+);
+
+    logic           XFracZero, YFracZero, ZFracZero; // input fraction zero
+    logic           XExpZero, YExpZero, ZExpZero; // input exponent zero
+    logic [63:0]    Addend; // value to add (Z or zero)
+    logic           YExpMaxE, ZExpMaxE;  // input exponent all 1s
+
+    assign Addend = FOpCtrlE[2] ? 64'b0 : Z; // Z is only used in the FMA, and is set to Zero if a multiply opperation
+    assign XSgnE = FmtE ? X[63] : X[31];
+    assign YSgnE = FmtE ? Y[63] : Y[31];
+    assign ZSgnE = FmtE ? Addend[63]^FOpCtrlE[0] : Addend[31]^FOpCtrlE[0];
+
+    assign XExpE = FmtE ? X[62:52] : {3'b0, X[30:23]};
+    assign YExpE = FmtE ? Y[62:52] : {3'b0, Y[30:23]};
+    assign ZExpE = FmtE ? Addend[62:52] : {3'b0, Addend[30:23]};
+
+    assign XFracE = FmtE ? X[51:0] : {X[22:0], 29'b0};
+    assign YFracE = FmtE ? Y[51:0] : {Y[22:0], 29'b0};
+    assign ZFracE = FmtE ? Addend[51:0] : {Addend[22:0], 29'b0};
+
+    assign XAssumed1E = |XExpE;
+    assign YAssumed1E = |YExpE;
+    assign ZAssumed1E = |ZExpE;
+
+    assign XExpZero = ~XAssumed1E;
+    assign YExpZero = ~YAssumed1E;
+    assign ZExpZero = ~ZAssumed1E;
+   
+    assign XFracZero = ~|XFracE;
+    assign YFracZero = ~|YFracE;
+    assign ZFracZero = ~|ZFracE;
+
+    assign XExpMaxE = FmtE ? &XExpE[10:0] : &XExpE[7:0];
+    assign YExpMaxE = FmtE ? &YExpE[10:0] : &YExpE[7:0];
+    assign ZExpMaxE = FmtE ? &ZExpE[10:0] : &ZExpE[7:0];
+   
+    assign XNormE = ~(XExpMaxE|XExpZero);
+    
+    assign XNaNE = XExpMaxE & ~XFracZero;
+    assign YNaNE = YExpMaxE & ~YFracZero;
+    assign ZNaNE = ZExpMaxE & ~ZFracZero;
+
+    assign XSNaNE = XNaNE&~XExpE[51];
+    assign YSNaNE = YNaNE&~YExpE[51];
+    assign ZSNaNE = ZNaNE&~ZExpE[51];
+
+    assign XDenormE = XExpZero & ~XFracZero;
+    assign YDenormE = YExpZero & ~YFracZero;
+    assign ZDenormE = ZExpZero & ~ZFracZero;
+
+    assign XInfE = XExpMaxE & XFracZero;
+    assign YInfE = YExpMaxE & YFracZero;
+    assign ZInfE = ZExpMaxE & ZFracZero;
+
+    assign XZeroE = XExpZero & XFracZero;
+    assign YZeroE = YExpZero & YFracZero;
+    assign ZZeroE = ZExpZero & ZFracZero;
+
+    assign BiasE = FmtE ? 13'h3ff : 13'h7f;
+
+endmodule
--- a/wally-pipelined/src/ieu/ieu.sv
+++ b/wally-pipelined/src/ieu/ieu.sv
@ -42,6 +42,7 @@ module ieu (
  output logic 		   MulDivE, W64E,
  output logic [2:0] 	   Funct3E,
  output logic [`XLEN-1:0] SrcAE, SrcBE,
+  output logic [4:0]    RdE,
  input logic 		   FWriteIntM,

  // Memory stage interface
@ -53,12 +54,14 @@ module ieu (

  output logic [2:0] 	   Funct3M, // size and signedness to LSU
  output logic [`XLEN-1:0] SrcAM, // to privilege and fpu
+  output logic [4:0]    RdM,
  input logic 		   DataAccessFaultM,
  input logic [`XLEN-1:0]  FIntResM, 

  // Writeback stage
  input logic [`XLEN-1:0]  CSRReadValW, ReadDataW, MulDivResultW,
  input logic 		   FWriteIntW,
+  output logic [4:0]    RdW,
  // input  logic [`XLEN-1:0] PCLinkW,
  output logic 		   InstrValidM, 
  // hazards
@ -82,7 +85,7 @@ module ieu (
  logic        InstrValidW;

  // forwarding signals
-  logic [4:0]       Rs1D, Rs2D, Rs1E, Rs2E, RdE, RdM, RdW;
+  logic [4:0]       Rs1D, Rs2D, Rs1E, Rs2E;
  logic [1:0]       ForwardAE, ForwardBE;
  logic             RegWriteM, RegWriteW;
  logic             MemReadE, CSRReadE;
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@ -94,6 +94,7 @@ module wallypipelinedhart
  // floating point unit signals
  logic [2:0] 		    FRM_REGW;
  logic [1:0] 		    FMemRWM, FMemRWE;
+  logic [4:0]        RdE, RdM, RdW;
  logic 		    FStallD;
  logic 		    FWriteIntE, FWriteIntM, FWriteIntW;
  logic [`XLEN-1:0] 	    FWriteDataE;
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -57,12 +57,8 @@ module testbench();
 string tests32f[] = '{
    "rv32f/I-FADD-S-01", "2000",
    "rv32f/I-FCLASS-S-01", "2000",
-    "rv32f/I-FCVT-S-L-01", "2000",
-    "rv32f/I-FCVT-S-LU-01", "2000",
    "rv32f/I-FCVT-S-W-01", "2000",
    "rv32f/I-FCVT-S-WU-01", "2000",
-    "rv32f/I-FCVT-L-S-01", "2000",
-    "rv32f/I-FCVT-LU-S-01", "2000",
    "rv32f/I-FCVT-W-S-01", "2000",
    "rv32f/I-FCVT-WU-S-01", "2000",
    // "rv32f/I-FDIV-S-01", "2000",