From 8eed89616cc0b3dc160ca11bb23566625ceb339c Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Wed, 23 Jun 2021 16:42:40 -0400
Subject: [PATCH 01/20] fpu clean-up

---
 wally-pipelined/src/fpu/fpu.sv                | 229 ++++++++----------
 .../testbench/testbench-imperas.sv            |   2 +-
 2 files changed, 100 insertions(+), 131 deletions(-)

diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index fc38b2f69..e886c66e3 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 //
-// Written: 
-// Modified: 
+// Written: Katherine Parry, Bret Mathis
+// Modified: 6/23/2021
 //
 // Purpose: FPU
 // 
@@ -25,23 +25,23 @@
 `include "wally-config.vh"
 
 module fpu (
-  input logic [2:0]        FRM_REGW,   // Rounding mode from CSR
-  input logic 		         reset,
   input logic 		         clk,
+  input logic 		         reset,
+  input logic [2:0]        FRM_REGW,   // Rounding mode from CSR
   input logic [31:0]       InstrD,
+  input logic [`XLEN-1:0]  ReadDataW,     // Read data from memory
+  input logic 		         RegWriteD,  // register write enable from ieu
   input logic [`XLEN-1:0]  SrcAE,      // Integer input being processed
   input logic [`XLEN-1:0]  SrcAM,      // Integer input being written into fpreg
   input logic 		         StallE, StallM, StallW,
   input logic 		         FlushE, FlushM, FlushW,
-  input logic [`XLEN-1:0]  ReadDataW,     // Read data from memory
-  input logic 		         RegWriteD,  // register write enable from ieu
-  output logic [4:0] 	   SetFflagsM, // FPU flags
   output logic [1:0] 	   FMemRWM,    // Read/write enable for memory {read, write}
   output logic 		      FStallD,    // Stall the decode stage if Div/Sqrt instruction
   output logic 		      FWriteIntE, FWriteIntM, FWriteIntW, // Write integer register enable
   output logic [`XLEN-1:0] FWriteDataM,      // Data to be written to memory
   output logic 		      FDivBusyE,        // Is the divison/sqrt unit busy
   output logic 		      IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
+  output logic [4:0] 	   SetFflagsM,       // FPU flags
   output logic [`XLEN-1:0] FPUResultW);      // FPU result
 
    // control logic signal instantiation
@@ -58,10 +58,10 @@ module fpu (
    logic 		   FInput2UsedD;                                            // Is input 2 used
    logic 		   FInput3UsedD;                                            // Is input 3 used
    logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
-   logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM, FOpCtrlW;                            // Select which opperation to do in each component
+   logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM, FOpCtrlW;                  // Select which opperation to do in each component
    logic          SelLoadInputE, SelLoadInputM;                            // Select which adress to load when single precision
    
-   // regfile signals //*** KEP lint warning -  changed `XLEN-1 to 63 
+   // regfile signals
    logic [4:0]    RdE, RdM, RdW;                                           // what adress to write to    // ***Can take from ieu insted of pipelining
    logic [63:0] 	FWDM;                                                    // Write data for FP register
    logic [63:0] 	FRD1D, FRD2D, FRD3D;                                     // Read Data from FP register - decode stage
@@ -147,26 +147,6 @@ module fpu (
    logic [63:0] 	FPUResult64W, FPUResult64E;                                           
    logic [4:0] 	FPUFlagsW;
    
-   // pipeline control logic
-   logic 		   PipeEnableDE;
-   logic 		   PipeEnableEM;
-   logic 		   PipeEnableMW;
-   logic 		   PipeClearDE;
-   logic 		   PipeClearEM;
-   logic 		   PipeClearMW;
-   
-   // temporarily assign pipe clear and enable signals
-   // to never flush & always be running
-   localparam     PipeClear = 1'b0;
-   localparam     PipeEnable = 1'b1;
-   always_comb begin      
-      PipeEnableDE = ~StallE;
-      PipeEnableEM = ~StallM;
-      PipeEnableMW = ~StallW;
-      PipeClearDE = FlushE;
-      PipeClearEM = FlushM;
-      PipeClearMW = FlushW;      
-   end
    
    //DECODE STAGE
    
@@ -185,29 +165,18 @@ module fpu (
    //*****************
    // fpregfile D/E pipe registers
    //*****************
-   flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, FRD1D, FRD1E);
-   flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, FRD2D, FRD2E);
-   flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, FRD3D, FRD3E);
+   flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
+   flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
+   flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
    
    //*****************
    // other  D/E pipe registers
    //*****************
-   flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FWriteEnD, FWriteEnE);
-   flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE);
-   flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
-   flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE);
-   flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE);
-   flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, FOpCtrlD, FOpCtrlE);
-   flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, FDivStartD, FDivStartE);
-   flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput1D, FForwardInput1E);
-   flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput2D, FForwardInput2E);
-   flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput3D, FForwardInput3E);
-   flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResult64W, FPUResult64E);
-   flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE);
-   flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E);
-   flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE);
-   flopenrc #(1) DEReg18(clk, reset, PipeClearDE, PipeEnableDE, InstrD[15], SelLoadInputE);
-   
+   flopenrc #(64) DEReg14(clk, reset, FlushE, ~StallE, FPUResult64W, FPUResult64E);
+   flopenrc #(28) CtrlRegE(clk, reset, FlushE, ~StallE, 
+                        {FWriteEnD, FResultSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FDivStartD, FForwardInput1D, FForwardInput2D, FForwardInput3D, FWriteIntD, FOutputInput2D, FMemRWD, InstrD[15]},
+                        {FWriteEnE, FResultSelE, FrmE, FmtE, RdE,          FOpCtrlE, FDivStartE, FForwardInput1E, FForwardInput2E, FForwardInput3E, FWriteIntE, FOutputInput2E, FMemRWE, SelLoadInputE});
+  
    //EXECUTION STAGE
    
    // input muxs for forwarding   
@@ -253,91 +222,91 @@ module fpu (
    //*****************
    //fpregfile D/E pipe registers
    //*****************
-   flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, FInput1E, FInput1M);
-   flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, FInput2E, FInput2M);
-   flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, FInput3E, FInput3M);
+   flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FInput1E, FInput1M);
+   flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, FInput2E, FInput2M);
+   flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, FInput3E, FInput3M);
    
    //*****************
    // fma E/M pipe registers
    //*****************  
-  flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, ProdManE, ProdManM); 
-  flopenrc #(162) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, AlignedAddendE, AlignedAddendM); 
-  flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, ProdExpE, ProdExpM);  
-  flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, AddendStickyE, AddendStickyM); 
-  flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, KillProdE, KillProdM); 
-  flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, XZeroE, XZeroM); 
-  flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, YZeroE, YZeroM); 
-  flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, ZZeroE, ZZeroM); 
-  flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, XInfE, XInfM); 
-  flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, YInfE, YInfM); 
-  flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, ZInfE, ZInfM); 
-  flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, XNaNE, XNaNM); 
-  flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, YNaNE, YNaNM); 
-  flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, ZNaNE, ZNaNM);  
+  flopenrc #(106) EMRegFma3(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
+  flopenrc #(162) EMRegFma4(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
+  flopenrc #(13) EMRegFma6(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
+  flopenrc #(1) EMRegFma7(clk, reset, FlushM, ~StallM, AddendStickyE, AddendStickyM); 
+  flopenrc #(1) EMRegFma8(clk, reset, FlushM, ~StallM, KillProdE, KillProdM); 
+  flopenrc #(1) EMRegFma10(clk, reset, FlushM, ~StallM, XZeroE, XZeroM); 
+  flopenrc #(1) EMRegFma11(clk, reset, FlushM, ~StallM, YZeroE, YZeroM); 
+  flopenrc #(1) EMRegFma12(clk, reset, FlushM, ~StallM, ZZeroE, ZZeroM); 
+  flopenrc #(1) EMRegFma16(clk, reset, FlushM, ~StallM, XInfE, XInfM); 
+  flopenrc #(1) EMRegFma17(clk, reset, FlushM, ~StallM, YInfE, YInfM); 
+  flopenrc #(1) EMRegFma18(clk, reset, FlushM, ~StallM, ZInfE, ZInfM); 
+  flopenrc #(1) EMRegFma19(clk, reset, FlushM, ~StallM, XNaNE, XNaNM); 
+  flopenrc #(1) EMRegFma20(clk, reset, FlushM, ~StallM, YNaNE, YNaNM); 
+  flopenrc #(1) EMRegFma21(clk, reset, FlushM, ~StallM, ZNaNE, ZNaNM);  
 
    //*****************
    // fpadd E/M pipe registers
    //*****************
-   flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
-   flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
-   flopenrc #(4)  EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
-   flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
-   flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
-   flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
-   flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
-   flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
-   flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
-   flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
-   flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
-   flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
-   flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
-   flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
-   flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignAM); 
-   flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
-   flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
-   flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
-   flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
-   flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
-   flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
-   flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
-   flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
-   flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
-   flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
+   flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
+   flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
+   flopenrc #(4)  EMRegAdd3(clk, reset, FlushM, ~StallM, AddSelInvE, AddSelInvM); 
+   flopenrc #(11) EMRegAdd4(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
+   flopenrc #(1) EMRegAdd5(clk, reset, FlushM, ~StallM, AddCorrSignE, AddCorrSignM); 
+   flopenrc #(1) EMRegAdd6(clk, reset, FlushM, ~StallM, AddOp1NormE, AddOp1NormM); 
+   flopenrc #(1) EMRegAdd7(clk, reset, FlushM, ~StallM, AddOp2NormE, AddOp2NormM); 
+   flopenrc #(1) EMRegAdd8(clk, reset, FlushM, ~StallM, AddOpANormE, AddOpANormM); 
+   flopenrc #(1) EMRegAdd9(clk, reset, FlushM, ~StallM, AddOpBNormE, AddOpBNormM); 
+   flopenrc #(1) EMRegAdd10(clk, reset, FlushM, ~StallM, AddInvalidE, AddInvalidM); 
+   flopenrc #(1) EMRegAdd11(clk, reset, FlushM, ~StallM, AddDenormInE, AddDenormInM); 
+   flopenrc #(1) EMRegAdd12(clk, reset, FlushM, ~StallM, AddConvertE, AddConvertM); 
+   flopenrc #(1) EMRegAdd13(clk, reset, FlushM, ~StallM, AddSwapE, AddSwapM); 
+   flopenrc #(1) EMRegAdd14(clk, reset, FlushM, ~StallM, AddNormOvflowE, AddNormOvflowM); 
+   flopenrc #(1) EMRegAdd15(clk, reset, FlushM, ~StallM, AddSignAE, AddSignAM); 
+   flopenrc #(64) EMRegAdd16(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
+   flopenrc #(64) EMRegAdd17(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
+   flopenrc #(12) EMRegAdd18(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
+   flopenrc #(12) EMRegAdd19(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
+   flopenrc #(11) EMRegAdd20(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM); 
+   flopenrc #(3) EMRegAdd23(clk, reset, FlushM, ~StallM, AddRmE, AddRmM); 
+   flopenrc #(4) EMRegAdd24(clk, reset, FlushM, ~StallM, AddOpTypeE, AddOpTypeM); 
+   flopenrc #(1) EMRegAdd25(clk, reset, FlushM, ~StallM, AddPE, AddPM); 
+   flopenrc #(1) EMRegAdd26(clk, reset, FlushM, ~StallM, AddOvEnE, AddOvEnM); 
+   flopenrc #(1) EMRegAdd27(clk, reset, FlushM, ~StallM, AddUnEnE, AddUnEnM); 
    
    //*****************
    // fpcmp E/M pipe registers
    //*****************
-   flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
-   flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
-   flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
-   flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
-   flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
-   flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
+   flopenrc #(8) EMRegCmp1(clk, reset, FlushM, ~StallM, WE, WM); 
+   flopenrc #(8) EMRegCmp2(clk, reset, FlushM, ~StallM, XE, XM); 
+   flopenrc #(1) EMRegcmp3(clk, reset, FlushM, ~StallM, ANaNE, ANaNM); 
+   flopenrc #(1) EMRegCmp4(clk, reset, FlushM, ~StallM, BNaNE, BNaNM); 
+   flopenrc #(1) EMRegCmp5(clk, reset, FlushM, ~StallM, AzeroE, AzeroM); 
+   flopenrc #(1) EMRegCmp6(clk, reset, FlushM, ~StallM, BzeroE, BzeroM); 
    
    // put this in for the event we want to delay fsgn - will otherwise bypass
    //*****************
    // fpsgn E/M pipe registers
    //***************** 
-   flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
-   flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
+   flopenrc #(64) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnResultE, SgnResultM);
+   flopenrc #(5) EMRegSgn3(clk, reset, FlushM, ~StallM, SgnFlagsE, SgnFlagsM);
    
    //*****************
    // other E/M pipe registers
    //*****************
-   flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FWriteEnE, FWriteEnM);
-   flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM);
-   flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
-   flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM);
-   flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM);
-   flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM);
-   flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
-   flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
-   flopenrc #(1) EMReg9(clk, reset, PipeClearEM, PipeEnableEM, SelLoadInputE, SelLoadInputM);
+   flopenrc #(1) EMReg1(clk, reset, FlushM, ~StallM, FWriteEnE, FWriteEnM);
+   flopenrc #(3) EMReg2(clk, reset, FlushM, ~StallM, FResultSelE, FResultSelM);
+   flopenrc #(3) EMReg3(clk, reset, FlushM, ~StallM, FrmE, FrmM);
+   flopenrc #(1) EMReg4(clk, reset, FlushM, ~StallM, FmtE, FmtM);
+   flopenrc #(5) EMReg5(clk, reset, FlushM, ~StallM, RdE, RdM);
+   flopenrc #(4) EMReg6(clk, reset, FlushM, ~StallM, FOpCtrlE, FOpCtrlM);
+   flopenrc #(1) EMReg7(clk, reset, FlushM, ~StallM, FWriteIntE, FWriteIntM);
+   flopenrc #(2) EMReg8(clk, reset, FlushM, ~StallM, FMemRWE, FMemRWM);
+   flopenrc #(1) EMReg9(clk, reset, FlushM, ~StallM, SelLoadInputE, SelLoadInputM);
    
    //*****************
    // fpuclassify E/M pipe registers
    //***************** 
-   flopenrc #(64) EMRegClass(clk, reset, PipeClearEM, PipeEnableEM, ClassResultE, ClassResultM);
+   flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResultE, ClassResultM);
    
    //BEGIN MEMORY STAGE
    
@@ -366,56 +335,56 @@ module fpu (
    //*****************
    //fpregfile M/W pipe registers
    //*****************
-   flopenrc #(64) MWFpReg1(clk, reset, PipeClearMW, PipeEnableMW, FInput1M, FInput1W);
+   flopenrc #(64) MWFpReg1(clk, reset, FlushW, ~StallW, FInput1M, FInput1W);
    
    //*****************
    // fma M/W pipe registers
    //*****************
-   flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); 
-   flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); 
+   flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FmaResultM, FmaResultW); 
+   flopenrc #(5) MWRegFma2(clk, reset, FlushW, ~StallW, FmaFlagsM, FmaFlagsW); 
    
    //*****************
    // fpdiv M/W pipe registers
    //*****************
-   flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, FDivResultM, FDivResultW); 
-   flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, FDivFlagsM, FDivFlagsW);
-   flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
+   flopenrc #(64) MWRegDiv1(clk, reset, FlushW, ~StallW, FDivResultM, FDivResultW); 
+   flopenrc #(5) MWRegDiv2(clk, reset, FlushW, ~StallW, FDivFlagsM, FDivFlagsW);
+   flopenrc #(1) MWRegDiv3(clk, reset, FlushW, ~StallW, DivDenormM, DivDenormW); 
    
    //*****************
    // fpadd M/W pipe registers
    //*****************
-   flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, FAddResultM, FAddResultW); 
-   flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, FAddFlagsM, FAddFlagsW); 
+   flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResultM, FAddResultW); 
+   flopenrc #(5) MWRegAdd2(clk, reset, FlushW, ~StallW, FAddFlagsM, FAddFlagsW); 
    
    //*****************
    // fpcmp M/W pipe registers
    //*****************
-   flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
-   flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
-   flopenrc #(64) MWRegCmp3(clk, reset, PipeClearMW, PipeEnableMW, FCmpResultM, FCmpResultW); 
+   flopenrc #(1) MWRegCmp1(clk, reset, FlushW, ~StallW, CmpInvalidM, CmpInvalidW); 
+   flopenrc #(2) MWRegCmp2(clk, reset, FlushW, ~StallW, CmpFCCM, CmpFCCW); 
+   flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, FCmpResultM, FCmpResultW); 
    
    //*****************
    // fpsgn M/W pipe registers
    //***************** 
-   flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW);
-   flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW);
+   flopenrc #(64) MWRegSgn1(clk, reset, FlushW, ~StallW, SgnResultM, SgnResultW);
+   flopenrc #(5) MWRegSgn2(clk, reset, FlushW, ~StallW, SgnFlagsM, SgnFlagsW);
    
    //*****************
    // other M/W pipe registers
    //*****************
-   flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FWriteEnM, FWriteEnW);
-   flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
-   flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
-   flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
-   flopenrc #(64) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, AlignedSrcAM, SrcAW);
-   // flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW);
-   flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
-   flopenrc #(4) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FOpCtrlM, FOpCtrlW);
+   flopenrc #(1) MWReg1(clk, reset, FlushW, ~StallW, FWriteEnM, FWriteEnW);
+   flopenrc #(3) MWReg2(clk, reset, FlushW, ~StallW, FResultSelM, FResultSelW);
+   flopenrc #(1) MWReg3(clk, reset, FlushW, ~StallW, FmtM, FmtW);
+   flopenrc #(5) MWReg4(clk, reset, FlushW, ~StallW, RdM, RdW);
+   flopenrc #(64) MWReg5(clk, reset, FlushW, ~StallW, AlignedSrcAM, SrcAW);
+   // flopenrc #(64) MWReg6(clk, reset, FlushW, ~StallW, FLoadStoreResultM, FLoadStoreResultW);
+   flopenrc #(1) MWReg7(clk, reset, FlushW, ~StallW, FWriteIntM, FWriteIntW);
+   flopenrc #(4) MWReg6(clk, reset, FlushW, ~StallW, FOpCtrlM, FOpCtrlW);
    
    //*****************
    // fpuclassify M/W pipe registers
    //***************** 
-   flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW);
+   flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, ClassResultM, ClassResultW);
    
 
 
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index 1bbe6124b..2b052dcdf 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -554,7 +554,7 @@ string tests32f[] = '{
           if (`C_SUPPORTED % 2 == 1) tests = {tests, tests32ic};    
           else                       tests = {tests, tests32iNOc};
           if (`M_SUPPORTED % 2 == 1) tests = {tests, tests32m};
-          // if (`F_SUPPORTED) tests = {tests32f, tests};
+          if (`F_SUPPORTED) tests = {tests32f, tests};
           if (`A_SUPPORTED) tests = {tests, tests32a};
           if (`MEM_VIRTMEM) tests = {tests, tests32mmu};
       end

From be962cb1ffcb04c2f4cb5e0bb16fb777b39fe514 Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Thu, 24 Jun 2021 01:42:35 -0400
Subject: [PATCH 02/20] overhauled linux testbench and spoofed MTTIME interrupt

---
 .../regression/regression-wally.py            |   2 +-
 .../regression/wally-buildroot-batch.do       |   1 -
 wally-pipelined/regression/wally-buildroot.do |   1 -
 .../regression/wave-dos/linux-waves.do        |   3 +-
 wally-pipelined/testbench/testbench-linux.sv  | 912 +++++++++---------
 5 files changed, 484 insertions(+), 435 deletions(-)

diff --git a/wally-pipelined/regression/regression-wally.py b/wally-pipelined/regression/regression-wally.py
index eac221cd6..fcd6d4be1 100755
--- a/wally-pipelined/regression/regression-wally.py
+++ b/wally-pipelined/regression/regression-wally.py
@@ -31,7 +31,7 @@ configs = [
     TestCase(
         name="buildroot",
         cmd="vsim -do wally-buildroot-batch.do -c > {}",
-        grepstr="# loaded 2000000 instructions"
+        grepstr="# loaded 2500000 instructions"
     ),
     TestCase(
         name="rv32ic",
diff --git a/wally-pipelined/regression/wally-buildroot-batch.do b/wally-pipelined/regression/wally-buildroot-batch.do
index c16655e1e..6eea258e6 100644
--- a/wally-pipelined/regression/wally-buildroot-batch.do
+++ b/wally-pipelined/regression/wally-buildroot-batch.do
@@ -36,5 +36,4 @@ vsim workopt -suppress 8852,12070
 
 run -all
 run -all
-exec ./slack-notifier/slack-notifier.py
 quit 
diff --git a/wally-pipelined/regression/wally-buildroot.do b/wally-pipelined/regression/wally-buildroot.do
index 452ba54d4..c2312f75e 100644
--- a/wally-pipelined/regression/wally-buildroot.do
+++ b/wally-pipelined/regression/wally-buildroot.do
@@ -39,5 +39,4 @@ vsim workopt -suppress 8852,12070
 run -all
 do ./wave-dos/linux-waves.do
 run -all
-exec ./slack-notifier/slack-notifier.py
 ##quit
diff --git a/wally-pipelined/regression/wave-dos/linux-waves.do b/wally-pipelined/regression/wave-dos/linux-waves.do
index b7dfd8c5a..b37276441 100644
--- a/wally-pipelined/regression/wave-dos/linux-waves.do
+++ b/wally-pipelined/regression/wave-dos/linux-waves.do
@@ -19,12 +19,13 @@ add wave /testbench/dut/hart/FlushW
 add wave -divider F
 add wave -hex /testbench/dut/hart/ifu/PCF
 add wave -divider D
-add wave -hex /testbench/pcExpected
+add wave -hex /testbench/PCDexpected
 add wave -hex /testbench/dut/hart/ifu/PCD
 add wave -hex /testbench/PCtextD
 add wave /testbench/InstrDName
 add wave -hex /testbench/dut/hart/ifu/InstrD
 add wave -hex /testbench/dut/hart/ieu/c/InstrValidD
+add wave -hex /testbench/PCDwrong
 add wave -divider E
 add wave -hex /testbench/dut/hart/ifu/PCE
 add wave -hex /testbench/PCtextE
diff --git a/wally-pipelined/testbench/testbench-linux.sv b/wally-pipelined/testbench/testbench-linux.sv
index df8fad8cd..15e0e3634 100644
--- a/wally-pipelined/testbench/testbench-linux.sv
+++ b/wally-pipelined/testbench/testbench-linux.sv
@@ -26,13 +26,15 @@
 `include "wally-config.vh"
 
 module testbench();
-  logic            clk, reset;
-  logic [31:0]     GPIOPinsIn;
-  logic [31:0]     GPIOPinsOut, GPIOPinsEn;
-
-  // instantiate device to be tested
-  logic [31:0] CheckInstrD;
+  
+  parameter waveOnICount = 2514000; // # of instructions at which to turn on waves in graphical sim
+  
 
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////////// DUT /////////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  logic             clk, reset;
+  
   logic [`AHBW-1:0] HRDATA;
   logic [31:0]      HADDR;
   logic [`AHBW-1:0] HWDATA;
@@ -45,155 +47,97 @@ module testbench();
   logic             HCLK, HRESETn;
   logic [`AHBW-1:0] HRDATAEXT;
   logic             HREADYEXT, HRESPEXT;
-  logic             UARTSout;
-
-  logic ignoreRFwrite;
-  
-  parameter waveOnICount = 2060000; // # of instructions at which to turn on waves in graphical sim
 
+  logic [31:0]      GPIOPinsIn;
+  logic [31:0]      GPIOPinsOut, GPIOPinsEn;
+  logic             UARTSin, UARTSout;
   assign GPIOPinsIn = 0;
   assign UARTSin = 1;
 
-  // instantiate processor and memories
   wallypipelinedsoc dut(.*);
 
-  /**
-   * Walk the page table stored in dtim according to sv39 logic and translate a
-   * virtual address to a physical address.
-   *
-   * See section 4.3.2 of the RISC-V Privileged specification for a full
-   * explanation of the below algorithm.
-   */
-  function logic [`XLEN-1:0] adrTranslator( 
-    input logic [`XLEN-1:0] adrIn);
-    begin
-      logic             SvMode, PTE_R, PTE_X;
-      logic [`XLEN-1:0] SATP, PTE;
-      logic [55:0]      BaseAdr, PAdr;
-      logic [8:0]       VPN [2:0];
-      logic [11:0]      Offset;
+  ///////////////////////////////////////////////////////////////////////////////
+  ////////////////////////   Signals & Shared Macros  //////////////////////////
+  //////////////////////// AKA stuff that comes first ///////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  // Sorry if these have gotten decontextualized.
+  // Verilog expects them to be defined before they are used.
 
-      int i;
+  // -------------------
+  // Signal Declarations
+  // -------------------
+  // Testbench Core
+  integer instrs;
+  integer warningCount = 0;
+  string trashString; // should never be read from
+  logic [31:0] InstrMask;
+  logic forcedInstr;
+  logic [63:0] lastPCD;
+  logic PCDwrong;
+  // PC, Instr Checking
+  logic [`XLEN-1:0] PCW;
+  logic [63:0] lastInstrDExpected, lastPC, lastPC2;
+  integer data_file_PCF, scan_file_PCF;
+  integer data_file_PCD, scan_file_PCD;
+  integer data_file_PCM, scan_file_PCM;
+  integer data_file_PCW, scan_file_PCW;
+  string PCtextF, PCtextF2;
+  string PCtextD, PCtextD2;
+  string PCtextE;
+  string PCtextM;
+  string PCtextW;
+  logic [31:0] InstrFExpected, InstrDExpected, InstrMExpected, InstrWExpected;
+  logic [63:0] PCFexpected, PCDexpected, PCMexpected, PCWexpected;
+  // RegFile Write Checking
+  logic ignoreRFwrite;
+  logic [63:0] regExpected;
+  integer regNumExpected;
+  integer data_file_rf, scan_file_rf;
+  // Bus Unit Read/Write Checking
+  logic [63:0] readMask;
+  logic [`XLEN-1:0] readAdrExpected, readAdrTranslated;
+  logic [`XLEN-1:0] writeDataExpected, writeAdrExpected, writeAdrTranslated;
+  integer data_file_memR, scan_file_memR;
+  integer data_file_memW, scan_file_memW;
+  // CSR Checking
+  integer totalCSR = 0;
+  logic [99:0] StartCSRexpected[63:0];
+  string StartCSRname[99:0];
+  integer data_file_csr, scan_file_csr;
+  
+  // -----------
+  // Error Macro
+  // -----------
+  `define ERROR \
+    #10; \
+    $display("processed %0d instructions with %0d warnings", instrs, warningCount); \
+    $stop;
 
-      // Grab the SATP register from privileged unit
-      SATP = dut.hart.priv.csr.SATP_REGW;
+  // ----------------
+  // PC Updater Macro
+  // ----------------
+  `define SCAN_PC(DATAFILE,SCANFILE,PCTEXT,PCTEXT2,CHECKINSTR,PCEXPECTED) \
+    SCANFILE = $fscanf(DATAFILE, "%s\n", PCTEXT); \
+    PCTEXT2 = ""; \
+    while (PCTEXT2 != "***") begin \
+      PCTEXT = {PCTEXT, " ", PCTEXT2}; \
+      SCANFILE = $fscanf(DATAFILE, "%s\n", PCTEXT2); \
+    end \
+    SCANFILE = $fscanf(DATAFILE, "%x\n", CHECKINSTR); \
+    SCANFILE = $fscanf(DATAFILE, "%x\n", PCEXPECTED);
 
-      // Split the virtual address into page number segments and offset
-      VPN[2] = adrIn[38:30];
-      VPN[1] = adrIn[29:21];
-      VPN[0] = adrIn[20:12];
-      Offset = adrIn[11:0];
-
-      // We do not support sv48; only sv39
-      SvMode = SATP[63];
-
-      // Only perform translation if translation is on and the processor is not
-      // in machine mode
-      if (SvMode && (dut.hart.priv.PrivilegeModeW != `M_MODE)) begin
-        BaseAdr = SATP[43:0] << 12;
-
-        for (i = 2; i >= 0; i--) begin
-          PAdr = BaseAdr + (VPN[i] << 3);
-          
-          // dtim.RAM is 64-bit addressed. PAdr specifies a byte. We right shift
-          // by 3 (the PTE size) to get the requested 64-bit PTE.
-          PTE = dut.uncore.dtim.RAM[PAdr >> 3];
-          PTE_R = PTE[1];
-          PTE_X = PTE[3];
-          if (PTE_R || PTE_X) begin
-            // Leaf page found
-            break;
-          end else begin
-            // Go to next level of table
-            BaseAdr = PTE[53:10] << 12;
-          end
-        end
-
-        // Determine which parts of the PTE page number to use based on the
-        // level of the page table we reached.
-        if (i == 2) begin
-          // Gigapage
-          assign adrTranslator = {8'b0, PTE[53:28], VPN[1], VPN[0], Offset};
-        end else if (i == 1) begin
-          // Megapage
-          assign adrTranslator = {8'b0, PTE[53:19], VPN[0], Offset};
-        end else begin
-          // Kilopage
-          assign adrTranslator = {8'b0, PTE[53:10], Offset};
-        end
-      end else begin
-        // Direct translation if address translation is not on
-        assign adrTranslator = adrIn;
-      end
-    end
-  endfunction
-
-  // initialize test
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////////////// Testbench Core ///////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  // --------------
+  // Initialization
+  // --------------
   initial
     begin
-      ignoreRFwrite <= 0;
+      instrs = 0;
+      PCDwrong = 0;
       reset <= 1; # 22; reset <= 0;
     end
-
-  // read pc trace file
-  integer data_file_PC, scan_file_PC;
-  initial begin
-    data_file_PC = $fopen({`LINUX_TEST_VECTORS,"parsedPC.txt"}, "r");
-    if (data_file_PC == 0) begin
-      $display("file couldn't be opened");
-      $stop;
-    end
-  end
-
-  integer data_file_PCW, scan_file_PCW;
-  initial begin
-    data_file_PCW = $fopen({`LINUX_TEST_VECTORS,"parsedPC.txt"}, "r");
-    if (data_file_PCW == 0) begin
-      $display("file couldn't be opened");
-      $stop;
-    end
-  end
-
-  // read register trace file
-  integer data_file_rf, scan_file_rf;
-  initial begin
-    data_file_rf = $fopen({`LINUX_TEST_VECTORS,"parsedRegs.txt"}, "r");
-    if (data_file_rf == 0) begin
-      $display("file couldn't be opened");
-      $stop;
-    end
-  end
-
-  // read CSR trace file
-  integer data_file_csr, scan_file_csr;
-  initial begin
-    data_file_csr = $fopen({`LINUX_TEST_VECTORS,"parsedCSRs.txt"}, "r");
-    if (data_file_csr == 0) begin
-      $display("file couldn't be opened");
-      $stop;
-    end
-  end
-
-  // read memreads trace file
-  integer data_file_memR, scan_file_memR;
-  initial begin
-    data_file_memR = $fopen({`LINUX_TEST_VECTORS,"parsedMemRead.txt"}, "r");
-    if (data_file_memR == 0) begin
-      $display("file couldn't be opened");
-      $stop;
-    end
-  end
-
-  // read memwrite trace file
-  integer data_file_memW, scan_file_memW;
-  initial begin
-    data_file_memW = $fopen({`LINUX_TEST_VECTORS,"parsedMemWrite.txt"}, "r");
-    if (data_file_memW == 0) begin
-      $display("file couldn't be opened");
-      $stop;
-    end
-  end
-
   // initial loading of memories
   initial begin
     $readmemh({`LINUX_TEST_VECTORS,"bootmem.txt"}, dut.uncore.bootdtim.RAM, 'h1000 >> 3);
@@ -201,49 +145,247 @@ module testbench();
     $readmemb(`TWO_BIT_PRELOAD, dut.hart.ifu.bpred.bpred.Predictor.DirPredictor.PHT.memory);
     $readmemb(`BTB_PRELOAD, dut.hart.ifu.bpred.bpred.TargetPredictor.memory.memory);
   end
-
-  integer warningCount = 0;
-  integer instrs;
-
-  //logic[63:0] adrTranslation[4:0];
-  //string translationType[4:0] = {"rf", "writeAdr", "PCW", "PC", "readAdr"};
-  //initial begin
-  //  for(int i=0; i<5; i++) begin
-  //    adrTranslation[i] = 64'b0;
-  //  end
-  //end
-
-  //function logic equal(logic[63:0] adr, logic[63:0] adrExpected, integer func);
-  //  if (adr[11:0] !== adrExpected[11:0]) begin
-  //    equal = 1'b0;
-  //  end else begin
-  //    equal = 1'b1;
-  //    if ((adr+adrTranslation[func]) !== adrExpected) begin
-  //      adrTranslation[func] = adrExpected - adr;
-  //      $display("warning: probably new address translation %x for %s at instr %0d", adrTranslation[func], translationType[func], instrs);
-  //      warningCount += 1;
-  //    end
-  //  end
-  //endfunction
-
-  // pretty sure this isn't necessary anymore, but keeping this for now since its easier
-  function logic equal(logic[63:0] adr, logic[63:0] adrExpected, integer func);
-    equal = adr === adrExpected;
-  endfunction
-
-
-  `define ERROR \
-    #10; \
-    $display("processed %0d instructions with %0d warnings", instrs, warningCount); \
-    $stop;
-
-  logic [63:0] pcExpected;
-  logic [63:0] regExpected;
-  integer regNumExpected;
-  logic [`XLEN-1:0] PCW;
   
+  // -------
+  // Running
+  // -------
+  always
+    begin
+      clk <= 1; # 5; clk <= 0; # 5;
+    end
+
+  // -------------------------------------
+  // Special warnings for important faults
+  // -------------------------------------
+  always @(dut.hart.priv.csr.genblk1.csrm.MCAUSE_REGW) begin
+    if (dut.hart.priv.csr.genblk1.csrm.MCAUSE_REGW == 2 && instrs > 1) begin
+      $display("!!!!!! illegal instruction !!!!!!!!!!");
+      $display("(as a reminder, MCAUSE and MEPC are set by this)");
+      $display("at %0t ps, PCM %x, instr %0d, HADDR %x", $time, dut.hart.ifu.PCM, instrs, HADDR);
+      `ERROR
+    end
+    if (dut.hart.priv.csr.genblk1.csrm.MCAUSE_REGW == 5 && instrs != 0) begin
+      $display("!!!!!! illegal (physical) memory access !!!!!!!!!!");
+      $display("(as a reminder, MCAUSE and MEPC are set by this)");
+      $display("at %0t ps, PCM %x, instr %0d, HADDR %x", $time, dut.hart.ifu.PCM, instrs, HADDR);
+      `ERROR
+    end
+  end
+
+  // -----------------------
+  // RegFile Write Hijacking
+  // -----------------------
+  always @(PCW or dut.hart.ieu.InstrValidW) begin
+    if(dut.hart.ieu.InstrValidW && PCW != 0) begin
+      // Hack to compensate for how Wally's MTIME may diverge from QEMU's MTIME (and that is okay)
+      if (PCtextW.substr(0,5) == "rdtime") begin
+        ignoreRFwrite <= 1;
+        scan_file_rf = $fscanf(data_file_rf, "%d\n", regNumExpected);
+        scan_file_rf = $fscanf(data_file_rf, "%x\n", regExpected);
+        force dut.hart.ieu.dp.regf.wd3 = regExpected;
+      // Hack to compensate for QEMU's incorrect MSTATUS
+      end else if (PCtextW.substr(0,3) == "csrr" && PCtextW.substr(10,16) == "mstatus") begin
+        force dut.hart.ieu.dp.regf.wd3 = dut.hart.ieu.dp.WriteDataW & ~64'ha00000000;
+      end else
+        release dut.hart.ieu.dp.regf.wd3;
+    end
+  end
+
+  // ----------------
+  // Big Chunky Block
+  // ----------------
+  always @(reset or dut.hart.ifu.InstrRawD or dut.hart.ifu.PCD) begin// or negedge dut.hart.ifu.StallE) begin // Why do we care about StallE? Everything seems to run fine without it.
+    if(~HWRITE) begin // *** Should this need to consider HWRITE?
+      #2;
+      // If PCD/InstrD aren't garbage
+      if (~reset && dut.hart.ifu.InstrRawD[15:0] !== {16{1'bx}} && dut.hart.ifu.PCD !== 64'h0) begin // && ~dut.hart.ifu.StallE) begin
+        // If Wally's PCD has updated
+        if (dut.hart.ifu.PCD !== lastPCD) begin
+          lastInstrDExpected = InstrDExpected;
+          lastPC <= dut.hart.ifu.PCD;
+          lastPC2 <= lastPC;
+          // If PCD isn't going to be flushed
+          if (~PCDwrong || lastPC == PCDexpected) begin
+
+            // Stop if we've reached the end
+            if($feof(data_file_PCF)) begin
+              $display("no more PC data to read... CONGRATULATIONS!!!");
+              `ERROR
+            end
+
+            // Increment PC
+            `SCAN_PC(data_file_PCF, scan_file_PCF, PCtextF, PCtextF2, InstrFExpected, PCFexpected);
+            `SCAN_PC(data_file_PCD, scan_file_PCD, PCtextD, PCtextD2, InstrDExpected, PCDexpected);
+
+            // NOP out certain instructions
+            if(dut.hart.ifu.PCD===PCDexpected) begin
+              if((dut.hart.ifu.PCD == 32'h80001dc6) || // for now, NOP out any stores to PLIC
+                 (dut.hart.ifu.PCD == 32'h80001de0) ||
+                 (dut.hart.ifu.PCD == 32'h80001de2)) begin
+                $display("warning: NOPing out %s at PCD=%0x, instr %0d, time %0t", PCtextD, dut.hart.ifu.PCD, instrs, $time);
+                force InstrDExpected = 32'b0010011;
+                force dut.hart.ifu.InstrRawD = 32'b0010011;
+                while (clk != 0) #1;
+                while (clk != 1) #1;                
+                release dut.hart.ifu.InstrRawD;
+                release InstrDExpected;
+                warningCount += 1;
+                forcedInstr = 1;
+              end else begin
+                forcedInstr = 0;
+              end
+            end
+
+            // Increment instruction count
+            if (instrs <= 10 || (instrs <= 100 && instrs % 10 == 0) ||
+               (instrs <= 1000 && instrs % 100 == 0) || (instrs <= 10000 && instrs % 1000 == 0) ||
+               (instrs <= 100000 && instrs % 10000 == 0) || (instrs % 100000 == 0)) begin
+              $display("loaded %0d instructions", instrs);
+            end
+            instrs += 1;
+            
+            // Stop before bugs so "do" file can turn on waves
+            if (instrs == waveOnICount) begin
+              $display("turning on waves at %0d instructions", instrs);
+              $stop;
+            end
+
+            // Check if PCD is going to be flushed due to a branch or jump
+            if (`BPRED_ENABLED) begin
+              PCDwrong = dut.hart.ifu.bpred.bpred.BPPredWrongE;
+            end else begin
+              casex (lastInstrDExpected[31:0])
+                32'b00000000001000000000000001110011, // URET
+                32'b00010000001000000000000001110011, // SRET
+                32'b00110000001000000000000001110011, // MRET
+                32'bXXXXXXXXXXXXXXXXXXXXXXXXX1101111, // JAL
+                32'bXXXXXXXXXXXXXXXXXXXXXXXXX1100111, // JALR
+                32'bXXXXXXXXXXXXXXXXXXXXXXXXX1100011, // B
+                32'bXXXXXXXXXXXXXXXX110XXXXXXXXXXX01, // C.BEQZ
+                32'bXXXXXXXXXXXXXXXX111XXXXXXXXXXX01, // C.BNEZ
+                32'bXXXXXXXXXXXXXXXX101XXXXXXXXXXX01: // C.J
+                  PCDwrong = 1;
+                32'bXXXXXXXXXXXXXXXX1001000000000010, // C.EBREAK:
+                32'bXXXXXXXXXXXXXXXXX000XXXXX1110011: // Something that's not CSRR*
+                  PCDwrong = 0; // tbh don't really know what should happen here
+                32'b000110000000XXXXXXXXXXXXX1110011, // CSR* SATP, *
+                32'bXXXXXXXXXXXXXXXX1000XXXXX0000010, // C.JR
+                32'bXXXXXXXXXXXXXXXX1001XXXXX0000010: // C.JALR //this is RV64 only so no C.JAL
+                  PCDwrong = 1;
+                default:
+                  PCDwrong = 0;
+              endcase
+            end
+
+            // Check PCD, InstrD
+            if (~PCDwrong && ~(dut.hart.ifu.PCD === PCDexpected)) begin
+              $display("%0t ps, instr %0d: PC does not equal PC expected: %x, %x", $time, instrs, dut.hart.ifu.PCD, PCDexpected);
+              `ERROR
+            end
+            InstrMask = InstrDExpected[1:0] == 2'b11 ? 32'hFFFFFFFF : 32'h0000FFFF;
+            if ((~forcedInstr) && (~PCDwrong) && ((InstrMask & dut.hart.ifu.InstrRawD) !== (InstrMask & InstrDExpected))) begin
+              $display("%0t ps, PCD %x, instr %0d: InstrD %x %s does not equal InstrDExpected %x %s", $time, dut.hart.ifu.PCD, instrs, dut.hart.ifu.InstrRawD, InstrDName, InstrDExpected, PCtextD);
+              `ERROR
+            end
+
+            // Repeated instruction means QEMU had an interrupt which we need to spoof
+            if (PCFexpected == PCDexpected) begin
+              $display("Note at %0t ps, PCM %x %s, instr %0d: spoofing an interrupt", $time, dut.hart.ifu.PCM, PCtextM, instrs);
+              // Increment file pointers past the repeated instruction.
+              `SCAN_PC(data_file_PCF, scan_file_PCF, PCtextF, PCtextF2, InstrFExpected, PCFexpected);
+              `SCAN_PC(data_file_PCD, scan_file_PCD, PCtextD, PCtextD2, InstrDExpected, PCDexpected);
+              scan_file_memR = $fscanf(data_file_memR, "%x\n", readAdrExpected);
+              scan_file_memR = $fscanf(data_file_memR, "%x\n", HRDATA);
+              // Next force a timer interrupt (*** this may later need generalizing)
+              force dut.uncore.genblk1.clint.MTIME = dut.uncore.genblk1.clint.MTIMECMP + 1;
+              while (clk != 0) #1;
+              while (clk != 1) #1;
+              release dut.uncore.genblk1.clint.MTIME;
+            end
+          end
+        end
+        lastPCD = dut.hart.ifu.PCD;
+      end
+    end
+  end
+
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////// PC,Instr Checking ///////////////////////////////
+  /////////////////////// (outside of Big Chunky Block) /////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  // --------------
+  // Initialization
+  // --------------
+  initial begin
+    data_file_PCF = $fopen({`LINUX_TEST_VECTORS,"parsedPC.txt"}, "r");
+    data_file_PCD = $fopen({`LINUX_TEST_VECTORS,"parsedPC.txt"}, "r");
+    data_file_PCM = $fopen({`LINUX_TEST_VECTORS,"parsedPC.txt"}, "r");
+    data_file_PCW = $fopen({`LINUX_TEST_VECTORS,"parsedPC.txt"}, "r");
+    if (data_file_PCW == 0) begin
+      $display("file couldn't be opened");
+      $stop;
+    end
+    // This makes sure PCF is one instr ahead of PCD
+    `SCAN_PC(data_file_PCF, scan_file_PCF, PCtextF, PCtextF2, InstrFExpected, PCFexpected);
+    // This makes sure PCM is one instr ahead of PCW
+    `SCAN_PC(data_file_PCM, scan_file_PCM, trashString, trashString, InstrMExpected, PCMexpected);
+  end
+
+  // -------------------
+  // Additional Hardware
+  // -------------------
   flopenr #(`XLEN) PCWReg(clk, reset, ~dut.hart.ieu.dp.StallW, dut.hart.ifu.PCM, PCW);
 
+  // PCF stuff isn't actually checked
+  //   it only exists for helping detecting duplicate instructions in PCD
+  //   which are the result of interrupts hitting QEMU
+  // PCD checking already happens in "Big Chunky Block"
+  // PCM stuff isn't actually checked
+  //   it only exists for helping detecting duplicate instructions in PCW
+  //   which are the result of interrupts hitting QEMU
+  // ------------
+  // PCW Checking
+  // ------------
+  always @(PCW or dut.hart.ieu.InstrValidW) begin
+   if(dut.hart.ieu.InstrValidW && PCW != 0) begin
+      if($feof(data_file_PCW)) begin
+        $display("no more PC data to read");
+        `ERROR
+      end
+      `SCAN_PC(data_file_PCM, scan_file_PCM, trashString, trashString, InstrMExpected, PCMexpected);
+      `SCAN_PC(data_file_PCW, scan_file_PCW, trashString, trashString, InstrWExpected, PCWexpected);
+      // If repeated instr
+      if (PCMexpected == PCWexpected) begin
+        // Increment file pointers past the repeated instruction.
+        `SCAN_PC(data_file_PCM, scan_file_PCM, trashString, trashString, InstrMExpected, PCMexpected);
+        `SCAN_PC(data_file_PCW, scan_file_PCW, trashString, trashString, InstrWExpected, PCWexpected);
+      end
+      if(~(PCW === PCWexpected)) begin
+        $display("%0t ps, instr %0d: PCW does not equal PCW expected: %x, %x", $time, instrs, PCW, PCWexpected);
+        `ERROR
+      end
+    end
+  end
+  
+
+  ///////////////////////////////////////////////////////////////////////////////
+  /////////////////////////// RegFile Write Checking ////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  // --------------
+  // Initialization
+  // --------------
+  initial begin
+    data_file_rf = $fopen({`LINUX_TEST_VECTORS,"parsedRegs.txt"}, "r");
+    if (data_file_rf == 0) begin
+      $display("file couldn't be opened");
+      $stop;
+    end
+  end
+  initial
+      ignoreRFwrite <= 0;
+  // --------
+  // Checking
+  // --------
   genvar i;
   generate
     for(i=1; i<32; i++) begin
@@ -251,33 +393,32 @@ module testbench();
         if ($time == 0) begin
           scan_file_rf = $fscanf(data_file_rf, "%x\n", regExpected);
           if (dut.hart.ieu.dp.regf.rf[i] != regExpected) begin
-            $display("%0t ps, instr %0d: rf[%0d] does not equal rf expected: %x, %x", $time, instrs, i, dut.hart.ieu.dp.regf.rf[i], regExpected);
+            $display("%0t ps, PCW %x, instr %0d: rf[%0d] does not equal rf expected: %x, %x", $time, PCW, instrs, i, dut.hart.ieu.dp.regf.rf[i], regExpected);
             `ERROR
           end
         end else begin
-          if (ignoreRFwrite)
+          if (ignoreRFwrite) // this allows other testbench elements to force WriteData to take on the next regExpected
             ignoreRFwrite <= 0;
           else begin
             scan_file_rf = $fscanf(data_file_rf, "%d\n", regNumExpected);
             scan_file_rf = $fscanf(data_file_rf, "%x\n", regExpected);
           end
           if (i != regNumExpected) begin
-            $display("%0t ps, instr %0d: wrong register changed: %0d, %0d expected to switch to %x from %x", $time, instrs, i, regNumExpected, regExpected, dut.hart.ieu.dp.regf.rf[regNumExpected]);
+            $display("%0t ps, PCW %x %s, instr %0d: wrong register changed: %0d, %0d expected to switch to %x from %x", $time, PCW, PCtextW, instrs, i, regNumExpected, regExpected, dut.hart.ieu.dp.regf.rf[regNumExpected]);
             `ERROR
           end
-          if (~equal(dut.hart.ieu.dp.regf.rf[i],regExpected, 0)) begin
-            $display("%0t ps, instr %0d: rf[%0d] does not equal rf expected: %x, %x", $time, instrs, i, dut.hart.ieu.dp.regf.rf[i], regExpected);
+          if (~(dut.hart.ieu.dp.regf.rf[i] === regExpected)) begin
+            $display("%0t ps, PCW %x %s, instr %0d: rf[%0d] does not equal rf expected: %x, %x", $time, PCW, PCtextW, instrs, i, dut.hart.ieu.dp.regf.rf[i], regExpected);
             `ERROR
           end
-          //if (dut.hart.ieu.dp.regf.rf[i] !== regExpected) begin
-          //  force dut.hart.ieu.dp.regf.rf[i] = regExpected;
-          //  release dut.hart.ieu.dp.regf.rf[i];
-          //end
         end
       end
     end
   endgenerate
 
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////// Bus Unit Read/Write Checking /////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
   // RAM and bootram are addressed in 64-bit blocks - this logic handles R/W
   // including subwords. Brief explanation on signals:
   //
@@ -289,17 +430,33 @@ module testbench();
   // In the linux boot, the processor spends the first ~5 instructions in
   // bootram, before jr jumps to main RAM
 
-  logic [63:0] readMask;
+  // --------------
+  // Initialization
+  // --------------
+  initial begin
+    data_file_memR = $fopen({`LINUX_TEST_VECTORS,"parsedMemRead.txt"}, "r");
+    if (data_file_memR == 0) begin
+      $display("file couldn't be opened");
+      $stop;
+    end
+  end
+  initial begin
+    data_file_memW = $fopen({`LINUX_TEST_VECTORS,"parsedMemWrite.txt"}, "r");
+    if (data_file_memW == 0) begin
+      $display("file couldn't be opened");
+      $stop;
+    end
+  end
+
+  // ------------
+  // Read Checker
+  // ------------
   assign readMask = ((1 << (8*(1 << HSIZE))) - 1) << 8 * HADDR[2:0];
-
-  logic [`XLEN-1:0] readAdrExpected, readAdrTranslated;
-
   always @(dut.HRDATA) begin
     #2;
     if (dut.hart.MemRWM[1]
       && (dut.hart.ebu.CaptureDataM)
       && dut.HRDATA !== {64{1'bx}}) begin
-      //$display("%0t", $time);
       if($feof(data_file_memR)) begin
         $display("no more memR data to read");
         `ERROR
@@ -307,31 +464,29 @@ module testbench();
       scan_file_memR = $fscanf(data_file_memR, "%x\n", readAdrExpected);
       scan_file_memR = $fscanf(data_file_memR, "%x\n", HRDATA);
       assign readAdrTranslated = adrTranslator(readAdrExpected);
-      if (~equal(HADDR,readAdrTranslated,4)) begin
-        $display("%0t ps, instr %0d: HADDR does not equal readAdrExpected: %x, %x", $time, instrs, HADDR, readAdrTranslated);
+      if (~(HADDR === readAdrTranslated)) begin
+        $display("%0t ps, PCM %x %s, instr %0d: HADDR does not equal readAdrExpected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, HADDR, readAdrTranslated);
         `ERROR
       end
       if ((readMask & HRDATA) !== (readMask & dut.HRDATA)) begin
         if (HADDR inside `LINUX_FIX_READ) begin
-          //$display("warning %0t ps, instr %0d, adr %0d: forcing HRDATA to expected: %x, %x", $time, instrs, HADDR, HRDATA, dut.HRDATA);
+          if (HADDR != 'h10000005) // Suppress the warning for UART LSR so we can read UART output
+            $display("warning %0t ps, PCM %x %s, instr %0d, adr %0d: forcing HRDATA to expected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, HADDR, HRDATA, dut.HRDATA);
           force dut.uncore.HRDATA = HRDATA;
           #9;
           release dut.uncore.HRDATA;
           warningCount += 1;
         end else begin
-          $display("%0t ps, instr %0d: ExpectedHRDATA does not equal dut.HRDATA: %x, %x from address %x, %x", $time, instrs, HRDATA, dut.HRDATA, HADDR, HSIZE);
+          $display("%0t ps, PCM %x %s, instr %0d: ExpectedHRDATA does not equal dut.HRDATA: %x, %x from address %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, HRDATA, dut.HRDATA, HADDR, HSIZE);
           `ERROR
         end
       end
-    //end else if(dut.hart.MemRWM[1]) begin
-    //  $display("%x, %x, %x, %t", HADDR, dut.PCF, dut.HRDATA, $time);
-
     end
-
   end
 
-  logic [`XLEN-1:0] writeDataExpected, writeAdrExpected, writeAdrTranslated;
-
+  // -------------
+  // Write Checker
+  // -------------
   // this might need to change
   //always @(HWDATA or HADDR or HSIZE or HWRITE) begin
   always @(negedge HWRITE) begin
@@ -346,20 +501,28 @@ module testbench();
       assign writeAdrTranslated = adrTranslator(writeAdrExpected);
 
       if (writeDataExpected != HWDATA && ~dut.uncore.HSELPLICD) begin
-        $display("%0t ps, instr %0d: HWDATA does not equal writeDataExpected: %x, %x", $time, instrs, HWDATA, writeDataExpected);
+        $display("%0t ps, PCM %x %s, instr %0d: HWDATA does not equal writeDataExpected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, HWDATA, writeDataExpected);
         `ERROR
       end
-      if (~equal(writeAdrTranslated,HADDR,1) && ~dut.uncore.HSELPLICD) begin
-        $display("%0t ps, instr %0d: HADDR does not equal writeAdrExpected: %x, %x", $time, instrs, HADDR, writeAdrTranslated);
+      if (~(writeAdrTranslated === HADDR) && ~dut.uncore.HSELPLICD) begin
+        $display("%0t ps, PCM %x %s, instr %0d: HADDR does not equal writeAdrExpected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, HADDR, writeAdrTranslated);
         `ERROR
       end
     end
   end
 
-  integer totalCSR = 0;
-  logic [99:0] StartCSRexpected[63:0];
-  string StartCSRname[99:0];
+  ///////////////////////////////////////////////////////////////////////////////
+  //////////////////////////////// CSR Checking /////////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  // --------------
+  // Initialization
+  // --------------
   initial begin
+    data_file_csr = $fopen({`LINUX_TEST_VECTORS,"parsedCSRs.txt"}, "r");
+    if (data_file_csr == 0) begin
+      $display("file couldn't be opened");
+      $stop;
+    end
     while(1) begin
       scan_file_csr = $fscanf(data_file_csr, "%s\n", StartCSRname[totalCSR]);
       if(StartCSRname[totalCSR] == "---") begin
@@ -370,22 +533,10 @@ module testbench();
     end
   end
 
-  always @(dut.hart.priv.csr.genblk1.csrm.MCAUSE_REGW) begin
-    if (dut.hart.priv.csr.genblk1.csrm.MCAUSE_REGW == 2 && instrs > 1) begin
-      $display("!!!!!! illegal instruction !!!!!!!!!!");
-      $display("(as a reminder, MCAUSE and MEPC are set by this)");
-      $display("at %0t ps, instr %0d, HADDR %x", $time, instrs, HADDR);
-      `ERROR
-    end
-    if (dut.hart.priv.csr.genblk1.csrm.MCAUSE_REGW == 5 && instrs != 0) begin
-      $display("!!!!!! illegal (physical) memory access !!!!!!!!!!");
-      $display("(as a reminder, MCAUSE and MEPC are set by this)");
-      $display("at %0t ps, instr %0d, HADDR %x", $time, instrs, HADDR);
-      `ERROR
-    end
-  end
-
-  string MSTATUSstring = "MSTATUS";
+  // --------------
+  // Checker Macros
+  // --------------
+  string MSTATUSstring = "MSTATUS"; //string variables seem to compare more reliably than string literals
   string SEPCstring = "SEPC";
   string SCAUSEstring = "SCAUSE";
   string SSTATUSstring = "SSTATUS";
@@ -394,7 +545,6 @@ module testbench();
     string CSR; \
     string ``CSR``name = `"CSR`"; \
     string expected``CSR``name; \
-    //CSR checking \
     always @(``PATH``.``CSR``_REGW) begin \
       if ($time > 1 && (`BUILDROOT != 1 || ``CSR``name != SSTATUSstring)) begin \
         if (``CSR``name == SEPCstring) begin #1; end \
@@ -403,16 +553,16 @@ module testbench();
         scan_file_csr = $fscanf(data_file_csr, "%s\n", expected``CSR``name); \
         scan_file_csr = $fscanf(data_file_csr, "%x\n", expected``CSR``); \
         if(expected``CSR``name.icompare(``CSR``name)) begin \
-          $display("%0t ps, instr %0d: %s changed, expected %s", $time, instrs, `"CSR`", expected``CSR``name); \
+          $display("%0t ps, PCM %x %s, instr %0d: %s changed, expected %s", $time, dut.hart.ifu.PCM, PCtextM, instrs, `"CSR`", expected``CSR``name); \
         end \
         if (``CSR``name == MSTATUSstring) begin \
           if (``PATH``.``CSR``_REGW != ((``expected``CSR) | 64'ha00000000)) begin \
-            $display("%0t ps, instr %0d: %s does not equal %s expected: %x, %x", $time, instrs, ``CSR``name, expected``CSR``name, ``PATH``.``CSR``_REGW, (``expected``CSR) | 64'ha00000000); \
+            $display("%0t ps, PCM %x %s, instr %0d: %s (should be MSTATUS) does not equal %s expected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, ``CSR``name, expected``CSR``name, ``PATH``.``CSR``_REGW, (``expected``CSR) | 64'ha00000000); \
             `ERROR \
           end \
         end else \
           if (``PATH``.``CSR``_REGW != ``expected``CSR[$bits(``PATH``.``CSR``_REGW)-1:0]) begin \
-            $display("%0t ps, instr %0d: %s does not equal %s expected: %x, %x", $time, instrs, ``CSR``name, expected``CSR``name, ``PATH``.``CSR``_REGW, ``expected``CSR); \
+            $display("%0t ps, PCM %x %s, instr %0d: %s does not equal %s expected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, ``CSR``name, expected``CSR``name, ``PATH``.``CSR``_REGW, ``expected``CSR); \
             `ERROR \
           end \
       end else begin \
@@ -420,7 +570,7 @@ module testbench();
           for(integer j=0; j<totalCSR; j++) begin \
             if(!StartCSRname[j].icompare(``CSR``name)) begin \
               if(``PATH``.``CSR``_REGW != StartCSRexpected[j]) begin \
-                $display("%0t ps, instr %0d: %s does not equal %s expected: %x, %x", $time, instrs, ``CSR``name, StartCSRname[j], ``PATH``.``CSR``_REGW, StartCSRexpected[j]); \
+                $display("%0t ps, PCM %x %s, instr %0d: %s does not equal %s expected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, ``CSR``name, StartCSRname[j], ``PATH``.``CSR``_REGW, StartCSRexpected[j]); \
                 `ERROR \
               end \
             end \
@@ -428,12 +578,15 @@ module testbench();
         end \
       end \
     end
+  
   `define CHECK_CSR(CSR) \
      `CHECK_CSR2(CSR, dut.hart.priv.csr)
   `define CSRM dut.hart.priv.csr.genblk1.csrm
   `define CSRS dut.hart.priv.csr.genblk1.csrs.genblk1
 
-  
+  // --------
+  // Checking
+  // --------
   //`CHECK_CSR(FCSR)
   `CHECK_CSR2(MCAUSE, `CSRM)
   `CHECK_CSR(MCOUNTEREN)
@@ -460,193 +613,11 @@ module testbench();
   `CHECK_CSR2(STVAL, `CSRS)
   `CHECK_CSR(STVEC)
 
-  logic speculative;
-  initial begin
-    speculative = 0;
-  end
-  logic [63:0] lastCheckInstrD, lastPC, lastPC2;
-
-  string PCtextW, PCtext2W;
-  logic [31:0] InstrWExpected;
-  logic [63:0] PCWExpected;
-  always @(PCW or dut.hart.ieu.InstrValidW) begin
-   if(dut.hart.ieu.InstrValidW && PCW != 0) begin
-      if($feof(data_file_PCW)) begin
-        $display("no more PC data to read");
-        `ERROR
-      end
-      scan_file_PCW = $fscanf(data_file_PCW, "%s\n", PCtextW);
-      PCtext2W = "";
-      while (PCtext2W != "***") begin
-        PCtextW = {PCtextW, " ", PCtext2W};
-        scan_file_PC = $fscanf(data_file_PCW, "%s\n", PCtext2W);
-      end
-      scan_file_PCW = $fscanf(data_file_PCW, "%x\n", InstrWExpected);
-      // then expected PC value
-      scan_file_PCW = $fscanf(data_file_PCW, "%x\n", PCWExpected);
-      if(~equal(PCW,PCWExpected,2)) begin
-        $display("%0t ps, instr %0d: PCW does not equal PCW expected: %x, %x", $time, instrs, PCW, PCWExpected);
-        `ERROR
-      end
-      //if(it.InstrW != InstrWExpected) begin
-      //  $display("%0t ps, instr %0d: InstrW does not equal InstrW expected: %x, %x", $time, instrs, it.InstrW, InstrWExpected);
-      //end
-      //
-      // Hack to compensate for how Wally's MTIME may diverge from QEMU's MTIME (and that is okay)
-      if (PCtextW.substr(0,5) == "rdtime") begin
-        ignoreRFwrite <= 1;
-        scan_file_rf = $fscanf(data_file_rf, "%d\n", regNumExpected);
-        scan_file_rf = $fscanf(data_file_rf, "%x\n", regExpected);
-        force dut.hart.ieu.dp.regf.wd3 = regExpected;
-      // Hack to compensate for QEMU's incorrect MSTATUS
-      end else if (PCtextW.substr(0,3) == "csrr" && PCtextW.substr(10,16) == "mstatus") begin
-        force dut.hart.ieu.dp.regf.wd3 = dut.hart.ieu.dp.WriteDataW & ~64'ha00000000;
-      end else
-        release dut.hart.ieu.dp.regf.wd3;
-    end
-  end
-
-  string PCtextD,PCtextE,PCtextM,PCtext2;
-  always_ff @(posedge clk, posedge reset)
-    if (reset) begin
-      PCtextE = "(reset)";
-      PCtextM = "(reset)";
-    end else begin
-      if (~dut.hart.StallM) 
-        if (dut.hart.FlushM) PCtextM = "(flushed)";
-        else                 PCtextM = PCtextE;
-      if (~dut.hart.StallE) 
-        if (dut.hart.FlushE) PCtextE = "(flushed)";
-        else                 PCtextE = PCtextD;
-    end
-
-
-  initial begin
-    instrs = 0;
-  end
-  logic [31:0] InstrMask;
-  logic forcedInstr;
-  logic [63:0] lastPCD;
-  
-  always @(dut.hart.ifu.PCD or dut.hart.ifu.InstrRawD or reset or negedge dut.hart.ifu.StallE) begin
-    if(~HWRITE) begin
-      #2;
-      if (~reset && dut.hart.ifu.InstrRawD[15:0] !== {16{1'bx}} && dut.hart.ifu.PCD !== 64'h0 && ~dut.hart.ifu.StallE) begin
-        if (dut.hart.ifu.PCD !== lastPCD) begin
-          lastCheckInstrD = CheckInstrD;
-          lastPC <= dut.hart.ifu.PCD;
-          lastPC2 <= lastPC;
-          if (speculative && (lastPC != pcExpected)) begin
-            speculative = ~equal(dut.hart.ifu.PCD,pcExpected,3);
-            if(dut.hart.ifu.PCD===pcExpected) begin
-              //if((dut.hart.ifu.InstrRawD[6:0] == 7'b1010011) || // We no longer have to NOP out any float instrs!
-              if((dut.hart.ifu.PCD == 32'h80001dc6) ||          // for now, NOP out any stores to PLIC
-                 (dut.hart.ifu.PCD == 32'h80001de0) ||
-                 (dut.hart.ifu.PCD == 32'h80001de2)) begin 
-                $display("warning: NOPing out %s at PC=%0x, instr %0d, time %0t", PCtextD, dut.hart.ifu.PCD, instrs, $time);
-                force CheckInstrD = 32'b0010011;
-                force dut.hart.ifu.InstrRawD = 32'b0010011;
-                while (clk != 0) #1;
-                while (clk != 1) #1;
-                release dut.hart.ifu.InstrRawD;
-                release CheckInstrD;
-                warningCount += 1;
-                forcedInstr = 1;
-              end
-              else begin
-                forcedInstr = 0;
-              end
-            end
-          end
-          else begin
-            if($feof(data_file_PC)) begin
-              $display("no more PC data to read");
-              `ERROR
-            end
-            scan_file_PC = $fscanf(data_file_PC, "%s\n", PCtextD);
-            PCtext2 = "";
-            while (PCtext2 != "***") begin
-              PCtextD = {PCtextD, " ", PCtext2};
-              scan_file_PC = $fscanf(data_file_PC, "%s\n", PCtext2);
-            end
-            scan_file_PC = $fscanf(data_file_PC, "%x\n", CheckInstrD);
-            if(dut.hart.ifu.PCD === pcExpected) begin
-              if((dut.hart.ifu.InstrRawD[6:0] == 7'b1010011) || // for now, NOP out any float instrs
-                 (dut.hart.ifu.PCD == 32'h80001dc6) ||          // as well as stores to PLIC
-                 (dut.hart.ifu.PCD == 32'h80001de0) ||
-                 (dut.hart.ifu.PCD == 32'h80001de2)) begin 
-                $display("warning: NOPing out %s at PC=%0x, instr %0d, time %0t", PCtextD, dut.hart.ifu.PCD, instrs, $time);
-                force CheckInstrD = 32'b0010011;
-                force dut.hart.ifu.InstrRawD = 32'b0010011;
-                while (clk != 0) #1;
-                while (clk != 1) #1;
-                release dut.hart.ifu.InstrRawD;
-                release CheckInstrD;
-                warningCount += 1;
-                forcedInstr = 1;
-              end
-              else begin
-                forcedInstr = 0;
-              end
-            end
-            // then expected PC value
-            scan_file_PC = $fscanf(data_file_PC, "%x\n", pcExpected);
-            if (instrs <= 10 || (instrs <= 100 && instrs % 10 == 0) ||
-               (instrs <= 1000 && instrs % 100 == 0) || (instrs <= 10000 && instrs % 1000 == 0) ||
-               (instrs <= 100000 && instrs % 10000 == 0) || (instrs % 100000 == 0)) begin
-              $display("loaded %0d instructions", instrs);
-            end
-            if (instrs == waveOnICount) begin
-              $display("turning on waves at %0d instructions", instrs);
-              $stop; // do file will resume after this first stop
-            end
-            instrs += 1;
-            // are we at a branch/jump?
-            if (`BPRED_ENABLED) begin
-              speculative = dut.hart.ifu.bpred.bpred.BPPredWrongE;
-            end else begin
-              casex (lastCheckInstrD[31:0])
-                32'b00000000001000000000000001110011, // URET
-                32'b00010000001000000000000001110011, // SRET
-                32'b00110000001000000000000001110011, // MRET
-                32'bXXXXXXXXXXXXXXXXXXXXXXXXX1101111, // JAL
-                32'bXXXXXXXXXXXXXXXXXXXXXXXXX1100111, // JALR
-                32'bXXXXXXXXXXXXXXXXXXXXXXXXX1100011, // B
-                32'bXXXXXXXXXXXXXXXX110XXXXXXXXXXX01, // C.BEQZ
-                32'bXXXXXXXXXXXXXXXX111XXXXXXXXXXX01, // C.BNEZ
-                32'bXXXXXXXXXXXXXXXX101XXXXXXXXXXX01: // C.J
-                  speculative = 1;
-                32'bXXXXXXXXXXXXXXXX1001000000000010, // C.EBREAK:
-                32'bXXXXXXXXXXXXXXXXX000XXXXX1110011: // Something that's not CSRR*
-                  speculative = 0; // tbh don't really know what should happen here
-                32'b000110000000XXXXXXXXXXXXX1110011, // CSR* SATP, *
-                32'bXXXXXXXXXXXXXXXX1000XXXXX0000010, // C.JR
-                32'bXXXXXXXXXXXXXXXX1001XXXXX0000010: // C.JALR //this is RV64 only so no C.JAL
-                  speculative = 1;
-                default:
-                  speculative = 0;
-              endcase
-            end
-
-            //check things!
-            if ((~speculative) && (~equal(dut.hart.ifu.PCD,pcExpected,3))) begin
-              $display("%0t ps, instr %0d: PC does not equal PC expected: %x, %x", $time, instrs, dut.hart.ifu.PCD, pcExpected);
-              `ERROR
-            end
-            InstrMask = CheckInstrD[1:0] == 2'b11 ? 32'hFFFFFFFF : 32'h0000FFFF;
-            if ((~forcedInstr) && (~speculative) && ((InstrMask & dut.hart.ifu.InstrRawD) !== (InstrMask & CheckInstrD))) begin
-              $display("%0t ps, instr %0d: InstrD does not equal CheckInstrD: %x, %x, PC: %x", $time, instrs, dut.hart.ifu.InstrRawD, CheckInstrD, dut.hart.ifu.PCD);
-              `ERROR
-            end
-          end
-        end
-        lastPCD = dut.hart.ifu.PCD;
-      end
-    end
-  end
-
-
-  // Track names of instructions
+  ///////////////////////////////////////////////////////////////////////////////
+  ///////////////////////////////// Miscellaneous ///////////////////////////////
+  ///////////////////////////////////////////////////////////////////////////////
+  // Instr Opcode Tracking
+  //   For waveview convenience
   string InstrFName, InstrDName, InstrEName, InstrMName, InstrWName;
   logic [31:0] InstrW;
   instrTrackerTB it(clk, reset,
@@ -655,21 +626,100 @@ module testbench();
                 dut.hart.ifu.InstrM,  dut.hart.ifu.InstrW,
                 InstrFName, InstrDName, InstrEName, InstrMName, InstrWName);
 
-  // generate clock to sequence tests
-  always
-    begin
-      clk <= 1; # 5; clk <= 0; # 5;
+  // Instr Assembly Tracking
+  //   For waveview convenience
+  //   PCtextF, PCtextD are read from testvectors
+  //   You could just as well read the others from testvectors,
+  //   but I really like how the pipeline synchronizes with Wally so cleanly
+  always_ff @(posedge clk, posedge reset)
+    if (reset) begin
+      PCtextE = "(reset)";
+      PCtextM = "(reset)";
+      PCtextW = "(reset)";
+    end else begin
+      if (~dut.hart.StallW) 
+        if (dut.hart.FlushW) PCtextW = "(flushed)";
+        else                 PCtextW = PCtextM;
+      if (~dut.hart.StallM) 
+        if (dut.hart.FlushM) PCtextM = "(flushed)";
+        else                 PCtextM = PCtextE;
+      if (~dut.hart.StallE) 
+        if (dut.hart.FlushE) PCtextE = "(flushed)";
+        else                 PCtextE = PCtextD;
     end
-
+  
+  // ------------------
+  // Address Translator
+  // ------------------
+   /**
+   * Walk the page table stored in dtim according to sv39 logic and translate a
+   * virtual address to a physical address.
+   *
+   * See section 4.3.2 of the RISC-V Privileged specification for a full
+   * explanation of the below algorithm.
+   */
+  function logic [`XLEN-1:0] adrTranslator( 
+    input logic [`XLEN-1:0] adrIn);
+    begin
+      logic             SvMode, PTE_R, PTE_X;
+      logic [`XLEN-1:0] SATP, PTE;
+      logic [55:0]      BaseAdr, PAdr;
+      logic [8:0]       VPN [2:0];
+      logic [11:0]      Offset;
+      int i;
+      // Grab the SATP register from privileged unit
+      SATP = dut.hart.priv.csr.SATP_REGW;
+      // Split the virtual address into page number segments and offset
+      VPN[2] = adrIn[38:30];
+      VPN[1] = adrIn[29:21];
+      VPN[0] = adrIn[20:12];
+      Offset = adrIn[11:0];
+      // We do not support sv48; only sv39
+      SvMode = SATP[63];
+      // Only perform translation if translation is on and the processor is not
+      // in machine mode
+      if (SvMode && (dut.hart.priv.PrivilegeModeW != `M_MODE)) begin
+        BaseAdr = SATP[43:0] << 12;
+        for (i = 2; i >= 0; i--) begin
+          PAdr = BaseAdr + (VPN[i] << 3);
+          // dtim.RAM is 64-bit addressed. PAdr specifies a byte. We right shift
+          // by 3 (the PTE size) to get the requested 64-bit PTE.
+          PTE = dut.uncore.dtim.RAM[PAdr >> 3];
+          PTE_R = PTE[1];
+          PTE_X = PTE[3];
+          if (PTE_R || PTE_X) begin
+            // Leaf page found
+            break;
+          end else begin
+            // Go to next level of table
+            BaseAdr = PTE[53:10] << 12;
+          end
+        end
+        // Determine which parts of the PTE page number to use based on the
+        // level of the page table we reached.
+        if (i == 2) begin
+          // Gigapage
+          assign adrTranslator = {8'b0, PTE[53:28], VPN[1], VPN[0], Offset};
+        end else if (i == 1) begin
+          // Megapage
+          assign adrTranslator = {8'b0, PTE[53:19], VPN[0], Offset};
+        end else begin
+          // Kilopage
+          assign adrTranslator = {8'b0, PTE[53:10], Offset};
+        end
+      end else begin
+        // Direct translation if address translation is not on
+        assign adrTranslator = adrIn;
+      end
+    end
+  endfunction
 endmodule
+
+
 module instrTrackerTB(
   input  logic            clk, reset,
   input  logic [31:0]     InstrF,InstrD,InstrE,InstrM,InstrW,
-  output string           InstrFName, InstrDName, InstrEName, InstrMName, InstrWName);
-        
-  // stage Instr to Writeback for visualization
-  //flopr  #(32) InstrWReg(clk, reset, InstrM, InstrW);
-
+  output string           InstrFName, InstrDName, InstrEName, InstrMName, InstrWName);     
   instrNameDecTB fdec(InstrF, InstrFName);
   instrNameDecTB ddec(InstrD, InstrDName);
   instrNameDecTB edec(InstrE, InstrEName);

From cee468b21aedfa9f4c1ec49c7e6fe4437d55f10c Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Thu, 24 Jun 2021 01:54:46 -0400
Subject: [PATCH 03/20] whoops meant to remove notifications from busybear, not
 buildroot

---
 wally-pipelined/regression/wally-buildroot-batch.do | 1 +
 wally-pipelined/regression/wally-buildroot.do       | 1 +
 wally-pipelined/regression/wally-busybear-batch.do  | 1 -
 wally-pipelined/regression/wally-busybear.do        | 1 -
 4 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/wally-pipelined/regression/wally-buildroot-batch.do b/wally-pipelined/regression/wally-buildroot-batch.do
index 6eea258e6..c16655e1e 100644
--- a/wally-pipelined/regression/wally-buildroot-batch.do
+++ b/wally-pipelined/regression/wally-buildroot-batch.do
@@ -36,4 +36,5 @@ vsim workopt -suppress 8852,12070
 
 run -all
 run -all
+exec ./slack-notifier/slack-notifier.py
 quit 
diff --git a/wally-pipelined/regression/wally-buildroot.do b/wally-pipelined/regression/wally-buildroot.do
index c2312f75e..452ba54d4 100644
--- a/wally-pipelined/regression/wally-buildroot.do
+++ b/wally-pipelined/regression/wally-buildroot.do
@@ -39,4 +39,5 @@ vsim workopt -suppress 8852,12070
 run -all
 do ./wave-dos/linux-waves.do
 run -all
+exec ./slack-notifier/slack-notifier.py
 ##quit
diff --git a/wally-pipelined/regression/wally-busybear-batch.do b/wally-pipelined/regression/wally-busybear-batch.do
index e819d7804..a4a80eb74 100644
--- a/wally-pipelined/regression/wally-busybear-batch.do
+++ b/wally-pipelined/regression/wally-busybear-batch.do
@@ -36,5 +36,4 @@ vopt work_busybear.testbench -o workopt_busybear
 vsim workopt_busybear -suppress 8852,12070
 
 run -all
-exec ./slack-notifier/slack-notifier.py
 quit
diff --git a/wally-pipelined/regression/wally-busybear.do b/wally-pipelined/regression/wally-busybear.do
index 8d6af28bc..11876dded 100644
--- a/wally-pipelined/regression/wally-busybear.do
+++ b/wally-pipelined/regression/wally-busybear.do
@@ -40,5 +40,4 @@ do ./wave-dos/linux-waves.do
 
 #-- Run the Simulation 
 run -all
-exec ./slack-notifier/slack-notifier.py
 ##quit

From 53d545cdfe1393de95152cbae842e11321190445 Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Thu, 24 Jun 2021 02:00:01 -0400
Subject: [PATCH 04/20] regression can overcome the fact that buildroots UART
 prints stuff

---
 wally-pipelined/regression/regression-wally.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wally-pipelined/regression/regression-wally.py b/wally-pipelined/regression/regression-wally.py
index fcd6d4be1..b1ded5e7a 100755
--- a/wally-pipelined/regression/regression-wally.py
+++ b/wally-pipelined/regression/regression-wally.py
@@ -26,12 +26,12 @@ configs = [
     TestCase(
         name="busybear",
         cmd="vsim -do wally-busybear-batch.do -c > {}",
-        grepstr="# loaded 100000 instructions"
+        grepstr="loaded 100000 instructions"
     ),
     TestCase(
         name="buildroot",
         cmd="vsim -do wally-buildroot-batch.do -c > {}",
-        grepstr="# loaded 2500000 instructions"
+        grepstr="loaded 2500000 instructions"
     ),
     TestCase(
         name="rv32ic",

From 2d9c91096b4f32372e6bdb827771763052cd2ded Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Thu, 24 Jun 2021 08:35:00 -0400
Subject: [PATCH 05/20] make linux testgen be nohup-friendly and make parser
 account for lr/sc memory accesses

---
 .gitignore                                        |  1 +
 wally-pipelined/linux-testgen/logAllBuildroot.sh  |  2 +-
 wally-pipelined/linux-testgen/parse_gdb_output.py | 11 ++++++++++-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index c6cac56fd..fe21942d0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ testsBP/*/*/*.elf*
 testsBP/*/OBJ/*
 testsBP/*/*.a
 wally-pipelined/linux-testgen/linux-testvectors/*
+wally-pipelined/linux-testgen/nohup*
 !wally-pipelined/linux-testgen/linux-testvectors/tvCopier.py
 !wally-pipelined/linux-testgen/linux-testvectors/tvLinker.sh
 wally-pipelined/regression/slack-notifier/slack-webhook-url.txt
diff --git a/wally-pipelined/linux-testgen/logAllBuildroot.sh b/wally-pipelined/linux-testgen/logAllBuildroot.sh
index df8b506a8..d045ee98c 100755
--- a/wally-pipelined/linux-testgen/logAllBuildroot.sh
+++ b/wally-pipelined/linux-testgen/logAllBuildroot.sh
@@ -24,4 +24,4 @@
 # =========== Just Do the Thing ========== 
 # Uncomment this version for the whole thing 
 # - Logs info needed by buildroot testbench
-(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | pv -l | ./parse_qemu.py | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot/") & riscv64-unknown-elf-gdb -x gdbinit_qemulog
+(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | ./parse_qemu.py | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot_new/") & riscv64-unknown-elf-gdb -x gdbinit_qemulog
diff --git a/wally-pipelined/linux-testgen/parse_gdb_output.py b/wally-pipelined/linux-testgen/parse_gdb_output.py
index 739a97e31..7e48fa637 100755
--- a/wally-pipelined/linux-testgen/parse_gdb_output.py
+++ b/wally-pipelined/linux-testgen/parse_gdb_output.py
@@ -44,7 +44,7 @@ try:
                   instrs += 1
                   storeAMO = ''
                   if instrs % 10000 == 0:
-                    print(instrs)
+                    print(instrs,flush=True)
                   # Instr in human assembly
                   wPC.write('{} ***\n'.format(' '.join(l.split(':')[1].split()[0:2])))
                   if '\tld' in l or '\tlw' in l or '\tlh' in l or '\tlb' in l:
@@ -63,6 +63,15 @@ try:
                     storeLoc = readLoc
                     storeReg = l.split()[-1].split(',')[1]
                     storeAMO = l.split()[-2]
+                  if '\tlr' in l:
+                    currentRead = l.split()[-1].split(',')[0]
+                    readOffset = "0"
+                    readLoc = l.split()[-1].split('(')[1][:-1]
+                    readType = "0" # *** I don't see that readType or lastReadType are ever used; we can probably get rid of them
+                  if '\tsc' in l:
+                    storeOffset = "0"
+                    storeLoc = l.split()[-1].split('(')[1][:-1]
+                    storeReg = l.split()[-1].split(',')[1]
                   if '\tsd' in l or '\tsw' in l or '\tsh' in l or '\tsb' in l:
                     s = l.split('#')[0].split()[-1]
                     storeReg = s.split(',')[0]

From 86e369df5284d9357a390d8b2ac9fe91f8152a3e Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Thu, 24 Jun 2021 11:20:21 -0400
Subject: [PATCH 06/20] fixed forwarding

---
 wally-pipelined/src/ebu/ahblite.sv            |  4 ++--
 wally-pipelined/src/ieu/controller.sv         |  3 ++-
 wally-pipelined/src/ieu/datapath.sv           | 20 +++++++++++++------
 wally-pipelined/src/ieu/forward.sv            |  2 +-
 wally-pipelined/src/ieu/ieu.sv                |  6 ++++--
 wally-pipelined/src/lsu/lsu.sv                |  2 +-
 wally-pipelined/src/muldiv/muldiv.sv          |  4 ++--
 wally-pipelined/src/privileged/csr.sv         |  4 ++--
 wally-pipelined/src/privileged/privileged.sv  |  2 +-
 .../src/wally/wallypipelinedhart.sv           |  9 ++++-----
 10 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/wally-pipelined/src/ebu/ahblite.sv b/wally-pipelined/src/ebu/ahblite.sv
index c59dfa9b5..9ace1077b 100644
--- a/wally-pipelined/src/ebu/ahblite.sv
+++ b/wally-pipelined/src/ebu/ahblite.sv
@@ -62,7 +62,7 @@ module ahblite (
   // Signals to PMA checker (metadata of proposed access)
   output logic             AtomicAccessM, ExecuteAccessF, WriteAccessM, ReadAccessM,
   // Return from bus
-  output logic [`XLEN-1:0] ReadDataW,
+  output logic [`XLEN-1:0] ReadDataM, ReadDataW,
   // AHB-Lite external signals
   input  logic [`AHBW-1:0] HRDATA,
   input  logic             HREADY, HRESP,
@@ -87,7 +87,7 @@ module ahblite (
   logic GrantData;
   logic [31:0] AccessAddress;
   logic [2:0] AccessSize, PTESize, ISize;
-  logic [`AHBW-1:0] HRDATAMasked, ReadDataM, CapturedData, ReadDataWnext, WriteData;
+  logic [`AHBW-1:0] HRDATAMasked, CapturedData, ReadDataWnext, WriteData;
   logic IReady, DReady;
   logic CaptureDataM,CapturedDataAvailable;
 
diff --git a/wally-pipelined/src/ieu/controller.sv b/wally-pipelined/src/ieu/controller.sv
index b27541d42..09ded48ba 100644
--- a/wally-pipelined/src/ieu/controller.sv
+++ b/wally-pipelined/src/ieu/controller.sv
@@ -53,6 +53,7 @@ module controller(
   output logic [1:0] AtomicM,
   output logic [2:0] Funct3M,
   output logic       RegWriteM,     // for Hazard Unit
+  output logic [2:0] ResultSrcM,
   output logic       InstrValidM,
   // Writeback stage control signals
   input  logic       StallW, FlushW,
@@ -72,7 +73,7 @@ module controller(
 
   // pipelined control signals
   logic 	    RegWriteE;
-  logic [2:0] ResultSrcD, ResultSrcE, ResultSrcM;
+  logic [2:0] ResultSrcD, ResultSrcE;
   logic [1:0] MemRWD, MemRWE;
   logic		    JumpD;
   logic		    BranchD, BranchE;
diff --git a/wally-pipelined/src/ieu/datapath.sv b/wally-pipelined/src/ieu/datapath.sv
index 13db65a37..848ed89a5 100644
--- a/wally-pipelined/src/ieu/datapath.sv
+++ b/wally-pipelined/src/ieu/datapath.sv
@@ -45,6 +45,9 @@ module datapath (
   // Memory stage signals
   input  logic             StallM, FlushM,
   input  logic [`XLEN-1:0] FWriteDataM,
+  input  logic             SquashSCM,
+  input  logic [2:0]       ResultSrcM,
+  input  logic [`XLEN-1:0] CSRReadValM, ReadDataM, MulDivResultM, 
   output logic [`XLEN-1:0] SrcAM,
   output logic [`XLEN-1:0] WriteDataM, MemAdrM,
   // Writeback stage signals
@@ -54,7 +57,6 @@ module datapath (
   input  logic             RegWriteW, 
   input  logic             SquashSCW,
   input  logic [2:0]       ResultSrcW,
-  // input  logic [`XLEN-1:0] PCLinkW,
   input  logic [`XLEN-1:0] CSRReadValW, ReadDataW, MulDivResultW, 
   // Hazard Unit signals 
   output logic [4:0]       Rs1D, Rs2D, Rs1E, Rs2E,
@@ -76,7 +78,9 @@ module datapath (
   logic [`XLEN-1:0] WriteDataE;
   logic [`XLEN-1:0] TargetBaseE;
   // Memory stage signals
+  logic [`XLEN-1:0] SCResultM;
   logic [`XLEN-1:0] ALUResultM;
+  logic [`XLEN-1:0] ResultM;
   // Writeback stage signals
   logic [`XLEN-1:0] SCResultW;
   logic [`XLEN-1:0] ALUResultW;
@@ -102,8 +106,8 @@ module datapath (
   flopenrc #(5)    Rs2EReg(clk, reset, FlushE, ~StallE, Rs2D, Rs2E);
   flopenrc #(5)    RdEReg(clk, reset, FlushE, ~StallE, RdD, RdE);
 	
-  mux4  #(`XLEN)  faemux(RD1E, WriteDataW, ALUResultM, FWriteDataM, ForwardAE, PreSrcAE);
-  mux4  #(`XLEN)  fbemux(RD2E, WriteDataW, ALUResultM, FWriteDataM, ForwardBE, WriteDataE);
+  mux4  #(`XLEN)  faemux(RD1E, WriteDataW, ResultM, FWriteDataM, ForwardAE, PreSrcAE);
+  mux4  #(`XLEN)  fbemux(RD2E, WriteDataW, ResultM, FWriteDataM, ForwardBE, WriteDataE);
   mux2  #(`XLEN)  srcamux(PreSrcAE, PCE, ALUSrcAE, SrcAE);
   mux2  #(`XLEN)  srcamux2(SrcAE, PCLinkE, JumpE, SrcAE2);  
   mux2  #(`XLEN)  srcbmux(WriteDataE, ExtImmE, ALUSrcBE, SrcBE);
@@ -118,6 +122,7 @@ module datapath (
   assign MemAdrM = ALUResultM;
   flopenrc #(`XLEN) WriteDataMReg(clk, reset, FlushM, ~StallM, WriteDataE, WriteDataM);
   flopenrc #(5)    RdMEg(clk, reset, FlushM, ~StallM, RdE, RdM);
+  mux5  #(`XLEN) resultmuxM(ALUResultM, ReadDataM, CSRReadValM, MulDivResultM, SCResultM, ResultSrcM, ResultM);	
   
   // Writeback stage pipeline register and logic
   flopenrc #(`XLEN) ALUResultWReg(clk, reset, FlushW, ~StallW, ALUResultM, ALUResultW);
@@ -125,13 +130,16 @@ module datapath (
 
   // handle Store Conditional result if atomic extension supported
   generate 
-    if (`A_SUPPORTED)
+    if (`A_SUPPORTED) begin
+      assign SCResultM = SquashSCM ? {{(`XLEN-1){1'b0}}, 1'b1} : {{(`XLEN-1){1'b0}}, 1'b0};
       assign SCResultW = SquashSCW ? {{(`XLEN-1){1'b0}}, 1'b1} : {{(`XLEN-1){1'b0}}, 1'b0};
-    else 
+    end else begin
+      assign SCResultM = 0;
       assign SCResultW = 0;
+    end
   endgenerate
 
-  mux5  #(`XLEN) resultmux(ALUResultW, ReadDataW, CSRReadValW, MulDivResultW, SCResultW, ResultSrcW, ResultW);	
+  mux5  #(`XLEN) resultmuxW(ALUResultW, ReadDataW, CSRReadValW, MulDivResultW, SCResultW, ResultSrcW, ResultW);	
 /* -----\/----- EXCLUDED -----\/-----
   // This mux4:1 no longer needs to include PCLinkW.  This is set correctly in the execution stage.
   // *** need to look at how the decoder is coded to fix.
diff --git a/wally-pipelined/src/ieu/forward.sv b/wally-pipelined/src/ieu/forward.sv
index cdc6d2700..6729ed424 100644
--- a/wally-pipelined/src/ieu/forward.sv
+++ b/wally-pipelined/src/ieu/forward.sv
@@ -43,7 +43,7 @@ module forward(
     if (Rs1E != 5'b0)
       if      ((Rs1E == RdM) & RegWriteM) ForwardAE = 2'b10;
       else if ((Rs1E == RdW) & (RegWriteW|FWriteIntW)) ForwardAE = 2'b01;
-     else if ((Rs1E == RdM) & FWriteIntM) ForwardAE = 2'b11;
+      else if ((Rs1E == RdM) & FWriteIntM) ForwardAE = 2'b11;
  
     if (Rs2E != 5'b0)
       if      ((Rs2E == RdM) & RegWriteM) ForwardBE = 2'b10;
diff --git a/wally-pipelined/src/ieu/ieu.sv b/wally-pipelined/src/ieu/ieu.sv
index 0bd9d598f..73c619f62 100644
--- a/wally-pipelined/src/ieu/ieu.sv
+++ b/wally-pipelined/src/ieu/ieu.sv
@@ -43,9 +43,10 @@ module ieu (
   // Memory stage interface
   input logic 		   DataMisalignedM,
   input logic 		   DataAccessFaultM,
-  input logic 		   SquashSCW,
   input logic	     	   FWriteIntM,
   input  logic [`XLEN-1:0] FWriteDataM,
+  input logic 		       SquashSCM,
+  input  logic [`XLEN-1:0] CSRReadValM, ReadDataM, MulDivResultM, 
   output logic [1:0] 	   MemRWM,
   output logic [1:0] 	   AtomicM,
   output logic [`XLEN-1:0] MemAdrM, WriteDataM,
@@ -55,6 +56,7 @@ module ieu (
   input logic [`XLEN-1:0]  CSRReadValW, ReadDataW, MulDivResultW,
   input logic             FWriteIntW,
   input logic [`XLEN-1:0] FPUResultW,
+  input logic 		   SquashSCW,
   // input  logic [`XLEN-1:0] PCLinkW,
   output logic 		   InstrValidM, InstrValidW,
   // hazards
@@ -72,7 +74,7 @@ module ieu (
   logic [2:0]  FlagsE;
   logic [4:0]  ALUControlE;
   logic        ALUSrcAE, ALUSrcBE;
-  logic [2:0]  ResultSrcW;
+  logic [2:0]  ResultSrcM, ResultSrcW;
   logic       TargetSrcE;
 
   // forwarding signals
diff --git a/wally-pipelined/src/lsu/lsu.sv b/wally-pipelined/src/lsu/lsu.sv
index ffa79adfe..f44340a45 100644
--- a/wally-pipelined/src/lsu/lsu.sv
+++ b/wally-pipelined/src/lsu/lsu.sv
@@ -45,6 +45,7 @@ module lsu (
   output logic [1:0]       AtomicMaskedM,
   output logic             DataMisalignedM,
   output logic             CommittedM,
+  output logic             SquashSCM,
   // Writeback Stage
   input  logic             MemAckW,
   input  logic [`XLEN-1:0] ReadDataW,
@@ -81,7 +82,6 @@ module lsu (
   
 );
 
-  logic SquashSCM;
   logic DTLBPageFaultM;
   logic MemAccessM;
   logic [1:0] CurrState, NextState;
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index e10b0c55d..691b3b5ae 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -34,7 +34,7 @@ module muldiv (
 	       input logic [2:0] 	Funct3E,
 	       input logic 		MulDivE, W64E,
 	       // Writeback stage
-	       output logic [`XLEN-1:0] MulDivResultW,
+	       output logic [`XLEN-1:0] MulDivResultM, MulDivResultW,
 	       // Divide Done
 	       output logic 		DivDoneE,
 	       output logic 		DivBusyE, 
@@ -44,7 +44,7 @@ module muldiv (
 
    generate
       if (`M_SUPPORTED) begin
-	 logic [`XLEN-1:0] MulDivResultE, MulDivResultM;
+	 logic [`XLEN-1:0] MulDivResultE;
 	 logic [`XLEN-1:0] PrelimResultE;
 	 logic [`XLEN-1:0] QuotE, RemE;
 	 logic [`XLEN*2-1:0] ProdE; 
diff --git a/wally-pipelined/src/privileged/csr.sv b/wally-pipelined/src/privileged/csr.sv
index 213bcde33..e88cb561d 100644
--- a/wally-pipelined/src/privileged/csr.sv
+++ b/wally-pipelined/src/privileged/csr.sv
@@ -65,12 +65,12 @@ module csr #(parameter
   input  logic [4:0]       SetFflagsM,
   output logic [2:0]       FRM_REGW, 
 //  output logic [11:0]     MIP_REGW, SIP_REGW, UIP_REGW, MIE_REGW, SIE_REGW, UIE_REGW,
-  output logic [`XLEN-1:0] CSRReadValW,
+  output logic [`XLEN-1:0] CSRReadValM, CSRReadValW,
   output logic             IllegalCSRAccessM
 );
 
   localparam NOP = 32'h13;
-  logic [`XLEN-1:0] CSRMReadValM, CSRSReadValM, CSRUReadValM, CSRNReadValM, CSRCReadValM, CSRReadValM;
+  logic [`XLEN-1:0] CSRMReadValM, CSRSReadValM, CSRUReadValM, CSRNReadValM, CSRCReadValM;
   logic [`XLEN-1:0] CSRSrcM, CSRRWM, CSRRSM, CSRRCM, CSRWriteValM;
  
   logic [`XLEN-1:0] MSTATUS_REGW, SSTATUS_REGW, USTATUS_REGW;
diff --git a/wally-pipelined/src/privileged/privileged.sv b/wally-pipelined/src/privileged/privileged.sv
index 1275cd4b8..ab794a4ad 100644
--- a/wally-pipelined/src/privileged/privileged.sv
+++ b/wally-pipelined/src/privileged/privileged.sv
@@ -34,7 +34,7 @@ module privileged (
   input  logic [`XLEN-1:0] SrcAM,
   input  logic [`XLEN-1:0] PCF,PCD,PCE,PCM,
   input  logic [31:0]      InstrD, InstrE, InstrM, InstrW,
-  output logic [`XLEN-1:0] CSRReadValW,
+  output logic [`XLEN-1:0] CSRReadValM, CSRReadValW,
   output logic [`XLEN-1:0] PrivilegedNextPCM,
   output logic             RetM, TrapM, NonBusTrapM,
   output logic             ITLBFlushF, DTLBFlushM,
diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv
index b32770b9a..1fd1408a4 100644
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@@ -71,7 +71,7 @@ module wallypipelinedhart (
   logic [31:0] InstrD, InstrE, InstrM, InstrW;
   logic [`XLEN-1:0] PCD, PCE, PCM, PCLinkE, PCLinkW;
   logic [`XLEN-1:0] PCTargetE;
-  logic [`XLEN-1:0] CSRReadValW, MulDivResultW;
+  logic [`XLEN-1:0] CSRReadValM, MulDivResultM, CSRReadValW, MulDivResultW;
   logic [`XLEN-1:0] PrivilegedNextPCM;
   logic [1:0] MemRWM;
   logic InstrValidM, InstrValidW;
@@ -96,7 +96,7 @@ module wallypipelinedhart (
   logic [1:0] FMemRWM;
   logic       RegWriteD;
   logic [`XLEN-1:0] FWriteDataM;
-  logic       SquashSCW;
+  logic       SquashSCM, SquashSCW;
   logic       FStallD;
   logic       FWriteIntE, FWriteIntW, FWriteIntM;
   logic             FDivBusyE;
@@ -136,7 +136,7 @@ module wallypipelinedhart (
   logic [2:0]       Funct3M;
   logic [`XLEN-1:0] MemAdrM, WriteDataM;
   logic [`PA_BITS-1:0] MemPAdrM;
-  logic [`XLEN-1:0] ReadDataW;
+  logic [`XLEN-1:0] ReadDataM, ReadDataW;
   logic [`PA_BITS-1:0] InstrPAdrF;
   logic [`XLEN-1:0] InstrRData;
   logic             InstrReadF;
@@ -153,8 +153,7 @@ module wallypipelinedhart (
   logic[`XLEN-1:0] WriteDatatmpM;
 
   logic [4:0]       InstrClassM;
-  
-           
+     
   ifu ifu(.InstrInF(InstrRData), .*); // instruction fetch unit: PC, branch prediction, instruction cache
 
   ieu ieu(.*); // integer execution unit: integer register file, datapath and controller

From 2155a4e485fd9b50a52a3f937fcf583b7543e1ee Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Thu, 24 Jun 2021 17:39:37 -0400
Subject: [PATCH 07/20] Revert "fixed forwarding"

This reverts commit 86e369df5284d9357a390d8b2ac9fe91f8152a3e.
---
 wally-pipelined/src/ebu/ahblite.sv            |  4 ++--
 wally-pipelined/src/ieu/controller.sv         |  3 +--
 wally-pipelined/src/ieu/datapath.sv           | 20 ++++++-------------
 wally-pipelined/src/ieu/forward.sv            |  2 +-
 wally-pipelined/src/ieu/ieu.sv                |  6 ++----
 wally-pipelined/src/lsu/lsu.sv                |  2 +-
 wally-pipelined/src/muldiv/muldiv.sv          |  4 ++--
 wally-pipelined/src/privileged/csr.sv         |  4 ++--
 wally-pipelined/src/privileged/privileged.sv  |  2 +-
 .../src/wally/wallypipelinedhart.sv           |  9 +++++----
 10 files changed, 23 insertions(+), 33 deletions(-)

diff --git a/wally-pipelined/src/ebu/ahblite.sv b/wally-pipelined/src/ebu/ahblite.sv
index 9ace1077b..c59dfa9b5 100644
--- a/wally-pipelined/src/ebu/ahblite.sv
+++ b/wally-pipelined/src/ebu/ahblite.sv
@@ -62,7 +62,7 @@ module ahblite (
   // Signals to PMA checker (metadata of proposed access)
   output logic             AtomicAccessM, ExecuteAccessF, WriteAccessM, ReadAccessM,
   // Return from bus
-  output logic [`XLEN-1:0] ReadDataM, ReadDataW,
+  output logic [`XLEN-1:0] ReadDataW,
   // AHB-Lite external signals
   input  logic [`AHBW-1:0] HRDATA,
   input  logic             HREADY, HRESP,
@@ -87,7 +87,7 @@ module ahblite (
   logic GrantData;
   logic [31:0] AccessAddress;
   logic [2:0] AccessSize, PTESize, ISize;
-  logic [`AHBW-1:0] HRDATAMasked, CapturedData, ReadDataWnext, WriteData;
+  logic [`AHBW-1:0] HRDATAMasked, ReadDataM, CapturedData, ReadDataWnext, WriteData;
   logic IReady, DReady;
   logic CaptureDataM,CapturedDataAvailable;
 
diff --git a/wally-pipelined/src/ieu/controller.sv b/wally-pipelined/src/ieu/controller.sv
index 09ded48ba..b27541d42 100644
--- a/wally-pipelined/src/ieu/controller.sv
+++ b/wally-pipelined/src/ieu/controller.sv
@@ -53,7 +53,6 @@ module controller(
   output logic [1:0] AtomicM,
   output logic [2:0] Funct3M,
   output logic       RegWriteM,     // for Hazard Unit
-  output logic [2:0] ResultSrcM,
   output logic       InstrValidM,
   // Writeback stage control signals
   input  logic       StallW, FlushW,
@@ -73,7 +72,7 @@ module controller(
 
   // pipelined control signals
   logic 	    RegWriteE;
-  logic [2:0] ResultSrcD, ResultSrcE;
+  logic [2:0] ResultSrcD, ResultSrcE, ResultSrcM;
   logic [1:0] MemRWD, MemRWE;
   logic		    JumpD;
   logic		    BranchD, BranchE;
diff --git a/wally-pipelined/src/ieu/datapath.sv b/wally-pipelined/src/ieu/datapath.sv
index 848ed89a5..13db65a37 100644
--- a/wally-pipelined/src/ieu/datapath.sv
+++ b/wally-pipelined/src/ieu/datapath.sv
@@ -45,9 +45,6 @@ module datapath (
   // Memory stage signals
   input  logic             StallM, FlushM,
   input  logic [`XLEN-1:0] FWriteDataM,
-  input  logic             SquashSCM,
-  input  logic [2:0]       ResultSrcM,
-  input  logic [`XLEN-1:0] CSRReadValM, ReadDataM, MulDivResultM, 
   output logic [`XLEN-1:0] SrcAM,
   output logic [`XLEN-1:0] WriteDataM, MemAdrM,
   // Writeback stage signals
@@ -57,6 +54,7 @@ module datapath (
   input  logic             RegWriteW, 
   input  logic             SquashSCW,
   input  logic [2:0]       ResultSrcW,
+  // input  logic [`XLEN-1:0] PCLinkW,
   input  logic [`XLEN-1:0] CSRReadValW, ReadDataW, MulDivResultW, 
   // Hazard Unit signals 
   output logic [4:0]       Rs1D, Rs2D, Rs1E, Rs2E,
@@ -78,9 +76,7 @@ module datapath (
   logic [`XLEN-1:0] WriteDataE;
   logic [`XLEN-1:0] TargetBaseE;
   // Memory stage signals
-  logic [`XLEN-1:0] SCResultM;
   logic [`XLEN-1:0] ALUResultM;
-  logic [`XLEN-1:0] ResultM;
   // Writeback stage signals
   logic [`XLEN-1:0] SCResultW;
   logic [`XLEN-1:0] ALUResultW;
@@ -106,8 +102,8 @@ module datapath (
   flopenrc #(5)    Rs2EReg(clk, reset, FlushE, ~StallE, Rs2D, Rs2E);
   flopenrc #(5)    RdEReg(clk, reset, FlushE, ~StallE, RdD, RdE);
 	
-  mux4  #(`XLEN)  faemux(RD1E, WriteDataW, ResultM, FWriteDataM, ForwardAE, PreSrcAE);
-  mux4  #(`XLEN)  fbemux(RD2E, WriteDataW, ResultM, FWriteDataM, ForwardBE, WriteDataE);
+  mux4  #(`XLEN)  faemux(RD1E, WriteDataW, ALUResultM, FWriteDataM, ForwardAE, PreSrcAE);
+  mux4  #(`XLEN)  fbemux(RD2E, WriteDataW, ALUResultM, FWriteDataM, ForwardBE, WriteDataE);
   mux2  #(`XLEN)  srcamux(PreSrcAE, PCE, ALUSrcAE, SrcAE);
   mux2  #(`XLEN)  srcamux2(SrcAE, PCLinkE, JumpE, SrcAE2);  
   mux2  #(`XLEN)  srcbmux(WriteDataE, ExtImmE, ALUSrcBE, SrcBE);
@@ -122,7 +118,6 @@ module datapath (
   assign MemAdrM = ALUResultM;
   flopenrc #(`XLEN) WriteDataMReg(clk, reset, FlushM, ~StallM, WriteDataE, WriteDataM);
   flopenrc #(5)    RdMEg(clk, reset, FlushM, ~StallM, RdE, RdM);
-  mux5  #(`XLEN) resultmuxM(ALUResultM, ReadDataM, CSRReadValM, MulDivResultM, SCResultM, ResultSrcM, ResultM);	
   
   // Writeback stage pipeline register and logic
   flopenrc #(`XLEN) ALUResultWReg(clk, reset, FlushW, ~StallW, ALUResultM, ALUResultW);
@@ -130,16 +125,13 @@ module datapath (
 
   // handle Store Conditional result if atomic extension supported
   generate 
-    if (`A_SUPPORTED) begin
-      assign SCResultM = SquashSCM ? {{(`XLEN-1){1'b0}}, 1'b1} : {{(`XLEN-1){1'b0}}, 1'b0};
+    if (`A_SUPPORTED)
       assign SCResultW = SquashSCW ? {{(`XLEN-1){1'b0}}, 1'b1} : {{(`XLEN-1){1'b0}}, 1'b0};
-    end else begin
-      assign SCResultM = 0;
+    else 
       assign SCResultW = 0;
-    end
   endgenerate
 
-  mux5  #(`XLEN) resultmuxW(ALUResultW, ReadDataW, CSRReadValW, MulDivResultW, SCResultW, ResultSrcW, ResultW);	
+  mux5  #(`XLEN) resultmux(ALUResultW, ReadDataW, CSRReadValW, MulDivResultW, SCResultW, ResultSrcW, ResultW);	
 /* -----\/----- EXCLUDED -----\/-----
   // This mux4:1 no longer needs to include PCLinkW.  This is set correctly in the execution stage.
   // *** need to look at how the decoder is coded to fix.
diff --git a/wally-pipelined/src/ieu/forward.sv b/wally-pipelined/src/ieu/forward.sv
index 6729ed424..cdc6d2700 100644
--- a/wally-pipelined/src/ieu/forward.sv
+++ b/wally-pipelined/src/ieu/forward.sv
@@ -43,7 +43,7 @@ module forward(
     if (Rs1E != 5'b0)
       if      ((Rs1E == RdM) & RegWriteM) ForwardAE = 2'b10;
       else if ((Rs1E == RdW) & (RegWriteW|FWriteIntW)) ForwardAE = 2'b01;
-      else if ((Rs1E == RdM) & FWriteIntM) ForwardAE = 2'b11;
+     else if ((Rs1E == RdM) & FWriteIntM) ForwardAE = 2'b11;
  
     if (Rs2E != 5'b0)
       if      ((Rs2E == RdM) & RegWriteM) ForwardBE = 2'b10;
diff --git a/wally-pipelined/src/ieu/ieu.sv b/wally-pipelined/src/ieu/ieu.sv
index 73c619f62..0bd9d598f 100644
--- a/wally-pipelined/src/ieu/ieu.sv
+++ b/wally-pipelined/src/ieu/ieu.sv
@@ -43,10 +43,9 @@ module ieu (
   // Memory stage interface
   input logic 		   DataMisalignedM,
   input logic 		   DataAccessFaultM,
+  input logic 		   SquashSCW,
   input logic	     	   FWriteIntM,
   input  logic [`XLEN-1:0] FWriteDataM,
-  input logic 		       SquashSCM,
-  input  logic [`XLEN-1:0] CSRReadValM, ReadDataM, MulDivResultM, 
   output logic [1:0] 	   MemRWM,
   output logic [1:0] 	   AtomicM,
   output logic [`XLEN-1:0] MemAdrM, WriteDataM,
@@ -56,7 +55,6 @@ module ieu (
   input logic [`XLEN-1:0]  CSRReadValW, ReadDataW, MulDivResultW,
   input logic             FWriteIntW,
   input logic [`XLEN-1:0] FPUResultW,
-  input logic 		   SquashSCW,
   // input  logic [`XLEN-1:0] PCLinkW,
   output logic 		   InstrValidM, InstrValidW,
   // hazards
@@ -74,7 +72,7 @@ module ieu (
   logic [2:0]  FlagsE;
   logic [4:0]  ALUControlE;
   logic        ALUSrcAE, ALUSrcBE;
-  logic [2:0]  ResultSrcM, ResultSrcW;
+  logic [2:0]  ResultSrcW;
   logic       TargetSrcE;
 
   // forwarding signals
diff --git a/wally-pipelined/src/lsu/lsu.sv b/wally-pipelined/src/lsu/lsu.sv
index f44340a45..ffa79adfe 100644
--- a/wally-pipelined/src/lsu/lsu.sv
+++ b/wally-pipelined/src/lsu/lsu.sv
@@ -45,7 +45,6 @@ module lsu (
   output logic [1:0]       AtomicMaskedM,
   output logic             DataMisalignedM,
   output logic             CommittedM,
-  output logic             SquashSCM,
   // Writeback Stage
   input  logic             MemAckW,
   input  logic [`XLEN-1:0] ReadDataW,
@@ -82,6 +81,7 @@ module lsu (
   
 );
 
+  logic SquashSCM;
   logic DTLBPageFaultM;
   logic MemAccessM;
   logic [1:0] CurrState, NextState;
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index 691b3b5ae..e10b0c55d 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -34,7 +34,7 @@ module muldiv (
 	       input logic [2:0] 	Funct3E,
 	       input logic 		MulDivE, W64E,
 	       // Writeback stage
-	       output logic [`XLEN-1:0] MulDivResultM, MulDivResultW,
+	       output logic [`XLEN-1:0] MulDivResultW,
 	       // Divide Done
 	       output logic 		DivDoneE,
 	       output logic 		DivBusyE, 
@@ -44,7 +44,7 @@ module muldiv (
 
    generate
       if (`M_SUPPORTED) begin
-	 logic [`XLEN-1:0] MulDivResultE;
+	 logic [`XLEN-1:0] MulDivResultE, MulDivResultM;
 	 logic [`XLEN-1:0] PrelimResultE;
 	 logic [`XLEN-1:0] QuotE, RemE;
 	 logic [`XLEN*2-1:0] ProdE; 
diff --git a/wally-pipelined/src/privileged/csr.sv b/wally-pipelined/src/privileged/csr.sv
index e88cb561d..213bcde33 100644
--- a/wally-pipelined/src/privileged/csr.sv
+++ b/wally-pipelined/src/privileged/csr.sv
@@ -65,12 +65,12 @@ module csr #(parameter
   input  logic [4:0]       SetFflagsM,
   output logic [2:0]       FRM_REGW, 
 //  output logic [11:0]     MIP_REGW, SIP_REGW, UIP_REGW, MIE_REGW, SIE_REGW, UIE_REGW,
-  output logic [`XLEN-1:0] CSRReadValM, CSRReadValW,
+  output logic [`XLEN-1:0] CSRReadValW,
   output logic             IllegalCSRAccessM
 );
 
   localparam NOP = 32'h13;
-  logic [`XLEN-1:0] CSRMReadValM, CSRSReadValM, CSRUReadValM, CSRNReadValM, CSRCReadValM;
+  logic [`XLEN-1:0] CSRMReadValM, CSRSReadValM, CSRUReadValM, CSRNReadValM, CSRCReadValM, CSRReadValM;
   logic [`XLEN-1:0] CSRSrcM, CSRRWM, CSRRSM, CSRRCM, CSRWriteValM;
  
   logic [`XLEN-1:0] MSTATUS_REGW, SSTATUS_REGW, USTATUS_REGW;
diff --git a/wally-pipelined/src/privileged/privileged.sv b/wally-pipelined/src/privileged/privileged.sv
index ab794a4ad..1275cd4b8 100644
--- a/wally-pipelined/src/privileged/privileged.sv
+++ b/wally-pipelined/src/privileged/privileged.sv
@@ -34,7 +34,7 @@ module privileged (
   input  logic [`XLEN-1:0] SrcAM,
   input  logic [`XLEN-1:0] PCF,PCD,PCE,PCM,
   input  logic [31:0]      InstrD, InstrE, InstrM, InstrW,
-  output logic [`XLEN-1:0] CSRReadValM, CSRReadValW,
+  output logic [`XLEN-1:0] CSRReadValW,
   output logic [`XLEN-1:0] PrivilegedNextPCM,
   output logic             RetM, TrapM, NonBusTrapM,
   output logic             ITLBFlushF, DTLBFlushM,
diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv
index 1fd1408a4..b32770b9a 100644
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@@ -71,7 +71,7 @@ module wallypipelinedhart (
   logic [31:0] InstrD, InstrE, InstrM, InstrW;
   logic [`XLEN-1:0] PCD, PCE, PCM, PCLinkE, PCLinkW;
   logic [`XLEN-1:0] PCTargetE;
-  logic [`XLEN-1:0] CSRReadValM, MulDivResultM, CSRReadValW, MulDivResultW;
+  logic [`XLEN-1:0] CSRReadValW, MulDivResultW;
   logic [`XLEN-1:0] PrivilegedNextPCM;
   logic [1:0] MemRWM;
   logic InstrValidM, InstrValidW;
@@ -96,7 +96,7 @@ module wallypipelinedhart (
   logic [1:0] FMemRWM;
   logic       RegWriteD;
   logic [`XLEN-1:0] FWriteDataM;
-  logic       SquashSCM, SquashSCW;
+  logic       SquashSCW;
   logic       FStallD;
   logic       FWriteIntE, FWriteIntW, FWriteIntM;
   logic             FDivBusyE;
@@ -136,7 +136,7 @@ module wallypipelinedhart (
   logic [2:0]       Funct3M;
   logic [`XLEN-1:0] MemAdrM, WriteDataM;
   logic [`PA_BITS-1:0] MemPAdrM;
-  logic [`XLEN-1:0] ReadDataM, ReadDataW;
+  logic [`XLEN-1:0] ReadDataW;
   logic [`PA_BITS-1:0] InstrPAdrF;
   logic [`XLEN-1:0] InstrRData;
   logic             InstrReadF;
@@ -153,7 +153,8 @@ module wallypipelinedhart (
   logic[`XLEN-1:0] WriteDatatmpM;
 
   logic [4:0]       InstrClassM;
-     
+  
+           
   ifu ifu(.InstrInF(InstrRData), .*); // instruction fetch unit: PC, branch prediction, instruction cache
 
   ieu ieu(.*); // integer execution unit: integer register file, datapath and controller

From 7e3483b28373c1686c9e51c346d8cb2ab73ef92b Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Thu, 24 Jun 2021 18:39:18 -0400
Subject: [PATCH 08/20] FPU forwarding reworked pt.1

---
 wally-pipelined/src/fpu/fctrl.sv              |  20 +-
 wally-pipelined/src/fpu/fpu.sv                | 139 +++---
 wally-pipelined/src/fpu/fpuaddcvt1.sv         |  14 +-
 wally-pipelined/src/fpu/fpuclassify.sv        |  16 +-
 wally-pipelined/src/fpu/fpucmp1.sv            | 269 ++++++++++-
 wally-pipelined/src/fpu/fpucmp2.sv            | 422 +++++++++---------
 wally-pipelined/src/fpu/fpuhazard.sv          |  60 ++-
 wally-pipelined/src/fpu/fsgn.sv               |  16 +-
 wally-pipelined/src/hazard/hazard.sv          |   4 +-
 wally-pipelined/src/ieu/controller.sv         |   8 +-
 wally-pipelined/src/ieu/datapath.sv           |  28 +-
 wally-pipelined/src/ieu/forward.sv            |   8 +-
 wally-pipelined/src/ieu/ieu.sv                |   9 +-
 .../src/wally/wallypipelinedhart.sv           |  31 +-
 .../testbench/testbench-imperas.sv            |   2 +-
 15 files changed, 653 insertions(+), 393 deletions(-)

diff --git a/wally-pipelined/src/fpu/fctrl.sv b/wally-pipelined/src/fpu/fctrl.sv
index 5749d0db7..a9fcb564e 100755
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@@ -6,6 +6,7 @@ module fctrl (
   input  logic [2:0] Funct3D,
   input  logic [2:0] FRM_REGW,
   output logic       IllegalFPUInstrD,
+  output logic       IsFPD,
   output logic       FWriteEnD,
   output logic       FDivStartD,
   output logic [2:0] FResultSelD,
@@ -27,20 +28,19 @@ module fctrl (
   
   //write is enabled for all fp instruciton op codes
   //sans fp load
-  logic isFP, isFPLD;
   always_comb begin
 	//case statement is easier to modify
 	//in case of errors
 	case(OpD)
 		//fp instructions sans load
-		7'b1010011 : isFP = 1'b1;
-		7'b1000011 : isFP = 1'b1;
-		7'b1000111 : isFP = 1'b1;
-		7'b1001011 : isFP = 1'b1;
-		7'b1001111 : isFP = 1'b1;
-		7'b0100111 : isFP = 1'b1;
-		7'b0000111 : isFP = 1'b1;// KEP change 7'b1010011 to 7'b0000111
-		default    : isFP = 1'b0;
+		7'b1010011 : IsFPD = 1'b1;
+		7'b1000011 : IsFPD = 1'b1;
+		7'b1000111 : IsFPD = 1'b1;
+		7'b1001011 : IsFPD = 1'b1;
+		7'b1001111 : IsFPD = 1'b1;
+		7'b0100111 : IsFPD = 1'b1;
+		7'b0000111 : IsFPD = 1'b1;// KEP change 7'b1010011 to 7'b0000111
+		default    : IsFPD = 1'b0;
 	endcase
   end
   
@@ -218,5 +218,5 @@ module fctrl (
   //			is add/cvt       and  is to int  or is classify		 or     is cmp	       	and not max/min or is output ReadData1 and is mv
   assign FWriteIntD = ((FResultSelD == 3'b100)&Funct7D[3]) | (FResultSelD == 3'b101) | ((FResultSelD == 3'b001)&~Funct7D[2]) | ((FResultSelD == 3'b111)&OpD[6]);
   // 		      if not writting to int reg and not a store function and not move
-  assign FWriteEnD = ~FWriteIntD & ~OpD[5] & ~((FResultSelD == 3'b111)&OpD[6]) & isFP;
+  assign FWriteEnD = ~FWriteIntD & ~OpD[5] & ~((FResultSelD == 3'b111)&OpD[6]) & IsFPD;
 endmodule
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index e886c66e3..7f93d33a7 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -30,15 +30,15 @@ module fpu (
   input logic [2:0]        FRM_REGW,   // Rounding mode from CSR
   input logic [31:0]       InstrD,
   input logic [`XLEN-1:0]  ReadDataW,     // Read data from memory
-  input logic 		         RegWriteD,  // register write enable from ieu
   input logic [`XLEN-1:0]  SrcAE,      // Integer input being processed
   input logic [`XLEN-1:0]  SrcAM,      // Integer input being written into fpreg
   input logic 		         StallE, StallM, StallW,
   input logic 		         FlushE, FlushM, FlushW,
-  output logic [1:0] 	   FMemRWM,    // Read/write enable for memory {read, write}
+  output logic  	         IsFPD, IsFPE,    // Read/write enable for memory {read, write}
   output logic 		      FStallD,    // Stall the decode stage if Div/Sqrt instruction
   output logic 		      FWriteIntE, FWriteIntM, FWriteIntW, // Write integer register enable
-  output logic [`XLEN-1:0] FWriteDataM,      // Data to be written to memory
+  output logic [`XLEN-1:0] FWriteDataE,      // Data to be written to memory
+  output logic [`XLEN-1:0] FIntResM,     
   output logic 		      FDivBusyE,        // Is the divison/sqrt unit busy
   output logic 		      IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
   output logic [4:0] 	   SetFflagsM,       // FPU flags
@@ -51,24 +51,27 @@ module fpu (
    logic 		   FDivStartD, FDivStartE;                                  // Start division
    logic 		   FWriteIntD;                                              // Write to integer register
    logic 		   FOutputInput2D, FOutputInput2E;                          // Put Input2 in Input1 if a store instruction
-   logic [1:0] 	FMemRWD, FMemRWE;                                        // Read and write enable for memory
-   logic [1:0]    FForwardInput1D, FForwardInput1E;                        // Input1 forwarding mux control signal
-   logic [1:0] 	FForwardInput2D, FForwardInput2E;                        // Input2 forwarding mux control signal
-   logic 		   FForwardInput3D, FForwardInput3E;                        // Input3 forwarding mux control signal
-   logic 		   FInput2UsedD;                                            // Is input 2 used
-   logic 		   FInput3UsedD;                                            // Is input 3 used
+   logic [1:0] 	FMemRWD;                                        // Read and write enable for memory
+   logic [1:0]    ForwardXD, ForwardXE;                        // Input1 forwarding mux control signal
+   logic [1:0] 	ForwardYD, ForwardYE;                        // Input2 forwarding mux control signal
+   logic [1:0]		   ForwardZD, ForwardZE;                        // Input3 forwarding mux control signal
+   logic 		   SrcYUsedD;                                            // Is input 2 used
+   logic 		   SrcZUsedD;                                            // Is input 3 used
    logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
    logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM, FOpCtrlW;                  // Select which opperation to do in each component
    logic          SelLoadInputE, SelLoadInputM;                            // Select which adress to load when single precision
+   logic       FInput2UsedD, FInput3UsedD;                                   
+   logic [4:0] 	Adr1E, Adr2E, Adr3E;
    
    // regfile signals
    logic [4:0]    RdE, RdM, RdW;                                           // what adress to write to    // ***Can take from ieu insted of pipelining
    logic [63:0] 	FWDM;                                                    // Write data for FP register
    logic [63:0] 	FRD1D, FRD2D, FRD3D;                                     // Read Data from FP register - decode stage
    logic [63:0] 	FRD1E, FRD2E, FRD3E;                                     // Read Data from FP register - execute stage
-   logic [63:0] 	FInput1E, FInput1M, FInput1W, FInput1tmpE;                         // Input 1 to the various units (after forwarding)
-   logic [63:0] 	FInput2E, FInput2M;                                      // Input 2 to the various units (after forwarding)
-   logic [63:0] 	FInput3E, FInput3M;                                      // Input 3 to the various units (after forwarding)
+   logic [63:0] 	SrcXE, SrcXM, SrcXW;                         // Input 1 to the various units (after forwarding)
+   logic [`XLEN-1:0]   SrcXMAligned;
+   logic [63:0] 	SrcYE, SrcYM, SrcYW;                                      // Input 2 to the various units (after forwarding)
+   logic [63:0] 	SrcZE, SrcZM;                                      // Input 3 to the various units (after forwarding)
    logic [63:0] 	FLoadResultW, FLoadStoreResultM, FLoadStoreResultW;      // Result for load, store, and move to int-reg instructions
    
    // div/sqrt signals
@@ -123,19 +126,13 @@ module fpu (
    logic [4:0] 	FAddFlagsM, FAddFlagsW;
    
    // cmp signals 
-   logic [7:0] 	WE, WM;
-   logic [7:0] 	XE, XM;
-   logic 		   ANaNE, ANaNM;
-   logic 		   BNaNE, BNaNM;
-   logic 		   AzeroE, AzeroM;
-   logic 		   BzeroE, BzeroM;
-   logic 		   CmpInvalidM, CmpInvalidW;
-   logic [1:0] 	CmpFCCM, CmpFCCW; 
-   logic [63:0] 	FCmpResultM, FCmpResultW;
+   logic 		   CmpInvalidE, CmpInvalidM, CmpInvalidW;
+   logic [63:0] 	FCmpResultE, FCmpResultM, FCmpResultW;
    
    // fsgn signals
    logic [63:0] 	SgnResultE, SgnResultM, SgnResultW;
    logic [4:0] 	SgnFlagsE, SgnFlagsM, SgnFlagsW;
+   logic [63:0]   FResM;
    
    // instantiation of W stage regfile signals
    logic [63:0] 	AlignedSrcAM, ForwardSrcAM, SrcAW;
@@ -150,8 +147,6 @@ module fpu (
    
    //DECODE STAGE
    
-   // Hazard unit for FPU
-   fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*);
    
    // top-level controller for FPU
    fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*);
@@ -172,22 +167,45 @@ module fpu (
    //*****************
    // other  D/E pipe registers
    //*****************
-   flopenrc #(64) DEReg14(clk, reset, FlushE, ~StallE, FPUResult64W, FPUResult64E);
-   flopenrc #(28) CtrlRegE(clk, reset, FlushE, ~StallE, 
-                        {FWriteEnD, FResultSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FDivStartD, FForwardInput1D, FForwardInput2D, FForwardInput3D, FWriteIntD, FOutputInput2D, FMemRWD, InstrD[15]},
-                        {FWriteEnE, FResultSelE, FrmE, FmtE, RdE,          FOpCtrlE, FDivStartE, FForwardInput1E, FForwardInput2E, FForwardInput3E, FWriteIntE, FOutputInput2E, FMemRWE, SelLoadInputE});
-  
+   // flopenrc #(64) DEReg14(clk, reset, FlushE, ~StallE, FPUResult64W, FPUResult64E);
+   // flopenrc #(1) CtrlRegE1(clk, reset, FlushE, ~StallE, FWriteEnD, FWriteEnE);
+   // flopenrc #(3) CtrlRegE2(clk, reset, FlushE, ~StallE, FResultSelD, FResultSelE);
+   // flopenrc #(3) CtrlRegE3(clk, reset, FlushE, ~StallE, FrmD, FrmE);
+   // flopenrc #(1) CtrlRegE4(clk, reset, FlushE, ~StallE, FmtD, FmtE);
+   // flopenrc #(5) CtrlRegE5(clk, reset, FlushE, ~StallE, InstrD[11:7], RdE);
+   // flopenrc #(4) CtrlRegE6(clk, reset, FlushE, ~StallE, FOpCtrlD, FOpCtrlE);
+   flopenrc #(1) CtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
+   flopenrc #(15) CtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
+                                                      {Adr1E,         Adr2E,         Adr3E});
+   // flopenrc #(1) CtrlRegE8(clk, reset, FlushE, ~StallE, FWriteIntD, FWriteIntE);
+   // flopenrc #(1) CtrlRegE9(clk, reset, FlushE, ~StallE, FOutputInput2D, FOutputInput2E);
+   // flopenrc #(2) CtrlRegE10(clk, reset, FlushE, ~StallE, FMemRWD, FMemRWE);
+   // flopenrc #(1) CtrlRegE11(clk, reset, FlushE, ~StallE, InstrD[15], SelLoadInputE);
+   flopenrc #(20) CtrlRegE(clk, reset, FlushE, ~StallE, 
+                        {FWriteEnD, FResultSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FWriteIntD, InstrD[15],    IsFPD},
+                        {FWriteEnE, FResultSelE, FrmE, FmtE, RdE,          FOpCtrlE, FWriteIntE, SelLoadInputE, IsFPE});
+
    //EXECUTION STAGE
    
    // input muxs for forwarding   
-   mux2  #(64)  SrcAMuxForward({SrcAM[31:0], 32'b0}, {SrcAM, {64-`XLEN{1'b0}}}, FmtM, ForwardSrcAM);
-   mux4  #(64)  FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, ForwardSrcAM, FForwardInput1E, FInput1tmpE);
-   mux3  #(64)  FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E);
-   mux2  #(64)  FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E);
-   mux2  #(64)  FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E);
+   // single vs double for SRCAM
+   // mux2  #(64)  SrcAMuxForward({SrcAM[31:0], 32'b0}, {SrcAM, {64-`XLEN{1'b0}}}, FmtM, ForwardSrcAM);
+   // //input 1 forwarding mux
+   // mux4  #(64)  SrcXEmux(FRD1E, FPUResult64W, FPUResult64E, ForwardSrcAM, ForwardXE, SrcXtmpE);
+   // mux3  #(64)  SrcYEmux(FRD2E, FPUResult64W, FPUResult64E, ForwardYE, SrcYE);
+   // mux2  #(64)  SrcZEmux(FRD3E, FPUResult64E, ForwardZE, SrcZE);
+   // mux2  #(64)  FOutputInput2mux(SrcXtmpE, SrcYE, FOutputInput2E, SrcXE);
+   
+   // Hazard unit for FPU
+   fpuhazard hazard(.*);
+
+   mux3  #(64)  fxemux(FRD1E, FPUResult64W, FResM, ForwardXE, SrcXE);
+   mux3  #(64)  fyemux(FRD2E, FPUResult64W, FResM, ForwardYE, SrcYE);
+   mux3  #(64)  fzemux(FRD3E, FPUResult64W, FResM, ForwardZE, SrcZE);
+
    
    // first of two-stage instance of floating-point fused multiply-add unit
-   fma1 fma1 (.X(FInput1E), .Y(FInput2E), .Z(FInput3E), .FOpCtrlE(FOpCtrlE[2:0]),.*);
+   fma1 fma1 (.X(SrcXE), .Y(SrcYE), .Z(SrcZE), .FOpCtrlE(FOpCtrlE[2:0]),.*);
    
    // first and only instance of floating-point divider
    logic fpdivClk;
@@ -198,10 +216,10 @@ module fpu (
 			.ECLK(fpdivClk));
    
    // capture the inputs for div/sqrt	 
-   flopenrc #(64) reg_input1 (.d(FInput1E), .q(DivInput1E),
+   flopenrc #(64) reg_input1 (.d(SrcXE), .q(DivInput1E),
                .en(~HoldInputs), .clear(FDivSqrtDoneE),
                .reset(reset),  .clk(clk));
-   flopenrc #(64) reg_input2 (.d(FInput2E), .q(DivInput2E),
+   flopenrc #(64) reg_input2 (.d(SrcYE), .q(DivInput2E),
                .en(~HoldInputs), .clear(FDivSqrtDoneE),
                .reset(reset),  .clk(clk));
 
@@ -211,20 +229,21 @@ module fpu (
    fpuaddcvt1 fpadd1 (.*);
    
    // first of two-stage instance of floating-point comparator
-   fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, FInput1E, FInput2E, FOpCtrlE[1:0]);
+   fpucmp1 fpcmp1 (SrcXE, SrcYE, FOpCtrlE[2:0], FmtE, CmpInvalidE, FCmpResultE);
    
    // first and only instance of floating-point sign converter
    fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*);
    
    // first and only instance of floating-point classify unit
    fpuclassify fpuclass (.*);
+   assign FWriteDataE = FmtE ? SrcYE[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcYE[63:32]};
    
    //*****************
    //fpregfile D/E pipe registers
    //*****************
-   flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FInput1E, FInput1M);
-   flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, FInput2E, FInput2M);
-   flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, FInput3E, FInput3M);
+   flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, SrcXE, SrcXM);
+   flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, SrcYE, SrcYM);
+   flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, SrcZE, SrcZM);
    
    //*****************
    // fma E/M pipe registers
@@ -276,12 +295,15 @@ module fpu (
    //*****************
    // fpcmp E/M pipe registers
    //*****************
-   flopenrc #(8) EMRegCmp1(clk, reset, FlushM, ~StallM, WE, WM); 
-   flopenrc #(8) EMRegCmp2(clk, reset, FlushM, ~StallM, XE, XM); 
-   flopenrc #(1) EMRegcmp3(clk, reset, FlushM, ~StallM, ANaNE, ANaNM); 
-   flopenrc #(1) EMRegCmp4(clk, reset, FlushM, ~StallM, BNaNE, BNaNM); 
-   flopenrc #(1) EMRegCmp5(clk, reset, FlushM, ~StallM, AzeroE, AzeroM); 
-   flopenrc #(1) EMRegCmp6(clk, reset, FlushM, ~StallM, BzeroE, BzeroM); 
+   // flopenrc #(8) EMRegCmp1(clk, reset, FlushM, ~StallM, WE, WM); 
+   // flopenrc #(8) EMRegCmp2(clk, reset, FlushM, ~StallM, XE, XM); 
+   // flopenrc #(1) EMRegcmp3(clk, reset, FlushM, ~StallM, ANaNE, ANaNM); 
+   // flopenrc #(1) EMRegCmp4(clk, reset, FlushM, ~StallM, BNaNE, BNaNM); 
+   // flopenrc #(1) EMRegCmp5(clk, reset, FlushM, ~StallM, AzeroE, AzeroM); 
+   // flopenrc #(1) EMRegCmp6(clk, reset, FlushM, ~StallM, BzeroE, BzeroM); 
+   flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpInvalidE, CmpInvalidM); 
+   // flopenrc #(2)  EMRegCmp2(clk, reset, FlushM, ~StallM, CmpFCCE, CmpFCCM); 
+   flopenrc #(64) EMRegCmp3(clk, reset, FlushM, ~StallM, FCmpResultE, FCmpResultM); 
    
    // put this in for the event we want to delay fsgn - will otherwise bypass
    //*****************
@@ -300,7 +322,7 @@ module fpu (
    flopenrc #(5) EMReg5(clk, reset, FlushM, ~StallM, RdE, RdM);
    flopenrc #(4) EMReg6(clk, reset, FlushM, ~StallM, FOpCtrlE, FOpCtrlM);
    flopenrc #(1) EMReg7(clk, reset, FlushM, ~StallM, FWriteIntE, FWriteIntM);
-   flopenrc #(2) EMReg8(clk, reset, FlushM, ~StallM, FMemRWE, FMemRWM);
+   // flopenrc #(2) EMReg8(clk, reset, FlushM, ~StallM, FMemRWE, FMemRWM);
    flopenrc #(1) EMReg9(clk, reset, FlushM, ~StallM, SelLoadInputE, SelLoadInputM);
    
    //*****************
@@ -310,32 +332,35 @@ module fpu (
    
    //BEGIN MEMORY STAGE
    
-   assign FWriteDataM = FmtM ? FInput1M[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FInput1M[63:32]};
+   mux2  #(64)  FResMux(AlignedSrcAM, SgnResultM, FResultSelM == 3'b011, FResM);
+   assign SrcXMAligned = FmtM ? SrcXM[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcXM[63:32]};
+   mux3  #(`XLEN)  IntResMux(SrcXMAligned, FCmpResultM[`XLEN-1:0], ClassResultM[`XLEN-1:0], {FResultSelM == 3'b101, FResultSelM == 3'b001}, FIntResM);
+
    //adjecent adress values are sent to the FPU, select the correct one
    //    -imm is 80000 most of the time vs the error one which is 00000
    // mux3  #(64)  FLoadResultMux({HRDATA[31:0], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA[`AHBW-1:`AHBW-32], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA, {64-`AHBW{1'b0}}}, {FmtM, SelLoadInputM}, FLoadResultM);
-   // mux2  #(64)  FLoadStoreResultMux(FLoadResultM, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
+   // mux2  #(64)  FLoadStoreResultMux(FLoadResultM, SrcXM, |FOpCtrlM[2:1], FLoadStoreResultM);
    
-   fma2 fma2(.X(FInput1M), .Y(FInput2M), .Z(FInput3M), .FOpCtrlM(FOpCtrlM[2:0]), .*);
+   fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM(FOpCtrlM[2:0]), .*);
    
    // second instance of two-stage floating-point add/cvt unit
    fpuaddcvt2 fpadd2 (.*);
    
    // second instance of two-stage floating-point comparator
-   fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), 
-		   .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*);
+   // fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), 
+	// 	   .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(SrcXM), .op2(SrcYM), .*);
 
    // Align SrcA to MSB when single precicion
    mux2  #(64)  SrcAMux({SrcAM[31:0], 32'b0}, {{64-`XLEN{1'b0}}, SrcAM}, FmtM, AlignedSrcAM);
       
 
 
-
       
    //*****************
    //fpregfile M/W pipe registers
    //*****************
-   flopenrc #(64) MWFpReg1(clk, reset, FlushW, ~StallW, FInput1M, FInput1W);
+   flopenrc #(64) MWFpReg1(clk, reset, FlushW, ~StallW, SrcXM, SrcXW);
+   flopenrc #(64) MWFpReg2(clk, reset, FlushW, ~StallW, SrcYM, SrcYW);
    
    //*****************
    // fma M/W pipe registers
@@ -360,7 +385,7 @@ module fpu (
    // fpcmp M/W pipe registers
    //*****************
    flopenrc #(1) MWRegCmp1(clk, reset, FlushW, ~StallW, CmpInvalidM, CmpInvalidW); 
-   flopenrc #(2) MWRegCmp2(clk, reset, FlushW, ~StallW, CmpFCCM, CmpFCCW); 
+   // flopenrc #(2) MWRegCmp2(clk, reset, FlushW, ~StallW, CmpFCCM, CmpFCCW); 
    flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, FCmpResultM, FCmpResultW); 
    
    //*****************
@@ -396,10 +421,10 @@ module fpu (
    
 
    // mux3  #(64)  FLoadResultMux({ReadD[31:0], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA[`AHBW-1:`AHBW-32], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA, {64-`AHBW{1'b0}}}, {FmtM, SelLoadInputM}, FLoadResultM);
-   // mux2  #(64)  FLoadStoreResultMux(FLoadResultM, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
+   // mux2  #(64)  FLoadStoreResultMux(FLoadResultM, SrcXM, |FOpCtrlM[2:1], FLoadStoreResultM);
    //***RV32D needs to give two bus transactions
     mux2  #(64)  FLoadResultMux({ReadDataW[31:0], {32{1'b0}}}, {ReadDataW, {64-`XLEN{1'b0}}}, FmtW, FLoadResultW);
-    mux2  #(64)  FLoadStoreResultMux(FLoadResultW, FInput1W, |FOpCtrlW[2:1], FLoadStoreResultW);
+    mux2  #(64)  FLoadStoreResultMux(FLoadResultW, SrcYW, |FOpCtrlW[2:1], FLoadStoreResultW);
 
 
 
diff --git a/wally-pipelined/src/fpu/fpuaddcvt1.sv b/wally-pipelined/src/fpu/fpuaddcvt1.sv
index febd47d1b..8f045dcdb 100755
--- a/wally-pipelined/src/fpu/fpuaddcvt1.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt1.sv
@@ -27,10 +27,10 @@
 //
 
 
-module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, FInput1E, FInput2E, FOpCtrlE, FmtE);
+module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, SrcXE, SrcYE, FOpCtrlE, FmtE);
 
-   input logic [63:0] FInput1E;		// 1st input operand (A)
-   input logic [63:0] FInput2E;		// 2nd input operand (B)
+   input logic [63:0] SrcXE;		// 1st input operand (A)
+   input logic [63:0] SrcYE;		// 2nd input operand (B)
    input logic [3:0]	FOpCtrlE;	// Function opcode
    input logic 	FmtE;   		// Result Precision (1 for double, 0 for single)
 
@@ -81,12 +81,12 @@ module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE,
    // and the sign of the first operand is set appropratiately based on
    // if the operation is absolute value or negation. 
 
-   convert_inputs conv1 (AddFloat1E, AddFloat2E, FInput1E, FInput2E, FOpCtrlE, P);
+   convert_inputs conv1 (AddFloat1E, AddFloat2E, SrcXE, SrcYE, FOpCtrlE, P);
 
    // Test for exceptions and return the "Invalid Operation" and
    // "Denormalized" Input Flags. The "AddSelInvE" is used in
    // the third pipeline stage to select the result. Also, AddOp1NormE
-   // and AddOp2NormE are one if FInput1E and FInput2E are not zero or denormalized.
+   // and AddOp2NormE are one if SrcXE and SrcYE are not zero or denormalized.
    // sub is one if the effective operation is subtaction. 
 
    exception exc1 (AddSelInvE, AddInvalidE, AddDenormInE, AddOp1NormE, AddOp2NormE, sub, 
@@ -159,8 +159,8 @@ module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE,
 
    // Place either the sign-extened 32-bit value or the original 64-bit value 
    // into IntValue (to be used for integer to floating point conversion)
-   assign IntValue [31:0] = FInput1E[31:0];
-   assign IntValue [63:32] = FOpCtrlE[0] ? {32{FInput1E[31]}} : FInput1E[63:32];
+   assign IntValue [31:0] = SrcXE[31:0];
+   assign IntValue [63:32] = FOpCtrlE[0] ? {32{SrcXE[31]}} : SrcXE[63:32];
 
    // If doing an integer to floating point conversion, mantissaA3 is set to 
    // IntVal and the prenomalized exponent is set to 1084. Otherwise, 
diff --git a/wally-pipelined/src/fpu/fpuclassify.sv b/wally-pipelined/src/fpu/fpuclassify.sv
index 1000bdf42..b320b2f07 100644
--- a/wally-pipelined/src/fpu/fpuclassify.sv
+++ b/wally-pipelined/src/fpu/fpuclassify.sv
@@ -1,7 +1,8 @@
+
 `include "wally-config.vh"
 
 module fpuclassify (
-    input  logic [63:0] FInput1E,
+    input  logic [63:0] SrcXE,
     input  logic        FmtE,           // 0-single 1-double
     output logic [63:0] ClassResultE
     );
@@ -13,9 +14,9 @@ module fpuclassify (
     logic ExpNotZero, ExpOnes, ManNotZero, ExpZero, ManZero, FirstBitMan;
    
     // single and double precision layouts
-    assign single = FInput1E[63:32];
-    assign double = FInput1E;
-    assign sign = FInput1E[63];
+    assign single = SrcXE[63:32];
+    assign double = SrcXE;
+    assign sign = SrcXE[63];
 
     // basic calculations for readabillity
     assign ExpNotZero = FmtE ? |double[62:52] : |single[30:23];
@@ -43,10 +44,7 @@ module fpuclassify (
     //  bit 7 - +infinity
     //  bit 8 - signaling NaN
     //  bit 9 - quiet NaN
-    assign ClassResultE = FmtE ? {{54{1'b0}}, FirstBitMan&NaN, ~FirstBitMan&NaN, ~sign&infinity, ~sign&normal, 
-                                    ~sign&subnormal, ~sign&zero, sign&zero, sign&subnormal, sign&normal, sign&infinity} : 
-				 {{22{1'b0}}, FirstBitMan&NaN, ~FirstBitMan&NaN, ~sign&infinity, ~sign&normal, 
-                                    ~sign&subnormal, ~sign&zero, sign&zero, sign&subnormal, sign&normal, sign&infinity, {32{1'b0}}};
-
+    assign ClassResultE = {{54{1'b0}}, FirstBitMan&NaN, ~FirstBitMan&NaN, ~sign&infinity, ~sign&normal, 
+                                    ~sign&subnormal, ~sign&zero, sign&zero, sign&subnormal, sign&normal, sign&infinity};
 
 endmodule
diff --git a/wally-pipelined/src/fpu/fpucmp1.sv b/wally-pipelined/src/fpu/fpucmp1.sv
index 1cf267f22..3a8245e63 100755
--- a/wally-pipelined/src/fpu/fpucmp1.sv
+++ b/wally-pipelined/src/fpu/fpucmp1.sv
@@ -1,3 +1,4 @@
+
 //
 // File name : fpcomp.v
 // Title     : Floating-Point Comparator
@@ -17,9 +18,9 @@
 //     and correct for sign bits
 //
 // This module takes 64-bits inputs op1 and op2, VSS, and VDD
-// signals, and a 2-bit signal Sel that indicates the type of 
+// signals, and a 2-bit signal FOpCtrlE that indicates the type of 
 // operands being compared as indicated below.
-//	Sel	Description
+//	FOpCtrlE	Description
 //	 00	double precision numbers
 //	 01	single precision numbers
 //	 10	half precision numbers
@@ -37,24 +38,41 @@
 // It also produces an invalid operation flag, which is one
 // if either of the input operands is a signaling NaN per 754
 
-module fpucmp1 (w, x, ANaN, BNaN, Azero, Bzero, op1, op2, Sel);///***fix Sel to match spec
-   
-   input logic [63:0] op1; 
-   input logic [63:0] op2;
-   input logic [1:0]  Sel;
+`include "wally-config.vh"
+module fpucmp1 (   
+   input logic [63:0] op1, 
+   input logic [63:0] op2,
+   input logic [2:0]  FOpCtrlE,
+   input logic 	      FmtE,
 
-   output logic [7:0]	      w, x;
-   output logic	      ANaN, BNaN;
-   output logic	      Azero, Bzero;
+   
+   output logic       Invalid, 		 // Invalid Operation
+   // output logic [1:0] FCC,  		 // Condition Codes 
+   output logic [63:0] FCmpResultE);
+   // Perform magnitude comparison between the 63 least signficant bits
+   // of the input operands. Only LT and EQ are returned, since GT can
+   // be determined from these values. 
+   logic [1:0] FCC;  		 // Condition Codes 
+   logic [7:0]	      w, x;
+   logic	      ANaN, BNaN;
+   logic	      Azero, Bzero;
+   logic 	      LT;                // magnitude op1 < magnitude op2
+   logic 	      EQ;                // magnitude op1 = magnitude op2
+   
+   magcompare64b_1 magcomp1 (w, x, {~op1[63], op1[62:0]}, {~op2[63], op2[62:0]});
+
+   // Determine final values based on output of magnitude comparison, 
+   // sign bits, and special case testing. 
+   exception_cmp_1 exc1 (ANaN, BNaN, Azero, Bzero, op1, op2, FOpCtrlE);
    
    // Perform magnitude comparison between the 63 least signficant bits
    // of the input operands. Only LT and EQ are returned, since GT can
    // be determined from these values. 
-   magcompare64b_1 magcomp2 (w, x, {~op1[63], op1[62:0]}, {~op2[63], op2[62:0]});
+   magcompare64b_2 magcomp2 (LT, EQ, w, x);
 
    // Determine final values based on output of magnitude comparison, 
    // sign bits, and special case testing. 
-   exception_cmp_1 exc1 (ANaN, BNaN, Azero, Bzero, op1, op2, Sel);
+   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .FOpCtrlE(FOpCtrlE), .A(op1), .B(op2), .*);
 
 endmodule // fpcomp
 
@@ -178,9 +196,9 @@ module magcompare64b_1 (w, x,  A, B);
 endmodule // magcompare64b
 
 // This module takes 64-bits inputs A and B, two magnitude comparison
-// flags LT_mag and EQ_mag, and a 2-bit signal Sel that indicates the type of 
+// flags LT_mag and EQ_mag, and a 2-bit signal FOpCtrlE that indicates the type of 
 // operands being compared as indicated below.
-//	Sel	Description
+//	FOpCtrlE	Description
 //	 00	double precision numbers
 //	 01	single precision numbers
 //	 10	half precision numbers
@@ -196,11 +214,11 @@ endmodule // magcompare64b
 // It also produces a invalid operation flag, which is one
 // if either of the input operands is a signaling NaN.
 
-module exception_cmp_1 (ANaN, BNaN, Azero, Bzero, A, B, Sel);
+module exception_cmp_1 (ANaN, BNaN, Azero, Bzero, A, B, FOpCtrlE);
 
    input logic [63:0] A;
    input logic [63:0] B;
-   input logic [1:0]  Sel;
+   input logic [2:0]  FOpCtrlE;
 
    logic 		      dp, sp, hp;
 
@@ -209,9 +227,9 @@ module exception_cmp_1 (ANaN, BNaN, Azero, Bzero, A, B, Sel);
    output logic               Azero;
    output logic               Bzero;
 
-   assign dp = !Sel[1]&!Sel[0];
-   assign sp = !Sel[1]&Sel[0];
-   assign hp = Sel[1]&!Sel[0];
+   assign dp = !FOpCtrlE[1]&!FOpCtrlE[0];
+   assign sp = !FOpCtrlE[1]&FOpCtrlE[0];
+   assign hp = FOpCtrlE[1]&!FOpCtrlE[0];
 
    // Test if A or B is NaN.
    assign ANaN = (A[62]&A[61]&A[60]&A[59]&A[58]) & 
@@ -232,3 +250,216 @@ module exception_cmp_1 (ANaN, BNaN, Azero, Bzero, A, B, Sel);
    assign Bzero = (B[62:0] == 63'h0);
 
 endmodule // exception_cmp
+//
+// File name : fpcomp.v
+// Title     : Floating-Point Comparator
+// project   : FPU
+// Library   : fpcomp
+// Author(s) : James E. Stine
+// Purpose   : definition of main unit to floating-point comparator
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Floating Point Comparator (Algorithm)
+//
+// 1.) Performs sign-extension if the inputs are 32-bit integers.
+// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
+// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
+//     and correct for sign bits
+//
+// This module takes 64-bits inputs op1 and op2, VSS, and VDD
+// signals, and a 2-bit signal FOpCtrlE that indicates the type of 
+// operands being compared as indicated below.
+//	FOpCtrlE	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	(unused)
+//
+// The comparator produces a 2-bit signal FCC, which
+// indicates the result of the comparison:
+//
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+//
+// It also produces an invalid operation flag, which is one
+// if either of the input operands is a signaling NaN per 754
+
+
+/*module magcompare2b (LT, GT, A, B);
+
+   input logic [1:0] A;
+   input logic [1:0] B;
+   
+   output logic     LT;
+   output logic     GT;
+
+   // Determine if A < B  using a minimized sum-of-products expression
+   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
+   // Determine if A > B  using a minimized sum-of-products expression
+   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
+
+endmodule*/ // magcompare2b
+
+// 2-bit magnitude comparator
+// This module compares two 2-bit values A and B. LT is '1' if A < B 
+// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
+// this version actually incorporates don't cares into the equation to
+// simplify the optimization
+
+// module magcompare2c (LT, GT, A, B);
+
+//    input logic [1:0] A;
+//    input logic [1:0] B;
+   
+//    output logic      LT;
+//    output logic      GT;
+
+//    assign LT = B[1] | (!A[1]&B[0]);
+//    assign GT = A[1] | (!B[1]&A[0]);
+
+// endmodule // magcompare2b
+
+// This module compares two 64-bit values A and B. LT is '1' if A < B 
+// and EQ is '1'if A = B. LT and GT are both '0' if A > B.
+// This structure was modified so
+// that it only does a strict magnitdude comparison, and only
+// returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
+// of 63 2-bit magnitude comparators, followed by one OR gates.
+//
+// J. E. Stine and M. J. Schulte, "A combined two's complement and
+// floating-point comparator," 2005 IEEE International Symposium on
+// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
+// doi: 10.1109/ISCAS.2005.1464531
+
+module magcompare64b_2 (LT, EQ, w, x);
+
+   input logic [7:0]  w;
+   input logic [7:0]  x;
+   logic [3:0] 	      y;
+   logic [3:0] 	      z;
+   logic [1:0] 	      a;
+   logic [1:0] 	      b;   
+   logic 	      GT;
+   
+   output logic       LT;
+   output logic       EQ;
+   
+   magcompare2c mag39(y[0], z[0], x[1:0], w[1:0]);
+   magcompare2c mag3A(y[1], z[1], x[3:2], w[3:2]);
+   magcompare2c mag3B(y[2], z[2], x[5:4], w[5:4]);
+   magcompare2c mag3C(y[3], z[3], x[7:6], w[7:6]);
+   
+   magcompare2c mag3D(a[0], b[0], z[1:0], y[1:0]);
+   magcompare2c mag3E(a[1], b[1], z[3:2], y[3:2]);
+   
+   magcompare2c mag3F(LT, GT, b[1:0], a[1:0]);
+
+   assign EQ = ~(LT | GT);
+
+endmodule // magcompare64b
+
+// This module takes 64-bits inputs A and B, two magnitude comparison
+// flags LT_mag and EQ_mag, and a 2-bit signal FOpCtrlE that indicates the type of 
+// operands being compared as indicated below.
+//	FOpCtrlE	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	bfloat precision numbers
+//
+// The comparator produces a 2-bit signal fcc, which
+// indicates the result of the comparison as follows:
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+// It also produces a invalid operation flag, which is one
+// if either of the input operands is a signaling NaN.
+
+module exception_cmp_2 (
+   input logic [63:0] A,
+   input logic [63:0] B,
+   input logic 	      FmtE,
+   input logic 	      LT_mag,
+   input logic 	      EQ_mag,
+   input logic [2:0]  FOpCtrlE,
+   
+   output logic       invalid,
+   output logic [1:0] fcc,
+   output logic [63:0] FCmpResultE,
+
+   input logic 	      Azero,
+   input logic 	      Bzero,   
+   input logic 	      ANaN,
+   input logic 	      BNaN);
+   
+   logic 	      dp;   
+   logic 	      sp;
+   logic 	      hp;   
+   logic 	      ASNaN;
+   logic 	      BSNaN;
+   logic 	      UO;
+   logic 	      GT;
+   logic 	      LT;
+   logic 	      EQ;
+   logic [62:0]       sixtythreezeros = 63'h0;
+
+   assign dp = !FOpCtrlE[1]&!FOpCtrlE[0];
+   assign sp = !FOpCtrlE[1]&FOpCtrlE[0];
+   assign hp = FOpCtrlE[1]&!FOpCtrlE[0];
+
+   // Values are unordered if ((A is NaN) OR (B is NaN)) AND (a floating 
+   // point comparison is being performed. 
+   assign UO = (ANaN | BNaN);
+
+   // Test if A or B is a signaling NaN.
+   assign ASNaN = ANaN & (sp&~A[53] | dp&~A[50] | hp&~A[56]);
+   assign BSNaN = BNaN & (sp&~B[53] | dp&~B[50] | hp&~B[56]);
+
+   // If either A or B is a signaling NaN the "Invalid Operation"
+   // exception flag is set to one; otherwise it is zero.    
+   assign invalid = (ASNaN | BSNaN);
+
+   // A and B are equal if (their magnitudes are equal) AND ((their signs are
+   // equal) or (their magnitudes are zero AND they are floating point
+   // numbers)). Also, A and B are not equal if they are unordered.
+   assign EQ = (EQ_mag | (Azero&Bzero)) & (~UO);
+   
+   // A is less than B if (A is negative and B is posiive) OR
+   // (A and B are positive and the magnitude of A is less than
+   // the magnitude of B) or (A and B are negative integers and
+   // the magnitude of A is less than the magnitude of B) or
+   // (A and B are negative floating point numbers and
+   // the magnitude of A is greater than the magnitude of B).
+   // Also, A is not less than B if A and B are equal or unordered.
+   assign LT = ((~LT_mag & A[63] & B[63]) |
+		(LT_mag & ~(A[63] & B[63])))&~EQ&~UO;
+   
+   // A is greater than B when LT, EQ, and UO are are false.
+   assign GT = ~(LT | EQ | UO);
+
+   // Note: it may be possible to optimize the setting of fcc 
+   // a little more, but it is probably not worth the effort. 
+
+   // Set the bits of fcc based on LT, GT, EQ, and UO
+   assign fcc[0] = LT | UO;
+   assign fcc[1] = GT | UO;  
+
+   always_comb begin
+      case (FOpCtrlE[2:0])
+         3'b111: FCmpResultE = LT ? A : B;//min 
+         3'b101: FCmpResultE = GT ? A : B;//max
+         3'b010: FCmpResultE = {63'b0, EQ};//equal
+         3'b001: FCmpResultE = {63'b0, LT};//less than
+         3'b011: FCmpResultE = {63'b0, LT|EQ};//less than or equal
+         default: FCmpResultE = 64'b0;
+      endcase
+   end 
+
+endmodule // exception_cmp
diff --git a/wally-pipelined/src/fpu/fpucmp2.sv b/wally-pipelined/src/fpu/fpucmp2.sv
index 42a780ac1..ee14afb94 100755
--- a/wally-pipelined/src/fpu/fpucmp2.sv
+++ b/wally-pipelined/src/fpu/fpucmp2.sv
@@ -1,243 +1,243 @@
-//
-// File name : fpcomp.v
-// Title     : Floating-Point Comparator
-// project   : FPU
-// Library   : fpcomp
-// Author(s) : James E. Stine
-// Purpose   : definition of main unit to floating-point comparator
-// notes :   
-//
-// Copyright Oklahoma State University
-//
-// Floating Point Comparator (Algorithm)
-//
-// 1.) Performs sign-extension if the inputs are 32-bit integers.
-// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
-// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
-//     and correct for sign bits
-//
-// This module takes 64-bits inputs op1 and op2, VSS, and VDD
-// signals, and a 2-bit signal Sel that indicates the type of 
-// operands being compared as indicated below.
-//	Sel	Description
-//	 00	double precision numbers
-//	 01	single precision numbers
-//	 10	half precision numbers
-//	 11	(unused)
-//
-// The comparator produces a 2-bit signal FCC, which
-// indicates the result of the comparison:
-//
-//     fcc 	decscription
-//      00	A = B	
-//      01	A < B	
-//      10	A > B	
-//      11	A and B	are unordered (i.e., A or B is NaN)
-//
-// It also produces an invalid operation flag, which is one
-// if either of the input operands is a signaling NaN per 754
+// //
+// // File name : fpcomp.v
+// // Title     : Floating-Point Comparator
+// // project   : FPU
+// // Library   : fpcomp
+// // Author(s) : James E. Stine
+// // Purpose   : definition of main unit to floating-point comparator
+// // notes :   
+// //
+// // Copyright Oklahoma State University
+// //
+// // Floating Point Comparator (Algorithm)
+// //
+// // 1.) Performs sign-extension if the inputs are 32-bit integers.
+// // 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
+// // 3.) Check for special cases (+0=-0, unordered, and infinite values) 
+// //     and correct for sign bits
+// //
+// // This module takes 64-bits inputs op1 and op2, VSS, and VDD
+// // signals, and a 2-bit signal Sel that indicates the type of 
+// // operands being compared as indicated below.
+// //	Sel	Description
+// //	 00	double precision numbers
+// //	 01	single precision numbers
+// //	 10	half precision numbers
+// //	 11	(unused)
+// //
+// // The comparator produces a 2-bit signal FCC, which
+// // indicates the result of the comparison:
+// //
+// //     fcc 	decscription
+// //      00	A = B	
+// //      01	A < B	
+// //      10	A > B	
+// //      11	A and B	are unordered (i.e., A or B is NaN)
+// //
+// // It also produces an invalid operation flag, which is one
+// // if either of the input operands is a signaling NaN per 754
 
-module fpucmp2 (   
-   input logic [63:0] op1, 
-   input logic [63:0] op2,
-   input logic [1:0]  Sel,
-   input logic [7:0]  w, x,
-   input logic        ANaN, BNaN,
-   input logic        Azero, Bzero,
-   input logic [3:0]  FOpCtrlM,
-   input logic 	      FmtM,
+// module fpucmp2 (   
+//    input logic [63:0] op1, 
+//    input logic [63:0] op2,
+//    input logic [1:0]  Sel,
+//    input logic [7:0]  w, x,
+//    input logic        ANaN, BNaN,
+//    input logic        Azero, Bzero,
+//    input logic [3:0]  FOpCtrlM,
+//    input logic 	      FmtM,
    
-   output logic       Invalid, 		 // Invalid Operation
-   output logic [1:0] FCC,  		 // Condition Codes 
-   output logic [63:0] FCmpResultM);
+//    output logic       Invalid, 		 // Invalid Operation
+//    output logic [1:0] FCC,  		 // Condition Codes 
+//    output logic [63:0] FCmpResultM);
    
-   logic 	      LT;                // magnitude op1 < magnitude op2
-   logic 	      EQ;                // magnitude op1 = magnitude op2
+//    logic 	      LT;                // magnitude op1 < magnitude op2
+//    logic 	      EQ;                // magnitude op1 = magnitude op2
    
-   // Perform magnitude comparison between the 63 least signficant bits
-   // of the input operands. Only LT and EQ are returned, since GT can
-   // be determined from these values. 
-   magcompare64b_2 magcomp2 (LT, EQ, w, x);
+//    // Perform magnitude comparison between the 63 least signficant bits
+//    // of the input operands. Only LT and EQ are returned, since GT can
+//    // be determined from these values. 
+//    magcompare64b_2 magcomp2 (LT, EQ, w, x);
 
-   // Determine final values based on output of magnitude comparison, 
-   // sign bits, and special case testing. 
-   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .Sel(Sel), .A(op1), .B(op2), .*);
+//    // Determine final values based on output of magnitude comparison, 
+//    // sign bits, and special case testing. 
+//    exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .Sel(Sel), .A(op1), .B(op2), .*);
    
 
-endmodule // fpcomp
+// endmodule // fpcomp
 
-/*module magcompare2b (LT, GT, A, B);
-
-   input logic [1:0] A;
-   input logic [1:0] B;
-   
-   output logic     LT;
-   output logic     GT;
-
-   // Determine if A < B  using a minimized sum-of-products expression
-   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
-   // Determine if A > B  using a minimized sum-of-products expression
-   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
-
-endmodule*/ // magcompare2b
-
-// 2-bit magnitude comparator
-// This module compares two 2-bit values A and B. LT is '1' if A < B 
-// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
-// this version actually incorporates don't cares into the equation to
-// simplify the optimization
-
-// module magcompare2c (LT, GT, A, B);
+// /*module magcompare2b (LT, GT, A, B);
 
 //    input logic [1:0] A;
 //    input logic [1:0] B;
    
-//    output logic      LT;
-//    output logic      GT;
+//    output logic     LT;
+//    output logic     GT;
 
-//    assign LT = B[1] | (!A[1]&B[0]);
-//    assign GT = A[1] | (!B[1]&A[0]);
+//    // Determine if A < B  using a minimized sum-of-products expression
+//    assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
+//    // Determine if A > B  using a minimized sum-of-products expression
+//    assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
 
-// endmodule // magcompare2b
+// endmodule*/ // magcompare2b
 
-// This module compares two 64-bit values A and B. LT is '1' if A < B 
-// and EQ is '1'if A = B. LT and GT are both '0' if A > B.
-// This structure was modified so
-// that it only does a strict magnitdude comparison, and only
-// returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
-// of 63 2-bit magnitude comparators, followed by one OR gates.
-//
-// J. E. Stine and M. J. Schulte, "A combined two's complement and
-// floating-point comparator," 2005 IEEE International Symposium on
-// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
-// doi: 10.1109/ISCAS.2005.1464531
+// // 2-bit magnitude comparator
+// // This module compares two 2-bit values A and B. LT is '1' if A < B 
+// // and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
+// // this version actually incorporates don't cares into the equation to
+// // simplify the optimization
 
-module magcompare64b_2 (LT, EQ, w, x);
+// // module magcompare2c (LT, GT, A, B);
 
-   input logic [7:0]  w;
-   input logic [7:0]  x;
-   logic [3:0] 	      y;
-   logic [3:0] 	      z;
-   logic [1:0] 	      a;
-   logic [1:0] 	      b;   
-   logic 	      GT;
+// //    input logic [1:0] A;
+// //    input logic [1:0] B;
    
-   output logic       LT;
-   output logic       EQ;
+// //    output logic      LT;
+// //    output logic      GT;
+
+// //    assign LT = B[1] | (!A[1]&B[0]);
+// //    assign GT = A[1] | (!B[1]&A[0]);
+
+// // endmodule // magcompare2b
+
+// // This module compares two 64-bit values A and B. LT is '1' if A < B 
+// // and EQ is '1'if A = B. LT and GT are both '0' if A > B.
+// // This structure was modified so
+// // that it only does a strict magnitdude comparison, and only
+// // returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
+// // of 63 2-bit magnitude comparators, followed by one OR gates.
+// //
+// // J. E. Stine and M. J. Schulte, "A combined two's complement and
+// // floating-point comparator," 2005 IEEE International Symposium on
+// // Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
+// // doi: 10.1109/ISCAS.2005.1464531
+
+// module magcompare64b_2 (LT, EQ, w, x);
+
+//    input logic [7:0]  w;
+//    input logic [7:0]  x;
+//    logic [3:0] 	      y;
+//    logic [3:0] 	      z;
+//    logic [1:0] 	      a;
+//    logic [1:0] 	      b;   
+//    logic 	      GT;
    
-   magcompare2c mag39(y[0], z[0], x[1:0], w[1:0]);
-   magcompare2c mag3A(y[1], z[1], x[3:2], w[3:2]);
-   magcompare2c mag3B(y[2], z[2], x[5:4], w[5:4]);
-   magcompare2c mag3C(y[3], z[3], x[7:6], w[7:6]);
+//    output logic       LT;
+//    output logic       EQ;
    
-   magcompare2c mag3D(a[0], b[0], z[1:0], y[1:0]);
-   magcompare2c mag3E(a[1], b[1], z[3:2], y[3:2]);
+//    magcompare2c mag39(y[0], z[0], x[1:0], w[1:0]);
+//    magcompare2c mag3A(y[1], z[1], x[3:2], w[3:2]);
+//    magcompare2c mag3B(y[2], z[2], x[5:4], w[5:4]);
+//    magcompare2c mag3C(y[3], z[3], x[7:6], w[7:6]);
    
-   magcompare2c mag3F(LT, GT, b[1:0], a[1:0]);
-
-   assign EQ = ~(LT | GT);
-
-endmodule // magcompare64b
-
-// This module takes 64-bits inputs A and B, two magnitude comparison
-// flags LT_mag and EQ_mag, and a 2-bit signal Sel that indicates the type of 
-// operands being compared as indicated below.
-//	Sel	Description
-//	 00	double precision numbers
-//	 01	single precision numbers
-//	 10	half precision numbers
-//	 11	bfloat precision numbers
-//
-// The comparator produces a 2-bit signal fcc, which
-// indicates the result of the comparison as follows:
-//     fcc 	decscription
-//      00	A = B	
-//      01	A < B	
-//      10	A > B	
-//      11	A and B	are unordered (i.e., A or B is NaN)
-// It also produces a invalid operation flag, which is one
-// if either of the input operands is a signaling NaN.
-
-module exception_cmp_2 (
-   input logic [63:0] A,
-   input logic [63:0] B,
-   input logic 	      FmtM,
-   input logic 	      LT_mag,
-   input logic 	      EQ_mag,
-   input logic [1:0]  Sel,
-   input logic [3:0]  FOpCtrlM,
+//    magcompare2c mag3D(a[0], b[0], z[1:0], y[1:0]);
+//    magcompare2c mag3E(a[1], b[1], z[3:2], y[3:2]);
    
-   output logic       invalid,
-   output logic [1:0] fcc,
-   output logic [63:0] FCmpResultM,
+//    magcompare2c mag3F(LT, GT, b[1:0], a[1:0]);
 
-   input logic 	      Azero,
-   input logic 	      Bzero,   
-   input logic 	      ANaN,
-   input logic 	      BNaN);
+//    assign EQ = ~(LT | GT);
+
+// endmodule // magcompare64b
+
+// // This module takes 64-bits inputs A and B, two magnitude comparison
+// // flags LT_mag and EQ_mag, and a 2-bit signal Sel that indicates the type of 
+// // operands being compared as indicated below.
+// //	Sel	Description
+// //	 00	double precision numbers
+// //	 01	single precision numbers
+// //	 10	half precision numbers
+// //	 11	bfloat precision numbers
+// //
+// // The comparator produces a 2-bit signal fcc, which
+// // indicates the result of the comparison as follows:
+// //     fcc 	decscription
+// //      00	A = B	
+// //      01	A < B	
+// //      10	A > B	
+// //      11	A and B	are unordered (i.e., A or B is NaN)
+// // It also produces a invalid operation flag, which is one
+// // if either of the input operands is a signaling NaN.
+
+// module exception_cmp_2 (
+//    input logic [63:0] A,
+//    input logic [63:0] B,
+//    input logic 	      FmtM,
+//    input logic 	      LT_mag,
+//    input logic 	      EQ_mag,
+//    input logic [1:0]  Sel,
+//    input logic [3:0]  FOpCtrlM,
    
-   logic 	      dp;   
-   logic 	      sp;
-   logic 	      hp;   
-   logic 	      ASNaN;
-   logic 	      BSNaN;
-   logic 	      UO;
-   logic 	      GT;
-   logic 	      LT;
-   logic 	      EQ;
-   logic [62:0]       sixtythreezeros = 63'h0;
+//    output logic       invalid,
+//    output logic [1:0] fcc,
+//    output logic [63:0] FCmpResultM,
 
-   assign dp = !Sel[1]&!Sel[0];
-   assign sp = !Sel[1]&Sel[0];
-   assign hp = Sel[1]&!Sel[0];
-
-   // Values are unordered if ((A is NaN) OR (B is NaN)) AND (a floating 
-   // point comparison is being performed. 
-   assign UO = (ANaN | BNaN);
-
-   // Test if A or B is a signaling NaN.
-   assign ASNaN = ANaN & (sp&~A[53] | dp&~A[50] | hp&~A[56]);
-   assign BSNaN = BNaN & (sp&~B[53] | dp&~B[50] | hp&~B[56]);
-
-   // If either A or B is a signaling NaN the "Invalid Operation"
-   // exception flag is set to one; otherwise it is zero.    
-   assign invalid = (ASNaN | BSNaN);
-
-   // A and B are equal if (their magnitudes are equal) AND ((their signs are
-   // equal) or (their magnitudes are zero AND they are floating point
-   // numbers)). Also, A and B are not equal if they are unordered.
-   assign EQ = (EQ_mag | (Azero&Bzero)) & (~UO);
+//    input logic 	      Azero,
+//    input logic 	      Bzero,   
+//    input logic 	      ANaN,
+//    input logic 	      BNaN);
    
-   // A is less than B if (A is negative and B is posiive) OR
-   // (A and B are positive and the magnitude of A is less than
-   // the magnitude of B) or (A and B are negative integers and
-   // the magnitude of A is less than the magnitude of B) or
-   // (A and B are negative floating point numbers and
-   // the magnitude of A is greater than the magnitude of B).
-   // Also, A is not less than B if A and B are equal or unordered.
-   assign LT = ((~LT_mag & A[63] & B[63]) |
-		(LT_mag & ~(A[63] & B[63])))&~EQ&~UO;
+//    logic 	      dp;   
+//    logic 	      sp;
+//    logic 	      hp;   
+//    logic 	      ASNaN;
+//    logic 	      BSNaN;
+//    logic 	      UO;
+//    logic 	      GT;
+//    logic 	      LT;
+//    logic 	      EQ;
+//    logic [62:0]       sixtythreezeros = 63'h0;
+
+//    assign dp = !Sel[1]&!Sel[0];
+//    assign sp = !Sel[1]&Sel[0];
+//    assign hp = Sel[1]&!Sel[0];
+
+//    // Values are unordered if ((A is NaN) OR (B is NaN)) AND (a floating 
+//    // point comparison is being performed. 
+//    assign UO = (ANaN | BNaN);
+
+//    // Test if A or B is a signaling NaN.
+//    assign ASNaN = ANaN & (sp&~A[53] | dp&~A[50] | hp&~A[56]);
+//    assign BSNaN = BNaN & (sp&~B[53] | dp&~B[50] | hp&~B[56]);
+
+//    // If either A or B is a signaling NaN the "Invalid Operation"
+//    // exception flag is set to one; otherwise it is zero.    
+//    assign invalid = (ASNaN | BSNaN);
+
+//    // A and B are equal if (their magnitudes are equal) AND ((their signs are
+//    // equal) or (their magnitudes are zero AND they are floating point
+//    // numbers)). Also, A and B are not equal if they are unordered.
+//    assign EQ = (EQ_mag | (Azero&Bzero)) & (~UO);
    
-   // A is greater than B when LT, EQ, and UO are are false.
-   assign GT = ~(LT | EQ | UO);
+//    // A is less than B if (A is negative and B is posiive) OR
+//    // (A and B are positive and the magnitude of A is less than
+//    // the magnitude of B) or (A and B are negative integers and
+//    // the magnitude of A is less than the magnitude of B) or
+//    // (A and B are negative floating point numbers and
+//    // the magnitude of A is greater than the magnitude of B).
+//    // Also, A is not less than B if A and B are equal or unordered.
+//    assign LT = ((~LT_mag & A[63] & B[63]) |
+// 		(LT_mag & ~(A[63] & B[63])))&~EQ&~UO;
+   
+//    // A is greater than B when LT, EQ, and UO are are false.
+//    assign GT = ~(LT | EQ | UO);
 
-   // Note: it may be possible to optimize the setting of fcc 
-   // a little more, but it is probably not worth the effort. 
+//    // Note: it may be possible to optimize the setting of fcc 
+//    // a little more, but it is probably not worth the effort. 
 
-   // Set the bits of fcc based on LT, GT, EQ, and UO
-   assign fcc[0] = LT | UO;
-   assign fcc[1] = GT | UO;  
+//    // Set the bits of fcc based on LT, GT, EQ, and UO
+//    assign fcc[0] = LT | UO;
+//    assign fcc[1] = GT | UO;  
 
-   always_comb begin
-      case (FOpCtrlM[2:0])
-         3'b111: FCmpResultM = LT ? A : B;//min 
-         3'b101: FCmpResultM = GT ? A : B;//max
-         3'b010: FCmpResultM = FmtM ? {63'b0, EQ} : {31'b0, EQ, 32'b0};//equal
-         3'b001: FCmpResultM = FmtM ? {63'b0, LT} : {31'b0, LT, 32'b0};//less than
-         3'b011: FCmpResultM = FmtM ? {63'b0, LT|EQ} : {31'b0, LT|EQ, 32'b0};//less than or equal
-         default: FCmpResultM = 64'b0;
-      endcase
-   end 
+//    always_comb begin
+//       case (FOpCtrlM[2:0])
+//          3'b111: FCmpResultM = LT ? A : B;//min 
+//          3'b101: FCmpResultM = GT ? A : B;//max
+//          3'b010: FCmpResultM = FmtM ? {63'b0, EQ} : {31'b0, EQ, 32'b0};//equal
+//          3'b001: FCmpResultM = FmtM ? {63'b0, LT} : {31'b0, LT, 32'b0};//less than
+//          3'b011: FCmpResultM = FmtM ? {63'b0, LT|EQ} : {31'b0, LT|EQ, 32'b0};//less than or equal
+//          default: FCmpResultM = 64'b0;
+//       endcase
+//    end 
 
 
-endmodule // exception_cmp
+// endmodule // exception_cmp
diff --git a/wally-pipelined/src/fpu/fpuhazard.sv b/wally-pipelined/src/fpu/fpuhazard.sv
index 959ef4763..03667d84f 100644
--- a/wally-pipelined/src/fpu/fpuhazard.sv
+++ b/wally-pipelined/src/fpu/fpuhazard.sv
@@ -26,47 +26,41 @@
 `include "wally-config.vh"
 
 module fpuhazard(
-    input logic [4:0] Adr1, Adr2, Adr3,
-    input logic FWriteEnE, FWriteEnM, FWriteEnW, 
-	  input logic [4:0] RdE, RdM, RdW,
-	  input logic FDivBusyE,
-	  input logic	RegWriteD,
-    input logic [2:0] FResultSelD, FResultSelE,
-    input logic IllegalFPUInstrD,
-    input logic FInput2UsedD, FInput3UsedD,
-  // Stall outputs
-	  output logic FStallD,
-    output logic [1:0] FForwardInput1D, FForwardInput2D, 
-    output logic FForwardInput3D
+    input logic [4:0] Adr1E, Adr2E, Adr3E,
+    input logic FWriteEnM, FWriteEnW, 
+	  input logic [4:0] RdM, RdW,
+    input logic [2:0] FResultSelM,
+    output logic FStallD,
+    output logic [1:0] ForwardXE, ForwardYE, ForwardZE
 );
 
 
   always_comb begin
     // set ReadData as default
-    FForwardInput1D = 2'b00; 
-    FForwardInput2D = 2'b00;
-    FForwardInput3D = 1'b0;
-    FStallD = FDivBusyE;
-    if (~IllegalFPUInstrD) begin
-//					if taking a value from int register
-      if ((Adr1 == RdE) & (FWriteEnE | ((FResultSelE == 3'b110) & RegWriteD))) 
-        if (FResultSelE == 3'b110) FForwardInput1D = 2'b11; // choose SrcAM
-        else FStallD = 1'b1;                           // otherwise stall
-      else if ((Adr1 == RdM) & FWriteEnM) FForwardInput1D = 2'b01; // choose FPUResultDirW
-      else if ((Adr1 == RdW) & FWriteEnW) FForwardInput1D = 2'b11; // choose FPUResultDirE
+    ForwardXE = 2'b00; // choose FRD1E
+    ForwardYE = 2'b00; // choose FRD2E
+    ForwardZE = 2'b00; // choose FRD3E
+    FStallD = 0;
+
+      if ((Adr1E == RdM) & FWriteEnM)
+      // if the result will be FResM
+        if(FResultSelM == 3'b110 | FResultSelM == 3'b011) ForwardXE = 2'b10; // choose FResM
+        else FStallD = 1;   // if the result won't be ready stall
+      else if ((Adr1E == RdW) & FWriteEnW) ForwardXE = 2'b01; // choose FPUResult64W
     
 
-      if(FInput2UsedD)
-        if      ((Adr2 == RdE) & FWriteEnE) FStallD = 1'b1;
-        else if ((Adr2 == RdM) & FWriteEnM) FForwardInput2D = 2'b01; // choose FPUResultDirW
-        else if ((Adr2 == RdW) & FWriteEnW) FForwardInput2D = 2'b10; // choose FPUResultDirE
+      if ((Adr2E == RdM) & FWriteEnM)
+      // if the result will be FResM
+        if(FResultSelM == 3'b110 | FResultSelM == 3'b011) ForwardYE = 2'b10; // choose FResM
+        else FStallD = 1;   // if the result won't be ready stall
+      else if ((Adr2E == RdW) & FWriteEnW) ForwardYE = 2'b01; // choose FPUResult64W
 
-
-      if(FInput3UsedD)
-        if      ((Adr3 == RdE) & FWriteEnE) FStallD = 1'b1;
-        else if ((Adr3 == RdM) & FWriteEnM) FStallD = 1'b1;
-        else if ((Adr3 == RdW) & FWriteEnW) FForwardInput3D = 1'b1; // choose FPUResultDirE
-    end
+ 
+      if ((Adr3E == RdM) & FWriteEnM)
+      // if the result will be FResM
+        if(FResultSelM == 3'b110 | FResultSelM == 3'b011) ForwardZE = 2'b10; // choose FResM
+        else FStallD = 1;   // if the result won't be ready stall
+      else if ((Adr3E == RdW) & FWriteEnW) ForwardZE = 2'b01; // choose FPUResult64W
 
   end 
 
diff --git a/wally-pipelined/src/fpu/fsgn.sv b/wally-pipelined/src/fpu/fsgn.sv
index 2850af86e..62d0e7d7c 100755
--- a/wally-pipelined/src/fpu/fsgn.sv
+++ b/wally-pipelined/src/fpu/fsgn.sv
@@ -1,8 +1,8 @@
 //performs the fsgnj/fsgnjn/fsgnjx RISCV instructions
 
-module fpusgn (SgnOpCodeE, SgnResultE, SgnFlagsE, FInput1E, FInput2E);
+module fpusgn (SgnOpCodeE, SgnResultE, SgnFlagsE, SrcXE, SrcYE);
 
-	input  [63:0]  FInput1E, FInput2E;
+	input  [63:0]  SrcXE, SrcYE;
 	input  [1:0]   SgnOpCodeE;
 	output [63:0]  SgnResultE;
 	output [4:0]   SgnFlagsE;
@@ -11,18 +11,18 @@ module fpusgn (SgnOpCodeE, SgnResultE, SgnFlagsE, FInput1E, FInput2E);
 
 	//op code designation:
 	//
-	//00 - fsgnj - directly copy over sign value of FInput2E
-	//01 - fsgnjn - negate sign value of FInput2E
-	//10 - fsgnjx - XOR sign values of FInput1E & FInput2E
+	//00 - fsgnj - directly copy over sign value of SrcYE
+	//01 - fsgnjn - negate sign value of SrcYE
+	//10 - fsgnjx - XOR sign values of SrcXE & SrcYE
 	//
 	
-	assign SgnResultE[63] = SgnOpCodeE[1] ? (FInput1E[63] ^ FInput2E[63]) : (FInput2E[63] ^ SgnOpCodeE[0]);
-	assign SgnResultE[62:0] = FInput1E[62:0];
+	assign SgnResultE[63] = SgnOpCodeE[1] ? (SrcXE[63] ^ SrcYE[63]) : (SrcYE[63] ^ SgnOpCodeE[0]);
+	assign SgnResultE[62:0] = SrcXE[62:0];
 
 	//If the exponent is all ones, then the value is either Inf or NaN,
 	//both of which will produce a QNaN/SNaN value of some sort. This will 
 	//set the invalid flag high.
-	assign AonesExp = FInput1E[62]&FInput1E[61]&FInput1E[60]&FInput1E[59]&FInput1E[58]&FInput1E[57]&FInput1E[56]&FInput1E[55]&FInput1E[54]&FInput1E[53]&FInput1E[52];
+	assign AonesExp = SrcXE[62]&SrcXE[61]&SrcXE[60]&SrcXE[59]&SrcXE[58]&SrcXE[57]&SrcXE[56]&SrcXE[55]&SrcXE[54]&SrcXE[53]&SrcXE[52];
 
 	//the only flag that can occur during this operation is invalid
 	//due to changing sign on already existing NaN
diff --git a/wally-pipelined/src/hazard/hazard.sv b/wally-pipelined/src/hazard/hazard.sv
index 016d8e1ad..356574d0f 100644
--- a/wally-pipelined/src/hazard/hazard.sv
+++ b/wally-pipelined/src/hazard/hazard.sv
@@ -32,7 +32,7 @@ module hazard(
 	      input logic  BPPredWrongE, CSRWritePendingDEM, RetM, TrapM,
 	      input logic  LoadStallD, MulDivStallD, CSRRdStallD,
 	      input logic  DataStall, ICacheStallF,
-        input logic  FPUStallD,
+        input logic  FPUStallD, FStallD,
 	      input logic  DivBusyE,FDivBusyE,
   // Stall & flush outputs
 	      output logic StallF, StallD, StallE, StallM, StallW,
@@ -56,7 +56,7 @@ module hazard(
   // If any stages are stalled, the first stage that isn't stalled must flush.
 
   assign StallFCause = CSRWritePendingDEM && ~(TrapM || RetM || BPPredWrongE);
-  assign StallDCause = (LoadStallD || MulDivStallD || CSRRdStallD || FPUStallD) && ~(TrapM || RetM || BPPredWrongE);    // stall in decode if instruction is a load/mul/csr dependent on previous
+  assign StallDCause = (LoadStallD || MulDivStallD || CSRRdStallD || FPUStallD || FStallD) && ~(TrapM || RetM || BPPredWrongE);    // stall in decode if instruction is a load/mul/csr dependent on previous
   assign StallECause = DivBusyE || FDivBusyE;
   assign StallMCause = 0; 
   assign StallWCause = DataStall || ICacheStallF;
diff --git a/wally-pipelined/src/ieu/controller.sv b/wally-pipelined/src/ieu/controller.sv
index 09ded48ba..ab25401e7 100644
--- a/wally-pipelined/src/ieu/controller.sv
+++ b/wally-pipelined/src/ieu/controller.sv
@@ -45,15 +45,16 @@ module controller(
   output logic       MemReadE, CSRReadE, // for Hazard Unit
   output logic [2:0] Funct3E,
   output logic       MulDivE, W64E,
-  output logic       JumpE,		  
+  output logic       JumpE,	
+  output logic [1:0] MemRWE,	  
   // Memory stage control signals
   input  logic       StallM, FlushM,
   output logic [1:0] MemRWM,
   output logic       CSRReadM, CSRWriteM, PrivilegedM, 
   output logic [1:0] AtomicM,
   output logic [2:0] Funct3M,
-  output logic       RegWriteM,     // for Hazard Unit
   output logic [2:0] ResultSrcM,
+  output logic       RegWriteM,     // for Hazard Unit
   output logic       InstrValidM,
   // Writeback stage control signals
   input  logic       StallW, FlushW,
@@ -74,7 +75,7 @@ module controller(
   // pipelined control signals
   logic 	    RegWriteE;
   logic [2:0] ResultSrcD, ResultSrcE;
-  logic [1:0] MemRWD, MemRWE;
+  logic [1:0] MemRWD;
   logic		    JumpD;
   logic		    BranchD, BranchE;
   logic	[1:0] ALUOpD;
@@ -141,6 +142,7 @@ module controller(
                       ControlsD = `CTRLW'b1_000_00_00_011_0_00_0_0_1_0_0_1_00_0; // W-type Multiply/Divide
                     else
                       ControlsD = `CTRLW'b0_000_00_00_000_0_00_0_0_0_0_0_0_00_1; // non-implemented instruction
+        //7'b1010011:   ControlsD = `CTRLW'b0_000_00_00_101_0_00_0_0_0_0_0_0_00_1; // FP
         7'b1100011:   ControlsD = `CTRLW'b0_010_00_00_000_1_01_0_0_0_0_0_0_00_0; // beq
         7'b1100111:   ControlsD = `CTRLW'b1_000_00_00_000_0_00_1_1_0_0_0_0_00_0; // jalr
         7'b1101111:   ControlsD = `CTRLW'b1_011_00_00_000_0_00_1_0_0_0_0_0_00_0; // jal
diff --git a/wally-pipelined/src/ieu/datapath.sv b/wally-pipelined/src/ieu/datapath.sv
index 848ed89a5..635c12f24 100644
--- a/wally-pipelined/src/ieu/datapath.sv
+++ b/wally-pipelined/src/ieu/datapath.sv
@@ -37,6 +37,9 @@ module datapath (
   input  logic             ALUSrcAE, ALUSrcBE,
   input  logic             TargetSrcE, 
   input  logic             JumpE,
+  input  logic             IsFPE,
+  input  logic [1:0]       MemRWE,
+  input  logic [`XLEN-1:0] FWriteDataE,
   input  logic [`XLEN-1:0] PCE,
   input  logic [`XLEN-1:0] PCLinkE,
   output logic [2:0]       FlagsE,
@@ -44,16 +47,16 @@ module datapath (
   output logic [`XLEN-1:0] SrcAE, SrcBE,
   // Memory stage signals
   input  logic             StallM, FlushM,
-  input  logic [`XLEN-1:0] FWriteDataM,
   input  logic             SquashSCM,
+  input  logic             FWriteIntM,
   input  logic [2:0]       ResultSrcM,
+  input  logic [`XLEN-1:0] FIntResM,
   input  logic [`XLEN-1:0] CSRReadValM, ReadDataM, MulDivResultM, 
   output logic [`XLEN-1:0] SrcAM,
   output logic [`XLEN-1:0] WriteDataM, MemAdrM,
   // Writeback stage signals
   input  logic             StallW, FlushW,
   input  logic             FWriteIntW,
-  input  logic [`XLEN-1:0] FPUResultW,
   input  logic             RegWriteW, 
   input  logic             SquashSCW,
   input  logic [2:0]       ResultSrcW,
@@ -72,7 +75,7 @@ module datapath (
   logic [`XLEN-1:0] RD1E, RD2E;
   logic [`XLEN-1:0] ExtImmE;
 
-  logic [`XLEN-1:0] PreSrcAE, SrcAE2, SrcBE2;
+  logic [`XLEN-1:0] PreSrcAE, PreSrcBE, SrcAE2, SrcBE2;
 
   logic [`XLEN-1:0] ALUResultE;
   logic [`XLEN-1:0] WriteDataE;
@@ -92,8 +95,7 @@ module datapath (
   assign Rs2D      = InstrD[24:20];
   assign RdD       = InstrD[11:7];
 
-  //Mux for writting floating point
-  mux2  #(`XLEN)  writedatamux(ResultW, FPUResultW, FWriteIntW, WriteDataW);  
+  //Mux for writting floating point 
   
   regfile regf(clk, reset, {RegWriteW | FWriteIntW}, Rs1D, Rs2D, RdW, WriteDataW, RD1D, RD2D);
   extend ext(.InstrD(InstrD[31:7]), .*);
@@ -106,11 +108,12 @@ module datapath (
   flopenrc #(5)    Rs2EReg(clk, reset, FlushE, ~StallE, Rs2D, Rs2E);
   flopenrc #(5)    RdEReg(clk, reset, FlushE, ~StallE, RdD, RdE);
 	
-  mux4  #(`XLEN)  faemux(RD1E, WriteDataW, ResultM, FWriteDataM, ForwardAE, PreSrcAE);
-  mux4  #(`XLEN)  fbemux(RD2E, WriteDataW, ResultM, FWriteDataM, ForwardBE, WriteDataE);
+  mux3  #(`XLEN)  faemux(RD1E, WriteDataW, ResultM, ForwardAE, PreSrcAE);
+  mux3  #(`XLEN)  fbemux(RD2E, WriteDataW, ResultM, ForwardBE, PreSrcBE);
+  mux2  #(`XLEN)  writedatamux(PreSrcBE, FWriteDataE, IsFPE, WriteDataE);
   mux2  #(`XLEN)  srcamux(PreSrcAE, PCE, ALUSrcAE, SrcAE);
   mux2  #(`XLEN)  srcamux2(SrcAE, PCLinkE, JumpE, SrcAE2);  
-  mux2  #(`XLEN)  srcbmux(WriteDataE, ExtImmE, ALUSrcBE, SrcBE);
+  mux2  #(`XLEN)  srcbmux(PreSrcBE, ExtImmE, ALUSrcBE, SrcBE);
   mux2  #(`XLEN)  srcbmux2(SrcBE, {`XLEN{1'b0}}, JumpE, SrcBE2); // *** May be able to remove this mux.
   alu   #(`XLEN)  alu(SrcAE2, SrcBE2, ALUControlE, ALUResultE, FlagsE);
   mux2  #(`XLEN)  targetsrcmux(PCE, SrcAE, TargetSrcE, TargetBaseE);
@@ -122,10 +125,11 @@ module datapath (
   assign MemAdrM = ALUResultM;
   flopenrc #(`XLEN) WriteDataMReg(clk, reset, FlushM, ~StallM, WriteDataE, WriteDataM);
   flopenrc #(5)    RdMEg(clk, reset, FlushM, ~StallM, RdE, RdM);
-  mux5  #(`XLEN) resultmuxM(ALUResultM, ReadDataM, CSRReadValM, MulDivResultM, SCResultM, ResultSrcM, ResultM);	
+  //mux6  #(`XLEN) resultmuxM(ALUResultM, ReadDataM, CSRReadValM, MulDivResultM, SCResultM, FIntResM, ResultSrcM, ResultM); //Wasn't doing anything	
+  mux2  #(`XLEN) resultmuxM(ALUResultM, FIntResM, FWriteIntM, ResultM); //Wasn't doing anything	
   
   // Writeback stage pipeline register and logic
-  flopenrc #(`XLEN) ALUResultWReg(clk, reset, FlushW, ~StallW, ALUResultM, ALUResultW);
+  flopenrc #(`XLEN) ResultWReg(clk, reset, FlushW, ~StallW, ResultM, ResultW);
   flopenrc #(5)    RdWEg(clk, reset, FlushW, ~StallW, RdM, RdW);
 
   // handle Store Conditional result if atomic extension supported
@@ -139,11 +143,11 @@ module datapath (
     end
   endgenerate
 
-  mux5  #(`XLEN) resultmuxW(ALUResultW, ReadDataW, CSRReadValW, MulDivResultW, SCResultW, ResultSrcW, ResultW);	
+  mux5  #(`XLEN) resultmuxW(ResultW, ReadDataW, CSRReadValW, MulDivResultW, SCResultW, ResultSrcW, WriteDataW);	
 /* -----\/----- EXCLUDED -----\/-----
   // This mux4:1 no longer needs to include PCLinkW.  This is set correctly in the execution stage.
   // *** need to look at how the decoder is coded to fix.
-  mux4  #(`XLEN) resultmux(ALUResultW, ReadDataW, PCLinkW, CSRReadValW, ResultSrcW, ResultW);	
+  mux4  #(`XLEN) resultmux(ALUResultW, ReadDataW, PCLinkW, CSRReadValW, ResultSrcW, WriteDataW);	
 >>>>>>> bp
  -----/\----- EXCLUDED -----/\----- */
  
diff --git a/wally-pipelined/src/ieu/forward.sv b/wally-pipelined/src/ieu/forward.sv
index 6729ed424..259d41f24 100644
--- a/wally-pipelined/src/ieu/forward.sv
+++ b/wally-pipelined/src/ieu/forward.sv
@@ -41,14 +41,12 @@ module forward(
     ForwardAE = 2'b00;
     ForwardBE = 2'b00;
     if (Rs1E != 5'b0)
-      if      ((Rs1E == RdM) & RegWriteM) ForwardAE = 2'b10;
+      if      ((Rs1E == RdM) & (RegWriteM|FWriteIntM)) ForwardAE = 2'b10;
       else if ((Rs1E == RdW) & (RegWriteW|FWriteIntW)) ForwardAE = 2'b01;
-      else if ((Rs1E == RdM) & FWriteIntM) ForwardAE = 2'b11;
  
     if (Rs2E != 5'b0)
-      if      ((Rs2E == RdM) & RegWriteM) ForwardBE = 2'b10;
+      if      ((Rs2E == RdM) & (RegWriteM|FWriteIntM)) ForwardBE = 2'b10;
       else if ((Rs2E == RdW) & (RegWriteW|FWriteIntW)) ForwardBE = 2'b01;
-      else if ((Rs2E == RdM) & FWriteIntM) ForwardBE = 2'b11;
   end
 
   // Stall on dependent operations that finish in Mem Stage and can't bypass in time
@@ -57,4 +55,4 @@ module forward(
    assign MulDivStallD = MulDivE & ((Rs1D == RdE) | (Rs2D == RdE)) | MulDivE | DivBusyE; // *** extend with stalls for divide
    assign CSRRdStallD = CSRReadE & ((Rs1D == RdE) | (Rs2D == RdE));
 
-endmodule
+endmodule
\ No newline at end of file
diff --git a/wally-pipelined/src/ieu/ieu.sv b/wally-pipelined/src/ieu/ieu.sv
index 73c619f62..b9198b0a6 100644
--- a/wally-pipelined/src/ieu/ieu.sv
+++ b/wally-pipelined/src/ieu/ieu.sv
@@ -35,7 +35,10 @@ module ieu (
   // Execute Stage interface
   input logic [`XLEN-1:0]  PCE, 
   input logic [`XLEN-1:0]  PCLinkE,
-  input logic 		   FWriteIntE, 
+  input logic 		         FWriteIntE, 
+  input logic              IsFPE,
+  //input  logic [1:0]       FMemRWE,
+  input  logic [`XLEN-1:0] FWriteDataE,
   output logic [`XLEN-1:0] PCTargetE,
   output logic 		   MulDivE, W64E,
   output logic [2:0] 	   Funct3E,
@@ -44,8 +47,8 @@ module ieu (
   input logic 		   DataMisalignedM,
   input logic 		   DataAccessFaultM,
   input logic	     	   FWriteIntM,
-  input  logic [`XLEN-1:0] FWriteDataM,
   input logic 		       SquashSCM,
+  input  logic [`XLEN-1:0] FIntResM,
   input  logic [`XLEN-1:0] CSRReadValM, ReadDataM, MulDivResultM, 
   output logic [1:0] 	   MemRWM,
   output logic [1:0] 	   AtomicM,
@@ -55,7 +58,6 @@ module ieu (
   // Writeback stage
   input logic [`XLEN-1:0]  CSRReadValW, ReadDataW, MulDivResultW,
   input logic             FWriteIntW,
-  input logic [`XLEN-1:0] FPUResultW,
   input logic 		   SquashSCW,
   // input  logic [`XLEN-1:0] PCLinkW,
   output logic 		   InstrValidM, InstrValidW,
@@ -83,6 +85,7 @@ module ieu (
   logic             RegWriteM, RegWriteW;
   logic             MemReadE, CSRReadE;
   logic             JumpE;
+  logic [1:0]       MemRWE;
            
   controller c(.*);
   datapath   dp(.*);             
diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv
index 1fd1408a4..fb7d288dc 100644
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@@ -86,21 +86,26 @@ module wallypipelinedhart (
 
   logic        PCSrcE;
   logic        CSRWritePendingDEM;
-  logic        FPUStallD, LoadStallD, MulDivStallD, CSRRdStallD;
+  logic       LoadStallD, MulDivStallD, CSRRdStallD;
   logic       DivDoneE;
   logic       DivBusyE;
   logic       DivDoneW;
-  logic [4:0] SetFflagsM;
-  logic [2:0] FRM_REGW;
-  logic       FloatRegWriteW;
-  logic [1:0] FMemRWM;
   logic       RegWriteD;
-  logic [`XLEN-1:0] FWriteDataM;
   logic       SquashSCM, SquashSCW;
-  logic       FStallD;
-  logic       FWriteIntE, FWriteIntW, FWriteIntM;
-  logic             FDivBusyE;
-  logic             IllegalFPUInstrD, IllegalFPUInstrE;
+
+  // floating point unit signals
+  logic [2:0]        FRM_REGW;
+  logic [1:0] 	   FMemRWM, FMemRWE;
+  logic 		      FStallD;
+  logic 		      FWriteIntE, FWriteIntM, FWriteIntW;
+  logic [`XLEN-1:0] FWriteDataE;
+  logic [`XLEN-1:0] FIntResM;  
+  logic 		      FDivBusyE;
+  logic 		      IsFPD, IsFPE;
+  logic 		      IllegalFPUInstrD, IllegalFPUInstrE;
+  logic           FloatRegWriteW;
+  logic           FPUStallD;
+  logic [4:0] 	   SetFflagsM;
   logic [`XLEN-1:0] FPUResultW;
 
   // memory management unit signals
@@ -159,13 +164,13 @@ module wallypipelinedhart (
   ieu ieu(.*); // integer execution unit: integer register file, datapath and controller
 
   
-  mux2  #(`XLEN)  OutputInput2mux(WriteDataM, FWriteDataM, FMemRWM[0], WriteDatatmpM);
-  lsu lsu(.MemRWM(MemRWM|FMemRWM), .WriteDataM(WriteDatatmpM),.*); // data cache unit
+  // mux2  #(`XLEN)  OutputInput2mux(WriteDataM, FWriteDataM, FMemRWM[0], WriteDatatmpM);
+  lsu lsu(.*); // data cache unit
 
   ahblite ebu( 
     //.InstrReadF(1'b0),
     //.InstrRData(InstrF), // hook up InstrF later
-    .WriteDataM(WriteDatatmpM),
+    .WriteDataM(WriteDataM),
     .MemSizeM(Funct3M[1:0]), .UnsignedLoadM(Funct3M[2]),
     .Funct7M(InstrM[31:25]),
     .*);
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index 2b052dcdf..11b8e5620 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -539,8 +539,8 @@ string tests32f[] = '{
         if (`M_SUPPORTED) tests = {tests, tests64m};
         if (`A_SUPPORTED) tests = {tests, tests64a};
         if (`MEM_VIRTMEM) tests = {tests, tests64mmu};
-        if (`D_SUPPORTED) tests = {tests64d, tests};
         if (`F_SUPPORTED) tests = {tests64f, tests};
+        if (`D_SUPPORTED) tests = {tests64d, tests};
       end
       //tests = {tests64a, tests};
     end else begin // RV32

From 192171826b2ca2a6b08692e7e6ae5f00857aeff9 Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Fri, 25 Jun 2021 07:18:38 -0400
Subject: [PATCH 09/20] changed SC M-to-E fowarding to W-to-E forwarding to
 improve critical path

---
 wally-pipelined/src/ieu/controller.sv |  6 ++++--
 wally-pipelined/src/ieu/forward.sv    | 15 ++++++++-------
 wally-pipelined/src/ieu/ieu.sv        |  3 ++-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/wally-pipelined/src/ieu/controller.sv b/wally-pipelined/src/ieu/controller.sv
index b27541d42..3654437fd 100644
--- a/wally-pipelined/src/ieu/controller.sv
+++ b/wally-pipelined/src/ieu/controller.sv
@@ -49,7 +49,8 @@ module controller(
   // Memory stage control signals
   input  logic       StallM, FlushM,
   output logic [1:0] MemRWM,
-  output logic       CSRReadM, CSRWriteM, PrivilegedM, 
+  output logic       CSRReadM, CSRWriteM, PrivilegedM,
+  output logic       SCE,
   output logic [1:0] AtomicM,
   output logic [2:0] Funct3M,
   output logic       RegWriteM,     // for Hazard Unit
@@ -202,7 +203,8 @@ module controller(
     
   assign PCSrcE = JumpE | BranchE & BranchTakenE;
 
-  assign MemReadE = MemRWE[1]; 
+  assign MemReadE = MemRWE[1];
+  assign SCE = (ResultSrcE == 3'b100);
   
   // Memory stage pipeline control register
   flopenrc #(15) controlregM(clk, reset, FlushM, ~StallM,
diff --git a/wally-pipelined/src/ieu/forward.sv b/wally-pipelined/src/ieu/forward.sv
index cdc6d2700..07c4daaf0 100644
--- a/wally-pipelined/src/ieu/forward.sv
+++ b/wally-pipelined/src/ieu/forward.sv
@@ -28,13 +28,14 @@
 module forward(
   // Detect hazards
   input logic [4:0]  Rs1D, Rs2D, Rs1E, Rs2E, RdE, RdM, RdW,
-  input logic 	     MemReadE, MulDivE, CSRReadE,
-  input logic 	     RegWriteM, RegWriteW,
-  input logic 	     DivDoneE, DivBusyE,
-  input logic	     FWriteIntE, FWriteIntM, FWriteIntW,
+  input logic        MemReadE, MulDivE, CSRReadE,
+  input logic        RegWriteM, RegWriteW,
+  input logic        DivDoneE, DivBusyE,
+  input logic	       FWriteIntE, FWriteIntM, FWriteIntW,
+  input logic        SCE,
   // Forwarding controls
   output logic [1:0] ForwardAE, ForwardBE,
-  output logic 	     FPUStallD, LoadStallD, MulDivStallD, CSRRdStallD
+  output logic       FPUStallD, LoadStallD, MulDivStallD, CSRRdStallD
 );
   
   always_comb begin
@@ -43,7 +44,7 @@ module forward(
     if (Rs1E != 5'b0)
       if      ((Rs1E == RdM) & RegWriteM) ForwardAE = 2'b10;
       else if ((Rs1E == RdW) & (RegWriteW|FWriteIntW)) ForwardAE = 2'b01;
-     else if ((Rs1E == RdM) & FWriteIntM) ForwardAE = 2'b11;
+      else if ((Rs1E == RdM) & FWriteIntM) ForwardAE = 2'b11;
  
     if (Rs2E != 5'b0)
       if      ((Rs2E == RdM) & RegWriteM) ForwardBE = 2'b10;
@@ -53,7 +54,7 @@ module forward(
 
   // Stall on dependent operations that finish in Mem Stage and can't bypass in time
    assign FPUStallD = FWriteIntE & ((Rs1D == RdE) | (Rs2D == RdE)); 
-   assign LoadStallD = MemReadE & ((Rs1D == RdE) | (Rs2D == RdE));  
+   assign LoadStallD = (MemReadE|SCE) & ((Rs1D == RdE) | (Rs2D == RdE));  
    assign MulDivStallD = MulDivE & ((Rs1D == RdE) | (Rs2D == RdE)) | MulDivE | DivBusyE; // *** extend with stalls for divide
    assign CSRRdStallD = CSRReadE & ((Rs1D == RdE) | (Rs2D == RdE));
 
diff --git a/wally-pipelined/src/ieu/ieu.sv b/wally-pipelined/src/ieu/ieu.sv
index 0bd9d598f..62dc371b9 100644
--- a/wally-pipelined/src/ieu/ieu.sv
+++ b/wally-pipelined/src/ieu/ieu.sv
@@ -73,7 +73,8 @@ module ieu (
   logic [4:0]  ALUControlE;
   logic        ALUSrcAE, ALUSrcBE;
   logic [2:0]  ResultSrcW;
-  logic       TargetSrcE;
+  logic        TargetSrcE;
+  logic        SCE;
 
   // forwarding signals
   logic [4:0]       Rs1D, Rs2D, Rs1E, Rs2E, RdE, RdM, RdW;

From 5b47da21bac8e3de9a508920724b4007430c06d0 Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Fri, 25 Jun 2021 08:15:19 -0400
Subject: [PATCH 10/20] made testbench-linux's PCDwrong be FlushD

---
 wally-pipelined/testbench/testbench-linux.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wally-pipelined/testbench/testbench-linux.sv b/wally-pipelined/testbench/testbench-linux.sv
index 15e0e3634..b87174b9b 100644
--- a/wally-pipelined/testbench/testbench-linux.sv
+++ b/wally-pipelined/testbench/testbench-linux.sv
@@ -57,7 +57,7 @@ module testbench();
   wallypipelinedsoc dut(.*);
 
   ///////////////////////////////////////////////////////////////////////////////
-  ////////////////////////   Signals & Shared Macros  //////////////////////////
+  ////////////////////////   Signals & Shared Macros  ///////////////////////////
   //////////////////////// AKA stuff that comes first ///////////////////////////
   ///////////////////////////////////////////////////////////////////////////////
   // Sorry if these have gotten decontextualized.
@@ -252,7 +252,7 @@ module testbench();
 
             // Check if PCD is going to be flushed due to a branch or jump
             if (`BPRED_ENABLED) begin
-              PCDwrong = dut.hart.ifu.bpred.bpred.BPPredWrongE;
+              PCDwrong = dut.hart.hzu.FlushD; //Old version: dut.hart.ifu.bpred.bpred.BPPredWrongE; <-- This old version failed to account for MRET.
             end else begin
               casex (lastInstrDExpected[31:0])
                 32'b00000000001000000000000001110011, // URET

From 13cf7c0934077100e04a962a9bcf0738457dc3c4 Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Fri, 25 Jun 2021 09:28:52 -0400
Subject: [PATCH 11/20] linux testbench now ignores HWRITE glitches caused by
 flush glitches

---
 wally-pipelined/regression/wave-dos/linux-waves.do | 1 +
 wally-pipelined/testbench/testbench-linux.sv       | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/wally-pipelined/regression/wave-dos/linux-waves.do b/wally-pipelined/regression/wave-dos/linux-waves.do
index b37276441..63623891c 100644
--- a/wally-pipelined/regression/wave-dos/linux-waves.do
+++ b/wally-pipelined/regression/wave-dos/linux-waves.do
@@ -4,6 +4,7 @@ view wave
 add wave -divider
 add wave /testbench/clk
 add wave /testbench/reset
+add wave -dec /testbench/instrs
 
 add wave -divider Stalls_and_Flushes
 add wave /testbench/dut/hart/StallF
diff --git a/wally-pipelined/testbench/testbench-linux.sv b/wally-pipelined/testbench/testbench-linux.sv
index b87174b9b..6676d1a7c 100644
--- a/wally-pipelined/testbench/testbench-linux.sv
+++ b/wally-pipelined/testbench/testbench-linux.sv
@@ -27,7 +27,7 @@
 
 module testbench();
   
-  parameter waveOnICount = 2514000; // # of instructions at which to turn on waves in graphical sim
+  parameter waveOnICount = 2657000; // # of instructions at which to turn on waves in graphical sim
   
 
   ///////////////////////////////////////////////////////////////////////////////
@@ -491,7 +491,7 @@ module testbench();
   //always @(HWDATA or HADDR or HSIZE or HWRITE) begin
   always @(negedge HWRITE) begin
     //#1;
-    if ($time != 0) begin
+    if (($time != 0) && ~dut.hart.hzu.FlushM) begin
       if($feof(data_file_memW)) begin
         $display("no more memW data to read");
         `ERROR

From 2ab29c74f263c784a640ae67c68ce165aa99186e Mon Sep 17 00:00:00 2001
From: Abe <castaa7@unlv.nevada.edu>
Date: Fri, 25 Jun 2021 16:27:23 -0400
Subject: [PATCH 12/20] Fixed Coremark Score output printing. Also made it so
 that the loop that sets the iteration count increments iterations by 1
 instead by increasing it by a factor of 10 each time (which was overkill for
 the timing that's needed to exit the loop)

---
 riscv-coremark/coremark/core_main.c | 45 +++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/riscv-coremark/coremark/core_main.c b/riscv-coremark/coremark/core_main.c
index edd1ac467..a2c3ac679 100644
--- a/riscv-coremark/coremark/core_main.c
+++ b/riscv-coremark/coremark/core_main.c
@@ -211,26 +211,53 @@ MAIN_RETURN_TYPE main(int argc, char *argv[]) {
 			core_init_state(results[0].size,results[i].seed1,results[i].memblock[3]);
 		}
 	}
-	
+
+ /*int foreverLoop = 1;
+ secs_ret timing = 0;
+ int timingInt;
+ ee_printf("\nENTERING FOREVER WHILE LOOP\n");
+ while(foreverLoop == 1)
+ {
+	 start_time();
+	 //filler
+	 stop_time();
+	 timing += time_in_secs(get_time());
+	 timingInt = (int)timing;
+	 ee_printf("Timing is %d\n", timingInt);
+ }/*
+
 	/* automatically determine number of iterations if not set */
 	if (results[0].iterations==0) { 
 		secs_ret secs_passed=0;
 		ee_u32 divisor;
 		results[0].iterations=1;
+		int iterationInc = 0;
+		ee_printf("\n\nENTERING ITERATION WHILE LOOP\n");
 		while (secs_passed < (secs_ret)1) {
-			results[0].iterations*=10;
+			if(iterationInc != 0)
+			{
+			  results[0].iterations++;
+			}
+			ee_printf("iterations is %d\n", results[0].iterations);
 			start_time();
 			iterate(&results[0]);
 			stop_time();
-			secs_passed=time_in_secs(get_time());
+			secs_passed = time_in_secs(get_time());
+			int secs_passed_int = (int)secs_passed;
+			ee_printf("secs passed is %d\n", secs_passed_int);
+			iterationInc++;
 		}
+		ee_printf("LEAVING ITERATION WHILE LOOP!\n\n");
 		/* now we know it executes for at least 1 sec, set actual run time at about 10 secs */
 		divisor=(ee_u32)secs_passed;
+		ee_printf("divisor is %lu\n", divisor);
 		if (divisor==0) /* some machines cast float to int as 0 since this conversion is not defined by ANSI, but we know at least one second passed */
 			divisor=1;
 		results[0].iterations*=1+10/divisor;
+		ee_printf("iterations is %d\n", results[0].iterations);
 	}
 	/* perform actual benchmark */
+	ee_printf("Starting benchmark\n");
 	start_time();
 #if (MULTITHREAD>1)
 	if (default_num_contexts>MULTITHREAD) {
@@ -249,7 +276,8 @@ MAIN_RETURN_TYPE main(int argc, char *argv[]) {
 #endif
 	stop_time();
 	total_time=get_time();
-	ee_printf("ending benchmark");
+	ee_printf("total time is %u\n", total_time);
+	ee_printf("ending benchmark\n");
 	/* get a function of the input to report */
 	seedcrc=crc16(results[0].seed1,seedcrc);
 	seedcrc=crc16(results[0].seed2,seedcrc);
@@ -340,12 +368,17 @@ MAIN_RETURN_TYPE main(int argc, char *argv[]) {
 		for (i=0 ; i<default_num_contexts; i++) 
 			ee_printf("[%d]crcstate      : 0x%04x\n",i,results[i].crcstate);
 	for (i=0 ; i<default_num_contexts; i++) 
-		ee_printf("[%d]crcfinal      : 0x%04x\"n",i,results[i].crc);
+		ee_printf("[%d]crcfinal      : 0x%04x\n",i,results[i].crc);
 	if (total_errors==0) {
 		ee_printf("Correct operation validated. See README.md for run and reporting rules.\n");
 #if HAS_FLOAT
 		if (known_id==3) {
-			ee_printf("CoreMark 1.0 : %f / %s %s",default_num_contexts*results[0].iterations/time_in_secs(total_time),COMPILER_VERSION,COMPILER_FLAGS);
+			unsigned long long tmp = (unsigned long long) 1000.0*default_num_contexts*results[0].iterations/time_in_secs(total_time);
+			secs_ret totalmsecs = time_in_secs(total_time);
+			int totalmint = (int) totalmsecs;
+			ee_printf("ELAPSED S: %d\n", totalmint);
+
+			ee_printf("CoreMark 1.0 : %d / %s %s\n",tmp,COMPILER_VERSION,COMPILER_FLAGS);
 #if defined(MEM_LOCATION) && !defined(MEM_LOCATION_UNSPEC)
 			ee_printf(" / %s",MEM_LOCATION);
 #else

From 12eff2bc5f9726887e875db9bce43784ccb13e6f Mon Sep 17 00:00:00 2001
From: Abe <castaa7@unlv.nevada.edu>
Date: Fri, 25 Jun 2021 16:42:03 -0400
Subject: [PATCH 13/20] Updated timing functions to read from MTIME register,
 TICKS_PER_SEC set to 10000 so timer reads millisecs

---
 riscv-coremark/riscv64-baremetal/core_portme.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/riscv-coremark/riscv64-baremetal/core_portme.c b/riscv-coremark/riscv64-baremetal/core_portme.c
index 8f17cb8bd..dab428306 100755
--- a/riscv-coremark/riscv64-baremetal/core_portme.c
+++ b/riscv-coremark/riscv64-baremetal/core_portme.c
@@ -114,9 +114,10 @@ void portable_free(void *p) {
     #define read_csr(reg) ({ unsigned long __tmp; \
        asm volatile ("csrr %0, " #reg : "=r"(__tmp)); \
        __tmp; })
-    #define GETMYTIME(_t) (*_t=read_csr(cycle))
+    #define GETMYTIME(_t) (_t = *(volatile unsigned long long*)0x0200BFF8)
 	#define MYTIMEDIFF(fin,ini) ((fin)-(ini))
-	#define TIMER_RES_DIVIDER 1
+	// Changing TIMER_RES_DIVIDER to 1000000 sets EE_TICKS_PER_SEC to 1000 (now counting ticks per ms)
+	#define TIMER_RES_DIVIDER 10000
 	#define SAMPLE_TIME_IMPLEMENTATION 1
 #endif
 #define EE_TICKS_PER_SEC (NSECS_PER_SEC / TIMER_RES_DIVIDER)
@@ -132,7 +133,9 @@ static CORETIMETYPE start_time_val, stop_time_val;
 	or zeroing some system parameters - e.g. setting the cpu clocks cycles to 0.
 */
 void start_time(void) {
-	GETMYTIME(&start_time_val );
+	GETMYTIME(start_time_val);
+	ee_printf("Timer started\n");
+	ee_printf("  MTIME: %u\n", start_time_val);
 #if CALLGRIND_RUN
 	CALLGRIND_START_INSTRUMENTATION
 #endif
@@ -153,7 +156,9 @@ void stop_time(void) {
 #if MICA
     asm volatile("int3");/*1 */
 #endif
-	GETMYTIME(&stop_time_val );
+	GETMYTIME(stop_time_val);
+	ee_printf("Timer stopped\n");
+	ee_printf("  MTIME: %u\n", stop_time_val);
 }
 /* Function: get_time
 	Return an abstract "ticks" number that signifies time on the system.
@@ -166,6 +171,7 @@ void stop_time(void) {
 */
 CORE_TICKS get_time(void) {
 	CORE_TICKS elapsed=(CORE_TICKS)(MYTIMEDIFF(stop_time_val, start_time_val));
+	ee_printf("    Elapsed MTIME: %u\n", elapsed);
 	return elapsed;
 }
 /* Function: time_in_secs
@@ -176,13 +182,15 @@ CORE_TICKS get_time(void) {
 */
 secs_ret time_in_secs(CORE_TICKS ticks) {
 	secs_ret retval=((secs_ret)ticks) / (secs_ret)EE_TICKS_PER_SEC;
+	int retvalint = (int)retval;
+	ee_printf("  RETURN VALUE FROM TIME IN SECS FUNCTION: %d\n", retvalint);
 	return retval;
 }
 #else
 #error "Please implement timing functionality in core_portme.c"
 #endif /* SAMPLE_TIME_IMPLEMENTATION */
 
-ee_u32 default_num_contexts=MULTITHREAD;
+ee_u32 default_num_contexts = MULTITHREAD;
 
 /* Function: portable_init
 	Target specific initialization code

From 74833dc68c20aeed361330bbacdcda6a701a76b0 Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Sat, 26 Jun 2021 07:18:26 -0400
Subject: [PATCH 14/20] split intermediate GDB output file into smaller files
 for better debug experience

---
 .gitignore                                       |  1 +
 wally-pipelined/linux-testgen/logAllBuildroot.sh | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index fe21942d0..82c20503b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ testsBP/*/OBJ/*
 testsBP/*/*.a
 wally-pipelined/linux-testgen/linux-testvectors/*
 wally-pipelined/linux-testgen/nohup*
+wally-pipelined/linux-testgen/x*
 !wally-pipelined/linux-testgen/linux-testvectors/tvCopier.py
 !wally-pipelined/linux-testgen/linux-testvectors/tvLinker.sh
 wally-pipelined/regression/slack-notifier/slack-webhook-url.txt
diff --git a/wally-pipelined/linux-testgen/logAllBuildroot.sh b/wally-pipelined/linux-testgen/logAllBuildroot.sh
index d045ee98c..073fc5a9d 100755
--- a/wally-pipelined/linux-testgen/logAllBuildroot.sh
+++ b/wally-pipelined/linux-testgen/logAllBuildroot.sh
@@ -1,3 +1,9 @@
+# Oftentimes this script runs so long you'll go to sleep.
+# But you don't want the script to die when your computer goes to sleep.
+# So consider invoking this with nohup (i.e. "nohup ./logAllBuildroot.sh")
+# You can run "tail -f nohup.out" to see what would've
+# outputted to the terminal if you didn't use nohup
+
 # =========== Debug the Process ========== 
 # Uncomment this version for GDB/QEMU debugging
 # - Opens up GDB interactively
@@ -15,6 +21,12 @@
 # - Logs parse_qemu.py's simulated gdb output to qemu_in_gdb_format.txt
 #cat qemu_output.txt | ./parse_qemu.py >qemu_in_gdb_format.txt
 #cat qemu_output.txt | ./parse_qemu.py | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot/"
+# Uncomment this version in case you just want to have qemu_in_gdb_format.txt around
+# It is often helpful for general debugging
+(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | ./parse_qemu.py >qemu_in_gdb_format.txt) & riscv64-unknown-elf-gdb -x gdbinit_qemulog
+
+# Split qemu_in_gdb_format.txt into chunks of 100,000 instructions for easier inspection
+#split -d -l 5600000 qemu_in_gdb_format.txt --verbose
 
 # Uncomment this version for parse_gdb_output.py debugging
 # - Uses qemu_in_gdb_format.txt
@@ -24,4 +36,4 @@
 # =========== Just Do the Thing ========== 
 # Uncomment this version for the whole thing 
 # - Logs info needed by buildroot testbench
-(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | ./parse_qemu.py | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot_new/") & riscv64-unknown-elf-gdb -x gdbinit_qemulog
+#(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | ./parse_qemu.py | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot_new/") & riscv64-unknown-elf-gdb -x gdbinit_qemulog

From 17afd9e5e8b7e65a868324d8ab4934be9c30fa17 Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Sat, 26 Jun 2021 07:19:51 -0400
Subject: [PATCH 15/20] temporarily disable PMP checking for EBU accesses.

---
 wally-pipelined/src/wally/wallypipelinedhart.sv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv
index e8064bcc7..a77c3ab01 100644
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@@ -171,6 +171,7 @@ module wallypipelinedhart (
   ahblite ebu( 
     //.InstrReadF(1'b0),
     //.InstrRData(InstrF), // hook up InstrF later
+    .ISquashBusAccessF(1'b0), // *** temporary hack to disable PMP instruction fetch checking
     .WriteDataM(WriteDataM),
     .MemSizeM(Funct3M[1:0]), .UnsignedLoadM(Funct3M[2]),
     .Funct7M(InstrM[31:25]),

From 751e606fb729a947170809c7b7aa5caed7c0728b Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Sat, 26 Jun 2021 08:30:58 -0400
Subject: [PATCH 16/20] trying out Noah and Kaveh's proposed hack for which
 CSRs to update for QEMU MMU bug

---
 wally-pipelined/linux-testgen/logAllBuildroot.sh |  4 ++--
 wally-pipelined/linux-testgen/parse_qemu.py      | 13 ++++++-------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/wally-pipelined/linux-testgen/logAllBuildroot.sh b/wally-pipelined/linux-testgen/logAllBuildroot.sh
index 073fc5a9d..740fa8c4b 100755
--- a/wally-pipelined/linux-testgen/logAllBuildroot.sh
+++ b/wally-pipelined/linux-testgen/logAllBuildroot.sh
@@ -23,7 +23,7 @@
 #cat qemu_output.txt | ./parse_qemu.py | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot/"
 # Uncomment this version in case you just want to have qemu_in_gdb_format.txt around
 # It is often helpful for general debugging
-(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | ./parse_qemu.py >qemu_in_gdb_format.txt) & riscv64-unknown-elf-gdb -x gdbinit_qemulog
+#(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | ./parse_qemu.py >qemu_in_gdb_format.txt) & riscv64-unknown-elf-gdb -x gdbinit_qemulog
 
 # Split qemu_in_gdb_format.txt into chunks of 100,000 instructions for easier inspection
 #split -d -l 5600000 qemu_in_gdb_format.txt --verbose
@@ -36,4 +36,4 @@
 # =========== Just Do the Thing ========== 
 # Uncomment this version for the whole thing 
 # - Logs info needed by buildroot testbench
-#(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | ./parse_qemu.py | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot_new/") & riscv64-unknown-elf-gdb -x gdbinit_qemulog
+(qemu-system-riscv64 -M virt -nographic -bios /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/fw_jump.elf -kernel /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/Image -append "root=/dev/vda ro" -initrd /courses/e190ax/qemu_sim/rv64_initrd/buildroot_experimental/output/images/rootfs.cpio -d nochain,cpu,in_asm -serial /dev/null -singlestep -s -S 2>&1 >/dev/null | ./parse_qemu.py | ./parse_gdb_output.py "/courses/e190ax/buildroot_boot_new/") & riscv64-unknown-elf-gdb -x gdbinit_qemulog
diff --git a/wally-pipelined/linux-testgen/parse_qemu.py b/wally-pipelined/linux-testgen/parse_qemu.py
index c7f31fb22..ac5d95f0b 100755
--- a/wally-pipelined/linux-testgen/parse_qemu.py
+++ b/wally-pipelined/linux-testgen/parse_qemu.py
@@ -40,13 +40,12 @@ def parseCSRs(l):
             val = int(l.split()[1],16)
             if inPageFault:
                 # Not sure if these CSRs should be updated or not during page fault.
-                #if l.startswith("mstatus") or l.startswith("mepc") or l.startswith("mcause") or l.startswith("mtval") or l.startswith("sepc") or l.startswith("scause") or l.startswith("stval"):
-                #    # We do update some CSRs
-                #    CSRs[csr] = val
-                #else:
-                #    # Others we preserve until changed later
-                #    pageFaultCSRs[csr] = val
-                pageFaultCSRs[csr] = val
+                if l.startswith("mstatus") or l.startswith("mepc") or l.startswith("mcause") or l.startswith("mtval") or l.startswith("sepc") or l.startswith("scause") or l.startswith("stval"):
+                    # We do update some CSRs
+                    CSRs[csr] = val
+                else:
+                    # Others we preserve until changed later
+                    pageFaultCSRs[csr] = val
             elif pageFaultCSRs and (csr in pageFaultCSRs):
                 if (val != pageFaultCSRs[csr]):
                     del pageFaultCSRs[csr]

From 0c2b7a1132ff01d48522c70be2ef33f74eea35d9 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Mon, 28 Jun 2021 18:53:58 -0400
Subject: [PATCH 17/20] FPU control signals changed and FMA works

---
 wally-pipelined/src/fpu/FMA/tbgen/tb.sv       |   5 +-
 wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh |   2 +-
 wally-pipelined/src/fpu/fctrl.sv              | 267 ++++-----
 wally-pipelined/src/fpu/fma1.sv               | 281 +++++-----
 wally-pipelined/src/fpu/fma2.sv               | 516 +++++++++---------
 wally-pipelined/src/fpu/fpu.sv                | 149 ++---
 wally-pipelined/src/fpu/fpuhazard.sv          |   6 +-
 wally-pipelined/src/ieu/datapath.sv           |   8 +-
 wally-pipelined/src/ieu/ieu.sv                |   3 +-
 .../src/wally/wallypipelinedhart.sv           |  23 +-
 10 files changed, 571 insertions(+), 689 deletions(-)

diff --git a/wally-pipelined/src/fpu/FMA/tbgen/tb.sv b/wally-pipelined/src/fpu/FMA/tbgen/tb.sv
index 4c93cd575..5a8e7a868 100644
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.sv
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.sv
@@ -45,8 +45,8 @@ assign FOpCtrlE = 3'b0;
 // down - 010
 // up - 011
 // nearest max mag - 100  
-assign FrmE = 3'b010;
-assign FmtE = 1'b1;
+assign FrmE = 3'b011;
+assign FmtE = 1'b0;
 
 
 assign	wnan = FmtE ? &FmaResultM[62:52] && |FmaResultM[51:0] : &FmaResultM[62:55] && |FmaResultM[54:32]; 
@@ -110,7 +110,6 @@ always @(posedge clk)
 		if(ans >= 64'h7FF8000000000000 && ans <= 64'h7FFfffffffffffff ) $display( "ans=qutNaN ");
 		if(ans >= 64'hFFF8000000000000 && ans <= 64'hFFFfffffffffffff ) $display( "ans=qutNaN ");
         errors = errors + 1;
-	  if (errors == 20)
 		$stop;
     end
     if((FmtE==1'b0)&(FmaFlagsM != flags[4:0] || (!wnan && (FmaResultM != ans)) || (wnan && ansnan && ~(((xnan && (FmaResultM[62:0] == {FInput1E[62:55],1'b1,FInput1E[53:0]})) || (ynan && (FmaResultM[62:0] == {FInput2E[62:55],1'b1,FInput2E[53:0]}))  || (znan && (FmaResultM[62:0] == {FInput3E[62:55],1'b1,FInput3E[53:0]})) || (FmaResultM[62:0] == ans[62:0]))) ))) begin
diff --git a/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh b/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
index dc9562b1a..5f12e143c 100755
--- a/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
+++ b/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
@@ -1,3 +1,3 @@
-testfloat_gen f64_mulAdd -tininessafter -n 6133248 -rmin  -seed 113355 -level 1 > testFloat
+testfloat_gen f32_mulAdd -tininessafter -n 6133248 -rmax  -seed 113355 -level 1 > testFloat
 tr -d ' ' < testFloat > testFloatNoSpace
 
diff --git a/wally-pipelined/src/fpu/fctrl.sv b/wally-pipelined/src/fpu/fctrl.sv
index a9fcb564e..3be9b281a 100755
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@@ -6,176 +6,128 @@ module fctrl (
   input  logic [2:0] Funct3D,
   input  logic [2:0] FRM_REGW,
   output logic       IllegalFPUInstrD,
-  output logic       IsFPD,
   output logic       FWriteEnD,
   output logic       FDivStartD,
   output logic [2:0] FResultSelD,
   output logic [3:0] FOpCtrlD,
+  output logic [1:0] FResSelD,
+  output logic [1:0] FIntResSelD,
   output logic       FmtD,
   output logic [2:0] FrmD,
-  output logic [1:0] FMemRWD,
-  output logic       FOutputInput2D,
-  output logic       FInput2UsedD, FInput3UsedD,
   output logic       FWriteIntD);
 
-
-  logic IllegalFPUInstr1D, IllegalFPUInstr2D;
-  // *** fix rounding for dynamic rounding
+  `define FCTRLW 15
+  logic [`FCTRLW-1:0] ControlsD;
+  // FPU Instruction Decoder
+  always_comb
+    case(OpD)
+    // FWriteEn_FWriteInt_FResultSel_FOpCtrl_FResSel_FIntResSel_FDivStart_IllegalFPUInstr
+      7'b0000111: case(Funct3D)
+                    3'b010:  ControlsD = `FCTRLW'b1_0_000_0000_00_00_0_0; // flw
+                    3'b011:  ControlsD = `FCTRLW'b1_0_000_0001_00_00_0_0; // fld
+                    default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                  endcase
+      7'b0100111: case(Funct3D)
+                    3'b010:  ControlsD = `FCTRLW'b0_0_000_0010_00_00_0_0; // fsw
+                    3'b011:  ControlsD = `FCTRLW'b0_0_000_0011_00_00_0_0; // fsd
+                    default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                  endcase
+      7'b1000011:   ControlsD = `FCTRLW'b1_0_001_0000_00_00_0_0; // fmadd
+      7'b1000111:   ControlsD = `FCTRLW'b1_0_001_0001_00_00_0_0; // fmsub
+      7'b1001011:   ControlsD = `FCTRLW'b1_0_001_0010_00_00_0_0; // fnmsub
+      7'b1001111:   ControlsD = `FCTRLW'b1_0_001_0011_00_00_0_0; // fnmadd
+      7'b1010011: casez(Funct7D)
+                    7'b00000??: ControlsD = `FCTRLW'b1_0_010_0000_00_00_0_0; // fadd
+                    7'b00001??: ControlsD = `FCTRLW'b1_0_010_0001_00_00_0_0; // fsub
+                    7'b00010??: ControlsD = `FCTRLW'b1_0_001_0100_00_00_0_0; // fmul
+                    7'b00011??: ControlsD = `FCTRLW'b1_0_011_0000_00_00_1_0; // fdiv
+                    7'b01011??: ControlsD = `FCTRLW'b1_0_011_0001_00_00_1_0; // fsqrt
+                    7'b00100??: case(Funct3D)
+                                  3'b000:  ControlsD = `FCTRLW'b1_0_100_0000_01_00_0_0; // fsgnj
+                                  3'b001:  ControlsD = `FCTRLW'b1_0_100_0001_01_00_0_0; // fsgnjn
+                                  3'b010:  ControlsD = `FCTRLW'b1_0_100_0010_01_00_0_0; // fsgnjx
+                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                endcase
+                    7'b00101??: case(Funct3D)
+                                  3'b000:  ControlsD = `FCTRLW'b1_0_100_0111_10_00_0_0; // fmin
+                                  3'b001:  ControlsD = `FCTRLW'b1_0_100_0101_10_00_0_0; // fmax
+                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                endcase
+                    7'b10100??: case(Funct3D)
+                                  3'b010:  ControlsD = `FCTRLW'b0_1_100_0010_00_00_0_0; // feq
+                                  3'b001:  ControlsD = `FCTRLW'b0_1_100_0001_00_00_0_0; // flt
+                                  3'b000:  ControlsD = `FCTRLW'b0_1_100_0011_00_00_0_0; // fle
+                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                endcase
+                    7'b11100??: if (Funct3D == 3'b001)
+                                  ControlsD = `FCTRLW'b0_1_100_0000_00_10_0_0; // fclass
+                                else if (Funct3D[1:0] == 2'b00) ControlsD = `FCTRLW'b0_1_100_0100_00_01_0_0; // fmv.x.w
+                                else if (Funct3D[1:0] == 2'b01) ControlsD = `FCTRLW'b0_1_100_0101_00_01_0_0; // fmv.x.d
+                                else                            ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                    7'b1100000: case(Rs2D[0])
+                                  1'b0:    ControlsD = `FCTRLW'b0_1_010_0110_00_00_0_0; // fcvt.s.w
+                                  1'b1:    ControlsD = `FCTRLW'b0_1_010_0101_00_00_0_0; // fcvt.s.wu
+                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                endcase
+                    7'b1101000: case(Rs2D[0])
+                                  1'b0:    ControlsD = `FCTRLW'b1_1_010_0100_00_00_0_0; // fcvt.w.s
+                                  1'b1:    ControlsD = `FCTRLW'b1_1_010_0101_00_00_0_0; // fcvt.wu.s
+                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                endcase
+                    7'b1111000: ControlsD = `FCTRLW'b1_0_100_0000_00_00_0_0; // fmv.w.x
+                    7'b0100000: ControlsD = `FCTRLW'b1_0_010_0010_00_00_0_0; // fcvt.s.d
+                    7'b1100001: case(Rs2D[0])
+                                  1'b0:    ControlsD = `FCTRLW'b0_1_010_1110_00_00_0_0; // fcvt.d.w
+                                  1'b1:    ControlsD = `FCTRLW'b0_1_010_1111_00_00_0_0; // fcvt.d.wu
+                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                endcase
+                    7'b1101001: case(Rs2D[0])
+                                  1'b0:    ControlsD = `FCTRLW'b1_0_010_1100_00_00_0_0; // fcvt.w.d
+                                  1'b1:    ControlsD = `FCTRLW'b1_0_010_1101_00_00_0_0; // fcvt.wu.d
+                                  default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                                endcase
+                    7'b1111001: ControlsD = `FCTRLW'b1_0_100_0001_00_00_0_0; // fmv.d.x
+                    7'b0100001: ControlsD = `FCTRLW'b1_0_010_1000_00_00_0_0; // fcvt.d.s
+                    default:    ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+                  endcase
+      default:      ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
+    endcase
+  // unswizzle control bits
+  assign {FWriteEnD, FWriteIntD, FResultSelD, FOpCtrlD, FResSelD, FIntResSelD, FDivStartD, IllegalFPUInstrD} = ControlsD;
+  
+  // if dynamic rounding, choose FRM_REGW
   assign FrmD = &Funct3D ? FRM_REGW : Funct3D;
 
-  //all subsequent logic is based on the table present
-  //in Section 5 of Wally Architecture Specification
-  
-  //write is enabled for all fp instruciton op codes
-  //sans fp load
-  always_comb begin
-	//case statement is easier to modify
-	//in case of errors
-	case(OpD)
-		//fp instructions sans load
-		7'b1010011 : IsFPD = 1'b1;
-		7'b1000011 : IsFPD = 1'b1;
-		7'b1000111 : IsFPD = 1'b1;
-		7'b1001011 : IsFPD = 1'b1;
-		7'b1001111 : IsFPD = 1'b1;
-		7'b0100111 : IsFPD = 1'b1;
-		7'b0000111 : IsFPD = 1'b1;// KEP change 7'b1010011 to 7'b0000111
-		default    : IsFPD = 1'b0;
-	endcase
-  end
-  
-
-  
-  //useful intermediary signals
-  //
-  //(mult only not supported in current datapath)
-  //set third FMA operand to zero in this case
-  //(or equivalent)
-
-  always_comb begin
-    //checks all but FMA/store/load
-    IllegalFPUInstr2D = 0;
-    FDivStartD = 1'b0;
-    if(OpD == 7'b1010011) begin
-      casez(Funct7D)
-        //compare	
-        7'b10100?? : FResultSelD = 3'b001;
-        //div/sqrt
-        7'b0?011?? : begin FResultSelD = 3'b000; FDivStartD = 1'b1; end
-        //add/sub
-        7'b0000??? : FResultSelD = 3'b100;
-        //mult
-        7'b00010?? : FResultSelD = 3'b010;
-        //convert (not precision)
-        7'b110?0?? : FResultSelD = 3'b100;
-        //convert (precision)
-        7'b010000? : FResultSelD = 3'b100;
-        //Min/Max
-        7'b00101?? : FResultSelD = 3'b001;
-        //sign injection
-        7'b00100?? : FResultSelD = 3'b011;
-        //classify //only if funct3 = 001 
-        7'b11100?? : if(Funct3D == 3'b001) FResultSelD = 3'b101;
-        //output ReadData1
-                    else if (Funct7D[1] == 0) FResultSelD = 3'b111;
-        //output SrcW
-        7'b111100? : FResultSelD = 3'b110;
-        default    : begin FResultSelD = 3'b0; IllegalFPUInstr2D = 1'b1; end
-      endcase
-    end
-    //FMA/store/load
-    else begin
-      case(OpD)
-        //4 FMA instructions
-        7'b1000011 : FResultSelD = 3'b010;
-        7'b1000111 : FResultSelD = 3'b010;
-        7'b1001011 : FResultSelD = 3'b010;
-        7'b1001111 : FResultSelD = 3'b010;
-        //store
-        7'b0100111 : FResultSelD = 3'b111;
-        //load
-        7'b0000111 : FResultSelD = 3'b111;
-        default    : begin FResultSelD = 3'b0; IllegalFPUInstr2D = 1'b1; end
-      endcase
-    end
-  end
-
-  assign FOutputInput2D = OpD == 7'b0100111;
-
-  assign FMemRWD[0] = FOutputInput2D;
-  assign FMemRWD[1] = OpD == 7'b0000111;
-
-
-
-  //register is chosen based on operation performed
-  //---- 
-  //write selection is chosen in the same way as 
-  //register selection
-  //
-
-  // reg/write sel logic and assignment
-  // 
-  // 3'b000 = div/sqrt
-  // 3'b001 = cmp
-  // 3'b010 = fma/mult
-  // 3'b011 = sgn inj
-  // 3'b100 = add/sub/cnvt
-  // 3'b101 = classify
-  // 3'b110 = output SrcAW
-  // 3'b111 = output ReadData1
-  //
-  //reg select
-  
-  //this value is used enough to be shorthand
-
-
-  //operation control for each fp operation
-  //has to be expanded over standard to account for
-  //integrated fpadd/cvt
-  //
-  //will integrate FMA opcodes into design later
-  //
-  //conversion instructions will
-  //also need to be added later as I find the opcode
-  //version I used for this repo
-
-  //let's do separate SOP for each type of operation
-//  assign FOpCtrlD[3] = 1'b0;
-//
-//
-
-
- 
-  always_comb begin
-    IllegalFPUInstr1D = 0;
-    FInput3UsedD = 0;
-    case (FResultSelD)
-      // div/sqrt
+  // Precision
+  //  0-single
+  //  1-double
+  assign FmtD = FResultSelD == 3'b000 ? Funct3D[0] : Funct7D[0];
+  // div/sqrt
       //  fdiv  = ???0
       //  fsqrt = ???1
-      3'b000 : begin FOpCtrlD = {3'b0, Funct7D[5]}; FInput2UsedD = ~Funct7D[5]; end
-      // cmp		
+
+  // cmp		
       //  fmin = ?111
       //  fmax = ?101
       //  feq  = ?010
       //  flt  = ?001
       //  fle  = ?011
       //		   {?,    is min or max, is eq or le, is lt or le}
-      3'b001 : begin FOpCtrlD = {1'b0, Funct7D[2], ~Funct3D[0], ~(|Funct3D[2:1])}; FInput2UsedD = 1'b1; end
-      //fma/mult	
+
+  //fma/mult	
       //  fmadd  = ?000
       //  fmsub  = ?001
       //  fnmsub = ?010	-(a*b)+c
       //  fnmadd = ?011 -(a*b)-c
       //  fmul   = ?100
       //		  {?, is mul, is negitive, is sub}
-      3'b010 : begin FOpCtrlD = {1'b0, OpD[4:2]}; FInput2UsedD = 1'b1; FInput3UsedD = ~OpD[4]; end
-      // sgn inj
+
+  // sgn inj
       //  fsgnj  = ??00
       //  fsgnjn = ??01
       //  fsgnjx = ??10
-      3'b011 : begin FOpCtrlD = {2'b0, Funct3D[1:0]}; FInput2UsedD = 1'b1; end
-      // add/sub/cnvt
+
+  // add/sub/cnvt
       //  fadd      = 0000
       //  fsub      = 0001
       //  fcvt.w.s  = 0100
@@ -188,35 +140,18 @@ module fctrl (
       //  fcvt.d.w  = 1110
       //  fcvt.d.wu = 1111
       //  fcvt.d.s  = 1000
-      //		   { is double and not add/sub, is to/from int, is to int or float to double,      is unsigned or sub
-      3'b100 : begin FOpCtrlD = {Funct7D[0]&Funct7D[5], Funct7D[6], Funct7D[3] | (~Funct7D[6]&Funct7D[5]&~Funct7D[0]), (Rs2D[0]&Funct7D[5])|(Funct7D[2]&~Funct7D[5])}; FInput2UsedD = ~Funct7D[5]; end
-      // classify	  {?, ?, ?, ?}
-      3'b101 : begin FOpCtrlD = 4'b0; FInput2UsedD = 1'b0; end
-      // output SrcAW
+      //		   { is double and not add/sub, is to/from int, is to int or float to double,      is unsigned or sub}
+
       //  fmv.w.x = ???0
       //  fmv.w.d = ???1
-      3'b110 : begin FOpCtrlD = {3'b0, Funct7D[0]}; FInput2UsedD = 1'b0; end
-      // output Input1
+
       //  flw       = ?000
       //  fld       = ?001 
-      //  fsw       = ?010 // output Input2
-      //  fsd       = ?011 // output Input2
+      //  fsw       = ?010
+      //  fsd       = ?011
       //  fmv.x.w  = ?100
       //  fmv.x.d  = ?101
       //		   {?, is mv, is store, is double or fmv}
-      3'b111 : begin FOpCtrlD = {1'b0, OpD[6:5], Funct3D[0] | (OpD[6]&Funct7D[0])}; FInput2UsedD = OpD[5]; end
-      default : begin FOpCtrlD = 4'b0; IllegalFPUInstr1D = 1'b1; FInput2UsedD = 1'b0; end
-    endcase
-  end
+    
 
-  //precision
-  assign FmtD = (~&FResultSelD & Funct7D[0]) | (&FResultSelD & FOpCtrlD[0]);
-
-  assign IllegalFPUInstrD = IllegalFPUInstr1D | IllegalFPUInstr2D;
-  //write to integer source if conv to int occurs
-  //AND of Funct7 for int results 
-  //			is add/cvt       and  is to int  or is classify		 or     is cmp	       	and not max/min or is output ReadData1 and is mv
-  assign FWriteIntD = ((FResultSelD == 3'b100)&Funct7D[3]) | (FResultSelD == 3'b101) | ((FResultSelD == 3'b001)&~Funct7D[2]) | ((FResultSelD == 3'b111)&OpD[6]);
-  // 		      if not writting to int reg and not a store function and not move
-  assign FWriteEnD = ~FWriteIntD & ~OpD[5] & ~((FResultSelD == 3'b111)&OpD[6]) & IsFPD;
 endmodule
diff --git a/wally-pipelined/src/fpu/fma1.sv b/wally-pipelined/src/fpu/fma1.sv
index ab9d2bb17..76f7316ba 100644
--- a/wally-pipelined/src/fpu/fma1.sv
+++ b/wally-pipelined/src/fpu/fma1.sv
@@ -1,111 +1,111 @@
 module fma1(
  
-	input logic 	[63:0]		X,	// X
-	input logic		[63:0]		Y,	// Y
-	input logic 	[63:0]		Z,	// Z
-	input logic 	[2:0]		FOpCtrlE,	// 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
-	input logic 				FmtE,		// precision 1 = double 0 = single
-	output logic 	[105:0]		ProdManE,	// 1.X frac * 1.Y frac
-	output logic 	[161:0]		AlignedAddendE,	// Z aligned for addition
-	output logic 	[12:0]		ProdExpE,		// X exponent + Y exponent - bias
-	output logic 				AddendStickyE,	// sticky bit that is calculated during alignment
-	output logic 				KillProdE,		// set the product to zero before addition if the product is too small to matter
-	output logic				XZeroE, YZeroE, ZZeroE, // inputs are zero
-	output logic				XInfE, YInfE, ZInfE,	// inputs are infinity
-	output logic				XNaNE, YNaNE, ZNaNE);	// inputs are NaN
+    input logic     [63:0]      X,  // X
+    input logic     [63:0]      Y,  // Y
+    input logic     [63:0]      Z,  // Z
+    input logic     [2:0]       FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
+    input logic                 FmtE,       // precision 1 = double 0 = single
+    output logic    [105:0]     ProdManE,   // 1.X frac * 1.Y frac
+    output logic    [161:0]     AlignedAddendE, // Z aligned for addition
+    output logic    [12:0]      ProdExpE,       // X exponent + Y exponent - bias
+    output logic                AddendStickyE,  // sticky bit that is calculated during alignment
+    output logic                KillProdE,      // set the product to zero before addition if the product is too small to matter
+    output logic                XZeroE, YZeroE, ZZeroE, // inputs are zero
+    output logic                XInfE, YInfE, ZInfE,    // inputs are infinity
+    output logic                XNaNE, YNaNE, ZNaNE);   // inputs are NaN
 
-	logic [51:0] 	XFrac,YFrac,ZFrac;	// input fraction
-	logic [52:0] 	XMan,YMan,ZMan;		// input mantissa (with leading one)
-	logic [12:0] 	XExp,YExp,ZExp;		// input exponents
-	logic 		 	XSgn,YSgn,ZSgn;		// input signs
-	logic [12:0]	AlignCnt;			// how far to shift the addend to align with the product
-	logic [211:0] 	ZManShifted;				// output of the alignment shifter including sticky bit
-	logic [211:0] 	ZManPreShifted;		// input to the alignment shifter
-	logic			XDenorm, YDenorm, ZDenorm;	// inputs are denormal
-	logic [63:0]	Addend;	// value to add (Z or zero)
-	logic [12:0]	Bias;	// 1023 for double, 127 for single
-	logic 			XExpZero, YExpZero, ZExpZero; 	// input exponent zero
-	logic 			XFracZero, YFracZero, ZFracZero; // input fraction zero
-	logic 			XExpMax, YExpMax, ZExpMax; 	// input exponent all 1s
+    logic [51:0]    XFrac,YFrac,ZFrac;  // input fraction
+    logic [52:0]    XMan,YMan,ZMan;     // input mantissa (with leading one)
+    logic [12:0]    XExp,YExp,ZExp;     // input exponents
+    logic           XSgn,YSgn,ZSgn;     // input signs
+    logic [12:0]    AlignCnt;           // how far to shift the addend to align with the product
+    logic [213:0]   ZManShifted;                // output of the alignment shifter including sticky bit
+    logic [213:0]   ZManPreShifted;     // input to the alignment shifter
+    logic           XDenorm, YDenorm, ZDenorm;  // inputs are denormal
+    logic [63:0]    Addend; // value to add (Z or zero)
+    logic [12:0]    Bias;   // 1023 for double, 127 for single
+    logic           XExpZero, YExpZero, ZExpZero;   // input exponent zero
+    logic           XFracZero, YFracZero, ZFracZero; // input fraction zero
+    logic           XExpMax, YExpMax, ZExpMax;  // input exponent all 1s
 
-	///////////////////////////////////////////////////////////////////////////////
-	// split inputs into the sign bit, fraction, and exponent to handle single or double precision
-	// 		- single precision is in the top half of the inputs
-	///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    // split inputs into the sign bit, fraction, and exponent to handle single or double precision
+    //      - single precision is in the top half of the inputs
+    ///////////////////////////////////////////////////////////////////////////////
 
-	// Set addend to zero if FMUL instruction
-  	assign Addend = FOpCtrlE[2] ? 64'b0 : Z;
+    // Set addend to zero if FMUL instruction
+    assign Addend = FOpCtrlE[2] ? 64'b0 : Z;
 
-	assign XSgn = X[63];
-	assign YSgn = Y[63];
-	assign ZSgn = Addend[63];
+    assign XSgn = X[63];
+    assign YSgn = Y[63];
+    assign ZSgn = Addend[63];
 
-	assign XExp = FmtE ? {2'b0, X[62:52]} : {5'b0, X[62:55]};
-	assign YExp = FmtE ? {2'b0, Y[62:52]} : {5'b0, Y[62:55]};
-	assign ZExp = FmtE ? {2'b0, Addend[62:52]} : {5'b0, Addend[62:55]};
+    assign XExp = FmtE ? {2'b0, X[62:52]} : {5'b0, X[62:55]};
+    assign YExp = FmtE ? {2'b0, Y[62:52]} : {5'b0, Y[62:55]};
+    assign ZExp = FmtE ? {2'b0, Addend[62:52]} : {5'b0, Addend[62:55]};
 
-	assign XFrac = FmtE ? X[51:0] : {X[54:32], 29'b0};
-	assign YFrac = FmtE ? Y[51:0] : {Y[54:32], 29'b0};
-	assign ZFrac = FmtE ? Addend[51:0] : {Addend[54:32], 29'b0};
-	
-	assign XMan = {~XExpZero, XFrac};
-	assign YMan = {~YExpZero, YFrac};
-	assign ZMan = {~ZExpZero, ZFrac};
+    assign XFrac = FmtE ? X[51:0] : {X[54:32], 29'b0};
+    assign YFrac = FmtE ? Y[51:0] : {Y[54:32], 29'b0};
+    assign ZFrac = FmtE ? Addend[51:0] : {Addend[54:32], 29'b0};
+   
+    assign XMan = {~XExpZero, XFrac};
+    assign YMan = {~YExpZero, YFrac};
+    assign ZMan = {~ZExpZero, ZFrac};
 
-	assign Bias = FmtE ? 13'h3ff : 13'h7f;
+    assign Bias = FmtE ? 13'h3ff : 13'h7f;
 
 
 
-	///////////////////////////////////////////////////////////////////////////////
-	// determine if an input is a special value
-	///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    // determine if an input is a special value
+    ///////////////////////////////////////////////////////////////////////////////
 
-	assign XExpZero = ~|XExp;
-	assign YExpZero = ~|YExp;
-	assign ZExpZero = ~|ZExp;
-	
-	assign XFracZero = ~|XFrac;
-	assign YFracZero = ~|YFrac;
-	assign ZFracZero = ~|ZFrac;
+    assign XExpZero = ~|XExp;
+    assign YExpZero = ~|YExp;
+    assign ZExpZero = ~|ZExp;
+   
+    assign XFracZero = ~|XFrac;
+    assign YFracZero = ~|YFrac;
+    assign ZFracZero = ~|ZFrac;
 
-	assign XExpMax = FmtE ? &XExp[10:0] : &XExp[7:0];
-	assign YExpMax = FmtE ? &YExp[10:0] : &YExp[7:0];
-	assign ZExpMax = FmtE ? &ZExp[10:0] : &ZExp[7:0];
-	
-	assign XNaNE = XExpMax & ~XFracZero;
-	assign YNaNE = YExpMax & ~YFracZero;
-	assign ZNaNE = ZExpMax & ~ZFracZero;
+    assign XExpMax = FmtE ? &XExp[10:0] : &XExp[7:0];
+    assign YExpMax = FmtE ? &YExp[10:0] : &YExp[7:0];
+    assign ZExpMax = FmtE ? &ZExp[10:0] : &ZExp[7:0];
+   
+    assign XNaNE = XExpMax & ~XFracZero;
+    assign YNaNE = YExpMax & ~YFracZero;
+    assign ZNaNE = ZExpMax & ~ZFracZero;
 
-	assign XDenorm = XExpZero & ~XFracZero; 
-	assign YDenorm = YExpZero & ~YFracZero; 
-	assign ZDenorm = ZExpZero & ~ZFracZero; 
+    assign XDenorm = XExpZero & ~XFracZero;
+    assign YDenorm = YExpZero & ~YFracZero;
+    assign ZDenorm = ZExpZero & ~ZFracZero;
 
-	assign XInfE = XExpMax & XFracZero; 
-	assign YInfE = YExpMax & YFracZero; 
-	assign ZInfE = ZExpMax & ZFracZero; 
+    assign XInfE = XExpMax & XFracZero;
+    assign YInfE = YExpMax & YFracZero;
+    assign ZInfE = ZExpMax & ZFracZero;
 
-	assign XZeroE = XExpZero & XFracZero;
-	assign YZeroE = YExpZero & YFracZero;
-	assign ZZeroE = ZExpZero & ZFracZero;
+    assign XZeroE = XExpZero & XFracZero;
+    assign YZeroE = YExpZero & YFracZero;
+    assign ZZeroE = ZExpZero & ZFracZero;
 
 
 
 
-	///////////////////////////////////////////////////////////////////////////////
-	// Calculate the product
-	//		- When multipliying two fp numbers, add the exponents
-	// 		- Subtract the bias (XExp + YExp has two biases, one from each exponent)
-	//		- Denormal numbers have an an exponent value of 1, however they are 
-	//		  represented with an exponent of 0. add one if there is a denormal number
-	///////////////////////////////////////////////////////////////////////////////
-	
-	// verilator lint_off WIDTH
-	assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 : 
-				 XExp + YExp - Bias + XDenorm + YDenorm;
+    ///////////////////////////////////////////////////////////////////////////////
+    // Calculate the product
+    //      - When multipliying two fp numbers, add the exponents
+    //      - Subtract the bias (XExp + YExp has two biases, one from each exponent)
+    //      - Denormal numbers have an an exponent value of 1, however they are
+    //        represented with an exponent of 0. add one if there is a denormal number
+    ///////////////////////////////////////////////////////////////////////////////
+   
+    // verilator lint_off WIDTH
+    assign ProdExpE = (XZeroE|YZeroE) ? 13'b0 :
+                 XExp + YExp - Bias + XDenorm + YDenorm;
 
-	// Calculate the product's mantissa
-	//		- Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
-	assign ProdManE =  XMan * YMan;
+    // Calculate the product's mantissa
+    //      - Add the assumed one. If the number is denormalized or zero, it does not have an assumed one.
+    assign ProdManE =  XMan * YMan;
 
 
 
@@ -114,72 +114,71 @@ module fma1(
 
 
 
-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Alignment shifter
-	///////////////////////////////////////////////////////////////////////////////
+   
+    ///////////////////////////////////////////////////////////////////////////////
+    // Alignment shifter
+    ///////////////////////////////////////////////////////////////////////////////
 
-	// determine the shift count for alignment
-	//		- negitive means Z is larger, so shift Z left
-	//		- positive means the product is larger, so shift Z right
-	//		- Denormal numbers have an an exponent value of 1, however they are 
-	//		  represented with an exponent of 0. add one to the exponent if it is a denormal number
-	assign AlignCnt = ProdExpE - ZExp - ZDenorm;
-	// verilator lint_on WIDTH
+    // determine the shift count for alignment
+    //      - negitive means Z is larger, so shift Z left
+    //      - positive means the product is larger, so shift Z right
+    //      - Denormal numbers have an an exponent value of 1, however they are
+    //        represented with an exponent of 0. add one to the exponent if it is a denormal number
+    assign AlignCnt = ProdExpE - ZExp - ZDenorm;
+    // verilator lint_on WIDTH
 
 
-	// Defualt Addition without shifting
-	// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-	//						 |1'b0| addnend |
+    // Defualt Addition without shifting
+    //          |   55'b0    |  106'b(product)  | 2'b0 |
+    //                       |1'b0| addnend |
 
-	// the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
-	assign ZManPreShifted = {55'b0, ZMan, 104'b0};
-	always_comb 
-		begin
-			
-		// If the product is too small to effect the sum, kill the product
+    // the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
+    assign ZManPreShifted = {55'b0, ZMan, 106'b0};
+    always_comb
+        begin
+           
+        // If the product is too small to effect the sum, kill the product
 
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//	| addnend |
-		if ($signed(AlignCnt) <= $signed(-13'd56)) begin
-			KillProdE = 1;
-			ZManShifted = {107'b0, ZMan, 52'b0};
-			AddendStickyE = ~(XZeroE|YZeroE);
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //  | addnend |
+        if ($signed(AlignCnt) <= $signed(-13'd56)) begin
+            KillProdE = 1;
+            ZManShifted = ZManPreShifted;//{107'b0, ZMan, 54'b0};
+            AddendStickyE = ~(XZeroE|YZeroE);
 
-		// If the Addend is shifted left (negitive AlignCnt)
+        // If the Addend is shifted left (negitive AlignCnt)
 
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//					| addnend |
-		end else if($signed(AlignCnt) <= $signed(13'd0))  begin
-			KillProdE = 0;
-			ZManShifted = ZManPreShifted << -AlignCnt;
-			AddendStickyE = |(ZManShifted[49:0]);
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                  | addnend |
+        end else if($signed(AlignCnt) <= $signed(13'd0))  begin
+            KillProdE = 0;
+            ZManShifted = ZManPreShifted << -AlignCnt;
+            AddendStickyE = |(ZManShifted[51:0]);
 
-		// If the Addend is shifted right (positive AlignCnt)
+        // If the Addend is shifted right (positive AlignCnt)
 
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//									| addnend |
-		end else if ($signed(AlignCnt)<=$signed(13'd104))  begin
-			KillProdE = 0;
-			ZManShifted = ZManPreShifted >> AlignCnt;
-			AddendStickyE = |(ZManShifted[49:0]);
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                                  | addnend |
+        end else if ($signed(AlignCnt)<=$signed(13'd106))  begin
+            KillProdE = 0;
+            ZManShifted = ZManPreShifted >> AlignCnt;
+            AddendStickyE = |(ZManShifted[51:0]);
 
-		// If the addend is too small to effect the addition		
-		//		- The addend has to shift two past the end of the addend to be considered too small
-		//		- The 2 extra bits are needed for rounding
+        // If the addend is too small to effect the addition        
+        //      - The addend has to shift two past the end of the addend to be considered too small
+        //      - The 2 extra bits are needed for rounding
 
-		// 			| 	55'b0	 |	106'b(product)	| 2'b0 |
-		//														| addnend |
-		end else begin
-			KillProdE = 0;
-			ZManShifted = 0;
-			AddendStickyE = ~ZZeroE;
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                                                      | addnend |
+        end else begin
+            KillProdE = 0;
+            ZManShifted = 0;
+            AddendStickyE = ~ZZeroE;
 
-		end 
-	end
+        end
+    end
 
-	
-	assign AlignedAddendE = ZManShifted[211:50];
-
-endmodule
+   
+    assign AlignedAddendE = ZManShifted[213:52];
 
+endmodule
\ No newline at end of file
diff --git a/wally-pipelined/src/fpu/fma2.sv b/wally-pipelined/src/fpu/fma2.sv
index f9efe93e8..131f98394 100644
--- a/wally-pipelined/src/fpu/fma2.sv
+++ b/wally-pipelined/src/fpu/fma2.sv
@@ -1,127 +1,131 @@
+
+
 module fma2(
  
-	input logic 	[63:0]		X,	// X
-	input logic		[63:0]		Y,	// Y
-	input logic 	[63:0]		Z,	// Z
-	input logic 	[2:0] 		FrmM,		// rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
-	input logic 	[2:0]		FOpCtrlM,	// 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
-	input logic 				FmtM,		// precision 1 = double 0 = single
-	input logic 	[105:0]		ProdManM,	// 1.X frac * 1.Y frac
-	input logic 	[161:0]		AlignedAddendM,	// Z aligned for addition
-	input logic 	[12:0]		ProdExpM,		// X exponent + Y exponent - bias
-	input logic 				AddendStickyM,	// sticky bit that is calculated during alignment
-	input logic 				KillProdM,		// set the product to zero before addition if the product is too small to matter
-	input logic					XZeroM, YZeroM, ZZeroM, // inputs are zero
-	input logic					XInfM, YInfM, ZInfM,	// inputs are infinity
-	input logic					XNaNM, YNaNM, ZNaNM,	// inputs are NaN
-	output logic	[63:0]		FmaResultM,		// FMA final result
-	output logic 	[4:0]		FmaFlagsM);		// FMA flags {invalid, divide by zero, overflow, underflow, inexact}
-	
+    input logic     [63:0]      X,  // X
+    input logic     [63:0]      Y,  // Y
+    input logic     [63:0]      Z,  // Z
+    input logic     [2:0]       FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+    input logic     [2:0]       FOpCtrlM,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
+    input logic                 FmtM,       // precision 1 = double 0 = single
+    input logic     [105:0]     ProdManM,   // 1.X frac * 1.Y frac
+    input logic     [161:0]     AlignedAddendM, // Z aligned for addition
+    input logic     [12:0]      ProdExpM,       // X exponent + Y exponent - bias
+    input logic                 AddendStickyM,  // sticky bit that is calculated during alignment
+    input logic                 KillProdM,      // set the product to zero before addition if the product is too small to matter
+    input logic                 XZeroM, YZeroM, ZZeroM, // inputs are zero
+    input logic                 XInfM, YInfM, ZInfM,    // inputs are infinity
+    input logic                 XNaNM, YNaNM, ZNaNM,    // inputs are NaN
+    output logic    [63:0]      FmaResultM,     // FMA final result
+    output logic    [4:0]       FmaFlagsM);     // FMA flags {invalid, divide by zero, overflow, underflow, inexact}
+   
 
 
-	logic [51:0] 	ResultFrac;	// Result fraction
-	logic [10:0] 	ResultExp;	// Result exponent
-	logic 		 	ResultSgn;	// Result sign
-	logic [10:0] 	ZExp;	// input exponent
-	logic 		 	XSgn, YSgn, ZSgn;	// input sign
-	logic 		 	PSgn;		// product sign
-	logic [105:0]	ProdMan2;	// product being added
-	logic [162:0]	AlignedAddend2;	// possibly inverted aligned Z
- 	logic [161:0]	Sum;		// positive sum
-	logic [162:0]	PreSum;		// possibly negitive sum 
-	logic [12:0]	SumExp;		// exponent of the normalized sum
-	logic [12:0]	SumExpTmp;	// exponent of the normalized sum not taking into account denormal or zero results
-	logic [12:0]	SumExpTmpMinus1;	// SumExpTmp-1
-	logic [12:0]	FullResultExp;		// ResultExp with bits to determine sign and overflow
-	logic [53:0]	NormSum;	// normalized sum
-	logic [161:0]	SumShifted; // sum shifted for normalization
-	logic [8:0]		NormCnt;	// output of the leading zero detector
-	logic 			NormSumSticky; // sticky bit calulated from the normalized sum
-	logic 			SumZero;	// is the sum zero
-	logic 			NegSum;		// is the sum negitive
-	logic 			InvZ;		// invert Z if there is a subtraction (-product + Z or product - Z)
-	logic			ResultDenorm;	// is the result denormalized
-	logic			Sticky;		// Sticky bit
-	logic 			Plus1, Minus1, CalcPlus1, CalcMinus1;	// do you add or subtract one for rounding
-	logic 			Invalid,Underflow,Overflow,Inexact;	// flags
-	logic [8:0]		DenormShift;	// right shift if the result is denormalized
-	logic 			SubBySmallNum;	// was there supposed to be a subtraction by a small number
-	logic [63:0]	Addend;		// value to add (Z or zero)
-	logic			ZeroSgn;		// the result's sign if the sum is zero
-	logic			ResultSgnTmp;	// the result's sign assuming the result is not zero
-	logic 			Guard, Round, LSBNormSum;	// bits needed to determine rounding
-	logic [12:0] 	MaxExp;		// maximum value of the exponent
-	logic [12:0] 	FracLen;	// length of the fraction
-	logic 			SigNaN;		// is an input a signaling NaN
-	logic 			UnderflowFlag; 	// Underflow singal used in FmaFlagsM (used to avoid a circular depencency)
-	logic [63:0] XNaNResult, YNaNResult, ZNaNResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult; // possible results
+    logic [51:0]    ResultFrac; // Result fraction
+    logic [10:0]    ResultExp;  // Result exponent
+    logic           ResultSgn;  // Result sign
+    logic [10:0]    ZExp;   // input exponent
+    logic           XSgn, YSgn, ZSgn;   // input sign
+    logic           PSgn;       // product sign
+    logic [105:0]   ProdMan2;   // product being added
+    logic [162:0]   AlignedAddend2; // possibly inverted aligned Z
+    logic [161:0]   Sum;        // positive sum
+    logic [162:0]   PreSum;     // possibly negitive sum
+    logic [12:0]    SumExp;     // exponent of the normalized sum
+    logic [12:0]    SumExpTmp;  // exponent of the normalized sum not taking into account denormal or zero results
+    logic [12:0]    SumExpTmpMinus1;    // SumExpTmp-1
+    logic [12:0]    FullResultExp;      // ResultExp with bits to determine sign and overflow
+    logic [54:0]    NormSum;    // normalized sum
+    logic [161:0]   SumShifted; // sum shifted for normalization
+    logic [8:0]     NormCnt;    // output of the leading zero detector
+    logic           NormSumSticky; // sticky bit calulated from the normalized sum
+    logic           SumZero;    // is the sum zero
+    logic           NegSum;     // is the sum negitive
+    logic           InvZ;       // invert Z if there is a subtraction (-product + Z or product - Z)
+    logic           ResultDenorm;   // is the result denormalized
+    logic           Sticky;     // Sticky bit
+    logic           Plus1, Minus1, CalcPlus1, CalcMinus1;   // do you add or subtract one for rounding
+    logic           UfPlus1, UfCalcPlus1;  // do you add one (for determining underflow flag)
+    logic           Invalid,Underflow,Overflow,Inexact; // flags
+    logic [8:0]     DenormShift;    // right shift if the result is denormalized
+    logic           SubBySmallNum;  // was there supposed to be a subtraction by a small number
+    logic [63:0]    Addend;     // value to add (Z or zero)
+    logic           ZeroSgn;        // the result's sign if the sum is zero
+    logic           ResultSgnTmp;   // the result's sign assuming the result is not zero
+    logic           Guard, Round, LSBNormSum;   // bits needed to determine rounding
+    logic           UfGuard, UfRound, UfLSBNormSum;   // bits needed to determine rounding for underflow flag
+    logic [12:0]    MaxExp;     // maximum value of the exponent
+    logic [12:0]    FracLen;    // length of the fraction
+    logic           SigNaN;     // is an input a signaling NaN
+    logic           UnderflowFlag;  // Underflow singal used in FmaFlagsM (used to avoid a circular depencency)
+    logic [63:0] XNaNResult, YNaNResult, ZNaNResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult; // possible results
 
-	
-	///////////////////////////////////////////////////////////////////////////////
-	// Select input fields
-	// The following logic duplicates fma1 because it's cheaper to recompute than provide registers
-	///////////////////////////////////////////////////////////////////////////////
+   
+    ///////////////////////////////////////////////////////////////////////////////
+    // Select input fields
+    // The following logic duplicates fma1 because it's cheaper to recompute than provide registers
+    ///////////////////////////////////////////////////////////////////////////////
 
-	// Set addend to zero if FMUL instruction
-  	assign Addend = FOpCtrlM[2] ? 64'b0 : Z;
+    // Set addend to zero if FMUL instruction
+    assign Addend = FOpCtrlM[2] ? 64'b0 : Z;
 
-	// split inputs into the sign bit, and exponent to handle single or double precision
-	// 		- single precision is in the top half of the inputs
-	assign XSgn = X[63];
-	assign YSgn = Y[63];
-	assign ZSgn = Addend[63]^FOpCtrlM[0]; //Negate Z if subtraction
+    // split inputs into the sign bit, and exponent to handle single or double precision
+    //      - single precision is in the top half of the inputs
+    assign XSgn = X[63];
+    assign YSgn = Y[63];
+    assign ZSgn = Addend[63]^FOpCtrlM[0]; //Negate Z if subtraction
 
-	assign ZExp = FmtM ? Addend[62:52] : {3'b0, Addend[62:55]};
+    assign ZExp = FmtM ? Addend[62:52] : {3'b0, Addend[62:55]};
 
 
 
 
-	// Calculate the product's sign
-	//		Negate product's sign if FNMADD or FNMSUB
-	assign PSgn = XSgn ^ YSgn ^ FOpCtrlM[1];
+    // Calculate the product's sign
+    //      Negate product's sign if FNMADD or FNMSUB
+    assign PSgn = XSgn ^ YSgn ^ FOpCtrlM[1];
 
 
 
-	///////////////////////////////////////////////////////////////////////////////
-	// Addition
-	///////////////////////////////////////////////////////////////////////////////
-	
-	// Negate Z  when doing one of the following opperations:
-	//		-prod +  Z
-	//		 prod -  Z 
-	assign InvZ = ZSgn ^ PSgn;
+    ///////////////////////////////////////////////////////////////////////////////
+    // Addition
+    ///////////////////////////////////////////////////////////////////////////////
+   
+    // Negate Z  when doing one of the following opperations:
+    //      -prod +  Z
+    //       prod -  Z
+    assign InvZ = ZSgn ^ PSgn;
 
-	// Choose an inverted or non-inverted addend - the one is added later
-	assign AlignedAddend2 = InvZ ? ~{1'b0, AlignedAddendM} : {1'b0, AlignedAddendM};
-	// Kill the product if the product is too small to effect the addition (determined in fma1.sv)
-	assign ProdMan2 = KillProdM ? 106'b0 : ProdManM;
+    // Choose an inverted or non-inverted addend - the one is added later
+    assign AlignedAddend2 = InvZ ? ~{1'b0, AlignedAddendM} : {1'b0, AlignedAddendM};
+    // Kill the product if the product is too small to effect the addition (determined in fma1.sv)
+    assign ProdMan2 = KillProdM ? 106'b0 : ProdManM;
 
-	// Do the addition
-	// 		- add one to negate if the added was inverted
-	//		- the 2 extra bits at the begining and end are needed for rounding
-	assign PreSum = AlignedAddend2 + {55'b0, ProdMan2, 2'b0} + {162'b0, InvZ};
-	 
-	// Is the sum negitive
-	assign NegSum = PreSum[162];
-	// If the sum is negitive, negate the sum.
-	assign Sum = NegSum ? -PreSum[161:0] : PreSum[161:0];
+    // Do the addition
+    //      - add one to negate if the added was inverted
+    //      - the 2 extra bits at the begining and end are needed for rounding
+    assign PreSum = AlignedAddend2 + {55'b0, ProdMan2, 2'b0} + {162'b0, InvZ};
+     
+    // Is the sum negitive
+    assign NegSum = PreSum[162];
+    // If the sum is negitive, negate the sum.
+    assign Sum = NegSum ? -PreSum[161:0] : PreSum[161:0];
 
 
 
 
 
 
-	///////////////////////////////////////////////////////////////////////////////
-	// Leading one detector
-	///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    // Leading one detector
+    ///////////////////////////////////////////////////////////////////////////////
 
-	//*** replace with non-behavoral code
-	logic [8:0]	i;
-	always_comb begin
-			i = 0;
-			while (~Sum[161-i] && $unsigned(i) <= $unsigned(9'd161)) i = i+1;  // search for leading one 
-			NormCnt = i+1;    // compute shift count
-	end
+    //*** replace with non-behavoral code
+    logic [8:0] i;
+    always_comb begin
+            i = 0;
+            while (~Sum[161-i] && $unsigned(i) <= $unsigned(9'd161)) i = i+1;  // search for leading one
+            NormCnt = i+1;    // compute shift count
+    end
 
 
 
@@ -133,112 +137,127 @@ module fma2(
 
 
 
-	///////////////////////////////////////////////////////////////////////////////
-	// Normalization
-	///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    // Normalization
+    ///////////////////////////////////////////////////////////////////////////////
 
-	// Determine if the sum is zero
-	assign SumZero = ~(|Sum);
+    // Determine if the sum is zero
+    assign SumZero = ~(|Sum);
 
-	// determine the length of the fraction based on precision
-	assign FracLen = FmtM ? 13'd52 : 13'd23;
+    // determine the length of the fraction based on precision
+    assign FracLen = FmtM ? 13'd52 : 13'd23;
 
-	// Determine if the result is denormal
-	assign SumExpTmp = KillProdM ? {2'b0, ZExp} : ProdExpM + -({4'b0, NormCnt} - 13'd56);
-	assign ResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp)>=$signed(-FracLen)) & ~SumZero;
+    // Determine if the result is denormal
+    assign SumExpTmp = KillProdM ? {2'b0, ZExp} : ProdExpM + -({4'b0, NormCnt} - 13'd56);
+    assign ResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp)>=$signed(-FracLen)) & ~SumZero;
 
-	// Determine the shift needed for denormal results
-	assign SumExpTmpMinus1 = SumExpTmp-1;
-	assign DenormShift = ResultDenorm ? SumExpTmpMinus1[8:0] : 9'b0;
+    // Determine the shift needed for denormal results
+    assign SumExpTmpMinus1 = SumExpTmp-1;
+    assign DenormShift = ResultDenorm ? SumExpTmpMinus1[8:0] : 9'b0;
 
-	// Normalize the sum
-	assign SumShifted = SumZero ? 162'b0 : Sum << NormCnt+DenormShift; 
-	assign NormSum = SumShifted[161:108];
-	// Calculate the sticky bit
-	assign NormSumSticky = FmtM ? (|SumShifted[107:0]) : (|SumShifted[136:0]);
-	assign Sticky = AddendStickyM | NormSumSticky;
+    // Normalize the sum
+    assign SumShifted = SumZero ? 162'b0 : Sum << NormCnt+DenormShift;
+    assign NormSum = SumShifted[161:107];
+    // Calculate the sticky bit
+    assign NormSumSticky = FmtM ? (|SumShifted[107:0]) : (|SumShifted[136:0]);
+    assign Sticky = AddendStickyM | NormSumSticky;
 
-	// Determine sum's exponent
-	assign SumExp = SumZero ? 13'b0 : 
-				 ResultDenorm ? 13'b0 :
-				 SumExpTmp; 
+    // Determine sum's exponent
+    assign SumExp = SumZero ? 13'b0 :
+                 ResultDenorm ? 13'b0 :
+                 SumExpTmp;
 
 
 
 
 
-	///////////////////////////////////////////////////////////////////////////////
-	// Rounding
-	///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    // Rounding
+    ///////////////////////////////////////////////////////////////////////////////
 
-	// round to nearest even
-	//		{Guard, Round, Sticky}
-	//		0xx - do nothing
-	//		100 - tie - Plus1 if result is odd  (LSBNormSum = 1)
-	//			- don't add 1 if a small number was supposed to be subtracted
-	//		101 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
-	//		110/111 - Plus1
+    // round to nearest even
+    //      {Guard, Round, Sticky}
+    //      0xx - do nothing
+    //      100 - tie - Plus1 if result is odd  (LSBNormSum = 1)
+    //          - don't add 1 if a small number was supposed to be subtracted
+    //      101 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
+    //      110/111 - Plus1
 
-	// 	round to zero - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
+    //  round to zero - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
 
-	// 	round to -infinity 
-	//			- Plus1 if negative unless a small number was supposed to be subtracted from a result with guard and round bits of 0
-	//			- subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
+    //  round to -infinity
+    //          - Plus1 if negative unless a small number was supposed to be subtracted from a result with guard and round bits of 0
+    //          - subtract 1 if a small number was supposed to be subtracted from a positive result with guard and round bits of 0
 
-	// 	round to infinity 
-	//			- Plus1 if positive unless a small number was supposed to be subtracted from a result with guard and round bits of 0
-	//			- subtract 1 if a small number was supposed to be subtracted from a negative result with guard and round bits of 0
+    //  round to infinity
+    //          - Plus1 if positive unless a small number was supposed to be subtracted from a result with guard and round bits of 0
+    //          - subtract 1 if a small number was supposed to be subtracted from a negative result with guard and round bits of 0
 
-	//  round to nearest max magnitude
-	//		{Guard, Round, Sticky}
-	//		0xx - do nothing
-	//		100 - tie - Plus1
-	//			- don't add 1 if a small number was supposed to be subtracted
-	//		101 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
-	//		110/111 - Plus1
+    //  round to nearest max magnitude
+    //      {Guard, Round, Sticky}
+    //      0xx - do nothing
+    //      100 - tie - Plus1
+    //          - don't add 1 if a small number was supposed to be subtracted
+    //      101 - do nothing if a small number was supposed to subtracted (the sticky bit was set by the small number)
+    //      110/111 - Plus1
 
-	// determine guard, round, and least significant bit of the result
-	assign Guard = FmtM ? NormSum[1] : NormSum[30];
-	assign Round = FmtM ? NormSum[0] : NormSum[29];
-	assign LSBNormSum = FmtM ? NormSum[2] : NormSum[31];
+    // determine guard, round, and least significant bit of the result
+    assign Guard = FmtM ? NormSum[2] : NormSum[31];
+    assign Round = FmtM ? NormSum[1] : NormSum[30];
+    assign LSBNormSum = FmtM ? NormSum[3] : NormSum[32];
 
-	// Deterimine if a small number was supposed to be subtrated
-	assign SubBySmallNum = AddendStickyM&InvZ&~(NormSumSticky)&~ZZeroM;
+    // used to determine underflow flag
+    assign UfGuard = FmtM ? NormSum[1] : NormSum[30];
+    assign UfRound = FmtM ? NormSum[0] : NormSum[29];
+    assign UfLSBNormSum = FmtM ? NormSum[2] : NormSum[31];
 
-	always_comb begin
-		// Determine if you add 1
-		case (FrmM)
-			3'b000: CalcPlus1 = Guard & (Round | (Sticky&~(~Round&SubBySmallNum)) | (~Round&~Sticky&LSBNormSum&~SubBySmallNum));//round to nearest even
-			3'b001: CalcPlus1 = 0;//round to zero
-			3'b010: CalcPlus1 = ResultSgn & ~(SubBySmallNum & ~Guard & ~Round);//round down
-			3'b011: CalcPlus1 = ~ResultSgn & ~(SubBySmallNum & ~Guard & ~Round);//round up
-			3'b100: CalcPlus1 = (Guard & (Round | (Sticky&~(~Round&SubBySmallNum)) | (~Round&~Sticky&~SubBySmallNum)));//round to nearest max magnitude
-			default: CalcPlus1 = 1'bx;
-		endcase
-		// Determine if you subtract 1
-		case (FrmM)
-			3'b000: CalcMinus1 = 0;//round to nearest even
-			3'b001: CalcMinus1 = SubBySmallNum & ~Guard & ~Round;//round to zero
-			3'b010: CalcMinus1 = ~ResultSgn & ~Guard & ~Round & SubBySmallNum;//round down
-			3'b011: CalcMinus1 = ResultSgn & ~Guard & ~Round & SubBySmallNum;//round up
-			3'b100: CalcMinus1 = 0;//round to nearest max magnitude
-			default: CalcMinus1 = 1'bx;
-		endcase
-	
-	end
+    // Deterimine if a small number was supposed to be subtrated
+    assign SubBySmallNum = AddendStickyM&InvZ&~(NormSumSticky)&~ZZeroM;
 
-	// If an answer is exact don't round
-    assign Plus1 = CalcPlus1 & (Sticky | Guard | Round);
-    assign Minus1 = CalcMinus1 & (Sticky | Guard | Round);
+    always_comb begin
+        // Determine if you add 1
+        case (FrmM)
+            3'b000: CalcPlus1 = Guard & (Round | ((Sticky|UfGuard)&~(~Round&SubBySmallNum)) | (~Round&~(Sticky|UfGuard)&LSBNormSum&~SubBySmallNum));//round to nearest even
+            3'b001: CalcPlus1 = 0;//round to zero
+            3'b010: CalcPlus1 = ResultSgn & ~(SubBySmallNum & ~Guard & ~Round);//round down
+            3'b011: CalcPlus1 = ~ResultSgn & ~(SubBySmallNum & ~Guard & ~Round);//round up
+            3'b100: CalcPlus1 = (Guard & (Round | ((Sticky|UfGuard)&~(~Round&SubBySmallNum)) | (~Round&~(Sticky|UfGuard)&~SubBySmallNum)));//round to nearest max magnitude
+            default: CalcPlus1 = 1'bx;
+        endcase
+        // Determine if you add 1 (for underflow flag)
+        case (FrmM)
+            3'b000: UfCalcPlus1 = UfGuard & (UfRound | (Sticky&~(~UfRound&SubBySmallNum)) | (~UfRound&~Sticky&UfLSBNormSum&~SubBySmallNum));//round to nearest even
+            3'b001: UfCalcPlus1 = 0;//round to zero
+            3'b010: UfCalcPlus1 = ResultSgn & ~(SubBySmallNum & ~UfGuard & ~UfRound);//round down
+            3'b011: UfCalcPlus1 = ~ResultSgn & ~(SubBySmallNum & ~UfGuard & ~UfRound);//round up
+            3'b100: UfCalcPlus1 = (UfGuard & (UfRound | (Sticky&~(~UfRound&SubBySmallNum)) | (~UfRound&~Sticky&~SubBySmallNum)));//round to nearest max magnitude
+            default: UfCalcPlus1 = 1'bx;
+        endcase
+        // Determine if you subtract 1
+        case (FrmM)
+            3'b000: CalcMinus1 = 0;//round to nearest even
+            3'b001: CalcMinus1 = SubBySmallNum & ~Guard & ~Round;//round to zero
+            3'b010: CalcMinus1 = ~ResultSgn & ~Guard & ~Round & SubBySmallNum;//round down
+            3'b011: CalcMinus1 = ResultSgn & ~Guard & ~Round & SubBySmallNum;//round up
+            3'b100: CalcMinus1 = 0;//round to nearest max magnitude
+            default: CalcMinus1 = 1'bx;
+        endcase
+   
+    end
 
-	// Compute rounded result 
-	logic [64:0] RoundAdd;
-	logic [51:0] NormSumTruncated;
-	assign RoundAdd = FmtM ? Minus1 ? {65{1'b1}} : {64'b0, Plus1} : 
-							 Minus1 ? {{36{1'b1}}, 29'b0} :	{35'b0, Plus1, 29'b0};
-	assign NormSumTruncated = FmtM ? NormSum[53:2] : {NormSum[53:31], 29'b0};
+    // If an answer is exact don't round
+    assign Plus1 = CalcPlus1 & (Sticky | UfGuard | Guard | Round);
+    assign UfPlus1 = UfCalcPlus1 & (Sticky | UfGuard | UfRound);
+    assign Minus1 = CalcMinus1 & (Sticky | UfGuard | Guard | Round);
 
-	assign {FullResultExp, ResultFrac} = {SumExp, NormSumTruncated} + RoundAdd;
+    // Compute rounded result
+    logic [64:0] RoundAdd;
+    logic [51:0] NormSumTruncated;
+    assign RoundAdd = FmtM ? Minus1 ? {65{1'b1}} : {64'b0, Plus1} :
+                             Minus1 ? {{36{1'b1}}, 29'b0} : {35'b0, Plus1, 29'b0};
+    assign NormSumTruncated = FmtM ? NormSum[54:3] : {NormSum[54:32], 29'b0};
+
+    assign {FullResultExp, ResultFrac} = {SumExp, NormSumTruncated} + RoundAdd;
     assign ResultExp = FullResultExp[10:0];
 
 
@@ -247,58 +266,57 @@ module fma2(
 
 
 
-	///////////////////////////////////////////////////////////////////////////////
-	// Sign calculation
-	///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    // Sign calculation
+    ///////////////////////////////////////////////////////////////////////////////
 
-	// Determine the sign if the sum is zero
-	//		if cancelation then 0 unless round to -infinity
-	//		otherwise psign
-	assign ZeroSgn = (PSgn^ZSgn)&~Underflow ? FrmM == 3'b010 : PSgn;
+    // Determine the sign if the sum is zero
+    //      if cancelation then 0 unless round to -infinity
+    //      otherwise psign
+    assign ZeroSgn = (PSgn^ZSgn)&~Underflow ? FrmM == 3'b010 : PSgn;
 
-	// is the result negitive
-	// 	if p - z is the Sum negitive
-	// 	if -p + z is the Sum positive
-	// 	if -p - z then the Sum is negitive
-	assign ResultSgnTmp = InvZ&(ZSgn)&NegSum | InvZ&PSgn&~NegSum | ((ZSgn)&PSgn);
-	assign ResultSgn = SumZero ? ZeroSgn : ResultSgnTmp;
+    // is the result negitive
+    //  if p - z is the Sum negitive
+    //  if -p + z is the Sum positive
+    //  if -p - z then the Sum is negitive
+    assign ResultSgnTmp = InvZ&(ZSgn)&NegSum | InvZ&PSgn&~NegSum | ((ZSgn)&PSgn);
+    assign ResultSgn = SumZero ? ZeroSgn : ResultSgnTmp;
  
 
 
 
 
-	///////////////////////////////////////////////////////////////////////////////
-	// Flags
-	///////////////////////////////////////////////////////////////////////////////
+    ///////////////////////////////////////////////////////////////////////////////
+    // Flags
+    ///////////////////////////////////////////////////////////////////////////////
 
 
 
-	// Set Invalid flag for following cases:
-	//   1) Inf - Inf (unless x or y is NaN)
-	//   2) 0 * Inf
-	//   3) any input is a signaling NaN
-	assign MaxExp = FmtM ? 13'd2047 : 13'd255;
-	assign SigNaN = FmtM ? (XNaNM&~X[51]) | (YNaNM&~Y[51]) | (ZNaNM&~Addend[51]) : 
-						   (XNaNM&~X[54]) | (YNaNM&~Y[54]) | (ZNaNM&~Addend[54]);
-	assign Invalid = SigNaN | ((XInfM || YInfM) & ZInfM & (PSgn ^ ZSgn) & ~XNaNM & ~YNaNM) | (XZeroM & YInfM) | (YZeroM & XInfM);  
-	
-	// Set Overflow flag if the number is too big to be represented
-	//		- Don't set the overflow flag if an overflowed result isn't outputed
-	assign Overflow = FullResultExp >= MaxExp & ~FullResultExp[12]&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+    // Set Invalid flag for following cases:
+    //   1) any input is a signaling NaN
+    //   2) Inf - Inf (unless x or y is NaN)
+    //   3) 0 * Inf
+    assign MaxExp = FmtM ? 13'd2047 : 13'd255;
+    assign SigNaN = FmtM ? (XNaNM&~X[51]) | (YNaNM&~Y[51]) | (ZNaNM&~Addend[51]) :
+                           (XNaNM&~X[54]) | (YNaNM&~Y[54]) | (ZNaNM&~Addend[54]);
+    assign Invalid = SigNaN | ((XInfM || YInfM) & ZInfM & (PSgn ^ ZSgn) & ~XNaNM & ~YNaNM) | (XZeroM & YInfM) | (YZeroM & XInfM);  
+   
+    // Set Overflow flag if the number is too big to be represented
+    //      - Don't set the overflow flag if an overflowed result isn't outputed
+    assign Overflow = FullResultExp >= MaxExp & ~FullResultExp[12]&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
 
-	// Set Underflow flag if the number is too small to be represented in normal numbers
-	//		- Don't set the underflow flag if the result is exact 
-	assign Underflow = (SumExp[12] | ((SumExp == 0) & (Round|Guard|Sticky)))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
-	//assign UnderflowFlag = (Underflow | (FullResultExp == 0)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM)&(Round|Guard|Sticky))  & ~(FullResultExp == 1);
-	assign UnderflowFlag = (Underflow | (FullResultExp == 0)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM)&(Round|Guard|Sticky))  & ~(FullResultExp == 1);
-	// Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
-	//		- Don't set the underflow flag if an underflowed result isn't outputed
-	assign Inexact = (Sticky|Overflow|Guard|Round|Underflow)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+    // Set Underflow flag if the number is too small to be represented in normal numbers
+    //      - Don't set the underflow flag if the result is exact
+    assign Underflow = (SumExp[12] | ((SumExp == 0) & (Round|Guard|Sticky|UfGuard)))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+    assign UnderflowFlag = (FullResultExp[12] | ((FullResultExp == 0) | ((FullResultExp == 1) & (SumExp == 0) & ~(UfPlus1&UfLSBNormSum)))&(Round|Guard|Sticky))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+    // Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
+    //      - Don't set the underflow flag if an underflowed result isn't outputed
+    assign Inexact = (Sticky|UfGuard|Overflow|Guard|Round|Underflow)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
 
-	// Combine flags 
-	//		- FMA can't set the Divide by zero flag
-	//		- Don't set the underflow flag if the result was rounded up to a normal number
-	assign FmaFlagsM = {Invalid, 1'b0, Overflow, UnderflowFlag, Inexact};
+    // Combine flags
+    //      - FMA can't set the Divide by zero flag
+    //      - Don't set the underflow flag if the result was rounded up to a normal number
+    assign FmaFlagsM = {Invalid, 1'b0, Overflow, UnderflowFlag, Inexact};
 
 
 
@@ -306,31 +324,31 @@ module fma2(
 
 
 
-	///////////////////////////////////////////////////////////////////////////////
-	// Select the result
-	///////////////////////////////////////////////////////////////////////////////
-	assign XNaNResult = FmtM ? {XSgn, X[62:52], 1'b1,X[50:0]} : {XSgn, X[62:55], 1'b1,X[53:0]};
-	assign YNaNResult = FmtM ? {YSgn, Y[62:52], 1'b1,Y[50:0]} : {YSgn, Y[62:55], 1'b1,Y[53:0]};
-	assign ZNaNResult = FmtM ? {ZSgn, Addend[62:52], 1'b1,Addend[50:0]} : {ZSgn, Addend[62:55], 1'b1,Addend[53:0]};
-	assign OverflowResult =  FmtM ? ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, 11'h7fe, {52{1'b1}}} : 
-																														  {ResultSgn, 11'h7ff, 52'b0} : 
-									((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, 8'hfe, {23{1'b1}}, 32'b0} :
-																														  {ResultSgn, 8'hff, 55'b0};
-	assign InvalidResult = FmtM ? {ResultSgn, 11'h7ff, 1'b1, 51'b0} : {ResultSgn, 8'hff, 1'b1, 54'b0};
-	assign KillProdResult = FmtM ?{ResultSgn, Addend[62:0] - {62'b0, (Minus1&AddendStickyM)}} + {62'b0, (Plus1&AddendStickyM)} : {ResultSgn, Addend[62:32] - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}, 32'b0};
-	assign UnderflowResult = FmtM ? {ResultSgn, 63'b0} + {63'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))} : {{ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}, 32'b0};
-	assign FmaResultM = XNaNM ? XNaNResult : 
-						YNaNM ? YNaNResult : 
-						ZNaNM ? ZNaNResult :
-						Invalid ? InvalidResult : // has to be before inf
-						XInfM ? {PSgn, X[62:0]} :
-						YInfM ? {PSgn, Y[62:0]} :
-						ZInfM ? {ZSgn, Addend[62:0]} :
-						Overflow ? OverflowResult :	
-						KillProdM ? KillProdResult : // has to be after Underflow		
-						Underflow & ~ResultDenorm ? UnderflowResult :	
-						FmtM ? {ResultSgn, ResultExp, ResultFrac} : 
-							   {ResultSgn, ResultExp[7:0], ResultFrac, 3'b0};
+    ///////////////////////////////////////////////////////////////////////////////
+    // Select the result
+    ///////////////////////////////////////////////////////////////////////////////
+    assign XNaNResult = FmtM ? {XSgn, X[62:52], 1'b1,X[50:0]} : {XSgn, X[62:55], 1'b1,X[53:0]};
+    assign YNaNResult = FmtM ? {YSgn, Y[62:52], 1'b1,Y[50:0]} : {YSgn, Y[62:55], 1'b1,Y[53:0]};
+    assign ZNaNResult = FmtM ? {ZSgn, Addend[62:52], 1'b1,Addend[50:0]} : {ZSgn, Addend[62:55], 1'b1,Addend[53:0]};
+    assign OverflowResult =  FmtM ? ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, 11'h7fe, {52{1'b1}}} :
+                                                                                                                          {ResultSgn, 11'h7ff, 52'b0} :
+                                    ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, 8'hfe, {23{1'b1}}, 32'b0} :
+                                                                                                                          {ResultSgn, 8'hff, 55'b0};
+    assign InvalidResult = FmtM ? {ResultSgn, 11'h7ff, 1'b1, 51'b0} : {ResultSgn, 8'hff, 1'b1, 54'b0};
+    assign KillProdResult = FmtM ?{ResultSgn, Addend[62:0] - {62'b0, (Minus1&AddendStickyM)}} + {62'b0, (Plus1&AddendStickyM)} : {ResultSgn, Addend[62:32] - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}, 32'b0};
+    assign UnderflowResult = FmtM ? {ResultSgn, 63'b0} + {63'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))} : {{ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}, 32'b0};
+    assign FmaResultM = XNaNM ? XNaNResult :
+                        YNaNM ? YNaNResult :
+                        ZNaNM ? ZNaNResult :
+                        Invalid ? InvalidResult : // has to be before inf
+                        XInfM ? {PSgn, X[62:0]} :
+                        YInfM ? {PSgn, Y[62:0]} :
+                        ZInfM ? {ZSgn, Addend[62:0]} :
+                        Overflow ? OverflowResult :
+                        KillProdM ? KillProdResult : // has to be after Underflow      
+                        Underflow & ~ResultDenorm ? UnderflowResult :  
+                        FmtM ? {ResultSgn, ResultExp, ResultFrac} :
+                               {ResultSgn, ResultExp[7:0], ResultFrac, 3'b0};
 
 
 
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index 7f93d33a7..5c15268ed 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -34,7 +34,6 @@ module fpu (
   input logic [`XLEN-1:0]  SrcAM,      // Integer input being written into fpreg
   input logic 		         StallE, StallM, StallW,
   input logic 		         FlushE, FlushM, FlushW,
-  output logic  	         IsFPD, IsFPE,    // Read/write enable for memory {read, write}
   output logic 		      FStallD,    // Stall the decode stage if Div/Sqrt instruction
   output logic 		      FWriteIntE, FWriteIntM, FWriteIntW, // Write integer register enable
   output logic [`XLEN-1:0] FWriteDataE,      // Data to be written to memory
@@ -59,8 +58,8 @@ module fpu (
    logic 		   SrcZUsedD;                                            // Is input 3 used
    logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
    logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM, FOpCtrlW;                  // Select which opperation to do in each component
-   logic          SelLoadInputE, SelLoadInputM;                            // Select which adress to load when single precision
-   logic       FInput2UsedD, FInput3UsedD;                                   
+   logic [1:0]         FResSelD, FResSelE, FResSelM;  
+   logic [1:0]         FIntResSelD, FIntResSelE, FIntResSelM;                                   
    logic [4:0] 	Adr1E, Adr2E, Adr3E;
    
    // regfile signals
@@ -132,7 +131,8 @@ module fpu (
    // fsgn signals
    logic [63:0] 	SgnResultE, SgnResultM, SgnResultW;
    logic [4:0] 	SgnFlagsE, SgnFlagsM, SgnFlagsW;
-   logic [63:0]   FResM;
+   logic [63:0]   FResM, FResW;
+   logic    FFlgM, FFlgW;
    
    // instantiation of W stage regfile signals
    logic [63:0] 	AlignedSrcAM, ForwardSrcAM, SrcAW;
@@ -167,38 +167,19 @@ module fpu (
    //*****************
    // other  D/E pipe registers
    //*****************
-   // flopenrc #(64) DEReg14(clk, reset, FlushE, ~StallE, FPUResult64W, FPUResult64E);
-   // flopenrc #(1) CtrlRegE1(clk, reset, FlushE, ~StallE, FWriteEnD, FWriteEnE);
-   // flopenrc #(3) CtrlRegE2(clk, reset, FlushE, ~StallE, FResultSelD, FResultSelE);
-   // flopenrc #(3) CtrlRegE3(clk, reset, FlushE, ~StallE, FrmD, FrmE);
-   // flopenrc #(1) CtrlRegE4(clk, reset, FlushE, ~StallE, FmtD, FmtE);
-   // flopenrc #(5) CtrlRegE5(clk, reset, FlushE, ~StallE, InstrD[11:7], RdE);
-   // flopenrc #(4) CtrlRegE6(clk, reset, FlushE, ~StallE, FOpCtrlD, FOpCtrlE);
    flopenrc #(1) CtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
    flopenrc #(15) CtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
-                                                      {Adr1E,         Adr2E,         Adr3E});
-   // flopenrc #(1) CtrlRegE8(clk, reset, FlushE, ~StallE, FWriteIntD, FWriteIntE);
-   // flopenrc #(1) CtrlRegE9(clk, reset, FlushE, ~StallE, FOutputInput2D, FOutputInput2E);
-   // flopenrc #(2) CtrlRegE10(clk, reset, FlushE, ~StallE, FMemRWD, FMemRWE);
-   // flopenrc #(1) CtrlRegE11(clk, reset, FlushE, ~StallE, InstrD[15], SelLoadInputE);
-   flopenrc #(20) CtrlRegE(clk, reset, FlushE, ~StallE, 
-                        {FWriteEnD, FResultSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FWriteIntD, InstrD[15],    IsFPD},
-                        {FWriteEnE, FResultSelE, FrmE, FmtE, RdE,          FOpCtrlE, FWriteIntE, SelLoadInputE, IsFPE});
+                                                         {Adr1E,         Adr2E,         Adr3E});
+   flopenrc #(22) DECtrlReg(clk, reset, FlushE, ~StallE, 
+                        {FWriteEnD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FWriteIntD},
+                        {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE,          FOpCtrlE, FWriteIntE});
 
    //EXECUTION STAGE
    
-   // input muxs for forwarding   
-   // single vs double for SRCAM
-   // mux2  #(64)  SrcAMuxForward({SrcAM[31:0], 32'b0}, {SrcAM, {64-`XLEN{1'b0}}}, FmtM, ForwardSrcAM);
-   // //input 1 forwarding mux
-   // mux4  #(64)  SrcXEmux(FRD1E, FPUResult64W, FPUResult64E, ForwardSrcAM, ForwardXE, SrcXtmpE);
-   // mux3  #(64)  SrcYEmux(FRD2E, FPUResult64W, FPUResult64E, ForwardYE, SrcYE);
-   // mux2  #(64)  SrcZEmux(FRD3E, FPUResult64E, ForwardZE, SrcZE);
-   // mux2  #(64)  FOutputInput2mux(SrcXtmpE, SrcYE, FOutputInput2E, SrcXE);
-   
    // Hazard unit for FPU
    fpuhazard hazard(.*);
 
+   // forwarding muxs
    mux3  #(64)  fxemux(FRD1E, FPUResult64W, FResM, ForwardXE, SrcXE);
    mux3  #(64)  fyemux(FRD2E, FPUResult64W, FResM, ForwardYE, SrcYE);
    mux3  #(64)  fzemux(FRD3E, FPUResult64W, FResM, ForwardZE, SrcZE);
@@ -225,6 +206,8 @@ module fpu (
 
    fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .*);
    
+
+
    // first of two-stage instance of floating-point add/cvt unit
    fpuaddcvt1 fpadd1 (.*);
    
@@ -236,6 +219,8 @@ module fpu (
    
    // first and only instance of floating-point classify unit
    fpuclassify fpuclass (.*);
+
+   // output for store instructions
    assign FWriteDataE = FmtE ? SrcYE[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcYE[63:32]};
    
    //*****************
@@ -295,17 +280,9 @@ module fpu (
    //*****************
    // fpcmp E/M pipe registers
    //*****************
-   // flopenrc #(8) EMRegCmp1(clk, reset, FlushM, ~StallM, WE, WM); 
-   // flopenrc #(8) EMRegCmp2(clk, reset, FlushM, ~StallM, XE, XM); 
-   // flopenrc #(1) EMRegcmp3(clk, reset, FlushM, ~StallM, ANaNE, ANaNM); 
-   // flopenrc #(1) EMRegCmp4(clk, reset, FlushM, ~StallM, BNaNE, BNaNM); 
-   // flopenrc #(1) EMRegCmp5(clk, reset, FlushM, ~StallM, AzeroE, AzeroM); 
-   // flopenrc #(1) EMRegCmp6(clk, reset, FlushM, ~StallM, BzeroE, BzeroM); 
    flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpInvalidE, CmpInvalidM); 
-   // flopenrc #(2)  EMRegCmp2(clk, reset, FlushM, ~StallM, CmpFCCE, CmpFCCM); 
    flopenrc #(64) EMRegCmp3(clk, reset, FlushM, ~StallM, FCmpResultE, FCmpResultM); 
    
-   // put this in for the event we want to delay fsgn - will otherwise bypass
    //*****************
    // fpsgn E/M pipe registers
    //***************** 
@@ -315,15 +292,9 @@ module fpu (
    //*****************
    // other E/M pipe registers
    //*****************
-   flopenrc #(1) EMReg1(clk, reset, FlushM, ~StallM, FWriteEnE, FWriteEnM);
-   flopenrc #(3) EMReg2(clk, reset, FlushM, ~StallM, FResultSelE, FResultSelM);
-   flopenrc #(3) EMReg3(clk, reset, FlushM, ~StallM, FrmE, FrmM);
-   flopenrc #(1) EMReg4(clk, reset, FlushM, ~StallM, FmtE, FmtM);
-   flopenrc #(5) EMReg5(clk, reset, FlushM, ~StallM, RdE, RdM);
-   flopenrc #(4) EMReg6(clk, reset, FlushM, ~StallM, FOpCtrlE, FOpCtrlM);
-   flopenrc #(1) EMReg7(clk, reset, FlushM, ~StallM, FWriteIntE, FWriteIntM);
-   // flopenrc #(2) EMReg8(clk, reset, FlushM, ~StallM, FMemRWE, FMemRWM);
-   flopenrc #(1) EMReg9(clk, reset, FlushM, ~StallM, SelLoadInputE, SelLoadInputM);
+   flopenrc #(22) EMCtrlReg(clk, reset, FlushM, ~StallM,
+                        {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE, FOpCtrlE, FWriteIntE},
+                        {FWriteEnM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, RdM, FOpCtrlM, FWriteIntM});
    
    //*****************
    // fpuclassify E/M pipe registers
@@ -332,24 +303,18 @@ module fpu (
    
    //BEGIN MEMORY STAGE
    
-   mux2  #(64)  FResMux(AlignedSrcAM, SgnResultM, FResultSelM == 3'b011, FResM);
-   assign SrcXMAligned = FmtM ? SrcXM[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcXM[63:32]};
-   mux3  #(`XLEN)  IntResMux(SrcXMAligned, FCmpResultM[`XLEN-1:0], ClassResultM[`XLEN-1:0], {FResultSelM == 3'b101, FResultSelM == 3'b001}, FIntResM);
+   mux3  #(64)  FResMux(AlignedSrcAM, SgnResultM, FCmpResultM, FResSelM, FResM);
+   assign FFlgM = CmpInvalidM & FResSelM[1];
 
-   //adjecent adress values are sent to the FPU, select the correct one
-   //    -imm is 80000 most of the time vs the error one which is 00000
-   // mux3  #(64)  FLoadResultMux({HRDATA[31:0], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA[`AHBW-1:`AHBW-32], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA, {64-`AHBW{1'b0}}}, {FmtM, SelLoadInputM}, FLoadResultM);
-   // mux2  #(64)  FLoadStoreResultMux(FLoadResultM, SrcXM, |FOpCtrlM[2:1], FLoadStoreResultM);
-   
+   assign SrcXMAligned = FmtM ? SrcXM[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcXM[63:32]};
+   mux3  #(`XLEN)  IntResMux(FCmpResultM[`XLEN-1:0], SrcXMAligned, ClassResultM[`XLEN-1:0], FIntResSelM, FIntResM);
+
+   // second instance of two-stage FMA unit
    fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM(FOpCtrlM[2:0]), .*);
    
    // second instance of two-stage floating-point add/cvt unit
    fpuaddcvt2 fpadd2 (.*);
    
-   // second instance of two-stage floating-point comparator
-   // fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), 
-	// 	   .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(SrcXM), .op2(SrcYM), .*);
-
    // Align SrcA to MSB when single precicion
    mux2  #(64)  SrcAMux({SrcAM[31:0], 32'b0}, {{64-`XLEN{1'b0}}, SrcAM}, FmtM, AlignedSrcAM);
       
@@ -397,19 +362,16 @@ module fpu (
    //*****************
    // other M/W pipe registers
    //*****************
-   flopenrc #(1) MWReg1(clk, reset, FlushW, ~StallW, FWriteEnM, FWriteEnW);
-   flopenrc #(3) MWReg2(clk, reset, FlushW, ~StallW, FResultSelM, FResultSelW);
-   flopenrc #(1) MWReg3(clk, reset, FlushW, ~StallW, FmtM, FmtW);
-   flopenrc #(5) MWReg4(clk, reset, FlushW, ~StallW, RdM, RdW);
-   flopenrc #(64) MWReg5(clk, reset, FlushW, ~StallW, AlignedSrcAM, SrcAW);
-   // flopenrc #(64) MWReg6(clk, reset, FlushW, ~StallW, FLoadStoreResultM, FLoadStoreResultW);
-   flopenrc #(1) MWReg7(clk, reset, FlushW, ~StallW, FWriteIntM, FWriteIntW);
-   flopenrc #(4) MWReg6(clk, reset, FlushW, ~StallW, FOpCtrlM, FOpCtrlW);
+   flopenrc #(11) MWCtrlReg(clk, reset, FlushW, ~StallW,
+                        {FWriteEnM, FResultSelM, RdM, FmtM, FWriteIntM},
+                        {FWriteEnW, FResultSelW, RdW, FmtW, FWriteIntW});
    
    //*****************
    // fpuclassify M/W pipe registers
    //***************** 
    flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, ClassResultM, ClassResultW);
+   flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
+   flopenrc #(1) MWRegClass1(clk, reset, FlushW, ~StallW, FFlgM, FFlgW);
    
 
 
@@ -418,14 +380,6 @@ module fpu (
   //#########################################
   // BEGIN WRITEBACK STAGE
   //#########################################
-   
-
-   // mux3  #(64)  FLoadResultMux({ReadD[31:0], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA[`AHBW-1:`AHBW-32], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA, {64-`AHBW{1'b0}}}, {FmtM, SelLoadInputM}, FLoadResultM);
-   // mux2  #(64)  FLoadStoreResultMux(FLoadResultM, SrcXM, |FOpCtrlM[2:1], FLoadStoreResultM);
-   //***RV32D needs to give two bus transactions
-    mux2  #(64)  FLoadResultMux({ReadDataW[31:0], {32{1'b0}}}, {ReadDataW, {64-`XLEN{1'b0}}}, FmtW, FLoadResultW);
-    mux2  #(64)  FLoadStoreResultMux(FLoadResultW, SrcYW, |FOpCtrlW[2:1], FLoadStoreResultW);
-
 
 
 
@@ -434,47 +388,26 @@ module fpu (
 
    always_comb begin
       case (FResultSelW)
-	// div/sqrt
-	3'b000 : FPUFlagsW = FDivFlagsW;
-	// cmp		
-	3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0};
-	//fma/mult
-	3'b010 : FPUFlagsW = FmaFlagsW;
-	// sgn inj
-	3'b011 : FPUFlagsW = SgnFlagsW;
-	// add/sub/cnvt
-	3'b100 : FPUFlagsW = FAddFlagsW;
-	// classify
-	3'b101 : FPUFlagsW = 5'b0;
-	// output SrcAW
-	3'b110 : FPUFlagsW = 5'b0;
-	// output FRD1
-	3'b111 : FPUFlagsW = 5'b0;
+	3'b000 : FPUFlagsW = 5'b0;
+	3'b001 : FPUFlagsW = FmaFlagsW;
+	3'b010 : FPUFlagsW = FAddFlagsW;
+	3'b011 : FPUFlagsW = FDivFlagsW;
+	3'b100 : FPUFlagsW = {4'b0,FFlgW};
 	default : FPUFlagsW = 5'bxxxxx;
       endcase
    end
-   
+
    always_comb begin
       case (FResultSelW)
-	// div/sqrt
-	3'b000 : FPUResult64W = FDivResultW;
-	// cmp		
-	3'b001 : FPUResult64W = FCmpResultW;
-	//fma/mult
-	3'b010 : FPUResult64W = FmaResultW;
-	// sgn inj
-	3'b011 : FPUResult64W = SgnResultW;
-	// add/sub/cnvt
-	3'b100 : FPUResult64W = FAddResultW;
-	// classify
-	3'b101 : FPUResult64W = ClassResultW;
-	// output SrcAW
-	3'b110 : FPUResult64W = SrcAW;
-	// Load/Store/Move to FP-register
-	3'b111 : FPUResult64W = FLoadStoreResultW;
-	default : FPUResult64W = {64{1'bx}};
+	3'b000 : FPUResult64W = FmtW ? {ReadDataW, {64-`XLEN{1'b0}}} : {ReadDataW[31:0], 32'b0};
+	3'b001 : FPUResult64W = FmaResultW;
+	3'b010 : FPUResult64W = FAddResultW;
+	3'b011 : FPUResult64W = FDivResultW;
+	3'b100 : FPUResult64W = FResW;
+	default : FPUResult64W = 64'bxxxxx;
       endcase
-   end // always_comb
+   end
+   
    
    // interface between XLEN size datapath and double-precision sized
    // floating-point results
diff --git a/wally-pipelined/src/fpu/fpuhazard.sv b/wally-pipelined/src/fpu/fpuhazard.sv
index 03667d84f..4d0895a77 100644
--- a/wally-pipelined/src/fpu/fpuhazard.sv
+++ b/wally-pipelined/src/fpu/fpuhazard.sv
@@ -44,21 +44,21 @@ module fpuhazard(
 
       if ((Adr1E == RdM) & FWriteEnM)
       // if the result will be FResM
-        if(FResultSelM == 3'b110 | FResultSelM == 3'b011) ForwardXE = 2'b10; // choose FResM
+        if(FResultSelM == 3'b100) ForwardXE = 2'b10; // choose FResM
         else FStallD = 1;   // if the result won't be ready stall
       else if ((Adr1E == RdW) & FWriteEnW) ForwardXE = 2'b01; // choose FPUResult64W
     
 
       if ((Adr2E == RdM) & FWriteEnM)
       // if the result will be FResM
-        if(FResultSelM == 3'b110 | FResultSelM == 3'b011) ForwardYE = 2'b10; // choose FResM
+        if(FResultSelM == 3'b100) ForwardYE = 2'b10; // choose FResM
         else FStallD = 1;   // if the result won't be ready stall
       else if ((Adr2E == RdW) & FWriteEnW) ForwardYE = 2'b01; // choose FPUResult64W
 
  
       if ((Adr3E == RdM) & FWriteEnM)
       // if the result will be FResM
-        if(FResultSelM == 3'b110 | FResultSelM == 3'b011) ForwardZE = 2'b10; // choose FResM
+        if(FResultSelM == 3'b100) ForwardZE = 2'b10; // choose FResM
         else FStallD = 1;   // if the result won't be ready stall
       else if ((Adr3E == RdW) & FWriteEnW) ForwardZE = 2'b01; // choose FPUResult64W
 
diff --git a/wally-pipelined/src/ieu/datapath.sv b/wally-pipelined/src/ieu/datapath.sv
index c3303f9ac..44a40045a 100644
--- a/wally-pipelined/src/ieu/datapath.sv
+++ b/wally-pipelined/src/ieu/datapath.sv
@@ -37,7 +37,7 @@ module datapath (
   input  logic             ALUSrcAE, ALUSrcBE,
   input  logic             TargetSrcE, 
   input  logic             JumpE,
-  input  logic             IsFPE,
+  input  logic             IllegalFPUInstrE,
   input  logic [1:0]       MemRWE,
   input  logic [`XLEN-1:0] FWriteDataE,
   input  logic [`XLEN-1:0] PCE,
@@ -105,9 +105,9 @@ module datapath (
   flopenrc #(5)    Rs2EReg(clk, reset, FlushE, ~StallE, Rs2D, Rs2E);
   flopenrc #(5)    RdEReg(clk, reset, FlushE, ~StallE, RdD, RdE);
 	
-  mux3  #(`XLEN)  faemux(RD1E, WriteDataW, ALUResultM, ForwardAE, PreSrcAE);
-  mux3  #(`XLEN)  fbemux(RD2E, WriteDataW, ALUResultM, ForwardBE, PreSrcBE);
-  mux2  #(`XLEN)  writedatamux(PreSrcBE, FWriteDataE, IsFPE, WriteDataE);
+  mux3  #(`XLEN)  faemux(RD1E, WriteDataW, ResultM, ForwardAE, PreSrcAE);
+  mux3  #(`XLEN)  fbemux(RD2E, WriteDataW, ResultM, ForwardBE, PreSrcBE);
+  mux2  #(`XLEN)  writedatamux(PreSrcBE, FWriteDataE, ~IllegalFPUInstrE, WriteDataE);
   mux2  #(`XLEN)  srcamux(PreSrcAE, PCE, ALUSrcAE, SrcAE);
   mux2  #(`XLEN)  srcamux2(SrcAE, PCLinkE, JumpE, SrcAE2);  
   mux2  #(`XLEN)  srcbmux(PreSrcBE, ExtImmE, ALUSrcBE, SrcBE);
diff --git a/wally-pipelined/src/ieu/ieu.sv b/wally-pipelined/src/ieu/ieu.sv
index 2515f3230..50bf79e80 100644
--- a/wally-pipelined/src/ieu/ieu.sv
+++ b/wally-pipelined/src/ieu/ieu.sv
@@ -36,8 +36,7 @@ module ieu (
   input logic [`XLEN-1:0]  PCE, 
   input logic [`XLEN-1:0]  PCLinkE,
   input logic 		         FWriteIntE, 
-  input logic              IsFPE,
-  //input  logic [1:0]       FMemRWE,
+  input logic              IllegalFPUInstrE,
   input  logic [`XLEN-1:0] FWriteDataE,
   output logic [`XLEN-1:0] PCTargetE,
   output logic 		   MulDivE, W64E,
diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv
index a77c3ab01..fe1f057ce 100644
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@@ -95,18 +95,17 @@ module wallypipelinedhart (
 
   // floating point unit signals
   logic [2:0]        FRM_REGW;
-  logic [1:0]        FMemRWM, FMemRWE;
-  logic              FStallD;
-  logic              FWriteIntE, FWriteIntM, FWriteIntW;
-  logic [`XLEN-1:0]  FWriteDataE;
-  logic [`XLEN-1:0]  FIntResM;  
-  logic              FDivBusyE;
-  logic              IsFPD, IsFPE;
-  logic              IllegalFPUInstrD, IllegalFPUInstrE;
-  logic              FloatRegWriteW;
-  logic              FPUStallD;
-  logic [4:0]        SetFflagsM;
-  logic [`XLEN-1:0]  FPUResultW;
+  logic [1:0] 	   FMemRWM, FMemRWE;
+  logic 		      FStallD;
+  logic 		      FWriteIntE, FWriteIntM, FWriteIntW;
+  logic [`XLEN-1:0] FWriteDataE;
+  logic [`XLEN-1:0] FIntResM;  
+  logic 		      FDivBusyE;
+  logic 		      IllegalFPUInstrD, IllegalFPUInstrE;
+  logic           FloatRegWriteW;
+  logic           FPUStallD;
+  logic [4:0] 	   SetFflagsM;
+  logic [`XLEN-1:0] FPUResultW;
 
   // memory management unit signals
   logic             ITLBWriteF, DTLBWriteM;

From 157b1b31bf7341eec198ccacab826e77742081ef Mon Sep 17 00:00:00 2001
From: Ross Thompson <stephen.thompson.37@us.af.mil>
Date: Wed, 30 Jun 2021 19:24:59 -0500
Subject: [PATCH 18/20] Icache ITLB interlock fix.

---
 wally-pipelined/src/cache/ICacheCntrl.sv | 26 ++++++++++++++++++++----
 wally-pipelined/src/cache/icache.sv      | 21 +++++++++++--------
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/wally-pipelined/src/cache/ICacheCntrl.sv b/wally-pipelined/src/cache/ICacheCntrl.sv
index f290f0ad2..bc5c30b3b 100644
--- a/wally-pipelined/src/cache/ICacheCntrl.sv
+++ b/wally-pipelined/src/cache/ICacheCntrl.sv
@@ -40,8 +40,8 @@ module ICacheCntrl #(parameter BLOCKLEN = 256) (
     input logic [31:0] 		ICacheMemReadData,
     input logic 		ICacheMemReadValid,
     // The address at which we want to search the cache memory
-    output logic [`PA_BITS-1:0] 	PCTagF,
-    output logic [`PA_BITS-1:0]    PCNextIndexF,						     
+    output logic [`PA_BITS-1:0] PCTagF,
+    output logic [`PA_BITS-1:0] PCNextIndexF, 
     output logic 		ICacheReadEn,
     // Load data into the cache
     output logic 		ICacheMemWriteEnable,
@@ -56,13 +56,15 @@ module ICacheCntrl #(parameter BLOCKLEN = 256) (
 
     // Outputs to pipeline control stuff
     output logic 		ICacheStallF, EndFetchState,
+    input logic  ITLBMissF,
+    input logic  ITLBWriteF,
 
     // Signals to/from ahblite interface
     // A read containing the requested data
     input logic [`XLEN-1:0] 	InstrInF,
     input logic 		InstrAckF,
     // The read we request from main memory
-    output logic [`PA_BITS-1:0]	InstrPAdrF,
+    output logic [`PA_BITS-1:0] InstrPAdrF,
     output logic 		InstrReadF
 );
 
@@ -109,6 +111,10 @@ module ICacheCntrl #(parameter BLOCKLEN = 256) (
   
 
   localparam STATE_INVALIDATE = 18; // *** not sure if invalidate or evict? invalidate by cache block or address?
+  localparam STATE_TLB_MISS = 19;
+  localparam STATE_TLB_MISS_DONE = 20;
+  
+  
   
   localparam AHBByteLength = `XLEN / 8;
   localparam AHBOFFETWIDTH = $clog2(AHBByteLength);
@@ -209,7 +215,9 @@ module ICacheCntrl #(parameter BLOCKLEN = 256) (
       STATE_READY: begin
 	PCMux = 2'b00;
 	ICacheReadEn = 1'b1;
-	if (hit & ~spill) begin
+	if (ITLBMissF) begin
+	  NextState = STATE_TLB_MISS;
+	end else if (hit & ~spill) begin
 	  SavePC = 1'b1;
 	  ICacheStallF = 1'b0;
 	  NextState = STATE_READY;
@@ -363,6 +371,16 @@ module ICacheCntrl #(parameter BLOCKLEN = 256) (
 	ICacheStallF = 1'b0;	
 	NextState = STATE_READY;
       end
+      STATE_TLB_MISS: begin
+	if (ITLBWriteF) begin
+	  NextState = STATE_TLB_MISS_DONE;
+	end else begin
+	  NextState = STATE_TLB_MISS;
+	end
+      end
+      STATE_TLB_MISS_DONE : begin
+	NextState = STATE_READY;
+      end
       default: begin
 	PCMux = 2'b01;
 	NextState = STATE_READY;
diff --git a/wally-pipelined/src/cache/icache.sv b/wally-pipelined/src/cache/icache.sv
index abf828fc5..89b2ff9e7 100644
--- a/wally-pipelined/src/cache/icache.sv
+++ b/wally-pipelined/src/cache/icache.sv
@@ -28,24 +28,27 @@
 module icache
   (
    // Basic pipeline stuff
-   input logic 		    clk, reset,
-   input logic 		    StallF, StallD,
-   input logic 		    FlushD,
+   input logic 		       clk, reset,
+   input logic 		       StallF, StallD,
+   input logic 		       FlushD,
    input logic [`PA_BITS-1:0]  PCNextF,
    input logic [`PA_BITS-1:0]  PCPF, 
    // Data read in from the ebu unit
-   input logic [`XLEN-1:0]  InstrInF,
-   input logic 		    InstrAckF,
+   input logic [`XLEN-1:0]     InstrInF,
+   input logic 		       InstrAckF,
    // Read requested from the ebu unit
    output logic [`PA_BITS-1:0] InstrPAdrF,
-   output logic 	    InstrReadF,
+   output logic 	       InstrReadF,
    // High if the instruction currently in the fetch stage is compressed
-   output logic 	    CompressedF,
+   output logic 	       CompressedF,
    // High if the icache is requesting a stall
-   output logic 	    ICacheStallF,
+   output logic 	       ICacheStallF,
+   input logic 		       ITLBMissF,
+   input logic 		       ITLBWriteF,
+   
    // The raw (not decompressed) instruction that was requested
    // If this instruction is compressed, upper 16 bits may be the next 16 bits or may be zeros
-   output logic [31:0] 	    FinalInstrRawF
+   output logic [31:0] 	       FinalInstrRawF
    );
 
   // Configuration parameters

From ec21126474be1d38574ad22f72c9fe0dc811605f Mon Sep 17 00:00:00 2001
From: Teo Ene <tdene@rivendell.ecen.okstate.edu>
Date: Thu, 1 Jul 2021 13:32:42 -0500
Subject: [PATCH 19/20] Flow updated for 90nm

---
 .gitmodules                         |   3 -
 sky130/sky130_osu_sc_t12            |   1 -
 wally-pipelined/src/generic/lzd.sv~ | 195 ----------------------------
 3 files changed, 199 deletions(-)
 delete mode 160000 sky130/sky130_osu_sc_t12
 delete mode 100755 wally-pipelined/src/generic/lzd.sv~

diff --git a/.gitmodules b/.gitmodules
index 65e1e71c9..e69de29bb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "sky130/sky130_osu_sc_t12"]
-	path = sky130/sky130_osu_sc_t12
-	url = https://foss-eda-tools.googlesource.com/skywater-pdk/libs/sky130_osu_sc_t12/
diff --git a/sky130/sky130_osu_sc_t12 b/sky130/sky130_osu_sc_t12
deleted file mode 160000
index f60f2d039..000000000
--- a/sky130/sky130_osu_sc_t12
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f60f2d0395053c4df362a97d7e2099721b6face6
diff --git a/wally-pipelined/src/generic/lzd.sv~ b/wally-pipelined/src/generic/lzd.sv~
deleted file mode 100755
index bfffe5e5b..000000000
--- a/wally-pipelined/src/generic/lzd.sv~
+++ /dev/null
@@ -1,195 +0,0 @@
-///////////////////////////////////////////
-// lzd.sv
-//
-// Written: James.Stine@okstate.edu 1 February 2021
-// Modified: 
-//
-// Purpose: Integer Divide instructions
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-///////////////////////////////////////////
-
-`include "wally-config.vh"
-/* verilator lint_off DECLFILENAME */
-
-// Original idea came from  V. G. Oklobdzija, "An algorithmic and novel
-// design of a leading zero detector circuit: comparison with logic
-// synthesis," in IEEE Transactions on Very Large Scale Integration
-// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi:
-// 10.1109/92.273153.
-
-// Modified to be more hierarchical
-
-module lz2 (P, V, B);
-
-   input logic  [1:0] B;
-
-   output logic P;
-   output logic V;
-
-   assign V = B[0] | B[1];
-   assign P = B[0] & ~B[1];
-   
-endmodule // lz2
-
-module lzd_hier #(parameter WIDTH=8) 
-   (input logic [WIDTH-1:0]          B,
-    output logic [$clog2(WIDTH)-1:0] ZP,
-    output logic 		     ZV);
-
-   if (WIDTH == 128)
-     lz128 lzd127 (ZP, ZV, B);	      
-   else if (WIDTH == 64)
-     lz64 lzd64 (ZP, ZV, B);	   
-   else if (WIDTH == 32)
-     lz32 lzd32 (ZP, ZV, B);
-   else if (WIDTH == 16)
-     lz16 lzd16 (ZP, ZV, B);
-   else if (WIDTH == 8)
-     lz8 lzd8 (ZP, ZV, B);
-   else if (WIDTH == 4)
-     lz4 lzd4 (ZP, ZV, B);
-
-endmodule // lzd_hier
-
-module lz4 (ZP, ZV, B);
-
-   input logic [3:0]  B;
-
-   logic  	       ZPa;
-   logic  	       ZPb;
-   logic 	       ZVa;
-   logic 	       ZVb;   
-
-   output logic [1:0]  ZP;
-   output logic        ZV;
-
-   lz2 l1(ZPa, ZVa, B[1:0]);
-   lz2 l2(ZPb, ZVb, B[3:2]);
-
-   assign ZP[0:0] = ZVb ? ZPb : ZPa;
-   assign ZP[1]   = ~ZVb;
-   assign ZV = ZVa | ZVb;
-
-endmodule 
-
-module lz8 (ZP, ZV, B);
-
-   input logic [7:0]  B;
-
-   logic [1:0] 	       ZPa;
-   logic [1:0] 	       ZPb;
-   logic 	       ZVa;
-   logic 	       ZVb;   
-
-   output logic [2:0]  ZP;
-   output logic        ZV;
-
-   lz4 l1(ZPa, ZVa, B[3:0]);
-   lz4 l2(ZPb, ZVb, B[7:4]);
-
-   assign ZP[1:0] = ZVb ? ZPb : ZPa;
-   assign ZP[2]   = ~ZVb;
-   assign ZV = ZVa | ZVb;
-
-endmodule 
-
-module lz16 (ZP, ZV, B);
-
-   input logic [15:0]  B;
-
-   logic [2:0] 	       ZPa;
-   logic [2:0] 	       ZPb;
-   logic 	       ZVa;
-   logic 	       ZVb;   
-
-   output logic [3:0]  ZP;
-   output logic        ZV;
-
-   lz8 l1(ZPa, ZVa, B[7:0]);
-   lz8 l2(ZPb, ZVb, B[15:8]);
-
-   assign ZP[2:0] = ZVb ? ZPb : ZPa;
-   assign ZP[3]   = ~ZVb;
-   assign ZV = ZVa | ZVb;
-
-endmodule // lz16
-
-module lz32 (ZP, ZV, B);
-
-   input logic [31:0] B;
-
-   logic [3:0] 	      ZPa;
-   logic [3:0] 	      ZPb;
-   logic 	      ZVa;
-   logic 	      ZVb;
-   
-   output logic [4:0] ZP;
-   output logic       ZV;
-   
-   lz16 l1(ZPa, ZVa, B[15:0]);
-   lz16 l2(ZPb, ZVb, B[31:16]);
-   
-   assign ZP[3:0] = ZVb ? ZPb : ZPa;
-   assign ZP[4]   = ~ZVb;
-   assign ZV = ZVa | ZVb;
-
-endmodule // lz32
-
-module lz64 (ZP, ZV, B);
-
-   input logic [63:0]  B;
-   
-   logic [4:0] 	       ZPa;
-   logic [4:0] 	       ZPb;
-   logic 	       ZVa;
-   logic 	       ZVb;
-   
-   output logic [5:0]  ZP;
-   output logic        ZV;
-   
-   lz32 l1(ZPa, ZVa, B[31:0]);
-   lz32 l2(ZPb, ZVb, B[63:32]);
-   
-   assign ZP[4:0] = ZVb ? ZPb : ZPa;
-   assign ZP[5]   = ~ZVb;
-   assign ZV = ZVa | ZVb;
-
-endmodule // lz64
-
-module lz128 (ZP, ZV, B);
-
-   input logic [127:0]  B;
-   
-   logic [5:0] 	       ZPa;
-   logic [5:0] 	       ZPb;
-   logic 	       ZVa;
-   logic 	       ZVb;
-   
-   output logic [6:0]  ZP;
-   output logic        ZV;
-   
-   lz64 l1(ZPa, ZVa, B[64:0]);
-   lz64 l2(ZPb, ZVb, B[127:63]);
-   
-   assign ZP[5:0] = ZVb ? ZPb : ZPa;
-   assign ZP[6]   = ~ZVb;
-   assign ZV = ZVa | ZVb;
-
-endmodule // lz128
-
-/* verilator lint_on DECLFILENAME */