Merge branch 'main' of github.com:davidharrishmc/riscv-wally into main

2025-02-11 06:05:49 +00:00 · 2021-06-22 15:47:16 -05:00 · 2021-06-22 15:47:16 -05:00 · e7d8d0b337
commit e7d8d0b337
parent 03084a4128 9eb6eb40bf
15 changed files with 32224 additions and 214817 deletions
--- a/.gitignore
+++ b/.gitignore
@ -26,3 +26,5 @@ testsBP/*/*.a
 wally-pipelined/linux-testgen/linux-testvectors/*
 !wally-pipelined/linux-testgen/linux-testvectors/tvCopier.py
 !wally-pipelined/linux-testgen/linux-testvectors/tvLinker.sh
+wally-pipelined/regression/slack-notifier/slack-webhook-url.txt
+
--- a/wally-pipelined/regression/slack-notifier/slack-notifier.py
+++ b/wally-pipelined/regression/slack-notifier/slack-notifier.py
@ -0,0 +1,32 @@
+#!/usr/bin/python3
+import os,sys,subprocess
+from datetime import datetime, timezone, timedelta
+
+if not os.path.isfile(sys.path[0]+'/slack-webhook-url.txt'):
+    print('==============================================================')
+    print('                             HOWDY!                           ')
+    print('slack-notifier.py can help let you know when your sim is done.')
+    print('To make it work, please supply your Slack bot webhook URL in:')
+    print(sys.path[0]+'/slack-webhook-url.txt')
+    print('Ask Ben for the Tera Slack Notifier Tutorial for more details.')
+    print('==============================================================')
+else:
+    urlFile = open(sys.path[0]+'/slack-webhook-url.txt','r')
+    url = urlFile.readline().strip('\n')
+
+    # Traverse 3 parents up the process tree
+    result = subprocess.check_output('ps -o ppid -p $PPID',shell=True)
+    PPID2 = str(result).split('\\n')[1]
+    result = subprocess.check_output('ps -o ppid -p '+PPID2,shell=True)
+    PPID3 = str(result).split('\\n')[1]
+    # Get command name
+    result = subprocess.check_output('ps -o cmd -p '+PPID3,shell=True)
+    cmdName = str(result).split('\\n')[1]
+    # Get current time
+    timezone_offset = -8.0  # Pacific Standard Time (UTC−08:00)
+    tzinfo = timezone(timedelta(hours=timezone_offset))
+    time = datetime.now(tzinfo).strftime('%I:%M %p')
+    # Send message
+    message = 'Command `'+cmdName+'` completed at '+time+' PST'
+    result = subprocess.run('curl -X POST -H \'Content-type: application/json\' --data \'{"text":"'+message+'"}\' '+url,shell=True,stdout=subprocess.DEVNULL,stderr=subprocess.DEVNULL)
+    print('Simulation stopped. Sending Slack message.')
--- a/wally-pipelined/regression/wally-buildroot-batch.do
+++ b/wally-pipelined/regression/wally-buildroot-batch.do
@ -36,4 +36,5 @@ vsim workopt -suppress 8852,12070

 run -all
 run -all
+exec ./slack-notifier/slack-notifier.py
 quit 
--- a/wally-pipelined/regression/wally-buildroot.do
+++ b/wally-pipelined/regression/wally-buildroot.do
@ -39,4 +39,5 @@ vsim workopt -suppress 8852,12070
 run -all
 do ./wave-dos/linux-waves.do
 run -all
+exec ./slack-notifier/slack-notifier.py
 ##quit
--- a/wally-pipelined/regression/wally-busybear-batch.do
+++ b/wally-pipelined/regression/wally-busybear-batch.do
@ -36,4 +36,5 @@ vopt work_busybear.testbench -o workopt_busybear
 vsim workopt_busybear -suppress 8852,12070

 run -all
+exec ./slack-notifier/slack-notifier.py
 quit
--- a/wally-pipelined/regression/wally-busybear.do
+++ b/wally-pipelined/regression/wally-busybear.do
@ -40,4 +40,5 @@ do ./wave-dos/linux-waves.do

 #-- Run the Simulation 
 run -all
+exec ./slack-notifier/slack-notifier.py
 ##quit
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.sv
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.sv
@ -110,7 +110,7 @@ always @(posedge clk)
 		if(ans >= 64'h7FF8000000000000 && ans <= 64'h7FFfffffffffffff ) $display( "ans=qutNaN ");
 		if(ans >= 64'hFFF8000000000000 && ans <= 64'hFFFfffffffffffff ) $display( "ans=qutNaN ");
        errors = errors + 1;
-	 // if (errors == 40)
+	  if (errors == 20)
 		$stop;
    end
    if((FmtE==1'b0)&(FmaFlagsM != flags[4:0] || (!wnan && (FmaResultM != ans)) || (wnan && ansnan && ~(((xnan && (FmaResultM[62:0] == {FInput1E[62:55],1'b1,FInput1E[53:0]})) || (ynan && (FmaResultM[62:0] == {FInput2E[62:55],1'b1,FInput2E[53:0]}))  || (znan && (FmaResultM[62:0] == {FInput3E[62:55],1'b1,FInput3E[53:0]})) || (FmaResultM[62:0] == ans[62:0]))) ))) begin
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.v
--- a/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
+++ b/wally-pipelined/src/fpu/FMA/tbgen/test_gen.sh
@ -1,3 +1,3 @@
-testfloat_gen f64_mulAdd -tininessbefore -n 6133248 -rmin  -seed 113355 -level 1 > testFloat
+testfloat_gen f64_mulAdd -tininessafter -n 6133248 -rmin  -seed 113355 -level 1 > testFloat
 tr -d ' ' < testFloat > testFloatNoSpace

--- a/wally-pipelined/src/fpu/fma2.sv
+++ b/wally-pipelined/src/fpu/fma2.sv
@ -288,9 +288,9 @@ module fma2(

 	// Set Underflow flag if the number is too small to be represented in normal numbers
 	//		- Don't set the underflow flag if the result is exact 
-	assign Underflow = (SumExp[12] | ((SumExp == 0) & (Round|Guard|Sticky))    )&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
-	assign UnderflowFlag = Underflow | (FullResultExp == 0)&Minus1; // before rounding option
-	// assign UnderflowFlag = (Underflow | (FullResultExp == 0)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM)&(Round|Guard|Sticky))  & ~(FullResultExp == 1); //after rounding option
+	assign Underflow = (SumExp[12] | ((SumExp == 0) & (Round|Guard|Sticky)))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+	//assign UnderflowFlag = (Underflow | (FullResultExp == 0)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM)&(Round|Guard|Sticky))  & ~(FullResultExp == 1);
+	assign UnderflowFlag = (Underflow | (FullResultExp == 0)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM)&(Round|Guard|Sticky))  & ~(FullResultExp == 1);
 	// Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
 	//		- Don't set the underflow flag if an underflowed result isn't outputed
 	assign Inexact = (Sticky|Overflow|Guard|Round|Underflow)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -25,129 +25,127 @@
 `include "wally-config.vh"

 module fpu (
-  input logic [2:0] 	   FRM_REGW, // Rounding mode from CSR
-  input logic 		   reset,
-  //input  logic             clear,     // *** not being used anywhere
-  input logic 		   clk,
-  input logic [31:0] 	   InstrD,
-  input logic [`XLEN-1:0]  SrcAE, // Integer input being processed
-  input logic [`XLEN-1:0]  SrcAM, // Integer input being written into fpreg
-  input logic 		   StallE, StallM, StallW,
-  input logic 		   FlushE, FlushM, FlushW,
-  input logic [`AHBW-1:0]  HRDATA,
-  input logic 		   RegWriteD,
-  output logic [4:0] 	   SetFflagsM,
-  output logic [31:0] 	   FSROutW,
-  output logic [1:0] 	   FMemRWM,
-  output logic 		   FStallD,
-  output logic 		   FWriteIntE, FWriteIntM, FWriteIntW,
-  output logic [`XLEN-1:0] FWriteDataM,
-  output logic 		   FDivBusyE,
-  output logic 		   IllegalFPUInstrD,
-  output logic [`XLEN-1:0] FPUResultW);
+  input logic [2:0]        FRM_REGW,   // Rounding mode from CSR
+  input logic 		         reset,
+  input logic 		         clk,
+  input logic [31:0]       InstrD,
+  input logic [`XLEN-1:0]  SrcAE,      // Integer input being processed
+  input logic [`XLEN-1:0]  SrcAM,      // Integer input being written into fpreg
+  input logic 		         StallE, StallM, StallW,
+  input logic 		         FlushE, FlushM, FlushW,
+  input logic [`XLEN-1:0]  ReadDataW,     // Read data from memory
+  input logic 		         RegWriteD,  // register write enable from ieu
+  output logic [4:0] 	   SetFflagsM, // FPU flags
+  output logic [1:0] 	   FMemRWM,    // Read/write enable for memory {read, write}
+  output logic 		      FStallD,    // Stall the decode stage if Div/Sqrt instruction
+  output logic 		      FWriteIntE, FWriteIntM, FWriteIntW, // Write integer register enable
+  output logic [`XLEN-1:0] FWriteDataM,      // Data to be written to memory
+  output logic 		      FDivBusyE,        // Is the divison/sqrt unit busy
+  output logic 		      IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
+  output logic [`XLEN-1:0] FPUResultW);      // FPU result

   // control logic signal instantiation
-   logic 		   FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;             // FP register write enable
-   logic [2:0] 		   FrmD, FrmE, FrmM, FrmW;                                 // FP rounding mode
-   logic 		   FmtD, FmtE, FmtM, FmtW;                                 // FP precision 0-single 1-double
-   logic 		   FDivStartD, FDivStartE;                                 // Start division
-   logic 		   FWriteIntD;                                 // Write to integer register
-   logic 		   FOutputInput2D, FOutputInput2E;                         // Put Input2 in Input1 if a store instruction
-   logic [1:0] 		   FMemRWD, FMemRWE;                                       // Read and write enable for memory
-   logic [1:0] 		   FForwardInput1D, FForwardInput1E;                       // Input1 forwarding mux control signal
-   logic [1:0] 		   FForwardInput2D, FForwardInput2E;                       // Input2 forwarding mux control signal
-   logic 		   FForwardInput3D, FForwardInput3E;                       // Input3 forwarding mux control signal
-   logic 		   FInput2UsedD;                                           // Is input 2 used
-   logic 		   FInput3UsedD;                                           // Is input 3 used
-   logic [2:0] 		   FResultSelD, FResultSelE, FResultSelM, FResultSelW;     // Select FP result
-   logic [3:0] 		   FOpCtrlD, FOpCtrlE, FOpCtrlM;                           // Select which opperation to do in each component
-   logic          SelLoadInputE, SelLoadInputM;
+   logic 		   FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;              // FP register write enable
+   logic [2:0] 	FrmD, FrmE, FrmM, FrmW;                                  // FP rounding mode
+   logic 		   FmtD, FmtE, FmtM, FmtW;                                  // FP precision 0-single 1-double
+   logic 		   FDivStartD, FDivStartE;                                  // Start division
+   logic 		   FWriteIntD;                                              // Write to integer register
+   logic 		   FOutputInput2D, FOutputInput2E;                          // Put Input2 in Input1 if a store instruction
+   logic [1:0] 	FMemRWD, FMemRWE;                                        // Read and write enable for memory
+   logic [1:0]    FForwardInput1D, FForwardInput1E;                        // Input1 forwarding mux control signal
+   logic [1:0] 	FForwardInput2D, FForwardInput2E;                        // Input2 forwarding mux control signal
+   logic 		   FForwardInput3D, FForwardInput3E;                        // Input3 forwarding mux control signal
+   logic 		   FInput2UsedD;                                            // Is input 2 used
+   logic 		   FInput3UsedD;                                            // Is input 3 used
+   logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
+   logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM, FOpCtrlW;                            // Select which opperation to do in each component
+   logic          SelLoadInputE, SelLoadInputM;                            // Select which adress to load when single precision
   
   // regfile signals //*** KEP lint warning -  changed `XLEN-1 to 63 
-   logic [4:0] 		   RdE, RdM, RdW; // ***Can take from ieu
-   logic [63:0] 	   FWDM;                                                   // Write data for FP register
-   logic [63:0] 	   FRD1D, FRD2D, FRD3D;                                    // Read Data from FP register
-   logic [63:0] 	   FRD1E, FRD2E, FRD3E;
-   logic [63:0] 	   FInput1E, FInput1M, FInput1tmpE;
-   logic [63:0] 	   FInput2E, FInput2M;
-   logic [63:0] 	   FInput3E, FInput3M;
-   logic [63:0] 	   FLoadResultM, FLoadStoreResultM, FLoadStoreResultW;                   // Result for load, store, and move to int-reg instructions
+   logic [4:0]    RdE, RdM, RdW;                                           // what adress to write to    // ***Can take from ieu insted of pipelining
+   logic [63:0] 	FWDM;                                                    // Write data for FP register
+   logic [63:0] 	FRD1D, FRD2D, FRD3D;                                     // Read Data from FP register - decode stage
+   logic [63:0] 	FRD1E, FRD2E, FRD3E;                                     // Read Data from FP register - execute stage
+   logic [63:0] 	FInput1E, FInput1M, FInput1W, FInput1tmpE;                         // Input 1 to the various units (after forwarding)
+   logic [63:0] 	FInput2E, FInput2M;                                      // Input 2 to the various units (after forwarding)
+   logic [63:0] 	FInput3E, FInput3M;                                      // Input 3 to the various units (after forwarding)
+   logic [63:0] 	FLoadResultW, FLoadStoreResultM, FLoadStoreResultW;      // Result for load, store, and move to int-reg instructions
   
   // div/sqrt signals
   logic 		   DivDenormE, DivDenormM, DivDenormW;
   logic 		   DivOvEn, DivUnEn;
-   logic [63:0] 	   FDivResultE, FDivResultM, FDivResultW;
-   logic [4:0] 		   FDivFlagsE, FDivFlagsM, FDivFlagsW;
-   logic            FDivSqrtDoneE, FDivSqrtDoneM;
-   logic [63:0] 	 DivInput1E, DivInput2E;
-   logic HoldInputs;
+   logic [63:0] 	FDivResultE, FDivResultM, FDivResultW;
+   logic [4:0] 	FDivFlagsE, FDivFlagsM, FDivFlagsW;
+   logic          FDivSqrtDoneE, FDivSqrtDoneM;
+   logic [63:0] 	DivInput1E, DivInput2E;
+   logic          HoldInputs;                                              // keep forwarded inputs arround durring division
   
   // FMA signals
-	logic 	[105:0]		ProdManE, ProdManM;
-	logic 	[161:0]		AlignedAddendE,	AlignedAddendM;
-	logic 	[12:0]		ProdExpE, ProdExpM;
-	logic 				    AddendStickyE, AddendStickyM;
-	logic 				    KillProdE, KillProdM;
-	logic				      XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
-	logic				      XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
-	logic				      XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
-  logic [63:0]      FmaResultM, FmaResultW;
-  logic [4:0]       FmaFlagsM, FmaFlagsW;
+	logic [105:0]	ProdManE, ProdManM;
+	logic [161:0]	AlignedAddendE, AlignedAddendM;                       
+	logic [12:0]	ProdExpE, ProdExpM;
+	logic 			AddendStickyE, AddendStickyM;
+	logic 			KillProdE, KillProdM;
+	logic				XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
+	logic				XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
+	logic				XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
+   logic [63:0]   FmaResultM, FmaResultW;
+   logic [4:0]    FmaFlagsM, FmaFlagsW;

   // add/cvt signals
-   logic [63:0] 	   AddSumE, AddSumTcE;
-   logic [3:0] 		   AddSelInvE;
-   logic [10:0] 	   AddExpPostSumE;
+   logic [63:0] 	AddSumE, AddSumTcE;
+   logic [3:0] 	AddSelInvE;
+   logic [10:0] 	AddExpPostSumE;
   logic 		   AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
   logic 		   AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
   logic 		   AddConvertE;
-   logic [63:0] 	   AddFloat1E, AddFloat2E;
-   logic [11:0] 	   AddExp1DenormE, AddExp2DenormE;
-   logic [10:0] 	   AddExponentE;
-   logic [2:0] 		   AddRmE;
-   logic [3:0] 		   AddOpTypeE;
+   logic [63:0] 	AddFloat1E, AddFloat2E;
+   logic [11:0] 	AddExp1DenormE, AddExp2DenormE;
+   logic [10:0] 	AddExponentE;
+   logic [2:0] 	AddRmE;
+   logic [3:0] 	AddOpTypeE;
   logic 		   AddPE, AddOvEnE, AddUnEnE;    
   logic 		   AddDenormM;
-   logic [63:0] 	   AddSumM, AddSumTcM;
-   logic [3:0] 		   AddSelInvM;
-   logic [10:0] 	   AddExpPostSumM;
+   logic [63:0] 	AddSumM, AddSumTcM;
+   logic [3:0] 	AddSelInvM;
+   logic [10:0] 	AddExpPostSumM;
   logic 		   AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
   logic 		   AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
   logic 		   AddConvertM, AddSignM;
-   logic [63:0] 	   AddFloat1M, AddFloat2M;
-   logic [11:0] 	   AddExp1DenormM, AddExp2DenormM;
-   logic [10:0] 	   AddExponentM;
-   logic [63:0] 	   AddOp1M, AddOp2M;
-   logic [2:0] 		   AddRmM;
-   logic [3:0] 		   AddOpTypeM;
+   logic [63:0] 	AddFloat1M, AddFloat2M;
+   logic [11:0] 	AddExp1DenormM, AddExp2DenormM;
+   logic [10:0] 	AddExponentM;
+   logic [63:0] 	AddOp1M, AddOp2M;
+   logic [2:0] 	AddRmM;
+   logic [3:0] 	AddOpTypeM;
   logic 		   AddPM, AddOvEnM, AddUnEnM;  
-   logic [63:0] 	   FAddResultM, FAddResultW;
-   logic [4:0] 		   FAddFlagsM, FAddFlagsW;
+   logic [63:0] 	FAddResultM, FAddResultW;
+   logic [4:0] 	FAddFlagsM, FAddFlagsW;
   
   // cmp signals 
-   logic [7:0] 		   WE, WM;
-   logic [7:0] 		   XE, XM;
+   logic [7:0] 	WE, WM;
+   logic [7:0] 	XE, XM;
   logic 		   ANaNE, ANaNM;
   logic 		   BNaNE, BNaNM;
   logic 		   AzeroE, AzeroM;
   logic 		   BzeroE, BzeroM;
   logic 		   CmpInvalidM, CmpInvalidW;
-   logic [1:0] 		   CmpFCCM, CmpFCCW; 
-   logic [63:0] 	   FCmpResultM, FCmpResultW;
+   logic [1:0] 	CmpFCCM, CmpFCCW; 
+   logic [63:0] 	FCmpResultM, FCmpResultW;
   
   // fsgn signals
-   logic [63:0] 	   SgnResultE, SgnResultM, SgnResultW;
-   logic [4:0] 		   SgnFlagsE, SgnFlagsM, SgnFlagsW;
+   logic [63:0] 	SgnResultE, SgnResultM, SgnResultW;
+   logic [4:0] 	SgnFlagsE, SgnFlagsM, SgnFlagsW;
   
   // instantiation of W stage regfile signals
-   logic [63:0] 	   AlignedSrcAM, ForwardSrcAM, SrcAW;
+   logic [63:0] 	AlignedSrcAM, ForwardSrcAM, SrcAW;
   
   // classify signals
-   logic [63:0] 	   ClassResultE, ClassResultM, ClassResultW;
+   logic [63:0] 	ClassResultE, ClassResultM, ClassResultW;
   
   // 64-bit FPU result   
-   logic [63:0] 	   FPUResult64W, FPUResult64E;                                           
-   logic [4:0] 		   FPUFlagsW;
+   logic [63:0] 	FPUResult64W, FPUResult64E;                                           
+   logic [4:0] 	FPUFlagsW;
   
   // pipeline control logic
   logic 		   PipeEnableDE;
@ -159,8 +157,8 @@ module fpu (
   
   // temporarily assign pipe clear and enable signals
   // to never flush & always be running
-   localparam PipeClear = 1'b0;
-   localparam PipeEnable = 1'b1;
+   localparam     PipeClear = 1'b0;
+   localparam     PipeEnable = 1'b1;
   always_comb begin      
      PipeEnableDE = ~StallE;
      PipeEnableEM = ~StallM;
@ -219,6 +217,7 @@ module fpu (
   mux2  #(64)  FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E);
   mux2  #(64)  FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E);
   
+   // first of two-stage instance of floating-point fused multiply-add unit
   fma1 fma1 (.X(FInput1E), .Y(FInput2E), .Z(FInput3E), .FOpCtrlE(FOpCtrlE[2:0]),.*);
   
   // first and only instance of floating-point divider
@ -275,13 +274,6 @@ module fpu (
  flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, XNaNE, XNaNM); 
  flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, YNaNE, YNaNM); 
  flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, ZNaNE, ZNaNM);  
-   
-   //*****************
-   // fpdiv E/M pipe registers
-   //*****************
-   // flopenrc #(64) EMRegDiv1(clk, reset, PipeClearEM, PipeEnableEM, FDivResultE, FDivResultM); 
-   // flopenrc #(5) EMRegDiv2(clk, reset, PipeClearEM, PipeEnableEM, FDivFlagsE, FDivFlagsM);
-   // flopenrc #(1) EMRegDiv3(clk, reset, PipeClearEM, PipeEnableEM, DivDenormE, DivDenormM); 

   //*****************
   // fpadd E/M pipe registers
@ -352,8 +344,8 @@ module fpu (
   assign FWriteDataM = FmtM ? FInput1M[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FInput1M[63:32]};
   //adjecent adress values are sent to the FPU, select the correct one
   //    -imm is 80000 most of the time vs the error one which is 00000
-   mux3  #(64)  FLoadResultMux({HRDATA[31:0], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA[`AHBW-1:`AHBW-32], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA, {64-`AHBW{1'b0}}}, {FmtM, SelLoadInputM}, FLoadResultM);
-   mux2  #(64)  FLoadStoreResultMux(FLoadResultM, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
+   // mux3  #(64)  FLoadResultMux({HRDATA[31:0], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA[`AHBW-1:`AHBW-32], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA, {64-`AHBW{1'b0}}}, {FmtM, SelLoadInputM}, FLoadResultM);
+   // mux2  #(64)  FLoadStoreResultMux(FLoadResultM, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
   
   fma2 fma2(.X(FInput1M), .Y(FInput2M), .Z(FInput3M), .FOpCtrlM(FOpCtrlM[2:0]), .*);
   
@ -364,8 +356,18 @@ module fpu (
   fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), 
 		   .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*);

+   // Align SrcA to MSB when single precicion
   mux2  #(64)  SrcAMux({SrcAM[31:0], 32'b0}, {{64-`XLEN{1'b0}}, SrcAM}, FmtM, AlignedSrcAM);
      
+
+
+
+      
+   //*****************
+   //fpregfile M/W pipe registers
+   //*****************
+   flopenrc #(64) MWFpReg1(clk, reset, PipeClearMW, PipeEnableMW, FInput1M, FInput1W);
+   
   //*****************
   // fma M/W pipe registers
   //*****************
@ -406,18 +408,36 @@ module fpu (
   flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
   flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
   flopenrc #(64) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, AlignedSrcAM, SrcAW);
-   flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW);
+   // flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW);
   flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
+   flopenrc #(4) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FOpCtrlM, FOpCtrlW);
   
   //*****************
   // fpuclassify M/W pipe registers
   //***************** 
   flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW);
+   
+
+
+

  //#########################################
  // BEGIN WRITEBACK STAGE
  //#########################################
   
+
+   // mux3  #(64)  FLoadResultMux({ReadD[31:0], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA[`AHBW-1:`AHBW-32], {64-`AHBW+(`XLEN-32){1'b0}}}, {HRDATA, {64-`AHBW{1'b0}}}, {FmtM, SelLoadInputM}, FLoadResultM);
+   // mux2  #(64)  FLoadStoreResultMux(FLoadResultM, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
+   //***RV32D needs to give two bus transactions
+    mux2  #(64)  FLoadResultMux({ReadDataW[31:0], {32{1'b0}}}, {ReadDataW, {64-`XLEN{1'b0}}}, FmtW, FLoadResultW);
+    mux2  #(64)  FLoadStoreResultMux(FLoadResultW, FInput1W, |FOpCtrlW[2:1], FLoadStoreResultW);
+
+
+
+
+
+
+
   always_comb begin
      case (FResultSelW)
 	// div/sqrt
--- a/wally-pipelined/src/mmu/pagetablewalker.sv
+++ b/wally-pipelined/src/mmu/pagetablewalker.sv
@ -110,7 +110,7 @@ module pagetablewalker (
  assign PageTypeF = PageType;
  assign PageTypeM = PageType;

-  localparam LEVEL0 = 3'h0;
+localparam LEVEL0 = 3'h0;
  localparam LEVEL1 = 3'h1;
  // space left for more levels
  localparam LEAF = 3'h5;
@ -216,7 +216,7 @@ module pagetablewalker (
    end else begin
      localparam LEVEL2 = 3'h2;
      localparam LEVEL3 = 3'h3;
-
+      
      logic [8:0] VPN3, VPN2, VPN1, VPN0;

      logic TerapageMisaligned, GigapageMisaligned, BadTerapage, BadGigapage;
@ -225,49 +225,53 @@ module pagetablewalker (

      always_comb begin
        case (WalkerState)
-          IDLE:   if      (MMUTranslate)           NextWalkerState = LEVEL3;
-                  else                             NextWalkerState = IDLE;
-          LEVEL3: if      (SvMode != `SV48)         NextWalkerState = LEVEL2;
-                  // 3rd level used if SV48 is enabled.
-                  else begin
-                    if      (~MMUReady)              NextWalkerState = LEVEL3;
-                    // *** <FUTURE WORK> According to the architecture, we should
-                    // fault upon finding a superpage that is misaligned or has 0
-                    // access bit. The following commented line of code is
-                    // supposed to perform that check. However, it is untested.
-                    else if (ValidPTE && LeafPTE && ~BadTerapage) NextWalkerState = LEAF;
-                    // else if (ValidPTE && LeafPTE)    NextWalkerState = LEAF;  // *** Once the above line is properly tested, delete this line.
-                    else if (ValidPTE && ~LeafPTE)   NextWalkerState = LEVEL2;
-                    else                             NextWalkerState = FAULT;
-                  end
-          LEVEL2: if      (~MMUReady)              NextWalkerState = LEVEL2;
+          IDLE:   if      (MMUTranslate && SvMode == `SV48)     NextWalkerState = LEVEL3;
+                  else if (MMUTranslate && SvMode == `SV39)     NextWalkerState = LEVEL2;
+                  else                                          NextWalkerState = IDLE;
+
+          LEVEL3: if      (~MMUReady)                           NextWalkerState = LEVEL3;
+                  // *** <FUTURE WORK> According to the architecture, we should
+                  // fault upon finding a superpage that is misaligned or has 0
+                  // access bit. The following commented line of code is
+                  // supposed to perform that check. However, it is untested.
+                  else if (ValidPTE && LeafPTE && ~BadTerapage) NextWalkerState = LEAF;
+                  // else if (ValidPTE && LeafPTE)    NextWalkerState = LEAF;  // *** Once the above line is properly tested, delete this line.
+                  else if (ValidPTE && ~LeafPTE)                NextWalkerState = LEVEL2;
+                  else                                          NextWalkerState = FAULT;
+
+          LEVEL2: if      (~MMUReady)                           NextWalkerState = LEVEL2;
                  // *** <FUTURE WORK> According to the architecture, we should
                  // fault upon finding a superpage that is misaligned or has 0
                  // access bit. The following commented line of code is
                  // supposed to perform that check. However, it is untested.
                  else if (ValidPTE && LeafPTE && ~BadGigapage) NextWalkerState = LEAF;
                  // else if (ValidPTE && LeafPTE)    NextWalkerState = LEAF;  // *** Once the above line is properly tested, delete this line.
-                  else if (ValidPTE && ~LeafPTE)   NextWalkerState = LEVEL1;
-                  else                             NextWalkerState = FAULT;
-          LEVEL1: if      (~MMUReady)              NextWalkerState = LEVEL1;
+                  else if (ValidPTE && ~LeafPTE)                NextWalkerState = LEVEL1;
+                  else                                          NextWalkerState = FAULT;
+
+          LEVEL1: if      (~MMUReady)                           NextWalkerState = LEVEL1;
                  // *** <FUTURE WORK> According to the architecture, we should
                  // fault upon finding a superpage that is misaligned or has 0
                  // access bit. The following commented line of code is
                  // supposed to perform that check. However, it is untested.
                  else if (ValidPTE && LeafPTE && ~BadMegapage) NextWalkerState = LEAF;
                  // else if (ValidPTE && LeafPTE)    NextWalkerState = LEAF;  // *** Once the above line is properly tested, delete this line.
-                  else if (ValidPTE && ~LeafPTE)   NextWalkerState = LEVEL0;
-                  else                             NextWalkerState = FAULT;
-          LEVEL0: if      (~MMUReady)              NextWalkerState = LEVEL0;
-                  else if (ValidPTE && LeafPTE && ~AccessAlert)
-                                                   NextWalkerState = LEAF;
-                  else                             NextWalkerState = FAULT;
-          LEAF:   if      (MMUTranslate)           NextWalkerState = LEVEL3;
-                  else                             NextWalkerState = IDLE;
-          FAULT:  if      (MMUTranslate)           NextWalkerState = LEVEL3;
-                  else                             NextWalkerState = IDLE;
+                  else if (ValidPTE && ~LeafPTE)                NextWalkerState = LEVEL0;
+                  else                                          NextWalkerState = FAULT;
+
+          LEVEL0: if      (~MMUReady)                           NextWalkerState = LEVEL0;
+                  else if (ValidPTE && LeafPTE && ~AccessAlert) NextWalkerState = LEAF;
+                  else                                          NextWalkerState = FAULT;
+                  
+          LEAF:   if      (MMUTranslate && SvMode == `SV48)     NextWalkerState = LEVEL3;
+                  else if (MMUTranslate && SvMode == `SV39)     NextWalkerState = LEVEL2;
+                  else                                          NextWalkerState = IDLE;
+
+          FAULT:  if      (MMUTranslate && SvMode == `SV48)     NextWalkerState = LEVEL3;
+                  else if (MMUTranslate && SvMode == `SV39)     NextWalkerState = LEVEL2;
+                  else                                          NextWalkerState = IDLE;
          // Default case should never happen, but is included for linter.
-          default:                                 NextWalkerState = IDLE;
+          default:                                              NextWalkerState = IDLE;
        endcase
      end

--- a/wally-pipelined/src/mmu/physicalpagemask.sv
+++ b/wally-pipelined/src/mmu/physicalpagemask.sv
@ -7,6 +7,7 @@
 //
 // Purpose: Takes two page numbers and replaces segments of the first page
 //          number with segments from the second, based on the page type.
+//          NOTE: this DOES NOT include the 12 bit offset, which is the same no matter the translation mode or page type.
 // 
 // A component of the Wally configurable RISC-V project.
 // 
@ -36,26 +37,26 @@ module physicalpagemask (
 );

  localparam EXTRA_BITS = `PPN_BITS - `VPN_BITS;
-  logic [`PPN_BITS-1:0] ZeroExtendedVPN = {{EXTRA_BITS{1'b0}}, VPN}; // forces the VPN to be the same width as PPN.
-
-  logic [`PPN_BITS-1:0] OffsetMask;
+  logic [`PPN_BITS-1:0] ZeroExtendedVPN;
+  logic [`PPN_BITS-1:0] PageNumberMask;

+  assign ZeroExtendedVPN = {{EXTRA_BITS{1'b0}}, VPN}; // forces the VPN to be the same width as PPN.

  generate
    if (`XLEN == 32) begin
      always_comb 
        case (PageType[0])
          // *** the widths of these constansts are hardocded here to match `PPN_BITS in the wally-constants file.
-          0: OffsetMask = 22'h3FFFFF; // kilopage: 22 bits of PPN, 0 bits of VPN
-          1: OffsetMask = 22'h3FFC00; // megapage: 12 bits of PPN, 10 bits of VPN
+          0: PageNumberMask = 22'h3FFFFF; // kilopage: 22 bits of PPN, 0 bits of VPN
+          1: PageNumberMask = 22'h3FFC00; // megapage: 12 bits of PPN, 10 bits of VPN
        endcase
    end else begin
      always_comb 
        case (PageType[1:0])
-          0: OffsetMask = 44'hFFFFFFFFFFF; // kilopage: 44 bits of PPN, 0 bits of VPN
-          1: OffsetMask = 44'hFFFFFFFFE00; // megapage: 35 bits of PPN, 9 bits of VPN
-          2: OffsetMask = 44'hFFFFFFC0000; // gigapage: 26 bits of PPN, 18 bits of VPN
-          3: OffsetMask = 44'hFFFF8000000; // terapage: 17 bits of PPN, 27 bits of VPN
+          0: PageNumberMask = 44'hFFFFFFFFFFF; // kilopage: 44 bits of PPN, 0 bits of VPN
+          1: PageNumberMask = 44'hFFFFFFFFE00; // megapage: 35 bits of PPN, 9 bits of VPN
+          2: PageNumberMask = 44'hFFFFFFC0000; // gigapage: 26 bits of PPN, 18 bits of VPN
+          3: PageNumberMask = 44'hFFFF8000000; // terapage: 17 bits of PPN, 27 bits of VPN
          // *** make sure that this doesnt break when using sv39. In that case, all of these
          //     busses are the widths for sv48, but extra bits should be zeroed out by the mux
          //     in the tlb when it generates VPN from the full virtualadress.
@ -63,7 +64,7 @@ module physicalpagemask (
    end
  endgenerate

-  // merge low bits of the virtual address containing the offset with high bits of the PPN
-  assign MixedPageNumber = (ZeroExtendedVPN & ~OffsetMask) | (PPN & OffsetMask);
+  // merge low segments of VPN with high segments of PPN decided by the pagetype.
+  assign MixedPageNumber = (ZeroExtendedVPN & ~PageNumberMask) | (PPN & PageNumberMask);

 endmodule
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@ -99,7 +99,6 @@ module wallypipelinedhart (
  logic       SquashSCW;
  logic       FStallD;
  logic       FWriteIntE, FWriteIntW, FWriteIntM;
-  logic [31:0]      FSROutW;
  logic             FDivBusyE;
  logic             IllegalFPUInstrD, IllegalFPUInstrE;
  logic [`XLEN-1:0] FPUResultW;
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -59,15 +59,15 @@ module testbench();
 string tests32f[] = '{
    "rv32f/I-FADD-S-01", "2000",
    "rv32f/I-FCLASS-S-01", "2000",
-    "rv32f/I-FCVT-S-L-01", "2000",
-    "rv32f/I-FCVT-S-LU-01", "2000",
-    "rv32f/I-FCVT-S-W-01", "2000",
-    "rv32f/I-FCVT-S-WU-01", "2000",
-    "rv32f/I-FCVT-L-S-01", "2000",
-    "rv32f/I-FCVT-LU-S-01", "2000",
-    "rv32f/I-FCVT-W-S-01", "2000",
-    "rv32f/I-FCVT-WU-S-01", "2000",
-    "rv32f/I-FDIV-S-01", "2000",
+    // "rv32f/I-FCVT-S-L-01", "2000",
+    // "rv32f/I-FCVT-S-LU-01", "2000",
+    // "rv32f/I-FCVT-S-W-01", "2000",
+    // "rv32f/I-FCVT-S-WU-01", "2000",
+    // "rv32f/I-FCVT-L-S-01", "2000",
+    // "rv32f/I-FCVT-LU-S-01", "2000",
+    // "rv32f/I-FCVT-W-S-01", "2000",
+    // "rv32f/I-FCVT-WU-S-01", "2000",
+    // "rv32f/I-FDIV-S-01", "2000",
    "rv32f/I-FEQ-S-01", "2000",
    "rv32f/I-FLE-S-01", "2000",
    "rv32f/I-FLT-S-01", "2000",
@ -83,14 +83,14 @@ string tests32f[] = '{
    "rv32f/I-FSGNJ-S-01", "2000",
    "rv32f/I-FSGNJN-S-01", "2000",
    "rv32f/I-FSGNJX-S-01", "2000",
-    "rv32f/I-FSQRT-S-01", "2000",
+    // "rv32f/I-FSQRT-S-01", "2000",
    "rv32f/I-FSW-01", "2000",
-    "rv32f/I-FLW-01", "2000",
+    "rv32f/I-FLW-01", "2110",
    "rv32f/I-FSUB-S-01", "2000"
  };

  string tests64f[] = '{
-    // "rv64f/I-FLW-01", "2110",
+    "rv64f/I-FLW-01", "2110",
    "rv64f/I-FMV-W-X-01", "2000",
    "rv64f/I-FMV-X-W-01", "2000",
    "rv64f/I-FSW-01", "2000",