diff --git a/wally-pipelined/regression/wave-dos/peripheral-waves.do b/wally-pipelined/regression/wave-dos/peripheral-waves.do
index 1304b40c..a42bfbd4 100644
--- a/wally-pipelined/regression/wave-dos/peripheral-waves.do
+++ b/wally-pipelined/regression/wave-dos/peripheral-waves.do
@@ -9,7 +9,8 @@ add wave /testbench/clk
 add wave /testbench/reset
 add wave -divider
 
-add wave /testbench/dut/hart/DataStall
+#add wave /testbench/dut/hart/DataStall
+add wave /testbench/debug
 add wave /testbench/dut/hart/StallF
 add wave /testbench/dut/hart/StallD
 add wave /testbench/dut/hart/StallE
diff --git a/wally-pipelined/src/cache/ICacheCntrl.sv b/wally-pipelined/src/cache/ICacheCntrl.sv
index 748b3f5e..e7098d75 100644
--- a/wally-pipelined/src/cache/ICacheCntrl.sv
+++ b/wally-pipelined/src/cache/ICacheCntrl.sv
@@ -115,8 +115,8 @@ module ICacheCntrl #(parameter BLOCKLEN = 256)
   localparam STATE_INVALIDATE = 'h12; // *** not sure if invalidate or evict? invalidate by cache block or address?
   localparam STATE_TLB_MISS = 'h13;
   localparam STATE_TLB_MISS_DONE = 'h14;
-  
-  
+  localparam STATE_INSTR_PAGE_FAULT = 'h15;
+
   
   localparam AHBByteLength = `XLEN / 8;
   localparam AHBOFFETWIDTH = $clog2(AHBByteLength);
@@ -370,13 +370,20 @@ module ICacheCntrl #(parameter BLOCKLEN = 256)
         NextState = STATE_READY;
       end
       STATE_TLB_MISS: begin
-        if (ITLBWriteF | WalkerInstrPageFaultF) begin
+        if (WalkerInstrPageFaultF) begin
+          NextState = STATE_INSTR_PAGE_FAULT;
+          ICacheStallF = 1'b0;
+        end else if (ITLBWriteF) begin
           NextState = STATE_TLB_MISS_DONE;
         end else begin
           NextState = STATE_TLB_MISS;
         end
       end
-      STATE_TLB_MISS_DONE : begin
+      STATE_TLB_MISS_DONE: begin
+        NextState = STATE_READY;
+      end
+      STATE_INSTR_PAGE_FAULT: begin
+        ICacheStallF = 1'b0;
         NextState = STATE_READY;
       end
       default: begin
@@ -425,8 +432,8 @@ module ICacheCntrl #(parameter BLOCKLEN = 256)
   // store read data from memory interface before writing into SRAM.
   genvar 				i;
   generate
-    for (i = 0; i < WORDSPERLINE; i++) begin
-      flopenr #(`XLEN) flop(.clk(clk),
+    for (i = 0; i < WORDSPERLINE; i++) begin:storebuffer
+      flopenr #(`XLEN) sb(.clk(clk),
 			    .reset(reset), 
 			    .en(InstrAckF & (i == FetchCount)),
 			    .d(InstrInF),
diff --git a/wally-pipelined/src/cache/dmapped.sv b/wally-pipelined/src/cache/dmapped.sv
index f40da412..42669752 100644
--- a/wally-pipelined/src/cache/dmapped.sv
+++ b/wally-pipelined/src/cache/dmapped.sv
@@ -106,7 +106,7 @@ module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par
     assign DataWord = ReadLineTransformed[ReadOffset];
     genvar i;
     generate
-        for (i=0; i < LINESIZE/WORDSIZE; i++) begin
+        for (i=0; i < LINESIZE/WORDSIZE; i++) begin:readline
             assign ReadLineTransformed[i] = ReadLine[(i+1)*WORDSIZE-1:i*WORDSIZE];
         end
     endgenerate
@@ -214,7 +214,7 @@ module wtdirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par
     assign DataWord = ReadLineTransformed[ReadOffset];
     genvar i;
     generate
-        for (i=0; i < LINESIZE/WORDSIZE; i++) begin
+        for (i=0; i < LINESIZE/WORDSIZE; i++) begin:readline
             assign ReadLineTransformed[i] = ReadLine[(i+1)*WORDSIZE-1:i*WORDSIZE];
         end
     endgenerate
diff --git a/wally-pipelined/src/ebu/ahblite.sv b/wally-pipelined/src/ebu/ahblite.sv
index b0c6f033..4bd079e9 100644
--- a/wally-pipelined/src/ebu/ahblite.sv
+++ b/wally-pipelined/src/ebu/ahblite.sv
@@ -216,11 +216,9 @@ module ahblite (
   subwordread swr(.*);
 
   // Handle AMO instructions if applicable
-  generate 
+  generate
     if (`A_SUPPORTED) begin
       logic [`XLEN-1:0] AMOResult;
-//      amoalu amoalu(.a(HRDATA), .b(WriteDataM), .funct(Funct7M), .width(MemSizeM), 
-//                    .result(AMOResult));
       amoalu amoalu(.srca(HRDATAW), .srcb(WriteDataM), .funct(Funct7M), .width(MemSizeM), 
                     .result(AMOResult));
       mux2 #(`XLEN) wdmux(WriteDataM, AMOResult, AtomicMaskedM[1], WriteData);
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index ff29dfd7..59f5e439 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -43,90 +43,93 @@ module fpu (
   output logic [4:0] 	   SetFflagsM,       // FPU flags
   output logic [`XLEN-1:0] FPUResultW);      // FPU result
 // *** change FMA to do 16 - 32 - 64 - 128 FEXPBITS 
-   // control logic signal instantiation
-   logic 		   FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;              // FP register write enable
-   logic [2:0] 	FrmD, FrmE, FrmM;                                  // FP rounding mode
-   logic 		   FmtD, FmtE, FmtM, FmtW;                                  // FP precision 0-single 1-double
-   logic 		   FDivStartD, FDivStartE;                                  // Start division
-   logic 		   FWriteIntD;                                              // Write to integer register
-   logic [1:0]    ForwardXE, ForwardYE, ForwardZE;                        // Input3 forwarding mux control signal
-   logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
-   logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM;                  // Select which opperation to do in each component
-   logic [1:0]    FResSelD, FResSelE, FResSelM;  
-   logic [1:0]    FIntResSelD, FIntResSelE, FIntResSelM;                                   
-   logic [4:0] 	Adr1E, Adr2E, Adr3E;
-   
-   // regfile signals
-   logic [4:0]    RdE, RdM, RdW;                                           // what adress to write to    // ***Can take from ieu insted of pipelining
-   logic [63:0] 	FRD1D, FRD2D, FRD3D;                                     // Read Data from FP register - decode stage
-   logic [63:0] 	FRD1E, FRD2E, FRD3E;                                     // Read Data from FP register - execute stage
-   logic [`XLEN-1:0]   SrcXMAligned;
-   logic [63:0] 	SrcXE, SrcXM;                         // Input 1 to the various units (after forwarding)
-   logic [63:0] 	SrcYE, SrcYM;                                      // Input 2 to the various units (after forwarding)
-   logic [63:0] 	SrcZE, SrcZM;                                      // Input 3 to the various units (after forwarding)
-   
-   // div/sqrt signals
-   logic [63:0] 	FDivResultM, FDivResultW;
-   logic [4:0]    FDivSqrtFlgM, FDivSqrtFlgW;
-   logic          FDivSqrtDoneE;
-   logic [63:0] 	DivInput1E, DivInput2E;
-   logic          HoldInputs;                                              // keep forwarded inputs arround durring division
-   
-   // FMA signals
-	logic [105:0]	ProdManE, ProdManM; ///*** put pipline stages in units
-	logic [161:0]	AlignedAddendE, AlignedAddendM;                       
-	logic [12:0]	ProdExpE, ProdExpM;
-	logic 			AddendStickyE, AddendStickyM;
-	logic 			KillProdE, KillProdM;
-	logic				XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
-	logic				XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
-	logic				XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
-   logic [63:0]   FMAResM, FMAResW;
-   logic [4:0]    FMAFlgM, FMAFlgW;
 
-   // add/cvt signals
-   logic [63:0] 	AddSumE, AddSumM;
-   logic [63:0]   AddSumTcE, AddSumTcM;
-   logic [3:0] 	AddSelInvE, AddSelInvM;
-   logic [10:0] 	AddExpPostSumE,AddExpPostSumM;
-   logic 		   AddCorrSignE, AddCorrSignM;
-   logic          AddOp1NormE, AddOp1NormM;
-   logic          AddOp2NormE, AddOp2NormM;
-   logic          AddOpANormE,  AddOpANormM;
-   logic          AddOpBNormE, AddOpBNormM;
-   logic          AddInvalidE, AddInvalidM;
-   logic 		   AddDenormInE, AddDenormInM;
-   logic          AddSwapE, AddSwapM;
-   logic          AddNormOvflowE, AddNormOvflowM; //***this isn't used in addcvt2
-   logic          AddSignAE, AddSignAM;
-   logic 		   AddConvertE, AddConvertM;
-   logic [63:0] 	AddFloat1E, AddFloat2E, AddFloat1M, AddFloat2M;
-   logic [11:0] 	AddExp1DenormE, AddExp2DenormE, AddExp1DenormM, AddExp2DenormM;
-   logic [10:0] 	AddExponentE, AddExponentM;
-   logic [63:0] 	FAddResM, FAddResW;
-   logic [4:0] 	FAddFlgM, FAddFlgW;  
-   
-   // cmp signals 
-   logic 		   CmpNVE, CmpNVM, CmpNVW;
-   logic [63:0] 	CmpResE, CmpResM, CmpResW;
-   
-   // fsgn signals
-   logic [63:0] 	SgnResE, SgnResM;
-   logic        	SgnNVE, SgnNVM, SgnNVW;
-   logic [63:0]   FResM, FResW;
-   logic          FFlgM, FFlgW;
-   
-   // instantiation of W stage regfile signals
-   logic [63:0] 	AlignedSrcAM;
-   
-   // classify signals
-   logic [63:0] 	ClassResE, ClassResM;
-   
-   // 64-bit FPU result   
-   logic [63:0] 	FPUResult64W;                                           
-   logic [4:0] 	FPUFlagsW;
-   
-   
+  generate
+     if (`F_SUPPORTED) begin 
+      // control logic signal instantiation
+      logic 		   FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;              // FP register write enable
+      logic [2:0] 	FrmD, FrmE, FrmM;                                  // FP rounding mode
+      logic 		   FmtD, FmtE, FmtM, FmtW;                                  // FP precision 0-single 1-double
+      logic 		   FDivStartD, FDivStartE;                                  // Start division
+      logic 		   FWriteIntD;                                              // Write to integer register
+      logic [1:0]    ForwardXE, ForwardYE, ForwardZE;                        // Input3 forwarding mux control signal
+      logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
+      logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM;                  // Select which opperation to do in each component
+      logic [1:0]    FResSelD, FResSelE, FResSelM;  
+      logic [1:0]    FIntResSelD, FIntResSelE, FIntResSelM;                                   
+      logic [4:0] 	Adr1E, Adr2E, Adr3E;
+      
+      // regfile signals
+      logic [4:0]    RdE, RdM, RdW;                                           // what adress to write to    // ***Can take from ieu insted of pipelining
+      logic [63:0] 	FRD1D, FRD2D, FRD3D;                                     // Read Data from FP register - decode stage
+      logic [63:0] 	FRD1E, FRD2E, FRD3E;                                     // Read Data from FP register - execute stage
+      logic [`XLEN-1:0]   SrcXMAligned;
+      logic [63:0] 	SrcXE, SrcXM;                         // Input 1 to the various units (after forwarding)
+      logic [63:0] 	SrcYE, SrcYM;                                      // Input 2 to the various units (after forwarding)
+      logic [63:0] 	SrcZE, SrcZM;                                      // Input 3 to the various units (after forwarding)
+      
+      // div/sqrt signals
+      logic [63:0] 	FDivResultM, FDivResultW;
+      logic [4:0]    FDivSqrtFlgM, FDivSqrtFlgW;
+      logic          FDivSqrtDoneE;
+      logic [63:0] 	DivInput1E, DivInput2E;
+      logic          HoldInputs;                                              // keep forwarded inputs arround durring division
+      
+      // FMA signals
+      logic [105:0]	ProdManE, ProdManM; ///*** put pipline stages in units
+      logic [161:0]	AlignedAddendE, AlignedAddendM;                       
+      logic [12:0]	ProdExpE, ProdExpM;
+      logic 			AddendStickyE, AddendStickyM;
+      logic 			KillProdE, KillProdM;
+      logic				XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
+      logic				XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
+      logic				XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
+      logic [63:0]   FMAResM, FMAResW;
+      logic [4:0]    FMAFlgM, FMAFlgW;
+
+      // add/cvt signals
+      logic [63:0] 	AddSumE, AddSumM;
+      logic [63:0]   AddSumTcE, AddSumTcM;
+      logic [3:0] 	AddSelInvE, AddSelInvM;
+      logic [10:0] 	AddExpPostSumE,AddExpPostSumM;
+      logic 		   AddCorrSignE, AddCorrSignM;
+      logic          AddOp1NormE, AddOp1NormM;
+      logic          AddOp2NormE, AddOp2NormM;
+      logic          AddOpANormE,  AddOpANormM;
+      logic          AddOpBNormE, AddOpBNormM;
+      logic          AddInvalidE, AddInvalidM;
+      logic 		   AddDenormInE, AddDenormInM;
+      logic          AddSwapE, AddSwapM;
+      logic          AddNormOvflowE, AddNormOvflowM; //***this isn't used in addcvt2
+      logic          AddSignAE, AddSignAM;
+      logic 		   AddConvertE, AddConvertM;
+      logic [63:0] 	AddFloat1E, AddFloat2E, AddFloat1M, AddFloat2M;
+      logic [11:0] 	AddExp1DenormE, AddExp2DenormE, AddExp1DenormM, AddExp2DenormM;
+      logic [10:0] 	AddExponentE, AddExponentM;
+      logic [63:0] 	FAddResM, FAddResW;
+      logic [4:0] 	FAddFlgM, FAddFlgW;  
+      
+      // cmp signals 
+      logic 		   CmpNVE, CmpNVM, CmpNVW;
+      logic [63:0] 	CmpResE, CmpResM, CmpResW;
+      
+      // fsgn signals
+      logic [63:0] 	SgnResE, SgnResM;
+      logic        	SgnNVE, SgnNVM, SgnNVW;
+      logic [63:0]   FResM, FResW;
+      logic          FFlgM, FFlgW;
+      
+      // instantiation of W stage regfile signals
+      logic [63:0] 	AlignedSrcAM;
+      
+      // classify signals
+      logic [63:0] 	ClassResE, ClassResM;
+      
+      // 64-bit FPU result   
+      logic [63:0] 	FPUResult64W;                                           
+      logic [4:0] 	FPUFlagsW;
+      
+      
 
 
 
@@ -134,189 +137,19 @@ module fpu (
 
 
 
-   //DECODE STAGE
-   
-   
-   // top-level controller for FPU
-   fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
-               .FRM_REGW, .IllegalFPUInstrD, .FWriteEnD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
-               .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
-   
-   // regfile instantiation
-   fregfile fregfile (clk, reset, FWriteEnW,
-			InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
-			FPUResult64W,
-			FRD1D, FRD2D, FRD3D);	
-   
-
-
-
-
-
-
-
-
-   //*****************
-   // D/E pipe registers
-   //*****************
-   flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
-   flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
-   flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
-   flopenrc #(1) DECtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
-   flopenrc #(15) DECtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
-                                                         {Adr1E,         Adr2E,         Adr3E});
-   flopenrc #(22) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-                        {FWriteEnD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FWriteIntD},
-                        {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE,          FOpCtrlE, FWriteIntE});
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-   //EXECUTION STAGE
-   
-   // Hazard unit for FPU
-   fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FWriteEnM, .FWriteEnW, .RdM, .RdW, .FResultSelM, .FStallD, 
-                     .ForwardXE, .ForwardYE, .ForwardZE);
-
-   // forwarding muxs
-   mux3  #(64)  fxemux(FRD1E, FPUResult64W, FResM, ForwardXE, SrcXE);
-   mux3  #(64)  fyemux(FRD2E, FPUResult64W, FResM, ForwardYE, SrcYE);
-   mux3  #(64)  fzemux(FRD3E, FPUResult64W, FResM, ForwardZE, SrcZE);
-
-   
-   // first of two-stage instance of floating-point fused multiply-add unit
-   fma1 fma1 (.X(SrcXE), .Y(SrcYE), .Z(SrcZE), .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .ProdManE, .AlignedAddendE,
-               .ProdExpE, .AddendStickyE, .KillProdE, .XZeroE, .YZeroE, .ZZeroE, .XInfE, .YInfE, .ZInfE,
-               .XNaNE, .YNaNE, .ZNaNE );
-   
-   // first and only instance of floating-point divider
-   logic fpdivClk;
-   
-   clockgater fpdivclkg(.E(FDivStartE),
-			.SE(1'b0),
-			.CLK(clk),
-			.ECLK(fpdivClk));
-   
-   // capture the inputs for div/sqrt	 
-   flopenrc #(64) reg_input1 (.d(SrcXE), .q(DivInput1E),
-               .en(~HoldInputs), .clear(FDivSqrtDoneE),
-               .reset(reset),  .clk(clk));
-   flopenrc #(64) reg_input2 (.d(SrcYE), .q(DivInput2E),
-               .en(~HoldInputs), .clear(FDivSqrtDoneE),
-               .reset(reset),  .clk(clk));
-
-   fdivsqrt fdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
-                     .FrmE, .DivOvEn(1'b1), .DivUnEn(1'b1), .FDivStartE, .FDivResultM, .FDivSqrtFlgM, 
-                     .FDivSqrtDoneE, .FDivBusyE, .HoldInputs, .reset);
-   
-
-
-   // first of two-stage instance of floating-point add/cvt unit
-   fpuaddcvt1 fpadd1 (.SrcXE, .SrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
-                     .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
-                     .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
-                     .AddDenormInE, .AddConvertE, .AddSwapE, .AddNormOvflowE);
-   
-   // first and only instance of floating-point comparator
-   fcmp fcmp (SrcXE, SrcYE, FOpCtrlE[2:0], FmtE, CmpNVE, CmpResE);
-   
-   // first and only instance of floating-point sign converter
-   fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .SrcXE, .SrcYE, .SgnResE, .SgnNVE);
-   
-   // first and only instance of floating-point classify unit
-   fclassify fclassify (.SrcXE, .FmtE, .ClassResE);
-
-   // output for store instructions
-   assign FWriteDataE = FmtE ? SrcYE[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcYE[63:32]};
-   //***swap to mux
-
-
-
-
-
-
-
-
-
-
-   //*****************
-   // E/M pipe registers
-   //*****************
-   flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, SrcXE, SrcXM);
-   flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, SrcYE, SrcYM);
-   flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, SrcZE, SrcZM);
-   
-   flopenrc #(106) EMRegFma1(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
-   flopenrc #(162) EMRegFma2(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
-   flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
-   flopenrc #(11) EMRegFma4(clk, reset, FlushM, ~StallM, 
-                              {AddendStickyE, KillProdE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE},
-                              {AddendStickyM, KillProdM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM});
-
-   flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
-   flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
-   flopenrc #(11) EMRegAdd3(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
-   flopenrc #(64) EMRegAdd4(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
-   flopenrc #(64) EMRegAdd5(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
-   flopenrc #(12) EMRegAdd6(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
-   flopenrc #(12) EMRegAdd7(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
-   flopenrc #(11) EMRegAdd8(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM);
-   flopenrc #(15) EMRegAdd9(clk, reset, FlushM, ~StallM, 
-                           {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE},
-                           {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM}); 
-
-   flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
-   flopenrc #(64) EMRegCmp2(clk, reset, FlushM, ~StallM, CmpResE, CmpResM); 
-   
-   flopenrc #(64) EMRegSgn1(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
-   flopenrc #(1) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
-   
-   flopenrc #(22) EMCtrlReg(clk, reset, FlushM, ~StallM,
-                        {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE, FOpCtrlE, FWriteIntE},
-                        {FWriteEnM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, RdM, FOpCtrlM, FWriteIntM});
-
-   flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
-   
-
-
-
-
-
-
-
-   //BEGIN MEMORY STAGE
-   
-   mux3  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, FResSelM, FResM);
-   mux3  #(1)  FFlgMux(1'b0, SgnNVM, CmpNVM, FResSelM, FFlgM);
-
-   //***change to mux
-   assign SrcXMAligned = FmtM ? SrcXM[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcXM[63:32]};
-   mux3  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], SrcXMAligned, ClassResM[`XLEN-1:0], FIntResSelM, FIntResM);
-
-   // second instance of two-stage FMA unit
-   fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM(FOpCtrlM[2:0]), .FrmM, .FmtM, 
-            .ProdManM, .AlignedAddendM, .ProdExpM, .AddendStickyM, .KillProdM, 
-            .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, 
-            .FMAResM, .FMAFlgM);
-   
-   // second instance of two-stage floating-point add/cvt unit
-   fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM, .AddFloat1M, .AddFloat2M, 
-                     .AddExp1DenormM, .AddExp2DenormM, .AddExponentM, .AddExpPostSumM, .AddSelInvM, 
-                     .AddOp1NormM, .AddOp2NormM, .AddOpANormM, .AddOpBNormM, .AddInvalidM, .AddDenormInM, 
-                     .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
-   
-   // Align SrcA to MSB when single precicion
-   mux2  #(64)  SrcAMux({SrcAM[31:0], 32'b0}, {{64-`XLEN{1'b0}}, SrcAM}, FmtM, AlignedSrcAM);
+      //DECODE STAGE
+      
+      
+      // top-level controller for FPU
+      fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
+                  .FRM_REGW, .IllegalFPUInstrD, .FWriteEnD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
+                  .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
+      
+      // regfile instantiation
+      fregfile fregfile (clk, reset, FWriteEnW,
+            InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
+            FPUResult64W,
+            FRD1D, FRD2D, FRD3D);	
       
 
 
@@ -326,77 +159,260 @@ module fpu (
 
 
 
+      //*****************
+      // D/E pipe registers
+      //*****************
+      flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
+      flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
+      flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
+      flopenrc #(1) DECtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
+      flopenrc #(15) DECtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
+                                                            {Adr1E,         Adr2E,         Adr3E});
+      flopenrc #(22) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+                           {FWriteEnD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FWriteIntD},
+                           {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE,          FOpCtrlE, FWriteIntE});
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+      //EXECUTION STAGE
+      
+      // Hazard unit for FPU
+      fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FWriteEnM, .FWriteEnW, .RdM, .RdW, .FResultSelM, .FStallD, 
+                        .ForwardXE, .ForwardYE, .ForwardZE);
+
+      // forwarding muxs
+      mux3  #(64)  fxemux(FRD1E, FPUResult64W, FResM, ForwardXE, SrcXE);
+      mux3  #(64)  fyemux(FRD2E, FPUResult64W, FResM, ForwardYE, SrcYE);
+      mux3  #(64)  fzemux(FRD3E, FPUResult64W, FResM, ForwardZE, SrcZE);
+
+      
+      // first of two-stage instance of floating-point fused multiply-add unit
+      fma1 fma1 (.X(SrcXE), .Y(SrcYE), .Z(SrcZE), .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .ProdManE, .AlignedAddendE,
+                  .ProdExpE, .AddendStickyE, .KillProdE, .XZeroE, .YZeroE, .ZZeroE, .XInfE, .YInfE, .ZInfE,
+                  .XNaNE, .YNaNE, .ZNaNE );
+      
+      // first and only instance of floating-point divider
+      logic fpdivClk;
+      
+      clockgater fpdivclkg(.E(FDivStartE),
+            .SE(1'b0),
+            .CLK(clk),
+            .ECLK(fpdivClk));
+      
+      // capture the inputs for div/sqrt	 
+      flopenrc #(64) reg_input1 (.d(SrcXE), .q(DivInput1E),
+                  .en(~HoldInputs), .clear(FDivSqrtDoneE),
+                  .reset(reset),  .clk(clk));
+      flopenrc #(64) reg_input2 (.d(SrcYE), .q(DivInput2E),
+                  .en(~HoldInputs), .clear(FDivSqrtDoneE),
+                  .reset(reset),  .clk(clk));
+
+      fdivsqrt fdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
+                        .FrmE, .DivOvEn(1'b1), .DivUnEn(1'b1), .FDivStartE, .FDivResultM, .FDivSqrtFlgM, 
+                        .FDivSqrtDoneE, .FDivBusyE, .HoldInputs, .reset);
+      
+
+
+      // first of two-stage instance of floating-point add/cvt unit
+      fpuaddcvt1 fpadd1 (.SrcXE, .SrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
+                        .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
+                        .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
+                        .AddDenormInE, .AddConvertE, .AddSwapE, .AddNormOvflowE);
+      
+      // first and only instance of floating-point comparator
+      fcmp fcmp (SrcXE, SrcYE, FOpCtrlE[2:0], FmtE, CmpNVE, CmpResE);
+      
+      // first and only instance of floating-point sign converter
+      fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .SrcXE, .SrcYE, .SgnResE, .SgnNVE);
+      
+      // first and only instance of floating-point classify unit
+      fclassify fclassify (.SrcXE, .FmtE, .ClassResE);
+
+      // output for store instructions
+      assign FWriteDataE = FmtE ? SrcYE[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcYE[63:32]};
+      //***swap to mux
+
+
+
+
+
+
+
+
+
+
+      //*****************
+      // E/M pipe registers
+      //*****************
+      flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, SrcXE, SrcXM);
+      flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, SrcYE, SrcYM);
+      flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, SrcZE, SrcZM);
+      
+      flopenrc #(106) EMRegFma1(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
+      flopenrc #(162) EMRegFma2(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
+      flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
+      flopenrc #(11) EMRegFma4(clk, reset, FlushM, ~StallM, 
+                                 {AddendStickyE, KillProdE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE},
+                                 {AddendStickyM, KillProdM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM});
+
+      flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
+      flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
+      flopenrc #(11) EMRegAdd3(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
+      flopenrc #(64) EMRegAdd4(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
+      flopenrc #(64) EMRegAdd5(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
+      flopenrc #(12) EMRegAdd6(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
+      flopenrc #(12) EMRegAdd7(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
+      flopenrc #(11) EMRegAdd8(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM);
+      flopenrc #(15) EMRegAdd9(clk, reset, FlushM, ~StallM, 
+                              {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE},
+                              {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM}); 
+
+      flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
+      flopenrc #(64) EMRegCmp2(clk, reset, FlushM, ~StallM, CmpResE, CmpResM); 
+      
+      flopenrc #(64) EMRegSgn1(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
+      flopenrc #(1) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
+      
+      flopenrc #(22) EMCtrlReg(clk, reset, FlushM, ~StallM,
+                           {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE, FOpCtrlE, FWriteIntE},
+                           {FWriteEnM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, RdM, FOpCtrlM, FWriteIntM});
+
+      flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
+      
+
+
+
+
+
+
+
+      //BEGIN MEMORY STAGE
+      
+      mux3  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, FResSelM, FResM);
+      mux3  #(1)  FFlgMux(1'b0, SgnNVM, CmpNVM, FResSelM, FFlgM);
+
+      //***change to mux
+      assign SrcXMAligned = FmtM ? SrcXM[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcXM[63:32]};
+      mux3  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], SrcXMAligned, ClassResM[`XLEN-1:0], FIntResSelM, FIntResM);
+
+      // second instance of two-stage FMA unit
+      fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM(FOpCtrlM[2:0]), .FrmM, .FmtM, 
+               .ProdManM, .AlignedAddendM, .ProdExpM, .AddendStickyM, .KillProdM, 
+               .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, 
+               .FMAResM, .FMAFlgM);
+      
+      // second instance of two-stage floating-point add/cvt unit
+      fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM, .AddFloat1M, .AddFloat2M, 
+                        .AddExp1DenormM, .AddExp2DenormM, .AddExponentM, .AddExpPostSumM, .AddSelInvM, 
+                        .AddOp1NormM, .AddOp2NormM, .AddOpANormM, .AddOpBNormM, .AddInvalidM, .AddDenormInM, 
+                        .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
+      
+      // Align SrcA to MSB when single precicion
+      mux2  #(64)  SrcAMux({SrcAM[31:0], 32'b0}, {{64-`XLEN{1'b0}}, SrcAM}, FmtM, AlignedSrcAM);
          
-   //*****************
-   // M/W pipe registers
-   //*****************
-   flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
-   flopenrc #(5) MWRegFma2(clk, reset, FlushW, ~StallW, FMAFlgM, FMAFlgW); 
-   
-   flopenrc #(64) MWRegDiv1(clk, reset, FlushW, ~StallW, FDivResultM, FDivResultW); 
-   flopenrc #(5) MWRegDiv2(clk, reset, FlushW, ~StallW, FDivSqrtFlgM, FDivSqrtFlgW);
-   
-   flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
-   flopenrc #(5) MWRegAdd2(clk, reset, FlushW, ~StallW, FAddFlgM, FAddFlgW); 
-   
-   flopenrc #(1) MWRegCmp1(clk, reset, FlushW, ~StallW, CmpNVM, CmpNVW); 
-   flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, CmpResM, CmpResW);
-
-   flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
-   flopenrc #(1) MWRegClass1(clk, reset, FlushW, ~StallW, FFlgM, FFlgW);
-   
-   flopenrc #(11) MWCtrlReg(clk, reset, FlushW, ~StallW,
-                        {FWriteEnM, FResultSelM, RdM, FmtM, FWriteIntM},
-                        {FWriteEnW, FResultSelW, RdW, FmtW, FWriteIntW});
-   
-   
-
-
-
-
-  //#########################################
-  // BEGIN WRITEBACK STAGE
-  //#########################################
 
 
 
 
 
-//***turn into muxs
-   always_comb begin
-      case (FResultSelW)
-	3'b000 : FPUFlagsW = 5'b0;
-	3'b001 : FPUFlagsW = FMAFlgW;
-	3'b010 : FPUFlagsW = FAddFlgW;
-	3'b011 : FPUFlagsW = FDivSqrtFlgW;
-	3'b100 : FPUFlagsW = {4'b0,FFlgW};
-	default : FPUFlagsW = 5'bxxxxx;
-      endcase
-   end
-
-   always_comb begin
-      case (FResultSelW)
-	3'b000 : FPUResult64W = FmtW ? {ReadDataW, {64-`XLEN{1'b0}}} : {ReadDataW[31:0], 32'b0};
-	3'b001 : FPUResult64W = FMAResW;
-	3'b010 : FPUResult64W = FAddResW;
-	3'b011 : FPUResult64W = FDivResultW;
-	3'b100 : FPUResult64W = FResW;
-	default : FPUResult64W = 64'bxxxxx;
-      endcase
-   end
-   
-   
-   // interface between XLEN size datapath and double-precision sized
-   // floating-point results
-   //
-   // define offsets for LSB zero extension or truncation
-   always_comb begin      
-      // zero extension 
-//***turn into mux
-      FPUResultW = FmtW ? FPUResult64W[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FPUResult64W[63:32]};
-      //*** put into mem stage
-      SetFflagsM = FPUFlagsW;      
+
+
+
+            
+      //*****************
+      // M/W pipe registers
+      //*****************
+      flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
+      flopenrc #(5) MWRegFma2(clk, reset, FlushW, ~StallW, FMAFlgM, FMAFlgW); 
+      
+      flopenrc #(64) MWRegDiv1(clk, reset, FlushW, ~StallW, FDivResultM, FDivResultW); 
+      flopenrc #(5) MWRegDiv2(clk, reset, FlushW, ~StallW, FDivSqrtFlgM, FDivSqrtFlgW);
+      
+      flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
+      flopenrc #(5) MWRegAdd2(clk, reset, FlushW, ~StallW, FAddFlgM, FAddFlgW); 
+      
+      flopenrc #(1) MWRegCmp1(clk, reset, FlushW, ~StallW, CmpNVM, CmpNVW); 
+      flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, CmpResM, CmpResW);
+
+      flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
+      flopenrc #(1) MWRegClass1(clk, reset, FlushW, ~StallW, FFlgM, FFlgW);
+      
+      flopenrc #(11) MWCtrlReg(clk, reset, FlushW, ~StallW,
+                           {FWriteEnM, FResultSelM, RdM, FmtM, FWriteIntM},
+                           {FWriteEnW, FResultSelW, RdW, FmtW, FWriteIntW});
+      
+      
+
+
+
+
+   //#########################################
+   // BEGIN WRITEBACK STAGE
+   //#########################################
+
+
+
+
+
+   //***turn into muxs
+      always_comb begin
+         case (FResultSelW)
+      3'b000 : FPUFlagsW = 5'b0;
+      3'b001 : FPUFlagsW = FMAFlgW;
+      3'b010 : FPUFlagsW = FAddFlgW;
+      3'b011 : FPUFlagsW = FDivSqrtFlgW;
+      3'b100 : FPUFlagsW = {4'b0,FFlgW};
+      default : FPUFlagsW = 5'bxxxxx;
+         endcase
+      end
+
+      always_comb begin
+         case (FResultSelW)
+      3'b000 : FPUResult64W = FmtW ? {ReadDataW, {64-`XLEN{1'b0}}} : {ReadDataW[31:0], 32'b0};
+      3'b001 : FPUResult64W = FMAResW;
+      3'b010 : FPUResult64W = FAddResW;
+      3'b011 : FPUResult64W = FDivResultW;
+      3'b100 : FPUResult64W = FResW;
+      default : FPUResult64W = 64'bxxxxx;
+         endcase
+      end
+      
+      
+      // interface between XLEN size datapath and double-precision sized
+      // floating-point results
+      //
+      // define offsets for LSB zero extension or truncation
+      always_comb begin      
+         // zero extension 
+   //***turn into mux
+         FPUResultW = FmtW ? FPUResult64W[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FPUResult64W[63:32]};
+         //*** put into mem stage
+         SetFflagsM = FPUFlagsW;      
+      end
+   end else begin // no F_SUPPORTED; tie outputs low
+     assign FStallD = 0;
+     assign FWriteIntE = 0; 
+     assign FWriteIntM = 0;
+     assign FWriteIntW = 0;
+     assign FWriteDataE = 0;
+     assign FIntResM = 0;
+     assign FDivBusyE = 0;
+     assign IllegalFPUInstrD = 1;
+     assign SetFflagsM = 0;
+     assign FPUResultW = 0;
    end
+  endgenerate 
   
 endmodule // fpu
 
diff --git a/wally-pipelined/src/generic/shift.sv b/wally-pipelined/src/generic/shift.sv
index 88152588..70e1076d 100755
--- a/wally-pipelined/src/generic/shift.sv
+++ b/wally-pipelined/src/generic/shift.sv
@@ -38,13 +38,12 @@ module shift_right #(parameter WIDTH=8)
 
    assign stage[0] = A;   
    generate
-      for (i=0;i<$clog2(WIDTH);i=i+1)
-	begin : genbit
-	   mux2 #(WIDTH) mux_inst (stage[i], 
+      for (i=0;i<$clog2(WIDTH);i=i+1) begin : genbit
+	      mux2 #(WIDTH) mux_inst (stage[i], 
 				   {{(WIDTH/(2**(i+1))){1'b0}}, stage[i][WIDTH-1:WIDTH/(2**(i+1))]}, 
 				   Shift[$clog2(WIDTH)-i-1], 
 				   stage[i+1]);
-	end
+	   end
    endgenerate
    assign Z = stage[$clog2(WIDTH)];   
 
@@ -60,13 +59,12 @@ module shift_left #(parameter WIDTH=8)
    
    assign stage[0] = A;   
    generate
-      for (i=0;i<$clog2(WIDTH);i=i+1)
-	begin : genbit
-	   mux2 #(WIDTH) mux_inst (stage[i], 
+      for (i=0;i<$clog2(WIDTH);i=i+1) begin : genbit
+	     mux2 #(WIDTH) mux_inst (stage[i], 
 				   {stage[i][WIDTH-1-WIDTH/(2**(i+1)):0], {(WIDTH/(2**(i+1))){1'b0}}}, 
 				   Shift[$clog2(WIDTH)-i-1], 
 				   stage[i+1]);
-	end
+	   end
    endgenerate
    assign Z = stage[$clog2(WIDTH)];   
 
diff --git a/wally-pipelined/src/ieu/alu.sv b/wally-pipelined/src/ieu/alu.sv
index 102fbbed..ac2c06dd 100644
--- a/wally-pipelined/src/ieu/alu.sv
+++ b/wally-pipelined/src/ieu/alu.sv
@@ -42,7 +42,7 @@ module alu #(parameter WIDTH=32) (
   assign {carry, presum} = a + condinvb + {{(WIDTH-1){1'b0}},alucontrol[3]};
   
   // support W-type RV64I ADDW/SUBW/ADDIW that sign-extend 32-bit result to 64 bits
-  generate 
+  generate
     if (WIDTH==64)
       assign sum = w64 ? {{32{presum[31]}}, presum[31:0]} : presum;
     else
diff --git a/wally-pipelined/src/ieu/datapath.sv b/wally-pipelined/src/ieu/datapath.sv
index 44a40045..f041fce6 100644
--- a/wally-pipelined/src/ieu/datapath.sv
+++ b/wally-pipelined/src/ieu/datapath.sv
@@ -129,7 +129,7 @@ module datapath (
   flopenrc #(5)    RdWEg(clk, reset, FlushW, ~StallW, RdM, RdW);
 
   // handle Store Conditional result if atomic extension supported
-  generate 
+  generate
     if (`A_SUPPORTED)
       assign SCResultW = SquashSCW ? {{(`XLEN-1){1'b0}}, 1'b1} : {{(`XLEN-1){1'b0}}, 1'b0};
     else 
diff --git a/wally-pipelined/src/ifu/SRAM2P1R1W.sv b/wally-pipelined/src/ifu/SRAM2P1R1W.sv
index d71f8bc4..046aacc6 100644
--- a/wally-pipelined/src/ifu/SRAM2P1R1W.sv
+++ b/wally-pipelined/src/ifu/SRAM2P1R1W.sv
@@ -97,11 +97,11 @@ module SRAM2P1R1W
   
   // write port
   generate
-    for (index = 0; index < Width; index = index + 1) begin    
+    for (index = 0; index < Width; index = index + 1) begin:mem
       always_ff @ (posedge clk) begin
-	if (WEN1Q & BitWEN1[index]) begin
-	  memory[WA1Q][index] <= WD1Q[index];
-	end
+	      if (WEN1Q & BitWEN1[index]) begin
+	        memory[WA1Q][index] <= WD1Q[index];
+	      end
       end
     end
   endgenerate
diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv
index 4fcefe85..24952edf 100644
--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@@ -188,7 +188,7 @@ module ifu (
   flopenl #(`XLEN) pcreg(clk, reset, ~StallF & ~ICacheStallF, PCNextF, `RESET_VECTOR, PCF);
 
   // branch and jump predictor
-  generate 
+  generate
     if (`BPRED_ENABLED == 1) begin : bpred
       // I am making the port connection explicit for now as I want to see them and they will be changing.
       bpred bpred(.*,
diff --git a/wally-pipelined/src/ifu/localHistoryPredictor.sv b/wally-pipelined/src/ifu/localHistoryPredictor.sv
index 8aaa85c0..6c5c9478 100644
--- a/wally-pipelined/src/ifu/localHistoryPredictor.sv
+++ b/wally-pipelined/src/ifu/localHistoryPredictor.sv
@@ -67,7 +67,7 @@ module localHistoryPredictor
 
   genvar 		   index;
   generate
-    for (index = 0; index < 2**m; index = index +1) begin
+    for (index = 0; index < 2**m; index = index +1) begin:localhist
       
       flopenr #(k) LocalHistoryRegister(.clk(clk),
 					.reset(reset),
diff --git a/wally-pipelined/src/lsu/dcache.sv b/wally-pipelined/src/lsu/dcache.sv
index fec70ef4..e8dfeb5c 100644
--- a/wally-pipelined/src/lsu/dcache.sv
+++ b/wally-pipelined/src/lsu/dcache.sv
@@ -151,7 +151,7 @@ module dcachecontroller #(parameter LINESIZE = 256) (
 
     genvar i;
     generate
-        for (i=0; i < WORDSPERLINE; i++) begin
+        for (i=0; i < WORDSPERLINE; i++) begin:sb
             flopenr #(`XLEN) flop(clk, reset, FetchState & (i == FetchWordNum), ReadDataW, DCacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN]);
         end
     endgenerate
diff --git a/wally-pipelined/src/lsu/lsu.sv b/wally-pipelined/src/lsu/lsu.sv
index 8d4df6ec..a2bcf52b 100644
--- a/wally-pipelined/src/lsu/lsu.sv
+++ b/wally-pipelined/src/lsu/lsu.sv
@@ -64,7 +64,7 @@ module lsu (
   output logic [1:0] 	      AtomicMaskedM,
   input logic 		      MemAckW, // from ahb
   input logic [`XLEN-1:0]     HRDATAW, // from ahb
-  output logic [2:0] 	      Funct3MfromLSU,
+  output logic [2:0] 	      SizeFromLSU,
   output logic 		      StallWfromLSU,
 
 
@@ -132,7 +132,7 @@ module lsu (
   logic 	    MMUTranslate;
   logic 	    HPTWRead;
   logic [1:0] 	    MemRWMtoLSU;
-  logic [2:0] 	    Funct3MtoLSU;
+  logic [2:0] 	    SizeToLSU;
   logic [1:0] 	    AtomicMtoLSU;
   logic [`XLEN-1:0] MemAdrMtoLSU;
   logic [`XLEN-1:0] WriteDataMtoLSU;
@@ -204,7 +204,7 @@ module lsu (
 		 // LSU
 		 .DisableTranslation(DisableTranslation),
 		 .MemRWMtoLSU(MemRWMtoLSU),
-		 .Funct3MtoLSU(Funct3MtoLSU),
+		 .SizeToLSU(SizeToLSU),
 		 .AtomicMtoLSU(AtomicMtoLSU),
 		 .MemAdrMtoLSU(MemAdrMtoLSU),          
 		 .WriteDataMtoLSU(WriteDataMtoLSU),   // *** ??????????????
@@ -220,7 +220,7 @@ module lsu (
   mmu #(.TLB_ENTRIES(`DTLB_ENTRIES), .IMMU(0))
   dmmu(.TLBAccessType(MemRWMtoLSU),
        .VirtualAddress(MemAdrMtoLSU),
-       .Size(Funct3MtoLSU[1:0]),
+       .Size(SizeToLSU[1:0]),
        .PTEWriteVal(PageTableEntryM),
        .PageTypeWriteVal(PageTypeM),
        .TLBWrite(DTLBWriteM),
@@ -244,7 +244,7 @@ module lsu (
 
   // Determine if an Unaligned access is taking place
   always_comb
-    case(Funct3MtoLSU[1:0]) 
+    case(SizeToLSU[1:0]) 
       2'b00:  DataMisalignedMfromLSU = 0;                       // lb, sb, lbu
       2'b01:  DataMisalignedMfromLSU = MemAdrMtoLSU[0];              // lh, sh, lhu
       2'b10:  DataMisalignedMfromLSU = MemAdrMtoLSU[1] | MemAdrMtoLSU[0]; // lw, sw, flw, fsw, lwu
@@ -400,7 +400,7 @@ module lsu (
   end // always_comb
 
   // *** for now just pass through size
-  assign Funct3MfromLSU = Funct3MtoLSU;
+  assign SizeFromLSU = SizeToLSU;
   assign StallWfromLSU = StallWtoLSU;
   
 
diff --git a/wally-pipelined/src/lsu/lsuArb.sv b/wally-pipelined/src/lsu/lsuArb.sv
index 3f57cabb..23e88970 100644
--- a/wally-pipelined/src/lsu/lsuArb.sv
+++ b/wally-pipelined/src/lsu/lsuArb.sv
@@ -54,7 +54,7 @@ module lsuArb
    // to LSU   
    output logic 	    DisableTranslation, 
    output logic [1:0] 	    MemRWMtoLSU,
-   output logic [2:0] 	    Funct3MtoLSU,
+   output logic [2:0] 	    SizeToLSU,
    output logic [1:0] 	    AtomicMtoLSU,
    output logic [`XLEN-1:0] MemAdrMtoLSU,
    output logic [`XLEN-1:0] WriteDataMtoLSU,
@@ -87,6 +87,7 @@ module lsuArb
   statetype CurrState, NextState;
   logic 		    SelPTW;
   logic 		    HPTWStallD;
+  logic [2:0] PTWSize;
   
 
   flopenl #(.TYPE(statetype)) StateReg(.clk(clk),
@@ -138,12 +139,9 @@ module lsuArb
   assign MemRWMtoLSU = SelPTW ? {HPTWRead, 1'b0} : MemRWM;
   
   generate
-    if (`XLEN == 32) begin
-      assign Funct3MtoLSU = SelPTW ? 3'b010 : Funct3M;
-    end else begin
-      assign Funct3MtoLSU = SelPTW ? 3'b011 : Funct3M;
-    end
+    assign PTWSize = (`XLEN==32 ? 3'b010 : 3'b011); // 32 or 64-bit access from htpw
   endgenerate
+  mux2 #(3) sizemux(Funct3M, PTWSize, SelPTW, SizeToLSU);
 
   assign AtomicMtoLSU = SelPTW ? 2'b00 : AtomicM;
   assign MemAdrMtoLSU = SelPTW ? HPTWPAdr : MemAdrM;
diff --git a/wally-pipelined/src/mmu/pmpadrdec.sv b/wally-pipelined/src/mmu/pmpadrdec.sv
index 50d399ae..0a14d832 100644
--- a/wally-pipelined/src/mmu/pmpadrdec.sv
+++ b/wally-pipelined/src/mmu/pmpadrdec.sv
@@ -76,8 +76,9 @@ module pmpadrdec (
   generate
     assign Mask[1:0] = 2'b11;
     assign Mask[2] = (AdrMode == NAPOT); // mask has 0s in upper bis for NA4 region
-    for (i=3; i < `PA_BITS; i=i+1) 
+    for (i=3; i < `PA_BITS; i=i+1) begin:mask
       assign Mask[i] = Mask[i-1] & PMPAdr[i-3]; // NAPOT mask: 1's indicate bits to ignore
+    end
    endgenerate
   // verilator lint_on UNOPTFLAT
 
diff --git a/wally-pipelined/src/mmu/pmpchecker.sv b/wally-pipelined/src/mmu/pmpchecker.sv
index ee4b261d..9c7f11da 100644
--- a/wally-pipelined/src/mmu/pmpchecker.sv
+++ b/wally-pipelined/src/mmu/pmpchecker.sv
@@ -63,12 +63,6 @@ module pmpchecker (
   // verilator lint_on UNOPTFLAT
   logic [`PMP_ENTRIES-1:0]   PAgePMPAdr;  // for TOR PMP matching, PhysicalAddress > PMPAdr[i]
   genvar i,j;
- /*
-  generate // extract 8-bit chunks from PMPCFG array
-    for (j=0; j<`PMP_ENTRIES; j = j+8)
-      assign {PMPCfg[j+7], PMPCfg[j+6], PMPCfg[j+5], PMPCfg[j+4],
-              PMPCfg[j+3], PMPCfg[j+2], PMPCfg[j+1], PMPCfg[j]} = PMPCFG_ARRAY_REGW[j/8];
-  endgenerate */
 
   pmpadrdec pmpadrdecs[`PMP_ENTRIES-1:0](
     .PhysicalAddress, 
@@ -80,7 +74,6 @@ module pmpchecker (
     .NoLowerMatchOut(NoLowerMatch),
     .Match, .Active, .L, .X, .W, .R);
 
-
   // Only enforce PMP checking for S and U modes when at least one PMP is active or in Machine mode when L bit is set in selected region
   assign EnforcePMP = (PrivilegeModeW == `M_MODE) ? |L : |Active; 
 
diff --git a/wally-pipelined/src/mmu/tlb.sv b/wally-pipelined/src/mmu/tlb.sv
index 75021265..34400647 100644
--- a/wally-pipelined/src/mmu/tlb.sv
+++ b/wally-pipelined/src/mmu/tlb.sv
@@ -111,6 +111,7 @@ module tlb #(parameter TLB_ENTRIES = 8,
   logic [1:0]            HitPageType;
   logic                  CAMHit;
   logic [`ASID_BITS-1:0] ASID;
+  logic                  DAFault;
 
   // Grab the sv mode from SATP and determine whether translation should occur
   assign SvMode = SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS];
@@ -165,7 +166,9 @@ module tlb #(parameter TLB_ENTRIES = 8,
       // only execute non-user mode pages.
       assign ImproperPrivilege = ((EffectivePrivilegeMode == `U_MODE) && ~PTE_U) ||
         ((EffectivePrivilegeMode == `S_MODE) && PTE_U);
-      assign TLBPageFault = Translate && TLBHit && (ImproperPrivilege || ~PTE_X);
+      // fault for software handling if access bit is off
+      assign DAFault = ~PTE_A;
+      assign TLBPageFault = Translate && TLBHit && (ImproperPrivilege || ~PTE_X || DAFault);
     end else begin
       logic ImproperPrivilege, InvalidRead, InvalidWrite;
 
@@ -180,7 +183,9 @@ module tlb #(parameter TLB_ENTRIES = 8,
       // Check for write error. Writes are invalid when the page's write bit is
       // low.
       assign InvalidWrite = WriteAccess && ~PTE_W;
-      assign TLBPageFault = Translate && TLBHit && (ImproperPrivilege || InvalidRead || InvalidWrite);
+      // Fault for software handling if access bit is off or writing a page with dirty bit off
+      assign DAFault = ~PTE_A | WriteAccess & ~PTE_D; 
+      assign TLBPageFault = Translate && TLBHit && (ImproperPrivilege || InvalidRead || InvalidWrite || DAFault);
     end
   endgenerate
 
diff --git a/wally-pipelined/src/mmu/tlbpriority.sv b/wally-pipelined/src/mmu/tlbpriority.sv
index a061f622..5096cae6 100644
--- a/wally-pipelined/src/mmu/tlbpriority.sv
+++ b/wally-pipelined/src/mmu/tlbpriority.sv
@@ -41,8 +41,9 @@ module tlbpriority #(parameter ENTRIES = 8) (
   genvar i;
   generate
     assign nolower[0] = 1;
-    for (i=1; i<ENTRIES; i++) 
+    for (i=1; i<ENTRIES; i++) begin:therm
       assign nolower[i] = nolower[i-1] & ~a[i-1];
+    end
   endgenerate
   // verilator lint_on UNOPTFLAT
   assign y = a & nolower;
diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv
index 70767dcc..30ea394f 100755
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
@@ -299,10 +299,9 @@ module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c,
    logic [WIDTH:0] 					  carry_temp;   
    genvar 						  i;
    generate
-      for (i=0;i<WIDTH;i=i+1)
-	begin : genbit
-	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
-	end
+      for (i=0;i<WIDTH;i=i+1) begin : genbit
+	    fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
+	  end
    endgenerate
    assign carry = {carry_temp[WIDTH-1:1], 1'b0};     
 
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index e10b0c55..7288229c 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -138,7 +138,9 @@ module muldiv (
 	 flopenrc #(`XLEN) MulDivResultWReg(clk, reset, FlushW, ~StallW, MulDivResultM, MulDivResultW);	 
 
       end else begin // no M instructions supported
-	 assign MulDivResultW = 0; 
+	 	assign MulDivResultW = 0; 
+		assign DivBusyE = 0;
+		assign DivDoneE = 0;
       end
    endgenerate
 
diff --git a/wally-pipelined/src/privileged/csrc.sv b/wally-pipelined/src/privileged/csrc.sv
index c762ea8c..9e47eece 100644
--- a/wally-pipelined/src/privileged/csrc.sv
+++ b/wally-pipelined/src/privileged/csrc.sv
@@ -87,7 +87,7 @@ module csrc #(parameter
     output logic             IllegalCSRCAccessM
   );
 
-  generate 
+  generate
     if (`ZCOUNTERS_SUPPORTED) begin
       //  logic [63:0] TIME_REGW, TIMECMP_REGW;
       logic [63:0] CYCLE_REGW, INSTRET_REGW;
diff --git a/wally-pipelined/src/privileged/csri.sv b/wally-pipelined/src/privileged/csri.sv
index e7a43e50..21796fa6 100644
--- a/wally-pipelined/src/privileged/csri.sv
+++ b/wally-pipelined/src/privileged/csri.sv
@@ -70,7 +70,7 @@ module csri #(parameter
   // MEIP, MTIP, MSIP are read-only
   // SEIP, STIP, SSIP is writable in MIP if S mode exists
   // SSIP is writable in SIP if S mode exists
-  generate 
+  generate
     if (`S_SUPPORTED) begin
       assign MIP_WRITE_MASK = 12'h222; // SEIP, STIP, SSIP are writable in MIP (20210108-draft 3.1.9)
       assign SIP_WRITE_MASK = 12'h002; // SSIP is writable in SIP (privileged 20210108-draft 4.1.3)
diff --git a/wally-pipelined/src/privileged/csrn.sv b/wally-pipelined/src/privileged/csrn.sv
index e82ff59f..16d5df8a 100644
--- a/wally-pipelined/src/privileged/csrn.sv
+++ b/wally-pipelined/src/privileged/csrn.sv
@@ -49,7 +49,7 @@ module csrn #(parameter
   );
 
   // User mode CSRs below only needed when user mode traps are supported
-  generate  
+  generate
     if (`N_SUPPORTED) begin
       logic WriteUTVECM;
       logic WriteUSCRATCHM, WriteUEPCM;
diff --git a/wally-pipelined/src/privileged/csrs.sv b/wally-pipelined/src/privileged/csrs.sv
index 0afe7091..ca64b053 100644
--- a/wally-pipelined/src/privileged/csrs.sv
+++ b/wally-pipelined/src/privileged/csrs.sv
@@ -66,7 +66,7 @@ module csrs #(parameter
   //logic [`XLEN-1:0] SEDELEG_MASK = ~(zero | 3'b111 << 9); // sedeleg[11:9] hardwired to zero per Privileged Spec 3.1.8
 
   // Supervisor mode CSRs sometimes supported
-  generate  
+  generate
     if (`S_SUPPORTED) begin
       logic WriteSTVECM;
       logic WriteSSCRATCHM, WriteSEPCM;
diff --git a/wally-pipelined/src/privileged/csru.sv b/wally-pipelined/src/privileged/csru.sv
index 2e48731d..08e682bf 100644
--- a/wally-pipelined/src/privileged/csru.sv
+++ b/wally-pipelined/src/privileged/csru.sv
@@ -43,7 +43,7 @@ module csru #(parameter
   );
 
   // Floating Point CSRs in User Mode only needed if Floating Point is supported
-  generate  
+  generate
     if (`F_SUPPORTED | `D_SUPPORTED) begin
       logic [4:0] FFLAGS_REGW;
       logic WriteFFLAGSM, WriteFRMM; //, WriteFCSRM;
diff --git a/wally-pipelined/src/uncore/gpio.sv b/wally-pipelined/src/uncore/gpio.sv
index 49b96e1e..0100c9c5 100644
--- a/wally-pipelined/src/uncore/gpio.sv
+++ b/wally-pipelined/src/uncore/gpio.sv
@@ -151,7 +151,7 @@ module gpio (
   end
 
   // chip i/o
-  generate 
+  generate
     if (`GPIO_LOOPBACK_TEST) // connect OUT to IN for loopback testing
       assign input0d = GPIOPinsOut & input_en & output_en;
     else
diff --git a/wally-pipelined/src/uncore/plic.sv b/wally-pipelined/src/uncore/plic.sv
index dc50eb4f..ef7ecdd5 100644
--- a/wally-pipelined/src/uncore/plic.sv
+++ b/wally-pipelined/src/uncore/plic.sv
@@ -164,17 +164,13 @@ module plic (
   flopr #(N) intPendingFlop(HCLK,~HRESETn,nextIntPending,intPending);
 
   // pending array - indexed by priority_lvl x source_ID
-  genvar i;
+  genvar i, j;
   generate
-    for (i=1; i<=N; i=i+1) begin
-      // *** make sure that this synthesizes into N decoders, not 7*N 3-bit equality comparators (right?)
-      assign pendingArray[7][i] = (intPriority[i]==7) & intEn[i] & intPending[i];
-      assign pendingArray[6][i] = (intPriority[i]==6) & intEn[i] & intPending[i];
-      assign pendingArray[5][i] = (intPriority[i]==5) & intEn[i] & intPending[i];
-      assign pendingArray[4][i] = (intPriority[i]==4) & intEn[i] & intPending[i];
-      assign pendingArray[3][i] = (intPriority[i]==3) & intEn[i] & intPending[i];
-      assign pendingArray[2][i] = (intPriority[i]==2) & intEn[i] & intPending[i];
-      assign pendingArray[1][i] = (intPriority[i]==1) & intEn[i] & intPending[i];
+    for (j=1; j<=7; j++) begin: pending
+      for (i=1; i<=N; i=i+1) begin: pendingbit
+        // *** make sure that this synthesizes into N decoders, not 7*N 3-bit equality comparators (right?)
+        assign pendingArray[j][i] = (intPriority[i]==j) & intEn[i] & intPending[i];
+      end
     end
   endgenerate
   // pending array, except grouped by priority
@@ -184,7 +180,9 @@ module plic (
                                  |pendingArray[4],
                                  |pendingArray[3],
                                  |pendingArray[2],
-                                 |pendingArray[1]};
+                                 |pendingArray[1]}; 
+  //assign pendingPGrouped = pendingArray.or;
+
   // pendingPGrouped, except only topmost priority is active
   assign pendingMaxP[7:1] = {pendingPGrouped[7],
                              pendingPGrouped[6] & ~|pendingPGrouped[7],
@@ -202,24 +200,24 @@ module plic (
                                     | ({N{pendingMaxP[2]}} & pendingArray[2])
                                     | ({N{pendingMaxP[1]}} & pendingArray[1]);
   // find the lowest ID amongst active interrupts at the highest priority
-  integer j;
-  // *** verify that this synthesizes to a reasonable priority encoder and that j doesn't actually exist in hardware
+  int k;
+  // *** verify that this synthesizes to a reasonable priority encoder and that k doesn't actually exist in hardware
   always_comb begin
     intClaim = 6'b0;
-    for(j=N; j>0; j=j-1) begin
-      if(pendingRequestsAtMaxP[j]) intClaim = j[5:0];
+    for(k=N; k>0; k=k-1) begin
+      if(pendingRequestsAtMaxP[k]) intClaim = k[5:0];
     end
   end
   
   // create threshold mask
-  always_comb begin
-    threshMask[7] = ~(7==intThreshold);
-    threshMask[6] = ~(6==intThreshold) & threshMask[7];
-    threshMask[5] = ~(5==intThreshold) & threshMask[6];
-    threshMask[4] = ~(4==intThreshold) & threshMask[5];
-    threshMask[3] = ~(3==intThreshold) & threshMask[4];
-    threshMask[2] = ~(2==intThreshold) & threshMask[3];
-    threshMask[1] = ~(1==intThreshold) & threshMask[2];
+   always_comb begin
+    threshMask[7] = (intThreshold != 7);
+    threshMask[6] = (intThreshold != 6) & threshMask[7];
+    threshMask[5] = (intThreshold != 5) & threshMask[6];
+    threshMask[4] = (intThreshold != 4) & threshMask[5];
+    threshMask[3] = (intThreshold != 3) & threshMask[4];
+    threshMask[2] = (intThreshold != 2) & threshMask[3];
+    threshMask[1] = (intThreshold != 1) & threshMask[2];
   end
   // is the max priority > threshold?
   // *** would it be any better to first priority encode maxPriority into binary and then ">" with threshold?
diff --git a/wally-pipelined/src/uncore/uartPC16550D.sv b/wally-pipelined/src/uncore/uartPC16550D.sv
index 62c8ea00..badc4197 100644
--- a/wally-pipelined/src/uncore/uartPC16550D.sv
+++ b/wally-pipelined/src/uncore/uartPC16550D.sv
@@ -291,7 +291,7 @@ module uartPC16550D(
   // although rxfullbit looks like a combinational loop, in one bit rxfifotail == i and breaks the loop
   generate
     genvar i;
-    for (i=0; i<16; i++) begin
+    for (i=0; i<16; i++) begin:rx
       assign RXerrbit[i] = |rxfifo[i][10:8]; // are any of the error conditions set?
       if (i > 0)
         assign rxfullbit[i] = ((rxfifohead==i) | rxfullbit[i-1]) & (rxfifotail != i);
diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv
index 1c44565f..f18d5af4 100644
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@@ -159,7 +159,7 @@ module wallypipelinedhart
 
   // IEU vs HPTW arbitration signals to send to LSU
   logic [1:0] 		    MemRWMtoLSU;
-  logic [2:0] 		    Funct3MtoLSU;
+  logic [2:0] 		    SizeToLSU;
   logic [1:0] 		    AtomicMtoLSU;
   logic [`XLEN-1:0] 	    MemAdrMtoLSU;
   logic [`XLEN-1:0] 	    WriteDataMtoLSU;
@@ -169,7 +169,7 @@ module wallypipelinedhart
   logic 		    DataMisalignedMfromLSU;
   logic 		    StallWtoLSU;
   logic 		    StallWfromLSU;  
-  logic [2:0] 		    Funct3MfromLSU;
+  logic [2:0] 		    SizeFromLSU;
 
   
   ifu ifu(.InstrInF(InstrRData),
@@ -207,7 +207,7 @@ module wallypipelinedhart
 	  .AtomicMaskedM(AtomicMaskedM),
 	  .MemAckW(MemAckW),
 	  .HRDATAW(HRDATAW),
-	  .Funct3MfromLSU(Funct3MfromLSU),           // stays the same
+	  .SizeFromLSU(SizeFromLSU),           // stays the same
 	  .StallWfromLSU(StallWfromLSU),             // stays the same
 	  .DSquashBusAccessM(DSquashBusAccessM),     // probalby removed after dcache implemenation?
 	  // currently not connected (but will need to be used for lsu talking to ahb.
@@ -261,7 +261,7 @@ module wallypipelinedhart
 	       //.InstrRData(InstrF), // hook up InstrF later
 	       .ISquashBusAccessF(1'b0), // *** temporary hack to disable PMP instruction fetch checking
 	       .WriteDataM(WriteDataM),
-	       .MemSizeM(Funct3MfromLSU[1:0]), .UnsignedLoadM(Funct3MfromLSU[2]),
+	       .MemSizeM(SizeFromLSU[1:0]), .UnsignedLoadM(SizeFromLSU[2]),
 	       .Funct7M(InstrM[31:25]),
 	       .HRDATAW(HRDATAW),
 	       .StallW(StallWfromLSU),
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index 8c3e28c3..2cf37c17 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -514,6 +514,9 @@ string tests32f[] = '{
   logic             HMASTLOCK;
   logic             HCLK, HRESETn;
   logic [`XLEN-1:0] PCW;
+
+  logic [`XLEN-1:0] debug;
+  assign debug = dut.uncore.dtim.RAM[536872960];
   
   flopenr #(`XLEN) PCWReg(clk, reset, ~dut.hart.ieu.dp.StallW, dut.hart.ifu.PCM, PCW);
   flopenr  #(32)   InstrWReg(clk, reset, ~dut.hart.ieu.dp.StallW,  dut.hart.ifu.InstrM, InstrW);
@@ -656,10 +659,7 @@ string tests32f[] = '{
         // Check errors
         errors = (i == SIGNATURESIZE+1); // error if file is empty
         i = 0;
-        if (`XLEN == 32)
-          testadr = (`TIM_BASE+tests[test+1].atohex())/4;
-        else
-          testadr = (`TIM_BASE+tests[test+1].atohex())/8;
+        testadr = (`TIM_BASE+tests[test+1].atohex())/(`XLEN/8);
         /* verilator lint_off INFINITELOOP */
         while (signature[i] !== 'bx) begin
           //$display("signature[%h] = %h", i, signature[i]);
@@ -669,14 +669,16 @@ string tests32f[] = '{
               // kind of hacky test for garbage right now
               errors = errors+1;
               $display("  Error on test %s result %d: adr = %h sim = %h, signature = %h", 
-                    tests[test], i, (testadr+i)*`XLEN/8, dut.uncore.dtim.RAM[testadr+i], signature[i]);
+                    tests[test], i, (testadr+i)*(`XLEN/8), dut.uncore.dtim.RAM[testadr+i], signature[i]);
               $stop;//***debug
             end
           end
           i = i + 1;
         end
         /* verilator lint_on INFINITELOOP */
-        if (errors == 0) $display("%s succeeded.  Brilliant!!!", tests[test]);
+        if (errors == 0) begin
+          $display("%s succeeded.  Brilliant!!!", tests[test]);
+        end
         else begin
           $display("%s failed with %d errors. :(", tests[test], errors);
           totalerrors = totalerrors+1;