diff --git a/wally-pipelined/regression/wave.do b/wally-pipelined/regression/wave.do
index b44c2b04a..daecfc921 100644
--- a/wally-pipelined/regression/wave.do
+++ b/wally-pipelined/regression/wave.do
@@ -7,11 +7,12 @@ add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/PCE
 add wave -noupdate -expand -group {Execution Stage} /testbench/InstrEName
 add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/InstrE
 add wave -noupdate -expand -group {Memory Stage} /testbench/dut/hart/priv/trap/InstrValidM
-add wave -noupdate -expand -group {Memory Stage} /testbench/PCtextM
 add wave -noupdate -expand -group {Memory Stage} /testbench/dut/hart/PCM
 add wave -noupdate -expand -group {Memory Stage} /testbench/InstrMName
 add wave -noupdate -expand -group {Memory Stage} /testbench/dut/hart/InstrM
 add wave -noupdate -expand -group {Memory Stage} /testbench/dut/hart/lsu/MemAdrM
+add wave -noupdate /testbench/dut/hart/ieu/dp/ResultM
+add wave -noupdate /testbench/dut/hart/ieu/dp/ResultW
 add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrMisalignedFaultM
 add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrAccessFaultM
 add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/IllegalInstrFaultM
@@ -127,18 +128,18 @@ add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart
 add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/CSRReadValW
 add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultSrcW
 add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultW
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/a
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/b
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/alucontrol
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/result
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/flags
-add wave -noupdate -group alu -divider internals
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/overflow
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/carry
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/zero
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/neg
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/lt
-add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/ltu
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/a
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/b
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/alucontrol
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/result
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/flags
+add wave -noupdate -expand -group alu -divider internals
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/overflow
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/carry
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/zero
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/neg
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/lt
+add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/ltu
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs1D
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs2D
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs1E
@@ -314,8 +315,6 @@ add wave -noupdate -expand -group lsu -expand -group dcache -expand -group {CPU
 add wave -noupdate -expand -group lsu -expand -group dcache -expand -group {CPU side} /testbench/dut/hart/lsu/dcache/AtomicM
 add wave -noupdate -expand -group lsu -expand -group dcache -expand -group {CPU side} /testbench/dut/hart/lsu/dcache/CacheableM
 add wave -noupdate -expand -group lsu -expand -group dcache -expand -group {CPU side} /testbench/dut/hart/lsu/dcache/WriteDataM
-add wave -noupdate -expand -group lsu -expand -group dcache -expand -group {CPU side} /testbench/dut/hart/lsu/dcache/ReadDataW
-add wave -noupdate -expand -group lsu -expand -group dcache -expand -group {CPU side} /testbench/dut/hart/lsu/dcache/StallW
 add wave -noupdate -expand -group lsu -expand -group dcache -expand -group {CPU side} /testbench/dut/hart/lsu/dcache/DCacheStall
 add wave -noupdate -expand -group lsu -expand -group dcache -group status /testbench/dut/hart/lsu/dcache/WayHit
 add wave -noupdate -expand -group lsu -expand -group dcache -group status -color {Medium Orchid} /testbench/dut/hart/lsu/dcache/CacheHit
@@ -365,7 +364,7 @@ add wave -noupdate -expand -group itlb /testbench/dut/hart/ifu/ITLBMissF
 add wave -noupdate -expand -group itlb /testbench/dut/hart/ifu/immu/PhysicalAddress
 add wave -noupdate /testbench/dut/hart/lsu/hptw/genblk1/PRegEn
 TreeUpdate [SetDefaultTree]
-WaveRestoreCursors {{Walk read is wrong} {26824 ns} 1} {{page table setup} {8167 ns} 1} {{eviction at wrong adr} {10128 ns} 1} {{Cursor 6} {41795656 ns} 0}
+WaveRestoreCursors {{Walk read is wrong} {26824 ns} 1} {{page table setup} {8167 ns} 1} {{eviction at wrong adr} {10128 ns} 1} {{Cursor 6} {2898 ns} 0}
 quietly wave cursor active 4
 configure wave -namecolwidth 250
 configure wave -valuecolwidth 297
@@ -381,4 +380,4 @@ configure wave -griddelta 40
 configure wave -timeline 0
 configure wave -timelineunits ns
 update
-WaveRestoreZoom {41795482 ns} {41795818 ns}
+WaveRestoreZoom {2835 ns} {2995 ns}
diff --git a/wally-pipelined/src/cache/ICacheCntrl.sv b/wally-pipelined/src/cache/ICacheCntrl.sv
index 35851defd..3881e9cb6 100644
--- a/wally-pipelined/src/cache/ICacheCntrl.sv
+++ b/wally-pipelined/src/cache/ICacheCntrl.sv
@@ -115,7 +115,6 @@ module ICacheCntrl #(parameter BLOCKLEN = 256)
   localparam STATE_INVALIDATE = 'h12; // *** not sure if invalidate or evict? invalidate by cache block or address?
   localparam STATE_TLB_MISS = 'h13;
   localparam STATE_TLB_MISS_DONE = 'h14;
-  localparam STATE_INSTR_PAGE_FAULT = 'h15;
 
   
   localparam AHBByteLength = `XLEN / 8;
@@ -369,7 +368,7 @@ module ICacheCntrl #(parameter BLOCKLEN = 256)
       end
       STATE_TLB_MISS: begin
         if (WalkerInstrPageFaultF) begin
-          NextState = STATE_INSTR_PAGE_FAULT;
+          NextState = STATE_READY;
           ICacheStallF = 1'b0;
         end else if (ITLBWriteF) begin
           NextState = STATE_TLB_MISS_DONE;
@@ -380,10 +379,6 @@ module ICacheCntrl #(parameter BLOCKLEN = 256)
       STATE_TLB_MISS_DONE: begin
         NextState = STATE_READY;
       end
-      STATE_INSTR_PAGE_FAULT: begin
-        ICacheStallF = 1'b0;
-        NextState = STATE_READY;
-      end
       default: begin
         PCMux = 2'b01;
         NextState = STATE_READY;
diff --git a/wally-pipelined/src/cache/dcache.sv b/wally-pipelined/src/cache/dcache.sv
index 0da202e57..b1edcfa8e 100644
--- a/wally-pipelined/src/cache/dcache.sv
+++ b/wally-pipelined/src/cache/dcache.sv
@@ -43,7 +43,7 @@ module dcache
    input logic [11:0] 	       VAdr, // when hptw writes dtlb we use this address to index SRAM.
 
    input logic [`XLEN-1:0]     WriteDataM,
-   output logic [`XLEN-1:0]    ReadDataM, 
+   output logic [`XLEN-1:0]    ReadDataM,
    output logic 	       DCacheStall,
    output logic 	       CommittedM,
    output logic 	       DCacheMiss,
@@ -60,6 +60,7 @@ module dcache
    // from ptw
    input logic 		       SelPTW,
    input logic 		       WalkerPageFaultM, 
+   output logic [`XLEN-1:0]    LSUData, 
    // ahb side
    output logic [`PA_BITS-1:0] AHBPAdr, // to ahb
    output logic 	       AHBRead,
@@ -147,6 +148,11 @@ module dcache
   logic SelEvict;
 
   logic LRUWriteEn;
+
+  logic CaptureDataM;
+  logic [`XLEN-1:0] SavedReadDataM;
+  logic 	    SelSavedReadDataM;
+  
   
   typedef enum {STATE_READY,
 
@@ -331,7 +337,24 @@ module dcache
   subwordread subwordread(.HRDATA(ReadDataWordMuxM),
 			  .HADDRD(MemPAdrM[2:0]),
 			  .HSIZED({Funct3M[2], 1'b0, Funct3M[1:0]}),
-			  .HRDATAMasked(ReadDataM));
+			  .HRDATAMasked(LSUData));
+
+  assign CaptureDataM = ~SelPTW & MemRWM[1];
+  
+  flopen #(`XLEN) 
+  SavedReadDataReg(.clk,
+		   .en(CaptureDataM),
+		   .d(LSUData),
+		   .q(SavedReadDataM));
+
+
+  mux2 #(`XLEN)
+  ReadDataMMux(.d0(LSUData),
+	       .d1(SavedReadDataM),
+	       .s(SelSavedReadDataM),
+	       .y(ReadDataM));
+		   
+  
 
   // This is a confusing point.
   // The final read data should be updated only if the CPU's StallWtoDCache is low
@@ -457,6 +480,7 @@ module dcache
     DCacheAccess = 1'b0;
     DCacheMiss = 1'b0;
     LRUWriteEn = 1'b0;
+    SelSavedReadDataM = 1'b0;
 
     case (CurrState)
       STATE_READY: begin
@@ -659,6 +683,9 @@ module dcache
 
 	if (ITLBWriteF | WalkerInstrPageFaultF) begin
 	  NextState = STATE_READY;
+	  // this signal is gross.  It is used to select the saved read data m when the
+	  // CPU was stalled for an itlb miss with a simultaneous load.
+	  SelSavedReadDataM = 1'b1;
 	end
 
 	// return to ready if page table walk completed.
diff --git a/wally-pipelined/src/lsu/lsu.sv b/wally-pipelined/src/lsu/lsu.sv
index 2a46559dd..7afb24995 100644
--- a/wally-pipelined/src/lsu/lsu.sv
+++ b/wally-pipelined/src/lsu/lsu.sv
@@ -148,8 +148,9 @@ module lsu
   logic 		       PendingInterruptMtoDCache;
   logic 		       FlushWtoDCache;
   logic 		       WalkerPageFaultM;
-  
-  
+
+  logic [`XLEN-1:0] 	       LSUData;
+    
   hptw hptw(
 	    .clk(clk),
 	    .reset(reset),
@@ -163,7 +164,7 @@ module lsu
 	    .PageType,
 	    .ITLBWriteF(ITLBWriteF),
 	    .DTLBWriteM(DTLBWriteM),
-	    .HPTWReadPTE(ReadDataM),
+	    .HPTWReadPTE(LSUData),
 	    .HPTWStall(HPTWStall),
             .TranslationPAdr,			  
 	    .HPTWRead(HPTWRead),
@@ -303,6 +304,7 @@ module lsu
 		.VAdr(MemAdrM[11:0]),		
 		.WriteDataM(WriteDataM),
 		.ReadDataM(ReadDataM),
+		.LSUData(LSUData),		
 		.DCacheStall(DCacheStall),
 		.CommittedM(CommittedMfromDCache),
 		.DCacheMiss,