diff --git a/wally-pipelined/regression/wave.do b/wally-pipelined/regression/wave.do
index 470cc599a..36401fd9e 100644
--- a/wally-pipelined/regression/wave.do
+++ b/wally-pipelined/regression/wave.do
@@ -26,7 +26,6 @@ add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/CSR
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/RetM
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/TrapM
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/LoadStallD
-add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/InstrStall
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/DataStall
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/MulDivStallD
 add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/hzu/FlushF
@@ -39,11 +38,6 @@ add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbe
 add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallE
 add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallM
 add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallW
-add wave -noupdate /testbench/dut/hart/hzu/StallFCause_Q
-add wave -noupdate /testbench/dut/hart/hzu/StallDCause_Q
-add wave -noupdate /testbench/dut/hart/hzu/StallECause_Q
-add wave -noupdate /testbench/dut/hart/hzu/StallMCause_Q
-add wave -noupdate /testbench/dut/hart/hzu/StallWCause_Q
 add wave -noupdate -group Bpred -expand -group direction -divider Update
 add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/Predictor/DirPredictor/UpdatePC
 add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/Predictor/DirPredictor/UpdateEN
@@ -80,36 +74,36 @@ add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/c/RegWriteD
 add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/RdD
 add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/Rs1D
 add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/Rs2D
-add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rf
-add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a1
-add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a2
-add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a3
-add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rd1
-add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rd2
-add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/we3
-add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/wd3
-add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ALUResultW
-add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ReadDataW
-add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/CSRReadValW
-add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultSrcW
-add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultW
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/a
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/b
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/alucontrol
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/result
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/flags
-add wave -noupdate -expand -group alu -divider internals
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/overflow
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/carry
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/zero
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/neg
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/lt
-add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/ltu
+add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/rf
+add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/a1
+add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/a2
+add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/a3
+add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/rd1
+add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/rd2
+add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/we3
+add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/wd3
+add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ALUResultW
+add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ReadDataW
+add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/CSRReadValW
+add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultSrcW
+add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultW
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/a
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/b
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/alucontrol
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/result
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/flags
+add wave -noupdate -group alu -divider internals
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/overflow
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/carry
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/zero
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/neg
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/lt
+add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/ltu
 add wave -noupdate /testbench/InstrFName
-add wave -noupdate -expand -group dcache /testbench/dut/hart/MemAdrM
-add wave -noupdate -expand -group dcache /testbench/dut/hart/MemPAdrM
-add wave -noupdate -expand -group dcache /testbench/dut/hart/WriteDataM
-add wave -noupdate -expand -group dcache /testbench/dut/hart/dmem/MemRWM
+add wave -noupdate -group dcache /testbench/dut/hart/MemAdrM
+add wave -noupdate -group dcache /testbench/dut/hart/MemPAdrM
+add wave -noupdate -group dcache /testbench/dut/hart/WriteDataM
+add wave -noupdate -group dcache /testbench/dut/hart/dmem/MemRWM
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs1D
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs2D
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs1E
@@ -148,32 +142,64 @@ add wave -noupdate -group {function radix debug} /testbench/functionRadix/functi
 add wave -noupdate -group {function radix debug} /testbench/functionRadix/function_radix/FunctionAddr
 add wave -noupdate -group {function radix debug} /testbench/functionRadix/function_radix/ProgramAddrIndex
 add wave -noupdate -group {function radix debug} /testbench/functionRadix/function_radix/FunctionName
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/InstrD
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/SrcAE
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/SrcBE
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/Funct3E
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/MulDivE
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/W64E
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/StallM
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/StallW
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/FlushM
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/FlushW
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/MulDivResultW
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/genblk1/div/start
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/DivDoneE
-add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/DivBusyE
-add wave -noupdate /testbench/dut/hart/mdu/genblk1/gclk
-add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/fsm1/CURRENT_STATE
-add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/N
-add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/D
-add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/Q
-add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/rem0
-add wave -noupdate /testbench/dut/hart/MulDivResultW
-add wave -noupdate /testbench/dut/hart/mdu/genblk1/PrelimResultE
-add wave -noupdate /testbench/dut/hart/mdu/Funct3E
-add wave -noupdate /testbench/dut/hart/mdu/genblk1/QuotE
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/InstrD
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/SrcAE
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/SrcBE
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/Funct3E
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/MulDivE
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/W64E
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/StallM
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/StallW
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/FlushM
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/FlushW
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/MulDivResultW
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/genblk1/div/start
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/DivDoneE
+add wave -noupdate -group muldiv /testbench/dut/hart/mdu/DivBusyE
+add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/fsm1/CURRENT_STATE
+add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/N
+add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/D
+add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/Q
+add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/rem0
+add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/AHBByteLength
+add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/AHBOFFETWIDTH
+add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/BlockByteLength
+add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/OFFSETWIDTH
+add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/WORDSPERLINE
+add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/LOGWPL
+add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/LINESIZE
+add wave -noupdate /testbench/dut/hart/ifu/icache/controller/CurrState
+add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrPAdrF
+add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCountFlag
+add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCount
+add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrReadF
+add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrAckF
+add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteEnable
+add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteData
+add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWritePAdr
+add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPF
+add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF
+add wave -noupdate -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/FinalInstrRawF
+add wave -noupdate -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/AlignedInstrRawD
+add wave -noupdate -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/InstrRawD
+add wave -noupdate /testbench/dut/hart/ifu/icache/controller/hit
+add wave -noupdate /testbench/dut/hart/ifu/icache/controller/spill
+add wave -noupdate /testbench/dut/hart/ifu/icache/controller/spillSave
+add wave -noupdate /testbench/dut/hart/ifu/icache/controller/UnalignedSelect
+add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCMux
+add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPFinalF
+add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/DataValidBit
+add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/DataValid
+add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/ReadTag
+add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/DataTag
+add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteEnable
+add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteLine
+add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WritePAdr
+add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteSet
+add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteTag
+add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/StoredData
 TreeUpdate [SetDefaultTree]
-WaveRestoreCursors {{Cursor 2} {128433 ns} 0}
+WaveRestoreCursors {{Cursor 2} {237 ns} 0}
 quietly wave cursor active 1
 configure wave -namecolwidth 250
 configure wave -valuecolwidth 229
@@ -189,4 +215,4 @@ configure wave -griddelta 40
 configure wave -timeline 0
 configure wave -timelineunits ns
 update
-WaveRestoreZoom {128007 ns} {128663 ns}
+WaveRestoreZoom {96 ns} {400 ns}
diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv
index f6440fcfb..573e885a6 100644
--- a/wally-pipelined/src/ifu/icache.sv
+++ b/wally-pipelined/src/ifu/icache.sv
@@ -129,57 +129,141 @@ module icachecontroller #(parameter LINESIZE = 256) (
     output logic             InstrReadF
 );
 
-    // Happy path signals
-    logic [31:0]    AlignedInstrRawF, AlignedInstrRawD;
-    logic           FlushDLastCycleN;
-    logic           PCPMisalignedF;
-    const logic [31:0] NOP = 32'h13;
-    logic [`XLEN-1:0] PCPF;
-    // Misaligned signals
-    logic [`XLEN:0] MisalignedInstrRawF;
-    logic           MisalignedStall;
-    // Cache fault signals
-    logic           FaultStall;
+  // FSM states
+  localparam STATE_READY = 0;
+  localparam STATE_HIT_SPILL = 1; // spill, block 0 hit
+  localparam STATE_HIT_SPILL_MISS_FETCH_WDV = 2; // block 1 miss, issue read to AHB and wait data.
+  localparam STATE_HIT_SPILL_MISS_FETCH_DONE = 3; // write data into SRAM/LUT
+  localparam STATE_HIT_SPILL_MERGE = 4;   // Read block 0 of CPU access, should be able to optimize into STATE_HIT_SPILL.
 
+  localparam STATE_MISS_FETCH_WDV = 5; // aligned miss, issue read to AHB and wait for data.
+  localparam STATE_MISS_FETCH_DONE = 6; // write data into SRAM/LUT
+  localparam STATE_MISS_READ = 7; // read block 1 from SRAM/LUT  
+
+  localparam STATE_MISS_SPILL_FETCH_WDV = 8; // spill, miss on block 0, issue read to AHB and wait
+  localparam STATE_MISS_SPILL_FETCH_DONE = 9; // write data into SRAM/LUT
+  localparam STATE_MISS_SPILL_READ1 = 10; // read block 0 from SRAM/LUT
+  localparam STATE_MISS_SPILL_2 = 11; // return to ready if hit or do second block update.
+  localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 12; // miss on block 1, issue read to AHB and wait
+  localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 13; // write data to SRAM/LUT
+  localparam STATE_MISS_SPILL_MERGE = 14; // read block 0 of CPU access,
+
+  localparam STATE_INVALIDATE = 15; // *** not sure if invalidate or evict? invalidate by cache block or address?
+  
+  localparam AHBByteLength = `XLEN / 8;
+  localparam AHBOFFETWIDTH = $clog2(AHBByteLength);
+  
+  
+  localparam BlockByteLength = LINESIZE / 8;
+  localparam OFFSETWIDTH = $clog2(BlockByteLength);
+  
+  localparam WORDSPERLINE = LINESIZE/`XLEN;
+  localparam LOGWPL = $clog2(WORDSPERLINE);
+
+  logic [3:0] 		     CurrState, NextState;
+  logic 		     hit, spill;
+  logic 		     SavePC;
+  logic [1:0] 		     PCMux;
+  logic 		     CntReset;
+  logic 		     PreCntEn, CntEn;
+  logic 		     spillSave;
+  logic 		     UnalignedSelect;
+  logic 		     FetchCountFlag;
+  localparam FetchCountThreshold = WORDSPERLINE - 1;
+  
+  logic [LOGWPL:0] 	     FetchCount, NextFetchCount;
+
+  logic [`XLEN-1:0] 	     PCPreFinalF, PCPFinalF, PCSpillF;
+  logic [`XLEN-1:OFFSETWIDTH] PCPTrunkF;
+
+  
+  logic [31:0] 		     FinalInstrRawF;
+
+  logic [15:0] 		     SpillDataBlock0;
+
+
+
+    // Happy path signals
+  logic [31:0] 		     AlignedInstrRawD;
+  
+    //logic [31:0]    AlignedInstrRawF, AlignedInstrRawD;
+    //logic           FlushDLastCycleN;
+    //logic           PCPMisalignedF;
+  const logic [31:0] 	     NOP = 32'h13;
+  logic [`XLEN-1:0] 	     PCPF;
+
+  logic 		     reset_q;
+  
+    // Misaligned signals
+    //logic [`XLEN:0] MisalignedInstrRawF;
+    //logic           MisalignedStall;
+    // Cache fault signals
+    //logic           FaultStall;
+
+
+  flopenr #(`XLEN) PCPFFlop(clk, reset, SavePC, {UpperPCNextPF, LowerPCNextF}, PCPF);
+  // on spill we want to get the first 2 bytes of the next cache block.
+  // the spill only occurs if the PCPF mod BlockByteLength == -2.  Therefore we can
+  // simply add 2 to land on the next cache block.
+  assign PCSpillF = PCPF + 2'b10;
+
+  // now we have to select between these three PCs
+  assign PCPreFinalF = PCMux[0] ? PCPF : {UpperPCNextPF, LowerPCNextF};
+  assign PCPFinalF = PCMux[1] ? PCSpillF : PCPreFinalF;
+  
+  
+  
+  // truncate the offset from PCPF for memory address generation
+  assign PCPTrunkF = PCPFinalF[`XLEN-1:OFFSETWIDTH];
+  
     // Detect if the instruction is compressed
-    assign CompressedF = AlignedInstrRawF[1:0] != 2'b11;
+  assign CompressedF = FinalInstrRawF[1:0] != 2'b11;
 
 
     // Handle happy path (data in cache, reads aligned)
+/* -----\/----- EXCLUDED -----\/-----
 
     generate
         if (`XLEN == 32) begin
             assign AlignedInstrRawF = PCPF[1] ? MisalignedInstrRawF : ICacheMemReadData;
-            assign PCPMisalignedF = PCPF[1] && ~CompressedF;
+            //assign PCPMisalignedF = PCPF[1] && ~CompressedF;
         end else begin
             assign AlignedInstrRawF = PCPF[2]
                 ? (PCPF[1] ? MisalignedInstrRawF : ICacheMemReadData[63:32])
                 : (PCPF[1] ? ICacheMemReadData[47:16] : ICacheMemReadData[31:0]);
-            assign PCPMisalignedF = PCPF[2] && PCPF[1] && ~CompressedF;
+            //assign PCPMisalignedF = PCPF[2] && PCPF[1] && ~CompressedF;
         end
     endgenerate
+ -----/\----- EXCLUDED -----/\----- */
 
-    flopenr #(32) AlignedInstrRawDFlop(clk, reset, ~StallD, AlignedInstrRawF, AlignedInstrRawD);
-    flopr   #(1)  FlushDLastCycleFlop(clk, reset, ~FlushD & (FlushDLastCycleN | ~StallF), FlushDLastCycleN);
-    flopenr #(`XLEN) PCPFFlop(clk, reset, ~StallF, {UpperPCNextPF, LowerPCNextF}, PCPF);
-    mux2    #(32) InstrRawDMux(AlignedInstrRawD, NOP, ~FlushDLastCycleN, InstrRawD);
+    //flopenr #(32) AlignedInstrRawDFlop(clk, reset, ~StallD, AlignedInstrRawF, AlignedInstrRawD);
+    //flopr   #(1)  FlushDLastCycleFlop(clk, reset, ~FlushD & (FlushDLastCycleN | ~StallF), FlushDLastCycleN);
+
+    //mux2    #(32) InstrRawDMux(AlignedInstrRawD, NOP, ~FlushDLastCycleN, InstrRawD);
 
     // Stall for faults or misaligned reads
+/* -----\/----- EXCLUDED -----\/-----
     always_comb begin
         assign ICacheStallF = FaultStall | MisalignedStall;
     end
+ -----/\----- EXCLUDED -----/\----- */
 
 
     // Handle misaligned, noncompressed reads
 
+/* -----\/----- EXCLUDED -----\/-----
     logic           MisalignedState, NextMisalignedState;
     logic [15:0]    MisalignedHalfInstrF;
     logic [15:0]    UpperHalfWord;
+ -----/\----- EXCLUDED -----/\----- */
 
+/* -----\/----- EXCLUDED -----\/-----
     flopenr #(16) MisalignedHalfInstrFlop(clk, reset, ~FaultStall & (PCPMisalignedF & MisalignedState), AlignedInstrRawF[15:0], MisalignedHalfInstrF);
     flopenr #(1)  MisalignedStateFlop(clk, reset, ~FaultStall, NextMisalignedState, MisalignedState);
+ -----/\----- EXCLUDED -----/\----- */
 
     // When doing a misaligned read, swizzle the bits correctly
+/* -----\/----- EXCLUDED -----\/-----
     generate
         if (`XLEN == 32) begin
             assign UpperHalfWord = ICacheMemReadData[31:16];
@@ -194,14 +278,18 @@ module icachecontroller #(parameter LINESIZE = 256) (
             assign MisalignedInstrRawF = {ICacheMemReadData[15:0], MisalignedHalfInstrF};
         end
     end
+ -----/\----- EXCLUDED -----/\----- */
 
     // Manage internal state and stall when necessary
+/* -----\/----- EXCLUDED -----\/-----
     always_comb begin
         assign MisalignedStall = PCPMisalignedF & MisalignedState;
         assign NextMisalignedState = ~PCPMisalignedF | ~MisalignedState;
     end
+ -----/\----- EXCLUDED -----/\----- */
 
     // Pick the correct address to read
+/* -----\/----- EXCLUDED -----\/-----
     generate
         if (`XLEN == 32) begin
             assign ICacheMemReadLowerAdr = {LowerPCNextF[11:2] + (PCPMisalignedF & ~MisalignedState), 2'b00};
@@ -209,16 +297,15 @@ module icachecontroller #(parameter LINESIZE = 256) (
             assign ICacheMemReadLowerAdr = {LowerPCNextF[11:3] + (PCPMisalignedF & ~MisalignedState), 3'b00};
         end
     endgenerate
+ -----/\----- EXCLUDED -----/\----- */
     // TODO Handle reading instructions that cross page boundaries
-    assign ICacheMemReadUpperPAdr = UpperPCNextPF;
+    //assign ICacheMemReadUpperPAdr = UpperPCNextPF;
 
 
     // Handle cache faults
 
-    localparam integer WORDSPERLINE = LINESIZE/`XLEN;
-    localparam integer LOGWPL = $clog2(WORDSPERLINE);
-    localparam integer OFFSETWIDTH = $clog2(LINESIZE/8);
 
+/* -----\/----- EXCLUDED -----\/-----
     logic               FetchState, BeginFetchState;
     logic [LOGWPL:0]    FetchWordNum, NextFetchWordNum;
     logic [`XLEN-1:0]   LineAlignedPCPF;
@@ -226,12 +313,6 @@ module icachecontroller #(parameter LINESIZE = 256) (
     flopr #(1) FetchStateFlop(clk, reset, BeginFetchState | (FetchState & ~EndFetchState), FetchState);
     flopr #(LOGWPL+1) FetchWordNumFlop(clk, reset, NextFetchWordNum, FetchWordNum);
 
-    genvar i;
-    generate
-        for (i=0; i < WORDSPERLINE; i++) begin
-            flopenr #(`XLEN) flop(clk, reset, FetchState & (i == FetchWordNum), InstrInF, ICacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN]);
-        end
-    endgenerate
 
     // Enter the fetch state when we hit a cache fault
     always_comb begin
@@ -242,10 +323,10 @@ module icachecontroller #(parameter LINESIZE = 256) (
 
     // Machinery to request the correct addresses from main memory
     always_comb begin
-        InstrReadF = FetchState & ~EndFetchState & ~ICacheMemWriteEnable;
-        LineAlignedPCPF = {ICacheMemReadUpperPAdr, ICacheMemReadLowerAdr[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}};
-        InstrPAdrF = LineAlignedPCPF + FetchWordNum*(`XLEN/8);
-        NextFetchWordNum = FetchState ? FetchWordNum+InstrAckF : {LOGWPL+1{1'b0}}; 
+        InstrReadF = FetchState & ~EndFetchState & ~ICacheMemWriteEnable; // next stage logic
+        LineAlignedPCPF = {ICacheMemReadUpperPAdr, ICacheMemReadLowerAdr[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; // the fetch address for abh?
+        InstrPAdrF = LineAlignedPCPF + FetchWordNum*(`XLEN/8); // ?
+        NextFetchWordNum = FetchState ? FetchWordNum+InstrAckF : {LOGWPL+1{1'b0}}; // convert to enable
     end
 
     // Write to cache memory when we have the line here
@@ -258,4 +339,255 @@ module icachecontroller #(parameter LINESIZE = 256) (
     always_comb begin
         FaultStall = FetchState | ~ICacheMemReadValid;
     end
+ -----/\----- EXCLUDED -----/\----- */
+
+  // the FSM is always runing, do not stall.
+  flopr #(4) stateReg(.clk(clk),
+		      .reset(reset),
+		      .d(NextState),
+		      .q(CurrState));
+
+  assign spill = PCPF[5:1] == 5'b1_1111 ? 1'b1 : 1'b0;
+  assign hit = ICacheMemReadValid; // note ICacheMemReadValid is hit.
+  assign FetchCountFlag = FetchCount == FetchCountThreshold;
+  
+  // Next state logic
+  always_comb begin
+      UnalignedSelect = 1'b0;
+      CntReset = 1'b0;
+      PreCntEn = 1'b0;
+      InstrReadF = 1'b0;
+      ICacheMemWriteEnable = 1'b0;
+      spillSave = 1'b0;
+      PCMux = 2'b00;
+    
+    case (CurrState)
+      
+      STATE_READY: begin
+	PCMux = 2'b00;
+	if (hit & ~spill) begin
+	  NextState = STATE_READY;
+	end else if (hit & spill) begin
+	  spillSave = 1'b1;
+	  NextState = STATE_HIT_SPILL;
+	end else if (~hit & ~spill) begin
+	  CntReset = 1'b1;
+	  NextState = STATE_MISS_FETCH_WDV;
+	end else if (~hit & spill)	begin
+	  CntReset = 1'b1;
+	  NextState = STATE_MISS_SPILL_FETCH_WDV;
+	end else begin
+          NextState = STATE_READY;
+	end
+      end
+
+      // branch 1,  hit spill and 2, miss spill hit
+      STATE_HIT_SPILL: begin
+	PCMux = 2'b10;
+	UnalignedSelect = 1'b1;	
+	if (hit) begin
+          NextState = STATE_READY;
+	end else
+	  CntReset = 1'b1;
+          NextState = STATE_HIT_SPILL_MISS_FETCH_WDV;
+      end
+      STATE_HIT_SPILL_MISS_FETCH_WDV: begin
+	PCMux = 2'b10;
+	InstrReadF = 1'b1;
+	PreCntEn = 1'b1;
+	if (FetchCountFlag & InstrAckF) begin
+	  NextState = STATE_HIT_SPILL_MISS_FETCH_DONE;
+	end else begin
+	  NextState = STATE_HIT_SPILL_MISS_FETCH_WDV;
+	end
+      end
+      STATE_HIT_SPILL_MISS_FETCH_DONE: begin
+	PCMux = 2'b10;
+	ICacheMemWriteEnable = 1'b1;
+        NextState = STATE_HIT_SPILL_MERGE;
+      end
+      STATE_HIT_SPILL_MERGE: begin
+	PCMux = 2'b10;
+	UnalignedSelect = 1'b1;
+        NextState = STATE_READY;
+      end
+
+      // branch 3 miss no spill
+      STATE_MISS_FETCH_WDV: begin
+	PCMux = 2'b01;
+	InstrReadF = 1'b1;
+	PreCntEn = 1'b1;
+	if (FetchCountFlag & InstrAckF) begin
+	  NextState = STATE_MISS_FETCH_DONE;	  
+	end else begin
+	  NextState = STATE_MISS_FETCH_WDV;
+	end
+      end
+      STATE_MISS_FETCH_DONE: begin
+	PCMux = 2'b01;
+	ICacheMemWriteEnable = 1'b1;
+        NextState = STATE_MISS_READ;
+      end
+      STATE_MISS_READ: begin
+	PCMux = 2'b01;
+	NextState = STATE_READY;
+      end
+
+      // branch 4 miss spill hit, and 5 miss spill miss
+      STATE_MISS_SPILL_FETCH_WDV: begin
+	PCMux = 2'b01;
+	PreCntEn = 1'b1;
+	InstrReadF = 1'b1;	
+	if (FetchCountFlag & InstrAckF) begin 
+	  NextState = STATE_MISS_SPILL_FETCH_DONE;
+	end else begin
+	  NextState = STATE_MISS_SPILL_FETCH_WDV;
+	end
+      end
+      STATE_MISS_SPILL_FETCH_DONE: begin
+	PCMux = 2'b01;	
+	ICacheMemWriteEnable = 1'b1;
+	NextState = STATE_MISS_SPILL_READ1;
+      end
+      STATE_MISS_SPILL_READ1: begin // always be a hit as we just wrote that cache block.
+	PCMux = 2'b10;	 // there is a 1 cycle delay after setting the address before the date arrives.
+	spillSave = 1'b1; /// *** Could pipeline these to make it clearer in the fsm.
+	NextState = STATE_MISS_SPILL_2;
+      end
+      STATE_MISS_SPILL_2: begin
+	PCMux = 2'b10;
+	UnalignedSelect = 1'b1;	
+	if (~hit) begin
+	  CntReset = 1'b1;
+	  NextState = STATE_MISS_SPILL_MISS_FETCH_WDV;
+	end else begin
+	  NextState = STATE_READY;
+	end
+      end
+      STATE_MISS_SPILL_MISS_FETCH_WDV: begin
+	PCMux = 2'b10;
+	PreCntEn = 1'b1;
+	InstrReadF = 1'b1;	
+	if (FetchCountFlag & InstrAckF) begin
+	  NextState = STATE_MISS_SPILL_MISS_FETCH_DONE;	  
+	end else begin
+	  NextState = STATE_MISS_SPILL_MISS_FETCH_WDV;
+	end
+      end
+      STATE_MISS_SPILL_MISS_FETCH_DONE: begin
+	PCMux = 2'b10;
+	ICacheMemWriteEnable = 1'b1;
+	NextState = STATE_MISS_SPILL_MERGE;
+      end
+      STATE_MISS_SPILL_MERGE: begin
+	PCMux = 2'b10;
+	UnalignedSelect = 1'b1;
+        NextState = STATE_READY;
+      end
+      default: begin
+	PCMux = 2'b01;
+	NextState = STATE_READY;
+      end
+      // *** add in error handling and invalidate/evict
+    endcase
+  end
+
+  // fsm outputs
+  // stall CPU any time we are not in the ready state.  any other state means the
+  // cache is either requesting data from the memory interface or handling a
+  // spill over two cycles.
+  assign ICacheStallF = (CurrState != STATE_READY) | reset_q ? 1'b1 : 1'b0;
+  // save the PC anytime we are in the ready state. The saved value will be used as the PC may not be stable.
+  assign SavePC = CurrState == STATE_READY ? 1'b1 : 1'b0;
+  assign CntEn = PreCntEn & InstrAckF;
+
+  // to compute the fetch address we need to add the bit shifted
+  // counter output to the address.
+
+  flopenr #(LOGWPL+1) 
+  FetchCountReg(.clk(clk),
+		.reset(reset | CntReset),
+		.en(CntEn),
+		.d(NextFetchCount),
+		.q(FetchCount));
+
+  assign NextFetchCount = FetchCount + 1'b1;
+  
+  // This part is confusing.
+  // we need to remove the offset bits (PCPTrunkF).  Because the AHB interface is XLEN wide
+  // we need to address on that number of bits so the PC is extended to the right by AHBByteLength with zeros.
+  // fetch count is already aligned to AHBByteLength, but we need to extend back to the full address width with
+  // more zeros after the addition.  This will be the number of offset bits less the AHBByteLength.
+  assign InstrPAdrF = {{PCPTrunkF, {{LOGWPL}{1'b0}}} + FetchCount, {{OFFSETWIDTH-LOGWPL}{1'b0}}};
+
+
+  // store read data from memory interface before writing into SRAM.
+  genvar i;
+  generate
+    for (i = 0; i < AHBByteLength; i++) begin
+      flopenr #(`XLEN) flop(.clk(clk),
+			    .reset(reset), 
+			    .en(InstrAckF & (i == FetchCount)),
+			    .d(InstrInF),
+			    .q(ICacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN]));
+    end
+  endgenerate
+
+  // what address is used to write the SRAM?
+  
+
+  // spills require storing the first cache block so it can merged
+  // with the second
+  // can optimize size, for now just make it the size of the data
+  // leaving the cache memory. 
+  flopenr #(16) SpillInstrReg(.clk(clk),
+			      .en(spillSave),
+			      .reset(reset),
+			      .d(ICacheMemReadData[15:0]),
+			      .q(SpillDataBlock0));
+
+  // use the not quite final PC to do the final selection.
+  generate
+    if( `XLEN == 32) begin
+      logic [1:1] PCPreFinalF_q;
+      flop #(1) PCFReg(.clk(clk),
+		       .d(PCPreFinalF[1]),
+		       .q(PCPreFinalF_q[1]));
+      assign FinalInstrRawF = PCPreFinalF[1] ? {SpillDataBlock0, ICacheMemReadData[31:16]} : ICacheMemReadData;
+    end else begin
+      logic [2:1] PCPreFinalF_q;
+      flop #(2) PCFReg(.clk(clk),
+		       .d(PCPreFinalF[2:1]),
+		       .q(PCPreFinalF_q[2:1]));
+      mux4 #(32) AlignmentMux(.d0(ICacheMemReadData[31:0]),
+			      .d1(ICacheMemReadData[47:16]),
+			      .d2(ICacheMemReadData[63:32]),
+			      .d3({SpillDataBlock0, ICacheMemReadData[63:48]}),
+			      .s(PCPreFinalF[2:1]),
+			      .y(FinalInstrRawF));
+    end
+  endgenerate
+
+  // There is a frustrating issue on the first access.
+  // The cache will not contain any valid data but will contain x's on
+  // reset. This makes FinalInstrRawF invalid.  On the first cycle out of
+  // reset this register will pickup this x and it will propagate throughout
+  // the cpu causing simulation failure, most likely a trap for invalid instruction.
+  // Reset must be held 1 cycle longer to prevent this issue. additionally the
+  // reset should be to a NOP rather than 0.
+
+  // register reset
+  flop #(1) resetReg (.clk(clk),
+		      .d(reset),
+		      .q(reset_q));
+  
+  flopenl #(32) AlignedInstrRawDFlop(clk, reset | reset_q, ~StallD, FinalInstrRawF, NOP, AlignedInstrRawD);
+  mux2    #(32) InstrRawDMux(AlignedInstrRawD, NOP, FlushD, InstrRawD);
+  
+  assign {ICacheMemReadUpperPAdr, ICacheMemReadLowerAdr} = PCPFinalF;
+
+  assign ICacheMemWritePAdr = PCPFinalF;
+
+  
+  
 endmodule
diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv
index 5a2d1b420..58b144f5c 100644
--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@@ -77,6 +77,8 @@ module ifu (
   logic [31:0]      nop = 32'h00000013; // instruction for NOP
   // *** send this to the trap unit
   logic             ITLBPageFaultF;
+  logic 	    reset_q; // *** look at this later.
+  
 
   tlb #(3) itlb(.TLBAccess(1'b1), .VirtualAddress(PCF),
                 .PageTableEntryWrite(PageTableEntryF), .PageTypeWrite(PageTypeF),
@@ -87,7 +89,7 @@ module ifu (
 
   // branch predictor signals
   logic 	   SelBPPredF;
-  logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F;
+  logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F, PCNext2F;
   logic [3:0] 	    InstrClassD, InstrClassE;
   
 
@@ -98,10 +100,10 @@ module ifu (
 
   // jarred 2021-03-14 Add instrution cache block to remove rd2
   assign PCNextPF = PCNextF; // Temporary workaround until iTLB is live
-  icache ic(
+  icache icache(
     .*,
-    .UpperPCPF(PCPF[`XLEN-1:12]),
-    .LowerPCF(PCF[11:0])
+    .UpperPCNextPF(PCNextPF[`XLEN-1:12]),
+    .LowerPCNextF(PCNextPF[11:0])
   );
 
   assign PrivilegedChangePCM = RetM | TrapM;
@@ -120,7 +122,17 @@ module ifu (
   mux2 #(`XLEN) pcmux2(.d0(PCNext1F),
 		       .d1(PrivilegedNextPCM),
 		       .s(PrivilegedChangePCM),
+		       .y(PCNext2F));
+
+  mux2 #(`XLEN) pcmux3(.d0(PCNext2F),
+		       .d1(`RESET_VECTOR),
+		       .s(reset_q),
 		       .y(UnalignedPCNextF));
+
+  flop #(1) resetReg (.clk(clk),
+		      .d(reset),
+		      .q(reset_q));
+  
   
   assign  PCNextF = {UnalignedPCNextF[`XLEN-1:1], 1'b0}; // hart-SPEC p. 21 about 16-bit alignment
   flopenl #(`XLEN) pcreg(clk, reset, ~StallF & ~ICacheStallF, PCNextF, `RESET_VECTOR, PCF);
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index 2d11bcc85..bd51596d1 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -447,7 +447,7 @@ module testbench();
 
   // Track names of instructions
   instrTrackerTB it(clk, reset, dut.hart.ieu.dp.FlushE,
-                dut.hart.ifu.ic.controller.AlignedInstrRawF,
+                dut.hart.ifu.icache.controller.FinalInstrRawF,
                 dut.hart.ifu.InstrD, dut.hart.ifu.InstrE,
                 dut.hart.ifu.InstrM,  dut.hart.ifu.InstrW,
                 InstrFName, InstrDName, InstrEName, InstrMName, InstrWName);