/////////////////////////////////////////// // icache.sv // // Written: ross1728@gmail.com June 04, 2021 // Modified: // // Purpose: I Cache controller // // A component of the Wally configurable RISC-V project. // // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University // // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software // is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// `include "wally-config.vh" module ICacheCntrl #(parameter BLOCKLEN = 256) ( // Inputs from pipeline input logic clk, reset, input logic StallF, StallD, input logic FlushD, // Input the address to read // The upper bits of the physical pc input logic [`PA_BITS-1:0] PCNextF, input logic [`PA_BITS-1:0] PCPF, // Signals to/from cache memory // The read coming out of it input logic [31:0] ICacheMemReadData, input logic ICacheMemReadValid, // The address at which we want to search the cache memory output logic [`PA_BITS-1:0] PCTagF, output logic [`PA_BITS-1:0] PCNextIndexF, output logic ICacheReadEn, // Load data into the cache output logic ICacheMemWriteEnable, output logic [BLOCKLEN-1:0] ICacheMemWriteData, // Outputs to rest of ifu // High if the instruction in the fetch stage is compressed output logic CompressedF, // The instruction that was requested // If this instruction is compressed, upper 16 bits may be the next 16 bits or may be zeros output logic [31:0] FinalInstrRawF, // Outputs to pipeline control stuff output logic ICacheStallF, EndFetchState, input logic ITLBMissF, input logic ITLBWriteF, input logic WalkerInstrPageFaultF, // Signals to/from ahblite interface // A read containing the requested data input logic [`XLEN-1:0] InstrInF, input logic InstrAckF, // The read we request from main memory output logic [`PA_BITS-1:0] InstrPAdrF, output logic InstrReadF ); // FSM states localparam STATE_READY = 'h0; localparam STATE_HIT_SPILL = 'h1; // spill, block 0 hit localparam STATE_HIT_SPILL_MISS_FETCH_WDV = 'h2; // block 1 miss, issue read to AHB and wait data. localparam STATE_HIT_SPILL_MISS_FETCH_DONE = 'h3; // write data into SRAM/LUT localparam STATE_HIT_SPILL_MERGE = 'h4; // Read block 0 of CPU access, should be able to optimize into STATE_HIT_SPILL. // a challenge is the spill signal gets us out of the ready state and moves us to // 1 of the 2 spill branches. However the original fsm design had us return to // the ready state when the spill + hits/misses were fully resolved. The problem // is the spill signal is based on PCPF so when we return to READY to check if the // cache has a hit it still expresses spill. We can fix in 1 of two ways. // 1. we can add 1 extra state at the end of each spill branch to returns the instruction // to the CPU advancing the CPU and icache to the next instruction. // 2. We can assert a signal which is delayed 1 cycle to suppress the spill when we get // to the READY state. // The first first option is more robust and increases the number of states by 2. The // second option is seams like it should work, but I worry there is a hidden interaction // between CPU stalling and that register. // Picking option 1. localparam STATE_HIT_SPILL_FINAL = 'h5; // this state replicates STATE_READY's replay of the // spill access but does nto consider spill. It also does not do another operation. localparam STATE_MISS_FETCH_WDV = 'h6; // aligned miss, issue read to AHB and wait for data. localparam STATE_MISS_FETCH_DONE = 'h7; // write data into SRAM/LUT localparam STATE_MISS_READ = 'h8; // read block 1 from SRAM/LUT localparam STATE_MISS_SPILL_FETCH_WDV = 'h9; // spill, miss on block 0, issue read to AHB and wait localparam STATE_MISS_SPILL_FETCH_DONE = 'ha; // write data into SRAM/LUT localparam STATE_MISS_SPILL_READ1 = 'hb; // read block 0 from SRAM/LUT localparam STATE_MISS_SPILL_2 = 'hc; // return to ready if hit or do second block update. localparam STATE_MISS_SPILL_2_START = 'hd; // return to ready if hit or do second block update. localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 'he; // miss on block 1, issue read to AHB and wait localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 'hf; // write data to SRAM/LUT localparam STATE_MISS_SPILL_MERGE = 'h10; // read block 0 of CPU access, localparam STATE_MISS_SPILL_FINAL = 'h11; // this state replicates STATE_READY's replay of the // spill access but does nto consider spill. It also does not do another operation. localparam STATE_INVALIDATE = 'h12; // *** not sure if invalidate or evict? invalidate by cache block or address? localparam STATE_TLB_MISS = 'h13; localparam STATE_TLB_MISS_DONE = 'h14; localparam STATE_INSTR_PAGE_FAULT = 'h15; localparam AHBByteLength = `XLEN / 8; localparam AHBOFFETWIDTH = $clog2(AHBByteLength); localparam BlockByteLength = BLOCKLEN / 8; localparam OFFSETWIDTH = $clog2(BlockByteLength); localparam WORDSPERLINE = BLOCKLEN/`XLEN; localparam LOGWPL = $clog2(WORDSPERLINE); localparam integer PA_WIDTH = `PA_BITS - 2; logic [4:0] CurrState, NextState; logic hit, spill; logic SavePC; logic [1:0] PCMux; logic CntReset; logic PreCntEn, CntEn; logic spillSave; logic UnalignedSelect; logic FetchCountFlag; localparam FetchCountThreshold = WORDSPERLINE - 1; logic [LOGWPL-1:0] FetchCount, NextFetchCount; logic [`PA_BITS-1:0] PCPreFinalF, PCPSpillF; logic [`PA_BITS-1:OFFSETWIDTH] PCPTrunkF; logic [15:0] SpillDataBlock0; localparam [31:0] NOP = 32'h13; logic reset_q; logic [1:0] PCMux_q; // Misaligned signals //logic [`XLEN:0] MisalignedInstrRawF; //logic MisalignedStall; // Cache fault signals //logic FaultStall; // on spill we want to get the first 2 bytes of the next cache block. // the spill only occurs if the PCPF mod BlockByteLength == -2. Therefore we can // simply add 2 to land on the next cache block. assign PCPSpillF = PCPF + {{{PA_WIDTH}{1'b0}}, 2'b10}; // *** modelsim does not allow the use of PA_BITS for literal width. // now we have to select between these three PCs assign PCPreFinalF = PCMux[0] | StallF ? PCPF : PCNextF; // *** don't like the stallf, but it is necessary assign PCNextIndexF = PCMux[1] ? PCPSpillF : PCPreFinalF; // this mux needs to be delayed 1 cycle as it occurs 1 pipeline stage later. // *** read enable may not be necessary. flopenr #(2) PCMuxReg(.clk(clk), .reset(reset), .en(ICacheReadEn), .d(PCMux), .q(PCMux_q)); assign PCTagF = PCMux_q[1] ? PCPSpillF : PCPF; // truncate the offset from PCPF for memory address generation assign PCPTrunkF = PCTagF[`PA_BITS-1:OFFSETWIDTH]; // Detect if the instruction is compressed assign CompressedF = FinalInstrRawF[1:0] != 2'b11; // the FSM is always runing, do not stall. flopr #(5) stateReg(.clk(clk), .reset(reset), .d(NextState), .q(CurrState)); assign spill = PCPF[4:1] == 4'b1111 ? 1'b1 : 1'b0; assign hit = ICacheMemReadValid; // note ICacheMemReadValid is hit. assign FetchCountFlag = (FetchCount == FetchCountThreshold[LOGWPL-1:0]); // Next state logic always_comb begin UnalignedSelect = 1'b0; CntReset = 1'b0; PreCntEn = 1'b0; //InstrReadF = 1'b0; ICacheMemWriteEnable = 1'b0; spillSave = 1'b0; PCMux = 2'b00; ICacheReadEn = 1'b0; SavePC = 1'b0; ICacheStallF = 1'b1; case (CurrState) STATE_READY: begin PCMux = 2'b00; ICacheReadEn = 1'b1; if (ITLBMissF) begin NextState = STATE_TLB_MISS; end else if (hit & ~spill) begin SavePC = 1'b1; ICacheStallF = 1'b0; NextState = STATE_READY; end else if (hit & spill) begin spillSave = 1'b1; PCMux = 2'b10; NextState = STATE_HIT_SPILL; end else if (~hit & ~spill) begin CntReset = 1'b1; NextState = STATE_MISS_FETCH_WDV; end else if (~hit & spill) begin CntReset = 1'b1; PCMux = 2'b01; NextState = STATE_MISS_SPILL_FETCH_WDV; end else begin NextState = STATE_READY; end end // branch 1, hit spill and 2, miss spill hit STATE_HIT_SPILL: begin PCMux = 2'b10; UnalignedSelect = 1'b1; ICacheReadEn = 1'b1; if (hit) begin NextState = STATE_HIT_SPILL_FINAL; end else begin CntReset = 1'b1; NextState = STATE_HIT_SPILL_MISS_FETCH_WDV; end end STATE_HIT_SPILL_MISS_FETCH_WDV: begin PCMux = 2'b10; //InstrReadF = 1'b1; PreCntEn = 1'b1; if (FetchCountFlag & InstrAckF) begin NextState = STATE_HIT_SPILL_MISS_FETCH_DONE; end else begin NextState = STATE_HIT_SPILL_MISS_FETCH_WDV; end end STATE_HIT_SPILL_MISS_FETCH_DONE: begin PCMux = 2'b10; ICacheMemWriteEnable = 1'b1; NextState = STATE_HIT_SPILL_MERGE; end STATE_HIT_SPILL_MERGE: begin PCMux = 2'b10; UnalignedSelect = 1'b1; ICacheReadEn = 1'b1; NextState = STATE_HIT_SPILL_FINAL; end STATE_HIT_SPILL_FINAL: begin ICacheReadEn = 1'b1; PCMux = 2'b00; UnalignedSelect = 1'b1; SavePC = 1'b1; NextState = STATE_READY; ICacheStallF = 1'b0; end // branch 3 miss no spill STATE_MISS_FETCH_WDV: begin PCMux = 2'b01; //InstrReadF = 1'b1; PreCntEn = 1'b1; if (FetchCountFlag & InstrAckF) begin NextState = STATE_MISS_FETCH_DONE; end else begin NextState = STATE_MISS_FETCH_WDV; end end STATE_MISS_FETCH_DONE: begin PCMux = 2'b01; ICacheMemWriteEnable = 1'b1; NextState = STATE_MISS_READ; end STATE_MISS_READ: begin PCMux = 2'b01; ICacheReadEn = 1'b1; NextState = STATE_READY; end // branch 4 miss spill hit, and 5 miss spill miss STATE_MISS_SPILL_FETCH_WDV: begin PCMux = 2'b01; PreCntEn = 1'b1; //InstrReadF = 1'b1; if (FetchCountFlag & InstrAckF) begin NextState = STATE_MISS_SPILL_FETCH_DONE; end else begin NextState = STATE_MISS_SPILL_FETCH_WDV; end end STATE_MISS_SPILL_FETCH_DONE: begin PCMux = 2'b01; ICacheMemWriteEnable = 1'b1; NextState = STATE_MISS_SPILL_READ1; end STATE_MISS_SPILL_READ1: begin // always be a hit as we just wrote that cache block. PCMux = 2'b01; // there is a 1 cycle delay after setting the address before the date arrives. ICacheReadEn = 1'b1; NextState = STATE_MISS_SPILL_2; end STATE_MISS_SPILL_2: begin PCMux = 2'b10; UnalignedSelect = 1'b1; spillSave = 1'b1; /// *** Could pipeline these to make it clearer in the fsm. ICacheReadEn = 1'b1; NextState = STATE_MISS_SPILL_2_START; end STATE_MISS_SPILL_2_START: begin if (~hit) begin CntReset = 1'b1; NextState = STATE_MISS_SPILL_MISS_FETCH_WDV; end else begin NextState = STATE_READY; ICacheReadEn = 1'b1; PCMux = 2'b00; UnalignedSelect = 1'b1; SavePC = 1'b1; ICacheStallF = 1'b0; end end STATE_MISS_SPILL_MISS_FETCH_WDV: begin PCMux = 2'b10; PreCntEn = 1'b1; //InstrReadF = 1'b1; if (FetchCountFlag & InstrAckF) begin NextState = STATE_MISS_SPILL_MISS_FETCH_DONE; end else begin NextState = STATE_MISS_SPILL_MISS_FETCH_WDV; end end STATE_MISS_SPILL_MISS_FETCH_DONE: begin PCMux = 2'b10; ICacheMemWriteEnable = 1'b1; NextState = STATE_MISS_SPILL_MERGE; end STATE_MISS_SPILL_MERGE: begin PCMux = 2'b10; UnalignedSelect = 1'b1; ICacheReadEn = 1'b1; NextState = STATE_MISS_SPILL_FINAL; end STATE_MISS_SPILL_FINAL: begin ICacheReadEn = 1'b1; PCMux = 2'b00; UnalignedSelect = 1'b1; SavePC = 1'b1; ICacheStallF = 1'b0; NextState = STATE_READY; end STATE_TLB_MISS: begin if (WalkerInstrPageFaultF) begin NextState = STATE_INSTR_PAGE_FAULT; ICacheStallF = 1'b0; end else if (ITLBWriteF) begin NextState = STATE_TLB_MISS_DONE; end else begin NextState = STATE_TLB_MISS; end end STATE_TLB_MISS_DONE: begin NextState = STATE_READY; end STATE_INSTR_PAGE_FAULT: begin ICacheStallF = 1'b0; NextState = STATE_READY; end default: begin PCMux = 2'b01; NextState = STATE_READY; end // *** add in error handling and invalidate/evict endcase end assign CntEn = PreCntEn & InstrAckF; assign InstrReadF = (CurrState == STATE_HIT_SPILL_MISS_FETCH_WDV) || (CurrState == STATE_MISS_FETCH_WDV) || (CurrState == STATE_MISS_SPILL_FETCH_WDV) || (CurrState == STATE_MISS_SPILL_MISS_FETCH_WDV); // to compute the fetch address we need to add the bit shifted // counter output to the address. flopenr #(LOGWPL) FetchCountReg(.clk(clk), .reset(reset | CntReset), .en(CntEn), .d(NextFetchCount), .q(FetchCount)); assign NextFetchCount = FetchCount + 1'b1; // This part is confusing. // *** Ross Thompson reduce the complexity. This is just dumb. // we need to remove the offset bits (PCPTrunkF). Because the AHB interface is XLEN wide // we need to address on that number of bits so the PC is extended to the right by AHBByteLength with zeros. // fetch count is already aligned to AHBByteLength, but we need to extend back to the full address width with // more zeros after the addition. This will be the number of offset bits less the AHBByteLength. logic [`PA_BITS-1:OFFSETWIDTH-LOGWPL] PCPTrunkExtF, InstrPAdrTrunkF ; assign PCPTrunkExtF = {PCPTrunkF, {{LOGWPL}{1'b0}}}; // verilator lint_off WIDTH assign InstrPAdrTrunkF = PCPTrunkExtF + FetchCount; // verilator lint_on WIDTH //assign InstrPAdrF = {{PCPTrunkF, {{LOGWPL}{1'b0}}} + FetchCount, {{OFFSETWIDTH-LOGWPL}{1'b0}}}; assign InstrPAdrF = {InstrPAdrTrunkF, {{OFFSETWIDTH-LOGWPL}{1'b0}}}; // store read data from memory interface before writing into SRAM. genvar i; generate for (i = 0; i < WORDSPERLINE; i++) begin:storebuffer flopenr #(`XLEN) sb(.clk(clk), .reset(reset), .en(InstrAckF & (i == FetchCount)), .d(InstrInF), .q(ICacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN])); end endgenerate // what address is used to write the SRAM? // spills require storing the first cache block so it can merged // with the second // can optimize size, for now just make it the size of the data // leaving the cache memory. flopenr #(16) SpillInstrReg(.clk(clk), .en(spillSave), .reset(reset), .d(ICacheMemReadData[15:0]), .q(SpillDataBlock0)); // use the not quite final PC to do the final selection. logic [1:1] PCPreFinalF_q; flopenr #(1) PCFReg(.clk(clk), .reset(reset), .en(~StallF), .d(PCPreFinalF[1]), .q(PCPreFinalF_q[1])); assign FinalInstrRawF = spill ? {ICacheMemReadData[15:0], SpillDataBlock0} : ICacheMemReadData; // There is a frustrating issue on the first access. // The cache will not contain any valid data but will contain x's on // reset. This makes FinalInstrRawF invalid. On the first cycle out of // reset this register will pickup this x and it will propagate throughout // the cpu causing simulation failure, most likely a trap for invalid instruction. // Reset must be held 1 cycle longer to prevent this issue. additionally the // reset should be to a NOP rather than 0. // register reset flop #(1) resetReg (.clk(clk), .d(reset), .q(reset_q)); endmodule