cvw/wally-pipelined/src/ifu/icache.sv

510 lines
17 KiB
Systemverilog
Raw Normal View History

///////////////////////////////////////////
// icache.sv
//
// Written: jaallen@g.hmc.edu 2021-03-02
// Modified:
//
// Purpose: Cache instructions for the ifu so it can access memory less often, saving cycles
//
// A component of the Wally configurable RISC-V project.
//
// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
// is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
///////////////////////////////////////////
`include "wally-config.vh"
module icache(
// Basic pipeline stuff
input logic clk, reset,
input logic StallF, StallD,
input logic FlushD,
input logic [`XLEN-1:0] PCNextF,
input logic [`XLEN-1:0] PCPF,
// Data read in from the ebu unit
input logic [`XLEN-1:0] InstrInF,
input logic InstrAckF,
// Read requested from the ebu unit
output logic [`XLEN-1:0] InstrPAdrF,
output logic InstrReadF,
// High if the instruction currently in the fetch stage is compressed
output logic CompressedF,
// High if the icache is requesting a stall
output logic ICacheStallF,
// The raw (not decompressed) instruction that was requested
// If this instruction is compressed, upper 16 bits may be the next 16 bits or may be zeros
output logic [31:0] InstrRawD
);
// Configuration parameters
// TODO Move these to a config file
localparam integer ICACHELINESIZE = 256;
localparam integer ICACHENUMLINES = 512;
// Input signals to cache memory
logic FlushMem;
logic ICacheMemWriteEnable;
logic [ICACHELINESIZE-1:0] ICacheMemWriteData;
2021-04-14 23:03:33 +00:00
logic EndFetchState;
2021-05-03 19:36:09 +00:00
logic [`XLEN-1:0] PCTagF, PCNextIndexF;
// Output signals from cache memory
2021-04-26 11:43:16 +00:00
logic [31:0] ICacheMemReadData;
logic ICacheMemReadValid;
logic ICacheReadEn;
rodirectmappedmemre #(.LINESIZE(ICACHELINESIZE), .NUMLINES(ICACHENUMLINES), .WORDSIZE(`XLEN))
cachemem(
.*,
2021-04-15 03:14:59 +00:00
// Stall it if the pipeline is stalled, unless we're stalling it and we're ending our stall
.flush(FlushMem),
.WriteEnable(ICacheMemWriteEnable),
.WriteLine(ICacheMemWriteData),
.DataWord(ICacheMemReadData),
.DataValid(ICacheMemReadValid)
);
icachecontroller #(.LINESIZE(ICACHELINESIZE)) controller(.*);
2021-04-13 05:06:57 +00:00
// For now, assume no writes to executable memory
2021-03-24 20:56:44 +00:00
assign FlushMem = 1'b0;
endmodule
module icachecontroller #(parameter LINESIZE = 256) (
// Inputs from pipeline
input logic clk, reset,
input logic StallF, StallD,
input logic FlushD,
// Input the address to read
// The upper bits of the physical pc
input logic [`XLEN-1:0] PCNextF,
2021-05-03 19:36:09 +00:00
input logic [`XLEN-1:0] PCPF,
// Signals to/from cache memory
// The read coming out of it
2021-04-21 21:47:05 +00:00
input logic [31:0] ICacheMemReadData,
input logic ICacheMemReadValid,
// The address at which we want to search the cache memory
2021-05-03 19:36:09 +00:00
output logic [`XLEN-1:0] PCTagF,
output logic [`XLEN-1:0] PCNextIndexF,
2021-04-21 21:47:05 +00:00
output logic ICacheReadEn,
// Load data into the cache
output logic ICacheMemWriteEnable,
output logic [LINESIZE-1:0] ICacheMemWriteData,
// Outputs to rest of ifu
// High if the instruction in the fetch stage is compressed
output logic CompressedF,
// The instruction that was requested
// If this instruction is compressed, upper 16 bits may be the next 16 bits or may be zeros
output logic [31:0] InstrRawD,
// Outputs to pipeline control stuff
output logic ICacheStallF, EndFetchState,
// Signals to/from ahblite interface
// A read containing the requested data
input logic [`XLEN-1:0] InstrInF,
input logic InstrAckF,
// The read we request from main memory
output logic [`XLEN-1:0] InstrPAdrF,
output logic InstrReadF
);
// FSM states
localparam STATE_READY = 0;
localparam STATE_HIT_SPILL = 1; // spill, block 0 hit
localparam STATE_HIT_SPILL_MISS_FETCH_WDV = 2; // block 1 miss, issue read to AHB and wait data.
localparam STATE_HIT_SPILL_MISS_FETCH_DONE = 3; // write data into SRAM/LUT
localparam STATE_HIT_SPILL_MERGE = 4; // Read block 0 of CPU access, should be able to optimize into STATE_HIT_SPILL.
// a challenge is the spill signal gets us out of the ready state and moves us to
// 1 of the 2 spill branches. However the original fsm design had us return to
// the ready state when the spill + hits/misses were fully resolved. The problem
// is the spill signal is based on PCPF so when we return to READY to check if the
// cache has a hit it still expresses spill. We can fix in 1 of two ways.
// 1. we can add 1 extra state at the end of each spill branch to returns the instruction
// to the CPU advancing the CPU and icache to the next instruction.
// 2. We can assert a signal which is delayed 1 cycle to suppress the spill when we get
// to the READY state.
// The first first option is more robust and increases the number of states by 2. The
// second option is seams like it should work, but I worry there is a hidden interaction
// between CPU stalling and that register.
// Picking option 1.
localparam STATE_HIT_SPILL_FINAL = 5; // this state replicates STATE_READY's replay of the
// spill access but does nto consider spill. It also does not do another operation.
localparam STATE_MISS_FETCH_WDV = 6; // aligned miss, issue read to AHB and wait for data.
localparam STATE_MISS_FETCH_DONE = 7; // write data into SRAM/LUT
localparam STATE_MISS_READ = 8; // read block 1 from SRAM/LUT
localparam STATE_MISS_SPILL_FETCH_WDV = 9; // spill, miss on block 0, issue read to AHB and wait
localparam STATE_MISS_SPILL_FETCH_DONE = 10; // write data into SRAM/LUT
localparam STATE_MISS_SPILL_READ1 = 11; // read block 0 from SRAM/LUT
localparam STATE_MISS_SPILL_2 = 12; // return to ready if hit or do second block update.
localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 13; // miss on block 1, issue read to AHB and wait
localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 14; // write data to SRAM/LUT
localparam STATE_MISS_SPILL_MERGE = 15; // read block 0 of CPU access,
localparam STATE_MISS_SPILL_FINAL = 16; // this state replicates STATE_READY's replay of the
// spill access but does nto consider spill. It also does not do another operation.
localparam STATE_INVALIDATE = 17; // *** not sure if invalidate or evict? invalidate by cache block or address?
localparam AHBByteLength = `XLEN / 8;
localparam AHBOFFETWIDTH = $clog2(AHBByteLength);
localparam BlockByteLength = LINESIZE / 8;
localparam OFFSETWIDTH = $clog2(BlockByteLength);
localparam WORDSPERLINE = LINESIZE/`XLEN;
localparam LOGWPL = $clog2(WORDSPERLINE);
logic [4:0] CurrState, NextState;
logic hit, spill;
logic SavePC;
logic [1:0] PCMux;
logic CntReset;
logic PreCntEn, CntEn;
logic spillSave;
logic UnalignedSelect;
logic FetchCountFlag;
localparam FetchCountThreshold = WORDSPERLINE - 1;
logic [LOGWPL:0] FetchCount, NextFetchCount;
logic [`XLEN-1:0] PCPreFinalF, PCPFinalF, PCSpillF;
logic [`XLEN-1:OFFSETWIDTH] PCPTrunkF;
logic [31:0] FinalInstrRawF;
logic [15:0] SpillDataBlock0;
logic FlushDLastCyclen;
// Happy path signals
logic [31:0] AlignedInstrRawD;
//logic [31:0] AlignedInstrRawF, AlignedInstrRawD;
//logic FlushDLastCycleN;
//logic PCPMisalignedF;
localparam [31:0] NOP = 32'h13;
//logic [`XLEN-1:0] PCPF;
logic reset_q;
2021-05-03 19:36:09 +00:00
logic [1:0] PCMux_q;
// Misaligned signals
//logic [`XLEN:0] MisalignedInstrRawF;
//logic MisalignedStall;
// Cache fault signals
//logic FaultStall;
// on spill we want to get the first 2 bytes of the next cache block.
// the spill only occurs if the PCPF mod BlockByteLength == -2. Therefore we can
// simply add 2 to land on the next cache block.
assign PCSpillF = PCPF + 2'b10;
// now we have to select between these three PCs
assign PCPreFinalF = PCMux[0] | StallF ? PCPF : PCNextF; // *** don't like the stallf, but it is necessary
assign PCPFinalF = PCMux[1] ? PCSpillF : PCPreFinalF;
2021-05-03 19:36:09 +00:00
// this mux needs to be delayed 1 cycle as it occurs 1 pipeline stage later.
// *** read enable may not be necessary.
flopenr #(2) PCMuxReg(.clk(clk),
.reset(reset),
.en(ICacheReadEn),
.d(PCMux),
.q(PCMux_q));
2021-05-03 19:36:09 +00:00
assign PCTagF = PCMux_q[1] ? PCSpillF : PCPF;
assign PCNextIndexF = PCPFinalF;
// truncate the offset from PCPF for memory address generation
assign PCPTrunkF = PCTagF[`XLEN-1:OFFSETWIDTH];
2021-03-25 18:43:10 +00:00
// Detect if the instruction is compressed
assign CompressedF = FinalInstrRawF[1:0] != 2'b11;
2021-04-14 23:03:33 +00:00
// the FSM is always runing, do not stall.
flopr #(5) stateReg(.clk(clk),
.reset(reset),
.d(NextState),
.q(CurrState));
assign spill = PCPF[4:1] == 4'b1111 ? 1'b1 : 1'b0;
assign hit = ICacheMemReadValid; // note ICacheMemReadValid is hit.
assign FetchCountFlag = FetchCount == FetchCountThreshold;
// Next state logic
always_comb begin
UnalignedSelect = 1'b0;
CntReset = 1'b0;
PreCntEn = 1'b0;
//InstrReadF = 1'b0;
ICacheMemWriteEnable = 1'b0;
spillSave = 1'b0;
PCMux = 2'b00;
ICacheReadEn = 1'b0;
SavePC = 1'b0;
ICacheStallF = 1'b1;
case (CurrState)
STATE_READY: begin
PCMux = 2'b00;
ICacheReadEn = 1'b1;
if (hit & ~spill) begin
SavePC = 1'b1;
ICacheStallF = 1'b0;
NextState = STATE_READY;
end else if (hit & spill) begin
spillSave = 1'b1;
PCMux = 2'b10;
NextState = STATE_HIT_SPILL;
end else if (~hit & ~spill) begin
CntReset = 1'b1;
NextState = STATE_MISS_FETCH_WDV;
end else if (~hit & spill) begin
CntReset = 1'b1;
PCMux = 2'b01;
NextState = STATE_MISS_SPILL_FETCH_WDV;
end else begin
NextState = STATE_READY;
end
end
// branch 1, hit spill and 2, miss spill hit
STATE_HIT_SPILL: begin
PCMux = 2'b10;
UnalignedSelect = 1'b1;
ICacheReadEn = 1'b1;
if (hit) begin
NextState = STATE_HIT_SPILL_FINAL;
end else begin
CntReset = 1'b1;
NextState = STATE_HIT_SPILL_MISS_FETCH_WDV;
end
end
STATE_HIT_SPILL_MISS_FETCH_WDV: begin
PCMux = 2'b10;
//InstrReadF = 1'b1;
PreCntEn = 1'b1;
if (FetchCountFlag & InstrAckF) begin
NextState = STATE_HIT_SPILL_MISS_FETCH_DONE;
end else begin
NextState = STATE_HIT_SPILL_MISS_FETCH_WDV;
end
end
STATE_HIT_SPILL_MISS_FETCH_DONE: begin
PCMux = 2'b10;
ICacheMemWriteEnable = 1'b1;
NextState = STATE_HIT_SPILL_MERGE;
end
STATE_HIT_SPILL_MERGE: begin
PCMux = 2'b10;
UnalignedSelect = 1'b1;
ICacheReadEn = 1'b1;
NextState = STATE_HIT_SPILL_FINAL;
end
STATE_HIT_SPILL_FINAL: begin
ICacheReadEn = 1'b1;
PCMux = 2'b00;
UnalignedSelect = 1'b1;
SavePC = 1'b1;
NextState = STATE_READY;
ICacheStallF = 1'b0;
end
// branch 3 miss no spill
STATE_MISS_FETCH_WDV: begin
PCMux = 2'b01;
//InstrReadF = 1'b1;
PreCntEn = 1'b1;
if (FetchCountFlag & InstrAckF) begin
NextState = STATE_MISS_FETCH_DONE;
end else begin
NextState = STATE_MISS_FETCH_WDV;
end
end
STATE_MISS_FETCH_DONE: begin
PCMux = 2'b01;
ICacheMemWriteEnable = 1'b1;
NextState = STATE_MISS_READ;
end
STATE_MISS_READ: begin
PCMux = 2'b01;
ICacheReadEn = 1'b1;
NextState = STATE_READY;
end
// branch 4 miss spill hit, and 5 miss spill miss
STATE_MISS_SPILL_FETCH_WDV: begin
PCMux = 2'b01;
PreCntEn = 1'b1;
//InstrReadF = 1'b1;
if (FetchCountFlag & InstrAckF) begin
NextState = STATE_MISS_SPILL_FETCH_DONE;
end else begin
NextState = STATE_MISS_SPILL_FETCH_WDV;
end
end
STATE_MISS_SPILL_FETCH_DONE: begin
PCMux = 2'b01;
ICacheMemWriteEnable = 1'b1;
NextState = STATE_MISS_SPILL_READ1;
end
STATE_MISS_SPILL_READ1: begin // always be a hit as we just wrote that cache block.
PCMux = 2'b01; // there is a 1 cycle delay after setting the address before the date arrives.
ICacheReadEn = 1'b1;
NextState = STATE_MISS_SPILL_2;
end
STATE_MISS_SPILL_2: begin
PCMux = 2'b10;
UnalignedSelect = 1'b1;
spillSave = 1'b1; /// *** Could pipeline these to make it clearer in the fsm.
if (~hit) begin
CntReset = 1'b1;
NextState = STATE_MISS_SPILL_MISS_FETCH_WDV;
end else begin
NextState = STATE_MISS_SPILL_FINAL;
end
end
STATE_MISS_SPILL_MISS_FETCH_WDV: begin
PCMux = 2'b10;
PreCntEn = 1'b1;
//InstrReadF = 1'b1;
if (FetchCountFlag & InstrAckF) begin
NextState = STATE_MISS_SPILL_MISS_FETCH_DONE;
end else begin
NextState = STATE_MISS_SPILL_MISS_FETCH_WDV;
end
end
STATE_MISS_SPILL_MISS_FETCH_DONE: begin
PCMux = 2'b10;
ICacheMemWriteEnable = 1'b1;
NextState = STATE_MISS_SPILL_MERGE;
end
STATE_MISS_SPILL_MERGE: begin
PCMux = 2'b10;
UnalignedSelect = 1'b1;
ICacheReadEn = 1'b1;
NextState = STATE_MISS_SPILL_FINAL;
end
STATE_MISS_SPILL_FINAL: begin
ICacheReadEn = 1'b1;
PCMux = 2'b00;
UnalignedSelect = 1'b1;
SavePC = 1'b1;
ICacheStallF = 1'b0;
NextState = STATE_READY;
end
default: begin
PCMux = 2'b01;
NextState = STATE_READY;
end
// *** add in error handling and invalidate/evict
endcase
end
assign CntEn = PreCntEn & InstrAckF;
assign InstrReadF = (CurrState == STATE_HIT_SPILL_MISS_FETCH_WDV) ||
(CurrState == STATE_MISS_FETCH_WDV) ||
(CurrState == STATE_MISS_SPILL_FETCH_WDV) ||
(CurrState == STATE_MISS_SPILL_MISS_FETCH_WDV);
// to compute the fetch address we need to add the bit shifted
// counter output to the address.
flopenr #(LOGWPL+1)
FetchCountReg(.clk(clk),
.reset(reset | CntReset),
.en(CntEn),
.d(NextFetchCount),
.q(FetchCount));
assign NextFetchCount = FetchCount + 1'b1;
// This part is confusing.
// we need to remove the offset bits (PCPTrunkF). Because the AHB interface is XLEN wide
// we need to address on that number of bits so the PC is extended to the right by AHBByteLength with zeros.
// fetch count is already aligned to AHBByteLength, but we need to extend back to the full address width with
// more zeros after the addition. This will be the number of offset bits less the AHBByteLength.
assign InstrPAdrF = {{PCPTrunkF, {{LOGWPL}{1'b0}}} + FetchCount, {{OFFSETWIDTH-LOGWPL}{1'b0}}};
// store read data from memory interface before writing into SRAM.
genvar i;
generate
for (i = 0; i < WORDSPERLINE; i++) begin
flopenr #(`XLEN) flop(.clk(clk),
.reset(reset),
.en(InstrAckF & (i == FetchCount)),
.d(InstrInF),
.q(ICacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN]));
end
endgenerate
// what address is used to write the SRAM?
// spills require storing the first cache block so it can merged
// with the second
// can optimize size, for now just make it the size of the data
// leaving the cache memory.
flopenr #(16) SpillInstrReg(.clk(clk),
.en(spillSave),
.reset(reset),
.d(ICacheMemReadData[15:0]),
.q(SpillDataBlock0));
// use the not quite final PC to do the final selection.
2021-04-21 21:47:05 +00:00
logic [1:1] PCPreFinalF_q;
flopenr #(1) PCFReg(.clk(clk),
.reset(reset),
.en(~StallF),
.d(PCPreFinalF[1]),
.q(PCPreFinalF_q[1]));
assign FinalInstrRawF = spill ? {ICacheMemReadData[15:0], SpillDataBlock0} : ICacheMemReadData;
// There is a frustrating issue on the first access.
// The cache will not contain any valid data but will contain x's on
// reset. This makes FinalInstrRawF invalid. On the first cycle out of
// reset this register will pickup this x and it will propagate throughout
// the cpu causing simulation failure, most likely a trap for invalid instruction.
// Reset must be held 1 cycle longer to prevent this issue. additionally the
// reset should be to a NOP rather than 0.
// register reset
flop #(1) resetReg (.clk(clk),
.d(reset),
.q(reset_q));
flopenl #(32) AlignedInstrRawDFlop(clk, reset | reset_q, ~StallD, FinalInstrRawF, NOP, AlignedInstrRawD);
// cannot have this mux as it creates a combo loop.
// This flop doesn't stall if StallF is high because we should output a nop
// when FlushD happens, even if the pipeline is also stalled.
flopr #(1) flushDLastCycleFlop(clk, reset, ~FlushD & (FlushDLastCyclen | ~StallF), FlushDLastCyclen);
mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, ~FlushDLastCyclen, InstrRawD);
//assign InstrRawD = AlignedInstrRawD;
endmodule