cvw/wally-pipelined/src/cache/dcache.sv
Ross Thompson 939ff663a5 Forgot to include a few files in the last few commits.
Also reorganized the dcache by read cpu path, write cpu path, and bus interface path.
Changed i/o names on subwordread to match signals in dcache.
2021-08-25 22:30:05 -05:00

340 lines
11 KiB
Systemverilog

///////////////////////////////////////////
// dcache (data cache)
//
// Written: ross1728@gmail.com July 07, 2021
// Implements the L1 data cache
//
// Purpose: Storage for data and meta data.
//
// A component of the Wally configurable RISC-V project.
//
// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
// is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
///////////////////////////////////////////
`include "wally-config.vh"
module dcache
(input logic clk,
input logic reset,
input logic StallM,
input logic StallWtoDCache,
input logic FlushM,
input logic FlushW,
// cpu side
input logic [1:0] MemRWM,
input logic [2:0] Funct3M,
input logic [6:0] Funct7M,
input logic [1:0] AtomicM,
input logic [11:0] MemAdrE, // virtual address, but we only use the lower 12 bits.
input logic [`PA_BITS-1:0] MemPAdrM, // physical address
input logic [11:0] VAdr, // when hptw writes dtlb we use this address to index SRAM.
input logic [`XLEN-1:0] WriteDataM,
output logic [`XLEN-1:0] ReadDataM,
output logic DCacheStall,
output logic CommittedM,
output logic DCacheMiss,
output logic DCacheAccess,
// inputs from TLB and PMA/P
input logic ExceptionM,
input logic PendingInterruptM,
input logic DTLBMissM,
input logic ITLBMissF,
input logic CacheableM,
input logic DTLBWriteM,
input logic ITLBWriteF,
input logic WalkerInstrPageFaultF,
// from ptw
input logic SelPTW,
input logic WalkerPageFaultM,
output logic MemAfterIWalkDone,
// ahb side
output logic [`PA_BITS-1:0] AHBPAdr, // to ahb
output logic AHBRead,
output logic AHBWrite,
input logic AHBAck, // from ahb
input logic [`XLEN-1:0] HRDATA, // from ahb
output logic [`XLEN-1:0] HWDATA // to ahb
);
/* localparam integer BLOCKLEN = 256;
localparam integer NUMLINES = 64;
localparam integer NUMWAYS = 4;
localparam integer NUMREPL_BITS = 3;*/
localparam integer BLOCKLEN = `DCACHE_BLOCKLENINBITS;
localparam integer NUMLINES = `DCACHE_WAYSIZEINBYTES*8/BLOCKLEN;
localparam integer NUMWAYS = `DCACHE_NUMWAYS;
localparam integer NUMREPL_BITS = `DCACHE_REPLBITS; // *** not used
localparam integer BLOCKBYTELEN = BLOCKLEN/8;
localparam integer OFFSETLEN = $clog2(BLOCKBYTELEN);
localparam integer INDEXLEN = $clog2(NUMLINES);
localparam integer TAGLEN = `PA_BITS - OFFSETLEN - INDEXLEN;
localparam integer WORDSPERLINE = BLOCKLEN/`XLEN;
localparam integer LOGWPL = $clog2(WORDSPERLINE);
localparam integer LOGXLENBYTES = $clog2(`XLEN/8);
logic [1:0] SelAdrM;
logic [INDEXLEN-1:0] RAdr;
logic [BLOCKLEN-1:0] SRAMWriteData;
logic [BLOCKLEN-1:0] DCacheMemWriteData;
logic SetValid, ClearValid;
logic SetDirty, ClearDirty;
logic [BLOCKLEN-1:0] ReadDataBlockWayMaskedM [NUMWAYS-1:0];
logic [NUMWAYS-1:0] WayHit;
logic CacheHit;
logic [BLOCKLEN-1:0] ReadDataBlockM;
logic [`XLEN-1:0] ReadDataBlockSetsM [(WORDSPERLINE)-1:0];
logic [`XLEN-1:0] ReadDataWordM, ReadDataWordMuxM;
logic [`XLEN-1:0] FinalWriteDataM, FinalAMOWriteDataM;
logic [LOGWPL-1:0] FetchCount, NextFetchCount;
logic [WORDSPERLINE-1:0] SRAMWordEnable;
logic SRAMWordWriteEnableM;
logic SRAMBlockWriteEnableM;
logic [NUMWAYS-1:0] SRAMBlockWayWriteEnableM;
logic SRAMWriteEnable;
logic [NUMWAYS-1:0] SRAMWayWriteEnable;
logic [NUMWAYS-1:0] VictimWay;
logic [NUMWAYS-1:0] VictimDirtyWay;
logic [BLOCKLEN-1:0] VictimReadDataBlockM;
logic VictimDirty;
logic SelUncached;
logic [2**LOGWPL-1:0] MemPAdrDecodedW;
logic [`PA_BITS-1:0] BasePAdrM;
logic [OFFSETLEN-1:0] BasePAdrOffsetM;
logic [`PA_BITS-1:0] BasePAdrMaskedM;
logic [TAGLEN-1:0] VictimTagWay [NUMWAYS-1:0];
logic [TAGLEN-1:0] VictimTag;
logic AnyCPUReqM;
logic FetchCountFlag;
logic PreCntEn;
logic CntEn;
logic CntReset;
logic SelEvict;
logic LRUWriteEn;
// Read Path CPU (IEU) side
mux3 #(INDEXLEN)
AdrSelMux(.d0(MemAdrE[INDEXLEN+OFFSETLEN-1:OFFSETLEN]),
.d1(VAdr[INDEXLEN+OFFSETLEN-1:OFFSETLEN]),
.d2(MemPAdrM[INDEXLEN+OFFSETLEN-1:OFFSETLEN]),
.s(SelAdrM),
.y(RAdr));
cacheway #(.NUMLINES(NUMLINES), .BLOCKLEN(BLOCKLEN), .TAGLEN(TAGLEN), .OFFSETLEN(OFFSETLEN), .INDEXLEN(INDEXLEN))
MemWay[NUMWAYS-1:0](.clk,
.reset,
.RAdr,
.MemPAdrM(MemPAdrM[`PA_BITS-1:0]),
.WriteEnable(SRAMWayWriteEnable),
.WriteWordEnable(SRAMWordEnable),
.TagWriteEnable(SRAMBlockWayWriteEnableM),
.WriteData(SRAMWriteData),
.SetValid,
.ClearValid,
.SetDirty,
.ClearDirty,
.SelEvict,
.VictimWay,
.ReadDataBlockWayMaskedM,
.WayHit,
.VictimDirtyWay,
.VictimTagWay);
generate
if(NUMWAYS > 1) begin
cachereplacementpolicy #(NUMWAYS, INDEXLEN, OFFSETLEN, NUMLINES)
cachereplacementpolicy(.clk, .reset,
.WayHit,
.VictimWay,
.MemPAdrM(MemPAdrM[INDEXLEN+OFFSETLEN-1:OFFSETLEN]),
.RAdr,
.LRUWriteEn);
end else begin
assign VictimWay = 1'b1; // one hot.
end
endgenerate
assign CacheHit = | WayHit;
assign VictimDirty = | VictimDirtyWay;
// ReadDataBlockWayMaskedM is a 2d array of cache block len by number of ways.
// Need to OR together each way in a bitwise manner.
// Final part of the AO Mux. First is the AND in the cacheway.
or_rows #(NUMWAYS, BLOCKLEN) ReadDataAOMux(.a(ReadDataBlockWayMaskedM), .y(ReadDataBlockM));
or_rows #(NUMWAYS, TAGLEN) VictimTagAOMux(.a(VictimTagWay), .y(VictimTag));
// Convert the Read data bus ReadDataSelectWay into sets of XLEN so we can
// easily build a variable input mux.
// *** consider using a limited range shift to do this final muxing.
genvar index;
generate
for (index = 0; index < WORDSPERLINE; index++) begin
assign ReadDataBlockSetsM[index] = ReadDataBlockM[((index+1)*`XLEN)-1: (index*`XLEN)];
end
endgenerate
// variable input mux
assign ReadDataWordM = ReadDataBlockSetsM[MemPAdrM[$clog2(WORDSPERLINE+`XLEN/8) : $clog2(`XLEN/8)]];
mux2 #(`XLEN) UnCachedDataMux(.d0(ReadDataWordM),
.d1(DCacheMemWriteData[`XLEN-1:0]),
.s(SelUncached),
.y(ReadDataWordMuxM));
// finally swr
subwordread subwordread(.ReadDataWordMuxM,
.MemPAdrM(MemPAdrM[2:0]),
.Funct3M,
.ReadDataM);
// Write Path CPU (IEU) side
onehotdecoder #(LOGWPL)
adrdec(.bin(MemPAdrM[LOGWPL+LOGXLENBYTES-1:LOGXLENBYTES]),
.decoded(MemPAdrDecodedW));
assign SRAMWordEnable = SRAMBlockWriteEnableM ? '1 : MemPAdrDecodedW;
assign SRAMBlockWayWriteEnableM = SRAMBlockWriteEnableM ? VictimWay : '0;
mux2 #(NUMWAYS) WriteEnableMux(.d0(SRAMWordWriteEnableM ? WayHit : '0),
.d1(SRAMBlockWayWriteEnableM),
.s(SRAMBlockWriteEnableM),
.y(SRAMWayWriteEnable));
generate
if (`A_SUPPORTED) begin
logic [`XLEN-1:0] AMOResult;
amoalu amoalu(.srca(ReadDataM), .srcb(WriteDataM), .funct(Funct7M), .width(Funct3M[1:0]),
.result(AMOResult));
mux2 #(`XLEN) wdmux(WriteDataM, AMOResult, AtomicM[1], FinalAMOWriteDataM);
end else
assign FinalAMOWriteDataM = WriteDataM;
endgenerate
subwordwrite subwordwrite(.HRDATA(ReadDataWordM),
.HADDRD(MemPAdrM[2:0]),
.HSIZED({Funct3M[2], 1'b0, Funct3M[1:0]}),
.HWDATAIN(FinalAMOWriteDataM),
.HWDATA(FinalWriteDataM));
mux2 #(BLOCKLEN) WriteDataMux(.d0({WORDSPERLINE{FinalWriteDataM}}),
.d1(DCacheMemWriteData),
.s(SRAMBlockWriteEnableM),
.y(SRAMWriteData));
// Bus Side logic
// register the fetch data from the next level of memory.
// This register should be necessary for timing. There is no register in the uncore or
// ahblite controller between the memories and this cache.
generate
for (index = 0; index < WORDSPERLINE; index++) begin:fetchbuffer
flopen #(`XLEN) fb(.clk(clk),
.en(AHBAck & AHBRead & (index == FetchCount)),
.d(HRDATA),
.q(DCacheMemWriteData[(index+1)*`XLEN-1:index*`XLEN]));
end
endgenerate
mux2 #(`PA_BITS) BaseAdrMux(.d0(MemPAdrM),
.d1({VictimTag, MemPAdrM[INDEXLEN+OFFSETLEN-1:OFFSETLEN], {{OFFSETLEN}{1'b0}}}),
.s(SelEvict),
.y(BasePAdrM));
// if not cacheable the offset bits needs to be sent to the EBU.
// if cacheable the offset bits are discarded. $ FSM will fetch the whole block.
assign BasePAdrOffsetM = CacheableM ? {{OFFSETLEN}{1'b0}} : BasePAdrM[OFFSETLEN-1:0];
assign BasePAdrMaskedM = {BasePAdrM[`PA_BITS-1:OFFSETLEN], BasePAdrOffsetM};
assign AHBPAdr = ({{`PA_BITS-LOGWPL{1'b0}}, FetchCount} << $clog2(`XLEN/8)) + BasePAdrMaskedM;
assign HWDATA = CacheableM ? ReadDataBlockSetsM[FetchCount] : WriteDataM;
localparam FetchCountThreshold = WORDSPERLINE - 1;
assign FetchCountFlag = (FetchCount == FetchCountThreshold[LOGWPL-1:0]);
flopenr #(LOGWPL)
FetchCountReg(.clk(clk),
.reset(reset | CntReset),
.en(CntEn),
.d(NextFetchCount),
.q(FetchCount));
assign NextFetchCount = FetchCount + 1'b1;
assign SRAMWriteEnable = SRAMBlockWriteEnableM | SRAMWordWriteEnableM;
// controller
dcachefsm dcachefsm(.clk,
.reset,
.MemRWM,
.AtomicM,
.ExceptionM,
.PendingInterruptM,
.StallWtoDCache,
.DTLBMissM,
.ITLBMissF,
.CacheableM,
.DTLBWriteM,
.ITLBWriteF,
.WalkerInstrPageFaultF,
.SelPTW,
.WalkerPageFaultM,
.AHBAck, // from ahb
.CacheHit,
.FetchCountFlag,
.VictimDirty,
.DCacheStall,
.CommittedM,
.DCacheMiss,
.DCacheAccess,
.MemAfterIWalkDone,
.AHBRead,
.AHBWrite,
.SelAdrM,
.CntEn,
.SetValid,
.ClearValid,
.SetDirty,
.ClearDirty,
.SRAMWordWriteEnableM,
.SRAMBlockWriteEnableM,
.CntReset,
.SelUncached,
.SelEvict,
.LRUWriteEn);
endmodule // dcache