/////////////////////////////////////////// // lsu.sv // // Written: David_Harris@hmc.edu 9 January 2021 // Modified: // // Purpose: Load/Store Unit // Top level of the memory-stage hart logic // Contains data cache, DTLB, subword read/write datapath, interface to external bus // // A component of the Wally configurable RISC-V project. // // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University // // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software // is furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. /////////////////////////////////////////// `include "wally-config.vh" module lsu ( input logic clk, reset, input logic StallM, FlushM, StallW, FlushW, output logic LSUStall, // Memory Stage // connected to cpu (controls) input logic [1:0] MemRWM, input logic [2:0] Funct3M, input logic [6:0] Funct7M, input logic [1:0] AtomicM, input logic ExceptionM, input logic PendingInterruptM, input logic FlushDCacheM, output logic CommittedM, output logic SquashSCW, output logic DCacheMiss, output logic DCacheAccess, // address and write data input logic [`XLEN-1:0] IEUAdrE, (* mark_debug = "true" *)output logic [`XLEN-1:0] IEUAdrM, input logic [`XLEN-1:0] WriteDataM, output logic [`XLEN-1:0] ReadDataM, // cpu privilege input logic [1:0] PrivilegeModeW, input logic DTLBFlushM, // faults output logic DTLBLoadPageFaultM, DTLBStorePageFaultM, output logic LoadMisalignedFaultM, LoadAccessFaultM, // cpu hazard unit (trap) output logic StoreMisalignedFaultM, StoreAccessFaultM, // connect to ahb (* mark_debug = "true" *) output logic [`PA_BITS-1:0] LsuBusAdr, output logic LsuBusRead, output logic LsuBusWrite, input logic LsuBusAck, (* mark_debug = "true" *) input logic [`XLEN-1:0] LsuBusHRDATA, output logic [`XLEN-1:0] LsuBusHWDATA, output logic [2:0] LsuBusSize, // mmu management // page table walker input logic [`XLEN-1:0] SATP_REGW, // from csr input logic STATUS_MXR, STATUS_SUM, STATUS_MPRV, input logic [1:0] STATUS_MPP, input logic [`XLEN-1:0] PCF, input logic ITLBMissF, output logic [`XLEN-1:0] PTE, output logic [1:0] PageType, output logic ITLBWriteF, input var logic [7:0] PMPCFG_ARRAY_REGW[`PMP_ENTRIES-1:0], input var logic [`XLEN-1:0] PMPADDR_ARRAY_REGW[`PMP_ENTRIES-1:0] // *** this one especially has a large note attached to it in pmpchecker. ); logic DTLBPageFaultM; logic [`PA_BITS-1:0] LsuPAdrM; // from mmu to dcache logic [`XLEN+1:0] IEUAdrExtM; logic DTLBMissM; logic DTLBWriteM; logic [1:0] LsuRWM; logic [1:0] PreLsuRWM; logic [2:0] LsuFunct3M; logic [1:0] LsuAtomicM; logic [`PA_BITS-1:0] PreLsuPAdrM, LocalLsuBusAdr; logic [11:0] LsuAdrE, DCacheAdrE; logic CPUBusy; logic MemReadM; logic DCacheStall; logic CacheableM; logic SelHPTW; logic BusStall; logic InterlockStall; logic IgnoreRequest; logic BusCommittedM, DCacheCommittedM; flopenrc #(`XLEN) AddressMReg(clk, reset, FlushM, ~StallM, IEUAdrE, IEUAdrM); assign IEUAdrExtM = {2'b00, IEUAdrM}; generate if(`MEM_VIRTMEM) begin : MEM_VIRTMEM logic AnyCPUReqM; logic [`PA_BITS-1:0] HPTWAdr; logic HPTWRead; logic [2:0] HPTWSize; logic SelReplayCPURequest; typedef enum {STATE_T0_READY, STATE_T0_REPLAY, STATE_T3_DTLB_MISS, STATE_T4_ITLB_MISS, STATE_T5_ITLB_MISS, STATE_T7_DITLB_MISS} statetype; statetype InterlockCurrState, InterlockNextState; assign AnyCPUReqM = (|MemRWM) | (|AtomicM); always_ff @(posedge clk) if (reset) InterlockCurrState <= #1 STATE_T0_READY; else InterlockCurrState <= #1 InterlockNextState; always_comb begin case(InterlockCurrState) STATE_T0_READY: if(~ITLBMissF & DTLBMissM & AnyCPUReqM) InterlockNextState = STATE_T3_DTLB_MISS; else if(ITLBMissF & ~DTLBMissM & ~AnyCPUReqM) InterlockNextState = STATE_T4_ITLB_MISS; else if(ITLBMissF & ~DTLBMissM & AnyCPUReqM) InterlockNextState = STATE_T5_ITLB_MISS; else if(ITLBMissF & DTLBMissM & AnyCPUReqM) InterlockNextState = STATE_T7_DITLB_MISS; else InterlockNextState = STATE_T0_READY; STATE_T0_REPLAY: if(DCacheStall) InterlockNextState = STATE_T0_REPLAY; else InterlockNextState = STATE_T0_READY; STATE_T3_DTLB_MISS: if(DTLBWriteM) InterlockNextState = STATE_T0_REPLAY; else InterlockNextState = STATE_T3_DTLB_MISS; STATE_T4_ITLB_MISS: if(ITLBWriteF) InterlockNextState = STATE_T0_READY; else InterlockNextState = STATE_T4_ITLB_MISS; STATE_T5_ITLB_MISS: if(ITLBWriteF) InterlockNextState = STATE_T0_REPLAY; else InterlockNextState = STATE_T5_ITLB_MISS; STATE_T7_DITLB_MISS: if(DTLBWriteM) InterlockNextState = STATE_T5_ITLB_MISS; else InterlockNextState = STATE_T7_DITLB_MISS; default: InterlockNextState = STATE_T0_READY; endcase end // always_comb // signal to CPU it needs to wait on HPTW. /* -----\/----- EXCLUDED -----\/----- // this code has a problem with imperas64mmu as it reads in an invalid uninitalized instruction. InterlockStall becomes x and it propagates // everywhere. The case statement below implements the same logic but any x on the inputs will resolve to 0. assign InterlockStall = (InterlockCurrState == STATE_T0_READY & (DTLBMissM | ITLBMissF)) | (InterlockCurrState == STATE_T3_DTLB_MISS) | (InterlockCurrState == STATE_T4_ITLB_MISS) | (InterlockCurrState == STATE_T5_ITLB_MISS) | (InterlockCurrState == STATE_T7_DITLB_MISS); -----/\----- EXCLUDED -----/\----- */ always_comb begin InterlockStall = 1'b0; case(InterlockCurrState) STATE_T0_READY: if(DTLBMissM | ITLBMissF) InterlockStall = 1'b1; STATE_T3_DTLB_MISS: InterlockStall = 1'b1; STATE_T4_ITLB_MISS: InterlockStall = 1'b1; STATE_T5_ITLB_MISS: InterlockStall = 1'b1; STATE_T7_DITLB_MISS: InterlockStall = 1'b1; default: InterlockStall = 1'b0; endcase end // When replaying CPU memory request after PTW select the IEUAdrM for correct address. assign SelReplayCPURequest = (InterlockNextState == STATE_T0_REPLAY); assign SelHPTW = (InterlockCurrState == STATE_T3_DTLB_MISS) | (InterlockCurrState == STATE_T4_ITLB_MISS) | (InterlockCurrState == STATE_T5_ITLB_MISS) | (InterlockCurrState == STATE_T7_DITLB_MISS); assign IgnoreRequest = (InterlockCurrState == STATE_T0_READY & (ITLBMissF | DTLBMissM | ExceptionM | PendingInterruptM)) | ((InterlockCurrState == STATE_T0_REPLAY) & (ExceptionM | PendingInterruptM)); // *** add generate to conditionally create hptw, lsuArb, and mmu // based on `MEM_VIRTMEM hptw hptw(.clk, .reset, .SATP_REGW, .PCF, .IEUAdrM, .ITLBMissF(ITLBMissF & ~PendingInterruptM), .DTLBMissM(DTLBMissM & ~PendingInterruptM), .MemRWM, .PTE, .PageType, .ITLBWriteF, .DTLBWriteM, .HPTWReadPTE(ReadDataM), .DCacheStall, .HPTWAdr, .HPTWRead, .HPTWSize, .AnyCPUReqM); // arbiter between IEU and hptw // multiplex the outputs to LSU mux2 #(2) rwmux(MemRWM, {HPTWRead, 1'b0}, SelHPTW, PreLsuRWM); mux2 #(3) sizemux(Funct3M, HPTWSize, SelHPTW, LsuFunct3M); mux2 #(2) atomicmux(AtomicM, 2'b00, SelHPTW, LsuAtomicM); mux2 #(12) adremux(IEUAdrE[11:0], HPTWAdr[11:0], SelHPTW, LsuAdrE); mux2 #(`PA_BITS) lsupadrmux(IEUAdrExtM[`PA_BITS-1:0], HPTWAdr, SelHPTW, PreLsuPAdrM); assign CPUBusy = StallW & ~SelHPTW; // always block interrupts when using the hardware page table walker. // this is for the d cache SRAM. // turns out because we cannot pipeline hptw requests we don't need this register //flop #(`PA_BITS) HPTWAdrMReg(clk, HPTWAdr, HPTWAdrM); // delay HPTWAdrM by a cycle //assign PreLsuRWM = SelHPTW ? {HPTWRead, 1'b0} : MemRWM; //assign LsuAdrE = SelHPTW ? HPTWAdr[11:0] : IEUAdrE[11:0]; //assign LsuAtomicM = SelHPTW ? 2'b00 : AtomicM; //assign PreLsuPAdrM = SelHPTW ? HPTWAdr : IEUAdrExtM[`PA_BITS-1:0]; // Specify which type of page fault is occurring // *** `MEM_VIRTMEM assign DTLBLoadPageFaultM = DTLBPageFaultM & PreLsuRWM[1]; assign DTLBStorePageFaultM = DTLBPageFaultM & PreLsuRWM[0]; assign DCacheAdrE = SelReplayCPURequest ? IEUAdrM[11:0] : LsuAdrE; end // if (`MEM_VIRTMEM) else begin assign InterlockStall = 1'b0; assign DCacheAdrE = LsuAdrE; assign SelHPTW = 1'b0; assign IgnoreRequest = 1'b0; assign PTE = '0; assign PageType = '0; assign DTLBWriteM = 1'b0; assign ITLBWriteF = 1'b0; assign PreLsuRWM = MemRWM; assign LsuFunct3M = Funct3M; assign LsuAtomicM = AtomicM; assign LsuAdrE = IEUAdrE[11:0]; assign PreLsuPAdrM = IEUAdrExtM; assign CPUBusy = StallW; assign DTLBLoadPageFaultM = 1'b0; assign DTLBStorePageFaultM = 1'b0; end endgenerate // **** look into this confusing signal. assign CommittedM = SelHPTW | DCacheCommittedM | BusCommittedM; generate if(`ZICSR_SUPPORTED == 1) begin : dmmu logic DataMisalignedM; mmu #(.TLB_ENTRIES(`DTLB_ENTRIES), .IMMU(0)) dmmu(.clk, .reset, .SATP_REGW, .STATUS_MXR, .STATUS_SUM, .STATUS_MPRV, .STATUS_MPP, .PrivilegeModeW, .DisableTranslation(SelHPTW), .PAdr(PreLsuPAdrM), .VAdr(IEUAdrM), .Size(LsuFunct3M[1:0]), .PTE, .PageTypeWriteVal(PageType), .TLBWrite(DTLBWriteM), .TLBFlush(DTLBFlushM), .PhysicalAddress(LsuPAdrM), .TLBMiss(DTLBMissM), .Cacheable(CacheableM), .Idempotent(), .AtomicAllowed(), .TLBPageFault(DTLBPageFaultM), .InstrAccessFaultF(), .LoadAccessFaultM, .StoreAccessFaultM, .AtomicAccessM(1'b0), .ExecuteAccessF(1'b0), /// atomicaccessm is probably a bug .WriteAccessM(PreLsuRWM[0]), .ReadAccessM(PreLsuRWM[1]), .PMPCFG_ARRAY_REGW, .PMPADDR_ARRAY_REGW ); // *** the pma/pmp instruction access faults don't really matter here. is it possible to parameterize which outputs exist? // Determine if an Unaligned access is taking place // hptw guarantees alignment, only check inputs from IEU. always_comb case(Funct3M[1:0]) 2'b00: DataMisalignedM = 0; // lb, sb, lbu 2'b01: DataMisalignedM = IEUAdrM[0]; // lh, sh, lhu 2'b10: DataMisalignedM = IEUAdrM[1] | IEUAdrM[0]; // lw, sw, flw, fsw, lwu 2'b11: DataMisalignedM = |IEUAdrM[2:0]; // ld, sd, fld, fsd endcase // If the CPU's (not HPTW's) request is a page fault. assign LoadMisalignedFaultM = DataMisalignedM & MemRWM[1]; assign StoreMisalignedFaultM = DataMisalignedM & MemRWM[0]; end else begin assign LsuPAdrM = PreLsuPAdrM; assign DTLBMissM = 0; assign CacheableM = 1; assign DTLBPageFaultM = 0; assign LoadAccessFaultM = 0; assign StoreMisalignedFaultM = 0; assign LoadMisalignedFaultM = 0; assign StoreMisalignedFaultM = 0; end endgenerate assign LSUStall = DCacheStall | InterlockStall | BusStall; // Move generate from lrsc to outside this module. // use PreLsu as prefix for lrsc generate if (`A_SUPPORTED) begin assign MemReadM = PreLsuRWM[1] & ~(IgnoreRequest) & ~DTLBMissM; lrsc lrsc(.clk, .reset, .FlushW, .CPUBusy, .MemReadM, .PreLsuRWM, .LsuAtomicM, .LsuPAdrM, .SquashSCW, .LsuRWM); end else begin assign SquashSCW = 0; assign LsuRWM = PreLsuRWM; end endgenerate // conditional // 1. ram // controlled by `MEM_DTIM // 2. cache `MEM_DCACHE // 3. wire pass-through localparam integer WORDSPERLINE = `DCACHE_BLOCKLENINBITS/`XLEN; localparam integer LOGWPL = $clog2(WORDSPERLINE); localparam integer BLOCKLEN = `DCACHE_BLOCKLENINBITS; localparam integer WordCountThreshold = WORDSPERLINE - 1; localparam integer BLOCKBYTELEN = BLOCKLEN/8; localparam integer OFFSETLEN = $clog2(BLOCKBYTELEN); // temp logic WordCountFlag; logic [`XLEN-1:0] FinalAMOWriteDataM, FinalWriteDataM; (* mark_debug = "true" *) logic [`XLEN-1:0] PreLsuBusHWDATA; logic SelFlush; logic [`XLEN-1:0] ReadDataWordM; logic [`DCACHE_BLOCKLENINBITS-1:0] DCacheMemWriteData; // keep logic [`XLEN-1:0] ReadDataWordMuxM; logic [LOGWPL-1:0] WordCount, NextWordCount; logic [`PA_BITS-1:0] BasePAdrMaskedM; logic [OFFSETLEN-1:0] BasePAdrOffsetM; logic CntEn, PreCntEn; logic CntReset; logic [`PA_BITS-1:0] DCacheBusAdr; logic [`XLEN-1:0] ReadDataBlockSetsM [(`DCACHE_BLOCKLENINBITS/`XLEN)-1:0]; logic DCacheWriteLine; logic DCacheFetchLine; logic DCacheBusAck; logic UnCachedLsuBusRead; logic UnCachedLsuBusWrite; logic SelUncachedAdr; dcache dcache(.clk, .reset, .CPUBusy, .MemRWM(LsuRWM), .Funct3M(LsuFunct3M), .Funct7M, .FlushDCacheM, .AtomicM(LsuAtomicM), .MemAdrE(DCacheAdrE), .LsuPAdrM, .FinalWriteDataM, .ReadDataWordM, .DCacheStall, .DCacheMiss, .DCacheAccess, .IgnoreRequest, .CacheableM(CacheableM), .DCacheCommittedM, .DCacheBusAdr, .ReadDataBlockSetsM, .SelFlush, .DCacheMemWriteData, .DCacheFetchLine, .DCacheWriteLine, .DCacheBusAck ); mux2 #(`XLEN) UnCachedDataMux(.d0(ReadDataWordM), .d1(DCacheMemWriteData[`XLEN-1:0]), .s(SelUncachedAdr), .y(ReadDataWordMuxM)); // finally swr subwordread subwordread(.ReadDataWordMuxM, .LsuPAdrM(LsuPAdrM[2:0]), .Funct3M(LsuFunct3M), .ReadDataM); generate if (`A_SUPPORTED) begin : amo logic [`XLEN-1:0] AMOResult; amoalu amoalu(.srca(ReadDataM), .srcb(WriteDataM), .funct(Funct7M), .width(LsuFunct3M[1:0]), .result(AMOResult)); mux2 #(`XLEN) wdmux(WriteDataM, AMOResult, LsuAtomicM[1], FinalAMOWriteDataM); end else assign FinalAMOWriteDataM = WriteDataM; endgenerate subwordwrite subwordwrite(.HRDATA(ReadDataWordM), .HADDRD(LsuPAdrM[2:0]), .HSIZED({LsuFunct3M[2], 1'b0, LsuFunct3M[1:0]}), .HWDATAIN(FinalAMOWriteDataM), .HWDATA(FinalWriteDataM)); generate if (`XLEN == 32) assign LsuBusSize = SelUncachedAdr ? LsuFunct3M : 3'b010; else assign LsuBusSize = SelUncachedAdr ? LsuFunct3M : 3'b011; endgenerate; // Bus Side logic // register the fetch data from the next level of memory. // This register should be necessary for timing. There is no register in the uncore or // ahblite controller between the memories and this cache. genvar index; generate for (index = 0; index < WORDSPERLINE; index++) begin:fetchbuffer flopen #(`XLEN) fb(.clk(clk), .en(LsuBusAck & LsuBusRead & (index == WordCount)), .d(LsuBusHRDATA), .q(DCacheMemWriteData[(index+1)*`XLEN-1:index*`XLEN])); end endgenerate assign LocalLsuBusAdr = SelUncachedAdr ? LsuPAdrM : DCacheBusAdr ; assign LsuBusAdr = ({{`PA_BITS-LOGWPL{1'b0}}, WordCount} << $clog2(`XLEN/8)) + LocalLsuBusAdr; assign PreLsuBusHWDATA = ReadDataBlockSetsM[WordCount]; assign LsuBusHWDATA = SelUncachedAdr ? WriteDataM : PreLsuBusHWDATA; // *** why is this not FinalWriteDataM? which does not work. assign WordCountFlag = (WordCount == WordCountThreshold[LOGWPL-1:0]); assign CntEn = PreCntEn & LsuBusAck; flopenr #(LOGWPL) WordCountReg(.clk(clk), .reset(reset | CntReset), .en(CntEn), .d(NextWordCount), .q(WordCount)); assign NextWordCount = WordCount + 1'b1; typedef enum {STATE_BUS_READY, STATE_BUS_FETCH, STATE_BUS_WRITE, STATE_BUS_UNCACHED_WRITE, STATE_BUS_UNCACHED_WRITE_DONE, STATE_BUS_UNCACHED_READ, STATE_BUS_UNCACHED_READ_DONE, STATE_BUS_CPU_BUSY} busstatetype; (* mark_debug = "true" *) busstatetype BusCurrState, BusNextState; always_ff @(posedge clk) if (reset) BusCurrState <= #1 STATE_BUS_READY; else BusCurrState <= #1 BusNextState; always_comb begin BusNextState = STATE_BUS_READY; case(BusCurrState) STATE_BUS_READY: if(IgnoreRequest) BusNextState = STATE_BUS_READY; else if(LsuRWM[0] & ~CacheableM) BusNextState = STATE_BUS_UNCACHED_WRITE; else if(LsuRWM[1] & ~CacheableM) BusNextState = STATE_BUS_UNCACHED_READ; else if(DCacheFetchLine) BusNextState = STATE_BUS_FETCH; else if(DCacheWriteLine) BusNextState = STATE_BUS_WRITE; STATE_BUS_UNCACHED_WRITE: if(LsuBusAck) BusNextState = STATE_BUS_UNCACHED_WRITE_DONE; else BusNextState = STATE_BUS_UNCACHED_WRITE; STATE_BUS_UNCACHED_READ: if(LsuBusAck) BusNextState = STATE_BUS_UNCACHED_READ_DONE; else BusNextState = STATE_BUS_UNCACHED_READ; STATE_BUS_UNCACHED_WRITE_DONE: if(CPUBusy) BusNextState = STATE_BUS_CPU_BUSY; else BusNextState = STATE_BUS_READY; STATE_BUS_UNCACHED_READ_DONE: if(CPUBusy) BusNextState = STATE_BUS_CPU_BUSY; else BusNextState = STATE_BUS_READY; STATE_BUS_CPU_BUSY: if(CPUBusy) BusNextState = STATE_BUS_CPU_BUSY; else BusNextState = STATE_BUS_READY; STATE_BUS_FETCH: if (WordCountFlag & LsuBusAck) BusNextState = STATE_BUS_READY; else BusNextState = STATE_BUS_FETCH; STATE_BUS_WRITE: if(WordCountFlag & LsuBusAck) BusNextState = STATE_BUS_READY; else BusNextState = STATE_BUS_WRITE; endcase end assign CntReset = BusCurrState == STATE_BUS_READY; assign BusStall = (BusCurrState == STATE_BUS_READY & ~IgnoreRequest & ((~CacheableM & (|LsuRWM)) | DCacheFetchLine | DCacheWriteLine)) | (BusCurrState == STATE_BUS_UNCACHED_WRITE) | (BusCurrState == STATE_BUS_UNCACHED_READ) | (BusCurrState == STATE_BUS_FETCH) | (BusCurrState == STATE_BUS_WRITE); assign PreCntEn = BusCurrState == STATE_BUS_FETCH | BusCurrState == STATE_BUS_WRITE; assign UnCachedLsuBusWrite = (BusCurrState == STATE_BUS_READY & ~CacheableM & (LsuRWM[0])) | (BusCurrState == STATE_BUS_UNCACHED_WRITE); assign LsuBusWrite = UnCachedLsuBusWrite | (BusCurrState == STATE_BUS_WRITE); assign UnCachedLsuBusRead = (BusCurrState == STATE_BUS_READY & ~CacheableM & (|LsuRWM[1])) | (BusCurrState == STATE_BUS_UNCACHED_READ); assign LsuBusRead = UnCachedLsuBusRead | (BusCurrState == STATE_BUS_FETCH); assign DCacheBusAck = (BusCurrState == STATE_BUS_FETCH & WordCountFlag & LsuBusAck) | (BusCurrState == STATE_BUS_WRITE & WordCountFlag & LsuBusAck); assign BusCommittedM = BusCurrState != STATE_BUS_READY; assign SelUncachedAdr = (BusCurrState == STATE_BUS_READY & (|LsuRWM & ~CacheableM)) | (BusCurrState == STATE_BUS_UNCACHED_READ | BusCurrState == STATE_BUS_UNCACHED_READ_DONE | BusCurrState == STATE_BUS_UNCACHED_WRITE | BusCurrState == STATE_BUS_UNCACHED_WRITE_DONE); endmodule