From 97d31cec217f86ce622148f6aeed8183c4fd8a67 Mon Sep 17 00:00:00 2001 From: David Harris Date: Thu, 3 Feb 2022 17:50:23 +0000 Subject: [PATCH] sram1rw cleanup --- pipelined/src/cache/cacheway.sv | 4 +- .../cache/dcache_ptw_interaction_README.txt | 90 ------------------- pipelined/src/cache/sram1rw.sv | 35 ++++---- 3 files changed, 20 insertions(+), 109 deletions(-) delete mode 100644 pipelined/src/cache/dcache_ptw_interaction_README.txt diff --git a/pipelined/src/cache/cacheway.sv b/pipelined/src/cache/cacheway.sv index a8af5645..0c623b8d 100644 --- a/pipelined/src/cache/cacheway.sv +++ b/pipelined/src/cache/cacheway.sv @@ -76,7 +76,7 @@ module cacheway #(parameter NUMLINES=512, parameter LINELEN = 256, TAGLEN = 26, ///////////////////////////////////////////////////////////////////////////////////////////// sram1rw #(.DEPTH(NUMLINES), .WIDTH(TAGLEN)) CacheTagMem(.clk(clk), - .Addr(RAdr), .ReadData(ReadTag), + .Adr(RAdr), .ReadData(ReadTag), .WriteData(PAdr[`PA_BITS-1:OFFSETLEN+INDEXLEN]), .WriteEnable(TagWriteEnable)); // AND portion of distributed tag multiplexer @@ -91,7 +91,7 @@ module cacheway #(parameter NUMLINES=512, parameter LINELEN = 256, TAGLEN = 26, // *** Potential optimization: if byte write enables are available, could remove subwordwrites genvar words; for(words = 0; words < LINELEN/`XLEN; words++) begin: word - sram1rw #(.DEPTH(NUMLINES), .WIDTH(`XLEN)) CacheDataMem(.clk(clk), .Addr(RAdr), + sram1rw #(.DEPTH(NUMLINES), .WIDTH(`XLEN)) CacheDataMem(.clk(clk), .Adr(RAdr), .ReadData(ReadDataLine[(words+1)*`XLEN-1:words*`XLEN] ), .WriteData(WriteData[(words+1)*`XLEN-1:words*`XLEN]), .WriteEnable(WriteEnable & WriteWordEnable[words])); diff --git a/pipelined/src/cache/dcache_ptw_interaction_README.txt b/pipelined/src/cache/dcache_ptw_interaction_README.txt deleted file mode 100644 index 47e2af5d..00000000 --- a/pipelined/src/cache/dcache_ptw_interaction_README.txt +++ /dev/null @@ -1,90 +0,0 @@ -Intractions betwen the dcache and hardware page table walker are complex. -In particular the complications arise when a fault occurs concurrently with a memory operation. - -At the begining of every memory operation there are 8 combinations of three signals; -ITBL miss, DTLB miss, and a memory operation. By looking at each combination we -can understand exactly the correct sequence of operations and if the operation -should continue. - -It is important to note ITLB misses and faults DO NOT flush a memory operation -in the memory stage. This is the core reason for the complexity. - -| Type | ITLB miss | DTLB miss | mem op | | -|-------+-----------+-----------+--------+--------------| -| 0 | 0 | 0 | 0 | | -| 1 | 0 | 0 | 1 | | -| 2 | 0 | 1 | 0 | Not possible | -| 3 | 0 | 1 | 1 | | -| 4 | 1 | 0 | 0 | | -| 5 | 1 | 0 | 1 | | -| 6 | 1 | 1 | 0 | Not possible | -| 7 | 1 | 1 | 1 | | - - -The above table classifies the operations into 8 categories. -2 of the 8 are not possible because a DTLB miss implies a memory operation. -Each (I/D)TLB miss results in either a write to the corresponding TLB or a TLB fault. -To complicate things it is possilbe to have concurrent ITLB and DTLB misses, which -both can result in either a write or a fault. The table belows shows the possible -scenarios and the sequence of operations. - - -| Type | action 1 | action 2 | action 3 | keep stall? | -|------+------------------+-----------------+-----------------+-------------| -| 1 | D$ handles memop | | | Yes | -| 3a | DTLB Write | D$ finish memop | | Yes | -| 3b | DTLB Fault | Abort memop | | No | -| 4a | ITLB write | | | No | -| 4b | ITLB Fault | | | No | -| 5a | ITLB Write | D$ finish memop | | Yes | -| 5b | ITLB Fault | D$ finish memop | | Yes | -| 7a | DTLB Write | ITLB write | D$ finish memop | Yes | -| 7b | DTLB Write | ITLB Fault | D$ finish memop | Yes | -| 7c | DTLB Fault | Abort all | | No | - -Type 1 is a memory operation which either hits in the DTLB or is a physical address. The -Dcache handles the operation. - -Type 3a is a memory operation with a DTLB miss. The Dcache enters a special set of states -designed to handle the page table walker (HTPW). Secondly the HPTW takes control over the -LSU via a set of multiplexors in the LSU Arbiter, driving the Dcache with addresses of the -page table. Interally to the HPTW an FSM checks each node of the Page Table and eventually -signals either a TLB write or a TLB Fault. In Type 3a the DTLB is written with the leaf -page table entry and returns control of the Dcache back to the IEU. Now the Dcache finishes -the memory operation using the physical address provided by the TLB. Note it is crucial -the dcache replay the memory access into the cache's SRAM memory. As the HPTW sends it -requests through the Dcache the original memory operation's SRAM lookup will be lost. - -Type 3b is similar to the 3a type in that is starts with the same conditions; however the -at the end of the page table walk a fault is detched. Rather than update the TLB the CPU -and the dcache need to be informed about the fault and abort the memory operation. Unlike -Type 3a the dcache returns directly to STATE_READY and lowers the stall. - -Type 4a is the simpliest form of TLB miss as it is an ITLB miss with no memory operation. -The Dcache switches in to the special set of page table states and the HPTW takes control -of the Dcache. Like with Type 3a the HPTW sends data request through the Dcache and eventually -reads a leaf page table entry (PTE). At this time the HPTW writes the PTE to the ITLB and -removes the stall as there is not memory operation to do. - -Type 4b is also an ITLB miss. As with 4a the Dcache switches into page table walker mode and reads -until it finds a leaf or in this case a fault. The fault is deteched and the Dcaches switches back -to normal mode. - -Type 5a is a Type 4a with a current memory operation. The Dcache first switches to walker mode. - -Other traps. -A new problem has emerged. What happens when an interrupt occurs during a page table walk? -The dcache has an output called CommittedM which tells the CPU if the memory operation is -committed into the memory system. It would be wrong to pin the interrupt to a memory operation -when it is already or partially committed to the memory system. Instead the next instruction -has to be pinned to the interrupt. The complexity occurs with the ITLB miss; types 4, 5 and 7. - -Type 4: The ITLB misses and starts using the dcache to fetch the page table. There is no memory -operation. Depending on where in the walk the operations could be aborted. If the tlb is not yet -updated then the walk could be aborted. However if the TLB is updated then the interrupt must be -delayed until the next instruction. - -What is the meaning of CommittedM? -This signal informs the CPU if a memory operation is not started or if it is between started -and done. Once a memory op is started it should not be interrupted. This is used to prevent the -CPU from generating an interrupt after the operation is partially or completely done. diff --git a/pipelined/src/cache/sram1rw.sv b/pipelined/src/cache/sram1rw.sv index 263e21e0..b17aa20d 100644 --- a/pipelined/src/cache/sram1rw.sv +++ b/pipelined/src/cache/sram1rw.sv @@ -34,25 +34,21 @@ // WIDTH is number of bits in one "word" of the memory, DEPTH is number of such words module sram1rw #(parameter DEPTH=128, WIDTH=256) ( - input logic clk, - // port 1 is read only - input logic [$clog2(DEPTH)-1:0] Addr, - output logic [WIDTH-1:0] ReadData, - - // port 2 is write only - input logic [WIDTH-1:0] WriteData, - input logic WriteEnable -); + input logic clk, + input logic [$clog2(DEPTH)-1:0] Adr, + input logic [WIDTH-1:0] WriteData, + input logic WriteEnable, + output logic [WIDTH-1:0] ReadData); - logic [DEPTH-1:0][WIDTH-1:0] StoredData; // *** inconsistency in packed vs. unpacked - logic [$clog2(DEPTH)-1:0] AddrD; - logic [WIDTH-1:0] WriteDataD; - logic WriteEnableD; - + logic [WIDTH-1:0] StoredData[DEPTH-1:0]; + logic [$clog2(DEPTH)-1:0] AddrD; + logic [WIDTH-1:0] WriteDataD; + logic WriteEnableD; + //*** model as single port always_ff @(posedge clk) begin - AddrD <= Addr; - WriteDataD <= WriteData; /// ****** this is not right. there should not need to be a delay. + AddrD <= Adr; + WriteDataD <= WriteData; /// ****** this is not right. there should not need to be a delay. Implement alternative cache stall to avoid this. Eliminates a bunch of delay flops elsewhere WriteEnableD <= WriteEnable; if (WriteEnableD) begin StoredData[AddrD] <= #1 WriteDataD; @@ -60,7 +56,12 @@ module sram1rw #(parameter DEPTH=128, WIDTH=256) ( end assign ReadData = StoredData[AddrD]; - +/* + always_ff @(posedge clk) begin + ReadData <= RAM[Adr]; + if (WriteEnable) RAM[Adr] <= WriteData; + end + */ endmodule