From 99424fb9831b324576b080b289041a6fb9d877ad Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 20 Apr 2021 21:19:53 -0500 Subject: [PATCH] Progress on icache. Fixed some issues aligning the PC with instruction. Still broken. --- wally-pipelined/regression/wave.do | 19 ++++-- wally-pipelined/src/cache/dmapped.sv | 99 ++++++++++++++++++++++++++++ wally-pipelined/src/ifu/icache.sv | 85 ++++++++++++++---------- 3 files changed, 163 insertions(+), 40 deletions(-) diff --git a/wally-pipelined/regression/wave.do b/wally-pipelined/regression/wave.do index 36401fd9..eeb8a0ba 100644 --- a/wally-pipelined/regression/wave.do +++ b/wally-pipelined/regression/wave.do @@ -122,6 +122,7 @@ add wave -noupdate -group {alu execution stage} /testbench/dut/hart/ieu/dp/ALURe add wave -noupdate -group {alu execution stage} /testbench/dut/hart/ieu/dp/SrcAE add wave -noupdate -group {alu execution stage} /testbench/dut/hart/ieu/dp/SrcBE add wave -noupdate /testbench/dut/hart/ieu/dp/ALUResultM +add wave -noupdate -expand -group PCS /testbench/dut/hart/ifu/PCNextF add wave -noupdate -expand -group PCS /testbench/dut/hart/PCF add wave -noupdate -expand -group PCS /testbench/dut/hart/ifu/PCD add wave -noupdate -expand -group PCS /testbench/dut/hart/PCE @@ -169,11 +170,12 @@ add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/cont add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/LOGWPL add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/LINESIZE add wave -noupdate /testbench/dut/hart/ifu/icache/controller/CurrState -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrPAdrF add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCountFlag add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCount +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrPAdrF add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrReadF add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrAckF +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrInF add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteEnable add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteData add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWritePAdr @@ -198,9 +200,18 @@ add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WritePAdr add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteSet add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteTag add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/StoredData +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadAddr +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadData +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/ReadPAdr +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/ICacheMemReadData +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/genblk2/PCPreFinalF_q +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/ICacheStallF +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/SavePC TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {237 ns} 0} -quietly wave cursor active 1 +WaveRestoreCursors {{Cursor 2} {44 ns} 0} {{Cursor 2} {284 ns} 0} +quietly wave cursor active 2 configure wave -namecolwidth 250 configure wave -valuecolwidth 229 configure wave -justifyvalue left @@ -215,4 +226,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ns update -WaveRestoreZoom {96 ns} {400 ns} +WaveRestoreZoom {139 ns} {443 ns} diff --git a/wally-pipelined/src/cache/dmapped.sv b/wally-pipelined/src/cache/dmapped.sv index fb6ce4c5..4f1cc2d3 100644 --- a/wally-pipelined/src/cache/dmapped.sv +++ b/wally-pipelined/src/cache/dmapped.sv @@ -125,6 +125,105 @@ module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par assign DataValid = DataValidBit && (DataTag == ReadTag); endmodule +module rodirectmappedmemre #(parameter NUMLINES=512, parameter LINESIZE = 256, parameter WORDSIZE = `XLEN) ( + // Pipeline stuff + input logic clk, + input logic reset, + input logic re, + // If flush is high, invalidate the entire cache + input logic flush, + // Select which address to read (broken for efficiency's sake) + input logic [`XLEN-1:12] ReadUpperPAdr, + input logic [11:0] ReadLowerAdr, + // Write new data to the cache + input logic WriteEnable, + input logic [LINESIZE-1:0] WriteLine, + input logic [`XLEN-1:0] WritePAdr, + // Output the word, as well as if it is valid + output logic [WORDSIZE-1:0] DataWord, + output logic DataValid +); + + // Various compile-time constants + localparam integer WORDWIDTH = $clog2(WORDSIZE/8); + localparam integer OFFSETWIDTH = $clog2(LINESIZE/WORDSIZE); + localparam integer SETWIDTH = $clog2(NUMLINES); + localparam integer TAGWIDTH = `XLEN - OFFSETWIDTH - SETWIDTH - WORDWIDTH; + + localparam integer OFFSETBEGIN = WORDWIDTH; + localparam integer OFFSETEND = OFFSETBEGIN+OFFSETWIDTH-1; + localparam integer SETBEGIN = OFFSETEND+1; + localparam integer SETEND = SETBEGIN + SETWIDTH - 1; + localparam integer TAGBEGIN = SETEND + 1; + localparam integer TAGEND = TAGBEGIN + TAGWIDTH - 1; + + // Machinery to read from and write to the correct addresses in memory + logic [`XLEN-1:0] ReadPAdr; + logic [`XLEN-1:0] OldReadPAdr; + logic [OFFSETWIDTH-1:0] ReadOffset, WriteOffset; + logic [SETWIDTH-1:0] ReadSet, WriteSet; + logic [TAGWIDTH-1:0] ReadTag, WriteTag; + logic [LINESIZE-1:0] ReadLine; + logic [LINESIZE/WORDSIZE-1:0][WORDSIZE-1:0] ReadLineTransformed; + + // Machinery to check if a given read is valid and is the desired value + logic [TAGWIDTH-1:0] DataTag; + logic [NUMLINES-1:0] ValidOut; + logic DataValidBit; + + flopenr #(`XLEN) ReadPAdrFlop(clk, reset, re, ReadPAdr, OldReadPAdr); + + // Assign the read and write addresses in cache memory + always_comb begin + ReadOffset = OldReadPAdr[OFFSETEND:OFFSETBEGIN]; + ReadPAdr = {ReadUpperPAdr, ReadLowerAdr}; + ReadSet = ReadPAdr[SETEND:SETBEGIN]; + ReadTag = OldReadPAdr[TAGEND:TAGBEGIN]; + + WriteOffset = WritePAdr[OFFSETEND:OFFSETBEGIN]; + WriteSet = WritePAdr[SETEND:SETBEGIN]; + WriteTag = WritePAdr[TAGEND:TAGBEGIN]; + end + + // Depth is number of bits in one "word" of the memory, width is number of such words + Sram1Read1Write #(.DEPTH(LINESIZE), .WIDTH(NUMLINES)) cachemem ( + .*, + .ReadAddr(ReadSet), + .ReadData(ReadLine), + .WriteAddr(WriteSet), + .WriteData(WriteLine) + ); + Sram1Read1Write #(.DEPTH(TAGWIDTH), .WIDTH(NUMLINES)) cachetags ( + .*, + .ReadAddr(ReadSet), + .ReadData(DataTag), + .WriteAddr(WriteSet), + .WriteData(WriteTag) + ); + + // Pick the right bits coming out the read line + assign DataWord = ReadLineTransformed[ReadOffset]; + genvar i; + generate + for (i=0; i < LINESIZE/WORDSIZE; i++) begin + assign ReadLineTransformed[i] = ReadLine[(i+1)*WORDSIZE-1:i*WORDSIZE]; + end + endgenerate + + // Correctly handle the valid bits + always_ff @(posedge clk, posedge reset) begin + if (reset || flush) begin + ValidOut <= {NUMLINES{1'b0}}; + end else begin + if (WriteEnable) begin + ValidOut[WriteSet] <= 1; + end + end + DataValidBit <= ValidOut[ReadSet]; + end + assign DataValid = DataValidBit && (DataTag == ReadTag); +endmodule + // Write-through direct-mapped memory module wtdirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, parameter WORDSIZE = `XLEN) ( // Pipeline stuff diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 573e885a..8c16b3a9 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -65,11 +65,14 @@ module icache( // Output signals from cache memory logic [`XLEN-1:0] ICacheMemReadData; logic ICacheMemReadValid; + logic ICacheReadEn; + - rodirectmappedmem #(.LINESIZE(ICACHELINESIZE), .NUMLINES(ICACHENUMLINES), .WORDSIZE(`XLEN)) cachemem( + rodirectmappedmemre #(.LINESIZE(ICACHELINESIZE), .NUMLINES(ICACHENUMLINES), .WORDSIZE(`XLEN)) + cachemem( .*, // Stall it if the pipeline is stalled, unless we're stalling it and we're ending our stall - .stall(StallF && (~ICacheStallF || ~EndFetchState)), + .re(ICacheReadEn), .flush(FlushMem), .ReadUpperPAdr(ICacheMemReadUpperPAdr), .ReadLowerAdr(ICacheMemReadLowerAdr), @@ -88,45 +91,46 @@ endmodule module icachecontroller #(parameter LINESIZE = 256) ( // Inputs from pipeline - input logic clk, reset, - input logic StallF, StallD, - input logic FlushD, + input logic clk, reset, + input logic StallF, StallD, + input logic FlushD, // Input the address to read // The upper bits of the physical pc - input logic [`XLEN-1:12] UpperPCNextPF, + input logic [`XLEN-1:12] UpperPCNextPF, // The lower bits of the virtual pc - input logic [11:0] LowerPCNextF, + input logic [11:0] LowerPCNextF, // Signals to/from cache memory // The read coming out of it - input logic [`XLEN-1:0] ICacheMemReadData, - input logic ICacheMemReadValid, + input logic [`XLEN-1:0] ICacheMemReadData, + input logic ICacheMemReadValid, // The address at which we want to search the cache memory - output logic [`XLEN-1:12] ICacheMemReadUpperPAdr, - output logic [11:0] ICacheMemReadLowerAdr, + output logic [`XLEN-1:12] ICacheMemReadUpperPAdr, + output logic [11:0] ICacheMemReadLowerAdr, + output logic ICacheReadEn, // Load data into the cache - output logic ICacheMemWriteEnable, + output logic ICacheMemWriteEnable, output logic [LINESIZE-1:0] ICacheMemWriteData, - output logic [`XLEN-1:0] ICacheMemWritePAdr, + output logic [`XLEN-1:0] ICacheMemWritePAdr, // Outputs to rest of ifu // High if the instruction in the fetch stage is compressed - output logic CompressedF, + output logic CompressedF, // The instruction that was requested // If this instruction is compressed, upper 16 bits may be the next 16 bits or may be zeros - output logic [31:0] InstrRawD, + output logic [31:0] InstrRawD, // Outputs to pipeline control stuff - output logic ICacheStallF, EndFetchState, + output logic ICacheStallF, EndFetchState, // Signals to/from ahblite interface // A read containing the requested data - input logic [`XLEN-1:0] InstrInF, - input logic InstrAckF, + input logic [`XLEN-1:0] InstrInF, + input logic InstrAckF, // The read we request from main memory - output logic [`XLEN-1:0] InstrPAdrF, - output logic InstrReadF + output logic [`XLEN-1:0] InstrPAdrF, + output logic InstrReadF ); // FSM states @@ -173,7 +177,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( logic [LOGWPL:0] FetchCount, NextFetchCount; - logic [`XLEN-1:0] PCPreFinalF, PCPFinalF, PCSpillF; + logic [`XLEN-1:0] PCPreFinalF, PCPFinalF, PCSpillF, PCNextPF; logic [`XLEN-1:OFFSETWIDTH] PCPTrunkF; @@ -200,15 +204,16 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Cache fault signals //logic FaultStall; - - flopenr #(`XLEN) PCPFFlop(clk, reset, SavePC, {UpperPCNextPF, LowerPCNextF}, PCPF); + assign PCNextPF = {UpperPCNextPF, LowerPCNextF}; + + flopenl #(`XLEN) PCPFFlop(clk, reset, SavePC, PCPFinalF, `RESET_VECTOR, PCPF); // on spill we want to get the first 2 bytes of the next cache block. // the spill only occurs if the PCPF mod BlockByteLength == -2. Therefore we can // simply add 2 to land on the next cache block. assign PCSpillF = PCPF + 2'b10; // now we have to select between these three PCs - assign PCPreFinalF = PCMux[0] ? PCPF : {UpperPCNextPF, LowerPCNextF}; + assign PCPreFinalF = PCMux[0] ? PCPF : PCNextPF; assign PCPFinalF = PCMux[1] ? PCSpillF : PCPreFinalF; @@ -353,18 +358,20 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Next state logic always_comb begin - UnalignedSelect = 1'b0; - CntReset = 1'b0; - PreCntEn = 1'b0; - InstrReadF = 1'b0; - ICacheMemWriteEnable = 1'b0; - spillSave = 1'b0; - PCMux = 2'b00; + UnalignedSelect = 1'b0; + CntReset = 1'b0; + PreCntEn = 1'b0; + InstrReadF = 1'b0; + ICacheMemWriteEnable = 1'b0; + spillSave = 1'b0; + PCMux = 2'b00; + ICacheReadEn = 1'b0; case (CurrState) STATE_READY: begin PCMux = 2'b00; + ICacheReadEn = 1'b1; if (hit & ~spill) begin NextState = STATE_READY; end else if (hit & spill) begin @@ -384,7 +391,8 @@ module icachecontroller #(parameter LINESIZE = 256) ( // branch 1, hit spill and 2, miss spill hit STATE_HIT_SPILL: begin PCMux = 2'b10; - UnalignedSelect = 1'b1; + UnalignedSelect = 1'b1; + ICacheReadEn = 1'b1; if (hit) begin NextState = STATE_READY; end else @@ -409,6 +417,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( STATE_HIT_SPILL_MERGE: begin PCMux = 2'b10; UnalignedSelect = 1'b1; + ICacheReadEn = 1'b1; NextState = STATE_READY; end @@ -430,6 +439,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( end STATE_MISS_READ: begin PCMux = 2'b01; + ICacheReadEn = 1'b1; NextState = STATE_READY; end @@ -452,6 +462,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( STATE_MISS_SPILL_READ1: begin // always be a hit as we just wrote that cache block. PCMux = 2'b10; // there is a 1 cycle delay after setting the address before the date arrives. spillSave = 1'b1; /// *** Could pipeline these to make it clearer in the fsm. + ICacheReadEn = 1'b1; NextState = STATE_MISS_SPILL_2; end STATE_MISS_SPILL_2: begin @@ -482,6 +493,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( STATE_MISS_SPILL_MERGE: begin PCMux = 2'b10; UnalignedSelect = 1'b1; + ICacheReadEn = 1'b1; NextState = STATE_READY; end default: begin @@ -496,9 +508,9 @@ module icachecontroller #(parameter LINESIZE = 256) ( // stall CPU any time we are not in the ready state. any other state means the // cache is either requesting data from the memory interface or handling a // spill over two cycles. - assign ICacheStallF = (CurrState != STATE_READY) | reset_q ? 1'b1 : 1'b0; + assign ICacheStallF = ((CurrState != STATE_READY) & hit) | reset_q ? 1'b1 : 1'b0; // save the PC anytime we are in the ready state. The saved value will be used as the PC may not be stable. - assign SavePC = CurrState == STATE_READY ? 1'b1 : 1'b0; + assign SavePC = (CurrState == STATE_READY) & hit ? 1'b1 : 1'b0; assign CntEn = PreCntEn & InstrAckF; // to compute the fetch address we need to add the bit shifted @@ -518,6 +530,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( // we need to address on that number of bits so the PC is extended to the right by AHBByteLength with zeros. // fetch count is already aligned to AHBByteLength, but we need to extend back to the full address width with // more zeros after the addition. This will be the number of offset bits less the AHBByteLength. + // *** now a bug need to mux between PCPF and PCPF+2 assign InstrPAdrF = {{PCPTrunkF, {{LOGWPL}{1'b0}}} + FetchCount, {{OFFSETWIDTH-LOGWPL}{1'b0}}}; @@ -553,7 +566,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( flop #(1) PCFReg(.clk(clk), .d(PCPreFinalF[1]), .q(PCPreFinalF_q[1])); - assign FinalInstrRawF = PCPreFinalF[1] ? {SpillDataBlock0, ICacheMemReadData[31:16]} : ICacheMemReadData; + assign FinalInstrRawF = PCPreFinalF_q[1] ? {SpillDataBlock0, ICacheMemReadData[31:16]} : ICacheMemReadData; end else begin logic [2:1] PCPreFinalF_q; flop #(2) PCFReg(.clk(clk), @@ -563,7 +576,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( .d1(ICacheMemReadData[47:16]), .d2(ICacheMemReadData[63:32]), .d3({SpillDataBlock0, ICacheMemReadData[63:48]}), - .s(PCPreFinalF[2:1]), + .s(PCPreFinalF_q[2:1]), .y(FinalInstrRawF)); end endgenerate