From ebd6b931c6a64b7c8b6636d9a7601d1067d85bbf Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Wed, 24 Mar 2021 13:39:45 -0400 Subject: [PATCH 01/24] Fix bug in cache line --- wally-pipelined/src/cache/line.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wally-pipelined/src/cache/line.sv b/wally-pipelined/src/cache/line.sv index e498d073..6fe848e5 100644 --- a/wally-pipelined/src/cache/line.sv +++ b/wally-pipelined/src/cache/line.sv @@ -62,7 +62,7 @@ module rocacheline #(parameter LINESIZE = 256, parameter TAGSIZE = 32, parameter always_comb begin - assign DataWord = DataLinesOut[WordSelect[OFFSETSIZE-1:$clog2(WORDSIZE)]]; + assign DataWord = DataLinesOut[WordSelect[OFFSETSIZE-1:$clog2(WORDSIZE/8)]]; end endmodule From ad0d77e9e1ca92449c13b7dd9bc4d3575e43258a Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Wed, 24 Mar 2021 13:40:08 -0400 Subject: [PATCH 02/24] Begin rewrite of icache module to use a direct-mapped scheme --- wally-pipelined/src/ifu/icache.sv | 147 ++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 4208c355..17c8bf1e 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -48,6 +48,153 @@ module icache( output logic [31:0] InstrRawD ); + // Configuration parameters + // TODO Move these to a config file + localparam integer ICACHELINESIZE = 256; + localparam integer ICACHENUMLINES = 512; + + // Input signals to cache memory + logic FlushMem; + logic [`XLEN-1:12] ICacheMemReadUpperPAdr; + logic [11:0] ICacheMemReadLowerAdr; + logic ICacheMemWriteEnable; + logic [ICACHELINESIZE-1:0] ICacheMemWriteData; + logic [`XLEN-1:0] ICacheMemWritePAdr; + // Output signals from cache memory + logic [`XLEN-1:0] ICacheMemReadData; + logic ICacheMemReadValid; + + rodirectmappedmem #(.LINESIZE(ICACHELINESIZE), .NUMLINES(ICACHENUMLINES)) cachemem( + .*, + .flush(FlushMem), + .ReadUpperPAdr(ICacheMemReadUpperPAdr), + .ReadLowerAdr(ICacheMemReadLowerAdr), + .WriteEnable(ICacheMemWriteEnable), + .WriteLine(ICacheMemWriteData), + .WritePAdr(ICacheMemWritePAdr), + .DataWord(ICacheMemReadData), + .DataValid(ICacheMemReadValid) + ); + + icachecontroller #(.LINESIZE(ICACHELINESIZE)) controller(.*); +endmodule + +module icachecontroller #(parameter LINESIZE = 256) ( + // Inputs from pipeline + input logic clk, reset, + input logic StallF, StallD, + input logic FlushD, + + // Input the address to read + // The upper bits of the physical pc + input logic [`XLEN-1:12] UpperPCPF, + // The lower bits of the virtual pc + input logic [11:0] LowerPCF, + + // Signals to/from cache memory + // The read coming out of it + input logic [`XLEN-1:0] ICacheMemReadData, + input logic ICacheMemReadValid, + // The address at which we want to search the cache memory + output logic [`XLEN-1:12] ICacheMemReadUpperPAdr, + output logic [11:0] ICacheMemReadLowerAdr, + // Load data into the cache + output logic ICacheMemWriteEnable, + output logic [LINESIZE-1:0] ICacheMemWriteData, + output logic [`XLEN-1:0] ICacheMemWritePAdr, + + // Outputs to rest of ifu + // High if the instruction in the fetch stage is compressed + output logic CompressedF, + // The instruction that was requested + // If this instruction is compressed, upper 16 bits may be the next 16 bits or may be zeros + output logic [31:0] InstrRawD, + + // Outputs to pipeline control stuff + output logic ICacheStallF, + + // Signals to/from ahblite interface + // A read containing the requested data + input logic [`XLEN-1:0] InstrInF, + // The read we request from main memory + output logic [`XLEN-1:0] InstrPAdrF, + output logic InstrReadF +); + + logic [31:0] AlignedInstrRawF, AlignedInstrRawD; + logic FlushDLastCycle; + const logic [31:0] NOP = 32'h13; + + // TODO allow compressed instructions + // (start with noncompressed only to get something working) + assign CompressedF = 1'b0; + + // Handle happy path (data in cache, reads aligned) + always_comb begin + assign ICacheMemReadLowerAdr = LowerPCF; + assign ICacheMemReadUpperPAdr = UpperPCPF; + end + + generate + if (`XLEN == 32) begin + assign AlignedInstrRawF = ICacheMemReadData; + end else begin + assign AlignedInstrRawF = LowerPCF[2] ? ICacheMemReadData[63:32] : ICacheMemReadData[31:0]; + end + endgenerate + + flopenr #(32) AlignedInstrRawDFlop(clk, reset, ~StallD, AlignedInstrRawF, AlignedInstrRawD); + flopr #(1) FlushDLastCycleFlop(clk, reset, FlushD | (FlushDLastCycle & StallF), FlushDLastCycle); + mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, FlushDLastCycle, InstrRawD); + + // Handle cache faults + + localparam integer WORDSPERLINE = LINESIZE/`XLEN; + localparam integer OFFSETWIDTH = $clog2(LINESIZE/8); + + logic FetchState; + logic [$clog2(WORDSPERLINE)-1:0] FetchWordNum; + logic [`XLEN-1:0] LineAlignedPCPF; + + flopr #(1) FetchStateFlop(clk, reset, 1'b0, FetchState); + flopr #($clog2(WORDSPERLINE)) FetchWordNumFlop(clk, reset, {$clog2(WORDSPERLINE){1'b0}}, FetchWordNum); + + genvar i; + generate + for (i=0; i < WORDSPERLINE; i++) begin + flopenr #(32) flop(clk, reset, FetchState & (i == FetchWordNum), InstrInF, ICacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN]); + end + endgenerate + + always_comb begin + assign InstrReadF = FetchState; + assign LineAlignedPCPF = {UpperPCPF, LowerPCF[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; + assign InstrPAdrF = LineAlignedPCPF + i*`XLEN; + end +endmodule + +module oldicache( + // Basic pipeline stuff + input logic clk, reset, + input logic StallF, StallD, + input logic FlushD, + // Upper bits of physical address for PC + input logic [`XLEN-1:12] UpperPCPF, + // Lower 12 bits of virtual PC address, since it's faster this way + input logic [11:0] LowerPCF, + // Data read in from the ebu unit + input logic [`XLEN-1:0] InstrInF, + // Read requested from the ebu unit + output logic [`XLEN-1:0] InstrPAdrF, + output logic InstrReadF, + // High if the instruction currently in the fetch stage is compressed + output logic CompressedF, + // High if the icache is requesting a stall + output logic ICacheStallF, + // The raw (not decompressed) instruction that was requested + // If the next instruction is compressed, the upper 16 bits may be anything + output logic [31:0] InstrRawD +); logic DelayF, DelaySideF, FlushDLastCyclen, DelayD; logic [1:0] InstrDMuxChoice; logic [15:0] MisalignedHalfInstrF, MisalignedHalfInstrD; From ba95557c449f9566e05ac2de7243b736792cf4d5 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Wed, 24 Mar 2021 13:58:43 -0400 Subject: [PATCH 03/24] More progress on icache controller --- wally-pipelined/src/ifu/icache.sv | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 17c8bf1e..c0f04286 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -150,14 +150,15 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Handle cache faults localparam integer WORDSPERLINE = LINESIZE/`XLEN; + localparam integer LOGWPL = $clog2(WORDSPERLINE); localparam integer OFFSETWIDTH = $clog2(LINESIZE/8); - logic FetchState; - logic [$clog2(WORDSPERLINE)-1:0] FetchWordNum; + logic FetchState, EndFetchState, BeginFetchState; + logic [LOGWPL:0] FetchWordNum, NextFetchWordNum; logic [`XLEN-1:0] LineAlignedPCPF; - flopr #(1) FetchStateFlop(clk, reset, 1'b0, FetchState); - flopr #($clog2(WORDSPERLINE)) FetchWordNumFlop(clk, reset, {$clog2(WORDSPERLINE){1'b0}}, FetchWordNum); + flopr #(1) FetchStateFlop(clk, reset, BeginFetchState | (FetchState & ~EndFetchState), FetchState); + flopr #(LOGWPL+1) FetchWordNumFlop(clk, reset, NextFetchWordNum, FetchWordNum); genvar i; generate @@ -166,10 +167,23 @@ module icachecontroller #(parameter LINESIZE = 256) ( end endgenerate + // Machinery to request the correct addresses from main memory always_comb begin assign InstrReadF = FetchState; assign LineAlignedPCPF = {UpperPCPF, LowerPCF[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; - assign InstrPAdrF = LineAlignedPCPF + i*`XLEN; + assign InstrPAdrF = LineAlignedPCPF + FetchWordNum*`XLEN; + assign NextFetchWordNum = FetchState ? FetchWordNum+1 : {LOGWPL+1{1'b0}}; + end + + // Write to cache memory when we have the line here + always_comb begin + assign BeginFetchState = 1'b0; + assign EndFetchState = FetchWordNum == {1'b1, {LOGWPL{1'b0}}}; + end + + // Stall the pipeline while loading a new line from memory + always_comb begin + assign ICacheStallF = FetchState | ~ICacheMemReadValid; end endmodule From 602271ff7b91a6eda837f07196b571ceff887f32 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Wed, 24 Mar 2021 16:56:44 -0400 Subject: [PATCH 04/24] rv64i linear control flow now working --- wally-pipelined/regression/wally-pipelined.do | 2 +- .../regression/wave-dos/ahb-waves.do | 8 -- .../regression/wave-dos/cache-waves.do | 82 +++++++++++++++++++ .../regression/wave-dos/default-waves.do | 5 -- wally-pipelined/src/cache/line.sv | 4 +- wally-pipelined/src/ebu/ahblite.sv | 2 + wally-pipelined/src/hazard/hazard.sv | 4 +- wally-pipelined/src/ifu/icache.sv | 22 +++-- wally-pipelined/src/ifu/ifu.sv | 1 + .../src/wally/wallypipelinedhart.sv | 2 +- .../testbench/testbench-imperas.sv | 3 +- 11 files changed, 109 insertions(+), 26 deletions(-) create mode 100644 wally-pipelined/regression/wave-dos/cache-waves.do diff --git a/wally-pipelined/regression/wally-pipelined.do b/wally-pipelined/regression/wally-pipelined.do index a5041005..5f1b1406 100644 --- a/wally-pipelined/regression/wally-pipelined.do +++ b/wally-pipelined/regression/wally-pipelined.do @@ -42,7 +42,7 @@ vsim workopt view wave -- display input and output signals as hexidecimal values -do ./wave-dos/ahb-waves.do +do ./wave-dos/cache-waves.do -- Set Wave Output Items TreeUpdate [SetDefaultTree] diff --git a/wally-pipelined/regression/wave-dos/ahb-waves.do b/wally-pipelined/regression/wave-dos/ahb-waves.do index f043d779..c3a38563 100644 --- a/wally-pipelined/regression/wave-dos/ahb-waves.do +++ b/wally-pipelined/regression/wave-dos/ahb-waves.do @@ -19,16 +19,8 @@ add wave -divider add wave -hex /testbench/dut/hart/ifu/PCF add wave -hex /testbench/dut/hart/ifu/PCD add wave -hex /testbench/dut/hart/ifu/InstrD - add wave /testbench/InstrDName add wave -hex /testbench/dut/hart/ifu/ic/InstrRawD -add wave -hex /testbench/dut/hart/ifu/ic/AlignedInstrD -add wave -divider -add wave -hex /testbench/dut/hart/ifu/ic/InstrPAdrF -add wave /testbench/dut/hart/ifu/ic/DelayF -add wave /testbench/dut/hart/ifu/ic/DelaySideF -add wave /testbench/dut/hart/ifu/ic/DelayD -add wave -hex /testbench/dut/hart/ifu/ic/MisalignedHalfInstrD add wave -divider add wave -hex /testbench/dut/hart/ifu/PCE diff --git a/wally-pipelined/regression/wave-dos/cache-waves.do b/wally-pipelined/regression/wave-dos/cache-waves.do new file mode 100644 index 00000000..e39d40a0 --- /dev/null +++ b/wally-pipelined/regression/wave-dos/cache-waves.do @@ -0,0 +1,82 @@ +add wave /testbench/clk +add wave /testbench/reset +add wave -divider + +#add wave /testbench/dut/hart/ebu/IReadF +add wave /testbench/dut/hart/DataStall +add wave /testbench/dut/hart/InstrStall +add wave /testbench/dut/hart/StallF +add wave /testbench/dut/hart/StallD +add wave /testbench/dut/hart/StallE +add wave /testbench/dut/hart/StallM +add wave /testbench/dut/hart/StallW +add wave /testbench/dut/hart/FlushD +add wave /testbench/dut/hart/FlushE +add wave /testbench/dut/hart/FlushM +add wave /testbench/dut/hart/FlushW + +add wave -divider +add wave -hex /testbench/dut/hart/ifu/PCF +add wave -hex /testbench/dut/hart/ifu/PCD +add wave -hex /testbench/dut/hart/ifu/InstrD + +add wave /testbench/InstrDName +add wave -hex /testbench/dut/hart/ifu/ic/InstrRawD +add wave -hex /testbench/dut/hart/ifu/ic/controller/AlignedInstrRawD +add wave -divider +add wave -hex /testbench/dut/hart/ifu/ic/controller/FetchState +add wave -hex /testbench/dut/hart/ifu/ic/controller/FetchWordNum +add wave -hex /testbench/dut/hart/ifu/ic/controller/ICacheMemWriteEnable +add wave -hex /testbench/dut/hart/ifu/ic/InstrPAdrF +add wave -hex /testbench/dut/hart/ifu/ic/InstrAckF +add wave -hex /testbench/dut/hart/ifu/ic/controller/ICacheMemWriteData +add wave -hex /testbench/dut/hart/ifu/ic/controller/ICacheMemWritePAdr +add wave -divider + +add wave -hex /testbench/dut/hart/ifu/PCE +add wave -hex /testbench/dut/hart/ifu/InstrE +add wave /testbench/InstrEName +add wave -hex /testbench/dut/hart/ieu/dp/SrcAE +add wave -hex /testbench/dut/hart/ieu/dp/SrcBE +add wave -hex /testbench/dut/hart/ieu/dp/ALUResultE +#add wave /testbench/dut/hart/ieu/dp/PCSrcE +add wave -divider + +add wave -hex /testbench/dut/hart/ifu/PCM +add wave -hex /testbench/dut/hart/ifu/InstrM +add wave /testbench/InstrMName +add wave /testbench/dut/uncore/dtim/memwrite +add wave -hex /testbench/dut/uncore/HADDR +add wave -hex /testbench/dut/uncore/HWDATA +add wave -divider + +add wave -hex /testbench/dut/hart/ebu/MemReadM +add wave -hex /testbench/dut/hart/ebu/InstrReadF +add wave -hex /testbench/dut/hart/ebu/BusState +add wave -hex /testbench/dut/hart/ebu/NextBusState +add wave -hex /testbench/dut/hart/ebu/HADDR +add wave -hex /testbench/dut/hart/ebu/HREADY +add wave -hex /testbench/dut/hart/ebu/HTRANS +add wave -hex /testbench/dut/hart/ebu/HRDATA +add wave -hex /testbench/dut/hart/ebu/HWRITE +add wave -hex /testbench/dut/hart/ebu/HWDATA +add wave -hex /testbench/dut/hart/ebu/CaptureDataM +add wave -hex /testbench/dut/hart/ebu/InstrStall +add wave -divider + +add wave -hex /testbench/dut/uncore/dtim/* +add wave -divider + +add wave -hex /testbench/dut/hart/ifu/PCW +add wave -hex /testbench/dut/hart/ifu/InstrW +add wave /testbench/InstrWName +add wave /testbench/dut/hart/ieu/dp/RegWriteW +add wave -hex /testbench/dut/hart/ebu/ReadDataW +add wave -hex /testbench/dut/hart/ieu/dp/ResultW +add wave -hex /testbench/dut/hart/ieu/dp/RdW +add wave -divider + +add wave -hex /testbench/dut/uncore/dtim/* +add wave -divider + +add wave -hex -r /testbench/* diff --git a/wally-pipelined/regression/wave-dos/default-waves.do b/wally-pipelined/regression/wave-dos/default-waves.do index 4b645651..ef4e30c6 100644 --- a/wally-pipelined/regression/wave-dos/default-waves.do +++ b/wally-pipelined/regression/wave-dos/default-waves.do @@ -23,11 +23,6 @@ add wave -hex /testbench/dut/hart/ifu/PCD add wave -hex /testbench/dut/hart/ifu/InstrD add wave /testbench/InstrDName add wave -hex /testbench/dut/hart/ifu/ic/InstrRawD -add wave -hex /testbench/dut/hart/ifu/ic/AlignedInstrD -add wave /testbench/dut/hart/ifu/ic/DelayF -add wave /testbench/dut/hart/ifu/ic/DelaySideF -add wave /testbench/dut/hart/ifu/ic/DelayD -add wave -hex /testbench/dut/hart/ifu/ic/MisalignedHalfInstrD add wave -divider add wave -hex /testbench/dut/hart/ifu/PCE add wave -hex /testbench/dut/hart/ifu/InstrE diff --git a/wally-pipelined/src/cache/line.sv b/wally-pipelined/src/cache/line.sv index 6fe848e5..d90cd206 100644 --- a/wally-pipelined/src/cache/line.sv +++ b/wally-pipelined/src/cache/line.sv @@ -55,8 +55,8 @@ module rocacheline #(parameter LINESIZE = 256, parameter TAGSIZE = 32, parameter genvar i; generate for (i=0; i < NUMWORDS; i++) begin - assign DataLinesIn[i] = WriteData[NUMWORDS*i+WORDSIZE-1:NUMWORDS*i]; - flopenr #(LINESIZE) LineFlop(clk, reset, WriteEnable, DataLinesIn[i], DataLinesOut[i]); + assign DataLinesIn[i] = WriteData[WORDSIZE*(i+1)-1:WORDSIZE*i]; + flopenr #(WORDSIZE) LineFlop(clk, reset, WriteEnable, DataLinesIn[i], DataLinesOut[i]); end endgenerate diff --git a/wally-pipelined/src/ebu/ahblite.sv b/wally-pipelined/src/ebu/ahblite.sv index 90ef018b..58a28747 100644 --- a/wally-pipelined/src/ebu/ahblite.sv +++ b/wally-pipelined/src/ebu/ahblite.sv @@ -41,6 +41,7 @@ module ahblite ( input logic [`XLEN-1:0] InstrPAdrF, // *** rename these to match block diagram input logic InstrReadF, output logic [`XLEN-1:0] InstrRData, + output logic InstrAckF, // Signals from Data Cache input logic [`XLEN-1:0] MemPAdrM, input logic MemReadM, MemWriteM, @@ -171,6 +172,7 @@ module ahblite ( assign #1 MMUReady = (NextBusState == MMUIDLE); assign InstrRData = HRDATA; + assign InstrAckF = (BusState == INSTRREAD) && (NextBusState != INSTRREAD) || (BusState == INSTRREADC) && (NextBusState != INSTRREADC); assign MMUReadPTE = HRDATA; assign ReadDataM = HRDATAMasked; // changed from W to M dh 2/7/2021 assign CaptureDataM = ((BusState == MEMREAD) && (NextBusState != MEMREAD)) || diff --git a/wally-pipelined/src/hazard/hazard.sv b/wally-pipelined/src/hazard/hazard.sv index 9542020d..ecd3c366 100644 --- a/wally-pipelined/src/hazard/hazard.sv +++ b/wally-pipelined/src/hazard/hazard.sv @@ -53,12 +53,12 @@ module hazard( assign BranchFlushDE = BPPredWrongE | RetM | TrapM; - assign StallFCause = CSRWritePendingDEM & ~(BranchFlushDE); + assign StallFCause = CSRWritePendingDEM & ~(BranchFlushDE) | ICacheStallF; assign StallDCause = (LoadStallD | MulDivStallD | CSRRdStallD) & ~(BranchFlushDE); // stall in decode if instruction is a load/mul/csr dependent on previous // assign StallDCause = LoadStallD | MulDivStallD | CSRRdStallD; // stall in decode if instruction is a load/mul/csr dependent on previous assign StallECause = 0; assign StallMCause = 0; - assign StallWCause = DataStall | InstrStall; + assign StallWCause = DataStall; // Each stage stalls if the next stage is stalled or there is a cause to stall this stage. assign StallF = StallD | StallFCause; diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index c0f04286..df608a39 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -36,6 +36,7 @@ module icache( input logic [11:0] LowerPCF, // Data read in from the ebu unit input logic [`XLEN-1:0] InstrInF, + input logic InstrAckF, // Read requested from the ebu unit output logic [`XLEN-1:0] InstrPAdrF, output logic InstrReadF, @@ -77,6 +78,8 @@ module icache( ); icachecontroller #(.LINESIZE(ICACHELINESIZE)) controller(.*); + + assign FlushMem = 1'b0; endmodule module icachecontroller #(parameter LINESIZE = 256) ( @@ -116,6 +119,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Signals to/from ahblite interface // A read containing the requested data input logic [`XLEN-1:0] InstrInF, + input logic InstrAckF, // The read we request from main memory output logic [`XLEN-1:0] InstrPAdrF, output logic InstrReadF @@ -163,22 +167,28 @@ module icachecontroller #(parameter LINESIZE = 256) ( genvar i; generate for (i=0; i < WORDSPERLINE; i++) begin - flopenr #(32) flop(clk, reset, FetchState & (i == FetchWordNum), InstrInF, ICacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN]); + flopenr #(`XLEN) flop(clk, reset, FetchState & (i == FetchWordNum), InstrInF, ICacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN]); end endgenerate + // Enter the fetch state when we hit a cache fault + always_comb begin + assign BeginFetchState = ~ICacheMemReadValid & ~FetchState; + end + // Machinery to request the correct addresses from main memory always_comb begin - assign InstrReadF = FetchState; + assign InstrReadF = FetchState & ~EndFetchState; assign LineAlignedPCPF = {UpperPCPF, LowerPCF[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; - assign InstrPAdrF = LineAlignedPCPF + FetchWordNum*`XLEN; - assign NextFetchWordNum = FetchState ? FetchWordNum+1 : {LOGWPL+1{1'b0}}; + assign InstrPAdrF = LineAlignedPCPF + FetchWordNum*(`XLEN/8); + assign NextFetchWordNum = FetchState ? FetchWordNum+InstrAckF : {LOGWPL+1{1'b0}}; end // Write to cache memory when we have the line here always_comb begin - assign BeginFetchState = 1'b0; - assign EndFetchState = FetchWordNum == {1'b1, {LOGWPL{1'b0}}}; + assign EndFetchState = FetchWordNum == {1'b1, {LOGWPL{1'b0}}} & FetchState; + assign ICacheMemWritePAdr = LineAlignedPCPF; + assign ICacheMemWriteEnable = EndFetchState; end // Stall the pipeline while loading a new line from memory diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv index c68786e5..747a2b49 100644 --- a/wally-pipelined/src/ifu/ifu.sv +++ b/wally-pipelined/src/ifu/ifu.sv @@ -32,6 +32,7 @@ module ifu ( input logic FlushF, FlushD, FlushE, FlushM, FlushW, // Fetch input logic [`XLEN-1:0] InstrInF, + input logic InstrAckF, output logic [`XLEN-1:0] PCF, output logic [`XLEN-1:0] InstrPAdrF, output logic InstrReadF, diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv index c858befd..49214b0d 100644 --- a/wally-pipelined/src/wally/wallypipelinedhart.sv +++ b/wally-pipelined/src/wally/wallypipelinedhart.sv @@ -112,7 +112,7 @@ module wallypipelinedhart ( logic [`XLEN-1:0] InstrRData; logic InstrReadF; logic DataStall, InstrStall; - logic InstrAckD, MemAckW; + logic InstrAckF, MemAckW; logic BPPredWrongE, BPPredWrongM; logic [3:0] InstrClassM; diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index 8b128b17..37d9883e 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -370,7 +370,8 @@ string tests32i[] = { // Track names of instructions instrTrackerTB it(clk, reset, dut.hart.ieu.dp.FlushE, - dut.hart.ifu.ic.InstrF, dut.hart.ifu.InstrD, dut.hart.ifu.InstrE, + dut.hart.ifu.ic.controller.AlignedInstrRawF, + dut.hart.ifu.InstrD, dut.hart.ifu.InstrE, dut.hart.ifu.InstrM, dut.hart.ifu.InstrW, InstrFName, InstrDName, InstrEName, InstrMName, InstrWName); From 128278ea2799da37487e9e231dd29e9c6aa27898 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Wed, 24 Mar 2021 17:23:00 -0400 Subject: [PATCH 05/24] Working for all of rv64i now, but not compressed instructions --- wally-pipelined/src/hazard/hazard.sv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wally-pipelined/src/hazard/hazard.sv b/wally-pipelined/src/hazard/hazard.sv index ecd3c366..3768f0fc 100644 --- a/wally-pipelined/src/hazard/hazard.sv +++ b/wally-pipelined/src/hazard/hazard.sv @@ -53,12 +53,12 @@ module hazard( assign BranchFlushDE = BPPredWrongE | RetM | TrapM; - assign StallFCause = CSRWritePendingDEM & ~(BranchFlushDE) | ICacheStallF; + assign StallFCause = CSRWritePendingDEM & ~(BranchFlushDE); assign StallDCause = (LoadStallD | MulDivStallD | CSRRdStallD) & ~(BranchFlushDE); // stall in decode if instruction is a load/mul/csr dependent on previous // assign StallDCause = LoadStallD | MulDivStallD | CSRRdStallD; // stall in decode if instruction is a load/mul/csr dependent on previous assign StallECause = 0; assign StallMCause = 0; - assign StallWCause = DataStall; + assign StallWCause = DataStall | ICacheStallF; // Each stage stalls if the next stage is stalled or there is a cause to stall this stage. assign StallF = StallD | StallFCause; From ce6f102fc5b9f73126cfba3a1f558118f19c6874 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Thu, 25 Mar 2021 00:46:51 -0400 Subject: [PATCH 06/24] Clean up some stuff --- wally-pipelined/src/ebu/ahblite.sv | 5 ++--- wally-pipelined/src/hazard/hazard.sv | 2 +- wally-pipelined/src/wally/wallypipelinedhart.sv | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/wally-pipelined/src/ebu/ahblite.sv b/wally-pipelined/src/ebu/ahblite.sv index 58a28747..c0aa27db 100644 --- a/wally-pipelined/src/ebu/ahblite.sv +++ b/wally-pipelined/src/ebu/ahblite.sv @@ -71,7 +71,7 @@ module ahblite ( output logic [3:0] HSIZED, output logic HWRITED, // Stalls - output logic InstrStall,/*InstrUpdate, */DataStall + output logic /*InstrUpdate, */DataStall // *** add a chip-level ready signal as part of handshake ); @@ -135,8 +135,7 @@ module ahblite ( // stall signals assign #2 DataStall = (NextBusState == MEMREAD) || (NextBusState == MEMWRITE) || - (NextBusState == ATOMICREAD) || (NextBusState == ATOMICWRITE) || - (NextBusState == MMUTRANSLATE) || (NextBusState == MMUIDLE); + (NextBusState == ATOMICREAD) || (NextBusState == ATOMICWRITE); // *** Could get finer grained stalling if we distinguish between MMU // instruction address translation and data address translation assign #1 InstrStall = (NextBusState == INSTRREAD) || (NextBusState == INSTRREADC) || diff --git a/wally-pipelined/src/hazard/hazard.sv b/wally-pipelined/src/hazard/hazard.sv index 3768f0fc..c225a4e8 100644 --- a/wally-pipelined/src/hazard/hazard.sv +++ b/wally-pipelined/src/hazard/hazard.sv @@ -29,7 +29,7 @@ module hazard( // Detect hazards input logic BPPredWrongE, CSRWritePendingDEM, RetM, TrapM, input logic LoadStallD, MulDivStallD, CSRRdStallD, - input logic InstrStall, DataStall, ICacheStallF, + input logic DataStall, ICacheStallF, // Stall & flush outputs output logic StallF, StallD, StallE, StallM, StallW, output logic FlushF, FlushD, FlushE, FlushM, FlushW diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv index 49214b0d..0c079ba1 100644 --- a/wally-pipelined/src/wally/wallypipelinedhart.sv +++ b/wally-pipelined/src/wally/wallypipelinedhart.sv @@ -111,7 +111,7 @@ module wallypipelinedhart ( logic [`XLEN-1:0] InstrPAdrF; logic [`XLEN-1:0] InstrRData; logic InstrReadF; - logic DataStall, InstrStall; + logic DataStall; logic InstrAckF, MemAckW; logic BPPredWrongE, BPPredWrongM; From 0290568a5227b4fba3cf20aae9b4981b99b0b4f3 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Thu, 25 Mar 2021 13:18:30 -0400 Subject: [PATCH 07/24] Make cache output NOP after a reset --- wally-pipelined/regression/wave-dos/ahb-waves.do | 2 +- wally-pipelined/regression/wave-dos/cache-waves.do | 2 +- wally-pipelined/regression/wave-dos/default-waves.do | 2 +- wally-pipelined/src/ifu/icache.sv | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/wally-pipelined/regression/wave-dos/ahb-waves.do b/wally-pipelined/regression/wave-dos/ahb-waves.do index c3a38563..c542f584 100644 --- a/wally-pipelined/regression/wave-dos/ahb-waves.do +++ b/wally-pipelined/regression/wave-dos/ahb-waves.do @@ -4,7 +4,7 @@ add wave -divider #add wave /testbench/dut/hart/ebu/IReadF add wave /testbench/dut/hart/DataStall -add wave /testbench/dut/hart/InstrStall +add wave /testbench/dut/hart/ICacheStallF add wave /testbench/dut/hart/StallF add wave /testbench/dut/hart/StallD add wave /testbench/dut/hart/StallE diff --git a/wally-pipelined/regression/wave-dos/cache-waves.do b/wally-pipelined/regression/wave-dos/cache-waves.do index e39d40a0..bdd88a13 100644 --- a/wally-pipelined/regression/wave-dos/cache-waves.do +++ b/wally-pipelined/regression/wave-dos/cache-waves.do @@ -4,7 +4,7 @@ add wave -divider #add wave /testbench/dut/hart/ebu/IReadF add wave /testbench/dut/hart/DataStall -add wave /testbench/dut/hart/InstrStall +add wave /testbench/dut/hart/ICacheStallF add wave /testbench/dut/hart/StallF add wave /testbench/dut/hart/StallD add wave /testbench/dut/hart/StallE diff --git a/wally-pipelined/regression/wave-dos/default-waves.do b/wally-pipelined/regression/wave-dos/default-waves.do index ef4e30c6..3f81cfad 100644 --- a/wally-pipelined/regression/wave-dos/default-waves.do +++ b/wally-pipelined/regression/wave-dos/default-waves.do @@ -6,7 +6,7 @@ add wave /testbench/reset add wave -divider #add wave /testbench/dut/hart/ebu/IReadF add wave /testbench/dut/hart/DataStall -add wave /testbench/dut/hart/InstrStall +add wave /testbench/dut/hart/ICacheStallF add wave /testbench/dut/hart/StallF add wave /testbench/dut/hart/StallD add wave /testbench/dut/hart/StallE diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index df608a39..631a9bd6 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -126,7 +126,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( ); logic [31:0] AlignedInstrRawF, AlignedInstrRawD; - logic FlushDLastCycle; + logic FlushDLastCycleN; const logic [31:0] NOP = 32'h13; // TODO allow compressed instructions @@ -148,8 +148,8 @@ module icachecontroller #(parameter LINESIZE = 256) ( endgenerate flopenr #(32) AlignedInstrRawDFlop(clk, reset, ~StallD, AlignedInstrRawF, AlignedInstrRawD); - flopr #(1) FlushDLastCycleFlop(clk, reset, FlushD | (FlushDLastCycle & StallF), FlushDLastCycle); - mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, FlushDLastCycle, InstrRawD); + flopr #(1) FlushDLastCycleFlop(clk, reset, ~FlushD & (FlushDLastCycleN | ~StallF), FlushDLastCycleN); + mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, ~FlushDLastCycleN, InstrRawD); // Handle cache faults From 3b4f0141f4a7782ef3fd4c4d580d43f8cfa9d88f Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Thu, 25 Mar 2021 14:43:10 -0400 Subject: [PATCH 08/24] Begin work on compressed instructions --- .../regression/wave-dos/ahb-waves.do | 1 - .../regression/wave-dos/cache-waves.do | 1 - wally-pipelined/src/ebu/ahblite.sv | 4 - wally-pipelined/src/ifu/icache.sv | 77 ++++++++++++++++--- .../testbench/testbench-imperas.sv | 2 +- 5 files changed, 69 insertions(+), 16 deletions(-) diff --git a/wally-pipelined/regression/wave-dos/ahb-waves.do b/wally-pipelined/regression/wave-dos/ahb-waves.do index c542f584..263693d7 100644 --- a/wally-pipelined/regression/wave-dos/ahb-waves.do +++ b/wally-pipelined/regression/wave-dos/ahb-waves.do @@ -51,7 +51,6 @@ add wave -hex /testbench/dut/hart/ebu/HRDATA add wave -hex /testbench/dut/hart/ebu/HWRITE add wave -hex /testbench/dut/hart/ebu/HWDATA add wave -hex /testbench/dut/hart/ebu/CaptureDataM -add wave -hex /testbench/dut/hart/ebu/InstrStall add wave -divider add wave -hex /testbench/dut/uncore/dtim/* diff --git a/wally-pipelined/regression/wave-dos/cache-waves.do b/wally-pipelined/regression/wave-dos/cache-waves.do index bdd88a13..20c7061b 100644 --- a/wally-pipelined/regression/wave-dos/cache-waves.do +++ b/wally-pipelined/regression/wave-dos/cache-waves.do @@ -61,7 +61,6 @@ add wave -hex /testbench/dut/hart/ebu/HRDATA add wave -hex /testbench/dut/hart/ebu/HWRITE add wave -hex /testbench/dut/hart/ebu/HWDATA add wave -hex /testbench/dut/hart/ebu/CaptureDataM -add wave -hex /testbench/dut/hart/ebu/InstrStall add wave -divider add wave -hex /testbench/dut/uncore/dtim/* diff --git a/wally-pipelined/src/ebu/ahblite.sv b/wally-pipelined/src/ebu/ahblite.sv index c0aa27db..73df76a3 100644 --- a/wally-pipelined/src/ebu/ahblite.sv +++ b/wally-pipelined/src/ebu/ahblite.sv @@ -136,10 +136,6 @@ module ahblite ( // stall signals assign #2 DataStall = (NextBusState == MEMREAD) || (NextBusState == MEMWRITE) || (NextBusState == ATOMICREAD) || (NextBusState == ATOMICWRITE); - // *** Could get finer grained stalling if we distinguish between MMU - // instruction address translation and data address translation - assign #1 InstrStall = (NextBusState == INSTRREAD) || (NextBusState == INSTRREADC) || - (NextBusState == MMUTRANSLATE) || (NextBusState == MMUIDLE); // bus outputs assign #1 GrantData = (NextBusState == MEMREAD) || (NextBusState == MEMWRITE) || diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 631a9bd6..09fb84ae 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -127,11 +127,11 @@ module icachecontroller #(parameter LINESIZE = 256) ( logic [31:0] AlignedInstrRawF, AlignedInstrRawD; logic FlushDLastCycleN; + logic PCPMisalignedF; const logic [31:0] NOP = 32'h13; - // TODO allow compressed instructions - // (start with noncompressed only to get something working) - assign CompressedF = 1'b0; + // Detect if the instruction is compressed + assign CompressedF = AlignedInstrRawF[1:0] != 2'b11; // Handle happy path (data in cache, reads aligned) always_comb begin @@ -141,9 +141,13 @@ module icachecontroller #(parameter LINESIZE = 256) ( generate if (`XLEN == 32) begin - assign AlignedInstrRawF = ICacheMemReadData; + assign AlignedInstrRawF = LowerPCF[1] ? {16'b0, ICacheMemReadData[31:16]} : ICacheMemReadData; + assign PCPMisalignedF = LowerPCF[1] && ~CompressedF; end else begin - assign AlignedInstrRawF = LowerPCF[2] ? ICacheMemReadData[63:32] : ICacheMemReadData[31:0]; + assign AlignedInstrRawF = LowerPCF[2] + ? (LowerPCF[1] ? MisalignedInstrRawF : ICacheMemReadData[63:32]) + : (LowerPCF[1] ? ICacheMemReadData[47:16] : ICacheMemReadData[31:0]); + assign PCPMisalignedF = LowerPCF[2] && LowerPCF[1] && ~CompressedF; end endgenerate @@ -151,15 +155,70 @@ module icachecontroller #(parameter LINESIZE = 256) ( flopr #(1) FlushDLastCycleFlop(clk, reset, ~FlushD & (FlushDLastCycleN | ~StallF), FlushDLastCycleN); mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, ~FlushDLastCycleN, InstrRawD); + // Stall for faults or misaligned reads + always_comb begin + assign ICacheStallF = FaultStall | MisalignedStall; + end + + // Handle misaligned, noncompressed reads + logic MisalignedState, NextMisalignedState; + logic MisalignedStall; + logic [15:0] MisalignedHalfInstrF; + logic [`XLEN:0] MisalignedInstrRawF; + + always_comb begin + assign MisalignedInstrRawF = {16'b0, ICacheMemReadData[63:48]}; + end + + flopenr #(16) MisalignedHalfInstrFlop(clk, reset, ~FaultStall & (PCPMisalignedF & MisalignedState), AlignedInstrRawF[15:0], MisalignedHalfInstrF); + flopenr #(1) MisalignedStateFlop(clk, reset, ~FaultStall, NextMisalignedState, MisalignedState); + + always_comb begin + assign MisalignedStall = PCPMisalignedF & MisalignedState; + assign NextMisalignedState = ~PCPMisalignedF | ~MisalignedState; + end + + // Pick the correct address to read + always_comb begin + if (~PCPMisalignedF) begin + assign ICacheMemReadUpperPAdr = UpperPCPF; + generate + if (`XLEN == 32) + assign ICacheMemReadLowerAdr = {LowerPCF[31:2], 2'b00}; + else + assign ICacheMemReadLowerAdr = {LowerPCF[31:3], 2'b000}; + endgenerate + end else begin + if (MisalignedState) begin + assign ICacheMemReadUpperPAdr = UpperPCPF; + generate + if (`XLEN == 32) + assign ICacheMemReadLowerAdr = {LowerPCF[31:2]+1, 2'b00}; + else + assign ICacheMemReadLowerAdr = {LowerPCF[31:3]+1, 2'b000}; + endgenerate + end else begin + assign ICacheMemReadUpperPAdr = UpperPCPF; + generate + if (`XLEN == 32) + assign ICacheMemReadLowerAdr = {LowerPCF[31:2], 2'b00}; + else + assign ICacheMemReadLowerAdr = {LowerPCF[31:3], 2'b000}; + endgenerate + end + end + end + // Handle cache faults localparam integer WORDSPERLINE = LINESIZE/`XLEN; localparam integer LOGWPL = $clog2(WORDSPERLINE); localparam integer OFFSETWIDTH = $clog2(LINESIZE/8); - logic FetchState, EndFetchState, BeginFetchState; - logic [LOGWPL:0] FetchWordNum, NextFetchWordNum; - logic [`XLEN-1:0] LineAlignedPCPF; + logic FetchState, EndFetchState, BeginFetchState; + logic FaultStall; + logic [LOGWPL:0] FetchWordNum, NextFetchWordNum; + logic [`XLEN-1:0] LineAlignedPCPF; flopr #(1) FetchStateFlop(clk, reset, BeginFetchState | (FetchState & ~EndFetchState), FetchState); flopr #(LOGWPL+1) FetchWordNumFlop(clk, reset, NextFetchWordNum, FetchWordNum); @@ -193,7 +252,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Stall the pipeline while loading a new line from memory always_comb begin - assign ICacheStallF = FetchState | ~ICacheMemReadValid; + assign FaultStall = FetchState | ~ICacheMemReadValid; end endmodule diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index 37d9883e..b94c1b62 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -340,7 +340,7 @@ string tests32i[] = { tests = testsBP64; end else begin tests = {tests64i}; - if (`C_SUPPORTED) tests = {tests, tests64ic}; + if (`C_SUPPORTED) tests = {tests64ic, tests}; else tests = {tests, tests64iNOc}; if (`M_SUPPORTED) tests = {tests, tests64m}; if (`A_SUPPORTED) tests = {tests, tests64a}; From 5f4feb0ff12dae150ac3c782c7ed1d7304c96920 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Thu, 25 Mar 2021 15:42:17 -0400 Subject: [PATCH 09/24] Works for misaligned instructions not on line boundaries --- wally-pipelined/src/ifu/icache.sv | 79 +++++++++++++++---------------- 1 file changed, 37 insertions(+), 42 deletions(-) diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 09fb84ae..85ec4cd3 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -45,7 +45,7 @@ module icache( // High if the icache is requesting a stall output logic ICacheStallF, // The raw (not decompressed) instruction that was requested - // If the next instruction is compressed, the upper 16 bits may be anything + // If this instruction is compressed, upper 16 bits may be the next 16 bits or may be zeros output logic [31:0] InstrRawD ); @@ -125,23 +125,25 @@ module icachecontroller #(parameter LINESIZE = 256) ( output logic InstrReadF ); + // Happy path signals logic [31:0] AlignedInstrRawF, AlignedInstrRawD; logic FlushDLastCycleN; logic PCPMisalignedF; const logic [31:0] NOP = 32'h13; + // Misaligned signals + logic [`XLEN:0] MisalignedInstrRawF; + logic MisalignedStall; + // Cache fault signals + logic FaultStall; // Detect if the instruction is compressed assign CompressedF = AlignedInstrRawF[1:0] != 2'b11; // Handle happy path (data in cache, reads aligned) - always_comb begin - assign ICacheMemReadLowerAdr = LowerPCF; - assign ICacheMemReadUpperPAdr = UpperPCPF; - end generate if (`XLEN == 32) begin - assign AlignedInstrRawF = LowerPCF[1] ? {16'b0, ICacheMemReadData[31:16]} : ICacheMemReadData; + assign AlignedInstrRawF = LowerPCF[1] ? MisalignedInstrRawF : ICacheMemReadData; assign PCPMisalignedF = LowerPCF[1] && ~CompressedF; end else begin assign AlignedInstrRawF = LowerPCF[2] @@ -160,54 +162,48 @@ module icachecontroller #(parameter LINESIZE = 256) ( assign ICacheStallF = FaultStall | MisalignedStall; end - // Handle misaligned, noncompressed reads - logic MisalignedState, NextMisalignedState; - logic MisalignedStall; - logic [15:0] MisalignedHalfInstrF; - logic [`XLEN:0] MisalignedInstrRawF; - always_comb begin - assign MisalignedInstrRawF = {16'b0, ICacheMemReadData[63:48]}; - end + // Handle misaligned, noncompressed reads + + logic MisalignedState, NextMisalignedState; + logic [15:0] MisalignedHalfInstrF; + logic [15:0] UpperHalfWord; flopenr #(16) MisalignedHalfInstrFlop(clk, reset, ~FaultStall & (PCPMisalignedF & MisalignedState), AlignedInstrRawF[15:0], MisalignedHalfInstrF); flopenr #(1) MisalignedStateFlop(clk, reset, ~FaultStall, NextMisalignedState, MisalignedState); + // When doing a misaligned read, swizzle the bits correctly + generate + if (`XLEN == 32) begin + assign UpperHalfWord = ICacheMemReadData[31:16]; + end else begin + assign UpperHalfWord = ICacheMemReadData[63:48]; + end + endgenerate + always_comb begin + if (MisalignedState) begin + assign MisalignedInstrRawF = {16'b0, UpperHalfWord}; + end else begin + assign MisalignedInstrRawF = {ICacheMemReadData[15:0], MisalignedHalfInstrF}; + end + end + + // Manage internal state and stall when necessary always_comb begin assign MisalignedStall = PCPMisalignedF & MisalignedState; assign NextMisalignedState = ~PCPMisalignedF | ~MisalignedState; end // Pick the correct address to read - always_comb begin - if (~PCPMisalignedF) begin - assign ICacheMemReadUpperPAdr = UpperPCPF; - generate - if (`XLEN == 32) - assign ICacheMemReadLowerAdr = {LowerPCF[31:2], 2'b00}; - else - assign ICacheMemReadLowerAdr = {LowerPCF[31:3], 2'b000}; - endgenerate + generate + if (`XLEN == 32) begin + assign ICacheMemReadLowerAdr = {LowerPCF[11:2] + (PCPMisalignedF & ~MisalignedState), 2'b00}; end else begin - if (MisalignedState) begin - assign ICacheMemReadUpperPAdr = UpperPCPF; - generate - if (`XLEN == 32) - assign ICacheMemReadLowerAdr = {LowerPCF[31:2]+1, 2'b00}; - else - assign ICacheMemReadLowerAdr = {LowerPCF[31:3]+1, 2'b000}; - endgenerate - end else begin - assign ICacheMemReadUpperPAdr = UpperPCPF; - generate - if (`XLEN == 32) - assign ICacheMemReadLowerAdr = {LowerPCF[31:2], 2'b00}; - else - assign ICacheMemReadLowerAdr = {LowerPCF[31:3], 2'b000}; - endgenerate - end + assign ICacheMemReadLowerAdr = {LowerPCF[11:3] + (PCPMisalignedF & ~MisalignedState), 3'b00}; end - end + endgenerate + assign ICacheMemReadUpperPAdr = UpperPCPF; + // Handle cache faults @@ -216,7 +212,6 @@ module icachecontroller #(parameter LINESIZE = 256) ( localparam integer OFFSETWIDTH = $clog2(LINESIZE/8); logic FetchState, EndFetchState, BeginFetchState; - logic FaultStall; logic [LOGWPL:0] FetchWordNum, NextFetchWordNum; logic [`XLEN-1:0] LineAlignedPCPF; From 32829bf7a1679811364f607ea7ed89a87869db0d Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Thu, 25 Mar 2021 15:46:35 -0400 Subject: [PATCH 10/24] Remove old icache --- wally-pipelined/src/ifu/icache.sv | 111 ------------------------------ 1 file changed, 111 deletions(-) diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 85ec4cd3..046126d3 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -250,114 +250,3 @@ module icachecontroller #(parameter LINESIZE = 256) ( assign FaultStall = FetchState | ~ICacheMemReadValid; end endmodule - -module oldicache( - // Basic pipeline stuff - input logic clk, reset, - input logic StallF, StallD, - input logic FlushD, - // Upper bits of physical address for PC - input logic [`XLEN-1:12] UpperPCPF, - // Lower 12 bits of virtual PC address, since it's faster this way - input logic [11:0] LowerPCF, - // Data read in from the ebu unit - input logic [`XLEN-1:0] InstrInF, - // Read requested from the ebu unit - output logic [`XLEN-1:0] InstrPAdrF, - output logic InstrReadF, - // High if the instruction currently in the fetch stage is compressed - output logic CompressedF, - // High if the icache is requesting a stall - output logic ICacheStallF, - // The raw (not decompressed) instruction that was requested - // If the next instruction is compressed, the upper 16 bits may be anything - output logic [31:0] InstrRawD -); - logic DelayF, DelaySideF, FlushDLastCyclen, DelayD; - logic [1:0] InstrDMuxChoice; - logic [15:0] MisalignedHalfInstrF, MisalignedHalfInstrD; - logic [31:0] InstrF, AlignedInstrD; - // Buffer the last read, for ease of accessing it again - logic LastReadDataValidF; - logic [`XLEN-1:0] LastReadDataF, LastReadAdrF, InDataF; - - // instruction for NOP - logic [31:0] nop = 32'h00000013; - - // Temporary change to bridge the new interface to old behaviors - logic [`XLEN-1:0] PCPF; - assign PCPF = {UpperPCPF, LowerPCF}; - - // This flop doesn't stall if StallF is high because we should output a nop - // when FlushD happens, even if the pipeline is also stalled. - flopr #(1) flushDLastCycleFlop(clk, reset, ~FlushD & (FlushDLastCyclen | ~StallF), FlushDLastCyclen); - - flopenr #(1) delayDFlop(clk, reset, ~StallF, DelayF & ~CompressedF, DelayD); - flopenrc#(1) delayStateFlop(clk, reset, FlushD, ~StallF, DelayF & ~DelaySideF, DelaySideF); - // This flop stores the first half of a misaligned instruction while waiting for the other half - flopenr #(16) halfInstrFlop(clk, reset, DelayF & ~StallF, MisalignedHalfInstrF, MisalignedHalfInstrD); - - // This flop is here to simulate pulling data out of the cache, which is edge-triggered - flopenr #(32) instrFlop(clk, reset, ~StallF, InstrF, AlignedInstrD); - - // These flops cache the previous read, to accelerate things - flopenr #(`XLEN) lastReadDataFlop(clk, reset, InstrReadF & ~StallF, InstrInF, LastReadDataF); - flopenr #(1) lastReadDataVFlop(clk, reset, InstrReadF & ~StallF, 1'b1, LastReadDataValidF); - flopenr #(`XLEN) lastReadAdrFlop(clk, reset, InstrReadF & ~StallF, InstrPAdrF, LastReadAdrF); - - // Decide which address needs to be fetched and sent out over InstrPAdrF - // If the requested address fits inside one read from memory, we fetch that - // address, adjusted to the bit width. Otherwise, we request the lower word - // and then the upper word, in that order. - generate - if (`XLEN == 32) begin - assign InstrPAdrF = PCPF[1] ? ((DelaySideF & ~CompressedF) ? {PCPF[31:2], 2'b00} : {PCPF[31:2], 2'b00}) : PCPF; - end else begin - assign InstrPAdrF = PCPF[2] ? (PCPF[1] ? ((DelaySideF & ~CompressedF) ? {PCPF[63:3]+1, 3'b000} : {PCPF[63:3], 3'b000}) : {PCPF[63:3], 3'b000}) : {PCPF[63:3], 3'b000}; - end - endgenerate - - // Read from memory if we don't have the address we want - always_comb if (LastReadDataValidF & (InstrPAdrF == LastReadAdrF)) begin - assign InstrReadF = 0; - end else begin - assign InstrReadF = 1; - end - - // Pick from the memory input or from the previous read, as appropriate - mux2 #(`XLEN) inDataMux(LastReadDataF, InstrInF, InstrReadF, InDataF); - - // If the instruction fits in one memory read, then we put the right bits - // into InstrF. Otherwise, we activate DelayF to signal the rest of the - // machinery to swizzle bits. - generate - if (`XLEN == 32) begin - assign InstrF = PCPF[1] ? {16'b0, InDataF[31:16]} : InDataF; - assign DelayF = PCPF[1]; - assign MisalignedHalfInstrF = InDataF[31:16]; - end else begin - assign InstrF = PCPF[2] ? (PCPF[1] ? {16'b0, InDataF[63:48]} : InDataF[63:32]) : (PCPF[1] ? InDataF[47:16] : InDataF[31:0]); - assign DelayF = PCPF[1] && PCPF[2]; - assign MisalignedHalfInstrF = InDataF[63:48]; - end - endgenerate - // We will likely need to stall later, but stalls are handled by the rest of the pipeline for now - assign ICacheStallF = 0; - - // Detect if the instruction is compressed - assign CompressedF = InstrF[1:0] != 2'b11; - - // Pick the correct output, depending on whether we have to assemble this - // instruction from two reads or not. - // Output the requested instruction (we don't need to worry if the read is - // incomplete, since the pipeline stalls for us when it isn't), or a NOP for - // the cycle when the first of two reads comes in. - always_comb if (~FlushDLastCyclen) begin - assign InstrDMuxChoice = 2'b10; - end else if (DelayD & (MisalignedHalfInstrD[1:0] != 2'b11)) begin - assign InstrDMuxChoice = 2'b11; - end else begin - assign InstrDMuxChoice = {1'b0, DelayD}; - end - mux4 #(32) instrDMux (AlignedInstrD, {InstrInF[15:0], MisalignedHalfInstrD}, nop, {16'b0, MisalignedHalfInstrD}, InstrDMuxChoice, InstrRawD); -endmodule From 39bf2347bc6044496a0a4e978fff505ce0259cb7 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Thu, 25 Mar 2021 18:47:23 -0400 Subject: [PATCH 11/24] Fix error when reading an instruction that crosses a line boundary --- wally-pipelined/regression/wave-dos/cache-waves.do | 3 +++ wally-pipelined/src/ifu/icache.sv | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/wally-pipelined/regression/wave-dos/cache-waves.do b/wally-pipelined/regression/wave-dos/cache-waves.do index 20c7061b..c7b32e1e 100644 --- a/wally-pipelined/regression/wave-dos/cache-waves.do +++ b/wally-pipelined/regression/wave-dos/cache-waves.do @@ -31,8 +31,11 @@ add wave -hex /testbench/dut/hart/ifu/ic/InstrPAdrF add wave -hex /testbench/dut/hart/ifu/ic/InstrAckF add wave -hex /testbench/dut/hart/ifu/ic/controller/ICacheMemWriteData add wave -hex /testbench/dut/hart/ifu/ic/controller/ICacheMemWritePAdr +add wave -hex /testbench/dut/hart/ifu/ic/controller/MisalignedState +add wave -hex /testbench/dut/hart/ifu/ic/controller/MisalignedHalfInstrF add wave -divider + add wave -hex /testbench/dut/hart/ifu/PCE add wave -hex /testbench/dut/hart/ifu/InstrE add wave /testbench/InstrEName diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 046126d3..0d953760 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -233,7 +233,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Machinery to request the correct addresses from main memory always_comb begin assign InstrReadF = FetchState & ~EndFetchState; - assign LineAlignedPCPF = {UpperPCPF, LowerPCF[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; + assign LineAlignedPCPF = {ICacheMemReadUpperPAdr, ICacheMemReadLowerAdr[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; assign InstrPAdrF = LineAlignedPCPF + FetchWordNum*(`XLEN/8); assign NextFetchWordNum = FetchState ? FetchWordNum+InstrAckF : {LOGWPL+1{1'b0}}; end From 5afb2552519af4b74954d7941796352cf2515ba1 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Thu, 1 Apr 2021 13:55:21 -0400 Subject: [PATCH 12/24] Begin changes to direct-mapped cache --- wally-pipelined/src/cache/dmapped.sv | 100 +++++++++++++++++---------- 1 file changed, 63 insertions(+), 37 deletions(-) diff --git a/wally-pipelined/src/cache/dmapped.sv b/wally-pipelined/src/cache/dmapped.sv index 9a51737a..9138089e 100644 --- a/wally-pipelined/src/cache/dmapped.sv +++ b/wally-pipelined/src/cache/dmapped.sv @@ -26,7 +26,7 @@ `include "wally-config.vh" -module rodirectmappedmem #(parameter LINESIZE = 256, parameter NUMLINES = 512, parameter WORDSIZE = `XLEN) ( +module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, parameter WORDSIZE = `XLEN) ( // Pipeline stuff input logic clk, input logic reset, @@ -44,50 +44,76 @@ module rodirectmappedmem #(parameter LINESIZE = 256, parameter NUMLINES = 512, p output logic DataValid ); - localparam integer SETWIDTH = $clog2(NUMLINES); - localparam integer OFFSETWIDTH = $clog2(LINESIZE/8); - localparam integer TAGWIDTH = `XLEN-SETWIDTH-OFFSETWIDTH; + // Various compile-time constants + localparam integer WORDWIDTH = $clog2(WORDSIZE); + localparam integer LINEWIDTH = $clog2(LINESIZE/8); + localparam integer OFFSETWIDTH = $clog2(LINESIZE) - WORDWIDTH; + localparam integer SETWIDTH = $clog2(NUMLINES); + localparam integer TAGWIDTH = $clog2(`XLEN) - $clog2(LINESIZE) - SETWIDTH; - logic [NUMLINES-1:0][WORDSIZE-1:0] LineOutputs; - logic [NUMLINES-1:0] ValidOutputs; - logic [NUMLINES-1:0][TAGWIDTH-1:0] TagOutputs; - logic [OFFSETWIDTH-1:0] WordSelect; - logic [`XLEN-1:0] ReadPAdr; - logic [SETWIDTH-1:0] ReadSet, WriteSet; - logic [TAGWIDTH-1:0] ReadTag, WriteTag; + // Machinery to read from and write to the correct addresses in memory + logic [`XLEN-1:0] ReadPAdr; + logic [OFFSETWIDTH-1:0] ReadOffset, WriteOffset; + logic [SETWIDTH-1:0] ReadSet, WriteSet; + logic [TAGWIDTH-1:0] ReadTag, WriteTag; - // Swizzle bits to get the offset, set, and tag out of the read and write addresses + // Machinery to check if a given read is valid and is the desired value + logic [TAGWIDTH-1:0] DataTag; + logic [NUMLINES-1:0] ValidOut, NextValidOut; + + // Assign the read and write addresses in cache memory always_comb begin - // Read address - assign WordSelect = ReadLowerAdr[OFFSETWIDTH-1:0]; + assign ReadOffset = ReadLowerAdr[WORDWIDTH+OFFSETWIDTH-1:WORDWIDTH]; assign ReadPAdr = {ReadUpperPAdr, ReadLowerAdr}; - assign ReadSet = ReadPAdr[SETWIDTH+OFFSETWIDTH-1:OFFSETWIDTH]; - assign ReadTag = ReadPAdr[`XLEN-1:SETWIDTH+OFFSETWIDTH]; - // Write address - assign WriteSet = WritePAdr[SETWIDTH+OFFSETWIDTH-1:OFFSETWIDTH]; - assign WriteTag = WritePAdr[`XLEN-1:SETWIDTH+OFFSETWIDTH]; + assign ReadSet = ReadPAdr[LINEWIDTH+SETWIDTH-1:LINEWIDTH]; + assign ReadTag = ReadPAdr[`XLEN-1:LINEWIDTH+SETWIDTH]; + + assign WriteOffset = WritePAdr[WORDWIDTH+OFFSETWIDTH-1:WORDWIDTH]; + assign WriteSet = WritePAdr[LINEWIDTH+SETWIDTH-1:LINEWIDTH]; + assign WriteTag = WritePAdr[`XLEN-1:LINEWIDTH+SETWIDTH]; end - genvar i; - generate - for (i=0; i < NUMLINES; i++) begin - rocacheline #(LINESIZE, TAGWIDTH, WORDSIZE) lines ( - .*, - .WriteEnable(WriteEnable & (WriteSet == i)), - .WriteData(WriteLine), - .WriteTag(WriteTag), - .DataWord(LineOutputs[i]), - .DataTag(TagOutputs[i]), - .DataValid(ValidOutputs[i]) - ); - end - endgenerate + SRAM2P1R1W #(.Depth(OFFSETWIDTH), .Width(WORDSIZE)) cachemem ( + .*, + .RA1(ReadOffset), + .RD1(DataWord), + .REN1(1'b1), + .WA1(WriteOffset), + .WD1(WriteSet), + .WEN1(WriteEnable), + .BitWEN1(0) + ); - // Get the data and valid out of the lines + SRAM2P1R1W #(.Depth(OFFSETWIDTH), .Width(TAGWIDTH)) cachetags ( + .*, + .RA1(ReadOffset), + .RD1(DataTag), + .REN1(1'b1), + .WA1(WriteOffset), + .WD1(WriteTag), + .WEN1(WriteEnable), + .BitWEN1(0) + ); + + // Correctly handle the valid bits always_comb begin - assign DataWord = LineOutputs[ReadSet]; - assign DataValid = ValidOutputs[ReadSet] & (TagOutputs[ReadSet] == ReadTag); + if (WriteEnable) begin + assign NextValidOut = {NextValidOut[NUMLINES-1:WriteSet+1], 1'b1, NextValidOut[WriteSet-1:0]}; + end else begin + assign NextValidOut = ValidOut; + end + end + always_ff @(posedge clk, reset, flush) begin + if (reset || flush) begin + ValidOut <= {NUMLINES{1'b0}}; + end else begin + ValidOut <= NextValidOut; + end + end + + // Determine if the line coming out is valid and matches the desired data + always_comb begin + assign DataValid = ValidOut[ReadSet] && (DataTag == ReadTag); end endmodule - From fc8b8ad7aa746b2776d5ff8582b35a336c553e89 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Tue, 13 Apr 2021 01:06:57 -0400 Subject: [PATCH 13/24] A few more cache fixes --- wally-pipelined/src/cache/cache-sram.sv | 22 +++++ wally-pipelined/src/cache/dmapped.sv | 92 ++++++++++--------- wally-pipelined/src/ifu/icache.sv | 33 ++++--- wally-pipelined/src/ifu/ifu.sv | 8 +- .../testbench/testbench-imperas.sv | 2 +- 5 files changed, 95 insertions(+), 62 deletions(-) create mode 100644 wally-pipelined/src/cache/cache-sram.sv diff --git a/wally-pipelined/src/cache/cache-sram.sv b/wally-pipelined/src/cache/cache-sram.sv new file mode 100644 index 00000000..0ba0efa5 --- /dev/null +++ b/wally-pipelined/src/cache/cache-sram.sv @@ -0,0 +1,22 @@ +// Depth is number of bits in one "word" of the memory, width is number of such words +module Sram1Read1Write #(parameter DEPTH=128, WIDTH=256) ( + input logic clk, + // port 1 is read only + input logic [$clog2(WIDTH)-1:0] ReadAddr, + output logic [DEPTH-1:0] ReadData, + + // port 2 is write only + input logic [$clog2(WIDTH)-1:0] WriteAddr, + input logic [DEPTH-1:0] WriteData, + input logic WriteEnable +); + + logic [WIDTH-1:0][DEPTH-1:0] StoredData; + + always_ff @(posedge clk) begin + ReadData <= StoredData[ReadAddr]; + if (WriteEnable) begin + StoredData[WriteAddr] <= WriteData; + end + end +endmodule diff --git a/wally-pipelined/src/cache/dmapped.sv b/wally-pipelined/src/cache/dmapped.sv index 9138089e..346355bd 100644 --- a/wally-pipelined/src/cache/dmapped.sv +++ b/wally-pipelined/src/cache/dmapped.sv @@ -30,6 +30,7 @@ module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par // Pipeline stuff input logic clk, input logic reset, + input logic stall, // If flush is high, invalidate the entire cache input logic flush, // Select which address to read (broken for efficiency's sake) @@ -45,75 +46,80 @@ module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par ); // Various compile-time constants - localparam integer WORDWIDTH = $clog2(WORDSIZE); - localparam integer LINEWIDTH = $clog2(LINESIZE/8); - localparam integer OFFSETWIDTH = $clog2(LINESIZE) - WORDWIDTH; + localparam integer WORDWIDTH = $clog2(WORDSIZE/8); + localparam integer OFFSETWIDTH = $clog2(LINESIZE/WORDSIZE); localparam integer SETWIDTH = $clog2(NUMLINES); - localparam integer TAGWIDTH = $clog2(`XLEN) - $clog2(LINESIZE) - SETWIDTH; + localparam integer TAGWIDTH = `XLEN - OFFSETWIDTH - SETWIDTH - WORDWIDTH; + + localparam integer OFFSETBEGIN = WORDWIDTH; + localparam integer OFFSETEND = OFFSETBEGIN+OFFSETWIDTH-1; + localparam integer SETBEGIN = OFFSETEND+1; + localparam integer SETEND = SETBEGIN + SETWIDTH - 1; + localparam integer TAGBEGIN = SETEND + 1; + localparam integer TAGEND = TAGBEGIN + TAGWIDTH - 1; // Machinery to read from and write to the correct addresses in memory logic [`XLEN-1:0] ReadPAdr; + logic [`XLEN-1:0] OldReadPAdr; logic [OFFSETWIDTH-1:0] ReadOffset, WriteOffset; logic [SETWIDTH-1:0] ReadSet, WriteSet; logic [TAGWIDTH-1:0] ReadTag, WriteTag; + logic [LINESIZE-1:0] ReadLine; + logic [LINESIZE/WORDSIZE-1:0][WORDSIZE-1:0] ReadLineTransformed; // Machinery to check if a given read is valid and is the desired value logic [TAGWIDTH-1:0] DataTag; - logic [NUMLINES-1:0] ValidOut, NextValidOut; + logic [NUMLINES-1:0] ValidOut; + + flopenr #(`XLEN) ReadPAdrFlop(clk, reset, ~stall, ReadPAdr, OldReadPAdr); // Assign the read and write addresses in cache memory always_comb begin - assign ReadOffset = ReadLowerAdr[WORDWIDTH+OFFSETWIDTH-1:WORDWIDTH]; + assign ReadOffset = OldReadPAdr[OFFSETEND:OFFSETBEGIN]; assign ReadPAdr = {ReadUpperPAdr, ReadLowerAdr}; - assign ReadSet = ReadPAdr[LINEWIDTH+SETWIDTH-1:LINEWIDTH]; - assign ReadTag = ReadPAdr[`XLEN-1:LINEWIDTH+SETWIDTH]; + assign ReadSet = ReadPAdr[SETEND:SETBEGIN]; + assign ReadTag = OldReadPAdr[TAGEND:TAGBEGIN]; - assign WriteOffset = WritePAdr[WORDWIDTH+OFFSETWIDTH-1:WORDWIDTH]; - assign WriteSet = WritePAdr[LINEWIDTH+SETWIDTH-1:LINEWIDTH]; - assign WriteTag = WritePAdr[`XLEN-1:LINEWIDTH+SETWIDTH]; + assign WriteOffset = WritePAdr[OFFSETEND:OFFSETBEGIN]; + assign WriteSet = WritePAdr[SETEND:SETBEGIN]; + assign WriteTag = WritePAdr[TAGEND:TAGBEGIN]; end - SRAM2P1R1W #(.Depth(OFFSETWIDTH), .Width(WORDSIZE)) cachemem ( + // Depth is number of bits in one "word" of the memory, width is number of such words + Sram1Read1Write #(.DEPTH(LINESIZE), .WIDTH(NUMLINES)) cachemem ( .*, - .RA1(ReadOffset), - .RD1(DataWord), - .REN1(1'b1), - .WA1(WriteOffset), - .WD1(WriteSet), - .WEN1(WriteEnable), - .BitWEN1(0) + .ReadAddr(ReadSet), + .ReadData(ReadLine), + .WriteAddr(WriteSet), + .WriteData(WriteLine) + ); + Sram1Read1Write #(.DEPTH(TAGWIDTH), .WIDTH(NUMLINES)) cachetags ( + .*, + .ReadAddr(ReadSet), + .ReadData(DataTag), + .WriteAddr(WriteSet), + .WriteData(WriteTag) ); - SRAM2P1R1W #(.Depth(OFFSETWIDTH), .Width(TAGWIDTH)) cachetags ( - .*, - .RA1(ReadOffset), - .RD1(DataTag), - .REN1(1'b1), - .WA1(WriteOffset), - .WD1(WriteTag), - .WEN1(WriteEnable), - .BitWEN1(0) - ); + // Pick the right bits coming out the read line + assign DataWord = ReadLineTransformed[ReadOffset]; + genvar i; + generate + for (i=0; i < LINESIZE/WORDSIZE; i++) begin + assign ReadLineTransformed[i] = ReadLine[(i+1)*WORDSIZE-1:i*WORDSIZE]; + end + endgenerate // Correctly handle the valid bits - always_comb begin - if (WriteEnable) begin - assign NextValidOut = {NextValidOut[NUMLINES-1:WriteSet+1], 1'b1, NextValidOut[WriteSet-1:0]}; - end else begin - assign NextValidOut = ValidOut; - end - end - always_ff @(posedge clk, reset, flush) begin + always_ff @(posedge clk, posedge reset) begin if (reset || flush) begin ValidOut <= {NUMLINES{1'b0}}; end else begin - ValidOut <= NextValidOut; + if (WriteEnable) begin + ValidOut[WriteSet] <= 1; + end end - end - - // Determine if the line coming out is valid and matches the desired data - always_comb begin - assign DataValid = ValidOut[ReadSet] && (DataTag == ReadTag); + DataValid <= ValidOut[ReadSet] && (DataTag == ReadTag); end endmodule diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 0d953760..e42edb49 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -31,9 +31,9 @@ module icache( input logic StallF, StallD, input logic FlushD, // Upper bits of physical address for PC - input logic [`XLEN-1:12] UpperPCPF, + input logic [`XLEN-1:12] UpperPCNextPF, // Lower 12 bits of virtual PC address, since it's faster this way - input logic [11:0] LowerPCF, + input logic [11:0] LowerPCNextF, // Data read in from the ebu unit input logic [`XLEN-1:0] InstrInF, input logic InstrAckF, @@ -65,8 +65,9 @@ module icache( logic [`XLEN-1:0] ICacheMemReadData; logic ICacheMemReadValid; - rodirectmappedmem #(.LINESIZE(ICACHELINESIZE), .NUMLINES(ICACHENUMLINES)) cachemem( + rodirectmappedmem #(.LINESIZE(ICACHELINESIZE), .NUMLINES(ICACHENUMLINES), .WORDSIZE(`XLEN)) cachemem( .*, + .stall(StallF && (~ICacheStallF || ~InstrAckF)), .flush(FlushMem), .ReadUpperPAdr(ICacheMemReadUpperPAdr), .ReadLowerAdr(ICacheMemReadLowerAdr), @@ -79,6 +80,7 @@ module icache( icachecontroller #(.LINESIZE(ICACHELINESIZE)) controller(.*); + // For now, assume no writes to executable memory assign FlushMem = 1'b0; endmodule @@ -90,9 +92,9 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Input the address to read // The upper bits of the physical pc - input logic [`XLEN-1:12] UpperPCPF, + input logic [`XLEN-1:12] UpperPCNextPF, // The lower bits of the virtual pc - input logic [11:0] LowerPCF, + input logic [11:0] LowerPCNextF, // Signals to/from cache memory // The read coming out of it @@ -130,6 +132,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( logic FlushDLastCycleN; logic PCPMisalignedF; const logic [31:0] NOP = 32'h13; + logic [`XLEN-1:0] PCPF; // Misaligned signals logic [`XLEN:0] MisalignedInstrRawF; logic MisalignedStall; @@ -143,18 +146,19 @@ module icachecontroller #(parameter LINESIZE = 256) ( generate if (`XLEN == 32) begin - assign AlignedInstrRawF = LowerPCF[1] ? MisalignedInstrRawF : ICacheMemReadData; - assign PCPMisalignedF = LowerPCF[1] && ~CompressedF; + assign AlignedInstrRawF = PCPF[1] ? MisalignedInstrRawF : ICacheMemReadData; + assign PCPMisalignedF = PCPF[1] && ~CompressedF; end else begin - assign AlignedInstrRawF = LowerPCF[2] - ? (LowerPCF[1] ? MisalignedInstrRawF : ICacheMemReadData[63:32]) - : (LowerPCF[1] ? ICacheMemReadData[47:16] : ICacheMemReadData[31:0]); - assign PCPMisalignedF = LowerPCF[2] && LowerPCF[1] && ~CompressedF; + assign AlignedInstrRawF = PCPF[2] + ? (PCPF[1] ? MisalignedInstrRawF : ICacheMemReadData[63:32]) + : (PCPF[1] ? ICacheMemReadData[47:16] : ICacheMemReadData[31:0]); + assign PCPMisalignedF = PCPF[2] && PCPF[1] && ~CompressedF; end endgenerate flopenr #(32) AlignedInstrRawDFlop(clk, reset, ~StallD, AlignedInstrRawF, AlignedInstrRawD); flopr #(1) FlushDLastCycleFlop(clk, reset, ~FlushD & (FlushDLastCycleN | ~StallF), FlushDLastCycleN); + flopenr #(`XLEN) PCPFFlop(clk, reset, ~StallF, {UpperPCNextPF, LowerPCNextF}, PCPF); mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, ~FlushDLastCycleN, InstrRawD); // Stall for faults or misaligned reads @@ -197,12 +201,13 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Pick the correct address to read generate if (`XLEN == 32) begin - assign ICacheMemReadLowerAdr = {LowerPCF[11:2] + (PCPMisalignedF & ~MisalignedState), 2'b00}; + assign ICacheMemReadLowerAdr = {LowerPCNextF[11:2] + (PCPMisalignedF & ~MisalignedState), 2'b00}; end else begin - assign ICacheMemReadLowerAdr = {LowerPCF[11:3] + (PCPMisalignedF & ~MisalignedState), 3'b00}; + assign ICacheMemReadLowerAdr = {LowerPCNextF[11:3] + (PCPMisalignedF & ~MisalignedState), 3'b00}; end endgenerate - assign ICacheMemReadUpperPAdr = UpperPCPF; + // TODO Handle reading instructions that cross page boundaries + assign ICacheMemReadUpperPAdr = UpperPCNextPF; // Handle cache faults diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv index 3347d80c..2ddd6706 100644 --- a/wally-pipelined/src/ifu/ifu.sv +++ b/wally-pipelined/src/ifu/ifu.sv @@ -71,7 +71,7 @@ module ifu ( logic misaligned, BranchMisalignedFaultE, BranchMisalignedFaultM, TrapMisalignedFaultM; logic PrivilegedChangePCM; logic IllegalCompInstrD; - logic [`XLEN-1:0] PCPlusUpperF, PCPlus2or4F, PCD, PCW, PCLinkD, PCLinkM, PCPF; + logic [`XLEN-1:0] PCPlusUpperF, PCPlus2or4F, PCD, PCW, PCLinkD, PCLinkM, PCNextPF; logic CompressedF; logic [31:0] InstrRawD, InstrE, InstrW; logic [31:0] nop = 32'h00000013; // instruction for NOP @@ -98,12 +98,12 @@ module ifu ( // assign InstrReadF = 1; // *** & ICacheMissF; add later // jarred 2021-03-14 Add instrution cache block to remove rd2 - assign PCPF = PCF; // Temporary workaround until iTLB is live + assign PCNextPF = PCNextF; // Temporary workaround until iTLB is live icache ic( .*, .InstrPAdrF(ICacheInstrPAdrF), - .UpperPCPF(PCPF[`XLEN-1:12]), - .LowerPCF(PCF[11:0]) + .UpperPCNextPF(PCNextPF[`XLEN-1:12]), + .LowerPCNextF(PCNextF[11:0]) ); // Prioritize the iTLB for reads if it wants one mux2 #(`XLEN) instrPAdrMux(ICacheInstrPAdrF, ITLBInstrPAdrF, ITLBMissF, InstrPAdrF); diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index 09c63d56..fbd4a829 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -393,7 +393,7 @@ module testbench(); // if (`F_SUPPORTED) tests = {tests64f, tests}; // if (`D_SUPPORTED) tests = {tests64d, tests}; if (`A_SUPPORTED) tests = {tests, tests64a}; - if (`MEM_VIRTMEM) tests = {tests64mmu, tests}; + if (`MEM_VIRTMEM) tests = {tests, tests64mmu}; end //tests = {tests64a, tests}; // tests = {tests, tests64p}; From 892dfd5a9bff95f018476e397dfd4130d198dd6a Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Wed, 14 Apr 2021 19:03:33 -0400 Subject: [PATCH 14/24] More icache bugfixes --- wally-pipelined/src/cache/dmapped.sv | 5 +++-- wally-pipelined/src/ifu/icache.sv | 27 +++++++++++++++------------ 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/wally-pipelined/src/cache/dmapped.sv b/wally-pipelined/src/cache/dmapped.sv index 634f29a9..f97cfa2d 100644 --- a/wally-pipelined/src/cache/dmapped.sv +++ b/wally-pipelined/src/cache/dmapped.sv @@ -70,6 +70,7 @@ module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par // Machinery to check if a given read is valid and is the desired value logic [TAGWIDTH-1:0] DataTag; logic [NUMLINES-1:0] ValidOut; + logic DataValidBit; flopenr #(`XLEN) ReadPAdrFlop(clk, reset, ~stall, ReadPAdr, OldReadPAdr); @@ -119,7 +120,7 @@ module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par ValidOut[WriteSet] <= 1; end end - DataValid <= ValidOut[ReadSet] && (DataTag == ReadTag); + DataValidBit <= ValidOut[ReadSet]; end - + assign DataValid = DataValidBit && (DataTag == ReadTag); endmodule diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index e42edb49..31d16b8a 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -61,13 +61,14 @@ module icache( logic ICacheMemWriteEnable; logic [ICACHELINESIZE-1:0] ICacheMemWriteData; logic [`XLEN-1:0] ICacheMemWritePAdr; + logic EndFetchState; // Output signals from cache memory logic [`XLEN-1:0] ICacheMemReadData; logic ICacheMemReadValid; rodirectmappedmem #(.LINESIZE(ICACHELINESIZE), .NUMLINES(ICACHENUMLINES), .WORDSIZE(`XLEN)) cachemem( .*, - .stall(StallF && (~ICacheStallF || ~InstrAckF)), + .stall(StallF && (~ICacheStallF || ~EndFetchState)), .flush(FlushMem), .ReadUpperPAdr(ICacheMemReadUpperPAdr), .ReadLowerAdr(ICacheMemReadLowerAdr), @@ -116,7 +117,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( output logic [31:0] InstrRawD, // Outputs to pipeline control stuff - output logic ICacheStallF, + output logic ICacheStallF, EndFetchState, // Signals to/from ahblite interface // A read containing the requested data @@ -142,6 +143,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Detect if the instruction is compressed assign CompressedF = AlignedInstrRawF[1:0] != 2'b11; + // Handle happy path (data in cache, reads aligned) generate @@ -216,7 +218,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( localparam integer LOGWPL = $clog2(WORDSPERLINE); localparam integer OFFSETWIDTH = $clog2(LINESIZE/8); - logic FetchState, EndFetchState, BeginFetchState; + logic FetchState, BeginFetchState; logic [LOGWPL:0] FetchWordNum, NextFetchWordNum; logic [`XLEN-1:0] LineAlignedPCPF; @@ -232,26 +234,27 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Enter the fetch state when we hit a cache fault always_comb begin - assign BeginFetchState = ~ICacheMemReadValid & ~FetchState; + BeginFetchState = ~ICacheMemReadValid & ~FetchState & (FetchWordNum == 0); end + // Exit the fetch state once the cache line has been loaded + flopr #(1) EndFetchStateFlop(clk, reset, ICacheMemWriteEnable, EndFetchState); // Machinery to request the correct addresses from main memory always_comb begin - assign InstrReadF = FetchState & ~EndFetchState; - assign LineAlignedPCPF = {ICacheMemReadUpperPAdr, ICacheMemReadLowerAdr[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; - assign InstrPAdrF = LineAlignedPCPF + FetchWordNum*(`XLEN/8); - assign NextFetchWordNum = FetchState ? FetchWordNum+InstrAckF : {LOGWPL+1{1'b0}}; + InstrReadF = FetchState & ~EndFetchState & ~ICacheMemWriteEnable; + LineAlignedPCPF = {ICacheMemReadUpperPAdr, ICacheMemReadLowerAdr[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; + InstrPAdrF = LineAlignedPCPF + FetchWordNum*(`XLEN/8); + NextFetchWordNum = FetchState ? FetchWordNum+InstrAckF : {LOGWPL+1{1'b0}}; end // Write to cache memory when we have the line here always_comb begin - assign EndFetchState = FetchWordNum == {1'b1, {LOGWPL{1'b0}}} & FetchState; - assign ICacheMemWritePAdr = LineAlignedPCPF; - assign ICacheMemWriteEnable = EndFetchState; + ICacheMemWritePAdr = LineAlignedPCPF; + ICacheMemWriteEnable = FetchWordNum == {1'b1, {LOGWPL{1'b0}}} & FetchState & ~EndFetchState; end // Stall the pipeline while loading a new line from memory always_comb begin - assign FaultStall = FetchState | ~ICacheMemReadValid; + FaultStall = FetchState | ~ICacheMemReadValid; end endmodule From 3717699ad93533f8b8005d6003b91806d66022ac Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Wed, 14 Apr 2021 23:14:59 -0400 Subject: [PATCH 15/24] Add a comment to explain a detail --- wally-pipelined/src/ifu/icache.sv | 1 + 1 file changed, 1 insertion(+) diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 31d16b8a..f6440fcf 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -68,6 +68,7 @@ module icache( rodirectmappedmem #(.LINESIZE(ICACHELINESIZE), .NUMLINES(ICACHENUMLINES), .WORDSIZE(`XLEN)) cachemem( .*, + // Stall it if the pipeline is stalled, unless we're stalling it and we're ending our stall .stall(StallF && (~ICacheStallF || ~EndFetchState)), .flush(FlushMem), .ReadUpperPAdr(ICacheMemReadUpperPAdr), From 7854d838c70e024a91614fe105f1ce3e126f74e7 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Thu, 15 Apr 2021 21:13:40 -0400 Subject: [PATCH 16/24] Enable linting of blocks not yet in the hierarchy --- wally-pipelined/lint-wally | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/wally-pipelined/lint-wally b/wally-pipelined/lint-wally index 9d5a20ba..791435ac 100755 --- a/wally-pipelined/lint-wally +++ b/wally-pipelined/lint-wally @@ -1,11 +1,25 @@ # check for warnings in Verilog code # The verilator lint tool is faster and better than Modelsim so it is best to run this first. -echo "rv64ic linting..." -verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv64ic src/*/*.sv -echo "rv32ic linting..." -verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv32ic src/*/*.sv -#verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv64ic src/*/*.sv src/*/div/*.sv +if [ -n "$1" ]; then + echo "rv64ic linting..." + if verilator --lint-only --top-module "$1" -Iconfig/rv64ic src/*/*.sv; then + echo "rv32ic linting..." + verilator --lint-only --top-module "$1" -Iconfig/rv32ic src/*/*.sv + else + echo "Skipping rv32ic because rv64ic had errors or warnings" + exit 1 + fi +else + echo "rv64ic linting..." + if verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv64ic src/*/*.sv; then + echo "rv32ic linting..." + verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv32ic src/*/*.sv + else + echo "Skipping rv32ic because rv64ic had errors or warnings" + exit 1 + fi +fi # --lint-only just runs lint rather than trying to compile and simulate # -I points to the include directory where files such as `include wally-config.vh are found From aef57cab500b8afb47a23139838f657899ee3b20 Mon Sep 17 00:00:00 2001 From: Jarred Allen Date: Thu, 15 Apr 2021 21:13:56 -0400 Subject: [PATCH 17/24] dcache lints --- wally-pipelined/src/cache/dmapped.sv | 111 +++++++++++++++- wally-pipelined/src/dmem/dcache.sv | 184 +++++++++++++++++++++++++++ 2 files changed, 293 insertions(+), 2 deletions(-) create mode 100644 wally-pipelined/src/dmem/dcache.sv diff --git a/wally-pipelined/src/cache/dmapped.sv b/wally-pipelined/src/cache/dmapped.sv index f97cfa2d..fb6ce4c5 100644 --- a/wally-pipelined/src/cache/dmapped.sv +++ b/wally-pipelined/src/cache/dmapped.sv @@ -4,8 +4,7 @@ // Written: jaallen@g.hmc.edu 2021-03-23 // Modified: // -// Purpose: An implementation of a direct-mapped cache memory -// This cache is read-only, so "write"s to the memory are loading new data +// Purpose: An implementation of a direct-mapped cache memory, with read-only and write-through versions // // A component of the Wally configurable RISC-V project. // @@ -26,6 +25,7 @@ `include "wally-config.vh" +// Read-only direct-mapped memory module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, parameter WORDSIZE = `XLEN) ( // Pipeline stuff input logic clk, @@ -124,3 +124,110 @@ module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par end assign DataValid = DataValidBit && (DataTag == ReadTag); endmodule + +// Write-through direct-mapped memory +module wtdirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, parameter WORDSIZE = `XLEN) ( + // Pipeline stuff + input logic clk, + input logic reset, + input logic stall, + // If flush is high, invalidate the entire cache + input logic flush, + // Select which address to read (broken for efficiency's sake) + input logic [`XLEN-1:12] ReadUpperPAdr, + input logic [11:0] ReadLowerAdr, + // Load new data into the cache (from main memory) + input logic LoadEnable, + input logic [LINESIZE-1:0] LoadLine, + input logic [`XLEN-1:0] LoadPAdr, + // Write data to the cache (like from a store instruction) + input logic WriteEnable, + input logic [WORDSIZE-1:0] WriteWord, + input logic [`XLEN-1:0] WritePAdr, + input logic [1:0] WriteSize, // Specify size of the write (non-written bits should be preserved) + // Output the word, as well as if it is valid + output logic [WORDSIZE-1:0] DataWord, + output logic DataValid +); + + // Various compile-time constants + localparam integer WORDWIDTH = $clog2(WORDSIZE/8); + localparam integer OFFSETWIDTH = $clog2(LINESIZE/WORDSIZE); + localparam integer SETWIDTH = $clog2(NUMLINES); + localparam integer TAGWIDTH = `XLEN - OFFSETWIDTH - SETWIDTH - WORDWIDTH; + + localparam integer OFFSETBEGIN = WORDWIDTH; + localparam integer OFFSETEND = OFFSETBEGIN+OFFSETWIDTH-1; + localparam integer SETBEGIN = OFFSETEND+1; + localparam integer SETEND = SETBEGIN + SETWIDTH - 1; + localparam integer TAGBEGIN = SETEND + 1; + localparam integer TAGEND = TAGBEGIN + TAGWIDTH - 1; + + // Machinery to read from and write to the correct addresses in memory + logic [`XLEN-1:0] ReadPAdr; + logic [`XLEN-1:0] OldReadPAdr; + logic [OFFSETWIDTH-1:0] ReadOffset, LoadOffset; + logic [SETWIDTH-1:0] ReadSet, LoadSet; + logic [TAGWIDTH-1:0] ReadTag, LoadTag; + logic [LINESIZE-1:0] ReadLine; + logic [LINESIZE/WORDSIZE-1:0][WORDSIZE-1:0] ReadLineTransformed; + + // Machinery to check if a given read is valid and is the desired value + logic [TAGWIDTH-1:0] DataTag; + logic [NUMLINES-1:0] ValidOut; + logic DataValidBit; + + flopenr #(`XLEN) ReadPAdrFlop(clk, reset, ~stall, ReadPAdr, OldReadPAdr); + + // Assign the read and write addresses in cache memory + always_comb begin + ReadOffset = OldReadPAdr[OFFSETEND:OFFSETBEGIN]; + ReadPAdr = {ReadUpperPAdr, ReadLowerAdr}; + ReadSet = ReadPAdr[SETEND:SETBEGIN]; + ReadTag = OldReadPAdr[TAGEND:TAGBEGIN]; + + LoadOffset = LoadPAdr[OFFSETEND:OFFSETBEGIN]; + LoadSet = LoadPAdr[SETEND:SETBEGIN]; + LoadTag = LoadPAdr[TAGEND:TAGBEGIN]; + end + + // Depth is number of bits in one "word" of the memory, width is number of such words + Sram1Read1Write #(.DEPTH(LINESIZE), .WIDTH(NUMLINES)) cachemem ( + .*, + .ReadAddr(ReadSet), + .ReadData(ReadLine), + .WriteAddr(LoadSet), + .WriteData(LoadLine), + .WriteEnable(LoadEnable) + ); + Sram1Read1Write #(.DEPTH(TAGWIDTH), .WIDTH(NUMLINES)) cachetags ( + .*, + .ReadAddr(ReadSet), + .ReadData(DataTag), + .WriteAddr(LoadSet), + .WriteData(LoadTag), + .WriteEnable(LoadEnable) + ); + + // Pick the right bits coming out the read line + assign DataWord = ReadLineTransformed[ReadOffset]; + genvar i; + generate + for (i=0; i < LINESIZE/WORDSIZE; i++) begin + assign ReadLineTransformed[i] = ReadLine[(i+1)*WORDSIZE-1:i*WORDSIZE]; + end + endgenerate + + // Correctly handle the valid bits + always_ff @(posedge clk, posedge reset) begin + if (reset || flush) begin + ValidOut <= {NUMLINES{1'b0}}; + end else begin + if (LoadEnable) begin + ValidOut[LoadSet] <= 1; + end + end + DataValidBit <= ValidOut[ReadSet]; + end + assign DataValid = DataValidBit && (DataTag == ReadTag); +endmodule diff --git a/wally-pipelined/src/dmem/dcache.sv b/wally-pipelined/src/dmem/dcache.sv new file mode 100644 index 00000000..243c6975 --- /dev/null +++ b/wally-pipelined/src/dmem/dcache.sv @@ -0,0 +1,184 @@ +/////////////////////////////////////////// +// dcache.sv +// +// Written: jaallen@g.hmc.edu 2021-04-15 +// Modified: +// +// Purpose: Cache memory for the dmem so it can access memory less often, saving cycles +// +// A component of the Wally configurable RISC-V project. +// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +`include "wally-config.vh" + +module dcache( + // Basic pipeline stuff + input logic clk, reset, + input logic StallW, + input logic FlushW, + // Upper bits of physical address + input logic [`XLEN-1:12] UpperPAdrM, + // Lower 12 bits of virtual address, since it's faster this way + input logic [11:0] LowerVAdrM, + // Write to the dcache + input logic [`XLEN-1:0] DCacheWriteDataM, + input logic DCacheReadM, DCacheWriteM, + // Data read in from the ebu unit + input logic [`XLEN-1:0] ReadDataW, + input logic MemAckW, + // Access requested from the ebu unit + output logic [`XLEN-1:0] MemPAdrM, + output logic MemReadM, MemWriteM, + // High if the dcache is requesting a stall + output logic DCacheStallW, + // The data that was requested from the cache + output logic [`XLEN-1:0] DCacheReadW +); + + // Configuration parameters + // TODO Move these to a config file + localparam integer DCACHELINESIZE = 256; + localparam integer DCACHENUMLINES = 512; + + // Input signals to cache memory + logic FlushMem; + logic [`XLEN-1:12] DCacheMemUpperPAdr; + logic [11:0] DCacheMemLowerAdr; + logic DCacheMemWriteEnable; + logic [DCACHELINESIZE-1:0] DCacheMemWriteData; + logic [`XLEN-1:0] DCacheMemWritePAdr; + logic EndFetchState; + // Output signals from cache memory + logic [`XLEN-1:0] DCacheMemReadData; + logic DCacheMemReadValid; + + wtdirectmappedmem #(.LINESIZE(DCACHELINESIZE), .NUMLINES(DCACHENUMLINES), .WORDSIZE(`XLEN)) cachemem( + .*, + // Stall it if the pipeline is stalled, unless we're stalling it and we're ending our stall + .stall(StallW), + .flush(FlushMem), + .ReadUpperPAdr(DCacheMemUpperPAdr), + .ReadLowerAdr(DCacheMemLowerAdr), + .LoadEnable(DCacheMemWriteEnable), + .LoadLine(DCacheMemWriteData), + .LoadPAdr(DCacheMemWritePAdr), + .DataWord(DCacheMemReadData), + .DataValid(DCacheMemReadValid), + .WriteEnable(0), + .WriteWord(0), + .WritePAdr(0), + .WriteSize(2'b10) + ); + + dcachecontroller #(.LINESIZE(DCACHELINESIZE)) controller(.*); + + // For now, assume no writes to executable memory + assign FlushMem = 1'b0; +endmodule + +module dcachecontroller #(parameter LINESIZE = 256) ( + // Inputs from pipeline + input logic clk, reset, + input logic StallW, + input logic FlushW, + + // Input the address to read + // The upper bits of the physical pc + input logic [`XLEN-1:12] DCacheMemUpperPAdr, + // The lower bits of the virtual pc + input logic [11:0] DCacheMemLowerAdr, + + // Signals to/from cache memory + // The read coming out of it + input logic [`XLEN-1:0] DCacheMemReadData, + input logic DCacheMemReadValid, + // Load data into the cache + output logic DCacheMemWriteEnable, + output logic [LINESIZE-1:0] DCacheMemWriteData, + output logic [`XLEN-1:0] DCacheMemWritePAdr, + + // The read that was requested + output logic [31:0] DCacheReadW, + + // Outputs to pipeline control stuff + output logic DCacheStallW, EndFetchState, + + // Signals to/from ahblite interface + // A read containing the requested data + input logic [`XLEN-1:0] ReadDataW, + input logic MemAckW, + // The read we request from main memory + output logic [`XLEN-1:0] MemPAdrM, + output logic MemReadM, MemWriteM +); + + // Cache fault signals + logic FaultStall; + + // Handle happy path (data in cache) + + always_comb begin + DCacheReadW = DCacheMemReadData; + end + + + // Handle cache faults + + localparam integer WORDSPERLINE = LINESIZE/`XLEN; + localparam integer LOGWPL = $clog2(WORDSPERLINE); + localparam integer OFFSETWIDTH = $clog2(LINESIZE/8); + + logic FetchState, BeginFetchState; + logic [LOGWPL:0] FetchWordNum, NextFetchWordNum; + logic [`XLEN-1:0] LineAlignedPCPF; + + flopr #(1) FetchStateFlop(clk, reset, BeginFetchState | (FetchState & ~EndFetchState), FetchState); + flopr #(LOGWPL+1) FetchWordNumFlop(clk, reset, NextFetchWordNum, FetchWordNum); + + genvar i; + generate + for (i=0; i < WORDSPERLINE; i++) begin + flopenr #(`XLEN) flop(clk, reset, FetchState & (i == FetchWordNum), ReadDataW, DCacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN]); + end + endgenerate + + // Enter the fetch state when we hit a cache fault + always_comb begin + BeginFetchState = ~DCacheMemReadValid & ~FetchState & (FetchWordNum == 0); + end + // Exit the fetch state once the cache line has been loaded + flopr #(1) EndFetchStateFlop(clk, reset, DCacheMemWriteEnable, EndFetchState); + + // Machinery to request the correct addresses from main memory + always_comb begin + MemReadM = FetchState & ~EndFetchState & ~DCacheMemWriteEnable; + LineAlignedPCPF = {DCacheMemUpperPAdr, DCacheMemLowerAdr[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; + MemPAdrM = LineAlignedPCPF + FetchWordNum*(`XLEN/8); + NextFetchWordNum = FetchState ? FetchWordNum+MemAckW : {LOGWPL+1{1'b0}}; + end + + // Write to cache memory when we have the line here + always_comb begin + DCacheMemWritePAdr = LineAlignedPCPF; + DCacheMemWriteEnable = FetchWordNum == {1'b1, {LOGWPL{1'b0}}} & FetchState & ~EndFetchState; + end + + // Stall the pipeline while loading a new line from memory + always_comb begin + DCacheStallW = FetchState | ~DCacheMemReadValid; + end +endmodule From 251ece20fec50df6c7cee2bfc61e44ae9bb4626a Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 20 Apr 2021 19:55:49 -0500 Subject: [PATCH 18/24] Broken icache. Design is done. Time to debug. --- wally-pipelined/regression/wave.do | 148 ++++--- wally-pipelined/src/ifu/icache.sv | 396 ++++++++++++++++-- wally-pipelined/src/ifu/ifu.sv | 20 +- .../testbench/testbench-imperas.sv | 2 +- 4 files changed, 468 insertions(+), 98 deletions(-) diff --git a/wally-pipelined/regression/wave.do b/wally-pipelined/regression/wave.do index 470cc599..36401fd9 100644 --- a/wally-pipelined/regression/wave.do +++ b/wally-pipelined/regression/wave.do @@ -26,7 +26,6 @@ add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/CSR add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/RetM add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/TrapM add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/LoadStallD -add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/InstrStall add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/DataStall add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/MulDivStallD add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/hzu/FlushF @@ -39,11 +38,6 @@ add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbe add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallE add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallM add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallW -add wave -noupdate /testbench/dut/hart/hzu/StallFCause_Q -add wave -noupdate /testbench/dut/hart/hzu/StallDCause_Q -add wave -noupdate /testbench/dut/hart/hzu/StallECause_Q -add wave -noupdate /testbench/dut/hart/hzu/StallMCause_Q -add wave -noupdate /testbench/dut/hart/hzu/StallWCause_Q add wave -noupdate -group Bpred -expand -group direction -divider Update add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/Predictor/DirPredictor/UpdatePC add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/Predictor/DirPredictor/UpdateEN @@ -80,36 +74,36 @@ add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/c/RegWriteD add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/RdD add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/Rs1D add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/Rs2D -add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rf -add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a1 -add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a2 -add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a3 -add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rd1 -add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rd2 -add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/we3 -add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/wd3 -add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ALUResultW -add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ReadDataW -add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/CSRReadValW -add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultSrcW -add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultW -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/a -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/b -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/alucontrol -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/result -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/flags -add wave -noupdate -expand -group alu -divider internals -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/overflow -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/carry -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/zero -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/neg -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/lt -add wave -noupdate -expand -group alu /testbench/dut/hart/ieu/dp/alu/ltu +add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/rf +add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/a1 +add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/a2 +add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/a3 +add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/rd1 +add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/rd2 +add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/we3 +add wave -noupdate -group RegFile /testbench/dut/hart/ieu/dp/regf/wd3 +add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ALUResultW +add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ReadDataW +add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/CSRReadValW +add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultSrcW +add wave -noupdate -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultW +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/a +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/b +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/alucontrol +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/result +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/flags +add wave -noupdate -group alu -divider internals +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/overflow +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/carry +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/zero +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/neg +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/lt +add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/ltu add wave -noupdate /testbench/InstrFName -add wave -noupdate -expand -group dcache /testbench/dut/hart/MemAdrM -add wave -noupdate -expand -group dcache /testbench/dut/hart/MemPAdrM -add wave -noupdate -expand -group dcache /testbench/dut/hart/WriteDataM -add wave -noupdate -expand -group dcache /testbench/dut/hart/dmem/MemRWM +add wave -noupdate -group dcache /testbench/dut/hart/MemAdrM +add wave -noupdate -group dcache /testbench/dut/hart/MemPAdrM +add wave -noupdate -group dcache /testbench/dut/hart/WriteDataM +add wave -noupdate -group dcache /testbench/dut/hart/dmem/MemRWM add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs1D add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs2D add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs1E @@ -148,32 +142,64 @@ add wave -noupdate -group {function radix debug} /testbench/functionRadix/functi add wave -noupdate -group {function radix debug} /testbench/functionRadix/function_radix/FunctionAddr add wave -noupdate -group {function radix debug} /testbench/functionRadix/function_radix/ProgramAddrIndex add wave -noupdate -group {function radix debug} /testbench/functionRadix/function_radix/FunctionName -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/InstrD -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/SrcAE -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/SrcBE -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/Funct3E -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/MulDivE -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/W64E -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/StallM -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/StallW -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/FlushM -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/FlushW -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/MulDivResultW -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/genblk1/div/start -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/DivDoneE -add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/DivBusyE -add wave -noupdate /testbench/dut/hart/mdu/genblk1/gclk -add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/fsm1/CURRENT_STATE -add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/N -add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/D -add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/Q -add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/rem0 -add wave -noupdate /testbench/dut/hart/MulDivResultW -add wave -noupdate /testbench/dut/hart/mdu/genblk1/PrelimResultE -add wave -noupdate /testbench/dut/hart/mdu/Funct3E -add wave -noupdate /testbench/dut/hart/mdu/genblk1/QuotE +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/InstrD +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/SrcAE +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/SrcBE +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/Funct3E +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/MulDivE +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/W64E +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/StallM +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/StallW +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/FlushM +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/FlushW +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/MulDivResultW +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/genblk1/div/start +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/DivDoneE +add wave -noupdate -group muldiv /testbench/dut/hart/mdu/DivBusyE +add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/fsm1/CURRENT_STATE +add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/N +add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/D +add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/Q +add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/rem0 +add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/AHBByteLength +add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/AHBOFFETWIDTH +add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/BlockByteLength +add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/OFFSETWIDTH +add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/WORDSPERLINE +add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/LOGWPL +add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/LINESIZE +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/CurrState +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrPAdrF +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCountFlag +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCount +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrReadF +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrAckF +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteEnable +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteData +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWritePAdr +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPF +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF +add wave -noupdate -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/FinalInstrRawF +add wave -noupdate -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/AlignedInstrRawD +add wave -noupdate -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/InstrRawD +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/hit +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/spill +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/spillSave +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/UnalignedSelect +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCMux +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPFinalF +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/DataValidBit +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/DataValid +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/ReadTag +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/DataTag +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteEnable +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteLine +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WritePAdr +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteSet +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteTag +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/StoredData TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {128433 ns} 0} +WaveRestoreCursors {{Cursor 2} {237 ns} 0} quietly wave cursor active 1 configure wave -namecolwidth 250 configure wave -valuecolwidth 229 @@ -189,4 +215,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ns update -WaveRestoreZoom {128007 ns} {128663 ns} +WaveRestoreZoom {96 ns} {400 ns} diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index f6440fcf..573e885a 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -129,57 +129,141 @@ module icachecontroller #(parameter LINESIZE = 256) ( output logic InstrReadF ); - // Happy path signals - logic [31:0] AlignedInstrRawF, AlignedInstrRawD; - logic FlushDLastCycleN; - logic PCPMisalignedF; - const logic [31:0] NOP = 32'h13; - logic [`XLEN-1:0] PCPF; - // Misaligned signals - logic [`XLEN:0] MisalignedInstrRawF; - logic MisalignedStall; - // Cache fault signals - logic FaultStall; + // FSM states + localparam STATE_READY = 0; + localparam STATE_HIT_SPILL = 1; // spill, block 0 hit + localparam STATE_HIT_SPILL_MISS_FETCH_WDV = 2; // block 1 miss, issue read to AHB and wait data. + localparam STATE_HIT_SPILL_MISS_FETCH_DONE = 3; // write data into SRAM/LUT + localparam STATE_HIT_SPILL_MERGE = 4; // Read block 0 of CPU access, should be able to optimize into STATE_HIT_SPILL. + localparam STATE_MISS_FETCH_WDV = 5; // aligned miss, issue read to AHB and wait for data. + localparam STATE_MISS_FETCH_DONE = 6; // write data into SRAM/LUT + localparam STATE_MISS_READ = 7; // read block 1 from SRAM/LUT + + localparam STATE_MISS_SPILL_FETCH_WDV = 8; // spill, miss on block 0, issue read to AHB and wait + localparam STATE_MISS_SPILL_FETCH_DONE = 9; // write data into SRAM/LUT + localparam STATE_MISS_SPILL_READ1 = 10; // read block 0 from SRAM/LUT + localparam STATE_MISS_SPILL_2 = 11; // return to ready if hit or do second block update. + localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 12; // miss on block 1, issue read to AHB and wait + localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 13; // write data to SRAM/LUT + localparam STATE_MISS_SPILL_MERGE = 14; // read block 0 of CPU access, + + localparam STATE_INVALIDATE = 15; // *** not sure if invalidate or evict? invalidate by cache block or address? + + localparam AHBByteLength = `XLEN / 8; + localparam AHBOFFETWIDTH = $clog2(AHBByteLength); + + + localparam BlockByteLength = LINESIZE / 8; + localparam OFFSETWIDTH = $clog2(BlockByteLength); + + localparam WORDSPERLINE = LINESIZE/`XLEN; + localparam LOGWPL = $clog2(WORDSPERLINE); + + logic [3:0] CurrState, NextState; + logic hit, spill; + logic SavePC; + logic [1:0] PCMux; + logic CntReset; + logic PreCntEn, CntEn; + logic spillSave; + logic UnalignedSelect; + logic FetchCountFlag; + localparam FetchCountThreshold = WORDSPERLINE - 1; + + logic [LOGWPL:0] FetchCount, NextFetchCount; + + logic [`XLEN-1:0] PCPreFinalF, PCPFinalF, PCSpillF; + logic [`XLEN-1:OFFSETWIDTH] PCPTrunkF; + + + logic [31:0] FinalInstrRawF; + + logic [15:0] SpillDataBlock0; + + + + // Happy path signals + logic [31:0] AlignedInstrRawD; + + //logic [31:0] AlignedInstrRawF, AlignedInstrRawD; + //logic FlushDLastCycleN; + //logic PCPMisalignedF; + const logic [31:0] NOP = 32'h13; + logic [`XLEN-1:0] PCPF; + + logic reset_q; + + // Misaligned signals + //logic [`XLEN:0] MisalignedInstrRawF; + //logic MisalignedStall; + // Cache fault signals + //logic FaultStall; + + + flopenr #(`XLEN) PCPFFlop(clk, reset, SavePC, {UpperPCNextPF, LowerPCNextF}, PCPF); + // on spill we want to get the first 2 bytes of the next cache block. + // the spill only occurs if the PCPF mod BlockByteLength == -2. Therefore we can + // simply add 2 to land on the next cache block. + assign PCSpillF = PCPF + 2'b10; + + // now we have to select between these three PCs + assign PCPreFinalF = PCMux[0] ? PCPF : {UpperPCNextPF, LowerPCNextF}; + assign PCPFinalF = PCMux[1] ? PCSpillF : PCPreFinalF; + + + + // truncate the offset from PCPF for memory address generation + assign PCPTrunkF = PCPFinalF[`XLEN-1:OFFSETWIDTH]; + // Detect if the instruction is compressed - assign CompressedF = AlignedInstrRawF[1:0] != 2'b11; + assign CompressedF = FinalInstrRawF[1:0] != 2'b11; // Handle happy path (data in cache, reads aligned) +/* -----\/----- EXCLUDED -----\/----- generate if (`XLEN == 32) begin assign AlignedInstrRawF = PCPF[1] ? MisalignedInstrRawF : ICacheMemReadData; - assign PCPMisalignedF = PCPF[1] && ~CompressedF; + //assign PCPMisalignedF = PCPF[1] && ~CompressedF; end else begin assign AlignedInstrRawF = PCPF[2] ? (PCPF[1] ? MisalignedInstrRawF : ICacheMemReadData[63:32]) : (PCPF[1] ? ICacheMemReadData[47:16] : ICacheMemReadData[31:0]); - assign PCPMisalignedF = PCPF[2] && PCPF[1] && ~CompressedF; + //assign PCPMisalignedF = PCPF[2] && PCPF[1] && ~CompressedF; end endgenerate + -----/\----- EXCLUDED -----/\----- */ - flopenr #(32) AlignedInstrRawDFlop(clk, reset, ~StallD, AlignedInstrRawF, AlignedInstrRawD); - flopr #(1) FlushDLastCycleFlop(clk, reset, ~FlushD & (FlushDLastCycleN | ~StallF), FlushDLastCycleN); - flopenr #(`XLEN) PCPFFlop(clk, reset, ~StallF, {UpperPCNextPF, LowerPCNextF}, PCPF); - mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, ~FlushDLastCycleN, InstrRawD); + //flopenr #(32) AlignedInstrRawDFlop(clk, reset, ~StallD, AlignedInstrRawF, AlignedInstrRawD); + //flopr #(1) FlushDLastCycleFlop(clk, reset, ~FlushD & (FlushDLastCycleN | ~StallF), FlushDLastCycleN); + + //mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, ~FlushDLastCycleN, InstrRawD); // Stall for faults or misaligned reads +/* -----\/----- EXCLUDED -----\/----- always_comb begin assign ICacheStallF = FaultStall | MisalignedStall; end + -----/\----- EXCLUDED -----/\----- */ // Handle misaligned, noncompressed reads +/* -----\/----- EXCLUDED -----\/----- logic MisalignedState, NextMisalignedState; logic [15:0] MisalignedHalfInstrF; logic [15:0] UpperHalfWord; + -----/\----- EXCLUDED -----/\----- */ +/* -----\/----- EXCLUDED -----\/----- flopenr #(16) MisalignedHalfInstrFlop(clk, reset, ~FaultStall & (PCPMisalignedF & MisalignedState), AlignedInstrRawF[15:0], MisalignedHalfInstrF); flopenr #(1) MisalignedStateFlop(clk, reset, ~FaultStall, NextMisalignedState, MisalignedState); + -----/\----- EXCLUDED -----/\----- */ // When doing a misaligned read, swizzle the bits correctly +/* -----\/----- EXCLUDED -----\/----- generate if (`XLEN == 32) begin assign UpperHalfWord = ICacheMemReadData[31:16]; @@ -194,14 +278,18 @@ module icachecontroller #(parameter LINESIZE = 256) ( assign MisalignedInstrRawF = {ICacheMemReadData[15:0], MisalignedHalfInstrF}; end end + -----/\----- EXCLUDED -----/\----- */ // Manage internal state and stall when necessary +/* -----\/----- EXCLUDED -----\/----- always_comb begin assign MisalignedStall = PCPMisalignedF & MisalignedState; assign NextMisalignedState = ~PCPMisalignedF | ~MisalignedState; end + -----/\----- EXCLUDED -----/\----- */ // Pick the correct address to read +/* -----\/----- EXCLUDED -----\/----- generate if (`XLEN == 32) begin assign ICacheMemReadLowerAdr = {LowerPCNextF[11:2] + (PCPMisalignedF & ~MisalignedState), 2'b00}; @@ -209,16 +297,15 @@ module icachecontroller #(parameter LINESIZE = 256) ( assign ICacheMemReadLowerAdr = {LowerPCNextF[11:3] + (PCPMisalignedF & ~MisalignedState), 3'b00}; end endgenerate + -----/\----- EXCLUDED -----/\----- */ // TODO Handle reading instructions that cross page boundaries - assign ICacheMemReadUpperPAdr = UpperPCNextPF; + //assign ICacheMemReadUpperPAdr = UpperPCNextPF; // Handle cache faults - localparam integer WORDSPERLINE = LINESIZE/`XLEN; - localparam integer LOGWPL = $clog2(WORDSPERLINE); - localparam integer OFFSETWIDTH = $clog2(LINESIZE/8); +/* -----\/----- EXCLUDED -----\/----- logic FetchState, BeginFetchState; logic [LOGWPL:0] FetchWordNum, NextFetchWordNum; logic [`XLEN-1:0] LineAlignedPCPF; @@ -226,12 +313,6 @@ module icachecontroller #(parameter LINESIZE = 256) ( flopr #(1) FetchStateFlop(clk, reset, BeginFetchState | (FetchState & ~EndFetchState), FetchState); flopr #(LOGWPL+1) FetchWordNumFlop(clk, reset, NextFetchWordNum, FetchWordNum); - genvar i; - generate - for (i=0; i < WORDSPERLINE; i++) begin - flopenr #(`XLEN) flop(clk, reset, FetchState & (i == FetchWordNum), InstrInF, ICacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN]); - end - endgenerate // Enter the fetch state when we hit a cache fault always_comb begin @@ -242,10 +323,10 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Machinery to request the correct addresses from main memory always_comb begin - InstrReadF = FetchState & ~EndFetchState & ~ICacheMemWriteEnable; - LineAlignedPCPF = {ICacheMemReadUpperPAdr, ICacheMemReadLowerAdr[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; - InstrPAdrF = LineAlignedPCPF + FetchWordNum*(`XLEN/8); - NextFetchWordNum = FetchState ? FetchWordNum+InstrAckF : {LOGWPL+1{1'b0}}; + InstrReadF = FetchState & ~EndFetchState & ~ICacheMemWriteEnable; // next stage logic + LineAlignedPCPF = {ICacheMemReadUpperPAdr, ICacheMemReadLowerAdr[11:OFFSETWIDTH], {OFFSETWIDTH{1'b0}}}; // the fetch address for abh? + InstrPAdrF = LineAlignedPCPF + FetchWordNum*(`XLEN/8); // ? + NextFetchWordNum = FetchState ? FetchWordNum+InstrAckF : {LOGWPL+1{1'b0}}; // convert to enable end // Write to cache memory when we have the line here @@ -258,4 +339,255 @@ module icachecontroller #(parameter LINESIZE = 256) ( always_comb begin FaultStall = FetchState | ~ICacheMemReadValid; end + -----/\----- EXCLUDED -----/\----- */ + + // the FSM is always runing, do not stall. + flopr #(4) stateReg(.clk(clk), + .reset(reset), + .d(NextState), + .q(CurrState)); + + assign spill = PCPF[5:1] == 5'b1_1111 ? 1'b1 : 1'b0; + assign hit = ICacheMemReadValid; // note ICacheMemReadValid is hit. + assign FetchCountFlag = FetchCount == FetchCountThreshold; + + // Next state logic + always_comb begin + UnalignedSelect = 1'b0; + CntReset = 1'b0; + PreCntEn = 1'b0; + InstrReadF = 1'b0; + ICacheMemWriteEnable = 1'b0; + spillSave = 1'b0; + PCMux = 2'b00; + + case (CurrState) + + STATE_READY: begin + PCMux = 2'b00; + if (hit & ~spill) begin + NextState = STATE_READY; + end else if (hit & spill) begin + spillSave = 1'b1; + NextState = STATE_HIT_SPILL; + end else if (~hit & ~spill) begin + CntReset = 1'b1; + NextState = STATE_MISS_FETCH_WDV; + end else if (~hit & spill) begin + CntReset = 1'b1; + NextState = STATE_MISS_SPILL_FETCH_WDV; + end else begin + NextState = STATE_READY; + end + end + + // branch 1, hit spill and 2, miss spill hit + STATE_HIT_SPILL: begin + PCMux = 2'b10; + UnalignedSelect = 1'b1; + if (hit) begin + NextState = STATE_READY; + end else + CntReset = 1'b1; + NextState = STATE_HIT_SPILL_MISS_FETCH_WDV; + end + STATE_HIT_SPILL_MISS_FETCH_WDV: begin + PCMux = 2'b10; + InstrReadF = 1'b1; + PreCntEn = 1'b1; + if (FetchCountFlag & InstrAckF) begin + NextState = STATE_HIT_SPILL_MISS_FETCH_DONE; + end else begin + NextState = STATE_HIT_SPILL_MISS_FETCH_WDV; + end + end + STATE_HIT_SPILL_MISS_FETCH_DONE: begin + PCMux = 2'b10; + ICacheMemWriteEnable = 1'b1; + NextState = STATE_HIT_SPILL_MERGE; + end + STATE_HIT_SPILL_MERGE: begin + PCMux = 2'b10; + UnalignedSelect = 1'b1; + NextState = STATE_READY; + end + + // branch 3 miss no spill + STATE_MISS_FETCH_WDV: begin + PCMux = 2'b01; + InstrReadF = 1'b1; + PreCntEn = 1'b1; + if (FetchCountFlag & InstrAckF) begin + NextState = STATE_MISS_FETCH_DONE; + end else begin + NextState = STATE_MISS_FETCH_WDV; + end + end + STATE_MISS_FETCH_DONE: begin + PCMux = 2'b01; + ICacheMemWriteEnable = 1'b1; + NextState = STATE_MISS_READ; + end + STATE_MISS_READ: begin + PCMux = 2'b01; + NextState = STATE_READY; + end + + // branch 4 miss spill hit, and 5 miss spill miss + STATE_MISS_SPILL_FETCH_WDV: begin + PCMux = 2'b01; + PreCntEn = 1'b1; + InstrReadF = 1'b1; + if (FetchCountFlag & InstrAckF) begin + NextState = STATE_MISS_SPILL_FETCH_DONE; + end else begin + NextState = STATE_MISS_SPILL_FETCH_WDV; + end + end + STATE_MISS_SPILL_FETCH_DONE: begin + PCMux = 2'b01; + ICacheMemWriteEnable = 1'b1; + NextState = STATE_MISS_SPILL_READ1; + end + STATE_MISS_SPILL_READ1: begin // always be a hit as we just wrote that cache block. + PCMux = 2'b10; // there is a 1 cycle delay after setting the address before the date arrives. + spillSave = 1'b1; /// *** Could pipeline these to make it clearer in the fsm. + NextState = STATE_MISS_SPILL_2; + end + STATE_MISS_SPILL_2: begin + PCMux = 2'b10; + UnalignedSelect = 1'b1; + if (~hit) begin + CntReset = 1'b1; + NextState = STATE_MISS_SPILL_MISS_FETCH_WDV; + end else begin + NextState = STATE_READY; + end + end + STATE_MISS_SPILL_MISS_FETCH_WDV: begin + PCMux = 2'b10; + PreCntEn = 1'b1; + InstrReadF = 1'b1; + if (FetchCountFlag & InstrAckF) begin + NextState = STATE_MISS_SPILL_MISS_FETCH_DONE; + end else begin + NextState = STATE_MISS_SPILL_MISS_FETCH_WDV; + end + end + STATE_MISS_SPILL_MISS_FETCH_DONE: begin + PCMux = 2'b10; + ICacheMemWriteEnable = 1'b1; + NextState = STATE_MISS_SPILL_MERGE; + end + STATE_MISS_SPILL_MERGE: begin + PCMux = 2'b10; + UnalignedSelect = 1'b1; + NextState = STATE_READY; + end + default: begin + PCMux = 2'b01; + NextState = STATE_READY; + end + // *** add in error handling and invalidate/evict + endcase + end + + // fsm outputs + // stall CPU any time we are not in the ready state. any other state means the + // cache is either requesting data from the memory interface or handling a + // spill over two cycles. + assign ICacheStallF = (CurrState != STATE_READY) | reset_q ? 1'b1 : 1'b0; + // save the PC anytime we are in the ready state. The saved value will be used as the PC may not be stable. + assign SavePC = CurrState == STATE_READY ? 1'b1 : 1'b0; + assign CntEn = PreCntEn & InstrAckF; + + // to compute the fetch address we need to add the bit shifted + // counter output to the address. + + flopenr #(LOGWPL+1) + FetchCountReg(.clk(clk), + .reset(reset | CntReset), + .en(CntEn), + .d(NextFetchCount), + .q(FetchCount)); + + assign NextFetchCount = FetchCount + 1'b1; + + // This part is confusing. + // we need to remove the offset bits (PCPTrunkF). Because the AHB interface is XLEN wide + // we need to address on that number of bits so the PC is extended to the right by AHBByteLength with zeros. + // fetch count is already aligned to AHBByteLength, but we need to extend back to the full address width with + // more zeros after the addition. This will be the number of offset bits less the AHBByteLength. + assign InstrPAdrF = {{PCPTrunkF, {{LOGWPL}{1'b0}}} + FetchCount, {{OFFSETWIDTH-LOGWPL}{1'b0}}}; + + + // store read data from memory interface before writing into SRAM. + genvar i; + generate + for (i = 0; i < AHBByteLength; i++) begin + flopenr #(`XLEN) flop(.clk(clk), + .reset(reset), + .en(InstrAckF & (i == FetchCount)), + .d(InstrInF), + .q(ICacheMemWriteData[(i+1)*`XLEN-1:i*`XLEN])); + end + endgenerate + + // what address is used to write the SRAM? + + + // spills require storing the first cache block so it can merged + // with the second + // can optimize size, for now just make it the size of the data + // leaving the cache memory. + flopenr #(16) SpillInstrReg(.clk(clk), + .en(spillSave), + .reset(reset), + .d(ICacheMemReadData[15:0]), + .q(SpillDataBlock0)); + + // use the not quite final PC to do the final selection. + generate + if( `XLEN == 32) begin + logic [1:1] PCPreFinalF_q; + flop #(1) PCFReg(.clk(clk), + .d(PCPreFinalF[1]), + .q(PCPreFinalF_q[1])); + assign FinalInstrRawF = PCPreFinalF[1] ? {SpillDataBlock0, ICacheMemReadData[31:16]} : ICacheMemReadData; + end else begin + logic [2:1] PCPreFinalF_q; + flop #(2) PCFReg(.clk(clk), + .d(PCPreFinalF[2:1]), + .q(PCPreFinalF_q[2:1])); + mux4 #(32) AlignmentMux(.d0(ICacheMemReadData[31:0]), + .d1(ICacheMemReadData[47:16]), + .d2(ICacheMemReadData[63:32]), + .d3({SpillDataBlock0, ICacheMemReadData[63:48]}), + .s(PCPreFinalF[2:1]), + .y(FinalInstrRawF)); + end + endgenerate + + // There is a frustrating issue on the first access. + // The cache will not contain any valid data but will contain x's on + // reset. This makes FinalInstrRawF invalid. On the first cycle out of + // reset this register will pickup this x and it will propagate throughout + // the cpu causing simulation failure, most likely a trap for invalid instruction. + // Reset must be held 1 cycle longer to prevent this issue. additionally the + // reset should be to a NOP rather than 0. + + // register reset + flop #(1) resetReg (.clk(clk), + .d(reset), + .q(reset_q)); + + flopenl #(32) AlignedInstrRawDFlop(clk, reset | reset_q, ~StallD, FinalInstrRawF, NOP, AlignedInstrRawD); + mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, FlushD, InstrRawD); + + assign {ICacheMemReadUpperPAdr, ICacheMemReadLowerAdr} = PCPFinalF; + + assign ICacheMemWritePAdr = PCPFinalF; + + + endmodule diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv index 5a2d1b42..58b144f5 100644 --- a/wally-pipelined/src/ifu/ifu.sv +++ b/wally-pipelined/src/ifu/ifu.sv @@ -77,6 +77,8 @@ module ifu ( logic [31:0] nop = 32'h00000013; // instruction for NOP // *** send this to the trap unit logic ITLBPageFaultF; + logic reset_q; // *** look at this later. + tlb #(3) itlb(.TLBAccess(1'b1), .VirtualAddress(PCF), .PageTableEntryWrite(PageTableEntryF), .PageTypeWrite(PageTypeF), @@ -87,7 +89,7 @@ module ifu ( // branch predictor signals logic SelBPPredF; - logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F; + logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F, PCNext2F; logic [3:0] InstrClassD, InstrClassE; @@ -98,10 +100,10 @@ module ifu ( // jarred 2021-03-14 Add instrution cache block to remove rd2 assign PCNextPF = PCNextF; // Temporary workaround until iTLB is live - icache ic( + icache icache( .*, - .UpperPCPF(PCPF[`XLEN-1:12]), - .LowerPCF(PCF[11:0]) + .UpperPCNextPF(PCNextPF[`XLEN-1:12]), + .LowerPCNextF(PCNextPF[11:0]) ); assign PrivilegedChangePCM = RetM | TrapM; @@ -120,7 +122,17 @@ module ifu ( mux2 #(`XLEN) pcmux2(.d0(PCNext1F), .d1(PrivilegedNextPCM), .s(PrivilegedChangePCM), + .y(PCNext2F)); + + mux2 #(`XLEN) pcmux3(.d0(PCNext2F), + .d1(`RESET_VECTOR), + .s(reset_q), .y(UnalignedPCNextF)); + + flop #(1) resetReg (.clk(clk), + .d(reset), + .q(reset_q)); + assign PCNextF = {UnalignedPCNextF[`XLEN-1:1], 1'b0}; // hart-SPEC p. 21 about 16-bit alignment flopenl #(`XLEN) pcreg(clk, reset, ~StallF & ~ICacheStallF, PCNextF, `RESET_VECTOR, PCF); diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv index 2d11bcc8..bd51596d 100644 --- a/wally-pipelined/testbench/testbench-imperas.sv +++ b/wally-pipelined/testbench/testbench-imperas.sv @@ -447,7 +447,7 @@ module testbench(); // Track names of instructions instrTrackerTB it(clk, reset, dut.hart.ieu.dp.FlushE, - dut.hart.ifu.ic.controller.AlignedInstrRawF, + dut.hart.ifu.icache.controller.FinalInstrRawF, dut.hart.ifu.InstrD, dut.hart.ifu.InstrE, dut.hart.ifu.InstrM, dut.hart.ifu.InstrW, InstrFName, InstrDName, InstrEName, InstrMName, InstrWName); From 99424fb9831b324576b080b289041a6fb9d877ad Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 20 Apr 2021 21:19:53 -0500 Subject: [PATCH 19/24] Progress on icache. Fixed some issues aligning the PC with instruction. Still broken. --- wally-pipelined/regression/wave.do | 19 ++++-- wally-pipelined/src/cache/dmapped.sv | 99 ++++++++++++++++++++++++++++ wally-pipelined/src/ifu/icache.sv | 85 ++++++++++++++---------- 3 files changed, 163 insertions(+), 40 deletions(-) diff --git a/wally-pipelined/regression/wave.do b/wally-pipelined/regression/wave.do index 36401fd9..eeb8a0ba 100644 --- a/wally-pipelined/regression/wave.do +++ b/wally-pipelined/regression/wave.do @@ -122,6 +122,7 @@ add wave -noupdate -group {alu execution stage} /testbench/dut/hart/ieu/dp/ALURe add wave -noupdate -group {alu execution stage} /testbench/dut/hart/ieu/dp/SrcAE add wave -noupdate -group {alu execution stage} /testbench/dut/hart/ieu/dp/SrcBE add wave -noupdate /testbench/dut/hart/ieu/dp/ALUResultM +add wave -noupdate -expand -group PCS /testbench/dut/hart/ifu/PCNextF add wave -noupdate -expand -group PCS /testbench/dut/hart/PCF add wave -noupdate -expand -group PCS /testbench/dut/hart/ifu/PCD add wave -noupdate -expand -group PCS /testbench/dut/hart/PCE @@ -169,11 +170,12 @@ add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/cont add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/LOGWPL add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/LINESIZE add wave -noupdate /testbench/dut/hart/ifu/icache/controller/CurrState -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrPAdrF add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCountFlag add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCount +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrPAdrF add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrReadF add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrAckF +add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrInF add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteEnable add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteData add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWritePAdr @@ -198,9 +200,18 @@ add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WritePAdr add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteSet add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteTag add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/StoredData +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadAddr +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadData +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/ReadPAdr +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/ICacheMemReadData +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/genblk2/PCPreFinalF_q +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/ICacheStallF +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/SavePC TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {237 ns} 0} -quietly wave cursor active 1 +WaveRestoreCursors {{Cursor 2} {44 ns} 0} {{Cursor 2} {284 ns} 0} +quietly wave cursor active 2 configure wave -namecolwidth 250 configure wave -valuecolwidth 229 configure wave -justifyvalue left @@ -215,4 +226,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ns update -WaveRestoreZoom {96 ns} {400 ns} +WaveRestoreZoom {139 ns} {443 ns} diff --git a/wally-pipelined/src/cache/dmapped.sv b/wally-pipelined/src/cache/dmapped.sv index fb6ce4c5..4f1cc2d3 100644 --- a/wally-pipelined/src/cache/dmapped.sv +++ b/wally-pipelined/src/cache/dmapped.sv @@ -125,6 +125,105 @@ module rodirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, par assign DataValid = DataValidBit && (DataTag == ReadTag); endmodule +module rodirectmappedmemre #(parameter NUMLINES=512, parameter LINESIZE = 256, parameter WORDSIZE = `XLEN) ( + // Pipeline stuff + input logic clk, + input logic reset, + input logic re, + // If flush is high, invalidate the entire cache + input logic flush, + // Select which address to read (broken for efficiency's sake) + input logic [`XLEN-1:12] ReadUpperPAdr, + input logic [11:0] ReadLowerAdr, + // Write new data to the cache + input logic WriteEnable, + input logic [LINESIZE-1:0] WriteLine, + input logic [`XLEN-1:0] WritePAdr, + // Output the word, as well as if it is valid + output logic [WORDSIZE-1:0] DataWord, + output logic DataValid +); + + // Various compile-time constants + localparam integer WORDWIDTH = $clog2(WORDSIZE/8); + localparam integer OFFSETWIDTH = $clog2(LINESIZE/WORDSIZE); + localparam integer SETWIDTH = $clog2(NUMLINES); + localparam integer TAGWIDTH = `XLEN - OFFSETWIDTH - SETWIDTH - WORDWIDTH; + + localparam integer OFFSETBEGIN = WORDWIDTH; + localparam integer OFFSETEND = OFFSETBEGIN+OFFSETWIDTH-1; + localparam integer SETBEGIN = OFFSETEND+1; + localparam integer SETEND = SETBEGIN + SETWIDTH - 1; + localparam integer TAGBEGIN = SETEND + 1; + localparam integer TAGEND = TAGBEGIN + TAGWIDTH - 1; + + // Machinery to read from and write to the correct addresses in memory + logic [`XLEN-1:0] ReadPAdr; + logic [`XLEN-1:0] OldReadPAdr; + logic [OFFSETWIDTH-1:0] ReadOffset, WriteOffset; + logic [SETWIDTH-1:0] ReadSet, WriteSet; + logic [TAGWIDTH-1:0] ReadTag, WriteTag; + logic [LINESIZE-1:0] ReadLine; + logic [LINESIZE/WORDSIZE-1:0][WORDSIZE-1:0] ReadLineTransformed; + + // Machinery to check if a given read is valid and is the desired value + logic [TAGWIDTH-1:0] DataTag; + logic [NUMLINES-1:0] ValidOut; + logic DataValidBit; + + flopenr #(`XLEN) ReadPAdrFlop(clk, reset, re, ReadPAdr, OldReadPAdr); + + // Assign the read and write addresses in cache memory + always_comb begin + ReadOffset = OldReadPAdr[OFFSETEND:OFFSETBEGIN]; + ReadPAdr = {ReadUpperPAdr, ReadLowerAdr}; + ReadSet = ReadPAdr[SETEND:SETBEGIN]; + ReadTag = OldReadPAdr[TAGEND:TAGBEGIN]; + + WriteOffset = WritePAdr[OFFSETEND:OFFSETBEGIN]; + WriteSet = WritePAdr[SETEND:SETBEGIN]; + WriteTag = WritePAdr[TAGEND:TAGBEGIN]; + end + + // Depth is number of bits in one "word" of the memory, width is number of such words + Sram1Read1Write #(.DEPTH(LINESIZE), .WIDTH(NUMLINES)) cachemem ( + .*, + .ReadAddr(ReadSet), + .ReadData(ReadLine), + .WriteAddr(WriteSet), + .WriteData(WriteLine) + ); + Sram1Read1Write #(.DEPTH(TAGWIDTH), .WIDTH(NUMLINES)) cachetags ( + .*, + .ReadAddr(ReadSet), + .ReadData(DataTag), + .WriteAddr(WriteSet), + .WriteData(WriteTag) + ); + + // Pick the right bits coming out the read line + assign DataWord = ReadLineTransformed[ReadOffset]; + genvar i; + generate + for (i=0; i < LINESIZE/WORDSIZE; i++) begin + assign ReadLineTransformed[i] = ReadLine[(i+1)*WORDSIZE-1:i*WORDSIZE]; + end + endgenerate + + // Correctly handle the valid bits + always_ff @(posedge clk, posedge reset) begin + if (reset || flush) begin + ValidOut <= {NUMLINES{1'b0}}; + end else begin + if (WriteEnable) begin + ValidOut[WriteSet] <= 1; + end + end + DataValidBit <= ValidOut[ReadSet]; + end + assign DataValid = DataValidBit && (DataTag == ReadTag); +endmodule + // Write-through direct-mapped memory module wtdirectmappedmem #(parameter NUMLINES=512, parameter LINESIZE = 256, parameter WORDSIZE = `XLEN) ( // Pipeline stuff diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 573e885a..8c16b3a9 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -65,11 +65,14 @@ module icache( // Output signals from cache memory logic [`XLEN-1:0] ICacheMemReadData; logic ICacheMemReadValid; + logic ICacheReadEn; + - rodirectmappedmem #(.LINESIZE(ICACHELINESIZE), .NUMLINES(ICACHENUMLINES), .WORDSIZE(`XLEN)) cachemem( + rodirectmappedmemre #(.LINESIZE(ICACHELINESIZE), .NUMLINES(ICACHENUMLINES), .WORDSIZE(`XLEN)) + cachemem( .*, // Stall it if the pipeline is stalled, unless we're stalling it and we're ending our stall - .stall(StallF && (~ICacheStallF || ~EndFetchState)), + .re(ICacheReadEn), .flush(FlushMem), .ReadUpperPAdr(ICacheMemReadUpperPAdr), .ReadLowerAdr(ICacheMemReadLowerAdr), @@ -88,45 +91,46 @@ endmodule module icachecontroller #(parameter LINESIZE = 256) ( // Inputs from pipeline - input logic clk, reset, - input logic StallF, StallD, - input logic FlushD, + input logic clk, reset, + input logic StallF, StallD, + input logic FlushD, // Input the address to read // The upper bits of the physical pc - input logic [`XLEN-1:12] UpperPCNextPF, + input logic [`XLEN-1:12] UpperPCNextPF, // The lower bits of the virtual pc - input logic [11:0] LowerPCNextF, + input logic [11:0] LowerPCNextF, // Signals to/from cache memory // The read coming out of it - input logic [`XLEN-1:0] ICacheMemReadData, - input logic ICacheMemReadValid, + input logic [`XLEN-1:0] ICacheMemReadData, + input logic ICacheMemReadValid, // The address at which we want to search the cache memory - output logic [`XLEN-1:12] ICacheMemReadUpperPAdr, - output logic [11:0] ICacheMemReadLowerAdr, + output logic [`XLEN-1:12] ICacheMemReadUpperPAdr, + output logic [11:0] ICacheMemReadLowerAdr, + output logic ICacheReadEn, // Load data into the cache - output logic ICacheMemWriteEnable, + output logic ICacheMemWriteEnable, output logic [LINESIZE-1:0] ICacheMemWriteData, - output logic [`XLEN-1:0] ICacheMemWritePAdr, + output logic [`XLEN-1:0] ICacheMemWritePAdr, // Outputs to rest of ifu // High if the instruction in the fetch stage is compressed - output logic CompressedF, + output logic CompressedF, // The instruction that was requested // If this instruction is compressed, upper 16 bits may be the next 16 bits or may be zeros - output logic [31:0] InstrRawD, + output logic [31:0] InstrRawD, // Outputs to pipeline control stuff - output logic ICacheStallF, EndFetchState, + output logic ICacheStallF, EndFetchState, // Signals to/from ahblite interface // A read containing the requested data - input logic [`XLEN-1:0] InstrInF, - input logic InstrAckF, + input logic [`XLEN-1:0] InstrInF, + input logic InstrAckF, // The read we request from main memory - output logic [`XLEN-1:0] InstrPAdrF, - output logic InstrReadF + output logic [`XLEN-1:0] InstrPAdrF, + output logic InstrReadF ); // FSM states @@ -173,7 +177,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( logic [LOGWPL:0] FetchCount, NextFetchCount; - logic [`XLEN-1:0] PCPreFinalF, PCPFinalF, PCSpillF; + logic [`XLEN-1:0] PCPreFinalF, PCPFinalF, PCSpillF, PCNextPF; logic [`XLEN-1:OFFSETWIDTH] PCPTrunkF; @@ -200,15 +204,16 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Cache fault signals //logic FaultStall; - - flopenr #(`XLEN) PCPFFlop(clk, reset, SavePC, {UpperPCNextPF, LowerPCNextF}, PCPF); + assign PCNextPF = {UpperPCNextPF, LowerPCNextF}; + + flopenl #(`XLEN) PCPFFlop(clk, reset, SavePC, PCPFinalF, `RESET_VECTOR, PCPF); // on spill we want to get the first 2 bytes of the next cache block. // the spill only occurs if the PCPF mod BlockByteLength == -2. Therefore we can // simply add 2 to land on the next cache block. assign PCSpillF = PCPF + 2'b10; // now we have to select between these three PCs - assign PCPreFinalF = PCMux[0] ? PCPF : {UpperPCNextPF, LowerPCNextF}; + assign PCPreFinalF = PCMux[0] ? PCPF : PCNextPF; assign PCPFinalF = PCMux[1] ? PCSpillF : PCPreFinalF; @@ -353,18 +358,20 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Next state logic always_comb begin - UnalignedSelect = 1'b0; - CntReset = 1'b0; - PreCntEn = 1'b0; - InstrReadF = 1'b0; - ICacheMemWriteEnable = 1'b0; - spillSave = 1'b0; - PCMux = 2'b00; + UnalignedSelect = 1'b0; + CntReset = 1'b0; + PreCntEn = 1'b0; + InstrReadF = 1'b0; + ICacheMemWriteEnable = 1'b0; + spillSave = 1'b0; + PCMux = 2'b00; + ICacheReadEn = 1'b0; case (CurrState) STATE_READY: begin PCMux = 2'b00; + ICacheReadEn = 1'b1; if (hit & ~spill) begin NextState = STATE_READY; end else if (hit & spill) begin @@ -384,7 +391,8 @@ module icachecontroller #(parameter LINESIZE = 256) ( // branch 1, hit spill and 2, miss spill hit STATE_HIT_SPILL: begin PCMux = 2'b10; - UnalignedSelect = 1'b1; + UnalignedSelect = 1'b1; + ICacheReadEn = 1'b1; if (hit) begin NextState = STATE_READY; end else @@ -409,6 +417,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( STATE_HIT_SPILL_MERGE: begin PCMux = 2'b10; UnalignedSelect = 1'b1; + ICacheReadEn = 1'b1; NextState = STATE_READY; end @@ -430,6 +439,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( end STATE_MISS_READ: begin PCMux = 2'b01; + ICacheReadEn = 1'b1; NextState = STATE_READY; end @@ -452,6 +462,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( STATE_MISS_SPILL_READ1: begin // always be a hit as we just wrote that cache block. PCMux = 2'b10; // there is a 1 cycle delay after setting the address before the date arrives. spillSave = 1'b1; /// *** Could pipeline these to make it clearer in the fsm. + ICacheReadEn = 1'b1; NextState = STATE_MISS_SPILL_2; end STATE_MISS_SPILL_2: begin @@ -482,6 +493,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( STATE_MISS_SPILL_MERGE: begin PCMux = 2'b10; UnalignedSelect = 1'b1; + ICacheReadEn = 1'b1; NextState = STATE_READY; end default: begin @@ -496,9 +508,9 @@ module icachecontroller #(parameter LINESIZE = 256) ( // stall CPU any time we are not in the ready state. any other state means the // cache is either requesting data from the memory interface or handling a // spill over two cycles. - assign ICacheStallF = (CurrState != STATE_READY) | reset_q ? 1'b1 : 1'b0; + assign ICacheStallF = ((CurrState != STATE_READY) & hit) | reset_q ? 1'b1 : 1'b0; // save the PC anytime we are in the ready state. The saved value will be used as the PC may not be stable. - assign SavePC = CurrState == STATE_READY ? 1'b1 : 1'b0; + assign SavePC = (CurrState == STATE_READY) & hit ? 1'b1 : 1'b0; assign CntEn = PreCntEn & InstrAckF; // to compute the fetch address we need to add the bit shifted @@ -518,6 +530,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( // we need to address on that number of bits so the PC is extended to the right by AHBByteLength with zeros. // fetch count is already aligned to AHBByteLength, but we need to extend back to the full address width with // more zeros after the addition. This will be the number of offset bits less the AHBByteLength. + // *** now a bug need to mux between PCPF and PCPF+2 assign InstrPAdrF = {{PCPTrunkF, {{LOGWPL}{1'b0}}} + FetchCount, {{OFFSETWIDTH-LOGWPL}{1'b0}}}; @@ -553,7 +566,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( flop #(1) PCFReg(.clk(clk), .d(PCPreFinalF[1]), .q(PCPreFinalF_q[1])); - assign FinalInstrRawF = PCPreFinalF[1] ? {SpillDataBlock0, ICacheMemReadData[31:16]} : ICacheMemReadData; + assign FinalInstrRawF = PCPreFinalF_q[1] ? {SpillDataBlock0, ICacheMemReadData[31:16]} : ICacheMemReadData; end else begin logic [2:1] PCPreFinalF_q; flop #(2) PCFReg(.clk(clk), @@ -563,7 +576,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( .d1(ICacheMemReadData[47:16]), .d2(ICacheMemReadData[63:32]), .d3({SpillDataBlock0, ICacheMemReadData[63:48]}), - .s(PCPreFinalF[2:1]), + .s(PCPreFinalF_q[2:1]), .y(FinalInstrRawF)); end endgenerate From f3093ac612fcbb08c8030a638e6b3ae3e781efc4 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Tue, 20 Apr 2021 22:06:12 -0500 Subject: [PATCH 20/24] Why was the linter messed up? There are a number of combo loops which need fixing outside the icache. They may be fixed in main. We get to instruction address 50 now! --- wally-pipelined/lint-wally | 24 +++++------------------- wally-pipelined/regression/wave.do | 4 ++-- wally-pipelined/src/ifu/icache.sv | 25 ++++++++++++++++++------- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/wally-pipelined/lint-wally b/wally-pipelined/lint-wally index 791435ac..9d5a20ba 100755 --- a/wally-pipelined/lint-wally +++ b/wally-pipelined/lint-wally @@ -1,25 +1,11 @@ # check for warnings in Verilog code # The verilator lint tool is faster and better than Modelsim so it is best to run this first. -if [ -n "$1" ]; then - echo "rv64ic linting..." - if verilator --lint-only --top-module "$1" -Iconfig/rv64ic src/*/*.sv; then - echo "rv32ic linting..." - verilator --lint-only --top-module "$1" -Iconfig/rv32ic src/*/*.sv - else - echo "Skipping rv32ic because rv64ic had errors or warnings" - exit 1 - fi -else - echo "rv64ic linting..." - if verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv64ic src/*/*.sv; then - echo "rv32ic linting..." - verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv32ic src/*/*.sv - else - echo "Skipping rv32ic because rv64ic had errors or warnings" - exit 1 - fi -fi +echo "rv64ic linting..." +verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv64ic src/*/*.sv +echo "rv32ic linting..." +verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv32ic src/*/*.sv +#verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv64ic src/*/*.sv src/*/div/*.sv # --lint-only just runs lint rather than trying to compile and simulate # -I points to the include directory where files such as `include wally-config.vh are found diff --git a/wally-pipelined/regression/wave.do b/wally-pipelined/regression/wave.do index eeb8a0ba..5174a350 100644 --- a/wally-pipelined/regression/wave.do +++ b/wally-pipelined/regression/wave.do @@ -210,7 +210,7 @@ add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF add wave -noupdate /testbench/dut/hart/ifu/icache/controller/ICacheStallF add wave -noupdate /testbench/dut/hart/ifu/icache/controller/SavePC TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {44 ns} 0} {{Cursor 2} {284 ns} 0} +WaveRestoreCursors {{Cursor 2} {44 ns} 0} {{Cursor 2} {566 ns} 0} quietly wave cursor active 2 configure wave -namecolwidth 250 configure wave -valuecolwidth 229 @@ -226,4 +226,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ns update -WaveRestoreZoom {139 ns} {443 ns} +WaveRestoreZoom {458 ns} {674 ns} diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 8c16b3a9..09dab7a8 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -361,7 +361,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( UnalignedSelect = 1'b0; CntReset = 1'b0; PreCntEn = 1'b0; - InstrReadF = 1'b0; + //InstrReadF = 1'b0; ICacheMemWriteEnable = 1'b0; spillSave = 1'b0; PCMux = 2'b00; @@ -401,7 +401,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( end STATE_HIT_SPILL_MISS_FETCH_WDV: begin PCMux = 2'b10; - InstrReadF = 1'b1; + //InstrReadF = 1'b1; PreCntEn = 1'b1; if (FetchCountFlag & InstrAckF) begin NextState = STATE_HIT_SPILL_MISS_FETCH_DONE; @@ -424,7 +424,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( // branch 3 miss no spill STATE_MISS_FETCH_WDV: begin PCMux = 2'b01; - InstrReadF = 1'b1; + //InstrReadF = 1'b1; PreCntEn = 1'b1; if (FetchCountFlag & InstrAckF) begin NextState = STATE_MISS_FETCH_DONE; @@ -447,7 +447,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( STATE_MISS_SPILL_FETCH_WDV: begin PCMux = 2'b01; PreCntEn = 1'b1; - InstrReadF = 1'b1; + //InstrReadF = 1'b1; if (FetchCountFlag & InstrAckF) begin NextState = STATE_MISS_SPILL_FETCH_DONE; end else begin @@ -478,7 +478,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( STATE_MISS_SPILL_MISS_FETCH_WDV: begin PCMux = 2'b10; PreCntEn = 1'b1; - InstrReadF = 1'b1; + //InstrReadF = 1'b1; if (FetchCountFlag & InstrAckF) begin NextState = STATE_MISS_SPILL_MISS_FETCH_DONE; end else begin @@ -508,11 +508,16 @@ module icachecontroller #(parameter LINESIZE = 256) ( // stall CPU any time we are not in the ready state. any other state means the // cache is either requesting data from the memory interface or handling a // spill over two cycles. - assign ICacheStallF = ((CurrState != STATE_READY) & hit) | reset_q ? 1'b1 : 1'b0; + assign ICacheStallF = ((CurrState != STATE_READY) | ~hit) | reset_q ? 1'b1 : 1'b0; // save the PC anytime we are in the ready state. The saved value will be used as the PC may not be stable. assign SavePC = (CurrState == STATE_READY) & hit ? 1'b1 : 1'b0; assign CntEn = PreCntEn & InstrAckF; + assign InstrReadF = (CurrState == STATE_HIT_SPILL_MISS_FETCH_WDV) || + (CurrState == STATE_MISS_FETCH_WDV) || + (CurrState == STATE_MISS_SPILL_FETCH_WDV) || + (CurrState == STATE_MISS_SPILL_MISS_FETCH_WDV); + // to compute the fetch address we need to add the bit shifted // counter output to the address. @@ -595,7 +600,13 @@ module icachecontroller #(parameter LINESIZE = 256) ( .q(reset_q)); flopenl #(32) AlignedInstrRawDFlop(clk, reset | reset_q, ~StallD, FinalInstrRawF, NOP, AlignedInstrRawD); - mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, FlushD, InstrRawD); + // cannot have this mux as it creates a combo loop. + // This flop doesn't stall if StallF is high because we should output a nop + // when FlushD happens, even if the pipeline is also stalled. + flopr #(1) flushDLastCycleFlop(clk, reset, ~FlushD & (FlushDLastCyclen | ~StallF), FlushDLastCyclen); + mux2 #(32) InstrRawDMux(AlignedInstrRawD, NOP, ~FlushDLastCyclen, InstrRawD); + //assign InstrRawD = AlignedInstrRawD; + assign {ICacheMemReadUpperPAdr, ICacheMemReadLowerAdr} = PCPFinalF; From 532c8771bafb20f720d51fdc9869c97876690050 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Wed, 21 Apr 2021 08:39:54 -0500 Subject: [PATCH 21/24] major progress. It's running the icache is imperas tests now. Compressed does not work yet. --- wally-pipelined/regression/wave.do | 86 +++++++++++++++--------------- wally-pipelined/src/ifu/icache.sv | 20 ++++--- 2 files changed, 55 insertions(+), 51 deletions(-) diff --git a/wally-pipelined/regression/wave.do b/wally-pipelined/regression/wave.do index 5174a350..816c566c 100644 --- a/wally-pipelined/regression/wave.do +++ b/wally-pipelined/regression/wave.do @@ -162,55 +162,55 @@ add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/N add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/D add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/Q add wave -noupdate -group divider /testbench/dut/hart/mdu/genblk1/div/rem0 -add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/AHBByteLength -add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/AHBOFFETWIDTH -add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/BlockByteLength -add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/OFFSETWIDTH -add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/WORDSPERLINE -add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/LOGWPL -add wave -noupdate -expand -group parameters /testbench/dut/hart/ifu/icache/controller/LINESIZE -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/CurrState -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCountFlag -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCount -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrPAdrF -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrReadF -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrAckF -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrInF -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteEnable -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteData -add wave -noupdate -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWritePAdr +add wave -noupdate -expand -group icache -color Orange /testbench/dut/hart/ifu/icache/controller/CurrState +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/hit +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/spill +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/ICacheStallF +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/SavePC +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/spillSave +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/UnalignedSelect +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/PCMux +add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/AHBByteLength +add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/AHBOFFETWIDTH +add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/BlockByteLength +add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/OFFSETWIDTH +add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/WORDSPERLINE +add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/LOGWPL +add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/LINESIZE +add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/FetchCountFlag +add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/FetchCount +add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/InstrPAdrF +add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/InstrReadF +add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/InstrAckF +add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/InstrInF +add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteEnable +add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteData +add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWritePAdr +add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/DataValidBit +add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/DataValid +add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/ReadTag +add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/DataTag +add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadAddr +add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadData +add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/ReadPAdr +add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteEnable +add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteLine +add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WritePAdr +add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteSet +add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteTag +add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/cachetags/StoredData +add wave -noupdate -expand -group icache -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/FinalInstrRawF +add wave -noupdate -expand -group icache -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/AlignedInstrRawD +add wave -noupdate -expand -group icache -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/InstrRawD add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPF add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF -add wave -noupdate -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/FinalInstrRawF -add wave -noupdate -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/AlignedInstrRawD -add wave -noupdate -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/InstrRawD -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/hit -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/spill -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/spillSave -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/UnalignedSelect -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCMux add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPFinalF -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/DataValidBit -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/DataValid -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/ReadTag -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/DataTag -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteEnable -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteLine -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WritePAdr -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteSet -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/WriteTag -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/StoredData -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadAddr -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadData -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/ReadPAdr -add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr add wave -noupdate /testbench/dut/hart/ifu/icache/controller/ICacheMemReadData add wave -noupdate /testbench/dut/hart/ifu/icache/controller/genblk2/PCPreFinalF_q add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/ICacheStallF -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/SavePC TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {44 ns} 0} {{Cursor 2} {566 ns} 0} +WaveRestoreCursors {{Cursor 2} {44 ns} 0} {{Cursor 2} {1598 ns} 0} quietly wave cursor active 2 configure wave -namecolwidth 250 configure wave -valuecolwidth 229 @@ -226,4 +226,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ns update -WaveRestoreZoom {458 ns} {674 ns} +WaveRestoreZoom {1559 ns} {1783 ns} diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index 09dab7a8..f836afb6 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -206,14 +206,14 @@ module icachecontroller #(parameter LINESIZE = 256) ( assign PCNextPF = {UpperPCNextPF, LowerPCNextF}; - flopenl #(`XLEN) PCPFFlop(clk, reset, SavePC, PCPFinalF, `RESET_VECTOR, PCPF); + flopenl #(`XLEN) PCPFFlop(clk, reset, SavePC & ~StallF, PCPFinalF, `RESET_VECTOR, PCPF); // on spill we want to get the first 2 bytes of the next cache block. // the spill only occurs if the PCPF mod BlockByteLength == -2. Therefore we can // simply add 2 to land on the next cache block. assign PCSpillF = PCPF + 2'b10; // now we have to select between these three PCs - assign PCPreFinalF = PCMux[0] ? PCPF : PCNextPF; + assign PCPreFinalF = PCMux[0] | StallF ? PCPF : PCNextPF; // *** don't like the stallf assign PCPFinalF = PCMux[1] ? PCSpillF : PCPreFinalF; @@ -568,15 +568,19 @@ module icachecontroller #(parameter LINESIZE = 256) ( generate if( `XLEN == 32) begin logic [1:1] PCPreFinalF_q; - flop #(1) PCFReg(.clk(clk), - .d(PCPreFinalF[1]), - .q(PCPreFinalF_q[1])); + flopenr #(1) PCFReg(.clk(clk), + .reset(reset), + .en(~StallF), + .d(PCPreFinalF[1]), + .q(PCPreFinalF_q[1])); assign FinalInstrRawF = PCPreFinalF_q[1] ? {SpillDataBlock0, ICacheMemReadData[31:16]} : ICacheMemReadData; end else begin logic [2:1] PCPreFinalF_q; - flop #(2) PCFReg(.clk(clk), - .d(PCPreFinalF[2:1]), - .q(PCPreFinalF_q[2:1])); + flopenr #(2) PCFReg(.clk(clk), + .reset(reset), + .en(~StallF), + .d(PCPreFinalF[2:1]), + .q(PCPreFinalF_q[2:1])); mux4 #(32) AlignmentMux(.d0(ICacheMemReadData[31:0]), .d1(ICacheMemReadData[47:16]), .d2(ICacheMemReadData[63:32]), From 7b3735fc25f0d51ef977d0e9caf5c017cf5e550f Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Wed, 21 Apr 2021 16:47:05 -0500 Subject: [PATCH 22/24] Fixed for the instruction spills. --- wally-pipelined/regression/wave.do | 57 +++++++++++++++------------- wally-pipelined/src/cache/dmapped.sv | 28 +++++++++++++- wally-pipelined/src/ifu/icache.sv | 35 +++++------------ 3 files changed, 66 insertions(+), 54 deletions(-) diff --git a/wally-pipelined/regression/wave.do b/wally-pipelined/regression/wave.do index 816c566c..bdbb9ec2 100644 --- a/wally-pipelined/regression/wave.do +++ b/wally-pipelined/regression/wave.do @@ -1,4 +1,5 @@ onerror {resume} +quietly virtual function -install /testbench/dut/hart/ifu/icache/cachemem -env /testbench/dut/hart/ifu/icache/cachemem { &{/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[4], /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[3], /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[2], /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[1], /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[0] }} offset quietly WaveActivateNextPane {} 0 add wave -noupdate /testbench/clk add wave -noupdate /testbench/reset @@ -8,19 +9,19 @@ add wave -noupdate -expand -group {Execution Stage} /testbench/functionRadix/fun add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/PCE add wave -noupdate -expand -group {Execution Stage} /testbench/InstrEName add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/InstrE -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrMisalignedFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrAccessFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/IllegalInstrFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/BreakpointFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/LoadMisalignedFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/StoreMisalignedFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/LoadAccessFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/StoreAccessFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/EcallFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrPageFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/LoadPageFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/StorePageFaultM -add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InterruptM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InstrMisalignedFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InstrAccessFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/IllegalInstrFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/BreakpointFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/LoadMisalignedFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/StoreMisalignedFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/LoadAccessFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/StoreAccessFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/EcallFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InstrPageFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/LoadPageFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/StorePageFaultM +add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InterruptM add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/BPPredWrongE add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/CSRWritePendingDEM add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/RetM @@ -59,15 +60,15 @@ add wave -noupdate -group Bpred /testbench/dut/hart/ifu/bpred/BPPredWrongE add wave -noupdate -expand -group {instruction pipeline} /testbench/dut/hart/ifu/InstrD add wave -noupdate -expand -group {instruction pipeline} /testbench/dut/hart/ifu/InstrE add wave -noupdate -expand -group {instruction pipeline} /testbench/dut/hart/ifu/InstrM -add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCNextF -add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCF -add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCPlus2or4F -add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/BPPredPCF -add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCNext0F -add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCNext1F -add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/SelBPPredF -add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/BPPredWrongE -add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PrivilegedChangePCM +add wave -noupdate -expand -group {PCNext Generation} /testbench/dut/hart/ifu/PCNextF +add wave -noupdate -expand -group {PCNext Generation} /testbench/dut/hart/ifu/PCF +add wave -noupdate -expand -group {PCNext Generation} /testbench/dut/hart/ifu/PCPlus2or4F +add wave -noupdate -expand -group {PCNext Generation} /testbench/dut/hart/ifu/BPPredPCF +add wave -noupdate -expand -group {PCNext Generation} /testbench/dut/hart/ifu/PCNext0F +add wave -noupdate -expand -group {PCNext Generation} /testbench/dut/hart/ifu/PCNext1F +add wave -noupdate -expand -group {PCNext Generation} /testbench/dut/hart/ifu/SelBPPredF +add wave -noupdate -expand -group {PCNext Generation} /testbench/dut/hart/ifu/BPPredWrongE +add wave -noupdate -expand -group {PCNext Generation} /testbench/dut/hart/ifu/PrivilegedChangePCM add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ifu/InstrD add wave -noupdate -group {Decode Stage} /testbench/InstrDName add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/c/RegWriteD @@ -207,13 +208,17 @@ add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPF add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPFinalF add wave -noupdate /testbench/dut/hart/ifu/icache/controller/ICacheMemReadData -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/genblk2/PCPreFinalF_q add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/ReadLine +add wave -noupdate -radix hexadecimal -childformat {{{/testbench/dut/hart/ifu/icache/cachemem/ReadOffset[1]} -radix hexadecimal} {{/testbench/dut/hart/ifu/icache/cachemem/ReadOffset[0]} -radix hexadecimal}} -subitemconfig {{/testbench/dut/hart/ifu/icache/cachemem/ReadOffset[1]} {-height 16 -radix hexadecimal} {/testbench/dut/hart/ifu/icache/cachemem/ReadOffset[0]} {-height 16 -radix hexadecimal}} /testbench/dut/hart/ifu/icache/cachemem/ReadOffset +add wave -noupdate -label {read offset} -radix unsigned -childformat {{(4) -radix unsigned} {(3) -radix unsigned} {(2) -radix unsigned} {(1) -radix unsigned} {(0) -radix unsigned}} -subitemconfig {{/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[4]} {-radix unsigned} {/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[3]} {-radix unsigned} {/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[2]} {-radix unsigned} {/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[1]} {-radix unsigned} {/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[0]} {-radix unsigned}} /testbench/dut/hart/ifu/icache/cachemem/offset +add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr +add wave -noupdate /testbench/dut/hart/ifu/CompressedF TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {44 ns} 0} {{Cursor 2} {1598 ns} 0} +WaveRestoreCursors {{Cursor 2} {44 ns} 0} {{Cursor 2} {9098514 ns} 0} quietly wave cursor active 2 configure wave -namecolwidth 250 -configure wave -valuecolwidth 229 +configure wave -valuecolwidth 513 configure wave -justifyvalue left configure wave -signalnamewidth 1 configure wave -snapdistance 10 @@ -226,4 +231,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ns update -WaveRestoreZoom {1559 ns} {1783 ns} +WaveRestoreZoom {9098483 ns} {9098569 ns} diff --git a/wally-pipelined/src/cache/dmapped.sv b/wally-pipelined/src/cache/dmapped.sv index 4f1cc2d3..34864d39 100644 --- a/wally-pipelined/src/cache/dmapped.sv +++ b/wally-pipelined/src/cache/dmapped.sv @@ -140,7 +140,7 @@ module rodirectmappedmemre #(parameter NUMLINES=512, parameter LINESIZE = 256, p input logic [LINESIZE-1:0] WriteLine, input logic [`XLEN-1:0] WritePAdr, // Output the word, as well as if it is valid - output logic [WORDSIZE-1:0] DataWord, + output logic [31:0] DataWord, // *** was WORDSIZE-1 output logic DataValid ); @@ -202,7 +202,31 @@ module rodirectmappedmemre #(parameter NUMLINES=512, parameter LINESIZE = 256, p ); // Pick the right bits coming out the read line - assign DataWord = ReadLineTransformed[ReadOffset]; + //assign DataWord = ReadLineTransformed[ReadOffset]; + //logic [31:0] tempRD; + always_comb begin + case (OldReadPAdr[4:1]) + 0: DataWord = ReadLine[31:0]; + 1: DataWord = ReadLine[47:16]; + 2: DataWord = ReadLine[63:32]; + 3: DataWord = ReadLine[79:48]; + + 4: DataWord = ReadLine[95:64]; + 5: DataWord = ReadLine[111:80]; + 6: DataWord = ReadLine[127:96]; + 7: DataWord = ReadLine[143:112]; + + 8: DataWord = ReadLine[159:128]; + 9: DataWord = ReadLine[175:144]; + 10: DataWord = ReadLine[191:160]; + 11: DataWord = ReadLine[207:176]; + + 12: DataWord = ReadLine[223:192]; + 13: DataWord = ReadLine[239:208]; + 14: DataWord = ReadLine[255:224]; + 15: DataWord = {16'b0, ReadLine[255:240]}; + endcase + end genvar i; generate for (i=0; i < LINESIZE/WORDSIZE; i++) begin diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index f836afb6..b14ae516 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -103,12 +103,12 @@ module icachecontroller #(parameter LINESIZE = 256) ( // Signals to/from cache memory // The read coming out of it - input logic [`XLEN-1:0] ICacheMemReadData, + input logic [31:0] ICacheMemReadData, input logic ICacheMemReadValid, // The address at which we want to search the cache memory output logic [`XLEN-1:12] ICacheMemReadUpperPAdr, output logic [11:0] ICacheMemReadLowerAdr, - output logic ICacheReadEn, + output logic ICacheReadEn, // Load data into the cache output logic ICacheMemWriteEnable, output logic [LINESIZE-1:0] ICacheMemWriteData, @@ -565,30 +565,13 @@ module icachecontroller #(parameter LINESIZE = 256) ( .q(SpillDataBlock0)); // use the not quite final PC to do the final selection. - generate - if( `XLEN == 32) begin - logic [1:1] PCPreFinalF_q; - flopenr #(1) PCFReg(.clk(clk), - .reset(reset), - .en(~StallF), - .d(PCPreFinalF[1]), - .q(PCPreFinalF_q[1])); - assign FinalInstrRawF = PCPreFinalF_q[1] ? {SpillDataBlock0, ICacheMemReadData[31:16]} : ICacheMemReadData; - end else begin - logic [2:1] PCPreFinalF_q; - flopenr #(2) PCFReg(.clk(clk), - .reset(reset), - .en(~StallF), - .d(PCPreFinalF[2:1]), - .q(PCPreFinalF_q[2:1])); - mux4 #(32) AlignmentMux(.d0(ICacheMemReadData[31:0]), - .d1(ICacheMemReadData[47:16]), - .d2(ICacheMemReadData[63:32]), - .d3({SpillDataBlock0, ICacheMemReadData[63:48]}), - .s(PCPreFinalF_q[2:1]), - .y(FinalInstrRawF)); - end - endgenerate + logic [1:1] PCPreFinalF_q; + flopenr #(1) PCFReg(.clk(clk), + .reset(reset), + .en(~StallF), + .d(PCPreFinalF[1]), + .q(PCPreFinalF_q[1])); + assign FinalInstrRawF = PCPreFinalF_q[1] ? {ICacheMemReadData[31:16], SpillDataBlock0} : ICacheMemReadData; // There is a frustrating issue on the first access. // The cache will not contain any valid data but will contain x's on From d8ab7a5de2fc20e7bd839ff67135546b0c8fcb03 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Thu, 22 Apr 2021 10:20:36 -0500 Subject: [PATCH 23/24] Partially working icache. The current issue is a StallF is required to halt the icache from getting an updated PCF. However if the dmemory is the reason for a stall it is possible for the icache stall to hold the d memory request continuously causing d memory to repeatedly read from memory. This keeps StallF high and the icache FSM is never allowed to complete. --- wally-pipelined/regression/wave.do | 101 +++++++++++++++-------------- wally-pipelined/src/ifu/icache.sv | 90 ++++++++++++++++++------- wally-pipelined/src/ifu/ifu.sv | 15 ++++- 3 files changed, 133 insertions(+), 73 deletions(-) diff --git a/wally-pipelined/regression/wave.do b/wally-pipelined/regression/wave.do index bdbb9ec2..280042de 100644 --- a/wally-pipelined/regression/wave.do +++ b/wally-pipelined/regression/wave.do @@ -9,26 +9,26 @@ add wave -noupdate -expand -group {Execution Stage} /testbench/functionRadix/fun add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/PCE add wave -noupdate -expand -group {Execution Stage} /testbench/InstrEName add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/InstrE -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InstrMisalignedFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InstrAccessFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/IllegalInstrFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/BreakpointFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/LoadMisalignedFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/StoreMisalignedFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/LoadAccessFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/StoreAccessFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/EcallFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InstrPageFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/LoadPageFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/StorePageFaultM -add wave -noupdate -expand -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InterruptM -add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/BPPredWrongE -add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/CSRWritePendingDEM -add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/RetM -add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/TrapM -add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/LoadStallD -add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/DataStall -add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/MulDivStallD +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrMisalignedFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrAccessFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/IllegalInstrFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/BreakpointFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/LoadMisalignedFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/StoreMisalignedFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/LoadAccessFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/StoreAccessFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/EcallFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrPageFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/LoadPageFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/StorePageFaultM +add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InterruptM +add wave -noupdate -expand -group HDU -expand -group hazards /testbench/dut/hart/hzu/BPPredWrongE +add wave -noupdate -expand -group HDU -expand -group hazards /testbench/dut/hart/hzu/CSRWritePendingDEM +add wave -noupdate -expand -group HDU -expand -group hazards /testbench/dut/hart/hzu/RetM +add wave -noupdate -expand -group HDU -expand -group hazards /testbench/dut/hart/hzu/TrapM +add wave -noupdate -expand -group HDU -expand -group hazards /testbench/dut/hart/hzu/LoadStallD +add wave -noupdate -expand -group HDU -expand -group hazards /testbench/dut/hart/hzu/DataStall +add wave -noupdate -expand -group HDU -expand -group hazards /testbench/dut/hart/MulDivStallD add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/hzu/FlushF add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushD add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushE @@ -172,6 +172,8 @@ add wave -noupdate -expand -group icache -expand -group {fsm out and control} /t add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/spillSave add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/UnalignedSelect add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/PCMux +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/spillSave +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/CntReset add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/AHBByteLength add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/AHBOFFETWIDTH add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/BlockByteLength @@ -179,34 +181,35 @@ add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/i add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/WORDSPERLINE add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/LOGWPL add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/LINESIZE -add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/FetchCountFlag -add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/FetchCount -add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/InstrPAdrF -add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/InstrReadF -add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/InstrAckF -add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/InstrInF -add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteEnable -add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteData -add wave -noupdate -expand -group icache -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWritePAdr -add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/DataValidBit -add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/DataValid -add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/ReadTag -add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/DataTag -add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadAddr -add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadData -add wave -noupdate -expand -group icache -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/ReadPAdr -add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteEnable -add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteLine -add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WritePAdr -add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteSet -add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteTag -add wave -noupdate -expand -group icache -group memory -expand -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/cachetags/StoredData +add wave -noupdate -expand -group icache -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCountFlag +add wave -noupdate -expand -group icache -expand -group memory /testbench/dut/hart/ifu/icache/controller/FetchCount +add wave -noupdate -expand -group icache -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrPAdrF +add wave -noupdate -expand -group icache -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrReadF +add wave -noupdate -expand -group icache -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrAckF +add wave -noupdate -expand -group icache -expand -group memory /testbench/dut/hart/ifu/icache/controller/InstrInF +add wave -noupdate -expand -group icache -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteEnable +add wave -noupdate -expand -group icache -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWriteData +add wave -noupdate -expand -group icache -expand -group memory /testbench/dut/hart/ifu/icache/controller/ICacheMemWritePAdr +add wave -noupdate -expand -group icache -expand -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/DataValidBit +add wave -noupdate -expand -group icache -expand -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/DataValid +add wave -noupdate -expand -group icache -expand -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/ReadTag +add wave -noupdate -expand -group icache -expand -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/DataTag +add wave -noupdate -expand -group icache -expand -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadAddr +add wave -noupdate -expand -group icache -expand -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/cachetags/ReadData +add wave -noupdate -expand -group icache -expand -group memory -group {tag read} /testbench/dut/hart/ifu/icache/cachemem/ReadPAdr +add wave -noupdate -expand -group icache -expand -group memory -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteEnable +add wave -noupdate -expand -group icache -expand -group memory -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteLine +add wave -noupdate -expand -group icache -expand -group memory -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WritePAdr +add wave -noupdate -expand -group icache -expand -group memory -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteSet +add wave -noupdate -expand -group icache -expand -group memory -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/WriteTag +add wave -noupdate -expand -group icache -expand -group memory -group {tag write} /testbench/dut/hart/ifu/icache/cachemem/cachetags/StoredData add wave -noupdate -expand -group icache -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/FinalInstrRawF add wave -noupdate -expand -group icache -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/AlignedInstrRawD add wave -noupdate -expand -group icache -expand -group {instr to cpu} /testbench/dut/hart/ifu/icache/controller/InstrRawD -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPF -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF -add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPFinalF +add wave -noupdate -expand -group pc /testbench/dut/hart/ifu/icache/controller/PCNextPF +add wave -noupdate -expand -group pc /testbench/dut/hart/ifu/icache/controller/PCPF +add wave -noupdate -expand -group pc /testbench/dut/hart/ifu/icache/controller/PCPreFinalF +add wave -noupdate -expand -group pc /testbench/dut/hart/ifu/icache/controller/PCPFinalF add wave -noupdate /testbench/dut/hart/ifu/icache/controller/ICacheMemReadData add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/ReadLine @@ -214,9 +217,11 @@ add wave -noupdate -radix hexadecimal -childformat {{{/testbench/dut/hart/ifu/ic add wave -noupdate -label {read offset} -radix unsigned -childformat {{(4) -radix unsigned} {(3) -radix unsigned} {(2) -radix unsigned} {(1) -radix unsigned} {(0) -radix unsigned}} -subitemconfig {{/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[4]} {-radix unsigned} {/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[3]} {-radix unsigned} {/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[2]} {-radix unsigned} {/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[1]} {-radix unsigned} {/testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr[0]} {-radix unsigned}} /testbench/dut/hart/ifu/icache/cachemem/offset add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr add wave -noupdate /testbench/dut/hart/ifu/CompressedF +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/SpillDataBlock0 +add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF_q TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {44 ns} 0} {{Cursor 2} {9098514 ns} 0} -quietly wave cursor active 2 +WaveRestoreCursors {{Cursor 2} {9808584 ns} 0} {{Cursor 3} {9808065 ns} 0} {{Cursor 4} {535 ns} 0} +quietly wave cursor active 1 configure wave -namecolwidth 250 configure wave -valuecolwidth 513 configure wave -justifyvalue left @@ -231,4 +236,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ns update -WaveRestoreZoom {9098483 ns} {9098569 ns} +WaveRestoreZoom {9808255 ns} {9808913 ns} diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv index b14ae516..6f0437e2 100644 --- a/wally-pipelined/src/ifu/icache.sv +++ b/wally-pipelined/src/ifu/icache.sv @@ -140,19 +140,41 @@ module icachecontroller #(parameter LINESIZE = 256) ( localparam STATE_HIT_SPILL_MISS_FETCH_DONE = 3; // write data into SRAM/LUT localparam STATE_HIT_SPILL_MERGE = 4; // Read block 0 of CPU access, should be able to optimize into STATE_HIT_SPILL. - localparam STATE_MISS_FETCH_WDV = 5; // aligned miss, issue read to AHB and wait for data. - localparam STATE_MISS_FETCH_DONE = 6; // write data into SRAM/LUT - localparam STATE_MISS_READ = 7; // read block 1 from SRAM/LUT + // a challenge is the spill signal gets us out of the ready state and moves us to + // 1 of the 2 spill branches. However the original fsm design had us return to + // the ready state when the spill + hits/misses were fully resolved. The problem + // is the spill signal is based on PCPF so when we return to READY to check if the + // cache has a hit it still expresses spill. We can fix in 1 of two ways. + // 1. we can add 1 extra state at the end of each spill branch to returns the instruction + // to the CPU advancing the CPU and icache to the next instruction. + // 2. We can assert a signal which is delayed 1 cycle to suppress the spill when we get + // to the READY state. + // The first first option is more robust and increases the number of states by 2. The + // second option is seams like it should work, but I worry there is a hidden interaction + // between CPU stalling and that register. + // Picking option 1. - localparam STATE_MISS_SPILL_FETCH_WDV = 8; // spill, miss on block 0, issue read to AHB and wait - localparam STATE_MISS_SPILL_FETCH_DONE = 9; // write data into SRAM/LUT - localparam STATE_MISS_SPILL_READ1 = 10; // read block 0 from SRAM/LUT - localparam STATE_MISS_SPILL_2 = 11; // return to ready if hit or do second block update. - localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 12; // miss on block 1, issue read to AHB and wait - localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 13; // write data to SRAM/LUT - localparam STATE_MISS_SPILL_MERGE = 14; // read block 0 of CPU access, + localparam STATE_HIT_SPILL_FINAL = 5; // this state replicates STATE_READY's replay of the + // spill access but does nto consider spill. It also does not do another operation. + - localparam STATE_INVALIDATE = 15; // *** not sure if invalidate or evict? invalidate by cache block or address? + localparam STATE_MISS_FETCH_WDV = 6; // aligned miss, issue read to AHB and wait for data. + localparam STATE_MISS_FETCH_DONE = 7; // write data into SRAM/LUT + localparam STATE_MISS_READ = 8; // read block 1 from SRAM/LUT + + localparam STATE_MISS_SPILL_FETCH_WDV = 9; // spill, miss on block 0, issue read to AHB and wait + localparam STATE_MISS_SPILL_FETCH_DONE = 10; // write data into SRAM/LUT + localparam STATE_MISS_SPILL_READ1 = 11; // read block 0 from SRAM/LUT + localparam STATE_MISS_SPILL_2 = 12; // return to ready if hit or do second block update. + localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 13; // miss on block 1, issue read to AHB and wait + localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 14; // write data to SRAM/LUT + localparam STATE_MISS_SPILL_MERGE = 15; // read block 0 of CPU access, + + localparam STATE_MISS_SPILL_FINAL = 16; // this state replicates STATE_READY's replay of the + // spill access but does nto consider spill. It also does not do another operation. + + + localparam STATE_INVALIDATE = 17; // *** not sure if invalidate or evict? invalidate by cache block or address? localparam AHBByteLength = `XLEN / 8; localparam AHBOFFETWIDTH = $clog2(AHBByteLength); @@ -164,7 +186,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( localparam WORDSPERLINE = LINESIZE/`XLEN; localparam LOGWPL = $clog2(WORDSPERLINE); - logic [3:0] CurrState, NextState; + logic [4:0] CurrState, NextState; logic hit, spill; logic SavePC; logic [1:0] PCMux; @@ -213,7 +235,8 @@ module icachecontroller #(parameter LINESIZE = 256) ( assign PCSpillF = PCPF + 2'b10; // now we have to select between these three PCs - assign PCPreFinalF = PCMux[0] | StallF ? PCPF : PCNextPF; // *** don't like the stallf + assign PCPreFinalF = PCMux[0] | StallF ? PCPF : PCNextPF; // *** don't like the stallf + //assign PCPreFinalF = PCMux[0] ? PCPF : PCNextPF; // *** don't like the stallf assign PCPFinalF = PCMux[1] ? PCSpillF : PCPreFinalF; @@ -347,12 +370,12 @@ module icachecontroller #(parameter LINESIZE = 256) ( -----/\----- EXCLUDED -----/\----- */ // the FSM is always runing, do not stall. - flopr #(4) stateReg(.clk(clk), + flopr #(5) stateReg(.clk(clk), .reset(reset), .d(NextState), .q(CurrState)); - assign spill = PCPF[5:1] == 5'b1_1111 ? 1'b1 : 1'b0; + assign spill = PCPF[4:1] == 4'b1111 ? 1'b1 : 1'b0; assign hit = ICacheMemReadValid; // note ICacheMemReadValid is hit. assign FetchCountFlag = FetchCount == FetchCountThreshold; @@ -366,6 +389,8 @@ module icachecontroller #(parameter LINESIZE = 256) ( spillSave = 1'b0; PCMux = 2'b00; ICacheReadEn = 1'b0; + SavePC = 1'b0; + ICacheStallF = 1'b1; case (CurrState) @@ -373,15 +398,19 @@ module icachecontroller #(parameter LINESIZE = 256) ( PCMux = 2'b00; ICacheReadEn = 1'b1; if (hit & ~spill) begin + SavePC = 1'b1; + ICacheStallF = 1'b0; NextState = STATE_READY; end else if (hit & spill) begin spillSave = 1'b1; + PCMux = 2'b10; NextState = STATE_HIT_SPILL; end else if (~hit & ~spill) begin CntReset = 1'b1; NextState = STATE_MISS_FETCH_WDV; end else if (~hit & spill) begin CntReset = 1'b1; + PCMux = 2'b10; NextState = STATE_MISS_SPILL_FETCH_WDV; end else begin NextState = STATE_READY; @@ -394,7 +423,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( UnalignedSelect = 1'b1; ICacheReadEn = 1'b1; if (hit) begin - NextState = STATE_READY; + NextState = STATE_HIT_SPILL_FINAL; end else CntReset = 1'b1; NextState = STATE_HIT_SPILL_MISS_FETCH_WDV; @@ -418,7 +447,15 @@ module icachecontroller #(parameter LINESIZE = 256) ( PCMux = 2'b10; UnalignedSelect = 1'b1; ICacheReadEn = 1'b1; - NextState = STATE_READY; + NextState = STATE_HIT_SPILL_FINAL; + end + STATE_HIT_SPILL_FINAL: begin + ICacheReadEn = 1'b1; + PCMux = 2'b00; + UnalignedSelect = 1'b1; + SavePC = 1'b1; + NextState = STATE_READY; + ICacheStallF = 1'b0; end // branch 3 miss no spill @@ -472,7 +509,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( CntReset = 1'b1; NextState = STATE_MISS_SPILL_MISS_FETCH_WDV; end else begin - NextState = STATE_READY; + NextState = STATE_MISS_SPILL_FINAL; end end STATE_MISS_SPILL_MISS_FETCH_WDV: begin @@ -494,7 +531,15 @@ module icachecontroller #(parameter LINESIZE = 256) ( PCMux = 2'b10; UnalignedSelect = 1'b1; ICacheReadEn = 1'b1; - NextState = STATE_READY; + NextState = STATE_MISS_SPILL_FINAL; + end + STATE_MISS_SPILL_FINAL: begin + ICacheReadEn = 1'b1; + PCMux = 2'b00; + UnalignedSelect = 1'b1; + SavePC = 1'b1; + ICacheStallF = 1'b0; + NextState = STATE_READY; end default: begin PCMux = 2'b01; @@ -508,9 +553,10 @@ module icachecontroller #(parameter LINESIZE = 256) ( // stall CPU any time we are not in the ready state. any other state means the // cache is either requesting data from the memory interface or handling a // spill over two cycles. - assign ICacheStallF = ((CurrState != STATE_READY) | ~hit) | reset_q ? 1'b1 : 1'b0; + // *** BUG this logic will need to change + //assign ICacheStallF = ((CurrState != STATE_READY) | ~hit | spill) | reset_q ? 1'b1 : 1'b0; // save the PC anytime we are in the ready state. The saved value will be used as the PC may not be stable. - assign SavePC = (CurrState == STATE_READY) & hit ? 1'b1 : 1'b0; + //assign SavePC = ((CurrState == STATE_READY) & hit) & ~spill ? 1'b1 : 1'b0; assign CntEn = PreCntEn & InstrAckF; assign InstrReadF = (CurrState == STATE_HIT_SPILL_MISS_FETCH_WDV) || @@ -571,7 +617,7 @@ module icachecontroller #(parameter LINESIZE = 256) ( .en(~StallF), .d(PCPreFinalF[1]), .q(PCPreFinalF_q[1])); - assign FinalInstrRawF = PCPreFinalF_q[1] ? {ICacheMemReadData[31:16], SpillDataBlock0} : ICacheMemReadData; + assign FinalInstrRawF = spill ? {ICacheMemReadData[15:0], SpillDataBlock0} : ICacheMemReadData; // There is a frustrating issue on the first access. // The cache will not contain any valid data but will contain x's on diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv index 58b144f5..5d728764 100644 --- a/wally-pipelined/src/ifu/ifu.sv +++ b/wally-pipelined/src/ifu/ifu.sv @@ -89,7 +89,7 @@ module ifu ( // branch predictor signals logic SelBPPredF; - logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F, PCNext2F; + logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F, PCNext2F, PCNext3F; logic [3:0] InstrClassD, InstrClassE; @@ -124,11 +124,20 @@ module ifu ( .s(PrivilegedChangePCM), .y(PCNext2F)); + // *** try to remove this in the future as it can add a long path. + // StallF may arrive late. +/* -----\/----- EXCLUDED -----\/----- mux2 #(`XLEN) pcmux3(.d0(PCNext2F), + .d1(PCF), + .s(StallF), + .y(PCNext3F)); + -----/\----- EXCLUDED -----/\----- */ + + mux2 #(`XLEN) pcmux4(.d0(PCNext2F), .d1(`RESET_VECTOR), .s(reset_q), - .y(UnalignedPCNextF)); - + .y(UnalignedPCNextF)); + flop #(1) resetReg (.clk(clk), .d(reset), .q(reset_q)); From c42399bdb5b5bc99f41f9638f8aa056a43e12e54 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Thu, 22 Apr 2021 15:22:56 -0500 Subject: [PATCH 24/24] Yes. The hack to not repeat the d memory operation fixed this issue. --- wally-pipelined/regression/wave.do | 25 ++++++++++++++++++-- wally-pipelined/src/dmem/dmem.sv | 37 ++++++++++++++++++++++++++++-- wally-pipelined/src/ebu/ahblite.sv | 4 +++- 3 files changed, 61 insertions(+), 5 deletions(-) diff --git a/wally-pipelined/regression/wave.do b/wally-pipelined/regression/wave.do index 280042de..0dbdf5de 100644 --- a/wally-pipelined/regression/wave.do +++ b/wally-pipelined/regression/wave.do @@ -101,6 +101,9 @@ add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/neg add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/lt add wave -noupdate -group alu /testbench/dut/hart/ieu/dp/alu/ltu add wave -noupdate /testbench/InstrFName +add wave -noupdate -expand -group {dcache memory} /testbench/dut/hart/dmem/MemReadM +add wave -noupdate -expand -group {dcache memory} /testbench/dut/hart/dmem/MemWriteM +add wave -noupdate -expand -group {dcache memory} /testbench/dut/hart/dmem/MemAckW add wave -noupdate -group dcache /testbench/dut/hart/MemAdrM add wave -noupdate -group dcache /testbench/dut/hart/MemPAdrM add wave -noupdate -group dcache /testbench/dut/hart/WriteDataM @@ -174,6 +177,8 @@ add wave -noupdate -expand -group icache -expand -group {fsm out and control} /t add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/PCMux add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/spillSave add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/CntReset +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/PreCntEn +add wave -noupdate -expand -group icache -expand -group {fsm out and control} /testbench/dut/hart/ifu/icache/controller/CntEn add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/AHBByteLength add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/AHBOFFETWIDTH add wave -noupdate -expand -group icache -group parameters /testbench/dut/hart/ifu/icache/controller/BlockByteLength @@ -219,8 +224,24 @@ add wave -noupdate /testbench/dut/hart/ifu/icache/cachemem/OldReadPAdr add wave -noupdate /testbench/dut/hart/ifu/CompressedF add wave -noupdate /testbench/dut/hart/ifu/icache/controller/SpillDataBlock0 add wave -noupdate /testbench/dut/hart/ifu/icache/controller/PCPreFinalF_q +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/BusState +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HCLK +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HRDATA +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HREADY +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HRESP +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HADDR +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HWDATA +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HWRITE +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HSIZE +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HBURST +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HPROT +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HTRANS +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HMASTLOCK +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HADDRD +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HSIZED +add wave -noupdate -expand -group AHB /testbench/dut/hart/ebu/HWRITED TreeUpdate [SetDefaultTree] -WaveRestoreCursors {{Cursor 2} {9808584 ns} 0} {{Cursor 3} {9808065 ns} 0} {{Cursor 4} {535 ns} 0} +WaveRestoreCursors {{Cursor 2} {9808206 ns} 0} {{Cursor 3} {9807791 ns} 0} {{Cursor 4} {85 ns} 0} quietly wave cursor active 1 configure wave -namecolwidth 250 configure wave -valuecolwidth 513 @@ -236,4 +257,4 @@ configure wave -griddelta 40 configure wave -timeline 0 configure wave -timelineunits ns update -WaveRestoreZoom {9808255 ns} {9808913 ns} +WaveRestoreZoom {9807926 ns} {9808486 ns} diff --git a/wally-pipelined/src/dmem/dmem.sv b/wally-pipelined/src/dmem/dmem.sv index 75559c3f..8836972b 100644 --- a/wally-pipelined/src/dmem/dmem.sv +++ b/wally-pipelined/src/dmem/dmem.sv @@ -63,6 +63,14 @@ module dmem ( // *** needs to be sent to trap unit logic DTLBPageFaultM; + logic [1:0] CurrState, NextState; + + localparam STATE_READY = 0; + localparam STATE_FETCH = 1; + localparam STATE_STALLED = 2; + + + tlb #(3) dtlb(.TLBAccess(MemAccessM), .VirtualAddress(MemAdrM), .PageTableEntryWrite(PageTableEntryM), .PageTypeWrite(PageTypeM), .TLBWrite(DTLBWriteM), .TLBFlush(DTLBFlushM), @@ -81,8 +89,8 @@ module dmem ( // Squash unaligned data accesses and failed store conditionals // *** this is also the place to squash if the cache is hit - assign MemReadM = MemRWM[1] & ~DataMisalignedM; - assign MemWriteM = MemRWM[0] & ~DataMisalignedM && ~SquashSCM; + assign MemReadM = MemRWM[1] & ~DataMisalignedM & CurrState != STATE_STALLED; + assign MemWriteM = MemRWM[0] & ~DataMisalignedM && ~SquashSCM & CurrState != STATE_STALLED; assign MemAccessM = |MemRWM; // Determine if address is valid @@ -119,5 +127,30 @@ module dmem ( // Data stall //assign DataStall = 0; + // Ross Thompson April 22, 2021 + // for now we need to handle the issue where the data memory interface repeately + // requests data from memory rather than issuing a single request. + + + flopr #(2) stateReg(.clk(clk), + .reset(reset), + .d(NextState), + .q(CurrState)); + + always_comb begin + case (CurrState) + STATE_READY: if (MemAccessM & ~DataMisalignedM) NextState = STATE_FETCH; + else NextState = STATE_READY; + STATE_FETCH: if (MemAckW & ~StallW) NextState = STATE_READY; + else if (MemAckW & StallW) NextState = STATE_STALLED; + else NextState = STATE_FETCH; + STATE_STALLED: if (~StallW) NextState = STATE_READY; + else NextState = STATE_STALLED; + default: NextState = STATE_READY; + endcase // case (CurrState) + end + + + endmodule diff --git a/wally-pipelined/src/ebu/ahblite.sv b/wally-pipelined/src/ebu/ahblite.sv index e71a94f9..c2121714 100644 --- a/wally-pipelined/src/ebu/ahblite.sv +++ b/wally-pipelined/src/ebu/ahblite.sv @@ -75,7 +75,8 @@ module ahblite ( output logic [3:0] HSIZED, output logic HWRITED, // Stalls - output logic /*InstrUpdate, */DataStall + output logic /*InstrUpdate, */DataStall, + output logic MemAckW // *** add a chip-level ready signal as part of handshake ); @@ -175,6 +176,7 @@ module ahblite ( assign InstrRData = HRDATA; assign InstrAckF = (BusState == INSTRREAD) && (NextBusState != INSTRREAD) || (BusState == INSTRREADC) && (NextBusState != INSTRREADC); + assign MemAckW = (BusState == MEMREAD) && (NextBusState != MEMREAD) || (BusState == MEMWRITE) && (NextBusState != MEMWRITE); assign MMUReadPTE = HRDATA; assign ReadDataM = HRDATAMasked; // changed from W to M dh 2/7/2021 assign CaptureDataM = ((BusState == MEMREAD) && (NextBusState != MEMREAD)) ||