Merge pull request #560 from ross144/main

Removed unnecessary spill on uncompressed instruction when aligned to end of cache line or uncached access.  Improves Coremark from 2.97 to 2.99.
This commit is contained in:
David Harris 2024-01-10 11:50:39 -08:00 committed by GitHub
commit c62c351aa7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 21 additions and 20 deletions

View File

@ -48,13 +48,12 @@ module ahbinterface #(
input logic [XLEN-1:0] WriteData, // IEU write data for a store
output logic BusStall, // Bus is busy with an in flight memory operation
output logic BusCommitted, // Bus is busy with an in flight memory operation and it is not safe to take an interrupt
output logic [(LSU ? XLEN : 32)-1:0] FetchBuffer // Register to hold HRDATA after arriving from the bus
output logic [XLEN-1:0] FetchBuffer // Register to hold HRDATA after arriving from the bus
);
logic CaptureEn;
localparam LEN = (LSU ? XLEN : 32); // 32 bits for IFU, XLEN for LSU
flopen #(LEN) fb(.clk(HCLK), .en(CaptureEn), .d(HRDATA[LEN-1:0]), .q(FetchBuffer));
flopen #(XLEN) fb(.clk(HCLK), .en(CaptureEn), .d(HRDATA), .q(FetchBuffer));
if(LSU) begin
// delay HWDATA by 1 cycle per spec; assumes AHBW = XLEN

View File

@ -99,6 +99,7 @@ module ifu import cvw::*; #(parameter cvw_t P) (
);
localparam [31:0] nop = 32'h00000013; // instruction for NOP
localparam LINELEN = P.ICACHE_SUPPORTED ? P.ICACHE_LINELENINBITS : P.XLEN;
logic [P.XLEN-1:0] PCNextF; // Next PCF, selected from Branch predictor, Privilege, or PC+2/4
logic [P.XLEN-1:0] PC1NextF; // Branch predictor next PCF
@ -136,6 +137,8 @@ module ifu import cvw::*; #(parameter cvw_t P) (
logic CacheCommittedF; // I$ memory operation started, delay interrupts
logic SelIROM; // PMA indicates instruction address is in the IROM
logic [15:0] InstrRawE, InstrRawM;
logic [LINELEN-1:0] FetchBuffer;
logic [31:0] ShiftUncachedInstr;
assign PCFExt = {2'b00, PCSpillF};
@ -225,9 +228,7 @@ module ifu import cvw::*; #(parameter cvw_t P) (
localparam LOGBWPL = P.ICACHE_SUPPORTED ? $clog2(WORDSPERLINE) : 1;
if(P.ICACHE_SUPPORTED) begin : icache
localparam LINELEN = P.ICACHE_SUPPORTED ? P.ICACHE_LINELENINBITS : P.XLEN;
localparam LLENPOVERAHBW = P.LLEN / P.AHBW; // Number of AHB beats in a LLEN word. AHBW cannot be larger than LLEN. (implementation limitation)
logic [LINELEN-1:0] FetchBuffer;
logic [P.PA_BITS-1:0] ICacheBusAdr;
logic ICacheBusAck;
logic [1:0] CacheBusRW, BusRW, CacheRWF;
@ -264,16 +265,10 @@ module ifu import cvw::*; #(parameter cvw_t P) (
.BusRW, .Stall(GatedStallD),
.BusStall, .BusCommitted(BusCommittedF));
logic [31:0] ShiftUncachedInstr;
if(P.XLEN == 64) mux4 #(32) UncachedShiftInstrMux(FetchBuffer[32-1:0], FetchBuffer[48-1:16], FetchBuffer[64-1:32], {16'b0, FetchBuffer[64-1:48]},
PCSpillF[2:1], ShiftUncachedInstr);
else mux2 #(32) UncachedShiftInstrMux(FetchBuffer[32-1:0], {16'b0, FetchBuffer[32-1:16]}, PCSpillF[1], ShiftUncachedInstr);
mux3 #(32) UnCachedDataMux(.d0(ICacheInstrF), .d1(ShiftUncachedInstr), .d2(IROMInstrF),
.s({SelIROM, ~CacheableF}), .y(InstrRawF[31:0]));
end else begin : passthrough
assign IFUHADDR = PCPF;
logic [31:0] FetchBuffer;
logic [1:0] BusRW;
assign BusRW = ~ITLBMissF & ~SelIROM ? IFURWF : '0;
assign IFUHSIZE = 3'b010;
@ -284,8 +279,8 @@ module ifu import cvw::*; #(parameter cvw_t P) (
.Stall(GatedStallD), .BusStall, .BusCommitted(BusCommittedF), .FetchBuffer(FetchBuffer));
assign CacheCommittedF = '0;
if(P.IROM_SUPPORTED) mux2 #(32) UnCachedDataMux2(FetchBuffer, IROMInstrF, SelIROM, InstrRawF);
else assign InstrRawF = FetchBuffer;
if(P.IROM_SUPPORTED) mux2 #(32) UnCachedDataMux2(ShiftUncachedInstr, IROMInstrF, SelIROM, InstrRawF);
else assign InstrRawF = ShiftUncachedInstr;
assign IFUHBURST = 3'b0;
assign {ICacheMiss, ICacheAccess, ICacheStallF} = '0;
end
@ -294,6 +289,11 @@ module ifu import cvw::*; #(parameter cvw_t P) (
assign {ICacheStallF, ICacheMiss, ICacheAccess} = '0;
assign InstrRawF = IROMInstrF;
end
// mux between the alignments of uncached reads.
if(P.XLEN == 64) mux4 #(32) UncachedShiftInstrMux(FetchBuffer[32-1:0], FetchBuffer[48-1:16], FetchBuffer[64-1:32], {16'b0, FetchBuffer[64-1:48]},
PCSpillF[2:1], ShiftUncachedInstr);
else mux2 #(32) UncachedShiftInstrMux(FetchBuffer[32-1:0], {16'b0, FetchBuffer[32-1:16]}, PCSpillF[1], ShiftUncachedInstr);
assign IFUCacheBusStallF = ICacheStallF | BusStall;
assign IFUStallF = IFUCacheBusStallF | SelSpillNextF;

View File

@ -57,6 +57,7 @@ module spill import cvw::*; #(parameter cvw_t P) (
logic SelSpillF;
logic SpillSaveF;
logic [15:0] InstrFirstHalfF;
logic EarlyCompressedF;
////////////////////////////////////////////////////////////////////////////////////////////////////
// PC logic
@ -77,14 +78,14 @@ module spill import cvw::*; #(parameter cvw_t P) (
////////////////////////////////////////////////////////////////////////////////////////////////////
if (P.ICACHE_SUPPORTED) begin
logic SpillCachedF, SpillUncachedF;
logic SpillCachedF, SpillUncachedF;
assign SpillCachedF = &PCF[$clog2(P.ICACHE_LINELENINBITS/32)+1:1];
assign SpillUncachedF = PCF[1]; // *** try to optimize this based on whether the next instruction is 16 bits and by fetching 64 bits in RV64
assign SpillF = CacheableF ? SpillCachedF : SpillUncachedF;
assign SpillUncachedF = PCF[1];
assign SpillF = (CacheableF ? SpillCachedF : SpillUncachedF);
end else
assign SpillF = PCF[1]; // *** might relax - only spill if next instruction is uncompressed
assign SpillF = PCF[1];
// Don't take the spill if there is a stall, TLB miss, or hardware update to the D/A bits
assign TakeSpillF = SpillF & ~IFUCacheBusStallF & ~(ITLBMissF | (P.SVADU_SUPPORTED & InstrUpdateDAF));
assign TakeSpillF = SpillF & ~EarlyCompressedF & ~IFUCacheBusStallF & ~(ITLBMissF | (P.SVADU_SUPPORTED & InstrUpdateDAF));
always_ff @(posedge clk)
if (reset | FlushD) CurrState <= #1 STATE_READY;
@ -112,11 +113,12 @@ module spill import cvw::*; #(parameter cvw_t P) (
flopenr #(16) SpillInstrReg(clk, reset, SpillSaveF, InstrRawF[15:0], InstrFirstHalfF);
// merge together
mux2 #(32) postspillmux(InstrRawF, {InstrRawF[15:0], InstrFirstHalfF}, SpillF, PostSpillInstrRawF);
mux2 #(32) postspillmux(InstrRawF, {InstrRawF[15:0], InstrFirstHalfF}, SelSpillF, PostSpillInstrRawF);
// Need to use always comb to avoid pessimistic x propagation if PostSpillInstrRawF is x
always_comb
if (PostSpillInstrRawF[1:0] != 2'b11) CompressedF = 1'b1;
else CompressedF = 1'b0;
assign EarlyCompressedF = ~(&InstrRawF[1:0]);
endmodule