Merge pull request #560 from ross144/main

Removed unnecessary spill on uncompressed instruction when aligned to end of cache line or uncached access.  Improves Coremark from 2.97 to 2.99.
This commit is contained in:
David Harris 2024-01-10 11:50:39 -08:00 committed by GitHub
commit c62c351aa7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 21 additions and 20 deletions

View File

@ -48,13 +48,12 @@ module ahbinterface #(
input logic [XLEN-1:0] WriteData, // IEU write data for a store input logic [XLEN-1:0] WriteData, // IEU write data for a store
output logic BusStall, // Bus is busy with an in flight memory operation output logic BusStall, // Bus is busy with an in flight memory operation
output logic BusCommitted, // Bus is busy with an in flight memory operation and it is not safe to take an interrupt output logic BusCommitted, // Bus is busy with an in flight memory operation and it is not safe to take an interrupt
output logic [(LSU ? XLEN : 32)-1:0] FetchBuffer // Register to hold HRDATA after arriving from the bus output logic [XLEN-1:0] FetchBuffer // Register to hold HRDATA after arriving from the bus
); );
logic CaptureEn; logic CaptureEn;
localparam LEN = (LSU ? XLEN : 32); // 32 bits for IFU, XLEN for LSU
flopen #(LEN) fb(.clk(HCLK), .en(CaptureEn), .d(HRDATA[LEN-1:0]), .q(FetchBuffer)); flopen #(XLEN) fb(.clk(HCLK), .en(CaptureEn), .d(HRDATA), .q(FetchBuffer));
if(LSU) begin if(LSU) begin
// delay HWDATA by 1 cycle per spec; assumes AHBW = XLEN // delay HWDATA by 1 cycle per spec; assumes AHBW = XLEN

View File

@ -99,6 +99,7 @@ module ifu import cvw::*; #(parameter cvw_t P) (
); );
localparam [31:0] nop = 32'h00000013; // instruction for NOP localparam [31:0] nop = 32'h00000013; // instruction for NOP
localparam LINELEN = P.ICACHE_SUPPORTED ? P.ICACHE_LINELENINBITS : P.XLEN;
logic [P.XLEN-1:0] PCNextF; // Next PCF, selected from Branch predictor, Privilege, or PC+2/4 logic [P.XLEN-1:0] PCNextF; // Next PCF, selected from Branch predictor, Privilege, or PC+2/4
logic [P.XLEN-1:0] PC1NextF; // Branch predictor next PCF logic [P.XLEN-1:0] PC1NextF; // Branch predictor next PCF
@ -136,6 +137,8 @@ module ifu import cvw::*; #(parameter cvw_t P) (
logic CacheCommittedF; // I$ memory operation started, delay interrupts logic CacheCommittedF; // I$ memory operation started, delay interrupts
logic SelIROM; // PMA indicates instruction address is in the IROM logic SelIROM; // PMA indicates instruction address is in the IROM
logic [15:0] InstrRawE, InstrRawM; logic [15:0] InstrRawE, InstrRawM;
logic [LINELEN-1:0] FetchBuffer;
logic [31:0] ShiftUncachedInstr;
assign PCFExt = {2'b00, PCSpillF}; assign PCFExt = {2'b00, PCSpillF};
@ -225,9 +228,7 @@ module ifu import cvw::*; #(parameter cvw_t P) (
localparam LOGBWPL = P.ICACHE_SUPPORTED ? $clog2(WORDSPERLINE) : 1; localparam LOGBWPL = P.ICACHE_SUPPORTED ? $clog2(WORDSPERLINE) : 1;
if(P.ICACHE_SUPPORTED) begin : icache if(P.ICACHE_SUPPORTED) begin : icache
localparam LINELEN = P.ICACHE_SUPPORTED ? P.ICACHE_LINELENINBITS : P.XLEN;
localparam LLENPOVERAHBW = P.LLEN / P.AHBW; // Number of AHB beats in a LLEN word. AHBW cannot be larger than LLEN. (implementation limitation) localparam LLENPOVERAHBW = P.LLEN / P.AHBW; // Number of AHB beats in a LLEN word. AHBW cannot be larger than LLEN. (implementation limitation)
logic [LINELEN-1:0] FetchBuffer;
logic [P.PA_BITS-1:0] ICacheBusAdr; logic [P.PA_BITS-1:0] ICacheBusAdr;
logic ICacheBusAck; logic ICacheBusAck;
logic [1:0] CacheBusRW, BusRW, CacheRWF; logic [1:0] CacheBusRW, BusRW, CacheRWF;
@ -264,16 +265,10 @@ module ifu import cvw::*; #(parameter cvw_t P) (
.BusRW, .Stall(GatedStallD), .BusRW, .Stall(GatedStallD),
.BusStall, .BusCommitted(BusCommittedF)); .BusStall, .BusCommitted(BusCommittedF));
logic [31:0] ShiftUncachedInstr;
if(P.XLEN == 64) mux4 #(32) UncachedShiftInstrMux(FetchBuffer[32-1:0], FetchBuffer[48-1:16], FetchBuffer[64-1:32], {16'b0, FetchBuffer[64-1:48]},
PCSpillF[2:1], ShiftUncachedInstr);
else mux2 #(32) UncachedShiftInstrMux(FetchBuffer[32-1:0], {16'b0, FetchBuffer[32-1:16]}, PCSpillF[1], ShiftUncachedInstr);
mux3 #(32) UnCachedDataMux(.d0(ICacheInstrF), .d1(ShiftUncachedInstr), .d2(IROMInstrF), mux3 #(32) UnCachedDataMux(.d0(ICacheInstrF), .d1(ShiftUncachedInstr), .d2(IROMInstrF),
.s({SelIROM, ~CacheableF}), .y(InstrRawF[31:0])); .s({SelIROM, ~CacheableF}), .y(InstrRawF[31:0]));
end else begin : passthrough end else begin : passthrough
assign IFUHADDR = PCPF; assign IFUHADDR = PCPF;
logic [31:0] FetchBuffer;
logic [1:0] BusRW; logic [1:0] BusRW;
assign BusRW = ~ITLBMissF & ~SelIROM ? IFURWF : '0; assign BusRW = ~ITLBMissF & ~SelIROM ? IFURWF : '0;
assign IFUHSIZE = 3'b010; assign IFUHSIZE = 3'b010;
@ -284,8 +279,8 @@ module ifu import cvw::*; #(parameter cvw_t P) (
.Stall(GatedStallD), .BusStall, .BusCommitted(BusCommittedF), .FetchBuffer(FetchBuffer)); .Stall(GatedStallD), .BusStall, .BusCommitted(BusCommittedF), .FetchBuffer(FetchBuffer));
assign CacheCommittedF = '0; assign CacheCommittedF = '0;
if(P.IROM_SUPPORTED) mux2 #(32) UnCachedDataMux2(FetchBuffer, IROMInstrF, SelIROM, InstrRawF); if(P.IROM_SUPPORTED) mux2 #(32) UnCachedDataMux2(ShiftUncachedInstr, IROMInstrF, SelIROM, InstrRawF);
else assign InstrRawF = FetchBuffer; else assign InstrRawF = ShiftUncachedInstr;
assign IFUHBURST = 3'b0; assign IFUHBURST = 3'b0;
assign {ICacheMiss, ICacheAccess, ICacheStallF} = '0; assign {ICacheMiss, ICacheAccess, ICacheStallF} = '0;
end end
@ -295,6 +290,11 @@ module ifu import cvw::*; #(parameter cvw_t P) (
assign InstrRawF = IROMInstrF; assign InstrRawF = IROMInstrF;
end end
// mux between the alignments of uncached reads.
if(P.XLEN == 64) mux4 #(32) UncachedShiftInstrMux(FetchBuffer[32-1:0], FetchBuffer[48-1:16], FetchBuffer[64-1:32], {16'b0, FetchBuffer[64-1:48]},
PCSpillF[2:1], ShiftUncachedInstr);
else mux2 #(32) UncachedShiftInstrMux(FetchBuffer[32-1:0], {16'b0, FetchBuffer[32-1:16]}, PCSpillF[1], ShiftUncachedInstr);
assign IFUCacheBusStallF = ICacheStallF | BusStall; assign IFUCacheBusStallF = ICacheStallF | BusStall;
assign IFUStallF = IFUCacheBusStallF | SelSpillNextF; assign IFUStallF = IFUCacheBusStallF | SelSpillNextF;
assign GatedStallD = StallD & ~SelSpillNextF; assign GatedStallD = StallD & ~SelSpillNextF;

View File

@ -57,6 +57,7 @@ module spill import cvw::*; #(parameter cvw_t P) (
logic SelSpillF; logic SelSpillF;
logic SpillSaveF; logic SpillSaveF;
logic [15:0] InstrFirstHalfF; logic [15:0] InstrFirstHalfF;
logic EarlyCompressedF;
//////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////
// PC logic // PC logic
@ -79,12 +80,12 @@ module spill import cvw::*; #(parameter cvw_t P) (
if (P.ICACHE_SUPPORTED) begin if (P.ICACHE_SUPPORTED) begin
logic SpillCachedF, SpillUncachedF; logic SpillCachedF, SpillUncachedF;
assign SpillCachedF = &PCF[$clog2(P.ICACHE_LINELENINBITS/32)+1:1]; assign SpillCachedF = &PCF[$clog2(P.ICACHE_LINELENINBITS/32)+1:1];
assign SpillUncachedF = PCF[1]; // *** try to optimize this based on whether the next instruction is 16 bits and by fetching 64 bits in RV64 assign SpillUncachedF = PCF[1];
assign SpillF = CacheableF ? SpillCachedF : SpillUncachedF; assign SpillF = (CacheableF ? SpillCachedF : SpillUncachedF);
end else end else
assign SpillF = PCF[1]; // *** might relax - only spill if next instruction is uncompressed assign SpillF = PCF[1];
// Don't take the spill if there is a stall, TLB miss, or hardware update to the D/A bits // Don't take the spill if there is a stall, TLB miss, or hardware update to the D/A bits
assign TakeSpillF = SpillF & ~IFUCacheBusStallF & ~(ITLBMissF | (P.SVADU_SUPPORTED & InstrUpdateDAF)); assign TakeSpillF = SpillF & ~EarlyCompressedF & ~IFUCacheBusStallF & ~(ITLBMissF | (P.SVADU_SUPPORTED & InstrUpdateDAF));
always_ff @(posedge clk) always_ff @(posedge clk)
if (reset | FlushD) CurrState <= #1 STATE_READY; if (reset | FlushD) CurrState <= #1 STATE_READY;
@ -112,11 +113,12 @@ module spill import cvw::*; #(parameter cvw_t P) (
flopenr #(16) SpillInstrReg(clk, reset, SpillSaveF, InstrRawF[15:0], InstrFirstHalfF); flopenr #(16) SpillInstrReg(clk, reset, SpillSaveF, InstrRawF[15:0], InstrFirstHalfF);
// merge together // merge together
mux2 #(32) postspillmux(InstrRawF, {InstrRawF[15:0], InstrFirstHalfF}, SpillF, PostSpillInstrRawF); mux2 #(32) postspillmux(InstrRawF, {InstrRawF[15:0], InstrFirstHalfF}, SelSpillF, PostSpillInstrRawF);
// Need to use always comb to avoid pessimistic x propagation if PostSpillInstrRawF is x // Need to use always comb to avoid pessimistic x propagation if PostSpillInstrRawF is x
always_comb always_comb
if (PostSpillInstrRawF[1:0] != 2'b11) CompressedF = 1'b1; if (PostSpillInstrRawF[1:0] != 2'b11) CompressedF = 1'b1;
else CompressedF = 1'b0; else CompressedF = 1'b0;
assign EarlyCompressedF = ~(&InstrRawF[1:0]);
endmodule endmodule