From 23c4ba27771a96615c85964fd1c407ba3dd15a10 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Wed, 26 Jan 2022 17:37:04 -0600 Subject: [PATCH] 1. Modified the cache so it can handle the reset delay internally. This removes the mux from the IFU. 2. Removed the write address delay from simpleram.sv 3. Fixed rv32tim and rv32ic mode to handle missalignment correctly. 4. Added imperas32i and imperas32c to rv32tim mode. --- pipelined/regression/regression-wally | 4 +- pipelined/src/cache/cachefsm.sv | 64 ++++++++++++++----------- pipelined/src/generic/flop/simpleram.sv | 7 +-- pipelined/src/ifu/ifu.sv | 62 +++++++----------------- pipelined/src/lsu/busfsm.sv | 4 +- pipelined/src/lsu/lsu.sv | 7 +-- 6 files changed, 62 insertions(+), 86 deletions(-) diff --git a/pipelined/regression/regression-wally b/pipelined/regression/regression-wally index a8f4f4dd4..391ee90ec 100755 --- a/pipelined/regression/regression-wally +++ b/pipelined/regression/regression-wally @@ -68,7 +68,7 @@ for test in tests32gc: grepstr="All tests ran without failures") configs.append(tc) -tests32ic = ["arch32i", "arch32c"] +tests32ic = ["arch32i", "arch32c", "imperas32i", "imperas32c"] for test in tests32ic: tc = TestCase( name=test, @@ -77,7 +77,7 @@ for test in tests32ic: grepstr="All tests ran without failures") configs.append(tc) -tests32tim = ["arch32i", "arch32c"] +tests32tim = ["arch32i", "arch32c", "imperas32i", "imperas32c"] for test in tests32tim: tc = TestCase( name=test, diff --git a/pipelined/src/cache/cachefsm.sv b/pipelined/src/cache/cachefsm.sv index 4997febec..7d4d2f77d 100644 --- a/pipelined/src/cache/cachefsm.sv +++ b/pipelined/src/cache/cachefsm.sv @@ -79,6 +79,8 @@ module cachefsm ); logic AnyCPUReqM; + logic [1:0] PreSelAdr; + logic resetDelay; typedef enum {STATE_READY, @@ -107,6 +109,12 @@ module cachefsm assign CacheAccess = AnyCPUReqM & CurrState == STATE_READY; assign CacheMiss = CacheAccess & ~CacheHit; + // special case on reset. When the fsm first exists reset the + // PCNextF will no longer be pointing to the correct address. + // But PCF will be the reset vector. + flop #(1) resetDelayReg(.clk, .d(reset), .q(resetDelay)); + assign SelAdr = resetDelay ? 2'b01 : PreSelAdr; + always_ff @(posedge clk) if (reset) CurrState <= #1 STATE_READY; else CurrState <= #1 NextState; @@ -114,7 +122,7 @@ module cachefsm // next state logic and some state ouputs. always_comb begin CacheStall = 1'b0; - SelAdr = 2'b00; + PreSelAdr = 2'b00; SetValid = 1'b0; ClearValid = 1'b0; SetDirty = 1'b0; @@ -137,7 +145,7 @@ module cachefsm STATE_READY: begin CacheStall = 1'b0; - SelAdr = 2'b00; + PreSelAdr = 2'b00; SRAMWordWriteEnable = 1'b0; SetDirty = 1'b0; LRUWriteEn = 1'b0; @@ -150,7 +158,7 @@ module cachefsm // PTW ready the CPU will stall. // The page table walker asserts it's control 1 cycle // after the TLBs miss. - SelAdr = 2'b01; + PreSelAdr = 2'b01; NextState = STATE_READY; end @@ -164,12 +172,12 @@ module cachefsm // amo hit else if(Atomic[1] & (&RW) & CacheHit) begin - SelAdr = 2'b01; + PreSelAdr = 2'b01; CacheStall = 1'b0; if(CPUBusy) begin NextState = STATE_CPU_BUSY_FINISH_AMO; - SelAdr = 2'b01; + PreSelAdr = 2'b01; end else begin SRAMWordWriteEnable = 1'b1; @@ -185,7 +193,7 @@ module cachefsm if(CPUBusy) begin NextState = STATE_CPU_BUSY; - SelAdr = 2'b01; + PreSelAdr = 2'b01; end else begin NextState = STATE_READY; @@ -193,7 +201,7 @@ module cachefsm end // write hit valid cached else if (RW[0] & CacheHit) begin - SelAdr = 2'b01; + PreSelAdr = 2'b01; CacheStall = 1'b0; SRAMWordWriteEnable = 1'b1; SetDirty = 1'b1; @@ -201,7 +209,7 @@ module cachefsm if(CPUBusy) begin NextState = STATE_CPU_BUSY; - SelAdr = 2'b01; + PreSelAdr = 2'b01; end else begin NextState = STATE_READY; @@ -218,7 +226,7 @@ module cachefsm STATE_MISS_FETCH_WDV: begin CacheStall = 1'b1; - SelAdr = 2'b01; + PreSelAdr = 2'b01; if (CacheBusAck) begin NextState = STATE_MISS_FETCH_DONE; @@ -229,7 +237,7 @@ module cachefsm STATE_MISS_FETCH_DONE: begin CacheStall = 1'b1; - SelAdr = 2'b01; + PreSelAdr = 2'b01; if(VictimDirty) begin NextState = STATE_MISS_EVICT_DIRTY; CacheWriteLine = 1'b1; @@ -242,14 +250,14 @@ module cachefsm SRAMLineWriteEnable = 1'b1; CacheStall = 1'b1; NextState = STATE_MISS_READ_WORD; - SelAdr = 2'b01; + PreSelAdr = 2'b01; SetValid = 1'b1; ClearDirty = 1'b1; //LRUWriteEn = 1'b1; // DO not update LRU on SRAM fetch update. Wait for subsequent read/write end STATE_MISS_READ_WORD: begin - SelAdr = 2'b01; + PreSelAdr = 2'b01; CacheStall = 1'b1; if (RW[0] & ~Atomic[1]) begin // handles stores and amo write. NextState = STATE_MISS_WRITE_WORD; @@ -261,12 +269,12 @@ module cachefsm end STATE_MISS_READ_WORD_DELAY: begin - //SelAdr = 2'b01; + //PreSelAdr = 2'b01; SRAMWordWriteEnable = 1'b0; SetDirty = 1'b0; LRUWriteEn = 1'b0; if(&RW & Atomic[1]) begin // amo write - SelAdr = 2'b01; + PreSelAdr = 2'b01; if(CPUBusy) begin NextState = STATE_CPU_BUSY_FINISH_AMO; end @@ -280,7 +288,7 @@ module cachefsm LRUWriteEn = 1'b1; if(CPUBusy) begin NextState = STATE_CPU_BUSY; - SelAdr = 2'b01; + PreSelAdr = 2'b01; end else begin NextState = STATE_READY; @@ -291,11 +299,11 @@ module cachefsm STATE_MISS_WRITE_WORD: begin SRAMWordWriteEnable = 1'b1; SetDirty = 1'b1; - SelAdr = 2'b01; + PreSelAdr = 2'b01; LRUWriteEn = 1'b1; if(CPUBusy) begin NextState = STATE_CPU_BUSY; - SelAdr = 2'b01; + PreSelAdr = 2'b01; end else begin NextState = STATE_READY; @@ -304,7 +312,7 @@ module cachefsm STATE_MISS_EVICT_DIRTY: begin CacheStall = 1'b1; - SelAdr = 2'b01; + PreSelAdr = 2'b01; SelEvict = 1'b1; if(CacheBusAck) begin NextState = STATE_MISS_WRITE_CACHE_LINE; @@ -315,10 +323,10 @@ module cachefsm STATE_CPU_BUSY: begin - SelAdr = 2'b00; + PreSelAdr = 2'b00; if(CPUBusy) begin NextState = STATE_CPU_BUSY; - SelAdr = 2'b01; + PreSelAdr = 2'b01; end else begin NextState = STATE_READY; @@ -326,7 +334,7 @@ module cachefsm end STATE_CPU_BUSY_FINISH_AMO: begin - SelAdr = 2'b01; + PreSelAdr = 2'b01; SRAMWordWriteEnable = 1'b0; SetDirty = 1'b0; LRUWriteEn = 1'b0; @@ -345,13 +353,13 @@ module cachefsm // intialize flush counters SelFlush = 1'b1; CacheStall = 1'b1; - SelAdr = 2'b10; + PreSelAdr = 2'b10; NextState = STATE_FLUSH_CHECK; end STATE_FLUSH_CHECK: begin CacheStall = 1'b1; - SelAdr = 2'b10; + PreSelAdr = 2'b10; SelFlush = 1'b1; if(VictimDirty) begin NextState = STATE_FLUSH_WRITE_BACK; @@ -360,7 +368,7 @@ module cachefsm end else if (FlushAdrFlag & FlushWayFlag) begin NextState = STATE_READY; CacheStall = 1'b0; - SelAdr = 2'b00; + PreSelAdr = 2'b00; FlushWayCntEn = 1'b0; end else if(FlushWayFlag) begin NextState = STATE_FLUSH_INCR; @@ -375,7 +383,7 @@ module cachefsm STATE_FLUSH_INCR: begin CacheStall = 1'b1; - SelAdr = 2'b10; + PreSelAdr = 2'b10; SelFlush = 1'b1; FlushWayCntRst = 1'b1; NextState = STATE_FLUSH_CHECK; @@ -383,7 +391,7 @@ module cachefsm STATE_FLUSH_WRITE_BACK: begin CacheStall = 1'b1; - SelAdr = 2'b10; + PreSelAdr = 2'b10; SelFlush = 1'b1; if(CacheBusAck) begin NextState = STATE_FLUSH_CLEAR_DIRTY; @@ -397,12 +405,12 @@ module cachefsm ClearDirty = 1'b1; VDWriteEnable = 1'b1; SelFlush = 1'b1; - SelAdr = 2'b10; + PreSelAdr = 2'b10; FlushWayCntEn = 1'b0; if(FlushAdrFlag & FlushWayFlag) begin NextState = STATE_READY; CacheStall = 1'b0; - SelAdr = 2'b00; + PreSelAdr = 2'b00; end else if (FlushWayFlag) begin NextState = STATE_FLUSH_INCR; FlushAdrCntEn = 1'b1; diff --git a/pipelined/src/generic/flop/simpleram.sv b/pipelined/src/generic/flop/simpleram.sv index 74fb7cd20..43b873567 100644 --- a/pipelined/src/generic/flop/simpleram.sv +++ b/pipelined/src/generic/flop/simpleram.sv @@ -39,20 +39,17 @@ module simpleram #(parameter BASE=0, RANGE = 65535) ( ); logic [`XLEN-1:0] RAM[BASE>>(1+`XLEN/32):(RANGE+BASE)>>1+(`XLEN/32)]; - logic [31:0] ad; - - flop #(32) areg(clk, a, ad); // *** redesign external interface so this delay isn't needed /* verilator lint_off WIDTH */ if (`XLEN == 64) begin:ramrw always_ff @(posedge clk) begin rd <= RAM[a[31:3]]; - if (we) RAM[ad[31:3]] <= #1 wd; + if (we) RAM[a[31:3]] <= #1 wd; end end else begin always_ff @(posedge clk) begin:ramrw rd <= RAM[a[31:2]]; - if (we) RAM[ad[31:2]] <= #1 wd; + if (we) RAM[a[31:2]] <= #1 wd; end end /* verilator lint_on WIDTH */ diff --git a/pipelined/src/ifu/ifu.sv b/pipelined/src/ifu/ifu.sv index 4b73d6852..94e6ae2d2 100644 --- a/pipelined/src/ifu/ifu.sv +++ b/pipelined/src/ifu/ifu.sv @@ -232,12 +232,10 @@ module ifu ( if (`MEM_IROM) begin : irom logic [`XLEN-1:0] FinalInstrRawF_FIXME; - // *** adjust interface so write address doesn't need delaying - // *** modify to be a ROM rather than RAM simpleram #( .BASE(`RAM_BASE), .RANGE(`RAM_RANGE)) ram ( .clk, - .a(CPUBusy ? PCPF[31:0] : PCNextFMux[31:0]), // mux is also inside $, have to replay address if CPU is stalled. + .a(CPUBusy | reset ? PCPF[31:0] : PCNextFMux[31:0]), // mux is also inside $, have to replay address if CPU is stalled. .we(1'b0), .wd(0), .rd(FinalInstrRawF_FIXME)); assign FinalInstrRawF = FinalInstrRawF_FIXME[31:0]; @@ -328,50 +326,23 @@ module ifu ( assign PrivilegedChangePCM = RetM | TrapM; - mux2 #(`XLEN) pcmux0(.d0(PCPlus2or4F), - .d1(BPPredPCF), - .s(SelBPPredF), - .y(PCNext0F)); - - mux2 #(`XLEN) pcmux1(.d0(PCNext0F), - .d1(PCCorrectE), - .s(BPPredWrongE), - .y(PCNext1F)); - - // December 20, 2021 Ross Thompson, If instructions in ID and IF are already invalid we don't pick PCE on icache invalidate. - // this only happens because of branch class miss prediction. The Fence instruction was incorrectly predicted as a branch - // this means on the previous cycle the BPPredWrongE updated PCNextF to the correct fall through address. - // to fix we need to select the correct address PCF as the next PCNextF. Unforunately we must still flush the instruction in IF - // as we are deliberately invalidating the icache. This address has to be refetched by the icache. - mux2 #(`XLEN) pcmux2(.d0(PCNext1F), - .d1(PCBPWrongInvalidate), - .s(InvalidateICacheM), - .y(PCNext2F)); - - mux2 #(`XLEN) pcmux3(.d0(PCNext2F), - .d1(PrivilegedNextPCM), - .s(PrivilegedChangePCM), - //.y(UnalignedPCNextF)); - .y(PCNext3F)); - + mux2 #(`XLEN) pcmux0(.d0(PCPlus2or4F), .d1(BPPredPCF), .s(SelBPPredF), .y(PCNext0F)); + mux2 #(`XLEN) pcmux1(.d0(PCNext0F), .d1(PCCorrectE), .s(BPPredWrongE), .y(PCNext1F)); + // The true correct target is IEUAdrE if PCSrcE is 1 else it is the fall through PCLinkE. + mux2 #(`XLEN) pccorrectemux(.d0(PCLinkE), .d1(IEUAdrE), .s(PCSrcE), .y(PCCorrectE)); + mux2 #(`XLEN) pcmux2(.d0(PCNext1F), .d1(PCBPWrongInvalidate), .s(InvalidateICacheM), .y(PCNext2F)); + // Mux only required on instruction class miss prediction. + mux2 #(`XLEN) pcmuxBPWrongInvalidateFlush(.d0(PCE), .d1(PCF), .s(BPPredWrongM), .y(PCBPWrongInvalidate)); + mux2 #(`XLEN) pcmux3(.d0(PCNext2F), .d1(PrivilegedNextPCM), .s(PrivilegedChangePCM), .y(PCNext3F)); // This mux is required as PCNextF needs to be the valid reset vector during reset. // Reseting PCF does not accomplish this as PCNextF will be +2/4 more than PCF. - mux2 #(`XLEN) pcmux4(.d0(PCNext3F), - .d1(`RESET_VECTOR), - .s(`MEM_IROM ? reset : reset_q), - .y(UnalignedPCNextF)); - - flop #(1) resetReg (.clk(clk), .d(reset),.q(reset_q)); // delay reset - - flopenrc #(1) BPPredWrongMReg(.clk, .reset, .en(~StallM), .clear(FlushM), - .d(BPPredWrongE), .q(BPPredWrongM)); - - mux2 #(`XLEN) pcmuxBPWrongInvalidateFlush(.d0(PCE), .d1(PCF), - .s(BPPredWrongM), // & InvalidateICacheM *** check with linux. - .y(PCBPWrongInvalidate)); - // The true correct target is IEUAdrE if PCSrcE is 1 else it is the fall through PCLinkE. - assign PCCorrectE = PCSrcE ? IEUAdrE : PCLinkE; + //mux2 #(`XLEN) pcmux4(.d0(PCNext3F), .d1(`RESET_VECTOR), .s(`MEM_IROM ? reset : reset_q), .y(UnalignedPCNextF)); + // mux2 #(`XLEN) pcmux4(.d0(PCNext3F), .d1(`RESET_VECTOR), .s(reset), .y(UnalignedPCNextF)); // ******* probably can get rid of by making reset SelAdr = 01 + assign UnalignedPCNextF = PCNext3F; + + flopenrc #(1) BPPredWrongMReg(.clk, .reset, .en(~StallM), .clear(FlushM), .d(BPPredWrongE), .q(BPPredWrongM)); + assign PCNextF = {UnalignedPCNextF[`XLEN-1:1], 1'b0}; // hart-SPEC p. 21 about 16-bit alignment flopenl #(`XLEN) pcreg(clk, reset, ~StallF, PCNextF, `RESET_VECTOR, PCF); @@ -404,7 +375,7 @@ module ifu ( end else begin : bpred assign BPPredPCF = '0; - assign BPPredWrongM = PCSrcE; + assign BPPredWrongE = PCSrcE; assign {SelBPPredF, BPPredDirWrongM, BTBPredPCWrongM, RASPredPCWrongM, BPPredClassNonCFIWrongM} = '0; end @@ -428,7 +399,6 @@ module ifu ( // *** combine these with others in better way, including M, F - // Misaligned PC logic // instruction address misalignment is generated by the target of control flow instructions, not // the fetch itself. diff --git a/pipelined/src/lsu/busfsm.sv b/pipelined/src/lsu/busfsm.sv index a9e43b559..ae85fa870 100644 --- a/pipelined/src/lsu/busfsm.sv +++ b/pipelined/src/lsu/busfsm.sv @@ -127,11 +127,11 @@ module busfsm #(parameter integer WordCountThreshold, (BusCurrState == STATE_BUS_FETCH) | (BusCurrState == STATE_BUS_WRITE); assign PreCntEn = BusCurrState == STATE_BUS_FETCH | BusCurrState == STATE_BUS_WRITE; - assign UnCachedLSUBusWrite = (BusCurrState == STATE_BUS_READY & UnCachedAccess & (LSURWM[0])) | + assign UnCachedLSUBusWrite = (BusCurrState == STATE_BUS_READY & UnCachedAccess & (LSURWM[0] & ~IgnoreRequest)) | (BusCurrState == STATE_BUS_UNCACHED_WRITE); assign LSUBusWrite = UnCachedLSUBusWrite | (BusCurrState == STATE_BUS_WRITE); - assign UnCachedLSUBusRead = (BusCurrState == STATE_BUS_READY & UnCachedAccess & (|LSURWM[1])) | + assign UnCachedLSUBusRead = (BusCurrState == STATE_BUS_READY & UnCachedAccess & (|LSURWM[1] & IgnoreRequest)) | (BusCurrState == STATE_BUS_UNCACHED_READ); assign LSUBusRead = UnCachedLSUBusRead | (BusCurrState == STATE_BUS_FETCH) | (BusCurrState == STATE_BUS_READY & DCacheFetchLine); diff --git a/pipelined/src/lsu/lsu.sv b/pipelined/src/lsu/lsu.sv index 93cf5e849..2ef616156 100644 --- a/pipelined/src/lsu/lsu.sv +++ b/pipelined/src/lsu/lsu.sv @@ -153,7 +153,8 @@ module lsu ( assign DTLBStorePageFaultM = DTLBPageFaultM & PreLSURWM[0]; end // if (`MEM_VIRTMEM) else begin - assign {InterlockStall, SelHPTW, IgnoreRequest, PTE, PageType, DTLBWriteM, ITLBWriteF} = '0; + assign {InterlockStall, SelHPTW, PTE, PageType, DTLBWriteM, ITLBWriteF} = '0; + assign IgnoreRequest = TrapM; assign {DTLBLoadPageFaultM, DTLBStorePageFaultM} = '0; assign CPUBusy = StallW; assign LSUAdrE = PreLSUAdrE; assign LSUFunct3M = Funct3M; assign LSUFunct7M = Funct7M; assign LSUAtomicM = AtomicM; @@ -248,8 +249,8 @@ module lsu ( // *** adjust interface so write address doesn't need delaying; switch to standard RAM? simpleram #(.BASE(`RAM_BASE), .RANGE(`RAM_RANGE)) ram ( .clk, - .a(CPUBusy ? IEUAdrM[31:0] : IEUAdrE[31:0]), - .we(LSURWM[0]), + .a(CPUBusy | LSURWM[0] ? IEUAdrM[31:0] : IEUAdrE[31:0]), + .we(LSURWM[0] & ~TrapM), // have to ignore write if Trap. .wd(FinalWriteDataM), .rd(ReadDataWordM)); // since we have a local memory the bus connections are all disabled.