diff --git a/benchmarks/coremark/riscv64-baremetal/syscalls.c b/benchmarks/coremark/riscv64-baremetal/syscalls.c index 34615b6e4..29cd5f24a 100644 --- a/benchmarks/coremark/riscv64-baremetal/syscalls.c +++ b/benchmarks/coremark/riscv64-baremetal/syscalls.c @@ -84,6 +84,11 @@ void setStats(int enable) READ_CTR(mhpmcounter10); READ_CTR(mhpmcounter11); READ_CTR(mhpmcounter12); + READ_CTR(mhpmcounter13); + READ_CTR(mhpmcounter14); + READ_CTR(mhpmcounter15); + READ_CTR(mhpmcounter16); + READ_CTR(mhpmcounter17); #undef READ_CTR } @@ -167,18 +172,21 @@ void _init(int cid, int nc) counters[12] = read_csr(mhpmcounter12) - counters[12]; counters[13] = read_csr(mhpmcounter13) - counters[13]; counters[14] = read_csr(mhpmcounter14) - counters[14]; + counters[15] = read_csr(mhpmcounter15) - counters[15]; + counters[16] = read_csr(mhpmcounter16) - counters[16]; + counters[17] = read_csr(mhpmcounter17) - counters[17]; - ee_printf("Load Stalls %d\n", counters[3]); - ee_printf("D-Cache Accesses %d\n", counters[11]); - ee_printf("D-Cache Misses %d\n", counters[12]); - ee_printf("I-Cache Accesses %d\n", counters[13]); - ee_printf("I-Cache Misses %d\n", counters[14]); - ee_printf("Branches %d\n", counters[5]); - ee_printf("Branches Miss Predictions %d\n", counters[4]); - ee_printf("BTB Misses %d\n", counters[6]); - ee_printf("Jump, JAL, JALR %d\n", counters[7]); - ee_printf("RAS Wrong %d\n", counters[8]); - ee_printf("Returns %d\n", counters[9]); + ee_printf("Load Stalls %d\n", counters[11]); + ee_printf("D-Cache Accesses %d\n", counters[13]); + ee_printf("D-Cache Misses %d\n", counters[14]); + ee_printf("I-Cache Accesses %d\n", counters[16]); + ee_printf("I-Cache Misses %d\n", counters[17]); + ee_printf("Branches %d\n", counters[3]); + ee_printf("Branches Miss Predictions %d\n", counters[7]); + ee_printf("BTB Misses %d\n", counters[8]); + ee_printf("Jump and JR %d\n", counters[4]); + ee_printf("RAS Wrong %d\n", counters[9]); + ee_printf("Returns %d\n", counters[5]); ee_printf("BP Class Wrong %d\n", counters[10]); ee_printf("Done printing performance counters\n"); diff --git a/bin/parseHPMC.py b/bin/parseHPMC.py index 5b5e0d98b..9e08f2c7a 100755 --- a/bin/parseHPMC.py +++ b/bin/parseHPMC.py @@ -279,12 +279,13 @@ if(sys.argv[1] == '-b'): dct[PredType] = (currSize, currPercent) print(dct) fig, axes = plt.subplots() - marker={'twobit' : '^', 'gshare' : 'o', 'global' : 's', 'gshareBasic' : '*', 'globalBasic' : 'x', 'btb': 'x', 'twobitCModel' : 'x', 'gshareCModel' : '*'} - colors={'twobit' : 'black', 'gshare' : 'blue', 'global' : 'dodgerblue', 'gshareBasic' : 'turquoise', 'globalBasic' : 'lightsteelblue', 'btb' : 'blue', 'twobitCModel' : 'gray', 'gshareCModel' : 'dodgerblue'} + marker={'twobit' : '^', 'gshare' : 'o', 'global' : 's', 'gshareBasic' : '*', 'globalBasic' : 'x', 'btb': 'x', 'twobitCModel' : 'x', 'gshareCModel' : '*', 'tenlocal' : '.', 'eightlocal' : ',', 'fourlocal' : 'x', 'tenlocalahead' : '.', 'eightlocalahead' : ',', 'fourlocalahead' : 'x', 'tenlocalrepair' : 'x'} + colors={'twobit' : 'black', 'gshare' : 'blue', 'global' : 'dodgerblue', 'gshareBasic' : 'turquoise', 'globalBasic' : 'lightsteelblue', 'btb' : 'blue', 'twobitCModel' : 'gray', 'gshareCModel' : 'dodgerblue', 'tenlocal' : 'lightblue', 'eightlocal' : 'lightblue', 'fourlocal' : 'lightblue', 'tenlocalahead' : 'lightblue', 'eightlocalahead' : 'lightblue', 'fourlocalahead' : 'lightblue', 'tenlocalrepair' : 'lightblue'} for cat in dct: (x, y) = dct[cat] x=[int(2**int(v)) for v in x] - print(x, y) + #print(x, y) + print(cat) axes.plot(x,y, color=colors[cat]) axes.scatter(x,y, label=cat, marker=marker[cat], color=colors[cat]) #plt.scatter(x, y, label=cat) diff --git a/config/buildroot/wally-config.vh b/config/buildroot/wally-config.vh index 236574288..0957dd003 100644 --- a/config/buildroot/wally-config.vh +++ b/config/buildroot/wally-config.vh @@ -132,6 +132,7 @@ `define BPRED_SUPPORTED 1 `define BPRED_TYPE "BP_GSHARE" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT `define BPRED_SIZE 10 +`define BPRED_NUM_LHR 6 `define BTB_SIZE 10 diff --git a/config/fpga/wally-config.vh b/config/fpga/wally-config.vh index 1f7447f4d..9230c148e 100644 --- a/config/fpga/wally-config.vh +++ b/config/fpga/wally-config.vh @@ -141,6 +141,7 @@ `define BPRED_SUPPORTED 1 `define BPRED_TYPE "BP_GSHARE" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT `define BPRED_SIZE 12 +`define BPRED_NUM_LHR 6 `define BTB_SIZE 10 diff --git a/config/rv32e/wally-config.vh b/config/rv32e/wally-config.vh index aee0e5410..700117e3d 100644 --- a/config/rv32e/wally-config.vh +++ b/config/rv32e/wally-config.vh @@ -136,6 +136,7 @@ `define BPRED_SUPPORTED 0 `define BPRED_TYPE "BP_GSHARE" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT `define BPRED_SIZE 10 +`define BPRED_NUM_LHR 6 `define BTB_SIZE 10 `define SVADU_SUPPORTED 0 diff --git a/config/rv32gc/wally-config.vh b/config/rv32gc/wally-config.vh index 37eff79ea..4eb71c54e 100644 --- a/config/rv32gc/wally-config.vh +++ b/config/rv32gc/wally-config.vh @@ -133,8 +133,9 @@ `define PLIC_UART_ID 10 `define BPRED_SUPPORTED 1 -`define BPRED_TYPE "BP_GSHARE" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT +`define BPRED_TYPE "BP_GSHARE" // "BP_LOCAL_REPAIR" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT `define BPRED_SIZE 16 +`define BPRED_NUM_LHR 8 `define BTB_SIZE 10 `define SVADU_SUPPORTED 1 diff --git a/config/rv32i/wally-config.vh b/config/rv32i/wally-config.vh index d75d0c462..585499169 100644 --- a/config/rv32i/wally-config.vh +++ b/config/rv32i/wally-config.vh @@ -136,6 +136,7 @@ `define BPRED_SUPPORTED 0 `define BPRED_TYPE "BP_GSHARE" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT `define BPRED_SIZE 10 +`define BPRED_NUM_LHR 6 `define BTB_SIZE 10 `define SVADU_SUPPORTED 0 diff --git a/config/rv32imc/wally-config.vh b/config/rv32imc/wally-config.vh index 42442d46e..093d92bd7 100644 --- a/config/rv32imc/wally-config.vh +++ b/config/rv32imc/wally-config.vh @@ -135,6 +135,7 @@ `define BPRED_SUPPORTED 0 `define BPRED_TYPE "BP_GSHARE" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT `define BPRED_SIZE 10 +`define BPRED_NUM_LHR 6 `define BTB_SIZE 10 `define SVADU_SUPPORTED 0 diff --git a/config/rv64fpquad/wally-config.vh b/config/rv64fpquad/wally-config.vh index 34d7628e0..45725645f 100644 --- a/config/rv64fpquad/wally-config.vh +++ b/config/rv64fpquad/wally-config.vh @@ -138,6 +138,7 @@ `define BPRED_SUPPORTED 1 `define BPRED_TYPE "BP_GSHARE" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT `define BPRED_SIZE 10 +`define BPRED_NUM_LHR 6 `define BTB_SIZE 10 `define SVADU_SUPPORTED 0 diff --git a/config/rv64gc/wally-config.vh b/config/rv64gc/wally-config.vh index b44351ef2..c933d87ae 100644 --- a/config/rv64gc/wally-config.vh +++ b/config/rv64gc/wally-config.vh @@ -136,8 +136,10 @@ `define PLIC_UART_ID 10 `define BPRED_SUPPORTED 1 -`define BPRED_TYPE "BP_GSHARE" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT -`define BPRED_SIZE 10 +//`define BPRED_TYPE "BP_GLOBAL_BASIC" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT +`define BPRED_TYPE "BP_GSHARE" // "BP_LOCAL_REPAIR" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT +`define BPRED_SIZE 6 +`define BPRED_NUM_LHR 4 `define BTB_SIZE 10 `define SVADU_SUPPORTED 1 diff --git a/config/rv64i/wally-config.vh b/config/rv64i/wally-config.vh index 34c37f73a..61ac725db 100644 --- a/config/rv64i/wally-config.vh +++ b/config/rv64i/wally-config.vh @@ -138,6 +138,7 @@ `define BPRED_SUPPORTED 0 `define BPRED_TYPE "BP_GSHARE" // BP_GSHARE_BASIC, BP_GLOBAL, BP_GLOBAL_BASIC, BP_TWOBIT `define BPRED_SIZE 10 +`define BPRED_NUM_LHR 6 `define BTB_SIZE 10 `define SVADU_SUPPORTED 0 diff --git a/sim/bpred-sim.py b/sim/bpred-sim.py index 60574e371..4f1757cb8 100755 --- a/sim/bpred-sim.py +++ b/sim/bpred-sim.py @@ -46,18 +46,33 @@ configs = [ ) ] +# bpdSize = [6, 8, 10, 12, 14, 16] +# bpdType = ['twobit', 'gshare', 'global', 'gshare_basic', 'global_basic', 'local_basic'] +# for CurrBPType in bpdType: +# for CurrBPSize in bpdSize: +# name = CurrBPType+str(CurrBPSize) +# configOptions = "+define+INSTR_CLASS_PRED=0 +define+BPRED_TYPE=\"BP_" + CurrBPType.upper() + "\" +define+BPRED_SIZE=" + str(CurrBPSize) +# tc = TestCase( +# name=name, +# variant="rv32gc", +# cmd="vsim > {} -c < {} -c < {} -c < 2) mux2 #(1) LRUMuxes[NUMWAYS-3:0](CurrLRU[NUMWAYS-3:0], ~WayExpanded[NUMWAYS-3:0], LRUUpdate[NUMWAYS-3:0], NextLRU[NUMWAYS-3:0]); // Compute next victim way. for(node = NUMWAYS-2; node >= NUMWAYS/2; node--) begin diff --git a/src/ifu/bpred/bpred.sv b/src/ifu/bpred/bpred.sv index 881150b3f..0656789f2 100644 --- a/src/ifu/bpred/bpred.sv +++ b/src/ifu/bpred/bpred.sv @@ -127,19 +127,21 @@ module bpred ( .PCNextF, .PCM, .BPDirPredF, .BPDirPredWrongE, .BranchE, .BranchM, .PCSrcE); - end else if (`BPRED_TYPE == "BPLOCALPAg") begin:Predictor - // *** Fix me -/* -----\/----- EXCLUDED -----\/----- - localHistoryPredictor DirPredictor(.clk, - .reset, .StallF, .StallE, - .LookUpPC(PCNextF), - .Prediction(BPDirPredF), - // update - .UpdatePC(PCE), - .UpdateEN(InstrClassE[0] & ~StallE), - .PCSrcE, - .UpdatePrediction(InstrClassE[0])); - -----/\----- EXCLUDED -----/\----- */ + end else if (`BPRED_TYPE == "BP_LOCAL_BASIC") begin:Predictor + localbpbasic #(`BPRED_NUM_LHR, `BPRED_SIZE) DirPredictor(.clk, .reset, + .StallF, .StallD, .StallE, .StallM, .StallW, .FlushD, .FlushE, .FlushM, .FlushW, + .PCNextF, .PCM, .BPDirPredF, .BPDirPredWrongE, + .BranchE, .BranchM, .PCSrcE); + end else if (`BPRED_TYPE == "BP_LOCAL_AHEAD") begin:Predictor + localaheadbp #(`BPRED_NUM_LHR, `BPRED_SIZE) DirPredictor(.clk, .reset, + .StallF, .StallD, .StallE, .StallM, .StallW, .FlushD, .FlushE, .FlushM, .FlushW, + .PCNextF, .PCM, .BPDirPredD(BPDirPredF), .BPDirPredWrongE, + .BranchE, .BranchM, .PCSrcE); + end else if (`BPRED_TYPE == "BP_LOCAL_REPAIR") begin:Predictor + localrepairbp #(`BPRED_NUM_LHR, `BPRED_SIZE) DirPredictor(.clk, .reset, + .StallF, .StallD, .StallE, .StallM, .StallW, .FlushD, .FlushE, .FlushM, .FlushW, + .PCNextF, .PCE, .PCM, .BPDirPredD(BPDirPredF), .BPDirPredWrongE, + .BranchD, .BranchE, .BranchM, .PCSrcE); end // Part 2 Branch target address prediction diff --git a/src/ifu/bpred/localHistoryPredictor.sv b/src/ifu/bpred/localHistoryPredictor.sv deleted file mode 100644 index cde1fa7d3..000000000 --- a/src/ifu/bpred/localHistoryPredictor.sv +++ /dev/null @@ -1,130 +0,0 @@ -/////////////////////////////////////////// -// locallHistoryPredictor.sv -// -// Written: Shreya Sanghai -// Email: ssanghai@hmc.edu -// Created: March 16, 2021 -// Modified: -// -// Purpose: Global History Branch predictor with parameterized global history register -// -// A component of the CORE-V-WALLY configurable RISC-V project. -// -// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University -// -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file -// except in compliance with the License, or, at your option, the Apache License version 2.0. You -// may obtain a copy of the License at -// -// https://solderpad.org/licenses/SHL-2.1/ -// -// Unless required by applicable law or agreed to in writing, any work distributed under the -// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -// either express or implied. See the License for the specific language governing permissions -// and limitations under the License. -//////////////////////////////////////////////////////////////////////////////////////////////// - -`include "wally-config.vh" - -module localHistoryPredictor #(parameter m = 6, // 2^m = number of local history branches - k = 10) ( // number of past branches stored - input logic clk, - input logic reset, - input logic StallF, StallE, - input logic [`XLEN-1:0] LookUpPC, - output logic [1:0] Prediction, - // update - input logic [`XLEN-1:0] UpdatePC, - input logic UpdateEN, PCSrcE, - input logic [1:0] UpdatePrediction -); - - logic [2**m-1:0][k-1:0] LHRNextF; - logic [k-1:0] LHRF, ForwardLHRNext, LHRFNext; - logic [m-1:0] LookUpPCIndex, UpdatePCIndex; - logic [1:0] PredictionMemory; - logic DoForwarding, DoForwardingF, DoForwardingPHT, DoForwardingPHTF; - logic [1:0] UpdatePredictionF; - - assign LHRFNext = {PCSrcE, LHRF[k-1:1]}; - assign UpdatePCIndex = {UpdatePC[m+1] ^ UpdatePC[1], UpdatePC[m:2]}; - assign LookUpPCIndex = {LookUpPC[m+1] ^ LookUpPC[1], LookUpPC[m:2]}; - - // INCASE we do ahead pipelining - // ram2p1r1wb #(m,k) LHR(.clk(clk)), - // .reset(reset), - // .RA1(LookUpPCIndex), // need hashing function to get correct PC address - // .RD1(LHRF), - // .REN1(~StallF), - // .WA1(UpdatePCIndex), - // .WD1(LHRENExt), - // .WEN1(UpdateEN), - // .BitWEN1(2'b11)); - - genvar index; - for (index = 0; index < 2**m; index = index +1) begin:localhist - flopenr #(k) LocalHistoryRegister(.clk, .reset, .en(UpdateEN & (index == UpdatePCIndex)), - .d(LHRFNext), .q(LHRNextF[index])); - end - - // need to forward when updating to the same address as reading. - // first we compare to see if the update and lookup addreses are the same - assign DoForwarding = LookUpPCIndex == UpdatePCIndex; - assign ForwardLHRNext = DoForwarding ? LHRFNext :LHRNextF[LookUpPCIndex]; - - // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT - // LHR referes to the address that the past k branches points to in the prediction stage - // LHRE refers to the address that the past k branches points to in the exectution stage - ram2p1r1wb #(k, 2) PHT(.clk(clk), - .reset(reset), - .ra1(ForwardLHRNext), - .rd1(PredictionMemory), - .ren1(~StallF), - .wa2(LHRFNext), - .wd2(UpdatePrediction), - .wen2(UpdateEN), - .bwe2(2'b11)); - - - - assign DoForwardingPHT = LHRFNext == ForwardLHRNext; - - // register the update value and the forwarding signal into the Fetch stage - // TODO: add stall logic *** - flopr #(1) DoForwardingReg(.clk(clk), - .reset(reset), - .d(DoForwardingPHT), - .q(DoForwardingPHTF)); - - flopr #(2) UpdatePredictionReg(.clk(clk), - .reset(reset), - .d(UpdatePrediction), - .q(UpdatePredictionF)); - - assign Prediction = DoForwardingPHTF ? UpdatePredictionF : PredictionMemory; - - //pipeline for LHR - flopenrc #(k) LHRFReg(.clk(clk), - .reset(reset), - .en(~StallF), - .clear(1'b0), - .d(ForwardLHRNext), - .q(LHRF)); - /* - flopenrc #(k) LHRDReg(.clk(clk), - .reset(reset), - .en(~StallD), - .clear(FlushD), - .d(LHRF), - .q(LHRD)); - - flopenrc #(k) LHREReg(.clk(clk), - .reset(reset), - .en(~StallE), - .clear(FlushE), - .d(LHRD), - .q(LHRE)); - */ -endmodule diff --git a/src/ifu/bpred/localaheadbp.sv b/src/ifu/bpred/localaheadbp.sv new file mode 100644 index 000000000..1af589e16 --- /dev/null +++ b/src/ifu/bpred/localaheadbp.sv @@ -0,0 +1,115 @@ +/////////////////////////////////////////// +// localaheadbp +// +// Written: Ross Thompson +// Email: ross1728@gmail.com +// Created: 16 March 2021 +// +// Purpose: local history branch predictor with ahead pipelining and SRAM memories. +// +// A component of the CORE-V-WALLY configurable RISC-V project. +// +// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University +// +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file +// except in compliance with the License, or, at your option, the Apache License version 2.0. You +// may obtain a copy of the License at +// +// https://solderpad.org/licenses/SHL-2.1/ +// +// Unless required by applicable law or agreed to in writing, any work distributed under the +// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +//////////////////////////////////////////////////////////////////////////////////////////////// + +`include "wally-config.vh" + +module localaheadbp #(parameter m = 6, // 2^m = number of local history branches + parameter k = 10) ( // number of past branches stored + input logic clk, + input logic reset, + input logic StallF, StallD, StallE, StallM, StallW, + input logic FlushD, FlushE, FlushM, FlushW, + output logic [1:0] BPDirPredD, + output logic BPDirPredWrongE, + // update + input logic [`XLEN-1:0] PCNextF, PCM, + input logic BranchE, BranchM, PCSrcE +); + + logic [k-1:0] IndexNextF, IndexM; + //logic [1:0] BPDirPredD, BPDirPredE; + logic [1:0] BPDirPredE; + logic [1:0] BPDirPredM; + logic [1:0] NewBPDirPredE, NewBPDirPredM, NewBPDirPredW; + + logic [k-1:0] LHRF, LHRD, LHRE, LHRM, LHRW, LHRNextF; + logic [k-1:0] LHRNextW; + logic PCSrcM; + logic [2**m-1:0][k-1:0] LHRArray; + logic [m-1:0] IndexLHRNextF, IndexLHRM; + logic [`XLEN-1:0] PCW; + + + logic UpdateM; + + //assign IndexNextF = LHR; + assign IndexM = LHRW; + + ram2p1r1wbe #(2**k, 2) PHT(.clk(clk), + .ce1(~StallD), .ce2(~StallW & ~FlushW), + .ra1(LHRF), + .rd1(BPDirPredD), + .wa2(IndexM), + .wd2(NewBPDirPredW), + .we2(BranchM), + .bwe2(1'b1)); + + //flopenrc #(2) PredictionRegD(clk, reset, FlushD, ~StallD, BPDirPredF, BPDirPredD); + flopenrc #(2) PredictionRegE(clk, reset, FlushE, ~StallE, BPDirPredD, BPDirPredE); + flopenrc #(2) PredictionRegM(clk, reset, FlushM, ~StallM, BPDirPredE, BPDirPredM); + + satCounter2 BPDirUpdateE(.BrDir(PCSrcE), .OldState(BPDirPredM), .NewState(NewBPDirPredM)); + //flopenrc #(2) NewPredictionRegM(clk, reset, FlushM, ~StallM, NewBPDirPredE, NewBPDirPredM); + flopenrc #(2) NewPredictionRegW(clk, reset, FlushW, ~StallW, NewBPDirPredM, NewBPDirPredW); + + assign BPDirPredWrongE = PCSrcE != BPDirPredM[1] & BranchE; + + // This is the main difference between global and local history basic implementations. In global, + // the ghr wraps back into itself directly without + // being pipelined. I.E. GHR is not read in F and then pipelined to M where it is updated. Instead + // GHR is both read and update in M. GHR is still pipelined so that the PHT is updated with the correct + // GHR. Local history in contrast must pipeline the specific history register read during F and then update + // that same one in M. This implementation does not forward if a branch matches in the D, E, or M stages. + assign LHRNextW = BranchM ? {PCSrcM, LHRW[k-1:1]} : LHRW; + + // this is local history + //genvar index; + //assign UpdateM = BranchM & ~StallW & ~FlushW; + assign IndexLHRM = {PCW[m+1] ^ PCW[1], PCW[m:2]}; + assign IndexLHRNextF = {PCNextF[m+1] ^ PCNextF[1], PCNextF[m:2]}; + + ram2p1r1wbe #(2**m, k) BHT(.clk(clk), + .ce1(~StallF), .ce2(~StallW & ~FlushW), + .ra1(IndexLHRNextF), + .rd1(LHRF), + .wa2(IndexLHRM), + .wd2(LHRNextW), + .we2(BranchM), + .bwe2('1)); + + flopenrc #(1) PCSrcMReg(clk, reset, FlushM, ~StallM, PCSrcE, PCSrcM); + + //flopenrc #(k) LHRFReg(clk, reset, FlushD, ~StallF, LHRNextF, LHRF); + //assign LHRF = LHRNextF; + flopenrc #(k) LHRDReg(clk, reset, FlushD, ~StallD, LHRF, LHRD); + flopenrc #(k) LHREReg(clk, reset, FlushE, ~StallE, LHRD, LHRE); + flopenrc #(k) LHRMReg(clk, reset, FlushM, ~StallM, LHRE, LHRM); + flopenrc #(k) LHRWReg(clk, reset, FlushW, ~StallW, LHRM, LHRW); + + flopenr #(`XLEN) PCWReg(clk, reset, ~StallW, PCM, PCW); + +endmodule diff --git a/src/ifu/bpred/localbpbasic.sv b/src/ifu/bpred/localbpbasic.sv new file mode 100644 index 000000000..071d890aa --- /dev/null +++ b/src/ifu/bpred/localbpbasic.sv @@ -0,0 +1,106 @@ +/////////////////////////////////////////// +// localbpbasic +// +// Written: Ross Thompson +// Email: ross1728@gmail.com +// Created: 16 March 2021 +// +// Purpose: Local history branch predictor. Basic implementation without any repair and flop memories. + +// +// A component of the CORE-V-WALLY configurable RISC-V project. +// +// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University +// +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file +// except in compliance with the License, or, at your option, the Apache License version 2.0. You +// may obtain a copy of the License at +// +// https://solderpad.org/licenses/SHL-2.1/ +// +// Unless required by applicable law or agreed to in writing, any work distributed under the +// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +//////////////////////////////////////////////////////////////////////////////////////////////// + +`include "wally-config.vh" + +module localbpbasic #(parameter m = 6, // 2^m = number of local history branches + parameter k = 10) ( // number of past branches stored + input logic clk, + input logic reset, + input logic StallF, StallD, StallE, StallM, StallW, + input logic FlushD, FlushE, FlushM, FlushW, + output logic [1:0] BPDirPredF, + output logic BPDirPredWrongE, + // update + input logic [`XLEN-1:0] PCNextF, PCM, + input logic BranchE, BranchM, PCSrcE +); + + logic [k-1:0] IndexNextF, IndexM; + logic [1:0] BPDirPredD, BPDirPredE; + logic [1:0] NewBPDirPredE, NewBPDirPredM; + + logic [k-1:0] LHRF, LHRD, LHRE, LHRM, LHR; + logic [k-1:0] LHRNextW; + logic PCSrcM; + logic [2**m-1:0][k-1:0] LHRArray; + logic [m-1:0] IndexLHRNextF, IndexLHRM; + + logic UpdateM; + + assign IndexNextF = LHR; + assign IndexM = LHRM; + + ram2p1r1wbe #(2**k, 2) PHT(.clk(clk), + .ce1(~StallF), .ce2(~StallW & ~FlushW), + .ra1(IndexNextF), + .rd1(BPDirPredF), + .wa2(IndexM), + .wd2(NewBPDirPredM), + .we2(BranchM), + .bwe2(1'b1)); + + flopenrc #(2) PredictionRegD(clk, reset, FlushD, ~StallD, BPDirPredF, BPDirPredD); + flopenrc #(2) PredictionRegE(clk, reset, FlushE, ~StallE, BPDirPredD, BPDirPredE); + + satCounter2 BPDirUpdateE(.BrDir(PCSrcE), .OldState(BPDirPredE), .NewState(NewBPDirPredE)); + flopenrc #(2) NewPredictionRegM(clk, reset, FlushM, ~StallM, NewBPDirPredE, NewBPDirPredM); + + assign BPDirPredWrongE = PCSrcE != BPDirPredE[1] & BranchE; + + // This is the main difference between global and local history basic implementations. In global, + // the ghr wraps back into itself directly without + // being pipelined. I.E. GHR is not read in F and then pipelined to M where it is updated. Instead + // GHR is both read and update in M. GHR is still pipelined so that the PHT is updated with the correct + // GHR. Local history in contrast must pipeline the specific history register read during F and then update + // that same one in M. This implementation does not forward if a branch matches in the D, E, or M stages. + assign LHRNextW = BranchM ? {PCSrcM, LHRM[k-1:1]} : LHRM; + + // this is local history + genvar index; + assign UpdateM = BranchM & ~StallW & ~FlushW; + assign IndexLHRM = {PCM[m+1] ^ PCM[1], PCM[m:2]}; + for (index = 0; index < 2**m; index = index +1) begin:localhist + flopenr #(k) LocalHistoryRegister(.clk, .reset, .en(UpdateM & (index == IndexLHRM)), + .d(LHRNextW), .q(LHRArray[index])); + end + assign IndexLHRNextF = {PCNextF[m+1] ^ PCNextF[1], PCNextF[m:2]}; + assign LHR = LHRArray[IndexLHRNextF]; + + // this is global history + //flopenr #(k) LHRReg(clk, reset, ~StallM & ~FlushM & BranchM, LHRNextW, LHR); + + flopenrc #(1) PCSrcMReg(clk, reset, FlushM, ~StallM, PCSrcE, PCSrcM); + + flopenrc #(k) LHRFReg(clk, reset, FlushD, ~StallF, LHR, LHRF); + flopenrc #(k) LHRDReg(clk, reset, FlushD, ~StallD, LHRF, LHRD); + flopenrc #(k) LHREReg(clk, reset, FlushE, ~StallE, LHRD, LHRE); + flopenrc #(k) LHRMReg(clk, reset, FlushM, ~StallM, LHRE, LHRM); + + +endmodule diff --git a/src/ifu/bpred/localrepairbp.sv b/src/ifu/bpred/localrepairbp.sv new file mode 100644 index 000000000..47cd2758c --- /dev/null +++ b/src/ifu/bpred/localrepairbp.sv @@ -0,0 +1,136 @@ +/////////////////////////////////////////// +// localrepairbp +// +// Written: Ross Thompson +// Email: ross1728@gmail.com +// Created: 15 April 2023 +// +// Purpose: Local history branch predictor with speculation and repair using CBH. +// +// A component of the CORE-V-WALLY configurable RISC-V project. +// +// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University +// +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file +// except in compliance with the License, or, at your option, the Apache License version 2.0. You +// may obtain a copy of the License at +// +// https://solderpad.org/licenses/SHL-2.1/ +// +// Unless required by applicable law or agreed to in writing, any work distributed under the +// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +//////////////////////////////////////////////////////////////////////////////////////////////// + +`include "wally-config.vh" + +module localrepairbp #(parameter m = 6, // 2^m = number of local history branches + parameter k = 10) ( // number of past branches stored + input logic clk, + input logic reset, + input logic StallF, StallD, StallE, StallM, StallW, + input logic FlushD, FlushE, FlushM, FlushW, + output logic [1:0] BPDirPredD, + output logic BPDirPredWrongE, + // update + input logic [`XLEN-1:0] PCNextF, PCE, PCM, + input logic BranchD, BranchE, BranchM, PCSrcE +); + + //logic [1:0] BPDirPredD, BPDirPredE; + logic [1:0] BPDirPredE; + logic [1:0] BPDirPredM; + logic [1:0] NewBPDirPredE, NewBPDirPredM, NewBPDirPredW; + + logic [k-1:0] LHRF, LHRD, LHRE, LHRM, LHRW, LHRNextF; + logic [k-1:0] LHRNextW; + logic PCSrcM; + logic [2**m-1:0][k-1:0] LHRArray; + logic [m-1:0] IndexLHRNextF, IndexLHRM; + logic [`XLEN-1:0] PCW; + + logic [k-1:0] LHRCommittedF, LHRSpeculativeF; + logic [m-1:0] IndexLHRD; + logic [k-1:0] LHRNextE; + logic SpeculativeFlushedF; + + + ram2p1r1wbe #(2**k, 2) PHT(.clk(clk), + .ce1(~StallD), .ce2(~StallW & ~FlushW), + .ra1(LHRF), + .rd1(BPDirPredD), + .wa2(LHRW), + .wd2(NewBPDirPredW), + .we2(BranchM), + .bwe2(1'b1)); + + //flopenrc #(2) PredictionRegD(clk, reset, FlushD, ~StallD, BPDirPredF, BPDirPredD); + flopenrc #(2) PredictionRegE(clk, reset, FlushE, ~StallE, BPDirPredD, BPDirPredE); + flopenrc #(2) PredictionRegM(clk, reset, FlushM, ~StallM, BPDirPredE, BPDirPredM); + + satCounter2 BPDirUpdateE(.BrDir(PCSrcE), .OldState(BPDirPredM), .NewState(NewBPDirPredM)); + //flopenrc #(2) NewPredictionRegM(clk, reset, FlushM, ~StallM, NewBPDirPredE, NewBPDirPredM); + flopenrc #(2) NewPredictionRegW(clk, reset, FlushW, ~StallW, NewBPDirPredM, NewBPDirPredW); + + assign BPDirPredWrongE = PCSrcE != BPDirPredM[1] & BranchE; + + // This is the main difference between global and local history basic implementations. In global, + // the ghr wraps back into itself directly without + // being pipelined. I.E. GHR is not read in F and then pipelined to M where it is updated. Instead + // GHR is both read and update in M. GHR is still pipelined so that the PHT is updated with the correct + // GHR. Local history in contrast must pipeline the specific history register read during F and then update + // that same one in M. This implementation does not forward if a branch matches in the D, E, or M stages. + assign LHRNextW = BranchM ? {PCSrcM, LHRW[k-1:1]} : LHRW; + + // this is local history + assign IndexLHRM = {PCW[m+1] ^ PCW[1], PCW[m:2]}; + assign IndexLHRNextF = {PCNextF[m+1] ^ PCNextF[1], PCNextF[m:2]}; + + ram2p1r1wbe #(2**m, k) BHT(.clk(clk), + .ce1(~StallF), .ce2(~StallW & ~FlushW), + .ra1(IndexLHRNextF), + .rd1(LHRCommittedF), + .wa2(IndexLHRM), + .wd2(LHRNextW), + .we2(BranchM), + .bwe2('1)); + + assign IndexLHRD = {PCE[m+1] ^ PCE[1], PCE[m:2]}; + assign LHRNextE = BranchD ? {BPDirPredD[1], LHRE[k-1:1]} : LHRE; + // *** replace with a small CAM + ram2p1r1wbe #(2**m, k) SHB(.clk(clk), + .ce1(~StallF), .ce2(~StallE & ~FlushE), + .ra1(IndexLHRNextF), + .rd1(LHRSpeculativeF), + .wa2(IndexLHRD), + .wd2(LHRNextE), + .we2(BranchD), + .bwe2('1)); + // **** replace with small CAM + logic [2**m-1:0] FlushedBits; + always_ff @(posedge clk) begin // Valid bit array, + SpeculativeFlushedF <= #1 FlushedBits[IndexLHRNextF]; + if (reset | FlushD) FlushedBits <= #1 '1; + if(BranchD & ~StallE & ~FlushE) begin + FlushedBits[IndexLHRD] <= #1 '0; + end + end + + //assign SpeculativeFlushedF = '1; + mux2 #(k) LHRMux(LHRSpeculativeF, LHRCommittedF, SpeculativeFlushedF, LHRF); + + flopenrc #(1) PCSrcMReg(clk, reset, FlushM, ~StallM, PCSrcE, PCSrcM); + + //flopenrc #(k) LHRFReg(clk, reset, FlushD, ~StallF, LHRNextF, LHRF); + //assign LHRF = LHRNextF; + flopenrc #(k) LHRDReg(clk, reset, FlushD, ~StallD, LHRF, LHRD); + flopenrc #(k) LHREReg(clk, reset, FlushE, ~StallE, LHRD, LHRE); + flopenrc #(k) LHRMReg(clk, reset, FlushM, ~StallM, LHRE, LHRM); + flopenrc #(k) LHRWReg(clk, reset, FlushW, ~StallW, LHRM, LHRW); + + flopenr #(`XLEN) PCWReg(clk, reset, ~StallW, PCM, PCW); + +endmodule diff --git a/src/privileged/csrc.sv b/src/privileged/csrc.sv index 297dcd0fb..8c8842c6c 100644 --- a/src/privileged/csrc.sv +++ b/src/privileged/csrc.sv @@ -104,8 +104,8 @@ module csrc #(parameter assign CounterEvent[8] = BTAWrongM & InstrValidNotFlushedM; // branch predictor wrong target assign CounterEvent[9] = RASPredPCWrongM & InstrValidNotFlushedM; // return address stack wrong address assign CounterEvent[10] = IClassWrongM & InstrValidNotFlushedM; // instruction class predictor wrong - assign CounterEvent[11] = LoadStallM & InstrValidNotFlushedM; // Load Stalls. don't want to suppress on flush as this only happens if flushed. - assign CounterEvent[12] = StoreStallM & InstrValidNotFlushedM; // Store Stall + assign CounterEvent[11] = LoadStallM; // Load Stalls. don't want to suppress on flush as this only happens if flushed. + assign CounterEvent[12] = StoreStallM; // Store Stall assign CounterEvent[13] = DCacheAccess & InstrValidNotFlushedM; // data cache access assign CounterEvent[14] = DCacheMiss; // data cache miss. Miss asserted 1 cycle at start of cache miss assign CounterEvent[15] = DCacheStallM; // d cache miss cycles diff --git a/testbench/testbench.sv b/testbench/testbench.sv index e5bc92944..154e93edf 100644 --- a/testbench/testbench.sv +++ b/testbench/testbench.sv @@ -28,7 +28,7 @@ `include "wally-config.vh" `include "tests.vh" -`define PrintHPMCounters 0 +`define PrintHPMCounters 1 `define BPRED_LOGGER 0 `define I_CACHE_ADDR_LOGGER 0 `define D_CACHE_ADDR_LOGGER 0 @@ -540,15 +540,27 @@ module testbench; if (`BPRED_SUPPORTED) begin integer adrindex; - // initialize branch predictor on reset - always @(posedge reset) begin - for(adrindex = 0; adrindex < 2**`BTB_SIZE; adrindex++) begin - dut.core.ifu.bpred.bpred.TargetPredictor.memory.mem[adrindex] = 0; + // local history only + if (`BPRED_TYPE == "BP_LOCAL_AHEAD" | `BPRED_TYPE == "BP_LOCAL_REPAIR") begin + always @(*) begin + if(reset) begin + for(adrindex = 0; adrindex < 2**`BPRED_NUM_LHR; adrindex++) begin + dut.core.ifu.bpred.bpred.Predictor.DirPredictor.BHT.mem[adrindex] = 0; + end + end + end + end + + always @(*) begin + if(reset) begin + for(adrindex = 0; adrindex < 2**`BTB_SIZE; adrindex++) begin + force dut.core.ifu.bpred.bpred.TargetPredictor.memory.mem[adrindex] = 0; end for(adrindex = 0; adrindex < 2**`BPRED_SIZE; adrindex++) begin dut.core.ifu.bpred.bpred.Predictor.DirPredictor.PHT.mem[adrindex] = 0; end - end + end + end end diff --git a/testbench/tests.vh b/testbench/tests.vh index 822705fae..54167f73c 100644 --- a/testbench/tests.vh +++ b/testbench/tests.vh @@ -2077,8 +2077,8 @@ string arch64zbs[] = '{ string custom[] = '{ `CUSTOM, - "debug", "simple", + "debug", "cacheTest" }; string testsBP64[] = '{