From 52d95d415fd4bb2fa49c120f50517172870a5b35 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Thu, 4 Mar 2021 09:23:35 -0600 Subject: [PATCH] Converted to using the BTB to predict the instruction class. --- wally-pipelined/src/ifu/BTBPredictor.sv | 22 ++++++------ wally-pipelined/src/ifu/bpred.sv | 47 +++++++++++-------------- wally-pipelined/src/ifu/ifu.sv | 20 +++++++++-- 3 files changed, 50 insertions(+), 39 deletions(-) diff --git a/wally-pipelined/src/ifu/BTBPredictor.sv b/wally-pipelined/src/ifu/BTBPredictor.sv index 041f2b64..4a78353f 100644 --- a/wally-pipelined/src/ifu/BTBPredictor.sv +++ b/wally-pipelined/src/ifu/BTBPredictor.sv @@ -35,11 +35,13 @@ module BTBPredictor input logic reset, input logic [`XLEN-1:0] LookUpPC, output logic [`XLEN-1:0] TargetPC, + output logic [3:0] InstrClass, output logic Valid, // update input logic UpdateEN, input logic [`XLEN-1:0] UpdatePC, - input logic [`XLEN-1:0] UpdateTarget + input logic [`XLEN-1:0] UpdateTarget, + input logic [3:0] UpdateInstrClass ); localparam TotalDepth = 2 ** Depth; @@ -82,15 +84,15 @@ module BTBPredictor // and other indirection branch data. // Another optimization may be using a PC relative address. - SRAM2P1R1W #(Depth, `XLEN) memory(.clk(clk), - .reset(reset), - .RA1(LookUpPCIndex), - .RD1(TargetPC), - .REN1(1'b1), - .WA1(UpdatePCIndex), - .WD1(UpdateTarget), - .WEN1(UpdateEN), - .BitWEN1({`XLEN{1'b1}})); + SRAM2P1R1W #(Depth, `XLEN+4) memory(.clk(clk), + .reset(reset), + .RA1(LookUpPCIndex), + .RD1({{InstrClass, TargetPC}}), + .REN1(1'b1), + .WA1(UpdatePCIndex), + .WD1({UpdateInstrClass, UpdateTarget}), + .WEN1(UpdateEN), + .BitWEN1({`XLEN{1'b1}})); endmodule diff --git a/wally-pipelined/src/ifu/bpred.sv b/wally-pipelined/src/ifu/bpred.sv index e6ed30b3..613120c0 100644 --- a/wally-pipelined/src/ifu/bpred.sv +++ b/wally-pipelined/src/ifu/bpred.sv @@ -36,9 +36,6 @@ module bpred input logic [`XLEN-1:0] PCNextF, // *** forgot to include this one on the I/O list output logic [`XLEN-1:0] BPPredPCF, output logic SelBPPredF, - input logic [31:0] InstrF, // we are going to use the opcode to indicate what type instruction this is. - // if this is too slow we will have to predict the type of instruction. - // Execute state // Update Predictor input logic [`XLEN-1:0] PCE, // The address of the currently executing instruction // 1 hot encoding @@ -50,6 +47,7 @@ module bpred input logic [`XLEN-1:0] PCTargetE, // The branch destination if the branch is taken. input logic [`XLEN-1:0] PCD, // The address the branch predictor took. input logic [`XLEN-1:0] PCLinkE, // The address following the branch instruction. (AKA Fall through address) + input logic [3:0] InstrClassE, // Report branch prediction status output logic BPPredWrongE ); @@ -57,7 +55,7 @@ module bpred logic BTBValidF; logic [1:0] BPPredF, BPPredD, BPPredE, UpdateBPPredE; - logic [3:0] InstrClassD, InstrClassF, InstrClassE; + logic [3:0] BPInstrClassF, BPInstrClassD, BPInstrClassE; logic [`XLEN-1:0] BTBPredPCF, RASPCF; logic TargetWrongE; logic FallThroughWrongE; @@ -65,17 +63,8 @@ module bpred logic PredictionPCWrongE; logic [`XLEN-1:0] CorrectPCE; - // Part 1 decode the instruction class. - // *** for now I'm skiping the compressed instructions - assign InstrClassF[3] = InstrF[6:0] == 7'h67 && InstrF[19:15] == 5'h01; // return - // This is probably too much logic. - // *** This also encourages me to switch to predicting the class. - assign InstrClassF[2] = InstrF[6:0] == 7'h67 && InstrF[19:15] != 5'h01; // jump register, but not return - assign InstrClassF[1] = InstrF[6:0] == 7'h6F; // jump - assign InstrClassF[0] = InstrF[6:0] == 7'h63; // branch - - // Part 2 branch direction prediction + // Part 1 branch direction prediction twoBitPredictor DirPredictor(.clk(clk), .reset(reset), @@ -91,40 +80,42 @@ module bpred // 2) Any information which is necessary for the predictor to built it's next state. // For a 2 bit table this is the prediction count. - assign SelBPPredF = ((InstrClassF[0] & BPPredF[1] & BTBValidF) | - InstrClassF[3] | - (InstrClassF[2] & BTBValidF) | - InstrClassF[1] & BTBValidF) ; + assign SelBPPredF = ((BPInstrClassF[0] & BPPredF[1] & BTBValidF) | + BPInstrClassF[3] | + (BPInstrClassF[2] & BTBValidF) | + BPInstrClassF[1] & BTBValidF) ; - // Part 3 Branch target address prediction + // Part 2 Branch target address prediction // *** For now the BTB will house the direct and indirect targets BTBPredictor TargetPredictor(.clk(clk), .reset(reset), .LookUpPC(PCNextF), .TargetPC(BTBPredPCF), + .InstrClass(BPInstrClassF), .Valid(BTBValidF), // update .UpdateEN(InstrClassE[2] | InstrClassE[1] | InstrClassE[0]), .UpdatePC(PCE), - .UpdateTarget(PCTargetE)); + .UpdateTarget(PCTargetE), + .UpdateInstrClass(InstrClassE)); // need to forward when updating to the same address as reading. //assign CorrectPCE = PCSrcE ? PCTargetE : PCLinkE; //assign TargetPC = (PCE == PCNextF) ? CorrectPCE : BTBPredPCF; - // Part 4 RAS + // Part 3 RAS // *** need to add the logic to restore RAS on flushes. We will use incr for this. RASPredictor RASPredictor(.clk(clk), .reset(reset), - .pop(InstrClassF[3]), + .pop(BPInstrClassF[3]), .popPC(RASPCF), .push(InstrClassE[3]), .incr(1'b0), .pushPC(PCLinkE)); - assign BPPredPCF = InstrClassF[3] ? RASPCF : BTBPredPCF; + assign BPPredPCF = BPInstrClassF[3] ? RASPCF : BTBPredPCF; @@ -150,15 +141,17 @@ module bpred .reset(reset), .en(~StallF), .clear(FlushF), - .d(InstrClassF), - .q(InstrClassD)); + .d(BPInstrClassF), + .q(BPInstrClassD)); flopenrc #(4) InstrClassRegE(.clk(clk), .reset(reset), .en(~StallD), .clear(FlushD), - .d(InstrClassD), - .q(InstrClassE)); + .d(BPInstrClassD), + .q(BPInstrClassE)); + + // Check the prediction makes execution. assign TargetWrongE = PCTargetE != PCD; diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv index 317a1da2..ccd481f8 100644 --- a/wally-pipelined/src/ifu/ifu.sv +++ b/wally-pipelined/src/ifu/ifu.sv @@ -67,6 +67,8 @@ module ifu ( // branch predictor signals logic SelBPPredF; logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F; + logic [3:0] InstrClassD, InstrClassE; + // *** put memory interface on here, InstrF becomes output @@ -109,13 +111,12 @@ module ifu ( .PCNextF(PCNextF), .BPPredPCF(BPPredPCF), .SelBPPredF(SelBPPredF), - .InstrF(InstrF), // *** this is flushed internally. The logic is redundant with some out here. - // Also I believe this port will be removed. .PCE(PCE), .PCSrcE(PCSrcE), .PCTargetE(PCTargetE), .PCD(PCD), .PCLinkE(PCLinkE), + .InstrClassE(InstrClassE), .BPPredWrongE(BPPredWrongE)); // The true correct target is PCTargetE if PCSrcE is 1 else it is the fall through PCLinkE. assign PCCorrectE = PCSrcE ? PCTargetE : PCLinkE; @@ -142,6 +143,14 @@ module ifu ( assign IllegalIEUInstrFaultD = IllegalBaseInstrFaultD | IllegalCompInstrD; // illegal if bad 32 or 16-bit instr // *** combine these with others in better way, including M, F + + // the branch predictor needs a compact decoding of the instruction class. + // *** consider adding in the alternate return address x5 for returns. + assign InstrClassD[3] = InstrD[6:0] == 7'h67 && InstrD[19:15] == 5'h01; // return + assign InstrClassD[2] = InstrD[6:0] == 7'h67 && InstrD[19:15] != 5'h01; // jump register, but not return + assign InstrClassD[1] = InstrD[6:0] == 7'h6F; // jump + assign InstrClassD[0] = InstrD[6:0] == 7'h63; // branch + // Misaligned PC logic generate @@ -164,6 +173,13 @@ module ifu ( flopr #(`XLEN) PCMReg(clk, reset, PCE, PCM); flopr #(`XLEN) PCWReg(clk, reset, PCM, PCW); // *** probably not needed; delete later + flopenrc #(4) InstrClassRegE(.clk(clk), + .reset(reset), + .en(~StallD), + .clear(FlushD), + .d(InstrClassD), + .q(InstrClassE)); + // seems like there should be a lower-cost way of doing this PC+2 or PC+4 for JAL. // either have ALU compute PC+2/4 and feed into ALUResult input of ResultMux or // have dedicated adder in Mem stage based on PCM + 2 or 4