forked from Github_Repos/cvw
Converted to using the BTB to predict the instruction class.
This commit is contained in:
parent
7592a0dacb
commit
52d95d415f
@ -35,11 +35,13 @@ module BTBPredictor
|
||||
input logic reset,
|
||||
input logic [`XLEN-1:0] LookUpPC,
|
||||
output logic [`XLEN-1:0] TargetPC,
|
||||
output logic [3:0] InstrClass,
|
||||
output logic Valid,
|
||||
// update
|
||||
input logic UpdateEN,
|
||||
input logic [`XLEN-1:0] UpdatePC,
|
||||
input logic [`XLEN-1:0] UpdateTarget
|
||||
input logic [`XLEN-1:0] UpdateTarget,
|
||||
input logic [3:0] UpdateInstrClass
|
||||
);
|
||||
|
||||
localparam TotalDepth = 2 ** Depth;
|
||||
@ -82,15 +84,15 @@ module BTBPredictor
|
||||
// and other indirection branch data.
|
||||
// Another optimization may be using a PC relative address.
|
||||
|
||||
SRAM2P1R1W #(Depth, `XLEN) memory(.clk(clk),
|
||||
.reset(reset),
|
||||
.RA1(LookUpPCIndex),
|
||||
.RD1(TargetPC),
|
||||
.REN1(1'b1),
|
||||
.WA1(UpdatePCIndex),
|
||||
.WD1(UpdateTarget),
|
||||
.WEN1(UpdateEN),
|
||||
.BitWEN1({`XLEN{1'b1}}));
|
||||
SRAM2P1R1W #(Depth, `XLEN+4) memory(.clk(clk),
|
||||
.reset(reset),
|
||||
.RA1(LookUpPCIndex),
|
||||
.RD1({{InstrClass, TargetPC}}),
|
||||
.REN1(1'b1),
|
||||
.WA1(UpdatePCIndex),
|
||||
.WD1({UpdateInstrClass, UpdateTarget}),
|
||||
.WEN1(UpdateEN),
|
||||
.BitWEN1({`XLEN{1'b1}}));
|
||||
|
||||
|
||||
endmodule
|
||||
|
@ -36,9 +36,6 @@ module bpred
|
||||
input logic [`XLEN-1:0] PCNextF, // *** forgot to include this one on the I/O list
|
||||
output logic [`XLEN-1:0] BPPredPCF,
|
||||
output logic SelBPPredF,
|
||||
input logic [31:0] InstrF, // we are going to use the opcode to indicate what type instruction this is.
|
||||
// if this is too slow we will have to predict the type of instruction.
|
||||
// Execute state
|
||||
// Update Predictor
|
||||
input logic [`XLEN-1:0] PCE, // The address of the currently executing instruction
|
||||
// 1 hot encoding
|
||||
@ -50,6 +47,7 @@ module bpred
|
||||
input logic [`XLEN-1:0] PCTargetE, // The branch destination if the branch is taken.
|
||||
input logic [`XLEN-1:0] PCD, // The address the branch predictor took.
|
||||
input logic [`XLEN-1:0] PCLinkE, // The address following the branch instruction. (AKA Fall through address)
|
||||
input logic [3:0] InstrClassE,
|
||||
// Report branch prediction status
|
||||
output logic BPPredWrongE
|
||||
);
|
||||
@ -57,7 +55,7 @@ module bpred
|
||||
logic BTBValidF;
|
||||
logic [1:0] BPPredF, BPPredD, BPPredE, UpdateBPPredE;
|
||||
|
||||
logic [3:0] InstrClassD, InstrClassF, InstrClassE;
|
||||
logic [3:0] BPInstrClassF, BPInstrClassD, BPInstrClassE;
|
||||
logic [`XLEN-1:0] BTBPredPCF, RASPCF;
|
||||
logic TargetWrongE;
|
||||
logic FallThroughWrongE;
|
||||
@ -65,17 +63,8 @@ module bpred
|
||||
logic PredictionPCWrongE;
|
||||
logic [`XLEN-1:0] CorrectPCE;
|
||||
|
||||
// Part 1 decode the instruction class.
|
||||
// *** for now I'm skiping the compressed instructions
|
||||
assign InstrClassF[3] = InstrF[6:0] == 7'h67 && InstrF[19:15] == 5'h01; // return
|
||||
// This is probably too much logic.
|
||||
// *** This also encourages me to switch to predicting the class.
|
||||
|
||||
assign InstrClassF[2] = InstrF[6:0] == 7'h67 && InstrF[19:15] != 5'h01; // jump register, but not return
|
||||
assign InstrClassF[1] = InstrF[6:0] == 7'h6F; // jump
|
||||
assign InstrClassF[0] = InstrF[6:0] == 7'h63; // branch
|
||||
|
||||
// Part 2 branch direction prediction
|
||||
// Part 1 branch direction prediction
|
||||
|
||||
twoBitPredictor DirPredictor(.clk(clk),
|
||||
.reset(reset),
|
||||
@ -91,40 +80,42 @@ module bpred
|
||||
// 2) Any information which is necessary for the predictor to built it's next state.
|
||||
// For a 2 bit table this is the prediction count.
|
||||
|
||||
assign SelBPPredF = ((InstrClassF[0] & BPPredF[1] & BTBValidF) |
|
||||
InstrClassF[3] |
|
||||
(InstrClassF[2] & BTBValidF) |
|
||||
InstrClassF[1] & BTBValidF) ;
|
||||
assign SelBPPredF = ((BPInstrClassF[0] & BPPredF[1] & BTBValidF) |
|
||||
BPInstrClassF[3] |
|
||||
(BPInstrClassF[2] & BTBValidF) |
|
||||
BPInstrClassF[1] & BTBValidF) ;
|
||||
|
||||
|
||||
// Part 3 Branch target address prediction
|
||||
// Part 2 Branch target address prediction
|
||||
// *** For now the BTB will house the direct and indirect targets
|
||||
|
||||
BTBPredictor TargetPredictor(.clk(clk),
|
||||
.reset(reset),
|
||||
.LookUpPC(PCNextF),
|
||||
.TargetPC(BTBPredPCF),
|
||||
.InstrClass(BPInstrClassF),
|
||||
.Valid(BTBValidF),
|
||||
// update
|
||||
.UpdateEN(InstrClassE[2] | InstrClassE[1] | InstrClassE[0]),
|
||||
.UpdatePC(PCE),
|
||||
.UpdateTarget(PCTargetE));
|
||||
.UpdateTarget(PCTargetE),
|
||||
.UpdateInstrClass(InstrClassE));
|
||||
|
||||
// need to forward when updating to the same address as reading.
|
||||
//assign CorrectPCE = PCSrcE ? PCTargetE : PCLinkE;
|
||||
//assign TargetPC = (PCE == PCNextF) ? CorrectPCE : BTBPredPCF;
|
||||
|
||||
// Part 4 RAS
|
||||
// Part 3 RAS
|
||||
// *** need to add the logic to restore RAS on flushes. We will use incr for this.
|
||||
RASPredictor RASPredictor(.clk(clk),
|
||||
.reset(reset),
|
||||
.pop(InstrClassF[3]),
|
||||
.pop(BPInstrClassF[3]),
|
||||
.popPC(RASPCF),
|
||||
.push(InstrClassE[3]),
|
||||
.incr(1'b0),
|
||||
.pushPC(PCLinkE));
|
||||
|
||||
assign BPPredPCF = InstrClassF[3] ? RASPCF : BTBPredPCF;
|
||||
assign BPPredPCF = BPInstrClassF[3] ? RASPCF : BTBPredPCF;
|
||||
|
||||
|
||||
|
||||
@ -150,15 +141,17 @@ module bpred
|
||||
.reset(reset),
|
||||
.en(~StallF),
|
||||
.clear(FlushF),
|
||||
.d(InstrClassF),
|
||||
.q(InstrClassD));
|
||||
.d(BPInstrClassF),
|
||||
.q(BPInstrClassD));
|
||||
|
||||
flopenrc #(4) InstrClassRegE(.clk(clk),
|
||||
.reset(reset),
|
||||
.en(~StallD),
|
||||
.clear(FlushD),
|
||||
.d(InstrClassD),
|
||||
.q(InstrClassE));
|
||||
.d(BPInstrClassD),
|
||||
.q(BPInstrClassE));
|
||||
|
||||
|
||||
|
||||
// Check the prediction makes execution.
|
||||
assign TargetWrongE = PCTargetE != PCD;
|
||||
|
@ -67,6 +67,8 @@ module ifu (
|
||||
// branch predictor signals
|
||||
logic SelBPPredF;
|
||||
logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F;
|
||||
logic [3:0] InstrClassD, InstrClassE;
|
||||
|
||||
|
||||
|
||||
// *** put memory interface on here, InstrF becomes output
|
||||
@ -109,13 +111,12 @@ module ifu (
|
||||
.PCNextF(PCNextF),
|
||||
.BPPredPCF(BPPredPCF),
|
||||
.SelBPPredF(SelBPPredF),
|
||||
.InstrF(InstrF), // *** this is flushed internally. The logic is redundant with some out here.
|
||||
// Also I believe this port will be removed.
|
||||
.PCE(PCE),
|
||||
.PCSrcE(PCSrcE),
|
||||
.PCTargetE(PCTargetE),
|
||||
.PCD(PCD),
|
||||
.PCLinkE(PCLinkE),
|
||||
.InstrClassE(InstrClassE),
|
||||
.BPPredWrongE(BPPredWrongE));
|
||||
// The true correct target is PCTargetE if PCSrcE is 1 else it is the fall through PCLinkE.
|
||||
assign PCCorrectE = PCSrcE ? PCTargetE : PCLinkE;
|
||||
@ -142,6 +143,14 @@ module ifu (
|
||||
assign IllegalIEUInstrFaultD = IllegalBaseInstrFaultD | IllegalCompInstrD; // illegal if bad 32 or 16-bit instr
|
||||
// *** combine these with others in better way, including M, F
|
||||
|
||||
|
||||
// the branch predictor needs a compact decoding of the instruction class.
|
||||
// *** consider adding in the alternate return address x5 for returns.
|
||||
assign InstrClassD[3] = InstrD[6:0] == 7'h67 && InstrD[19:15] == 5'h01; // return
|
||||
assign InstrClassD[2] = InstrD[6:0] == 7'h67 && InstrD[19:15] != 5'h01; // jump register, but not return
|
||||
assign InstrClassD[1] = InstrD[6:0] == 7'h6F; // jump
|
||||
assign InstrClassD[0] = InstrD[6:0] == 7'h63; // branch
|
||||
|
||||
// Misaligned PC logic
|
||||
|
||||
generate
|
||||
@ -164,6 +173,13 @@ module ifu (
|
||||
flopr #(`XLEN) PCMReg(clk, reset, PCE, PCM);
|
||||
flopr #(`XLEN) PCWReg(clk, reset, PCM, PCW); // *** probably not needed; delete later
|
||||
|
||||
flopenrc #(4) InstrClassRegE(.clk(clk),
|
||||
.reset(reset),
|
||||
.en(~StallD),
|
||||
.clear(FlushD),
|
||||
.d(InstrClassD),
|
||||
.q(InstrClassE));
|
||||
|
||||
// seems like there should be a lower-cost way of doing this PC+2 or PC+4 for JAL.
|
||||
// either have ALU compute PC+2/4 and feed into ALUResult input of ResultMux or
|
||||
// have dedicated adder in Mem stage based on PCM + 2 or 4
|
||||
|
Loading…
Reference in New Issue
Block a user