Converted to using the BTB to predict the instruction class.

2021-03-04 09:23:35 -06:00 · 2021-03-04 09:23:35 -06:00 · 52d95d415f
commit 52d95d415f
parent 7592a0dacb
3 changed files with 50 additions and 39 deletions
--- a/wally-pipelined/src/ifu/BTBPredictor.sv
+++ b/wally-pipelined/src/ifu/BTBPredictor.sv
@ -35,11 +35,13 @@ module BTBPredictor
   input logic 		    reset,
   input logic [`XLEN-1:0]  LookUpPC,
   output logic [`XLEN-1:0] TargetPC,
+   output logic [3:0] 	    InstrClass,
   output logic 	    Valid,
   // update
   input logic 		    UpdateEN,
   input logic [`XLEN-1:0]  UpdatePC,
-   input logic [`XLEN-1:0]  UpdateTarget
+   input logic [`XLEN-1:0]  UpdateTarget,
+   input logic [3:0] 	    UpdateInstrClass
   );

  localparam TotalDepth = 2 ** Depth;
@ -82,15 +84,15 @@ module BTBPredictor
  // and other indirection branch data.
  // Another optimization may be using a PC relative address.

-  SRAM2P1R1W #(Depth, `XLEN) memory(.clk(clk),
-				    .reset(reset),
-				    .RA1(LookUpPCIndex),
-				    .RD1(TargetPC),
-				    .REN1(1'b1),
-				    .WA1(UpdatePCIndex),
-				    .WD1(UpdateTarget),
-				    .WEN1(UpdateEN),
-				    .BitWEN1({`XLEN{1'b1}}));
+  SRAM2P1R1W #(Depth, `XLEN+4) memory(.clk(clk),
+				      .reset(reset),
+				      .RA1(LookUpPCIndex),
+				      .RD1({{InstrClass, TargetPC}}),
+				      .REN1(1'b1),
+				      .WA1(UpdatePCIndex),
+				      .WD1({UpdateInstrClass, UpdateTarget}),
+				      .WEN1(UpdateEN),
+				      .BitWEN1({`XLEN{1'b1}}));


 endmodule
--- a/wally-pipelined/src/ifu/bpred.sv
+++ b/wally-pipelined/src/ifu/bpred.sv
@ -36,9 +36,6 @@ module bpred
   input logic [`XLEN-1:0]  PCNextF, // *** forgot to include this one on the I/O list
   output logic [`XLEN-1:0] BPPredPCF,
   output logic 	    SelBPPredF,
-   input logic [31:0] 	    InstrF, // we are going to use the opcode to indicate what type instruction this is.
-   // if this is too slow we will have to predict the type of instruction.
-   // Execute state
   // Update Predictor
   input logic [`XLEN-1:0]  PCE, // The address of the currently executing instruction
   // 1 hot encoding
@ -50,6 +47,7 @@ module bpred
   input logic [`XLEN-1:0]  PCTargetE, // The branch destination if the branch is taken.
   input logic [`XLEN-1:0]  PCD, // The address the branch predictor took.
   input logic [`XLEN-1:0]  PCLinkE, // The address following the branch instruction. (AKA Fall through address)
+   input logic [3:0] 	    InstrClassE,
   // Report branch prediction status
   output logic 	    BPPredWrongE
   );
@ -57,7 +55,7 @@ module bpred
  logic 		    BTBValidF;
  logic [1:0] 		    BPPredF, BPPredD, BPPredE, UpdateBPPredE;

-  logic [3:0] 		    InstrClassD, InstrClassF, InstrClassE;
+  logic [3:0] 		    BPInstrClassF, BPInstrClassD, BPInstrClassE;
  logic [`XLEN-1:0] 	    BTBPredPCF, RASPCF;
  logic 		    TargetWrongE;
  logic 		    FallThroughWrongE;
@ -65,17 +63,8 @@ module bpred
  logic 		    PredictionPCWrongE;
  logic [`XLEN-1:0] 	    CorrectPCE;

-  // Part 1 decode the instruction class.
-  // *** for now I'm skiping the compressed instructions
-  assign InstrClassF[3] = InstrF[6:0] == 7'h67 && InstrF[19:15] == 5'h01; // return
-  // This is probably too much logic. 
-  // *** This also encourages me to switch to predicting the class.

-  assign InstrClassF[2] = InstrF[6:0] == 7'h67 && InstrF[19:15] != 5'h01; // jump register, but not return
-  assign InstrClassF[1] = InstrF[6:0] == 7'h6F; // jump
-  assign InstrClassF[0] = InstrF[6:0] == 7'h63; // branch
-  
-  // Part 2 branch direction prediction
+  // Part 1 branch direction prediction

  twoBitPredictor DirPredictor(.clk(clk),
 			       .reset(reset),
@ -91,40 +80,42 @@ module bpred
  // 2) Any information which is necessary for the predictor to built it's next state.
  // For a 2 bit table this is the prediction count.

-  assign SelBPPredF = ((InstrClassF[0] & BPPredF[1] & BTBValidF) | 
-		       InstrClassF[3] |
-		       (InstrClassF[2] & BTBValidF) | 
-		       InstrClassF[1] & BTBValidF) ;
+  assign SelBPPredF = ((BPInstrClassF[0] & BPPredF[1] & BTBValidF) | 
+		       BPInstrClassF[3] |
+		       (BPInstrClassF[2] & BTBValidF) | 
+		       BPInstrClassF[1] & BTBValidF) ;


-  // Part 3 Branch target address prediction
+  // Part 2 Branch target address prediction
  // *** For now the BTB will house the direct and indirect targets

  BTBPredictor TargetPredictor(.clk(clk),
 			       .reset(reset),
 			       .LookUpPC(PCNextF),
 			       .TargetPC(BTBPredPCF),
+			       .InstrClass(BPInstrClassF),
 			       .Valid(BTBValidF),
 			       // update
 			       .UpdateEN(InstrClassE[2] | InstrClassE[1] | InstrClassE[0]),
 			       .UpdatePC(PCE),
-			       .UpdateTarget(PCTargetE));
+			       .UpdateTarget(PCTargetE),
+			       .UpdateInstrClass(InstrClassE));

  // need to forward when updating to the same address as reading.
  //assign CorrectPCE = PCSrcE ? PCTargetE : PCLinkE;
  //assign TargetPC = (PCE == PCNextF) ? CorrectPCE : BTBPredPCF;

-  // Part 4 RAS
+  // Part 3 RAS
  // *** need to add the logic to restore RAS on flushes.  We will use incr for this.
  RASPredictor RASPredictor(.clk(clk),
 			    .reset(reset),
-			    .pop(InstrClassF[3]),
+			    .pop(BPInstrClassF[3]),
 			    .popPC(RASPCF),
 			    .push(InstrClassE[3]),
 			    .incr(1'b0),
 			    .pushPC(PCLinkE));

-  assign BPPredPCF = InstrClassF[3] ? RASPCF : BTBPredPCF;
+  assign BPPredPCF = BPInstrClassF[3] ? RASPCF : BTBPredPCF;
  
  

@ -150,15 +141,17 @@ module bpred
 			       .reset(reset),
 			       .en(~StallF),
 			       .clear(FlushF),
-			       .d(InstrClassF),
-			       .q(InstrClassD));
+			       .d(BPInstrClassF),
+			       .q(BPInstrClassD));

  flopenrc #(4) InstrClassRegE(.clk(clk),
 			       .reset(reset),
 			       .en(~StallD),
 			       .clear(FlushD),
-			       .d(InstrClassD),
-			       .q(InstrClassE));
+			       .d(BPInstrClassD),
+			       .q(BPInstrClassE));
+
+  

  // Check the prediction makes execution.
  assign TargetWrongE = PCTargetE != PCD;
--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@ -67,6 +67,8 @@ module ifu (
  // branch predictor signals
  logic 	   SelBPPredF;
  logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F;
+  logic [3:0] 	    InstrClassD, InstrClassE;
+  
  

  // *** put memory interface on here, InstrF becomes output
@ -109,13 +111,12 @@ module ifu (
 	      .PCNextF(PCNextF),
 	      .BPPredPCF(BPPredPCF),
 	      .SelBPPredF(SelBPPredF),
-	      .InstrF(InstrF), // *** this is flushed internally. The logic is redundant with some out here.
-	      // Also I believe this port will be removed.
 	      .PCE(PCE),
 	      .PCSrcE(PCSrcE),
 	      .PCTargetE(PCTargetE),
 	      .PCD(PCD),
 	      .PCLinkE(PCLinkE),
+	      .InstrClassE(InstrClassE),
 	      .BPPredWrongE(BPPredWrongE));
  // The true correct target is PCTargetE if PCSrcE is 1 else it is the fall through PCLinkE.
  assign PCCorrectE =  PCSrcE ? PCTargetE : PCLinkE;
@ -142,6 +143,14 @@ module ifu (
  assign IllegalIEUInstrFaultD = IllegalBaseInstrFaultD | IllegalCompInstrD; // illegal if bad 32 or 16-bit instr
  // *** combine these with others in better way, including M, F

+
+  // the branch predictor needs a compact decoding of the instruction class.
+  // *** consider adding in the alternate return address x5 for returns.
+  assign InstrClassD[3] = InstrD[6:0] == 7'h67 && InstrD[19:15] == 5'h01; // return
+  assign InstrClassD[2] = InstrD[6:0] == 7'h67 && InstrD[19:15] != 5'h01; // jump register, but not return
+  assign InstrClassD[1] = InstrD[6:0] == 7'h6F; // jump
+  assign InstrClassD[0] = InstrD[6:0] == 7'h63; // branch
+
  // Misaligned PC logic

  generate
@ -164,6 +173,13 @@ module ifu (
  flopr #(`XLEN) PCMReg(clk, reset, PCE, PCM);
  flopr #(`XLEN) PCWReg(clk, reset, PCM, PCW); // *** probably not needed; delete later

+  flopenrc #(4) InstrClassRegE(.clk(clk),
+			       .reset(reset),
+			       .en(~StallD),
+			       .clear(FlushD),
+			       .d(InstrClassD),
+			       .q(InstrClassE));
+
  // seems like there should be a lower-cost way of doing this PC+2 or PC+4 for JAL.  
  // either have ALU compute PC+2/4 and feed into ALUResult input of ResultMux or
  // have dedicated adder in Mem stage based on PCM + 2 or 4