///////////////////////////////////////////
// ifu.sv
//
// Written: David_Harris@hmc.edu 9 January 2021
// Modified:
//
// Purpose: Instrunction Fetch Unit
//           PC, branch prediction, instruction cache
// 
// A component of the Wally configurable RISC-V project.
// 
// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
// is furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
///////////////////////////////////////////

`include "wally-config.vh"

module ifu (
  input  logic             clk, reset,
  input  logic             StallF, StallD, StallE, StallM, StallW,
  input  logic             FlushF, FlushD, FlushE, FlushM, FlushW,
  // Fetch
  input  logic [`XLEN-1:0] InstrInF,
  input  logic             InstrAckF,
  output logic [`XLEN-1:0] PCF, 
  output logic [`XLEN-1:0] InstrPAdrF,
  output logic             InstrReadF,
  output logic             ICacheStallF,
  // Decode  
  // Execute
  output logic [`XLEN-1:0] PCLinkE,
  input logic 		   PCSrcE, 
  input logic [`XLEN-1:0]  PCTargetE,
  output logic [`XLEN-1:0] PCE,
  output logic 		   BPPredWrongE, 
  // Mem
  input logic 		   RetM, TrapM, 
  input logic [`XLEN-1:0]  PrivilegedNextPCM, 
  output logic [31:0] 	   InstrD, InstrM,
  output logic [`XLEN-1:0] PCM, 
  output logic [3:0] InstrClassM,
  output logic BPPredWrongM,
  // Writeback
  // output logic [`XLEN-1:0] PCLinkW,
  // Faults
  input  logic             IllegalBaseInstrFaultD,
  output logic             IllegalIEUInstrFaultD,
  output logic             InstrMisalignedFaultM,
  output logic [`XLEN-1:0] InstrMisalignedAdrM,
  // TLB management
  input logic  [1:0]       PrivilegeModeW,
  input logic  [`XLEN-1:0] PageTableEntryF,
  input logic  [1:0]       PageTypeF,
  input logic  [`XLEN-1:0] SATP_REGW,
  input logic              ITLBWriteF, ITLBFlushF,
  output logic             ITLBMissF, ITLBHitF
);

  logic [`XLEN-1:0] UnalignedPCNextF, PCNextF;
  logic             misaligned, BranchMisalignedFaultE, BranchMisalignedFaultM, TrapMisalignedFaultM;
  logic             PrivilegedChangePCM;
  logic             IllegalCompInstrD;
  logic [`XLEN-1:0] PCPlusUpperF, PCPlus2or4F, PCD, PCW, PCLinkD, PCLinkM, PCNextPF;
  logic             CompressedF;
  logic [31:0]      InstrRawD, InstrE, InstrW;
  logic [31:0]      nop = 32'h00000013; // instruction for NOP
  // *** send this to the trap unit
  logic             ITLBPageFaultF;
  logic 	    reset_q; // *** look at this later.
  

  tlb #(3) itlb(.TLBAccess(1'b1), .VirtualAddress(PCF),
                .PageTableEntryWrite(PageTableEntryF), .PageTypeWrite(PageTypeF),
                .TLBWrite(ITLBWriteF), .TLBFlush(ITLBFlushF),
                .PhysicalAddress(PCPF), .TLBMiss(ITLBMissF),
                .TLBHit(ITLBHitF), .TLBPageFault(ITLBPageFaultF),
                .*);

  // branch predictor signals
  logic 	   SelBPPredF;
  logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F, PCNext2F;
  logic [3:0] 	    InstrClassD, InstrClassE;
  

  // *** put memory interface on here, InstrF becomes output
  //assign InstrPAdrF = PCF; // *** no MMU
  //assign InstrReadF = ~StallD; // *** & ICacheMissF; add later
  // assign InstrReadF = 1; // *** & ICacheMissF; add later

  // jarred 2021-03-14 Add instrution cache block to remove rd2
  assign PCNextPF = PCNextF; // Temporary workaround until iTLB is live
  icache icache(
    .*,
    .UpperPCNextPF(PCNextPF[`XLEN-1:12]),
    .LowerPCNextF(PCNextPF[11:0])
  );

  assign PrivilegedChangePCM = RetM | TrapM;

  //mux3    #(`XLEN) pcmux(PCPlus2or4F, PCCorrectE, PrivilegedNextPCM, {PrivilegedChangePCM, BPPredWrongE}, UnalignedPCNextF);
  mux2 #(`XLEN) pcmux0(.d0(PCPlus2or4F),
		       .d1(BPPredPCF),
		       .s(SelBPPredF),
		       .y(PCNext0F));

  mux2 #(`XLEN) pcmux1(.d0(PCNext0F),
		       .d1(PCCorrectE),
		       .s(BPPredWrongE),
		       .y(PCNext1F));

  mux2 #(`XLEN) pcmux2(.d0(PCNext1F),
		       .d1(PrivilegedNextPCM),
		       .s(PrivilegedChangePCM),
		       .y(PCNext2F));

  mux2 #(`XLEN) pcmux3(.d0(PCNext2F),
		       .d1(`RESET_VECTOR),
		       .s(reset_q),
		       .y(UnalignedPCNextF));

  flop #(1) resetReg (.clk(clk),
		      .d(reset),
		      .q(reset_q));
  
  
  assign  PCNextF = {UnalignedPCNextF[`XLEN-1:1], 1'b0}; // hart-SPEC p. 21 about 16-bit alignment
  flopenl #(`XLEN) pcreg(clk, reset, ~StallF & ~ICacheStallF, PCNextF, `RESET_VECTOR, PCF);

  // branch and jump predictor
  // I am making the port connection explicit for now as I want to see them and they will be changing.
  bpred bpred(.clk(clk),
	      .reset(reset),
	      .StallF(StallF),
	      .StallD(StallD),
	      .StallE(1'b0),   // *** may need this eventually
	      .FlushF(FlushF),
	      .FlushD(FlushD),
	      .FlushE(FlushE),
	      .PCNextF(PCNextF),
	      .BPPredPCF(BPPredPCF),
	      .SelBPPredF(SelBPPredF),
	      .PCE(PCE),
	      .PCSrcE(PCSrcE),
	      .PCTargetE(PCTargetE),
	      .PCD(PCD),
	      .PCLinkE(PCLinkE),
	      .InstrClassE(InstrClassE),
	      .BPPredWrongE(BPPredWrongE));
  // The true correct target is PCTargetE if PCSrcE is 1 else it is the fall through PCLinkE.
  assign PCCorrectE =  PCSrcE ? PCTargetE : PCLinkE;

  // pcadder
  // add 2 or 4 to the PC, based on whether the instruction is 16 bits or 32
  assign PCPlusUpperF = PCF[`XLEN-1:2] + 1; // add 4 to PC
  // choose PC+2 or PC+4
  always_comb
    if (CompressedF) // add 2
      if (PCF[1]) PCPlus2or4F = {PCPlusUpperF, 2'b00}; 
      else        PCPlus2or4F = {PCF[`XLEN-1:2], 2'b10};
    else          PCPlus2or4F = {PCPlusUpperF, PCF[1:0]}; // add 4

  // Decode stage pipeline register and logic
  flopenrc #(`XLEN) PCDReg(clk, reset, FlushD, ~StallD, PCF, PCD);
   
  // expand 16-bit compressed instructions to 32 bits
  decompress decomp(.*);
  assign IllegalIEUInstrFaultD = IllegalBaseInstrFaultD | IllegalCompInstrD; // illegal if bad 32 or 16-bit instr
  // *** combine these with others in better way, including M, F


  // the branch predictor needs a compact decoding of the instruction class.
  // *** consider adding in the alternate return address x5 for returns.
  assign InstrClassD[3] = InstrD[6:0] == 7'h67 && InstrD[19:15] == 5'h01; // return
  assign InstrClassD[2] = InstrD[6:0] == 7'h67 && InstrD[19:15] != 5'h01; // jump register, but not return
  assign InstrClassD[1] = InstrD[6:0] == 7'h6F; // jump
  assign InstrClassD[0] = InstrD[6:0] == 7'h63; // branch

  // Misaligned PC logic

  generate
    if (`C_SUPPORTED) // C supports compressed instructions on halfword boundaries
      assign misaligned = PCNextF[0];
    else // instructions must be on word boundaries
      assign misaligned = |PCNextF[1:0];
  endgenerate

  // pipeline misaligned faults to M stage
  assign BranchMisalignedFaultE = misaligned & PCSrcE; // E-stage (Branch/Jump) misaligned
  flopenr #(1) InstrMisalginedReg(clk, reset, ~StallM, BranchMisalignedFaultE, BranchMisalignedFaultM);
  flopenr #(`XLEN) InstrMisalignedAdrReg(clk, reset, ~StallM, PCNextF, InstrMisalignedAdrM);
  assign TrapMisalignedFaultM = misaligned & PrivilegedChangePCM;
  assign InstrMisalignedFaultM = BranchMisalignedFaultM; // | TrapMisalignedFaultM; *** put this back in without causing a cyclic path
  
  flopenr  #(32)   InstrEReg(clk, reset, ~StallE, FlushE ? nop : InstrD, InstrE);
  flopenr  #(32)   InstrMReg(clk, reset, ~StallM, FlushM ? nop : InstrE, InstrM);
  // flopenr  #(32)   InstrWReg(clk, reset, ~StallW, FlushW ? nop : InstrM, InstrW); // just for testbench, delete later
  flopenr #(`XLEN) PCEReg(clk, reset, ~StallE, PCD, PCE);
  flopenr #(`XLEN) PCMReg(clk, reset, ~StallM, PCE, PCM);
  // flopenr #(`XLEN) PCWReg(clk, reset, ~StallW, PCM, PCW); // *** probably not needed; delete later

  flopenrc #(4) InstrClassRegE(.clk(clk),
			       .reset(reset),
			       .en(~StallE),
			       .clear(FlushE),
			       .d(InstrClassD),
			       .q(InstrClassE));

  flopenrc #(4) InstrClassRegM(.clk(clk),
			       .reset(reset),
			       .en(~StallM),
			       .clear(FlushM),
			       .d(InstrClassE),
			       .q(InstrClassM));

  flopenrc #(1) BPPredWrongRegM(.clk(clk),
			       .reset(reset),
			       .en(~StallM),
			       .clear(FlushM),
			       .d(BPPredWrongE),
			       .q(BPPredWrongM));

  // seems like there should be a lower-cost way of doing this PC+2 or PC+4 for JAL.  
  // either have ALU compute PC+2/4 and feed into ALUResult input of ResultMux or
  // have dedicated adder in Mem stage based on PCM + 2 or 4
  // *** redo this 
  flopenr #(`XLEN) PCPDReg(clk, reset, ~StallD, PCPlus2or4F, PCLinkD);
  flopenr #(`XLEN) PCPEReg(clk, reset, ~StallE, PCLinkD, PCLinkE);
  // flopenr #(`XLEN) PCPMReg(clk, reset, ~StallM, PCLinkE, PCLinkM);
  // /flopenr #(`XLEN) PCPWReg(clk, reset, ~StallW, PCLinkM, PCLinkW);

endmodule