Moved forwarding logic into controller

2025-02-11 06:05:49 +00:00 · 2023-12-26 21:17:01 -08:00 · 2023-12-26 21:17:01 -08:00 · 2c2f692f3a
commit 2c2f692f3a
parent 8196ab20be
11 changed files with 113 additions and 153 deletions
--- a/src/hazard/hazard.sv
+++ b/src/hazard/hazard.sv
@ -28,9 +28,9 @@

 module hazard import cvw::*;  #(parameter cvw_t P) ( 
  input  logic  BPWrongE, CSRWriteFenceM, RetM, TrapM,   
-  input  logic  LoadStallD, StoreStallD, MDUStallD, CSRRdStallD,
+  input  logic  StructuralStallD,
  input  logic  LSUStallM, IFUStallF,
-  input  logic  FCvtIntStallD, FPUStallD,
+  input  logic  FPUStallD,
  input  logic  DivBusyE, FDivBusyE,
  input  logic  wfiM, IntPendingM,
  // Stall & flush outputs
@ -82,7 +82,7 @@ module hazard import cvw::*;  #(parameter cvw_t P) (
  //    The IFU stalls the entire pipeline rather than just Fetch to avoid complications with instructions later in the pipeline causing Exceptions
  //    A trap could be asserted at the start of a IFU/LSU stall, and should flush the memory operation
  assign StallFCause = '0;
-  assign StallDCause = (LoadStallD | StoreStallD | MDUStallD | CSRRdStallD | FCvtIntStallD | FPUStallD) & ~FlushDCause;
+  assign StallDCause = (StructuralStallD | FPUStallD) & ~FlushDCause;
  assign StallECause = (DivBusyE | FDivBusyE) & ~FlushECause; 
  assign StallMCause = WFIStallM & ~FlushMCause;
  // Need to gate IFUStallF when the equivalent FlushFCause = FlushDCause = 1.
--- a/src/ieu/alu.sv
+++ b/src/ieu/alu.sv
@ -27,8 +27,8 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////

-module alu import cvw::*; #(parameter cvw_t P, parameter WIDTH) (
-  input  logic [WIDTH-1:0] A, B,        // Operands
+module alu import cvw::*; #(parameter cvw_t P) (
+  input  logic [P.XLEN-1:0] A, B,        // Operands
  input  logic             W64,         // W64-type instruction
  input  logic             SubArith,    // Subtraction or arithmetic shift
  input  logic [2:0]       ALUSelect,   // ALU mux select signal
@ -37,14 +37,14 @@ module alu import cvw::*; #(parameter cvw_t P, parameter WIDTH) (
  input  logic [2:0]       Funct3,      // For BMU decoding
  input  logic [2:0]       BALUControl, // ALU Control signals for B instructions in Execute Stage
  input  logic             BMUActiveE,  // Bit manipulation instruction being executed
-  output logic [WIDTH-1:0] ALUResult,   // ALU result
-  output logic [WIDTH-1:0] Sum);        // Sum of operands
+  output logic [P.XLEN-1:0] ALUResult,   // ALU result
+  output logic [P.XLEN-1:0] Sum);        // Sum of operands

  // CondInvB = ~B when subtracting, B otherwise. Shift = shift result. SLT/U = result of a slt/u instruction.
  // FullResult = ALU result before adjusting for a RV64 w-suffix instruction.
-  logic [WIDTH-1:0] CondMaskInvB, Shift, FullResult, PreALUResult;                // Intermediate Signals 
-  logic [WIDTH-1:0] CondMaskB;                                                    // Result of B mask select mux
-  logic [WIDTH-1:0] CondShiftA;                                                   // Result of A shifted select mux
+  logic [P.XLEN-1:0] CondMaskInvB, Shift, FullResult, PreALUResult;                // Intermediate Signals 
+  logic [P.XLEN-1:0] CondMaskB;                                                    // Result of B mask select mux
+  logic [P.XLEN-1:0] CondShiftA;                                                   // Result of A shifted select mux
  logic             Carry, Neg;                                                   // Flags: carry out, negative
  logic             LT, LTU;                                                      // Less than, Less than unsigned
  logic             Asign, Bsign;                                                 // Sign bits of A, B
@ -53,7 +53,7 @@ module alu import cvw::*; #(parameter cvw_t P, parameter WIDTH) (
  // CondMaskB is B for add/sub, or a masked version of B for certain bit manipulation instructions
  // CondShiftA is A for add/sub or a shifted version of A for shift-and-add BMU instructions
  assign CondMaskInvB = SubArith ? ~CondMaskB : CondMaskB;
-  assign {Carry, Sum} = CondShiftA + CondMaskInvB + {{(WIDTH-1){1'b0}}, SubArith};
+  assign {Carry, Sum} = CondShiftA + CondMaskInvB + {{(P.XLEN-1){1'b0}}, SubArith};
  
  // Shifts (configurable for rotation)
  shifter #(P) sh(.A, .Amt(B[P.LOG_XLEN-1:0]), .Right(Funct3[2]), .W64, .SubArith, .Y(Shift), .Rotate(BALUControl[2]));
@ -62,9 +62,9 @@ module alu import cvw::*; #(parameter cvw_t P, parameter WIDTH) (
  // Overflow occurs when the numbers being subtracted have the opposite sign 
  // and the result has the opposite sign of A.
  // LT is simplified from Overflow = Asign & Bsign & Asign & Neg; LT = Neg ^ Overflow
-  assign Neg  = Sum[WIDTH-1];
-  assign Asign = A[WIDTH-1];
-  assign Bsign = B[WIDTH-1];
+  assign Neg  = Sum[P.XLEN-1];
+  assign Asign = A[P.XLEN-1];
+  assign Bsign = B[P.XLEN-1];
  assign LT = Asign & ~Bsign | Asign & Neg | ~Bsign & Neg; 
  assign LTU = ~Carry;
 
@ -73,21 +73,22 @@ module alu import cvw::*; #(parameter cvw_t P, parameter WIDTH) (
    case (ALUSelect)                                
      3'b000: FullResult = Sum;                           // add or sub (including address generation)
      3'b001: FullResult = Shift;                         // sll, sra, or srl
-      3'b010: FullResult = {{(WIDTH-1){1'b0}}, LT};       // slt
-      3'b011: FullResult = {{(WIDTH-1){1'b0}}, LTU};      // sltu
+      3'b010: FullResult = {{(P.XLEN-1){1'b0}}, LT};       // slt
+      3'b011: FullResult = {{(P.XLEN-1){1'b0}}, LTU};      // sltu
      3'b100: FullResult = A ^ CondMaskInvB;              // xor, xnor, binv
-      3'b101: FullResult = (P.ZBS_SUPPORTED | P.ZBB_SUPPORTED) ? {{(WIDTH-1){1'b0}},{|(A & CondMaskB)}} : Shift; // bext (or IEU shift when BMU not supported)
+      3'b101: FullResult = (P.ZBS_SUPPORTED | P.ZBB_SUPPORTED) ? {{(P.XLEN-1){1'b0}},{|(A & CondMaskB)}} : Shift; // bext (or IEU shift when BMU not supported)
      3'b110: FullResult = A | CondMaskInvB;              // or, orn, bset
      3'b111: FullResult = A & CondMaskInvB;              // and, bclr
    endcase

  // Support RV64I W-type addw/subw/addiw/shifts that discard upper 32 bits and sign-extend 32-bit result to 64 bits
-  if (WIDTH == 64)  assign PreALUResult = W64 ? {{32{FullResult[31]}}, FullResult[31:0]} : FullResult;
+  if (P.XLEN == 64)  assign PreALUResult = W64 ? {{32{FullResult[31]}}, FullResult[31:0]} : FullResult;
  else              assign PreALUResult = FullResult;

  // Final Result B instruction select mux
  if (P.ZBC_SUPPORTED | P.ZBS_SUPPORTED | P.ZBA_SUPPORTED | P.ZBB_SUPPORTED) begin : bitmanipalu
-    bitmanipalu #(P, WIDTH) balu(.A, .B, .W64, .BSelect, .ZBBSelect, .BMUActiveE,
+    bitmanipalu #(P) balu(
+      .A, .B, .W64, .BSelect, .ZBBSelect, .BMUActiveE,
      .Funct3, .LT,.LTU, .BALUControl, .PreALUResult, .FullResult,
      .CondMaskB, .CondShiftA, .ALUResult);
  end else begin
--- a/src/ieu/bmu/bitmanipalu.sv
+++ b/src/ieu/bmu/bitmanipalu.sv
@ -27,9 +27,8 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////

-module bitmanipalu import cvw::*; #(parameter cvw_t P, 
-                     parameter WIDTH=32) (
-  input  logic [WIDTH-1:0] A, B,                    // Operands
+module bitmanipalu import cvw::*; #(parameter cvw_t P) (
+  input  logic [P.XLEN-1:0] A, B,                    // Operands
  input  logic             W64,                     // W64-type instruction
  input  logic [1:0]       BSelect,                 // Binary encoding of if it's a ZBA_ZBB_ZBC_ZBS instruction
  input  logic [2:0]       ZBBSelect,               // ZBB mux select signal
@ -38,37 +37,37 @@ module bitmanipalu import cvw::*; #(parameter cvw_t P,
  input  logic             LTU,                     // less than unsigned flag
  input  logic [2:0]       BALUControl,             // ALU Control signals for B instructions in Execute Stage
  input  logic             BMUActiveE,              // Bit manipulation instruction being executed
-  input  logic [WIDTH-1:0] PreALUResult, FullResult,// PreALUResult, FullResult signals
-  output logic [WIDTH-1:0] CondMaskB,               // B is conditionally masked for ZBS instructions
-  output logic [WIDTH-1:0] CondShiftA,              // A is conditionally shifted for ShAdd instructions
-  output logic [WIDTH-1:0] ALUResult);              // Result
+  input  logic [P.XLEN-1:0] PreALUResult, FullResult,// PreALUResult, FullResult signals
+  output logic [P.XLEN-1:0] CondMaskB,               // B is conditionally masked for ZBS instructions
+  output logic [P.XLEN-1:0] CondShiftA,              // A is conditionally shifted for ShAdd instructions
+  output logic [P.XLEN-1:0] ALUResult);              // Result

-  logic [WIDTH-1:0] ZBBResult, ZBCResult;           // ZBB, ZBC Result
-  logic [WIDTH-1:0] MaskB;                          // BitMask of B
-  logic [WIDTH-1:0] RevA;                           // Bit-reversed A
+  logic [P.XLEN-1:0] ZBBResult, ZBCResult;           // ZBB, ZBC Result
+  logic [P.XLEN-1:0] MaskB;                          // BitMask of B
+  logic [P.XLEN-1:0] RevA;                           // Bit-reversed A
  logic             Rotate;                         // Indicates if it is Rotate instruction
  logic             Mask;                           // Indicates if it is ZBS instruction
  logic             PreShift;                       // Inidicates if it is sh1add, sh2add, sh3add instruction
  logic [1:0]       PreShiftAmt;                    // Amount to Pre-Shift A 
-  logic [WIDTH-1:0] CondZextA;                      // A Conditional Extend Intermediary Signal
-  logic [WIDTH-1:0] ABMU, BBMU;                     // Gated data inputs to reduce BMU activity
+  logic [P.XLEN-1:0] CondZextA;                      // A Conditional Extend Intermediary Signal
+  logic [P.XLEN-1:0] ABMU, BBMU;                     // Gated data inputs to reduce BMU activity

  // gate data inputs to BMU to only operate when BMU is active
-  assign ABMU = A & {WIDTH{BMUActiveE}};
-  assign BBMU = B & {WIDTH{BMUActiveE}};
+  assign ABMU = A & {P.XLEN{BMUActiveE}};
+  assign BBMU = B & {P.XLEN{BMUActiveE}};

  // Extract control signals from bitmanip ALUControl.
  assign {Mask, PreShift} = BALUControl[1:0];

  // Mask Generation Mux
  if (P.ZBS_SUPPORTED) begin: zbsdec
-    decoder #($clog2(WIDTH)) maskgen(BBMU[$clog2(WIDTH)-1:0], MaskB);
-    mux2 #(WIDTH) maskmux(B, MaskB, Mask, CondMaskB);
+    decoder #($clog2(P.XLEN)) maskgen(BBMU[$clog2(P.XLEN)-1:0], MaskB);
+    mux2 #(P.XLEN) maskmux(B, MaskB, Mask, CondMaskB);
  end else assign CondMaskB = B;
 
  // 0-3 bit Pre-Shift Mux
  if (P.ZBA_SUPPORTED) begin: zbapreshift
-    if (WIDTH == 64) begin
+    if (P.XLEN == 64) begin
      mux2 #(64) zextmux(A, {{32{1'b0}}, A[31:0]}, W64, CondZextA); 
    end else assign CondZextA = A;
    assign PreShiftAmt = Funct3[2:1] & {2{PreShift}};
@ -80,17 +79,17 @@ module bitmanipalu import cvw::*; #(parameter cvw_t P,

  // Bit reverse needed for some ZBB, ZBC instructions
  if (P.ZBC_SUPPORTED | P.ZBB_SUPPORTED) begin: bitreverse
-    bitreverse #(WIDTH) brA(.A(ABMU), .RevA);
+    bitreverse #(P.XLEN) brA(.A(ABMU), .RevA);
  end

  // ZBC Unit
  if (P.ZBC_SUPPORTED) begin: zbc
-    zbc #(WIDTH) ZBC(.A(ABMU), .RevA, .B(BBMU), .Funct3, .ZBCResult);
+    zbc #(P.XLEN) ZBC(.A(ABMU), .RevA, .B(BBMU), .Funct3, .ZBCResult);
  end else assign ZBCResult = 0;

  // ZBB Unit
  if (P.ZBB_SUPPORTED) begin: zbb
-    zbb #(WIDTH) ZBB(.A(ABMU), .RevA, .B(BBMU), .W64, .LT, .LTU, .BUnsigned(Funct3[0]), .ZBBSelect, .ZBBResult);
+    zbb #(P.XLEN) ZBB(.A(ABMU), .RevA, .B(BBMU), .W64, .LT, .LTU, .BUnsigned(Funct3[0]), .ZBBSelect, .ZBBResult);
  end else assign ZBBResult = 0;

  // Result Select Mux
--- a/src/ieu/controller.sv
+++ b/src/ieu/controller.sv
@ -39,10 +39,14 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  output logic        IllegalBaseInstrD,       // Illegal I-type instruction, or illegal RV32 access to upper 16 registers
  output logic        JumpD,                   // Jump instruction
  output logic        BranchD,                 // Branch instruction
-   // Execute stage control signals             
+  output logic        StructuralStallD,        // Structural stalls detected by controller
+  output logic        LoadStallD,              // Structural stalls for load, sent to performance counters
+  output logic [4:0]  Rs1D, Rs2D,              // Register sources to read in Decode or Execute stage
+  // Execute stage control signals             
  input  logic        StallE, FlushE,          // Stall, flush Execute stage
  input  logic [1:0]  FlagsE,                  // Comparison flags ({eq, lt})
  input  logic        FWriteIntE,              // Write integer register, coming from FPU controller
+  input  logic        FCvtIntE,                              // FPU convert float to int
  output logic        PCSrcE,                  // Select signal to choose next PC (for datapath and Hazard unit)
  output logic        ALUSrcAE, ALUSrcBE,      // ALU operands
  output logic        ALUResultSrcE,           // Selects result to pass on to Memory stage
@ -65,7 +69,7 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  output logic [3:0]  CMOpM,                   // 1: cbo.inval; 2: cbo.flush; 4: cbo.clean; 8: cbo.zero
  output logic        IFUPrefetchE,            // instruction prefetch
  output logic        LSUPrefetchM,            // data prefetch
-
+  output logic [1:0]  ForwardAE, ForwardBE,    // Select signals for forwarding multiplexers
  // Memory stage control signals
  input  logic        StallM, FlushM,          // Stall, flush Memory stage
  output logic [1:0]  MemRWE,                  // Mem read/write: MemRWM[1] = 1 for read, MemRWM[0] = 1 for write 
@ -83,14 +87,16 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  output logic [2:0]  ResultSrcW,              // Select source of result to write back to register file
  // Stall during CSRs
  output logic        CSRWriteFenceM,          // CSR write or fence instruction; needs to flush the following instructions
-  output logic        StoreStallD              // Store (memory write) causes stall
+  output logic [4:0]  RdE, RdM,                // Pipelined destination registers
+  // Forwarding controls
+  output logic [4:0]  RdW                      // Register destinations in Execute, Memory, or Writeback stage
 );

-
+  logic [4:0] Rs1E, Rs2E;                      // pipelined register sources
  logic [6:0] OpD;                             // Opcode in Decode stage
  logic [2:0] Funct3D;                         // Funct3 field in Decode stage
  logic [6:0] Funct7D;                         // Funct7 field in Decode stage
-  logic [4:0] Rs1D, Rs2D, RdD;                 // Rs1/2 source register / dest reg in Decode stage
+  logic [4:0] RdD;                             // Rs1/2 source register / dest reg in Decode stage

  `define CTRLW 24

@ -146,6 +152,9 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  logic [3:0]  CMOpD, CMOpE;                   // which CMO instruction 1: cbo.inval; 2: cbo.flush; 4: cbo.clean; 8: cbo.zero
  logic        IFUPrefetchD;                   // instruction prefetch
  logic        LSUPrefetchD, LSUPrefetchE;     // data prefetch
+  logic        AMOStallD, CMOStallD;           // Structural hazards from atomic and cache management ops
+  logic        MatchDE;                        // Match between a source register in Decode stage and destination register in Execute stage
+  logic        FCvtIntStallD, MDUStallD, CSRRdStallD; // Stall due to conversion, load, multiply/divide, CSR read 

  // Extract fields
  assign OpD     = InstrD[6:0];
@ -394,6 +403,9 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  flopenrc #(35) controlregE(clk, reset, FlushE, ~StallE,
                           {ALUSelectD, RegWriteD, ResultSrcD, MemRWD, JumpD, BranchD, ALUSrcAD, ALUSrcBD, ALUResultSrcD, CSRReadD, CSRWriteD, PrivilegedD, Funct3D, W64D, SubArithD, MDUD, AtomicD, InvalidateICacheD, FlushDCacheD, FenceD, CMOpD, IFUPrefetchD, LSUPrefetchD, InstrValidD},
                           {ALUSelectE, IEURegWriteE, ResultSrcE, MemRWE, JumpE, BranchE, ALUSrcAE, ALUSrcBE, ALUResultSrcE, CSRReadE, CSRWriteE, PrivilegedE, Funct3E, W64E, SubArithE, MDUE, AtomicE, InvalidateICacheE, FlushDCacheE, FenceE, CMOpE, IFUPrefetchE, LSUPrefetchE, InstrValidE});
+  flopenrc #(5)      Rs1EReg(clk, reset, FlushE, ~StallE, Rs1D, Rs1E);
+  flopenrc #(5)      Rs2EReg(clk, reset, FlushE, ~StallE, Rs2D, Rs2E);
+  flopenrc #(5)      RdEReg(clk, reset, FlushE, ~StallE, RdD, RdE);

  // Branch Logic
  //  The comparator handles both signed and unsigned branches using BranchSignedE
@ -415,22 +427,45 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  flopenrc #(25) controlregM(clk, reset, FlushM, ~StallM,
                         {RegWriteE, ResultSrcE, MemRWE, CSRReadE, CSRWriteE, PrivilegedE, Funct3E, FWriteIntE, AtomicE, InvalidateICacheE, FlushDCacheE, FenceE, InstrValidE, IntDivE, CMOpE, LSUPrefetchE},
                         {RegWriteM, ResultSrcM, MemRWM, CSRReadM, CSRWriteM, PrivilegedM, Funct3M, FWriteIntM, AtomicM, InvalidateICacheM, FlushDCacheM, FenceM, InstrValidM, IntDivM, CMOpM, LSUPrefetchM});
-  
+  flopenrc #(5)      RdMReg(clk, reset, FlushM, ~StallM, RdE, RdM);  
+
  // Writeback stage pipeline control register
  flopenrc #(5) controlregW(clk, reset, FlushW, ~StallW,
                         {RegWriteM, ResultSrcM, IntDivM},
                         {RegWriteW, ResultSrcW, IntDivW});  
+  flopenrc #(5)      RdWReg(clk, reset, FlushW, ~StallW, RdM, RdW);

  // Flush F, D, and E stages on a CSR write or Fence.I or SFence.VMA
  assign CSRWriteFenceM = CSRWriteM | FenceM;

+  // Forwarding logic
+  always_comb begin
+    ForwardAE = 2'b00;
+    ForwardBE = 2'b00;
+    if (Rs1E != 5'b0)
+      if      ((Rs1E == RdM) & RegWriteM) ForwardAE = 2'b10;
+      else if ((Rs1E == RdW) & RegWriteW) ForwardAE = 2'b01;
+ 
+    if (Rs2E != 5'b0)
+      if      ((Rs2E == RdM) & RegWriteM) ForwardBE = 2'b10;
+      else if ((Rs2E == RdW) & RegWriteW) ForwardBE = 2'b01;
+  end
+
+  // Stall on dependent operations that finish in Mem Stage and can't bypass in time
+  assign MatchDE = ((Rs1D == RdE) | (Rs2D == RdE)) & (RdE != 5'b0); // Decode-stage instruction source depends on result from execute stage instruction
+  assign FCvtIntStallD = FCvtIntE & MatchDE; // FPU to Integer transfers have single-cycle latency except fcvt
+  assign LoadStallD = (MemReadE|SCE) & MatchDE;  
+  assign MDUStallD = MDUE & MatchDE; // Int mult/div is at least two cycle latency, even when coming from the FDIV
+  assign CSRRdStallD = CSRReadE & MatchDE;
+
  // the synchronous DTIM cannot read immediately after write
  // a cache cannot read or write immediately after a write
-  // atomic operations are also detected as MemRWD[1]
+  // atomic operations are also detected as MemRWD[1] ***check; seems like & MemRWE
  // *** RT: Remove this after updating the cache.
  // *** RT: Check that atomic after atomic works correctly.
-  //assign StoreStallD = ((|CMOpE)) & ((|CMOpD));
-  logic AMOHazard;
-  assign AMOHazard = &MemRWE & MemRWD[1];
-  assign StoreStallD = ((|CMOpE) & (|CMOpD)) | AMOHazard;
+  assign AMOStallD = &MemRWE & MemRWD[1]; // Read after atomic operation causes structural hazard
+  assign CMOStallD = (|CMOpE) & (|CMOpD); // CMO op after CMO op causes structural hazard ***explain, why doesn't interact with read/write
+
+  // Structural hazard causes stall if any of these events occur
+  assign StructuralStallD = LoadStallD | MDUStallD | CSRRdStallD | FCvtIntStallD | AMOStallD | CMOStallD;
 endmodule
--- a/src/ieu/datapath.sv
+++ b/src/ieu/datapath.sv
@ -32,6 +32,7 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
  // Decode stage signals
  input  logic [2:0]        ImmSrcD,                 // Selects type of immediate extension
  input  logic [31:0]       InstrD,                  // Instruction in Decode stage
+  input  logic [4:0]        Rs1D, Rs2D,              // Source registers
  // Execute stage signals
  input  logic [P.XLEN-1:0] PCE,                     // PC in Execute stage  
  input  logic [P.XLEN-1:0] PCLinkE,                 // PC + 4 (of instruction in Execute stage)
@ -68,9 +69,8 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.XLEN-1:0] CSRReadValW,             // CSR read result
  input  logic [P.XLEN-1:0] MDUResultW,              // MDU (Multiply/divide unit) result
  input  logic [P.XLEN-1:0] FIntDivResultW,          // FPU's integer divide result
+  input  logic [4:0]        RdW                      // Destination register
   // Hazard Unit signals 
-  output logic [4:0]        Rs1D, Rs2D, Rs1E, Rs2E,  // Register sources to read in Decode or Execute stage
-  output logic [4:0]        RdE, RdM, RdW            // Register destinations in Execute, Memory, or Writeback stage
 );

  // Fetch stage signals
@ -94,9 +94,6 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
  logic [P.XLEN-1:0] MulDivResultW;                  // Multiply always comes from MDU.  Divide could come from MDU or FPU (when using fdivsqrt for integer division)

  // Decode stage
-  assign Rs1D      = InstrD[19:15];
-  assign Rs2D      = InstrD[24:20];
-  assign RdD       = InstrD[11:7];
  regfile #(P.XLEN, P.E_SUPPORTED) regf(clk, reset, RegWriteW, Rs1D, Rs2D, RdW, ResultW, R1D, R2D);
  extend #(P)        ext(.InstrD(InstrD[31:7]), .ImmSrcD, .ImmExtD);
 
@ -104,28 +101,23 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
  flopenrc #(P.XLEN) RD1EReg(clk, reset, FlushE, ~StallE, R1D, R1E);
  flopenrc #(P.XLEN) RD2EReg(clk, reset, FlushE, ~StallE, R2D, R2E);
  flopenrc #(P.XLEN) ImmExtEReg(clk, reset, FlushE, ~StallE, ImmExtD, ImmExtE);
-  flopenrc #(5)      Rs1EReg(clk, reset, FlushE, ~StallE, Rs1D, Rs1E);
-  flopenrc #(5)      Rs2EReg(clk, reset, FlushE, ~StallE, Rs2D, Rs2E);
-  flopenrc #(5)      RdEReg(clk, reset, FlushE, ~StallE, RdD, RdE);
  
  mux3  #(P.XLEN)  faemux(R1E, ResultW, IFResultM, ForwardAE, ForwardedSrcAE);
  mux3  #(P.XLEN)  fbemux(R2E, ResultW, IFResultM, ForwardBE, ForwardedSrcBE);
  comparator #(P.XLEN) comp(ForwardedSrcAE, ForwardedSrcBE, BranchSignedE, FlagsE);
  mux2  #(P.XLEN)  srcamux(ForwardedSrcAE, PCE, ALUSrcAE, SrcAE);
  mux2  #(P.XLEN)  srcbmux(ForwardedSrcBE, ImmExtE, ALUSrcBE, SrcBE);
-  alu   #(P, P.XLEN)  alu(SrcAE, SrcBE, W64E, SubArithE, ALUSelectE, BSelectE, ZBBSelectE, Funct3E, BALUControlE, BMUActiveE, ALUResultE, IEUAdrE);
+  alu   #(P)       alu(SrcAE, SrcBE, W64E, SubArithE, ALUSelectE, BSelectE, ZBBSelectE, Funct3E, BALUControlE, BMUActiveE, ALUResultE, IEUAdrE);
  mux2  #(P.XLEN)  altresultmux(ImmExtE, PCLinkE, JumpE, AltResultE);
  mux2  #(P.XLEN)  ieuresultmux(ALUResultE, AltResultE, ALUResultSrcE, IEUResultE);

  // Memory stage pipeline register
  flopenrc #(P.XLEN) SrcAMReg(clk, reset, FlushM, ~StallM, SrcAE, SrcAM);
  flopenrc #(P.XLEN) IEUResultMReg(clk, reset, FlushM, ~StallM, IEUResultE, IEUResultM);
-  flopenrc #(5)      RdMReg(clk, reset, FlushM, ~StallM, RdE, RdM);  
  flopenrc #(P.XLEN) WriteDataMReg(clk, reset, FlushM, ~StallM, ForwardedSrcBE, WriteDataM); 
  
  // Writeback stage pipeline register and logic
  flopenrc #(P.XLEN) IFResultWReg(clk, reset, FlushW, ~StallW, IFResultM, IFResultW);
-  flopenrc #(5)      RdWReg(clk, reset, FlushW, ~StallW, RdM, RdW);

  // floating point inputs: FIntResM comes from fclass, fcmp, fmv; FCvtIntResW comes from fcvt
  if (P.F_SUPPORTED) begin:fpmux
--- a/src/ieu/forward.sv
+++ b/src/ieu/forward.sv
@ -1,62 +0,0 @@
-///////////////////////////////////////////
-// forward.sv
-//
-// Written: David_Harris@hmc.edu, Sarah.Harris@unlv.edu
-// Created: 9 January 2021
-// Modified: 
-//
-// Purpose: Determine datapath forwarding
-// 
-// Documentation: RISC-V System on Chip Design Chapter 4 (Section 4.2.2.3)
-//
-// A component of the CORE-V-WALLY configurable RISC-V project.
-// 
-// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
-//
-// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
-//
-// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
-// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
-// may obtain a copy of the License at
-//
-// https://solderpad.org/licenses/SHL-2.1/
-//
-// Unless required by applicable law or agreed to in writing, any work distributed under the 
-// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
-// either express or implied. See the License for the specific language governing permissions 
-// and limitations under the License.
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-module forward(
-  // Detect hazards
-  input  logic [4:0]  Rs1D, Rs2D, Rs1E, Rs2E, RdE, RdM, RdW, // Source and destination registers
-  input  logic        MemReadE, MDUE, CSRReadE,              // Execute stage instruction is a load (MemReadE), divide (MDUE), or CSR read (CSRReadE)
-  input  logic        RegWriteM, RegWriteW,                  // Instruction in Memory or Writeback stage writes register file
-  input  logic        FCvtIntE,                              // FPU convert float to int
-  input  logic        SCE,                                   // Store Conditional instruction
-  // Forwarding controls
-  output logic [1:0]  ForwardAE, ForwardBE,                  // Select signals for forwarding multiplexers
-  output logic        FCvtIntStallD, LoadStallD, MDUStallD, CSRRdStallD // Stall due to conversion, load, multiply/divide, CSR read
-);
-
-  logic MatchDE;                                             // Match between a source register in Decode stage and destination register in Execute stage
-  
-  always_comb begin
-    ForwardAE = 2'b00;
-    ForwardBE = 2'b00;
-    if (Rs1E != 5'b0)
-      if      ((Rs1E == RdM) & RegWriteM) ForwardAE = 2'b10;
-      else if ((Rs1E == RdW) & RegWriteW) ForwardAE = 2'b01;
- 
-    if (Rs2E != 5'b0)
-      if      ((Rs2E == RdM) & RegWriteM) ForwardBE = 2'b10;
-      else if ((Rs2E == RdW) & RegWriteW) ForwardBE = 2'b01;
-  end
-
-  // Stall on dependent operations that finish in Mem Stage and can't bypass in time
-  assign MatchDE = ((Rs1D == RdE) | (Rs2D == RdE)) & (RdE != 5'b0); // Decode-stage instruction source depends on result from execute stage instruction
-  assign FCvtIntStallD = FCvtIntE & MatchDE; // FPU to Integer transfers have single-cycle latency except fcvt
-  assign LoadStallD = (MemReadE|SCE) & MatchDE;  
-  assign MDUStallD = MDUE & MatchDE; // Int mult/div is at least two cycle latency, even when coming from the FDIV
-  assign CSRRdStallD = CSRReadE & MatchDE;
-endmodule
--- a/src/ieu/ieu.sv
+++ b/src/ieu/ieu.sv
@ -73,8 +73,8 @@ module ieu import cvw::*;  #(parameter cvw_t P) (
  // Hazard unit signals
  input  logic              StallD, StallE, StallM, StallW,  // Stall signals from hazard unit
  input  logic              FlushD, FlushE, FlushM, FlushW,  // Flush signals
-  output logic              FCvtIntStallD, LoadStallD,       // Stall causes from IEU to hazard unit
-  output logic              MDUStallD, CSRRdStallD, StoreStallD,
+  output logic              StructuralStallD,                // IEU detects structural hazard in Decode stage
+  output logic              LoadStallD,                      // Structural stalls for load, sent to performance counters
  output logic              CSRReadM, CSRWriteM, PrivilegedM,// CSR read, CSR write, is privileged instruction
  output logic              CSRWriteFenceM                   // CSR write or fence instruction needs to flush subsequent instructions
 );
@ -94,7 +94,7 @@ module ieu import cvw::*;  #(parameter cvw_t P) (
  logic       SubArithE;                                     // Subtraction or arithmetic shift

  // Forwarding signals
-  logic [4:0] Rs1D, Rs2D, Rs1E, Rs2E;                        // Source and destination registers
+  logic [4:0] Rs1D, Rs2D;                                    // Source registers
  logic [1:0] ForwardAE, ForwardBE;                          // Select signals for forwarding multiplexers
  logic       RegWriteM, RegWriteW;                          // Register will be written in Memory, Writeback stages
  logic       MemReadE, CSRReadE;                            // Load, CSRRead instruction
@ -104,25 +104,23 @@ module ieu import cvw::*;  #(parameter cvw_t P) (
           
  controller #(P) c(
    .clk, .reset, .StallD, .FlushD, .InstrD, .STATUS_FS, .ENVCFG_CBE, .ImmSrcD,
-    .IllegalIEUFPUInstrD, .IllegalBaseInstrD, .StallE, .FlushE, .FlagsE, .FWriteIntE,
+    .IllegalIEUFPUInstrD, .IllegalBaseInstrD, 
+    .StructuralStallD, .LoadStallD, .Rs1D, .Rs2D, 
+    .StallE, .FlushE, .FlagsE, .FWriteIntE,
    .PCSrcE, .ALUSrcAE, .ALUSrcBE, .ALUResultSrcE, .ALUSelectE, .MemReadE, .CSRReadE, 
    .Funct3E, .IntDivE, .MDUE, .W64E, .SubArithE, .BranchD, .BranchE, .JumpD, .JumpE, .SCE, 
-    .BranchSignedE, .BSelectE, .ZBBSelectE, .BALUControlE, .BMUActiveE, .MDUActiveE, .CMOpM, .IFUPrefetchE, .LSUPrefetchM,
+    .BranchSignedE, .BSelectE, .ZBBSelectE, .BALUControlE, .BMUActiveE, .MDUActiveE, 
+    .FCvtIntE, .ForwardAE, .ForwardBE, .CMOpM, .IFUPrefetchE, .LSUPrefetchM,
    .StallM, .FlushM, .MemRWE, .MemRWM, .CSRReadM, .CSRWriteM, .PrivilegedM, .AtomicM, .Funct3M,
    .RegWriteM, .FlushDCacheM, .InstrValidM, .InstrValidE, .InstrValidD, .FWriteIntM,
-    .StallW, .FlushW, .RegWriteW, .IntDivW, .ResultSrcW, .CSRWriteFenceM, .InvalidateICacheM, .StoreStallD);
+    .StallW, .FlushW, .RegWriteW, .IntDivW, .ResultSrcW, .CSRWriteFenceM, .InvalidateICacheM,
+    .RdW, .RdE, .RdM);

  datapath #(P) dp(
-    .clk, .reset, .ImmSrcD, .InstrD, .StallE, .FlushE, .ForwardAE, .ForwardBE, .W64E, .SubArithE,
+    .clk, .reset, .ImmSrcD, .InstrD, .Rs1D, .Rs2D, .StallE, .FlushE, .ForwardAE, .ForwardBE, .W64E, .SubArithE,
    .Funct3E, .ALUSrcAE, .ALUSrcBE, .ALUResultSrcE, .ALUSelectE, .JumpE, .BranchSignedE, 
    .PCE, .PCLinkE, .FlagsE, .IEUAdrE, .ForwardedSrcAE, .ForwardedSrcBE, .BSelectE, .ZBBSelectE, .BALUControlE, .BMUActiveE,
    .StallM, .FlushM, .FWriteIntM, .FIntResM, .SrcAM, .WriteDataM, .FCvtIntW,
    .StallW, .FlushW, .RegWriteW, .IntDivW, .SquashSCW, .ResultSrcW, .ReadDataW, .FCvtIntResW,
-    .CSRReadValW, .MDUResultW, .FIntDivResultW, .Rs1D, .Rs2D, .Rs1E, .Rs2E, .RdE, .RdM, .RdW);             
-  
-  forward    fw(
-    .Rs1D, .Rs2D, .Rs1E, .Rs2E, .RdE, .RdM, .RdW,
-    .MemReadE, .MDUE, .CSRReadE, .RegWriteM, .RegWriteW,
-    .FCvtIntE, .SCE, .ForwardAE, .ForwardBE,
-    .FCvtIntStallD, .LoadStallD, .MDUStallD, .CSRRdStallD);
+    .CSRReadValW, .MDUResultW, .FIntDivResultW, .RdW);             
 endmodule
--- a/src/privileged/csr.sv
+++ b/src/privileged/csr.sv
@ -54,7 +54,6 @@ module csr import cvw::*;  #(parameter cvw_t P) (
  input  logic                     SelHPTW,                   // hardware page table walker active, so base endianness on supervisor mode
  // inputs for performance counters
  input  logic                     LoadStallD,
-  input  logic                     StoreStallD,
  input  logic                     ICacheStallF,
  input  logic                     DCacheStallM,
  input  logic                     BPDirPredWrongM,
@ -275,7 +274,7 @@ module csr import cvw::*;  #(parameter cvw_t P) (
  
  if (P.ZICNTR_SUPPORTED) begin:counters
    csrc #(P) counters(.clk, .reset, .StallE, .StallM, .FlushM,
-      .InstrValidNotFlushedM, .LoadStallD, .StoreStallD, .CSRWriteM, .CSRMWriteM,
+      .InstrValidNotFlushedM, .LoadStallD, .CSRWriteM, .CSRMWriteM,
      .BPDirPredWrongM, .BTAWrongM, .RASPredPCWrongM, .IClassWrongM, .BPWrongM,
      .InstrClassM, .DCacheMiss, .DCacheAccess, .ICacheMiss, .ICacheAccess, .sfencevmaM,
      .InterruptM, .ExceptionM, .InvalidateICacheM, .ICacheStallF, .DCacheStallM, .DivBusyE, .FDivBusyE,
--- a/src/privileged/csrc.sv
+++ b/src/privileged/csrc.sv
@ -32,7 +32,7 @@ module csrc  import cvw::*;  #(parameter cvw_t P) (
  input  logic              clk, reset,
  input  logic              StallE, StallM, 
  input  logic              FlushM, 
-  input  logic              InstrValidNotFlushedM, LoadStallD, StoreStallD, 
+  input  logic              InstrValidNotFlushedM, LoadStallD, 
  input  logic              CSRMWriteM, CSRWriteM,
  input  logic              BPDirPredWrongM,
  input  logic              BTAWrongM,
@ -75,7 +75,6 @@ module csrc  import cvw::*;  #(parameter cvw_t P) (
  logic [P.XLEN-1:0]       HPMCOUNTER_REGW[P.COUNTERS-1:0];
  logic [P.XLEN-1:0]       HPMCOUNTERH_REGW[P.COUNTERS-1:0];
  logic                    LoadStallE, LoadStallM;
-  logic                    StoreStallE, StoreStallM;
  logic [P.COUNTERS-1:0]   WriteHPMCOUNTERM;
  logic [P.COUNTERS-1:0]   CounterEvent;
  logic [63:0]             HPMCOUNTERPlusM[P.COUNTERS-1:0];
@ -83,8 +82,8 @@ module csrc  import cvw::*;  #(parameter cvw_t P) (
  genvar                   i;

  // Interface signals
-  flopenrc #(2) LoadStallEReg(.clk, .reset, .clear(1'b0), .en(~StallE), .d({StoreStallD, LoadStallD}), .q({StoreStallE, LoadStallE}));  // don't flush the load stall during a load stall.
-  flopenrc #(2) LoadStallMReg(.clk, .reset, .clear(FlushM), .en(~StallM), .d({StoreStallE, LoadStallE}), .q({StoreStallM, LoadStallM}));  
+  flopenrc #(1) LoadStallEReg(.clk, .reset, .clear(1'b0), .en(~StallE), .d(LoadStallD), .q(LoadStallE));  // don't flush the load stall during a load stall.
+  flopenrc #(1) LoadStallMReg(.clk, .reset, .clear(FlushM), .en(~StallM), .d(LoadStallE), .q(LoadStallM));  
  
  // Determine when to increment each counter
  assign CounterEvent[0]    = 1'b1;                                                      // MCYCLE always increments
@ -100,7 +99,7 @@ module csrc  import cvw::*;  #(parameter cvw_t P) (
    assign CounterEvent[9]  = RASPredPCWrongM & InstrValidNotFlushedM;                   // return address stack wrong address
    assign CounterEvent[10] = IClassWrongM & InstrValidNotFlushedM;                      // instruction class predictor wrong
    assign CounterEvent[11] = LoadStallM;                                                // Load Stalls. don't want to suppress on flush as this only happens if flushed.
-    assign CounterEvent[12] = StoreStallM;                                               //  Store Stall
+    assign CounterEvent[12] = 0;                                                         // depricated Store Stall
    assign CounterEvent[13] = DCacheAccess;                                              // data cache access
    assign CounterEvent[14] = DCacheMiss;                                                // data cache miss. Miss asserted 1 cycle at start of cache miss
    assign CounterEvent[15] = DCacheStallM;                                              // d cache miss cycles
--- a/src/privileged/privileged.sv
+++ b/src/privileged/privileged.sv
@ -45,7 +45,6 @@ module privileged import cvw::*;  #(parameter cvw_t P) (
  // processor events for performance counter logging                      
  input  logic              FRegWriteM,                                     // instruction will write floating-point registers
  input  logic              LoadStallD,                                     // load instruction is stalling
-  input  logic              StoreStallD,                                    // store instruction is stalling
  input  logic              ICacheStallF,                                   // I cache stalled
  input  logic              DCacheStallM,                                   // D cache stalled
  input  logic              BPDirPredWrongM,                                // branch predictor guessed wrong direction
@ -133,7 +132,7 @@ module privileged import cvw::*;  #(parameter cvw_t P) (
    .InstrM, .InstrOrigM, .PCM, .SrcAM, .IEUAdrM, 
    .CSRReadM, .CSRWriteM, .TrapM, .mretM, .sretM, .InterruptM,
    .MTimerInt, .MExtInt, .SExtInt, .MSwInt,
-    .MTIME_CLINT, .InstrValidM, .FRegWriteM, .LoadStallD, .StoreStallD,
+    .MTIME_CLINT, .InstrValidM, .FRegWriteM, .LoadStallD, 
    .BPDirPredWrongM, .BTAWrongM, .RASPredPCWrongM, .BPWrongM,
    .sfencevmaM, .ExceptionM, .InvalidateICacheM, .ICacheStallF, .DCacheStallM, .DivBusyE, .FDivBusyE,
    .IClassWrongM, .InstrClassM, .DCacheMiss, .DCacheAccess, .ICacheMiss, .ICacheAccess,
--- a/src/wally/wallypipelinedcore.sv
+++ b/src/wally/wallypipelinedcore.sv
@ -75,7 +75,8 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
  logic                          PCSrcE;
  logic                          CSRWriteFenceM;
  logic                          DivBusyE;
-  logic                          LoadStallD, StoreStallD, MDUStallD, CSRRdStallD;
+  logic                          StructuralStallD;
+  logic                          LoadStallD;
  logic                          SquashSCW;
  logic                          MDUActiveE;                      // Mul/Div instruction being executed
  logic                          ENVCFG_ADUE;                     // HPTW A/D Update enable
@ -95,7 +96,6 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
  logic                          FCvtIntW; 
  logic                          FDivBusyE;
  logic                          FRegWriteM;
-  logic                          FCvtIntStallD;
  logic                          FpLoadStoreM;
  logic [4:0]                    SetFflagsM;
  logic [P.XLEN-1:0]             FIntDivResultW;
@ -211,8 +211,8 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
     .InstrValidM, .InstrValidE, .InstrValidD, .FCvtIntResW, .FCvtIntW,
     // hazards
     .StallD, .StallE, .StallM, .StallW, .FlushD, .FlushE, .FlushM, .FlushW,
-     .FCvtIntStallD, .LoadStallD, .MDUStallD, .CSRRdStallD, .PCSrcE,
-     .CSRReadM, .CSRWriteM, .PrivilegedM, .CSRWriteFenceM, .InvalidateICacheM, .StoreStallD); 
+     .StructuralStallD, .LoadStallD, .PCSrcE,
+     .CSRReadM, .CSRWriteM, .PrivilegedM, .CSRWriteFenceM, .InvalidateICacheM); 

  lsu #(P) lsu(
    .clk, .reset, .StallM, .FlushM, .StallW, .FlushW,
@ -266,9 +266,9 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
  // global stall and flush control  
  hazard #(P) hzu(
    .BPWrongE, .CSRWriteFenceM, .RetM, .TrapM,
-    .LoadStallD, .StoreStallD, .MDUStallD, .CSRRdStallD,
+    .StructuralStallD,
    .LSUStallM, .IFUStallF,
-    .FCvtIntStallD, .FPUStallD,
+    .FPUStallD,
    .DivBusyE, .FDivBusyE,
    .wfiM, .IntPendingM,
    // Stall & flush outputs
@ -284,7 +284,7 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
      .InstrM, .InstrOrigM, .CSRReadValW, .EPCM, .TrapVectorM,
      .RetM, .TrapM, .sfencevmaM, .InvalidateICacheM, .DCacheStallM, .ICacheStallF,
      .InstrValidM, .CommittedM, .CommittedF,
-      .FRegWriteM, .LoadStallD, .StoreStallD,
+      .FRegWriteM, .LoadStallD,
      .BPDirPredWrongM, .BTAWrongM, .BPWrongM,
      .RASPredPCWrongM, .IClassWrongM, .DivBusyE, .FDivBusyE,
      .InstrClassM, .DCacheMiss, .DCacheAccess, .ICacheMiss, .ICacheAccess, .PrivilegedM,