diff --git a/src/hazard/hazard.sv b/src/hazard/hazard.sv
index 0bea0a5d0..3728ceb17 100644
--- a/src/hazard/hazard.sv
+++ b/src/hazard/hazard.sv
@@ -28,9 +28,9 @@
 
 module hazard import cvw::*;  #(parameter cvw_t P) ( 
   input  logic  BPWrongE, CSRWriteFenceM, RetM, TrapM,   
-  input  logic  LoadStallD, StoreStallD, MDUStallD, CSRRdStallD,
+  input  logic  StructuralStallD,
   input  logic  LSUStallM, IFUStallF,
-  input  logic  FCvtIntStallD, FPUStallD,
+  input  logic  FPUStallD,
   input  logic  DivBusyE, FDivBusyE,
   input  logic  wfiM, IntPendingM,
   // Stall & flush outputs
@@ -82,7 +82,7 @@ module hazard import cvw::*;  #(parameter cvw_t P) (
   //    The IFU stalls the entire pipeline rather than just Fetch to avoid complications with instructions later in the pipeline causing Exceptions
   //    A trap could be asserted at the start of a IFU/LSU stall, and should flush the memory operation
   assign StallFCause = '0;
-  assign StallDCause = (LoadStallD | StoreStallD | MDUStallD | CSRRdStallD | FCvtIntStallD | FPUStallD) & ~FlushDCause;
+  assign StallDCause = (StructuralStallD | FPUStallD) & ~FlushDCause;
   assign StallECause = (DivBusyE | FDivBusyE) & ~FlushECause; 
   assign StallMCause = WFIStallM & ~FlushMCause;
   // Need to gate IFUStallF when the equivalent FlushFCause = FlushDCause = 1.
diff --git a/src/ieu/alu.sv b/src/ieu/alu.sv
index f4618bc97..b8a0933dc 100644
--- a/src/ieu/alu.sv
+++ b/src/ieu/alu.sv
@@ -27,8 +27,8 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module alu import cvw::*; #(parameter cvw_t P, parameter WIDTH) (
-  input  logic [WIDTH-1:0] A, B,        // Operands
+module alu import cvw::*; #(parameter cvw_t P) (
+  input  logic [P.XLEN-1:0] A, B,        // Operands
   input  logic             W64,         // W64-type instruction
   input  logic             SubArith,    // Subtraction or arithmetic shift
   input  logic [2:0]       ALUSelect,   // ALU mux select signal
@@ -37,14 +37,14 @@ module alu import cvw::*; #(parameter cvw_t P, parameter WIDTH) (
   input  logic [2:0]       Funct3,      // For BMU decoding
   input  logic [2:0]       BALUControl, // ALU Control signals for B instructions in Execute Stage
   input  logic             BMUActiveE,  // Bit manipulation instruction being executed
-  output logic [WIDTH-1:0] ALUResult,   // ALU result
-  output logic [WIDTH-1:0] Sum);        // Sum of operands
+  output logic [P.XLEN-1:0] ALUResult,   // ALU result
+  output logic [P.XLEN-1:0] Sum);        // Sum of operands
 
   // CondInvB = ~B when subtracting, B otherwise. Shift = shift result. SLT/U = result of a slt/u instruction.
   // FullResult = ALU result before adjusting for a RV64 w-suffix instruction.
-  logic [WIDTH-1:0] CondMaskInvB, Shift, FullResult, PreALUResult;                // Intermediate Signals 
-  logic [WIDTH-1:0] CondMaskB;                                                    // Result of B mask select mux
-  logic [WIDTH-1:0] CondShiftA;                                                   // Result of A shifted select mux
+  logic [P.XLEN-1:0] CondMaskInvB, Shift, FullResult, PreALUResult;                // Intermediate Signals 
+  logic [P.XLEN-1:0] CondMaskB;                                                    // Result of B mask select mux
+  logic [P.XLEN-1:0] CondShiftA;                                                   // Result of A shifted select mux
   logic             Carry, Neg;                                                   // Flags: carry out, negative
   logic             LT, LTU;                                                      // Less than, Less than unsigned
   logic             Asign, Bsign;                                                 // Sign bits of A, B
@@ -53,7 +53,7 @@ module alu import cvw::*; #(parameter cvw_t P, parameter WIDTH) (
   // CondMaskB is B for add/sub, or a masked version of B for certain bit manipulation instructions
   // CondShiftA is A for add/sub or a shifted version of A for shift-and-add BMU instructions
   assign CondMaskInvB = SubArith ? ~CondMaskB : CondMaskB;
-  assign {Carry, Sum} = CondShiftA + CondMaskInvB + {{(WIDTH-1){1'b0}}, SubArith};
+  assign {Carry, Sum} = CondShiftA + CondMaskInvB + {{(P.XLEN-1){1'b0}}, SubArith};
   
   // Shifts (configurable for rotation)
   shifter #(P) sh(.A, .Amt(B[P.LOG_XLEN-1:0]), .Right(Funct3[2]), .W64, .SubArith, .Y(Shift), .Rotate(BALUControl[2]));
@@ -62,9 +62,9 @@ module alu import cvw::*; #(parameter cvw_t P, parameter WIDTH) (
   // Overflow occurs when the numbers being subtracted have the opposite sign 
   // and the result has the opposite sign of A.
   // LT is simplified from Overflow = Asign & Bsign & Asign & Neg; LT = Neg ^ Overflow
-  assign Neg  = Sum[WIDTH-1];
-  assign Asign = A[WIDTH-1];
-  assign Bsign = B[WIDTH-1];
+  assign Neg  = Sum[P.XLEN-1];
+  assign Asign = A[P.XLEN-1];
+  assign Bsign = B[P.XLEN-1];
   assign LT = Asign & ~Bsign | Asign & Neg | ~Bsign & Neg; 
   assign LTU = ~Carry;
  
@@ -73,21 +73,22 @@ module alu import cvw::*; #(parameter cvw_t P, parameter WIDTH) (
     case (ALUSelect)                                
       3'b000: FullResult = Sum;                           // add or sub (including address generation)
       3'b001: FullResult = Shift;                         // sll, sra, or srl
-      3'b010: FullResult = {{(WIDTH-1){1'b0}}, LT};       // slt
-      3'b011: FullResult = {{(WIDTH-1){1'b0}}, LTU};      // sltu
+      3'b010: FullResult = {{(P.XLEN-1){1'b0}}, LT};       // slt
+      3'b011: FullResult = {{(P.XLEN-1){1'b0}}, LTU};      // sltu
       3'b100: FullResult = A ^ CondMaskInvB;              // xor, xnor, binv
-      3'b101: FullResult = (P.ZBS_SUPPORTED | P.ZBB_SUPPORTED) ? {{(WIDTH-1){1'b0}},{|(A & CondMaskB)}} : Shift; // bext (or IEU shift when BMU not supported)
+      3'b101: FullResult = (P.ZBS_SUPPORTED | P.ZBB_SUPPORTED) ? {{(P.XLEN-1){1'b0}},{|(A & CondMaskB)}} : Shift; // bext (or IEU shift when BMU not supported)
       3'b110: FullResult = A | CondMaskInvB;              // or, orn, bset
       3'b111: FullResult = A & CondMaskInvB;              // and, bclr
     endcase
 
   // Support RV64I W-type addw/subw/addiw/shifts that discard upper 32 bits and sign-extend 32-bit result to 64 bits
-  if (WIDTH == 64)  assign PreALUResult = W64 ? {{32{FullResult[31]}}, FullResult[31:0]} : FullResult;
+  if (P.XLEN == 64)  assign PreALUResult = W64 ? {{32{FullResult[31]}}, FullResult[31:0]} : FullResult;
   else              assign PreALUResult = FullResult;
 
   // Final Result B instruction select mux
   if (P.ZBC_SUPPORTED | P.ZBS_SUPPORTED | P.ZBA_SUPPORTED | P.ZBB_SUPPORTED) begin : bitmanipalu
-    bitmanipalu #(P, WIDTH) balu(.A, .B, .W64, .BSelect, .ZBBSelect, .BMUActiveE,
+    bitmanipalu #(P) balu(
+      .A, .B, .W64, .BSelect, .ZBBSelect, .BMUActiveE,
       .Funct3, .LT,.LTU, .BALUControl, .PreALUResult, .FullResult,
       .CondMaskB, .CondShiftA, .ALUResult);
   end else begin
diff --git a/src/ieu/bmu/bitmanipalu.sv b/src/ieu/bmu/bitmanipalu.sv
index 706ebdb21..44c66f795 100644
--- a/src/ieu/bmu/bitmanipalu.sv
+++ b/src/ieu/bmu/bitmanipalu.sv
@@ -27,9 +27,8 @@
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
-module bitmanipalu import cvw::*; #(parameter cvw_t P, 
-                     parameter WIDTH=32) (
-  input  logic [WIDTH-1:0] A, B,                    // Operands
+module bitmanipalu import cvw::*; #(parameter cvw_t P) (
+  input  logic [P.XLEN-1:0] A, B,                    // Operands
   input  logic             W64,                     // W64-type instruction
   input  logic [1:0]       BSelect,                 // Binary encoding of if it's a ZBA_ZBB_ZBC_ZBS instruction
   input  logic [2:0]       ZBBSelect,               // ZBB mux select signal
@@ -38,37 +37,37 @@ module bitmanipalu import cvw::*; #(parameter cvw_t P,
   input  logic             LTU,                     // less than unsigned flag
   input  logic [2:0]       BALUControl,             // ALU Control signals for B instructions in Execute Stage
   input  logic             BMUActiveE,              // Bit manipulation instruction being executed
-  input  logic [WIDTH-1:0] PreALUResult, FullResult,// PreALUResult, FullResult signals
-  output logic [WIDTH-1:0] CondMaskB,               // B is conditionally masked for ZBS instructions
-  output logic [WIDTH-1:0] CondShiftA,              // A is conditionally shifted for ShAdd instructions
-  output logic [WIDTH-1:0] ALUResult);              // Result
+  input  logic [P.XLEN-1:0] PreALUResult, FullResult,// PreALUResult, FullResult signals
+  output logic [P.XLEN-1:0] CondMaskB,               // B is conditionally masked for ZBS instructions
+  output logic [P.XLEN-1:0] CondShiftA,              // A is conditionally shifted for ShAdd instructions
+  output logic [P.XLEN-1:0] ALUResult);              // Result
 
-  logic [WIDTH-1:0] ZBBResult, ZBCResult;           // ZBB, ZBC Result
-  logic [WIDTH-1:0] MaskB;                          // BitMask of B
-  logic [WIDTH-1:0] RevA;                           // Bit-reversed A
+  logic [P.XLEN-1:0] ZBBResult, ZBCResult;           // ZBB, ZBC Result
+  logic [P.XLEN-1:0] MaskB;                          // BitMask of B
+  logic [P.XLEN-1:0] RevA;                           // Bit-reversed A
   logic             Rotate;                         // Indicates if it is Rotate instruction
   logic             Mask;                           // Indicates if it is ZBS instruction
   logic             PreShift;                       // Inidicates if it is sh1add, sh2add, sh3add instruction
   logic [1:0]       PreShiftAmt;                    // Amount to Pre-Shift A 
-  logic [WIDTH-1:0] CondZextA;                      // A Conditional Extend Intermediary Signal
-  logic [WIDTH-1:0] ABMU, BBMU;                     // Gated data inputs to reduce BMU activity
+  logic [P.XLEN-1:0] CondZextA;                      // A Conditional Extend Intermediary Signal
+  logic [P.XLEN-1:0] ABMU, BBMU;                     // Gated data inputs to reduce BMU activity
 
   // gate data inputs to BMU to only operate when BMU is active
-  assign ABMU = A & {WIDTH{BMUActiveE}};
-  assign BBMU = B & {WIDTH{BMUActiveE}};
+  assign ABMU = A & {P.XLEN{BMUActiveE}};
+  assign BBMU = B & {P.XLEN{BMUActiveE}};
 
   // Extract control signals from bitmanip ALUControl.
   assign {Mask, PreShift} = BALUControl[1:0];
 
   // Mask Generation Mux
   if (P.ZBS_SUPPORTED) begin: zbsdec
-    decoder #($clog2(WIDTH)) maskgen(BBMU[$clog2(WIDTH)-1:0], MaskB);
-    mux2 #(WIDTH) maskmux(B, MaskB, Mask, CondMaskB);
+    decoder #($clog2(P.XLEN)) maskgen(BBMU[$clog2(P.XLEN)-1:0], MaskB);
+    mux2 #(P.XLEN) maskmux(B, MaskB, Mask, CondMaskB);
   end else assign CondMaskB = B;
  
   // 0-3 bit Pre-Shift Mux
   if (P.ZBA_SUPPORTED) begin: zbapreshift
-    if (WIDTH == 64) begin
+    if (P.XLEN == 64) begin
       mux2 #(64) zextmux(A, {{32{1'b0}}, A[31:0]}, W64, CondZextA); 
     end else assign CondZextA = A;
     assign PreShiftAmt = Funct3[2:1] & {2{PreShift}};
@@ -80,17 +79,17 @@ module bitmanipalu import cvw::*; #(parameter cvw_t P,
 
   // Bit reverse needed for some ZBB, ZBC instructions
   if (P.ZBC_SUPPORTED | P.ZBB_SUPPORTED) begin: bitreverse
-    bitreverse #(WIDTH) brA(.A(ABMU), .RevA);
+    bitreverse #(P.XLEN) brA(.A(ABMU), .RevA);
   end
 
   // ZBC Unit
   if (P.ZBC_SUPPORTED) begin: zbc
-    zbc #(WIDTH) ZBC(.A(ABMU), .RevA, .B(BBMU), .Funct3, .ZBCResult);
+    zbc #(P.XLEN) ZBC(.A(ABMU), .RevA, .B(BBMU), .Funct3, .ZBCResult);
   end else assign ZBCResult = 0;
 
   // ZBB Unit
   if (P.ZBB_SUPPORTED) begin: zbb
-    zbb #(WIDTH) ZBB(.A(ABMU), .RevA, .B(BBMU), .W64, .LT, .LTU, .BUnsigned(Funct3[0]), .ZBBSelect, .ZBBResult);
+    zbb #(P.XLEN) ZBB(.A(ABMU), .RevA, .B(BBMU), .W64, .LT, .LTU, .BUnsigned(Funct3[0]), .ZBBSelect, .ZBBResult);
   end else assign ZBBResult = 0;
 
   // Result Select Mux
diff --git a/src/ieu/controller.sv b/src/ieu/controller.sv
index 4fc3ac9e7..1285ab4cc 100644
--- a/src/ieu/controller.sv
+++ b/src/ieu/controller.sv
@@ -39,10 +39,14 @@ module controller import cvw::*;  #(parameter cvw_t P) (
   output logic        IllegalBaseInstrD,       // Illegal I-type instruction, or illegal RV32 access to upper 16 registers
   output logic        JumpD,                   // Jump instruction
   output logic        BranchD,                 // Branch instruction
-   // Execute stage control signals             
+  output logic        StructuralStallD,        // Structural stalls detected by controller
+  output logic        LoadStallD,              // Structural stalls for load, sent to performance counters
+  output logic [4:0]  Rs1D, Rs2D,              // Register sources to read in Decode or Execute stage
+  // Execute stage control signals             
   input  logic        StallE, FlushE,          // Stall, flush Execute stage
   input  logic [1:0]  FlagsE,                  // Comparison flags ({eq, lt})
   input  logic        FWriteIntE,              // Write integer register, coming from FPU controller
+  input  logic        FCvtIntE,                              // FPU convert float to int
   output logic        PCSrcE,                  // Select signal to choose next PC (for datapath and Hazard unit)
   output logic        ALUSrcAE, ALUSrcBE,      // ALU operands
   output logic        ALUResultSrcE,           // Selects result to pass on to Memory stage
@@ -65,7 +69,7 @@ module controller import cvw::*;  #(parameter cvw_t P) (
   output logic [3:0]  CMOpM,                   // 1: cbo.inval; 2: cbo.flush; 4: cbo.clean; 8: cbo.zero
   output logic        IFUPrefetchE,            // instruction prefetch
   output logic        LSUPrefetchM,            // data prefetch
-
+  output logic [1:0]  ForwardAE, ForwardBE,    // Select signals for forwarding multiplexers
   // Memory stage control signals
   input  logic        StallM, FlushM,          // Stall, flush Memory stage
   output logic [1:0]  MemRWE,                  // Mem read/write: MemRWM[1] = 1 for read, MemRWM[0] = 1 for write 
@@ -83,14 +87,16 @@ module controller import cvw::*;  #(parameter cvw_t P) (
   output logic [2:0]  ResultSrcW,              // Select source of result to write back to register file
   // Stall during CSRs
   output logic        CSRWriteFenceM,          // CSR write or fence instruction; needs to flush the following instructions
-  output logic        StoreStallD              // Store (memory write) causes stall
+  output logic [4:0]  RdE, RdM,                // Pipelined destination registers
+  // Forwarding controls
+  output logic [4:0]  RdW                      // Register destinations in Execute, Memory, or Writeback stage
 );
 
-
+  logic [4:0] Rs1E, Rs2E;                      // pipelined register sources
   logic [6:0] OpD;                             // Opcode in Decode stage
   logic [2:0] Funct3D;                         // Funct3 field in Decode stage
   logic [6:0] Funct7D;                         // Funct7 field in Decode stage
-  logic [4:0] Rs1D, Rs2D, RdD;                 // Rs1/2 source register / dest reg in Decode stage
+  logic [4:0] RdD;                             // Rs1/2 source register / dest reg in Decode stage
 
   `define CTRLW 24
 
@@ -146,6 +152,9 @@ module controller import cvw::*;  #(parameter cvw_t P) (
   logic [3:0]  CMOpD, CMOpE;                   // which CMO instruction 1: cbo.inval; 2: cbo.flush; 4: cbo.clean; 8: cbo.zero
   logic        IFUPrefetchD;                   // instruction prefetch
   logic        LSUPrefetchD, LSUPrefetchE;     // data prefetch
+  logic        AMOStallD, CMOStallD;           // Structural hazards from atomic and cache management ops
+  logic        MatchDE;                        // Match between a source register in Decode stage and destination register in Execute stage
+  logic        FCvtIntStallD, MDUStallD, CSRRdStallD; // Stall due to conversion, load, multiply/divide, CSR read 
 
   // Extract fields
   assign OpD     = InstrD[6:0];
@@ -394,6 +403,9 @@ module controller import cvw::*;  #(parameter cvw_t P) (
   flopenrc #(35) controlregE(clk, reset, FlushE, ~StallE,
                            {ALUSelectD, RegWriteD, ResultSrcD, MemRWD, JumpD, BranchD, ALUSrcAD, ALUSrcBD, ALUResultSrcD, CSRReadD, CSRWriteD, PrivilegedD, Funct3D, W64D, SubArithD, MDUD, AtomicD, InvalidateICacheD, FlushDCacheD, FenceD, CMOpD, IFUPrefetchD, LSUPrefetchD, InstrValidD},
                            {ALUSelectE, IEURegWriteE, ResultSrcE, MemRWE, JumpE, BranchE, ALUSrcAE, ALUSrcBE, ALUResultSrcE, CSRReadE, CSRWriteE, PrivilegedE, Funct3E, W64E, SubArithE, MDUE, AtomicE, InvalidateICacheE, FlushDCacheE, FenceE, CMOpE, IFUPrefetchE, LSUPrefetchE, InstrValidE});
+  flopenrc #(5)      Rs1EReg(clk, reset, FlushE, ~StallE, Rs1D, Rs1E);
+  flopenrc #(5)      Rs2EReg(clk, reset, FlushE, ~StallE, Rs2D, Rs2E);
+  flopenrc #(5)      RdEReg(clk, reset, FlushE, ~StallE, RdD, RdE);
 
   // Branch Logic
   //  The comparator handles both signed and unsigned branches using BranchSignedE
@@ -415,22 +427,45 @@ module controller import cvw::*;  #(parameter cvw_t P) (
   flopenrc #(25) controlregM(clk, reset, FlushM, ~StallM,
                          {RegWriteE, ResultSrcE, MemRWE, CSRReadE, CSRWriteE, PrivilegedE, Funct3E, FWriteIntE, AtomicE, InvalidateICacheE, FlushDCacheE, FenceE, InstrValidE, IntDivE, CMOpE, LSUPrefetchE},
                          {RegWriteM, ResultSrcM, MemRWM, CSRReadM, CSRWriteM, PrivilegedM, Funct3M, FWriteIntM, AtomicM, InvalidateICacheM, FlushDCacheM, FenceM, InstrValidM, IntDivM, CMOpM, LSUPrefetchM});
-  
+  flopenrc #(5)      RdMReg(clk, reset, FlushM, ~StallM, RdE, RdM);  
+
   // Writeback stage pipeline control register
   flopenrc #(5) controlregW(clk, reset, FlushW, ~StallW,
                          {RegWriteM, ResultSrcM, IntDivM},
                          {RegWriteW, ResultSrcW, IntDivW});  
+  flopenrc #(5)      RdWReg(clk, reset, FlushW, ~StallW, RdM, RdW);
 
   // Flush F, D, and E stages on a CSR write or Fence.I or SFence.VMA
   assign CSRWriteFenceM = CSRWriteM | FenceM;
 
+  // Forwarding logic
+  always_comb begin
+    ForwardAE = 2'b00;
+    ForwardBE = 2'b00;
+    if (Rs1E != 5'b0)
+      if      ((Rs1E == RdM) & RegWriteM) ForwardAE = 2'b10;
+      else if ((Rs1E == RdW) & RegWriteW) ForwardAE = 2'b01;
+ 
+    if (Rs2E != 5'b0)
+      if      ((Rs2E == RdM) & RegWriteM) ForwardBE = 2'b10;
+      else if ((Rs2E == RdW) & RegWriteW) ForwardBE = 2'b01;
+  end
+
+  // Stall on dependent operations that finish in Mem Stage and can't bypass in time
+  assign MatchDE = ((Rs1D == RdE) | (Rs2D == RdE)) & (RdE != 5'b0); // Decode-stage instruction source depends on result from execute stage instruction
+  assign FCvtIntStallD = FCvtIntE & MatchDE; // FPU to Integer transfers have single-cycle latency except fcvt
+  assign LoadStallD = (MemReadE|SCE) & MatchDE;  
+  assign MDUStallD = MDUE & MatchDE; // Int mult/div is at least two cycle latency, even when coming from the FDIV
+  assign CSRRdStallD = CSRReadE & MatchDE;
+
   // the synchronous DTIM cannot read immediately after write
   // a cache cannot read or write immediately after a write
-  // atomic operations are also detected as MemRWD[1]
+  // atomic operations are also detected as MemRWD[1] ***check; seems like & MemRWE
   // *** RT: Remove this after updating the cache.
   // *** RT: Check that atomic after atomic works correctly.
-  //assign StoreStallD = ((|CMOpE)) & ((|CMOpD));
-  logic AMOHazard;
-  assign AMOHazard = &MemRWE & MemRWD[1];
-  assign StoreStallD = ((|CMOpE) & (|CMOpD)) | AMOHazard;
+  assign AMOStallD = &MemRWE & MemRWD[1]; // Read after atomic operation causes structural hazard
+  assign CMOStallD = (|CMOpE) & (|CMOpD); // CMO op after CMO op causes structural hazard ***explain, why doesn't interact with read/write
+
+  // Structural hazard causes stall if any of these events occur
+  assign StructuralStallD = LoadStallD | MDUStallD | CSRRdStallD | FCvtIntStallD | AMOStallD | CMOStallD;
 endmodule
diff --git a/src/ieu/datapath.sv b/src/ieu/datapath.sv
index 8c366a2ef..126410238 100644
--- a/src/ieu/datapath.sv
+++ b/src/ieu/datapath.sv
@@ -32,6 +32,7 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
   // Decode stage signals
   input  logic [2:0]        ImmSrcD,                 // Selects type of immediate extension
   input  logic [31:0]       InstrD,                  // Instruction in Decode stage
+  input  logic [4:0]        Rs1D, Rs2D,              // Source registers
   // Execute stage signals
   input  logic [P.XLEN-1:0] PCE,                     // PC in Execute stage  
   input  logic [P.XLEN-1:0] PCLinkE,                 // PC + 4 (of instruction in Execute stage)
@@ -68,9 +69,8 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.XLEN-1:0] CSRReadValW,             // CSR read result
   input  logic [P.XLEN-1:0] MDUResultW,              // MDU (Multiply/divide unit) result
   input  logic [P.XLEN-1:0] FIntDivResultW,          // FPU's integer divide result
+  input  logic [4:0]        RdW                      // Destination register
    // Hazard Unit signals 
-  output logic [4:0]        Rs1D, Rs2D, Rs1E, Rs2E,  // Register sources to read in Decode or Execute stage
-  output logic [4:0]        RdE, RdM, RdW            // Register destinations in Execute, Memory, or Writeback stage
 );
 
   // Fetch stage signals
@@ -94,9 +94,6 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
   logic [P.XLEN-1:0] MulDivResultW;                  // Multiply always comes from MDU.  Divide could come from MDU or FPU (when using fdivsqrt for integer division)
 
   // Decode stage
-  assign Rs1D      = InstrD[19:15];
-  assign Rs2D      = InstrD[24:20];
-  assign RdD       = InstrD[11:7];
   regfile #(P.XLEN, P.E_SUPPORTED) regf(clk, reset, RegWriteW, Rs1D, Rs2D, RdW, ResultW, R1D, R2D);
   extend #(P)        ext(.InstrD(InstrD[31:7]), .ImmSrcD, .ImmExtD);
  
@@ -104,28 +101,23 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
   flopenrc #(P.XLEN) RD1EReg(clk, reset, FlushE, ~StallE, R1D, R1E);
   flopenrc #(P.XLEN) RD2EReg(clk, reset, FlushE, ~StallE, R2D, R2E);
   flopenrc #(P.XLEN) ImmExtEReg(clk, reset, FlushE, ~StallE, ImmExtD, ImmExtE);
-  flopenrc #(5)      Rs1EReg(clk, reset, FlushE, ~StallE, Rs1D, Rs1E);
-  flopenrc #(5)      Rs2EReg(clk, reset, FlushE, ~StallE, Rs2D, Rs2E);
-  flopenrc #(5)      RdEReg(clk, reset, FlushE, ~StallE, RdD, RdE);
   
   mux3  #(P.XLEN)  faemux(R1E, ResultW, IFResultM, ForwardAE, ForwardedSrcAE);
   mux3  #(P.XLEN)  fbemux(R2E, ResultW, IFResultM, ForwardBE, ForwardedSrcBE);
   comparator #(P.XLEN) comp(ForwardedSrcAE, ForwardedSrcBE, BranchSignedE, FlagsE);
   mux2  #(P.XLEN)  srcamux(ForwardedSrcAE, PCE, ALUSrcAE, SrcAE);
   mux2  #(P.XLEN)  srcbmux(ForwardedSrcBE, ImmExtE, ALUSrcBE, SrcBE);
-  alu   #(P, P.XLEN)  alu(SrcAE, SrcBE, W64E, SubArithE, ALUSelectE, BSelectE, ZBBSelectE, Funct3E, BALUControlE, BMUActiveE, ALUResultE, IEUAdrE);
+  alu   #(P)       alu(SrcAE, SrcBE, W64E, SubArithE, ALUSelectE, BSelectE, ZBBSelectE, Funct3E, BALUControlE, BMUActiveE, ALUResultE, IEUAdrE);
   mux2  #(P.XLEN)  altresultmux(ImmExtE, PCLinkE, JumpE, AltResultE);
   mux2  #(P.XLEN)  ieuresultmux(ALUResultE, AltResultE, ALUResultSrcE, IEUResultE);
 
   // Memory stage pipeline register
   flopenrc #(P.XLEN) SrcAMReg(clk, reset, FlushM, ~StallM, SrcAE, SrcAM);
   flopenrc #(P.XLEN) IEUResultMReg(clk, reset, FlushM, ~StallM, IEUResultE, IEUResultM);
-  flopenrc #(5)      RdMReg(clk, reset, FlushM, ~StallM, RdE, RdM);  
   flopenrc #(P.XLEN) WriteDataMReg(clk, reset, FlushM, ~StallM, ForwardedSrcBE, WriteDataM); 
   
   // Writeback stage pipeline register and logic
   flopenrc #(P.XLEN) IFResultWReg(clk, reset, FlushW, ~StallW, IFResultM, IFResultW);
-  flopenrc #(5)      RdWReg(clk, reset, FlushW, ~StallW, RdM, RdW);
 
   // floating point inputs: FIntResM comes from fclass, fcmp, fmv; FCvtIntResW comes from fcvt
   if (P.F_SUPPORTED) begin:fpmux
diff --git a/src/ieu/forward.sv b/src/ieu/forward.sv
deleted file mode 100644
index ef3cd4b4b..000000000
--- a/src/ieu/forward.sv
+++ /dev/null
@@ -1,62 +0,0 @@
-///////////////////////////////////////////
-// forward.sv
-//
-// Written: David_Harris@hmc.edu, Sarah.Harris@unlv.edu
-// Created: 9 January 2021
-// Modified: 
-//
-// Purpose: Determine datapath forwarding
-// 
-// Documentation: RISC-V System on Chip Design Chapter 4 (Section 4.2.2.3)
-//
-// A component of the CORE-V-WALLY configurable RISC-V project.
-// 
-// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
-//
-// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
-//
-// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
-// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
-// may obtain a copy of the License at
-//
-// https://solderpad.org/licenses/SHL-2.1/
-//
-// Unless required by applicable law or agreed to in writing, any work distributed under the 
-// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
-// either express or implied. See the License for the specific language governing permissions 
-// and limitations under the License.
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-module forward(
-  // Detect hazards
-  input  logic [4:0]  Rs1D, Rs2D, Rs1E, Rs2E, RdE, RdM, RdW, // Source and destination registers
-  input  logic        MemReadE, MDUE, CSRReadE,              // Execute stage instruction is a load (MemReadE), divide (MDUE), or CSR read (CSRReadE)
-  input  logic        RegWriteM, RegWriteW,                  // Instruction in Memory or Writeback stage writes register file
-  input  logic        FCvtIntE,                              // FPU convert float to int
-  input  logic        SCE,                                   // Store Conditional instruction
-  // Forwarding controls
-  output logic [1:0]  ForwardAE, ForwardBE,                  // Select signals for forwarding multiplexers
-  output logic        FCvtIntStallD, LoadStallD, MDUStallD, CSRRdStallD // Stall due to conversion, load, multiply/divide, CSR read
-);
-
-  logic MatchDE;                                             // Match between a source register in Decode stage and destination register in Execute stage
-  
-  always_comb begin
-    ForwardAE = 2'b00;
-    ForwardBE = 2'b00;
-    if (Rs1E != 5'b0)
-      if      ((Rs1E == RdM) & RegWriteM) ForwardAE = 2'b10;
-      else if ((Rs1E == RdW) & RegWriteW) ForwardAE = 2'b01;
- 
-    if (Rs2E != 5'b0)
-      if      ((Rs2E == RdM) & RegWriteM) ForwardBE = 2'b10;
-      else if ((Rs2E == RdW) & RegWriteW) ForwardBE = 2'b01;
-  end
-
-  // Stall on dependent operations that finish in Mem Stage and can't bypass in time
-  assign MatchDE = ((Rs1D == RdE) | (Rs2D == RdE)) & (RdE != 5'b0); // Decode-stage instruction source depends on result from execute stage instruction
-  assign FCvtIntStallD = FCvtIntE & MatchDE; // FPU to Integer transfers have single-cycle latency except fcvt
-  assign LoadStallD = (MemReadE|SCE) & MatchDE;  
-  assign MDUStallD = MDUE & MatchDE; // Int mult/div is at least two cycle latency, even when coming from the FDIV
-  assign CSRRdStallD = CSRReadE & MatchDE;
-endmodule
diff --git a/src/ieu/ieu.sv b/src/ieu/ieu.sv
index 55cfdf854..b8b3c200d 100644
--- a/src/ieu/ieu.sv
+++ b/src/ieu/ieu.sv
@@ -73,8 +73,8 @@ module ieu import cvw::*;  #(parameter cvw_t P) (
   // Hazard unit signals
   input  logic              StallD, StallE, StallM, StallW,  // Stall signals from hazard unit
   input  logic              FlushD, FlushE, FlushM, FlushW,  // Flush signals
-  output logic              FCvtIntStallD, LoadStallD,       // Stall causes from IEU to hazard unit
-  output logic              MDUStallD, CSRRdStallD, StoreStallD,
+  output logic              StructuralStallD,                // IEU detects structural hazard in Decode stage
+  output logic              LoadStallD,                      // Structural stalls for load, sent to performance counters
   output logic              CSRReadM, CSRWriteM, PrivilegedM,// CSR read, CSR write, is privileged instruction
   output logic              CSRWriteFenceM                   // CSR write or fence instruction needs to flush subsequent instructions
 );
@@ -94,7 +94,7 @@ module ieu import cvw::*;  #(parameter cvw_t P) (
   logic       SubArithE;                                     // Subtraction or arithmetic shift
 
   // Forwarding signals
-  logic [4:0] Rs1D, Rs2D, Rs1E, Rs2E;                        // Source and destination registers
+  logic [4:0] Rs1D, Rs2D;                                    // Source registers
   logic [1:0] ForwardAE, ForwardBE;                          // Select signals for forwarding multiplexers
   logic       RegWriteM, RegWriteW;                          // Register will be written in Memory, Writeback stages
   logic       MemReadE, CSRReadE;                            // Load, CSRRead instruction
@@ -104,25 +104,23 @@ module ieu import cvw::*;  #(parameter cvw_t P) (
            
   controller #(P) c(
     .clk, .reset, .StallD, .FlushD, .InstrD, .STATUS_FS, .ENVCFG_CBE, .ImmSrcD,
-    .IllegalIEUFPUInstrD, .IllegalBaseInstrD, .StallE, .FlushE, .FlagsE, .FWriteIntE,
+    .IllegalIEUFPUInstrD, .IllegalBaseInstrD, 
+    .StructuralStallD, .LoadStallD, .Rs1D, .Rs2D, 
+    .StallE, .FlushE, .FlagsE, .FWriteIntE,
     .PCSrcE, .ALUSrcAE, .ALUSrcBE, .ALUResultSrcE, .ALUSelectE, .MemReadE, .CSRReadE, 
     .Funct3E, .IntDivE, .MDUE, .W64E, .SubArithE, .BranchD, .BranchE, .JumpD, .JumpE, .SCE, 
-    .BranchSignedE, .BSelectE, .ZBBSelectE, .BALUControlE, .BMUActiveE, .MDUActiveE, .CMOpM, .IFUPrefetchE, .LSUPrefetchM,
+    .BranchSignedE, .BSelectE, .ZBBSelectE, .BALUControlE, .BMUActiveE, .MDUActiveE, 
+    .FCvtIntE, .ForwardAE, .ForwardBE, .CMOpM, .IFUPrefetchE, .LSUPrefetchM,
     .StallM, .FlushM, .MemRWE, .MemRWM, .CSRReadM, .CSRWriteM, .PrivilegedM, .AtomicM, .Funct3M,
     .RegWriteM, .FlushDCacheM, .InstrValidM, .InstrValidE, .InstrValidD, .FWriteIntM,
-    .StallW, .FlushW, .RegWriteW, .IntDivW, .ResultSrcW, .CSRWriteFenceM, .InvalidateICacheM, .StoreStallD);
+    .StallW, .FlushW, .RegWriteW, .IntDivW, .ResultSrcW, .CSRWriteFenceM, .InvalidateICacheM,
+    .RdW, .RdE, .RdM);
 
   datapath #(P) dp(
-    .clk, .reset, .ImmSrcD, .InstrD, .StallE, .FlushE, .ForwardAE, .ForwardBE, .W64E, .SubArithE,
+    .clk, .reset, .ImmSrcD, .InstrD, .Rs1D, .Rs2D, .StallE, .FlushE, .ForwardAE, .ForwardBE, .W64E, .SubArithE,
     .Funct3E, .ALUSrcAE, .ALUSrcBE, .ALUResultSrcE, .ALUSelectE, .JumpE, .BranchSignedE, 
     .PCE, .PCLinkE, .FlagsE, .IEUAdrE, .ForwardedSrcAE, .ForwardedSrcBE, .BSelectE, .ZBBSelectE, .BALUControlE, .BMUActiveE,
     .StallM, .FlushM, .FWriteIntM, .FIntResM, .SrcAM, .WriteDataM, .FCvtIntW,
     .StallW, .FlushW, .RegWriteW, .IntDivW, .SquashSCW, .ResultSrcW, .ReadDataW, .FCvtIntResW,
-    .CSRReadValW, .MDUResultW, .FIntDivResultW, .Rs1D, .Rs2D, .Rs1E, .Rs2E, .RdE, .RdM, .RdW);             
-  
-  forward    fw(
-    .Rs1D, .Rs2D, .Rs1E, .Rs2E, .RdE, .RdM, .RdW,
-    .MemReadE, .MDUE, .CSRReadE, .RegWriteM, .RegWriteW,
-    .FCvtIntE, .SCE, .ForwardAE, .ForwardBE,
-    .FCvtIntStallD, .LoadStallD, .MDUStallD, .CSRRdStallD);
+    .CSRReadValW, .MDUResultW, .FIntDivResultW, .RdW);             
 endmodule
diff --git a/src/privileged/csr.sv b/src/privileged/csr.sv
index edb27155c..e43712d81 100644
--- a/src/privileged/csr.sv
+++ b/src/privileged/csr.sv
@@ -54,7 +54,6 @@ module csr import cvw::*;  #(parameter cvw_t P) (
   input  logic                     SelHPTW,                   // hardware page table walker active, so base endianness on supervisor mode
   // inputs for performance counters
   input  logic                     LoadStallD,
-  input  logic                     StoreStallD,
   input  logic                     ICacheStallF,
   input  logic                     DCacheStallM,
   input  logic                     BPDirPredWrongM,
@@ -275,7 +274,7 @@ module csr import cvw::*;  #(parameter cvw_t P) (
   
   if (P.ZICNTR_SUPPORTED) begin:counters
     csrc #(P) counters(.clk, .reset, .StallE, .StallM, .FlushM,
-      .InstrValidNotFlushedM, .LoadStallD, .StoreStallD, .CSRWriteM, .CSRMWriteM,
+      .InstrValidNotFlushedM, .LoadStallD, .CSRWriteM, .CSRMWriteM,
       .BPDirPredWrongM, .BTAWrongM, .RASPredPCWrongM, .IClassWrongM, .BPWrongM,
       .InstrClassM, .DCacheMiss, .DCacheAccess, .ICacheMiss, .ICacheAccess, .sfencevmaM,
       .InterruptM, .ExceptionM, .InvalidateICacheM, .ICacheStallF, .DCacheStallM, .DivBusyE, .FDivBusyE,
diff --git a/src/privileged/csrc.sv b/src/privileged/csrc.sv
index 2944b1a66..c3dbf1f6b 100644
--- a/src/privileged/csrc.sv
+++ b/src/privileged/csrc.sv
@@ -32,7 +32,7 @@ module csrc  import cvw::*;  #(parameter cvw_t P) (
   input  logic              clk, reset,
   input  logic              StallE, StallM, 
   input  logic              FlushM, 
-  input  logic              InstrValidNotFlushedM, LoadStallD, StoreStallD, 
+  input  logic              InstrValidNotFlushedM, LoadStallD, 
   input  logic              CSRMWriteM, CSRWriteM,
   input  logic              BPDirPredWrongM,
   input  logic              BTAWrongM,
@@ -75,7 +75,6 @@ module csrc  import cvw::*;  #(parameter cvw_t P) (
   logic [P.XLEN-1:0]       HPMCOUNTER_REGW[P.COUNTERS-1:0];
   logic [P.XLEN-1:0]       HPMCOUNTERH_REGW[P.COUNTERS-1:0];
   logic                    LoadStallE, LoadStallM;
-  logic                    StoreStallE, StoreStallM;
   logic [P.COUNTERS-1:0]   WriteHPMCOUNTERM;
   logic [P.COUNTERS-1:0]   CounterEvent;
   logic [63:0]             HPMCOUNTERPlusM[P.COUNTERS-1:0];
@@ -83,8 +82,8 @@ module csrc  import cvw::*;  #(parameter cvw_t P) (
   genvar                   i;
 
   // Interface signals
-  flopenrc #(2) LoadStallEReg(.clk, .reset, .clear(1'b0), .en(~StallE), .d({StoreStallD, LoadStallD}), .q({StoreStallE, LoadStallE}));  // don't flush the load stall during a load stall.
-  flopenrc #(2) LoadStallMReg(.clk, .reset, .clear(FlushM), .en(~StallM), .d({StoreStallE, LoadStallE}), .q({StoreStallM, LoadStallM}));  
+  flopenrc #(1) LoadStallEReg(.clk, .reset, .clear(1'b0), .en(~StallE), .d(LoadStallD), .q(LoadStallE));  // don't flush the load stall during a load stall.
+  flopenrc #(1) LoadStallMReg(.clk, .reset, .clear(FlushM), .en(~StallM), .d(LoadStallE), .q(LoadStallM));  
   
   // Determine when to increment each counter
   assign CounterEvent[0]    = 1'b1;                                                      // MCYCLE always increments
@@ -100,7 +99,7 @@ module csrc  import cvw::*;  #(parameter cvw_t P) (
     assign CounterEvent[9]  = RASPredPCWrongM & InstrValidNotFlushedM;                   // return address stack wrong address
     assign CounterEvent[10] = IClassWrongM & InstrValidNotFlushedM;                      // instruction class predictor wrong
     assign CounterEvent[11] = LoadStallM;                                                // Load Stalls. don't want to suppress on flush as this only happens if flushed.
-    assign CounterEvent[12] = StoreStallM;                                               //  Store Stall
+    assign CounterEvent[12] = 0;                                                         // depricated Store Stall
     assign CounterEvent[13] = DCacheAccess;                                              // data cache access
     assign CounterEvent[14] = DCacheMiss;                                                // data cache miss. Miss asserted 1 cycle at start of cache miss
     assign CounterEvent[15] = DCacheStallM;                                              // d cache miss cycles
diff --git a/src/privileged/privileged.sv b/src/privileged/privileged.sv
index 4c27df006..5692fc9f8 100644
--- a/src/privileged/privileged.sv
+++ b/src/privileged/privileged.sv
@@ -45,7 +45,6 @@ module privileged import cvw::*;  #(parameter cvw_t P) (
   // processor events for performance counter logging                      
   input  logic              FRegWriteM,                                     // instruction will write floating-point registers
   input  logic              LoadStallD,                                     // load instruction is stalling
-  input  logic              StoreStallD,                                    // store instruction is stalling
   input  logic              ICacheStallF,                                   // I cache stalled
   input  logic              DCacheStallM,                                   // D cache stalled
   input  logic              BPDirPredWrongM,                                // branch predictor guessed wrong direction
@@ -133,7 +132,7 @@ module privileged import cvw::*;  #(parameter cvw_t P) (
     .InstrM, .InstrOrigM, .PCM, .SrcAM, .IEUAdrM, 
     .CSRReadM, .CSRWriteM, .TrapM, .mretM, .sretM, .InterruptM,
     .MTimerInt, .MExtInt, .SExtInt, .MSwInt,
-    .MTIME_CLINT, .InstrValidM, .FRegWriteM, .LoadStallD, .StoreStallD,
+    .MTIME_CLINT, .InstrValidM, .FRegWriteM, .LoadStallD, 
     .BPDirPredWrongM, .BTAWrongM, .RASPredPCWrongM, .BPWrongM,
     .sfencevmaM, .ExceptionM, .InvalidateICacheM, .ICacheStallF, .DCacheStallM, .DivBusyE, .FDivBusyE,
     .IClassWrongM, .InstrClassM, .DCacheMiss, .DCacheAccess, .ICacheMiss, .ICacheAccess,
diff --git a/src/wally/wallypipelinedcore.sv b/src/wally/wallypipelinedcore.sv
index f861a08bc..78cec0665 100644
--- a/src/wally/wallypipelinedcore.sv
+++ b/src/wally/wallypipelinedcore.sv
@@ -75,7 +75,8 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
   logic                          PCSrcE;
   logic                          CSRWriteFenceM;
   logic                          DivBusyE;
-  logic                          LoadStallD, StoreStallD, MDUStallD, CSRRdStallD;
+  logic                          StructuralStallD;
+  logic                          LoadStallD;
   logic                          SquashSCW;
   logic                          MDUActiveE;                      // Mul/Div instruction being executed
   logic                          ENVCFG_ADUE;                     // HPTW A/D Update enable
@@ -95,7 +96,6 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
   logic                          FCvtIntW; 
   logic                          FDivBusyE;
   logic                          FRegWriteM;
-  logic                          FCvtIntStallD;
   logic                          FpLoadStoreM;
   logic [4:0]                    SetFflagsM;
   logic [P.XLEN-1:0]             FIntDivResultW;
@@ -211,8 +211,8 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
      .InstrValidM, .InstrValidE, .InstrValidD, .FCvtIntResW, .FCvtIntW,
      // hazards
      .StallD, .StallE, .StallM, .StallW, .FlushD, .FlushE, .FlushM, .FlushW,
-     .FCvtIntStallD, .LoadStallD, .MDUStallD, .CSRRdStallD, .PCSrcE,
-     .CSRReadM, .CSRWriteM, .PrivilegedM, .CSRWriteFenceM, .InvalidateICacheM, .StoreStallD); 
+     .StructuralStallD, .LoadStallD, .PCSrcE,
+     .CSRReadM, .CSRWriteM, .PrivilegedM, .CSRWriteFenceM, .InvalidateICacheM); 
 
   lsu #(P) lsu(
     .clk, .reset, .StallM, .FlushM, .StallW, .FlushW,
@@ -266,9 +266,9 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
   // global stall and flush control  
   hazard #(P) hzu(
     .BPWrongE, .CSRWriteFenceM, .RetM, .TrapM,
-    .LoadStallD, .StoreStallD, .MDUStallD, .CSRRdStallD,
+    .StructuralStallD,
     .LSUStallM, .IFUStallF,
-    .FCvtIntStallD, .FPUStallD,
+    .FPUStallD,
     .DivBusyE, .FDivBusyE,
     .wfiM, .IntPendingM,
     // Stall & flush outputs
@@ -284,7 +284,7 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
       .InstrM, .InstrOrigM, .CSRReadValW, .EPCM, .TrapVectorM,
       .RetM, .TrapM, .sfencevmaM, .InvalidateICacheM, .DCacheStallM, .ICacheStallF,
       .InstrValidM, .CommittedM, .CommittedF,
-      .FRegWriteM, .LoadStallD, .StoreStallD,
+      .FRegWriteM, .LoadStallD,
       .BPDirPredWrongM, .BTAWrongM, .BPWrongM,
       .RASPredPCWrongM, .IClassWrongM, .DivBusyE, .FDivBusyE,
       .InstrClassM, .DCacheMiss, .DCacheAccess, .ICacheMiss, .ICacheAccess, .PrivilegedM,