diff --git a/src/lsu/align.sv b/src/lsu/align.sv
index 8cae76a02..897f0d181 100644
--- a/src/lsu/align.sv
+++ b/src/lsu/align.sv
@@ -36,7 +36,7 @@ module align import cvw::*;  #(parameter cvw_t P) (
   input logic [P.XLEN-1:0]  IEUAdrM,               // 2 byte aligned PC in Fetch stage
   input logic [P.XLEN-1:0]  IEUAdrE,           // The next IEUAdrM
   input logic [2:0]         Funct3M,           // Size of memory operation
-  input logic [31:0]        ReadDataWordMuxM,  // Instruction from the IROM, I$, or bus. Used to check if the instruction if compressed
+  input logic [P.LLEN*2-1:0]ReadDataWordMuxM,  // Instruction from the IROM, I$, or bus. Used to check if the instruction if compressed
   input logic               LSUStallM,         // I$ or bus are stalled. Transition to second fetch of spill after the first is fetched
   input logic               DTLBMissM,         // ITLB miss, ignore memory request
   input logic               DataUpdateDAM,     // ITLB miss, ignore memory request
@@ -44,7 +44,7 @@ module align import cvw::*;  #(parameter cvw_t P) (
   output logic [P.XLEN-1:0] IEUAdrSpillE,      // The next PCF for one of the two memory addresses of the spill
   output logic [P.XLEN-1:0] IEUAdrSpillM,      // IEUAdrM for one of the two memory addresses of the spill
   output logic              SelSpillE,     // During the transition between the two spill operations, the IFU should stall the pipeline
-  output logic [31:0]       ReadDataWordSpillM)// The final 32 bit instruction after merging the two spilled fetches into 1 instruction
+  output logic [P.LLEN-1:0] ReadDataWordSpillM);// The final 32 bit instruction after merging the two spilled fetches into 1 instruction
 
   // Spill threshold occurs when all the cache offset PC bits are 1 (except [0]).  Without a cache this is just PCF[1]
   typedef enum logic [1:0]  {STATE_READY, STATE_SPILL} statetype;
@@ -52,15 +52,17 @@ module align import cvw::*;  #(parameter cvw_t P) (
   statetype          CurrState, NextState;
   logic              TakeSpillM, TakeSpillE;
   logic              SpillM;
-  logic              SelSpillF;
-  logic              SpillSaveF;
-  logic [LLEN-8:0]   ReadDataWordFirstHalfM;
+  logic              SelSpillM;
+  logic              SpillSaveM;
+  logic [P.LLEN-1:0]   ReadDataWordFirstHalfM;
+  logic              MisalignedM;
+  logic [P.LLEN*2-1:0] ReadDataWordSpillAllM;
 
   ////////////////////////////////////////////////////////////////////////////////////////////////////
   // PC logic 
   ////////////////////////////////////////////////////////////////////////////////////////////////////
   
-  localparam LLENINBYTES = LLEN/8;
+  localparam LLENINBYTES = P.LLEN/8;
   logic              IEUAdrIncrementM;
   assign IEUAdrIncrementM = IEUAdrM + LLENINBYTES;
   mux2 #(P.XLEN) pcplus2mux(.d0({IEUAdrM[P.XLEN-1:2], 2'b10}), .d1(IEUAdrIncrementM), .s(TakeSpillM), .y(IEUAdrSpillM));
@@ -110,18 +112,30 @@ module align import cvw::*;  #(parameter cvw_t P) (
   assign SpillSaveM = (CurrState == STATE_READY) & TakeSpillM & ~FlushM;
 
   ////////////////////////////////////////////////////////////////////////////////////////////////////
-  // Merge spilled instruction
+  // Merge spilled data
   ////////////////////////////////////////////////////////////////////////////////////////////////////
 
   // save the first 2 bytes
-  flopenr #(P.LLEN-8) SpillDataReg(clk, reset, SpillSaveM, ReadDataWordMuxM[LLEN-1:8], ReadDataWordFirstHalfM);
+  flopenr #(P.LLEN) SpillDataReg(clk, reset, SpillSaveM, ReadDataWordMuxM[P.LLEN-1:0], ReadDataWordFirstHalfM);
 
   // merge together
-  mux2 #(32) postspillmux(InstrRawF, {InstrRawF[15:0], InstrFirstHalfF}, SpillF, PostSpillInstrRawF);
+  mux2 #(2*P.LLEN) postspillmux(ReadDataWordMuxM, {ReadDataWordMuxM[P.LLEN-1:0], ReadDataWordFirstHalfM}, SpillM, ReadDataWordSpillAllM);
 
-  // Need to use always comb to avoid pessimistic x propagation if PostSpillInstrRawF is x
-  always_comb
-  if (PostSpillInstrRawF[1:0] != 2'b11) CompressedF = 1'b1;
-  else CompressedF = 1'b0;
+  // align by shifting
+  // *** optimize by merging with halfSpill, WordSpill, etc
+  logic HalfMisalignedM, WordMisalignedM;
+  assign HalfMisalignedM = Funct3M[1:0] == 2'b01 & ByteOffsetM[0] != 1'b0;
+  assign WordMisalignedM = Funct3M[1:0] == 2'b10 & ByteOffsetM[1:0] != 2'b00;
+  if(P.LLEN == 64) begin
+    logic DoubleMisalignedM;
+    assign DoubleMisalignedM = Funct3M[1:0] == 2'b11 & ByteOffsetM[2:0] != 3'b00;
+    assign MisalignedM = HalfMisalignedM | WordMisalignedM | DoubleMisalignedM;
+  end else begin
+    assign MisalignedM = HalfMisalignedM | WordMisalignedM;
+  end
 
+  // shifter (4:1 mux for 32 bit, 8:1 mux for 64 bit)
+  // 8 * is for shifting by bytes not bits
+  assign ReadDataWordSpillM = ReadDataWordSpillAllM >> (MisalignedM ? 8 * ByteOffsetM : '0);
+  
 endmodule
diff --git a/src/lsu/subwordread-variant1.sv b/src/lsu/subwordread-variant1.sv
deleted file mode 100644
index c0cfe247b..000000000
--- a/src/lsu/subwordread-variant1.sv
+++ /dev/null
@@ -1,249 +0,0 @@
-///////////////////////////////////////////
-// subwordread.sv
-//
-// Written: David_Harris@hmc.edu 
-// Created: 9 January 2021
-// Modified: 18 January 2023 
-//
-// Purpose: Extract subwords and sign extend for reads
-// 
-// Documentation: RISC-V System on Chip Design Chapter 4 (Figure 4.9)
-//
-// A component of the CORE-V-WALLY configurable RISC-V project.
-// 
-// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
-//
-// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
-//
-// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
-// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
-// may obtain a copy of the License at
-//
-// https://solderpad.org/licenses/SHL-2.1/
-//
-// Unless required by applicable law or agreed to in writing, any work distributed under the 
-// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
-// either express or implied. See the License for the specific language governing permissions 
-// and limitations under the License.
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-module subwordreadVar1 #(parameter LLEN) 
-  (
-   input logic [LLEN-1:0]          ReadDataWordMuxM,
-   input logic [$clog(LLEN/8)-1:0] PAdrM,
-   input logic [2:0]               Funct3M,
-   input logic                     FpLoadStoreM, 
-   input logic                     BigEndianM, 
-   output logic [LLEN/2-1:0]       ReadDataM
-);
-
-  localparam OFFSET_LEN = $clog(LLEN/8);
-  localparam HLEN = LLEN/2;
-  logic [7:0]               ByteM; 
-  logic [15:0]              HalfwordM;
-  logic [OFFSET_LEN-1:0]    PAdrSwap;
-  // Funct3M[2] is the unsigned bit. mask upper bits.
-  // Funct3M[1:0] is the size of the memory access.
-  assign PAdrSwap = PAdrM ^ {OFFSET_LEN{BigEndianM}};
-
-  if (LLEN == 128) begin:swrmux
-    // ByteMe mux
-    always_comb
-    case(PAdrSwap[3:0])
-      4'b0000: ByteM = ReadDataWordMuxM[7:0];
-      4'b0001: ByteM = ReadDataWordMuxM[15:8];
-      4'b0010: ByteM = ReadDataWordMuxM[23:16];
-      4'b0011: ByteM = ReadDataWordMuxM[31:24];
-      4'b0100: ByteM = ReadDataWordMuxM[39:32];
-      4'b0101: ByteM = ReadDataWordMuxM[47:40];
-      4'b0110: ByteM = ReadDataWordMuxM[55:48];
-      4'b0111: ByteM = ReadDataWordMuxM[63:56];
-      4'b1000: ByteM = ReadDataWordMuxM[71:64];      
-      4'b1001: ByteM = ReadDataWordMuxM[79:72];      
-      4'b1010: ByteM = ReadDataWordMuxM[87:80];      
-      4'b1011: ByteM = ReadDataWordMuxM[95:88];      
-      4'b1100: ByteM = ReadDataWordMuxM[103:96];      
-      4'b1101: ByteM = ReadDataWordMuxM[111:104];      
-      4'b1110: ByteM = ReadDataWordMuxM[119:112];      
-      4'b1111: ByteM = ReadDataWordMuxM[127:120];      
-    endcase
-  
-    // halfword mux
-    always_comb
-    case(PAdrSwap[3:0])
-      4'b0000: HalfwordM = ReadDataWordMuxM[15:0];
-      4'b0001: HalfwordM = ReadDataWordMuxM[23:8];
-      4'b0010: HalfwordM = ReadDataWordMuxM[31:16];
-      4'b0011: HalfwordM = ReadDataWordMuxM[39:24];
-      4'b0100: HalfwordM = ReadDataWordMuxM[47:32];
-      4'b0101: HalfwordM = ReadDataWordMuxM[55:40];
-      4'b0110: HalfwordM = ReadDataWordMuxM[63:48];
-      4'b0111: HalfwordM = ReadDataWordMuxM[71:56];
-      4'b1000: HalfwordM = ReadDataWordMuxM[79:64];
-      4'b1001: HalfwordM = ReadDataWordMuxM[87:72];
-      4'b1010: HalfwordM = ReadDataWordMuxM[95:80];
-      4'b1011: HalfwordM = ReadDataWordMuxM[103:88];
-      4'b1100: HalfwordM = ReadDataWordMuxM[111:96];
-      4'b1101: HalfwordM = ReadDataWordMuxM[119:104];
-      4'b1110: HalfwordM = ReadDataWordMuxM[127:112];
-      //4'b1111: HalfwordM = {ReadDataWordMuxM[7:0], ReadDataWordMuxM[127:120]}; // *** might be ok to zero extend rather than wrap around
-      4'b1111: HalfwordM = {8'b0, ReadDataWordMuxM[127:120]}; // *** might be ok to zero extend rather than wrap around
-    endcase
-    
-    logic [31:0] WordM;
-    
-    always_comb
-      case(PAdrSwap[3:0])
-        4'b0000: WordM = ReadDataWordMuxM[31:0];
-        4'b0001: WordM = ReadDataWordMuxM[39:8];
-        4'b0010: WordM = ReadDataWordMuxM[47:16];
-        4'b0011: WordM = ReadDataWordMuxM[55:24];
-        4'b0100: WordM = ReadDataWordMuxM[63:32];
-        4'b0101: WordM = ReadDataWordMuxM[71:40];
-        4'b0111: WordM = ReadDataWordMuxM[79:48];
-        4'b1000: WordM = ReadDataWordMuxM[87:56];
-        4'b1001: WordM = ReadDataWordMuxM[95:64];
-        4'b1010: WordM = ReadDataWordMuxM[103:72];
-        4'b1011: WordM = ReadDataWordMuxM[111:80];
-        4'b1011: WordM = ReadDataWordMuxM[119:88];
-        4'b1100: WordM = ReadDataWordMuxM[127:96];
-        4'b1101: WordM = {8'b0, ReadDataWordMuxM[127:104]};
-        4'b1110: WordM = {16'b0, ReadDataWordMuxM[127:112]};
-        4'b1111: WordM = {24'b0, ReadDataWordMuxM[127:120]};
-      endcase
-
-    logic [63:0] DblWordM;
-    always_comb
-      case(PAdrSwap[3:0])
-        4'b0000: DblWordMM = ReadDataWordMuxM[63:0];
-        4'b0001: DblWordMM = ReadDataWordMuxM[71:8];
-        4'b0010: DblWordMM = ReadDataWordMuxM[79:16];
-        4'b0011: DblWordMM = ReadDataWordMuxM[87:24];
-        4'b0100: DblWordMM = ReadDataWordMuxM[95:32];
-        4'b0101: DblWordMM = ReadDataWordMuxM[103:40];
-        4'b0111: DblWordMM = ReadDataWordMuxM[111:48];
-        4'b1000: DblWordMM = ReadDataWordMuxM[119:56];
-        4'b1001: DblWordMM = ReadDataWordMuxM[127:64];
-        4'b1010: DblWordMM = {8'b0, ReadDataWordMuxM[103:72]};
-        4'b1011: DblWordMM = {16'b0, ReadDataWordMuxM[111:80]};
-        4'b1011: DblWordMM = {24'b0, ReadDataWordMuxM[119:88]};
-        4'b1100: DblWordMM = {32'b0, ReadDataWordMuxM[127:96]};
-        4'b1101: DblWordMM = {40'b0, ReadDataWordMuxM[127:104]};
-        4'b1110: DblWordMM = {48'b0, ReadDataWordMuxM[127:112]};
-        4'b1111: DblWordMM = {56'b0, ReadDataWordMuxM[127:120]};
-      endcase
-
-    // sign extension/ NaN boxing
-    always_comb
-    case(Funct3M)
-      3'b000:  ReadDataM = {{HLEN-8{ByteM[7]}}, ByteM};                              // lb
-      3'b001:  ReadDataM = {{HLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]}; // lh/flh
-      3'b010:  ReadDataM = {{HLEN-32{WordM[31]|FpLoadStoreM}}, WordM[31:0]};         // lw/flw
-      3'b011:  ReadDataM = {{HLEN-64{DblWordM[63]|FpLoadStoreM}}, DblWordM[63:0]};   // ld/fld
-      3'b100:  ReadDataM = {{HLEN-8{1'b0}}, ByteM[7:0]};                             // lbu
-    //3'b100:  ReadDataM = FpLoadStoreM ? ReadDataWordMuxM : {{HLEN-8{1'b0}}, ByteM[7:0]}; // lbu/flq   - only needed when LLEN=128
-      3'b101:  ReadDataM = {{HLEN-16{1'b0}}, HalfwordM[15:0]};                       // lhu
-      3'b110:  ReadDataM = {{HLEN-32{1'b0}}, WordM[31:0]};                           // lwu
-      default: ReadDataM = ReadDataWordMuxM[HLEN-1:0];                                         // Shouldn't happen
-    endcase
-
-  end else if (LLEN == 64) begin:swrmux
-    // ByteMe mux
-    always_comb
-    case(PAdrSwap[2:0])
-      3'b000: ByteM = ReadDataWordMuxM[7:0];
-      3'b001: ByteM = ReadDataWordMuxM[15:8];
-      3'b010: ByteM = ReadDataWordMuxM[23:16];
-      3'b011: ByteM = ReadDataWordMuxM[31:24];
-      3'b100: ByteM = ReadDataWordMuxM[39:32];
-      3'b101: ByteM = ReadDataWordMuxM[47:40];
-      3'b110: ByteM = ReadDataWordMuxM[55:48];
-      3'b111: ByteM = ReadDataWordMuxM[63:56];
-    endcase
-  
-    // halfword mux
-    always_comb
-    case(PAdrSwap[2:0])
-      3'b000: HalfwordM = ReadDataWordMuxM[15:0];
-      3'b001: HalfwordM = ReadDataWordMuxM[23:8];
-      3'b010: HalfwordM = ReadDataWordMuxM[31:16];
-      3'b011: HalfwordM = ReadDataWordMuxM[39:24];
-      3'b100: HalfwordM = ReadDataWordMuxM[47:32];
-      3'b011: HalfwordM = ReadDataWordMuxM[55:40];
-      3'b110: HalfwordM = ReadDataWordMuxM[63:48];
-      3'b011: HalfwordM = {8'b0, ReadDataWordMuxM[63:56]};
-    endcase
-    
-    logic [31:0] WordM;
-    
-    always_comb
-      case(PAdrSwap[2:0])
-        3'b000: WordM = ReadDataWordMuxM[31:0];
-        3'b001: WordM = ReadDataWordMuxM[39:8];
-        3'b010: WordM = ReadDataWordMuxM[47:16];
-        3'b011: WordM = ReadDataWordMuxM[55:24];
-        3'b100: WordM = ReadDataWordMuxM[63:32];
-        3'b101: WordM = {8'b0, ReadDataWordMuxM[63:40]};
-        3'b110: WordM = {16'b0, ReadDataWordMuxM[63:48]};
-        3'b111: WordM = {24'b0, ReadDataWordMuxM[63:56]};
-      endcase
-
-    logic [63:0] DblWordM;
-    always_comb
-      case(PAdrSwap[2:0])
-        3'b000: DblWordMM = ReadDataWordMuxM[63:0];
-        3'b001: DblWordMM = {8'b0, ReadDataWordMuxM[63:8]};
-        3'b010: DblWordMM = {16'b0, ReadDataWordMuxM[63:16]};
-        3'b011: DblWordMM = {24'b0, ReadDataWordMuxM[63:24]};
-        3'b100: DblWordMM = {32'b0, ReadDataWordMuxM[63:32]};
-        3'b101: DblWordMM = {40'b0, ReadDataWordMuxM[63:40]};
-        3'b110: DblWordMM = {48'b0, ReadDataWordMuxM[63:48]};
-        3'b111: DblWordMM = {56'b0, ReadDataWordMuxM[63:56]};
-      endcase
-
-    // sign extension/ NaN boxing
-    always_comb
-    case(Funct3M)
-      3'b000:  ReadDataM = {{HLEN-8{ByteM[7]}}, ByteM};                              // lb
-      3'b001:  ReadDataM = {{HLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]}; // lh/flh
-      3'b010:  ReadDataM = {{HLEN-32{WordM[31]|FpLoadStoreM}}, WordM[31:0]};         // lw/flw
-      3'b011:  ReadDataM = {{HLEN-64{DblWordM[63]|FpLoadStoreM}}, DblWordM[63:0]};   // ld/fld
-      3'b100:  ReadDataM = {{HLEN-8{1'b0}}, ByteM[7:0]};                             // lbu
-    //3'b100:  ReadDataM = FpLoadStoreM ? ReadDataWordMuxM : {{HLEN-8{1'b0}}, ByteM[7:0]}; // lbu/flq   - only needed when LLEN=128
-      3'b101:  ReadDataM = {{HLEN-16{1'b0}}, HalfwordM[15:0]};                       // lhu
-      3'b110:  ReadDataM = {{HLEN-32{1'b0}}, WordM[31:0]};                           // lwu
-      default: ReadDataM = ReadDataWordMuxM;                                         // Shouldn't happen
-    endcase
-
-  end else begin:swrmux // 32-bit
-    // byte mux
-    always_comb
-    case(PAdrSwap[1:0])
-      2'b00: ByteM = ReadDataWordMuxM[7:0];
-      2'b01: ByteM = ReadDataWordMuxM[15:8];
-      2'b10: ByteM = ReadDataWordMuxM[23:16];
-      2'b11: ByteM = ReadDataWordMuxM[31:24];
-    endcase
-  
-    // halfword mux
-    always_comb
-    case(PAdrSwap[1:0])
-      2'b00: HalfwordM = ReadDataWordMuxM[15:0];
-      2'b01: HalfwordM = ReadDataWordMuxM[23:8];
-      2'b10: HalfwordM = ReadDataWordMuxM[31:16];
-      2'b11: HalfwordM = {8'b0, ReadDataWordMuxM[31:24]};
-    endcase
-
-    // sign extension
-    always_comb
-    case(Funct3M)
-      3'b000:  ReadDataM = {{HLEN-8{ByteM[7]}}, ByteM};                                            // lb
-      3'b001:  ReadDataM = {{HLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]};               // lh/flh
-      3'b010:  ReadDataM = {{HLEN-32{ReadDataWordMuxM[31]|FpLoadStoreM}}, ReadDataWordMuxM[31:0]}; // lw/flw
-      3'b011:  ReadDataM = ReadDataWordMuxM;                                                        // fld
-      3'b100:  ReadDataM = {{HLEN-8{1'b0}}, ByteM[7:0]};                                           // lbu
-      3'b101:  ReadDataM = {{HLEN-16{1'b0}}, HalfwordM[15:0]};                                     // lhu
-      default: ReadDataM = ReadDataWordMuxM;                                                        // Shouldn't happen
-    endcase
-  end
-endmodule
diff --git a/src/wally/wallypipelinedcore.sv b/src/wally/wallypipelinedcore.sv
index 5df543903..00b348660 100644
--- a/src/wally/wallypipelinedcore.sv
+++ b/src/wally/wallypipelinedcore.sv
@@ -264,7 +264,7 @@ module wallypipelinedcore import cvw::*; #(parameter cvw_t P) (
   end
 
   // global stall and flush control  
-  hazard  hzu(.clk, .reset,
+  hazard  hzu(
     .BPWrongE, .CSRWriteFenceM, .RetM, .TrapM,
     .LoadStallD, .StoreStallD, .MDUStallD, .CSRRdStallD,
     .LSUStallM, .IFUStallF,