From 12763b72977d9769b0b52a418c53560cb25b4659 Mon Sep 17 00:00:00 2001
From: Rose Thompson <ross1728@gmail.com>
Date: Thu, 26 Oct 2023 10:47:00 -0500
Subject: [PATCH] begin implemenation of Zicclsm.

---
 src/lsu/align.sv       | 121 ++++++++++++++++++++++++++
 src/lsu/subwordread.sv | 193 +++++++++++++++++++++++++++++++++--------
 2 files changed, 280 insertions(+), 34 deletions(-)
 create mode 100644 src/lsu/align.sv

diff --git a/src/lsu/align.sv b/src/lsu/align.sv
new file mode 100644
index 000000000..b3e810ee2
--- /dev/null
+++ b/src/lsu/align.sv
@@ -0,0 +1,121 @@
+///////////////////////////////////////////
+// spill.sv
+//
+// Written: Rose Thompson ross1728@gmail.com
+// Created: 26 October 2023
+// Modified: 26 October 2023
+//
+// Purpose: This module implements native alignment support for the Zicclsm extension
+//          It is simlar to the IFU's spill module and probably could be merged together with 
+//          some effort.
+//
+// Documentation: RISC-V System on Chip Design Chapter 11 (Figure 11.5)
+// 
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module align import cvw::*;  #(parameter cvw_t P) (
+  input logic               clk,               
+  input logic               reset,
+  input logic               StallM, FlushM,
+  input logic [P.XLEN-1:0]  IEUAdrM,               // 2 byte aligned PC in Fetch stage
+  input logic [P.XLEN-1:0]  IEUAdrE,           // The next IEUAdrM
+  input logic [31:0]        ReadDataWordMuxM,  // Instruction from the IROM, I$, or bus. Used to check if the instruction if compressed
+  input logic               LSUStallM,         // I$ or bus are stalled. Transition to second fetch of spill after the first is fetched
+  input logic               DTLBMissM,         // ITLB miss, ignore memory request
+
+  output logic [P.XLEN-1:0] IEUAdrSpillE,      // The next PCF for one of the two memory addresses of the spill
+  output logic [P.XLEN-1:0] IEUAdrSpillM,      // IEUAdrM for one of the two memory addresses of the spill
+  output logic              SelSpillE,     // During the transition between the two spill operations, the IFU should stall the pipeline
+  output logic [31:0]       ReadDataWordSpillM)// The final 32 bit instruction after merging the two spilled fetches into 1 instruction
+
+  // Spill threshold occurs when all the cache offset PC bits are 1 (except [0]).  Without a cache this is just PCF[1]
+  typedef enum logic [1:0]  {STATE_READY, STATE_SPILL} statetype;
+
+  statetype          CurrState, NextState;
+  logic              TakeSpillM, TakeSpillE;
+  logic              SpillF;
+  logic              SelSpillF;
+  logic              SpillSaveF;
+  logic [15:0]       InstrFirstHalfF;
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // PC logic 
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  
+  localparam LLENINBYTES = LLEN/8;
+  logic              IEUAdrIncrementM;
+  assign IEUAdrIncrementM = IEUAdrM + LLENINBYTES;
+  mux2 #(P.XLEN) pcplus2mux(.d0({IEUAdrM[P.XLEN-1:2], 2'b10}), .d1(IEUAdrIncrementM), .s(TakeSpillM), .y(IEUAdrSpillM));
+  mux2 #(P.XLEN) pcnextspillmux(.d0(IEUAdrE), .d1(IEUAdrIncrementM), .s(TakeSpillE), .y(IEUAdrSpillE));
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Detect spill
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // spill detection in lsu is more complex than ifu, depends on 3 factors
+  // 1) operation size
+  // 2) offset
+  // 3) access location within the cacheline or is the access is uncached.
+  // first consider uncached operations
+  // accesses are always aligned to the natural size of the bus (XLEN or AHBW)
+
+  if (P.ICACHE_SUPPORTED) begin
+    logic  SpillCachedF, SpillUncachedF;
+    assign SpillCachedF = &IEUAdrM[$clog2(P.ICACHE_LINELENINBITS/32)+1:1];
+    assign SpillUncachedF = IEUAdrM[1]; // *** try to optimize this based on whether the next instruction is 16 bits and by fetching 64 bits in RV64
+    assign SpillF = CacheableF ? SpillCachedF : SpillUncachedF;
+  end else
+    assign SpillF = IEUAdrM[1]; // *** might relax - only spill if next instruction is uncompressed
+  // Don't take the spill if there is a stall, TLB miss, or hardware update to the D/A bits
+  assign TakeSpillF = SpillF & ~IFUCacheBusStallF & ~(ITLBMissF | (P.SVADU_SUPPORTED & InstrUpdateDAF));
+  
+  always_ff @(posedge clk)
+    if (reset | FlushM)    CurrState <= #1 STATE_READY;
+    else CurrState <= #1 NextState;
+
+  always_comb begin
+    case (CurrState)
+      STATE_READY: if (TakeSpillF)                NextState = STATE_SPILL;
+                   else                           NextState = STATE_READY;
+      STATE_SPILL: if(StallM)                     NextState = STATE_SPILL;
+                   else                           NextState = STATE_READY;
+      default:                                    NextState = STATE_READY;
+    endcase
+  end
+
+  assign SelSpillF = (CurrState == STATE_SPILL);
+  assign SelSpillNextF = (CurrState == STATE_READY & TakeSpillF) | (CurrState == STATE_SPILL & IFUCacheBusStallF);
+  assign SpillSaveF = (CurrState == STATE_READY) & TakeSpillF & ~FlushM;
+
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+  // Merge spilled instruction
+  ////////////////////////////////////////////////////////////////////////////////////////////////////
+
+  // save the first 2 bytes
+  flopenr #(16) SpillInstrReg(clk, reset, SpillSaveF, InstrRawF[15:0], InstrFirstHalfF);
+
+  // merge together
+  mux2 #(32) postspillmux(InstrRawF, {InstrRawF[15:0], InstrFirstHalfF}, SpillF, PostSpillInstrRawF);
+
+  // Need to use always comb to avoid pessimistic x propagation if PostSpillInstrRawF is x
+  always_comb
+  if (PostSpillInstrRawF[1:0] != 2'b11) CompressedF = 1'b1;
+  else CompressedF = 1'b0;
+
+endmodule
diff --git a/src/lsu/subwordread.sv b/src/lsu/subwordread.sv
index e5666eb84..ae3e3c78b 100644
--- a/src/lsu/subwordread.sv
+++ b/src/lsu/subwordread.sv
@@ -29,22 +29,125 @@
 
 module subwordread #(parameter LLEN) 
   (
-   input logic [LLEN-1:0]   ReadDataWordMuxM,
-   input logic [2:0]        PAdrM,
-   input logic [2:0]        Funct3M,
-   input logic              FpLoadStoreM, 
-   input logic              BigEndianM, 
-   output logic [LLEN-1:0]  ReadDataM
+   input logic [LLEN-1:0]          ReadDataWordMuxM,
+   input logic [$clog(LLEN/8)-1:0] PAdrM,
+   input logic [2:0]               Funct3M,
+   input logic                     FpLoadStoreM, 
+   input logic                     BigEndianM, 
+   output logic [LLEN/2-1:0]       ReadDataM
 );
 
+  localparam OFFSET_LEN = $clog(LLEN/8);
+  localparam HLEN = LLEN/2;
   logic [7:0]               ByteM; 
   logic [15:0]              HalfwordM;
-  logic [2:0]               PAdrSwap;
+  logic [OFFSET_LEN-1:0]    PAdrSwap;
   // Funct3M[2] is the unsigned bit. mask upper bits.
   // Funct3M[1:0] is the size of the memory access.
-  assign PAdrSwap = PAdrM ^ {3{BigEndianM}};
+  assign PAdrSwap = PAdrM ^ {OFFSET_LEN{BigEndianM}};
 
-  if (LLEN == 64) begin:swrmux
+  if (LLEN == 128) begin:swrmux
+    // ByteMe mux
+    always_comb
+    case(PAdrSwap[3:0])
+      4'b0000: ByteM = ReadDataWordMuxM[7:0];
+      4'b0001: ByteM = ReadDataWordMuxM[15:8];
+      4'b0010: ByteM = ReadDataWordMuxM[23:16];
+      4'b0011: ByteM = ReadDataWordMuxM[31:24];
+      4'b0100: ByteM = ReadDataWordMuxM[39:32];
+      4'b0101: ByteM = ReadDataWordMuxM[47:40];
+      4'b0110: ByteM = ReadDataWordMuxM[55:48];
+      4'b0111: ByteM = ReadDataWordMuxM[63:56];
+      4'b1000: ByteM = ReadDataWordMuxM[71:64];      
+      4'b1001: ByteM = ReadDataWordMuxM[79:72];      
+      4'b1010: ByteM = ReadDataWordMuxM[87:80];      
+      4'b1011: ByteM = ReadDataWordMuxM[95:88];      
+      4'b1100: ByteM = ReadDataWordMuxM[103:96];      
+      4'b1101: ByteM = ReadDataWordMuxM[111:104];      
+      4'b1110: ByteM = ReadDataWordMuxM[119:112];      
+      4'b1111: ByteM = ReadDataWordMuxM[127:120];      
+    endcase
+  
+    // halfword mux
+    always_comb
+    case(PAdrSwap[3:0])
+      4'b0000: HalfwordM = ReadDataWordMuxM[15:0];
+      4'b0001: HalfwordM = ReadDataWordMuxM[23:8];
+      4'b0010: HalfwordM = ReadDataWordMuxM[31:16];
+      4'b0011: HalfwordM = ReadDataWordMuxM[39:24];
+      4'b0100: HalfwordM = ReadDataWordMuxM[47:32];
+      4'b0101: HalfwordM = ReadDataWordMuxM[55:40];
+      4'b0110: HalfwordM = ReadDataWordMuxM[63:48];
+      4'b0111: HalfwordM = ReadDataWordMuxM[71:56];
+      4'b1000: HalfwordM = ReadDataWordMuxM[79:64];
+      4'b1001: HalfwordM = ReadDataWordMuxM[87:72];
+      4'b1010: HalfwordM = ReadDataWordMuxM[95:80];
+      4'b1011: HalfwordM = ReadDataWordMuxM[103:88];
+      4'b1100: HalfwordM = ReadDataWordMuxM[111:96];
+      4'b1101: HalfwordM = ReadDataWordMuxM[119:104];
+      4'b1110: HalfwordM = ReadDataWordMuxM[127:112];
+      //4'b1111: HalfwordM = {ReadDataWordMuxM[7:0], ReadDataWordMuxM[127:120]}; // *** might be ok to zero extend rather than wrap around
+      4'b1111: HalfwordM = {8'b0, ReadDataWordMuxM[127:120]}; // *** might be ok to zero extend rather than wrap around
+    endcase
+    
+    logic [31:0] WordM;
+    
+    always_comb
+      case(PAdrSwap[3:0])
+        4'b0000: WordM = ReadDataWordMuxM[31:0];
+        4'b0001: WordM = ReadDataWordMuxM[39:8];
+        4'b0010: WordM = ReadDataWordMuxM[47:16];
+        4'b0011: WordM = ReadDataWordMuxM[55:24];
+        4'b0100: WordM = ReadDataWordMuxM[63:32];
+        4'b0101: WordM = ReadDataWordMuxM[71:40];
+        4'b0111: WordM = ReadDataWordMuxM[79:48];
+        4'b1000: WordM = ReadDataWordMuxM[87:56];
+        4'b1001: WordM = ReadDataWordMuxM[95:64];
+        4'b1010: WordM = ReadDataWordMuxM[103:72];
+        4'b1011: WordM = ReadDataWordMuxM[111:80];
+        4'b1011: WordM = ReadDataWordMuxM[119:88];
+        4'b1100: WordM = ReadDataWordMuxM[127:96];
+        4'b1101: WordM = {8'b0, ReadDataWordMuxM[127:104]};
+        4'b1110: WordM = {16'b0, ReadDataWordMuxM[127:112]};
+        4'b1111: WordM = {24'b0, ReadDataWordMuxM[127:120]};
+      endcase
+
+    logic [63:0] DblWordM;
+    always_comb
+      case(PAdrSwap[3:0])
+        4'b0000: DblWordMM = ReadDataWordMuxM[63:0];
+        4'b0001: DblWordMM = ReadDataWordMuxM[71:8];
+        4'b0010: DblWordMM = ReadDataWordMuxM[79:16];
+        4'b0011: DblWordMM = ReadDataWordMuxM[87:24];
+        4'b0100: DblWordMM = ReadDataWordMuxM[95:32];
+        4'b0101: DblWordMM = ReadDataWordMuxM[103:40];
+        4'b0111: DblWordMM = ReadDataWordMuxM[111:48];
+        4'b1000: DblWordMM = ReadDataWordMuxM[119:56];
+        4'b1001: DblWordMM = ReadDataWordMuxM[127:64];
+        4'b1010: DblWordMM = {8'b0, ReadDataWordMuxM[103:72]};
+        4'b1011: DblWordMM = {16'b0, ReadDataWordMuxM[111:80]};
+        4'b1011: DblWordMM = {24'b0, ReadDataWordMuxM[119:88]};
+        4'b1100: DblWordMM = {32'b0, ReadDataWordMuxM[127:96]};
+        4'b1101: DblWordMM = {40'b0, ReadDataWordMuxM[127:104]};
+        4'b1110: DblWordMM = {48'b0, ReadDataWordMuxM[127:112]};
+        4'b1111: DblWordMM = {56'b0, ReadDataWordMuxM[127:120]};
+      endcase
+
+    // sign extension/ NaN boxing
+    always_comb
+    case(Funct3M)
+      3'b000:  ReadDataM = {{HLEN-8{ByteM[7]}}, ByteM};                              // lb
+      3'b001:  ReadDataM = {{HLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]}; // lh/flh
+      3'b010:  ReadDataM = {{HLEN-32{WordM[31]|FpLoadStoreM}}, WordM[31:0]};         // lw/flw
+      3'b011:  ReadDataM = {{HLEN-64{DblWordM[63]|FpLoadStoreM}}, DblWordM[63:0]};   // ld/fld
+      3'b100:  ReadDataM = {{HLEN-8{1'b0}}, ByteM[7:0]};                             // lbu
+    //3'b100:  ReadDataM = FpLoadStoreM ? ReadDataWordMuxM : {{HLEN-8{1'b0}}, ByteM[7:0]}; // lbu/flq   - only needed when LLEN=128
+      3'b101:  ReadDataM = {{HLEN-16{1'b0}}, HalfwordM[15:0]};                       // lhu
+      3'b110:  ReadDataM = {{HLEN-32{1'b0}}, WordM[31:0]};                           // lwu
+      default: ReadDataM = ReadDataWordMuxM[HLEN-1:0];                                         // Shouldn't happen
+    endcase
+
+  end else if (LLEN == 64) begin:swrmux
     // ByteMe mux
     always_comb
     case(PAdrSwap[2:0])
@@ -60,35 +163,55 @@ module subwordread #(parameter LLEN)
   
     // halfword mux
     always_comb
-    case(PAdrSwap[2:1])
-      2'b00: HalfwordM = ReadDataWordMuxM[15:0];
-      2'b01: HalfwordM = ReadDataWordMuxM[31:16];
-      2'b10: HalfwordM = ReadDataWordMuxM[47:32];
-      2'b11: HalfwordM = ReadDataWordMuxM[63:48];
+    case(PAdrSwap[2:0])
+      3'b000: HalfwordM = ReadDataWordMuxM[15:0];
+      3'b001: HalfwordM = ReadDataWordMuxM[23:8];
+      3'b010: HalfwordM = ReadDataWordMuxM[31:16];
+      3'b011: HalfwordM = ReadDataWordMuxM[39:24];
+      3'b100: HalfwordM = ReadDataWordMuxM[47:32];
+      3'b011: HalfwordM = ReadDataWordMuxM[55:40];
+      3'b110: HalfwordM = ReadDataWordMuxM[63:48];
+      3'b011: HalfwordM = {8'b0, ReadDataWordMuxM[63:56]};
     endcase
     
     logic [31:0] WordM;
     
     always_comb
-      case(PAdrSwap[2])
-        1'b0: WordM = ReadDataWordMuxM[31:0];
-        1'b1: WordM = ReadDataWordMuxM[63:32];
+      case(PAdrSwap[2:0])
+        3'b000: WordM = ReadDataWordMuxM[31:0];
+        3'b001: WordM = ReadDataWordMuxM[39:8];
+        3'b010: WordM = ReadDataWordMuxM[47:16];
+        3'b011: WordM = ReadDataWordMuxM[55:24];
+        3'b100: WordM = ReadDataWordMuxM[63:32];
+        3'b101: WordM = {8'b0, ReadDataWordMuxM[63:40]};
+        3'b110: WordM = {16'b0, ReadDataWordMuxM[63:48]};
+        3'b111: WordM = {24'b0, ReadDataWordMuxM[63:56]};
       endcase
 
     logic [63:0] DblWordM;
-    assign DblWordM = ReadDataWordMuxM[63:0];
+    always_comb
+      case(PAdrSwap[2:0])
+        3'b000: DblWordMM = ReadDataWordMuxM[63:0];
+        3'b001: DblWordMM = {8'b0, ReadDataWordMuxM[63:8]};
+        3'b010: DblWordMM = {16'b0, ReadDataWordMuxM[63:16]};
+        3'b011: DblWordMM = {24'b0, ReadDataWordMuxM[63:24]};
+        3'b100: DblWordMM = {32'b0, ReadDataWordMuxM[63:32]};
+        3'b101: DblWordMM = {40'b0, ReadDataWordMuxM[63:40]};
+        3'b110: DblWordMM = {48'b0, ReadDataWordMuxM[63:48]};
+        3'b111: DblWordMM = {56'b0, ReadDataWordMuxM[63:56]};
+      endcase
 
     // sign extension/ NaN boxing
     always_comb
     case(Funct3M)
-      3'b000:  ReadDataM = {{LLEN-8{ByteM[7]}}, ByteM};                              // lb
-      3'b001:  ReadDataM = {{LLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]}; // lh/flh
-      3'b010:  ReadDataM = {{LLEN-32{WordM[31]|FpLoadStoreM}}, WordM[31:0]};         // lw/flw
-      3'b011:  ReadDataM = {{LLEN-64{DblWordM[63]|FpLoadStoreM}}, DblWordM[63:0]};   // ld/fld
-      3'b100:  ReadDataM = {{LLEN-8{1'b0}}, ByteM[7:0]};                             // lbu
-    //3'b100:  ReadDataM = FpLoadStoreM ? ReadDataWordMuxM : {{LLEN-8{1'b0}}, ByteM[7:0]}; // lbu/flq   - only needed when LLEN=128
-      3'b101:  ReadDataM = {{LLEN-16{1'b0}}, HalfwordM[15:0]};                       // lhu
-      3'b110:  ReadDataM = {{LLEN-32{1'b0}}, WordM[31:0]};                           // lwu
+      3'b000:  ReadDataM = {{HLEN-8{ByteM[7]}}, ByteM};                              // lb
+      3'b001:  ReadDataM = {{HLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]}; // lh/flh
+      3'b010:  ReadDataM = {{HLEN-32{WordM[31]|FpLoadStoreM}}, WordM[31:0]};         // lw/flw
+      3'b011:  ReadDataM = {{HLEN-64{DblWordM[63]|FpLoadStoreM}}, DblWordM[63:0]};   // ld/fld
+      3'b100:  ReadDataM = {{HLEN-8{1'b0}}, ByteM[7:0]};                             // lbu
+    //3'b100:  ReadDataM = FpLoadStoreM ? ReadDataWordMuxM : {{HLEN-8{1'b0}}, ByteM[7:0]}; // lbu/flq   - only needed when LLEN=128
+      3'b101:  ReadDataM = {{HLEN-16{1'b0}}, HalfwordM[15:0]};                       // lhu
+      3'b110:  ReadDataM = {{HLEN-32{1'b0}}, WordM[31:0]};                           // lwu
       default: ReadDataM = ReadDataWordMuxM;                                         // Shouldn't happen
     endcase
 
@@ -104,20 +227,22 @@ module subwordread #(parameter LLEN)
   
     // halfword mux
     always_comb
-    case(PAdrSwap[1])
-      1'b0: HalfwordM = ReadDataWordMuxM[15:0];
-      1'b1: HalfwordM = ReadDataWordMuxM[31:16];
+    case(PAdrSwap[1:0])
+      2'b00: HalfwordM = ReadDataWordMuxM[15:0];
+      2'b01: HalfwordM = ReadDataWordMuxM[23:8];
+      2'b10: HalfwordM = ReadDataWordMuxM[31:16];
+      2'b11: HalfwordM = {8'b0, ReadDataWordMuxM[31:24]};
     endcase
 
     // sign extension
     always_comb
     case(Funct3M)
-      3'b000:  ReadDataM = {{LLEN-8{ByteM[7]}}, ByteM};                                            // lb
-      3'b001:  ReadDataM = {{LLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]};               // lh/flh
-      3'b010:  ReadDataM = {{LLEN-32{ReadDataWordMuxM[31]|FpLoadStoreM}}, ReadDataWordMuxM[31:0]}; // lw/flw
+      3'b000:  ReadDataM = {{HLEN-8{ByteM[7]}}, ByteM};                                            // lb
+      3'b001:  ReadDataM = {{HLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]};               // lh/flh
+      3'b010:  ReadDataM = {{HLEN-32{ReadDataWordMuxM[31]|FpLoadStoreM}}, ReadDataWordMuxM[31:0]}; // lw/flw
       3'b011:  ReadDataM = ReadDataWordMuxM;                                                        // fld
-      3'b100:  ReadDataM = {{LLEN-8{1'b0}}, ByteM[7:0]};                                           // lbu
-      3'b101:  ReadDataM = {{LLEN-16{1'b0}}, HalfwordM[15:0]};                                     // lhu
+      3'b100:  ReadDataM = {{HLEN-8{1'b0}}, ByteM[7:0]};                                           // lbu
+      3'b101:  ReadDataM = {{HLEN-16{1'b0}}, HalfwordM[15:0]};                                     // lhu
       default: ReadDataM = ReadDataWordMuxM;                                                        // Shouldn't happen
     endcase
   end