Optimized the align logic for loads.

This commit is contained in:
Rose Thompson 2024-02-14 12:14:19 -06:00
parent ea14162c40
commit 1fd678b433
4 changed files with 331 additions and 14 deletions

View File

@ -53,7 +53,7 @@ module align import cvw::*; #(parameter cvw_t P) (
output logic [P.XLEN-1:0] IEUAdrSpillM, // IEUAdrM for one of the two memory addresses of the spill
output logic SelSpillE, // During the transition between the two spill operations, the IFU should stall the pipeline
output logic SelStoreDelay, //*** this is bad. really don't like moving this outside
output logic [P.LLEN-1:0] DCacheReadDataWordSpillM, // The final 32 bit instruction after merging the two spilled fetches into 1 instruction
output logic [P.LLEN*2-1:0] ReadDataWordSpillAllM,
output logic SpillStallM);
localparam LLENINBYTES = P.LLEN/8;
@ -67,8 +67,6 @@ module align import cvw::*; #(parameter cvw_t P) (
logic SpillSaveM;
logic [P.LLEN-1:0] ReadDataWordFirstHalfM;
logic MisalignedM;
logic [P.LLEN*2-1:0] ReadDataWordSpillAllM;
logic [P.LLEN*2-1:0] ReadDataWordSpillShiftedM;
logic [P.XLEN-1:0] IEUAdrIncrementM;
@ -148,8 +146,6 @@ module align import cvw::*; #(parameter cvw_t P) (
// shifter (4:1 mux for 32 bit, 8:1 mux for 64 bit)
// 8 * is for shifting by bytes not bits
assign ShiftAmount = SelHPTW ? '0 : {AccessByteOffsetM, 3'b0}; // AND gate
assign ReadDataWordSpillShiftedM = ReadDataWordSpillAllM >> ShiftAmount;
assign DCacheReadDataWordSpillM = ReadDataWordSpillShiftedM[P.LLEN-1:0];
// write path. Also has the 8:1 shifter muxing for the byteoffset
// then it also has the mux to select when a spill occurs

114
src/lsu/endianswapdouble.sv Normal file
View File

@ -0,0 +1,114 @@
///////////////////////////////////////////
// endianswap.sv
//
// Written: David_Harris@hmc.edu
// Created: 7 May 2022
// Modified: 18 January 2023
//
// Purpose: Swap byte order for Big-Endian accesses
//
// Documentation: RISC-V System on Chip Design Chapter 5 (Figure 5.9)
//
// A component of the CORE-V-WALLY configurable RISC-V project.
// https://github.com/openhwgroup/cvw
//
// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
//
// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
//
// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file
// except in compliance with the License, or, at your option, the Apache License version 2.0. You
// may obtain a copy of the License at
//
// https://solderpad.org/licenses/SHL-2.1/
//
// Unless required by applicable law or agreed to in writing, any work distributed under the
// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
////////////////////////////////////////////////////////////////////////////////////////////////
module endianswapdouble #(parameter LEN) (
input logic BigEndianM,
input logic [LEN-1:0] a,
output logic [LEN-1:0] y
);
if(LEN == 256) begin
always_comb
if (BigEndianM) begin // swap endianness
y[255:248] = a[7:0];
y[247:240] = a[15:8];
y[239:232] = a[23:16];
y[231:224] = a[31:24];
y[223:216] = a[39:32];
y[215:208] = a[47:40];
y[207:200] = a[55:48];
y[199:192] = a[63:56];
y[191:184] = a[71:64];
y[183:176] = a[79:72];
y[175:168] = a[87:80];
y[167:160] = a[95:88];
y[159:152] = a[103:96];
y[151:144] = a[111:104];
y[143:136] = a[119:112];
y[135:128] = a[127:120];
y[127:120] = a[135:128];
y[119:112] = a[142:136];
y[111:104] = a[152:144];
y[103:96] = a[159:152];
y[95:88] = a[167:160];
y[87:80] = a[175:168];
y[79:72] = a[183:176];
y[71:64] = a[191:184];
y[63:56] = a[199:192];
y[55:48] = a[207:200];
y[47:40] = a[215:208];
y[39:32] = a[223:216];
y[31:24] = a[231:224];
y[23:16] = a[239:232];
y[15:8] = a[247:240];
y[7:0] = a[255:248];
end else y = a;
end else if(LEN == 128) begin
always_comb
if (BigEndianM) begin // swap endianness
y[127:120] = a[7:0];
y[119:112] = a[15:8];
y[111:104] = a[23:16];
y[103:96] = a[31:24];
y[95:88] = a[39:32];
y[87:80] = a[47:40];
y[79:72] = a[55:48];
y[71:64] = a[63:56];
y[63:56] = a[71:64];
y[55:48] = a[79:72];
y[47:40] = a[87:80];
y[39:32] = a[95:88];
y[31:24] = a[103:96];
y[23:16] = a[111:104];
y[15:8] = a[119:112];
y[7:0] = a[127:120];
end else y = a;
end else if(LEN == 64) begin
always_comb
if (BigEndianM) begin // swap endianness
y[63:56] = a[7:0];
y[55:48] = a[15:8];
y[47:40] = a[23:16];
y[39:32] = a[31:24];
y[31:24] = a[39:32];
y[23:16] = a[47:40];
y[15:8] = a[55:48];
y[7:0] = a[63:56];
end else y = a;
end else begin
always_comb
if (BigEndianM) begin
y[31:24] = a[7:0];
y[23:16] = a[15:8];
y[15:8] = a[23:16];
y[7:0] = a[31:24];
end else y = a;
end
endmodule

View File

@ -128,9 +128,8 @@ module lsu import cvw::*; #(parameter cvw_t P) (
logic [MLEN-1:0] LSUWriteDataSpillM; // Final write data
logic [MLEN/8-1:0] ByteMaskSpillM; // Selects which bytes within a word to write
/* verilator lint_on WIDTHEXPAND */
logic [P.LLEN-1:0] DCacheReadDataWordSpillM; // D$ read data
logic [P.LLEN-1:0] ReadDataWordMuxM; // DTIM or D$ read data
logic [P.LLEN-1:0] LittleEndianReadDataWordM; // Endian-swapped read data
logic [MLEN-1:0] ReadDataWordMuxM; // DTIM or D$ read data
logic [MLEN-1:0] LittleEndianReadDataWordM; // Endian-swapped read data
logic [P.LLEN-1:0] ReadDataWordM; // Read data before subword selection
logic [P.LLEN-1:0] ReadDataM; // Final read data
@ -155,6 +154,7 @@ module lsu import cvw::*; #(parameter cvw_t P) (
logic SelDTIM; // Select DTIM rather than bus or D$
logic [P.XLEN-1:0] WriteDataZM;
logic LSULoadPageFaultM, LSUStoreAmoPageFaultM;
logic [MLEN-1:0] ReadDataWordSpillAllM;
/////////////////////////////////////////////////////////////////////////////////////////////
// Pipeline for IEUAdr E to M
@ -168,7 +168,7 @@ module lsu import cvw::*; #(parameter cvw_t P) (
.MemRWM,
.DCacheReadDataWordM, .CacheBusHPWTStall, .SelHPTW,
.ByteMaskM, .ByteMaskExtendedM, .LSUWriteDataM, .ByteMaskSpillM, .LSUWriteDataSpillM,
.IEUAdrSpillE, .IEUAdrSpillM, .SelSpillE, .DCacheReadDataWordSpillM, .SpillStallM,
.IEUAdrSpillE, .IEUAdrSpillM, .SelSpillE, .ReadDataWordSpillAllM, .SpillStallM,
.SelStoreDelay);
assign IEUAdrExtM = {2'b00, IEUAdrSpillM};
assign IEUAdrExtE = {2'b00, IEUAdrSpillE};
@ -176,7 +176,7 @@ module lsu import cvw::*; #(parameter cvw_t P) (
assign IEUAdrExtM = {2'b00, IEUAdrM};
assign IEUAdrExtE = {2'b00, IEUAdrE};
assign SelSpillE = '0;
assign DCacheReadDataWordSpillM = DCacheReadDataWordM;
assign ReadDataWordSpillAllM = DCacheReadDataWordM;
assign ByteMaskSpillM = ByteMaskM;
assign LSUWriteDataSpillM = LSUWriteDataM;
assign MemRWSpillM = MemRWM;
@ -298,6 +298,7 @@ module lsu import cvw::*; #(parameter cvw_t P) (
localparam AHBWLOGBWPL = $clog2(BEATSPERLINE); // Log2 of ^
localparam LINELEN = P.DCACHE_LINELENINBITS; // Number of bits in cacheline
localparam LLENPOVERAHBW = P.LLEN / P.AHBW; // Number of AHB beats in a LLEN word. AHBW cannot be larger than LLEN. (implementation limitation)
localparam MLENPOVERAHBW = MLEN / P.AHBW; // Number of AHB beats in a LLEN word. AHBW cannot be larger than LLEN. (implementation limitation)
localparam CACHEWORDLEN = P.ZICCLSM_SUPPORTED ? 2*P.LLEN : P.LLEN; // Width of the cache's input and output data buses. Misaligned doubles width for fast access
logic [LINELEN-1:0] FetchBuffer; // Temporary buffer to hold partially fetched cacheline
@ -361,9 +362,14 @@ module lsu import cvw::*; #(parameter cvw_t P) (
// Uncache bus access may be smaller width than LLEN. Duplicate LLENPOVERAHBW times.
// *** DTIMReadDataWordM should be increased to LLEN.
// pma should generate exception for LLEN read to periph.
/* -----\/----- EXCLUDED -----\/-----
mux3 #(P.LLEN) UnCachedDataMux(.d0(DCacheReadDataWordSpillM), .d1({LLENPOVERAHBW{FetchBuffer[P.XLEN-1:0]}}),
.d2({{P.LLEN-P.XLEN{1'b0}}, DTIMReadDataWordM[P.XLEN-1:0]}),
.s({SelDTIM, ~(CacheableOrFlushCacheM)}), .y(ReadDataWordMuxM));
-----/\----- EXCLUDED -----/\----- */
mux3 #(MLEN) UnCachedDataMux(.d0(ReadDataWordSpillAllM), .d1({MLENPOVERAHBW{FetchBuffer[P.XLEN-1:0]}}),
.d2({{(MLEN-P.XLEN){1'b0}}, DTIMReadDataWordM[P.XLEN-1:0]}),
.s({SelDTIM, ~(CacheableOrFlushCacheM)}), .y(ReadDataWordMuxM));
end else begin : passthrough // No Cache, use simple ahbinterface instad of ahbcacheinterface
logic [1:0] BusRW; // Non-DTIM memory access, ignore cacheableM
logic [P.XLEN-1:0] FetchBuffer;
@ -416,9 +422,14 @@ module lsu import cvw::*; #(parameter cvw_t P) (
/////////////////////////////////////////////////////////////////////////////////////////////
// Subword Accesses
/////////////////////////////////////////////////////////////////////////////////////////////
subwordread #(P.LLEN) subwordread(.ReadDataWordMuxM(LittleEndianReadDataWordM), .PAdrM(PAdrM[2:0]), .BigEndianM,
.FpLoadStoreM, .Funct3M(LSUFunct3M), .ReadDataM);
if(MISALIGN_SUPPORT) begin
subwordreaddouble #(P.LLEN) subwordread(.ReadDataWordMuxM(LittleEndianReadDataWordM), .PAdrM(PAdrM[2:0]), .BigEndianM,
.FpLoadStoreM, .Funct3M(LSUFunct3M), .ReadDataM);
end else begin
subwordread #(P.LLEN) subwordread(.ReadDataWordMuxM(LittleEndianReadDataWordM), .PAdrM(PAdrM[2:0]), .BigEndianM,
.FpLoadStoreM, .Funct3M(LSUFunct3M), .ReadDataM);
end
subwordwrite #(P.LLEN) subwordwrite(.LSUFunct3M, .IMAFWriteDataM, .LittleEndianWriteDataM);
// Compute byte masks
@ -438,7 +449,7 @@ module lsu import cvw::*; #(parameter cvw_t P) (
if (P.BIGENDIAN_SUPPORTED) begin:endian
endianswap #(P.LLEN) storeswap(.BigEndianM, .a(LittleEndianWriteDataM), .y(LSUWriteDataM));
endianswap #(P.LLEN) loadswap(.BigEndianM, .a(ReadDataWordMuxM), .y(LittleEndianReadDataWordM));
endianswapdouble #(MLEN) loadswap(.BigEndianM, .a(ReadDataWordMuxM), .y(LittleEndianReadDataWordM));
end else begin
assign LSUWriteDataM = LittleEndianWriteDataM;
assign LittleEndianReadDataWordM = ReadDataWordMuxM;

View File

@ -0,0 +1,196 @@
///////////////////////////////////////////
// subwordread.sv
//
// Written: David_Harris@hmc.edu
// Created: 9 January 2021
// Modified: 18 January 2023
//
// Purpose: Extract subwords and sign extend for reads
//
// Documentation: RISC-V System on Chip Design Chapter 4 (Figure 4.9)
//
// A component of the CORE-V-WALLY configurable RISC-V project.
// https://github.com/openhwgroup/cvw
//
// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
//
// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
//
// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file
// except in compliance with the License, or, at your option, the Apache License version 2.0. You
// may obtain a copy of the License at
//
// https://solderpad.org/licenses/SHL-2.1/
//
// Unless required by applicable law or agreed to in writing, any work distributed under the
// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
// either express or implied. See the License for the specific language governing permissions
// and limitations under the License.
////////////////////////////////////////////////////////////////////////////////////////////////
module subwordreaddouble #(parameter LLEN)
(
input logic [LLEN*2-1:0] ReadDataWordMuxM,
input logic [2:0] PAdrM,
input logic [2:0] Funct3M,
input logic FpLoadStoreM,
input logic BigEndianM,
output logic [LLEN-1:0] ReadDataM
);
logic [7:0] ByteM;
logic [15:0] HalfwordM;
logic [4:0] PAdrSwap;
logic [4:0] BigEndianPAdr;
logic [4:0] LengthM;
// Funct3M[2] is the unsigned bit. mask upper bits.
// Funct3M[1:0] is the size of the memory access.
assign PAdrSwap = BigEndianM ? BigEndianPAdr : {2'b0, PAdrM};
/* verilator lint_off WIDTHEXPAND */
/* verilator lint_off WIDTHTRUNC */
assign BigEndianPAdr = (LLEN/4) - PAdrM - LengthM;
/* verilator lint_on WIDTHTRUNC */
/* verilator lint_on WIDTHEXPAND */
always_comb
case(Funct3M & {FpLoadStoreM, 2'b11})
3'b000: LengthM = 5'd1;
3'b001: LengthM = 5'd2;
3'b010: LengthM = 5'd4;
3'b011: LengthM = 5'd8;
3'b100: LengthM = 5'd16;
default: LengthM = 5'd8;
endcase
if (LLEN == 128) begin:swrmux
logic [31:0] WordM;
logic [63:0] DblWordM;
logic [63:0] QdWordM;
always_comb
case(PAdrSwap)
5'b00000: QdWordM = ReadDataWordMuxM[127:0];
5'b00001: QdWordM = ReadDataWordMuxM[135:8];
5'b00010: QdWordM = ReadDataWordMuxM[143:16];
5'b00011: QdWordM = ReadDataWordMuxM[151:24];
5'b00100: QdWordM = ReadDataWordMuxM[159:32];
5'b00101: QdWordM = ReadDataWordMuxM[167:40];
5'b00110: QdWordM = ReadDataWordMuxM[175:48];
5'b00111: QdWordM = ReadDataWordMuxM[183:56];
5'b01000: QdWordM = ReadDataWordMuxM[191:64];
5'b01001: QdWordM = ReadDataWordMuxM[199:72];
5'b01010: QdWordM = ReadDataWordMuxM[207:80];
5'b01011: QdWordM = ReadDataWordMuxM[215:88];
5'b01100: QdWordM = ReadDataWordMuxM[223:96];
5'b01101: QdWordM = ReadDataWordMuxM[231:104];
5'b01110: QdWordM = ReadDataWordMuxM[239:112];
5'b01111: QdWordM = ReadDataWordMuxM[247:120];
5'b10000: QdWordM = ReadDataWordMuxM[255:128];
5'b10001: QdWordM = {8'b0, ReadDataWordMuxM[255:136]};
5'b10010: QdWordM = {16'b0, ReadDataWordMuxM[255:144]};
5'b10011: QdWordM = {24'b0, ReadDataWordMuxM[255:152]};
5'b10100: QdWordM = {32'b0, ReadDataWordMuxM[255:160]};
5'b10101: QdWordM = {40'b0, ReadDataWordMuxM[255:168]};
5'b10110: QdWordM = {48'b0, ReadDataWordMuxM[255:176]};
5'b10111: QdWordM = {56'b0, ReadDataWordMuxM[255:184]};
5'b11000: QdWordM = {64'b0, ReadDataWordMuxM[255:192]};
5'b11001: QdWordM = {72'b0, ReadDataWordMuxM[255:200]};
5'b11010: QdWordM = {80'b0, ReadDataWordMuxM[255:208]};
5'b11011: QdWordM = {88'b0, ReadDataWordMuxM[255:216]};
5'b11100: QdWordM = {96'b0, ReadDataWordMuxM[255:224]};
5'b11101: QdWordM = {104'b0, ReadDataWordMuxM[255:232]};
5'b11110: QdWordM = {112'b0, ReadDataWordMuxM[255:240]};
5'b11111: QdWordM = {120'b0, ReadDataWordMuxM[255:248]};
endcase
assign ByteM = QdWordM[7:0];
assign HalfwordM = QdWordM[15:0];
assign WordM = QdWordM[31:0];
assign DblWordM = QdWordM[63:0];
// sign extension/ NaN boxing
always_comb
case(Funct3M)
3'b000: ReadDataM = {{LLEN-8{ByteM[7]}}, ByteM}; // lb
3'b001: ReadDataM = {{LLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]}; // lh/flh
3'b010: ReadDataM = {{LLEN-32{WordM[31]|FpLoadStoreM}}, WordM[31:0]}; // lw/flw
3'b011: ReadDataM = {{LLEN-64{DblWordM[63]|FpLoadStoreM}}, DblWordM[63:0]}; // ld/fld
3'b100: ReadDataM = {{LLEN-8{1'b0}}, ByteM[7:0]}; // lbu
//3'b100: ReadDataM = FpLoadStoreM ? ReadDataWordMuxM : {{LLEN-8{1'b0}}, ByteM[7:0]}; // lbu/flq - only needed when LLEN=128
3'b101: ReadDataM = {{LLEN-16{1'b0}}, HalfwordM[15:0]}; // lhu
3'b110: ReadDataM = {{LLEN-32{1'b0}}, WordM[31:0]}; // lwu
default: ReadDataM = {{LLEN-8{ByteM[7]}}, ByteM}; // Shouldn't happen
endcase
end else if (LLEN == 64) begin:swrmux
logic [31:0] WordM;
logic [63:0] DblWordM;
always_comb
case(PAdrSwap[3:0])
4'b0000: DblWordM = ReadDataWordMuxM[63:0];
4'b0001: DblWordM = ReadDataWordMuxM[71:8];
4'b0010: DblWordM = ReadDataWordMuxM[79:16];
4'b0011: DblWordM = ReadDataWordMuxM[87:24];
4'b0100: DblWordM = ReadDataWordMuxM[95:32];
4'b0101: DblWordM = ReadDataWordMuxM[103:40];
4'b0110: DblWordM = ReadDataWordMuxM[111:48];
4'b0111: DblWordM = ReadDataWordMuxM[119:56];
4'b1000: DblWordM = ReadDataWordMuxM[127:64];
4'b1001: DblWordM = {8'b0, ReadDataWordMuxM[127:72]};
4'b1010: DblWordM = {16'b0, ReadDataWordMuxM[127:80]};
4'b1011: DblWordM = {24'b0, ReadDataWordMuxM[127:88]};
4'b1100: DblWordM = {32'b0, ReadDataWordMuxM[127:96]};
4'b1101: DblWordM = {40'b0, ReadDataWordMuxM[127:104]};
4'b1110: DblWordM = {48'b0, ReadDataWordMuxM[127:112]};
4'b1111: DblWordM = {56'b0, ReadDataWordMuxM[127:120]};
endcase
assign ByteM = DblWordM[7:0];
assign HalfwordM = DblWordM[15:0];
assign WordM = DblWordM[31:0];
// sign extension/ NaN boxing
always_comb
case(Funct3M)
3'b000: ReadDataM = {{LLEN-8{ByteM[7]}}, ByteM}; // lb
3'b001: ReadDataM = {{LLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]}; // lh/flh
3'b010: ReadDataM = {{LLEN-32{WordM[31]|FpLoadStoreM}}, WordM[31:0]}; // lw/flw
3'b011: ReadDataM = {{LLEN-64{DblWordM[63]|FpLoadStoreM}}, DblWordM[63:0]}; // ld/fld
3'b100: ReadDataM = {{LLEN-8{1'b0}}, ByteM[7:0]}; // lbu
//3'b100: ReadDataM = FpLoadStoreM ? ReadDataWordMuxM : {{LLEN-8{1'b0}}, ByteM[7:0]}; // lbu/flq - only needed when LLEN=128
3'b101: ReadDataM = {{LLEN-16{1'b0}}, HalfwordM[15:0]}; // lhu
3'b110: ReadDataM = {{LLEN-32{1'b0}}, WordM[31:0]}; // lwu
default: ReadDataM = {{LLEN-8{ByteM[7]}}, ByteM}; // Shouldn't happen
endcase
end else begin:swrmux // 32-bit
logic [31:0] WordM;
always_comb
case(PAdrSwap[2:0])
3'b000: WordM = ReadDataWordMuxM[31:0];
3'b001: WordM = ReadDataWordMuxM[39:8];
3'b010: WordM = ReadDataWordMuxM[47:16];
3'b011: WordM = ReadDataWordMuxM[55:24];
3'b100: WordM = ReadDataWordMuxM[63:32];
3'b101: WordM = {8'b0, ReadDataWordMuxM[63:40]};
3'b110: WordM = {16'b0, ReadDataWordMuxM[63:48]};
3'b111: WordM = {24'b0, ReadDataWordMuxM[63:56]};
endcase
assign ByteM = WordM[7:0];
assign HalfwordM = WordM[15:0];
// sign extension
always_comb
case(Funct3M)
3'b000: ReadDataM = {{LLEN-8{ByteM[7]}}, ByteM}; // lb
3'b001: ReadDataM = {{LLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]}; // lh/flh
3'b010: ReadDataM = {{LLEN-32{ReadDataWordMuxM[31]|FpLoadStoreM}}, ReadDataWordMuxM[31:0]}; // lw/flw
3'b011: ReadDataM = ReadDataWordMuxM[LLEN-1:0]; // fld
3'b100: ReadDataM = {{LLEN-8{1'b0}}, ByteM[7:0]}; // lbu
3'b101: ReadDataM = {{LLEN-16{1'b0}}, HalfwordM[15:0]}; // lhu
default: ReadDataM = ReadDataWordMuxM[LLEN-1:0]; // Shouldn't happen
endcase
end
endmodule