diff --git a/config/buildroot/config.vh b/config/buildroot/config.vh index bb6a0f6d2..7d68affc0 100644 --- a/config/buildroot/config.vh +++ b/config/buildroot/config.vh @@ -46,6 +46,7 @@ localparam SSTC_SUPPORTED = 1; localparam ZICBOM_SUPPORTED = 1; localparam ZICBOZ_SUPPORTED = 1; localparam ZICBOP_SUPPORTED = 1; +localparam ZICCLSM_SUPPORTED = 0; localparam SVPBMT_SUPPORTED = 1; localparam SVNAPOT_SUPPORTED = 1; localparam SVINVAL_SUPPORTED = 1; diff --git a/config/fpga/config.vh b/config/fpga/config.vh index d390453f4..9ed009439 100644 --- a/config/fpga/config.vh +++ b/config/fpga/config.vh @@ -48,6 +48,7 @@ localparam SSTC_SUPPORTED = 1; localparam ZICBOM_SUPPORTED = 1; localparam ZICBOZ_SUPPORTED = 1; localparam ZICBOP_SUPPORTED = 1; +localparam ZICCLSM_SUPPORTED = 0; localparam SVPBMT_SUPPORTED = 1; localparam SVNAPOT_SUPPORTED = 1; localparam SVINVAL_SUPPORTED = 1; diff --git a/config/rv32e/config.vh b/config/rv32e/config.vh index 6e1f53280..98d44eb18 100644 --- a/config/rv32e/config.vh +++ b/config/rv32e/config.vh @@ -47,6 +47,7 @@ localparam SSTC_SUPPORTED = 0; localparam ZICBOM_SUPPORTED = 0; localparam ZICBOZ_SUPPORTED = 0; localparam ZICBOP_SUPPORTED = 0; +localparam ZICCLSM_SUPPORTED = 0; localparam SVPBMT_SUPPORTED = 0; localparam SVNAPOT_SUPPORTED = 0; localparam SVINVAL_SUPPORTED = 0; diff --git a/config/rv32gc/config.vh b/config/rv32gc/config.vh index f9137bc4b..a0aacb38f 100644 --- a/config/rv32gc/config.vh +++ b/config/rv32gc/config.vh @@ -48,6 +48,7 @@ localparam SSTC_SUPPORTED = 1; localparam ZICBOM_SUPPORTED = 1; localparam ZICBOZ_SUPPORTED = 1; localparam ZICBOP_SUPPORTED = 0; +localparam ZICCLSM_SUPPORTED = 0; localparam SVPBMT_SUPPORTED = 0; localparam SVNAPOT_SUPPORTED = 0; localparam SVINVAL_SUPPORTED = 1; diff --git a/config/rv32i/config.vh b/config/rv32i/config.vh index aa7186761..9ae992e4a 100644 --- a/config/rv32i/config.vh +++ b/config/rv32i/config.vh @@ -47,6 +47,7 @@ localparam SSTC_SUPPORTED = 0; localparam ZICBOM_SUPPORTED = 0; localparam ZICBOZ_SUPPORTED = 0; localparam ZICBOP_SUPPORTED = 0; +localparam ZICCLSM_SUPPORTED = 0; localparam SVPBMT_SUPPORTED = 0; localparam SVNAPOT_SUPPORTED = 0; localparam SVINVAL_SUPPORTED = 0; diff --git a/config/rv32imc/config.vh b/config/rv32imc/config.vh index 76f78c4c4..ec5bc0e15 100644 --- a/config/rv32imc/config.vh +++ b/config/rv32imc/config.vh @@ -46,6 +46,7 @@ localparam SSTC_SUPPORTED = 0; localparam ZICBOM_SUPPORTED = 0; localparam ZICBOZ_SUPPORTED = 0; localparam ZICBOP_SUPPORTED = 0; +localparam ZICCLSM_SUPPORTED = 0; localparam SVPBMT_SUPPORTED = 0; localparam SVNAPOT_SUPPORTED = 0; localparam SVINVAL_SUPPORTED = 0; diff --git a/config/rv64fpquad/config.vh b/config/rv64fpquad/config.vh index d3a2227fe..0fffba91e 100644 --- a/config/rv64fpquad/config.vh +++ b/config/rv64fpquad/config.vh @@ -47,6 +47,7 @@ localparam SSTC_SUPPORTED = 0; localparam ZICBOM_SUPPORTED = 0; localparam ZICBOZ_SUPPORTED = 0; localparam ZICBOP_SUPPORTED = 0; +localparam ZICCLSM_SUPPORTED = 0; localparam SVPBMT_SUPPORTED = 0; localparam SVNAPOT_SUPPORTED = 0; localparam SVINVAL_SUPPORTED = 1; diff --git a/config/rv64gc/config.vh b/config/rv64gc/config.vh index 8decf60d5..af2402b4f 100644 --- a/config/rv64gc/config.vh +++ b/config/rv64gc/config.vh @@ -47,6 +47,7 @@ localparam SSTC_SUPPORTED = 1; localparam ZICBOM_SUPPORTED = 1; localparam ZICBOZ_SUPPORTED = 1; localparam ZICBOP_SUPPORTED = 1; +localparam ZICCLSM_SUPPORTED = 1; localparam SVPBMT_SUPPORTED = 1; localparam SVNAPOT_SUPPORTED = 1; localparam SVINVAL_SUPPORTED = 1; diff --git a/config/rv64i/config.vh b/config/rv64i/config.vh index e547dca6f..028d47c91 100644 --- a/config/rv64i/config.vh +++ b/config/rv64i/config.vh @@ -47,6 +47,7 @@ localparam SSTC_SUPPORTED = 0; localparam ZICBOM_SUPPORTED = 0; localparam ZICBOZ_SUPPORTED = 0; localparam ZICBOP_SUPPORTED = 0; +localparam ZICCLSM_SUPPORTED = 0; localparam SVPBMT_SUPPORTED = 0; localparam SVNAPOT_SUPPORTED = 0; localparam SVINVAL_SUPPORTED = 0; diff --git a/config/shared/parameter-defs.vh b/config/shared/parameter-defs.vh index d04b35e56..dfb41ef9f 100644 --- a/config/shared/parameter-defs.vh +++ b/config/shared/parameter-defs.vh @@ -24,6 +24,7 @@ localparam cvw_t P = '{ ZICBOM_SUPPORTED : ZICBOM_SUPPORTED, ZICBOZ_SUPPORTED : ZICBOZ_SUPPORTED, ZICBOP_SUPPORTED : ZICBOP_SUPPORTED, + ZICCLSM_SUPPORTED : ZICCLSM_SUPPORTED, SVPBMT_SUPPORTED : SVPBMT_SUPPORTED, SVNAPOT_SUPPORTED : SVNAPOT_SUPPORTED, SVINVAL_SUPPORTED : SVINVAL_SUPPORTED, diff --git a/src/cache/cache.sv b/src/cache/cache.sv index 1714544ec..23fd6163e 100644 --- a/src/cache/cache.sv +++ b/src/cache/cache.sv @@ -175,10 +175,16 @@ module cache import cvw::*; #(parameter cvw_t P, logic [LINELEN/8-1:0] DemuxedByteMask, FetchBufferByteSel; // Adjust byte mask from word to cache line - onehotdecoder #(LOGCWPL) adrdec(.bin(PAdr[LOGCWPL+LOGLLENBYTES-1:LOGLLENBYTES]), .decoded(MemPAdrDecoded)); - for(index = 0; index < 2**LOGCWPL; index++) begin - assign DemuxedByteMask[(index+1)*(WORDLEN/8)-1:index*(WORDLEN/8)] = MemPAdrDecoded[index] ? ByteMask : '0; - end + + localparam CACHEMUXINVERALPERLINE = LINELEN/MUXINTERVAL;// Number of words in cache line + localparam LOGMIPL = $clog2(CACHEMUXINVERALPERLINE);// Log2 of ^ + + logic [LINELEN/8-1:0] BlankByteMask; + assign BlankByteMask[WORDLEN/8-1:0] = ByteMask; + assign BlankByteMask[LINELEN/8-1:WORDLEN/8] = '0; + + assign DemuxedByteMask = BlankByteMask << ((MUXINTERVAL/8) * WordOffsetAddr); + assign FetchBufferByteSel = SetValid & ~SetDirty ? '1 : ~DemuxedByteMask; // If load miss set all muxes to 1. // Merge write data into fetched cache line for store miss diff --git a/src/cvw.sv b/src/cvw.sv index 4cbf67b28..198042913 100644 --- a/src/cvw.sv +++ b/src/cvw.sv @@ -59,6 +59,7 @@ typedef struct packed { logic ZICBOM_SUPPORTED; logic ZICBOZ_SUPPORTED; logic ZICBOP_SUPPORTED; + logic ZICCLSM_SUPPORTED; logic SVPBMT_SUPPORTED; logic SVNAPOT_SUPPORTED; logic SVINVAL_SUPPORTED; diff --git a/src/ebu/ahbcacheinterface.sv b/src/ebu/ahbcacheinterface.sv index 9c2ff3a89..054022106 100644 --- a/src/ebu/ahbcacheinterface.sv +++ b/src/ebu/ahbcacheinterface.sv @@ -113,7 +113,7 @@ module ahbcacheinterface #( // *** bummer need a second byte mask for bus as it is AHBW rather than LLEN. // probably can merge by muxing PAdrM's LLEN/8-1 index bit based on HTRANS being != 0. - swbytemask #(AHBW) busswbytemask(.Size(HSIZE), .Adr(HADDR[$clog2(AHBW/8)-1:0]), .ByteMask(BusByteMaskM)); + swbytemask #(AHBW) busswbytemask(.Size(HSIZE), .Adr(HADDR[$clog2(AHBW/8)-1:0]), .ByteMask(BusByteMaskM), .ByteMaskExtended()); flopen #(AHBW/8) HWSTRBReg(HCLK, HREADY, BusByteMaskM[AHBW/8-1:0], HWSTRB); diff --git a/src/lsu/align.sv b/src/lsu/align.sv new file mode 100644 index 000000000..a54474b07 --- /dev/null +++ b/src/lsu/align.sv @@ -0,0 +1,195 @@ +/////////////////////////////////////////// +// spill.sv +// +// Written: Rose Thompson ross1728@gmail.com +// Created: 26 October 2023 +// Modified: 26 October 2023 +// +// Purpose: This module implements native alignment support for the Zicclsm extension +// It is simlar to the IFU's spill module and probably could be merged together with +// some effort. +// +// Documentation: RISC-V System on Chip Design Chapter 11 (Figure 11.5) +// +// A component of the CORE-V-WALLY configurable RISC-V project. +// +// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University +// +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file +// except in compliance with the License, or, at your option, the Apache License version 2.0. You +// may obtain a copy of the License at +// +// https://solderpad.org/licenses/SHL-2.1/ +// +// Unless required by applicable law or agreed to in writing, any work distributed under the +// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, +// either express or implied. See the License for the specific language governing permissions +// and limitations under the License. +//////////////////////////////////////////////////////////////////////////////////////////////// + +module align import cvw::*; #(parameter cvw_t P) ( + input logic clk, + input logic reset, + input logic StallM, FlushM, + input logic [P.XLEN-1:0] IEUAdrM, // 2 byte aligned PC in Fetch stage + input logic [P.XLEN-1:0] IEUAdrE, // The next IEUAdrM + input logic [2:0] Funct3M, // Size of memory operation + input logic [1:0] MemRWM, + input logic CacheableM, + input logic [P.LLEN*2-1:0]DCacheReadDataWordM, // Instruction from the IROM, I$, or bus. Used to check if the instruction if compressed + input logic CacheBusHPWTStall, // I$ or bus are stalled. Transition to second fetch of spill after the first is fetched + input logic DTLBMissM, // ITLB miss, ignore memory request + input logic DataUpdateDAM, // ITLB miss, ignore memory request + + input logic [(P.LLEN-1)/8:0] ByteMaskM, + input logic [(P.LLEN-1)/8:0] ByteMaskExtendedM, + input logic [P.LLEN-1:0] LSUWriteDataM, + + output logic [(P.LLEN*2-1)/8:0] ByteMaskSpillM, + output logic [P.LLEN*2-1:0] LSUWriteDataSpillM, + + output logic [P.XLEN-1:0] IEUAdrSpillE, // The next PCF for one of the two memory addresses of the spill + output logic [P.XLEN-1:0] IEUAdrSpillM, // IEUAdrM for one of the two memory addresses of the spill + output logic SelSpillE, // During the transition between the two spill operations, the IFU should stall the pipeline + output logic [1:0] MemRWSpillM, + output logic SelStoreDelay, //*** this is bad. really don't like moving this outside + output logic [P.LLEN-1:0] DCacheReadDataWordSpillM, // The final 32 bit instruction after merging the two spilled fetches into 1 instruction + output logic SpillStallM); + + // Spill threshold occurs when all the cache offset PC bits are 1 (except [0]). Without a cache this is just PCF[1] + typedef enum logic [1:0] {STATE_READY, STATE_SPILL, STATE_STORE_DELAY} statetype; + + statetype CurrState, NextState; + logic TakeSpillM; + logic SpillM; + logic SelSpillM; + logic SpillSaveM; + logic [P.LLEN-1:0] ReadDataWordFirstHalfM; + logic MisalignedM; + logic [P.LLEN*2-1:0] ReadDataWordSpillAllM; + logic [P.LLEN*2-1:0] ReadDataWordSpillShiftedM; + + localparam LLENINBYTES = P.LLEN/8; + logic [P.XLEN-1:0] IEUAdrIncrementM; + + logic [(P.LLEN-1)*2/8:0] ByteMaskSaveM; + logic [(P.LLEN-1)*2/8:0] ByteMaskMuxM; + logic SaveByteMask; + + /* verilator lint_off WIDTHEXPAND */ + assign IEUAdrIncrementM = IEUAdrM + LLENINBYTES; + /* verilator lint_on WIDTHEXPAND */ + mux2 #(P.XLEN) ieuadrspillemux(.d0(IEUAdrE), .d1(IEUAdrIncrementM), .s(SelSpillE), .y(IEUAdrSpillE)); + mux2 #(P.XLEN) ieuadrspillmmux(.d0(IEUAdrM), .d1(IEUAdrIncrementM), .s(SelSpillM), .y(IEUAdrSpillM)); + + //////////////////////////////////////////////////////////////////////////////////////////////////// + // Detect spill + //////////////////////////////////////////////////////////////////////////////////////////////////// + + // spill detection in lsu is more complex than ifu, depends on 3 factors + // 1) operation size + // 2) offset + // 3) access location within the cacheline + localparam OFFSET_BIT_POS = $clog2(P.DCACHE_LINELENINBITS/8); + logic [OFFSET_BIT_POS-1:$clog2(LLENINBYTES)] WordOffsetM; + logic [$clog2(LLENINBYTES)-1:0] ByteOffsetM; + logic HalfSpillM, WordSpillM; + logic [$clog2(LLENINBYTES)-1:0] AccessByteOffsetM; + + assign {WordOffsetM, ByteOffsetM} = IEUAdrM[OFFSET_BIT_POS-1:0]; + + always_comb begin + case (Funct3M[1:0]) + 2'b00: AccessByteOffsetM = '0; // byte access + 2'b01: AccessByteOffsetM = {2'b00, ByteOffsetM[0]}; // half access + 2'b10: AccessByteOffsetM = {1'b0, ByteOffsetM[1:0]}; // word access + 2'b11: AccessByteOffsetM = ByteOffsetM; // double access + default: AccessByteOffsetM = ByteOffsetM; + endcase + end + + assign HalfSpillM = (IEUAdrM[OFFSET_BIT_POS-1:1] == '1) & (ByteOffsetM[0] != '0) & Funct3M[1:0] == 2'b01; + assign WordSpillM = (IEUAdrM[OFFSET_BIT_POS-1:2] == '1) & (ByteOffsetM[1:0] != '0) & Funct3M[1:0] == 2'b10; + if(P.LLEN == 64) begin + logic DoubleSpillM; + assign DoubleSpillM = (IEUAdrM[OFFSET_BIT_POS-1:3] == '1) & (ByteOffsetM[2:0] != '0) & Funct3M[1:0] == 2'b11; + assign SpillM = (|MemRWM) & CacheableM & (HalfSpillM | WordSpillM | DoubleSpillM); + end else begin + assign SpillM = (|MemRWM) & CacheableM & (HalfSpillM | WordSpillM); + end + + // Don't take the spill if there is a stall, TLB miss, or hardware update to the D/A bits + assign TakeSpillM = SpillM & ~CacheBusHPWTStall & ~(DTLBMissM | (P.SVADU_SUPPORTED & DataUpdateDAM)); + + always_ff @(posedge clk) + if (reset | FlushM) CurrState <= #1 STATE_READY; + else CurrState <= #1 NextState; + + always_comb begin + case (CurrState) + STATE_READY: if (TakeSpillM & ~MemRWM[0]) NextState = STATE_SPILL; + else if(TakeSpillM & MemRWM[0])NextState = STATE_STORE_DELAY; + else NextState = STATE_READY; + STATE_SPILL: if(StallM) NextState = STATE_SPILL; + else NextState = STATE_READY; + STATE_STORE_DELAY: NextState = STATE_SPILL; + default: NextState = STATE_READY; + endcase + end + + assign SelSpillM = (CurrState == STATE_SPILL | CurrState == STATE_STORE_DELAY); + assign SelSpillE = (CurrState == STATE_READY & TakeSpillM) | (CurrState == STATE_SPILL & CacheBusHPWTStall) | (CurrState == STATE_STORE_DELAY); + assign SaveByteMask = (CurrState == STATE_READY & TakeSpillM); + assign SpillSaveM = (CurrState == STATE_READY) & TakeSpillM & ~FlushM; + assign SelStoreDelay = (CurrState == STATE_STORE_DELAY); // *** Can this be merged into the PreLSURWM logic? + assign SpillStallM = SelSpillE | CurrState == STATE_STORE_DELAY; + mux2 #(2) memrwmux(MemRWM, 2'b00, SelStoreDelay, MemRWSpillM); + + //////////////////////////////////////////////////////////////////////////////////////////////////// + // Merge spilled data + //////////////////////////////////////////////////////////////////////////////////////////////////// + + // save the first 2 bytes + flopenr #(P.LLEN) SpillDataReg(clk, reset, SpillSaveM, DCacheReadDataWordM[P.LLEN-1:0], ReadDataWordFirstHalfM); + + // merge together + mux2 #(2*P.LLEN) postspillmux(DCacheReadDataWordM, {DCacheReadDataWordM[P.LLEN-1:0], ReadDataWordFirstHalfM}, SpillM, ReadDataWordSpillAllM); + + // align by shifting + // *** optimize by merging with halfSpill, WordSpill, etc + logic HalfMisalignedM, WordMisalignedM; + assign HalfMisalignedM = Funct3M[1:0] == 2'b01 & ByteOffsetM[0] != 1'b0; + assign WordMisalignedM = Funct3M[1:0] == 2'b10 & ByteOffsetM[1:0] != 2'b00; + if(P.LLEN == 64) begin + logic DoubleMisalignedM; + assign DoubleMisalignedM = Funct3M[1:0] == 2'b11 & ByteOffsetM[2:0] != 3'b00; + assign MisalignedM = HalfMisalignedM | WordMisalignedM | DoubleMisalignedM; + end else begin + assign MisalignedM = HalfMisalignedM | WordMisalignedM; + end + + // shifter (4:1 mux for 32 bit, 8:1 mux for 64 bit) + // 8 * is for shifting by bytes not bits + assign ReadDataWordSpillShiftedM = ReadDataWordSpillAllM >> (MisalignedM ? 8 * AccessByteOffsetM : '0); + assign DCacheReadDataWordSpillM = ReadDataWordSpillShiftedM[P.LLEN-1:0]; + + // write path. Also has the 8:1 shifter muxing for the byteoffset + // then it also has the mux to select when a spill occurs + logic [P.LLEN*2-1:0] LSUWriteDataShiftedM; + logic [P.LLEN*3-1:0] LSUWriteDataShiftedExtM; // *** RT: Find a better way. I've extending in both directions so we don't shift in zeros. The cache expects the writedata to not have any zero data, but instead replicated data. + + assign LSUWriteDataShiftedExtM = {LSUWriteDataM, LSUWriteDataM, LSUWriteDataM} << (MisalignedM ? 8 * AccessByteOffsetM : '0); + assign LSUWriteDataShiftedM = LSUWriteDataShiftedExtM[P.LLEN*3-1:P.LLEN]; + assign LSUWriteDataSpillM = LSUWriteDataShiftedM; + //mux2 #(2*P.LLEN) writedataspillmux(LSUWriteDataShiftedM, {LSUWriteDataShiftedM[P.LLEN*2-1:P.LLEN], LSUWriteDataShiftedM[P.LLEN*2-1:P.LLEN]}, SelSpillM, LSUWriteDataSpillM); + + logic [P.LLEN*2/8-1:0] ByteMaskShiftedM; + assign ByteMaskShiftedM = ByteMaskMuxM; + mux3 #(2*P.LLEN/8) bytemaskspillmux(ByteMaskShiftedM, {{{P.LLEN/8}{1'b0}}, ByteMaskM}, + {{{P.LLEN/8}{1'b0}}, ByteMaskMuxM[P.LLEN*2/8-1:P.LLEN/8]}, {SelSpillM, SelSpillE}, ByteMaskSpillM); + + flopenr #(P.LLEN*2/8) bytemaskreg(clk, reset, SaveByteMask, {ByteMaskExtendedM, ByteMaskM}, ByteMaskSaveM); + mux2 #(P.LLEN*2/8) bytemasksavemux({ByteMaskExtendedM, ByteMaskM}, ByteMaskSaveM, SelSpillM, ByteMaskMuxM); +endmodule diff --git a/src/lsu/lsu.sv b/src/lsu/lsu.sv index 191599f12..ba7d8e119 100644 --- a/src/lsu/lsu.sv +++ b/src/lsu/lsu.sv @@ -92,6 +92,7 @@ module lsu import cvw::*; #(parameter cvw_t P) ( input var logic [7:0] PMPCFG_ARRAY_REGW[P.PMP_ENTRIES-1:0], // PMP configuration from privileged unit input var logic [P.PA_BITS-3:0] PMPADDR_ARRAY_REGW[P.PMP_ENTRIES-1:0] // PMP address from privileged unit ); + localparam MISALIGN_SUPPORT = P.ZICCLSM_SUPPORTED & P.DCACHE_SUPPORTED; logic [P.XLEN+1:0] IEUAdrExtM; // Memory stage address zero-extended to PA_BITS or XLEN whichever is longer logic [P.XLEN+1:0] IEUAdrExtE; // Execution stage address zero-extended to PA_BITS or XLEN whichever is longer @@ -108,13 +109,20 @@ module lsu import cvw::*; #(parameter cvw_t P) ( logic BusStall; // Bus interface busy with multicycle operation logic HPTWStall; // HPTW busy with multicycle operation + logic CacheBusHPWTStall; // Cache, bus, or hptw is requesting a stall + logic SelSpillE; // Align logic detected a spill and needs to stall logic CacheableM; // PMA indicates memory address is cacheable logic BusCommittedM; // Bus memory operation in flight, delay interrupts logic DCacheCommittedM; // D$ memory operation started, delay interrupts logic [P.LLEN-1:0] DTIMReadDataWordM; // DTIM read data - logic [P.LLEN-1:0] DCacheReadDataWordM; // D$ read data + /* verilator lint_off WIDTHEXPAND */ + logic [(MISALIGN_SUPPORT+1)*P.LLEN-1:0] DCacheReadDataWordM; // D$ read data + logic [(MISALIGN_SUPPORT+1)*P.LLEN-1:0] LSUWriteDataSpillM; // Final write data + logic [((MISALIGN_SUPPORT+1)*P.LLEN-1)/8:0] ByteMaskSpillM; // Selects which bytes within a word to write + /* verilator lint_on WIDTHEXPAND */ + logic [P.LLEN-1:0] DCacheReadDataWordSpillM; // D$ read data logic [P.LLEN-1:0] ReadDataWordMuxM; // DTIM or D$ read data logic [P.LLEN-1:0] LittleEndianReadDataWordM; // Endian-swapped read data logic [P.LLEN-1:0] ReadDataWordM; // Read data before subword selection @@ -126,7 +134,11 @@ module lsu import cvw::*; #(parameter cvw_t P) ( logic [P.LLEN-1:0] LittleEndianWriteDataM; // Ending-swapped write data logic [P.LLEN-1:0] LSUWriteDataM; // Final write data logic [(P.LLEN-1)/8:0] ByteMaskM; // Selects which bytes within a word to write - + logic [(P.LLEN-1)/8:0] ByteMaskExtendedM; // Selects which bytes within a word to write + logic [1:0] MemRWSpillM; + logic SpillStallM; + logic SelStoreDelay; + logic DTLBMissM; // DTLB miss causes HPTW walk logic DTLBWriteM; // Writes PTE and PageType to DTLB logic DataUpdateDAM; // DTLB hit needs to update dirty or access bits @@ -142,8 +154,26 @@ module lsu import cvw::*; #(parameter cvw_t P) ( ///////////////////////////////////////////////////////////////////////////////////////////// flopenrc #(P.XLEN) AddressMReg(clk, reset, FlushM, ~StallM, IEUAdrE, IEUAdrM); - assign IEUAdrExtM = {2'b00, IEUAdrM}; - assign IEUAdrExtE = {2'b00, IEUAdrE}; + if(MISALIGN_SUPPORT) begin : ziccslm_align + logic [P.XLEN-1:0] IEUAdrSpillE, IEUAdrSpillM; + align #(P) align(.clk, .reset, .StallM, .FlushM, .IEUAdrE, .IEUAdrM, .Funct3M, + .MemRWM, .CacheableM, + .DCacheReadDataWordM, .CacheBusHPWTStall, .DTLBMissM, .DataUpdateDAM, + .ByteMaskM, .ByteMaskExtendedM, .LSUWriteDataM, .ByteMaskSpillM, .LSUWriteDataSpillM, + .IEUAdrSpillE, .IEUAdrSpillM, .SelSpillE, .MemRWSpillM, .DCacheReadDataWordSpillM, .SpillStallM, + .SelStoreDelay); + assign IEUAdrExtM = {2'b00, IEUAdrSpillM}; + assign IEUAdrExtE = {2'b00, IEUAdrSpillE}; + end else begin : no_ziccslm_align + assign IEUAdrExtM = {2'b00, IEUAdrM}; + assign IEUAdrExtE = {2'b00, IEUAdrE}; + assign SelSpillE = '0; + assign DCacheReadDataWordSpillM = DCacheReadDataWordM; + assign ByteMaskSpillM = ByteMaskM; + assign LSUWriteDataSpillM = LSUWriteDataM; + assign MemRWSpillM = MemRWM; + assign {SpillStallM, SelStoreDelay} = '0; + end ///////////////////////////////////////////////////////////////////////////////////////////// // HPTW (only needed if VM supported) @@ -180,7 +210,8 @@ module lsu import cvw::*; #(parameter cvw_t P) ( // the trap module. assign CommittedM = SelHPTW | DCacheCommittedM | BusCommittedM; assign GatedStallW = StallW & ~SelHPTW; - assign LSUStallM = DCacheStallM | HPTWStall | BusStall; + assign CacheBusHPWTStall = DCacheStallM | HPTWStall | BusStall; + assign LSUStallM = CacheBusHPWTStall | SpillStallM; ///////////////////////////////////////////////////////////////////////////////////////////// // MMU and misalignment fault logic required if privileged unit exists @@ -234,9 +265,10 @@ module lsu import cvw::*; #(parameter cvw_t P) ( assign DTIMMemRWM = SelDTIM & ~IgnoreRequestTLB ? LSURWM : '0; // **** fix ReadDataWordM to be LLEN. ByteMask is wrong length. // **** create config to support DTIM with floating point. + // Add support for cboz dtim #(P) dtim(.clk, .ce(~GatedStallW), .MemRWM(DTIMMemRWM), .DTIMAdr, .FlushW, .WriteDataM(LSUWriteDataM), - .ReadDataWordM(DTIMReadDataWordM[P.LLEN-1:0]), .ByteMaskM(ByteMaskM[P.LLEN/8-1:0])); + .ReadDataWordM(DTIMReadDataWordM[P.LLEN-1:0]), .ByteMaskM(ByteMaskM)); end else begin end if (P.BUS_SUPPORTED) begin : bus @@ -247,6 +279,7 @@ module lsu import cvw::*; #(parameter cvw_t P) ( localparam AHBWLOGBWPL = $clog2(BEATSPERLINE); // Log2 of ^ localparam LINELEN = P.DCACHE_LINELENINBITS; // Number of bits in cacheline localparam LLENPOVERAHBW = P.LLEN / P.AHBW; // Number of AHB beats in a LLEN word. AHBW cannot be larger than LLEN. (implementation limitation) + localparam CACHEWORDLEN = P.ZICCLSM_SUPPORTED ? 2*P.LLEN : P.LLEN; // Width of the cache's input and output data buses. Misaligned doubles width for fast access logic [LINELEN-1:0] FetchBuffer; // Temporary buffer to hold partially fetched cacheline logic [P.PA_BITS-1:0] DCacheBusAdr; // Cacheline address to fetch or writeback. @@ -268,14 +301,12 @@ module lsu import cvw::*; #(parameter cvw_t P) ( assign CacheAtomicM = CacheableM & ~SelDTIM ? LSUAtomicM : '0; assign FlushDCache = FlushDCacheM & ~(SelHPTW); - // *** need RT to add support for CMOpM and LSUPrefetchM (DH 7/2/23) - // *** prefetch can just act as a read operation cache #(.P(P), .PA_BITS(P.PA_BITS), .XLEN(P.XLEN), .LINELEN(P.DCACHE_LINELENINBITS), .NUMLINES(P.DCACHE_WAYSIZEINBYTES*8/LINELEN), - .NUMWAYS(P.DCACHE_NUMWAYS), .LOGBWPL(LLENLOGBWPL), .WORDLEN(P.LLEN), .MUXINTERVAL(P.LLEN), .READ_ONLY_CACHE(0)) dcache( - .clk, .reset, .Stall(GatedStallW), .SelBusBeat, .FlushStage(FlushW | IgnoreRequestTLB), .CacheRW(CacheRWM), .CacheAtomic(CacheAtomicM), - .FlushCache(FlushDCache), .NextSet(IEUAdrE[11:0]), .PAdr(PAdrM), - .ByteMask(ByteMaskM), .BeatCount(BeatCount[AHBWLOGBWPL-1:AHBWLOGBWPL-LLENLOGBWPL]), - .CacheWriteData(LSUWriteDataM), .SelHPTW, + .NUMWAYS(P.DCACHE_NUMWAYS), .LOGBWPL(LLENLOGBWPL), .WORDLEN(CACHEWORDLEN), .MUXINTERVAL(P.LLEN), .READ_ONLY_CACHE(0)) dcache( + .clk, .reset, .Stall(GatedStallW & ~SelSpillE), .SelBusBeat, .FlushStage(FlushW | IgnoreRequestTLB), .CacheRW(SelStoreDelay ? 2'b00 : CacheRWM), .CacheAtomic(CacheAtomicM), + .FlushCache(FlushDCache), .NextSet(IEUAdrExtE[11:0]), .PAdr(PAdrM), + .ByteMask(ByteMaskSpillM), .BeatCount(BeatCount[AHBWLOGBWPL-1:AHBWLOGBWPL-LLENLOGBWPL]), + .CacheWriteData(LSUWriteDataSpillM), .SelHPTW, .CacheStall, .CacheMiss(DCacheMiss), .CacheAccess(DCacheAccess), .CacheCommitted(DCacheCommittedM), .CacheBusAdr(DCacheBusAdr), .ReadDataWord(DCacheReadDataWordM), @@ -285,11 +316,12 @@ module lsu import cvw::*; #(parameter cvw_t P) ( assign DCacheStallM = CacheStall & ~IgnoreRequestTLB; assign CacheBusRW = CacheBusRWTemp; + // *** add support for cboz ahbcacheinterface #(.AHBW(P.AHBW), .LLEN(P.LLEN), .PA_BITS(P.PA_BITS), .BEATSPERLINE(BEATSPERLINE), .AHBWLOGBWPL(AHBWLOGBWPL), .LINELEN(LINELEN), .LLENPOVERAHBW(LLENPOVERAHBW), .READ_ONLY_CACHE(0)) ahbcacheinterface( .HCLK(clk), .HRESETn(~reset), .Flush(FlushW | IgnoreRequestTLB), .HRDATA, .HWDATA(LSUHWDATA), .HWSTRB(LSUHWSTRB), .HSIZE(LSUHSIZE), .HBURST(LSUHBURST), .HTRANS(LSUHTRANS), .HWRITE(LSUHWRITE), .HREADY(LSUHREADY), - .BeatCount, .SelBusBeat, .CacheReadDataWordM(DCacheReadDataWordM), .WriteDataM(LSUWriteDataM), + .BeatCount, .SelBusBeat, .CacheReadDataWordM(DCacheReadDataWordM[P.LLEN-1:0]), .WriteDataM(LSUWriteDataM), .Funct3(LSUFunct3M), .HADDR(LSUHADDR), .CacheBusAdr(DCacheBusAdr), .CacheBusRW, .CacheableOrFlushCacheM, .CacheBusAck(DCacheBusAck), .FetchBuffer, .PAdr(PAdrM), .Cacheable(CacheableOrFlushCacheM), .BusRW, .Stall(GatedStallW), @@ -299,7 +331,7 @@ module lsu import cvw::*; #(parameter cvw_t P) ( // Uncache bus access may be smaller width than LLEN. Duplicate LLENPOVERAHBW times. // *** DTIMReadDataWordM should be increased to LLEN. // pma should generate exception for LLEN read to periph. - mux3 #(P.LLEN) UnCachedDataMux(.d0(DCacheReadDataWordM), .d1({LLENPOVERAHBW{FetchBuffer[P.XLEN-1:0]}}), + mux3 #(P.LLEN) UnCachedDataMux(.d0(DCacheReadDataWordSpillM), .d1({LLENPOVERAHBW{FetchBuffer[P.XLEN-1:0]}}), .d2({{P.LLEN-P.XLEN{1'b0}}, DTIMReadDataWordM[P.XLEN-1:0]}), .s({SelDTIM, ~(CacheableOrFlushCacheM)}), .y(ReadDataWordMuxM)); end else begin : passthrough // No Cache, use simple ahbinterface instad of ahbcacheinterface @@ -312,7 +344,7 @@ module lsu import cvw::*; #(parameter cvw_t P) ( ahbinterface #(P.XLEN, 1) ahbinterface(.HCLK(clk), .HRESETn(~reset), .Flush(FlushW), .HREADY(LSUHREADY), .HRDATA(HRDATA), .HTRANS(LSUHTRANS), .HWRITE(LSUHWRITE), .HWDATA(LSUHWDATA), - .HWSTRB(LSUHWSTRB), .BusRW, .ByteMask(ByteMaskM[P.XLEN/8-1:0]), .WriteData(LSUWriteDataM[P.XLEN-1:0]), + .HWSTRB(LSUHWSTRB), .BusRW, .ByteMask(ByteMaskM), .WriteData(LSUWriteDataM[P.XLEN-1:0]), .Stall(GatedStallW), .BusStall, .BusCommitted(BusCommittedM), .FetchBuffer(FetchBuffer)); // Mux between the 2 sources of read data, 0: Bus, 1: DTIM @@ -354,7 +386,7 @@ module lsu import cvw::*; #(parameter cvw_t P) ( subwordwrite #(P.LLEN) subwordwrite(.LSUFunct3M, .IMAFWriteDataM, .LittleEndianWriteDataM); // Compute byte masks - swbytemask #(P.LLEN) swbytemask(.Size(LSUFunct3M), .Adr(PAdrM[$clog2(P.LLEN/8)-1:0]), .ByteMask(ByteMaskM)); + swbytemask #(P.LLEN, P.ZICCLSM_SUPPORTED) swbytemask(.Size(LSUFunct3M), .Adr(PAdrM[$clog2(P.LLEN/8)-1:0]), .ByteMask(ByteMaskM), .ByteMaskExtended(ByteMaskExtendedM)); ///////////////////////////////////////////////////////////////////////////////////////////// // MW Pipeline Register diff --git a/src/lsu/swbytemask.sv b/src/lsu/swbytemask.sv index d8c4ed167..5737bdc9b 100644 --- a/src/lsu/swbytemask.sv +++ b/src/lsu/swbytemask.sv @@ -27,13 +27,22 @@ // and limitations under the License. //////////////////////////////////////////////////////////////////////////////////////////////// -module swbytemask #(parameter WORDLEN)( +module swbytemask #(parameter WORDLEN, EXTEND = 0)( input logic [2:0] Size, input logic [$clog2(WORDLEN/8)-1:0] Adr, - output logic [WORDLEN/8-1:0] ByteMask + output logic [WORDLEN/8-1:0] ByteMask, + output logic [WORDLEN/8-1:0] ByteMaskExtended ); - - assign ByteMask =(('d2**('d2**Size))-'d1) << Adr; // 'd2 means 2, but stops Design Compiler from complaining about signed to unsigned conversion + if(EXTEND) begin + logic [WORDLEN*2/8-1:0] ExtendedByteMask; + // 'd2 means 2, but stops Design Compiler from complaining about signed to unsigned conversion + assign ExtendedByteMask = (('d2**('d2**Size))-'d1) << Adr; + assign ByteMask = ExtendedByteMask[WORDLEN/8-1:0]; + assign ByteMaskExtended = ExtendedByteMask[WORDLEN*2/8-1:WORDLEN/8]; + end else begin + assign ByteMask = (('d2**('d2**Size))-'d1) << Adr; + assign ByteMaskExtended = '0; + end /* Equivalent to the following diff --git a/src/mmu/mmu.sv b/src/mmu/mmu.sv index 32fed853d..a497b6da7 100644 --- a/src/mmu/mmu.sv +++ b/src/mmu/mmu.sv @@ -138,8 +138,8 @@ module mmu import cvw::*; #(parameter cvw_t P, 2'b10: DataMisalignedM = VAdr[1] | VAdr[0]; // lw, sw, flw, fsw, lwu 2'b11: DataMisalignedM = |VAdr[2:0]; // ld, sd, fld, fsd endcase - assign LoadMisalignedFaultM = DataMisalignedM & ReadNoAmoAccessM; - assign StoreAmoMisalignedFaultM = DataMisalignedM & WriteAccessM; + assign LoadMisalignedFaultM = DataMisalignedM & ReadNoAmoAccessM & ~(P.ZICCLSM_SUPPORTED & Cacheable); + assign StoreAmoMisalignedFaultM = DataMisalignedM & WriteAccessM & ~(P.ZICCLSM_SUPPORTED & Cacheable); // Specify which type of page fault is occurring assign InstrPageFaultF = TLBPageFault & ExecuteAccessF; diff --git a/testbench/tests.vh b/testbench/tests.vh index 0a1607a16..8ebc98730 100644 --- a/testbench/tests.vh +++ b/testbench/tests.vh @@ -1971,6 +1971,7 @@ string arch64zbs[] = '{ string wally64priv[] = '{ `WALLYTEST, "rv64i_m/privilege/src/WALLY-minfo-01.S", + "rv64i_m/privilege/src/WALLY-misaligned-access-01.S", "rv64i_m/privilege/src/WALLY-csr-permission-s-01.S", "rv64i_m/privilege/src/WALLY-cboz-01.S", "rv64i_m/privilege/src/WALLY-cbom-01.S", diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/Makefrag b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/Makefrag index 5758ecc33..19bb5bd01 100644 --- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/Makefrag +++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/Makefrag @@ -28,11 +28,11 @@ # Description: Makefrag for RV64I architectural tests rv64i_sc_tests = \ - WALLY-ADD \ + WALLY-ADD \ WALLY-SUB \ WALLY-SLT \ - WALLY-SLTU \ - WALLY-XOR + WALLY-SLTU \ + WALLY-XOR \ rv64i_tests = $(addsuffix .elf, $(rv64i_sc_tests)) diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/Makefrag b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/Makefrag index 2e3521920..a640e0c06 100644 --- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/Makefrag +++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/Makefrag @@ -60,6 +60,7 @@ target_tests_nosim = \ WALLY-wfi-01 \ WALLY-cbom-01 \ WALLY-cboz-01 \ + WALLY-misaligned-access-01 \ # unclear why status-fp-enabled and wfi aren't simulating ok diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-misaligned-access-01.reference_output b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-misaligned-access-01.reference_output new file mode 100644 index 000000000..209eb4cf4 --- /dev/null +++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-misaligned-access-01.reference_output @@ -0,0 +1,560 @@ +03020100 # ByteDstData +07060504 +0b0a0908 +0f0e0d0c +13021110 +17161514 +1b1a1918 +1f1e1d1c +23222120 +27262524 +2b2a2928 +2f2e2d2c +33023130 +37363534 +3b3a3938 +3f3e3d3c +43424140 +47464544 +4b4a4948 +4f4e4d4c +53025150 +57565554 +5b5a5958 +5f5e5d5c +63626160 +67666564 +6b6a6968 +6f6e6d6c +73027170 +77767574 +7b7a7978 +7f7e7d7c +03020100 # Half0DstData +07060504 +0b0a0908 +0f0e0d0c +13021110 +17161514 +1b1a1918 +1f1e1d1c +23222120 +27262524 +2b2a2928 +2f2e2d2c +33023130 +37363534 +3b3a3938 +3f3e3d3c +43424140 +47464544 +4b4a4948 +4f4e4d4c +53025150 +57565554 +5b5a5958 +5f5e5d5c +63626160 +67666564 +6b6a6968 +6f6e6d6c +73027170 +77767574 +7b7a7978 +7f7e7d7c +020100ef # Half1DstData +06050403 +0a090807 +0e0d0c0b +0211100f +16151413 +1a191817 +1e1d1c1b +2221201f +26252423 +2a292827 +2e2d2c2b +0231302f +36353433 +3a393837 +3e3d3c3b +4241403f +46454443 +4a494847 +4e4d4c4b +0251504f +56555453 +5a595857 +5e5d5c5b +6261605f +66656463 +6a696867 +6e6d6c6b +0271706f +76757473 +7a797877 +7e7d7c7b +deadbe7f +deadbeef +03020100 # Word0DstData +07060504 +0b0a0908 +0f0e0d0c +13021110 +17161514 +1b1a1918 +1f1e1d1c +23222120 +27262524 +2b2a2928 +2f2e2d2c +33023130 +37363534 +3b3a3938 +3f3e3d3c +43424140 +47464544 +4b4a4948 +4f4e4d4c +53025150 +57565554 +5b5a5958 +5f5e5d5c +63626160 +67666564 +6b6a6968 +6f6e6d6c +73027170 +77767574 +7b7a7978 +7f7e7d7c +020100ef # Word1DstData +06050403 +0a090807 +0e0d0c0b +0211100f +16151413 +1a191817 +1e1d1c1b +2221201f +26252423 +2a292827 +2e2d2c2b +0231302f +36353433 +3a393837 +3e3d3c3b +4241403f +46454443 +4a494847 +4e4d4c4b +0251504f +56555453 +5a595857 +5e5d5c5b +6261605f +66656463 +6a696867 +6e6d6c6b +0271706f +76757473 +7a797877 +7e7d7c7b +deadbe7f +deadbeef +0100beef # Word2DstData +05040302 +09080706 +0d0c0b0a +11100f0e +15141302 +19181716 +1d1c1b1a +21201f1e +25242322 +29282726 +2d2c2b2a +31302f2e +35343302 +39383736 +3d3c3b3a +41403f3e +45444342 +49484746 +4d4c4b4a +51504f4e +55545302 +59585756 +5d5c5b5a +61605f5e +65646362 +69686766 +6d6c6b6a +71706f6e +75747302 +79787776 +7d7c7b7a +dead7f7e +deadbeef +00adbeef # Word3DstData +04030201 +08070605 +0c0b0a09 +100f0e0d +14130211 +18171615 +1c1b1a19 +201f1e1d +24232221 +28272625 +2c2b2a29 +302f2e2d +34330231 +38373635 +3c3b3a39 +403f3e3d +44434241 +48474645 +4c4b4a49 +504f4e4d +54530251 +58575655 +5c5b5a59 +605f5e5d +64636261 +68676665 +6c6b6a69 +706f6e6d +74730271 +78777675 +7c7b7a79 +de7f7e7d +deadbeef +03020100 # Double0DstData +07060504 +0b0a0908 +0f0e0d0c +13021110 +17161514 +1b1a1918 +1f1e1d1c +23222120 +27262524 +2b2a2928 +2f2e2d2c +33023130 +37363534 +3b3a3938 +3f3e3d3c +43424140 +47464544 +4b4a4948 +4f4e4d4c +53025150 +57565554 +5b5a5958 +5f5e5d5c +63626160 +67666564 +6b6a6968 +6f6e6d6c +73027170 +77767574 +7b7a7978 +7f7e7d7c +020100ef # Double1DstData +06050403 +0a090807 +0e0d0c0b +0211100f +16151413 +1a191817 +1e1d1c1b +2221201f +26252423 +2a292827 +2e2d2c2b +0231302f +36353433 +3a393837 +3e3d3c3b +4241403f +46454443 +4a494847 +4e4d4c4b +0251504f +56555453 +5a595857 +5e5d5c5b +6261605f +66656463 +6a696867 +6e6d6c6b +0271706f +76757473 +7a797877 +7e7d7c7b +deadbe7f +deadbeef +0100beef # Double2DstData +05040302 +09080706 +0d0c0b0a +11100f0e +15141302 +19181716 +1d1c1b1a +21201f1e +25242322 +29282726 +2d2c2b2a +31302f2e +35343302 +39383736 +3d3c3b3a +41403f3e +45444342 +49484746 +4d4c4b4a +51504f4e +55545302 +59585756 +5d5c5b5a +61605f5e +65646362 +69686766 +6d6c6b6a +71706f6e +75747302 +79787776 +7d7c7b7a +dead7f7e +deadbeef +00adbeef # Double3DstData +04030201 +08070605 +0c0b0a09 +100f0e0d +14130211 +18171615 +1c1b1a19 +201f1e1d +24232221 +28272625 +2c2b2a29 +302f2e2d +34330231 +38373635 +3c3b3a39 +403f3e3d +44434241 +48474645 +4c4b4a49 +504f4e4d +54530251 +58575655 +5c5b5a59 +605f5e5d +64636261 +68676665 +6c6b6a69 +706f6e6d +74730271 +78777675 +7c7b7a79 +de7f7e7d +deadbeef +deadbeef # Double4DstData +03020100 +07060504 +0b0a0908 +0f0e0d0c +13021110 +17161514 +1b1a1918 +1f1e1d1c +23222120 +27262524 +2b2a2928 +2f2e2d2c +33023130 +37363534 +3b3a3938 +3f3e3d3c +43424140 +47464544 +4b4a4948 +4f4e4d4c +53025150 +57565554 +5b5a5958 +5f5e5d5c +63626160 +67666564 +6b6a6968 +6f6e6d6c +73027170 +77767574 +7b7a7978 +7f7e7d7c +deadbeef +deadbeef # Double5DstData +020100ef +06050403 +0a090807 +0e0d0c0b +0211100f +16151413 +1a191817 +1e1d1c1b +2221201f +26252423 +2a292827 +2e2d2c2b +0231302f +36353433 +3a393837 +3e3d3c3b +4241403f +46454443 +4a494847 +4e4d4c4b +0251504f +56555453 +5a595857 +5e5d5c5b +6261605f +66656463 +6a696867 +6e6d6c6b +0271706f +76757473 +7a797877 +7e7d7c7b +deadbe7f +deadbeef # Double6DstData +0100beef +05040302 +09080706 +0d0c0b0a +11100f0e +15141302 +19181716 +1d1c1b1a +21201f1e +25242322 +29282726 +2d2c2b2a +31302f2e +35343302 +39383736 +3d3c3b3a +41403f3e +45444342 +49484746 +4d4c4b4a +51504f4e +55545302 +59585756 +5d5c5b5a +61605f5e +65646362 +69686766 +6d6c6b6a +71706f6e +75747302 +79787776 +7d7c7b7a +dead7f7e +deadbeef # Double7DstData +00adbeef +04030201 +08070605 +0c0b0a09 +100f0e0d +14130211 +18171615 +1c1b1a19 +201f1e1d +24232221 +28272625 +2c2b2a29 +302f2e2d +34330231 +38373635 +3c3b3a39 +403f3e3d +44434241 +48474645 +4c4b4a49 +504f4e4d +54530251 +58575655 +5c5b5a59 +605f5e5d +64636261 +68676665 +6c6b6a69 +706f6e6d +74730271 +78777675 +7c7b7a79 +de7f7e7d +ffffffff #signature +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +ffffffff +000000ff +00000000 diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-01.reference_output b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-01.reference_output index 557341ad4..54b60a227 100644 --- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-01.reference_output +++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-01.reference_output @@ -1,3 +1,4 @@ + FFFFFFFF # stimecmp low bits 00000000 # stimecmp high bits 00000000 # menvcfg low bits @@ -24,7 +25,7 @@ FFFFFFFF # stimecmp low bits 00000000 00000004 # mcause from load address misaligned 00000000 -80000411 # mtval of misaligned address (0x80000409) +02000001 # mtval of misaligned address 00000000 00001880 # masked out mstatus.MPP = 11, mstatus.MPIE = 1, and mstatus.MIE = 0 00000000 @@ -36,7 +37,7 @@ FFFFFFFF # stimecmp low bits 00000000 00000006 # mcause from store misaligned 00000000 -80000429 # mtval of address with misaligned store instr (0x80000421) +02000001 # mtval of misaligned address 00000000 00001880 # masked out mstatus.MPP = 11, mstatus.MPIE = 1, and mstatus.MIE = 0 00000000 @@ -136,7 +137,7 @@ FFFFFFFF # stimecmp low bits 00000000 00000004 # mcause from load address misaligned 00000000 -80000411 # mtval of misaligned address (0x80000409) +02000001 # mtval of misaligned address 00000000 00001880 # masked out mstatus.MPP = 11, mstatus.MPIE = 1, and mstatus.MIE = 0 00000000 @@ -148,7 +149,7 @@ FFFFFFFF # stimecmp low bits 00000000 00000006 # mcause from store misaligned 00000000 -80000429 # mtval of address with misaligned store instr (0x80000421) +02000001 # mtval of misaligned address 00000000 00001880 # masked out mstatus.MPP = 11, mstatus.MPIE = 1, and mstatus.MIE = 0 00000000 diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-s-01.reference_output b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-s-01.reference_output index bca764a76..3e1af9a61 100644 --- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-s-01.reference_output +++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-s-01.reference_output @@ -26,7 +26,7 @@ 00000000 00000004 # scause from load address misaligned 00000000 -80000411 # stval of misaligned address (0x80000409) +02000001 # mtval of misaligned address 00000000 00000800 # masked out mstatus.mpp = 1, mstatus.MPIE = 0, and mstatus.MIE = 0 00000000 @@ -38,7 +38,7 @@ 00000000 00000006 # scause from store misaligned 00000000 -80000429 # stval of address with misaligned store instr (0x80000421) +02000001 # mtval of misaligned address 00000000 00000800 # masked out mstatus.mpp = 1, mstatus.MPIE = 0, and mstatus.MIE = 0 00000000 @@ -128,7 +128,7 @@ 00000000 00000004 # scause from load address misaligned 00000000 -80000411 # stval of misaligned address (0x80000409) +02000001 # mtval of misaligned address 00000000 00000120 # masked out sstatus.SPP = 1, sstatus.SPIE = 1, and sstatus.SIE = 0 00000000 @@ -140,7 +140,7 @@ 00000000 00000006 # scause from store misaligned 00000000 -80000429 # stval of address with misaligned store instr (0x80000421) +02000001 # mtval of misaligned address 00000000 00000120 # masked out sstatus.SPP = 1, sstatus.SPIE = 1, and sstatus.SIE = 0 00000000 diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-u-01.reference_output b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-u-01.reference_output index 1670f68d7..359c8364c 100644 --- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-u-01.reference_output +++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/references/WALLY-trap-u-01.reference_output @@ -26,7 +26,7 @@ 00000000 00000004 # scause from load address misaligned 00000000 -80000411 # stval of misaligned address (0x80000409) +02000001 # mtval of misaligned address 00000000 00000000 # masked out mstatus.mpp = 0, mstatus.MPIE = 0, and mstatus.MIE = 0 00000000 @@ -38,7 +38,7 @@ 00000000 00000006 # scause from store misaligned 00000000 -80000429 # stval of address with misaligned store instr (0x80000421) +02000001 # mtval of misaligned address 00000000 00000000 # masked out mstatus.mpp = 0, mstatus.MPIE = 0, and mstatus.MIE = 0 00000000 @@ -122,7 +122,7 @@ 00000000 00000004 # scause from load address misaligned 00000000 -80000411 # stval of misaligned address (0x80000409) +02000001 # mtval of misaligned address 00000000 00000020 # masked out sstatus.SPP = 0, sstatus.SPIE = 1, and sstatus.SIE = 0 00000000 @@ -134,7 +134,7 @@ 00000000 00000006 # scause from store misaligned 00000000 -80000429 # stval of address with misaligned store instr (0x80000421) +02000001 # mtval of misaligned address 00000000 00000020 # masked out sstatus.SPP = 0, sstatus.SPIE = 1, and sstatus.SIE = 0 00000000 diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-TEST-LIB-64.h b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-TEST-LIB-64.h index 9e1dcb264..67d76c6ab 100644 --- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-TEST-LIB-64.h +++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-TEST-LIB-64.h @@ -98,7 +98,8 @@ cause_breakpnt: ret cause_load_addr_misaligned: - auipc t3, 0 // get current PC, which is aligned + li t3, 0x02000000 // base address of clint, because with zicclsm misaligned cached access won't trap + //auipc t3, 0 // get current PC, which is aligned addi t3, t3, 1 lw t4, 0(t3) // load from a misaligned address ret @@ -108,7 +109,8 @@ cause_load_acc: ret cause_store_addr_misaligned: - auipc t3, 0 // get current PC, which is aligned + li t3, 0x02000000 // base address of clint, because with zicclsm misaligned cached access won't trap + //auipc t3, 0 // get current PC, which is aligned addi t3, t3, 1 sw t4, 0(t3) // store to a misaligned address ret diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-misaligned-access-01.S b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-misaligned-access-01.S new file mode 100644 index 000000000..3ff89a237 --- /dev/null +++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-misaligned-access-01.S @@ -0,0 +1,752 @@ +/////////////////////////////////////////// +// ../wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SLT.S +// David_Harris@hmc.edu & Katherine Parry +// Created 2022-06-17 22:58:09.916813// +// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, +// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software +// is furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT +// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +/////////////////////////////////////////// + +#include "model_test.h" +#include "arch_test.h" +RVTEST_ISA("RV64I") + +.section .text.init +.globl rvtest_entry_point +rvtest_entry_point: +RVMODEL_BOOT +RVTEST_CODE_BEGIN + +RVTEST_CASE(0,"//check ISA:=regex(.*64.*);check ISA:=regex(.*I.*);def TEST_CASE_1=True;def NO_SAIL=True;",ld) + + # This test checks the misaligned load and stores work correctly and across D$ line spills. + # The general approach is to + # 1. load a region of memory using load doubles equal to two cache lines. And copy to a new + # region but using stores of bytes, half, word, or doubles. Each are repeated for all possible + # misaligned access. Bytes are always aligned, halves are 0, and 1, words are 0, 1, 2, and 3, and + # doubles are 0 through 7. Then the new region is compared against the reference region. Because + # of the misalignment the last few bytes will not be written so they will be some portion of deadbeef. + # The comparison is done using using same abyte, half, word, and double misaligned approach. + + la a3, signature # does not get overwritten by any functions + +TEST_BYTE: + # byte copy region. always naturally aligned + la a0, SourceData + la a1, ByteDstData + li a2, 16 + jal ra, memcpy8_1 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, ByteDstData + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_HALF0: + la a0, SourceData + la a1, Half0DstData + li a2, 16 + jal ra, memcpy8_2 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Half0DstData + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_HALF1: + la a0, SourceData + la a1, Half1DstData+1 + li a2, 16 + jal ra, memcpy8_2 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Half1DstData+1 + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_WORD0: + la a0, SourceData + la a1, Word0DstData + li a2, 16 + jal ra, memcpy8_4 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Word0DstData + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_WORD1: + la a0, SourceData + la a1, Word1DstData+1 + li a2, 16 + jal ra, memcpy8_4 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Word1DstData+1 + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_WORD2: + la a0, SourceData + la a1, Word2DstData+2 + li a2, 16 + jal ra, memcpy8_4 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Word2DstData+2 + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_WORD3: + la a0, SourceData + la a1, Word3DstData+3 + li a2, 16 + jal ra, memcpy8_4 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Word3DstData+3 + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_DOUBLE0: + la a0, SourceData + la a1, Double0DstData + li a2, 16 + jal ra, memcpy8_8 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Double0DstData + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_DOUBLE1: + la a0, SourceData + la a1, Double1DstData+1 + li a2, 16 + jal ra, memcpy8_8 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Double1DstData+1 + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_DOUBLE2: + la a0, SourceData + la a1, Double2DstData+2 + li a2, 16 + jal ra, memcpy8_8 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Double2DstData+2 + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_DOUBLE3: + la a0, SourceData + la a1, Double3DstData+3 + li a2, 16 + jal ra, memcpy8_8 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Double3DstData+3 + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_DOUBLE4: + la a0, SourceData + la a1, Double4DstData+4 + li a2, 16 + jal ra, memcpy8_8 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Double4DstData+4 + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_DOUBLE5: + la a0, SourceData + la a1, Double5DstData+5 + li a2, 16 + jal ra, memcpy8_8 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Double5DstData+5 + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_DOUBLE6: + la a0, SourceData + la a1, Double6DstData+6 + li a2, 16 + jal ra, memcpy8_8 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Double6DstData+6 + li a2, 16 + jal ra, CheckAllWriteSignature + +TEST_DOUBLE7: + la a0, SourceData + la a1, Double7DstData+7 + li a2, 16 + jal ra, memcpy8_8 + + # check if the values are write for all sizes and offsets of misaligned loads. + la a0, SourceData + la a1, Double7DstData+7 + li a2, 16 + jal ra, CheckAllWriteSignature + +RVMODEL_HALT + +.type CheckAll, @function +# a0 is the SourceData, (golden), a1 is the data to be checked. +# a2 is the number of doubles +# a3 is the signature pointer +# returns a0 as 0 for no mismatch, 1 for mismatch, +# returns a3 as incremented signature pointer +CheckAllWriteSignature: + mv s0, a0 + mv s1, a1 + mv s2, a2 + mv s3, a3 + # there is no stack so I'm saving ra into s5 + mv s5, ra + + # check values byte by byte + mv a0, s0 # SourceData + mv a1, s1 # ie: ByteDstData + slli a2, s2, 3 # * 8 + jal ra, memcmp1 + sb a0, 0(s3) + mv s4, a0 + + # check values half by half + mv a0, s0 # SourceData + mv a1, s1 # ie: ByteDstData + slli a2, s2, 2 # * 4 + jal ra, memcmp2 + sb a0, 1(s3) + or s4, s4, a0 + + # check values half by half + addi a0, s0, 1 # SourceData+1 + addi a1, s1, 1 # ie: ByteDstData+1 + slli a2, s2, 2 # * 4 -1 + addi a2, a2, -1 + jal ra, memcmp2 + sb a0, 2(s3) + or s4, s4, a0 + + # check values word by word + addi a0, s0, 0 # SourceData + mv a1, s1 # ie: ByteDstData + slli a2, s2, 1 # * 2 + jal ra, memcmp4 + sb a0, 3(s3) + or s4, s4, a0 + + # check values word by word + addi a0, s0, 1 # SourceData+1 + addi a1, s1, 1 # ie: ByteDstData+1 + slli a2, s2, 1 # * 2 -1 + addi a2, a2, -1 + jal ra, memcmp4 + sb a0, 4(s3) + or s4, s4, a0 + + # check values word by word + addi a0, s0, 2 # SourceData+2 + addi a1, s1, 2 # ie: ByteDstData+2 + slli a2, s2, 1 # * 2 -1 + addi a2, a2, -1 + jal ra, memcmp4 + sb a0, 5(s3) + or s4, s4, a0 + + # check values word by word + addi a0, s0, 3 # SourceData+3 + addi a1, s1, 3 # ie: ByteDstData+3 + slli a2, s2, 1 # * 2 -1 + addi a2, a2, -1 + jal ra, memcmp4 + sb a0, 6(s3) + or s4, s4, a0 + + # check values double by double + mv a0, s0 # SourceData + mv a1, s1 # ie: ByteDstData + slli a2, s2, 0 # * 1 + jal ra, memcmp8 + sb a0, 7(s3) + + # check values double by double + addi a0, s0, 1 # SourceData+1 + addi a1, s1, 1 # ie: ByteDstData+1 + slli a2, s2, 0 # * 1 -1 + addi a2, a2, -1 + jal ra, memcmp8 + sb a0, 8(s3) + + # check values double by double + addi a0, s0, 2 # SourceData+2 + addi a1, s1, 2 # ie: ByteDstData+2 + slli a2, s2, 0 # * 1 -1 + addi a2, a2, -1 + jal ra, memcmp8 + sb a0, 9(s3) + + # check values double by double + addi a0, s0, 3 # SourceData+3 + addi a1, s1, 3 # ie: ByteDstData+3 + slli a2, s2, 0 # * 1 -1 + addi a2, a2, -1 + jal ra, memcmp8 + sb a0, 10(s3) + + # check values double by double + addi a0, s0, 4 # SourceData+4 + addi a1, s1, 4 # ie: ByteDstData+4 + slli a2, s2, 0 # * 1 -1 + addi a2, a2, -1 + jal ra, memcmp8 + sb a0, 11(s3) + + # check values double by double + addi a0, s0, 5 # SourceData+5 + addi a1, s1, 5 # ie: ByteDstData+5 + slli a2, s2, 0 # * 1 -1 + addi a2, a2, -1 + jal ra, memcmp8 + sb a0, 12(s3) + + # check values double by double + addi a0, s0, 6 # SourceData+6 + addi a1, s1, 6 # ie: ByteDstData+6 + slli a2, s2, 0 # * 1 -1 + addi a2, a2, -1 + jal ra, memcmp8 + sb a0, 13(s3) + + # check values double by double + addi a0, s0, 7 # SourceData+7 + addi a1, s1, 7 # ie: ByteDstData+7 + slli a2, s2, 0 # * 1 + addi a2, a2, -1 + jal ra, memcmp8 + sb a0, 14(s3) + + addi s3, s3, 15 + mv a3, s3 + or a0, s4, a0 + mv ra, s5 + ret + + +.type memcmp1, @function +# returns which index mismatch, -1 if none +memcmp1: + # a0 is the source1 + # a1 is the source2 + # a2 is the number of 1 byte words + mv t0, a0 + mv t1, a1 + li t2, 0 +memcmp1_loop: + lbu t3, 0(t0) + lbu t4, 0(t1) + bne t3, t4, memcmp1_ne + addi t0, t0, 1 + addi t1, t1, 1 + addi t2, t2, 1 + blt t2, a2, memcmp1_loop + li a0, -1 + ret +memcmp1_ne: + mv a0, t2 + ret + +.type memcmp2, @function +# returns which index mismatch, -1 if none +memcmp2: + # a0 is the source1 + # a1 is the source2 + # a2 is the number of 2 byte words + mv t0, a0 + mv t1, a1 + li t2, 0 +memcmp2_loop: + lhu t3, 0(t0) + lhu t4, 0(t1) + bne t3, t4, memcmp2_ne + addi t0, t0, 2 + addi t1, t1, 2 + addi t2, t2, 1 + blt t2, a2, memcmp2_loop + li a0, -1 + ret +memcmp2_ne: + mv a0, t2 + ret + +.type memcmp4, @function +# returns which index mismatch, -1 if none +memcmp4: + # a0 is the source1 + # a1 is the source2 + # a2 is the number of 4 byte words + mv t0, a0 + mv t1, a1 + li t2, 0 +memcmp4_loop: + lwu t3, 0(t0) + lwu t4, 0(t1) + bne t3, t4, memcmp4_ne + addi t0, t0, 4 + addi t1, t1, 4 + addi t2, t2, 1 + blt t2, a2, memcmp4_loop + li a0, -1 + ret +memcmp4_ne: + mv a0, t2 + ret + +.type memcmp8, @function +# returns which index mismatch, -1 if none +memcmp8: + # a0 is the source1 + # a1 is the source2 + # a2 is the number of 8 byte words + mv t0, a0 + mv t1, a1 + li t2, 0 +memcmp8_loop: + ld t3, 0(t0) + ld t4, 0(t1) + bne t3, t4, memcmp8_ne + addi t0, t0, 8 + addi t1, t1, 8 + addi t2, t2, 1 + blt t2, a2, memcmp8_loop + li a0, -1 + ret +memcmp8_ne: + mv a0, t2 + ret + + +RVTEST_CODE_END + +.type memcpy8_1, @function +# load 8 bytes using load double then store using 8 sb +memcpy8_1: + # a0 is the source + # a1 is the dst + # a2 is the number of 8 byte words + mv t0, a0 + mv t1, a1 + li t2, 0 +memcpy8_1_loop: + ld t3, 0(t0) + andi t4, t3, 0xff + sb t4, 0(t1) + srli t4, t3, 8 + andi t4, t4, 0xff + sb t4, 1(t1) + + srli t4, t3, 16 + andi t4, t4, 0xff + sb t4, 2(t1) + + srli t4, t3, 24 + andi t4, t4, 0xff + sb t4, 3(t1) + + srli t4, t3, 32 + andi t4, t4, 0xff + sb t4, 4(t1) + + srli t4, t3, 40 + andi t4, t4, 0xff + sb t4, 5(t1) + + srli t4, t3, 48 + andi t4, t4, 0xff + sb t4, 6(t1) + + srli t4, t3, 56 + andi t4, t4, 0xff + sb t4, 7(t1) + + addi t0, t0, 8 + addi t1, t1, 8 + addi t2, t2, 1 + blt t2, a2, memcpy8_1_loop + ret + +.type memcpy8_2, @function +# load 8 bytes using load double then store using 4 sh +memcpy8_2: + # a0 is the source + # a1 is the dst + # a2 is the number of 8 byte words + mv t0, a0 + mv t1, a1 + li t2, 0 + + # 16 bit mask + lui t4, 0xf + li t3, 0xfff + or t5, t4, t3 + +memcpy8_2_loop: + ld t3, 0(t0) + and t4, t3, t5 + sh t4, 0(t1) + + srli t4, t3, 16 + and t4, t4, t5 + sh t4, 2(t1) + + srli t4, t3, 32 + and t4, t4, t5 + sh t4, 4(t1) + + srli t4, t3, 48 + and t4, t4, t5 + sh t4, 6(t1) + + + addi t0, t0, 8 + addi t1, t1, 8 + addi t2, t2, 1 + blt t2, a2, memcpy8_2_loop + ret + +.type memcpy8_4, @function +# load 8 bytes using load double then store using 2 sw +memcpy8_4: + # a0 is the source + # a1 is the dst + # a2 is the number of 8 byte words + mv t0, a0 + mv t1, a1 + li t2, 0 + + # 32 bit mask + addi t4, x0, -1 + srli t5, t4, 32 + +memcpy8_4_loop: + ld t3, 0(t0) + and t4, t3, t5 + sw t4, 0(t1) + + srli t4, t3, 32 + and t4, t4, t5 + sw t4, 4(t1) + + addi t0, t0, 8 + addi t1, t1, 8 + addi t2, t2, 1 + blt t2, a2, memcpy8_4_loop + ret + +.type memcpy8_8, @function +# load 8 bytes using load double then store using 1 sd +memcpy8_8: + # a0 is the source + # a1 is the dst + # a2 is the number of 8 byte words + mv t0, a0 + mv t1, a1 + li t2, 0 + +memcpy8_8_loop: + ld t3, 0(t0) + sd t3, 0(t1) + + addi t0, t0, 8 + addi t1, t1, 8 + addi t2, t2, 1 + blt t2, a2, memcpy8_8_loop + ret + + +RVTEST_DATA_BEGIN +.align 3 +rvtest_data: +SourceData: +.8byte 0x0706050403020100, 0x0f0e0d0c0b0a0908, 0x1716151413021110, 0x1f1e1d1c1b1a1918 +.8byte 0x2726252423222120, 0x2f2e2d2c2b2a2928, 0x3736353433023130, 0x3f3e3d3c3b3a3938 +.8byte 0x4746454443424140, 0x4f4e4d4c4b4a4948, 0x5756555453025150, 0x5f5e5d5c5b5a5958 +.8byte 0x6766656463626160, 0x6f6e6d6c6b6a6968, 0x7776757473027170, 0x7f7e7d7c7b7a7978 +.8byte 0xdeadbeefdeadbeef + +Response1ByteOffsetData: +.8byte 0x0807060504030201, 0x100f0e0d0c0b0a09, 0x1817161514130211, 0x201f1e1d1c1b1a19 +.8byte 0x2827262524232221, 0x302f2e2d2c2b2a29, 0x3837363534330231, 0x403f3e3d3c3b3a39 +.8byte 0x4847464544434241, 0x504f4e4d4c4b4a49, 0x5857565554530251, 0x605f5e5d5c5b5a59 +.8byte 0x6867666564636261, 0x706f6e6d6c6b6a69, 0x7877767574730271, 0xde7f7e7d7c7b7a79 + +Response2ByteOffsetData: +.8byte 0x0908070605040302, 0x11100f0e0d0c0b0a, 0x1918171615141302, 0x21201f1e1d1c1b1a +.8byte 0x2928272625242322, 0x31302f2e2d2c2b2a, 0x3938373635343302, 0x41403f3e3d3c3b3a +.8byte 0x4948474645444342, 0x51504f4e4d4c4b4a, 0x5958575655545302, 0x61605f5e5d5c5b5a +.8byte 0x6968676665646362, 0x71706f6e6d6c6b6a, 0x7978777675747302, 0xdead7f7e7d7c7b7a + +Response3ByteOffsetData: +.8byte 0x0a09080706050403, 0x0211100f0e0d0c0b, 0x1a19181716151413, 0x2221201f1e1d1c1b +.8byte 0x2a29282726252423, 0x0231302f2e2d2c2b, 0x3a39383736353433, 0x4241403f3e3d3c3b +.8byte 0x4a49484746454443, 0x0251504f4e4d4c4b, 0x5a59585756555453, 0x6261605f5e5d5c5b +.8byte 0x6a69686766656463, 0x0271706f6e6d6c6b, 0x7a79787776757473, 0xdeadbe7f7e7d7c7b + +Response4ByteOffsetData: +.8byte 0x0b0a090807060504, 0x130211100f0e0d0c, 0x1b1a191817161514, 0x232221201f1e1d1c +.8byte 0x2b2a292827262524, 0x330231302f2e2d2c, 0x3b3a393837363534, 0x434241403f3e3d3c +.8byte 0x4b4a494847464544, 0x530251504f4e4d4c, 0x5b5a595857565554, 0x636261605f5e5d5c +.8byte 0x6b6a696867666564, 0x730271706f6e6d6c, 0x7b7a797877767574, 0xdeadbeef7f7e7d7c + +Response5ByteOffsetData: +.8byte 0x0c0b0a0908070605, 0x14130211100f0e0d, 0x1c1b1a1918171615, 0x24232221201f1e1d +.8byte 0x2c2b2a2928272625, 0x34330231302f2e2d, 0x3c3b3a3938373635, 0x44434241403f3e3d +.8byte 0x4c4b4a4948474645, 0x54530251504f4e4d, 0x5c5b5a5958575655, 0x64636261605f5e5d +.8byte 0x6c6b6a6968676665, 0x74730271706f6e6d, 0x7c7b7a7978777675, 0xdeadbeefde7f7e7d + +Response6ByteOffsetData: +.8byte 0x0d0c0b0a09080706, 0x1514130211100f0e, 0x1d1c1b1a19181716, 0x2524232221201f1e +.8byte 0x2d2c2b2a29282726, 0x3534330231302f2e, 0x3d3c3b3a39383736, 0x4544434241403f3e +.8byte 0x4d4c4b4a49484746, 0x5554530251504f4e, 0x5d5c5b5a59585756, 0x6564636261605f5e +.8byte 0x6d6c6b6a69686766, 0x7574730271706f6e, 0x7d7c7b7a79787776, 0xdeadbeefdead7f7e + +Response7ByteOffsetData: +.8byte 0x0e0d0c0b0a090807, 0x161514130211100f, 0x1e1d1c1b1a191817, 0x262524232221201f +.8byte 0x2e2d2c2b2a292827, 0x363534330231302f, 0x3e3d3c3b3a393837, 0x464544434241403f +.8byte 0x4e4d4c4b4a494847, 0x565554530251504f, 0x5e5d5c5b5a595857, 0x666564636261605f +.8byte 0x6e6d6c6b6a696867, 0x767574730271706f, 0x7e7d7c7b7a797877, 0xdeadbeefdeadbe7f + +RVTEST_DATA_END + +RVMODEL_DATA_BEGIN + +ByteDstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef + +Half0DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef + +Half1DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +Word0DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef + +Word1DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +Word2DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +Word3DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +Double0DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef + +Double1DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +Double2DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +Double3DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +Double4DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +Double5DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +Double6DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +Double7DstData: +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef, 0xdeadbeefdeadbeef +.8byte 0xdeadbeefdeadbeef +signature: + .fill 225, 1, 0x00 + +RVMODEL_DATA_END +// ../wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SLT.S +// David_Harris@hmc.edu & Katherine Parry