From c77afcb7e6c9d7ce9dad46d1f3c9d967d0254060 Mon Sep 17 00:00:00 2001 From: David Harris Date: Mon, 19 Feb 2024 22:28:55 -0800 Subject: [PATCH 1/4] Removed floprc with synchronous reset and synchornous clear --- src/generic/flop/floprc.sv | 38 -------------------------------------- src/privileged/privdec.sv | 4 ++-- 2 files changed, 2 insertions(+), 40 deletions(-) delete mode 100644 src/generic/flop/floprc.sv diff --git a/src/generic/flop/floprc.sv b/src/generic/flop/floprc.sv deleted file mode 100644 index 59f2e2862..000000000 --- a/src/generic/flop/floprc.sv +++ /dev/null @@ -1,38 +0,0 @@ -/////////////////////////////////////////// -// floprc.sv -// -// Written: David_Harris@hmc.edu 9 January 2021 -// Modified: -// -// Purpose: D flip-flop with synchronous reset and clear -// -// A component of the CORE-V-WALLY configurable RISC-V project. -// https://github.com/openhwgroup/cvw -// -// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University -// -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -// -// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file -// except in compliance with the License, or, at your option, the Apache License version 2.0. You -// may obtain a copy of the License at -// -// https://solderpad.org/licenses/SHL-2.1/ -// -// Unless required by applicable law or agreed to in writing, any work distributed under the -// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, -// either express or implied. See the License for the specific language governing permissions -// and limitations under the License. -//////////////////////////////////////////////////////////////////////////////////////////////// - -module floprc #(parameter WIDTH = 8) ( - input logic clk, - input logic reset, - input logic clear, - input logic [WIDTH-1:0] d, - output logic [WIDTH-1:0] q); - - always_ff @(posedge clk) - if (reset | clear ) q <= #1 0; - else q <= #1 d; -endmodule diff --git a/src/privileged/privdec.sv b/src/privileged/privdec.sv index bc9f9235f..23c0c2f15 100644 --- a/src/privileged/privdec.sv +++ b/src/privileged/privdec.sv @@ -80,8 +80,8 @@ module privdec import cvw::*; #(parameter cvw_t P) ( if (P.U_SUPPORTED) begin:wfi logic [P.WFI_TIMEOUT_BIT:0] WFICount, WFICountPlus1; - assign WFICountPlus1 = WFICount + 1; - floprc #(P.WFI_TIMEOUT_BIT+1) wficountreg(clk, reset, ~wfiM, WFICountPlus1, WFICount); // count while in WFI + assign WFICountPlus1 = wfiM ? '0 : WFICount + 1; // restart counting on WFI + flopr #(P.WFI_TIMEOUT_BIT+1) wficountreg(clk, reset, WFICountPlus1, WFICount); // count while in WFI // coverage off -item e 1 -fecexprrow 1 // WFI Timout trap will not occur when STATUS_TW is low while in supervisor mode, so the system gets stuck waiting for an interrupt and triggers a watchdog timeout. assign WFITimeoutM = ((STATUS_TW & PrivilegeModeW != P.M_MODE) | (P.S_SUPPORTED & PrivilegeModeW == P.U_MODE)) & WFICount[P.WFI_TIMEOUT_BIT]; From 90e89ced1db83c769e9bbdd7db3462e36d6e7725 Mon Sep 17 00:00:00 2001 From: David Harris Date: Mon, 26 Feb 2024 04:20:08 -0800 Subject: [PATCH 2/4] Fixes for synthesis. HPTW change will break x detection --- bin/wally-tool-chain-install.sh | 12 ++++++------ src/cache/cacheLRU.sv | 10 +++++----- src/mmu/hptw.sv | 4 +++- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/bin/wally-tool-chain-install.sh b/bin/wally-tool-chain-install.sh index 6e7e4c8e9..74157bffa 100755 --- a/bin/wally-tool-chain-install.sh +++ b/bin/wally-tool-chain-install.sh @@ -69,9 +69,6 @@ fi cd $RISCV git clone https://github.com/riscv/riscv-gnu-toolchain cd riscv-gnu-toolchain -# Temporarily use the following commands until gcc-13 is part of riscv-gnu-toolchain (issue #1249) -#git clone https://github.com/gcc-mirror/gcc -b releases/gcc-13 gcc-13 -#./configure --prefix=/opt/riscv --with-multilib-generator="rv32e-ilp32e--;rv32i-ilp32--;rv32im-ilp32--;rv32iac-ilp32--;rv32imac-ilp32--;rv32imafc-ilp32f--;rv32imafdc-ilp32d--;rv64i-lp64--;rv64ic-lp64--;rv64iac-lp64--;rv64imac-lp64--;rv64imafdc-lp64d--;rv64im-lp64--;" --with-gcc-src=`pwd`/gcc-13 ./configure --prefix=${RISCV} --with-multilib-generator="rv32e-ilp32e--;rv32i-ilp32--;rv32im-ilp32--;rv32iac-ilp32--;rv32imac-ilp32--;rv32imafc-ilp32f--;rv32imafdc-ilp32d--;rv64i-lp64--;rv64ic-lp64--;rv64iac-lp64--;rv64imac-lp64--;rv64imafdc-lp64d--;rv64im-lp64--;" make -j ${NUM_THREADS} @@ -111,14 +108,15 @@ cd riscv-isa-sim/build make -j ${NUM_THREADS} make install cd ../arch_test_target/spike/device -sed -i 's/--isa=rv32ic/--isa=rv32iac/' rv32i_m/privilege/Makefile.include -sed -i 's/--isa=rv64ic/--isa=rv64iac/' rv64i_m/privilege/Makefile.include +# dh 2/5/24: these should be obsolete +#sed -i 's/--isa=rv32ic/--isa=rv32iac/' rv32i_m/privilege/Makefile.include +#sed -i 's/--isa=rv64ic/--isa=rv64iac/' rv64i_m/privilege/Makefile.include # Wally needs Verilator 5.021 or later. # Verilator needs to be built from scratch to get the latest version # apt-get install verilator installs version 4.028 as of 6/8/23 sudo apt-get install -y perl g++ ccache help2man libgoogle-perftools-dev numactl perl-doc zlib1g -sudo apt-get install -y libfl2 libfl-dev # Ubuntu only (ignore if gives error) +sudo apt-get install -y perl g++ ccache help2man libgoogle-perftools-dev numactl perl-doc zlib1g cd $RISCV git clone https://github.com/verilator/verilator # Only first time # unsetenv VERILATOR_ROOT # For csh; ignore error if on bash @@ -173,6 +171,8 @@ sudo make install cd $RISCV opam init -y --disable-sandboxing +opam update +opam upgrade opam switch create 5.1.0 opam install sail -y diff --git a/src/cache/cacheLRU.sv b/src/cache/cacheLRU.sv index e795dd765..1b803f5d3 100644 --- a/src/cache/cacheLRU.sv +++ b/src/cache/cacheLRU.sv @@ -143,16 +143,16 @@ module cacheLRU // This is a two port memory. // Every cycle must read from CacheSetData and each load/store must write the new LRU. always_ff @(posedge clk) begin - if (reset) for (int set = 0; set < NUMLINES; set++) LRUMemory[set] = '0; // exclusion-tag: initialize + if (reset | (InvalidateCache & ~FlushStage)) for (int set = 0; set < NUMLINES; set++) LRUMemory[set] = '0; if(CacheEn) begin if(ClearValid & ~FlushStage) - LRUMemory[PAdr] <= '0; + LRUMemory[PAdr] = '0; else if(LRUWriteEn) - LRUMemory[PAdr] <= NextLRU; + LRUMemory[PAdr] = NextLRU; if(LRUWriteEn & (PAdr == CacheSetTag)) - CurrLRU <= #1 NextLRU; + CurrLRU = NextLRU; else - CurrLRU <= #1 LRUMemory[CacheSetTag]; + CurrLRU = LRUMemory[CacheSetTag]; end end diff --git a/src/mmu/hptw.sv b/src/mmu/hptw.sv index 0823dc7e0..4e292ba3d 100644 --- a/src/mmu/hptw.sv +++ b/src/mmu/hptw.sv @@ -148,6 +148,7 @@ module hptw import cvw::*; #(parameter cvw_t P) ( flopenr #(1) TLBMissMReg(clk, reset, StartWalk, DTLBMissOrUpdateDAM, DTLBWalk); // when walk begins, record whether it was for DTLB (or record 0 for ITLB) assign PRegEn = HPTWRW[1] & ~DCacheBusStallM | UpdatePTE; flopenr #(P.XLEN) PTEReg(clk, reset, PRegEn, NextPTE, PTE); // Capture page table entry from data cache + assert property(@(posedge clk) ~PRegEn | reset | NextPTE[0] !== 1'bx); // report writing an x PTE from an uninitialized page table // Assign PTE descriptors common across all XLEN values // For non-leaf PTEs, D, A, U bits are reserved and ignored. They do not cause faults while walking the page table @@ -173,7 +174,8 @@ module hptw import cvw::*; #(parameter cvw_t P) ( logic [P.XLEN-1:0] AccessedPTE; assign AccessedPTE = {PTE[P.XLEN-1:8], (SetDirty | PTE[7]), 1'b1, PTE[5:0]}; // set accessed bit, conditionally set dirty bit - assign ReadDataNoXM = (ReadDataM[0] === 'x) ? '0 : ReadDataM; // If the PTE.V bit is x because it was read from uninitialized memory set to 0 to avoid x propagation and hanging the simulation. + //assign ReadDataNoXM = (ReadDataM[0] === 'x) ? '0 : ReadDataM; // If the PTE.V bit is x because it was read from uninitialized memory set to 0 to avoid x propagation and hanging the simulation. + assign ReadDataNoXM = ReadDataM; // *** temporary fix for synthesis; === and x in line above are not synthesizable. mux2 #(P.XLEN) NextPTEMux(ReadDataNoXM, AccessedPTE, UpdatePTE, NextPTE); // NextPTE = ReadDataNoXM when ADUE = 0 because UpdatePTE = 0 flopenr #(P.PA_BITS) HPTWAdrWriteReg(clk, reset, SaveHPTWAdr, HPTWReadAdr, HPTWWriteAdr); From 1a0097f6e76bf4f862da355586422cb2e825a5bf Mon Sep 17 00:00:00 2001 From: David Harris Date: Mon, 4 Mar 2024 16:40:49 -0800 Subject: [PATCH 3/4] Further fdivsqrt simplification after starting Sqrt at iteration 0 --- src/fpu/fdivsqrt/fdivsqrtiter.sv | 17 ++++++----------- src/fpu/fdivsqrt/fdivsqrtstage4.sv | 6 +++--- src/fpu/fdivsqrt/fdivsqrtuslc4.sv | 13 +++++++------ src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv | 9 ++++----- 4 files changed, 20 insertions(+), 25 deletions(-) diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv index 29b6d4fe6..4bfcebcd1 100644 --- a/src/fpu/fdivsqrt/fdivsqrtiter.sv +++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv @@ -44,7 +44,7 @@ module fdivsqrtiter import cvw::*; #(parameter cvw_t P) ( logic [P.DIVb+3:0] WCNext[P.DIVCOPIES-1:0]; // Q4.DIVb logic [P.DIVb+3:0] WS[P.DIVCOPIES:0]; // Q4.DIVb logic [P.DIVb+3:0] WC[P.DIVCOPIES:0]; // Q4.DIVb - logic [P.DIVb:0] U[P.DIVCOPIES:0]; // U1.DIVb + logic [P.DIVb:0] U[P.DIVCOPIES:0]; // U1.DIVb // *** probably Q not U. See Table 16.26 notes logic [P.DIVb:0] UM[P.DIVCOPIES:0]; // U1.DIVb logic [P.DIVb:0] UNext[P.DIVCOPIES-1:0]; // U1.DIVb logic [P.DIVb:0] UMNext[P.DIVCOPIES-1:0]; // U1.DIVb @@ -71,7 +71,7 @@ module fdivsqrtiter import cvw::*; #(parameter cvw_t P) ( flopen #(P.DIVb+4) wcreg(clk, FDivBusyE, WCN, WC[0]); // UOTFC Result U and UM registers/initialization mux - // Initialize U to 1.0 and UM to 0 for square root; U to 0 and UM to -1 otherwise + // Initialize U to 0 = 0.0000... and UM to -1 = 1.00000... (in Q1.Divb) assign initU ={(P.DIVb+1){1'b0}}; assign initUM = {{1'b1}, {(P.DIVb){1'b0}}}; mux2 #(P.DIVb+1) Umux(UNext[P.DIVCOPIES-1], initU, IFDivStartE, UMux); @@ -79,15 +79,10 @@ module fdivsqrtiter import cvw::*; #(parameter cvw_t P) ( flopen #(P.DIVb+1) UReg(clk, FDivBusyE, UMux, U[0]); flopen #(P.DIVb+1) UMReg(clk, FDivBusyE, UMMux, UM[0]); - // C register/initialization mux - logic [1:0] initCUpper; - if(P.RADIX == 4) begin - assign initCUpper = 2'b00; - end else begin - assign initCUpper = 2'b10; - end - - assign initC = {initCUpper, {P.DIVb{1'b0}}}; + // C register/initialization mux: C = -R: + // C = -4 = 00.000000... (in Q2.DIVb) for radix 4, C = -2 = 10.000000... for radix2 + if(P.RADIX == 4) assign initC = '0; + else assign initC = {2'b10, {{P.DIVb{1'b0}}}}; mux2 #(P.DIVb+2) cmux(C[P.DIVCOPIES], initC, IFDivStartE, NextC); flopen #(P.DIVb+2) creg(clk, FDivBusyE, NextC, C[0]); diff --git a/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/src/fpu/fdivsqrt/fdivsqrtstage4.sv index 4323ee35c..856273a5e 100644 --- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv +++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv @@ -48,16 +48,16 @@ module fdivsqrtstage4 import cvw::*; #(parameter cvw_t P) ( logic [7:0] WCmsbs, WSmsbs; // U4.4 logic CarryIn; logic [P.DIVb+3:0] WSA, WCA; // Q4.DIVb - logic j0,j1; + logic j0, j1; // step j = 0 or step j = 1 // Digit Selection logic assign j0 = ~C[P.DIVb+1]; // first step of R digit selection: C = 00...0 - assign j1 = C[P.DIVb] ^ C[P.DIVb-1]; // second step of R digit selection: C = 1100...0 + assign j1 = C[P.DIVb] & ~C[P.DIVb-1]; // second step of R digit selection: C = 1100...0; *** could simplify to ~C[P.DIVb-1] because j=0 case takes priority assign Smsbs = U[P.DIVb:P.DIVb-4]; // U1.4 most significant bits of square root assign Dmsbs = D[P.DIVb-1:P.DIVb-3]; // U0.3 most significant fractional bits of divisor after leading 1 assign WCmsbs = WC[P.DIVb+3:P.DIVb-4]; // Q4.4 most significant bits of residual assign WSmsbs = WS[P.DIVb+3:P.DIVb-4]; // Q4.4 most significant bits of residual - fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j1, .j0, .udigit); + fdivsqrtuslc4cmp uslc4(.Dmsbs, .Smsbs, .WSmsbs, .WCmsbs, .SqrtE, .j0, .j1, .udigit); assign un = 1'b0; // unused for radix 4 // F generation logic diff --git a/src/fpu/fdivsqrt/fdivsqrtuslc4.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv index 63ea5aae2..610b79395 100644 --- a/src/fpu/fdivsqrt/fdivsqrtuslc4.sv +++ b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv @@ -31,7 +31,7 @@ module fdivsqrtuslc4 ( input logic [2:0] Dmsbs, // U0.3 fractional bits after implicit leading 1 input logic [4:0] Smsbs, // U1.4 leading bits of square root approximation input logic [7:0] WSmsbs, WCmsbs, // Q4.4 redundant residual most significant bits - input logic Sqrt, j1, + input logic Sqrt, j0, j1, output logic [3:0] udigit // {2, 1, -1, -2} digit is 0 if none are hot ); logic [7:0] PreWmsbs; // Q4.4 nonredundant residual msbs @@ -102,11 +102,12 @@ module fdivsqrtuslc4 ( // Select A always_comb if (Sqrt) begin - if (j1) A = 3'b101; // on first sqrt iteration A = .101 - else if (Smsbs == 5'b10000) A = 3'b111; // if S = 1.0, use A = .111 - else A = Smsbs[2:0]; // otherwise use A = 2S (in U0.3 format) - end else A = Dmsbs; // division Unless A = D (IN U0.3 format, dropping leading 1) + if (j1) A = 3'b101; // on first sqrt iteration A = .101 + else if (Smsbs[4] == 1) A = 3'b111; // if S = 1.0000, use A = .111 + else A = Smsbs[2:0]; // otherwise use A = 2S (in U0.3 format) + end else A = Dmsbs; // division A = D (IN U0.3 format, dropping leading 1) // Select quotient digit from lookup table based on A and W - assign udigit = USel4[{A,Wmsbs}]; + // On step j = 0 for square root, always select u_0 = 1 + assign udigit = (Sqrt & j0) ? 4'b0100 : USel4[{A,Wmsbs}]; endmodule diff --git a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv index 7812248a9..fef26668c 100644 --- a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv +++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv @@ -32,7 +32,7 @@ module fdivsqrtuslc4cmp ( input logic [4:0] Smsbs, // U1.4 leading bits of square root approximation input logic [7:0] WSmsbs, WCmsbs, // Q4.4 residual most significant bits input logic SqrtE, - input logic j0,j1, // are we on first (j0) or second step (j1) of digit selection + input logic j0, j1, // are we on first (j0) or second step (j1) of digit selection output logic [3:0] udigit // {2, 1, -1, -2} digit is 0 if none are hot ); logic [6:0] Wmsbs; @@ -71,23 +71,22 @@ module fdivsqrtuslc4cmp ( // handles special case when j = 0 or j = 1 for sqrt assign mkj2 = 20; // when j = 1 use mk2[101] when j = 0 use anything bigger than 7. - assign mkj1 = j1 ? 8 : 0; // when j = 1 use mk1[101] = 8 and when j = 0 use 0 so we choose u_0 = 1 + assign mkj1 = j0 ? 0 : 8; // when j = 1 use mk1[101] = 8 and when j = 0 use 0 so we choose u_0 = 1 assign sqrtspecial = SqrtE & (j1 | j0); // Choose A for current operation always_comb if (SqrtE) begin - if (Smsbs[4]) A = 3'b111; // *** can we get rid of SMSBs case? + if (Smsbs[4]) A = 3'b111; // for S = 1.0000 *** can we optimize away this case? else A = Smsbs[2:0]; end else A = Dmsbs; - // Choose selection constants based on a assign mk2 = sqrtspecial ? mkj2 : mks2[A]; assign mk1 = sqrtspecial ? mkj1 : mks1[A]; assign mk0 = -mk1; - assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table *** can we hide? + assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table *** can we hide from critical path // Compare residual W to selection constants to choose digit always_comb From e8e0538f6c444c3aefba2f53235c8bcd7ec75e81 Mon Sep 17 00:00:00 2001 From: Rose Thompson Date: Tue, 5 Mar 2024 10:33:47 -0600 Subject: [PATCH 4/4] Changed to non-blocking in cacheLRU and removed clearing LRU bits on flush. --- src/cache/cacheLRU.sv | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/cache/cacheLRU.sv b/src/cache/cacheLRU.sv index 1b803f5d3..26bdca887 100644 --- a/src/cache/cacheLRU.sv +++ b/src/cache/cacheLRU.sv @@ -143,16 +143,14 @@ module cacheLRU // This is a two port memory. // Every cycle must read from CacheSetData and each load/store must write the new LRU. always_ff @(posedge clk) begin - if (reset | (InvalidateCache & ~FlushStage)) for (int set = 0; set < NUMLINES; set++) LRUMemory[set] = '0; + if (reset | (InvalidateCache & ~FlushStage)) for (int set = 0; set < NUMLINES; set++) LRUMemory[set] <= '0; if(CacheEn) begin - if(ClearValid & ~FlushStage) - LRUMemory[PAdr] = '0; - else if(LRUWriteEn) - LRUMemory[PAdr] = NextLRU; + if(LRUWriteEn) + LRUMemory[PAdr] <= NextLRU; if(LRUWriteEn & (PAdr == CacheSetTag)) - CurrLRU = NextLRU; + CurrLRU <= NextLRU; else - CurrLRU = LRUMemory[CacheSetTag]; + CurrLRU <= LRUMemory[CacheSetTag]; end end