Merged with merge conflict

2025-02-11 06:05:49 +00:00 · 2024-04-17 10:47:28 -07:00 · 2024-04-17 10:47:28 -07:00 · 2b0cf90a99
commit 2b0cf90a99
parent d39f1ebefc 9a29668d1e
12 changed files with 84 additions and 74 deletions
--- a/benchmarks/coremark/Makefile
+++ b/benchmarks/coremark/Makefile
@ -5,14 +5,13 @@
 PORT_DIR = $(CURDIR)/riscv64-baremetal
 cmbase= $(WALLY)/addins/coremark
 work_dir= $(WALLY)/benchmarks/coremark/work
-XLEN ?=64
+XLEN ?=32
 sources=$(cmbase)/core_main.c $(cmbase)/core_list_join.c $(cmbase)/coremark.h  \
 	$(cmbase)/core_matrix.c $(cmbase)/core_state.c $(cmbase)/core_util.c \
 	$(PORT_DIR)/core_portme.h $(PORT_DIR)/core_portme.c $(PORT_DIR)/core_portme.mak \
 	$(PORT_DIR)/crt.S $(PORT_DIR)/encoding.h $(PORT_DIR)/util.h $(PORT_DIR)/syscalls.c
 ABI := $(if $(findstring "64","$(XLEN)"),lp64,ilp32)
-#ARCH := rv$(XLEN)gc_zba_zbb_zbc
-ARCH := rv$(XLEN)im_zicsr_zba_zbb_zbc
+ARCH := rv$(XLEN)im_zicsr_zba_zbb_zbs
 #ARCH := rv$(XLEN)gc
 #ARCH := rv$(XLEN)imc_zicsr
 #ARCH := rv$(XLEN)im_zicsr
@ -29,7 +28,6 @@ all: $(work_dir)/coremark.bare.riscv.elf.memfile

 run:
 	time wsim rv$(XLEN)gc coremark 2>&1 | tee $(work_dir)/coremark.sim.log
-	#(cd ../../sim && (time vsim -c -do "do wally-batch.do rv$(XLEN)gc coremark" 2>&1 | tee $(work_dir)/coremark.sim.log))

 $(work_dir)/coremark.bare.riscv.elf.memfile: $(work_dir)/coremark.bare.riscv
 	riscv64-unknown-elf-objdump -D $< > $<.elf.objdump
--- a/benchmarks/coremark/coremark_sweep.py
+++ b/benchmarks/coremark/coremark_sweep.py
@ -34,18 +34,18 @@ import re
 import csv
 # list of architectures to run. 
 arch_list = [
-    "rv32gc_zba_zbb_zbc",
+    "rv32i_zicsr",
+    "rv32im_zicsr",
+    "rv32imc_zicsr",
    "rv32im_zicsr_zba_zbb_zbc",
    "rv32gc",
-    "rv32imc_zicsr",
-    "rv32im_zicsr",
-    "rv32i_zicsr",
-    "rv64gc_zba_zbb_zbc",
+    "rv32gc_zba_zbb_zbc",
+    "rv64i_zicsr",
+    "rv64im_zicsr",
+    "rv64imc_zicsr",
    "rv64im_zicsr_zba_zbb_zbc",
    "rv64gc",
-    "rv64imc_zicsr",
-    "rv64im_zicsr",
-    "rv64i_zicsr"
+    "rv64gc_zba_zbb_zbc"
 ]
 str="32" 
 # Define regular expressions to match the desired fields
--- a/benchmarks/coremark/riscv64-baremetal/core_portme.h
+++ b/benchmarks/coremark/riscv64-baremetal/core_portme.h
@ -109,11 +109,11 @@ typedef unsigned short ee_u16;
 typedef signed int ee_s32;
 typedef double ee_f32;
 typedef unsigned char ee_u8;
-//typedef unsigned int ee_u32;
-typedef signed int ee_u32; // replaced with signed to improve performance per https://github.com/sifive/benchmark-coremark/blob/master/linux64/core_portme.h#L102
 #if (XLEN==64) 
+	typedef signed int ee_u32; // replaced with signed to improve performance by avoiding zero extension in RV64 per https://github.com/sifive/benchmark-coremark/blob/master/linux64/core_portme.h#L102
 	typedef unsigned long long ee_ptr_int;
 #else
+	typedef unsigned int ee_u32;
 	typedef ee_u32 ee_ptr_int;
 #endif
 typedef size_t ee_size_t;
--- a/benchmarks/coremark/riscv64-baremetal/core_portme.mak
+++ b/benchmarks/coremark/riscv64-baremetal/core_portme.mak
@ -107,7 +107,7 @@ port_prebuild: $(PGO_STAGE)

 .PHONY: build_pgo_gcc
 build_pgo_gcc:
-	$(MAKE) PGO=gen XCFLAGS="$(XCFLAGS) -fprofile-generate -DTOTAL_DATA_SIZE=1200" ITERATIONS=10 gen_pgo_data REBUILD=1
+	$(MAKE) PGO=gen XCFLAGS="$(XCFLAGS) -fprofile-generate -DTOTAL_DATA_SIZE=1200" gen_pgo_data REBUILD=1

 # Target: port_postbuild
 # Generate any files that are needed after actual build end.
--- a/benchmarks/embench/Makefile
+++ b/benchmarks/embench/Makefile
@ -38,26 +38,27 @@ build_speedopt_size:
 build_sizeopt_size:
 	$(embench_dir)/build_all.py --builddir=bd_sizeopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S -march=$(ARCH)" --cflags="-Os -msave-restore -march=$(ARCH)" --dummy-libs="libgcc libm libc crt0"

-# builds dependencies, then launches modelsim and finally runs python wrapper script to present results
-sim: modelsim_build_memfile modelsim_run speed
+# builds dependencies, then launches sim and finally runs python wrapper script to present results
+sim: sim_build_memfile sim_run speed

-# launches modelsim to simulate tests on wally
-modelsim_run:
-	mkdir -p ../../sim/wkdir
-	(cd ../../sim/ && wsim rv32gc embench)
-	cd ../../benchmarks/embench/
+# launches sim to simulate tests on wally
+sim_run:
+	wsim rv32gc embench
+	#mkdir -p ../../sim/wkdir
+	#(cd ../../sim/ && wsim rv32gc embench)
+	#cd ../../benchmarks/embench/

 # builds the objdump based on the compiled c elf files
 objdump:
 	find $(embench_dir)/bd_*_speed/ -type f -name "*.elf" | while read f; do riscv64-unknown-elf-objdump -S -D "$$f" > "$$f.objdump"; done

 # build memfiles, objdump.lab and objdump.addr files
-modelsim_build_memfile: objdump
+sim_build_memfile: objdump
 	find $(embench_dir)/bd_*_speed/ -type f -name "*.elf" | while read f; do riscv64-unknown-elf-elf2hex --bit-width 32 --input "$$f" --output "$$f.memfile"; done
 	find $(embench_dir)/bd_*_speed/ -type f -name "*.elf.objdump" | while read f; do extractFunctionRadix.sh $$f; done

 # builds the tests for speed, runs them on spike and then launches python script to present results
-# note that the speed python script benchmark_speed.py can get confused if there's both a .output file created from spike and modelsim
+# note that the speed python script benchmark_speed.py can get confused if there's both a .output file created from spike and questa
 # you'll need to manually remove one of the two .output files, or run make clean
 spike: buildspeed spike_run speed

--- a/bin/lint-wally
+++ b/bin/lint-wally
@ -11,7 +11,7 @@ GREEN='\033[0;32m'
 NC='\033[0m' # No Color
 fails=0

-if [ "$1" == "-nightly" ]; then
+if [ "$1" == "--nightly" ]; then
    configs=(rv32e rv64gc rv32gc rv32imc rv32i rv64i) # fdqh_rv64gc
    derivconfigs=`ls $WALLY/config/deriv`
    for entry in $derivconfigs
--- a/bin/regression-wally
+++ b/bin/regression-wally
@ -217,7 +217,7 @@ def addTests(tests, sim):
            gs = test[3]
        else:
            gs = "All tests ran without failures"
-        cmdPrefix="wsim --sim " + sim + " " + config 
+        cmdPrefix="wsim --sim " + sim + " " + coverStr + " " + config
        for t in suites:
            sim_log = sim_logdir + config + "_" + t + ".log"
            if (len(test) >= 5):
@ -401,7 +401,7 @@ def main():
    #   Presently don't run buildroot because it has a different config and can't be merged with the rv64gc coverage.
    #   Also it is slow to run.   
    #    configs.append(getBuildrootTC(boot=False))
-        os.system('rm -f cov/*.ucdb')
+        os.system('rm -f questa/cov/*.ucdb')
    elif '--nightly' in sys.argv:
        TIMEOUT_DUR = 60*1440 # 1 day
        #configs.append(getBuildrootTC(boot=False))
@ -427,7 +427,7 @@ def main():

    # Coverage report
    if coverage:
-       os.system('make coverage')
+       os.system('make QuestaCoverage')
    # Count the number of failures
    if num_fail:
        print(f"{bcolors.FAIL}Regression failed with %s failed configurations{bcolors.ENDC}" % num_fail)
--- a/bin/wally-tool-chain-install.sh
+++ b/bin/wally-tool-chain-install.sh
@ -176,6 +176,10 @@ git clone https://github.com/riscv/sail-riscv.git
 cd sail-riscv
 # For now, use checkout that is stable for Wally
 #git checkout 72b2516d10d472ac77482fd959a9401ce3487f60  # not new enough for Zicboz?
+export OPAMCLI=2.0  # Sail is not compatible with opam 2.1 as of 4/16/24
+# It is faster to just build c_emulator/riscv_sim_RV* than to build all of Sail
+#make -j ${NUM_THREADS}
+#ARCH=RV32 make -j ${NUM_THREADS}
 make -j ${NUM_THREADS} c_emulator/riscv_sim_RV64
 ARCH=RV32 make -j ${NUM_THREADS} c_emulator/riscv_sim_RV32
 sudo ln -sf $RISCV/sail-riscv/c_emulator/riscv_sim_RV64 /usr/bin/riscv_sim_RV64
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@ -99,7 +99,6 @@ localparam RK          = LOGR*DIVCOPIES;                            // r*k bits

 // intermediate division parameters not directly used in fdivsqrt hardware
 localparam FPDIVMINb   = NF + 2; // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit to allow sqrt being shifted right
-//localparam FPDIVMINb   = NF + 2 + (RADIX == 2); // minimum length of fractional part: Nf result bits + guard and round bits + 1 extra bit for preshifting radix2 square root right, if radix4 doesn't use a right shift.  This version saves one cycle on double-precision with R=4,k=4.  However, it doesn't work yet because C is too short, so k is incorrectly calculated as a 1 in the lsb after the last step.
 localparam DIVMINb     = ((FPDIVMINb<XLEN) & IDIV_ON_FPU) ? XLEN : FPDIVMINb; // minimum fractional bits b = max(XLEN, FPDIVMINb)
 localparam RESBITS     = DIVMINb + LOGR; // number of bits in a result: r integer + b fractional

@ -111,12 +110,18 @@ localparam DIVBLEN     = $clog2(DIVb+1);                            // enough bi

 // largest length in IEU/FPU
 localparam BASECVTLEN = `max(XLEN, NF); // convert length excluding Zfa fcvtmod.w.d
-localparam CVTLEN = ZFA_SUPPORTED ? `max(BASECVTLEN, 32'd84) : BASECVTLEN; // fcvtmod.w.d needs at least 32+52 because a double with 52 fractional bits might be into upper bits of 32 bit word
+localparam CVTLEN = (ZFA_SUPPORTED & D_SUPPORTED) ? `max(BASECVTLEN, 32'd84) : BASECVTLEN; // fcvtmod.w.d needs at least 32+52 because a double with 52 fractional bits might be into upper bits of 32 bit word
 localparam LLEN = `max($unsigned(FLEN), $unsigned(XLEN));
 localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));
+
+// NORMSHIFTSIZE is the bits out of the normalization shifter
+// RV32F: max(32+23+1, 2(23)+4, 3(23)+6) = 3*23+6 = 75
+// RV64F: max(64+23+1, 64 + 23 + 2, 3*23+6) = 89
+// RV64D: max(84+52+1, 64+52+2, 3*52+6) = 162
 localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1), (DIVb + 1 + NF + 1)), (3*NF+6));
-localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));
-localparam CORRSHIFTSZ = `max((NORMSHIFTSZ-2), (DIVMINb + 1 + NF));
+
+localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));                  // log_2(NORMSHIFTSZ)
+localparam CORRSHIFTSZ = NORMSHIFTSZ-2;                             // Drop leading 2 integer bits


 // Disable spurious Verilator warnings
--- a/sim/Makefile
+++ b/sim/Makefile
@ -17,28 +17,28 @@ all: riscoftests memfiles coveragetests deriv

 wally-riscv-arch-test: wallyriscoftests memfiles

-coverage: cov/rv64gc_arch64i.ucdb 
+QuestaCoverage: questa/cov/rv64gc_arch64i.ucdb 
 	#iter-elf.bash --cover --search ../tests/coverage
-	vcover merge -out cov/cov.ucdb cov/rv64gc_arch64i.ucdb cov/rv64gc*.ucdb -logfile cov/log
-#	vcover merge -out cov/cov.ucdb cov/rv64gc_arch64i.ucdb cov/rv64gc*.ucdb cov/buildroot_buildroot.ucdb riscv.ucdb -logfile cov/log
-	vcover report -details cov/cov.ucdb > cov/rv64gc_coverage_details.rpt
-	vcover report cov/cov.ucdb -details -instance=/core/ebu. > cov/rv64gc_coverage_ebu.rpt
-	vcover report cov/cov.ucdb -details -instance=/core/priv. > cov/rv64gc_coverage_priv.rpt
-	vcover report cov/cov.ucdb -details -instance=/core/ifu. > cov/rv64gc_coverage_ifu.rpt
-	vcover report cov/cov.ucdb -details -instance=/core/lsu. > cov/rv64gc_coverage_lsu.rpt
-	vcover report cov/cov.ucdb -details -instance=/core/fpu. > cov/rv64gc_coverage_fpu.rpt
-	vcover report cov/cov.ucdb -details -instance=/core/ieu. > cov/rv64gc_coverage_ieu.rpt
-	vcover report cov/cov.ucdb -below 100 -details -instance=/core/ebu. > cov/rv64gc_uncovered_ebu.rpt
-	vcover report cov/cov.ucdb -below 100 -details -instance=/core/priv. > cov/rv64gc_uncovered_priv.rpt
-	vcover report cov/cov.ucdb -below 100 -details -instance=/core/ifu. > cov/rv64gc_uncovered_ifu.rpt
-	vcover report cov/cov.ucdb -below 100 -details -instance=/core/lsu. > cov/rv64gc_uncovered_lsu.rpt
-	vcover report cov/cov.ucdb -below 100 -details -instance=/core/fpu. > cov/rv64gc_uncovered_fpu.rpt
-	vcover report cov/cov.ucdb -below 100 -details -instance=/core/ieu. > cov/rv64gc_uncovered_ieu.rpt
-	vcover report -hierarchical cov/cov.ucdb > cov/rv64gc_coverage_hierarchical.rpt
-	vcover report -below 100 -hierarchical cov/cov.ucdb > cov/rv64gc_uncovered_hierarchical.rpt
-#	vcover report -below 100 cov/cov.ucdb > cov/rv64gc_coverage.rpt
-#	vcover report -recursive cov/cov.ucdb > cov/rv64gc_recursive.rpt
-	vcover report -details -threshH 100 -html cov/cov.ucdb
+	vcover merge -out questa/cov/cov.ucdb questa/cov/rv64gc_arch64i.ucdb questa/cov/rv64gc*.ucdb -logfile questa/cov/log
+#	vcover merge -out questa/cov/cov.ucdb questa/cov/rv64gc_arch64i.ucdb questa/cov/rv64gc*.ucdb questa/cov/buildroot_buildroot.ucdb riscv.ucdb -logfile questa/cov/log
+	vcover report -details questa/cov/cov.ucdb > questa/cov/rv64gc_coverage_details.rpt
+	vcover report questa/cov/cov.ucdb -details -instance=/core/ebu. > questa/cov/rv64gc_coverage_ebu.rpt
+	vcover report questa/cov/cov.ucdb -details -instance=/core/priv. > questa/cov/rv64gc_coverage_priv.rpt
+	vcover report questa/cov/cov.ucdb -details -instance=/core/ifu. > questa/cov/rv64gc_coverage_ifu.rpt
+	vcover report questa/cov/cov.ucdb -details -instance=/core/lsu. > questa/cov/rv64gc_coverage_lsu.rpt
+	vcover report questa/cov/cov.ucdb -details -instance=/core/fpu. > questa/cov/rv64gc_coverage_fpu.rpt
+	vcover report questa/cov/cov.ucdb -details -instance=/core/ieu. > questa/cov/rv64gc_coverage_ieu.rpt
+	vcover report questa/cov/cov.ucdb -below 100 -details -instance=/core/ebu. > questa/cov/rv64gc_uncovered_ebu.rpt
+	vcover report questa/cov/cov.ucdb -below 100 -details -instance=/core/priv. > questa/cov/rv64gc_uncovered_priv.rpt
+	vcover report questa/cov/cov.ucdb -below 100 -details -instance=/core/ifu. > questa/cov/rv64gc_uncovered_ifu.rpt
+	vcover report questa/cov/cov.ucdb -below 100 -details -instance=/core/lsu. > questa/cov/rv64gc_uncovered_lsu.rpt
+	vcover report questa/cov/cov.ucdb -below 100 -details -instance=/core/fpu. > questa/cov/rv64gc_uncovered_fpu.rpt
+	vcover report questa/cov/cov.ucdb -below 100 -details -instance=/core/ieu. > questa/cov/rv64gc_uncovered_ieu.rpt
+	vcover report -hierarchical questa/cov/cov.ucdb > questa/cov/rv64gc_coverage_hierarchical.rpt
+	vcover report -below 100 -hierarchical questa/cov/cov.ucdb > questa/cov/rv64gc_uncovered_hierarchical.rpt
+#	vcover report -below 100 questa/cov/cov.ucdb > questa/cov/rv64gc_coverage.rpt
+#	vcover report -recursive questa/cov/cov.ucdb > questa/cov/rv64gc_recursive.rpt
+	vcover report -details -threshH 100 -html questa/cov/cov.ucdb

 allclean: clean all

--- a/src/fpu/postproc/shiftcorrection.sv
+++ b/src/fpu/postproc/shiftcorrection.sv
@ -28,7 +28,7 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
-  input logic  [P.NORMSHIFTSZ-1:0] Shifted,                // the shifted sum before LZA correction
+  input logic  [P.NORMSHIFTSZ-1:0] Shifted,                // normalization shifter output
  // divsqrt
  input logic                      DivOp,                  // is it a divsqrt operation
  input logic                      DivResSubnorm,          // is the divsqrt result subnormal
@ -41,37 +41,39 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
  input logic                      FmaSZero,
  // output
  output logic [P.NE+1:0]          FmaMe,                  // exponent of the normalized sum
-  output logic [P.CORRSHIFTSZ-1:0] Mf,                     // the shifted sum before LZA correction
+  output logic [P.CORRSHIFTSZ-1:0] Mf,                     // the shifted sum after correction
  output logic [P.NE+1:0]          Ue                      // corrected exponent for divider
 );

-  logic [P.CORRSHIFTSZ-1:0]        CorrSumShifted;         // the shifted sum after LZA correction
-  logic [P.CORRSHIFTSZ-1:0]        CorrQm0, CorrQm1;       // portions of Shifted to select for CorrQmShifted
-  logic [P.CORRSHIFTSZ-1:0]        CorrQmShifted;          // the shifted divsqrt result after one bit shift
+  logic [P.CORRSHIFTSZ-1:0]        CorrShifted;         // the shifted sum after LZA correction
  logic                            ResSubnorm;             // is the result Subnormal
  logic                            LZAPlus1;               // add one or two to the sum's exponent due to LZA correction
  logic                            LeftShiftQm;            // should the divsqrt result be shifted one to the left
+  logic                            RightShift;             // shift right by 1

-  // LZA correction
-  assign LZAPlus1 = Shifted[P.NORMSHIFTSZ-1];
-
+  // *** 4/16/24 this code is a mess and needs cleaning and explaining
+  // define bit widths
+  // seems to shift by 0, 1, or 2.  right and left shift is confusing
+  
+  // FMA LZA correction
  // correct the shifting error caused by the LZA
  //  - the only possible mantissa for a plus two is all zeroes 
-  //  - a one has to propigate all the way through a sum. so we can leave the bottom statement alone
-  mux2 #(P.NORMSHIFTSZ-2) lzacorrmux(Shifted[P.NORMSHIFTSZ-3:0], Shifted[P.NORMSHIFTSZ-2:1], LZAPlus1, CorrSumShifted);
+  //  - a one has to propagate all the way through a sum. so we can leave the bottom statement alone
+  assign LZAPlus1 = Shifted[P.NORMSHIFTSZ-1];

-  // correct the shifting of the divsqrt caused by producing a result in (2, .5] range
+  // correct the shifting of the divsqrt caused by producing a result in (0.5, 2) range
  // condition: if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Subnorm)
  assign LeftShiftQm = (LZAPlus1|(DivUe==1&~LZAPlus1));
-  assign CorrQm0     = Shifted[P.NORMSHIFTSZ-3:P.NORMSHIFTSZ-P.CORRSHIFTSZ-2];
-  assign CorrQm1     = Shifted[P.NORMSHIFTSZ-2:P.NORMSHIFTSZ-P.CORRSHIFTSZ-1];
-  mux2 #(P.CORRSHIFTSZ) divcorrmux(CorrQm0, CorrQm1, LeftShiftQm, CorrQmShifted);
  
+  assign RightShift = FmaOp ? LZAPlus1 : LeftShiftQm;
+
+  // one bit right shift for FMA or division
+  mux2 #(P.NORMSHIFTSZ-2) corrmux(Shifted[P.NORMSHIFTSZ-3:0], Shifted[P.NORMSHIFTSZ-2:1], RightShift, CorrShifted);
+
  // if the result of the divider was calculated to be subnormal, then the result was correctly normalized, so select the top shifted bits
  always_comb
-    if(FmaOp)                       Mf = {CorrSumShifted};
-    else if (DivOp&~DivResSubnorm)  Mf = CorrQmShifted;
-    else                            Mf = Shifted[P.NORMSHIFTSZ-1:P.NORMSHIFTSZ-P.CORRSHIFTSZ];
+    if (FmaOp | DivOp & !DivResSubnorm) Mf = CorrShifted;
+    else                                Mf = Shifted[P.NORMSHIFTSZ-1:2]; 
    
  // Determine sum's exponent
  //  main exponent issues: 
@ -86,7 +88,7 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
  // recalculate if the result is subnormal after LZA correction
  assign ResSubnorm = FmaPreResultSubnorm&~Shifted[P.NORMSHIFTSZ-2]&~Shifted[P.NORMSHIFTSZ-1];

-  // the quotent is in the range [.5,2) if there is no early termination
+  // the quotent is in the range (.5,2) if there is no early termination
  // if the quotent < 1 and not Subnormal then subtract 1 to account for the normalization shift
  assign Ue = (DivResSubnorm & DivSubnormShiftPos) ? 0 : DivUe - {(P.NE+1)'(0), ~LZAPlus1};
 endmodule
--- a/src/privileged/csrc.sv
+++ b/src/privileged/csrc.sv
@ -104,13 +104,13 @@ module csrc  import cvw::*;  #(parameter cvw_t P) (
    assign CounterEvent[9]  = RASPredPCWrongM & InstrValidNotFlushedM;                   // return address stack wrong address
    assign CounterEvent[10] = IClassWrongM & InstrValidNotFlushedM;                      // instruction class predictor wrong
    assign CounterEvent[11] = LoadStallM;                                                // Load Stalls. don't want to suppress on flush as this only happens if flushed.
-    assign CounterEvent[12] = StoreStallM;                                               // depricated Store Stall
+    assign CounterEvent[12] = StoreStallM;                                               // Store Stall
    assign CounterEvent[13] = DCacheAccess;                                              // data cache access
    assign CounterEvent[14] = DCacheMiss;                                                // data cache miss. Miss asserted 1 cycle at start of cache miss
-    assign CounterEvent[15] = DCacheStallM;                                              // d cache miss cycles
+    assign CounterEvent[15] = DCacheStallM;                                              // D$ miss cycles
    assign CounterEvent[16] = ICacheAccess;                                              // instruction cache access
    assign CounterEvent[17] = ICacheMiss;                                                // instruction cache miss. Miss asserted 1 cycle at start of cache miss
-    assign CounterEvent[18] = ICacheStallF;                                              // i cache miss cycles
+    assign CounterEvent[18] = ICacheStallF;                                              // I$ miss cycles
    assign CounterEvent[19] = CSRWriteM & InstrValidNotFlushedM;                         // CSR writes
    assign CounterEvent[20] = InvalidateICacheM & InstrValidNotFlushedM;                 // fence.i
    assign CounterEvent[21] = sfencevmaM & InstrValidNotFlushedM;                        // sfence.vma