Updated riscv64-unknown-elf-gcc location so that it can be easily accessed

2021-07-20 14:18:13 -04:00 · 2021-07-20 14:18:13 -04:00 · 89dc9ba6e4
commit 89dc9ba6e4
parent 69c6a7d2cc 6b72b1f859
31 changed files with 721 additions and 628 deletions
--- a/wally-pipelined/config/buildroot/wally-config.vh
+++ b/wally-pipelined/config/buildroot/wally-config.vh
@ -43,15 +43,26 @@
 `define UARCH_PIPELINED 1
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
+`define MEM_DCACHE 1
 `define MEM_DTIM 1
-`define MEM_ICACHE 0
+`define MEM_ICACHE 1
 `define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1 // Domenico Ottolia 4/15: Support for vectored interrupts in _tvec csrs. Just implemented in src/privileged/trap.sv around line 75. Pretty sure this should be 1.

+// TLB configuration.  Entries should be a power of 2
 `define ITLB_ENTRIES 32
 `define DTLB_ENTRIES 32

+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more blocks
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 2048
+`define DCACHE_BLOCKLENINBITS 256
+`define DCACHE_REPLBITS 3
+`define ICACHE_NUMWAYS 1
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_BLOCKLENINBITS 256
+
 // Legal number of PMP entries are 0, 16, or 64
 `define PMP_ENTRIES 16

--- a/wally-pipelined/config/busybear/wally-config.vh
+++ b/wally-pipelined/config/busybear/wally-config.vh
@ -44,15 +44,26 @@
 `define UARCH_PIPELINED 1
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
+`define MEM_DCACHE 1
 `define MEM_DTIM 1
-`define MEM_ICACHE 0
+`define MEM_ICACHE 1
 `define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1 // Domenico Ottolia 4/15: Support for vectored interrupts in _tvec csrs. Just implemented in src/privileged/trap.sv around line 75. Pretty sure this should be 1.

+// TLB configuration.  Entries should be a power of 2
 `define ITLB_ENTRIES 32
 `define DTLB_ENTRIES 32

+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more blocks
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 2048
+`define DCACHE_BLOCKLENINBITS 256
+`define DCACHE_REPLBITS 3
+`define ICACHE_NUMWAYS 1
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_BLOCKLENINBITS 256
+
 // Legal number of PMP entries are 0, 16, or 64
 `define PMP_ENTRIES 16

--- a/wally-pipelined/config/coremark-64i/wally-config.vh
+++ b/wally-pipelined/config/coremark-64i/wally-config.vh
@ -1,84 +0,0 @@
-//////////////////////////////////////////
-// wally-config.vh
-//
-// Written: David_Harris@hmc.edu 4 January 2021
-// Modified: 
-//
-// Purpose: Specify which features are configured
-//          Macros to determine which modes are supported based on MISA
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-///////////////////////////////////////////
-
-// include shared configuration
-`include "wally-shared.vh"
-
-// RV32 or RV64: XLEN = 32 or 64
-`define XLEN 64
-
-//`define MISA (32'h00000104)
-`define MISA (32'h00000104 | 1<<5 | 1<<18 | 1 << 20)
-`define ZCSR_SUPPORTED 1
-`define ZCOUNTERS_SUPPORTED 1
-
-// Microarchitectural Features
-`define UARCH_PIPELINED 1
-`define UARCH_SUPERSCALR 0
-`define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
-`define MEM_DTIM 1
-`define MEM_ICACHE 0
-`define MEM_VIRTMEM 0
-
-// Address space
-`define RESET_VECTOR 64'h0000000000001000
-
-// Bus Interface width
-`define AHBW 64
-
-// Peripheral Addresses
-// Peripheral memory space extends from BASE to BASE+RANGE
-// Range should be a thermometer code with 0's in the upper bits and 1s in the lower bits
-
-`define BOOTTIM_SUPPORTED 1'b1
-`define BOOTTIM_BASE   56'h00001000 
-`define BOOTTIM_RANGE  56'h00000FFF
-`define TIM_SUPPORTED 1'b1
-`define TIM_BASE       56'h80000000
-`define TIM_RANGE      56'h07FFFFFF
-`define CLINT_SUPPORTED 1'b1
-`define CLINT_BASE  56'h02000000
-`define CLINT_RANGE 56'h0000FFFF
-`define GPIO_SUPPORTED 1'b1
-`define GPIO_BASE   56'h10012000
-`define GPIO_RANGE  56'h000000FF
-`define UART_SUPPORTED 1'b1
-`define UART_BASE   56'h10000000
-`define UART_RANGE  56'h00000007
-`define PLIC_SUPPORTED 1'b1
-`define PLIC_BASE   56'h0C000000
-`define PLIC_RANGE  56'h03FFFFFF
-
-// Test modes
-
-// Tie GPIO outputs back to inputs
-`define GPIO_LOOPBACK_TEST 0
-
-
-// Hardware configuration
-`define UART_PRESCALE 1
-
--- a/wally-pipelined/config/coremark/wally-config.vh
+++ b/wally-pipelined/config/coremark/wally-config.vh
@ -43,15 +43,26 @@
 `define UARCH_PIPELINED 1
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
+`define MEM_DCACHE 1
 `define MEM_DTIM 1
-`define MEM_ICACHE 0
+`define MEM_ICACHE 1
 `define MEM_VIRTMEM 0
 `define VECTORED_INTERRUPTS_SUPPORTED 1

+// TLB configuration.  Entries should be a power of 2
 `define ITLB_ENTRIES 32
 `define DTLB_ENTRIES 32

+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more blocks
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 2048
+`define DCACHE_BLOCKLENINBITS 256
+`define DCACHE_REPLBITS 3
+`define ICACHE_NUMWAYS 1
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_BLOCKLENINBITS 256
+
 // Address space
 `define RESET_VECTOR 64'h00000000000100b0

--- a/wally-pipelined/config/coremark_bare/wally-config.vh
+++ b/wally-pipelined/config/coremark_bare/wally-config.vh
@ -44,15 +44,26 @@
 `define UARCH_PIPELINED 1
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
+`define MEM_DCACHE 1
 `define MEM_DTIM 1
-`define MEM_ICACHE 0
+`define MEM_ICACHE 1
 `define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1

+// TLB configuration.  Entries should be a power of 2
 `define ITLB_ENTRIES 32
 `define DTLB_ENTRIES 32

+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more blocks
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 2048
+`define DCACHE_BLOCKLENINBITS 256
+`define DCACHE_REPLBITS 3
+`define ICACHE_NUMWAYS 1
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_BLOCKLENINBITS 256
+
 // Legal number of PMP entries are 0, 16, or 64
 `define PMP_ENTRIES 64

--- a/wally-pipelined/config/rv32ic/wally-config.vh
+++ b/wally-pipelined/config/rv32ic/wally-config.vh
@ -42,15 +42,26 @@
 `define UARCH_PIPELINED 1
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
+`define MEM_DCACHE 1
 `define MEM_DTIM 1
-`define MEM_ICACHE 0
+`define MEM_ICACHE 1
 `define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1

+// TLB configuration.  Entries should be a power of 2
 `define ITLB_ENTRIES 32
 `define DTLB_ENTRIES 32

+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more blocks
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 2048
+`define DCACHE_BLOCKLENINBITS 256
+`define DCACHE_REPLBITS 3
+`define ICACHE_NUMWAYS 1
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_BLOCKLENINBITS 256
+
 // Legal number of PMP entries are 0, 16, or 64
 `define PMP_ENTRIES 16

--- a/wally-pipelined/config/rv32icfd/wally-config.vh
+++ b/wally-pipelined/config/rv32icfd/wally-config.vh
@ -42,15 +42,26 @@
 `define UARCH_PIPELINED 1
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
+`define MEM_DCACHE 1
 `define MEM_DTIM 1
-`define MEM_ICACHE 0
+`define MEM_ICACHE 1
 `define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1

+// TLB configuration.  Entries should be a power of 2
 `define ITLB_ENTRIES 32
 `define DTLB_ENTRIES 32

+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more blocks
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 2048
+`define DCACHE_BLOCKLENINBITS 256
+`define DCACHE_REPLBITS 3
+`define ICACHE_NUMWAYS 1
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_BLOCKLENINBITS 256
+
 // Legal number of PMP entries are 0, 16, or 64
 `define PMP_ENTRIES 16

--- a/wally-pipelined/config/rv64BP/wally-config.vh
+++ b/wally-pipelined/config/rv64BP/wally-config.vh
@ -44,15 +44,26 @@
 `define UARCH_PIPELINED 1
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
+`define MEM_DCACHE 1
 `define MEM_DTIM 1
-`define MEM_ICACHE 0
+`define MEM_ICACHE 1
 `define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1

+// TLB configuration.  Entries should be a power of 2
 `define ITLB_ENTRIES 32
 `define DTLB_ENTRIES 32

+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more blocks
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 2048
+`define DCACHE_BLOCKLENINBITS 256
+`define DCACHE_REPLBITS 3
+`define ICACHE_NUMWAYS 1
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_BLOCKLENINBITS 256
+
 // Address space
 `define RESET_VECTOR 64'h0000000000000000

--- a/wally-pipelined/config/rv64ic/wally-config.vh
+++ b/wally-pipelined/config/rv64ic/wally-config.vh
@ -43,15 +43,26 @@
 `define UARCH_PIPELINED 1
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
+`define MEM_DCACHE 1
 `define MEM_DTIM 1
-`define MEM_ICACHE 0
+`define MEM_ICACHE 1
 `define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1

+// TLB configuration.  Entries should be a power of 2
 `define ITLB_ENTRIES 32
 `define DTLB_ENTRIES 32

+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more blocks
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 2048
+`define DCACHE_BLOCKLENINBITS 256
+`define DCACHE_REPLBITS 3
+`define ICACHE_NUMWAYS 1
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_BLOCKLENINBITS 256
+
 // Legal number of PMP entries are 0, 16, or 64
 `define PMP_ENTRIES 64

@ -73,7 +84,7 @@
 `define BOOTTIM_RANGE  56'h00000FFF
 `define TIM_SUPPORTED 1'b1
 `define TIM_BASE       56'h80000000
-`define TIM_RANGE      56'h07FFFFFF
+`define TIM_RANGE      56'h7FFFFFFF
 `define CLINT_SUPPORTED 1'b1
 `define CLINT_BASE  56'h02000000
 `define CLINT_RANGE 56'h0000FFFF
--- a/wally-pipelined/config/rv64icfd/wally-config.vh
+++ b/wally-pipelined/config/rv64icfd/wally-config.vh
@ -43,15 +43,26 @@
 `define UARCH_PIPELINED 1
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
+`define MEM_DCACHE 1
 `define MEM_DTIM 1
-`define MEM_ICACHE 0
+`define MEM_ICACHE 1
 `define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1

+// TLB configuration.  Entries should be a power of 2
 `define ITLB_ENTRIES 32
 `define DTLB_ENTRIES 32

+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more blocks
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 2048
+`define DCACHE_BLOCKLENINBITS 256
+`define DCACHE_REPLBITS 3
+`define ICACHE_NUMWAYS 1
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_BLOCKLENINBITS 256
+
 // Legal number of PMP entries are 0, 16, or 64
 `define PMP_ENTRIES 16

--- a/wally-pipelined/config/rv64imc/wally-config.vh
+++ b/wally-pipelined/config/rv64imc/wally-config.vh
@ -42,15 +42,26 @@
 `define UARCH_PIPELINED 1
 `define UARCH_SUPERSCALR 0
 `define UARCH_SINGLECYCLE 0
-`define MEM_DCACHE 0
+`define MEM_DCACHE 1
 `define MEM_DTIM 1
-`define MEM_ICACHE 0
+`define MEM_ICACHE 1
 `define MEM_VIRTMEM 1
 `define VECTORED_INTERRUPTS_SUPPORTED 1

+// TLB configuration.  Entries should be a power of 2
 `define ITLB_ENTRIES 32
 `define DTLB_ENTRIES 32

+// Cache configuration.  Sizes should be a power of two
+// typical configuration 4 ways, 4096 bytes per way, 256 bit or more blocks
+`define DCACHE_NUMWAYS 4
+`define DCACHE_WAYSIZEINBYTES 2048
+`define DCACHE_BLOCKLENINBITS 256
+`define DCACHE_REPLBITS 3
+`define ICACHE_NUMWAYS 1
+`define ICACHE_WAYSIZEINBYTES 4096
+`define ICACHE_BLOCKLENINBITS 256
+
 // Address space
 `define RESET_VECTOR 64'h0000000080000000

--- a/wally-pipelined/linux-testgen/testvector-generation/debugBuildroot.sh
+++ b/wally-pipelined/linux-testgen/testvector-generation/debugBuildroot.sh
@ -14,7 +14,7 @@ outDir="../linux-testvectors"
 # Uncomment this version for QEMU debugging of kernel
 #  - good for poking around VM if it boots up
 #  - good for running QEMU commands (press "Ctrl-A" then "c" to open QEMU command prompt)
-$customQemu -M virt -nographic -bios $imageDir/fw_jump.elf -kernel $imageDir/Image -append "root=/dev/vda ro" -initrd $imageDir/rootfs.cpio 
+#$customQemu -M virt -nographic -bios $imageDir/fw_jump.elf -kernel $imageDir/Image -append "root=/dev/vda ro" -initrd $imageDir/rootfs.cpio 
 # Uncomment this version for GDB debugging of kernel
 #  - attempts to load in symbols from "vmlinux"
 #  - good for looking at backtraces when Linux gets stuck for some reason 
@ -30,9 +30,9 @@ $customQemu -M virt -nographic -bios $imageDir/fw_jump.elf -kernel $imageDir/Ima
 # - Makes qemu_in_gdb_format.txt
 # - Splits qemu_in_gdb_format.txt into chunks of 100,000 instrs
 #cat $intermedDir/qemu_output.txt | ./parse_qemu.py >$intermedDir/qemu_in_gdb_format.txt
-#cd $intermedDir
-#split -d -l 5600000 ./qemu_in_gdb_format.txt --verbose
-#cd ../../testvector-generation
+cd $intermedDir
+split -d -l 5000000 ./qemu_in_gdb_format.txt --verbose
+cd ../../testvector-generation

 # Uncomment this version for parse_gdb_output.py debugging
 # - Uses qemu_in_gdb_format.txt
--- a/wally-pipelined/linux-testgen/testvector-generation/gdbinit_debug
+++ b/wally-pipelined/linux-testgen/testvector-generation/gdbinit_debug
@ -6,3 +6,5 @@ c
 file ../buildroot-image-output/vmlinux
 b plic_init
 c
+b do_idle
+c
--- a/wally-pipelined/regression/regression-wally.py
+++ b/wally-pipelined/regression/regression-wally.py
@ -23,11 +23,11 @@ TestCase = namedtuple("TestCase", ['name', 'cmd', 'grepstr'])

 # edit this list to add more test cases
 configs = [
-    TestCase(
-        name="busybear",
-        cmd="vsim -do wally-busybear-batch.do -c > {}",
-        grepstr="loaded 100000 instructions"
-    ),
+    #TestCase(
+    #    name="busybear",
+    #    cmd="vsim -do wally-busybear-batch.do -c > {}",
+    #    grepstr="loaded 100000 instructions"
+    #),
    TestCase(
        name="buildroot",
        cmd="vsim -do wally-buildroot-batch.do -c > {}",
--- a/wally-pipelined/src/cache/dcache.sv
+++ b/wally-pipelined/src/cache/dcache.sv
@ -46,6 +46,8 @@ module dcache
   output logic [`XLEN-1:0]    ReadDataM, 
   output logic 	       DCacheStall,
   output logic 	       CommittedM,
+   output logic 	       DCacheMiss,
+   output logic 	       DCacheAccess,

   // inputs from TLB and PMA/P
   input logic 		       ExceptionM,
@ -53,7 +55,7 @@ module dcache
   input logic 		       DTLBMissM,
   input logic 		       CacheableM,
   input logic 		       DTLBWriteM,
-   input logic 		       ITLBWriteF,   
+   input logic 		       ITLBWriteF, 
   // from ptw
   input logic 		       SelPTW,
   input logic 		       WalkerPageFaultM, 
@ -66,10 +68,14 @@ module dcache
   output logic [`XLEN-1:0]    HWDATA // to ahb
   );

-  localparam integer	       BLOCKLEN = 256;
+/*  localparam integer	       BLOCKLEN = 256;
  localparam integer	       NUMLINES = 64;
  localparam integer	       NUMWAYS = 4;
-  localparam integer	       NUMREPL_BITS = 3;
+  localparam integer	       NUMREPL_BITS = 3;*/
+  localparam integer	       BLOCKLEN = `DCACHE_BLOCKLENINBITS;
+  localparam integer	       NUMLINES = `DCACHE_WAYSIZEINBYTES*8/BLOCKLEN;
+  localparam integer	       NUMWAYS = `DCACHE_NUMWAYS;
+  localparam integer	       NUMREPL_BITS = `DCACHE_REPLBITS;

  localparam integer	       BLOCKBYTELEN = BLOCKLEN/8;
  localparam integer	       OFFSETLEN = $clog2(BLOCKBYTELEN);
@ -416,7 +422,7 @@ module dcache
    if (reset)    CurrState <= #1 STATE_READY;
    else CurrState <= #1 NextState;

-
+  
  // next state logic and some state ouputs.
  always_comb begin
    DCacheStall = 1'b0;
@ -437,6 +443,8 @@ module dcache
    CommittedM = 1'b0;        
    SelUncached = 1'b0;
    SelEvict = 1'b0;
+    DCacheAccess = 1'b0;
+    DCacheMiss = 1'b0;

    case (CurrState)
      STATE_READY: begin
@ -472,7 +480,8 @@ module dcache
 	// read hit valid cached
 	else if(MemRWM[1] & CacheableM & ~(ExceptionM | PendingInterruptM) & CacheHit & ~DTLBMissM) begin
 	  DCacheStall = 1'b0;
-
+	  DCacheAccess = 1'b1;
+	  
 	  if(StallW) begin
 	    NextState = STATE_CPU_BUSY;
            SelAdrM = 1'b1;
@ -485,6 +494,7 @@ module dcache
 	  DCacheStall = 1'b0;
 	  SRAMWordWriteEnableM = 1'b1;
 	  SetDirtyM = 1'b1;
+	  DCacheStall = 1'b1;
 	  
 	  if(StallW) begin 
 	    NextState = STATE_CPU_BUSY;
@ -497,6 +507,8 @@ module dcache
 	  NextState = STATE_MISS_FETCH_WDV;
 	  CntReset = 1'b1;
 	  DCacheStall = 1'b1;
+	  DCacheAccess = 1'b1;
+	  DCacheMiss = 1'b1;
 	end
 	// uncached write
 	else if(MemRWM[0] & ~CacheableM & ~(ExceptionM | PendingInterruptM) & ~DTLBMissM) begin
--- a/wally-pipelined/src/cache/icache.sv
+++ b/wally-pipelined/src/cache/icache.sv
@ -53,9 +53,8 @@ module icache
   );

  // Configuration parameters
-  // TODO Move these to a config file
-  localparam integer 	    BLOCKLEN = 256;
-  localparam integer 	    NUMLINES = 512;
+  localparam integer 	    BLOCKLEN = `ICACHE_BLOCKLENINBITS;
+  localparam integer 	    NUMLINES = `ICACHE_WAYSIZEINBYTES*8/`ICACHE_BLOCKLENINBITS;

  // Input signals to cache memory
  logic 		    FlushMem;
--- a/wally-pipelined/src/fpu/fma.sv
+++ b/wally-pipelined/src/fpu/fma.sv
@ -89,15 +89,15 @@ module fma1(
    input logic     [2:0]       FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
    input logic                 FmtE,       // precision 1 = double 0 = single
    output logic    [2*`NF+1:0]     ProdManE,   // 1.X frac * 1.Y frac in U(2.2Nf) format
-    output logic    [3*`NF+5:0]     AlignedAddendE, // Z aligned for addition in *** format
+    output logic    [3*`NF+5:0]     AlignedAddendE, // Z aligned for addition in U(NF+5.2NF+1)
    output logic    [`NE+1:0]      ProdExpE,       // X exponent + Y exponent - bias in B(NE+2.0) format; adds 2 bits to allow for size of number and negative sign
    output logic                AddendStickyE,  // sticky bit that is calculated during alignment
    output logic                KillProdE      // set the product to zero before addition if the product is too small to matter
    );

    logic [`NE+1:0]    AlignCnt;           // how far to shift the addend to align with the product in Q(NE+2.0) format *** is this enough bits?
-    logic [4*`NF+5:0]   ZManShifted;                // output of the alignment shifter including sticky bit
-    logic [4*`NF+5:0]   ZManPreShifted;     // input to the alignment shifter
+    logic [4*`NF+5:0]   ZManShifted;                // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
+    logic [4*`NF+5:0]   ZManPreShifted;     // input to the alignment shifter U(NF+5.3NF+1)
    
    ///////////////////////////////////////////////////////////////////////////////
    // Calculate the product
@ -132,7 +132,7 @@ module fma1(
    //                       |1'b0| addnend |

    // the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
-    assign ZManPreShifted = {55'b0, {ZAssumed1E, ZFracE}, 106'b0};
+    assign ZManPreShifted = {(`NF+3)'(0), {ZAssumed1E, ZFracE}, /*106*/(2*`NF+2)'(0)};
    always_comb
        begin
           
@ -140,7 +140,7 @@ module fma1(

        //          |   54'b0    |  106'b(product)  | 2'b0 |
        //  | addnend |
-        if ($signed(AlignCnt) <= $signed(-13'd56)) begin
+        if ($signed(AlignCnt) <= /*$signed(-13'd56)*/-(`NF+4)) begin
            KillProdE = 1;
            ZManShifted = ZManPreShifted;//{107'b0, {~ZAssumed1E, ZFrac}, 54'b0};
            AddendStickyE = ~(XZeroE|YZeroE);
@ -149,7 +149,7 @@ module fma1(

        //          |   54'b0    |  106'b(product)  | 2'b0 |
        //                  | addnend |
-        end else if($signed(AlignCnt) <= $signed(13'd0))  begin
+        end else if($signed(AlignCnt) <= 0)  begin
            KillProdE = 0;
            ZManShifted = ZManPreShifted << -AlignCnt;
            AddendStickyE = |(ZManShifted[51:0]);
@ -158,7 +158,7 @@ module fma1(

        //          |   54'b0    |  106'b(product)  | 2'b0 |
        //                                  | addnend |
-        end else if ($signed(AlignCnt)<=$signed(13'd106))  begin
+        end else if ($signed(AlignCnt)<=(2*`NF+2))  begin
            KillProdE = 0;
            ZManShifted = ZManPreShifted >> AlignCnt;
            AddendStickyE = |(ZManShifted[51:0]);
@ -176,7 +176,7 @@ module fma1(

        end
    end
-    assign AlignedAddendE = ZManShifted[213:52];
+    assign AlignedAddendE = ZManShifted[(4*`NF+5):`NF];
 endmodule


--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -25,23 +25,23 @@
 `include "wally-config.vh"

 module fpu (
-  input logic 		         clk,
-  input logic 		         reset,
-  input logic [2:0]        FRM_REGW,   // Rounding mode from CSR
-  input logic [31:0]       InstrD,
-  input logic [`XLEN-1:0]  ReadDataW,     // Read data from memory
-  input logic [`XLEN-1:0]  SrcAE,      // Integer input being processed
-  input logic [`XLEN-1:0]  SrcAM,      // Integer input being written into fpreg
-  input logic 		         StallE, StallM, StallW,
-  input logic 		         FlushE, FlushM, FlushW,
-  input logic [4:0]        RdE, RdM, RdW, 
-  output logic          FRegWriteM,
-  output logic 		      FStallD,    // Stall the decode stage
-  output logic 		      FWriteIntE, FWriteIntM, FWriteIntW, // Write integer register enable
-  output logic [`XLEN-1:0] FWriteDataE,      // Data to be written to memory
-  output logic [`XLEN-1:0] FIntResM,     
-  output logic 		      FDivBusyE,        // Is the divison/sqrt unit busy
-  output logic 		      IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
+  input logic 		   clk,
+  input logic 		   reset,
+  input logic [2:0] 	   FRM_REGW, // Rounding mode from CSR
+  input logic [31:0] 	   InstrD,
+  input logic [`XLEN-1:0]  ReadDataW, // Read data from memory
+  input logic [`XLEN-1:0]  SrcAE, // Integer input being processed
+  input logic [`XLEN-1:0]  SrcAM, // Integer input being written into fpreg
+  input logic 		   StallE, StallM, StallW,
+  input logic 		   FlushE, FlushM, FlushW,
+  input logic [4:0] 	   RdE, RdM, RdW, 
+  output logic 		   FRegWriteM,
+  output logic 		   FStallD, // Stall the decode stage
+  output logic 		   FWriteIntE, FWriteIntM, FWriteIntW, // Write integer register enable
+  output logic [`XLEN-1:0] FWriteDataE, // Data to be written to memory
+  output logic [`XLEN-1:0] FIntResM, 
+  output logic 		   FDivBusyE, // Is the divison/sqrt unit busy
+  output logic 		   IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
  output logic [4:0] 	   SetFflagsM);      // FPU result
 // *** change FMA to do 16 - 32 - 64 - 128 FEXPBITS 
 // *** folder at same level of src for tests fpu tests
@ -50,254 +50,256 @@ module fpu (
  generate
     if (`F_SUPPORTED | `D_SUPPORTED) begin 
      // control logic signal instantiation
-      logic 		   FRegWriteD, FRegWriteE, FRegWriteW;              // FP register write enable
-      logic [2:0] 	FrmD, FrmE, FrmM;                                  // FP rounding mode
-      logic 		   FmtD, FmtE, FmtM, FmtW;                                  // FP precision 0-single 1-double
-      logic 		   FDivStartD, FDivStartE;                                  // Start division
-      logic 		   FWriteIntD;                                              // Write to integer register
-      logic [1:0]    FForwardXE, FForwardYE, FForwardZE;                        // Input3 forwarding mux control signal
-      logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
-      logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM;                  // Select which opperation to do in each component
-      logic [1:0]    FResSelD, FResSelE, FResSelM;  
-      logic [1:0]    FIntResSelD, FIntResSelE, FIntResSelM;                                   
-      logic [4:0] 	Adr1E, Adr2E, Adr3E;
-      
-      // regfile signals
-      logic [63:0] 	FRD1D, FRD2D, FRD3D;                                     // Read Data from FP register - decode stage
-      logic [63:0] 	FRD1E, FRD2E, FRD3E;                                     // Read Data from FP register - execute stage
-      logic [`XLEN-1:0]   FSrcXMAligned;
-      logic [63:0] 	FSrcXE, FSrcXM;                         // Input 1 to the various units (after forwarding)
-      logic [63:0] 	FSrcYE;                                      // Input 2 to the various units (after forwarding)
-      logic [63:0] 	FSrcZE;                                      // Input 3 to the various units (after forwarding)
-      
-      // unpacking signals
-      logic XSgnE, YSgnE, ZSgnE;
-      logic [10:0] XExpE, YExpE, ZExpE;
-      logic [51:0] XFracE, YFracE, ZFracE;
-      logic        XAssumed1E, YAssumed1E, ZAssumed1E;
-      logic XNaNE, YNaNE, ZNaNE;
-      logic XSNaNE, YSNaNE, ZSNaNE;
-      logic XDenormE, YDenormE, ZDenormE;
-      logic XZeroE, YZeroE, ZZeroE;
-      logic [10:0] BiasE;
-      logic XInfE, YInfE, ZInfE;
-      logic XExpMaxE;
-      logic XNormE;
-
-      logic XSgnM, YSgnM, ZSgnM;
-      logic [10:0] XExpM, YExpM, ZExpM;
-      logic [51:0] XFracM, YFracM, ZFracM;
-      logic XNaNM, YNaNM, ZNaNM;
-      logic XSNaNM, YSNaNM, ZSNaNM;
-      logic XZeroM, YZeroM, ZZeroM;
-      logic XInfM, YInfM, ZInfM;
-      
-      // div/sqrt signals
-      logic [63:0] 	FDivResultM, FDivResultW;
-      logic [4:0]    FDivSqrtFlgM, FDivSqrtFlgW;
-      logic          FDivSqrtDoneE;
-      logic [63:0] 	DivInput1E, DivInput2E;
-      logic          HoldInputs;                                              // keep forwarded inputs arround durring division
-      
-      //fpu signals
-      logic [63:0]   FMAResM, FMAResW;
-      logic [4:0]    FMAFlgM, FMAFlgW;
-
-
-      logic [63:0]   ReadResW;
-
-      // add/cvt signals
-      logic [63:0] 	FAddResM, FAddResW;
-      logic [4:0] 	FAddFlgM, FAddFlgW;  
-      logic [63:0] 	CvtResE, CvtResM;
-      logic [4:0] 	CvtFlgE, CvtFlgM;  
-      
-      // cmp signals 
-      logic 		   CmpNVE, CmpNVM, CmpNVW;
-      logic [63:0] 	CmpResE, CmpResM, CmpResW;
-      
-      // fsgn signals
-      logic [63:0] 	SgnResE, SgnResM;
-      logic        	SgnNVE, SgnNVM, SgnNVW;
-      logic [63:0]   FResM, FResW;
-      logic [4:0]         FFlgM, FFlgW;
-      
-      // instantiation of W stage regfile signals
-      logic [63:0] 	AlignedSrcAM;
-      
-      // classify signals
-      logic [63:0] 	ClassResE, ClassResM;
-      
-      // 64-bit FPU result   
-      logic [63:0] 	FPUResultW;                                           
-      logic [4:0] 	FPUFlagsW;
-      
-
-      //DECODE STAGE
-      
-      // top-level controller for FPU
-      fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
-                  .FRM_REGW, .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
-                  .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
-      
-      // regfile instantiation
-      fregfile fregfile (clk, reset, FRegWriteW,
-            InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
-            FPUResultW,
-            FRD1D, FRD2D, FRD3D);	
-
-      //*****************
-      // D/E pipe registers
-      //*****************
-      flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
-      flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
-      flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
-      flopenrc #(1) DECtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
-      flopenrc #(15) DECtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
-                                                            {Adr1E,         Adr2E,         Adr3E});
-      flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-                           {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD},
-                           {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE});
-
-
-      //EXECUTION STAGE
-      
-      // Hazard unit for FPU
-      fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, .FStallD, 
+	logic 		   FRegWriteD, FRegWriteE, FRegWriteW;                 // FP register write enable
+	logic [2:0] 	   FrmD, FrmE, FrmM;                                   // FP rounding mode
+	logic 		   FmtD, FmtE, FmtM, FmtW;                             // FP precision 0-single 1-double
+	logic 		   FDivStartD, FDivStartE;                             // Start division
+	logic 		   FWriteIntD;                                         // Write to integer register
+	logic [1:0] 	   FForwardXE, FForwardYE, FForwardZE;                 // Input3 forwarding mux control signal
+	logic [2:0] 	   FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select FP result
+	logic [3:0] 	   FOpCtrlD, FOpCtrlE, FOpCtrlM;                       // Select which opperation to do in each component
+	logic [1:0] 	   FResSelD, FResSelE, FResSelM;  
+	logic [1:0] 	   FIntResSelD, FIntResSelE, FIntResSelM;                                   
+	logic [4:0] 	   Adr1E, Adr2E, Adr3E;
+	
+	// regfile signals
+	logic [63:0] 	   FRD1D, FRD2D, FRD3D;                                // Read Data from FP register - decode stage
+	logic [63:0] 	   FRD1E, FRD2E, FRD3E;                                // Read Data from FP register - execute stage
+	logic [`XLEN-1:0]  FSrcXMAligned;
+	logic [63:0] 	   FSrcXE, FSrcXM;                                     // Input 1 to the various units (after forwarding)
+	logic [63:0] 	   FSrcYE;                                             // Input 2 to the various units (after forwarding)
+	logic [63:0] 	   FSrcZE;                                             // Input 3 to the various units (after forwarding)
+	
+	// unpacking signals
+	logic 		   XSgnE, YSgnE, ZSgnE;
+	logic [10:0] 	   XExpE, YExpE, ZExpE;
+	logic [51:0] 	   XFracE, YFracE, ZFracE;
+	logic 		   XAssumed1E, YAssumed1E, ZAssumed1E;
+	logic 		   XNaNE, YNaNE, ZNaNE;
+	logic 		   XSNaNE, YSNaNE, ZSNaNE;
+	logic 		   XDenormE, YDenormE, ZDenormE;
+	logic 		   XZeroE, YZeroE, ZZeroE;
+	logic [10:0] 	   BiasE;
+	logic 		   XInfE, YInfE, ZInfE;
+	logic 		   XExpMaxE;
+	logic 		   XNormE;
+	
+	logic 		   XSgnM, YSgnM, ZSgnM;
+	logic [10:0] 	   XExpM, YExpM, ZExpM;
+	logic [51:0] 	   XFracM, YFracM, ZFracM;
+	logic 		   XNaNM, YNaNM, ZNaNM;
+	logic 		   XSNaNM, YSNaNM, ZSNaNM;
+	logic 		   XZeroM, YZeroM, ZZeroM;
+	logic 		   XInfM, YInfM, ZInfM;
+	
+	// div/sqrt signals
+	logic [63:0] 	   FDivResultM, FDivResultW;
+	logic [4:0] 	   FDivSqrtFlgM, FDivSqrtFlgW;
+	logic 		   FDivSqrtDoneE;
+	logic [63:0] 	   DivInput1E, DivInput2E;
+	logic 		   HoldInputs;                                              // keep forwarded inputs arround durring division
+	
+	//fpu signals
+	logic [63:0] 	   FMAResM, FMAResW;
+	logic [4:0] 	   FMAFlgM, FMAFlgW;
+	
+	logic [63:0] 	   ReadResW;
+	
+	// add/cvt signals
+	logic [63:0] 	   FAddResM, FAddResW;
+	logic [4:0] 	   FAddFlgM, FAddFlgW;  
+	logic [63:0] 	   CvtResE, CvtResM;
+	logic [4:0] 	   CvtFlgE, CvtFlgM;  
+	
+	// cmp signals 
+	logic 		   CmpNVE, CmpNVM, CmpNVW;
+	logic [63:0] 	   CmpResE, CmpResM, CmpResW;
+	
+	// fsgn signals
+	logic [63:0] 	   SgnResE, SgnResM;
+	logic 		   SgnNVE, SgnNVM, SgnNVW;
+	logic [63:0] 	   FResM, FResW;
+	logic [4:0] 	   FFlgM, FFlgW;
+	
+	// instantiation of W stage regfile signals
+	logic [63:0] 	   AlignedSrcAM;
+	
+	// classify signals
+	logic [63:0] 	   ClassResE, ClassResM;
+	
+	// 64-bit FPU result   
+	logic [63:0] 	   FPUResultW;                                           
+	logic [4:0] 	   FPUFlagsW;
+	
+	//DECODE STAGE
+	
+	// top-level controller for FPU
+	fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
+                     .FRM_REGW, .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
+                     .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
+	
+	// regfile instantiation
+	fregfile fregfile (clk, reset, FRegWriteW,
+			   InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
+			   FPUResultW,
+			   FRD1D, FRD2D, FRD3D);	
+	
+	//*****************
+	// D/E pipe registers
+	//*****************
+	flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
+	flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
+	flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
+	flopenrc #(1) DECtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
+	flopenrc #(15) DECtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
+                                   {Adr1E,         Adr2E,         Adr3E});
+	flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+				  {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD},
+				  {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE});
+	
+	//EXECUTION STAGE
+	
+	// Hazard unit for FPU
+	fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, .FStallD, 
                        .FForwardXE, .FForwardYE, .FForwardZE);
-
-      // forwarding muxs
-      mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
-      mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FSrcYE);
-      mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FSrcZE);
-
-      unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XFracE, .YFracE, .ZFracE, .XAssumed1E, .YAssumed1E, .ZAssumed1E, .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
+	
+	// forwarding muxs
+	mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
+	mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FSrcYE);
+	mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FSrcZE);
+	
+	unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), 
+			    .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .XSgnE, .YSgnE, 
+			    .ZSgnE, .XExpE, .YExpE, .ZExpE, .XFracE, .YFracE, .ZFracE, 
+			    .XAssumed1E, .YAssumed1E, .ZAssumed1E, .XNaNE, .YNaNE, .ZNaNE, 
+			    .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
+			    .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
      // first of two-stage instance of floating-point fused multiply-add unit
-      fma fma (.clk, .reset, .FlushM, .StallM, 
-               .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XFracE, .YFracE, .ZFracE, .XAssumed1E, .YAssumed1E, .ZAssumed1E, .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
-               .XSgnM, .YSgnM, .ZSgnM, .XExpM, .YExpM, .ZExpM, .XFracM, .YFracM, .ZFracM, .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
-              //  .FSrcXE, .FSrcYE, .FSrcZE, .FSrcXM, .FSrcYM, .FSrcZM, 
-               .FOpCtrlE(FOpCtrlE[2:0]), .FOpCtrlM(FOpCtrlM[2:0]), 
-               .FmtE, .FmtM, .FrmM, .FMAFlgM, .FMAResM);
-      
-      // first and only instance of floating-point divider
-      logic fpdivClk;
-      
-      clockgater fpdivclkg(.E(FDivStartE),
-            .SE(1'b0),
-            .CLK(clk),
-            .ECLK(fpdivClk));
-      
-      // capture the inputs for div/sqrt	 
-      flopenrc #(64) reg_input1 (.d(FSrcXE), .q(DivInput1E),
-                  .en(1'b1), .clear(FDivSqrtDoneE),
-                  .reset(reset),  .clk(HoldInputs));
-      flopenrc #(64) reg_input2 (.d(FSrcYE), .q(DivInput2E),
-                  .en(1'b1), .clear(FDivSqrtDoneE),
-                  .reset(reset),  .clk(HoldInputs));
-      //*** add round to nearest ties to max magnitude
-      fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .done(FDivSqrtDoneE), .rm(FrmE[1:0]),	.op_type(FOpCtrlE[0]), .P(~FmtE), .FDivBusyE, .HoldInputs, 
-                      .OvEn(1'b1), .UnEn(1'b1),	.start(FDivStartE), .reset, .clk(~clk), .AS_Result(FDivResultM), .Flags(FDivSqrtFlgM));
+	fma fma (.clk, .reset, .FlushM, .StallM, 
+		 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XFracE, .YFracE, .
+		 ZFracE, .XAssumed1E, .YAssumed1E, .ZAssumed1E, .XDenormE, .YDenormE, 
+		 .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
+		 .XSgnM, .YSgnM, .ZSgnM, .XExpM, .YExpM, .ZExpM, .XFracM, 
+		 .YFracM, .ZFracM, .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
+		 //  .FSrcXE, .FSrcYE, .FSrcZE, .FSrcXM, .FSrcYM, .FSrcZM, 
+		 .FOpCtrlE(FOpCtrlE[2:0]), .FOpCtrlM(FOpCtrlM[2:0]), 
+		 .FmtE, .FmtM, .FrmM, .FMAFlgM, .FMAResM);
+	
+	// first and only instance of floating-point divider
+	logic 		   fpdivClk;
+	
+	clockgater fpdivclkg(.E(FDivStartE),
+			     .SE(1'b0),
+			     .CLK(clk),
+			     .ECLK(fpdivClk));
+	
+	// capture the inputs for div/sqrt	 
+	flopenrc #(64) reg_input1 (.d(FSrcXE), .q(DivInput1E),
+				   .en(1'b1), .clear(FDivSqrtDoneE),
+				   .reset(reset),  .clk(HoldInputs));
+	flopenrc #(64) reg_input2 (.d(FSrcYE), .q(DivInput2E),
+				   .en(1'b1), .clear(FDivSqrtDoneE),
+				   .reset(reset),  .clk(HoldInputs));
+	//*** add round to nearest ties to max magnitude
+	fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .done(FDivSqrtDoneE), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
+			.P(~FmtE), .FDivBusyE, .HoldInputs, 
+			.OvEn(1'b1), .UnEn(1'b1),
+			.start(FDivStartE), .reset, .clk(~clk), .AS_Result(FDivResultM), .Flags(FDivSqrtFlgM));
+	
        // .DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
        //                 .FrmE, .DivOvEn(1'b1), .DivUnEn(1'b1), .FDivStartE, .FDivResultM, .FDivSqrtFlgM, 
        //                 .FDivSqrtDoneE, .FDivBusyE, .HoldInputs, .reset);
-      // assign FDivBusyE = 0;
-      // first of two-stage instance of floating-point add/cvt unit
-      faddcvt faddcvt (.clk, .reset, .FlushM, .StallM, .FrmM, .FOpCtrlM, .FmtE, .FmtM,
-                        .FSrcXE, .FSrcYE, .FOpCtrlE, .FAddResM, .FAddFlgM);
-      
-      // first and only instance of floating-point comparator
-      fcmp fcmp (.op1({XSgnE,XExpE,XFracE}), .op2({YSgnE,YExpE,YFracE}), .FSrcXE, .FSrcYE, .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .Invalid(CmpNVE), .CmpResE, .XNaNE, .YNaNE, .XZeroE, .YZeroE);
-      
-      // first and only instance of floating-point sign converter
-      fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .XExpE, .XFracE, .FmtE, .SgnResE, .SgnNVE, .XExpMaxE);
-      
-      // first and only instance of floating-point classify unit
-      fclassify fclassify (.XSgnE, .XFracE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, .XSNaNE, .ClassResE);
-
-
-      fcvt fcvt (.XSgnE, .XExpE, .XFracE, .XAssumed1E, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE, .CvtResE, .CvtFlgE);
-
-      // output for store instructions
-      assign FWriteDataE = FSrcYE[`XLEN-1:0];
-
-      //*****************
-      // E/M pipe registers
-      //*****************
-      flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
-      // flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, FSrcYE, FSrcYM);
-      // flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, FSrcZE, FSrcZM);
-      flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XFracE}, {XSgnM,XExpM,XFracM});
-      flopenrc #(64) EMFpReg5(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YFracE}, {YSgnM,YExpM,YFracM});
-      flopenrc #(64) EMFpReg6(clk, reset, FlushM, ~StallM, {ZSgnE,ZExpE,ZFracE}, {ZSgnM,ZExpM,ZFracM});
-      flopenrc #(12) EMFpReg7(clk, reset, FlushM, ~StallM, 
-                          {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
-                          {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});
-
-      
-     
-      flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
-      flopenrc #(64) EMRegCmp2(clk, reset, FlushM, ~StallM, CmpResE, CmpResM); 
-      
-      flopenrc #(64) EMRegSgn1(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
-      flopenrc #(1) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
-      
-      flopenrc #(64) EMRegCvt1(clk, reset, FlushM, ~StallM, CvtResE, CvtResM);
-      flopenrc #(5) EMRegCvt2(clk, reset, FlushM, ~StallM, CvtFlgE, CvtFlgM);
-      
-      flopenrc #(17) EMCtrlReg(clk, reset, FlushM, ~StallM,
-                           {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
-                           {FRegWriteM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
-
-      flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
-
-      //BEGIN MEMORY STAGE
-      mux4  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, CvtResM, FResSelM, FResM);
-      mux4  #(5)  FFlgMux(5'b0, {4'b0, SgnNVM}, {4'b0, CmpNVM}, CvtFlgM, FResSelM, FFlgM);
-
-      // mux2  #(`XLEN)  FSrcXAlignedMux({{`XLEN-32{1'b0}}, FSrcXM[63:32]}, FSrcXM[63:64-`XLEN], FmtM, FSrcXMAligned);
-      mux4  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], FSrcXM[`XLEN-1:0], ClassResM[`XLEN-1:0], CvtResM[`XLEN-1:0], FIntResSelM, FIntResM);
-      
-      // Align SrcA to MSB when single precicion
-      mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAM[31:0]}, {{64-`XLEN{1'b1}}, SrcAM}, FmtM, AlignedSrcAM);
-      mux5  #(5)  FPUFlgMux(5'b0, FMAFlgM, FAddFlgM, FDivSqrtFlgM, FFlgM, FResultSelW, SetFflagsM);
-
-      //*****************
-      // M/W pipe registers
-      //*****************
-      flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
-      
-      flopenrc #(64) MWRegDiv1(clk, reset, FlushW, ~StallW, FDivResultM, FDivResultW); 
-      
-      flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
-      
-      flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, CmpResM, CmpResW);
-
-      flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
-      
-      flopenrc #(6) MWCtrlReg(clk, reset, FlushW, ~StallW,
-                           {FRegWriteM, FResultSelM, FmtM, FWriteIntM},
-                           {FRegWriteW, FResultSelW, FmtW, FWriteIntW});
-      
-   //#########################################
-   // BEGIN WRITEBACK STAGE
-   //#########################################
-
-      mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
-      mux5  #(64)  FPUResultMux(ReadResW, FMAResW, FAddResW, FDivResultW, FResW, FResultSelW, FPUResultW);
-      
-
-   end else begin // no F_SUPPORTED; tie outputs low
-     assign FStallD = 0;
-     assign FWriteIntE = 0; 
-     assign FWriteIntM = 0;
-     assign FWriteIntW = 0;
-     assign FWriteDataE = 0;
-     assign FIntResM = 0;
-     assign FDivBusyE = 0;
-     assign IllegalFPUInstrD = 1;
-     assign SetFflagsM = 0;
-   end
+	// assign FDivBusyE = 0;
+	
+	// first of two-stage instance of floating-point add/cvt unit
+	faddcvt faddcvt (.clk, .reset, .FlushM, .StallM, .FrmM, .FOpCtrlM, .FmtE, .FmtM,
+                         .FSrcXE, .FSrcYE, .FOpCtrlE, .FAddResM, .FAddFlgM);
+	
+	// first and only instance of floating-point comparator
+	fcmp fcmp (.op1({XSgnE,XExpE,XFracE}), .op2({YSgnE,YExpE,YFracE}), .FSrcXE, 
+		   .FSrcYE, .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, 
+		   .Invalid(CmpNVE), .CmpResE, .XNaNE, .YNaNE, .XZeroE, .YZeroE);
+	
+	// first and only instance of floating-point sign converter
+	fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .XExpE, .XFracE, .FmtE, .SgnResE, .SgnNVE, .XExpMaxE);
+	
+	// first and only instance of floating-point classify unit
+	fclassify fclassify (.XSgnE, .XFracE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, .XSNaNE, .ClassResE);
+	
+	fcvt fcvt (.XSgnE, .XExpE, .XFracE, .XAssumed1E, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE, .CvtResE, .CvtFlgE);
+	
+	// output for store instructions
+	assign FWriteDataE = FSrcYE[`XLEN-1:0];
+	
+	//*****************
+	// E/M pipe registers
+	//*****************
+	flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
+	// flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, FSrcYE, FSrcYM);
+	// flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, FSrcZE, FSrcZM);
+	flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XFracE}, {XSgnM,XExpM,XFracM});
+	flopenrc #(64) EMFpReg5(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YFracE}, {YSgnM,YExpM,YFracM});
+	flopenrc #(64) EMFpReg6(clk, reset, FlushM, ~StallM, {ZSgnE,ZExpE,ZFracE}, {ZSgnM,ZExpM,ZFracM});
+	flopenrc #(12) EMFpReg7(clk, reset, FlushM, ~StallM, 
+				{XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
+				{XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});
+	
+	flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
+	flopenrc #(64) EMRegCmp2(clk, reset, FlushM, ~StallM, CmpResE, CmpResM); 
+	
+	flopenrc #(64) EMRegSgn1(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
+	flopenrc #(1) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
+	
+	flopenrc #(64) EMRegCvt1(clk, reset, FlushM, ~StallM, CvtResE, CvtResM);
+	flopenrc #(5) EMRegCvt2(clk, reset, FlushM, ~StallM, CvtFlgE, CvtFlgM);
+	
+	flopenrc #(17) EMCtrlReg(clk, reset, FlushM, ~StallM,
+				 {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
+				 {FRegWriteM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
+	
+	flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
+	
+	//BEGIN MEMORY STAGE
+	mux4  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, CvtResM, FResSelM, FResM);
+	mux4  #(5)  FFlgMux(5'b0, {4'b0, SgnNVM}, {4'b0, CmpNVM}, CvtFlgM, FResSelM, FFlgM);
+	
+	// mux2  #(`XLEN)  FSrcXAlignedMux({{`XLEN-32{1'b0}}, FSrcXM[63:32]}, FSrcXM[63:64-`XLEN], FmtM, FSrcXMAligned);
+	mux4  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], FSrcXM[`XLEN-1:0], ClassResM[`XLEN-1:0], CvtResM[`XLEN-1:0], FIntResSelM, FIntResM);
+	
+	// Align SrcA to MSB when single precicion
+	mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAM[31:0]}, {{64-`XLEN{1'b1}}, SrcAM}, FmtM, AlignedSrcAM);
+	mux5  #(5)  FPUFlgMux(5'b0, FMAFlgM, FAddFlgM, FDivSqrtFlgM, FFlgM, FResultSelW, SetFflagsM);
+	
+	//*****************
+	// M/W pipe registers
+	//*****************
+	flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
+	flopenrc #(64) MWRegDiv1(clk, reset, FlushW, ~StallW, FDivResultM, FDivResultW); 
+	flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
+	flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, CmpResM, CmpResW);
+	flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
+	flopenrc #(6) MWCtrlReg(clk, reset, FlushW, ~StallW,
+				{FRegWriteM, FResultSelM, FmtM, FWriteIntM},
+				{FRegWriteW, FResultSelW, FmtW, FWriteIntW});
+	
+	//#########################################
+	// BEGIN WRITEBACK STAGE
+	//#########################################
+	mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
+	mux5  #(64)  FPUResultMux(ReadResW, FMAResW, FAddResW, FDivResultW, FResW, FResultSelW, FPUResultW);
+	
+	
+     end else begin // no F_SUPPORTED; tie outputs low
+	assign FStallD = 0;
+	assign FWriteIntE = 0; 
+	assign FWriteIntM = 0;
+	assign FWriteIntW = 0;
+	assign FWriteDataE = 0;
+	assign FIntResM = 0;
+	assign FDivBusyE = 0;
+	assign IllegalFPUInstrD = 1;
+	assign SetFflagsM = 0;
+     end
  endgenerate 
-  
+   
 endmodule // fpu
--- a/wally-pipelined/src/fpu/fsm.sv
+++ b/wally-pipelined/src/fpu/fsm.sv
@ -6,7 +6,7 @@ module fsm (done, load_rega, load_regb, load_regc,
   input 	clk;
   input 	reset;
   input 	start;
-//    input 	error;
+   //    input 	error;
   input  	op_type;
   //***can use divbusy insted of holdinputs
   output       done;      
@ -113,8 +113,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S1:
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -129,8 +129,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S2: // iteration 1
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -145,8 +145,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S3:
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -161,8 +161,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S4: // iteration 2
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -177,8 +177,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S5:
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -193,8 +193,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S6: // iteration 3
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -209,8 +209,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S7:
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -225,8 +225,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S8: // q,qm,qp
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -241,8 +241,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S9:  // rem
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -257,8 +257,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S10:  // done
 	    begin
 	       done = 1'b1;
-		   divBusy = 1'b0;
-		   holdInputs = 1'b0;
+	       divBusy = 1'b0;
+	       holdInputs = 1'b0;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -273,8 +273,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S13:  // start of sqrt path
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -289,8 +289,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S14:  
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -305,8 +305,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S15:  // iteration 1
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -321,8 +321,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S16:  
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -337,8 +337,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S17:  
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -353,8 +353,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S18:  // iteration 2
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -369,8 +369,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S19:  
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -385,8 +385,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S20:  
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -401,8 +401,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S21:  // iteration 3
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -417,8 +417,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S22:  
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -433,8 +433,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S23:  
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -449,8 +449,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S24: // q,qm,qp
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -465,8 +465,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  S25:  // rem
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b1;
-		   holdInputs = 1'b1;
+	       divBusy = 1'b1;
+	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -476,13 +476,13 @@ module fsm (done, load_rega, load_regb, load_regc,
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b110;
 	       sel_muxr = 1'b1;
-	       NEXT_STATE = S26;
-	    end 	  
+	       NEXT_STATE = S27;
+	    end 
 	  S26:  // done
 	    begin
 	       done = 1'b1;
-		   divBusy = 1'b0;
-		   holdInputs = 1'b0;
+	       divBusy = 1'b0;
+	       holdInputs = 1'b0;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -497,8 +497,8 @@ module fsm (done, load_rega, load_regb, load_regc,
 	  default: 
 	    begin
 	       done = 1'b0;
-		   divBusy = 1'b0;
-		   holdInputs = 1'b0;
+	       divBusy = 1'b0;
+	       holdInputs = 1'b0;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
--- a/wally-pipelined/src/lsu/lsu.sv
+++ b/wally-pipelined/src/lsu/lsu.sv
@ -45,6 +45,8 @@ module lsu
   output logic 	       CommittedM, 
   output logic 	       SquashSCW,
   output logic 	       DataMisalignedM,
+   output logic 	       DCacheMiss,
+   output logic 	       DCacheAccess,

   // address and write data
   input logic [`XLEN-1:0]     MemAdrM,
@ -315,6 +317,8 @@ module lsu
 		.ReadDataM(HPTWReadPTE),
 		.DCacheStall(DCacheStall),
 		.CommittedM(CommittedMfromDCache),
+		.DCacheMiss,
+		.DCacheAccess,		
 		.ExceptionM(ExceptionM),
 		.PendingInterruptM(PendingInterruptMtoDCache),
 		.DTLBMissM(DTLBMissM),
--- a/wally-pipelined/src/mmu/pmpadrdec.sv
+++ b/wally-pipelined/src/mmu/pmpadrdec.sv
@ -34,9 +34,10 @@ module pmpadrdec (
  input  logic [7:0]       PMPCfg,
  input  logic [`XLEN-1:0] PMPAdr,
  input  logic             PAgePMPAdrIn,
-  input  logic             NoLowerMatchIn,
+//  input  logic             NoLowerMatchIn,
+  input  logic             FirstMatch,
  output logic             PAgePMPAdrOut,
-  output logic             NoLowerMatchOut,
+//  output logic             NoLowerMatchOut,
  output logic             Match, Active, 
  output logic             L, X, W, R
 );
@ -47,7 +48,7 @@ module pmpadrdec (

  logic TORMatch, NAMatch;
  logic PAltPMPAdr;
-  logic FirstMatch;
+//  logic FirstMatch;
  logic [`PA_BITS-1:0] CurrentAdrFull;
  logic [1:0] AdrMode;

@ -69,16 +70,30 @@ module pmpadrdec (

  // verilator lint_off UNOPTFLAT
  logic [`PA_BITS-1:0] Mask;
-  genvar i;
+  //genvar i;
  
  // create a mask of which bits to ignore
-  generate
-    assign Mask[1:0] = 2'b11;
-    assign Mask[2] = (AdrMode == NAPOT); // mask has 0s in upper bis for NA4 region
-    for (i=3; i < `PA_BITS; i=i+1) begin:mask
-      assign Mask[i] = Mask[i-1] & PMPAdr[i-3]; // NAPOT mask: 1's indicate bits to ignore
-    end
-   endgenerate
+  // generate
+  //   assign Mask[1:0] = 2'b11;
+  //   assign Mask[2] = (AdrMode == NAPOT); // mask has 0s in upper bis for NA4 region
+  //   for (i=3; i < `PA_BITS; i=i+1) begin:mask
+  //     assign Mask[i] = Mask[i-1] & PMPAdr[i-3]; // NAPOT mask: 1's indicate bits to ignore
+  //   end
+  // endgenerate
+  prioritycircuit #(.ENTRIES(`PA_BITS-2), .FINAL_OP("NONE")) maskgen(.a(~PMPAdr[`PA_BITS-3:0]), .FirstPin(AdrMode==NAPOT), .y(Mask[`PA_BITS-1:2]));
+  assign Mask[1:0] = 2'b11;
+
+  // *** possible experiments:
+  /* PA < PMP addr could be in its own module, 
+        preeserving hierarchy so we can know if this is the culprit on the critical path
+        Should take logarthmic time, so more like 6 levels than 40 should be expected
+
+    update mask generation
+        Should be concurrent with the subtraction/comparison
+        if one is the critical path, the other shouldn't be which makes us think the mask generation is the culprit.
+
+    Hopefully just use the priority circuit here
+    */
  // verilator lint_on UNOPTFLAT

  assign NAMatch = &((PhysicalAddress ~^ CurrentAdrFull) | Mask);
@ -87,8 +102,6 @@ module pmpadrdec (
                 (AdrMode == NA4 || AdrMode == NAPOT) ? NAMatch :
                 0;

-  assign FirstMatch =      NoLowerMatchIn & Match;
-  assign NoLowerMatchOut = NoLowerMatchIn & ~Match;
  assign L = PMPCfg[7] & FirstMatch;
  assign X = PMPCfg[2] & FirstMatch;
  assign W = PMPCfg[1] & FirstMatch;
--- a/wally-pipelined/src/mmu/pmpchecker.sv
+++ b/wally-pipelined/src/mmu/pmpchecker.sv
@ -55,12 +55,9 @@ module pmpchecker (
  // Bit i is high when the address falls in PMP region i
  logic                    EnforcePMP;
  logic [7:0]              PMPCfg[`PMP_ENTRIES-1:0];
-  logic [`PMP_ENTRIES-1:0] Match;      // PMP Entry matches
+  logic [`PMP_ENTRIES-1:0] Match, FirstMatch;      // PMP Entry matches
  logic [`PMP_ENTRIES-1:0] Active;     // PMP register i is non-null
  logic [`PMP_ENTRIES-1:0] L, X, W, R; // PMP matches and has flag set
-  // verilator lint_off UNOPTFLAT
-  logic [`PMP_ENTRIES-1:0]   NoLowerMatch; // None of the lower PMP entries match
-  // verilator lint_on UNOPTFLAT
  logic [`PMP_ENTRIES-1:0]   PAgePMPAdr;  // for TOR PMP matching, PhysicalAddress > PMPAdr[i]
  genvar i,j;

@ -70,9 +67,9 @@ module pmpchecker (
    .PMPAdr(PMPADDR_ARRAY_REGW),
    .PAgePMPAdrIn({PAgePMPAdr[`PMP_ENTRIES-2:0], 1'b1}),
    .PAgePMPAdrOut(PAgePMPAdr),
-    .NoLowerMatchIn({NoLowerMatch[`PMP_ENTRIES-2:0], 1'b1}),
-    .NoLowerMatchOut(NoLowerMatch),
-    .Match, .Active, .L, .X, .W, .R);
+    .FirstMatch, .Match, .Active, .L, .X, .W, .R);
+
+  prioritycircuit #(.ENTRIES(`PMP_ENTRIES), .FINAL_OP("AND")) pmppriority(.a(Match), .FirstPin(1'b1), .y(FirstMatch)); // Take the ripple gates/signals out of the pmpadrdec and into another unit.

  // Only enforce PMP checking for S and U modes when at least one PMP is active or in Machine mode when L bit is set in selected region
  assign EnforcePMP = (PrivilegeModeW == `M_MODE) ? |L : |Active; 
--- a/wally-pipelined/src/mmu/prioritycircuit.sv
+++ b/wally-pipelined/src/mmu/prioritycircuit.sv
@ -1,5 +1,5 @@
 ///////////////////////////////////////////
-// tlbpriority.sv
+// prioritycircuit.sv
 //
 // Written: tfleming@hmc.edu & jtorrey@hmc.edu 7 April 2021
 // Modified: Teo Ene 15 Apr 2021:
@ -30,8 +30,10 @@

 `include "wally-config.vh"

-module tlbpriority #(parameter ENTRIES = 8) (
+module prioritycircuit #(parameter ENTRIES = 8,
+                         parameter FINAL_OP = "AND") (
  input  logic  [ENTRIES-1:0] a,
+  input  logic                FirstPin,
  output logic  [ENTRIES-1:0] y
 );
  // verilator lint_off UNOPTFLAT
@ -40,11 +42,19 @@ module tlbpriority #(parameter ENTRIES = 8) (
  // generate thermometer code mask
  genvar i;
  generate
-    assign nolower[0] = 1;
+    assign nolower[0] = FirstPin;
    for (i=1; i<ENTRIES; i++) begin:therm
      assign nolower[i] = nolower[i-1] & ~a[i-1];
    end
  endgenerate
  // verilator lint_on UNOPTFLAT
-  assign y = a & nolower;
+  
+  generate
+    if (FINAL_OP=="AND") begin
+      assign y = a & nolower;
+    end else if (FINAL_OP=="NONE") begin
+      assign y = nolower;
+    end // *** So far these are the only two operations I need to do at the end, but feel free to add more as needed.
+  endgenerate
+  // assign y = a & nolower;
 endmodule
--- a/wally-pipelined/src/mmu/tlblru.sv
+++ b/wally-pipelined/src/mmu/tlblru.sv
@ -39,7 +39,7 @@ module tlblru #(parameter TLB_ENTRIES = 8) (
  logic                AllUsed;  // High if the next access causes all RU bits to be 1

  // Find the first line not recently used
-  tlbpriority #(TLB_ENTRIES) nru(~RUBits, WriteLines);
+  prioritycircuit #(.ENTRIES(TLB_ENTRIES), .FINAL_OP("AND")) nru(.a(~RUBits), .FirstPin(1'b1), .y(WriteLines));

  // Track recently used lines, updating on a CAM Hit or TLB write
  assign WriteEnables = WriteLines & {(TLB_ENTRIES){TLBWrite}};
--- a/wally-pipelined/src/privileged/csr.sv
+++ b/wally-pipelined/src/privileged/csr.sv
@ -46,6 +46,8 @@ module csr #(parameter
  input  logic 		   RASPredPCWrongM,
  input  logic 		   BPPredClassNonCFIWrongM,
  input  logic [4:0]       InstrClassM,
+  input  logic             DCacheMiss,
+  input  logic             DCacheAccess,
  input  logic [1:0]       NextPrivilegeModeM, PrivilegeModeW,
  input  logic [`XLEN-1:0] CauseM, NextFaultMtvalM,
  input  logic             BreakpointFaultM, EcallFaultM,
--- a/wally-pipelined/src/privileged/csrc.sv
+++ b/wally-pipelined/src/privileged/csrc.sv
@ -78,6 +78,8 @@ module csrc #(parameter
    input  logic             RASPredPCWrongM,
    input  logic             BPPredClassNonCFIWrongM,
    input  logic [4:0]       InstrClassM,
+    input  logic             DCacheMiss,
+    input  logic             DCacheAccess,
    input  logic [11:0]      CSRAdrM,
    input  logic [1:0]       PrivilegeModeW,
    input  logic [`XLEN-1:0] CSRWriteValM,
@ -143,7 +145,9 @@ module csrc #(parameter
        assign CounterEvent[8] = RASPredPCWrongM & ~StallM;
        assign CounterEvent[9] = InstrClassM[3] & ~StallM;
        assign CounterEvent[10] = BPPredClassNonCFIWrongM & ~StallM;
-        assign CounterEvent[`COUNTERS-1:11] = 0; // eventually give these sources, including FP instructions, I$/D$ misses, branches and mispredictions
+        assign CounterEvent[11] = DCacheAccess & ~StallM;
+        assign CounterEvent[12] = DCacheMiss & ~StallM;      
+        assign CounterEvent[`COUNTERS-1:13] = 0; // eventually give these sources, including FP instructions, I$/D$ misses, branches and mispredictions

        for (i = 3; i < `COUNTERS; i = i+1) begin
            assign WriteHPMCOUNTERM[i] = CSRMWriteM && (CSRAdrM == MHPMCOUNTERBASE + i);
@ -509,4 +513,4 @@ module csrc #(parameter
        end // end for else
    endgenerate
 endmodule
-*/
+*/
--- a/wally-pipelined/src/privileged/privileged.sv
+++ b/wally-pipelined/src/privileged/privileged.sv
@ -45,6 +45,8 @@ module privileged (
  input  logic 		   RASPredPCWrongM,
  input  logic 		   BPPredClassNonCFIWrongM,
  input  logic [4:0]       InstrClassM,
+  input  logic             DCacheMiss,
+  input  logic             DCacheAccess,
  input  logic             PrivilegedM,
  input  logic             ITLBInstrPageFaultF, DTLBLoadPageFaultM, DTLBStorePageFaultM,
  input  logic             WalkerInstrPageFaultF, WalkerLoadPageFaultM, WalkerStorePageFaultM,
--- a/wally-pipelined/src/uncore/clint.sv
+++ b/wally-pipelined/src/uncore/clint.sv
@ -82,7 +82,7 @@ module clint (
      always_ff @(posedge HCLK or negedge HRESETn) 
        if (~HRESETn) begin
          MSIP <= 0;
-          MTIMECMP <= (`XLEN)'(-1);
+          MTIMECMP <= (64)'(0);
          // MTIMECMP is not reset
        end else if (memwrite) begin
          if (entryd == 16'h0000) MSIP <= HWDATA[0];
@ -112,7 +112,7 @@ module clint (
      always_ff @(posedge HCLK or negedge HRESETn) 
        if (~HRESETn) begin
          MSIP <= 0;
-          MTIMECMP <= (`XLEN)'(-1);
+          MTIMECMP <= (64)'(0);
          // MTIMECMP is not reset
        end else if (memwrite) begin
          if (entryd == 16'h0000) MSIP <= HWDATA[0];
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@ -164,6 +164,8 @@ module wallypipelinedhart
  
  logic 		    ExceptionM;
  logic 		    PendingInterruptM;
+  logic 		    DCacheMiss;
+  logic 		    DCacheAccess;

  
  ifu ifu(.InstrInF(InstrRData),
@ -185,7 +187,9 @@ module wallypipelinedhart
 	  .AtomicM(AtomicM),    
 	  .ExceptionM(ExceptionM),
 	  .PendingInterruptM(PendingInterruptM),		
-	  .CommittedM(CommittedM),          
+	  .CommittedM(CommittedM),
+	  .DCacheMiss,
+          .DCacheAccess,
 	  .SquashSCW(SquashSCW),            
 	  .DataMisalignedM(DataMisalignedM),
 	  .MemAdrE(MemAdrE),
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -737,12 +737,26 @@ endmodule
 module riscvassertions();
  // Legal number of PMP entries are 0, 16, or 64
  initial begin
-    assert (`PMP_ENTRIES == 0 || `PMP_ENTRIES==16 || `PMP_ENTRIES==64) else $error("Illegal number of PMP entries");
+    assert (`PMP_ENTRIES == 0 || `PMP_ENTRIES==16 || `PMP_ENTRIES==64) else $error("Illegal number of PMP entries: PMP_ENTRIES must be 0, 16, or 64");
    assert (`F_SUPPORTED || ~`D_SUPPORTED) else $error("Can't support double without supporting float");
    assert (`XLEN == 64 || ~`D_SUPPORTED) else $error("Wally does not yet support D extensions on RV32");
+    assert (`DCACHE_WAYSIZEINBYTES <= 4096 || `MEM_DCACHE == 0 || `MEM_VIRTMEM == 0) else $error("DCACHE_WAYSIZEINBYTES cannot exceed 4 KiB when caches and vitual memory is enabled (to prevent aliasing)");
+    assert (`DCACHE_BLOCKLENINBITS >= 128 || `MEM_DCACHE == 0) else $error("DCACHE_BLOCKLENINBITS must be at least 128 when caches are enabled");
+    assert (`DCACHE_BLOCKLENINBITS < `DCACHE_WAYSIZEINBYTES*8) else $error("DCACHE_BLOCKLENINBITS must be smaller than way size");
+    assert (`ICACHE_WAYSIZEINBYTES <= 4096 || `MEM_ICACHE == 0 || `MEM_VIRTMEM == 0) else $error("ICACHE_WAYSIZEINBYTES cannot exceed 4 KiB when caches and vitual memory is enabled (to prevent aliasing)");
+    assert (`ICACHE_BLOCKLENINBITS >= 32 || `MEM_ICACHE == 0) else $error("ICACHE_BLOCKLENINBITS must be at least 32 when caches are enabled");
+    assert (`ICACHE_BLOCKLENINBITS < `ICACHE_WAYSIZEINBYTES*8) else $error("ICACHE_BLOCKLENINBITS must be smaller than way size");
+    assert (2**$clog2(`DCACHE_BLOCKLENINBITS) == `DCACHE_BLOCKLENINBITS) else $error("DCACHE_BLOCKLENINBITS must be a power of 2");
+    assert (2**$clog2(`DCACHE_WAYSIZEINBYTES) == `DCACHE_WAYSIZEINBYTES) else $error("DCACHE_WAYSIZEINBYTES must be a power of 2");
+    assert (2**$clog2(`ICACHE_BLOCKLENINBITS) == `ICACHE_BLOCKLENINBITS) else $error("ICACHE_BLOCKLENINBITS must be a power of 2");
+    assert (2**$clog2(`ICACHE_WAYSIZEINBYTES) == `ICACHE_WAYSIZEINBYTES) else $error("ICACHE_WAYSIZEINBYTES must be a power of 2");
+    assert (`ICACHE_NUMWAYS == 1 || `MEM_ICACHE == 0) else $error("Multiple Instruction Cache ways not yet implemented");
+    assert (2**$clog2(`ITLB_ENTRIES) == `ITLB_ENTRIES) else $error("ITLB_ENTRIES must be a power of 2");
+    assert (2**$clog2(`DTLB_ENTRIES) == `DTLB_ENTRIES) else $error("DTLB_ENTRIES must be a power of 2");
  end
 endmodule

+
 /* verilator lint_on STMTDLY */
 /* verilator lint_on WIDTH */

--- a/wally-pipelined/testbench/testbench-linux.sv
+++ b/wally-pipelined/testbench/testbench-linux.sv
@ -27,7 +27,7 @@

 module testbench();
  
-  parameter waveOnICount = `BUSYBEAR*140000 + `BUILDROOT*0459700; // # of instructions at which to turn on waves in graphical sim
+  parameter waveOnICount = `BUSYBEAR*140000 + `BUILDROOT*0900000; // # of instructions at which to turn on waves in graphical sim
  parameter stopICount   = `BUSYBEAR*143898 + `BUILDROOT*0000000; // # instructions at which to halt sim completely (set to 0 to let it run as far as it can)  

  ///////////////////////////////////////////////////////////////////////////////
@ -103,6 +103,7 @@ module testbench();
  logic [99:0] StartCSRexpected[63:0];
  string StartCSRname[99:0];
  integer data_file_csr, scan_file_csr;
+  logic IllegalInstrFaultd;
  
  // -----------
  // Error Macro
@ -153,21 +154,22 @@ module testbench();
      clk <= 1; # 5; clk <= 0; # 5;
    end

+  // -------------------
+  // Additional Hardware
+  // -------------------
+  always @(posedge clk)
+    IllegalInstrFaultd = dut.hart.priv.IllegalInstrFaultM;
+
  // -------------------------------------
  // Special warnings for important faults
  // -------------------------------------
  always @(dut.hart.priv.csr.genblk1.csrm.MCAUSE_REGW) begin
    if (dut.hart.priv.csr.genblk1.csrm.MCAUSE_REGW == 2 && instrs > 1) begin
-      $display("!!!!!! illegal instruction !!!!!!!!!!");
-      $display("(as a reminder, MCAUSE and MEPC are set by this)");
-      $display("at %0t ps, PCM %x, instr %0d, dut.hart.lsu.dcache.MemPAdrM %x", $time, dut.hart.ifu.PCM, instrs, dut.hart.lsu.dcache.MemPAdrM);
-      `ERROR
+      // This is sometimes okay if the source code intentionally causes it.
+      $display("Warning: illegal instruction exception at %0t ps, InstrNum %0d, PCM %x, InstrM %s", $time, instrs, dut.hart.ifu.PCM, PCtextM);
    end
    if (dut.hart.priv.csr.genblk1.csrm.MCAUSE_REGW == 5 && instrs != 0) begin
-      $display("!!!!!! illegal (physical) memory access !!!!!!!!!!");
-      $display("(as a reminder, MCAUSE and MEPC are set by this)");
-      $display("at %0t ps, PCM %x, instr %0d, dut.hart.lsu.dcache.MemPAdrM %x", $time, dut.hart.ifu.PCM, instrs, dut.hart.lsu.dcache.MemPAdrM);
-      `ERROR
+      $display("Warning: illegal physical memory access exception at %0t ps, InstrNum %0d, PCM %x, InstrM %s", $time, instrs, dut.hart.ifu.PCM, PCtextM);
    end
  end

@ -185,8 +187,14 @@ module testbench();
      // Hack to compensate for QEMU's incorrect MSTATUS
      end else if (PCtextW.substr(0,3) == "csrr" && PCtextW.substr(10,16) == "mstatus") begin
        force dut.hart.ieu.dp.regf.wd3 = dut.hart.ieu.dp.WriteDataW & ~64'ha00000000;
-      end else
-        release dut.hart.ieu.dp.regf.wd3;
+      end else release dut.hart.ieu.dp.regf.wd3;
+      // Hack to compensate for QEMU's correct but different MTVAL (according to spec, storing the faulting instr is an optional feature)
+      if (PCtextW.substr(0,3) == "csrr" && PCtextW.substr(10,14) == "mtval") begin
+        force dut.hart.ieu.dp.WriteDataW = 0;
+      // Hack to compensate for QEMU's correct but different mhpmcounter's (these too are optional)
+      end else if (PCtextW.substr(0,3) == "csrr" && PCtextW.substr(10,20) == "mhpmcounter") begin
+        force dut.hart.ieu.dp.WriteDataW = 0;
+      end else release dut.hart.ieu.dp.WriteDataW;
    end
  end

@ -194,120 +202,95 @@ module testbench();
  // Big Chunky Block
  // ----------------
  always @(reset or dut.hart.ifu.InstrRawD or dut.hart.ifu.PCD) begin// or negedge dut.hart.ifu.StallE) begin // Why do we care about StallE? Everything seems to run fine without it.
-    if(~dut.hart.lsu.dcache.MemRWM) begin // *** Should this need to consider dut.hart.lsu.dcache.MemRWM?
-      #2;
-      // If PCD/InstrD aren't garbage
-      if (~reset && dut.hart.ifu.InstrRawD[15:0] !== {16{1'bx}} && dut.hart.ifu.PCD !== 64'h0) begin // && ~dut.hart.ifu.StallE) begin
-        // If Wally's PCD has updated
-        if (dut.hart.ifu.PCD !== lastPCD) begin
-          lastInstrDExpected = InstrDExpected;
-          lastPC <= dut.hart.ifu.PCD;
-          lastPC2 <= lastPC;
-          // If PCD isn't going to be flushed
-          if (~PCDwrong || lastPC == PCDexpected) begin
+    #2;
+    // If PCD/InstrD aren't garbage
+    if (~reset && dut.hart.ifu.InstrRawD[15:0] !== {16{1'bx}} && dut.hart.ifu.PCD !== 64'h0) begin // && ~dut.hart.ifu.StallE) begin
+      // If Wally's PCD has updated
+      if (dut.hart.ifu.PCD !== lastPCD) begin
+        lastInstrDExpected = InstrDExpected;
+        lastPC <= dut.hart.ifu.PCD;
+        lastPC2 <= lastPC;
+        // If PCD isn't going to be flushed
+        if (~PCDwrong || lastPC == PCDexpected) begin
+          // Stop if we've reached the end
+          if($feof(data_file_PCF)) begin
+            $display("no more PC data to read... CONGRATULATIONS!!!");
+            `ERROR
+          end

-            // Stop if we've reached the end
-            if($feof(data_file_PCF)) begin
-              $display("no more PC data to read... CONGRATULATIONS!!!");
-              `ERROR
-            end
+          // Increment PC
+          `SCAN_PC(data_file_PCF, scan_file_PCF, PCtextF, PCtextF2, InstrFExpected, PCFexpected);
+          `SCAN_PC(data_file_PCD, scan_file_PCD, PCtextD, PCtextD2, InstrDExpected, PCDexpected);

-            // Increment PC
-            `SCAN_PC(data_file_PCF, scan_file_PCF, PCtextF, PCtextF2, InstrFExpected, PCFexpected);
-            `SCAN_PC(data_file_PCD, scan_file_PCD, PCtextD, PCtextD2, InstrDExpected, PCDexpected);
-
-            // NOP out certain instructions
-            if(dut.hart.ifu.PCD===PCDexpected) begin
-              if((dut.hart.ifu.PCD == 32'h80001dc6) || // for now, NOP out any stores to PLIC
-                 (dut.hart.ifu.PCD == 32'h80001de0) ||
-                 (dut.hart.ifu.PCD == 32'h80001de2)) begin
-                $display("warning: NOPing out %s at PCD=%0x, instr %0d, time %0t", PCtextD, dut.hart.ifu.PCD, instrs, $time);
-                force InstrDExpected = 32'b0010011;
-                force dut.hart.ifu.InstrRawD = 32'b0010011;
-                while (clk != 0) #1;
-                while (clk != 1) #1;                
-                release dut.hart.ifu.InstrRawD;
-                release InstrDExpected;
-                warningCount += 1;
-                forcedInstr = 1;
-              end else begin
-                forcedInstr = 0;
-              end
-            end
-
-            // Increment instruction count
-            if (instrs <= 10 || (instrs <= 100 && instrs % 10 == 0) ||
-               (instrs <= 1000 && instrs % 100 == 0) || (instrs <= 10000 && instrs % 1000 == 0) ||
-               (instrs <= 100000 && instrs % 10000 == 0) || (instrs % 100000 == 0)) begin
-              $display("loaded %0d instructions", instrs);
-            end
-            instrs += 1;
-            
-            // Stop before bugs so "do" file can turn on waves
-            if (instrs == waveOnICount) begin
-              $display("turning on waves at %0d instructions", instrs);
-              $stop;
-            end else if (instrs == stopICount && stopICount != 0) begin
-              $display("Ending sim at %0d instructions (set stopICount to 0 to let the sim go on)", instrs);
-              $stop;
-            end
-
-            // Check if PCD is going to be flushed due to a branch or jump
-            if (`BPRED_ENABLED) begin
-              PCDwrong = dut.hart.hzu.FlushD; //Old version: dut.hart.ifu.bpred.bpred.BPPredWrongE; <-- This old version failed to account for MRET.
-            end else begin
-              casex (lastInstrDExpected[31:0])
-                32'b00000000001000000000000001110011, // URET
-                32'b00010000001000000000000001110011, // SRET
-                32'b00110000001000000000000001110011, // MRET
-                32'bXXXXXXXXXXXXXXXXXXXXXXXXX1101111, // JAL
-                32'bXXXXXXXXXXXXXXXXXXXXXXXXX1100111, // JALR
-                32'bXXXXXXXXXXXXXXXXXXXXXXXXX1100011, // B
-                32'bXXXXXXXXXXXXXXXX110XXXXXXXXXXX01, // C.BEQZ
-                32'bXXXXXXXXXXXXXXXX111XXXXXXXXXXX01, // C.BNEZ
-                32'bXXXXXXXXXXXXXXXX101XXXXXXXXXXX01: // C.J
-                  PCDwrong = 1;
-                32'bXXXXXXXXXXXXXXXX1001000000000010, // C.EBREAK:
-                32'bXXXXXXXXXXXXXXXXX000XXXXX1110011: // Something that's not CSRR*
-                  PCDwrong = 0; // tbh don't really know what should happen here
-                32'b000110000000XXXXXXXXXXXXX1110011, // CSR* SATP, *
-                32'bXXXXXXXXXXXXXXXX1000XXXXX0000010, // C.JR
-                32'bXXXXXXXXXXXXXXXX1001XXXXX0000010: // C.JALR //this is RV64 only so no C.JAL
-                  PCDwrong = 1;
-                default:
-                  PCDwrong = 0;
-              endcase
-            end
-
-            // Check PCD, InstrD
-            if (~PCDwrong && ~(dut.hart.ifu.PCD === PCDexpected)) begin
-              $display("%0t ps, instr %0d: PC does not equal PC expected: %x, %x", $time, instrs, dut.hart.ifu.PCD, PCDexpected);
-              `ERROR
-            end
-            InstrMask = InstrDExpected[1:0] == 2'b11 ? 32'hFFFFFFFF : 32'h0000FFFF;
-            if ((~forcedInstr) && (~PCDwrong) && ((InstrMask & dut.hart.ifu.InstrRawD) !== (InstrMask & InstrDExpected))) begin
-              $display("%0t ps, PCD %x, instr %0d: InstrD %x %s does not equal InstrDExpected %x %s", $time, dut.hart.ifu.PCD, instrs, dut.hart.ifu.InstrRawD, InstrDName, InstrDExpected, PCtextD);
-              `ERROR
-            end
-
-            // Repeated instruction means QEMU had an interrupt which we need to spoof
-            if (PCFexpected == PCDexpected) begin
-              $display("Note at %0t ps, PCM %x %s, instr %0d: spoofing an interrupt", $time, dut.hart.ifu.PCM, PCtextM, instrs);
-              // Increment file pointers past the repeated instruction.
-              `SCAN_PC(data_file_PCF, scan_file_PCF, PCtextF, PCtextF2, InstrFExpected, PCFexpected);
-              `SCAN_PC(data_file_PCD, scan_file_PCD, PCtextD, PCtextD2, InstrDExpected, PCDexpected);
-              scan_file_memR = $fscanf(data_file_memR, "%x\n", readAdrExpected);
-              scan_file_memR = $fscanf(data_file_memR, "%x\n", readDataExpected);
-              // Next force a timer interrupt (*** this may later need generalizing)
-              force dut.uncore.genblk1.clint.MTIME = dut.uncore.genblk1.clint.MTIMECMP + 1;
+          // NOP out certain instructions
+          if(dut.hart.ifu.PCD===PCDexpected) begin
+            if((dut.hart.ifu.PCD == 32'h80001dc6) || // for now, NOP out any stores to PLIC
+                (dut.hart.ifu.PCD == 32'h80001de0) ||
+                (dut.hart.ifu.PCD == 32'h80001de2)) begin
+              $display("warning: NOPing out %s at PCD=%0x, instr %0d, time %0t", PCtextD, dut.hart.ifu.PCD, instrs, $time);
+              force InstrDExpected = 32'b0010011;
+              force dut.hart.ifu.InstrRawD = 32'b0010011;
              while (clk != 0) #1;
-              while (clk != 1) #1;
-              release dut.uncore.genblk1.clint.MTIME;
+              while (clk != 1) #1;                
+              release dut.hart.ifu.InstrRawD;
+              release InstrDExpected;
+              warningCount += 1;
+              forcedInstr = 1;
+            end else begin
+              forcedInstr = 0;
            end
          end
+
+          // Increment instruction count
+          if (instrs <= 10 || (instrs <= 100 && instrs % 10 == 0) ||
+              (instrs <= 1000 && instrs % 100 == 0) || (instrs <= 10000 && instrs % 1000 == 0) ||
+              (instrs <= 100000 && instrs % 10000 == 0) || (instrs % 100000 == 0)) begin
+            $display("loaded %0d instructions", instrs);
+          end
+          instrs += 1;
+          
+          // Stop before bugs so "do" file can turn on waves
+          if (instrs == waveOnICount) begin
+            $display("turning on waves at %0d instructions", instrs);
+            $stop;
+          end else if (instrs == stopICount && stopICount != 0) begin
+            $display("Ending sim at %0d instructions (set stopICount to 0 to let the sim go on)", instrs);
+            $stop;
+          end
+
+          // Check if PCD is going to be flushed due to a branch or jump
+          if (`BPRED_ENABLED) begin
+            PCDwrong = dut.hart.hzu.FlushD || (PCtextE.substr(0,3) == "mret"); //Old version: dut.hart.ifu.bpred.bpred.BPPredWrongE; <-- This old version failed to account for MRET.
+          end
+
+          // Check PCD, InstrD
+          if (~PCDwrong && ~(dut.hart.ifu.PCD === PCDexpected)) begin
+            $display("%0t ps, instr %0d: PC does not equal PC expected: %x, %x", $time, instrs, dut.hart.ifu.PCD, PCDexpected);
+            `ERROR
+          end
+          InstrMask = InstrDExpected[1:0] == 2'b11 ? 32'hFFFFFFFF : 32'h0000FFFF;
+          if ((~forcedInstr) && (~PCDwrong) && ((InstrMask & dut.hart.ifu.InstrRawD) !== (InstrMask & InstrDExpected))) begin
+            $display("%0t ps, PCD %x, instr %0d: InstrD %x %s does not equal InstrDExpected %x %s", $time, dut.hart.ifu.PCD, instrs, dut.hart.ifu.InstrRawD, InstrDName, InstrDExpected, PCtextD);
+            `ERROR
+          end
+
+          // Repeated instruction means QEMU had an interrupt which we need to spoof
+          if (PCFexpected == PCDexpected) begin
+            $display("Note at %0t ps, PCM %x %s, instr %0d: spoofing an interrupt", $time, dut.hart.ifu.PCM, PCtextM, instrs);
+            // Increment file pointers past the repeated instruction.
+            `SCAN_PC(data_file_PCF, scan_file_PCF, PCtextF, PCtextF2, InstrFExpected, PCFexpected);
+            `SCAN_PC(data_file_PCD, scan_file_PCD, PCtextD, PCtextD2, InstrDExpected, PCDexpected);
+            scan_file_memR = $fscanf(data_file_memR, "%x\n", readAdrExpected);
+            scan_file_memR = $fscanf(data_file_memR, "%x\n", readDataExpected);
+            // Next force a timer interrupt (*** this may later need generalizing)
+            force dut.uncore.genblk1.clint.MTIME = dut.uncore.genblk1.clint.MTIMECMP + 1;
+            while (clk != 0) #1;
+            while (clk != 1) #1;
+            release dut.uncore.genblk1.clint.MTIME;
+          end
        end
-        lastPCD = dut.hart.ifu.PCD;
      end
+      lastPCD = dut.hart.ifu.PCD;
    end
  end

@ -360,9 +343,8 @@ module testbench();
      end
      `SCAN_PC(data_file_PCM, scan_file_PCM, trashString, trashString, InstrMExpected, PCMexpected);
      `SCAN_PC(data_file_PCW, scan_file_PCW, trashString, trashString, InstrWExpected, PCWexpected);
-      // If repeated instr
+      // If repeated or instruction, we want to skip over it (indicates an interrupt)
      if (PCMexpected == PCWexpected) begin
-        // Increment file pointers past the repeated instruction.
        `SCAN_PC(data_file_PCM, scan_file_PCM, trashString, trashString, InstrMExpected, PCMexpected);
        `SCAN_PC(data_file_PCW, scan_file_PCW, trashString, trashString, InstrWExpected, PCWexpected);
      end
@ -371,6 +353,11 @@ module testbench();
        `ERROR
      end
    end
+    // Skip over faulting instructions because they do not make it to the W stage.
+    if (IllegalInstrFaultd) begin
+      `SCAN_PC(data_file_PCM, scan_file_PCM, trashString, trashString, InstrMExpected, PCMexpected);
+      `SCAN_PC(data_file_PCW, scan_file_PCW, trashString, trashString, InstrWExpected, PCWexpected);
+    end
  end
  

@ -453,8 +440,7 @@ module testbench();
  // Read Checker
  // ------------
  always @(negedge clk) begin
-    //if (dut.hart.MemRWM[1] && ~dut.hart.StallM && ~dut.hart.FlushM && dut.hart.ieu.InstrValidM) begin <-- This doesn't work because ReadDataM can be used for other things (namely page table walking) while the pipeline is stalled, leaving it in a different state when the pipeline unstalls
-    if (dut.hart.MemRWM[1] && dut.hart.lsu.dcache.ReadDataWEn) begin // <-- ReadDataWEn is a good indicator that the pipeline is using the current contents of ReadDataM
+    if (dut.hart.MemRWM[1] && ~dut.hart.StallM && ~dut.hart.FlushM && dut.hart.ieu.InstrValidM) begin
      if($feof(data_file_memR)) begin
        $display("no more memR data to read");
        `ERROR
@ -530,45 +516,61 @@ module testbench();
  // --------------
  // Checker Macros
  // --------------
-  string MSTATUSstring = "MSTATUS"; // string variables seem to compare more reliably than string literals (they gave me a lot of hassle), but *** there's probably a better way to do this
+  // String variables seem to compare more reliably than string literals (they gave me a lot of hassle),
+  // but *** there's probably a better way to do this.
+  // You can't just use the "__name" variables though because you need to declare variables before using them.
+  string MSTATUSstring = "MSTATUS";
+  string MIPstring = "MIP";
+  string MEPCstring = "MEPC";
+  string MCAUSEstring = "MCAUSE";
+  string MTVALstring = "MTVAL";
  string SEPCstring = "SEPC";
  string SCAUSEstring = "SCAUSE";
  string SSTATUSstring = "SSTATUS";
+
+  logic [63:0] expectedCSR;
+  string expectedCSRname;
  `define CHECK_CSR2(CSR, PATH) \
-    logic [63:0] expected``CSR``; \
-    string CSR; \
    string ``CSR``name = `"CSR`"; \
-    string expected``CSR``name; \
    always @(``PATH``.``CSR``_REGW) begin \
-      if ($time > 1 && (`BUILDROOT != 1 || ``CSR``name != SSTATUSstring)) begin \
-        // This is some feeble hackery designed to control the order in which CSRs are checked \
-        // when multiple change at the same time. \
-        if (``CSR``name == SEPCstring) #1; \
-        if (``CSR``name == SCAUSEstring) #2; \
-        if (``CSR``name == SSTATUSstring) #3; \
-        scan_file_csr = $fscanf(data_file_csr, "%s\n", expected``CSR``name); \
-        scan_file_csr = $fscanf(data_file_csr, "%x\n", expected``CSR``); \
-        if(expected``CSR``name.icompare(``CSR``name)) begin \
-          $display("%0t ps, PCM %x %s, instr %0d: %s changed, expected %s", $time, dut.hart.ifu.PCM, PCtextM, instrs, `"CSR`", expected``CSR``name); \
+      if (instrs == 0 && ~reset) begin \
+        for(integer j=0; j<totalCSR; j++) begin \
+          if(!StartCSRname[j].icompare(``CSR``name)) begin \
+            if(``PATH``.``CSR``_REGW != StartCSRexpected[j]) begin \
+              $display("%0t ps, PCM %x %s, instr %0d: %s does not equal %s expected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, ``CSR``name, StartCSRname[j], ``PATH``.``CSR``_REGW, StartCSRexpected[j]); \
+              `ERROR \
+            end \
+          end \
        end \
-        if (``CSR``name == MSTATUSstring) begin \
-          if (``PATH``.``CSR``_REGW != ((``expected``CSR) | 64'ha00000000)) begin \
-            $display("%0t ps, PCM %x %s, instr %0d: %s (should be MSTATUS) does not equal %s expected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, ``CSR``name, expected``CSR``name, ``PATH``.``CSR``_REGW, (``expected``CSR) | 64'ha00000000); \
-            `ERROR \
-          end \
-        end else \
-          if (``PATH``.``CSR``_REGW != ``expected``CSR[$bits(``PATH``.``CSR``_REGW)-1:0]) begin \
-            $display("%0t ps, PCM %x %s, instr %0d: %s does not equal %s expected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, ``CSR``name, expected``CSR``name, ``PATH``.``CSR``_REGW, ``expected``CSR); \
-            `ERROR \
-          end \
+        $display("CSRs' intital states look good"); \
      end else begin \
-        if (!(`BUILDROOT == 1 && ``CSR``name == MSTATUSstring)) begin \
-          for(integer j=0; j<totalCSR; j++) begin \
-            if(!StartCSRname[j].icompare(``CSR``name)) begin \
-              if(``PATH``.``CSR``_REGW != StartCSRexpected[j]) begin \
-                $display("%0t ps, PCM %x %s, instr %0d: %s does not equal %s expected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, ``CSR``name, StartCSRname[j], ``PATH``.``CSR``_REGW, StartCSRexpected[j]); \
-                `ERROR \
-              end \
+        // MIP is not checked because QEMU bodges it (MTIP in particular), and even if QEMU reported it correctly, the timing would still be off \
+        // MTVAL is not checked on illegal instr faults because QEMU chooses not to implement the behavior where MTVAL is written with the faulting instruction \
+        if  (~reset && ``CSR``name != MIPstring && ~(IllegalInstrFaultd && ``CSR``name == MTVALstring)) begin \
+          // This is some feeble hackery designed to control the order in which CSRs are checked \
+          // when multiple change at the same time. \
+          // *** it would be better for each CSR to have its own testvector file \
+          // so as to avoid this awkward ordering problem. \
+          if (``CSR``name == MEPCstring) #1; \
+          if (``CSR``name == MCAUSEstring) #2; \
+          if (``CSR``name == MTVALstring) #3; \
+          if (``CSR``name == SEPCstring) #1; \
+          if (``CSR``name == SCAUSEstring) #2; \
+          if (``CSR``name == SSTATUSstring) #3; \
+          scan_file_csr = $fscanf(data_file_csr, "%s\n", expectedCSRname); \
+          scan_file_csr = $fscanf(data_file_csr, "%x\n", expectedCSR); \
+          if(expectedCSRname.icompare(``CSR``name)) begin \
+            $display("%0t ps, PCM %x %s, instr %0d: %s changed, expected %s", $time, dut.hart.ifu.PCM, PCtextM, instrs, `"CSR`", expectedCSRname); \
+          end \
+          if (``CSR``name == MSTATUSstring) begin \
+            if (``PATH``.``CSR``_REGW != ((expectedCSR) | 64'ha00000000)) begin \
+              $display("%0t ps, PCM %x %s, instr %0d: %s (should be MSTATUS) does not equal %s expected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, ``CSR``name, expectedCSRname, ``PATH``.``CSR``_REGW, expectedCSR | 64'ha00000000); \
+              `ERROR \
+            end \
+          end else begin \
+            if (``PATH``.``CSR``_REGW != expectedCSR[$bits(``PATH``.``CSR``_REGW)-1:0]) begin \
+              $display("%0t ps, PCM %x %s, instr %0d: %s does not equal %s expected: %x, %x", $time, dut.hart.ifu.PCM, PCtextM, instrs, ``CSR``name, expectedCSRname, ``PATH``.``CSR``_REGW, expectedCSR); \
+              `ERROR \
            end \
          end \
        end \