Merge branch 'main' of https://github.com/davidharrishmc/riscv-wally into main

2021-06-02 10:03:23 -04:00 · 2021-06-02 10:03:23 -04:00 · a683dd7fde
commit a683dd7fde
parent 2c77a13c08 5187574e8a
39 changed files with 1807 additions and 1490 deletions
--- a/testsBP/crt0/Makefile
+++ b/testsBP/crt0/Makefile
@ -4,12 +4,12 @@ ROOT		:= ..
 LIBRARY_DIRS	:= 
 LIBRARY_FILES	:=

-MARCH           :=-march=rv64ic
-MABI            :=-mabi=lp64
+MARCH           :=-march=rv64imfdc
+MABI            :=-mabi=lp64d
 LINK_FLAGS      :=$(MARCH) $(MABI) -nostartfiles

-AFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -W
-CFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64  -mcmodel=medany 
+AFLAGS =$(MARCH) $(MABI) -W
+CFLAGS =$(MARCH) $(MABI) -mcmodel=medany  -O2
 AS=riscv64-unknown-elf-as
 CC=riscv64-unknown-elf-gcc
 AR=riscv64-unknown-elf-ar
@ -19,7 +19,7 @@ all: libcrt0.a
 %.o: %.s
 	${AS} ${AFLAGS} -c $< -o $@

-libcrt0.a: start.o
+libcrt0.a: start.o pcnt_driver.o pre_main.o
 	${AR} -r $@ $^

 clean:
--- a/testsBP/crt0/start.s
+++ b/testsBP/crt0/start.s
@ -43,11 +43,10 @@ _start:



-	# set the stack pointer to the top of memory
-	# 0x8000_0000 + 64K - 8 bytes
-	li sp, 0x007FFFF8
+	# set the stack pointer to the top of memory - 8 bytes (pointer size)
+	li sp, 0x07FFFFF8

-	jal ra, main
+	jal ra, pre_main
 	jal ra, _halt

 .section .text
--- a/testsBP/mibench_qsort/Makefile
+++ b/testsBP/mibench_qsort/Makefile
@ -8,7 +8,7 @@ MARCH           :=-march=rv64ic
 MABI            :=-mabi=lp64
 LINK_FLAGS      :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map

-CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align
+CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align -O2

 CC=riscv64-unknown-elf-gcc
 DA=riscv64-unknown-elf-objdump -d
--- a/testsBP/sieve/Makefile
+++ b/testsBP/sieve/Makefile
@ -8,7 +8,7 @@ MARCH           :=-march=rv64ic
 MABI            :=-mabi=lp64
 LINK_FLAGS      :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map

-CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align
+CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align -O2

 CC=riscv64-unknown-elf-gcc
 DA=riscv64-unknown-elf-objdump -d
--- a/testsBP/sieve/sieve.c
+++ b/testsBP/sieve/sieve.c
@ -66,21 +66,21 @@ int main () {
    
  ans = sieve ();
  //gettimeofday(&after , NULL);
-  if (ans != 1899)
-    printf ("Sieve result wrong, ans = %d, expected 1899", ans);
+  /* /\* /\\* if (ans != 1899) *\\/ *\/ */
+  /* /\* /\\*   printf ("Sieve result wrong, ans = %d, expected 1899", ans); *\\/ *\/ */

-  //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) );
+  /* /\* //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); *\/ */


-  printf("Round 2\n");
-  //gettimeofday(&before , NULL);
+  /* /\* printf("Round 2\n"); *\/ */
+  /* //gettimeofday(&before , NULL); */
    
-  ans = sieve ();
-  //gettimeofday(&after , NULL);
-  if (ans != 1899)
-    printf ("Sieve result wrong, ans = %d, expected 1899", ans);
+  /* ans = sieve (); */
+  /* //gettimeofday(&after , NULL); */
+  /* if (ans != 1899) */
+  /*   printf ("Sieve result wrong, ans = %d, expected 1899", ans); */

-  //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); 
+  /* //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) );  */
  
  return 0;

--- a/testsBP/simple/Makefile
+++ b/testsBP/simple/Makefile
@ -8,7 +8,7 @@ MARCH           :=-march=rv64ic
 MABI            :=-mabi=lp64
 LINK_FLAGS      :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map

-CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align
+CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align -O2
 CC=riscv64-unknown-elf-gcc
 DA=riscv64-unknown-elf-objdump -d

--- a/testsBP/simple/header.h
+++ b/testsBP/simple/header.h
@ -5,4 +5,8 @@ int fail();
 int simple_csrbr_test();
 int lbu_test();
 int icache_spill_test();
+void global_hist_0_space_test();
+void global_hist_1_space_test();
+void global_hist_2_space_test();
+void global_hist_3_space_test();
 #endif
--- a/testsBP/simple/main.c
+++ b/testsBP/simple/main.c
@ -2,6 +2,10 @@

 int main(){
  //int res = icache_spill_test();
+  global_hist_3_space_test();  
+  global_hist_2_space_test();
+  global_hist_1_space_test();
+  global_hist_0_space_test();    
  int res = 1;
  if (res < 0) {
    fail();
--- a/wally-pipelined/config/buildroot/wally-constants.vh
+++ b/wally-pipelined/config/buildroot/wally-constants.vh
@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////

-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
--- a/wally-pipelined/config/busybear/wally-constants.vh
+++ b/wally-pipelined/config/busybear/wally-constants.vh
@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////

-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
--- a/wally-pipelined/config/coremark/wally-constants.vh
+++ b/wally-pipelined/config/coremark/wally-constants.vh
@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////

-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
--- a/wally-pipelined/config/coremark_bare/wally-constants.vh
+++ b/wally-pipelined/config/coremark_bare/wally-constants.vh
@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////

-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
--- a/wally-pipelined/config/rv32ic/wally-constants.vh
+++ b/wally-pipelined/config/rv32ic/wally-constants.vh
@ -2,7 +2,10 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: kmacsaigoren@hmc.edu 31 May 2021
+//              added svmode constants. These aren't strictly necessary since we're just checking one bit,
+//              but they're here to stay consistent and to make sure we dont wind up
+//              a "NO_TRANSLATE undefined" situation.
 //
 // Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
 //          These macros should not be changed, except in the event of an
@ -31,3 +34,10 @@
 `define PPN_BITS 22
 `define PPN_HIGH_SEGMENT_BITS 12
 `define PA_BITS  34
+`define SVMODE_BITS 1
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8 // These two are only here to stop
+`define SV48 9 // the verilator from yelling at me
--- a/wally-pipelined/config/rv64BP/wally-config.vh
+++ b/wally-pipelined/config/rv64BP/wally-config.vh
@ -32,7 +32,7 @@
 `define XLEN 64

 //`define MISA (32'h00000105)
-`define MISA (32'h00000104 | 1<<5 | 1<<18 | 1 << 20 | 1 << 12 | 1 << 0)
+`define MISA (32'h00000104 | 1 << 5 | 1 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0)
 `define A_SUPPORTED ((`MISA >> 0) % 2 == 1)
 `define C_SUPPORTED ((`MISA >> 2) % 2 == 1)
 `define D_SUPPORTED ((`MISA >> 3) % 2 == 1)
@ -107,8 +107,9 @@
 /* verilator lint_off ASSIGNDLY */
 /* verilator lint_off PINCONNECTEMPTY */

-`define TWO_BIT_PRELOAD "../config/rv64icfd/twoBitPredictor.txt"
-`define BTB_PRELOAD "../config/rv64icfd/BTBPredictor.txt"
+`define TWO_BIT_PRELOAD "../config/rv64BP/twoBitPredictor.txt"
+`define BTB_PRELOAD "../config/rv64BP/BTBPredictor.txt"
 `define BPRED_ENABLED 1
-`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE
+//`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE
+`define BPTYPE "BPGSHARE" // BPTWOBIT or "BPGLOBAL"  or BPLOCALPAg or BPGSHARE
 `define TESTSBP 1
--- a/wally-pipelined/config/rv64BP/wally-constants.vh
+++ b/wally-pipelined/config/rv64BP/wally-constants.vh
@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////

-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
--- a/wally-pipelined/config/rv64ic/wally-config.vh
+++ b/wally-pipelined/config/rv64ic/wally-config.vh
@ -31,7 +31,7 @@
 `define XLEN 64

 // MISA RISC-V configuration per specification
-`define MISA (32'h00000104 | 0 << 5 | 1 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0)
+`define MISA (32'h00000104 | 0 << 5 | 0 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0)
 `define A_SUPPORTED ((`MISA >> 0) % 2 == 1)
 `define C_SUPPORTED ((`MISA >> 2) % 2 == 1)
 `define D_SUPPORTED ((`MISA >> 3) % 2 == 1)
--- a/wally-pipelined/config/rv64ic/wally-constants.vh
+++ b/wally-pipelined/config/rv64ic/wally-constants.vh
@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////

-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
--- a/wally-pipelined/config/rv64icfd/wally-constants.vh
+++ b/wally-pipelined/config/rv64icfd/wally-constants.vh
@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////

-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
--- a/wally-pipelined/config/rv64imc/wally-constants.vh
+++ b/wally-pipelined/config/rv64imc/wally-constants.vh
@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////

-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -25,528 +25,455 @@
 `include "wally-config.vh"

 module fpu (
-  input  logic [2:0]       FRM_REGW,    // Rounding mode from CSR
-  input  logic             reset,
+  input logic [2:0] 	   FRM_REGW, // Rounding mode from CSR
+  input logic 		   reset,
  //input  logic             clear,     // *** not being used anywhere
-  input  logic             clk,
-  input  logic [31:0]      InstrD,
-  input  logic [`XLEN-1:0] SrcAE,       // Integer input being processed
-  input  logic [`XLEN-1:0] SrcAM,       // Integer input being written into fpreg
-  input  logic 		         StallE, StallM, StallW,
-  input  logic             FlushE, FlushM, FlushW,
-  input  logic [`AHBW-1:0] HRDATA,
-  input  logic             RegWriteD,
-  output logic [4:0]       SetFflagsM,
-  output logic [31:0]      FSROutW,
-  output logic [1:0]       FMemRWM,
-	output logic             FStallD,
-  output logic             FWriteIntE, FWriteIntM, FWriteIntW,
+  input logic 		   clk,
+  input logic [31:0] 	   InstrD,
+  input logic [`XLEN-1:0]  SrcAE, // Integer input being processed
+  input logic [`XLEN-1:0]  SrcAM, // Integer input being written into fpreg
+  input logic 		   StallE, StallM, StallW,
+  input logic 		   FlushE, FlushM, FlushW,
+  input logic [`AHBW-1:0]  HRDATA,
+  input logic 		   RegWriteD,
+  output logic [4:0] 	   SetFflagsM,
+  output logic [31:0] 	   FSROutW,
+  output logic [1:0] 	   FMemRWM,
+  output logic 		   FStallD,
+  output logic 		   FWriteIntE, FWriteIntM, FWriteIntW,
  output logic [`XLEN-1:0] FWriteDataM,
-  output logic             FDivSqrtDoneM,
-  output logic             IllegalFPUInstrD,
+  output logic 		   FDivSqrtDoneM,
+  output logic 		   IllegalFPUInstrD,
  output logic [`XLEN-1:0] FPUResultW);

+   // control logic signal instantiation
+   logic 		   FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;             // FP register write enable
+   logic [2:0] 		   FrmD, FrmE, FrmM, FrmW;                                 // FP rounding mode
+   logic 		   FmtD, FmtE, FmtM, FmtW;                                 // FP precision 0-single 1-double
+   logic 		   FDivStartD, FDivStartE;                                 // Start division
+   logic 		   FWriteIntD;                                 // Write to integer register
+   logic 		   FOutputInput2D, FOutputInput2E;                         // Put Input2 in Input1 if a store instruction
+   logic [1:0] 		   FMemRWD, FMemRWE;                                       // Read and write enable for memory
+   logic [1:0] 		   FForwardInput1D, FForwardInput1E;                       // Input1 forwarding mux control signal
+   logic [1:0] 		   FForwardInput2D, FForwardInput2E;                       // Input2 forwarding mux control signal
+   logic 		   FForwardInput3D, FForwardInput3E;                       // Input3 forwarding mux control signal
+   logic 		   FInput2UsedD;                                           // Is input 2 used
+   logic 		   FInput3UsedD;                                           // Is input 3 used
+   logic [2:0] 		   FResultSelD, FResultSelE, FResultSelM, FResultSelW;     // Select FP result
+   logic [3:0] 		   FOpCtrlD, FOpCtrlE, FOpCtrlM;                           // Select which opperation to do in each component
   
+   // regfile signals
+   logic [4:0] 		   RdE, RdM, RdW; // ***Can take from ieu
+   logic [`XLEN-1:0] 	   FWDM;                                                   // Write data for FP register
+   logic [`XLEN-1:0] 	   FRD1D, FRD2D, FRD3D;                                    // Read Data from FP register
+   logic [`XLEN-1:0] 	   FRD1E, FRD2E, FRD3E;
+   logic [`XLEN-1:0] 	   FInput1E, FInput1M, FInput1tmpE;
+   logic [`XLEN-1:0] 	   FInput2E, FInput2M;
+   logic [`XLEN-1:0] 	   FInput3E, FInput3M;
+   logic [`XLEN-1:0] 	   FLoadStoreResultM, FLoadStoreResultW;                   // Result for load, store, and move to int-reg instructions
   
+   // div/sqrt signals
+   logic 		   DivDenormM, DivDenormW;
+   logic 		   DivOvEn, DivUnEn;
+   logic 		   DivBusyM;
+   logic [63:0] 	   FDivResultM, FDivResultW;
+   logic [4:0] 		   FDivFlagsM, FDivFlagsW;
   
+   // FMA signals
+   logic [12:0] 	   aligncntE, aligncntM; 
+   logic [105:0] 	   rE, rM; 
+   logic [105:0] 	   sE, sM; 
+   logic [163:0] 	   tE, tM;	
+   logic [8:0] 		   normcntE, normcntM; 
+   logic [12:0] 	   aeE, aeM; 
+   logic 		   bsE, bsM;
+   logic 		   killprodE, killprodM; 
+   logic 		   prodofE, prodofM; 
+   logic 		   xzeroE, xzeroM;
+   logic 		   yzeroE, yzeroM;
+   logic 		   zzeroE, zzeroM;
+   logic 		   xdenormE, xdenormM;
+   logic 		   ydenormE, ydenormM;
+   logic 		   zdenormE, zdenormM;
+   logic 		   xinfE, xinfM;
+   logic 		   yinfE, yinfM;
+   logic 		   zinfE, zinfM;
+   logic 		   xnanE, xnanM;
+   logic 		   ynanE, ynanM;
+   logic 		   znanE, znanM;
+   logic 		   nanE, nanM;
+   logic [8:0] 		   sumshiftE, sumshiftM;
+   logic 		   sumshiftzeroE, sumshiftzeroM;
+   logic 		   prodinfE, prodinfM;
+   logic [63:0] 	   FmaResultM, FmaResultW;
+   logic [4:0] 		   FmaFlagsM, FmaFlagsW;
   
-  //control logic signal instantiation
-  logic             FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;             // FP register write enable
-  logic [2:0]       FrmD, FrmE, FrmM, FrmW;                                 // FP rounding mode
-  logic             FmtD, FmtE, FmtM, FmtW;                                 // FP precision 0-single 1-double
-  logic             FDivStartD, FDivStartE;                                 // Start division
-  logic             FWriteIntD;                                 // Write to integer register
-  logic             FOutputInput2D, FOutputInput2E;                         // Put Input2 in Input1 if a store instruction
-  logic [1:0]       FMemRWD, FMemRWE;                                       // Read and write enable for memory
-  logic [1:0]       FForwardInput1D, FForwardInput1E;                       // Input1 forwarding mux control signal
-  logic [1:0]       FForwardInput2D, FForwardInput2E;                       // Input2 forwarding mux control signal
-  logic             FForwardInput3D, FForwardInput3E;                       // Input3 forwarding mux control signal
-  logic             FInput2UsedD;                                           // Is input 2 used
-  logic             FInput3UsedD;                                           // Is input 3 used
-  logic [2:0]       FResultSelD, FResultSelE, FResultSelM, FResultSelW;     // Select FP result
-  logic [3:0]       FOpCtrlD, FOpCtrlE, FOpCtrlM;                           // Select which opperation to do in each component
+   // add/cvt signals
+   logic [63:0] 	   AddSumE, AddSumTcE;
+   logic [3:0] 		   AddSelInvE;
+   logic [10:0] 	   AddExpPostSumE;
+   logic 		   AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
+   logic 		   AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
+   logic 		   AddConvertE;
+   logic [63:0] 	   AddFloat1E, AddFloat2E;
+   logic [11:0] 	   AddExp1DenormE, AddExp2DenormE;
+   logic [10:0] 	   AddExponentE;
+   logic [2:0] 		   AddRmE;
+   logic [3:0] 		   AddOpTypeE;
+   logic 		   AddPE, AddOvEnE, AddUnEnE;    
+   logic 		   AddDenormM;
+   logic [63:0] 	   AddSumM, AddSumTcM;
+   logic [3:0] 		   AddSelInvM;
+   logic [10:0] 	   AddExpPostSumM;
+   logic 		   AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
+   logic 		   AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
+   logic 		   AddConvertM, AddSignM;
+   logic [63:0] 	   AddFloat1M, AddFloat2M;
+   logic [11:0] 	   AddExp1DenormM, AddExp2DenormM;
+   logic [10:0] 	   AddExponentM;
+   logic [63:0] 	   AddOp1M, AddOp2M;
+   logic [2:0] 		   AddRmM;
+   logic [3:0] 		   AddOpTypeM;
+   logic 		   AddPM, AddOvEnM, AddUnEnM;  
+   logic [63:0] 	   FAddResultM, FAddResultW;
+   logic [4:0] 		   FAddFlagsM, FAddFlagsW;
   
-  // regfile signals
-  logic [4:0]       RdE, RdM, RdW; // ***Can take from ieu
-  logic [`XLEN-1:0] FWDM;                                                   // Write data for FP register
-  logic [`XLEN-1:0] FRD1D, FRD2D, FRD3D;                                    // Read Data from FP register
-  logic [`XLEN-1:0] FRD1E, FRD2E, FRD3E;
-  logic [`XLEN-1:0] FInput1E, FInput1M, FInput1tmpE;
-  logic [`XLEN-1:0] FInput2E, FInput2M;
-  logic [`XLEN-1:0] FInput3E, FInput3M;
-  logic [`XLEN-1:0] FLoadStoreResultM, FLoadStoreResultW;                   // Result for load, store, and move to int-reg instructions
+   // cmp signals 
+   logic [7:0] 		   WE, WM;
+   logic [7:0] 		   XE, XM;
+   logic 		   ANaNE, ANaNM;
+   logic 		   BNaNE, BNaNM;
+   logic 		   AzeroE, AzeroM;
+   logic 		   BzeroE, BzeroM;
+   logic 		   CmpInvalidM, CmpInvalidW;
+   logic [1:0] 		   CmpFCCM, CmpFCCW; 
+   logic [63:0] 	   FCmpResultM, FCmpResultW;
   
-  // div/sqrt signals
-  logic             DivDenormM, DivDenormW;
-  logic             DivOvEn, DivUnEn;
-  logic             DivBusyM;
-  logic [63:0]      FDivResultM, FDivResultW;
-  logic [4:0]       FDivFlagsM, FDivFlagsW;
+   // fsgn signals
+   logic [63:0] 	   SgnResultE, SgnResultM, SgnResultW;
+   logic [4:0] 		   SgnFlagsE, SgnFlagsM, SgnFlagsW;
   
-  // FMA signals
-  logic [12:0]		  aligncntE, aligncntM; 
-  logic [105:0]		  rE, rM; 
-  logic [105:0]		  sE, sM; 
-  logic [163:0]		  tE, tM;	
-  logic [8:0]		    normcntE, normcntM; 
-  logic [12:0]		  aeE, aeM; 
-  logic 		        bsE, bsM;
-  logic 		        killprodE, killprodM; 
-  logic 		        prodofE, prodofM; 
-  logic			        xzeroE, xzeroM;
-  logic			        yzeroE, yzeroM;
-  logic			        zzeroE, zzeroM;
-  logic			        xdenormE, xdenormM;
-  logic			        ydenormE, ydenormM;
-  logic			        zdenormE, zdenormM;
-  logic			        xinfE, xinfM;
-  logic			        yinfE, yinfM;
-  logic			        zinfE, zinfM;
-  logic			        xnanE, xnanM;
-  logic			        ynanE, ynanM;
-  logic			        znanE, znanM;
-  logic			        nanE, nanM;
-  logic	[8:0]		    sumshiftE, sumshiftM;
-  logic			        sumshiftzeroE, sumshiftzeroM;
-  logic             prodinfE, prodinfM;
-  logic [63:0]      FmaResultM, FmaResultW;
-  logic [4:0]       FmaFlagsM, FmaFlagsW;
+   // instantiation of W stage regfile signals
+   logic [`XLEN-1:0] 	   SrcAW;
   
-  // add/cvt signals
-  logic [63:0]      AddSumE, AddSumTcE;
-  logic [3:0]       AddSelInvE;
-  logic [10:0]      AddExpPostSumE;
-  logic             AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
-  logic             AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
-  logic             AddConvertE;
-  logic [63:0]      AddFloat1E, AddFloat2E;
-  logic [11:0]      AddExp1DenormE, AddExp2DenormE;
-  logic [10:0]      AddExponentE;
-  logic [2:0]       AddRmE;
-  logic [3:0]       AddOpTypeE;
-  logic             AddPE, AddOvEnE, AddUnEnE;    
-  logic             AddDenormM;
-  logic [63:0]      AddSumM, AddSumTcM;
-  logic [3:0]       AddSelInvM;
-  logic [10:0]      AddExpPostSumM;
-  logic             AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
-  logic             AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
-  logic             AddConvertM, AddSignM;
-  logic [63:0]      AddFloat1M, AddFloat2M;
-  logic [11:0]      AddExp1DenormM, AddExp2DenormM;
-  logic [10:0]      AddExponentM;
-  logic [63:0]      AddOp1M, AddOp2M;
-  logic [2:0]       AddRmM;
-  logic [3:0]       AddOpTypeM;
-  logic             AddPM, AddOvEnM, AddUnEnM;  
-  logic [63:0]      FAddResultM, FAddResultW;
-  logic [4:0]       FAddFlagsM, FAddFlagsW;
+   // classify signals
+   logic [63:0] 	   ClassResultE, ClassResultM, ClassResultW;
   
-  //cmp signals 
-  logic [7:0]       WE, WM;
-  logic [7:0]       XE, XM;
-  logic             ANaNE, ANaNM;
-  logic             BNaNE, BNaNM;
-  logic             AzeroE, AzeroM;
-  logic             BzeroE, BzeroM;
-  logic             CmpInvalidM, CmpInvalidW;
-  logic [1:0]       CmpFCCM, CmpFCCW; 
-  logic [63:0]      FCmpResultM, FCmpResultW;
+   // 64-bit FPU result   
+   logic [63:0] 	   FPUResult64W, FPUResult64E;                                           
+   logic [4:0] 		   FPUFlagsW;
   
-  // fsgn signals
-  logic [63:0]      SgnResultE, SgnResultM, SgnResultW;
-  logic [4:0]       SgnFlagsE, SgnFlagsM, SgnFlagsW;
+   // pipeline control logic
+   logic 		   PipeEnableDE;
+   logic 		   PipeEnableEM;
+   logic 		   PipeEnableMW;
+   logic 		   PipeClearDE;
+   logic 		   PipeClearEM;
+   logic 		   PipeClearMW;
   
-  //instantiation of W stage regfile signals
-  logic [`XLEN-1:0] SrcAW;
+   // temporarily assign pipe clear and enable signals
+   // to never flush & always be running
+   localparam PipeClear = 1'b0;
+   localparam PipeEnable = 1'b1;
+   always_comb begin      
+      PipeEnableDE = ~StallE;
+      PipeEnableEM = ~StallM;
+      PipeEnableMW = ~StallW;
+      PipeClearDE = FlushE;
+      PipeClearEM = FlushM;
+      PipeClearMW = FlushW;      
+   end
   
-  // classify signals
-  logic [63:0]      ClassResultE, ClassResultM, ClassResultW;
+   //DECODE STAGE
   
-  // other
-  logic [63:0]      FPUResult64W, FPUResult64E;                                           // 64-bit FPU result
-  logic [4:0]       FPUFlagsW;
+   // Hazard unit for FPU
+   fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*);
   
-  // pipeline control logic
-  logic	                   PipeEnableDE;
-  logic	                   PipeEnableEM;
-  logic	                   PipeEnableMW;
-  logic                    PipeClearDE;
-  logic                    PipeClearEM;
-  logic                    PipeClearMW;
+   // top-level controller for FPU
+   fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*);
   
-  //temporarily assign pipe clear and enable signals
-  //to never flush & always be running
-  localparam PipeClear = 1'b0;
-  localparam PipeEnable = 1'b1;
-  always_comb begin
-
-	  PipeEnableDE = ~StallE;
-	  PipeEnableEM = ~StallM;
-	  PipeEnableMW = ~StallW;
-	  PipeClearDE = FlushE;
-	  PipeClearEM = FlushM;
-	  PipeClearMW = FlushW;
-
-  end
-
- 
-
-
-
-
-
-
-
-
-
-
-
-  //DECODE STAGE
-
-  //Hazard unit for FPU
-  fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*);
-
-  //top-level controller for FPU
-  fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*);
-
-
-  //regfile instantiation
+   // regfile instantiation
   FPregfile fpregfile (clk, reset, FWriteEnW,
 			InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
 			FPUResult64W,
 			FRD1D, FRD2D, FRD3D);	
   
-
-
-
-
-
-
-
-
-  //*****************
-  //fpregfile D/E pipe registers
-  //*****************
-  flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, FRD1D, FRD1E);
-  flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, FRD2D, FRD2E);
-  flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, FRD3D, FRD3E);
-
-  //*****************
-  //other  D/E pipe registers
-  //*****************
-  flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FWriteEnD, FWriteEnE);
-  flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE);
-  flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
-  flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE);
-  flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE);
-  flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, FOpCtrlD, FOpCtrlE);
-  flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, FDivStartD, FDivStartE);
-  flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput1D, FForwardInput1E);
-  flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput2D, FForwardInput2E);
-  flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput3D, FForwardInput3E);
-  flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResult64W, FPUResult64E);
-  flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE);
-  flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E);
-  flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE);
-
-
-
-
-
-
-
-
-
-
-
-
-
-  //EXECUTION STAGE
-
-
-
-  // input muxs for forwarding
-  mux4  #(64)  FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, SrcAM, FForwardInput1E, FInput1tmpE);
-  mux3  #(64)  FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E);
-  mux2  #(64)  FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E);
-  mux2  #(64)  FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E);
-
-  fma1 fma1 (.*);
-
-  //first and only instance of floating-point divider
-  fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .*);
-
-  //first of two-stage instance of floating-point add/cvt unit
-  fpuaddcvt1 fpadd1 (.*);
-
-  //first of two-stage instance of floating-point comparator
-  fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, FInput1E, FInput2E, FOpCtrlE[1:0]);
-
-  //first and only instance of floating-point sign converter
-  fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*);
-
-  //first and only instance of floating-point classify unit
-  fpuclassify fpuclass (.*);
-
-  
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-  //*****************
-  //fpregfile D/E pipe registers
-  //*****************
-  flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, FInput1E, FInput1M);
-  flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, FInput2E, FInput2M);
-  flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, FInput3E, FInput3M);
-
-  //*****************
-  //fma E/M pipe registers
-  //*****************  
-  flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); 
-  flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); 
-  flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); 
-  flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); 
-  flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); 
-  flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM);  
-  flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); 
-  flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); 
-  flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); 
-  flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); 
-  flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); 
-  flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); 
-  flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); 
-  flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); 
-  flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); 
-  flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); 
-  flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); 
-  flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); 
-  flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); 
-  flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); 
-  flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); 
-  flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); 
-  flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); 
-  flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); 
-  flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); 
-
-  //*****************
-  //fpadd E/M pipe registers
-  //*****************
-  flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
-  flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
-  flopenrc #(4)  EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
-  flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
-  flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
-  flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
-  flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
-  flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
-  flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
-  flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
-  flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
-  flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
-  flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
-  flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
-  flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignAM); 
-  flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
-  flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
-  flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
-  flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
-  flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
-  flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
-  flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
-  flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
-  flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
-  flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
-
-  //*****************
-  //fpcmp E/M pipe registers
-  //*****************
-  flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
-  flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
-  flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
-  flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
-  flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
-  flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
-
-  //put this in for the event we want to delay fsgn - will otherwise bypass
-  //*****************
-  //fpsgn E/M pipe registers
-  //***************** 
-  flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
-  flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
-
-  //*****************
-  //other E/M pipe registers
-  //*****************
-  flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FWriteEnE, FWriteEnM);
-  flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM);
-  flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
-  flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM);
-  flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM);
-  flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM);
-  flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
-  flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
-
-  //*****************
-  //fpuclassify E/M pipe registers
-  //***************** 
-  flopenrc #(64) EMRegClass(clk, reset, PipeClearEM, PipeEnableEM, ClassResultE, ClassResultM);
-
-
-
-
-
-
-
-
-  //BEGIN MEMORY STAGE
-
-  assign FWriteDataM = FInput1M;
-
-  mux2  #(64)  FLoadStoreResultMux(HRDATA, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
-
-  fma2 fma2(.*);
-
-  //second instance of two-stage floating-point add/cvt unit
-  fpuaddcvt2 fpadd2 (.*);
-
-  //second instance of two-stage floating-point comparator
-  fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*);
-
-
-
-
-
-
-
-
-
-
-  
-  //*****************
-  //fma M/W pipe registers
-  //*****************
-  flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); 
-  flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); 
-
-  //*****************
-  //fpdiv M/W pipe registers
-  //*****************
-  flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, FDivResultM, FDivResultW); 
-  flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, FDivFlagsM, FDivFlagsW);
-  flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
-
-  //*****************
-  //fpadd M/W pipe registers
-  //*****************
-  flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, FAddResultM, FAddResultW); 
-  flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, FAddFlagsM, FAddFlagsW); 
-
-  //*****************
-  //fpcmp M/W pipe registers
-  //*****************
-  flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
-  flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
-  flopenrc #(64) MWRegCmp3(clk, reset, PipeClearMW, PipeEnableMW, FCmpResultM, FCmpResultW); 
-
-  //*****************
-  //fpsgn M/W pipe registers
-  //***************** 
-  flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW);
-  flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW);
-
-  //*****************
-  //other M/W pipe registers
-  //*****************
-  flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FWriteEnM, FWriteEnW);
-  flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
-  flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
-  flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
-  flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW);
-  flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW);
-  flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
-
-
-  //*****************
-  //fpuclassify M/W pipe registers
-  //***************** 
-  flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW);
-
-
-
-
-
-
-
+   //*****************
+   // fpregfile D/E pipe registers
+   //*****************
+   flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, FRD1D, FRD1E);
+   flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, FRD2D, FRD2E);
+   flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, FRD3D, FRD3E);
+   
+   //*****************
+   // other  D/E pipe registers
+   //*****************
+   flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FWriteEnD, FWriteEnE);
+   flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE);
+   flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
+   flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE);
+   flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE);
+   flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, FOpCtrlD, FOpCtrlE);
+   flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, FDivStartD, FDivStartE);
+   flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput1D, FForwardInput1E);
+   flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput2D, FForwardInput2E);
+   flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput3D, FForwardInput3E);
+   flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResult64W, FPUResult64E);
+   flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE);
+   flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E);
+   flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE);
+   
+   //EXECUTION STAGE
+   
+   // input muxs for forwarding
+   mux4  #(64)  FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, SrcAM, FForwardInput1E, FInput1tmpE);
+   mux3  #(64)  FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E);
+   mux2  #(64)  FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E);
+   mux2  #(64)  FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E);
+   
+   fma1 fma1 (.*);
+   
+   // first and only instance of floating-point divider
+   logic fpdivClk;
+   
+   clockgater fpdivclkg(.E(FDivStartE),
+			.SE(DivBusyM),
+			.CLK(clk),
+			.ECLK(fpdivClk));
+   
+   fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk));
+   
+   // first of two-stage instance of floating-point add/cvt unit
+   fpuaddcvt1 fpadd1 (.*);
+   
+   // first of two-stage instance of floating-point comparator
+   fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, FInput1E, FInput2E, FOpCtrlE[1:0]);
+   
+   // first and only instance of floating-point sign converter
+   fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*);
+   
+   // first and only instance of floating-point classify unit
+   fpuclassify fpuclass (.*);
+   
+   //*****************
+   //fpregfile D/E pipe registers
+   //*****************
+   flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, FInput1E, FInput1M);
+   flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, FInput2E, FInput2M);
+   flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, FInput3E, FInput3M);
+   
+   //*****************
+   // fma E/M pipe registers
+   //*****************  
+   flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); 
+   flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); 
+   flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); 
+   flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); 
+   flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); 
+   flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM);  
+   flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); 
+   flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); 
+   flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); 
+   flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); 
+   flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); 
+   flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); 
+   flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); 
+   flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); 
+   flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); 
+   flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); 
+   flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); 
+   flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); 
+   flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); 
+   flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); 
+   flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); 
+   flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); 
+   flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); 
+   flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); 
+   flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); 
+   
+   //*****************
+   // fpadd E/M pipe registers
+   //*****************
+   flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
+   flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
+   flopenrc #(4)  EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
+   flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
+   flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
+   flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
+   flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
+   flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
+   flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
+   flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
+   flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
+   flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
+   flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
+   flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
+   flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignAM); 
+   flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
+   flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
+   flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
+   flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
+   flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
+   flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
+   flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
+   flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
+   flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
+   flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
+   
+   //*****************
+   // fpcmp E/M pipe registers
+   //*****************
+   flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
+   flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
+   flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
+   flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
+   flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
+   flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
+   
+   // put this in for the event we want to delay fsgn - will otherwise bypass
+   //*****************
+   // fpsgn E/M pipe registers
+   //***************** 
+   flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
+   flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
+   
+   //*****************
+   // other E/M pipe registers
+   //*****************
+   flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FWriteEnE, FWriteEnM);
+   flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM);
+   flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
+   flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM);
+   flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM);
+   flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM);
+   flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
+   flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
+   
+   //*****************
+   // fpuclassify E/M pipe registers
+   //***************** 
+   flopenrc #(64) EMRegClass(clk, reset, PipeClearEM, PipeEnableEM, ClassResultE, ClassResultM);
+   
+   //BEGIN MEMORY STAGE
+   
+   assign FWriteDataM = FInput1M;
+   
+   mux2  #(64)  FLoadStoreResultMux(HRDATA, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
+   
+   fma2 fma2(.*);
+   
+   // second instance of two-stage floating-point add/cvt unit
+   fpuaddcvt2 fpadd2 (.*);
+   
+   // second instance of two-stage floating-point comparator
+   fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), 
+		   .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*);
+   
+   //*****************
+   // fma M/W pipe registers
+   //*****************
+   flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); 
+   flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); 
+   
+   //*****************
+   // fpdiv M/W pipe registers
+   //*****************
+   flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, FDivResultM, FDivResultW); 
+   flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, FDivFlagsM, FDivFlagsW);
+   flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
+   
+   //*****************
+   // fpadd M/W pipe registers
+   //*****************
+   flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, FAddResultM, FAddResultW); 
+   flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, FAddFlagsM, FAddFlagsW); 
+   
+   //*****************
+   // fpcmp M/W pipe registers
+   //*****************
+   flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
+   flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
+   flopenrc #(64) MWRegCmp3(clk, reset, PipeClearMW, PipeEnableMW, FCmpResultM, FCmpResultW); 
+   
+   //*****************
+   // fpsgn M/W pipe registers
+   //***************** 
+   flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW);
+   flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW);
+   
+   //*****************
+   // other M/W pipe registers
+   //*****************
+   flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FWriteEnM, FWriteEnW);
+   flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
+   flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
+   flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
+   flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW);
+   flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW);
+   flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
+   
+   //*****************
+   // fpuclassify M/W pipe registers
+   //***************** 
+   flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW);

  //#########################################
-  //BEGIN WRITEBACK STAGE
+  // BEGIN WRITEBACK STAGE
  //#########################################
   
-  always_comb begin
-	case (FResultSelW)
-		// div/sqrt
-		3'b000 : FPUFlagsW = FDivFlagsW;
-		// cmp		
-		3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0};
-		//fma/mult
-		3'b010 : FPUFlagsW = FmaFlagsW;
-		// sgn inj
-		3'b011 : FPUFlagsW = SgnFlagsW;
-		// add/sub/cnvt
-		3'b100 : FPUFlagsW = FAddFlagsW;
-		// classify
-		3'b101 : FPUFlagsW = 5'b0;
-		// output SrcAW
-		3'b110 : FPUFlagsW = 5'b0;
-		// output FRD1
-		3'b111 : FPUFlagsW = 5'b0;
-		default : FPUFlagsW = 5'bxxxxx;
-	endcase
-  end
+   always_comb begin
+      case (FResultSelW)
+	// div/sqrt
+	3'b000 : FPUFlagsW = FDivFlagsW;
+	// cmp		
+	3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0};
+	//fma/mult
+	3'b010 : FPUFlagsW = FmaFlagsW;
+	// sgn inj
+	3'b011 : FPUFlagsW = SgnFlagsW;
+	// add/sub/cnvt
+	3'b100 : FPUFlagsW = FAddFlagsW;
+	// classify
+	3'b101 : FPUFlagsW = 5'b0;
+	// output SrcAW
+	3'b110 : FPUFlagsW = 5'b0;
+	// output FRD1
+	3'b111 : FPUFlagsW = 5'b0;
+	default : FPUFlagsW = 5'bxxxxx;
+      endcase
+   end
   
+   always_comb begin
+      case (FResultSelW)
+	// div/sqrt
+	3'b000 : FPUResult64W = FDivResultW;
+	// cmp		
+	3'b001 : FPUResult64W = FCmpResultW;
+	//fma/mult
+	3'b010 : FPUResult64W = FmaResultW;
+	// sgn inj
+	3'b011 : FPUResult64W = SgnResultW;
+	// add/sub/cnvt
+	3'b100 : FPUResult64W = FAddResultW;
+	// classify
+	3'b101 : FPUResult64W = ClassResultW;
+	// output SrcAW
+	3'b110 : FPUResult64W = SrcAW;
+	// Load/Store/Move to FP-register
+	3'b111 : FPUResult64W = FLoadStoreResultW;
+	default : FPUResult64W = {64{1'bx}};
+      endcase
+   end // always_comb
   
-  always_comb begin
-	case (FResultSelW)
-		// div/sqrt
-		3'b000 : FPUResult64W = FDivResultW;
-		// cmp		
-		3'b001 : FPUResult64W = FCmpResultW;
-		//fma/mult
-		3'b010 : FPUResult64W = FmaResultW;
-		// sgn inj
-		3'b011 : FPUResult64W = SgnResultW;
-		// add/sub/cnvt
-		3'b100 : FPUResult64W = FAddResultW;
-		// classify
-		3'b101 : FPUResult64W = ClassResultW;
-		// output SrcAW
-		3'b110 : FPUResult64W = SrcAW;
-		// Load/Store/Move to FP-register
-		3'b111 : FPUResult64W = FLoadStoreResultW;
-		default : FPUResult64W = {64{1'bx}};
-	endcase
-  end
-  //interface between XLEN size datapath and double-precision sized
-  //floating-point results
-  //
-  //define offsets for LSB zero extension or truncation
-  always_comb begin
-           
-  //zero extension 
+   // interface between XLEN size datapath and double-precision sized
+   // floating-point results
+   //
+   // define offsets for LSB zero extension or truncation
+   always_comb begin      
+      // zero extension 
      FPUResultW = FPUResult64W[63:64-`XLEN];
      SetFflagsM = FPUFlagsW;      
+   end
+  
+endmodule // fpu

-  end  
-endmodule
--- a/wally-pipelined/src/generic/clockgater.sv
+++ b/wally-pipelined/src/generic/clockgater.sv
@ -0,0 +1,46 @@
+///////////////////////////////////////////
+// clockgater.sv
+//
+// Written: Ross Thompson 9 January 2021
+// Modified: 
+//
+// Purpose: Clock gater model. Must use standard cell for synthesis.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module clockgater
+  (input logic 	E,
+   input logic 	SE,
+   input logic 	CLK,
+   output logic ECLK);
+
+  // VERY IMPORTANT.
+  // This part functionally models a clock gater, but does not necessarily meet the timing constrains a real standard cell would.
+  // Do not use this in synthesis!
+
+  logic 	enable_q;
+  
+
+  always @(~CLK) begin
+    enable_q <= E | SE;
+  end
+  assign ECLK = enable_q & CLK;
+
+endmodule
--- a/wally-pipelined/src/generic/lzd.sv
+++ b/wally-pipelined/src/generic/lzd.sv
@ -0,0 +1,195 @@
+///////////////////////////////////////////
+// lzd.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+/* verilator lint_off DECLFILENAME */
+
+// Original idea came from  V. G. Oklobdzija, "An algorithmic and novel
+// design of a leading zero detector circuit: comparison with logic
+// synthesis," in IEEE Transactions on Very Large Scale Integration
+// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi:
+// 10.1109/92.273153.
+
+// Modified to be more hierarchical
+
+module lzd2 (P, V, B);
+
+   input logic  [1:0] B;
+
+   output logic P;
+   output logic V;
+
+   assign V = B[0] | B[1];
+   assign P = B[0] & ~B[1];
+   
+endmodule // lz2
+
+module lzd_hier #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]          B,
+    output logic [$clog2(WIDTH)-1:0] ZP,
+    output logic 		     ZV);
+
+   if (WIDTH == 128)
+     lzd128 lz127 (ZP, ZV, B);	      
+   else if (WIDTH == 64)
+     lzd64 lz64 (ZP, ZV, B);	   
+   else if (WIDTH == 32)
+     lzd32 lz32 (ZP, ZV, B);
+   else if (WIDTH == 16)
+     lzd16 lz16 (ZP, ZV, B);
+   else if (WIDTH == 8)
+     lzd8 lz8 (ZP, ZV, B);
+   else if (WIDTH == 4)
+     lzd4 lz4 (ZP, ZV, B);
+
+endmodule // lzd_hier
+
+module lzd4 (ZP, ZV, B);
+
+   input logic [3:0]  B;
+
+   logic  	       ZPa;
+   logic  	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [1:0]  ZP;
+   output logic        ZV;
+
+   lz2 l1(ZPa, ZVa, B[1:0]);
+   lz2 l2(ZPb, ZVb, B[3:2]);
+
+   assign ZP[0:0] = ZVb ? ZPb : ZPa;
+   assign ZP[1]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd4
+
+module lzd8 (ZP, ZV, B);
+
+   input logic [7:0]  B;
+
+   logic [1:0] 	       ZPa;
+   logic [1:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [2:0]  ZP;
+   output logic        ZV;
+
+   lz4 l1(ZPa, ZVa, B[3:0]);
+   lz4 l2(ZPb, ZVb, B[7:4]);
+
+   assign ZP[1:0] = ZVb ? ZPb : ZPa;
+   assign ZP[2]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd8
+
+module lzd16 (ZP, ZV, B);
+
+   input logic [15:0]  B;
+
+   logic [2:0] 	       ZPa;
+   logic [2:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [3:0]  ZP;
+   output logic        ZV;
+
+   lz8 l1(ZPa, ZVa, B[7:0]);
+   lz8 l2(ZPb, ZVb, B[15:8]);
+
+   assign ZP[2:0] = ZVb ? ZPb : ZPa;
+   assign ZP[3]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd16
+
+module lzd32 (ZP, ZV, B);
+
+   input logic [31:0] B;
+
+   logic [3:0] 	      ZPa;
+   logic [3:0] 	      ZPb;
+   logic 	      ZVa;
+   logic 	      ZVb;
+   
+   output logic [4:0] ZP;
+   output logic       ZV;
+   
+   lz16 l1(ZPa, ZVa, B[15:0]);
+   lz16 l2(ZPb, ZVb, B[31:16]);
+   
+   assign ZP[3:0] = ZVb ? ZPb : ZPa;
+   assign ZP[4]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd32
+
+module lzd64 (ZP, ZV, B);
+
+   input logic [63:0]  B;
+   
+   logic [4:0] 	       ZPa;
+   logic [4:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [5:0]  ZP;
+   output logic        ZV;
+   
+   lz32 l1(ZPa, ZVa, B[31:0]);
+   lz32 l2(ZPb, ZVb, B[63:32]);
+   
+   assign ZP[4:0] = ZVb ? ZPb : ZPa;
+   assign ZP[5]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd64
+
+module lzd128 (ZP, ZV, B);
+
+   input logic [127:0]  B;
+   
+   logic [5:0] 	       ZPa;
+   logic [5:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [6:0]  ZP;
+   output logic        ZV;
+   
+   lz64 l1(ZPa, ZVa, B[64:0]);
+   lz64 l2(ZPb, ZVb, B[127:63]);
+   
+   assign ZP[5:0] = ZVb ? ZPb : ZPa;
+   assign ZP[6]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd128
+
+/* verilator lint_on DECLFILENAME */
--- a/wally-pipelined/src/generic/lzd.sv~
+++ b/wally-pipelined/src/generic/lzd.sv~
@ -0,0 +1,195 @@
+///////////////////////////////////////////
+// lzd.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+/* verilator lint_off DECLFILENAME */
+
+// Original idea came from  V. G. Oklobdzija, "An algorithmic and novel
+// design of a leading zero detector circuit: comparison with logic
+// synthesis," in IEEE Transactions on Very Large Scale Integration
+// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi:
+// 10.1109/92.273153.
+
+// Modified to be more hierarchical
+
+module lz2 (P, V, B);
+
+   input logic  [1:0] B;
+
+   output logic P;
+   output logic V;
+
+   assign V = B[0] | B[1];
+   assign P = B[0] & ~B[1];
+   
+endmodule // lz2
+
+module lzd_hier #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]          B,
+    output logic [$clog2(WIDTH)-1:0] ZP,
+    output logic 		     ZV);
+
+   if (WIDTH == 128)
+     lz128 lzd127 (ZP, ZV, B);	      
+   else if (WIDTH == 64)
+     lz64 lzd64 (ZP, ZV, B);	   
+   else if (WIDTH == 32)
+     lz32 lzd32 (ZP, ZV, B);
+   else if (WIDTH == 16)
+     lz16 lzd16 (ZP, ZV, B);
+   else if (WIDTH == 8)
+     lz8 lzd8 (ZP, ZV, B);
+   else if (WIDTH == 4)
+     lz4 lzd4 (ZP, ZV, B);
+
+endmodule // lzd_hier
+
+module lz4 (ZP, ZV, B);
+
+   input logic [3:0]  B;
+
+   logic  	       ZPa;
+   logic  	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [1:0]  ZP;
+   output logic        ZV;
+
+   lz2 l1(ZPa, ZVa, B[1:0]);
+   lz2 l2(ZPb, ZVb, B[3:2]);
+
+   assign ZP[0:0] = ZVb ? ZPb : ZPa;
+   assign ZP[1]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule 
+
+module lz8 (ZP, ZV, B);
+
+   input logic [7:0]  B;
+
+   logic [1:0] 	       ZPa;
+   logic [1:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [2:0]  ZP;
+   output logic        ZV;
+
+   lz4 l1(ZPa, ZVa, B[3:0]);
+   lz4 l2(ZPb, ZVb, B[7:4]);
+
+   assign ZP[1:0] = ZVb ? ZPb : ZPa;
+   assign ZP[2]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule 
+
+module lz16 (ZP, ZV, B);
+
+   input logic [15:0]  B;
+
+   logic [2:0] 	       ZPa;
+   logic [2:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [3:0]  ZP;
+   output logic        ZV;
+
+   lz8 l1(ZPa, ZVa, B[7:0]);
+   lz8 l2(ZPb, ZVb, B[15:8]);
+
+   assign ZP[2:0] = ZVb ? ZPb : ZPa;
+   assign ZP[3]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz16
+
+module lz32 (ZP, ZV, B);
+
+   input logic [31:0] B;
+
+   logic [3:0] 	      ZPa;
+   logic [3:0] 	      ZPb;
+   logic 	      ZVa;
+   logic 	      ZVb;
+   
+   output logic [4:0] ZP;
+   output logic       ZV;
+   
+   lz16 l1(ZPa, ZVa, B[15:0]);
+   lz16 l2(ZPb, ZVb, B[31:16]);
+   
+   assign ZP[3:0] = ZVb ? ZPb : ZPa;
+   assign ZP[4]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz32
+
+module lz64 (ZP, ZV, B);
+
+   input logic [63:0]  B;
+   
+   logic [4:0] 	       ZPa;
+   logic [4:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [5:0]  ZP;
+   output logic        ZV;
+   
+   lz32 l1(ZPa, ZVa, B[31:0]);
+   lz32 l2(ZPb, ZVb, B[63:32]);
+   
+   assign ZP[4:0] = ZVb ? ZPb : ZPa;
+   assign ZP[5]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz64
+
+module lz128 (ZP, ZV, B);
+
+   input logic [127:0]  B;
+   
+   logic [5:0] 	       ZPa;
+   logic [5:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [6:0]  ZP;
+   output logic        ZV;
+   
+   lz64 l1(ZPa, ZVa, B[64:0]);
+   lz64 l2(ZPb, ZVb, B[127:63]);
+   
+   assign ZP[5:0] = ZVb ? ZPb : ZPa;
+   assign ZP[6]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz128
+
+/* verilator lint_on DECLFILENAME */
--- a/wally-pipelined/src/generic/shift.sv
+++ b/wally-pipelined/src/generic/shift.sv
@ -0,0 +1,76 @@
+///////////////////////////////////////////
+// shifters.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+/* verilator lint_off DECLFILENAME */
+/* verilator lint_off UNOPTFLAT */
+
+module shift_right #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]         A,
+    input logic [$clog2(WIDTH)-1:0] Shift,
+    output logic [WIDTH-1:0] 	    Z);
+   
+   logic [WIDTH-1:0] 		    stage [$clog2(WIDTH):0];
+   logic 			    sign;   
+   genvar 			    i;
+
+   assign stage[0] = A;   
+   generate
+      for (i=0;i<$clog2(WIDTH);i=i+1)
+	begin : genbit
+	   mux2 #(WIDTH) mux_inst (stage[i], 
+				   {{(WIDTH/(2**(i+1))){1'b0}}, stage[i][WIDTH-1:WIDTH/(2**(i+1))]}, 
+				   Shift[$clog2(WIDTH)-i-1], 
+				   stage[i+1]);
+	end
+   endgenerate
+   assign Z = stage[$clog2(WIDTH)];   
+
+endmodule // shift_right
+
+module shift_left #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]         A,
+    input logic [$clog2(WIDTH)-1:0] Shift,
+    output logic [WIDTH-1:0] 	    Z);
+   
+   logic [WIDTH-1:0] 		    stage [$clog2(WIDTH):0];
+   genvar 			    i;
+   
+   assign stage[0] = A;   
+   generate
+      for (i=0;i<$clog2(WIDTH);i=i+1)
+	begin : genbit
+	   mux2 #(WIDTH) mux_inst (stage[i], 
+				   {stage[i][WIDTH-1-WIDTH/(2**(i+1)):0], {(WIDTH/(2**(i+1))){1'b0}}}, 
+				   Shift[$clog2(WIDTH)-i-1], 
+				   stage[i+1]);
+	end
+   endgenerate
+   assign Z = stage[$clog2(WIDTH)];   
+
+endmodule // shift_left
+
+/* verilator lint_on DECLFILENAME */
+/* verilator lint_on UNOPTFLAT */
--- a/wally-pipelined/src/ifu/bpred.sv
+++ b/wally-pipelined/src/ifu/bpred.sv
@ -30,7 +30,8 @@

 module bpred 
  (input logic clk, reset,
-   input logic 		    StallF, StallD, StallE, FlushF, FlushD, FlushE,
+   input logic 		    StallF, StallD, StallE, StallM, StallW, 
+   input logic 		    FlushF, FlushD, FlushE, FlushM, FlushW,
   // Fetch stage
   // the prediction
   input logic [`XLEN-1:0]  PCNextF, // *** forgot to include this one on the I/O list
@ -88,25 +89,29 @@ module bpred
      globalHistoryPredictor DirPredictor(.clk(clk),
 					  .reset(reset),
 					  .*, // Stalls and flushes
-					  .LookUpPC(PCNextF),
-					  .Prediction(BPPredF),
+					  .PCNextF(PCNextF),
+					  .BPPredF(BPPredF),
 					  // update
-					  .UpdatePC(PCE),
-					  .UpdateEN(InstrClassE[0] & ~StallE),
+					  .InstrClassE(InstrClassE),
+					  .BPInstrClassE(BPInstrClassE),
+					  .BPPredDirWrongE(BPPredDirWrongE),
+					  .PCE(PCE),
 					  .PCSrcE(PCSrcE),
-					  .UpdatePrediction(UpdateBPPredE));
+					  .UpdateBPPredE(UpdateBPPredE));
    end else if (`BPTYPE == "BPGSHARE") begin:Predictor

      gsharePredictor DirPredictor(.clk(clk),
-				   .reset(reset),
-				   .*, // Stalls and flushes
-				   .LookUpPC(PCNextF),
-				   .Prediction(BPPredF),
-				   // update
-				   .UpdatePC(PCE),
-				   .UpdateEN(InstrClassE[0] & ~StallE),
-				   .PCSrcE(PCSrcE),
-				   .UpdatePrediction(UpdateBPPredE));
+					  .reset(reset),
+					  .*, // Stalls and flushes
+					  .PCNextF(PCNextF),
+					  .BPPredF(BPPredF),
+					  // update
+					  .InstrClassE(InstrClassE),
+					  .BPInstrClassE(BPInstrClassE),
+					  .BPPredDirWrongE(BPPredDirWrongE),
+					  .PCE(PCE),
+					  .PCSrcE(PCSrcE),
+					  .UpdateBPPredE(UpdateBPPredE));
    end 
    else if (`BPTYPE == "BPLOCALPAg") begin:Predictor

@ -190,14 +195,14 @@ module bpred
  flopenrc #(2) BPPredRegD(.clk(clk),
 			   .reset(reset),
 			   .en(~StallD),
-			   .clear(FlushD),
+			   .clear(1'b0),
 			   .d(BPPredF),
 			   .q(BPPredD));

  flopenrc #(2) BPPredRegE(.clk(clk),
 			   .reset(reset),
 			   .en(~StallE),
-			   .clear(FlushE),
+			   .clear(1'b0),
 			   .d(BPPredD),
 			   .q(BPPredE));

--- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv
+++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv
@ -32,76 +32,89 @@ module globalHistoryPredictor
    )
  (input logic clk,
   input logic 		   reset,
-   input logic 		    StallF, StallD, StallE, FlushF, FlushD, FlushE,
-   input logic [`XLEN-1:0] LookUpPC,
-   output logic [1:0] 	   Prediction,
+   input logic 		   StallF, StallD, StallE, FlushF, FlushD, FlushE,
+   input logic [`XLEN-1:0] PCNextF,
+   output logic [1:0] 	   BPPredF,
   // update
-   input logic [`XLEN-1:0] UpdatePC,
-   input logic 		   UpdateEN, PCSrcE, 
-   input logic [1:0] 	   UpdatePrediction
+   input logic [4:0] 	   InstrClassE,
+   input logic [4:0] 	   BPInstrClassE,
+   input logic [4:0] 	   BPInstrClassD,
+   input logic [4:0] 	   BPInstrClassF, 
+   input logic 		   BPPredDirWrongE,
+
+   input logic [`XLEN-1:0] PCE,
+   input logic 		   PCSrcE,
+   input logic [1:0] 	   UpdateBPPredE
  
   );
-   logic [k-1:0] GHRF, GHRFNext;
-   assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; 
+  logic [k+1:0] 	   GHR, GHRNext;
+  logic [k-1:0] 	   PHTUpdateAdr, PHTUpdateAdr0, PHTUpdateAdr1;
+  logic 		   PHTUpdateEN;
+  logic 		   BPClassWrongNonCFI;
+  logic 		   BPClassWrongCFI;
+  logic 		   BPClassRightNonCFI;

-    flopenr #(k) GlobalHistoryRegister(.clk(clk),
-            .reset(reset),
-            .en(UpdateEN),
-            .d(GHRFNext),
-            .q(GHRF));
+  logic [6:0] 		   GHRMuxSel;
+  logic 		   GHRUpdateEN;
+  logic [k-1:0] 	   GHRLookup;
+
+  assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0];
+  assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0];
+  assign BPClassWrongNonCFI = BPInstrClassE[0] & ~InstrClassE[0];
+  assign BPClassRightBPWrong = BPInstrClassE[0] & InstrClassE[0] & BPPredDirWrongE;
+  assign BPClassRightBPRight = BPInstrClassE[0] & InstrClassE[0] & ~BPPredDirWrongE;
  
  
+  // GHR update selection, 1 hot encoded.
+  assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight);
+  assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0];
+  assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0];
+  assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]);
+  assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0];
+  assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0];
+  assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight));
+  assign GHRUpdateEN = (| GHRMuxSel[5:1] & ~StallE) | GHRMuxSel[6] & ~StallF;

-  logic [1:0] 		   PredictionMemory;
-  logic 		   DoForwarding, DoForwardingF;
-  logic [1:0] 		   UpdatePredictionF;
+  // hoping this created a AND-OR mux.
+  always_comb begin
+    case (GHRMuxSel) 
+      7'b000_0001: GHRNext = GHR[k-1+2:0];  // no change
+      7'b000_0010: GHRNext = {GHR[k-2+2:0], PCSrcE}; // branch update
+      7'b000_0100: GHRNext = {1'b0, GHR[k+1:1]}; // repair 1
+      7'b000_1000: GHRNext = {GHR[k-1+2:1], PCSrcE}; // branch update with mis prediction correction
+      7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2
+      7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1
+      7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update
+      default: GHRNext = GHR[k-1+2:0];
+    endcase
+  end

+  flopenr #(k+2) GlobalHistoryRegister(.clk(clk),
+				       .reset(reset),
+				       .en((GHRUpdateEN)),
+				       .d(GHRNext),
+				       .q(GHR));
+
+  // if actively updating the GHR at the time of prediction we want to us
+  // GHRNext as the lookup rather than GHR.
+
+  assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0];
+  assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1];  
+  assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0;
+  assign PHTUpdateEN = InstrClassE[0] & ~StallE;
+
+  assign GHRLookup = |GHRMuxSel[6:1] ? GHRNext[k-1:0] : GHR[k-1:0];
  
  // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT 
-  // GHR referes to the address that the past k branches points to in the prediction stage 
-  // GHRE refers to the address that the past k branches points to in the exectution stage
-    SRAM2P1R1W #(k, 2) PHT(.clk(clk),
-				.reset(reset),
-				.RA1(GHRF),
-				.RD1(PredictionMemory),
-				.REN1(~StallF),
-				.WA1(GHRFNext),
-				.WD1(UpdatePrediction),
-				.WEN1(UpdateEN),
-				.BitWEN1(2'b11));
+  SRAM2P1R1W #(k, 2) PHT(.clk(clk),
+			 .reset(reset),
+			 //.RA1(GHR[k-1:0]),
+			 .RA1(GHRLookup),
+			 .RD1(BPPredF),
+			 .REN1(~StallF),
+			 .WA1(PHTUpdateAdr),
+			 .WD1(UpdateBPPredE),
+			 .WEN1(PHTUpdateEN),
+			 .BitWEN1(2'b11));

-
-  // need to forward when updating to the same address as reading.
-  // first we compare to see if the update and lookup addreses are the same
-  assign DoForwarding = GHRF == GHRFNext;
-
-  // register the update value and the forwarding signal into the Fetch stage
-  // TODO: add stall logic ***
-  flopr #(1) DoForwardingReg(.clk(clk),
-			     .reset(reset),
-			     .d(DoForwarding),
-			     .q(DoForwardingF));
-  
-  flopr #(2) UpdatePredictionReg(.clk(clk),
-				 .reset(reset),
-				 .d(UpdatePrediction),
-				 .q(UpdatePredictionF));
-
-  assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory;
-  
-  //pipeline for GHR
-  /*flopenrc #(k) GHRDReg(.clk(clk),
-      .reset(reset),
-      .en(~StallD),
-      .clear(FlushD),
-      .d(GHRF),
-      .q(GHRD));
-
-  flopenrc #(k) GHREReg(.clk(clk),
-        .reset(reset),
-        .en(~StallE),
-        .clear(FlushE),
-        .d(GHRD),
-        .q(GHRE));
-*/
 endmodule
--- a/wally-pipelined/src/ifu/gshare.sv
+++ b/wally-pipelined/src/ifu/gshare.sv
@ -1,128 +0,0 @@
-///////////////////////////////////////////
-// gshare.sv
-//
-// Written: Shreya Sanghai
-// Email: ssanghai@hmc.edu
-// Created: March 16, 2021
-// Modified: 
-//
-// Purpose: Gshare predictor with parameterized global history register
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-///////////////////////////////////////////
-
-`include "wally-config.vh"
-
-module gsharePredictor
-  #(parameter int k = 10
-    )
-  (input logic clk,
-   input logic 		   reset,
-   input logic 		   StallF, StallD, StallE, FlushF, FlushD, FlushE,
-   input logic [`XLEN-1:0] LookUpPC,
-   output logic [1:0] 	   Prediction,
-   // update
-   input logic [`XLEN-1:0] UpdatePC,
-   input logic 		   UpdateEN, PCSrcE,
-   input logic [1:0] 	   UpdatePrediction
-  
-   );
-
-  logic [k-1:0] 	   GHRF, GHRFNext;
-  //logic [k-1:0] 	   LookUpPCIndexD, LookUpPCIndexE;
-  logic [k-1:0] 	   LookUpPCIndex, UpdatePCIndex;
-  logic [1:0] 		   PredictionMemory;
-  logic 		   DoForwarding, DoForwardingF;
-  logic [1:0] 		   UpdatePredictionF;
-
-  assign GHRFNext = {PCSrcE, GHRF[k-1:1]};
-  
-  flopenr #(k) GlobalHistoryRegister(.clk(clk),
-				     .reset(reset),
-				     .en(UpdateEN),
-				     .d(GHRFNext),
-				     .q(GHRF));
-
-
-  // for gshare xor the PC with the GHR 
-  assign UpdatePCIndex = GHRFNext ^ UpdatePC[k:1];
-  assign LookUpPCIndex = GHRF ^ LookUpPC[k:1];  
-  // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT 
-  // GHR referes to the address that the past k branches points to in the prediction stage 
-  // GHRE refers to the address that the past k branches points to in the exectution stage
-  SRAM2P1R1W #(k, 2) PHT(.clk(clk),
-			 .reset(reset),
-			 .RA1(LookUpPCIndex),
-			 .RD1(PredictionMemory),
-			 .REN1(~StallF),
-			 .WA1(UpdatePCIndex),
-			 .WD1(UpdatePrediction),
-			 .WEN1(UpdateEN),
-			 .BitWEN1(2'b11));
-
-
-  // need to forward when updating to the same address as reading.
-  // first we compare to see if the update and lookup addreses are the same
-  assign DoForwarding = LookUpPCIndex == UpdatePCIndex;
-
-  // register the update value and the forwarding signal into the Fetch stage
-  // TODO: add stall logic ***
-  flopr #(1) DoForwardingReg(.clk(clk),
-			     .reset(reset),
-			     .d(DoForwarding),
-			     .q(DoForwardingF));
-  
-  flopr #(2) UpdatePredictionReg(.clk(clk),
-				 .reset(reset),
-				 .d(UpdatePrediction),
-				 .q(UpdatePredictionF));
-
-  assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory;
-  
-  //pipeline for GHR
-/* -----\/----- EXCLUDED -----\/-----
-  flopenrc #(k) LookUpDReg(.clk(clk),
-			   .reset(reset),
-			   .en(~StallD),
-			   .clear(FlushD),
-			   .d(LookUpPCIndex),
-			   .q(LookUpPCIndexD));
-
-  flopenrc #(k) LookUpEReg(.clk(clk),
-			   .reset(reset),
-			   .en(~StallE),
-			   .clear(FlushE),
-			   .d(LookUpPCIndexD),
-			   .q(LookUpPCIndexE));
- -----/\----- EXCLUDED -----/\----- */
-
-/*  flopenrc #(k) GHRRegD(.clk(clk),
-			.reset(reset),
-			.en(~StallD),
-			.clear(FlushD),
-			.d(GHRF),
-			.q(GHRD));
-
-  flopenrc #(k) GHRRegE(.clk(clk),
-			.reset(reset),
-			.en(~StallE),
-			.clear(FlushE),
-			.d(GHRD),
-			.q(GHRE));
-  
-*/
-endmodule
--- a/wally-pipelined/src/ifu/gsharePredictor.sv
+++ b/wally-pipelined/src/ifu/gsharePredictor.sv
@ -0,0 +1,120 @@
+///////////////////////////////////////////
+// globalHistoryPredictor.sv
+//
+// Written: Shreya Sanghai
+// Email: ssanghai@hmc.edu
+// Created: March 16, 2021
+// Modified: 
+//
+// Purpose: Gshare predictor with parameterized global history register
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module gsharePredictor
+  #(parameter int k = 10
+    )
+  (input logic clk,
+   input logic 		   reset,
+   input logic 		   StallF, StallD, StallE, FlushF, FlushD, FlushE,
+   input logic [`XLEN-1:0] PCNextF,
+   output logic [1:0] 	   BPPredF,
+   // update
+   input logic [4:0] 	   InstrClassE,
+   input logic [4:0] 	   BPInstrClassE,
+   input logic [4:0] 	   BPInstrClassD,
+   input logic [4:0] 	   BPInstrClassF, 
+   input logic 		   BPPredDirWrongE,
+
+   input logic [`XLEN-1:0] PCE,
+   input logic 		   PCSrcE,
+   input logic [1:0] 	   UpdateBPPredE
+  
+   );
+  logic [k+1:0] 	   GHR, GHRNext;
+  logic [k-1:0] 	   PHTUpdateAdr, PHTUpdateAdr0, PHTUpdateAdr1;
+  logic 		   PHTUpdateEN;
+  logic 		   BPClassWrongNonCFI;
+  logic 		   BPClassWrongCFI;
+  logic 		   BPClassRightNonCFI;
+
+  logic [6:0] 		   GHRMuxSel;
+  logic 		   GHRUpdateEN;
+  logic [k-1:0] 	   GHRLookup;
+
+  assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0];
+  assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0];
+  assign BPClassWrongNonCFI = BPInstrClassE[0] & ~InstrClassE[0];
+  assign BPClassRightBPWrong = BPInstrClassE[0] & InstrClassE[0] & BPPredDirWrongE;
+  assign BPClassRightBPRight = BPInstrClassE[0] & InstrClassE[0] & ~BPPredDirWrongE;
+  
+  
+  // GHR update selection, 1 hot encoded.
+  assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight);
+  assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0];
+  assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0];
+  assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]);
+  assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0];
+  assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0];
+  assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight));
+  assign GHRUpdateEN = (| GHRMuxSel[5:1] & ~StallE) | GHRMuxSel[6] & ~StallF;
+
+  // hoping this created a AND-OR mux.
+  always_comb begin
+    case (GHRMuxSel) 
+      7'b000_0001: GHRNext = GHR[k-1+2:0];  // no change
+      7'b000_0010: GHRNext = {GHR[k-2+2:0], PCSrcE}; // branch update
+      7'b000_0100: GHRNext = {1'b0, GHR[k+1:1]}; // repair 1
+      7'b000_1000: GHRNext = {GHR[k-1+2:1], PCSrcE}; // branch update with mis prediction correction
+      7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2
+      7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1
+      7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update
+      default: GHRNext = GHR[k-1+2:0];
+    endcase
+  end
+
+  flopenr #(k+2) GlobalHistoryRegister(.clk(clk),
+				       .reset(reset),
+				       .en((GHRUpdateEN)),
+				       .d(GHRNext),
+				       .q(GHR));
+
+  // if actively updating the GHR at the time of prediction we want to us
+  // GHRNext as the lookup rather than GHR.
+
+  assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0];
+  assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1];  
+  assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0;
+  assign PHTUpdateEN = InstrClassE[0] & ~StallE;
+
+  assign GHRLookup = |GHRMuxSel[6:1] ? GHRNext[k-1:0] : GHR[k-1:0];
+  
+  // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT 
+  SRAM2P1R1W #(k, 2) PHT(.clk(clk),
+			 .reset(reset),
+			 //.RA1(GHR[k-1:0]),
+			 .RA1(GHRLookup ^ PCNextF[k:1]),
+			 .RD1(BPPredF),
+			 .REN1(~StallF),
+			 .WA1(PHTUpdateAdr ^ PCE[k:1]),
+			 .WD1(UpdateBPPredE),
+			 .WEN1(PHTUpdateEN),
+			 .BitWEN1(2'b11));
+
+endmodule // gsharePredictor
--- a/wally-pipelined/src/ifu/icache.sv
+++ b/wally-pipelined/src/ifu/icache.sv
@ -154,15 +154,16 @@ module icachecontroller #(parameter LINESIZE = 256) (
  localparam STATE_MISS_SPILL_FETCH_DONE = 10; // write data into SRAM/LUT
  localparam STATE_MISS_SPILL_READ1 = 11; // read block 0 from SRAM/LUT
  localparam STATE_MISS_SPILL_2 = 12; // return to ready if hit or do second block update.
-  localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 13; // miss on block 1, issue read to AHB and wait
-  localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 14; // write data to SRAM/LUT
-  localparam STATE_MISS_SPILL_MERGE = 15; // read block 0 of CPU access,
+  localparam STATE_MISS_SPILL_2_START = 13; // return to ready if hit or do second block update.  
+  localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 14; // miss on block 1, issue read to AHB and wait
+  localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 15; // write data to SRAM/LUT
+  localparam STATE_MISS_SPILL_MERGE = 16; // read block 0 of CPU access,

-  localparam STATE_MISS_SPILL_FINAL = 16; // this state replicates STATE_READY's replay of the
+  localparam STATE_MISS_SPILL_FINAL = 17; // this state replicates STATE_READY's replay of the
  // spill access but does nto consider spill.  It also does not do another operation.
  

-  localparam STATE_INVALIDATE = 17; // *** not sure if invalidate or evict? invalidate by cache block or address?
+  localparam STATE_INVALIDATE = 18; // *** not sure if invalidate or evict? invalidate by cache block or address?
  
  localparam AHBByteLength = `XLEN / 8;
  localparam AHBOFFETWIDTH = $clog2(AHBByteLength);
@ -380,11 +381,20 @@ module icachecontroller #(parameter LINESIZE = 256) (
 	PCMux = 2'b10;
 	UnalignedSelect = 1'b1;
 	spillSave = 1'b1; /// *** Could pipeline these to make it clearer in the fsm.
+	ICacheReadEn = 1'b1;
+	NextState = STATE_MISS_SPILL_2_START;
+      end
+      STATE_MISS_SPILL_2_START: begin
 	if (~hit) begin
 	  CntReset = 1'b1;
 	  NextState = STATE_MISS_SPILL_MISS_FETCH_WDV;
 	end else begin
-	  NextState = STATE_MISS_SPILL_FINAL;
+	  NextState = STATE_READY;
+	  ICacheReadEn = 1'b1;
+	  PCMux = 2'b00;
+	  UnalignedSelect = 1'b1;
+	  SavePC = 1'b1;
+	  ICacheStallF = 1'b0;	
 	end
      end
      STATE_MISS_SPILL_MISS_FETCH_WDV: begin
--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@ -154,14 +154,7 @@ module ifu (
  generate 
    if (`BPRED_ENABLED == 1) begin : bpred
      // I am making the port connection explicit for now as I want to see them and they will be changing.
-      bpred bpred(.clk(clk),
-		  .reset(reset),
-		  .StallF(StallF),
-		  .StallD(StallD),
-		  .StallE(StallE),
-		  .FlushF(FlushF),
-		  .FlushD(FlushD),
-		  .FlushE(FlushE),
+      bpred bpred(.*,
 		  .PCNextF(PCNextF),
 		  .BPPredPCF(BPPredPCF),
 		  .SelBPPredF(SelBPPredF),
--- a/wally-pipelined/src/mmu/cam_line.sv
+++ b/wally-pipelined/src/mmu/cam_line.sv
@ -2,7 +2,9 @@
 // cam_line.sv
 //
 // Written: tfleming@hmc.edu & jtorrey@hmc.edu 6 April 2021
-// Modified:
+// Modified: kmacsaigoren@hmc.edu 1 June 2021
+//            Implemented SV48 on top of SV39. This included adding SvMode input signal and the wally constants
+//            Mostly this was done to make the PageNumberMixer work.
 //
 // Purpose: CAM line for the translation lookaside buffer (TLB)
 //          Determines whether a virtual address matches the stored key.
@ -24,12 +26,17 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////

+`include "wally-constants.vh"
+
 module cam_line #(parameter KEY_BITS = 20,
                  parameter HIGH_SEGMENT_BITS = 10) (
  input                 clk, reset,

+  // input to scheck which SvMode is running
+  input [`SVMODE_BITS-1:0] SvMode,
+  
  // The requested page number to compare against the key
-  input  [KEY_BITS-1:0] VirtualPageNumber,
+  input [KEY_BITS-1:0]  VirtualPageNumber,

  // Signals to write a new entry to this line
  input                 CAMLineWrite,
@ -38,10 +45,11 @@ module cam_line #(parameter KEY_BITS = 20,
  // Flush this line (set valid to 0)
  input                 TLBFlush,

-  // This entry is a key for a giga, mega, or kilopage.
+  // This entry is a key for a tera, giga, mega, or kilopage.
  // PageType == 2'b00 --> kilopage
  // PageType == 2'b01 --> megapage
-  // PageType == 2'b11 --> gigapage
+  // PageType == 2'b10 --> gigapage
+  // PageType == 2'b11 --> terapage
  output [1:0]          PageType,  // *** should this be the stored version or the always updated one?
  output                Match
 );
@ -67,9 +75,9 @@ module cam_line #(parameter KEY_BITS = 20,
  flopenr #(KEY_BITS) keyflop(clk, reset, CAMLineWrite, VirtualPageNumber, Key);

  // Calculate the actual query key based on the input key and the page type.
-  // For example, a megapage in sv39 only cares about VPN2 and VPN1, so VPN0
+  // For example, a megapage in SV39 only cares about VPN2 and VPN1, so VPN0
  // should automatically match.
-  page_number_mixer #(KEY_BITS, HIGH_SEGMENT_BITS) mixer(VirtualPageNumber, Key, PageType, VirtualPageNumberQuery);
+  page_number_mixer #(KEY_BITS, HIGH_SEGMENT_BITS) mixer(VirtualPageNumber, Key, PageType, SvMode, VirtualPageNumberQuery);

  assign Match = ({1'b1, VirtualPageNumberQuery} == {Valid, Key});

--- a/wally-pipelined/src/mmu/page_number_mixer.sv
+++ b/wally-pipelined/src/mmu/page_number_mixer.sv
@ -2,7 +2,11 @@
 // page_number_mixer.sv
 //
 // Written: tfleming@hmc.edu & jtorrey@hmc.edu 6 April 2021
-// Modified:
+// Modified: kmacsaigoren@hmc.edu 1 June 2021
+//              Implemented SV48 on top of SV39. This included adding a 3rd Segment to each of the pagenumbers,
+//              Ensuring that the BITS and HIGH_SEGMENT_BITS inputs were correct everywhere this module gets instatniated,
+//              Adding seveeral muxes to decide the bit selection to turn pagenumbers into segments based on SV mode,
+//              Adding support for terapage/newgigapage encoding.
 //
 // Purpose: Takes two page numbers and replaces segments of the first page
 //          number with segments from the second, based on the page type.
@ -25,22 +29,29 @@
 ///////////////////////////////////////////

 `include "wally-config.vh"
+`include "wally-constants.vh"

 module page_number_mixer #(parameter BITS = 20,
                           parameter HIGH_SEGMENT_BITS = 10) (
-    input  [BITS-1:0] PageNumber,
-    input  [BITS-1:0] MixPageNumber,
-    input  [1:0]      PageType,
-    output [BITS-1:0] PageNumberCombined
+    input  [BITS-1:0]         PageNumber,
+    input  [BITS-1:0]         MixPageNumber,
+    input  [1:0]              PageType,
+    input  [`SVMODE_BITS-1:0] SvMode,
+
+    output [BITS-1:0]         PageNumberCombined
 );

+  // The upper segment might have a different width than the lower segments.
+  // For example, an SV39 PTE has 26 bits for PPN2 and 9 bits for the other
+  // segments. This is outside the 'if XLEN' b/c the constant is already configured
+  // to the correct value for the XLEN in the relevant wally-constants.vh file.
+  localparam LOW_SEGMENT_BITS = `VPN_SEGMENT_BITS;
+  // *** each time this module is implemented, low segment bits is either
+  // `VPN_SEGMENT_BITS or `PPN_LOW_SEGMENT_BITS (if it existed)
+  // in every mode so far, these are the same, so it's left as it is above. 
+
  generate
-    // *** Just checking XLEN is not enough to support sv39 AND sv48.
    if (`XLEN == 32) begin
-      // The upper segment might have a different width than the lower segments.
-      // For example, an sv39 PTE has 26 bits for PPN2 and 9 bits for the other
-      // segments.
-      localparam LOW_SEGMENT_BITS = (BITS - HIGH_SEGMENT_BITS);

      logic [HIGH_SEGMENT_BITS-1:0] Segment1, MixSegment1, Segment1Combined;
      logic [LOW_SEGMENT_BITS-1:0]  Segment0, MixSegment0, Segment0Combined;
@ -58,28 +69,60 @@ module page_number_mixer #(parameter BITS = 20,
      // Reswizzle segments of the combined page number
      assign PageNumberCombined = {Segment1Combined, Segment0Combined};
    end else begin
-      // The upper segment might have a different width than the lower segments.
-      // For example, an sv39 PTE has 26 bits for PPN2 and 9 bits for the other
-      // segments.
-      localparam LOW_SEGMENT_BITS = (BITS - HIGH_SEGMENT_BITS) / 2;

-      logic [HIGH_SEGMENT_BITS-1:0] Segment2, MixSegment2, Segment2Combined;
+      // After segment 0 and 1 of the page number, the width of each segment is dependant on the SvMode.
+      // For this reason, each segment bus is the width of its widest value across each mode
+      // when a smaller value needs to be loaded in to a wider bus, it's loaded in the least significant bits
+      // and left padded with zeros. MAKE SURE that if a value is being padded with zeros here,
+      // that it's padded with zeros everywhere else in the MMU ans beyond to avoid false misses in the TLB.
+      logic [HIGH_SEGMENT_BITS-1:0] Segment3, MixSegment3, Segment3Combined;
+      logic [HIGH_SEGMENT_BITS + LOW_SEGMENT_BITS-1:0]  Segment2, MixSegment2, Segment2Combined;
      logic [LOW_SEGMENT_BITS-1:0]  Segment1, MixSegment1, Segment1Combined;
      logic [LOW_SEGMENT_BITS-1:0]  Segment0, MixSegment0, Segment0Combined;
      
+
      // Unswizzle segments of the input page number
-      assign {Segment2, Segment1, Segment0} = PageNumber;
-      assign {MixSegment2, MixSegment1, MixSegment0} = MixPageNumber;
+      // *** these muxes assume that only Sv48 and SV39 are implemented in rv64. for future SV57 and up,
+      //      there will have to be more muxes to select which value each segment gets.
+      //      as a cool reminder: BITS is the width of the page number, virt or phys, coming into this module
+      //      while high segment bits is the width of the highest segment of that page number.
+      //      Note for future work: this module has to work with both VPNs and PPNs and due to their differing 
+      //         widths and the fact that the ppn has one longer segment at the top makes the muxes below very confusing.
+      //      Potentially very annoying thing for future workers: the number of bits in a ppn is always 44 (for SV39 and48)
+      //         but in SV57 and above, this might be a new longer length. In that case these selectors will most likely
+      //         become even more complicated and confusing.
+      assign Segment3 = (SvMode == `SV48) ? 
+                        PageNumber[BITS-1:3*LOW_SEGMENT_BITS] : // take the top segment or not
+                        {HIGH_SEGMENT_BITS{1'b0}}; // for virtual page numbers in SV39, both options should be zeros.
+      assign Segment2 = (SvMode == `SV48) ? 
+                        {{HIGH_SEGMENT_BITS{1'b0}}, PageNumber[3*LOW_SEGMENT_BITS-1:2*LOW_SEGMENT_BITS]} : // just take another low segment left padded with zeros.
+                        PageNumber[BITS-1:2*LOW_SEGMENT_BITS]; // otherwise take the rest of the PageNumber
+      assign Segment1 = PageNumber[2*LOW_SEGMENT_BITS-1:LOW_SEGMENT_BITS];
+      assign Segment0 = PageNumber[LOW_SEGMENT_BITS-1:0];
+
+
+      assign MixSegment3 = (SvMode == `SV48) ? 
+                        MixPageNumber[BITS-1:3*LOW_SEGMENT_BITS] : // take the top segment or not
+                        {HIGH_SEGMENT_BITS{1'b0}}; // for virtual page numbers in SV39, both options should be zeros.
+      assign MixSegment2 = (SvMode == `SV48) ? 
+                        {{HIGH_SEGMENT_BITS{1'b0}}, MixPageNumber[3*LOW_SEGMENT_BITS-1:2*LOW_SEGMENT_BITS]} : // just take another low segment left padded with zeros.
+                        MixPageNumber[BITS-1:2*LOW_SEGMENT_BITS]; // otherwise take the rest of the PageNumber
+      assign MixSegment1 = MixPageNumber[2*LOW_SEGMENT_BITS-1:LOW_SEGMENT_BITS];
+      assign MixSegment0 = MixPageNumber[LOW_SEGMENT_BITS-1:0];
+

      // Pass through the high segment
-      assign Segment2Combined = Segment2;
+      assign Segment3Combined = Segment3;

-      // Either pass through or zero out segments 1 and 0 based on the page type
-      mux2 #(LOW_SEGMENT_BITS) segment1mux(Segment1, MixSegment1, PageType[1], Segment1Combined);
-      mux2 #(LOW_SEGMENT_BITS) segment0mux(Segment0, MixSegment0, PageType[0], Segment0Combined);
+      // Either pass through or zero out lower segments based on the page type
+      assign Segment2Combined = (PageType[1] && PageType[0]) ? MixSegment2 : Segment2; // terapage (page == 11)
+      assign Segment1Combined = (PageType[1]) ? MixSegment1 : Segment1; // gigapage and higher (page == 10 or 11)
+      assign Segment0Combined = (PageType[1] || PageType[0]) ? MixSegment0 : Segment0; // megapage and higher (page == 01 or 10 or 11)

      // Reswizzle segments of the combined page number
-      assign PageNumberCombined = {Segment2Combined, Segment1Combined, Segment0Combined};
+      assign PageNumberCombined = (SvMode == `SV48) ? 
+                                  {Segment3Combined, Segment2Combined[LOW_SEGMENT_BITS-1:0], Segment1Combined, Segment0Combined} :
+                                  {Segment2Combined, Segment1Combined, Segment0Combined};
    end
  endgenerate
 endmodule
--- a/wally-pipelined/src/mmu/pagetablewalker.sv
+++ b/wally-pipelined/src/mmu/pagetablewalker.sv
@ -2,7 +2,10 @@
 // pagetablewalker.sv
 //
 // Written: tfleming@hmc.edu 2 March 2021
-// Modified: 
+// Modified: kmacsaigoren@hmc.edu 1 June 2021
+//            implemented SV48 on top of SV39. This included, adding a level of the FSM for the extra page number segment
+//            adding support for terapage encoding, and for setting the TranslationPAdr using the new level,
+//            adding the internal SvMode signal
 //
 // Purpose: Page Table Walker
 //          Part of the Memory Management Unit (MMU)
@ -70,6 +73,7 @@ module pagetablewalker (
  logic [`XLEN-1:0]     SavedPTE, CurrentPTE;
  logic [`PA_BITS-1:0]  TranslationPAdr;
  logic [`PPN_BITS-1:0] CurrentPPN;
+  logic [`SVMODE_BITS-1:0]  SvMode;
  logic                 MemStore;

  // PTE Control Bits
@ -82,6 +86,8 @@ module pagetablewalker (
  logic [`XLEN-1:0] PageTableEntry;
  logic [1:0] PageType;

+  assign SvMode = SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS];
+
  assign BasePageTablePPN = SATP_REGW[`PPN_BITS-1:0];

  assign MemStore = MemRWM[0];
@ -105,11 +111,12 @@ module pagetablewalker (
  assign PageTypeF = PageType;
  assign PageTypeM = PageType;

-  localparam IDLE = 3'h0;
+  localparam LEVEL0 = 3'h0;
  localparam LEVEL1 = 3'h1;
-  localparam LEVEL0 = 3'h2;
-  localparam LEAF = 3'h3;
-  localparam FAULT = 3'h4;
+  // space left for more levels
+  localparam LEAF = 3'h5;
+  localparam IDLE = 3'h6;
+  localparam FAULT = 3'h7;

  logic [2:0] WalkerState, NextWalkerState;

@ -208,18 +215,32 @@ module pagetablewalker (
      assign MMUPAdr = TranslationPAdr[31:0];

    end else begin
-      localparam LEVEL2 = 3'h5;
+      localparam LEVEL2 = 3'h2;
+      localparam LEVEL3 = 3'h3;

-      logic [8:0] VPN2, VPN1, VPN0;
+      logic [8:0] VPN3, VPN2, VPN1, VPN0;

-      logic GigapageMisaligned, BadGigapage;
+      logic TerapageMisaligned, GigapageMisaligned, BadTerapage, BadGigapage;

      flopenl #(3) mmureg(HCLK, ~HRESETn, 1'b1, NextWalkerState, IDLE, WalkerState);

      always_comb begin
        case (WalkerState)
-          IDLE:   if      (MMUTranslate)           NextWalkerState = LEVEL2;
+          IDLE:   if      (MMUTranslate)           NextWalkerState = LEVEL3;
                  else                             NextWalkerState = IDLE;
+          LEVEL3: if      (SvMode != `SV48)         NextWalkerState = LEVEL2;
+                  // 3rd level used if SV48 is enabled.
+                  else begin
+                    if      (~MMUReady)              NextWalkerState = LEVEL3;
+                    // *** <FUTURE WORK> According to the architecture, we should
+                    // fault upon finding a superpage that is misaligned or has 0
+                    // access bit. The following commented line of code is
+                    // supposed to perform that check. However, it is untested.
+                    else if (ValidPTE && LeafPTE && ~BadTerapage) NextWalkerState = LEAF;
+                    // else if (ValidPTE && LeafPTE)    NextWalkerState = LEAF;  // *** Once the above line is properly tested, delete this line.
+                    else if (ValidPTE && ~LeafPTE)   NextWalkerState = LEVEL2;
+                    else                             NextWalkerState = FAULT;
+                  end
          LEVEL2: if      (~MMUReady)              NextWalkerState = LEVEL2;
                  // *** <FUTURE WORK> According to the architecture, we should
                  // fault upon finding a superpage that is misaligned or has 0
@ -242,24 +263,29 @@ module pagetablewalker (
                  else if (ValidPTE && LeafPTE && ~AccessAlert)
                                                   NextWalkerState = LEAF;
                  else                             NextWalkerState = FAULT;
-          LEAF:   if      (MMUTranslate)           NextWalkerState = LEVEL2;
+          LEAF:   if      (MMUTranslate)           NextWalkerState = LEVEL3;
                  else                             NextWalkerState = IDLE;
-          FAULT:  if      (MMUTranslate)           NextWalkerState = LEVEL2;
+          FAULT:  if      (MMUTranslate)           NextWalkerState = LEVEL3;
                  else                             NextWalkerState = IDLE;
          // Default case should never happen, but is included for linter.
          default:                                 NextWalkerState = IDLE;
        endcase
      end

+      // A terapage is a level 3 leaf page. This page must have zero PPN[2],
+      // zero PPN[1], and zero PPN[0]
+      assign TerapageMisaligned = |(CurrentPPN[26:0]);
      // A gigapage is a Level 2 leaf page. This page must have zero PPN[1] and
      // zero PPN[0]
      assign GigapageMisaligned = |(CurrentPPN[17:0]);
      // A megapage is a Level 1 leaf page. This page must have zero PPN[0].
      assign MegapageMisaligned = |(CurrentPPN[8:0]);

+      assign BadTerapage = TerapageMisaligned || AccessAlert;  // *** Implement better access/dirty scheme
      assign BadGigapage = GigapageMisaligned || AccessAlert;  // *** Implement better access/dirty scheme
      assign BadMegapage = MegapageMisaligned || AccessAlert;  // *** Implement better access/dirty scheme

+      assign VPN3 = TranslationVAdr[47:39];
      assign VPN2 = TranslationVAdr[38:30];
      assign VPN1 = TranslationVAdr[29:21];
      assign VPN0 = TranslationVAdr[20:12];
@ -282,8 +308,13 @@ module pagetablewalker (
          IDLE: begin
            MMUStall = '0;
          end
+          LEVEL3: begin
+            TranslationPAdr = {BasePageTablePPN, VPN3, 3'b000};
+            // *** this is a huge breaking point. if we're going through level3 every time, even when sv48 is off,
+            // what should translationPAdr be when level3 is just off?
+          end
          LEVEL2: begin
-            TranslationPAdr = {BasePageTablePPN, VPN2, 3'b000};
+            TranslationPAdr = {(SvMode == `SV48) ? CurrentPPN : BasePageTablePPN, VPN2, 3'b000};
          end
          LEVEL1: begin
            TranslationPAdr = {CurrentPPN, VPN1, 3'b000};
@ -295,8 +326,9 @@ module pagetablewalker (
            // Keep physical address alive to prevent HADDR dropping to 0
            TranslationPAdr = {CurrentPPN, VPN0, 3'b000};
            PageTableEntry = CurrentPTE;
-            PageType = (WalkerState == LEVEL2) ? 2'b11 : 
-                                ((WalkerState == LEVEL1) ? 2'b01 : 2'b00);
+            PageType = (WalkerState == LEVEL3) ? 2'b11 :
+                                ((WalkerState == LEVEL2) ? 2'b10 : 
+                                ((WalkerState == LEVEL1) ? 2'b01 : 2'b00));
            DTLBWriteM = DTLBMissM;
            ITLBWriteF = ~DTLBMissM;  // Prefer data over instructions
          end
--- a/wally-pipelined/src/mmu/priority_encoder.sv
+++ b/wally-pipelined/src/mmu/priority_encoder.sv
@ -4,7 +4,11 @@
 // Written: tfleming@hmc.edu & jtorrey@hmc.edu 7 April 2021
 // Based on implementation from https://www.allaboutcircuits.com/ip-cores/communication-controller/priority-encoder/
 // *** Give proper LGPL attribution for above source
-// Modified:
+// Modified: Teo Ene 15 Apr 2021:
+//              Temporarily removed paramterized priority encoder for non-parameterized one
+//              To get synthesis working quickly
+//           Kmacsaigoren@hmc.edu 28 May 2021:
+//              Added working version of parameterized priority encoder. 
 //
 // Purpose: One-hot encoding to binary encoder
 //
@ -27,51 +31,33 @@

 `include "wally-config.vh"

-// Teo Ene 04/15:
-// Temporarily removed paramterized priority encoder for non-parameterized one
-// To get synthesis working quickly
 module priority_encoder #(parameter BINARY_BITS = 3) (
-  input  logic  [7:0] one_hot,
-  output logic  [2:0] binary
+  input  logic  [2**BINARY_BITS - 1:0] one_hot,
+  output logic  [BINARY_BITS - 1:0] binary
 );

-  // localparam ONE_HOT_BITS = 2**BINARY_BITS;
-
-  /*
-  genvar i, j;
-  generate
-    for (i = 0; i < ONE_HOT_BITS; i++) begin
-      for (j = 0; j < BINARY_BITS; j++) begin
-        if (i[j]) begin
-          assign binary[j] = one_hot[i];
-        end
-      end
-    end
-  endgenerate
-  */
-
-  /*
-  logic [BINARY_BITS-1:0] binary_comb;
-
+  integer i;
  always_comb begin
-    binary_comb = 0;
-    for (int i = 0; i < ONE_HOT_BITS; i++)
-      if (one_hot[i]) binary_comb = i;
+    binary = 0;
+    for (i = 0; i < 2**BINARY_BITS; i++) begin
+      if (one_hot[i]) binary = i; // prioritizes the most significant bit
+    end
  end
+  // *** triple check synthesizability here

-  assign binary = binary_comb;
+  // Ideally this mimics the following:
+  /*
+  always_comb begin
+    casex (one_hot)
+      1xx ... x: binary = BINARY_BITS - 1;
+      01x ... x: binary = BINARY_BITS - 2;
+      001 ... x: binary = BINARY_BITS - 3;
+      
+      {...}
+
+      00 ... 1xx: binary = 2;
+      00 ... 01x: binary = 1;
+      00 ... 001: binary = 0;
+  end
  */
-  always_comb
-    case (one_hot)
-      8'h1:     binary=3'h0;
-      8'h2:     binary=3'h1;
-      8'h4:     binary=3'h2;
-      8'h8:     binary=3'h3;
-      8'h10:    binary=3'h4;
-      8'h20:    binary=3'h5;
-      8'h40:    binary=3'h6;
-      8'h80:    binary=3'h7;
-      default:  binary=3'h0; //should never happen
-    endcase
-
 endmodule
--- a/wally-pipelined/src/mmu/tlb.sv
+++ b/wally-pipelined/src/mmu/tlb.sv
@ -2,7 +2,9 @@
 // tlb.sv
 //
 // Written: jtorrey@hmc.edu 16 February 2021
-// Modified:
+// Modified: kmacsaigoren@hmc.edu 1 June 2021
+//            Implemented SV48 on top of SV39. This included adding the SvMode signal,
+//            and using it to decide the translate signal and get the virtual page number
 //
 // Purpose: Translation lookaside buffer
 //          Cache of virtural-to-physical address translations
@ -25,7 +27,7 @@
 ///////////////////////////////////////////

 /**
- * sv32 specs
+ * SV32 specs
 * ----------
 * Virtual address [31:0] (32 bits)
 *    [________________________________]
@ -85,14 +87,11 @@ module tlb #(parameter ENTRY_BITS = 3,
  output             TLBPageFault
 );

-  logic SvMode;
  logic Translate;
  logic TLBAccess, ReadAccess, WriteAccess;

-  // *** If we want to support multiple virtual memory modes (ie sv39 AND sv48),
-  // we could have some muxes that control which parameters are current.
-  // Although then some of the signals are not big enough. But that's a problem
-  // for much later.
+  // Store current virtual memory mode (SV32, SV39, SV48, ect...)
+  logic [`SVMODE_BITS-1:0] SvMode;

  // Index (currently random) to write the next TLB entry
  logic [ENTRY_BITS-1:0] WriteIndex;
@ -116,17 +115,24 @@ module tlb #(parameter ENTRY_BITS = 3,
  // Whether the virtual address has a match in the CAM
  logic                  CAMHit;

-  // Grab the sv bit from SATP
+  // Grab the sv mode from SATP
+  assign SvMode = SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS];
+
+  // The bus width is always the largest it could be for that XLEN. For example, vpn will be 36 bits wide in rv64
+  // this, even though it could be 27 bits (SV39) or 36 bits (SV48) wide. When the value of VPN is narrower,
+  // is shorter, the extra bits are used as padded zeros on the left of the full value.
  generate
    if (`XLEN == 32) begin
-      assign SvMode = SATP_REGW[31];  // *** change to an enum somehow?
+      assign VirtualPageNumber = VirtualAddress[`VPN_BITS+11:12];
    end else begin
-      assign SvMode = SATP_REGW[63]; // currently just a boolean whether translation enabled
+      assign VirtualPageNumber = (SvMode == `SV48) ?
+                                 VirtualAddress[`VPN_BITS+11:12] :
+                                 {{`VPN_SEGMENT_BITS{1'b0}}, VirtualAddress[3*`VPN_SEGMENT_BITS+11:12]};
    end
  endgenerate

  // Whether translation should occur
-  assign Translate = SvMode & (PrivilegeModeW != `M_MODE);
+  assign Translate = (SvMode != `NO_TRANSLATE) & (PrivilegeModeW != `M_MODE);

  // Determine how the TLB is currently being used
  // Note that we use ReadAccess for both loads and instruction fetches
@ -134,7 +140,7 @@ module tlb #(parameter ENTRY_BITS = 3,
  assign WriteAccess = TLBAccessType[0];
  assign TLBAccess = ReadAccess || WriteAccess;

-  assign VirtualPageNumber = VirtualAddress[`VPN_BITS+11:12];
+  
  assign PageOffset        = VirtualAddress[11:0];

  // TLB entries are evicted according to the LRU algorithm
@ -191,6 +197,7 @@ module tlb #(parameter ENTRY_BITS = 3,
    physical_mixer(PhysicalPageNumber, 
      {{EXTRA_PHYSICAL_BITS{1'b0}}, VirtualPageNumber},
      HitPageType,
+      SvMode,
      PhysicalPageNumberMixed);

  // Provide physical address only on TLBHits to cause catastrophic errors if
--- a/wally-pipelined/src/mmu/tlb_cam.sv
+++ b/wally-pipelined/src/mmu/tlb_cam.sv
@ -2,7 +2,9 @@
 // tlb_cam.sv
 //
 // Written: jtorrey@hmc.edu 16 February 2021
-// Modified:
+// Modified: kmacsaigoren@hmc.edu 1 June 2021
+//            Implemented SV48 on top of SV39. This included adding the SvMode signal input and wally constants
+//            Mostly this was to make the cam_lines work.
 //
 // Purpose: Stores virtual page numbers with cached translations.
 //          Determines whether a given virtual page number is in the TLB.
@ -24,18 +26,21 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////

+`include "wally-constants.vh"
+
 module tlb_cam #(parameter ENTRY_BITS = 3,
                 parameter KEY_BITS   = 20,
                 parameter HIGH_SEGMENT_BITS = 10) (
-  input                    clk, reset,
-  input  [KEY_BITS-1:0]    VirtualPageNumber,
-  input  [1:0]             PageTypeWrite,
-  input  [ENTRY_BITS-1:0]  WriteIndex,
-  input                    TLBWrite,
-  input                    TLBFlush,
-  output [ENTRY_BITS-1:0]  VPNIndex,
-  output [1:0]             HitPageType,
-  output                   CAMHit
+  input                     clk, reset,
+  input  [KEY_BITS-1:0]     VirtualPageNumber,
+  input  [1:0]              PageTypeWrite,
+  input  [ENTRY_BITS-1:0]   WriteIndex,
+  input  [`SVMODE_BITS-1:0] SvMode,
+  input                     TLBWrite,
+  input                     TLBFlush,
+  output [ENTRY_BITS-1:0]   VPNIndex,
+  output [1:0]              HitPageType,
+  output                    CAMHit
 );

  localparam NENTRIES = 2**ENTRY_BITS;
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@ -47,13 +47,13 @@ module muldiv (
 	 logic [`XLEN-1:0] MulDivResultE, MulDivResultM;
 	 logic [`XLEN-1:0] PrelimResultE;
 	 logic [`XLEN-1:0] QuotE, RemE;
-	 //logic [`XLEN-1:0] Q, R;	 
 	 logic [`XLEN*2-1:0] ProdE; 

 	 logic 		     enable_q;	 
 	 logic [2:0] 	     Funct3E_Q;
 	 logic 		     div0error;
 	 logic [`XLEN-1:0]   N, D;
+	 logic [`XLEN-1:0]   Num0, Den0;	 

 	 logic 		     gclk;
 	 logic 		     DivStartE;
@ -70,15 +70,25 @@ module muldiv (
 	 end
 	 assign gclk = enable_q & clk;

+	 // Handle sign extension for W-type instructions
+	 if (`XLEN == 64) begin // RV64 has W-type instructions
+            assign Num0 = W64E ? {{32{SrcAE[31]&signedDivide}}, SrcAE[31:0]} : SrcAE;
+            assign Den0 = W64E ? {{32{SrcBE[31]&signedDivide}}, SrcBE[31:0]} : SrcBE;
+	 end else begin // RV32 has no W-type instructions
+            assign Num0 = SrcAE;
+            assign Den0 = SrcBE;	    
+	 end	    
+
 	 // capture the Numerator/Denominator	 
-	 flopenrc #(`XLEN) reg_num (.d(SrcAE), .q(N),
+	 flopenrc #(`XLEN) reg_num (.d(Num0), .q(N),
 				    .en(startDivideE), .clear(DivDoneE),
 				    .reset(reset),  .clk(~gclk));
-	 flopenrc #(`XLEN) reg_den (.d(SrcBE), .q(D),
+	 flopenrc #(`XLEN) reg_den (.d(Den0), .q(D),
 				    .en(startDivideE), .clear(DivDoneE),
 				    .reset(reset),  .clk(~gclk));
+	 
 	 assign signedDivide = (Funct3E[2]&~Funct3E[1]&~Funct3E[0]) | (Funct3E[2]&Funct3E[1]&~Funct3E[0]);	 
-	 div div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide);
+	 intdiv #(`XLEN) div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide);

 	 // Added for debugging of start signal for divide
 	 assign startDivideE = MulDivE&DivStartE&~DivBusyE;
@ -93,7 +103,6 @@ module muldiv (
 	 
 	 // Select result
 	 always_comb
-	   //           case (DivDoneE ? Funct3E_Q : Funct3E)
           case (Funct3E)	   
             3'b000: PrelimResultE = ProdE[`XLEN-1:0];
             3'b001: PrelimResultE = ProdE[`XLEN*2-1:`XLEN];
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -166,12 +166,12 @@ string tests32f[] = '{
    "rv64m/I-MULW-01", "3000",
    "rv64m/I-DIV-01", "3000",
    "rv64m/I-DIVU-01", "3000",
-    //"rv64m/I-DIVUW-01", "3000",
-    //"rv64m/I-DIVW-01", "3000",
+    "rv64m/I-DIVUW-01", "3000",
+    "rv64m/I-DIVW-01", "3000",
    "rv64m/I-REM-01", "3000",
-    "rv64m/I-REMU-01", "3000"
-    //"rv64m/I-REMUW-01", "3000",
-    //"rv64m/I-REMW-01", "3000"
+    "rv64m/I-REMU-01", "3000",
+    "rv64m/I-REMUW-01", "3000",
+    "rv64m/I-REMW-01", "3000"
  };

  string tests64ic[] = '{
@ -320,11 +320,11 @@ string tests32f[] = '{
    "rv32m/I-MUL-01", "2000",
    "rv32m/I-MULH-01", "2000",
    "rv32m/I-MULHSU-01", "2000",
-    "rv32m/I-MULHU-01", "2000"
-    //"rv32m/I-DIV-01", "2000",
-    //"rv32m/I-DIVU-01", "2000",
-    //"rv32m/I-REM-01", "2000",
-    //"rv32m/I-REMU-01", "2000"
+    "rv32m/I-MULHU-01", "2000",
+    "rv32m/I-DIV-01", "2000",
+    "rv32m/I-DIVU-01", "2000",
+    "rv32m/I-REM-01", "2000",
+    "rv32m/I-REMU-01", "2000"
  };

  string tests32ic[] = '{
@ -439,8 +439,11 @@ string tests32f[] = '{

  string testsBP64[] = '{
    "rv64BP/simple", "10000",
+    "rv64BP/mmm", "1000000",
+    "rv64BP/linpack_bench", "1000000",
+    "rv64BP/sieve", "1000000",
    "rv64BP/qsort", "1000000",
-    "rv64BP/sieve", "1000000"
+    "rv64BP/dhrystone", "1000000"
  };

  string tests64p[] = '{