From fec40a1b75f8a929760baa22ee8becabbb84850a Mon Sep 17 00:00:00 2001
From: Ross Thompson <stephen.thompson.37@us.af.mil>
Date: Tue, 25 May 2021 14:26:22 -0500
Subject: [PATCH 01/19] fixed bug with icache miss spill fsm branch.

---
 .../src/ifu/globalHistoryPredictor.sv         | 78 +++++--------------
 wally-pipelined/src/ifu/icache.sv             | 22 ++++--
 2 files changed, 34 insertions(+), 66 deletions(-)

diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv
index 087458df..b2357ecc 100644
--- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv
+++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv
@@ -32,76 +32,34 @@ module globalHistoryPredictor
     )
   (input logic clk,
    input logic 		   reset,
-   input logic 		    StallF, StallD, StallE, FlushF, FlushD, FlushE,
+   input logic 		   StallF, StallD, StallE, FlushF, FlushD, FlushE,
    input logic [`XLEN-1:0] LookUpPC,
    output logic [1:0] 	   Prediction,
    // update
    input logic [`XLEN-1:0] UpdatePC,
    input logic 		   UpdateEN, PCSrcE, 
    input logic [1:0] 	   UpdatePrediction
-   
+  
    );
-   logic [k-1:0] GHRF, GHRFNext;
-   assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; 
+  logic [k-1:0] 	   GHRF, GHRFNext;
+  assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; 
 
-    flopenr #(k) GlobalHistoryRegister(.clk(clk),
-            .reset(reset),
-            .en(UpdateEN),
-            .d(GHRFNext),
-            .q(GHRF));
-
-
-
-  logic [1:0] 		   PredictionMemory;
-  logic 		   DoForwarding, DoForwardingF;
-  logic [1:0] 		   UpdatePredictionF;
- 
+  flopenr #(k) GlobalHistoryRegister(.clk(clk),
+				     .reset(reset),
+				     .en(UpdateEN),
+				     .d(GHRFNext),
+				     .q(GHRF));
 
   // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT 
-  // GHR referes to the address that the past k branches points to in the prediction stage 
-  // GHRE refers to the address that the past k branches points to in the exectution stage
-    SRAM2P1R1W #(k, 2) PHT(.clk(clk),
-				.reset(reset),
-				.RA1(GHRF),
-				.RD1(PredictionMemory),
-				.REN1(~StallF),
-				.WA1(GHRFNext),
-				.WD1(UpdatePrediction),
-				.WEN1(UpdateEN),
-				.BitWEN1(2'b11));
+  SRAM2P1R1W #(k, 2) PHT(.clk(clk),
+			 .reset(reset),
+			 .RA1(GHRF),
+			 .RD1(Prediction),
+			 .REN1(~StallF),
+			 .WA1(GHRF),
+			 .WD1(UpdatePrediction),
+			 .WEN1(UpdateEN),
+			 .BitWEN1(2'b11));
 
 
-  // need to forward when updating to the same address as reading.
-  // first we compare to see if the update and lookup addreses are the same
-  assign DoForwarding = GHRF == GHRFNext;
-
-  // register the update value and the forwarding signal into the Fetch stage
-  // TODO: add stall logic ***
-  flopr #(1) DoForwardingReg(.clk(clk),
-			     .reset(reset),
-			     .d(DoForwarding),
-			     .q(DoForwardingF));
-  
-  flopr #(2) UpdatePredictionReg(.clk(clk),
-				 .reset(reset),
-				 .d(UpdatePrediction),
-				 .q(UpdatePredictionF));
-
-  assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory;
-  
-  //pipeline for GHR
-  /*flopenrc #(k) GHRDReg(.clk(clk),
-      .reset(reset),
-      .en(~StallD),
-      .clear(FlushD),
-      .d(GHRF),
-      .q(GHRD));
-
-  flopenrc #(k) GHREReg(.clk(clk),
-        .reset(reset),
-        .en(~StallE),
-        .clear(FlushE),
-        .d(GHRD),
-        .q(GHRE));
-*/
 endmodule
diff --git a/wally-pipelined/src/ifu/icache.sv b/wally-pipelined/src/ifu/icache.sv
index 9e30a083..4f51edd7 100644
--- a/wally-pipelined/src/ifu/icache.sv
+++ b/wally-pipelined/src/ifu/icache.sv
@@ -154,15 +154,16 @@ module icachecontroller #(parameter LINESIZE = 256) (
   localparam STATE_MISS_SPILL_FETCH_DONE = 10; // write data into SRAM/LUT
   localparam STATE_MISS_SPILL_READ1 = 11; // read block 0 from SRAM/LUT
   localparam STATE_MISS_SPILL_2 = 12; // return to ready if hit or do second block update.
-  localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 13; // miss on block 1, issue read to AHB and wait
-  localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 14; // write data to SRAM/LUT
-  localparam STATE_MISS_SPILL_MERGE = 15; // read block 0 of CPU access,
+  localparam STATE_MISS_SPILL_2_START = 13; // return to ready if hit or do second block update.  
+  localparam STATE_MISS_SPILL_MISS_FETCH_WDV = 14; // miss on block 1, issue read to AHB and wait
+  localparam STATE_MISS_SPILL_MISS_FETCH_DONE = 15; // write data to SRAM/LUT
+  localparam STATE_MISS_SPILL_MERGE = 16; // read block 0 of CPU access,
 
-  localparam STATE_MISS_SPILL_FINAL = 16; // this state replicates STATE_READY's replay of the
+  localparam STATE_MISS_SPILL_FINAL = 17; // this state replicates STATE_READY's replay of the
   // spill access but does nto consider spill.  It also does not do another operation.
   
 
-  localparam STATE_INVALIDATE = 17; // *** not sure if invalidate or evict? invalidate by cache block or address?
+  localparam STATE_INVALIDATE = 18; // *** not sure if invalidate or evict? invalidate by cache block or address?
   
   localparam AHBByteLength = `XLEN / 8;
   localparam AHBOFFETWIDTH = $clog2(AHBByteLength);
@@ -380,11 +381,20 @@ module icachecontroller #(parameter LINESIZE = 256) (
 	PCMux = 2'b10;
 	UnalignedSelect = 1'b1;
 	spillSave = 1'b1; /// *** Could pipeline these to make it clearer in the fsm.
+	ICacheReadEn = 1'b1;
+	NextState = STATE_MISS_SPILL_2_START;
+      end
+      STATE_MISS_SPILL_2_START: begin
 	if (~hit) begin
 	  CntReset = 1'b1;
 	  NextState = STATE_MISS_SPILL_MISS_FETCH_WDV;
 	end else begin
-	  NextState = STATE_MISS_SPILL_FINAL;
+	  NextState = STATE_READY;
+	  ICacheReadEn = 1'b1;
+	  PCMux = 2'b00;
+	  UnalignedSelect = 1'b1;
+	  SavePC = 1'b1;
+	  ICacheStallF = 1'b0;	
 	end
       end
       STATE_MISS_SPILL_MISS_FETCH_WDV: begin

From 7e84c3f51481d788f0714ce6f128a4eff881290b Mon Sep 17 00:00:00 2001
From: Ross Thompson <stephen.thompson.37@us.af.mil>
Date: Thu, 27 May 2021 11:48:29 -0500
Subject: [PATCH 02/19] Updated benchmarking code.

---
 testsBP/crt0/Makefile          |  4 ++--
 testsBP/crt0/start.s           |  7 +++----
 testsBP/mibench_qsort/Makefile |  2 +-
 testsBP/sieve/Makefile         |  2 +-
 testsBP/sieve/sieve.c          | 20 ++++++++++----------
 testsBP/simple/Makefile        |  2 +-
 testsBP/simple/header.h        |  1 +
 testsBP/simple/main.c          |  1 +
 8 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/testsBP/crt0/Makefile b/testsBP/crt0/Makefile
index ab47384f..b42e86cb 100644
--- a/testsBP/crt0/Makefile
+++ b/testsBP/crt0/Makefile
@@ -9,7 +9,7 @@ MABI            :=-mabi=lp64
 LINK_FLAGS      :=$(MARCH) $(MABI) -nostartfiles
 
 AFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -W
-CFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64  -mcmodel=medany 
+CFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64  -mcmodel=medany  -O2
 AS=riscv64-unknown-elf-as
 CC=riscv64-unknown-elf-gcc
 AR=riscv64-unknown-elf-ar
@@ -19,7 +19,7 @@ all: libcrt0.a
 %.o: %.s
 	${AS} ${AFLAGS} -c $< -o $@
 
-libcrt0.a: start.o
+libcrt0.a: start.o pcnt_driver.o pre_main.o
 	${AR} -r $@ $^
 
 clean:
diff --git a/testsBP/crt0/start.s b/testsBP/crt0/start.s
index 19a240d8..731a61e3 100644
--- a/testsBP/crt0/start.s
+++ b/testsBP/crt0/start.s
@@ -43,11 +43,10 @@ _start:
 
 
 
-	# set the stack pointer to the top of memory
-	# 0x8000_0000 + 64K - 8 bytes
-	li sp, 0x007FFFF8
+	# set the stack pointer to the top of memory - 8 bytes (pointer size)
+	li sp, 0x07FFFFF8
 
-	jal ra, main
+	jal ra, pre_main
 	jal ra, _halt
 
 .section .text
diff --git a/testsBP/mibench_qsort/Makefile b/testsBP/mibench_qsort/Makefile
index f4d36839..b1cf7b67 100644
--- a/testsBP/mibench_qsort/Makefile
+++ b/testsBP/mibench_qsort/Makefile
@@ -8,7 +8,7 @@ MARCH           :=-march=rv64ic
 MABI            :=-mabi=lp64
 LINK_FLAGS      :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map
 
-CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align
+CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align -O2
 
 CC=riscv64-unknown-elf-gcc
 DA=riscv64-unknown-elf-objdump -d
diff --git a/testsBP/sieve/Makefile b/testsBP/sieve/Makefile
index 1d38d123..9c884f48 100644
--- a/testsBP/sieve/Makefile
+++ b/testsBP/sieve/Makefile
@@ -8,7 +8,7 @@ MARCH           :=-march=rv64ic
 MABI            :=-mabi=lp64
 LINK_FLAGS      :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map
 
-CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align
+CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align -O2
 
 CC=riscv64-unknown-elf-gcc
 DA=riscv64-unknown-elf-objdump -d
diff --git a/testsBP/sieve/sieve.c b/testsBP/sieve/sieve.c
index e8207404..f7d36d95 100644
--- a/testsBP/sieve/sieve.c
+++ b/testsBP/sieve/sieve.c
@@ -66,21 +66,21 @@ int main () {
     
   ans = sieve ();
   //gettimeofday(&after , NULL);
-  if (ans != 1899)
-    printf ("Sieve result wrong, ans = %d, expected 1899", ans);
+  /* /\* /\\* if (ans != 1899) *\\/ *\/ */
+  /* /\* /\\*   printf ("Sieve result wrong, ans = %d, expected 1899", ans); *\\/ *\/ */
 
-  //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) );
+  /* /\* //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); *\/ */
 
 
-  printf("Round 2\n");
-  //gettimeofday(&before , NULL);
+  /* /\* printf("Round 2\n"); *\/ */
+  /* //gettimeofday(&before , NULL); */
     
-  ans = sieve ();
-  //gettimeofday(&after , NULL);
-  if (ans != 1899)
-    printf ("Sieve result wrong, ans = %d, expected 1899", ans);
+  /* ans = sieve (); */
+  /* //gettimeofday(&after , NULL); */
+  /* if (ans != 1899) */
+  /*   printf ("Sieve result wrong, ans = %d, expected 1899", ans); */
 
-  //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) ); 
+  /* //printf("Total time elapsed : %.0lf us\n" , time_diff(before , after) );  */
   
   return 0;
 
diff --git a/testsBP/simple/Makefile b/testsBP/simple/Makefile
index 450aacaa..4447f284 100644
--- a/testsBP/simple/Makefile
+++ b/testsBP/simple/Makefile
@@ -8,7 +8,7 @@ MARCH           :=-march=rv64ic
 MABI            :=-mabi=lp64
 LINK_FLAGS      :=$(MARCH) $(MABI) -nostartfiles -Wl,-Map=$(TARGET).map
 
-CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align
+CFLAGS =$(MARCH) $(MABI) -Wa,-alhs -Wa,-L -mcmodel=medany  -mstrict-align -O2
 CC=riscv64-unknown-elf-gcc
 DA=riscv64-unknown-elf-objdump -d
 
diff --git a/testsBP/simple/header.h b/testsBP/simple/header.h
index bfe014a4..6def656f 100644
--- a/testsBP/simple/header.h
+++ b/testsBP/simple/header.h
@@ -5,4 +5,5 @@ int fail();
 int simple_csrbr_test();
 int lbu_test();
 int icache_spill_test();
+void global_hist_test();
 #endif
diff --git a/testsBP/simple/main.c b/testsBP/simple/main.c
index 0d14fcfb..036a351d 100644
--- a/testsBP/simple/main.c
+++ b/testsBP/simple/main.c
@@ -2,6 +2,7 @@
 
 int main(){
   //int res = icache_spill_test();
+  global_hist_test();
   int res = 1;
   if (res < 0) {
     fail();

From 8a035104ac47678fc1de4fc1110511c5334233ae Mon Sep 17 00:00:00 2001
From: Ross Thompson <stephen.thompson.37@us.af.mil>
Date: Thu, 27 May 2021 23:06:28 -0500
Subject: [PATCH 03/19] It's a bit sloppy, but the global history predictor is
 working correctly now. There were two major bugs with the predictor. First
 the update mechanism was completely wrong.  The PHT is updated with the GHR
 that was used to lookup the prediction.  PHT[GHR] = Sat2(PHT[GHR], branch
 outcome). Second the GHR needs to be updated speculatively as the branch is
 predicted.  This is important so that back to back branches' GHRs are not the
 same.  The must be different to avoid aliasing.  Speculation of the GHR
 update allows them to be different.  On mis prediction the GHR must be
 reverted. This implementation is a bit sloppy with names and now the GHR
 recovery is performed.  Updates to follow.

---
 wally-pipelined/config/rv64BP/wally-config.vh |  3 +-
 wally-pipelined/src/ifu/bpred.sv              |  9 ++--
 .../src/ifu/globalHistoryPredictor.sv         | 47 +++++++++++++++++--
 wally-pipelined/src/ifu/ifu.sv                |  9 +---
 4 files changed, 51 insertions(+), 17 deletions(-)

diff --git a/wally-pipelined/config/rv64BP/wally-config.vh b/wally-pipelined/config/rv64BP/wally-config.vh
index 17a8c284..fd482bfd 100644
--- a/wally-pipelined/config/rv64BP/wally-config.vh
+++ b/wally-pipelined/config/rv64BP/wally-config.vh
@@ -110,5 +110,6 @@
 `define TWO_BIT_PRELOAD "../config/rv64icfd/twoBitPredictor.txt"
 `define BTB_PRELOAD "../config/rv64icfd/BTBPredictor.txt"
 `define BPRED_ENABLED 1
-`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE
+//`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE
+`define BPTYPE "BPGLOBAL" // BPTWOBIT or "BPGSHARE"  or BPLOCALPAg or BPGSHARE
 `define TESTSBP 1
diff --git a/wally-pipelined/src/ifu/bpred.sv b/wally-pipelined/src/ifu/bpred.sv
index de0f8143..c5b4dde4 100644
--- a/wally-pipelined/src/ifu/bpred.sv
+++ b/wally-pipelined/src/ifu/bpred.sv
@@ -30,7 +30,8 @@
 
 module bpred 
   (input logic clk, reset,
-   input logic 		    StallF, StallD, StallE, FlushF, FlushD, FlushE,
+   input logic 		    StallF, StallD, StallE, StallM, StallW, 
+   input logic 		    FlushF, FlushD, FlushE, FlushM, FlushW,
    // Fetch stage
    // the prediction
    input logic [`XLEN-1:0]  PCNextF, // *** forgot to include this one on the I/O list
@@ -93,6 +94,8 @@ module bpred
 					  // update
 					  .UpdatePC(PCE),
 					  .UpdateEN(InstrClassE[0] & ~StallE),
+					  .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF),
+					  .BPPredDirWrongE(BPPredDirWrongE),
 					  .PCSrcE(PCSrcE),
 					  .UpdatePrediction(UpdateBPPredE));
     end else if (`BPTYPE == "BPGSHARE") begin:Predictor
@@ -190,14 +193,14 @@ module bpred
   flopenrc #(2) BPPredRegD(.clk(clk),
 			   .reset(reset),
 			   .en(~StallD),
-			   .clear(FlushD),
+			   .clear(1'b0),
 			   .d(BPPredF),
 			   .q(BPPredD));
 
   flopenrc #(2) BPPredRegE(.clk(clk),
 			   .reset(reset),
 			   .en(~StallE),
-			   .clear(FlushE),
+			   .clear(1'b0),
 			   .d(BPPredD),
 			   .q(BPPredE));
 
diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv
index b2357ecc..fadbf004 100644
--- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv
+++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv
@@ -37,29 +37,66 @@ module globalHistoryPredictor
    output logic [1:0] 	   Prediction,
    // update
    input logic [`XLEN-1:0] UpdatePC,
-   input logic 		   UpdateEN, PCSrcE, 
+   input logic 		   UpdateEN, PCSrcE,
+   input logic SpeculativeUpdateEn, BPPredDirWrongE,
    input logic [1:0] 	   UpdatePrediction
   
    );
-  logic [k-1:0] 	   GHRF, GHRFNext;
-  assign GHRFNext = {PCSrcE, GHRF[k-1:1]}; 
+  logic [k-1:0] 	   GHRF, GHRFNext, GHRD, GHRE, GHRLookup;
+
+  logic 		   FlushedD, FlushedE;
+  
+
+  // if the prediction is wrong we need to restore the ghr.
+  assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : 
+		    {Prediction[1], GHRF[k-1:1]};
 
   flopenr #(k) GlobalHistoryRegister(.clk(clk),
 				     .reset(reset),
-				     .en(UpdateEN),
+				     .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)),
 				     .d(GHRFNext),
 				     .q(GHRF));
 
+  // if actively updating the GHR at the time of prediction we want to us
+  // GHRFNext as the lookup rather than GHRF.
+
+  assign GHRLookup = UpdateEN ? GHRFNext : GHRF;
+
   // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT 
   SRAM2P1R1W #(k, 2) PHT(.clk(clk),
 			 .reset(reset),
 			 .RA1(GHRF),
 			 .RD1(Prediction),
 			 .REN1(~StallF),
-			 .WA1(GHRF),
+			 .WA1(GHRE),
 			 .WD1(UpdatePrediction),
 			 .WEN1(UpdateEN),
 			 .BitWEN1(2'b11));
 
+  flopenr #(k) GlobalHistoryRegisterD(.clk(clk),
+				     .reset(reset),
+				     .en(~StallD & ~FlushedE),
+				     .d(GHRF),
+				     .q(GHRD));
+
+  flopenr #(k) GlobalHistoryRegisterE(.clk(clk),
+				     .reset(reset),
+				     .en(~StallE & ~ FlushedE),
+				     .d(GHRD),
+				     .q(GHRE));
+
+
+  flopenr #(1) flushedDReg(.clk(clk),
+			   .reset(reset),
+			   .en(~StallD),
+			   .d(FlushD),
+			   .q(FlushedD));
+
+  flopenr #(1) flushedEReg(.clk(clk),
+			   .reset(reset),
+			   .en(~StallE),
+			   .d(FlushE | FlushedD),
+			   .q(FlushedE));
+    
 
 endmodule
diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv
index 994288bd..0922f787 100644
--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@@ -153,14 +153,7 @@ module ifu (
   generate 
     if (`BPRED_ENABLED == 1) begin : bpred
       // I am making the port connection explicit for now as I want to see them and they will be changing.
-      bpred bpred(.clk(clk),
-		  .reset(reset),
-		  .StallF(StallF),
-		  .StallD(StallD),
-		  .StallE(StallE),
-		  .FlushF(FlushF),
-		  .FlushD(FlushD),
-		  .FlushE(FlushE),
+      bpred bpred(.*,
 		  .PCNextF(PCNextF),
 		  .BPPredPCF(BPPredPCF),
 		  .SelBPPredF(SelBPPredF),

From 690815ca51d0ca325c710068f9a0824538f9d4b0 Mon Sep 17 00:00:00 2001
From: Kip Macsai-Goren <kipmacsaigoren@github.com>
Date: Fri, 28 May 2021 18:09:28 -0400
Subject: [PATCH 04/19] made priority encoder parameterizable

---
 wally-pipelined/src/mmu/priority_encoder.sv | 68 ++++++++-------------
 1 file changed, 27 insertions(+), 41 deletions(-)

diff --git a/wally-pipelined/src/mmu/priority_encoder.sv b/wally-pipelined/src/mmu/priority_encoder.sv
index e4a62ce1..dade2e83 100644
--- a/wally-pipelined/src/mmu/priority_encoder.sv
+++ b/wally-pipelined/src/mmu/priority_encoder.sv
@@ -4,7 +4,11 @@
 // Written: tfleming@hmc.edu & jtorrey@hmc.edu 7 April 2021
 // Based on implementation from https://www.allaboutcircuits.com/ip-cores/communication-controller/priority-encoder/
 // *** Give proper LGPL attribution for above source
-// Modified:
+// Modified: Teo Ene 15 Apr 2021:
+//              Temporarily removed paramterized priority encoder for non-parameterized one
+//              To get synthesis working quickly
+//           Kmacsaigoren@hmc.edu 28 May 2021:
+//              Added working version of parameterized priority encoder. 
 //
 // Purpose: One-hot encoding to binary encoder
 //
@@ -27,51 +31,33 @@
 
 `include "wally-config.vh"
 
-// Teo Ene 04/15:
-// Temporarily removed paramterized priority encoder for non-parameterized one
-// To get synthesis working quickly
 module priority_encoder #(parameter BINARY_BITS = 3) (
-  input  logic  [7:0] one_hot,
-  output logic  [2:0] binary
+  input  logic  [2**BINARY_BITS - 1:0] one_hot,
+  output logic  [BINARY_BITS - 1:0] binary
 );
 
-  // localparam ONE_HOT_BITS = 2**BINARY_BITS;
-
-  /*
-  genvar i, j;
-  generate
-    for (i = 0; i < ONE_HOT_BITS; i++) begin
-      for (j = 0; j < BINARY_BITS; j++) begin
-        if (i[j]) begin
-          assign binary[j] = one_hot[i];
-        end
-      end
-    end
-  endgenerate
-  */
-
-  /*
-  logic [BINARY_BITS-1:0] binary_comb;
-
+  integer i;
   always_comb begin
-    binary_comb = 0;
-    for (int i = 0; i < ONE_HOT_BITS; i++)
-      if (one_hot[i]) binary_comb = i;
+    binary = 0;
+    for (i = 0; i < 2**BINARY_BITS; i++) begin
+      if (one_hot[i]) binary = i; // prioritizes the most significant bit
+    end
   end
+  // *** triple check synthesizability here
 
-  assign binary = binary_comb;
+  // Ideally this mimics the following:
+  /*
+  always_comb begin
+    casex (one_hot)
+      1xx ... x: binary = BINARY_BITS - 1;
+      01x ... x: binary = BINARY_BITS - 2;
+      001 ... x: binary = BINARY_BITS - 3;
+      
+      {...}
+
+      00 ... 1xx: binary = 2;
+      00 ... 01x: binary = 1;
+      00 ... 001: binary = 0;
+  end
   */
-  always_comb
-    case (one_hot)
-      8'h1:     binary=3'h0;
-      8'h2:     binary=3'h1;
-      8'h4:     binary=3'h2;
-      8'h8:     binary=3'h3;
-      8'h10:    binary=3'h4;
-      8'h20:    binary=3'h5;
-      8'h40:    binary=3'h6;
-      8'h80:    binary=3'h7;
-      default:  binary=3'h0; //should never happen
-    endcase
-
 endmodule

From 12c34c25f3f122c90b8a99f6fdb4590f721fe0d2 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Mon, 31 May 2021 08:36:19 -0400
Subject: [PATCH 05/19] Modify elements of generics for LZD and shifter wrote
 for integer divider.

---
 wally-pipelined/src/generic/lzd.sv   | 195 +++++++++++++++++++++++++++
 wally-pipelined/src/generic/lzd.sv~  | 195 +++++++++++++++++++++++++++
 wally-pipelined/src/generic/shift.sv |  76 +++++++++++
 wally-pipelined/src/muldiv/div.sv    | 146 +-------------------
 4 files changed, 471 insertions(+), 141 deletions(-)
 create mode 100755 wally-pipelined/src/generic/lzd.sv
 create mode 100755 wally-pipelined/src/generic/lzd.sv~
 create mode 100755 wally-pipelined/src/generic/shift.sv

diff --git a/wally-pipelined/src/generic/lzd.sv b/wally-pipelined/src/generic/lzd.sv
new file mode 100755
index 00000000..98642c15
--- /dev/null
+++ b/wally-pipelined/src/generic/lzd.sv
@@ -0,0 +1,195 @@
+///////////////////////////////////////////
+// lzd.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+/* verilator lint_off DECLFILENAME */
+
+// Original idea came from  V. G. Oklobdzija, "An algorithmic and novel
+// design of a leading zero detector circuit: comparison with logic
+// synthesis," in IEEE Transactions on Very Large Scale Integration
+// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi:
+// 10.1109/92.273153.
+
+// Modified to be more hierarchical
+
+module lzd2 (P, V, B);
+
+   input logic  [1:0] B;
+
+   output logic P;
+   output logic V;
+
+   assign V = B[0] | B[1];
+   assign P = B[0] & ~B[1];
+   
+endmodule // lz2
+
+module lzd_hier #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]          B,
+    output logic [$clog2(WIDTH)-1:0] ZP,
+    output logic 		     ZV);
+
+   if (WIDTH == 128)
+     lzd128 lz127 (ZP, ZV, B);	      
+   else if (WIDTH == 64)
+     lzd64 lz64 (ZP, ZV, B);	   
+   else if (WIDTH == 32)
+     lzd32 lz32 (ZP, ZV, B);
+   else if (WIDTH == 16)
+     lzd16 lz16 (ZP, ZV, B);
+   else if (WIDTH == 8)
+     lzd8 lz8 (ZP, ZV, B);
+   else if (WIDTH == 4)
+     lzd4 lz4 (ZP, ZV, B);
+
+endmodule // lzd_hier
+
+module lzd4 (ZP, ZV, B);
+
+   input logic [3:0]  B;
+
+   logic  	       ZPa;
+   logic  	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [1:0]  ZP;
+   output logic        ZV;
+
+   lz2 l1(ZPa, ZVa, B[1:0]);
+   lz2 l2(ZPb, ZVb, B[3:2]);
+
+   assign ZP[0:0] = ZVb ? ZPb : ZPa;
+   assign ZP[1]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd4
+
+module lzd8 (ZP, ZV, B);
+
+   input logic [7:0]  B;
+
+   logic [1:0] 	       ZPa;
+   logic [1:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [2:0]  ZP;
+   output logic        ZV;
+
+   lz4 l1(ZPa, ZVa, B[3:0]);
+   lz4 l2(ZPb, ZVb, B[7:4]);
+
+   assign ZP[1:0] = ZVb ? ZPb : ZPa;
+   assign ZP[2]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd8
+
+module lzd16 (ZP, ZV, B);
+
+   input logic [15:0]  B;
+
+   logic [2:0] 	       ZPa;
+   logic [2:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [3:0]  ZP;
+   output logic        ZV;
+
+   lz8 l1(ZPa, ZVa, B[7:0]);
+   lz8 l2(ZPb, ZVb, B[15:8]);
+
+   assign ZP[2:0] = ZVb ? ZPb : ZPa;
+   assign ZP[3]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd16
+
+module lzd32 (ZP, ZV, B);
+
+   input logic [31:0] B;
+
+   logic [3:0] 	      ZPa;
+   logic [3:0] 	      ZPb;
+   logic 	      ZVa;
+   logic 	      ZVb;
+   
+   output logic [4:0] ZP;
+   output logic       ZV;
+   
+   lz16 l1(ZPa, ZVa, B[15:0]);
+   lz16 l2(ZPb, ZVb, B[31:16]);
+   
+   assign ZP[3:0] = ZVb ? ZPb : ZPa;
+   assign ZP[4]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd32
+
+module lzd64 (ZP, ZV, B);
+
+   input logic [63:0]  B;
+   
+   logic [4:0] 	       ZPa;
+   logic [4:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [5:0]  ZP;
+   output logic        ZV;
+   
+   lz32 l1(ZPa, ZVa, B[31:0]);
+   lz32 l2(ZPb, ZVb, B[63:32]);
+   
+   assign ZP[4:0] = ZVb ? ZPb : ZPa;
+   assign ZP[5]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd64
+
+module lzd128 (ZP, ZV, B);
+
+   input logic [127:0]  B;
+   
+   logic [5:0] 	       ZPa;
+   logic [5:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [6:0]  ZP;
+   output logic        ZV;
+   
+   lz64 l1(ZPa, ZVa, B[64:0]);
+   lz64 l2(ZPb, ZVb, B[127:63]);
+   
+   assign ZP[5:0] = ZVb ? ZPb : ZPa;
+   assign ZP[6]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd128
+
+/* verilator lint_on DECLFILENAME */
diff --git a/wally-pipelined/src/generic/lzd.sv~ b/wally-pipelined/src/generic/lzd.sv~
new file mode 100755
index 00000000..bfffe5e5
--- /dev/null
+++ b/wally-pipelined/src/generic/lzd.sv~
@@ -0,0 +1,195 @@
+///////////////////////////////////////////
+// lzd.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+/* verilator lint_off DECLFILENAME */
+
+// Original idea came from  V. G. Oklobdzija, "An algorithmic and novel
+// design of a leading zero detector circuit: comparison with logic
+// synthesis," in IEEE Transactions on Very Large Scale Integration
+// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi:
+// 10.1109/92.273153.
+
+// Modified to be more hierarchical
+
+module lz2 (P, V, B);
+
+   input logic  [1:0] B;
+
+   output logic P;
+   output logic V;
+
+   assign V = B[0] | B[1];
+   assign P = B[0] & ~B[1];
+   
+endmodule // lz2
+
+module lzd_hier #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]          B,
+    output logic [$clog2(WIDTH)-1:0] ZP,
+    output logic 		     ZV);
+
+   if (WIDTH == 128)
+     lz128 lzd127 (ZP, ZV, B);	      
+   else if (WIDTH == 64)
+     lz64 lzd64 (ZP, ZV, B);	   
+   else if (WIDTH == 32)
+     lz32 lzd32 (ZP, ZV, B);
+   else if (WIDTH == 16)
+     lz16 lzd16 (ZP, ZV, B);
+   else if (WIDTH == 8)
+     lz8 lzd8 (ZP, ZV, B);
+   else if (WIDTH == 4)
+     lz4 lzd4 (ZP, ZV, B);
+
+endmodule // lzd_hier
+
+module lz4 (ZP, ZV, B);
+
+   input logic [3:0]  B;
+
+   logic  	       ZPa;
+   logic  	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [1:0]  ZP;
+   output logic        ZV;
+
+   lz2 l1(ZPa, ZVa, B[1:0]);
+   lz2 l2(ZPb, ZVb, B[3:2]);
+
+   assign ZP[0:0] = ZVb ? ZPb : ZPa;
+   assign ZP[1]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule 
+
+module lz8 (ZP, ZV, B);
+
+   input logic [7:0]  B;
+
+   logic [1:0] 	       ZPa;
+   logic [1:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [2:0]  ZP;
+   output logic        ZV;
+
+   lz4 l1(ZPa, ZVa, B[3:0]);
+   lz4 l2(ZPb, ZVb, B[7:4]);
+
+   assign ZP[1:0] = ZVb ? ZPb : ZPa;
+   assign ZP[2]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule 
+
+module lz16 (ZP, ZV, B);
+
+   input logic [15:0]  B;
+
+   logic [2:0] 	       ZPa;
+   logic [2:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [3:0]  ZP;
+   output logic        ZV;
+
+   lz8 l1(ZPa, ZVa, B[7:0]);
+   lz8 l2(ZPb, ZVb, B[15:8]);
+
+   assign ZP[2:0] = ZVb ? ZPb : ZPa;
+   assign ZP[3]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz16
+
+module lz32 (ZP, ZV, B);
+
+   input logic [31:0] B;
+
+   logic [3:0] 	      ZPa;
+   logic [3:0] 	      ZPb;
+   logic 	      ZVa;
+   logic 	      ZVb;
+   
+   output logic [4:0] ZP;
+   output logic       ZV;
+   
+   lz16 l1(ZPa, ZVa, B[15:0]);
+   lz16 l2(ZPb, ZVb, B[31:16]);
+   
+   assign ZP[3:0] = ZVb ? ZPb : ZPa;
+   assign ZP[4]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz32
+
+module lz64 (ZP, ZV, B);
+
+   input logic [63:0]  B;
+   
+   logic [4:0] 	       ZPa;
+   logic [4:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [5:0]  ZP;
+   output logic        ZV;
+   
+   lz32 l1(ZPa, ZVa, B[31:0]);
+   lz32 l2(ZPb, ZVb, B[63:32]);
+   
+   assign ZP[4:0] = ZVb ? ZPb : ZPa;
+   assign ZP[5]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz64
+
+module lz128 (ZP, ZV, B);
+
+   input logic [127:0]  B;
+   
+   logic [5:0] 	       ZPa;
+   logic [5:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [6:0]  ZP;
+   output logic        ZV;
+   
+   lz64 l1(ZPa, ZVa, B[64:0]);
+   lz64 l2(ZPb, ZVb, B[127:63]);
+   
+   assign ZP[5:0] = ZVb ? ZPb : ZPa;
+   assign ZP[6]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz128
+
+/* verilator lint_on DECLFILENAME */
diff --git a/wally-pipelined/src/generic/shift.sv b/wally-pipelined/src/generic/shift.sv
new file mode 100755
index 00000000..88152588
--- /dev/null
+++ b/wally-pipelined/src/generic/shift.sv
@@ -0,0 +1,76 @@
+///////////////////////////////////////////
+// shifters.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+/* verilator lint_off DECLFILENAME */
+/* verilator lint_off UNOPTFLAT */
+
+module shift_right #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]         A,
+    input logic [$clog2(WIDTH)-1:0] Shift,
+    output logic [WIDTH-1:0] 	    Z);
+   
+   logic [WIDTH-1:0] 		    stage [$clog2(WIDTH):0];
+   logic 			    sign;   
+   genvar 			    i;
+
+   assign stage[0] = A;   
+   generate
+      for (i=0;i<$clog2(WIDTH);i=i+1)
+	begin : genbit
+	   mux2 #(WIDTH) mux_inst (stage[i], 
+				   {{(WIDTH/(2**(i+1))){1'b0}}, stage[i][WIDTH-1:WIDTH/(2**(i+1))]}, 
+				   Shift[$clog2(WIDTH)-i-1], 
+				   stage[i+1]);
+	end
+   endgenerate
+   assign Z = stage[$clog2(WIDTH)];   
+
+endmodule // shift_right
+
+module shift_left #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]         A,
+    input logic [$clog2(WIDTH)-1:0] Shift,
+    output logic [WIDTH-1:0] 	    Z);
+   
+   logic [WIDTH-1:0] 		    stage [$clog2(WIDTH):0];
+   genvar 			    i;
+   
+   assign stage[0] = A;   
+   generate
+      for (i=0;i<$clog2(WIDTH);i=i+1)
+	begin : genbit
+	   mux2 #(WIDTH) mux_inst (stage[i], 
+				   {stage[i][WIDTH-1-WIDTH/(2**(i+1)):0], {(WIDTH/(2**(i+1))){1'b0}}}, 
+				   Shift[$clog2(WIDTH)-i-1], 
+				   stage[i+1]);
+	end
+   endgenerate
+   assign Z = stage[$clog2(WIDTH)];   
+
+endmodule // shift_left
+
+/* verilator lint_on DECLFILENAME */
+/* verilator lint_on UNOPTFLAT */
diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv
index db830ca3..4266ae61 100755
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
@@ -78,11 +78,7 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
    assign D_NegOne = &D;
 
    // Divider goes the distance to 37 cycles
-   // (thanks the evil divisor for D = 0x1) 
-   // but could theoretically be stopped when
-   // divdone is asserted.  The enable signal
-   // turns off register storage thus invalidating
-   // any future cycles.
+   // (thanks to the evil divisor for D = 0x1) 
    
    // Shift D, if needed (for integer)
    // needed to allow qst to be in range for integer
@@ -93,8 +89,8 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
    // exception is given to FSM to tell the operation to 
    // quit gracefully.
 
-   lz64 p1 (P, V, twoD);
-   shifter_l64 p2 (op2, twoD, P);
+   lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD));
+   shift_left #(64) p2 (twoD, P, op2);   
    assign op1 = twoN;
    assign div0 = ~V;
 
@@ -141,9 +137,8 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
    assign Q = Qd2[63:0];
    assign Rem5 = Rd2[64:1];  
    
-   // Adjust remainder by m (no need to adjust by
-   // n ln(r)
-   shifter_r64 p4 (rem0, Rem5, RemShift);
+   // Adjust remainder by m 
+   shift_right #(64) p4 (Rem5, RemShift, rem0);   
 
    // Adjust Q/Rem for Signed
    assign tcQ = (SignN ^ SignD) & S;
@@ -368,8 +363,6 @@ module qst4 (input logic [6:0] s, input logic [2:0] d,
    
 endmodule // qst4
 
-// LZD
-
 module lz2 (P, V, B0, B1);
 
    input logic  B0;
@@ -497,7 +490,6 @@ module lz64 (ZP, ZV, B);
 endmodule // lz64
 
 // FSM Control for Integer Divider
-
 module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 	      start, error, NumIter, clk, reset);
 
@@ -1505,134 +1497,6 @@ module magcompare8 (LT, EQ, A, B);
 
 endmodule // magcompare8
 
-module shifter_l64 (Z, A, Shift);
-
-   input logic [63:0]  A;
-   input logic [5:0]   Shift;
-   
-   logic [63:0]        stage1;
-   logic [63:0]        stage2;
-   logic [63:0]        stage3;
-   logic [63:0]        stage4;
-   logic [63:0]        stage5;   
-   
-   output logic [63:0] Z;      
-   
-   mux2 #(64) mx01(A,      {A[31:0], 32'h0}, Shift[5], stage1);   
-   mux2 #(64) mx02(stage1, {stage1[47:0], 16'h0}, Shift[4], stage2);
-   mux2 #(64) mx03(stage2, {stage2[55:0], 8'h0}, Shift[3], stage3);
-   mux2 #(64) mx04(stage3, {stage3[59:0], 4'h0}, Shift[2], stage4);
-   mux2 #(64) mx05(stage4, {stage4[61:0], 2'h0}, Shift[1], stage5);
-   mux2 #(64) mx06(stage5, {stage5[62:0], 1'h0}, Shift[0], Z);
-
-endmodule // shifter_l64
-
-module shifter_r64 (Z, A, Shift);
-
-   input logic [63:0]  A;
-   input logic [5:0]   Shift;
-   
-   logic [63:0]        stage1;
-   logic [63:0]        stage2;
-   logic [63:0]        stage3;
-   logic [63:0]        stage4;
-   logic [63:0]        stage5;   		  
-   
-   output logic [63:0] Z;
-   
-   mux2 #(64) mx01(A, {32'h0, A[63:32]}, Shift[5], stage1);		  
-   mux2 #(64) mx02(stage1, {16'h0, stage1[63:16]}, Shift[4], stage2);
-   mux2 #(64) mx03(stage2, {8'h0, stage2[63:8]}, Shift[3], stage3);
-   mux2 #(64) mx04(stage3, {4'h0, stage3[63:4]}, Shift[2], stage4);
-   mux2 #(64) mx05(stage4, {2'h0, stage4[63:2]}, Shift[1], stage5);
-   mux2 #(64) mx06(stage5, {1'h0, stage5[63:1]},  Shift[0], Z);
-   
-endmodule // shifter_r64
-
-module shifter_l32 (Z, A, Shift);
-
-   input logic [31:0]  A;
-   input logic [4:0]   Shift;
-   
-   logic [31:0]        stage1;
-   logic [31:0]        stage2;
-   logic [31:0]        stage3;
-   logic [31:0]        stage4;
-   
-   output logic [31:0] Z;      
-
-   mux2 #(32) mx01(A,      {A[15:0], 16'h0},    Shift[4], stage1);
-   mux2 #(32) mx02(stage1, {stage1[23:0], 8'h0}, Shift[3], stage2);
-   mux2 #(32) mx03(stage2, {stage2[27:0], 4'h0},  Shift[2], stage3);
-   mux2 #(32) mx04(stage3, {stage3[29:0], 2'h0},   Shift[1], stage4);
-   mux2 #(32) mx05(stage4, {stage4[30:0], 1'h0},    Shift[0], Z);
-
-endmodule // shifter_l32
-
-module shifter_r32 (Z, A, Shift);
-
-   input logic [31:0]  A;
-   input logic [4:0]   Shift;
-   
-   logic [31:0]        stage1;
-   logic [31:0]        stage2;
-   logic [31:0]        stage3;
-   logic [31:0]        stage4;
-   
-   output logic [31:0] Z;
-   
-   mux2 #(32) mx01(A,      {16'h0, A[31:16]},   Shift[4], stage1);
-   mux2 #(32) mx02(stage1, {8'h0, stage1[31:8]}, Shift[3], stage2);
-   mux2 #(32) mx03(stage2, {4'h0, stage2[31:4]},  Shift[2], stage3);
-   mux2 #(32) mx04(stage3, {2'h0, stage3[31:2]},   Shift[1], stage4);
-   mux2 #(32) mx05(stage4, {1'h0, stage4[31:1]},    Shift[0], Z);
-   
-endmodule // shifter_r32
-
-module shift_right #(parameter WIDTH=8) 
-   (input logic [`XLEN-1:0]         A,
-    input logic [$clog2(`XLEN)-1:0] Shift,
-    output logic [`XLEN-1:0] 	    Z);
-   
-   logic [`XLEN-1:0] 							 stage [$clog2(`XLEN):0];
-   genvar 								 i;
-   
-   assign stage[0] = A;   
-   generate
-      for (i=0;i<$clog2(`XLEN);i=i+1)
-	begin : genbit
-	   mux2 #(`XLEN) mux_inst (stage[i], 
-				   {{(`XLEN/(2**(i+1))){1'b0}}, stage[i][`XLEN-1:`XLEN/(2**(i+1))]}, 
-				   Shift[$clog2(`XLEN)-i-1], 
-				   stage[i+1]);
-	end
-   endgenerate
-   assign Z = stage[$clog2(`XLEN)];   
-
-endmodule // shift_right
-
-module shift_left #(parameter WIDTH=8) 
-   (input logic [`XLEN-1:0]         A,
-    input logic [$clog2(`XLEN)-1:0] Shift,
-    output logic [`XLEN-1:0] 	    Z);
-   
-   logic [`XLEN-1:0] 							stage [$clog2(`XLEN):0];
-   genvar 								i;
-   
-   assign stage[0] = A;   
-   generate
-      for (i=0;i<$clog2(`XLEN);i=i+1)
-	begin : genbit
-	   mux2 #(`XLEN) mux_inst (stage[i], 
-				   {stage[i][`XLEN-1-`XLEN/(2**(i+1)):0], {(`XLEN/(2**(i+1))){1'b0}}}, 
-				   Shift[$clog2(`XLEN)-i-1], 
-				   stage[i+1]);
-	end
-   endgenerate
-   assign Z = stage[$clog2(`XLEN)];   
-
-endmodule // shift_right
-
 module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
 
    input logic [63:0] Q;

From 9954d16fc91017dae8df34f0b60f6ab188242708 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Mon, 31 May 2021 09:12:21 -0400
Subject: [PATCH 06/19] Add enhancements to integer divider including:   -
 better comments   - optimize FSM to end earlier   - passes for 32-bit or
 64-bit depending on parameter to intdiv

Left div.bak in just in case have to revert back to original for now.
---
 wally-pipelined/src/muldiv/div.bak   | 1560 ++++++++++++++++++++++++++
 wally-pipelined/src/muldiv/div.sv    |  614 ++++------
 wally-pipelined/src/muldiv/muldiv.sv |    3 +-
 3 files changed, 1773 insertions(+), 404 deletions(-)
 create mode 100755 wally-pipelined/src/muldiv/div.bak

diff --git a/wally-pipelined/src/muldiv/div.bak b/wally-pipelined/src/muldiv/div.bak
new file mode 100755
index 00000000..4266ae61
--- /dev/null
+++ b/wally-pipelined/src/muldiv/div.bak
@@ -0,0 +1,1560 @@
+///////////////////////////////////////////
+// mul.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+// *** <Thomas Fleming> I added these verilator controls to clean up the
+// lint output. The linter warnings should be fixed, but now the output is at
+// least readable.
+/* verilator lint_off COMBDLY */
+/* verilator lint_off IMPLICIT */
+
+`include "wally-config.vh"
+
+module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
+
+   input logic [63:0]  N, D;
+   input logic 	       clk;
+   input logic 	       reset;
+   input logic 	       start;
+   input logic 	       S;   
+   
+   output logic [63:0] Qf;
+   output logic [63:0] remf;
+   output logic        div0;
+   output logic        done;
+   output logic        divBusy;   
+
+   logic 	       divdone;   
+   logic 	       enable;
+   logic 	       state0;
+   logic 	       V;   
+   logic [7:0] 	       Num;
+   logic [5:0] 	       P, NumIter, RemShift;
+   logic [63:0]        op1, op2, op1shift, Rem5;
+   logic [64:0]        Qd, Rd, Qd2, Rd2;
+   logic [63:0]        Q, rem0;
+   logic [3:0] 	       quotient;
+   logic 	       otfzero; 
+   logic 	       shiftResult;
+   logic 	       enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp;
+
+   logic [63:0]        twoD;
+   logic [63:0]        twoN;
+   logic 	       SignD;
+   logic 	       SignN;
+   logic [63:0]        QT, remT;
+   logic 	       D_NegOne;
+   logic 	       Max_N;
+
+   // Check if negative (two's complement)
+   //   If so, convert to positive
+   adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD);
+   adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN);   
+   assign SignD = D[63];
+   assign SignN = N[63];   
+   // Max N and D = -1 (Overflow)
+   assign Max_N = (~|N[62:0]) & N[63];
+   assign D_NegOne = &D;
+
+   // Divider goes the distance to 37 cycles
+   // (thanks to the evil divisor for D = 0x1) 
+   
+   // Shift D, if needed (for integer)
+   // needed to allow qst to be in range for integer
+   // division [1,2) and allow integer divide to work.
+   //
+   // The V or valid bit can be used to determine if D
+   // is 0 and thus a divide by 0 exception.  This div0
+   // exception is given to FSM to tell the operation to 
+   // quit gracefully.
+
+   lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD));
+   shift_left #(64) p2 (twoD, P, op2);   
+   assign op1 = twoN;
+   assign div0 = ~V;
+
+   // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0)
+   // v = 2 since \rho < 1 (add 4 to make sure its a ceil)
+   adder #(8) cpa3 ({2'b0, P}, 
+		    {5'h0, shiftResult, ~shiftResult, 1'b0}, 
+		    Num);      
+   
+   // Determine whether need to add just Q/Rem
+   assign shiftResult = P[0];   
+   // div by 2 (ceil)
+   assign NumIter = Num[6:1];   
+   assign RemShift = P;
+
+   // FSM to control integer divider
+   //   assume inputs are postive edge and
+   //   datapath (divider) is negative edge
+   fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv,
+	       start, div0, NumIter, ~clk, reset);
+
+   flopr #(1) rega (~clk, reset, donev, done);
+   flopr #(1) regb (~clk, reset, divdonev, divdone);
+   flopr #(1) regc (~clk, reset, otfzerov, otfzero);
+   flopr #(1) regd (~clk, reset, enablev, enable);
+   flopr #(1) rege (~clk, reset, state0v, state0);
+   flopr #(1) regf (~clk, reset, divBusyv, divBusy);      
+   
+   // To obtain a correct remainder the last bit of the
+   // quotient has to be aligned with a radix-r boundary.
+   // Since the quotient is in the range 1/2 < q < 2 (one
+   // integer bit and m fractional bits), this is achieved by
+   // shifting N right by v+s so that (m+v+s) mod k = 0.  And,
+   // the quotient has to be aligned to the integer position.
+
+   divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, 
+		  enable, otfzero, shiftResult);
+
+   // Storage registers to hold contents stable
+   flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2);
+   flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2);         
+
+   // Probably not needed - just assigns results
+   assign Q = Qd2[63:0];
+   assign Rem5 = Rd2[64:1];  
+   
+   // Adjust remainder by m 
+   shift_right #(64) p4 (Rem5, RemShift, rem0);   
+
+   // Adjust Q/Rem for Signed
+   assign tcQ = (SignN ^ SignD) & S;
+   assign tcR = SignN & S;
+   // Signed Divide
+   // - When N and D are negative: Remainder is negative (undergoes a two's complement).
+   // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement).
+   // - When D is negative: Quotient is negative (undergoes a two's complement).
+   adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT);
+   adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT);         
+
+   // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec)
+   exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf);
+
+endmodule // int32div
+
+module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, 
+		   enable, otfzero, shiftResult); 
+
+   input logic [63:0]   op1, op2;
+   input logic 		clk, state0;
+   input logic 		reset;
+   input logic 		enable;
+   input logic 		otfzero;
+   input logic 		shiftResult;   
+   
+   output logic [64:0] 	rem0;
+   output logic [64:0] 	Q;
+   output logic [3:0] 	quotient;   
+
+   logic [67:0] 	Sum, Carry;   
+   logic [64:0] 	Qstar;   
+   logic [64:0] 	QMstar;   
+   logic [7:0] 		qtotal;   
+   logic [67:0] 	SumN, CarryN, SumN2, CarryN2;
+   logic [67:0] 	divi1, divi2, divi1c, divi2c, dive1;
+   logic [67:0] 	mdivi_temp, mdivi;   
+   logic 		zero;
+   logic [1:0] 		qsel;
+   logic [1:0] 		Qin, QMin;
+   logic 		CshiftQ, CshiftQM;
+   logic [67:0] 	rem1, rem2, rem3;
+   logic [67:0] 	SumR, CarryR;
+   logic [64:0] 	Qt;   
+
+   // Create one's complement values of Divisor (for q*D)
+   assign divi1 = {3'h0, op2, 1'b0};
+   assign divi2 = {2'h0, op2, 2'b0};
+   assign divi1c = ~divi1;
+   assign divi2c = ~divi2;
+   // Shift x1 if not mod k
+   mux2 #(68) mx1 ({3'b000, op1, 1'b0},  {4'h0, op1}, shiftResult, dive1);   
+
+   // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D)
+   mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN);
+   mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN);
+   // Simplify QST
+   adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal);   
+   // q = {+2, +1, -1, -2} else q = 0
+   qst4 pd1 (qtotal[7:1], divi1[63:61], quotient);
+   assign ulp = quotient[2]|quotient[3];
+   assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]);
+   // Map to binary encoding
+   assign qsel[1] = quotient[3]|quotient[2];
+   assign qsel[0] = quotient[3]|quotient[1];   
+   mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp);
+   mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi);
+   csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry);
+   // regs : save CSA
+   flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2);
+   flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2);
+   // OTF
+   ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM);   
+   otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, 
+		   otfzero, enable, Qstar, QMstar);
+
+   // Correction and generation of Remainder
+   adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1);
+   // Add back +D as correction
+   csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR);
+   adder #(68) cpa3 (SumR, CarryR, rem2);   
+   // Choose remainder (Rem or Rem+D)
+   mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3);
+   // Choose correct Q or QM
+   mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt);
+   // Final results
+   assign rem0 = rem3[64:0];
+   assign Q = Qt;   
+   
+endmodule // divide4x64
+
+module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM);
+
+   input logic [3:0] quot;
+
+   output logic [1:0] Qin;
+   output logic [1:0] QMin;
+   output logic       CshiftQ;
+   output logic       CshiftQM;
+
+   // Load/Store Control for OTF
+   assign Qin[1] = (quot[1]) | (quot[3]) | (quot[0]);
+   assign Qin[0] = (quot[1]) | (quot[2]);
+   assign QMin[1] = (quot[1]) | (!quot[3]&!quot[2]&!quot[1]&!quot[0]);
+   assign QMin[0] = (quot[3]) | (quot[0]) | 
+		    (!quot[3]&!quot[2]&!quot[1]&!quot[0]);
+   assign CshiftQ = (quot[1]) | (quot[0]);
+   assign CshiftQM = (quot[3]) | (quot[2]);   
+
+endmodule 
+
+// On-the-fly Conversion per Ercegovac/Lang
+
+module otf #(parameter WIDTH=8) 
+   (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q);
+   
+   input logic [1:0]        Qin, QMin;
+   input logic 		    CshiftQ, CshiftQM;   
+   input logic 		    clk;
+   input logic 	            reset;
+   input logic 		    enable;   
+
+   output logic [WIDTH-1:0] R2Q;
+   output logic [WIDTH-1:0] R1Q;   
+
+   logic [WIDTH-1:0] 	    Qstar, QMstar;      
+   logic [WIDTH-1:0] 	    M1Q, M2Q;
+   
+   // QM
+   mux2 #(WIDTH)  m1 (QMstar, Qstar, CshiftQM, M1Q);
+   flopenr #(WIDTH) r1 (clk, reset, enable, {M1Q[WIDTH-3:0], QMin}, R1Q);
+   // Q
+   mux2 #(WIDTH)  m2 (Qstar, QMstar, CshiftQ, M2Q);
+   flopenr #(WIDTH) r2 (clk, reset, enable, {M2Q[WIDTH-3:0], Qin}, R2Q);
+   
+   assign Qstar = R2Q;
+   assign QMstar = R1Q;
+
+endmodule // otf8
+
+module adder #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b,
+				   output logic [WIDTH-1:0] y);
+
+   assign y = a + b;
+
+endmodule // adder
+
+module fa (input logic a, b, c, output logic sum, carry);
+
+   assign sum = a^b^c;
+   assign carry = a&b|a&c|b&c;   
+
+endmodule // fa
+
+module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c,
+				 output logic [WIDTH-1:0] sum, carry);
+
+   logic [WIDTH:0] 					  carry_temp;   
+   genvar 						  i;
+   generate
+      for (i=0;i<WIDTH;i=i+1)
+	begin : genbit
+	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
+	end
+   endgenerate
+   //assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     // trmimmed excess bit dh 5/3/21
+   assign carry = {carry_temp[WIDTH-1:1], 1'b0};     
+
+endmodule // adder
+
+module eqcmp #(parameter WIDTH = 8)
+   (input  logic [WIDTH-1:0] a, b,
+    output logic y);
+   
+   assign y = (a == b);
+   
+endmodule // eqcmp
+
+module qst4 (input logic [6:0] s, input logic [2:0] d,
+	     output logic [3:0] q);
+   
+   
+   assign q[3] = (!s[6]&s[5]) | (!d[2]&!s[6]&s[4]) | (!s[6]&s[4]&s[3]) | 
+		 (!d[1]&!s[6]&s[4]&s[2]) | (!d[0]&!s[6]&s[4]&s[2]) | 
+		 (!d[1]&!d[0]&!s[6]&s[4]&s[1]) | 
+		 (!d[2]&!d[1]&!d[0]&!s[6]&s[3]&s[2]) | 
+		 (!d[2]&!d[1]&!s[6]&s[3]&s[2]&s[1]) | 
+		 (!d[2]&!d[0]&!s[6]&s[3]&s[2]&s[1]&s[0]);
+   
+   assign q[2] = (d[2]&!s[6]&!s[5]&!s[4]&s[3]) | 
+		 (!s[6]&!s[5]&!s[4]&s[3]&!s[2]) | 
+		 (!d[2]&!s[6]&!s[5]&!s[4]&!s[3]&s[2]) | 
+		 (d[2]&d[1]&d[0]&!s[6]&!s[5]&s[4]&!s[3]) | 
+		 (d[2]&d[1]&!s[6]&!s[5]&s[4]&!s[3]&!s[2]) | 
+		 (d[2]&d[0]&!s[6]&!s[5]&s[4]&!s[3]&!s[2]) | 
+		 (d[2]&!s[6]&!s[5]&s[4]&!s[3]&!s[2]&!s[1]) | 
+		 (!d[2]&d[1]&d[0]&!s[6]&!s[5]&!s[4]&s[2]) | 
+		 (!d[1]&!s[6]&!s[5]&!s[4]&!s[3]&s[2]&s[1]) | 
+		 (!d[2]&d[1]&!s[6]&!s[5]&!s[4]&s[2]&!s[1]) | 
+		 (!d[2]&d[0]&!s[6]&!s[5]&!s[4]&s[2]&!s[1]) | 
+		 (!d[2]&d[1]&!s[6]&!s[5]&!s[4]&s[2]&!s[0]);
+   
+   assign q[1] = (d[2]&s[6]&s[5]&s[4]&!s[3]) | 
+		 (d[1]&s[6]&s[5]&s[4]&!s[3]) | (s[6]&s[5]&s[4]&!s[3]&s[2]) | 
+		 (d[2]&s[6]&s[5]&!s[4]&s[3]&s[2]) | 
+		 (d[0]&s[6]&s[5]&s[4]&!s[3]&s[1]) | 
+		 (d[2]&d[1]&d[0]&s[6]&s[5]&!s[4]&s[3]) | 
+		 (d[2]&d[1]&s[6]&s[5]&!s[4]&s[3]&s[1]) | 
+		 (!d[2]&s[6]&s[5]&s[4]&s[3]&!s[2]&!s[1]) | 
+		 (!d[2]&!d[1]&!d[0]&s[6]&s[5]&s[4]&s[3]&!s[2]) | 
+		 (d[1]&d[0]&s[6]&s[5]&!s[4]&s[3]&s[2]&s[1]) | 
+		 (!d[2]&d[0]&s[6]&s[5]&s[4]&!s[2]&!s[1]&s[0]) | 
+		 (!d[2]&!d[1]&!d[0]&s[6]&s[5]&s[4]&!s[2]&s[1]&s[0]);
+   
+   assign q[0] = (s[6]&!s[5]) | (s[6]&!s[4]&!s[3]) | 
+		 (!d[2]&!d[1]&s[6]&!s[4]) | (!d[2]&!d[0]&s[6]&!s[4]) | 
+		 (!d[2]&s[6]&!s[4]&!s[2]) | (!d[1]&s[6]&!s[4]&!s[2]) | 
+		 (!d[2]&s[6]&!s[4]&!s[1]) | (!d[0]&s[6]&!s[4]&!s[2]&!s[1]) | 
+		 (!d[2]&!d[1]&!d[0]&s[6]&!s[3]&!s[2]&!s[1]) | 
+		 (!d[2]&!d[1]&!d[0]&s[6]&!s[3]&!s[2]&!s[0]) | 
+		 (!d[2]&!d[1]&s[6]&!s[3]&!s[2]&!s[1]&!s[0]);
+   
+endmodule // qst4
+
+module lz2 (P, V, B0, B1);
+
+   input logic  B0;
+   input logic 	B1;
+
+   output logic P;
+   output logic V;
+
+   assign V = B0 | B1;
+   assign P = B0 & ~B1;
+   
+endmodule // lz2
+
+module lz4 (ZP, ZV, B0, B1, V0, V1);
+   
+   input logic        B0;
+   input logic        B1;
+   input logic        V0;
+   input logic        V1;
+   
+   output logic [1:0] ZP;
+   output logic       ZV;
+   
+   assign ZP[0] = V0 ? B0 : B1;
+   assign ZP[1] = ~V0;
+   assign ZV = V0 | V1;
+
+endmodule // lz4
+
+module lz8 (ZP, ZV, B);
+   
+   input logic [7:0]  B;
+
+   logic 	      s1p0;
+   logic 	      s1v0;
+   logic 	      s1p1;
+   logic 	      s1v1;
+   logic 	      s2p0;
+   logic 	      s2v0;
+   logic 	      s2p1;
+   logic 	      s2v1;
+   logic [1:0] 	      ZPa;
+   logic [1:0] 	      ZPb;
+   logic 	      ZVa;
+   logic 	      ZVb;
+   
+   output logic [2:0] ZP;
+   output logic       ZV;
+   
+   lz2 l1(s1p0, s1v0, B[2], B[3]);
+   lz2 l2(s1p1, s1v1, B[0], B[1]);
+   lz4 l3(ZPa, ZVa, s1p0, s1p1, s1v0, s1v1);
+
+   lz2 l4(s2p0, s2v0, B[6], B[7]);
+   lz2 l5(s2p1, s2v1, B[4], B[5]);
+   lz4 l6(ZPb, ZVb, s2p0, s2p1, s2v0, s2v1);
+
+   assign ZP[1:0] = ZVb ? ZPb : ZPa;
+   assign ZP[2]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz8
+
+module lz16 (ZP, ZV, B);
+
+   input logic [15:0]  B;
+
+   logic [2:0] 	       ZPa;
+   logic [2:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [3:0]  ZP;
+   output logic        ZV;
+
+   lz8 l1(ZPa, ZVa, B[7:0]);
+   lz8 l2(ZPb, ZVb, B[15:8]);
+
+   assign ZP[2:0] = ZVb ? ZPb : ZPa;
+   assign ZP[3]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz16
+
+module lz32 (ZP, ZV, B);
+
+   input logic [31:0] B;
+
+   logic [3:0] 	      ZPa;
+   logic [3:0] 	      ZPb;
+   logic 	      ZVa;
+   logic 	      ZVb;
+   
+   output logic [4:0] ZP;
+   output logic       ZV;
+   
+   lz16 l1(ZPa, ZVa, B[15:0]);
+   lz16 l2(ZPb, ZVb, B[31:16]);
+   
+   assign ZP[3:0] = ZVb ? ZPb : ZPa;
+   assign ZP[4]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz32
+
+module lz64 (ZP, ZV, B);
+
+   input logic [63:0]  B;
+   
+   logic [4:0] 	       ZPa;
+   logic [4:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [5:0]  ZP;
+   output logic        ZV;
+   
+   lz32 l1(ZPa, ZVa, B[31:0]);
+   lz32 l2(ZPb, ZVb, B[63:32]);
+   
+   assign ZP[4:0] = ZVb ? ZPb : ZPa;
+   assign ZP[5]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz64
+
+// FSM Control for Integer Divider
+module fsm64 (en, state0, done, divdone, otfzero, divBusy,
+	      start, error, NumIter, clk, reset);
+
+   input logic [5:0]  NumIter;   
+   input logic 	      clk;
+   input logic 	      reset;
+   input logic 	      start;
+   input logic 	      error;   
+   
+   output logic       done;      
+   output logic       en;
+   output logic       state0;
+   output logic       divdone;
+   output logic       otfzero;
+   output logic       divBusy;   
+   
+   logic 	      LT, EQ;
+   logic 	      Divide0;   
+   logic [5:0] 	      CURRENT_STATE;
+   logic [5:0] 	      NEXT_STATE;   
+   
+   parameter [5:0] 
+     S0=6'd0, S1=6'd1, S2=6'd2,
+     S3=6'd3, S4=6'd4, S5=6'd5,
+     S6=6'd6, S7=6'd7, S8=6'd8,
+     S9=6'd9, S10=6'd10, S11=6'd11,
+     S12=6'd12, S13=6'd13, S14=6'd14,
+     S15=6'd15, S16=6'd16, S17=6'd17,
+     S18=6'd18, S19=6'd19, S20=6'd20,
+     S21=6'd21, S22=6'd22, S23=6'd23,
+     S24=6'd24, S25=6'd25, S26=6'd26,
+     S27=6'd27, S28=6'd28, S29=6'd29,
+     S30=6'd30, S31=6'd31, S32=6'd32,
+     S33=6'd33, S34=6'd34, S35=6'd35,
+     S36=6'd36, Done=6'd37;      
+   
+   always @(posedge clk)
+     begin
+	if(reset==1'b1)
+	  CURRENT_STATE<=S0;
+	else
+	  CURRENT_STATE<=NEXT_STATE;
+     end
+
+   // Going to cheat and hard code number of states 
+   // needed into FSM instead of using a counter
+   // FIXME: could counter be better
+
+   // Cheated and made 8 - let synthesis do its magic
+   magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {2'h0, NumIter});
+
+   always @(CURRENT_STATE or start)
+     begin
+ 	case(CURRENT_STATE)
+	  S0:
+	    begin
+	       if (start==1'b0)
+		 begin
+		    otfzero = 1'b1;   
+		    en = 1'b0;
+		    divBusy = 1'b0;		    
+		    state0 = 1'b0;
+		    divdone = 1'b0;		    
+		    done = 1'b0;
+		    NEXT_STATE <= S0;
+		 end 
+	       else 
+		 begin
+		    otfzero = 1'b0;	       		    
+		    en = 1'b1;
+		    divBusy = 1'b1;		    		    
+		    state0 = 1'b1;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		    
+		    done = 1'b0;
+		    divdone = 1'b0;		 		 
+		    NEXT_STATE <= S1;
+		 end 
+	    end	    
+	  S1:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S2;
+		 end
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S2;
+		 end		    
+	    end // case: S1	  
+	  S2:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S3;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S3;
+		 end		    	       	       
+	    end // case: S2
+	  S3:
+	    begin	       
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S4;
+		 end 
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S4;
+		 end		    	       
+	    end // case: S3
+	  S4:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S5;
+		 end 	       	    
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S5;
+		 end		       	       
+	    end // case: S4
+	  S5:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S6;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S6;
+		 end		    	       	       	       
+	    end // case: S5
+	  S6:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S7;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S7;
+		 end		    	       	       
+	    end // case: S6
+	  S7:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S8;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S8;
+		 end		    	       	       
+	    end // case: S7
+	  S8:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S9;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S9;
+		 end		    	       	       
+	    end // case: S8
+	  S9:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S10;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S10;
+		 end		    	       	       
+	    end // case: S9
+	  S10:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S11;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S11;
+		 end		    	       	       
+	    end // case: S10
+	  S11:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S12;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S12;
+		 end		    	       	       
+	    end // case: S11
+	  S12:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S13;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S13;
+		 end		    	       	       
+	    end // case: S12
+	  S13:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S14;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S14;
+		 end		    	       	       
+	    end // case: S13
+	  S14:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S15;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S15;
+		 end		    	       	       
+	    end // case: S14
+	  S15:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S16;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S16;
+		 end		    	       	       
+	    end // case: S15
+	  S16:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S17;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S17;
+		 end		    	       	       
+	    end // case: S16
+	  S17:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S18;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S18;
+		 end		    	       	       
+	    end // case: S17
+	  S18:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S19;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S19;
+		 end		    	       	       
+	    end // case: S18
+	  S19:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S20;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S20;
+		 end		    	       	       
+	    end // case: S19
+	  S20:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S21;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S21;
+		 end		    	       	       
+	    end // case: S20
+	  S21:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S22;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S22;
+		 end		    	       	       
+	    end // case: S21
+	  S22:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;
+		    NEXT_STATE <= S23;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S23;
+		 end		    	       	       
+	    end // case: S22
+	  S23:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S24;		    
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S24;
+		 end		    	       	       
+	    end // case: S23 
+	  S24:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S25;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S25;
+		 end		    	       	       
+	    end // case: S24
+	  S25:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S26;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S26;
+		 end		    	       	       
+	    end // case: S25
+	  S26:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S27;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S27;
+		 end		    	       	       
+	    end // case: S26
+	  S27:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S28;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S28;
+		 end		    	       	       
+	    end // case: S27
+	  S28:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S29;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S29;
+		 end		    	       	       
+	    end // case: S28
+	  S29:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S30;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S30;
+		 end		    	       	       
+	    end // case: S29
+	  S30:
+	    begin
+	       otfzero = 1'b0;
+     	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S31;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S31;
+		 end		    	       	       
+	    end // case: S30
+	  S31:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S32;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S32;
+		 end		    	       	       
+	    end // case: S31  
+	  S32:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S33;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S33;
+		 end		    	       	       
+	    end // case: S32
+	  S33:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S34;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S34;
+		 end		    	       	       
+	    end // case: S33
+	  S34:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S35;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S35;
+		 end		    	       	       
+	    end // case: S34  	  
+	  S35:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S36;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S36;
+		 end		    	       	       
+	    end // case: S35	  
+	  S36:
+	    begin
+	       otfzero = 1'b1;
+	       divBusy = 1'b1;	       
+	       state0 = 1'b0;
+	       done = 1'b1;
+	       if (EQ)
+		 begin
+		    divdone = 1'b1;
+		    en = 1'b1;
+		 end
+	       else
+		 begin
+		    divdone = 1'b0;
+		    en = 1'b0;
+		 end
+	       NEXT_STATE <= S0;
+	    end // case: S36
+	  default: 
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       en = 1'b0;
+	       state0 = 1'b0;
+	       done = 1'b0;
+	       divdone = 1'b0;
+	       NEXT_STATE <= S0;
+	    end
+	endcase // case(CURRENT_STATE)	
+     end // always @ (CURRENT_STATE or X)   
+
+endmodule // fsm64
+
+// 2-bit magnitude comparator
+// This module compares two 2-bit values A and B. LT is '1' if A < B 
+// and GT is '1'if A > B. LT and GT are both '0' if A = B.
+
+module magcompare2b (LT, GT, A, B);
+
+   input logic [1:0] A;
+   input logic [1:0] B;
+   
+   output logic      LT;
+   output logic      GT;
+   
+   // Determine if A < B  using a minimized sum-of-products expression
+   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
+   // Determine if A > B  using a minimized sum-of-products expression
+   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
+
+endmodule // magcompare2b
+
+// J. E. Stine and M. J. Schulte, "A combined two's complement and
+// floating-point comparator," 2005 IEEE International Symposium on
+// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
+// doi: 10.1109/ISCAS.2005.1464531
+
+module magcompare8 (LT, EQ, A, B);
+
+   input logic [7:0]  A;
+   input logic [7:0]  B;
+   
+   logic [3:0] 	      s;
+   logic [3:0] 	      t;
+   logic [1:0] 	      u;
+   logic [1:0] 	      v;
+   logic 	      GT;
+   //wire 	LT;   
+   
+   output logic       EQ;
+   output logic       LT;   
+   
+   magcompare2b mag1 (s[0], t[0], A[1:0], B[1:0]);
+   magcompare2b mag2 (s[1], t[1], A[3:2], B[3:2]);
+   magcompare2b mag3 (s[2], t[2], A[5:4], B[5:4]);
+   magcompare2b mag4 (s[3], t[3], A[7:6], B[7:6]);
+   
+   magcompare2b mag5 (u[0], v[0], t[1:0], s[1:0]);
+   magcompare2b mag6 (u[1], v[1], t[3:2], s[3:2]);
+
+   magcompare2b mag7 (LT, GT, v[1:0], u[1:0]);
+   
+   assign EQ = ~(GT | LT);   
+
+endmodule // magcompare8
+
+module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
+
+   input logic [63:0] Q;
+   input logic [63:0] rem;
+   input logic [63:0] op1;      
+   input logic 	      S;
+   input logic 	      div0;
+   input logic 	      Max_N;
+   input logic 	      D_NegOne;
+   
+   output logic [63:0] Qf;
+   output logic [63:0] remf;
+
+   // Needs to be optimized
+   always_comb
+     case ({div0, S, Max_N, D_NegOne})
+       4'b0000 : Qf = Q;
+       4'b0001 : Qf = Q;
+       4'b0010 : Qf = Q;              
+       4'b0011 : Qf = Q;              
+       4'b0100 : Qf = Q;
+       4'b0101 : Qf = Q;
+       4'b0110 : Qf = Q;       
+       4'b0111 : Qf = {1'b1, 31'h0};
+       4'b1000 : Qf = {64{1'b1}};
+       4'b1001 : Qf = {64{1'b1}};
+       4'b1010 : Qf = {64{1'b1}};
+       4'b1011 : Qf = {64{1'b1}};              
+       4'b1100 : Qf = {64{1'b1}};
+       4'b1101 : Qf = {64{1'b1}};       
+       4'b1110 : Qf = {64{1'b1}};       
+       4'b1111 : Qf = {64{1'b1}};              
+       default: Qf = Q;       
+     endcase 
+
+   always_comb
+     case ({div0, S, Max_N, D_NegOne})
+       4'b0000 : remf = rem;
+       4'b0001 : remf = rem;
+       4'b0010 : remf = rem;
+       4'b0011 : remf = rem;
+       4'b0100 : remf = rem;
+       4'b0101 : remf = rem;
+       4'b0110 : remf = rem;
+       4'b0111 : remf = 64'h0;     
+       4'b1000 : remf = op1;
+       4'b1001 : remf = op1;
+       4'b1010 : remf = op1;
+       4'b1011 : remf = op1;       
+       4'b1100 : remf = op1;
+       4'b1101 : remf = op1;
+       4'b1110 : remf = op1;       
+       4'b1111 : remf = op1;              
+       default: remf = rem;
+     endcase 
+
+endmodule // exception_int
+
+/* verilator lint_on COMBDLY */
+/* verilator lint_on IMPLICIT */
+
diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv
index 4266ae61..107b002f 100755
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
@@ -1,5 +1,5 @@
 ///////////////////////////////////////////
-// mul.sv
+// divide4x64.sv
 //
 // Written: James.Stine@okstate.edu 1 February 2021
 // Modified: 
@@ -29,54 +29,53 @@
 /* verilator lint_off COMBDLY */
 /* verilator lint_off IMPLICIT */
 
-`include "wally-config.vh"
+module intdiv #(parameter WIDTH=64) 
+   (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
 
-module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
-
-   input logic [63:0]  N, D;
-   input logic 	       clk;
-   input logic 	       reset;
-   input logic 	       start;
-   input logic 	       S;   
+   input logic [WIDTH-1:0]   N, D;
+   input logic 		     clk;
+   input logic 		     reset;
+   input logic 		     start;
+   input logic 		     S;   
+   
+   output logic [WIDTH-1:0]  Qf;
+   output logic [WIDTH-1:0]  remf;
+   output logic 	     div0;
+   output logic 	     done;
+   output logic 	     divBusy;   
+   
+   logic 		     enable;
+   logic 		     state0;
+   logic 		     V;   
+   logic [$clog2(WIDTH):0]   Num;
+   logic [$clog2(WIDTH)-1:0] P, NumIter, RemShift;
+   logic [WIDTH-1:0] 	     op1, op2, op1shift, Rem5;
+   logic [WIDTH:0] 	     Qd, Rd, Qd2, Rd2;
+   logic [WIDTH-1:0] 	     Q, rem0;
+   logic [3:0] 		     quotient;
+   logic 		     otfzero; 
+   logic 		     shiftResult;
+   logic 		     enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp;   
+   
+   logic [WIDTH-1:0] 	     twoD;
+   logic [WIDTH-1:0] 	     twoN;
+   logic 		     SignD;
+   logic 		     SignN;
+   logic [WIDTH-1:0] 	     QT, remT;
+   logic 		     D_NegOne;
+   logic 		     Max_N;      
    
-   output logic [63:0] Qf;
-   output logic [63:0] remf;
-   output logic        div0;
-   output logic        done;
-   output logic        divBusy;   
-
-   logic 	       divdone;   
-   logic 	       enable;
-   logic 	       state0;
-   logic 	       V;   
-   logic [7:0] 	       Num;
-   logic [5:0] 	       P, NumIter, RemShift;
-   logic [63:0]        op1, op2, op1shift, Rem5;
-   logic [64:0]        Qd, Rd, Qd2, Rd2;
-   logic [63:0]        Q, rem0;
-   logic [3:0] 	       quotient;
-   logic 	       otfzero; 
-   logic 	       shiftResult;
-   logic 	       enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp;
-
-   logic [63:0]        twoD;
-   logic [63:0]        twoN;
-   logic 	       SignD;
-   logic 	       SignN;
-   logic [63:0]        QT, remT;
-   logic 	       D_NegOne;
-   logic 	       Max_N;
 
    // Check if negative (two's complement)
    //   If so, convert to positive
-   adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD);
-   adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN);   
-   assign SignD = D[63];
-   assign SignN = N[63];   
+   adder #(WIDTH) cpa1 ((D ^ {WIDTH{D[WIDTH-1]&S}}), {{WIDTH-1{1'b0}}, D[WIDTH-1]&S}, twoD);
+   adder #(WIDTH) cpa2 ((N ^ {WIDTH{N[WIDTH-1]&S}}), {{WIDTH-1{1'b0}}, N[WIDTH-1]&S}, twoN);   
+   assign SignD = D[WIDTH-1];
+   assign SignN = N[WIDTH-1];   
    // Max N and D = -1 (Overflow)
-   assign Max_N = (~|N[62:0]) & N[63];
+   assign Max_N = (~|N[WIDTH-2:0]) & N[WIDTH-1];
    assign D_NegOne = &D;
-
+   
    // Divider goes the distance to 37 cycles
    // (thanks to the evil divisor for D = 0x1) 
    
@@ -89,31 +88,31 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
    // exception is given to FSM to tell the operation to 
    // quit gracefully.
 
-   lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD));
-   shift_left #(64) p2 (twoD, P, op2);   
-   assign op1 = twoN;
+   lzd_hier #(WIDTH) p1 (.ZP(P), .ZV(V), .B(twoD));
+   shift_left #(WIDTH) p2 (twoD, P, op2);
+   assign op1 = twoN;   
    assign div0 = ~V;
 
-   // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0)
+   // #iter: N = m+v+s = m+2+s (mod k = 0)
    // v = 2 since \rho < 1 (add 4 to make sure its a ceil)
-   adder #(8) cpa3 ({2'b0, P}, 
-		    {5'h0, shiftResult, ~shiftResult, 1'b0}, 
-		    Num);      
+   // k = 2 (r = 2^k)
+   adder #($clog2(WIDTH)+1) cpa3 ({1'b0, P}, 
+				  {{$clog2(WIDTH)+1-3{1'b0}}, shiftResult, ~shiftResult, 1'b0}, 
+				  Num);      
    
    // Determine whether need to add just Q/Rem
    assign shiftResult = P[0];   
    // div by 2 (ceil)
-   assign NumIter = Num[6:1];   
+   assign NumIter = Num[$clog2(WIDTH):1];   
    assign RemShift = P;
 
    // FSM to control integer divider
    //   assume inputs are postive edge and
    //   datapath (divider) is negative edge
-   fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv,
-	       start, div0, NumIter, ~clk, reset);
+   fsm64 #($clog2(WIDTH)) fsm1 (enablev, state0v, donev, otfzerov, divBusyv,
+				start, div0, NumIter, ~clk, reset);
 
    flopr #(1) rega (~clk, reset, donev, done);
-   flopr #(1) regb (~clk, reset, divdonev, divdone);
    flopr #(1) regc (~clk, reset, otfzerov, otfzero);
    flopr #(1) regd (~clk, reset, enablev, enable);
    flopr #(1) rege (~clk, reset, state0v, state0);
@@ -125,64 +124,66 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
    // integer bit and m fractional bits), this is achieved by
    // shifting N right by v+s so that (m+v+s) mod k = 0.  And,
    // the quotient has to be aligned to the integer position.
-
-   divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, 
-		  enable, otfzero, shiftResult);
+   divide4 #(WIDTH) p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, 
+			enable, otfzero, shiftResult);
 
    // Storage registers to hold contents stable
-   flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2);
-   flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2);         
+   flopenr #(WIDTH+1) reg3 (clk, reset, enable, Rd, Rd2);
+   flopenr #(WIDTH+1) reg4 (clk, reset, enable, Qd, Qd2);         
 
    // Probably not needed - just assigns results
-   assign Q = Qd2[63:0];
-   assign Rem5 = Rd2[64:1];  
+   assign Q = Qd2[WIDTH-1:0];
+   assign Rem5 = Rd2[WIDTH:1];  
    
-   // Adjust remainder by m 
-   shift_right #(64) p4 (Rem5, RemShift, rem0);   
+   // Adjust remainder by m (no need to adjust by
+   shift_right #(WIDTH) p4 (Rem5, RemShift, rem0);
 
    // Adjust Q/Rem for Signed
    assign tcQ = (SignN ^ SignD) & S;
    assign tcR = SignN & S;
-   // Signed Divide
+
+   // When Dividend (N) and/or Divisor (D) are negative (first bit is '1'):
    // - When N and D are negative: Remainder is negative (undergoes a two's complement).
    // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement).
    // - When D is negative: Quotient is negative (undergoes a two's complement).
-   adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT);
-   adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT);         
+   adder #(WIDTH) cpa4 ((rem0 ^ {WIDTH{tcR}}), {{WIDTH-1{1'b0}}, tcR}, remT);
+   adder #(WIDTH) cpa5 ((Q ^ {WIDTH{tcQ}}), {{WIDTH-1{1'b0}}, tcQ}, QT);         
 
    // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec)
-   exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf);
-
+   exception_int #(WIDTH) exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf);
+   
 endmodule // int32div
 
-module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, 
-		   enable, otfzero, shiftResult); 
+// Division by Recurrence (r=4)
+module divide4 #(parameter WIDTH=64) 
+   (Q, rem0, quotient, op1, op2, clk, reset, state0, 
+    enable, otfzero, shiftResult); 
 
-   input logic [63:0]   op1, op2;
-   input logic 		clk, state0;
-   input logic 		reset;
-   input logic 		enable;
-   input logic 		otfzero;
-   input logic 		shiftResult;   
+   input logic [WIDTH-1:0]   op1, op2;
+   input logic 		     clk, state0;
+   input logic 		     reset;
+   input logic 		     enable;
+   input logic 		     otfzero;
+   input logic 		     shiftResult;   
    
-   output logic [64:0] 	rem0;
-   output logic [64:0] 	Q;
-   output logic [3:0] 	quotient;   
+   output logic [WIDTH:0]    rem0;
+   output logic [WIDTH:0]    Q;
+   output logic [3:0] 	     quotient;   
 
-   logic [67:0] 	Sum, Carry;   
-   logic [64:0] 	Qstar;   
-   logic [64:0] 	QMstar;   
-   logic [7:0] 		qtotal;   
-   logic [67:0] 	SumN, CarryN, SumN2, CarryN2;
-   logic [67:0] 	divi1, divi2, divi1c, divi2c, dive1;
-   logic [67:0] 	mdivi_temp, mdivi;   
-   logic 		zero;
-   logic [1:0] 		qsel;
-   logic [1:0] 		Qin, QMin;
-   logic 		CshiftQ, CshiftQM;
-   logic [67:0] 	rem1, rem2, rem3;
-   logic [67:0] 	SumR, CarryR;
-   logic [64:0] 	Qt;   
+   logic [WIDTH+3:0] 	     Sum, Carry;   
+   logic [WIDTH:0] 	     Qstar;   
+   logic [WIDTH:0] 	     QMstar;   
+   logic [7:0] 		     qtotal;   
+   logic [WIDTH+3:0] 	     SumN, CarryN, SumN2, CarryN2;
+   logic [WIDTH+3:0] 	     divi1, divi2, divi1c, divi2c, dive1;
+   logic [WIDTH+3:0] 	     mdivi_temp, mdivi;   
+   logic 		     zero;
+   logic [1:0] 		     qsel;
+   logic [1:0] 		     Qin, QMin;
+   logic 		     CshiftQ, CshiftQM;
+   logic [WIDTH+3:0] 	     rem1, rem2, rem3;
+   logic [WIDTH+3:0] 	     SumR, CarryR;
+   logic [WIDTH:0] 	     Qt;   
 
    // Create one's complement values of Divisor (for q*D)
    assign divi1 = {3'h0, op2, 1'b0};
@@ -190,42 +191,42 @@ module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0,
    assign divi1c = ~divi1;
    assign divi2c = ~divi2;
    // Shift x1 if not mod k
-   mux2 #(68) mx1 ({3'b000, op1, 1'b0},  {4'h0, op1}, shiftResult, dive1);   
+   mux2 #(WIDTH+4) mx1 ({3'b000, op1, 1'b0},  {4'h0, op1}, shiftResult, dive1);   
 
    // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D)
-   mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN);
-   mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN);
+   mux2 #(WIDTH+4) mx2 ({CarryN2[WIDTH+1:0], 2'h0}, {WIDTH+4{1'b0}}, state0, CarryN);
+   mux2 #(WIDTH+4) mx3 ({SumN2[WIDTH+1:0], 2'h0}, dive1, state0, SumN);
    // Simplify QST
-   adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal);   
+   adder #(8) cpa1 (SumN[WIDTH+3:WIDTH-4], CarryN[WIDTH+3:WIDTH-4], qtotal);   
    // q = {+2, +1, -1, -2} else q = 0
-   qst4 pd1 (qtotal[7:1], divi1[63:61], quotient);
+   qst4 pd1 (qtotal[7:1], divi1[WIDTH-1:WIDTH-3], quotient);
    assign ulp = quotient[2]|quotient[3];
    assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]);
    // Map to binary encoding
    assign qsel[1] = quotient[3]|quotient[2];
    assign qsel[0] = quotient[3]|quotient[1];   
-   mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp);
-   mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi);
-   csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry);
+   mux4 #(WIDTH+4) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp);
+   mux2 #(WIDTH+4) mx5 (mdivi_temp, {WIDTH+4{1'b0}}, zero, mdivi);
+   csa #(WIDTH+4) csa1 (mdivi, SumN, {CarryN[WIDTH+3:1], ulp}, Sum, Carry);
    // regs : save CSA
-   flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2);
-   flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2);
+   flopenr #(WIDTH+4) reg1 (clk, reset, enable, Sum, SumN2);
+   flopenr #(WIDTH+4) reg2 (clk, reset, enable, Carry, CarryN2);
    // OTF
    ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM);   
-   otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, 
-		   otfzero, enable, Qstar, QMstar);
+   otf #(WIDTH+1) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, 
+			otfzero, enable, Qstar, QMstar);
 
    // Correction and generation of Remainder
-   adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1);
+   adder #(WIDTH+4) cpa2 (SumN2[WIDTH+3:0], CarryN2[WIDTH+3:0], rem1);
    // Add back +D as correction
-   csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR);
-   adder #(68) cpa3 (SumR, CarryR, rem2);   
+   csa #(WIDTH+4) csa2 (CarryN2[WIDTH+3:0], SumN2[WIDTH+3:0], divi1, SumR, CarryR);
+   adder #(WIDTH+4) cpa3 (SumR, CarryR, rem2);   
    // Choose remainder (Rem or Rem+D)
-   mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3);
+   mux2 #(WIDTH+4) mx6 (rem1, rem2, rem1[WIDTH+3], rem3);
    // Choose correct Q or QM
-   mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt);
+   mux2 #(WIDTH+1) mx7 (Qstar, QMstar, rem1[WIDTH+3], Qt);
    // Final results
-   assign rem0 = rem3[64:0];
+   assign rem0 = rem3[WIDTH:0];
    assign Q = Qt;   
    
 endmodule // divide4x64
@@ -304,10 +305,9 @@ module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c,
 	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
 	end
    endgenerate
-   //assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     // trmimmed excess bit dh 5/3/21
-   assign carry = {carry_temp[WIDTH-1:1], 1'b0};     
+   assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     
 
-endmodule // adder
+endmodule // csa
 
 module eqcmp #(parameter WIDTH = 8)
    (input  logic [WIDTH-1:0] a, b,
@@ -490,26 +490,24 @@ module lz64 (ZP, ZV, B);
 endmodule // lz64
 
 // FSM Control for Integer Divider
-module fsm64 (en, state0, done, divdone, otfzero, divBusy,
-	      start, error, NumIter, clk, reset);
+module fsm64 #(parameter WIDTH=6)
+  (en, state0, done, otfzero, divBusy, start, error, NumIter, clk, reset);
 
-   input logic [5:0]  NumIter;   
-   input logic 	      clk;
-   input logic 	      reset;
-   input logic 	      start;
-   input logic 	      error;   
+   input logic [WIDTH-1:0]  NumIter;   
+   input logic 		    clk;
+   input logic 		    reset;
+   input logic 		    start;
+   input logic 		    error;   
    
-   output logic       done;      
-   output logic       en;
-   output logic       state0;
-   output logic       divdone;
-   output logic       otfzero;
-   output logic       divBusy;   
+   output logic 	    done;      
+   output logic 	    en;
+   output logic 	    state0;
+   output logic 	    otfzero;
+   output logic 	    divBusy;   
    
-   logic 	      LT, EQ;
-   logic 	      Divide0;   
-   logic [5:0] 	      CURRENT_STATE;
-   logic [5:0] 	      NEXT_STATE;   
+   logic 		    LT, EQ;
+   logic [5:0] 		    CURRENT_STATE;
+   logic [5:0] 		    NEXT_STATE;   
    
    parameter [5:0] 
      S0=6'd0, S1=6'd1, S2=6'd2,
@@ -534,12 +532,8 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 	  CURRENT_STATE<=NEXT_STATE;
      end
 
-   // Going to cheat and hard code number of states 
-   // needed into FSM instead of using a counter
-   // FIXME: could counter be better
-
    // Cheated and made 8 - let synthesis do its magic
-   magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {2'h0, NumIter});
+   magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {{8-WIDTH{1'b0}}, NumIter});
 
    always @(CURRENT_STATE or start)
      begin
@@ -552,7 +546,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    divBusy = 1'b0;		    
 		    state0 = 1'b0;
-		    divdone = 1'b0;		    
 		    done = 1'b0;
 		    NEXT_STATE <= S0;
 		 end 
@@ -560,30 +553,21 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		 begin
 		    otfzero = 1'b0;	       		    
 		    en = 1'b1;
-		    divBusy = 1'b1;		    		    
+		    divBusy = 1'b1;		    
 		    state0 = 1'b1;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		    
 		    done = 1'b0;
-		    divdone = 1'b0;		 		 
 		    NEXT_STATE <= S1;
 		 end 
 	    end	    
 	  S1:
 	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
+	       otfzero = 1'b0;	   
+	       divBusy = 1'b1;
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S2;
 		 end
 	       else
@@ -591,8 +575,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S2;
+		    NEXT_STATE <= S36;
 		 end		    
 	    end // case: S1	  
 	  S2:
@@ -604,10 +587,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S3;
 		 end // if (LT|EQ)
 	       else
@@ -615,8 +594,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S3;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S2
 	  S3:
@@ -628,10 +606,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S4;
 		 end 
 	       else
@@ -639,8 +613,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S4;
+		    NEXT_STATE <= S36;
 		 end		    	       
 	    end // case: S3
 	  S4:
@@ -652,10 +625,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S5;
 		 end 	       	    
 	       else
@@ -663,8 +632,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S5;
+		    NEXT_STATE <= S36;
 		 end		       	       
 	    end // case: S4
 	  S5:
@@ -676,10 +644,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S6;
 		 end // if (LT|EQ)
 	       else
@@ -687,8 +651,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S6;
+		    NEXT_STATE <= S36;
 		 end		    	       	       	       
 	    end // case: S5
 	  S6:
@@ -700,10 +663,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S7;
 		 end // if (LT|EQ)
 	       else
@@ -711,8 +670,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S7;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S6
 	  S7:
@@ -724,10 +682,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S8;
 		 end // if (LT|EQ)
 	       else
@@ -735,8 +689,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S8;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S7
 	  S8:
@@ -748,10 +701,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S9;
 		 end // if (LT|EQ)
 	       else
@@ -759,8 +708,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S9;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S8
 	  S9:
@@ -772,10 +720,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S10;
 		 end // if (LT|EQ)
 	       else
@@ -783,8 +727,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S10;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S9
 	  S10:
@@ -796,10 +739,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S11;
 		 end // if (LT|EQ)
 	       else
@@ -807,8 +746,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S11;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S10
 	  S11:
@@ -820,10 +758,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S12;
 		 end // if (LT|EQ)
 	       else
@@ -831,8 +765,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S12;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S11
 	  S12:
@@ -844,10 +777,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S13;
 		 end // if (LT|EQ)
 	       else
@@ -855,8 +784,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S13;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S12
 	  S13:
@@ -868,10 +796,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S14;
 		 end // if (LT|EQ)
 	       else
@@ -879,23 +803,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S14;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S13
 	  S14:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S15;
 		 end // if (LT|EQ)
 	       else
@@ -903,23 +822,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S15;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S14
 	  S15:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S16;
 		 end // if (LT|EQ)
 	       else
@@ -927,23 +841,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S16;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S15
 	  S16:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S17;
 		 end // if (LT|EQ)
 	       else
@@ -951,23 +860,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S17;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S16
 	  S17:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S18;
 		 end // if (LT|EQ)
 	       else
@@ -975,23 +879,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S18;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S17
 	  S18:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S19;
 		 end // if (LT|EQ)
 	       else
@@ -999,23 +898,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S19;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S18
 	  S19:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S20;
 		 end // if (LT|EQ)
 	       else
@@ -1023,23 +917,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S20;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S19
 	  S20:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S21;
 		 end // if (LT|EQ)
 	       else
@@ -1047,23 +936,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S21;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S20
 	  S21:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S22;
 		 end // if (LT|EQ)
 	       else
@@ -1071,23 +955,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S22;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S21
 	  S22:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;
 		    NEXT_STATE <= S23;
 		 end // if (LT|EQ)
 	       else
@@ -1095,23 +974,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S23;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S22
 	  S23:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S24;		    
 		 end // if (LT|EQ)
 	       else
@@ -1119,23 +993,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S24;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S23 
 	  S24:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S25;
 		 end // if (LT|EQ)
 	       else
@@ -1143,23 +1012,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S25;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S24
 	  S25:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S26;
 		 end // if (LT|EQ)
 	       else
@@ -1167,23 +1031,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S26;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S25
 	  S26:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S27;
 		 end // if (LT|EQ)
 	       else
@@ -1191,23 +1050,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S27;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S26
 	  S27:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S28;
 		 end // if (LT|EQ)
 	       else
@@ -1215,23 +1069,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S28;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S27
 	  S28:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S29;
 		 end // if (LT|EQ)
 	       else
@@ -1239,23 +1088,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S29;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S28
 	  S29:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S30;
 		 end // if (LT|EQ)
 	       else
@@ -1263,23 +1107,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S30;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S29
 	  S30:
 	    begin
 	       otfzero = 1'b0;
-     	       divBusy = 1'b1;	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S31;
 		 end // if (LT|EQ)
 	       else
@@ -1287,8 +1126,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S31;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S30
 	  S31:
@@ -1300,10 +1138,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S32;
 		 end // if (LT|EQ)
 	       else
@@ -1311,8 +1145,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S32;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S31  
 	  S32:
@@ -1324,10 +1157,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S33;
 		 end // if (LT|EQ)
 	       else
@@ -1335,8 +1164,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S33;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S32
 	  S33:
@@ -1348,10 +1176,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S34;
 		 end // if (LT|EQ)
 	       else
@@ -1359,23 +1183,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S34;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S33
 	  S34:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
+	       divBusy = 1'b1;
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S35;
 		 end // if (LT|EQ)
 	       else
@@ -1383,8 +1202,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S35;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S34  	  
 	  S35:
@@ -1396,10 +1214,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S36;
 		 end // if (LT|EQ)
 	       else
@@ -1407,7 +1221,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
 		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S35	  
@@ -1419,12 +1232,10 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 	       done = 1'b1;
 	       if (EQ)
 		 begin
-		    divdone = 1'b1;
 		    en = 1'b1;
 		 end
 	       else
 		 begin
-		    divdone = 1'b0;
 		    en = 1'b0;
 		 end
 	       NEXT_STATE <= S0;
@@ -1432,11 +1243,10 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 	  default: 
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
+	       divBusy = 1'b0;	       
 	       en = 1'b0;
 	       state0 = 1'b0;
 	       done = 1'b0;
-	       divdone = 1'b0;
 	       NEXT_STATE <= S0;
 	    end
 	endcase // case(CURRENT_STATE)	
@@ -1497,38 +1307,39 @@ module magcompare8 (LT, EQ, A, B);
 
 endmodule // magcompare8
 
-module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
+// RISC-V Exception Logic for Divide by 0 and Overflow (Signed Integer Divide)
+module exception_int #(parameter WIDTH=8) 
+   (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
 
-   input logic [63:0] Q;
-   input logic [63:0] rem;
-   input logic [63:0] op1;      
-   input logic 	      S;
-   input logic 	      div0;
-   input logic 	      Max_N;
-   input logic 	      D_NegOne;
+   input logic [WIDTH-1:0] Q;
+   input logic [WIDTH-1:0] rem;
+   input logic [WIDTH-1:0] op1;      
+   input logic 		   S;
+   input logic 		   div0;
+   input logic 		   Max_N;
+   input logic 		   D_NegOne;
    
-   output logic [63:0] Qf;
-   output logic [63:0] remf;
+   output logic [WIDTH-1:0] Qf;
+   output logic [WIDTH-1:0] remf;
 
-   // Needs to be optimized
    always_comb
      case ({div0, S, Max_N, D_NegOne})
        4'b0000 : Qf = Q;
        4'b0001 : Qf = Q;
-       4'b0010 : Qf = Q;              
-       4'b0011 : Qf = Q;              
+       4'b0010 : Qf = Q;       
+       4'b0011 : Qf = Q;
        4'b0100 : Qf = Q;
-       4'b0101 : Qf = Q;
+       4'b0101 : Qf = Q;       
        4'b0110 : Qf = Q;       
-       4'b0111 : Qf = {1'b1, 31'h0};
-       4'b1000 : Qf = {64{1'b1}};
-       4'b1001 : Qf = {64{1'b1}};
-       4'b1010 : Qf = {64{1'b1}};
-       4'b1011 : Qf = {64{1'b1}};              
-       4'b1100 : Qf = {64{1'b1}};
-       4'b1101 : Qf = {64{1'b1}};       
-       4'b1110 : Qf = {64{1'b1}};       
-       4'b1111 : Qf = {64{1'b1}};              
+       4'b0111 : Qf = {1'b1, {WIDTH-1{1'h0}}};       
+       4'b1000 : Qf = {WIDTH{1'b1}};
+       4'b1001 : Qf = {WIDTH{1'b1}};
+       4'b1010 : Qf = {WIDTH{1'b1}};
+       4'b1011 : Qf = {WIDTH{1'b1}};       
+       4'b1100 : Qf = {WIDTH{1'b1}};
+       4'b1101 : Qf = {WIDTH{1'b1}};
+       4'b1110 : Qf = {WIDTH{1'b1}};
+       4'b1111 : Qf = {WIDTH{1'b1}};       
        default: Qf = Q;       
      endcase 
 
@@ -1536,18 +1347,18 @@ module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
      case ({div0, S, Max_N, D_NegOne})
        4'b0000 : remf = rem;
        4'b0001 : remf = rem;
-       4'b0010 : remf = rem;
+       4'b0010 : remf = rem;       
        4'b0011 : remf = rem;
        4'b0100 : remf = rem;
        4'b0101 : remf = rem;
        4'b0110 : remf = rem;
-       4'b0111 : remf = 64'h0;     
+       4'b0111 : remf = {WIDTH{1'h0}};
        4'b1000 : remf = op1;
        4'b1001 : remf = op1;
        4'b1010 : remf = op1;
        4'b1011 : remf = op1;       
        4'b1100 : remf = op1;
-       4'b1101 : remf = op1;
+       4'b1101 : remf = op1;       
        4'b1110 : remf = op1;       
        4'b1111 : remf = op1;              
        default: remf = rem;
@@ -1557,4 +1368,3 @@ endmodule // exception_int
 
 /* verilator lint_on COMBDLY */
 /* verilator lint_on IMPLICIT */
-
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index 17c4aac5..f4096fd1 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -78,7 +78,7 @@ module muldiv (
 				    .en(startDivideE), .clear(DivDoneE),
 				    .reset(reset),  .clk(~gclk));	 
 	 assign signedDivide = (Funct3E[2]&~Funct3E[1]&~Funct3E[0]) | (Funct3E[2]&Funct3E[1]&~Funct3E[0]);	 
-	 div div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide);
+	 intdiv #(`XLEN) div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide);
 
 	 // Added for debugging of start signal for divide
 	 assign startDivideE = MulDivE&DivStartE&~DivBusyE;
@@ -93,7 +93,6 @@ module muldiv (
 	 
 	 // Select result
 	 always_comb
-	   //           case (DivDoneE ? Funct3E_Q : Funct3E)
            case (Funct3E)	   
              3'b000: PrelimResultE = ProdE[`XLEN-1:0];
              3'b001: PrelimResultE = ProdE[`XLEN*2-1:`XLEN];

From 46a232b862249262e91fd0241c48f7b662bac599 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Mon, 31 May 2021 09:16:30 -0400
Subject: [PATCH 07/19] Cosmetic changes on integer divider

---
 wally-pipelined/src/muldiv/div.sv    | 7 ++++---
 wally-pipelined/src/muldiv/muldiv.sv | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv
index 107b002f..8b4e0463 100755
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
@@ -55,7 +55,7 @@ module intdiv #(parameter WIDTH=64)
    logic [3:0] 		     quotient;
    logic 		     otfzero; 
    logic 		     shiftResult;
-   logic 		     enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp;   
+   logic 		     enablev, state0v, donev, oftzerov, divBusyv, ulp;   
    
    logic [WIDTH-1:0] 	     twoD;
    logic [WIDTH-1:0] 	     twoN;
@@ -231,6 +231,7 @@ module divide4 #(parameter WIDTH=64)
    
 endmodule // divide4x64
 
+// Load/Control for OTFC
 module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM);
 
    input logic [3:0] quot;
@@ -251,8 +252,7 @@ module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM);
 
 endmodule 
 
-// On-the-fly Conversion per Ercegovac/Lang
-
+// On-the-fly Conversion (OTFC)
 module otf #(parameter WIDTH=8) 
    (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q);
    
@@ -317,6 +317,7 @@ module eqcmp #(parameter WIDTH = 8)
    
 endmodule // eqcmp
 
+// QST for r=4
 module qst4 (input logic [6:0] s, input logic [2:0] d,
 	     output logic [3:0] q);
    
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index f4096fd1..ccabe341 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -47,7 +47,6 @@ module muldiv (
 	 logic [`XLEN-1:0] MulDivResultE, MulDivResultM;
 	 logic [`XLEN-1:0] PrelimResultE;
 	 logic [`XLEN-1:0] QuotE, RemE;
-	 //logic [`XLEN-1:0] Q, R;	 
 	 logic [`XLEN*2-1:0] ProdE; 
 
 	 logic 		     enable_q;	 

From f6c88666cfc8dbeebfd34db85b5282636c361690 Mon Sep 17 00:00:00 2001
From: Ross Thompson <stephen.thompson.37@us.af.mil>
Date: Mon, 31 May 2021 16:11:12 -0500
Subject: [PATCH 08/19] may have fixed the global branch history predictor. The
 solution required a completed rewrite and understanding of how the GHR needs
 to be speculatively updated and repaired.

---
 testsBP/simple/header.h                       |  4 +-
 testsBP/simple/main.c                         |  4 +-
 wally-pipelined/src/ifu/bpred.sv              | 11 ++-
 .../src/ifu/globalHistoryPredictor.sv         | 98 +++++++++++++++----
 wally-pipelined/src/ifu/gshare.sv             | 41 +++++++-
 5 files changed, 128 insertions(+), 30 deletions(-)

diff --git a/testsBP/simple/header.h b/testsBP/simple/header.h
index 6def656f..f3a62da3 100644
--- a/testsBP/simple/header.h
+++ b/testsBP/simple/header.h
@@ -5,5 +5,7 @@ int fail();
 int simple_csrbr_test();
 int lbu_test();
 int icache_spill_test();
-void global_hist_test();
+void global_hist_1_space_test();
+void global_hist_2_space_test();
+void global_hist_3_space_test();
 #endif
diff --git a/testsBP/simple/main.c b/testsBP/simple/main.c
index 036a351d..7bf6b475 100644
--- a/testsBP/simple/main.c
+++ b/testsBP/simple/main.c
@@ -2,7 +2,9 @@
 
 int main(){
   //int res = icache_spill_test();
-  global_hist_test();
+  global_hist_3_space_test();  
+  global_hist_2_space_test();
+  global_hist_1_space_test();  
   int res = 1;
   if (res < 0) {
     fail();
diff --git a/wally-pipelined/src/ifu/bpred.sv b/wally-pipelined/src/ifu/bpred.sv
index c5b4dde4..9beaa959 100644
--- a/wally-pipelined/src/ifu/bpred.sv
+++ b/wally-pipelined/src/ifu/bpred.sv
@@ -90,12 +90,13 @@ module bpred
 					  .reset(reset),
 					  .*, // Stalls and flushes
 					  .LookUpPC(PCNextF),
-					  .Prediction(BPPredF),
+					  .BPPredF(BPPredF),
 					  // update
-					  .UpdatePC(PCE),
-					  .UpdateEN(InstrClassE[0] & ~StallE),
-					  .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF),
+					  .BPPredD(BPPredD),
+					  .InstrClassE(InstrClassE),
+					  .BPInstrClassE(BPInstrClassE),
 					  .BPPredDirWrongE(BPPredDirWrongE),
+					  .UpdatePC(PCE),
 					  .PCSrcE(PCSrcE),
 					  .UpdatePrediction(UpdateBPPredE));
     end else if (`BPTYPE == "BPGSHARE") begin:Predictor
@@ -108,6 +109,8 @@ module bpred
 				   // update
 				   .UpdatePC(PCE),
 				   .UpdateEN(InstrClassE[0] & ~StallE),
+				   .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF),
+				   .BPPredDirWrongE(BPPredDirWrongE),
 				   .PCSrcE(PCSrcE),
 				   .UpdatePrediction(UpdateBPPredE));
     end 
diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv
index fadbf004..b2ac1991 100644
--- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv
+++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv
@@ -34,49 +34,108 @@ module globalHistoryPredictor
    input logic 		   reset,
    input logic 		   StallF, StallD, StallE, FlushF, FlushD, FlushE,
    input logic [`XLEN-1:0] LookUpPC,
-   output logic [1:0] 	   Prediction,
+   output logic [1:0] 	   BPPredF,
    // update
+   input logic [1:0] 	   BPPredD,
+   input logic [4:0] 	   InstrClassE,
+   input logic [4:0] 	   BPInstrClassE,
+   input logic [4:0] 	   BPInstrClassD,
+   input logic [4:0] 	   BPInstrClassF, 
+   input logic 		   BPPredDirWrongE,
+
    input logic [`XLEN-1:0] UpdatePC,
-   input logic 		   UpdateEN, PCSrcE,
-   input logic SpeculativeUpdateEn, BPPredDirWrongE,
+   input logic 		   PCSrcE,
    input logic [1:0] 	   UpdatePrediction
   
    );
-  logic [k-1:0] 	   GHRF, GHRFNext, GHRD, GHRE, GHRLookup;
+  logic [k+1:0] 	   GHR, GHRNext;
+  logic [k-1:0] 	   PHTUpdateAdr, PHTUpdateAdr0, PHTUpdateAdr1;
+  logic 		   PHTUpdateEN;
+  logic 		   BPClassWrongNonCFI;
+  logic 		   BPClassWrongCFI;
+  logic 		   BPClassRightNonCFI;
+  
+		   
+/* -----\/----- EXCLUDED -----\/-----
+  logic [k-1:0] GHRD, GHRE, GHRLookup;
 
   logic 		   FlushedD, FlushedE;
+ -----/\----- EXCLUDED -----/\----- */
+
+
+  logic [6:0] 		   GHRMuxSel;
+  logic 		   GHRUpdateEN;
+
+  assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0];
+  assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0];
+  assign BPClassWrongNonCFI = BPInstrClassE[0] & ~InstrClassE[0];
+  assign BPClassRightBPWrong = BPInstrClassE[0] & InstrClassE[0] & BPPredDirWrongE;
+  assign BPClassRightBPRight = BPInstrClassE[0] & InstrClassE[0] & ~BPPredDirWrongE;
   
+  
+  // GHR update selection, 1 hot encoded.
+  assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight);
 
-  // if the prediction is wrong we need to restore the ghr.
-  assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : 
-		    {Prediction[1], GHRF[k-1:1]};
+  assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0];
+  assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]);
 
-  flopenr #(k) GlobalHistoryRegister(.clk(clk),
-				     .reset(reset),
-				     .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)),
-				     .d(GHRFNext),
-				     .q(GHRF));
+
+  assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0];
+
+
+
+  assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0];
+  assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0];
+  assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight));
+  assign GHRUpdateEN = (| GHRMuxSel[5:1] & ~StallE) | GHRMuxSel[6] & ~StallF;
+
+  // hoping this created a AND-OR mux.
+  always_comb begin
+    case (GHRMuxSel) 
+      7'b000_0001: GHRNext = GHR[k-1+2:0];  // no change
+      7'b000_0010: GHRNext = {GHR[k-2+2:0], PCSrcE}; // branch update
+      7'b000_0100: GHRNext = {1'b0, GHR[k+1:1]}; // repair 1
+      7'b000_1000: GHRNext = {GHR[k-1+2:1], PCSrcE}; // branch update with mis prediction correction
+      7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2
+      7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1
+      7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update
+      //7'b100_0000: GHRNext = {k+1{1'bx}}; // speculative update
+      default: GHRNext = GHR[k-1+2:0];
+    endcase
+  end
+
+  flopenr #(k+2) GlobalHistoryRegister(.clk(clk),
+				       .reset(reset),
+				       .en((GHRUpdateEN)),
+				       .d(GHRNext),
+				       .q(GHR));
 
   // if actively updating the GHR at the time of prediction we want to us
-  // GHRFNext as the lookup rather than GHRF.
+  // GHRNext as the lookup rather than GHR.
 
-  assign GHRLookup = UpdateEN ? GHRFNext : GHRF;
+  //assign GHRLookup = GHRUpdateEN ? GHRNext : GHR;
 
+  assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0];
+  assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1];  
+  assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0;
+  assign PHTUpdateEN = InstrClassE[0] & ~StallE;
+  
   // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT 
   SRAM2P1R1W #(k, 2) PHT(.clk(clk),
 			 .reset(reset),
-			 .RA1(GHRF),
-			 .RD1(Prediction),
+			 .RA1(GHR[k-1:0]),
+			 .RD1(BPPredF),
 			 .REN1(~StallF),
-			 .WA1(GHRE),
+			 .WA1(PHTUpdateAdr),
 			 .WD1(UpdatePrediction),
-			 .WEN1(UpdateEN),
+			 .WEN1(PHTUpdateEN),
 			 .BitWEN1(2'b11));
 
+/* -----\/----- EXCLUDED -----\/-----
   flopenr #(k) GlobalHistoryRegisterD(.clk(clk),
 				     .reset(reset),
 				     .en(~StallD & ~FlushedE),
-				     .d(GHRF),
+				     .d(GHR),
 				     .q(GHRD));
 
   flopenr #(k) GlobalHistoryRegisterE(.clk(clk),
@@ -97,6 +156,7 @@ module globalHistoryPredictor
 			   .en(~StallE),
 			   .d(FlushE | FlushedD),
 			   .q(FlushedE));
+ -----/\----- EXCLUDED -----/\----- */
     
 
 endmodule
diff --git a/wally-pipelined/src/ifu/gshare.sv b/wally-pipelined/src/ifu/gshare.sv
index 4d31e519..3cc73be8 100644
--- a/wally-pipelined/src/ifu/gshare.sv
+++ b/wally-pipelined/src/ifu/gshare.sv
@@ -38,28 +38,32 @@ module gsharePredictor
    // update
    input logic [`XLEN-1:0] UpdatePC,
    input logic 		   UpdateEN, PCSrcE,
+   input logic 		   SpeculativeUpdateEn, BPPredDirWrongE,
    input logic [1:0] 	   UpdatePrediction
   
    );
 
-  logic [k-1:0] 	   GHRF, GHRFNext;
+  logic [k-1:0] 	   GHRF, GHRFNext, GHRD, GHRE;
   //logic [k-1:0] 	   LookUpPCIndexD, LookUpPCIndexE;
   logic [k-1:0] 	   LookUpPCIndex, UpdatePCIndex;
   logic [1:0] 		   PredictionMemory;
   logic 		   DoForwarding, DoForwardingF;
   logic [1:0] 		   UpdatePredictionF;
+  logic 		   FlushedD, FlushedE;
 
-  assign GHRFNext = {PCSrcE, GHRF[k-1:1]};
+  // if the prediction is wrong we need to restore the ghr.
+  assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : 
+		    {Prediction[1], GHRF[k-1:1]};
   
   flopenr #(k) GlobalHistoryRegister(.clk(clk),
 				     .reset(reset),
-				     .en(UpdateEN),
+				     .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)),
 				     .d(GHRFNext),
 				     .q(GHRF));
 
 
   // for gshare xor the PC with the GHR 
-  assign UpdatePCIndex = GHRFNext ^ UpdatePC[k:1];
+  assign UpdatePCIndex = GHRE ^ UpdatePC[k:1];
   assign LookUpPCIndex = GHRF ^ LookUpPC[k:1];  
   // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT 
   // GHR referes to the address that the past k branches points to in the prediction stage 
@@ -67,7 +71,7 @@ module gsharePredictor
   SRAM2P1R1W #(k, 2) PHT(.clk(clk),
 			 .reset(reset),
 			 .RA1(LookUpPCIndex),
-			 .RD1(PredictionMemory),
+			 .RD1(Prediction),
 			 .REN1(~StallF),
 			 .WA1(UpdatePCIndex),
 			 .WD1(UpdatePrediction),
@@ -75,6 +79,32 @@ module gsharePredictor
 			 .BitWEN1(2'b11));
 
 
+  flopenr #(k) GlobalHistoryRegisterD(.clk(clk),
+				     .reset(reset),
+				     .en(~StallD & ~FlushedE),
+				     .d(GHRF),
+				     .q(GHRD));
+
+  flopenr #(k) GlobalHistoryRegisterE(.clk(clk),
+				     .reset(reset),
+				     .en(~StallE & ~ FlushedE),
+				     .d(GHRD),
+				     .q(GHRE));
+
+
+  flopenr #(1) flushedDReg(.clk(clk),
+			   .reset(reset),
+			   .en(~StallD),
+			   .d(FlushD),
+			   .q(FlushedD));
+
+  flopenr #(1) flushedEReg(.clk(clk),
+			   .reset(reset),
+			   .en(~StallE),
+			   .d(FlushE | FlushedD),
+			   .q(FlushedE));
+
+/* -----\/----- EXCLUDED -----\/-----
   // need to forward when updating to the same address as reading.
   // first we compare to see if the update and lookup addreses are the same
   assign DoForwarding = LookUpPCIndex == UpdatePCIndex;
@@ -92,6 +122,7 @@ module gsharePredictor
 				 .q(UpdatePredictionF));
 
   assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory;
+ -----/\----- EXCLUDED -----/\----- */
   
   //pipeline for GHR
 /* -----\/----- EXCLUDED -----\/-----

From ddbdd0d5a27d485537994516a0671225c3cb7219 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Mon, 31 May 2021 23:27:42 -0400
Subject: [PATCH 09/19] Modify muldiv.sv to handle W instructions for 64-bits

---
 wally-pipelined/src/muldiv/div.sv              |  1 -
 wally-pipelined/src/muldiv/muldiv.sv           | 17 ++++++++++++++---
 wally-pipelined/testbench/testbench-imperas.sv | 10 +++++-----
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv
index 8b4e0463..10af5eee 100755
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
@@ -87,7 +87,6 @@ module intdiv #(parameter WIDTH=64)
    // is 0 and thus a divide by 0 exception.  This div0
    // exception is given to FSM to tell the operation to 
    // quit gracefully.
-
    lzd_hier #(WIDTH) p1 (.ZP(P), .ZV(V), .B(twoD));
    shift_left #(WIDTH) p2 (twoD, P, op2);
    assign op1 = twoN;   
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index ccabe341..0c26a5df 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -53,6 +53,7 @@ module muldiv (
 	 logic [2:0] 	     Funct3E_Q;
 	 logic 		     div0error;
 	 logic [`XLEN-1:0]   N, D;
+	 logic [`XLEN-1:0]   Num0, Den0;	 
 
 	 logic 		     gclk;
 	 logic 		     DivStartE;
@@ -69,13 +70,23 @@ module muldiv (
 	 end
 	 assign gclk = enable_q & clk;
 
+	 // Handle sign extension for W-type instructions
+	 if (`XLEN == 64) begin // RV64 has W-type instructions
+            assign Num0 = W64E ? {{32{SrcAE[31]&signedDivide}}, SrcAE[31:0]} : SrcAE;
+            assign Den0 = W64E ? {{32{SrcBE[31]&signedDivide}}, SrcBE[31:0]} : SrcBE;
+	 end else begin // RV32 has no W-type instructions
+            assign Num0 = SrcAE;
+            assign Den0 = SrcAE;	    
+	 end	    
+
 	 // capture the Numerator/Denominator	 
-	 flopenrc #(`XLEN) reg_num (.d(SrcAE), .q(N),
+	 flopenrc #(`XLEN) reg_num (.d(Num0), .q(N),
 				    .en(startDivideE), .clear(DivDoneE),
 				    .reset(reset),  .clk(~gclk));
-	 flopenrc #(`XLEN) reg_den (.d(SrcBE), .q(D),
+	 flopenrc #(`XLEN) reg_den (.d(Den0), .q(D),
 				    .en(startDivideE), .clear(DivDoneE),
-				    .reset(reset),  .clk(~gclk));	 
+				    .reset(reset),  .clk(~gclk));
+	 
 	 assign signedDivide = (Funct3E[2]&~Funct3E[1]&~Funct3E[0]) | (Funct3E[2]&Funct3E[1]&~Funct3E[0]);	 
 	 intdiv #(`XLEN) div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide);
 
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index ea693900..6d8f1049 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -166,12 +166,12 @@ string tests32f[] = '{
     "rv64m/I-MULW-01", "3000",
     "rv64m/I-DIV-01", "3000",
     "rv64m/I-DIVU-01", "3000",
-    //"rv64m/I-DIVUW-01", "3000",
-    //"rv64m/I-DIVW-01", "3000",
+    "rv64m/I-DIVUW-01", "3000",
+    "rv64m/I-DIVW-01", "3000",
     "rv64m/I-REM-01", "3000",
-    "rv64m/I-REMU-01", "3000"
-    //"rv64m/I-REMUW-01", "3000",
-    //"rv64m/I-REMW-01", "3000"
+    "rv64m/I-REMU-01", "3000",
+    "rv64m/I-REMUW-01", "3000",
+    "rv64m/I-REMW-01", "3000"
   };
 
   string tests64ic[] = '{

From 857f59ab5c51e146d1cdf121443297f7ac079246 Mon Sep 17 00:00:00 2001
From: Ross Thompson <stephen.thompson.37@us.af.mil>
Date: Tue, 1 Jun 2021 10:57:43 -0500
Subject: [PATCH 10/19] Now have global history working correctly.

---
 testsBP/crt0/Makefile                         |   8 +-
 testsBP/simple/header.h                       |   1 +
 testsBP/simple/main.c                         |   3 +-
 wally-pipelined/config/rv64BP/wally-config.vh |   6 +-
 wally-pipelined/src/ifu/bpred.sv              |  29 ++--
 .../src/ifu/globalHistoryPredictor.sv         |  62 ++-----
 wally-pipelined/src/ifu/gshare.sv             | 159 ------------------
 .../testbench/testbench-imperas.sv            |   5 +-
 8 files changed, 38 insertions(+), 235 deletions(-)
 delete mode 100644 wally-pipelined/src/ifu/gshare.sv

diff --git a/testsBP/crt0/Makefile b/testsBP/crt0/Makefile
index b42e86cb..2af43a40 100644
--- a/testsBP/crt0/Makefile
+++ b/testsBP/crt0/Makefile
@@ -4,12 +4,12 @@ ROOT		:= ..
 LIBRARY_DIRS	:= 
 LIBRARY_FILES	:=
 
-MARCH           :=-march=rv64ic
-MABI            :=-mabi=lp64
+MARCH           :=-march=rv64imfdc
+MABI            :=-mabi=lp64d
 LINK_FLAGS      :=$(MARCH) $(MABI) -nostartfiles
 
-AFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64 -W
-CFLAGS =$(MARCH) $(MABI) -march=rv64ic -mabi=lp64  -mcmodel=medany  -O2
+AFLAGS =$(MARCH) $(MABI) -W
+CFLAGS =$(MARCH) $(MABI) -mcmodel=medany  -O2
 AS=riscv64-unknown-elf-as
 CC=riscv64-unknown-elf-gcc
 AR=riscv64-unknown-elf-ar
diff --git a/testsBP/simple/header.h b/testsBP/simple/header.h
index f3a62da3..aab8973f 100644
--- a/testsBP/simple/header.h
+++ b/testsBP/simple/header.h
@@ -5,6 +5,7 @@ int fail();
 int simple_csrbr_test();
 int lbu_test();
 int icache_spill_test();
+void global_hist_0_space_test();
 void global_hist_1_space_test();
 void global_hist_2_space_test();
 void global_hist_3_space_test();
diff --git a/testsBP/simple/main.c b/testsBP/simple/main.c
index 7bf6b475..564b474e 100644
--- a/testsBP/simple/main.c
+++ b/testsBP/simple/main.c
@@ -4,7 +4,8 @@ int main(){
   //int res = icache_spill_test();
   global_hist_3_space_test();  
   global_hist_2_space_test();
-  global_hist_1_space_test();  
+  global_hist_1_space_test();
+  global_hist_0_space_test();    
   int res = 1;
   if (res < 0) {
     fail();
diff --git a/wally-pipelined/config/rv64BP/wally-config.vh b/wally-pipelined/config/rv64BP/wally-config.vh
index fd482bfd..a9dbb1bd 100644
--- a/wally-pipelined/config/rv64BP/wally-config.vh
+++ b/wally-pipelined/config/rv64BP/wally-config.vh
@@ -32,7 +32,7 @@
 `define XLEN 64
 
 //`define MISA (32'h00000105)
-`define MISA (32'h00000104 | 1<<5 | 1<<18 | 1 << 20 | 1 << 12 | 1 << 0)
+`define MISA (32'h00000104 | 1 << 5 | 1 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0)
 `define A_SUPPORTED ((`MISA >> 0) % 2 == 1)
 `define C_SUPPORTED ((`MISA >> 2) % 2 == 1)
 `define D_SUPPORTED ((`MISA >> 3) % 2 == 1)
@@ -107,8 +107,8 @@
 /* verilator lint_off ASSIGNDLY */
 /* verilator lint_off PINCONNECTEMPTY */
 
-`define TWO_BIT_PRELOAD "../config/rv64icfd/twoBitPredictor.txt"
-`define BTB_PRELOAD "../config/rv64icfd/BTBPredictor.txt"
+`define TWO_BIT_PRELOAD "../config/rv64BP/twoBitPredictor.txt"
+`define BTB_PRELOAD "../config/rv64BP/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 //`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE
 `define BPTYPE "BPGLOBAL" // BPTWOBIT or "BPGSHARE"  or BPLOCALPAg or BPGSHARE
diff --git a/wally-pipelined/src/ifu/bpred.sv b/wally-pipelined/src/ifu/bpred.sv
index 9beaa959..92471c57 100644
--- a/wally-pipelined/src/ifu/bpred.sv
+++ b/wally-pipelined/src/ifu/bpred.sv
@@ -89,30 +89,29 @@ module bpred
       globalHistoryPredictor DirPredictor(.clk(clk),
 					  .reset(reset),
 					  .*, // Stalls and flushes
-					  .LookUpPC(PCNextF),
+					  .PCNextF(PCNextF),
 					  .BPPredF(BPPredF),
 					  // update
-					  .BPPredD(BPPredD),
 					  .InstrClassE(InstrClassE),
 					  .BPInstrClassE(BPInstrClassE),
 					  .BPPredDirWrongE(BPPredDirWrongE),
-					  .UpdatePC(PCE),
+					  .PCE(PCE),
 					  .PCSrcE(PCSrcE),
-					  .UpdatePrediction(UpdateBPPredE));
+					  .UpdateBPPredE(UpdateBPPredE));
     end else if (`BPTYPE == "BPGSHARE") begin:Predictor
 
       gsharePredictor DirPredictor(.clk(clk),
-				   .reset(reset),
-				   .*, // Stalls and flushes
-				   .LookUpPC(PCNextF),
-				   .Prediction(BPPredF),
-				   // update
-				   .UpdatePC(PCE),
-				   .UpdateEN(InstrClassE[0] & ~StallE),
-				   .SpeculativeUpdateEn(BPInstrClassF[0] & ~StallF),
-				   .BPPredDirWrongE(BPPredDirWrongE),
-				   .PCSrcE(PCSrcE),
-				   .UpdatePrediction(UpdateBPPredE));
+					  .reset(reset),
+					  .*, // Stalls and flushes
+					  .PCNextF(PCNextF),
+					  .BPPredF(BPPredF),
+					  // update
+					  .InstrClassE(InstrClassE),
+					  .BPInstrClassE(BPInstrClassE),
+					  .BPPredDirWrongE(BPPredDirWrongE),
+					  .PCE(PCE),
+					  .PCSrcE(PCSrcE),
+					  .UpdateBPPredE(UpdateBPPredE));
     end 
     else if (`BPTYPE == "BPLOCALPAg") begin:Predictor
 
diff --git a/wally-pipelined/src/ifu/globalHistoryPredictor.sv b/wally-pipelined/src/ifu/globalHistoryPredictor.sv
index b2ac1991..516de633 100644
--- a/wally-pipelined/src/ifu/globalHistoryPredictor.sv
+++ b/wally-pipelined/src/ifu/globalHistoryPredictor.sv
@@ -33,19 +33,18 @@ module globalHistoryPredictor
   (input logic clk,
    input logic 		   reset,
    input logic 		   StallF, StallD, StallE, FlushF, FlushD, FlushE,
-   input logic [`XLEN-1:0] LookUpPC,
+   input logic [`XLEN-1:0] PCNextF,
    output logic [1:0] 	   BPPredF,
    // update
-   input logic [1:0] 	   BPPredD,
    input logic [4:0] 	   InstrClassE,
    input logic [4:0] 	   BPInstrClassE,
    input logic [4:0] 	   BPInstrClassD,
    input logic [4:0] 	   BPInstrClassF, 
    input logic 		   BPPredDirWrongE,
 
-   input logic [`XLEN-1:0] UpdatePC,
+   input logic [`XLEN-1:0] PCE,
    input logic 		   PCSrcE,
-   input logic [1:0] 	   UpdatePrediction
+   input logic [1:0] 	   UpdateBPPredE
   
    );
   logic [k+1:0] 	   GHR, GHRNext;
@@ -54,17 +53,10 @@ module globalHistoryPredictor
   logic 		   BPClassWrongNonCFI;
   logic 		   BPClassWrongCFI;
   logic 		   BPClassRightNonCFI;
-  
-		   
-/* -----\/----- EXCLUDED -----\/-----
-  logic [k-1:0] GHRD, GHRE, GHRLookup;
-
-  logic 		   FlushedD, FlushedE;
- -----/\----- EXCLUDED -----/\----- */
-
 
   logic [6:0] 		   GHRMuxSel;
   logic 		   GHRUpdateEN;
+  logic [k-1:0] 	   GHRLookup;
 
   assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0];
   assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0];
@@ -75,15 +67,9 @@ module globalHistoryPredictor
   
   // GHR update selection, 1 hot encoded.
   assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight);
-
   assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0];
-  assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]);
-
-
   assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0];
-
-
-
+  assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]);
   assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0];
   assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0];
   assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight));
@@ -99,7 +85,6 @@ module globalHistoryPredictor
       7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2
       7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1
       7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update
-      //7'b100_0000: GHRNext = {k+1{1'bx}}; // speculative update
       default: GHRNext = GHR[k-1+2:0];
     endcase
   end
@@ -113,50 +98,23 @@ module globalHistoryPredictor
   // if actively updating the GHR at the time of prediction we want to us
   // GHRNext as the lookup rather than GHR.
 
-  //assign GHRLookup = GHRUpdateEN ? GHRNext : GHR;
-
   assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0];
   assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1];  
   assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0;
   assign PHTUpdateEN = InstrClassE[0] & ~StallE;
+
+  assign GHRLookup = |GHRMuxSel[6:1] ? GHRNext[k-1:0] : GHR[k-1:0];
   
   // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT 
   SRAM2P1R1W #(k, 2) PHT(.clk(clk),
 			 .reset(reset),
-			 .RA1(GHR[k-1:0]),
+			 //.RA1(GHR[k-1:0]),
+			 .RA1(GHRLookup),
 			 .RD1(BPPredF),
 			 .REN1(~StallF),
 			 .WA1(PHTUpdateAdr),
-			 .WD1(UpdatePrediction),
+			 .WD1(UpdateBPPredE),
 			 .WEN1(PHTUpdateEN),
 			 .BitWEN1(2'b11));
 
-/* -----\/----- EXCLUDED -----\/-----
-  flopenr #(k) GlobalHistoryRegisterD(.clk(clk),
-				     .reset(reset),
-				     .en(~StallD & ~FlushedE),
-				     .d(GHR),
-				     .q(GHRD));
-
-  flopenr #(k) GlobalHistoryRegisterE(.clk(clk),
-				     .reset(reset),
-				     .en(~StallE & ~ FlushedE),
-				     .d(GHRD),
-				     .q(GHRE));
-
-
-  flopenr #(1) flushedDReg(.clk(clk),
-			   .reset(reset),
-			   .en(~StallD),
-			   .d(FlushD),
-			   .q(FlushedD));
-
-  flopenr #(1) flushedEReg(.clk(clk),
-			   .reset(reset),
-			   .en(~StallE),
-			   .d(FlushE | FlushedD),
-			   .q(FlushedE));
- -----/\----- EXCLUDED -----/\----- */
-    
-
 endmodule
diff --git a/wally-pipelined/src/ifu/gshare.sv b/wally-pipelined/src/ifu/gshare.sv
deleted file mode 100644
index 3cc73be8..00000000
--- a/wally-pipelined/src/ifu/gshare.sv
+++ /dev/null
@@ -1,159 +0,0 @@
-///////////////////////////////////////////
-// gshare.sv
-//
-// Written: Shreya Sanghai
-// Email: ssanghai@hmc.edu
-// Created: March 16, 2021
-// Modified: 
-//
-// Purpose: Gshare predictor with parameterized global history register
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-///////////////////////////////////////////
-
-`include "wally-config.vh"
-
-module gsharePredictor
-  #(parameter int k = 10
-    )
-  (input logic clk,
-   input logic 		   reset,
-   input logic 		   StallF, StallD, StallE, FlushF, FlushD, FlushE,
-   input logic [`XLEN-1:0] LookUpPC,
-   output logic [1:0] 	   Prediction,
-   // update
-   input logic [`XLEN-1:0] UpdatePC,
-   input logic 		   UpdateEN, PCSrcE,
-   input logic 		   SpeculativeUpdateEn, BPPredDirWrongE,
-   input logic [1:0] 	   UpdatePrediction
-  
-   );
-
-  logic [k-1:0] 	   GHRF, GHRFNext, GHRD, GHRE;
-  //logic [k-1:0] 	   LookUpPCIndexD, LookUpPCIndexE;
-  logic [k-1:0] 	   LookUpPCIndex, UpdatePCIndex;
-  logic [1:0] 		   PredictionMemory;
-  logic 		   DoForwarding, DoForwardingF;
-  logic [1:0] 		   UpdatePredictionF;
-  logic 		   FlushedD, FlushedE;
-
-  // if the prediction is wrong we need to restore the ghr.
-  assign GHRFNext = BPPredDirWrongE ? {PCSrcE, GHRE[k-1:1]} : 
-		    {Prediction[1], GHRF[k-1:1]};
-  
-  flopenr #(k) GlobalHistoryRegister(.clk(clk),
-				     .reset(reset),
-				     .en((UpdateEN & BPPredDirWrongE) | (SpeculativeUpdateEn)),
-				     .d(GHRFNext),
-				     .q(GHRF));
-
-
-  // for gshare xor the PC with the GHR 
-  assign UpdatePCIndex = GHRE ^ UpdatePC[k:1];
-  assign LookUpPCIndex = GHRF ^ LookUpPC[k:1];  
-  // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT 
-  // GHR referes to the address that the past k branches points to in the prediction stage 
-  // GHRE refers to the address that the past k branches points to in the exectution stage
-  SRAM2P1R1W #(k, 2) PHT(.clk(clk),
-			 .reset(reset),
-			 .RA1(LookUpPCIndex),
-			 .RD1(Prediction),
-			 .REN1(~StallF),
-			 .WA1(UpdatePCIndex),
-			 .WD1(UpdatePrediction),
-			 .WEN1(UpdateEN),
-			 .BitWEN1(2'b11));
-
-
-  flopenr #(k) GlobalHistoryRegisterD(.clk(clk),
-				     .reset(reset),
-				     .en(~StallD & ~FlushedE),
-				     .d(GHRF),
-				     .q(GHRD));
-
-  flopenr #(k) GlobalHistoryRegisterE(.clk(clk),
-				     .reset(reset),
-				     .en(~StallE & ~ FlushedE),
-				     .d(GHRD),
-				     .q(GHRE));
-
-
-  flopenr #(1) flushedDReg(.clk(clk),
-			   .reset(reset),
-			   .en(~StallD),
-			   .d(FlushD),
-			   .q(FlushedD));
-
-  flopenr #(1) flushedEReg(.clk(clk),
-			   .reset(reset),
-			   .en(~StallE),
-			   .d(FlushE | FlushedD),
-			   .q(FlushedE));
-
-/* -----\/----- EXCLUDED -----\/-----
-  // need to forward when updating to the same address as reading.
-  // first we compare to see if the update and lookup addreses are the same
-  assign DoForwarding = LookUpPCIndex == UpdatePCIndex;
-
-  // register the update value and the forwarding signal into the Fetch stage
-  // TODO: add stall logic ***
-  flopr #(1) DoForwardingReg(.clk(clk),
-			     .reset(reset),
-			     .d(DoForwarding),
-			     .q(DoForwardingF));
-  
-  flopr #(2) UpdatePredictionReg(.clk(clk),
-				 .reset(reset),
-				 .d(UpdatePrediction),
-				 .q(UpdatePredictionF));
-
-  assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory;
- -----/\----- EXCLUDED -----/\----- */
-  
-  //pipeline for GHR
-/* -----\/----- EXCLUDED -----\/-----
-  flopenrc #(k) LookUpDReg(.clk(clk),
-			   .reset(reset),
-			   .en(~StallD),
-			   .clear(FlushD),
-			   .d(LookUpPCIndex),
-			   .q(LookUpPCIndexD));
-
-  flopenrc #(k) LookUpEReg(.clk(clk),
-			   .reset(reset),
-			   .en(~StallE),
-			   .clear(FlushE),
-			   .d(LookUpPCIndexD),
-			   .q(LookUpPCIndexE));
- -----/\----- EXCLUDED -----/\----- */
-
-/*  flopenrc #(k) GHRRegD(.clk(clk),
-			.reset(reset),
-			.en(~StallD),
-			.clear(FlushD),
-			.d(GHRF),
-			.q(GHRD));
-
-  flopenrc #(k) GHRRegE(.clk(clk),
-			.reset(reset),
-			.en(~StallE),
-			.clear(FlushE),
-			.d(GHRD),
-			.q(GHRE));
-  
-*/
-endmodule
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index ddee23a1..bb8ffbd4 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -438,8 +438,11 @@ string tests32f[] = '{
 
   string testsBP64[] = '{
     "rv64BP/simple", "10000",
+    "rv64BP/mmm", "1000000",
+    "rv64BP/linpack_bench", "1000000",
+    "rv64BP/sieve", "1000000",
     "rv64BP/qsort", "1000000",
-    "rv64BP/sieve", "1000000"
+    "rv64BP/dhrystone", "1000000"
   };
 
   string tests64p[] = '{

From ab509614bb36a1db60b8017f0df1521bf9688858 Mon Sep 17 00:00:00 2001
From: Ross Thompson <stephen.thompson.37@us.af.mil>
Date: Tue, 1 Jun 2021 12:14:58 -0500
Subject: [PATCH 11/19] Changed to bp config to use gshare.

---
 wally-pipelined/config/rv64BP/wally-config.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wally-pipelined/config/rv64BP/wally-config.vh b/wally-pipelined/config/rv64BP/wally-config.vh
index a9dbb1bd..f85e0c22 100644
--- a/wally-pipelined/config/rv64BP/wally-config.vh
+++ b/wally-pipelined/config/rv64BP/wally-config.vh
@@ -111,5 +111,5 @@
 `define BTB_PRELOAD "../config/rv64BP/BTBPredictor.txt"
 `define BPRED_ENABLED 1
 //`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE
-`define BPTYPE "BPGLOBAL" // BPTWOBIT or "BPGSHARE"  or BPLOCALPAg or BPGSHARE
+`define BPTYPE "BPGSHARE" // BPTWOBIT or "BPGLOBAL"  or BPLOCALPAg or BPGSHARE
 `define TESTSBP 1

From 997c13a5217f0d32a40dc77dcc6a2653368bb397 Mon Sep 17 00:00:00 2001
From: Ross Thompson <stephen.thompson.37@us.af.mil>
Date: Tue, 1 Jun 2021 12:41:48 -0500
Subject: [PATCH 12/19] Forgot to include the new gshare predictor file.

---
 wally-pipelined/src/ifu/gsharePredictor.sv | 120 +++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 wally-pipelined/src/ifu/gsharePredictor.sv

diff --git a/wally-pipelined/src/ifu/gsharePredictor.sv b/wally-pipelined/src/ifu/gsharePredictor.sv
new file mode 100644
index 00000000..b4a60827
--- /dev/null
+++ b/wally-pipelined/src/ifu/gsharePredictor.sv
@@ -0,0 +1,120 @@
+///////////////////////////////////////////
+// globalHistoryPredictor.sv
+//
+// Written: Shreya Sanghai
+// Email: ssanghai@hmc.edu
+// Created: March 16, 2021
+// Modified: 
+//
+// Purpose: Gshare predictor with parameterized global history register
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module gsharePredictor
+  #(parameter int k = 10
+    )
+  (input logic clk,
+   input logic 		   reset,
+   input logic 		   StallF, StallD, StallE, FlushF, FlushD, FlushE,
+   input logic [`XLEN-1:0] PCNextF,
+   output logic [1:0] 	   BPPredF,
+   // update
+   input logic [4:0] 	   InstrClassE,
+   input logic [4:0] 	   BPInstrClassE,
+   input logic [4:0] 	   BPInstrClassD,
+   input logic [4:0] 	   BPInstrClassF, 
+   input logic 		   BPPredDirWrongE,
+
+   input logic [`XLEN-1:0] PCE,
+   input logic 		   PCSrcE,
+   input logic [1:0] 	   UpdateBPPredE
+  
+   );
+  logic [k+1:0] 	   GHR, GHRNext;
+  logic [k-1:0] 	   PHTUpdateAdr, PHTUpdateAdr0, PHTUpdateAdr1;
+  logic 		   PHTUpdateEN;
+  logic 		   BPClassWrongNonCFI;
+  logic 		   BPClassWrongCFI;
+  logic 		   BPClassRightNonCFI;
+
+  logic [6:0] 		   GHRMuxSel;
+  logic 		   GHRUpdateEN;
+  logic [k-1:0] 	   GHRLookup;
+
+  assign BPClassRightNonCFI = ~BPInstrClassE[0] & ~InstrClassE[0];
+  assign BPClassWrongCFI = ~BPInstrClassE[0] & InstrClassE[0];
+  assign BPClassWrongNonCFI = BPInstrClassE[0] & ~InstrClassE[0];
+  assign BPClassRightBPWrong = BPInstrClassE[0] & InstrClassE[0] & BPPredDirWrongE;
+  assign BPClassRightBPRight = BPInstrClassE[0] & InstrClassE[0] & ~BPPredDirWrongE;
+  
+  
+  // GHR update selection, 1 hot encoded.
+  assign GHRMuxSel[0] = ~BPInstrClassF[0] & (BPClassRightNonCFI | BPClassRightBPRight);
+  assign GHRMuxSel[1] = BPClassWrongCFI & ~BPInstrClassD[0];
+  assign GHRMuxSel[2] = BPClassWrongNonCFI & ~BPInstrClassD[0];
+  assign GHRMuxSel[3] = (BPClassRightBPWrong & ~BPInstrClassD[0]) | (BPClassWrongCFI & BPInstrClassD[0]);
+  assign GHRMuxSel[4] = BPClassWrongNonCFI & BPInstrClassD[0];
+  assign GHRMuxSel[5] = InstrClassE[0] & BPClassRightBPWrong & BPInstrClassD[0];
+  assign GHRMuxSel[6] = BPInstrClassF[0] & (BPClassRightNonCFI | (InstrClassE[0] & BPClassRightBPRight));
+  assign GHRUpdateEN = (| GHRMuxSel[5:1] & ~StallE) | GHRMuxSel[6] & ~StallF;
+
+  // hoping this created a AND-OR mux.
+  always_comb begin
+    case (GHRMuxSel) 
+      7'b000_0001: GHRNext = GHR[k-1+2:0];  // no change
+      7'b000_0010: GHRNext = {GHR[k-2+2:0], PCSrcE}; // branch update
+      7'b000_0100: GHRNext = {1'b0, GHR[k+1:1]}; // repair 1
+      7'b000_1000: GHRNext = {GHR[k-1+2:1], PCSrcE}; // branch update with mis prediction correction
+      7'b001_0000: GHRNext = {2'b00, GHR[k+1:2]}; // repair 2
+      7'b010_0000: GHRNext = {1'b0, GHR[k+1:2], PCSrcE}; // branch update + repair 1
+      7'b100_0000: GHRNext = {GHR[k-2+2:0], BPPredF[1]}; // speculative update
+      default: GHRNext = GHR[k-1+2:0];
+    endcase
+  end
+
+  flopenr #(k+2) GlobalHistoryRegister(.clk(clk),
+				       .reset(reset),
+				       .en((GHRUpdateEN)),
+				       .d(GHRNext),
+				       .q(GHR));
+
+  // if actively updating the GHR at the time of prediction we want to us
+  // GHRNext as the lookup rather than GHR.
+
+  assign PHTUpdateAdr0 = InstrClassE[0] ? GHR[k:1] : GHR[k-1:0];
+  assign PHTUpdateAdr1 = InstrClassE[0] ? GHR[k+1:2] : GHR[k:1];  
+  assign PHTUpdateAdr = BPInstrClassD[0] ? PHTUpdateAdr1 : PHTUpdateAdr0;
+  assign PHTUpdateEN = InstrClassE[0] & ~StallE;
+
+  assign GHRLookup = |GHRMuxSel[6:1] ? GHRNext[k-1:0] : GHR[k-1:0];
+  
+  // Make Prediction by reading the correct address in the PHT and also update the new address in the PHT 
+  SRAM2P1R1W #(k, 2) PHT(.clk(clk),
+			 .reset(reset),
+			 //.RA1(GHR[k-1:0]),
+			 .RA1(GHRLookup ^ PCNextF[k:1]),
+			 .RD1(BPPredF),
+			 .REN1(~StallF),
+			 .WA1(PHTUpdateAdr ^ PCE[k:1]),
+			 .WD1(UpdateBPPredE),
+			 .WEN1(PHTUpdateEN),
+			 .BitWEN1(2'b11));
+
+endmodule // gsharePredictor

From fe22fd2db8cc6f66ee5021ae4095aa55b1cc80ed Mon Sep 17 00:00:00 2001
From: Ross Thompson <stephen.thompson.37@us.af.mil>
Date: Tue, 1 Jun 2021 13:46:21 -0500
Subject: [PATCH 13/19] added clock gater to floating point divider to speed up
 simulation time.

---
 wally-pipelined/src/fpu/fpu.sv            |  9 ++++-
 wally-pipelined/src/generic/clockgater.sv | 46 +++++++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 wally-pipelined/src/generic/clockgater.sv

diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index c876b313..8362dbe3 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -275,7 +275,14 @@ module fpu (
   fma1 fma1 (.*);
 
   //first and only instance of floating-point divider
-  fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .*);
+  logic fpdivClk;
+  
+  clockgater fpdivclkg(.E(FDivStartE),
+		       .SE(DivBusyM),
+		       .CLK(clk),
+		       .ECLK(fpdivClk));
+  
+  fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk));
 
   //first of two-stage instance of floating-point add/cvt unit
   fpuaddcvt1 fpadd1 (.*);
diff --git a/wally-pipelined/src/generic/clockgater.sv b/wally-pipelined/src/generic/clockgater.sv
new file mode 100644
index 00000000..dc51829d
--- /dev/null
+++ b/wally-pipelined/src/generic/clockgater.sv
@@ -0,0 +1,46 @@
+///////////////////////////////////////////
+// clockgater.sv
+//
+// Written: Ross Thompson 9 January 2021
+// Modified: 
+//
+// Purpose: Clock gater model. Must use standard cell for synthesis.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module clockgater
+  (input logic 	E,
+   input logic 	SE,
+   input logic 	CLK,
+   output logic ECLK);
+
+  // VERY IMPORTANT.
+  // This part functionally models a clock gater, but does not necessarily meet the timing constrains a real standard cell would.
+  // Do not use this in synthesis!
+
+  logic 	enable_q;
+  
+
+  always @(E or SE) begin
+    enable_q <= E | SE;
+  end
+  assign ECLK = enable_q & CLK;
+
+endmodule

From 2eeb12c6741659ece6bc5e17d13ccbd04dbfc6c5 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Tue, 1 Jun 2021 15:31:07 -0400
Subject: [PATCH 14/19] Updates to muldiv.sv for 32-bit div/rem

---
 wally-pipelined/config/rv64ic/wally-config.vh  |  2 +-
 wally-pipelined/src/muldiv/muldiv.sv           |  2 +-
 wally-pipelined/testbench/testbench-imperas.sv | 10 +++++-----
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/wally-pipelined/config/rv64ic/wally-config.vh b/wally-pipelined/config/rv64ic/wally-config.vh
index 259e41ae..12d254ba 100644
--- a/wally-pipelined/config/rv64ic/wally-config.vh
+++ b/wally-pipelined/config/rv64ic/wally-config.vh
@@ -31,7 +31,7 @@
 `define XLEN 64
 
 // MISA RISC-V configuration per specification
-`define MISA (32'h00000104 | 0 << 5 | 1 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0)
+`define MISA (32'h00000104 | 0 << 5 | 0 << 3 | 1 << 18 | 1 << 20 | 1 << 12 | 1 << 0)
 `define A_SUPPORTED ((`MISA >> 0) % 2 == 1)
 `define C_SUPPORTED ((`MISA >> 2) % 2 == 1)
 `define D_SUPPORTED ((`MISA >> 3) % 2 == 1)
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index 0c26a5df..e10b0c55 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -76,7 +76,7 @@ module muldiv (
             assign Den0 = W64E ? {{32{SrcBE[31]&signedDivide}}, SrcBE[31:0]} : SrcBE;
 	 end else begin // RV32 has no W-type instructions
             assign Num0 = SrcAE;
-            assign Den0 = SrcAE;	    
+            assign Den0 = SrcBE;	    
 	 end	    
 
 	 // capture the Numerator/Denominator	 
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index c60aa40d..dabc6d12 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -320,11 +320,11 @@ string tests32f[] = '{
     "rv32m/I-MUL-01", "2000",
     "rv32m/I-MULH-01", "2000",
     "rv32m/I-MULHSU-01", "2000",
-    "rv32m/I-MULHU-01", "2000"
-    //"rv32m/I-DIV-01", "2000",
-    //"rv32m/I-DIVU-01", "2000",
-    //"rv32m/I-REM-01", "2000",
-    //"rv32m/I-REMU-01", "2000"
+    "rv32m/I-MULHU-01", "2000",
+    "rv32m/I-DIV-01", "2000",
+    "rv32m/I-DIVU-01", "2000",
+    "rv32m/I-REM-01", "2000",
+    "rv32m/I-REMU-01", "2000"
   };
 
   string tests32ic[] = '{

From 564d7c4adb0d62d3626e23e250aafa4b3db93bd4 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Tue, 1 Jun 2021 15:45:32 -0400
Subject: [PATCH 15/19] Minor cosmetic update to fpu.sv

---
 wally-pipelined/src/fpu/fpu.sv | 958 +++++++++++++++------------------
 1 file changed, 439 insertions(+), 519 deletions(-)

diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index 8362dbe3..e303f205 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -25,535 +25,455 @@
 `include "wally-config.vh"
 
 module fpu (
-  input  logic [2:0]       FRM_REGW,    // Rounding mode from CSR
-  input  logic             reset,
+  input logic [2:0] 	   FRM_REGW, // Rounding mode from CSR
+  input logic 		   reset,
   //input  logic             clear,     // *** not being used anywhere
-  input  logic             clk,
-  input  logic [31:0]      InstrD,
-  input  logic [`XLEN-1:0] SrcAE,       // Integer input being processed
-  input  logic [`XLEN-1:0] SrcAM,       // Integer input being written into fpreg
-  input  logic 		         StallE, StallM, StallW,
-  input  logic             FlushE, FlushM, FlushW,
-  input  logic [`AHBW-1:0] HRDATA,
-  input  logic             RegWriteD,
-  output logic [4:0]       SetFflagsM,
-  output logic [31:0]      FSROutW,
-  output logic [1:0]       FMemRWM,
-	output logic             FStallD,
-  output logic             FWriteIntE, FWriteIntM, FWriteIntW,
+  input logic 		   clk,
+  input logic [31:0] 	   InstrD,
+  input logic [`XLEN-1:0]  SrcAE, // Integer input being processed
+  input logic [`XLEN-1:0]  SrcAM, // Integer input being written into fpreg
+  input logic 		   StallE, StallM, StallW,
+  input logic 		   FlushE, FlushM, FlushW,
+  input logic [`AHBW-1:0]  HRDATA,
+  input logic 		   RegWriteD,
+  output logic [4:0] 	   SetFflagsM,
+  output logic [31:0] 	   FSROutW,
+  output logic [1:0] 	   FMemRWM,
+  output logic 		   FStallD,
+  output logic 		   FWriteIntE, FWriteIntM, FWriteIntW,
   output logic [`XLEN-1:0] FWriteDataM,
-  output logic             FDivSqrtDoneM,
-  output logic             IllegalFPUInstrD,
+  output logic 		   FDivSqrtDoneM,
+  output logic 		   IllegalFPUInstrD,
   output logic [`XLEN-1:0] FPUResultW);
 
-
-
-
-
-  //control logic signal instantiation
-  logic             FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;             // FP register write enable
-  logic [2:0]       FrmD, FrmE, FrmM, FrmW;                                 // FP rounding mode
-  logic             FmtD, FmtE, FmtM, FmtW;                                 // FP precision 0-single 1-double
-  logic             FDivStartD, FDivStartE;                                 // Start division
-  logic             FWriteIntD;                                 // Write to integer register
-  logic             FOutputInput2D, FOutputInput2E;                         // Put Input2 in Input1 if a store instruction
-  logic [1:0]       FMemRWD, FMemRWE;                                       // Read and write enable for memory
-  logic [1:0]       FForwardInput1D, FForwardInput1E;                       // Input1 forwarding mux control signal
-  logic [1:0]       FForwardInput2D, FForwardInput2E;                       // Input2 forwarding mux control signal
-  logic             FForwardInput3D, FForwardInput3E;                       // Input3 forwarding mux control signal
-  logic             FInput2UsedD;                                           // Is input 2 used
-  logic             FInput3UsedD;                                           // Is input 3 used
-  logic [2:0]       FResultSelD, FResultSelE, FResultSelM, FResultSelW;     // Select FP result
-  logic [3:0]       FOpCtrlD, FOpCtrlE, FOpCtrlM;                           // Select which opperation to do in each component
-  
-  // regfile signals
-  logic [4:0]       RdE, RdM, RdW; // ***Can take from ieu
-  logic [`XLEN-1:0] FWDM;                                                   // Write data for FP register
-  logic [`XLEN-1:0] FRD1D, FRD2D, FRD3D;                                    // Read Data from FP register
-  logic [`XLEN-1:0] FRD1E, FRD2E, FRD3E;
-  logic [`XLEN-1:0] FInput1E, FInput1M, FInput1tmpE;
-  logic [`XLEN-1:0] FInput2E, FInput2M;
-  logic [`XLEN-1:0] FInput3E, FInput3M;
-  logic [`XLEN-1:0] FLoadStoreResultM, FLoadStoreResultW;                   // Result for load, store, and move to int-reg instructions
-
-  // div/sqrt signals
-  logic             DivDenormM, DivDenormW;
-  logic             DivOvEn, DivUnEn;
-  logic             DivBusyM;
-  logic [63:0]      FDivResultM, FDivResultW;
-  logic [4:0]       FDivFlagsM, FDivFlagsW;
-
-  // FMA signals
-  logic [12:0]		  aligncntE, aligncntM; 
-  logic [105:0]		  rE, rM; 
-  logic [105:0]		  sE, sM; 
-  logic [163:0]		  tE, tM;	
-  logic [8:0]		    normcntE, normcntM; 
-  logic [12:0]		  aeE, aeM; 
-  logic 		        bsE, bsM;
-  logic 		        killprodE, killprodM; 
-  logic 		        prodofE, prodofM; 
-  logic			        xzeroE, xzeroM;
-  logic			        yzeroE, yzeroM;
-  logic			        zzeroE, zzeroM;
-  logic			        xdenormE, xdenormM;
-  logic			        ydenormE, ydenormM;
-  logic			        zdenormE, zdenormM;
-  logic			        xinfE, xinfM;
-  logic			        yinfE, yinfM;
-  logic			        zinfE, zinfM;
-  logic			        xnanE, xnanM;
-  logic			        ynanE, ynanM;
-  logic			        znanE, znanM;
-  logic			        nanE, nanM;
-  logic	[8:0]		    sumshiftE, sumshiftM;
-  logic			        sumshiftzeroE, sumshiftzeroM;
-  logic             prodinfE, prodinfM;
-  logic [63:0]      FmaResultM, FmaResultW;
-  logic [4:0]       FmaFlagsM, FmaFlagsW;
-  
-  // add/cvt signals
-  logic [63:0]      AddSumE, AddSumTcE;
-  logic [3:0]       AddSelInvE;
-  logic [10:0]      AddExpPostSumE;
-  logic             AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
-  logic             AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
-  logic             AddConvertE;
-  logic [63:0]      AddFloat1E, AddFloat2E;
-  logic [11:0]      AddExp1DenormE, AddExp2DenormE;
-  logic [10:0]      AddExponentE;
-  logic [2:0]       AddRmE;
-  logic [3:0]       AddOpTypeE;
-  logic             AddPE, AddOvEnE, AddUnEnE;    
-  logic             AddDenormM;
-  logic [63:0]      AddSumM, AddSumTcM;
-  logic [3:0]       AddSelInvM;
-  logic [10:0]      AddExpPostSumM;
-  logic             AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
-  logic             AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
-  logic             AddConvertM, AddSignM;
-  logic [63:0]      AddFloat1M, AddFloat2M;
-  logic [11:0]      AddExp1DenormM, AddExp2DenormM;
-  logic [10:0]      AddExponentM;
-  logic [63:0]      AddOp1M, AddOp2M;
-  logic [2:0]       AddRmM;
-  logic [3:0]       AddOpTypeM;
-  logic             AddPM, AddOvEnM, AddUnEnM;  
-  logic [63:0]      FAddResultM, FAddResultW;
-  logic [4:0]       FAddFlagsM, FAddFlagsW;
-
-  //cmp signals 
-  logic [7:0]       WE, WM;
-  logic [7:0]       XE, XM;
-  logic             ANaNE, ANaNM;
-  logic             BNaNE, BNaNM;
-  logic             AzeroE, AzeroM;
-  logic             BzeroE, BzeroM;
-  logic             CmpInvalidM, CmpInvalidW;
-  logic [1:0]       CmpFCCM, CmpFCCW; 
-  logic [63:0]      FCmpResultM, FCmpResultW;
-
-  // fsgn signals
-  logic [63:0]      SgnResultE, SgnResultM, SgnResultW;
-  logic [4:0]       SgnFlagsE, SgnFlagsM, SgnFlagsW;
-
-  //instantiation of W stage regfile signals
-  logic [`XLEN-1:0] SrcAW;
-
-  // classify signals
-  logic [63:0]      ClassResultE, ClassResultM, ClassResultW;
-
-  // other
-  logic [63:0]      FPUResult64W, FPUResult64E;                                           // 64-bit FPU result
-  logic [4:0]       FPUFlagsW;
-
-  // pipeline control logic
-  logic	                   PipeEnableDE;
-  logic	                   PipeEnableEM;
-  logic	                   PipeEnableMW;
-  logic                    PipeClearDE;
-  logic                    PipeClearEM;
-  logic                    PipeClearMW;
-
-  //temporarily assign pipe clear and enable signals
-  //to never flush & always be running
-  localparam PipeClear = 1'b0;
-  localparam PipeEnable = 1'b1;
-  always_comb begin
-
-	  PipeEnableDE = ~StallE;
-	  PipeEnableEM = ~StallM;
-	  PipeEnableMW = ~StallW;
-	  PipeClearDE = FlushE;
-	  PipeClearEM = FlushM;
-	  PipeClearMW = FlushW;
-
-  end
-
- 
-
-
-
-
-
-
-
-
-
-
-
-  //DECODE STAGE
-
-  //Hazard unit for FPU
-  fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*);
-
-  //top-level controller for FPU
-  fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*);
-
-
-  //regfile instantiation
+   // control logic signal instantiation
+   logic 		   FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;             // FP register write enable
+   logic [2:0] 		   FrmD, FrmE, FrmM, FrmW;                                 // FP rounding mode
+   logic 		   FmtD, FmtE, FmtM, FmtW;                                 // FP precision 0-single 1-double
+   logic 		   FDivStartD, FDivStartE;                                 // Start division
+   logic 		   FWriteIntD;                                 // Write to integer register
+   logic 		   FOutputInput2D, FOutputInput2E;                         // Put Input2 in Input1 if a store instruction
+   logic [1:0] 		   FMemRWD, FMemRWE;                                       // Read and write enable for memory
+   logic [1:0] 		   FForwardInput1D, FForwardInput1E;                       // Input1 forwarding mux control signal
+   logic [1:0] 		   FForwardInput2D, FForwardInput2E;                       // Input2 forwarding mux control signal
+   logic 		   FForwardInput3D, FForwardInput3E;                       // Input3 forwarding mux control signal
+   logic 		   FInput2UsedD;                                           // Is input 2 used
+   logic 		   FInput3UsedD;                                           // Is input 3 used
+   logic [2:0] 		   FResultSelD, FResultSelE, FResultSelM, FResultSelW;     // Select FP result
+   logic [3:0] 		   FOpCtrlD, FOpCtrlE, FOpCtrlM;                           // Select which opperation to do in each component
+   
+   // regfile signals
+   logic [4:0] 		   RdE, RdM, RdW; // ***Can take from ieu
+   logic [`XLEN-1:0] 	   FWDM;                                                   // Write data for FP register
+   logic [`XLEN-1:0] 	   FRD1D, FRD2D, FRD3D;                                    // Read Data from FP register
+   logic [`XLEN-1:0] 	   FRD1E, FRD2E, FRD3E;
+   logic [`XLEN-1:0] 	   FInput1E, FInput1M, FInput1tmpE;
+   logic [`XLEN-1:0] 	   FInput2E, FInput2M;
+   logic [`XLEN-1:0] 	   FInput3E, FInput3M;
+   logic [`XLEN-1:0] 	   FLoadStoreResultM, FLoadStoreResultW;                   // Result for load, store, and move to int-reg instructions
+   
+   // div/sqrt signals
+   logic 		   DivDenormM, DivDenormW;
+   logic 		   DivOvEn, DivUnEn;
+   logic 		   DivBusyM;
+   logic [63:0] 	   FDivResultM, FDivResultW;
+   logic [4:0] 		   FDivFlagsM, FDivFlagsW;
+   
+   // FMA signals
+   logic [12:0] 	   aligncntE, aligncntM; 
+   logic [105:0] 	   rE, rM; 
+   logic [105:0] 	   sE, sM; 
+   logic [163:0] 	   tE, tM;	
+   logic [8:0] 		   normcntE, normcntM; 
+   logic [12:0] 	   aeE, aeM; 
+   logic 		   bsE, bsM;
+   logic 		   killprodE, killprodM; 
+   logic 		   prodofE, prodofM; 
+   logic 		   xzeroE, xzeroM;
+   logic 		   yzeroE, yzeroM;
+   logic 		   zzeroE, zzeroM;
+   logic 		   xdenormE, xdenormM;
+   logic 		   ydenormE, ydenormM;
+   logic 		   zdenormE, zdenormM;
+   logic 		   xinfE, xinfM;
+   logic 		   yinfE, yinfM;
+   logic 		   zinfE, zinfM;
+   logic 		   xnanE, xnanM;
+   logic 		   ynanE, ynanM;
+   logic 		   znanE, znanM;
+   logic 		   nanE, nanM;
+   logic [8:0] 		   sumshiftE, sumshiftM;
+   logic 		   sumshiftzeroE, sumshiftzeroM;
+   logic 		   prodinfE, prodinfM;
+   logic [63:0] 	   FmaResultM, FmaResultW;
+   logic [4:0] 		   FmaFlagsM, FmaFlagsW;
+   
+   // add/cvt signals
+   logic [63:0] 	   AddSumE, AddSumTcE;
+   logic [3:0] 		   AddSelInvE;
+   logic [10:0] 	   AddExpPostSumE;
+   logic 		   AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
+   logic 		   AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
+   logic 		   AddConvertE;
+   logic [63:0] 	   AddFloat1E, AddFloat2E;
+   logic [11:0] 	   AddExp1DenormE, AddExp2DenormE;
+   logic [10:0] 	   AddExponentE;
+   logic [2:0] 		   AddRmE;
+   logic [3:0] 		   AddOpTypeE;
+   logic 		   AddPE, AddOvEnE, AddUnEnE;    
+   logic 		   AddDenormM;
+   logic [63:0] 	   AddSumM, AddSumTcM;
+   logic [3:0] 		   AddSelInvM;
+   logic [10:0] 	   AddExpPostSumM;
+   logic 		   AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
+   logic 		   AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
+   logic 		   AddConvertM, AddSignM;
+   logic [63:0] 	   AddFloat1M, AddFloat2M;
+   logic [11:0] 	   AddExp1DenormM, AddExp2DenormM;
+   logic [10:0] 	   AddExponentM;
+   logic [63:0] 	   AddOp1M, AddOp2M;
+   logic [2:0] 		   AddRmM;
+   logic [3:0] 		   AddOpTypeM;
+   logic 		   AddPM, AddOvEnM, AddUnEnM;  
+   logic [63:0] 	   FAddResultM, FAddResultW;
+   logic [4:0] 		   FAddFlagsM, FAddFlagsW;
+   
+   // cmp signals 
+   logic [7:0] 		   WE, WM;
+   logic [7:0] 		   XE, XM;
+   logic 		   ANaNE, ANaNM;
+   logic 		   BNaNE, BNaNM;
+   logic 		   AzeroE, AzeroM;
+   logic 		   BzeroE, BzeroM;
+   logic 		   CmpInvalidM, CmpInvalidW;
+   logic [1:0] 		   CmpFCCM, CmpFCCW; 
+   logic [63:0] 	   FCmpResultM, FCmpResultW;
+   
+   // fsgn signals
+   logic [63:0] 	   SgnResultE, SgnResultM, SgnResultW;
+   logic [4:0] 		   SgnFlagsE, SgnFlagsM, SgnFlagsW;
+   
+   // instantiation of W stage regfile signals
+   logic [`XLEN-1:0] 	   SrcAW;
+   
+   // classify signals
+   logic [63:0] 	   ClassResultE, ClassResultM, ClassResultW;
+   
+   // 64-bit FPU result   
+   logic [63:0] 	   FPUResult64W, FPUResult64E;                                           
+   logic [4:0] 		   FPUFlagsW;
+   
+   // pipeline control logic
+   logic 		   PipeEnableDE;
+   logic 		   PipeEnableEM;
+   logic 		   PipeEnableMW;
+   logic 		   PipeClearDE;
+   logic 		   PipeClearEM;
+   logic 		   PipeClearMW;
+   
+   // temporarily assign pipe clear and enable signals
+   // to never flush & always be running
+   localparam PipeClear = 1'b0;
+   localparam PipeEnable = 1'b1;
+   always_comb begin      
+      PipeEnableDE = ~StallE;
+      PipeEnableEM = ~StallM;
+      PipeEnableMW = ~StallW;
+      PipeClearDE = FlushE;
+      PipeClearEM = FlushM;
+      PipeClearMW = FlushW;      
+   end
+   
+   //DECODE STAGE
+   
+   // Hazard unit for FPU
+   fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*);
+   
+   // top-level controller for FPU
+   fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*);
+   
+   // regfile instantiation
    FPregfile fpregfile (clk, reset, FWriteEnW,
 			InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
 			FPUResult64W,
 			FRD1D, FRD2D, FRD3D);	
-
-
-
-
-
-
-
-
-
-  //*****************
-  //fpregfile D/E pipe registers
-  //*****************
-  flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, FRD1D, FRD1E);
-  flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, FRD2D, FRD2E);
-  flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, FRD3D, FRD3E);
-
-  //*****************
-  //other  D/E pipe registers
-  //*****************
-  flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FWriteEnD, FWriteEnE);
-  flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE);
-  flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
-  flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE);
-  flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE);
-  flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, FOpCtrlD, FOpCtrlE);
-  flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, FDivStartD, FDivStartE);
-  flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput1D, FForwardInput1E);
-  flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput2D, FForwardInput2E);
-  flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput3D, FForwardInput3E);
-  flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResult64W, FPUResult64E);
-  flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE);
-  flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E);
-  flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE);
-
-
-
-
-
-
-
-
-
-
-
-
-
-  //EXECUTION STAGE
-
-
-
-  // input muxs for forwarding
-  mux4  #(64)  FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, SrcAM, FForwardInput1E, FInput1tmpE);
-  mux3  #(64)  FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E);
-  mux2  #(64)  FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E);
-  mux2  #(64)  FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E);
-
-  fma1 fma1 (.*);
-
-  //first and only instance of floating-point divider
-  logic fpdivClk;
-  
-  clockgater fpdivclkg(.E(FDivStartE),
-		       .SE(DivBusyM),
-		       .CLK(clk),
-		       .ECLK(fpdivClk));
-  
-  fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk));
-
-  //first of two-stage instance of floating-point add/cvt unit
-  fpuaddcvt1 fpadd1 (.*);
-
-  //first of two-stage instance of floating-point comparator
-  fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, FInput1E, FInput2E, FOpCtrlE[1:0]);
-
-  //first and only instance of floating-point sign converter
-  fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*);
-
-  //first and only instance of floating-point classify unit
-  fpuclassify fpuclass (.*);
-
-  
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-  //*****************
-  //fpregfile D/E pipe registers
-  //*****************
-  flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, FInput1E, FInput1M);
-  flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, FInput2E, FInput2M);
-  flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, FInput3E, FInput3M);
-
-  //*****************
-  //fma E/M pipe registers
-  //*****************  
-  flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); 
-  flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); 
-  flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); 
-  flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); 
-  flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); 
-  flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM);  
-  flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); 
-  flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); 
-  flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); 
-  flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); 
-  flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); 
-  flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); 
-  flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); 
-  flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); 
-  flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); 
-  flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); 
-  flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); 
-  flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); 
-  flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); 
-  flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); 
-  flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); 
-  flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); 
-  flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); 
-  flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); 
-  flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); 
-
-  //*****************
-  //fpadd E/M pipe registers
-  //*****************
-  flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
-  flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
-  flopenrc #(4)  EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
-  flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
-  flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
-  flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
-  flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
-  flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
-  flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
-  flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
-  flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
-  flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
-  flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
-  flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
-  flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignAM); 
-  flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
-  flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
-  flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
-  flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
-  flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
-  flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
-  flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
-  flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
-  flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
-  flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
-
-  //*****************
-  //fpcmp E/M pipe registers
-  //*****************
-  flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
-  flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
-  flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
-  flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
-  flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
-  flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
-
-  //put this in for the event we want to delay fsgn - will otherwise bypass
-  //*****************
-  //fpsgn E/M pipe registers
-  //***************** 
-  flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
-  flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
-
-  //*****************
-  //other E/M pipe registers
-  //*****************
-  flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FWriteEnE, FWriteEnM);
-  flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM);
-  flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
-  flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM);
-  flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM);
-  flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM);
-  flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
-  flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
-
-  //*****************
-  //fpuclassify E/M pipe registers
-  //***************** 
-  flopenrc #(64) EMRegClass(clk, reset, PipeClearEM, PipeEnableEM, ClassResultE, ClassResultM);
-
-
-
-
-
-
-
-
-  //BEGIN MEMORY STAGE
-
-  assign FWriteDataM = FInput1M;
-
-  mux2  #(64)  FLoadStoreResultMux(HRDATA, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
-
-  fma2 fma2(.*);
-
-  //second instance of two-stage floating-point add/cvt unit
-  fpuaddcvt2 fpadd2 (.*);
-
-  //second instance of two-stage floating-point comparator
-  fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*);
-
-
-
-
-
-
-
-
-
-
-  
-  //*****************
-  //fma M/W pipe registers
-  //*****************
-  flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); 
-  flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); 
-
-  //*****************
-  //fpdiv M/W pipe registers
-  //*****************
-  flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, FDivResultM, FDivResultW); 
-  flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, FDivFlagsM, FDivFlagsW);
-  flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
-
-  //*****************
-  //fpadd M/W pipe registers
-  //*****************
-  flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, FAddResultM, FAddResultW); 
-  flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, FAddFlagsM, FAddFlagsW); 
-
-  //*****************
-  //fpcmp M/W pipe registers
-  //*****************
-  flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
-  flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
-  flopenrc #(64) MWRegCmp3(clk, reset, PipeClearMW, PipeEnableMW, FCmpResultM, FCmpResultW); 
-
-  //*****************
-  //fpsgn M/W pipe registers
-  //***************** 
-  flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW);
-  flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW);
-
-  //*****************
-  //other M/W pipe registers
-  //*****************
-  flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FWriteEnM, FWriteEnW);
-  flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
-  flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
-  flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
-  flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW);
-  flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW);
-  flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
-
-
-  //*****************
-  //fpuclassify M/W pipe registers
-  //***************** 
-  flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW);
-
-
-
-
-
-
-
+   
+   //*****************
+   // fpregfile D/E pipe registers
+   //*****************
+   flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, FRD1D, FRD1E);
+   flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, FRD2D, FRD2E);
+   flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, FRD3D, FRD3E);
+   
+   //*****************
+   // other  D/E pipe registers
+   //*****************
+   flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FWriteEnD, FWriteEnE);
+   flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE);
+   flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
+   flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE);
+   flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE);
+   flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, FOpCtrlD, FOpCtrlE);
+   flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, FDivStartD, FDivStartE);
+   flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput1D, FForwardInput1E);
+   flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput2D, FForwardInput2E);
+   flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput3D, FForwardInput3E);
+   flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResult64W, FPUResult64E);
+   flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE);
+   flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E);
+   flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE);
+   
+   //EXECUTION STAGE
+   
+   // input muxs for forwarding
+   mux4  #(64)  FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, SrcAM, FForwardInput1E, FInput1tmpE);
+   mux3  #(64)  FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E);
+   mux2  #(64)  FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E);
+   mux2  #(64)  FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E);
+   
+   fma1 fma1 (.*);
+   
+   // first and only instance of floating-point divider
+   logic fpdivClk;
+   
+   clockgater fpdivclkg(.E(FDivStartE),
+			.SE(DivBusyM),
+			.CLK(clk),
+			.ECLK(fpdivClk));
+   
+   fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk));
+   
+   // first of two-stage instance of floating-point add/cvt unit
+   fpuaddcvt1 fpadd1 (.*);
+   
+   // first of two-stage instance of floating-point comparator
+   fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, FInput1E, FInput2E, FOpCtrlE[1:0]);
+   
+   // first and only instance of floating-point sign converter
+   fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*);
+   
+   // first and only instance of floating-point classify unit
+   fpuclassify fpuclass (.*);
+   
+   //*****************
+   //fpregfile D/E pipe registers
+   //*****************
+   flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, FInput1E, FInput1M);
+   flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, FInput2E, FInput2M);
+   flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, FInput3E, FInput3M);
+   
+   //*****************
+   // fma E/M pipe registers
+   //*****************  
+   flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); 
+   flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); 
+   flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); 
+   flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); 
+   flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); 
+   flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM);  
+   flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); 
+   flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); 
+   flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); 
+   flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); 
+   flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); 
+   flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); 
+   flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); 
+   flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); 
+   flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); 
+   flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); 
+   flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); 
+   flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); 
+   flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); 
+   flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); 
+   flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); 
+   flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); 
+   flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); 
+   flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); 
+   flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); 
+   
+   //*****************
+   // fpadd E/M pipe registers
+   //*****************
+   flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
+   flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
+   flopenrc #(4)  EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
+   flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
+   flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
+   flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
+   flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
+   flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
+   flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
+   flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
+   flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
+   flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
+   flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
+   flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
+   flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignAM); 
+   flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
+   flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
+   flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
+   flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
+   flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
+   flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
+   flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
+   flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
+   flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
+   flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
+   
+   //*****************
+   // fpcmp E/M pipe registers
+   //*****************
+   flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
+   flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
+   flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
+   flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
+   flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
+   flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
+   
+   // put this in for the event we want to delay fsgn - will otherwise bypass
+   //*****************
+   // fpsgn E/M pipe registers
+   //***************** 
+   flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
+   flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
+   
+   //*****************
+   // other E/M pipe registers
+   //*****************
+   flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FWriteEnE, FWriteEnM);
+   flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM);
+   flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
+   flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM);
+   flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM);
+   flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM);
+   flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
+   flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
+   
+   //*****************
+   // fpuclassify E/M pipe registers
+   //***************** 
+   flopenrc #(64) EMRegClass(clk, reset, PipeClearEM, PipeEnableEM, ClassResultE, ClassResultM);
+   
+   //BEGIN MEMORY STAGE
+   
+   assign FWriteDataM = FInput1M;
+   
+   mux2  #(64)  FLoadStoreResultMux(HRDATA, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
+   
+   fma2 fma2(.*);
+   
+   // second instance of two-stage floating-point add/cvt unit
+   fpuaddcvt2 fpadd2 (.*);
+   
+   // second instance of two-stage floating-point comparator
+   fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), 
+		   .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*);
+   
+   //*****************
+   // fma M/W pipe registers
+   //*****************
+   flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); 
+   flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); 
+   
+   //*****************
+   // fpdiv M/W pipe registers
+   //*****************
+   flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, FDivResultM, FDivResultW); 
+   flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, FDivFlagsM, FDivFlagsW);
+   flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
+   
+   //*****************
+   // fpadd M/W pipe registers
+   //*****************
+   flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, FAddResultM, FAddResultW); 
+   flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, FAddFlagsM, FAddFlagsW); 
+   
+   //*****************
+   // fpcmp M/W pipe registers
+   //*****************
+   flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
+   flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
+   flopenrc #(64) MWRegCmp3(clk, reset, PipeClearMW, PipeEnableMW, FCmpResultM, FCmpResultW); 
+   
+   //*****************
+   // fpsgn M/W pipe registers
+   //***************** 
+   flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW);
+   flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW);
+   
+   //*****************
+   // other M/W pipe registers
+   //*****************
+   flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FWriteEnM, FWriteEnW);
+   flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
+   flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
+   flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
+   flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW);
+   flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW);
+   flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
+   
+   //*****************
+   // fpuclassify M/W pipe registers
+   //***************** 
+   flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW);
 
   //#########################################
-  //BEGIN WRITEBACK STAGE
+  // BEGIN WRITEBACK STAGE
   //#########################################
-
-  always_comb begin
-	case (FResultSelW)
-		// div/sqrt
-		3'b000 : FPUFlagsW = FDivFlagsW;
-		// cmp		
-		3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0};
-		//fma/mult
-		3'b010 : FPUFlagsW = FmaFlagsW;
-		// sgn inj
-		3'b011 : FPUFlagsW = SgnFlagsW;
-		// add/sub/cnvt
-		3'b100 : FPUFlagsW = FAddFlagsW;
-		// classify
-		3'b101 : FPUFlagsW = 5'b0;
-		// output SrcAW
-		3'b110 : FPUFlagsW = 5'b0;
-		// output FRD1
-		3'b111 : FPUFlagsW = 5'b0;
-		default : FPUFlagsW = 5'bxxxxx;
-	endcase
-  end
-
-
-  always_comb begin
-	case (FResultSelW)
-		// div/sqrt
-		3'b000 : FPUResult64W = FDivResultW;
-		// cmp		
-		3'b001 : FPUResult64W = FCmpResultW;
-		//fma/mult
-		3'b010 : FPUResult64W = FmaResultW;
-		// sgn inj
-		3'b011 : FPUResult64W = SgnResultW;
-		// add/sub/cnvt
-		3'b100 : FPUResult64W = FAddResultW;
-		// classify
-		3'b101 : FPUResult64W = ClassResultW;
-		// output SrcAW
-		3'b110 : FPUResult64W = SrcAW;
-		// Load/Store/Move to FP-register
-		3'b111 : FPUResult64W = FLoadStoreResultW;
-		default : FPUResult64W = {64{1'bx}};
-	endcase
-  end
-  //interface between XLEN size datapath and double-precision sized
-  //floating-point results
-  //
-  //define offsets for LSB zero extension or truncation
-  always_comb begin
-           
-  //zero extension 
+   
+   always_comb begin
+      case (FResultSelW)
+	// div/sqrt
+	3'b000 : FPUFlagsW = FDivFlagsW;
+	// cmp		
+	3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0};
+	//fma/mult
+	3'b010 : FPUFlagsW = FmaFlagsW;
+	// sgn inj
+	3'b011 : FPUFlagsW = SgnFlagsW;
+	// add/sub/cnvt
+	3'b100 : FPUFlagsW = FAddFlagsW;
+	// classify
+	3'b101 : FPUFlagsW = 5'b0;
+	// output SrcAW
+	3'b110 : FPUFlagsW = 5'b0;
+	// output FRD1
+	3'b111 : FPUFlagsW = 5'b0;
+	default : FPUFlagsW = 5'bxxxxx;
+      endcase
+   end
+   
+   always_comb begin
+      case (FResultSelW)
+	// div/sqrt
+	3'b000 : FPUResult64W = FDivResultW;
+	// cmp		
+	3'b001 : FPUResult64W = FCmpResultW;
+	//fma/mult
+	3'b010 : FPUResult64W = FmaResultW;
+	// sgn inj
+	3'b011 : FPUResult64W = SgnResultW;
+	// add/sub/cnvt
+	3'b100 : FPUResult64W = FAddResultW;
+	// classify
+	3'b101 : FPUResult64W = ClassResultW;
+	// output SrcAW
+	3'b110 : FPUResult64W = SrcAW;
+	// Load/Store/Move to FP-register
+	3'b111 : FPUResult64W = FLoadStoreResultW;
+	default : FPUResult64W = {64{1'bx}};
+      endcase
+   end // always_comb
+   
+   // interface between XLEN size datapath and double-precision sized
+   // floating-point results
+   //
+   // define offsets for LSB zero extension or truncation
+   always_comb begin      
+      // zero extension 
       FPUResultW = FPUResult64W[63:64-`XLEN];
-      SetFflagsM = FPUFlagsW;
+      SetFflagsM = FPUFlagsW;      
+   end
+  
+endmodule // fpu
 
-  end  
-endmodule

From 0670c57fd2638defa89c97712dfaedaed5ddf3c9 Mon Sep 17 00:00:00 2001
From: Ross Thompson <stephen.thompson.37@us.af.mil>
Date: Tue, 1 Jun 2021 15:05:22 -0500
Subject: [PATCH 16/19] The clock gater was not implemented correctly.  Now it
 is level sensitive to a low clock.

---
 wally-pipelined/src/generic/clockgater.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wally-pipelined/src/generic/clockgater.sv b/wally-pipelined/src/generic/clockgater.sv
index dc51829d..c06a1cbd 100644
--- a/wally-pipelined/src/generic/clockgater.sv
+++ b/wally-pipelined/src/generic/clockgater.sv
@@ -38,7 +38,7 @@ module clockgater
   logic 	enable_q;
   
 
-  always @(E or SE) begin
+  always @(~CLK) begin
     enable_q <= E | SE;
   end
   assign ECLK = enable_q & CLK;

From eba7ce64f56fc49a2fb4017290af23ac9a820712 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Tue, 1 Jun 2021 17:39:54 -0400
Subject: [PATCH 17/19] delete div.bak

---
 wally-pipelined/src/muldiv/div.bak | 1560 ----------------------------
 1 file changed, 1560 deletions(-)
 delete mode 100755 wally-pipelined/src/muldiv/div.bak

diff --git a/wally-pipelined/src/muldiv/div.bak b/wally-pipelined/src/muldiv/div.bak
deleted file mode 100755
index 4266ae61..00000000
--- a/wally-pipelined/src/muldiv/div.bak
+++ /dev/null
@@ -1,1560 +0,0 @@
-///////////////////////////////////////////
-// mul.sv
-//
-// Written: James.Stine@okstate.edu 1 February 2021
-// Modified: 
-//
-// Purpose: Integer Divide instructions
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-///////////////////////////////////////////
-
-// *** <Thomas Fleming> I added these verilator controls to clean up the
-// lint output. The linter warnings should be fixed, but now the output is at
-// least readable.
-/* verilator lint_off COMBDLY */
-/* verilator lint_off IMPLICIT */
-
-`include "wally-config.vh"
-
-module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
-
-   input logic [63:0]  N, D;
-   input logic 	       clk;
-   input logic 	       reset;
-   input logic 	       start;
-   input logic 	       S;   
-   
-   output logic [63:0] Qf;
-   output logic [63:0] remf;
-   output logic        div0;
-   output logic        done;
-   output logic        divBusy;   
-
-   logic 	       divdone;   
-   logic 	       enable;
-   logic 	       state0;
-   logic 	       V;   
-   logic [7:0] 	       Num;
-   logic [5:0] 	       P, NumIter, RemShift;
-   logic [63:0]        op1, op2, op1shift, Rem5;
-   logic [64:0]        Qd, Rd, Qd2, Rd2;
-   logic [63:0]        Q, rem0;
-   logic [3:0] 	       quotient;
-   logic 	       otfzero; 
-   logic 	       shiftResult;
-   logic 	       enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp;
-
-   logic [63:0]        twoD;
-   logic [63:0]        twoN;
-   logic 	       SignD;
-   logic 	       SignN;
-   logic [63:0]        QT, remT;
-   logic 	       D_NegOne;
-   logic 	       Max_N;
-
-   // Check if negative (two's complement)
-   //   If so, convert to positive
-   adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD);
-   adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN);   
-   assign SignD = D[63];
-   assign SignN = N[63];   
-   // Max N and D = -1 (Overflow)
-   assign Max_N = (~|N[62:0]) & N[63];
-   assign D_NegOne = &D;
-
-   // Divider goes the distance to 37 cycles
-   // (thanks to the evil divisor for D = 0x1) 
-   
-   // Shift D, if needed (for integer)
-   // needed to allow qst to be in range for integer
-   // division [1,2) and allow integer divide to work.
-   //
-   // The V or valid bit can be used to determine if D
-   // is 0 and thus a divide by 0 exception.  This div0
-   // exception is given to FSM to tell the operation to 
-   // quit gracefully.
-
-   lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD));
-   shift_left #(64) p2 (twoD, P, op2);   
-   assign op1 = twoN;
-   assign div0 = ~V;
-
-   // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0)
-   // v = 2 since \rho < 1 (add 4 to make sure its a ceil)
-   adder #(8) cpa3 ({2'b0, P}, 
-		    {5'h0, shiftResult, ~shiftResult, 1'b0}, 
-		    Num);      
-   
-   // Determine whether need to add just Q/Rem
-   assign shiftResult = P[0];   
-   // div by 2 (ceil)
-   assign NumIter = Num[6:1];   
-   assign RemShift = P;
-
-   // FSM to control integer divider
-   //   assume inputs are postive edge and
-   //   datapath (divider) is negative edge
-   fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv,
-	       start, div0, NumIter, ~clk, reset);
-
-   flopr #(1) rega (~clk, reset, donev, done);
-   flopr #(1) regb (~clk, reset, divdonev, divdone);
-   flopr #(1) regc (~clk, reset, otfzerov, otfzero);
-   flopr #(1) regd (~clk, reset, enablev, enable);
-   flopr #(1) rege (~clk, reset, state0v, state0);
-   flopr #(1) regf (~clk, reset, divBusyv, divBusy);      
-   
-   // To obtain a correct remainder the last bit of the
-   // quotient has to be aligned with a radix-r boundary.
-   // Since the quotient is in the range 1/2 < q < 2 (one
-   // integer bit and m fractional bits), this is achieved by
-   // shifting N right by v+s so that (m+v+s) mod k = 0.  And,
-   // the quotient has to be aligned to the integer position.
-
-   divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, 
-		  enable, otfzero, shiftResult);
-
-   // Storage registers to hold contents stable
-   flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2);
-   flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2);         
-
-   // Probably not needed - just assigns results
-   assign Q = Qd2[63:0];
-   assign Rem5 = Rd2[64:1];  
-   
-   // Adjust remainder by m 
-   shift_right #(64) p4 (Rem5, RemShift, rem0);   
-
-   // Adjust Q/Rem for Signed
-   assign tcQ = (SignN ^ SignD) & S;
-   assign tcR = SignN & S;
-   // Signed Divide
-   // - When N and D are negative: Remainder is negative (undergoes a two's complement).
-   // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement).
-   // - When D is negative: Quotient is negative (undergoes a two's complement).
-   adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT);
-   adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT);         
-
-   // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec)
-   exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf);
-
-endmodule // int32div
-
-module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, 
-		   enable, otfzero, shiftResult); 
-
-   input logic [63:0]   op1, op2;
-   input logic 		clk, state0;
-   input logic 		reset;
-   input logic 		enable;
-   input logic 		otfzero;
-   input logic 		shiftResult;   
-   
-   output logic [64:0] 	rem0;
-   output logic [64:0] 	Q;
-   output logic [3:0] 	quotient;   
-
-   logic [67:0] 	Sum, Carry;   
-   logic [64:0] 	Qstar;   
-   logic [64:0] 	QMstar;   
-   logic [7:0] 		qtotal;   
-   logic [67:0] 	SumN, CarryN, SumN2, CarryN2;
-   logic [67:0] 	divi1, divi2, divi1c, divi2c, dive1;
-   logic [67:0] 	mdivi_temp, mdivi;   
-   logic 		zero;
-   logic [1:0] 		qsel;
-   logic [1:0] 		Qin, QMin;
-   logic 		CshiftQ, CshiftQM;
-   logic [67:0] 	rem1, rem2, rem3;
-   logic [67:0] 	SumR, CarryR;
-   logic [64:0] 	Qt;   
-
-   // Create one's complement values of Divisor (for q*D)
-   assign divi1 = {3'h0, op2, 1'b0};
-   assign divi2 = {2'h0, op2, 2'b0};
-   assign divi1c = ~divi1;
-   assign divi2c = ~divi2;
-   // Shift x1 if not mod k
-   mux2 #(68) mx1 ({3'b000, op1, 1'b0},  {4'h0, op1}, shiftResult, dive1);   
-
-   // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D)
-   mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN);
-   mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN);
-   // Simplify QST
-   adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal);   
-   // q = {+2, +1, -1, -2} else q = 0
-   qst4 pd1 (qtotal[7:1], divi1[63:61], quotient);
-   assign ulp = quotient[2]|quotient[3];
-   assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]);
-   // Map to binary encoding
-   assign qsel[1] = quotient[3]|quotient[2];
-   assign qsel[0] = quotient[3]|quotient[1];   
-   mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp);
-   mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi);
-   csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry);
-   // regs : save CSA
-   flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2);
-   flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2);
-   // OTF
-   ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM);   
-   otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, 
-		   otfzero, enable, Qstar, QMstar);
-
-   // Correction and generation of Remainder
-   adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1);
-   // Add back +D as correction
-   csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR);
-   adder #(68) cpa3 (SumR, CarryR, rem2);   
-   // Choose remainder (Rem or Rem+D)
-   mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3);
-   // Choose correct Q or QM
-   mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt);
-   // Final results
-   assign rem0 = rem3[64:0];
-   assign Q = Qt;   
-   
-endmodule // divide4x64
-
-module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM);
-
-   input logic [3:0] quot;
-
-   output logic [1:0] Qin;
-   output logic [1:0] QMin;
-   output logic       CshiftQ;
-   output logic       CshiftQM;
-
-   // Load/Store Control for OTF
-   assign Qin[1] = (quot[1]) | (quot[3]) | (quot[0]);
-   assign Qin[0] = (quot[1]) | (quot[2]);
-   assign QMin[1] = (quot[1]) | (!quot[3]&!quot[2]&!quot[1]&!quot[0]);
-   assign QMin[0] = (quot[3]) | (quot[0]) | 
-		    (!quot[3]&!quot[2]&!quot[1]&!quot[0]);
-   assign CshiftQ = (quot[1]) | (quot[0]);
-   assign CshiftQM = (quot[3]) | (quot[2]);   
-
-endmodule 
-
-// On-the-fly Conversion per Ercegovac/Lang
-
-module otf #(parameter WIDTH=8) 
-   (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q);
-   
-   input logic [1:0]        Qin, QMin;
-   input logic 		    CshiftQ, CshiftQM;   
-   input logic 		    clk;
-   input logic 	            reset;
-   input logic 		    enable;   
-
-   output logic [WIDTH-1:0] R2Q;
-   output logic [WIDTH-1:0] R1Q;   
-
-   logic [WIDTH-1:0] 	    Qstar, QMstar;      
-   logic [WIDTH-1:0] 	    M1Q, M2Q;
-   
-   // QM
-   mux2 #(WIDTH)  m1 (QMstar, Qstar, CshiftQM, M1Q);
-   flopenr #(WIDTH) r1 (clk, reset, enable, {M1Q[WIDTH-3:0], QMin}, R1Q);
-   // Q
-   mux2 #(WIDTH)  m2 (Qstar, QMstar, CshiftQ, M2Q);
-   flopenr #(WIDTH) r2 (clk, reset, enable, {M2Q[WIDTH-3:0], Qin}, R2Q);
-   
-   assign Qstar = R2Q;
-   assign QMstar = R1Q;
-
-endmodule // otf8
-
-module adder #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b,
-				   output logic [WIDTH-1:0] y);
-
-   assign y = a + b;
-
-endmodule // adder
-
-module fa (input logic a, b, c, output logic sum, carry);
-
-   assign sum = a^b^c;
-   assign carry = a&b|a&c|b&c;   
-
-endmodule // fa
-
-module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c,
-				 output logic [WIDTH-1:0] sum, carry);
-
-   logic [WIDTH:0] 					  carry_temp;   
-   genvar 						  i;
-   generate
-      for (i=0;i<WIDTH;i=i+1)
-	begin : genbit
-	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
-	end
-   endgenerate
-   //assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     // trmimmed excess bit dh 5/3/21
-   assign carry = {carry_temp[WIDTH-1:1], 1'b0};     
-
-endmodule // adder
-
-module eqcmp #(parameter WIDTH = 8)
-   (input  logic [WIDTH-1:0] a, b,
-    output logic y);
-   
-   assign y = (a == b);
-   
-endmodule // eqcmp
-
-module qst4 (input logic [6:0] s, input logic [2:0] d,
-	     output logic [3:0] q);
-   
-   
-   assign q[3] = (!s[6]&s[5]) | (!d[2]&!s[6]&s[4]) | (!s[6]&s[4]&s[3]) | 
-		 (!d[1]&!s[6]&s[4]&s[2]) | (!d[0]&!s[6]&s[4]&s[2]) | 
-		 (!d[1]&!d[0]&!s[6]&s[4]&s[1]) | 
-		 (!d[2]&!d[1]&!d[0]&!s[6]&s[3]&s[2]) | 
-		 (!d[2]&!d[1]&!s[6]&s[3]&s[2]&s[1]) | 
-		 (!d[2]&!d[0]&!s[6]&s[3]&s[2]&s[1]&s[0]);
-   
-   assign q[2] = (d[2]&!s[6]&!s[5]&!s[4]&s[3]) | 
-		 (!s[6]&!s[5]&!s[4]&s[3]&!s[2]) | 
-		 (!d[2]&!s[6]&!s[5]&!s[4]&!s[3]&s[2]) | 
-		 (d[2]&d[1]&d[0]&!s[6]&!s[5]&s[4]&!s[3]) | 
-		 (d[2]&d[1]&!s[6]&!s[5]&s[4]&!s[3]&!s[2]) | 
-		 (d[2]&d[0]&!s[6]&!s[5]&s[4]&!s[3]&!s[2]) | 
-		 (d[2]&!s[6]&!s[5]&s[4]&!s[3]&!s[2]&!s[1]) | 
-		 (!d[2]&d[1]&d[0]&!s[6]&!s[5]&!s[4]&s[2]) | 
-		 (!d[1]&!s[6]&!s[5]&!s[4]&!s[3]&s[2]&s[1]) | 
-		 (!d[2]&d[1]&!s[6]&!s[5]&!s[4]&s[2]&!s[1]) | 
-		 (!d[2]&d[0]&!s[6]&!s[5]&!s[4]&s[2]&!s[1]) | 
-		 (!d[2]&d[1]&!s[6]&!s[5]&!s[4]&s[2]&!s[0]);
-   
-   assign q[1] = (d[2]&s[6]&s[5]&s[4]&!s[3]) | 
-		 (d[1]&s[6]&s[5]&s[4]&!s[3]) | (s[6]&s[5]&s[4]&!s[3]&s[2]) | 
-		 (d[2]&s[6]&s[5]&!s[4]&s[3]&s[2]) | 
-		 (d[0]&s[6]&s[5]&s[4]&!s[3]&s[1]) | 
-		 (d[2]&d[1]&d[0]&s[6]&s[5]&!s[4]&s[3]) | 
-		 (d[2]&d[1]&s[6]&s[5]&!s[4]&s[3]&s[1]) | 
-		 (!d[2]&s[6]&s[5]&s[4]&s[3]&!s[2]&!s[1]) | 
-		 (!d[2]&!d[1]&!d[0]&s[6]&s[5]&s[4]&s[3]&!s[2]) | 
-		 (d[1]&d[0]&s[6]&s[5]&!s[4]&s[3]&s[2]&s[1]) | 
-		 (!d[2]&d[0]&s[6]&s[5]&s[4]&!s[2]&!s[1]&s[0]) | 
-		 (!d[2]&!d[1]&!d[0]&s[6]&s[5]&s[4]&!s[2]&s[1]&s[0]);
-   
-   assign q[0] = (s[6]&!s[5]) | (s[6]&!s[4]&!s[3]) | 
-		 (!d[2]&!d[1]&s[6]&!s[4]) | (!d[2]&!d[0]&s[6]&!s[4]) | 
-		 (!d[2]&s[6]&!s[4]&!s[2]) | (!d[1]&s[6]&!s[4]&!s[2]) | 
-		 (!d[2]&s[6]&!s[4]&!s[1]) | (!d[0]&s[6]&!s[4]&!s[2]&!s[1]) | 
-		 (!d[2]&!d[1]&!d[0]&s[6]&!s[3]&!s[2]&!s[1]) | 
-		 (!d[2]&!d[1]&!d[0]&s[6]&!s[3]&!s[2]&!s[0]) | 
-		 (!d[2]&!d[1]&s[6]&!s[3]&!s[2]&!s[1]&!s[0]);
-   
-endmodule // qst4
-
-module lz2 (P, V, B0, B1);
-
-   input logic  B0;
-   input logic 	B1;
-
-   output logic P;
-   output logic V;
-
-   assign V = B0 | B1;
-   assign P = B0 & ~B1;
-   
-endmodule // lz2
-
-module lz4 (ZP, ZV, B0, B1, V0, V1);
-   
-   input logic        B0;
-   input logic        B1;
-   input logic        V0;
-   input logic        V1;
-   
-   output logic [1:0] ZP;
-   output logic       ZV;
-   
-   assign ZP[0] = V0 ? B0 : B1;
-   assign ZP[1] = ~V0;
-   assign ZV = V0 | V1;
-
-endmodule // lz4
-
-module lz8 (ZP, ZV, B);
-   
-   input logic [7:0]  B;
-
-   logic 	      s1p0;
-   logic 	      s1v0;
-   logic 	      s1p1;
-   logic 	      s1v1;
-   logic 	      s2p0;
-   logic 	      s2v0;
-   logic 	      s2p1;
-   logic 	      s2v1;
-   logic [1:0] 	      ZPa;
-   logic [1:0] 	      ZPb;
-   logic 	      ZVa;
-   logic 	      ZVb;
-   
-   output logic [2:0] ZP;
-   output logic       ZV;
-   
-   lz2 l1(s1p0, s1v0, B[2], B[3]);
-   lz2 l2(s1p1, s1v1, B[0], B[1]);
-   lz4 l3(ZPa, ZVa, s1p0, s1p1, s1v0, s1v1);
-
-   lz2 l4(s2p0, s2v0, B[6], B[7]);
-   lz2 l5(s2p1, s2v1, B[4], B[5]);
-   lz4 l6(ZPb, ZVb, s2p0, s2p1, s2v0, s2v1);
-
-   assign ZP[1:0] = ZVb ? ZPb : ZPa;
-   assign ZP[2]   = ~ZVb;
-   assign ZV = ZVa | ZVb;
-
-endmodule // lz8
-
-module lz16 (ZP, ZV, B);
-
-   input logic [15:0]  B;
-
-   logic [2:0] 	       ZPa;
-   logic [2:0] 	       ZPb;
-   logic 	       ZVa;
-   logic 	       ZVb;   
-
-   output logic [3:0]  ZP;
-   output logic        ZV;
-
-   lz8 l1(ZPa, ZVa, B[7:0]);
-   lz8 l2(ZPb, ZVb, B[15:8]);
-
-   assign ZP[2:0] = ZVb ? ZPb : ZPa;
-   assign ZP[3]   = ~ZVb;
-   assign ZV = ZVa | ZVb;
-
-endmodule // lz16
-
-module lz32 (ZP, ZV, B);
-
-   input logic [31:0] B;
-
-   logic [3:0] 	      ZPa;
-   logic [3:0] 	      ZPb;
-   logic 	      ZVa;
-   logic 	      ZVb;
-   
-   output logic [4:0] ZP;
-   output logic       ZV;
-   
-   lz16 l1(ZPa, ZVa, B[15:0]);
-   lz16 l2(ZPb, ZVb, B[31:16]);
-   
-   assign ZP[3:0] = ZVb ? ZPb : ZPa;
-   assign ZP[4]   = ~ZVb;
-   assign ZV = ZVa | ZVb;
-
-endmodule // lz32
-
-module lz64 (ZP, ZV, B);
-
-   input logic [63:0]  B;
-   
-   logic [4:0] 	       ZPa;
-   logic [4:0] 	       ZPb;
-   logic 	       ZVa;
-   logic 	       ZVb;
-   
-   output logic [5:0]  ZP;
-   output logic        ZV;
-   
-   lz32 l1(ZPa, ZVa, B[31:0]);
-   lz32 l2(ZPb, ZVb, B[63:32]);
-   
-   assign ZP[4:0] = ZVb ? ZPb : ZPa;
-   assign ZP[5]   = ~ZVb;
-   assign ZV = ZVa | ZVb;
-
-endmodule // lz64
-
-// FSM Control for Integer Divider
-module fsm64 (en, state0, done, divdone, otfzero, divBusy,
-	      start, error, NumIter, clk, reset);
-
-   input logic [5:0]  NumIter;   
-   input logic 	      clk;
-   input logic 	      reset;
-   input logic 	      start;
-   input logic 	      error;   
-   
-   output logic       done;      
-   output logic       en;
-   output logic       state0;
-   output logic       divdone;
-   output logic       otfzero;
-   output logic       divBusy;   
-   
-   logic 	      LT, EQ;
-   logic 	      Divide0;   
-   logic [5:0] 	      CURRENT_STATE;
-   logic [5:0] 	      NEXT_STATE;   
-   
-   parameter [5:0] 
-     S0=6'd0, S1=6'd1, S2=6'd2,
-     S3=6'd3, S4=6'd4, S5=6'd5,
-     S6=6'd6, S7=6'd7, S8=6'd8,
-     S9=6'd9, S10=6'd10, S11=6'd11,
-     S12=6'd12, S13=6'd13, S14=6'd14,
-     S15=6'd15, S16=6'd16, S17=6'd17,
-     S18=6'd18, S19=6'd19, S20=6'd20,
-     S21=6'd21, S22=6'd22, S23=6'd23,
-     S24=6'd24, S25=6'd25, S26=6'd26,
-     S27=6'd27, S28=6'd28, S29=6'd29,
-     S30=6'd30, S31=6'd31, S32=6'd32,
-     S33=6'd33, S34=6'd34, S35=6'd35,
-     S36=6'd36, Done=6'd37;      
-   
-   always @(posedge clk)
-     begin
-	if(reset==1'b1)
-	  CURRENT_STATE<=S0;
-	else
-	  CURRENT_STATE<=NEXT_STATE;
-     end
-
-   // Going to cheat and hard code number of states 
-   // needed into FSM instead of using a counter
-   // FIXME: could counter be better
-
-   // Cheated and made 8 - let synthesis do its magic
-   magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {2'h0, NumIter});
-
-   always @(CURRENT_STATE or start)
-     begin
- 	case(CURRENT_STATE)
-	  S0:
-	    begin
-	       if (start==1'b0)
-		 begin
-		    otfzero = 1'b1;   
-		    en = 1'b0;
-		    divBusy = 1'b0;		    
-		    state0 = 1'b0;
-		    divdone = 1'b0;		    
-		    done = 1'b0;
-		    NEXT_STATE <= S0;
-		 end 
-	       else 
-		 begin
-		    otfzero = 1'b0;	       		    
-		    en = 1'b1;
-		    divBusy = 1'b1;		    		    
-		    state0 = 1'b1;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		    
-		    done = 1'b0;
-		    divdone = 1'b0;		 		 
-		    NEXT_STATE <= S1;
-		 end 
-	    end	    
-	  S1:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S2;
-		 end
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S2;
-		 end		    
-	    end // case: S1	  
-	  S2:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S3;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S3;
-		 end		    	       	       
-	    end // case: S2
-	  S3:
-	    begin	       
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S4;
-		 end 
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S4;
-		 end		    	       
-	    end // case: S3
-	  S4:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S5;
-		 end 	       	    
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S5;
-		 end		       	       
-	    end // case: S4
-	  S5:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S6;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S6;
-		 end		    	       	       	       
-	    end // case: S5
-	  S6:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S7;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S7;
-		 end		    	       	       
-	    end // case: S6
-	  S7:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S8;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S8;
-		 end		    	       	       
-	    end // case: S7
-	  S8:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S9;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S9;
-		 end		    	       	       
-	    end // case: S8
-	  S9:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S10;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S10;
-		 end		    	       	       
-	    end // case: S9
-	  S10:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S11;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S11;
-		 end		    	       	       
-	    end // case: S10
-	  S11:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S12;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S12;
-		 end		    	       	       
-	    end // case: S11
-	  S12:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S13;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S13;
-		 end		    	       	       
-	    end // case: S12
-	  S13:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S14;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S14;
-		 end		    	       	       
-	    end // case: S13
-	  S14:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S15;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S15;
-		 end		    	       	       
-	    end // case: S14
-	  S15:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S16;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S16;
-		 end		    	       	       
-	    end // case: S15
-	  S16:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S17;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S17;
-		 end		    	       	       
-	    end // case: S16
-	  S17:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S18;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S18;
-		 end		    	       	       
-	    end // case: S17
-	  S18:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S19;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S19;
-		 end		    	       	       
-	    end // case: S18
-	  S19:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S20;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S20;
-		 end		    	       	       
-	    end // case: S19
-	  S20:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S21;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S21;
-		 end		    	       	       
-	    end // case: S20
-	  S21:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S22;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S22;
-		 end		    	       	       
-	    end // case: S21
-	  S22:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;
-		    NEXT_STATE <= S23;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S23;
-		 end		    	       	       
-	    end // case: S22
-	  S23:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S24;		    
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S24;
-		 end		    	       	       
-	    end // case: S23 
-	  S24:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S25;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S25;
-		 end		    	       	       
-	    end // case: S24
-	  S25:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S26;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S26;
-		 end		    	       	       
-	    end // case: S25
-	  S26:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S27;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S27;
-		 end		    	       	       
-	    end // case: S26
-	  S27:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S28;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S28;
-		 end		    	       	       
-	    end // case: S27
-	  S28:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S29;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S29;
-		 end		    	       	       
-	    end // case: S28
-	  S29:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S30;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S30;
-		 end		    	       	       
-	    end // case: S29
-	  S30:
-	    begin
-	       otfzero = 1'b0;
-     	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S31;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S31;
-		 end		    	       	       
-	    end // case: S30
-	  S31:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S32;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S32;
-		 end		    	       	       
-	    end // case: S31  
-	  S32:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S33;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S33;
-		 end		    	       	       
-	    end // case: S32
-	  S33:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S34;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S34;
-		 end		    	       	       
-	    end // case: S33
-	  S34:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S35;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S35;
-		 end		    	       	       
-	    end // case: S34  	  
-	  S35:
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       if (LT|EQ)
-		 begin
-		    en = 1'b1;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
-		    NEXT_STATE <= S36;
-		 end // if (LT|EQ)
-	       else
-		 begin
-		    en = 1'b0;
-		    state0 = 1'b0;
-		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S36;
-		 end		    	       	       
-	    end // case: S35	  
-	  S36:
-	    begin
-	       otfzero = 1'b1;
-	       divBusy = 1'b1;	       
-	       state0 = 1'b0;
-	       done = 1'b1;
-	       if (EQ)
-		 begin
-		    divdone = 1'b1;
-		    en = 1'b1;
-		 end
-	       else
-		 begin
-		    divdone = 1'b0;
-		    en = 1'b0;
-		 end
-	       NEXT_STATE <= S0;
-	    end // case: S36
-	  default: 
-	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
-	       en = 1'b0;
-	       state0 = 1'b0;
-	       done = 1'b0;
-	       divdone = 1'b0;
-	       NEXT_STATE <= S0;
-	    end
-	endcase // case(CURRENT_STATE)	
-     end // always @ (CURRENT_STATE or X)   
-
-endmodule // fsm64
-
-// 2-bit magnitude comparator
-// This module compares two 2-bit values A and B. LT is '1' if A < B 
-// and GT is '1'if A > B. LT and GT are both '0' if A = B.
-
-module magcompare2b (LT, GT, A, B);
-
-   input logic [1:0] A;
-   input logic [1:0] B;
-   
-   output logic      LT;
-   output logic      GT;
-   
-   // Determine if A < B  using a minimized sum-of-products expression
-   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
-   // Determine if A > B  using a minimized sum-of-products expression
-   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
-
-endmodule // magcompare2b
-
-// J. E. Stine and M. J. Schulte, "A combined two's complement and
-// floating-point comparator," 2005 IEEE International Symposium on
-// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
-// doi: 10.1109/ISCAS.2005.1464531
-
-module magcompare8 (LT, EQ, A, B);
-
-   input logic [7:0]  A;
-   input logic [7:0]  B;
-   
-   logic [3:0] 	      s;
-   logic [3:0] 	      t;
-   logic [1:0] 	      u;
-   logic [1:0] 	      v;
-   logic 	      GT;
-   //wire 	LT;   
-   
-   output logic       EQ;
-   output logic       LT;   
-   
-   magcompare2b mag1 (s[0], t[0], A[1:0], B[1:0]);
-   magcompare2b mag2 (s[1], t[1], A[3:2], B[3:2]);
-   magcompare2b mag3 (s[2], t[2], A[5:4], B[5:4]);
-   magcompare2b mag4 (s[3], t[3], A[7:6], B[7:6]);
-   
-   magcompare2b mag5 (u[0], v[0], t[1:0], s[1:0]);
-   magcompare2b mag6 (u[1], v[1], t[3:2], s[3:2]);
-
-   magcompare2b mag7 (LT, GT, v[1:0], u[1:0]);
-   
-   assign EQ = ~(GT | LT);   
-
-endmodule // magcompare8
-
-module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
-
-   input logic [63:0] Q;
-   input logic [63:0] rem;
-   input logic [63:0] op1;      
-   input logic 	      S;
-   input logic 	      div0;
-   input logic 	      Max_N;
-   input logic 	      D_NegOne;
-   
-   output logic [63:0] Qf;
-   output logic [63:0] remf;
-
-   // Needs to be optimized
-   always_comb
-     case ({div0, S, Max_N, D_NegOne})
-       4'b0000 : Qf = Q;
-       4'b0001 : Qf = Q;
-       4'b0010 : Qf = Q;              
-       4'b0011 : Qf = Q;              
-       4'b0100 : Qf = Q;
-       4'b0101 : Qf = Q;
-       4'b0110 : Qf = Q;       
-       4'b0111 : Qf = {1'b1, 31'h0};
-       4'b1000 : Qf = {64{1'b1}};
-       4'b1001 : Qf = {64{1'b1}};
-       4'b1010 : Qf = {64{1'b1}};
-       4'b1011 : Qf = {64{1'b1}};              
-       4'b1100 : Qf = {64{1'b1}};
-       4'b1101 : Qf = {64{1'b1}};       
-       4'b1110 : Qf = {64{1'b1}};       
-       4'b1111 : Qf = {64{1'b1}};              
-       default: Qf = Q;       
-     endcase 
-
-   always_comb
-     case ({div0, S, Max_N, D_NegOne})
-       4'b0000 : remf = rem;
-       4'b0001 : remf = rem;
-       4'b0010 : remf = rem;
-       4'b0011 : remf = rem;
-       4'b0100 : remf = rem;
-       4'b0101 : remf = rem;
-       4'b0110 : remf = rem;
-       4'b0111 : remf = 64'h0;     
-       4'b1000 : remf = op1;
-       4'b1001 : remf = op1;
-       4'b1010 : remf = op1;
-       4'b1011 : remf = op1;       
-       4'b1100 : remf = op1;
-       4'b1101 : remf = op1;
-       4'b1110 : remf = op1;       
-       4'b1111 : remf = op1;              
-       default: remf = rem;
-     endcase 
-
-endmodule // exception_int
-
-/* verilator lint_on COMBDLY */
-/* verilator lint_on IMPLICIT */
-

From 40cfa8693564e3f74a5f1a0cd437d0d0ff03d577 Mon Sep 17 00:00:00 2001
From: Kip Macsai-Goren <kipmacsaigoren@github.com>
Date: Tue, 1 Jun 2021 17:49:45 -0400
Subject: [PATCH 18/19] Edited and added constants to support SV48

---
 .../config/buildroot/wally-constants.vh       | 26 +++++++++++++------
 .../config/busybear/wally-constants.vh        | 26 +++++++++++++------
 .../config/coremark/wally-constants.vh        | 26 +++++++++++++------
 .../config/coremark_bare/wally-constants.vh   | 26 +++++++++++++------
 .../config/rv32ic/wally-constants.vh          | 12 ++++++++-
 .../config/rv64BP/wally-constants.vh          | 26 +++++++++++++------
 .../config/rv64ic/wally-constants.vh          | 26 +++++++++++++------
 .../config/rv64icfd/wally-constants.vh        | 26 +++++++++++++------
 .../config/rv64imc/wally-constants.vh         | 26 +++++++++++++------
 9 files changed, 155 insertions(+), 65 deletions(-)

diff --git a/wally-pipelined/config/buildroot/wally-constants.vh b/wally-pipelined/config/buildroot/wally-constants.vh
index 43d95863..cc6c27fc 100644
--- a/wally-pipelined/config/buildroot/wally-constants.vh
+++ b/wally-pipelined/config/buildroot/wally-constants.vh
@@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
diff --git a/wally-pipelined/config/busybear/wally-constants.vh b/wally-pipelined/config/busybear/wally-constants.vh
index 43d95863..cc6c27fc 100644
--- a/wally-pipelined/config/busybear/wally-constants.vh
+++ b/wally-pipelined/config/busybear/wally-constants.vh
@@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
diff --git a/wally-pipelined/config/coremark/wally-constants.vh b/wally-pipelined/config/coremark/wally-constants.vh
index 43d95863..cc6c27fc 100644
--- a/wally-pipelined/config/coremark/wally-constants.vh
+++ b/wally-pipelined/config/coremark/wally-constants.vh
@@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
diff --git a/wally-pipelined/config/coremark_bare/wally-constants.vh b/wally-pipelined/config/coremark_bare/wally-constants.vh
index 43d95863..cc6c27fc 100644
--- a/wally-pipelined/config/coremark_bare/wally-constants.vh
+++ b/wally-pipelined/config/coremark_bare/wally-constants.vh
@@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
diff --git a/wally-pipelined/config/rv32ic/wally-constants.vh b/wally-pipelined/config/rv32ic/wally-constants.vh
index ec4a48b4..f4c5ce9a 100644
--- a/wally-pipelined/config/rv32ic/wally-constants.vh
+++ b/wally-pipelined/config/rv32ic/wally-constants.vh
@@ -2,7 +2,10 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: kmacsaigoren@hmc.edu 31 May 2021
+//              added svmode constants. These aren't strictly necessary since we're just checking one bit,
+//              but they're here to stay consistent and to make sure we dont wind up
+//              a "NO_TRANSLATE undefined" situation.
 //
 // Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
 //          These macros should not be changed, except in the event of an
@@ -31,3 +34,10 @@
 `define PPN_BITS 22
 `define PPN_HIGH_SEGMENT_BITS 12
 `define PA_BITS  34
+`define SVMODE_BITS 1
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8 // These two are only here to stop
+`define SV48 9 // the verilator from yelling at me
diff --git a/wally-pipelined/config/rv64BP/wally-constants.vh b/wally-pipelined/config/rv64BP/wally-constants.vh
index 43d95863..cc6c27fc 100644
--- a/wally-pipelined/config/rv64BP/wally-constants.vh
+++ b/wally-pipelined/config/rv64BP/wally-constants.vh
@@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
diff --git a/wally-pipelined/config/rv64ic/wally-constants.vh b/wally-pipelined/config/rv64ic/wally-constants.vh
index 43d95863..cc6c27fc 100644
--- a/wally-pipelined/config/rv64ic/wally-constants.vh
+++ b/wally-pipelined/config/rv64ic/wally-constants.vh
@@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
diff --git a/wally-pipelined/config/rv64icfd/wally-constants.vh b/wally-pipelined/config/rv64icfd/wally-constants.vh
index 43d95863..cc6c27fc 100644
--- a/wally-pipelined/config/rv64icfd/wally-constants.vh
+++ b/wally-pipelined/config/rv64icfd/wally-constants.vh
@@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9
diff --git a/wally-pipelined/config/rv64imc/wally-constants.vh b/wally-pipelined/config/rv64imc/wally-constants.vh
index 43d95863..cc6c27fc 100644
--- a/wally-pipelined/config/rv64imc/wally-constants.vh
+++ b/wally-pipelined/config/rv64imc/wally-constants.vh
@@ -2,11 +2,14 @@
 // wally-constants.vh
 //
 // Written: tfleming@hmc.edu 4 March 2021
-// Modified:
+// Modified: Kmacsaigoren@hmc.edu 31 May 2021
+//              Added constants for checking sv mode and changed existing constants to accomodate
+//              both sv48 and sv39
 //
-// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
-//          These macros should not be changed, except in the event of an
-//          update to the architecture or particularly special circumstances.
+// Purpose: Specify constants nexessary for different memory virtualization modes.
+//              These are specific to sv49, defined in section 4.5 of the privileged spec.
+//              However, despite different constants for different modes, the hardware helps distinguish between
+//              each mode.
 //
 // A component of the Wally configurable RISC-V project.
 //
@@ -25,9 +28,16 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
-// Virtual Memory Constants (sv39)
+// Virtual Memory Constants (sv48)
 `define VPN_SEGMENT_BITS 9
-`define VPN_BITS 27
+`define VPN_BITS 36
+`define PPN_HIGH_SEGMENT_BITS 17
 `define PPN_BITS 44
-`define PPN_HIGH_SEGMENT_BITS 26
-`define PA_BITS  56
+`define PA_BITS 56
+`define SVMODE_BITS 4
+// constants to check SATP_MODE against
+// defined in Table 4.3 of the privileged spec
+`define NO_TRANSLATE 0
+`define SV32 1
+`define SV39 8
+`define SV48 9

From 5187574e8a45a3a993e88aafb0fda0b372c82e1f Mon Sep 17 00:00:00 2001
From: Kip Macsai-Goren <kipmacsaigoren@github.com>
Date: Tue, 1 Jun 2021 17:50:37 -0400
Subject: [PATCH 19/19] implemented Sv48.

---
 wally-pipelined/src/mmu/cam_line.sv          | 20 +++--
 wally-pipelined/src/mmu/page_number_mixer.sv | 87 +++++++++++++++-----
 wally-pipelined/src/mmu/pagetablewalker.sv   | 60 ++++++++++----
 wally-pipelined/src/mmu/tlb.sv               | 33 +++++---
 wally-pipelined/src/mmu/tlb_cam.sv           | 25 +++---
 5 files changed, 160 insertions(+), 65 deletions(-)

diff --git a/wally-pipelined/src/mmu/cam_line.sv b/wally-pipelined/src/mmu/cam_line.sv
index b7577573..6bab0b60 100644
--- a/wally-pipelined/src/mmu/cam_line.sv
+++ b/wally-pipelined/src/mmu/cam_line.sv
@@ -2,7 +2,9 @@
 // cam_line.sv
 //
 // Written: tfleming@hmc.edu & jtorrey@hmc.edu 6 April 2021
-// Modified:
+// Modified: kmacsaigoren@hmc.edu 1 June 2021
+//            Implemented SV48 on top of SV39. This included adding SvMode input signal and the wally constants
+//            Mostly this was done to make the PageNumberMixer work.
 //
 // Purpose: CAM line for the translation lookaside buffer (TLB)
 //          Determines whether a virtual address matches the stored key.
@@ -24,12 +26,17 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
+`include "wally-constants.vh"
+
 module cam_line #(parameter KEY_BITS = 20,
                   parameter HIGH_SEGMENT_BITS = 10) (
   input                 clk, reset,
 
+  // input to scheck which SvMode is running
+  input [`SVMODE_BITS-1:0] SvMode,
+  
   // The requested page number to compare against the key
-  input  [KEY_BITS-1:0] VirtualPageNumber,
+  input [KEY_BITS-1:0]  VirtualPageNumber,
 
   // Signals to write a new entry to this line
   input                 CAMLineWrite,
@@ -38,10 +45,11 @@ module cam_line #(parameter KEY_BITS = 20,
   // Flush this line (set valid to 0)
   input                 TLBFlush,
 
-  // This entry is a key for a giga, mega, or kilopage.
+  // This entry is a key for a tera, giga, mega, or kilopage.
   // PageType == 2'b00 --> kilopage
   // PageType == 2'b01 --> megapage
-  // PageType == 2'b11 --> gigapage
+  // PageType == 2'b10 --> gigapage
+  // PageType == 2'b11 --> terapage
   output [1:0]          PageType,  // *** should this be the stored version or the always updated one?
   output                Match
 );
@@ -67,9 +75,9 @@ module cam_line #(parameter KEY_BITS = 20,
   flopenr #(KEY_BITS) keyflop(clk, reset, CAMLineWrite, VirtualPageNumber, Key);
 
   // Calculate the actual query key based on the input key and the page type.
-  // For example, a megapage in sv39 only cares about VPN2 and VPN1, so VPN0
+  // For example, a megapage in SV39 only cares about VPN2 and VPN1, so VPN0
   // should automatically match.
-  page_number_mixer #(KEY_BITS, HIGH_SEGMENT_BITS) mixer(VirtualPageNumber, Key, PageType, VirtualPageNumberQuery);
+  page_number_mixer #(KEY_BITS, HIGH_SEGMENT_BITS) mixer(VirtualPageNumber, Key, PageType, SvMode, VirtualPageNumberQuery);
 
   assign Match = ({1'b1, VirtualPageNumberQuery} == {Valid, Key});
 
diff --git a/wally-pipelined/src/mmu/page_number_mixer.sv b/wally-pipelined/src/mmu/page_number_mixer.sv
index 57b8e4b7..03851018 100644
--- a/wally-pipelined/src/mmu/page_number_mixer.sv
+++ b/wally-pipelined/src/mmu/page_number_mixer.sv
@@ -2,7 +2,11 @@
 // page_number_mixer.sv
 //
 // Written: tfleming@hmc.edu & jtorrey@hmc.edu 6 April 2021
-// Modified:
+// Modified: kmacsaigoren@hmc.edu 1 June 2021
+//              Implemented SV48 on top of SV39. This included adding a 3rd Segment to each of the pagenumbers,
+//              Ensuring that the BITS and HIGH_SEGMENT_BITS inputs were correct everywhere this module gets instatniated,
+//              Adding seveeral muxes to decide the bit selection to turn pagenumbers into segments based on SV mode,
+//              Adding support for terapage/newgigapage encoding.
 //
 // Purpose: Takes two page numbers and replaces segments of the first page
 //          number with segments from the second, based on the page type.
@@ -25,22 +29,29 @@
 ///////////////////////////////////////////
 
 `include "wally-config.vh"
+`include "wally-constants.vh"
 
 module page_number_mixer #(parameter BITS = 20,
                            parameter HIGH_SEGMENT_BITS = 10) (
-    input  [BITS-1:0] PageNumber,
-    input  [BITS-1:0] MixPageNumber,
-    input  [1:0]      PageType,
-    output [BITS-1:0] PageNumberCombined
+    input  [BITS-1:0]         PageNumber,
+    input  [BITS-1:0]         MixPageNumber,
+    input  [1:0]              PageType,
+    input  [`SVMODE_BITS-1:0] SvMode,
+
+    output [BITS-1:0]         PageNumberCombined
 );
 
+  // The upper segment might have a different width than the lower segments.
+  // For example, an SV39 PTE has 26 bits for PPN2 and 9 bits for the other
+  // segments. This is outside the 'if XLEN' b/c the constant is already configured
+  // to the correct value for the XLEN in the relevant wally-constants.vh file.
+  localparam LOW_SEGMENT_BITS = `VPN_SEGMENT_BITS;
+  // *** each time this module is implemented, low segment bits is either
+  // `VPN_SEGMENT_BITS or `PPN_LOW_SEGMENT_BITS (if it existed)
+  // in every mode so far, these are the same, so it's left as it is above. 
+
   generate
-    // *** Just checking XLEN is not enough to support sv39 AND sv48.
     if (`XLEN == 32) begin
-      // The upper segment might have a different width than the lower segments.
-      // For example, an sv39 PTE has 26 bits for PPN2 and 9 bits for the other
-      // segments.
-      localparam LOW_SEGMENT_BITS = (BITS - HIGH_SEGMENT_BITS);
 
       logic [HIGH_SEGMENT_BITS-1:0] Segment1, MixSegment1, Segment1Combined;
       logic [LOW_SEGMENT_BITS-1:0]  Segment0, MixSegment0, Segment0Combined;
@@ -58,28 +69,60 @@ module page_number_mixer #(parameter BITS = 20,
       // Reswizzle segments of the combined page number
       assign PageNumberCombined = {Segment1Combined, Segment0Combined};
     end else begin
-      // The upper segment might have a different width than the lower segments.
-      // For example, an sv39 PTE has 26 bits for PPN2 and 9 bits for the other
-      // segments.
-      localparam LOW_SEGMENT_BITS = (BITS - HIGH_SEGMENT_BITS) / 2;
 
-      logic [HIGH_SEGMENT_BITS-1:0] Segment2, MixSegment2, Segment2Combined;
+      // After segment 0 and 1 of the page number, the width of each segment is dependant on the SvMode.
+      // For this reason, each segment bus is the width of its widest value across each mode
+      // when a smaller value needs to be loaded in to a wider bus, it's loaded in the least significant bits
+      // and left padded with zeros. MAKE SURE that if a value is being padded with zeros here,
+      // that it's padded with zeros everywhere else in the MMU ans beyond to avoid false misses in the TLB.
+      logic [HIGH_SEGMENT_BITS-1:0] Segment3, MixSegment3, Segment3Combined;
+      logic [HIGH_SEGMENT_BITS + LOW_SEGMENT_BITS-1:0]  Segment2, MixSegment2, Segment2Combined;
       logic [LOW_SEGMENT_BITS-1:0]  Segment1, MixSegment1, Segment1Combined;
       logic [LOW_SEGMENT_BITS-1:0]  Segment0, MixSegment0, Segment0Combined;
+      
 
       // Unswizzle segments of the input page number
-      assign {Segment2, Segment1, Segment0} = PageNumber;
-      assign {MixSegment2, MixSegment1, MixSegment0} = MixPageNumber;
+      // *** these muxes assume that only Sv48 and SV39 are implemented in rv64. for future SV57 and up,
+      //      there will have to be more muxes to select which value each segment gets.
+      //      as a cool reminder: BITS is the width of the page number, virt or phys, coming into this module
+      //      while high segment bits is the width of the highest segment of that page number.
+      //      Note for future work: this module has to work with both VPNs and PPNs and due to their differing 
+      //         widths and the fact that the ppn has one longer segment at the top makes the muxes below very confusing.
+      //      Potentially very annoying thing for future workers: the number of bits in a ppn is always 44 (for SV39 and48)
+      //         but in SV57 and above, this might be a new longer length. In that case these selectors will most likely
+      //         become even more complicated and confusing.
+      assign Segment3 = (SvMode == `SV48) ? 
+                        PageNumber[BITS-1:3*LOW_SEGMENT_BITS] : // take the top segment or not
+                        {HIGH_SEGMENT_BITS{1'b0}}; // for virtual page numbers in SV39, both options should be zeros.
+      assign Segment2 = (SvMode == `SV48) ? 
+                        {{HIGH_SEGMENT_BITS{1'b0}}, PageNumber[3*LOW_SEGMENT_BITS-1:2*LOW_SEGMENT_BITS]} : // just take another low segment left padded with zeros.
+                        PageNumber[BITS-1:2*LOW_SEGMENT_BITS]; // otherwise take the rest of the PageNumber
+      assign Segment1 = PageNumber[2*LOW_SEGMENT_BITS-1:LOW_SEGMENT_BITS];
+      assign Segment0 = PageNumber[LOW_SEGMENT_BITS-1:0];
+
+
+      assign MixSegment3 = (SvMode == `SV48) ? 
+                        MixPageNumber[BITS-1:3*LOW_SEGMENT_BITS] : // take the top segment or not
+                        {HIGH_SEGMENT_BITS{1'b0}}; // for virtual page numbers in SV39, both options should be zeros.
+      assign MixSegment2 = (SvMode == `SV48) ? 
+                        {{HIGH_SEGMENT_BITS{1'b0}}, MixPageNumber[3*LOW_SEGMENT_BITS-1:2*LOW_SEGMENT_BITS]} : // just take another low segment left padded with zeros.
+                        MixPageNumber[BITS-1:2*LOW_SEGMENT_BITS]; // otherwise take the rest of the PageNumber
+      assign MixSegment1 = MixPageNumber[2*LOW_SEGMENT_BITS-1:LOW_SEGMENT_BITS];
+      assign MixSegment0 = MixPageNumber[LOW_SEGMENT_BITS-1:0];
+
 
       // Pass through the high segment
-      assign Segment2Combined = Segment2;
+      assign Segment3Combined = Segment3;
 
-      // Either pass through or zero out segments 1 and 0 based on the page type
-      mux2 #(LOW_SEGMENT_BITS) segment1mux(Segment1, MixSegment1, PageType[1], Segment1Combined);
-      mux2 #(LOW_SEGMENT_BITS) segment0mux(Segment0, MixSegment0, PageType[0], Segment0Combined);
+      // Either pass through or zero out lower segments based on the page type
+      assign Segment2Combined = (PageType[1] && PageType[0]) ? MixSegment2 : Segment2; // terapage (page == 11)
+      assign Segment1Combined = (PageType[1]) ? MixSegment1 : Segment1; // gigapage and higher (page == 10 or 11)
+      assign Segment0Combined = (PageType[1] || PageType[0]) ? MixSegment0 : Segment0; // megapage and higher (page == 01 or 10 or 11)
 
       // Reswizzle segments of the combined page number
-      assign PageNumberCombined = {Segment2Combined, Segment1Combined, Segment0Combined};
+      assign PageNumberCombined = (SvMode == `SV48) ? 
+                                  {Segment3Combined, Segment2Combined[LOW_SEGMENT_BITS-1:0], Segment1Combined, Segment0Combined} :
+                                  {Segment2Combined, Segment1Combined, Segment0Combined};
     end
   endgenerate
 endmodule
diff --git a/wally-pipelined/src/mmu/pagetablewalker.sv b/wally-pipelined/src/mmu/pagetablewalker.sv
index f2aada44..b0e4fe8e 100644
--- a/wally-pipelined/src/mmu/pagetablewalker.sv
+++ b/wally-pipelined/src/mmu/pagetablewalker.sv
@@ -2,7 +2,10 @@
 // pagetablewalker.sv
 //
 // Written: tfleming@hmc.edu 2 March 2021
-// Modified: 
+// Modified: kmacsaigoren@hmc.edu 1 June 2021
+//            implemented SV48 on top of SV39. This included, adding a level of the FSM for the extra page number segment
+//            adding support for terapage encoding, and for setting the TranslationPAdr using the new level,
+//            adding the internal SvMode signal
 //
 // Purpose: Page Table Walker
 //          Part of the Memory Management Unit (MMU)
@@ -70,6 +73,7 @@ module pagetablewalker (
   logic [`XLEN-1:0]     SavedPTE, CurrentPTE;
   logic [`PA_BITS-1:0]  TranslationPAdr;
   logic [`PPN_BITS-1:0] CurrentPPN;
+  logic [`SVMODE_BITS-1:0]  SvMode;
   logic                 MemStore;
 
   // PTE Control Bits
@@ -82,6 +86,8 @@ module pagetablewalker (
   logic [`XLEN-1:0] PageTableEntry;
   logic [1:0] PageType;
 
+  assign SvMode = SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS];
+
   assign BasePageTablePPN = SATP_REGW[`PPN_BITS-1:0];
 
   assign MemStore = MemRWM[0];
@@ -105,11 +111,12 @@ module pagetablewalker (
   assign PageTypeF = PageType;
   assign PageTypeM = PageType;
 
-  localparam IDLE = 3'h0;
+  localparam LEVEL0 = 3'h0;
   localparam LEVEL1 = 3'h1;
-  localparam LEVEL0 = 3'h2;
-  localparam LEAF = 3'h3;
-  localparam FAULT = 3'h4;
+  // space left for more levels
+  localparam LEAF = 3'h5;
+  localparam IDLE = 3'h6;
+  localparam FAULT = 3'h7;
 
   logic [2:0] WalkerState, NextWalkerState;
 
@@ -208,18 +215,32 @@ module pagetablewalker (
       assign MMUPAdr = TranslationPAdr[31:0];
 
     end else begin
-      localparam LEVEL2 = 3'h5;
+      localparam LEVEL2 = 3'h2;
+      localparam LEVEL3 = 3'h3;
 
-      logic [8:0] VPN2, VPN1, VPN0;
+      logic [8:0] VPN3, VPN2, VPN1, VPN0;
 
-      logic GigapageMisaligned, BadGigapage;
+      logic TerapageMisaligned, GigapageMisaligned, BadTerapage, BadGigapage;
 
       flopenl #(3) mmureg(HCLK, ~HRESETn, 1'b1, NextWalkerState, IDLE, WalkerState);
 
       always_comb begin
         case (WalkerState)
-          IDLE:   if      (MMUTranslate)           NextWalkerState = LEVEL2;
+          IDLE:   if      (MMUTranslate)           NextWalkerState = LEVEL3;
                   else                             NextWalkerState = IDLE;
+          LEVEL3: if      (SvMode != `SV48)         NextWalkerState = LEVEL2;
+                  // 3rd level used if SV48 is enabled.
+                  else begin
+                    if      (~MMUReady)              NextWalkerState = LEVEL3;
+                    // *** <FUTURE WORK> According to the architecture, we should
+                    // fault upon finding a superpage that is misaligned or has 0
+                    // access bit. The following commented line of code is
+                    // supposed to perform that check. However, it is untested.
+                    else if (ValidPTE && LeafPTE && ~BadTerapage) NextWalkerState = LEAF;
+                    // else if (ValidPTE && LeafPTE)    NextWalkerState = LEAF;  // *** Once the above line is properly tested, delete this line.
+                    else if (ValidPTE && ~LeafPTE)   NextWalkerState = LEVEL2;
+                    else                             NextWalkerState = FAULT;
+                  end
           LEVEL2: if      (~MMUReady)              NextWalkerState = LEVEL2;
                   // *** <FUTURE WORK> According to the architecture, we should
                   // fault upon finding a superpage that is misaligned or has 0
@@ -242,24 +263,29 @@ module pagetablewalker (
                   else if (ValidPTE && LeafPTE && ~AccessAlert)
                                                    NextWalkerState = LEAF;
                   else                             NextWalkerState = FAULT;
-          LEAF:   if      (MMUTranslate)           NextWalkerState = LEVEL2;
+          LEAF:   if      (MMUTranslate)           NextWalkerState = LEVEL3;
                   else                             NextWalkerState = IDLE;
-          FAULT:  if      (MMUTranslate)           NextWalkerState = LEVEL2;
+          FAULT:  if      (MMUTranslate)           NextWalkerState = LEVEL3;
                   else                             NextWalkerState = IDLE;
           // Default case should never happen, but is included for linter.
           default:                                 NextWalkerState = IDLE;
         endcase
       end
 
+      // A terapage is a level 3 leaf page. This page must have zero PPN[2],
+      // zero PPN[1], and zero PPN[0]
+      assign TerapageMisaligned = |(CurrentPPN[26:0]);
       // A gigapage is a Level 2 leaf page. This page must have zero PPN[1] and
       // zero PPN[0]
       assign GigapageMisaligned = |(CurrentPPN[17:0]);
       // A megapage is a Level 1 leaf page. This page must have zero PPN[0].
       assign MegapageMisaligned = |(CurrentPPN[8:0]);
 
+      assign BadTerapage = TerapageMisaligned || AccessAlert;  // *** Implement better access/dirty scheme
       assign BadGigapage = GigapageMisaligned || AccessAlert;  // *** Implement better access/dirty scheme
       assign BadMegapage = MegapageMisaligned || AccessAlert;  // *** Implement better access/dirty scheme
 
+      assign VPN3 = TranslationVAdr[47:39];
       assign VPN2 = TranslationVAdr[38:30];
       assign VPN1 = TranslationVAdr[29:21];
       assign VPN0 = TranslationVAdr[20:12];
@@ -282,8 +308,13 @@ module pagetablewalker (
           IDLE: begin
             MMUStall = '0;
           end
+          LEVEL3: begin
+            TranslationPAdr = {BasePageTablePPN, VPN3, 3'b000};
+            // *** this is a huge breaking point. if we're going through level3 every time, even when sv48 is off,
+            // what should translationPAdr be when level3 is just off?
+          end
           LEVEL2: begin
-            TranslationPAdr = {BasePageTablePPN, VPN2, 3'b000};
+            TranslationPAdr = {(SvMode == `SV48) ? CurrentPPN : BasePageTablePPN, VPN2, 3'b000};
           end
           LEVEL1: begin
             TranslationPAdr = {CurrentPPN, VPN1, 3'b000};
@@ -295,8 +326,9 @@ module pagetablewalker (
             // Keep physical address alive to prevent HADDR dropping to 0
             TranslationPAdr = {CurrentPPN, VPN0, 3'b000};
             PageTableEntry = CurrentPTE;
-            PageType = (WalkerState == LEVEL2) ? 2'b11 : 
-                                ((WalkerState == LEVEL1) ? 2'b01 : 2'b00);
+            PageType = (WalkerState == LEVEL3) ? 2'b11 :
+                                ((WalkerState == LEVEL2) ? 2'b10 : 
+                                ((WalkerState == LEVEL1) ? 2'b01 : 2'b00));
             DTLBWriteM = DTLBMissM;
             ITLBWriteF = ~DTLBMissM;  // Prefer data over instructions
           end
diff --git a/wally-pipelined/src/mmu/tlb.sv b/wally-pipelined/src/mmu/tlb.sv
index 7ed594e4..1828c98e 100644
--- a/wally-pipelined/src/mmu/tlb.sv
+++ b/wally-pipelined/src/mmu/tlb.sv
@@ -2,7 +2,9 @@
 // tlb.sv
 //
 // Written: jtorrey@hmc.edu 16 February 2021
-// Modified:
+// Modified: kmacsaigoren@hmc.edu 1 June 2021
+//            Implemented SV48 on top of SV39. This included adding the SvMode signal,
+//            and using it to decide the translate signal and get the virtual page number
 //
 // Purpose: Translation lookaside buffer
 //          Cache of virtural-to-physical address translations
@@ -25,7 +27,7 @@
 ///////////////////////////////////////////
 
 /**
- * sv32 specs
+ * SV32 specs
  * ----------
  * Virtual address [31:0] (32 bits)
  *    [________________________________]
@@ -85,14 +87,11 @@ module tlb #(parameter ENTRY_BITS = 3,
   output             TLBPageFault
 );
 
-  logic SvMode;
   logic Translate;
   logic TLBAccess, ReadAccess, WriteAccess;
 
-  // *** If we want to support multiple virtual memory modes (ie sv39 AND sv48),
-  // we could have some muxes that control which parameters are current.
-  // Although then some of the signals are not big enough. But that's a problem
-  // for much later.
+  // Store current virtual memory mode (SV32, SV39, SV48, ect...)
+  logic [`SVMODE_BITS-1:0] SvMode;
 
   // Index (currently random) to write the next TLB entry
   logic [ENTRY_BITS-1:0] WriteIndex;
@@ -116,17 +115,24 @@ module tlb #(parameter ENTRY_BITS = 3,
   // Whether the virtual address has a match in the CAM
   logic                  CAMHit;
 
-  // Grab the sv bit from SATP
+  // Grab the sv mode from SATP
+  assign SvMode = SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS];
+
+  // The bus width is always the largest it could be for that XLEN. For example, vpn will be 36 bits wide in rv64
+  // this, even though it could be 27 bits (SV39) or 36 bits (SV48) wide. When the value of VPN is narrower,
+  // is shorter, the extra bits are used as padded zeros on the left of the full value.
   generate
     if (`XLEN == 32) begin
-      assign SvMode = SATP_REGW[31];  // *** change to an enum somehow?
+      assign VirtualPageNumber = VirtualAddress[`VPN_BITS+11:12];
     end else begin
-      assign SvMode = SATP_REGW[63]; // currently just a boolean whether translation enabled
+      assign VirtualPageNumber = (SvMode == `SV48) ?
+                                 VirtualAddress[`VPN_BITS+11:12] :
+                                 {{`VPN_SEGMENT_BITS{1'b0}}, VirtualAddress[3*`VPN_SEGMENT_BITS+11:12]};
     end
   endgenerate
 
   // Whether translation should occur
-  assign Translate = SvMode & (PrivilegeModeW != `M_MODE);
+  assign Translate = (SvMode != `NO_TRANSLATE) & (PrivilegeModeW != `M_MODE);
 
   // Determine how the TLB is currently being used
   // Note that we use ReadAccess for both loads and instruction fetches
@@ -134,7 +140,7 @@ module tlb #(parameter ENTRY_BITS = 3,
   assign WriteAccess = TLBAccessType[0];
   assign TLBAccess = ReadAccess || WriteAccess;
 
-  assign VirtualPageNumber = VirtualAddress[`VPN_BITS+11:12];
+  
   assign PageOffset        = VirtualAddress[11:0];
 
   // TLB entries are evicted according to the LRU algorithm
@@ -188,9 +194,10 @@ module tlb #(parameter ENTRY_BITS = 3,
   // page number. For 4 KB pages, the entire virtual page number is replaced.
   // For superpages, some segments are considered offsets into a larger page.
   page_number_mixer #(`PPN_BITS, `PPN_HIGH_SEGMENT_BITS)
-    physical_mixer(PhysicalPageNumber,
+    physical_mixer(PhysicalPageNumber, 
       {{EXTRA_PHYSICAL_BITS{1'b0}}, VirtualPageNumber},
       HitPageType,
+      SvMode,
       PhysicalPageNumberMixed);
 
   // Provide physical address only on TLBHits to cause catastrophic errors if
diff --git a/wally-pipelined/src/mmu/tlb_cam.sv b/wally-pipelined/src/mmu/tlb_cam.sv
index 330bb382..78d9ff8d 100644
--- a/wally-pipelined/src/mmu/tlb_cam.sv
+++ b/wally-pipelined/src/mmu/tlb_cam.sv
@@ -2,7 +2,9 @@
 // tlb_cam.sv
 //
 // Written: jtorrey@hmc.edu 16 February 2021
-// Modified:
+// Modified: kmacsaigoren@hmc.edu 1 June 2021
+//            Implemented SV48 on top of SV39. This included adding the SvMode signal input and wally constants
+//            Mostly this was to make the cam_lines work.
 //
 // Purpose: Stores virtual page numbers with cached translations.
 //          Determines whether a given virtual page number is in the TLB.
@@ -24,18 +26,21 @@
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 
+`include "wally-constants.vh"
+
 module tlb_cam #(parameter ENTRY_BITS = 3,
                  parameter KEY_BITS   = 20,
                  parameter HIGH_SEGMENT_BITS = 10) (
-  input                    clk, reset,
-  input  [KEY_BITS-1:0]    VirtualPageNumber,
-  input  [1:0]             PageTypeWrite,
-  input  [ENTRY_BITS-1:0]  WriteIndex,
-  input                    TLBWrite,
-  input                    TLBFlush,
-  output [ENTRY_BITS-1:0]  VPNIndex,
-  output [1:0]             HitPageType,
-  output                   CAMHit
+  input                     clk, reset,
+  input  [KEY_BITS-1:0]     VirtualPageNumber,
+  input  [1:0]              PageTypeWrite,
+  input  [ENTRY_BITS-1:0]   WriteIndex,
+  input  [`SVMODE_BITS-1:0] SvMode,
+  input                     TLBWrite,
+  input                     TLBFlush,
+  output [ENTRY_BITS-1:0]   VPNIndex,
+  output [1:0]              HitPageType,
+  output                    CAMHit
 );
 
   localparam NENTRIES = 2**ENTRY_BITS;