From 60cfa0d69cdeadaae7151b1bb57d7e7cd5191c27 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 4 Nov 2022 15:21:09 -0700
Subject: [PATCH 01/10] HPTW cleanup

---
 pipelined/src/mmu/hptw.sv       | 118 +++++++++++++-------------------
 pipelined/src/mmu/tlb.sv        |  20 +++---
 pipelined/src/mmu/tlbcontrol.sv |  14 +---
 pipelined/src/mmu/vm64check.sv  |  50 ++++++++++++++
 4 files changed, 110 insertions(+), 92 deletions(-)
 create mode 100644 pipelined/src/mmu/vm64check.sv

diff --git a/pipelined/src/mmu/hptw.sv b/pipelined/src/mmu/hptw.sv
index 7b303ff43..e2b2573ed 100644
--- a/pipelined/src/mmu/hptw.sv
+++ b/pipelined/src/mmu/hptw.sv
@@ -42,7 +42,7 @@ module hptw
    input logic [1:0]           STATUS_MPP,
    input logic [1:0]           PrivilegeModeW,
    (* mark_debug = "true" *) input logic ITLBMissOrDAFaultNoTrapF, DTLBMissOrDAFaultNoTrapM, // TLB Miss
-   input logic [`XLEN-1:0]     HPTWReadPTE, // page table entry from LSU
+   input logic [`XLEN-1:0]     HPTWReadPTE, // page table entry from LSU  *** change to ReadDataM
    input logic                 DCacheStallM, // stall from LSU
    output logic [`XLEN-1:0]    PTE, // page table entry to TLBs
    output logic [1:0]          PageType, // page type to TLBs
@@ -106,7 +106,6 @@ module hptw
 
   if(`HPTW_WRITES_SUPPORTED) begin : hptwwrites
 
-    logic                     SV39Mode;
     logic                     ReadAccess, WriteAccess;
     logic                     InvalidRead, InvalidWrite;
     logic                     UpperBitsUnequalPageFault; 
@@ -136,19 +135,9 @@ module hptw
     assign ImproperPrivilege = ((EffectivePrivilegeMode == `U_MODE) & ~PTE_U) |
                                ((EffectivePrivilegeMode == `S_MODE) & PTE_U & (~STATUS_SUM & DTLBWalk));
 
-    // *** turn into module common with code in tlbcontrol.
-    if (`XLEN==64) begin:rv64
-      assign SV39Mode = (SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS] == `SV39);
-      // page fault if upper bits aren't all the same
-      logic UpperEqual39, UpperEqual48;
-      assign UpperEqual39 = &(TranslationVAdr[63:38]) | ~|(TranslationVAdr[63:38]);
-      assign UpperEqual48 = &(TranslationVAdr[63:47]) | ~|(TranslationVAdr[63:47]); 
-      assign UpperBitsUnequalPageFault = SV39Mode ? ~UpperEqual39 : ~UpperEqual48;
-    end else begin
-      assign SV39Mode = 0;
-      assign UpperBitsUnequalPageFault = 0;
-    end           
-
+    // Check for page faults
+	vm64check vm64check(.SATP_MODE(SATP_REGW[`XLEN-1:`XLEN-`SVMODE_BITS]), .VAdr(TranslationVAdr), 
+	                               .SV39Mode(), .UpperBitsUnequalPageFault);
     assign InvalidRead = ReadAccess & ~Readable & (~STATUS_MXR | ~Executable);
     assign InvalidWrite = WriteAccess & ~Writable;
     assign OtherPageFault = DTLBWalk? ImproperPrivilege | InvalidRead | InvalidWrite | UpperBitsUnequalPageFault | Misaligned | ~Valid :
@@ -190,26 +179,26 @@ module hptw
 
 	// HPTWAdr muxing
 	if (`XLEN==32) begin // RV32
-	logic [9:0] VPN;
-	logic [`PPN_BITS-1:0] PPN;
-	assign VPN = ((WalkerState == L1_ADR) | (WalkerState == L1_RD)) ? TranslationVAdr[31:22] : TranslationVAdr[21:12]; // select VPN field based on HPTW state
-	assign PPN = ((WalkerState == L1_ADR) | (WalkerState == L1_RD)) ? BasePageTablePPN : CurrentPPN; 
-	assign HPTWReadAdr = {PPN, VPN, 2'b00};
-	assign HPTWSize = 3'b010;
+		logic [9:0] VPN;
+		logic [`PPN_BITS-1:0] PPN;
+		assign VPN = ((WalkerState == L1_ADR) | (WalkerState == L1_RD)) ? TranslationVAdr[31:22] : TranslationVAdr[21:12]; // select VPN field based on HPTW state
+		assign PPN = ((WalkerState == L1_ADR) | (WalkerState == L1_RD)) ? BasePageTablePPN : CurrentPPN; 
+		assign HPTWReadAdr = {PPN, VPN, 2'b00};
+		assign HPTWSize = 3'b010;
 	end else begin // RV64
-	logic [8:0] VPN;
-	logic [`PPN_BITS-1:0] PPN;
-	always_comb
-		case (WalkerState) // select VPN field based on HPTW state
-			L3_ADR, L3_RD:  			VPN = TranslationVAdr[47:39];
-			L2_ADR, L2_RD:    VPN = TranslationVAdr[38:30];
-			L1_ADR, L1_RD: 	VPN = TranslationVAdr[29:21];
-			default:		 						VPN = TranslationVAdr[20:12];
-		endcase
-	assign PPN = ((WalkerState == L3_ADR) | (WalkerState == L3_RD) | 
-					(SvMode != `SV48 & ((WalkerState == L2_ADR) | (WalkerState == L2_RD)))) ? BasePageTablePPN : CurrentPPN;
-	assign HPTWReadAdr = {PPN, VPN, 3'b000};
-	assign HPTWSize = 3'b011;
+		logic [8:0] VPN;
+		logic [`PPN_BITS-1:0] PPN;
+		always_comb
+			case (WalkerState) // select VPN field based on HPTW state
+				L3_ADR, L3_RD:  VPN = TranslationVAdr[47:39];
+				L2_ADR, L2_RD:  VPN = TranslationVAdr[38:30];
+				L1_ADR, L1_RD: 	VPN = TranslationVAdr[29:21];
+				default:		VPN = TranslationVAdr[20:12];
+			endcase
+		assign PPN = ((WalkerState == L3_ADR) | (WalkerState == L3_RD) | 
+						(SvMode != `SV48 & ((WalkerState == L2_ADR) | (WalkerState == L2_RD)))) ? BasePageTablePPN : CurrentPPN;
+		assign HPTWReadAdr = {PPN, VPN, 3'b000};
+		assign HPTWSize = 3'b011;
 	end
 
 	// Initial state and misalignment for RV32/64
@@ -228,44 +217,33 @@ module hptw
 	end
 
 	// Page Table Walker FSM
-	// If the setup time on the D$ RAM is short, it should be possible to merge the LEVELx_READ and LEVELx states
-	// to decrease the latency of the HPTW.  However, if the D$ is a cycle limiter, it's better to leave the
-	// HPTW as shown below to keep the D$ setup time out of the critical path.
-	// *** Is this really true.  Talk with Ross.  Seems like it's the next state logic on critical path instead.
-	// *** address TYPE(statetype)
 	flopenl #(.TYPE(statetype)) WalkerStateReg(clk, reset, 1'b1, NextWalkerState, IDLE, WalkerState); 
 	always_comb 
-	case (WalkerState)
-	IDLE: if (TLBMiss)	 		NextWalkerState = InitialWalkerState;
-		  else 					NextWalkerState = IDLE;
-	L3_ADR:                     NextWalkerState = L3_RD; // first access in SV48
-	L3_RD: if (DCacheStallM)    NextWalkerState = L3_RD;
-           else     			NextWalkerState = L2_ADR;
-	L2_ADR: if (InitialWalkerState == L2_ADR)    NextWalkerState = L2_RD; // first access in SV39
-			else if (ValidLeafPTE & ~Misaligned) NextWalkerState = LEAF; // could shortcut this by a cyle for all Lx_ADR superpages
-			else if (ValidNonLeafPTE)            NextWalkerState = L2_RD;
-			else 				                 NextWalkerState = LEAF;
-	L2_RD: if (DCacheStallM)                     NextWalkerState = L2_RD;
-           else                                  NextWalkerState = L1_ADR;
-	L1_ADR: if (InitialWalkerState == L1_ADR)    NextWalkerState = L1_RD; // first access in SV32
-			else if (ValidLeafPTE & ~Misaligned) NextWalkerState = LEAF; // could shortcut this by a cyle for all Lx_ADR superpages
-			else if (ValidNonLeafPTE)            NextWalkerState = L1_RD;
-			else 				                 NextWalkerState = LEAF;	
-	L1_RD: if (DCacheStallM)                     NextWalkerState = L1_RD;
-           else                                  NextWalkerState = L0_ADR;
-	L0_ADR: if (ValidLeafPTE & ~Misaligned)      NextWalkerState = LEAF; // could shortcut this by a cyle for all Lx_ADR superpages
-			else if (ValidNonLeafPTE)            NextWalkerState = L0_RD;
-			else                                 NextWalkerState = LEAF;
-	L0_RD: if (DCacheStallM)                     NextWalkerState = L0_RD;
-           else                                  NextWalkerState = LEAF;
-    LEAF: if (DAPageFault) NextWalkerState = UPDATE_PTE;
-          else NextWalkerState = IDLE;
-     UPDATE_PTE: if(`HPTW_WRITES_SUPPORTED & DCacheStallM) NextWalkerState = UPDATE_PTE;
-                else NextWalkerState = LEAF;
-	default: begin
-		NextWalkerState = IDLE; // should never be reached
-	end
-	endcase // case (WalkerState)
+		case (WalkerState)
+			IDLE: if (TLBMiss)	 										NextWalkerState = InitialWalkerState;
+				  else 													NextWalkerState = IDLE;
+			L3_ADR:                     								NextWalkerState = L3_RD; // first access in SV48
+			L3_RD: if (DCacheStallM)    								NextWalkerState = L3_RD;
+				   else     											NextWalkerState = L2_ADR;
+			L2_ADR: if (InitialWalkerState == L2_ADR | ValidNonLeafPTE) NextWalkerState = L2_RD; // first access in SV39
+					else 				                 				NextWalkerState = LEAF;
+			L2_RD: if (DCacheStallM)                     				NextWalkerState = L2_RD;
+				else                                     				NextWalkerState = L1_ADR;
+			L1_ADR: if (InitialWalkerState == L1_ADR | ValidNonLeafPTE) NextWalkerState = L1_RD; // first access in SV32
+					else if (ValidNonLeafPTE)            				NextWalkerState = L1_RD;
+					else 				                				NextWalkerState = LEAF;	
+			L1_RD: if (DCacheStallM)                     				NextWalkerState = L1_RD;
+				else                                     				NextWalkerState = L0_ADR;
+			L0_ADR: if (ValidNonLeafPTE)                 				NextWalkerState = L0_RD;
+					else                                 				NextWalkerState = LEAF;
+			L0_RD: if (DCacheStallM)                     				NextWalkerState = L0_RD;
+				   else                                     			NextWalkerState = LEAF;
+			LEAF: if (DAPageFault)                       				NextWalkerState = UPDATE_PTE;
+				  else 													NextWalkerState = IDLE;
+			UPDATE_PTE: if(`HPTW_WRITES_SUPPORTED & DCacheStallM) 		NextWalkerState = UPDATE_PTE;
+						else 											NextWalkerState = LEAF;
+			default: 													NextWalkerState = IDLE; // should never be reached
+		endcase // case (WalkerState)
 
   assign IgnoreRequestTLB = WalkerState == IDLE & TLBMiss;
   assign SelHPTW = WalkerState != IDLE;
diff --git a/pipelined/src/mmu/tlb.sv b/pipelined/src/mmu/tlb.sv
index 6954e1d95..2f4fd5560 100644
--- a/pipelined/src/mmu/tlb.sv
+++ b/pipelined/src/mmu/tlb.sv
@@ -116,16 +116,16 @@ module tlb #(parameter TLB_ENTRIES = 8,
   // we cache Misaligned along with the PTE?  This only has to be computed once
   // in the hptw as it is always the same regardless of the VPN.
   if(`XLEN == 32) begin
-	assign MegapageMisaligned = |(PPN[9:0]); // must have zero PPN0
-	assign Misaligned = (HitPageType == 2'b01) & MegapageMisaligned;
+    assign MegapageMisaligned = |(PPN[9:0]); // must have zero PPN0
+    assign Misaligned = (HitPageType == 2'b01) & MegapageMisaligned;
   end else begin
-	logic 				 GigapageMisaligned, TerapageMisaligned;
-	assign TerapageMisaligned = |(PPN[26:0]); // must have zero PPN2, PPN1, PPN0
-	assign GigapageMisaligned = |(PPN[17:0]); // must have zero PPN1 and PPN0
-	assign MegapageMisaligned = |(PPN[8:0]); // must have zero PPN0		  
-	assign Misaligned = ((HitPageType == 2'b11) & TerapageMisaligned) | 
-						((HitPageType == 2'b10) & GigapageMisaligned) | 
-						((HitPageType == 2'b01) & MegapageMisaligned);
+    logic 				 GigapageMisaligned, TerapageMisaligned;
+    assign TerapageMisaligned = |(PPN[26:0]); // must have zero PPN2, PPN1, PPN0
+    assign GigapageMisaligned = |(PPN[17:0]); // must have zero PPN1 and PPN0
+    assign MegapageMisaligned = |(PPN[8:0]); // must have zero PPN0		  
+    assign Misaligned = ((HitPageType == 2'b11) & TerapageMisaligned) | 
+              ((HitPageType == 2'b10) & GigapageMisaligned) | 
+              ((HitPageType == 2'b01) & MegapageMisaligned);
   end
 
   assign VPN = VAdr[`VPN_BITS+11:12];
@@ -137,7 +137,7 @@ module tlb #(parameter TLB_ENTRIES = 8,
 
   tlblru #(TLB_ENTRIES) lru(.clk, .reset, .TLBWrite, .TLBFlush, .Matches, .CAMHit, .WriteEnables);
   tlbcam #(TLB_ENTRIES, `VPN_BITS + `ASID_BITS, `VPN_SEGMENT_BITS) 
-    tlbcam(.clk, .reset, .VPN, .PageTypeWriteVal, .SV39Mode, .TLBFlush, .WriteEnables, .PTE_Gs, 
+  tlbcam(.clk, .reset, .VPN, .PageTypeWriteVal, .SV39Mode, .TLBFlush, .WriteEnables, .PTE_Gs, 
            .SATP_ASID, .Matches, .HitPageType, .CAMHit);
   tlbram #(TLB_ENTRIES) tlbram(.clk, .reset, .PTE, .Matches, .WriteEnables, .PPN, .PTEAccessBits, .PTE_Gs);
 
diff --git a/pipelined/src/mmu/tlbcontrol.sv b/pipelined/src/mmu/tlbcontrol.sv
index 5a9e4852d..8b3da2f35 100644
--- a/pipelined/src/mmu/tlbcontrol.sv
+++ b/pipelined/src/mmu/tlbcontrol.sv
@@ -68,22 +68,12 @@ module tlbcontrol #(parameter ITLB = 0) (
   // Grab the sv mode from SATP and determine whether translation should occur
   assign EffectivePrivilegeMode = (ITLB == 1) ? PrivilegeModeW : (STATUS_MPRV ? STATUS_MPP : PrivilegeModeW); // DTLB uses MPP mode when MPRV is 1
   assign Translate = (SATP_MODE != `NO_TRANSLATE) & (EffectivePrivilegeMode != `M_MODE) & ~DisableTranslation; 
-  if (`XLEN==64) begin:rv64
-      assign SV39Mode = (SATP_MODE == `SV39);
-      // page fault if upper bits aren't all the same
-      logic UpperEqual39, UpperEqual48;
-      assign UpperEqual39 = &(VAdr[63:38]) | ~|(VAdr[63:38]);
-      assign UpperEqual48 = &(VAdr[63:47]) | ~|(VAdr[63:47]); 
-      assign UpperBitsUnequalPageFault = SV39Mode ? ~UpperEqual39 : ~UpperEqual48;
-  end else begin
-      assign SV39Mode = 0;
-      assign UpperBitsUnequalPageFault = 0;
-  end           
 
   // Determine whether TLB is being used
   assign TLBAccess = ReadAccess | WriteAccess;
 
   // Check whether upper bits of virtual addresss are all equal
+  vm64check vm64check(.SATP_MODE, .VAdr, .SV39Mode, .UpperBitsUnequalPageFault);
 
   // unswizzle useful PTE bits
   assign {PTE_D, PTE_A} = PTEAccessBits[7:6];
@@ -99,7 +89,7 @@ module tlbcontrol #(parameter ITLB = 0) (
       assign DAPageFault = Translate & TLBHit & ~PTE_A & ~TLBPageFault;
       assign TLBPageFault = (Translate  & TLBHit & (ImproperPrivilege | ~PTE_X | UpperBitsUnequalPageFault | Misaligned | ~PTE_V));
     end else begin
-    // fault for software handling if access bit is off
+      // fault for software handling if access bit is off
       assign DAPageFault = ~PTE_A;
       assign TLBPageFault = (Translate  & TLBHit & (ImproperPrivilege | ~PTE_X | DAPageFault | UpperBitsUnequalPageFault | Misaligned | ~PTE_V));
     end
diff --git a/pipelined/src/mmu/vm64check.sv b/pipelined/src/mmu/vm64check.sv
new file mode 100644
index 000000000..cedeb5267
--- /dev/null
+++ b/pipelined/src/mmu/vm64check.sv
@@ -0,0 +1,50 @@
+///////////////////////////////////////////
+// vm64check.sv
+//
+// Written: David_Harris@hmc.edu 4 November 2022
+// Modified: 
+//
+// Purpose: Check for good upper address bits in RV64 mode
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module vm64check (
+   input  logic [`SVMODE_BITS-1:0] SATP_MODE,
+   input  logic [`XLEN-1:0]        VAdr,
+   output logic                    SV39Mode, UpperBitsUnequalPageFault
+);
+
+  if (`XLEN==64) begin:rv64
+      assign SV39Mode = (SATP_MODE == `SV39);
+      // page fault if upper bits aren't all the same
+      logic UpperEqual39, UpperEqual48;
+      assign UpperEqual39 = &(VAdr[63:38]) | ~|(VAdr[63:38]);
+      assign UpperEqual48 = &(VAdr[63:47]) | ~|(VAdr[63:47]); 
+      assign UpperBitsUnequalPageFault = SV39Mode ? ~UpperEqual39 : ~UpperEqual48;
+  end else begin
+      assign SV39Mode = 0;
+      assign UpperBitsUnequalPageFault = 0;
+  end           
+endmodule

From 53a88fec8f104537cc236c49ef35b43be8a8e4ff Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 4 Nov 2022 15:21:51 -0700
Subject: [PATCH 02/10] Reorder embench tests to prevent crash

---
 pipelined/testbench/tests.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelined/testbench/tests.vh b/pipelined/testbench/tests.vh
index 91d3dcf12..633ecb81d 100644
--- a/pipelined/testbench/tests.vh
+++ b/pipelined/testbench/tests.vh
@@ -55,9 +55,9 @@ string tvpaths[] = '{
     "bd_speedopt_speed/src/matmult-int/matmult-int",
     // "bd_speedopt_speed/src/md5sum/md5sum", //commenting out tests from embench 2.0. When embench 2.0 launches stabilty, add these tests back
     "bd_speedopt_speed/src/minver/minver",
-    "bd_speedopt_speed/src/nbody/nbody",
     "bd_speedopt_speed/src/nettle-aes/nettle-aes",
     "bd_speedopt_speed/src/nettle-sha256/nettle-sha256",
+    "bd_speedopt_speed/src/nbody/nbody",
     "bd_speedopt_speed/src/nsichneu/nsichneu",
     "bd_speedopt_speed/src/picojpeg/picojpeg",
     // "bd_speedopt_speed/src/primecount/primecount",

From 90ef371abc97b01ab73bf6f31bf17eb40132585f Mon Sep 17 00:00:00 2001
From: Kip Macsai-Goren <kmacsaigoren@hmc.edu>
Date: Sat, 5 Nov 2022 13:34:24 -0700
Subject: [PATCH 03/10] fixed fifo timout handling. error now in data ready
 interrupt

---
 .../rv32i_m/privilege/src/WALLY-TEST-LIB-32.h                | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-TEST-LIB-32.h b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-TEST-LIB-32.h
index 454d05be5..ca197876c 100644
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-TEST-LIB-32.h
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-TEST-LIB-32.h
@@ -1072,9 +1072,9 @@ uart_data_wait:
     li t3, 0x10000002 // IIR
     li a4, 0x61
 uart_read_LSR_IIR:
-    lb t4, 0(t3) // save IIR before reading LSR mgith clear it
+    lbu t4, 0(t3) // save IIR before reading LSR might clear it
     //  check if IIR is the rxfifotimeout interrupt. if it is, then read the fifo then go back and repeat this.
-    li t5, 6
+    li t5, 0xCC // Value in IIR for Fifo Enabled, with timeout interrupt pending
     beq t4, t5, uart_rxfifo_timout
     lb t5, 0(t2) // read LSR
     andi t6, t5, 0x61  // wait until all transmissions are done and data is ready
@@ -1083,7 +1083,6 @@ uart_read_LSR_IIR:
 uart_rxfifo_timout:
     li t4, 0x10000000 // read from the fifo
     lb t5, 0(t4)
-    lb t5, 0(t4)
     //read the fifo until empty
     j uart_read_LSR_IIR
 

From 6bc4c1318eb5e37f47aab51691a63f5860333883 Mon Sep 17 00:00:00 2001
From: cturek <cturek@hmc.edu>
Date: Sun, 6 Nov 2022 21:53:48 +0000
Subject: [PATCH 04/10] Added new macros for int div preprocessing, added p, n,
 and rightshiftx logic

---
 pipelined/config/shared/wally-shared.vh       |  8 +++-
 pipelined/src/fpu/fdivsqrt/fdivsqrt.sv        |  3 +-
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 42 +++++++++++++------
 3 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index ca93d7e7b..97feac9e7 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -110,7 +110,7 @@
 
 // division constants
 `define RADIX 32'h4
-`define DIVCOPIES 32'h3
+`define DIVCOPIES 32'h2
 `define DIVLEN ((`NF < `XLEN) ? (`XLEN) : `NF+3)
 // `define DIVN (`NF < `XLEN ? `XLEN : `NF+1) // length of input
 `define DIVN (`NF<`XLEN ? `XLEN : (`NF + 3)) // length of input
@@ -118,12 +118,16 @@
 `define EXTRAINTBITS ((`NF < `XLEN) ? 0 : (`NF - `XLEN + 3))
 `define DIVRESLEN ((`NF>`XLEN) ? (`NF + 4) : `XLEN)
 `define LOGR ((`RADIX==2) ? 32'h1 : 32'h2)
-// FPDUR = ceil(DIVRESLEN/(LOGR*DIVCOPIES))
+`define RK (`DIVCOPIES*`LOGR) // r*k used for intdiv preproc
+`define LOGK ($clog2(`DIVCOPIES))
+`define LOGRK ($clog2(`RK))
+// FPDUR = ceil(DIVRESLEN/(LOGR*DIVCOPIES)) 
 // one iteration is required for the integer bit for minimally redundent radix-4
 `define FPDUR ((`DIVN+2+(`LOGR*`DIVCOPIES)-1)/(`LOGR*`DIVCOPIES)+(`RADIX/4))
 `define DURLEN ($clog2(`FPDUR+1))
 `define QLEN (`FPDUR*`LOGR*`DIVCOPIES)
 `define DIVb (`QLEN-1)
+`define DIVBLEN ($clog2(`DIVb))
 
 
 `define USE_SRAM 0
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
index 604a0711f..3f6199933 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -64,10 +64,11 @@ module fdivsqrt(
   logic Firstun;
   logic WZero;
   logic SpecialCaseM;
+  logic [`DIVBLEN:0] n;
 
   fdivsqrtpreproc fdivsqrtpreproc(
     .clk, .DivStartE, .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), 
-    .Sqrt(SqrtE), .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc, 
+    .Sqrt(SqrtE), .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc, .n,
     .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .MDUE, .W64E);
   fdivsqrtfsm fdivsqrtfsm(
     .clk, .reset, .FmtE, .XsE, .SqrtE, 
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index f1882ad6f..ae015a583 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -41,7 +41,8 @@ module fdivsqrtpreproc (
   input  logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
 	input  logic [2:0] 	Funct3E, Funct3M,
 	input  logic MDUE, W64E,
-  output logic  [`NE+1:0] QeM,
+  output logic [`DIVBLEN:0] n,
+  output logic [`NE+1:0] QeM,
   output logic [`DIVb+3:0] X,
   output logic [`DIVN-2:0] Dpreproc
 );
@@ -53,33 +54,50 @@ module fdivsqrtpreproc (
   logic  [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
   logic  [`NE+1:0] Qe;
   // Intdiv signals
-  logic  [`DIVN-1:0] ZeroBufX, ZeroBufY;
+  logic  [`DIVb-1:0] ZeroBufX, ZeroBufY;
   logic  [`XLEN-1:0] PosA, PosB;
-  logic  Signed, Aneg, Bneg;
+  logic  As, Bs;
+  logic  [`XLEN-1:0] A64, B64;
+  logic  [`DIVBLEN:0] p, ZeroDiff, IntBits, RightShiftX;
+  logic  [`DIVBLEN:0] pPlusr, pPrTrunc, pPrCeil;
+  logic  [`DIVb+3:0] PreShiftX;
 
   // ***can probably merge X LZC with conversion
   // cout the number of leading zeros
-  // Muxes needed for Int; add after Cedar Commit
-  assign ZeroBufX = MDUE ? {ForwardedSrcAE, {`DIVN-`XLEN{1'b0}}} : {Xm, {`DIVN-`NF-1{1'b0}}};
-  assign ZeroBufY = MDUE ? {ForwardedSrcBE, {`DIVN-`XLEN{1'b0}}} : {Ym, {`DIVN-`NF-1{1'b0}}};
+
+  assign As = ForwardedSrcAE[`XLEN-1] & Funct3E[0];
+  assign Bs = ForwardedSrcBE[`XLEN-1] & Funct3E[0];
+  assign A64 = W64E ? {{(`XLEN-32){As}}, ForwardedSrcAE[31:0]} : ForwardedSrcAE;
+  assign B64 = W64E ? {{(`XLEN-32){Bs}}, ForwardedSrcBE[31:0]} : ForwardedSrcBE;
+  
+  assign PosA = As ? -A64 : A64;
+  assign PosB = Bs ? -B64 : B64;
+
+  assign ZeroBufX = MDUE ? {PosA, {`DIVb-`XLEN{1'b0}}} : {Xm, {`DIVb-`NF-1{1'b0}}};
+  assign ZeroBufY = MDUE ? {PosB, {`DIVb-`XLEN{1'b0}}} : {Ym, {`DIVb-`NF-1{1'b0}}};
   lzc #(`NF+1) lzcX (Xm, XZeroCnt);
   lzc #(`NF+1) lzcY (Ym, YZeroCnt);
 
-  assign Signed = Funct3E[0];
-  assign Aneg = ForwardedSrcAE[`XLEN-1] & Signed;
-  assign Bneg = ForwardedSrcBE[`XLEN-1] & Signed;
-  assign PosA = Aneg ? -ForwardedSrcAE : ForwardedSrcAE;
-  assign PosB = Bneg ? -ForwardedSrcBE : ForwardedSrcBE;
-
   assign PreprocX = Xm[`NF-1:0]<<XZeroCnt;
   assign PreprocY = Ym[`NF-1:0]<<YZeroCnt;
 
+  // assign ZeroDiff = YZeroCnt - XZeroCnt;
+  // assign p = ZeroDiff[`DIVBLEN] ? '0 : ZeroDiff;
+
+  // assign pPlusr = (`DIVBLEN)'(`LOGR) + p;
+  // assign pPrTrunc = pPlusr[`LOGRK-1:0];
+  // assign pPrCeil = (pPlusr >> `LOGRK) + |(pPrTrunc);
+  // assign n = (pPrCeil << `LOGK) - ((`DIVBLEN)'b1);
+  // assign IntBits = (`DIVBLEN)'(`RK) + p;
+  // assign RightShiftX = (`DIVBLEN)'(`RK) - {{(`DIVBLEN-`RK){1'b0}}, IntBits[`RK-1:0]};
+
   assign SqrtX = Xe[0]^XZeroCnt[0] ? {1'b0, ~XZero, PreprocX} : {~XZero, PreprocX, 1'b0};
   assign DivX = {3'b000, ~XZero, PreprocX, {`DIVb-`NF{1'b0}}};
 
   // *** explain why X is shifted between radices (initial assignment of WS=RX)
   if (`RADIX == 2)  assign X = Sqrt ? {3'b111, SqrtX, {`DIVb-1-`NF{1'b0}}} : DivX;
   else              assign X = Sqrt ? {2'b11, SqrtX, {`DIVb-1-`NF{1'b0}}, 1'b0} : DivX;
+  // assign X = MDUE ? PreShiftX >> RightShiftX : PreShiftX;
   assign Dpreproc = {PreprocY, {`DIVN-1-`NF{1'b0}}};
 
   //           radix 2     radix 4

From 2cbe2fd70b53196297ead05eda5fe313548f8460 Mon Sep 17 00:00:00 2001
From: cturek <cturek@hmc.edu>
Date: Sun, 6 Nov 2022 22:08:18 +0000
Subject: [PATCH 05/10] Added n, p, and m signals between fdivsqrt submodules.
 Added w64 and mdue to divsqrt testbench.

---
 pipelined/src/fpu/fdivsqrt/fdivsqrt.sv         | 5 +++--
 pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 1 +
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv  | 4 ++--
 pipelined/testbench/testbench-fp.sv            | 1 +
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
index 3f6199933..cab1531e9 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -64,11 +64,11 @@ module fdivsqrt(
   logic Firstun;
   logic WZero;
   logic SpecialCaseM;
-  logic [`DIVBLEN:0] n;
+  logic [`DIVBLEN:0] n, p, m;
 
   fdivsqrtpreproc fdivsqrtpreproc(
     .clk, .DivStartE, .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), 
-    .Sqrt(SqrtE), .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc, .n,
+    .Sqrt(SqrtE), .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc, .n, .p, .m,
     .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .MDUE, .W64E);
   fdivsqrtfsm fdivsqrtfsm(
     .clk, .reset, .FmtE, .XsE, .SqrtE, 
@@ -83,5 +83,6 @@ module fdivsqrt(
   fdivsqrtpostproc fdivsqrtpostproc(
     .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, .Firstun, 
     .SqrtM, .SpecialCaseM, .RemOp(Funct3E[1]),
+    .n, .p, .m,
     .QmM, .WZero, .DivSM);
 endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 92bb1bd9b..9e9bdb10b 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -39,6 +39,7 @@ module fdivsqrtpostproc(
   input  logic SqrtM,
   input  logic SpecialCaseM,
   input  logic RemOp,
+  input  logic [`DIVBLEN:0] n, p, m,
   output logic [`DIVb:0] QmM, 
   output logic WZero,
   output logic DivSM
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index ae015a583..893863032 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -41,7 +41,7 @@ module fdivsqrtpreproc (
   input  logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, // *** these are the src outputs before the mux choosing between them and PCE to put in srcA/B
 	input  logic [2:0] 	Funct3E, Funct3M,
 	input  logic MDUE, W64E,
-  output logic [`DIVBLEN:0] n,
+  output logic [`DIVBLEN:0] n, p, m,
   output logic [`NE+1:0] QeM,
   output logic [`DIVb+3:0] X,
   output logic [`DIVN-2:0] Dpreproc
@@ -58,7 +58,7 @@ module fdivsqrtpreproc (
   logic  [`XLEN-1:0] PosA, PosB;
   logic  As, Bs;
   logic  [`XLEN-1:0] A64, B64;
-  logic  [`DIVBLEN:0] p, ZeroDiff, IntBits, RightShiftX;
+  logic  [`DIVBLEN:0] ZeroDiff, IntBits, RightShiftX;
   logic  [`DIVBLEN:0] pPlusr, pPrTrunc, pPrCeil;
   logic  [`DIVb+3:0] PreShiftX;
 
diff --git a/pipelined/testbench/testbench-fp.sv b/pipelined/testbench/testbench-fp.sv
index 748be2280..228bc88f9 100644
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@@ -718,6 +718,7 @@ module testbenchfp;
   if (TEST === "div" | TEST === "sqrt" | TEST === "all") begin: fdivsqrt
     fdivsqrt fdivsqrt(.clk, .reset, .XsE(Xs), .FmtE(ModFmt), .XmE(Xm), .YmE(Ym), .XeE(Xe), .YeE(Ye), .SqrtE(OpCtrlVal[0]), .SqrtM(OpCtrlVal[0]),
                     .XInfE(XInf), .YInfE(YInf), .XZeroE(XZero), .YZeroE(YZero), .XNaNE(XNaN), .YNaNE(YNaN), .DivStartE(DivStart), 
+                    .MDUE(1'b0), .W64E(1'b0),
                     .StallE(1'b0), .StallM(1'b0), .DivSM(DivSticky), .DivBusy, .QeM(DivCalcExp),
                     .QmM(Quot), .DivDone);
   end

From 83051a53515a35fd2615b8405d2e8b1a1cef36c4 Mon Sep 17 00:00:00 2001
From: cturek <cturek@hmc.edu>
Date: Sun, 6 Nov 2022 22:21:35 +0000
Subject: [PATCH 06/10] Changed lzc names, started int/fp size merge in preproc

---
 pipelined/config/shared/wally-shared.vh       |  2 +-
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 30 +++++++++----------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index 97feac9e7..a69814b58 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -127,7 +127,7 @@
 `define DURLEN ($clog2(`FPDUR+1))
 `define QLEN (`FPDUR*`LOGR*`DIVCOPIES)
 `define DIVb (`QLEN-1)
-`define DIVBLEN ($clog2(`DIVb))
+`define DIVBLEN ($clog2(`DIVb+1)-1)
 
 
 `define USE_SRAM 0
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 893863032..4d90185c3 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -51,7 +51,7 @@ module fdivsqrtpreproc (
   logic  [`NF-1:0] PreprocB, PreprocY;
   logic  [`NF+1:0] SqrtX;
   logic  [`DIVb+3:0] DivX;
-  logic  [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
+  logic  [`DIVBLEN:0] L;
   logic  [`NE+1:0] Qe;
   // Intdiv signals
   logic  [`DIVb-1:0] ZeroBufX, ZeroBufY;
@@ -75,13 +75,13 @@ module fdivsqrtpreproc (
 
   assign ZeroBufX = MDUE ? {PosA, {`DIVb-`XLEN{1'b0}}} : {Xm, {`DIVb-`NF-1{1'b0}}};
   assign ZeroBufY = MDUE ? {PosB, {`DIVb-`XLEN{1'b0}}} : {Ym, {`DIVb-`NF-1{1'b0}}};
-  lzc #(`NF+1) lzcX (Xm, XZeroCnt);
-  lzc #(`NF+1) lzcY (Ym, YZeroCnt);
+  lzc #(`DIVb) lzcX (ZeroBufX, L);
+  lzc #(`DIVb) lzcY (ZeroBufY, m);
 
-  assign PreprocX = Xm[`NF-1:0]<<XZeroCnt;
-  assign PreprocY = Ym[`NF-1:0]<<YZeroCnt;
+  assign PreprocX = Xm[`NF-1:0]<<L;
+  assign PreprocY = Ym[`NF-1:0]<<m;
 
-  // assign ZeroDiff = YZeroCnt - XZeroCnt;
+  // assign ZeroDiff = m - L;
   // assign p = ZeroDiff[`DIVBLEN] ? '0 : ZeroDiff;
 
   // assign pPlusr = (`DIVBLEN)'(`LOGR) + p;
@@ -91,7 +91,7 @@ module fdivsqrtpreproc (
   // assign IntBits = (`DIVBLEN)'(`RK) + p;
   // assign RightShiftX = (`DIVBLEN)'(`RK) - {{(`DIVBLEN-`RK){1'b0}}, IntBits[`RK-1:0]};
 
-  assign SqrtX = Xe[0]^XZeroCnt[0] ? {1'b0, ~XZero, PreprocX} : {~XZero, PreprocX, 1'b0};
+  assign SqrtX = Xe[0]^L[0] ? {1'b0, ~XZero, PreprocX} : {~XZero, PreprocX, 1'b0};
   assign DivX = {3'b000, ~XZero, PreprocX, {`DIVb-`NF{1'b0}}};
 
   // *** explain why X is shifted between radices (initial assignment of WS=RX)
@@ -110,17 +110,17 @@ module fdivsqrtpreproc (
   // r = 1 or 2
   // DIVRESLEN/(r*`DIVCOPIES)
   flopen #(`NE+2) expflop(clk, DivStartE, Qe, QeM);
-  expcalc expcalc(.Fmt, .Xe, .Ye, .Sqrt, .XZero, .XZeroCnt, .YZeroCnt, .Qe);
+  expcalc expcalc(.Fmt, .Xe, .Ye, .Sqrt, .XZero, .L, .m, .Qe);
 
 endmodule
 
 module expcalc(
-  input logic  [`FMTBITS-1:0] Fmt,
+  input  logic [`FMTBITS-1:0] Fmt,
   input  logic [`NE-1:0] Xe, Ye,
-  input logic Sqrt,
-  input logic XZero, 
-  input logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt,
-  output logic  [`NE+1:0] Qe
+  input  logic Sqrt,
+  input  logic XZero, 
+  input  logic [`DIVBLEN:0] L, m,
+  output logic [`NE+1:0] Qe
   );
   logic [`NE-2:0] Bias;
   logic [`NE+1:0] SXExp;
@@ -151,10 +151,10 @@ module expcalc(
             2'h2: Bias =  (`NE-1)'(`H_BIAS);
         endcase
   end
-  assign SXExp = {2'b0, Xe} - {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, XZeroCnt} - (`NE+1)'(`BIAS);
+  assign SXExp = {2'b0, Xe} - {{(`NE+1-`DIVBLEN){1'b0}}, L} - (`NE+2)'(`BIAS);
   assign SExp  = {SXExp[`NE+1], SXExp[`NE+1:1]} + {2'b0, Bias};
   // correct exponent for denormalized input's normalization shifts
-  assign DExp = ({2'b0, Xe} - {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, XZeroCnt} - {2'b0, Ye} + {{`NE+1-$unsigned($clog2(`NF+2)){1'b0}}, YZeroCnt} + {3'b0, Bias})&{`NE+2{~XZero}};
+  assign DExp  = ({2'b0, Xe} - {{(`NE+1-`DIVBLEN){1'b0}}, L} - {2'b0, Ye} + {{(`NE+1-`DIVBLEN){1'b0}}, m} + {3'b0, Bias}) & {`NE+2{~XZero}};
   
   assign Qe = Sqrt ? SExp : DExp;
 endmodule
\ No newline at end of file

From 350d4d254f697e33dd5f716a40e437068d67635c Mon Sep 17 00:00:00 2001
From: cturek <cturek@hmc.edu>
Date: Sun, 6 Nov 2022 22:24:21 +0000
Subject: [PATCH 07/10] p calculation

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 4d90185c3..50f3e68f0 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -81,8 +81,8 @@ module fdivsqrtpreproc (
   assign PreprocX = Xm[`NF-1:0]<<L;
   assign PreprocY = Ym[`NF-1:0]<<m;
 
-  // assign ZeroDiff = m - L;
-  // assign p = ZeroDiff[`DIVBLEN] ? '0 : ZeroDiff;
+  assign ZeroDiff = m - L;
+  assign p = ZeroDiff[`DIVBLEN] ? '0 : ZeroDiff;
 
   // assign pPlusr = (`DIVBLEN)'(`LOGR) + p;
   // assign pPrTrunc = pPlusr[`LOGRK-1:0];

From a49ea2a16dd0cba752af47da79d5fe5879e3f662 Mon Sep 17 00:00:00 2001
From: cturek <cturek@hmc.edu>
Date: Sun, 6 Nov 2022 22:31:48 +0000
Subject: [PATCH 08/10] Added n and rightshiftx

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 50f3e68f0..a16e5f795 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -59,7 +59,8 @@ module fdivsqrtpreproc (
   logic  As, Bs;
   logic  [`XLEN-1:0] A64, B64;
   logic  [`DIVBLEN:0] ZeroDiff, IntBits, RightShiftX;
-  logic  [`DIVBLEN:0] pPlusr, pPrTrunc, pPrCeil;
+  logic  [`DIVBLEN:0] pPlusr, pPrCeil;
+  logic  [`LOGRK-1:0] pPrTrunc;
   logic  [`DIVb+3:0] PreShiftX;
 
   // ***can probably merge X LZC with conversion
@@ -84,12 +85,12 @@ module fdivsqrtpreproc (
   assign ZeroDiff = m - L;
   assign p = ZeroDiff[`DIVBLEN] ? '0 : ZeroDiff;
 
-  // assign pPlusr = (`DIVBLEN)'(`LOGR) + p;
-  // assign pPrTrunc = pPlusr[`LOGRK-1:0];
-  // assign pPrCeil = (pPlusr >> `LOGRK) + |(pPrTrunc);
-  // assign n = (pPrCeil << `LOGK) - ((`DIVBLEN)'b1);
-  // assign IntBits = (`DIVBLEN)'(`RK) + p;
-  // assign RightShiftX = (`DIVBLEN)'(`RK) - {{(`DIVBLEN-`RK){1'b0}}, IntBits[`RK-1:0]};
+  assign pPlusr = (`DIVBLEN)'(`LOGR) + p;
+  assign pPrTrunc = pPlusr[`LOGRK-1:0];
+  assign pPrCeil = (pPlusr >> `LOGRK) + {{`DIVBLEN-1{1'b0}}, |(pPrTrunc)};
+  assign n = (pPrCeil << `LOGK) - 1;
+  assign IntBits = (`DIVBLEN)'(`RK) + p;
+  assign RightShiftX = (`DIVBLEN)'(`RK) - {{(`DIVBLEN-`RK){1'b0}}, IntBits[`RK-1:0]};
 
   assign SqrtX = Xe[0]^L[0] ? {1'b0, ~XZero, PreprocX} : {~XZero, PreprocX, 1'b0};
   assign DivX = {3'b000, ~XZero, PreprocX, {`DIVb-`NF{1'b0}}};

From c3e635c78808960cd27c03d5fa070836a268ec7a Mon Sep 17 00:00:00 2001
From: cturek <cturek@hmc.edu>
Date: Sun, 6 Nov 2022 22:40:21 +0000
Subject: [PATCH 09/10] Finished Int Preprocessinggit add
 ../src/fpu/fdivsqrt/fdivsqrtpreproc.sv

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index a16e5f795..44a57af7b 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -96,9 +96,9 @@ module fdivsqrtpreproc (
   assign DivX = {3'b000, ~XZero, PreprocX, {`DIVb-`NF{1'b0}}};
 
   // *** explain why X is shifted between radices (initial assignment of WS=RX)
-  if (`RADIX == 2)  assign X = Sqrt ? {3'b111, SqrtX, {`DIVb-1-`NF{1'b0}}} : DivX;
-  else              assign X = Sqrt ? {2'b11, SqrtX, {`DIVb-1-`NF{1'b0}}, 1'b0} : DivX;
-  // assign X = MDUE ? PreShiftX >> RightShiftX : PreShiftX;
+  if (`RADIX == 2)  assign PreShiftX = Sqrt ? {3'b111, SqrtX, {`DIVb-1-`NF{1'b0}}} : DivX;
+  else              assign PreShiftX = Sqrt ? {2'b11, SqrtX, {`DIVb-1-`NF{1'b0}}, 1'b0} : DivX;
+  assign X = MDUE ? PreShiftX >> RightShiftX : PreShiftX;
   assign Dpreproc = {PreprocY, {`DIVN-1-`NF{1'b0}}};
 
   //           radix 2     radix 4

From 54f09f3616bb89be85f3d45aa9f3102163bd46b8 Mon Sep 17 00:00:00 2001
From: cturek <cturek@hmc.edu>
Date: Sun, 6 Nov 2022 23:09:09 +0000
Subject: [PATCH 10/10] Added conditional OTFC swap for simplified int
 postprocessing

---
 pipelined/config/shared/wally-shared.vh       | 1 +
 pipelined/src/fpu/fdivsqrt/fdivsqrt.sv        | 5 +++--
 pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv    | 1 +
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 6 +++++-
 pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv  | 2 +-
 5 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index a69814b58..506cc7c50 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -127,6 +127,7 @@
 `define DURLEN ($clog2(`FPDUR+1))
 `define QLEN (`FPDUR*`LOGR*`DIVCOPIES)
 `define DIVb (`QLEN-1)
+`define DIVa (`DIVb+4-`XLEN)
 `define DIVBLEN ($clog2(`DIVb+1)-1)
 
 
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
index cab1531e9..3f9c7e8a5 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -65,10 +65,11 @@ module fdivsqrt(
   logic WZero;
   logic SpecialCaseM;
   logic [`DIVBLEN:0] n, p, m;
+  logic OTFCSwap;
 
   fdivsqrtpreproc fdivsqrtpreproc(
     .clk, .DivStartE, .Xm(XmE), .QeM, .Xe(XeE), .Fmt(FmtE), .Ye(YeE), 
-    .Sqrt(SqrtE), .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc, .n, .p, .m,
+    .Sqrt(SqrtE), .Ym(YmE), .XZero(XZeroE), .X, .Dpreproc, .n, .p, .m, .OTFCSwap,
     .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .MDUE, .W64E);
   fdivsqrtfsm fdivsqrtfsm(
     .clk, .reset, .FmtE, .XsE, .SqrtE, 
@@ -78,7 +79,7 @@ module fdivsqrt(
   fdivsqrtiter fdivsqrtiter(
     .clk, .Firstun, .D, .FirstU, .FirstUM, .FirstC, .SqrtE, .SqrtM, 
     .X,.Dpreproc, .FirstWS(WS), .FirstWC(WC),
-    .DivStartE, .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE,
+    .DivStartE, .Xe(XeE), .Ye(YeE), .XZeroE, .YZeroE, .OTFCSwap,
     .DivBusy);
   fdivsqrtpostproc fdivsqrtpostproc(
     .WS, .WC, .D, .FirstU, .FirstUM, .FirstC, .Firstun, 
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
index 17cc3f5c2..d234144c4 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -38,6 +38,7 @@ module fdivsqrtiter(
   input  logic XZeroE, YZeroE, 
   input  logic SqrtE,
   input  logic SqrtM,
+  input  logic OTFCSwap,
   input  logic [`DIVb+3:0] X,
   input  logic [`DIVN-2:0] Dpreproc,
   output logic [`DIVN-2:0]  D, // U0.N-1
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 44a57af7b..756c5cc9f 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -42,6 +42,7 @@ module fdivsqrtpreproc (
 	input  logic [2:0] 	Funct3E, Funct3M,
 	input  logic MDUE, W64E,
   output logic [`DIVBLEN:0] n, p, m,
+  output logic OTFCSwap,
   output logic [`NE+1:0] QeM,
   output logic [`DIVb+3:0] X,
   output logic [`DIVN-2:0] Dpreproc
@@ -56,7 +57,7 @@ module fdivsqrtpreproc (
   // Intdiv signals
   logic  [`DIVb-1:0] ZeroBufX, ZeroBufY;
   logic  [`XLEN-1:0] PosA, PosB;
-  logic  As, Bs;
+  logic  As, Bs, OTFCSwapTemp;
   logic  [`XLEN-1:0] A64, B64;
   logic  [`DIVBLEN:0] ZeroDiff, IntBits, RightShiftX;
   logic  [`DIVBLEN:0] pPlusr, pPrCeil;
@@ -70,6 +71,8 @@ module fdivsqrtpreproc (
   assign Bs = ForwardedSrcBE[`XLEN-1] & Funct3E[0];
   assign A64 = W64E ? {{(`XLEN-32){As}}, ForwardedSrcAE[31:0]} : ForwardedSrcAE;
   assign B64 = W64E ? {{(`XLEN-32){Bs}}, ForwardedSrcBE[31:0]} : ForwardedSrcBE;
+
+  assign OTFCSwapTemp = (As ^ Bs) & MDUE;
   
   assign PosA = As ? -A64 : A64;
   assign PosB = Bs ? -B64 : B64;
@@ -111,6 +114,7 @@ module fdivsqrtpreproc (
   // r = 1 or 2
   // DIVRESLEN/(r*`DIVCOPIES)
   flopen #(`NE+2) expflop(clk, DivStartE, Qe, QeM);
+  flopen #(1) swapflop(clk, DivStartE, OTFCSwapTemp, OTFCSwap);
   expcalc expcalc(.Fmt, .Xe, .Ye, .Sqrt, .XZero, .L, .m, .Qe);
 
 endmodule
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
index 8ed1664af..09f82da81 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@@ -61,7 +61,7 @@ module fdivsqrtstage2 (
 	// 0001 = -2
   fdivsqrtqsel2 qsel2(WS[`DIVb+3:`DIVb], WC[`DIVb+3:`DIVb], up, uz, un);
 
-  // Sqrt F generatin
+  // Sqrt F generation
   fdivsqrtfgen2 fgen2(.up, .uz, .C(CNext), .U, .UM, .F);
 
   // Divisor multiple