From a129e2750246e72fadb983a4699844aa80653abf Mon Sep 17 00:00:00 2001
From: Ross Thompson <ross1728@gmail.com>
Date: Tue, 27 Dec 2022 15:07:01 -0600
Subject: [PATCH 01/14] signal name changes in ram2p.

---
 pipelined/src/generic/mem/ram2p1r1wb.sv | 34 ++++++++++++-------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/pipelined/src/generic/mem/ram2p1r1wb.sv b/pipelined/src/generic/mem/ram2p1r1wb.sv
index c11246d3..7ded3a20 100644
--- a/pipelined/src/generic/mem/ram2p1r1wb.sv
+++ b/pipelined/src/generic/mem/ram2p1r1wb.sv
@@ -49,21 +49,21 @@ module ram2p1r1wb
    input logic              reset,
   
    // port 1 is read only
-   input logic [DEPTH-1:0]  RA1,
-   output logic [WIDTH-1:0] RD1,
-   input logic              REN1,
+   input logic [DEPTH-1:0]  ra1,
+   output logic [WIDTH-1:0] rd1,
+   input logic              ren1,
   
    // port 2 is write only
-   input logic [DEPTH-1:0]  WA1,
-   input logic [WIDTH-1:0]  WD1,
-   input logic              WEN1,
-   input logic [WIDTH-1:0]  BitWEN1
+   input logic [DEPTH-1:0]  wa2,
+   input logic [WIDTH-1:0]  wd2,
+   input logic              wen2,
+   input logic [WIDTH-1:0]  bwe2
 );
   
 
-  logic [DEPTH-1:0]         RA1Q, WA1Q;
-  logic                     WEN1Q;
-  logic [WIDTH-1:0]         WD1Q;
+  logic [DEPTH-1:0]         ra1q, wa2q;
+  logic                     wen2q;
+  logic [width-1:0]         wd2q;
 
   logic [WIDTH-1:0]         mem[2**DEPTH-1:0];
   logic [WIDTH-1:0]         bwe;
@@ -76,18 +76,18 @@ module ram2p1r1wb
   //  prefer not to have two-cycle write latency
   //  will require branch predictor changes
   
-  flopenr #(DEPTH) RA1Reg(clk, reset, REN1, RA1, RA1Q);
-  flopenr #(DEPTH) WA1Reg(clk, reset, REN1, WA1, WA1Q);
-  flopr   #(1)     WEN1Reg(clk, reset, WEN1, WEN1Q);
-  flopenr #(WIDTH) WD1Reg(clk, reset, REN1, WD1, WD1Q);
+  flopenr #(DEPTH) ra1Reg(clk, reset, ren1, ra1, ra1q);
+  flopenr #(DEPTH) wa2Reg(clk, reset, ren1, wa2, wa2q);
+  flopr   #(1)     wen2Reg(clk, reset, wen2, wen2q);
+  flopenr #(WIDTH) wd2Reg(clk, reset, ren1, wd2, wd2q);
 
   // read port
-  assign RD1 = mem[RA1Q];
+  assign rd1 = mem[ra1q];
   
   // write port
-  assign bwe = {WIDTH{WEN1Q}} & BitWEN1;
+  assign bwe = {WIDTH{wen2q}} & bwe2;
   always_ff @(posedge clk)
-    mem[WA1Q] <= WD1Q & bwe | mem[WA1Q] & ~bwe;
+    mem[wa2q] <= wd2q & bwe | mem[wa2q] & ~bwe;
  
 endmodule  
 

From 654b10894c270c10f28d188b03603250f1e54af3 Mon Sep 17 00:00:00 2001
From: Ross Thompson <ross1728@gmail.com>
Date: Thu, 29 Dec 2022 17:07:50 -0600
Subject: [PATCH 02/14] Re-enabled the branch predictor in rv64gc.

---
 pipelined/config/rv64gc/wally-config.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelined/config/rv64gc/wally-config.vh b/pipelined/config/rv64gc/wally-config.vh
index 6ca3a56c..65d27aa3 100644
--- a/pipelined/config/rv64gc/wally-config.vh
+++ b/pipelined/config/rv64gc/wally-config.vh
@@ -139,7 +139,7 @@
 `define PLIC_GPIO_ID 3
 `define PLIC_UART_ID 10
 
-`define BPRED_ENABLED 0
+`define BPRED_ENABLED 1
 `define BPTYPE "BPGSHARE" // BPLOCALPAg or BPGLOBAL or BPTWOBIT or BPGSHARE
 `define TESTSBP 0
 `define BPRED_SIZE 10

From 872ff619e397b1fe63bbf1a4dca878d6642c63bb Mon Sep 17 00:00:00 2001
From: Ross Thompson <ross1728@gmail.com>
Date: Thu, 29 Dec 2022 17:13:48 -0600
Subject: [PATCH 03/14] Fixed problems with changes to ram2p.

---
 pipelined/src/generic/mem/ram2p1r1wb.sv     |  2 +-
 pipelined/src/ifu/BTBPredictor.sv           | 14 +++++++-------
 pipelined/src/ifu/globalHistoryPredictor.sv | 14 +++++++-------
 pipelined/src/ifu/gsharePredictor.sv        | 14 +++++++-------
 pipelined/src/ifu/localHistoryPredictor.sv  | 14 +++++++-------
 pipelined/src/ifu/twoBitPredictor.sv        | 14 +++++++-------
 6 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/pipelined/src/generic/mem/ram2p1r1wb.sv b/pipelined/src/generic/mem/ram2p1r1wb.sv
index 7ded3a20..dac1290f 100644
--- a/pipelined/src/generic/mem/ram2p1r1wb.sv
+++ b/pipelined/src/generic/mem/ram2p1r1wb.sv
@@ -63,7 +63,7 @@ module ram2p1r1wb
 
   logic [DEPTH-1:0]         ra1q, wa2q;
   logic                     wen2q;
-  logic [width-1:0]         wd2q;
+  logic [WIDTH-1:0]         wd2q;
 
   logic [WIDTH-1:0]         mem[2**DEPTH-1:0];
   logic [WIDTH-1:0]         bwe;
diff --git a/pipelined/src/ifu/BTBPredictor.sv b/pipelined/src/ifu/BTBPredictor.sv
index f03bbdd5..3c90497e 100644
--- a/pipelined/src/ifu/BTBPredictor.sv
+++ b/pipelined/src/ifu/BTBPredictor.sv
@@ -105,13 +105,13 @@ module BTBPredictor
   // *** optimize for byte write enables
   ram2p1r1wb #(Depth, `XLEN+5) memory(.clk(clk),
           .reset(reset),
-          .RA1(LookUpPCIndex),
-          .RD1({{InstrClass, TargetPC}}),
-          .REN1(~StallF),
-          .WA1(UpdatePCIndex),
-          .WD1({UpdateInstrClass, UpdateTarget}),
-          .WEN1(UpdateEN),
-          .BitWEN1({5'h1F, {`XLEN{1'b1}}})); // *** definitely not right.
+          .ra1(LookUpPCIndex),
+          .rd1({{InstrClass, TargetPC}}),
+          .ren1(~StallF),
+          .wa2(UpdatePCIndex),
+          .wd2({UpdateInstrClass, UpdateTarget}),
+          .wen2(UpdateEN),
+          .bwe2({5'h1F, {`XLEN{1'b1}}})); // *** definitely not right.
 
 
 endmodule
diff --git a/pipelined/src/ifu/globalHistoryPredictor.sv b/pipelined/src/ifu/globalHistoryPredictor.sv
index 6d06dc8c..29cb735d 100644
--- a/pipelined/src/ifu/globalHistoryPredictor.sv
+++ b/pipelined/src/ifu/globalHistoryPredictor.sv
@@ -116,12 +116,12 @@ module globalHistoryPredictor
   ram2p1r1wb #(k, 2) PHT(.clk(clk),
     .reset(reset),
     //.RA1(GHR[k-1:0]),
-    .RA1(GHRLookup),
-    .RD1(BPPredF),
-    .REN1(~StallF),
-    .WA1(PHTUpdateAdr),
-    .WD1(UpdateBPPredE),
-    .WEN1(PHTUpdateEN),
-    .BitWEN1(2'b11));
+    .ra1(GHRLookup),
+    .rd1(BPPredF),
+    .ren1(~StallF),
+    .wa2(PHTUpdateAdr),
+    .wd2(UpdateBPPredE),
+    .wen2(PHTUpdateEN),
+    .bwe2(2'b11));
 
 endmodule
diff --git a/pipelined/src/ifu/gsharePredictor.sv b/pipelined/src/ifu/gsharePredictor.sv
index ff111a3e..fa780194 100644
--- a/pipelined/src/ifu/gsharePredictor.sv
+++ b/pipelined/src/ifu/gsharePredictor.sv
@@ -113,12 +113,12 @@ module gsharePredictor
   ram2p1r1wb #(`BPRED_SIZE, 2) PHT(.clk(clk),
     .reset(reset),
     //.RA1(GHR[`BPRED_SIZE-1:0]),
-    .RA1(GHRLookup ^ PCNextF[`BPRED_SIZE:1]),
-    .RD1(BPPredF),
-    .REN1(~StallF),
-    .WA1(PHTUpdateAdr ^ PCE[`BPRED_SIZE:1]),
-    .WD1(UpdateBPPredE),
-    .WEN1(PHTUpdateEN),
-    .BitWEN1(2'b11));
+    .ra1(GHRLookup ^ PCNextF[`BPRED_SIZE:1]),
+    .rd1(BPPredF),
+    .ren1(~StallF),
+    .wa2(PHTUpdateAdr ^ PCE[`BPRED_SIZE:1]),
+    .wd2(UpdateBPPredE),
+    .wen2(PHTUpdateEN),
+    .bwe2(2'b11));
 
 endmodule // gsharePredictor
diff --git a/pipelined/src/ifu/localHistoryPredictor.sv b/pipelined/src/ifu/localHistoryPredictor.sv
index 97b2b6f5..02ad4cf1 100644
--- a/pipelined/src/ifu/localHistoryPredictor.sv
+++ b/pipelined/src/ifu/localHistoryPredictor.sv
@@ -86,13 +86,13 @@ module localHistoryPredictor
   // LHRE refers to the address that the past k branches points to in the exectution stage
   ram2p1r1wb #(k, 2) PHT(.clk(clk), 
     .reset(reset),
-    .RA1(ForwardLHRNext),
-    .RD1(PredictionMemory),
-    .REN1(~StallF),
-    .WA1(LHRFNext),
-    .WD1(UpdatePrediction),
-    .WEN1(UpdateEN),
-    .BitWEN1(2'b11));
+    .ra1(ForwardLHRNext),
+    .rd1(PredictionMemory),
+    .ren1(~StallF),
+    .wa2(LHRFNext),
+    .wd2(UpdatePrediction),
+    .wen2(UpdateEN),
+    .bwe2(2'b11));
 
 
   
diff --git a/pipelined/src/ifu/twoBitPredictor.sv b/pipelined/src/ifu/twoBitPredictor.sv
index 5ffb29d3..7459ea6a 100644
--- a/pipelined/src/ifu/twoBitPredictor.sv
+++ b/pipelined/src/ifu/twoBitPredictor.sv
@@ -62,13 +62,13 @@ module twoBitPredictor
 
   ram2p1r1wb #(Depth, 2) PHT(.clk(clk),
     .reset(reset),
-    .RA1(LookUpPCIndex),
-    .RD1(PredictionMemory),
-    .REN1(~StallF),
-    .WA1(UpdatePCIndex),
-    .WD1(UpdatePrediction),
-    .WEN1(UpdateEN),
-    .BitWEN1(2'b11));
+    .ra1(LookUpPCIndex),
+    .rd1(PredictionMemory),
+    .ren1(~StallF),
+    .wa2(UpdatePCIndex),
+    .wd2(UpdatePrediction),
+    .wen2(UpdateEN),
+    .bwe2(2'b11));
 
   // need to forward when updating to the same address as reading.
   // first we compare to see if the update and lookup addreses are the same

From ef37070eee0217b60f0d539cda24e92bb4327b29 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Thu, 29 Dec 2022 21:09:23 -0800
Subject: [PATCH 04/14] Fixed register timing failure on SpecialCaseM in
 fdivsqrt

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index a5735ba3..c16abd9b 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -69,7 +69,8 @@ module fdivsqrtfsm(
     assign ISpecialCaseE = AZeroE | BZeroE; // *** why is AZeroE part of this.  Should other special cases be considered?
     assign SpecialCaseE  = MDUE ? ISpecialCaseE : FSpecialCaseE;
   end else assign SpecialCaseE = FSpecialCaseE;
-  flopenr #(1) SpecialCaseReg(clk, reset, ~StallM, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
+  //flopenr #(1) SpecialCaseReg(clk, reset, ~StallM, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
+  flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
 
 // DIVN = `NF+3
 // NS = NF + 1

From e9b314f9023e3be52825c66d20576ef75bd788ca Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 06:40:25 -0800
Subject: [PATCH 05/14] fdiv cleanup, reduce number of rv32f fma_b15 tests
 being run to speed up regression

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv      |  1 -
 pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 16 +++++++---------
 pipelined/testbench/tests.vh                   |  4 ++--
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index c16abd9b..a950ea7b 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -69,7 +69,6 @@ module fdivsqrtfsm(
     assign ISpecialCaseE = AZeroE | BZeroE; // *** why is AZeroE part of this.  Should other special cases be considered?
     assign SpecialCaseE  = MDUE ? ISpecialCaseE : FSpecialCaseE;
   end else assign SpecialCaseE = FSpecialCaseE;
-  //flopenr #(1) SpecialCaseReg(clk, reset, ~StallM, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
   flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
 
 // DIVN = `NF+3
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 8bedd384..6d955d61 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -157,13 +157,6 @@ module fdivsqrtpostproc(
       end else begin
         NormShiftM = ((`DIVBLEN+1)'(`DIVb) - (nM * (`DIVBLEN+1)'(`LOGR)));
         PreResultM = IntQuotM;
-        /*
-        if (~ALTBM & NegQuotM) begin
-          PreResultM = {3'b111, -IntQuotM};
-        end else begin
-          PreResultM = {3'b000, IntQuotM};
-        end*/
-        //PreResultM = {IntQuotM[`DIVb], IntQuotM[`DIVb], IntQuotM[`DIVb], IntQuotM}; // Suspicious Sign Extender
       end
     
 
@@ -171,7 +164,12 @@ module fdivsqrtpostproc(
     
     assign PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
     assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
-    // *** conditional on RV64
-    assign FPIntDivResultM = (W64M ? {{(`XLEN-32){SpecialFPIntDivResultM[31]}}, SpecialFPIntDivResultM[31:0]} : SpecialFPIntDivResultM[`XLEN-1:0]); // Sign extending in case of W64
+
+    // sign extend result for W64
+    if (`XLEN==64)
+      assign FPIntDivResultM = (W64M ? {{(`XLEN-32){SpecialFPIntDivResultM[31]}}, SpecialFPIntDivResultM[31:0]} : 
+                                       SpecialFPIntDivResultM[`XLEN-1:0]); // Sign extending in case of W64
+    else
+      assign FPIntDivResultM = SpecialFPIntDivResultM[`XLEN-1:0];
   end
 endmodule
\ No newline at end of file
diff --git a/pipelined/testbench/tests.vh b/pipelined/testbench/tests.vh
index 61e45d9e..48a29303 100644
--- a/pipelined/testbench/tests.vh
+++ b/pipelined/testbench/tests.vh
@@ -1098,7 +1098,7 @@ string imperas32f[] = '{
     "rv64i_m/F/src/flw-align-01.S",
     "rv64i_m/F/src/fmadd_b1-01.S",
     "rv64i_m/F/src/fmadd_b14-01.S",
-    "rv64i_m/F/src/fmadd_b15-01.S",
+    //"rv64i_m/F/src/fmadd_b15-01.S",
     "rv64i_m/F/src/fmadd_b16-01.S",
     "rv64i_m/F/src/fmadd_b17-01.S",
     "rv64i_m/F/src/fmadd_b18-01.S",
@@ -1473,7 +1473,7 @@ string imperas32f[] = '{
     "rv32i_m/F/src/fmin_b19-01.S",
     "rv32i_m/F/src/fmsub_b1-01.S",
     "rv32i_m/F/src/fmsub_b14-01.S",
-    "rv32i_m/F/src/fmsub_b15-01.S",
+    //"rv32i_m/F/src/fmsub_b15-01.S",
     "rv32i_m/F/src/fmsub_b16-01.S",
     "rv32i_m/F/src/fmsub_b17-01.S",
     "rv32i_m/F/src/fmsub_b18-01.S",

From 0e9bd5dab558910d5b577b5660b5790f71991eab Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 06:45:51 -0800
Subject: [PATCH 06/14] fdivsqrtpreproc shift simplification

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index b3f42a7c..cb883365 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -57,7 +57,6 @@ module fdivsqrtpreproc (
   // Intdiv signals
   logic  [`DIVb-1:0] IFNormLenX, IFNormLenD;
   logic  [`DIVBLEN:0] mE;
-  logic  [`DIVBLEN:0] ZeroDiff, IntBits, RightShiftX;
   logic  [`DIVBLEN:0] pPlusr, pPrCeil, p, ell;
   logic  [`LOGRK:0] pPrTrunc;
   logic  [`DIVb+3:0]  PreShiftX;
@@ -71,6 +70,7 @@ module fdivsqrtpreproc (
     logic  AsE, BsE, ALTBE, NegQuotE;
     logic  [`XLEN-1:0]  AE, BE;
     logic  [`XLEN-1:0] PosA, PosB;
+    logic  [`DIVBLEN:0] ZeroDiff, IntBits, RightShiftX;
 
     // Extract inputs, signs, zero, depending on W64 mode if applicable
     assign signedDiv = ~Funct3E[0];
@@ -108,12 +108,12 @@ module fdivsqrtpreproc (
 
   /* verilator lint_off WIDTH */
     // right shift amount to complete in discrete number of steps
-    assign pPlusr = (`DIVBLEN)'(`LOGR) + p;
+    assign pPlusr = `LOGR + p;
     assign pPrTrunc = pPlusr % `RK;
-    assign pPrCeil = (pPlusr >> `LOGRK) + {{`DIVBLEN{1'b0}}, |(pPrTrunc)};
-    assign nE = (pPrCeil * (`DIVBLEN+1)'(`DIVCOPIES)) - {{(`DIVBLEN){1'b0}}, 1'b1};
-    assign IntBits = (`DIVBLEN)'(`LOGR) + p - {{(`DIVBLEN){1'b0}}, 1'b1};
-    assign RightShiftX = ((`DIVBLEN)'(`RK) - 1) - (IntBits % `RK);
+    assign pPrCeil = (pPlusr >> `LOGRK) + |pPrTrunc;
+    assign nE = (pPrCeil * `DIVCOPIES) - 1;
+    assign IntBits = `LOGR + p - 1;
+    assign RightShiftX = `RK - 1 - IntBits % `RK;
   /* verilator lint_on WIDTH */
 
     // Selet integer or floating-point operands

From dba3ffe767f27f3719b75a381ca793998bd66e95 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 06:47:40 -0800
Subject: [PATCH 07/14] Reduced size of preproc right shift

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index cb883365..d0a06079 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -70,7 +70,8 @@ module fdivsqrtpreproc (
     logic  AsE, BsE, ALTBE, NegQuotE;
     logic  [`XLEN-1:0]  AE, BE;
     logic  [`XLEN-1:0] PosA, PosB;
-    logic  [`DIVBLEN:0] ZeroDiff, IntBits, RightShiftX;
+    logic  [`DIVBLEN:0] ZeroDiff, IntBits;
+    logic  [`LOGRK-1:0] RightShiftX;
 
     // Extract inputs, signs, zero, depending on W64 mode if applicable
     assign signedDiv = ~Funct3E[0];

From 4fb839686750a27cccd0769c5bf2aaf7f3e69180 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 06:55:20 -0800
Subject: [PATCH 08/14] Clean up sqrt initialization mux

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 29 ++++++++-----------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index d0a06079..e5645674 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -51,20 +51,14 @@ module fdivsqrtpreproc (
 );
 
   logic  [`DIVb-1:0] XPreproc;
-  logic  [`DIVb:0] SqrtX;
-  logic  [`DIVb+3:0] DivX;
+  logic  [`DIVb:0] PreSqrtX;
+  logic  [`DIVb+3:0] DivX, SqrtX;
   logic  [`NE+1:0] QeE;
-  // Intdiv signals
   logic  [`DIVb-1:0] IFNormLenX, IFNormLenD;
-  logic  [`DIVBLEN:0] mE;
-  logic  [`DIVBLEN:0] pPlusr, pPrCeil, p, ell;
-  logic  [`LOGRK:0] pPrTrunc;
+  logic  [`DIVBLEN:0] mE, ell;
   logic  [`DIVb+3:0]  PreShiftX;
   logic  NumZeroE;
 
-  // ***can probably merge X LZC with conversion
-  // cout the number of leading zeros
-
   if (`IDIV_ON_FPU) begin
     logic signedDiv;
     logic  AsE, BsE, ALTBE, NegQuotE;
@@ -72,6 +66,8 @@ module fdivsqrtpreproc (
     logic  [`XLEN-1:0] PosA, PosB;
     logic  [`DIVBLEN:0] ZeroDiff, IntBits;
     logic  [`LOGRK-1:0] RightShiftX;
+    logic  [`DIVBLEN:0] pPlusr, pPrCeil, p;
+    logic  [`LOGRK-1:0] pPrTrunc;
 
     // Extract inputs, signs, zero, depending on W64 mode if applicable
     assign signedDiv = ~Funct3E[0];
@@ -149,16 +145,15 @@ module fdivsqrtpreproc (
   assign DPreproc = IFNormLenD << (mE + {{`DIVBLEN{1'b0}}, 1'b1}); 
 
   //  append leading 1 (for nonzero inputs) and zero-extend
-  assign SqrtX = (Xe[0]^ell[0]) ? {1'b0, ~NumZeroE, XPreproc[`DIVb-1:1]} : {~NumZeroE, XPreproc}; // Bottom bit of XPreproc is always zero because DIVb is larger than XLEN and NF
+  assign PreSqrtX = (Xe[0]^ell[0]) ? {1'b0, ~NumZeroE, XPreproc[`DIVb-1:1]} : {~NumZeroE, XPreproc}; // Bottom bit of XPreproc is always zero because DIVb is larger than XLEN and NF
   assign DivX = {3'b000, ~NumZeroE, XPreproc};
-
-  // *** explain why X is shifted between radices (initial assignment of WS=RX)
-  if (`RADIX == 2)  assign PreShiftX = Sqrt ? {3'b111, SqrtX} : DivX;
-  else              assign PreShiftX = Sqrt ? {2'b11, SqrtX, 1'b0} : DivX;
-
+  // Sqrt is initialized after a first step of R(X-1), which depends on Radix
+  if (`RADIX == 2)  assign SqrtX = {3'b111, PreSqrtX};
+  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};
+  assign PreShiftX = Sqrt ? SqrtX : DivX;
+ 
   // Floating-point exponent
   fdivsqrtexpcalc expcalc(.Fmt, .Xe, .Ye, .Sqrt, .XZero(XZeroE), .ell, .m(mE), .Qe(QeE));
-
-  flopen #(`NE+2)    expreg(clk, IFDivStartE, QeE, QeM);
+  flopen #(`NE+2) expreg(clk, IFDivStartE, QeE, QeM);
 endmodule
 

From 3c475455d9208ba5a1cbfb28a698cd11ad2b02fb Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:00:48 -0800
Subject: [PATCH 09/14] Clean up sqrt preproc

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index e5645674..63d391ae 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -104,7 +104,7 @@ module fdivsqrtpreproc (
     assign p = ALTBE ? '0 : ZeroDiff;
 
   /* verilator lint_off WIDTH */
-    // right shift amount to complete in discrete number of steps
+    // calculate number of cycles nE right shift amount RightShiftX to complete in discrete number of steps
     assign pPlusr = `LOGR + p;
     assign pPrTrunc = pPlusr % `RK;
     assign pPrCeil = (pPlusr >> `LOGRK) + |pPrTrunc;
@@ -145,6 +145,7 @@ module fdivsqrtpreproc (
   assign DPreproc = IFNormLenD << (mE + {{`DIVBLEN{1'b0}}, 1'b1}); 
 
   //  append leading 1 (for nonzero inputs) and zero-extend
+  // *** explain this next line
   assign PreSqrtX = (Xe[0]^ell[0]) ? {1'b0, ~NumZeroE, XPreproc[`DIVb-1:1]} : {~NumZeroE, XPreproc}; // Bottom bit of XPreproc is always zero because DIVb is larger than XLEN and NF
   assign DivX = {3'b000, ~NumZeroE, XPreproc};
   // Sqrt is initialized after a first step of R(X-1), which depends on Radix

From ba976d66e40868e8cac69764dfa833aad1bcb73c Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:01:44 -0800
Subject: [PATCH 10/14] Radix 4 divsqrt

---
 pipelined/config/shared/wally-shared.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index cc24c42f..044bd7d7 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -109,7 +109,7 @@
 `define CORRSHIFTSZ ((`DIVRESLEN+`NF) > (3*`NF+8) ? (`DIVRESLEN+`NF) : (3*`NF+6))
 
 // division constants
-`define RADIX 32'h2
+`define RADIX 32'h4
 `define DIVCOPIES 32'h4
 `define DIVLEN ((`NF < `XLEN) ? (`XLEN) : `NF+3)
 // `define DIVN (`NF < `XLEN ? `XLEN : `NF+1) // length of input

From 61230c967ceb460022f7db428267ae838a90085c Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:10:47 -0800
Subject: [PATCH 11/14] simplified sign handling mux

---
 .../src/fpu/fdivsqrt/fdivsqrtpostproc.sv      | 27 +++++--------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 6d955d61..1dd11b3f 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -52,9 +52,6 @@ module fdivsqrtpostproc(
   logic [`DIVb:0] PreQmM;
   logic NegStickyM;
   logic weq0E, weq0M, WZeroM;
-  logic [`DIVBLEN:0] NormShiftM;
-  logic [`DIVb:0] NormQuotM;
-  logic [`DIVb+3:0] IntQuotM, IntRemM, NormRemM;
   logic signed [`DIVb+3:0] PreResultM, PreFPIntDivResultM;
   logic [`XLEN-1:0] SpecialFPIntDivResultM;
 
@@ -104,27 +101,17 @@ module fdivsqrtpostproc(
   assign QmM = SqrtM ? (PreQmM << 1) : PreQmM;
 
   if (`IDIV_ON_FPU) begin
+    logic [`DIVBLEN:0] NormShiftM;
+    logic [`DIVb:0] NormQuotM;
+    logic [`DIVb+3:0] IntQuotM, IntRemM, NormRemM, NormRemDM;
+
     assign W = $signed(Sum) >>> `LOGR;
     assign DM = {4'b0001, D};
 
     // Integer division: sign handling for div and rem
-    always_comb 
-      if (~AsM)
-        if (NegStickyM) begin
-          NormQuotM = FirstUM;
-          NormRemM  = W + DM;
-        end else begin
-          NormQuotM = FirstU;
-          NormRemM  = W;
-        end
-      else 
-        if (NegStickyM) begin
-          NormQuotM = FirstUM;
-          NormRemM  = -(W + DM);
-        end else begin 
-          NormQuotM = FirstU;
-          NormRemM  = -W;
-        end
+    mux2 #(`DIVb+1) normquotmux(FirstU, FirstUM, NegStickyM, NormQuotM);
+    mux2 #(`DIVb+4) normremdmux(W, W+DM, NegStickyM, NormRemDM);
+    mux2 #(`DIVb+4) normremsmux(NormRemDM, -NormRemDM, AsM, NormRemM);
 
     // Integer division: Special cases
     always_comb

From 30dc45c76408a1d88293fa79f6dae4037a359bbd Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:17:38 -0800
Subject: [PATCH 12/14] removed duplicate quotient mux

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 1dd11b3f..258f0eb5 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -102,14 +102,12 @@ module fdivsqrtpostproc(
 
   if (`IDIV_ON_FPU) begin
     logic [`DIVBLEN:0] NormShiftM;
-    logic [`DIVb:0] NormQuotM;
     logic [`DIVb+3:0] IntQuotM, IntRemM, NormRemM, NormRemDM;
 
     assign W = $signed(Sum) >>> `LOGR;
     assign DM = {4'b0001, D};
 
-    // Integer division: sign handling for div and rem
-    mux2 #(`DIVb+1) normquotmux(FirstU, FirstUM, NegStickyM, NormQuotM);
+    // Integer remainder: sticky and sign correction muxes
     mux2 #(`DIVb+4) normremdmux(W, W+DM, NegStickyM, NormRemDM);
     mux2 #(`DIVb+4) normremsmux(NormRemDM, -NormRemDM, AsM, NormRemM);
 
@@ -129,7 +127,7 @@ module fdivsqrtpostproc(
             IntRemM  = '0;
           end 
         end else begin 
-          PreIntQuotM = {3'b000, NormQuotM};
+          PreIntQuotM = {3'b000, PreQmM};
           IntRemM  = NormRemM;
         end 
         // flip sign if necessary
@@ -147,7 +145,7 @@ module fdivsqrtpostproc(
       end
     
 
-    // division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
+    // integer division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
     
     assign PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
     assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases

From bd16fd79d42506a7263f8180f0889ac81aa0fb4d Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:34:26 -0800
Subject: [PATCH 13/14] started simplifying integer division special cases

---
 .../src/fpu/fdivsqrt/fdivsqrtpostproc.sv      | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 258f0eb5..80d9e4b0 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -111,6 +111,43 @@ module fdivsqrtpostproc(
     mux2 #(`DIVb+4) normremdmux(W, W+DM, NegStickyM, NormRemDM);
     mux2 #(`DIVb+4) normremsmux(NormRemDM, -NormRemDM, AsM, NormRemM);
 
+    // special case logic
+    always_comb
+      if (ALTBM) begin
+        if (RemOpM) PreFPIntDivResultM = {{(`DIVb-`XLEN+4){1'b0}}, AM};
+        else        PreFPIntDivResultM = '0;
+ //       IntQuotM = '0;
+ //       IntRemM  = {{(`DIVb-`XLEN+4){1'b0}}, AM};
+      end else begin
+        logic [`DIVb+3:0] PreIntQuotM;
+        if (WZeroM) begin
+          if (weq0M) begin
+            PreIntQuotM = {3'b000, FirstU};
+            IntRemM  = '0;
+          end else begin
+            PreIntQuotM = {3'b000, FirstUM};
+            IntRemM  = '0;
+          end 
+        end else begin 
+          PreIntQuotM = {3'b000, PreQmM};
+          IntRemM  = NormRemM;
+        end 
+        // flip sign if necessary
+        if (NegQuotM) IntQuotM = -PreIntQuotM;
+        else          IntQuotM =  PreIntQuotM;
+        if (RemOpM) begin
+          NormShiftM = ALTBM ? '0 : (mM + (`DIVBLEN+1)'(`DIVa)); // no postshift if forwarding input A to remainder
+          PreResultM = IntRemM;
+        end else begin
+          NormShiftM = ((`DIVBLEN+1)'(`DIVb) - (nM * (`DIVBLEN+1)'(`LOGR)));
+          PreResultM = IntQuotM;
+        end
+        PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
+      end
+
+    assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
+
+/*
     // Integer division: Special cases
     always_comb
       if (ALTBM) begin
@@ -149,6 +186,7 @@ module fdivsqrtpostproc(
     
     assign PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
     assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
+*/
 
     // sign extend result for W64
     if (`XLEN==64)

From 58218dbdd178dbc980f22432e2c5be845ca4f03e Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:40:28 -0800
Subject: [PATCH 14/14] continued simplifying integer division special cases

---
 .../src/fpu/fdivsqrt/fdivsqrtpostproc.sv      | 53 +++----------------
 1 file changed, 7 insertions(+), 46 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 80d9e4b0..c78738a4 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -113,9 +113,12 @@ module fdivsqrtpostproc(
 
     // special case logic
     always_comb
-      if (ALTBM) begin
-        if (RemOpM) PreFPIntDivResultM = {{(`DIVb-`XLEN+4){1'b0}}, AM};
-        else        PreFPIntDivResultM = '0;
+      if (BZeroM) begin 
+        if (RemOpM) SpecialFPIntDivResultM = AM;
+        else        SpecialFPIntDivResultM = {(`XLEN){1'b1}};
+      end else if (ALTBM) begin
+        if (RemOpM) SpecialFPIntDivResultM = AM;
+        else        SpecialFPIntDivResultM = '0;
  //       IntQuotM = '0;
  //       IntRemM  = {{(`DIVb-`XLEN+4){1'b0}}, AM};
       end else begin
@@ -143,51 +146,9 @@ module fdivsqrtpostproc(
           PreResultM = IntQuotM;
         end
         PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
+        SpecialFPIntDivResultM = PreFPIntDivResultM[`XLEN-1:0];
       end
 
-    assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
-
-/*
-    // Integer division: Special cases
-    always_comb
-      if (ALTBM) begin
-        IntQuotM = '0;
-        IntRemM  = {{(`DIVb-`XLEN+4){1'b0}}, AM};
-      end else begin
-        logic [`DIVb+3:0] PreIntQuotM;
-        if (WZeroM) begin
-          if (weq0M) begin
-            PreIntQuotM = {3'b000, FirstU};
-            IntRemM  = '0;
-          end else begin
-            PreIntQuotM = {3'b000, FirstUM};
-            IntRemM  = '0;
-          end 
-        end else begin 
-          PreIntQuotM = {3'b000, PreQmM};
-          IntRemM  = NormRemM;
-        end 
-        // flip sign if necessary
-        if (NegQuotM) IntQuotM = -PreIntQuotM;
-        else          IntQuotM =  PreIntQuotM;
-      end
-    
-    always_comb
-      if (RemOpM) begin
-        NormShiftM = ALTBM ? '0 : (mM + (`DIVBLEN+1)'(`DIVa)); // no postshift if forwarding input A to remainder
-        PreResultM = IntRemM;
-      end else begin
-        NormShiftM = ((`DIVBLEN+1)'(`DIVb) - (nM * (`DIVBLEN+1)'(`LOGR)));
-        PreResultM = IntQuotM;
-      end
-    
-
-    // integer division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
-    
-    assign PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
-    assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
-*/
-
     // sign extend result for W64
     if (`XLEN==64)
       assign FPIntDivResultM = (W64M ? {{(`XLEN-32){SpecialFPIntDivResultM[31]}}, SpecialFPIntDivResultM[31:0]} :