From b469831b5354f750a83b8de2f7e882c456ef9736 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Wed, 28 Dec 2022 17:46:53 -0600
Subject: [PATCH 01/15] one bitt removed from inital lignment shift

---
 pipelined/config/shared/wally-shared.vh       |  4 ++--
 pipelined/src/fpu/fma/fma.sv                  | 11 +++++-----
 pipelined/src/fpu/fma/fmaadd.sv               | 12 +++++------
 pipelined/src/fpu/fma/fmaalign.sv             | 20 +++++++++----------
 pipelined/src/fpu/fma/fmalza.sv               |  2 +-
 pipelined/src/fpu/fpu.sv                      |  8 ++++----
 pipelined/src/fpu/postproc/fmashiftcalc.sv    | 14 ++++++-------
 pipelined/src/fpu/postproc/postprocess.sv     | 12 +++++------
 pipelined/src/fpu/postproc/shiftcorrection.sv |  4 ++--
 pipelined/testbench/testbench-fp.sv           |  4 ++--
 10 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index cc24c42f6..cb2930a7a 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -104,9 +104,9 @@
 `define CVTLEN ((`NF<`XLEN) ? (`XLEN) : (`NF))
 `define LLEN ((`FLEN<`XLEN) ? (`XLEN) : (`FLEN))
 `define LOGCVTLEN $unsigned($clog2(`CVTLEN+1))
-`define NORMSHIFTSZ ((`QLEN+`NF+1) > (3*`NF+8) ? (`QLEN+`NF+1) : (3*`NF+8))
+`define NORMSHIFTSZ ((`QLEN+`NF+1) > (3*`NF+7) ? (`QLEN+`NF+1) : (3*`NF+7))//change
 `define LOGNORMSHIFTSZ ($clog2(`NORMSHIFTSZ))
-`define CORRSHIFTSZ ((`DIVRESLEN+`NF) > (3*`NF+8) ? (`DIVRESLEN+`NF) : (3*`NF+6))
+`define CORRSHIFTSZ ((`DIVRESLEN+`NF) > (3*`NF+7) ? (`DIVRESLEN+`NF) : (3*`NF+5))//change
 
 // division constants
 `define RADIX 32'h2
diff --git a/pipelined/src/fpu/fma/fma.sv b/pipelined/src/fpu/fma/fma.sv
index c1a69e6fc..eb2213da5 100644
--- a/pipelined/src/fpu/fma/fma.sv
+++ b/pipelined/src/fpu/fma/fma.sv
@@ -37,18 +37,18 @@ module fma(
     input logic                 XZero, YZero, ZZero, // is the input zero
     input logic  [2:0]          OpCtrl,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
     output logic                ZmSticky,  // sticky bit that is calculated during alignment
-    output logic [3*`NF+5:0]    Sm,           // the positive sum's significand
+    output logic [3*`NF+4:0]    Sm,//change           // the positive sum's significand
     output logic                InvA,          // Was A inverted for effective subtraction (P-A or -P+A)
     output logic                As,       // the aligned addend's sign (modified Z sign for other opperations)
     output logic                Ps,          // the product's sign
     output logic                Ss,          // the sum's sign
     output logic [`NE+1:0]      Se,
-    output logic [$clog2(3*`NF+7)-1:0]          SCnt        // normalization shift count
+    output logic [$clog2(3*`NF+6)-1:0]          SCnt//change        // normalization shift count
 );
 
     logic [2*`NF+1:0]   Pm;           // the product's significand in U(2.2Nf) format
-    logic [3*`NF+5:0]   Am;     // addend aligned's mantissa for addition in U(NF+5.2NF+1)
-    logic [3*`NF+5:0]   AmInv;   // aligned addend's mantissa possibly inverted
+    logic [3*`NF+4:0]   Am;//change     // addend aligned's mantissa for addition in U(NF+5.2NF+1)
+    logic [3*`NF+4:0]   AmInv; //change   // aligned addend's mantissa possibly inverted
     logic [2*`NF+1:0]   PmKilled;      // the product's mantissa possibly killed
     logic               KillProd;  // set the product to zero before addition if the product is too small to matter
     logic [`NE+1:0]     Pe;       // the product's exponent B(NE+2.0) format; adds 2 bits to allow for size of number and negative sign
@@ -85,7 +85,8 @@ module fma(
         
     fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .KillProd, .ZmSticky, .AmInv, .PmKilled, .InvA, .Sm, .Se, .Ss);
 
-    fmalza #(3*`NF+6) lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
+    //change
+    fmalza #(3*`NF+5) lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
 endmodule
 
 
diff --git a/pipelined/src/fpu/fma/fmaadd.sv b/pipelined/src/fpu/fma/fmaadd.sv
index adb8f4504..0991e44b0 100644
--- a/pipelined/src/fpu/fma/fmaadd.sv
+++ b/pipelined/src/fpu/fma/fmaadd.sv
@@ -31,7 +31,7 @@
 `include "wally-config.vh"
 
 module fmaadd(
-    input logic  [3*`NF+5:0]    Am, // aligned addend's mantissa for addition in U(NF+5.2NF+1)
+    input logic  [3*`NF+4:0]    Am, //change // aligned addend's mantissa for addition in U(NF+5.2NF+1)
     input logic  [2*`NF+1:0]    Pm,       // the product's mantissa
     input logic                 Ps, // the product sign and the alligend addeded's sign (Modified Z sign for other opperations)
     input logic                InvA,          // invert the aligned addend
@@ -39,13 +39,13 @@ module fmaadd(
     input logic                 ZmSticky,
     input logic  [`NE-1:0]      Ze,
     input logic  [`NE+1:0]      Pe,
-    output logic [3*`NF+5:0]    AmInv,  // aligned addend possibly inverted
+    output logic [3*`NF+4:0]    AmInv,//change // aligned addend possibly inverted
     output logic [2*`NF+1:0]    PmKilled,     // the product's mantissa possibly killed
     output logic                Ss,          
     output logic [`NE+1:0]      Se,
-    output logic [3*`NF+5:0]    Sm           // the positive sum
+    output logic [3*`NF+4:0]    Sm//change           // the positive sum
 );
-    logic [3*`NF+5:0]    PreSum, NegPreSum; // possibly negitive sum
+    logic [3*`NF+4:0]    PreSum, NegPreSum;//change // possibly negitive sum
     logic [3*`NF+5:0]    PreSumdebug, NegPreSumdebug; // possibly negitive sum
     logic                NegSum;        // was the sum negitive
     logic                NegSumdebug;        // was the sum negitive
@@ -65,8 +65,8 @@ module fmaadd(
     //          ie ~(InvA&ZmSticky&~KillProd)&InvA = (~ZmSticky|KillProd)&InvA
     //      addend - prod where product is killed (and not exactly zero) then don't add +1 from negation 
     //          ie ~(InvA&ZmSticky&KillProd)&InvA = (~ZmSticky|~KillProd)&InvA
-    assign {NegSum, PreSum} = {{`NF+3{1'b0}}, PmKilled, 2'b0} + {InvA, AmInv} + {{3*`NF+6{1'b0}}, (~ZmSticky|KillProd)&InvA};
-    assign NegPreSum = Am + {{`NF+2{1'b1}}, ~PmKilled, 2'b0} + {(3*`NF+3)'(0), (~ZmSticky|~KillProd)&InvA, 2'b0};
+    assign {NegSum, PreSum} = {{`NF+2{1'b0}}, PmKilled, 2'b0} + {InvA, AmInv} + {{3*`NF+5{1'b0}}, (~ZmSticky|KillProd)&InvA};//change
+    assign NegPreSum = Am + {{`NF+1{1'b1}}, ~PmKilled, 2'b0} + {(3*`NF+2)'(0), (~ZmSticky|~KillProd)&InvA, 2'b0};//change
      
     // Choose the positive sum and accompanying LZA result.
     assign Sm = NegSum ? NegPreSum : PreSum;
diff --git a/pipelined/src/fpu/fma/fmaalign.sv b/pipelined/src/fpu/fma/fmaalign.sv
index e423c19dd..85b28c7b5 100644
--- a/pipelined/src/fpu/fma/fmaalign.sv
+++ b/pipelined/src/fpu/fma/fmaalign.sv
@@ -35,14 +35,14 @@ module fmaalign(
     input logic  [`NE-1:0]      Xe, Ye, Ze,      // biased exponents in B(NE.0) format
     input logic  [`NF:0]        Zm,      // significand in U(0.NF) format]
     input logic                 XZero, YZero, ZZero, // is the input zero
-    output logic [3*`NF+5:0]    Am, // addend aligned for addition in U(NF+5.2NF+1)
+    output logic [3*`NF+4:0]    Am,//change // addend aligned for addition in U(NF+5.2NF+1)
     output logic                ZmSticky,  // Sticky bit calculated from the aliged addend
     output logic                KillProd       // should the product be set to zero
 );
 
     logic [`NE+1:0]     ACnt;           // how far to shift the addend to align with the product in Q(NE+2.0) format
-    logic [4*`NF+5:0]   ZmShifted;        // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
-    logic [4*`NF+5:0]   ZmPreshifted;     // input to the alignment shifter U(NF+5.3NF+1)
+    logic [4*`NF+4:0]   ZmShifted;//change        // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
+    logic [4*`NF+4:0]   ZmPreshifted;//change     // input to the alignment shifter U(NF+5.3NF+1)
     logic KillZ;
 
     ///////////////////////////////////////////////////////////////////////////////
@@ -53,16 +53,16 @@ module fmaalign(
     //      - negitive means Z is larger, so shift Z left
     //      - positive means the product is larger, so shift Z right
     // This could have been done using Pe, but ACnt is on the critical path so we replicate logic for speed
-    assign ACnt = {2'b0, Xe} + {2'b0, Ye} - {2'b0, (`NE)'(`BIAS)} + (`NE+2)'(`NF+3) - {2'b0, Ze};
+    assign ACnt = {2'b0, Xe} + {2'b0, Ye} - {2'b0, (`NE)'(`BIAS)} + (`NE+2)'(`NF+2) - {2'b0, Ze};
 
     // Defualt Addition with only inital left shift
-    //          |   54'b0    |  106'b(product)  | 2'b0 |
+    //          |   53'b0    |  106'b(product)  | 2'b0 |
     //          | addnend |
 
-    assign ZmPreshifted = {Zm,(3*`NF+5)'(0)};
+    assign ZmPreshifted = {Zm,(3*`NF+4)'(0)}; //change
     
     assign KillProd = (ACnt[`NE+1]&~ZZero)|XZero|YZero;
-    assign KillZ = $signed(ACnt)>$signed((`NE+2)'(3)*(`NE+2)'(`NF)+(`NE+2)'(5));
+    assign KillZ = $signed(ACnt)>$signed((`NE+2)'(3)*(`NE+2)'(`NF)+(`NE+2)'(4));//change
 
     always_comb
         begin
@@ -72,7 +72,7 @@ module fmaalign(
         //          |   54'b0    |  106'b(product)  | 2'b0 |
         //  | addnend |
         if (KillProd) begin
-            ZmShifted = {(`NF+3)'(0), Zm, (2*`NF+2)'(0)};
+            ZmShifted = {(`NF+2)'(0), Zm, (2*`NF+2)'(0)};//change
             ZmSticky = ~(XZero|YZero);
 
         // If the addend is too small to effect the addition        
@@ -90,12 +90,12 @@ module fmaalign(
         //                                  | addnend |
         end else begin
             ZmShifted = ZmPreshifted >> ACnt;
-            ZmSticky = |(ZmShifted[`NF-1:0]);
+            ZmSticky = |(ZmShifted[`NF-1:0]); 
 
         end
     end
 
-    assign Am = ZmShifted[4*`NF+5:`NF];
+    assign Am = ZmShifted[4*`NF+4:`NF];//change
 
 endmodule
 
diff --git a/pipelined/src/fpu/fma/fmalza.sv b/pipelined/src/fpu/fma/fmalza.sv
index 1f6677ddc..182075bde 100644
--- a/pipelined/src/fpu/fma/fmalza.sv
+++ b/pipelined/src/fpu/fma/fmalza.sv
@@ -42,7 +42,7 @@ module fmalza #(WIDTH) ( // [Schmookler & Nowka, Leading zero anticipation and d
    logic [WIDTH-1:0]  B, P, G, K;
     logic [WIDTH-1:0] Pp1, Gm1, Km1;
 
-    assign B = {{(`NF+2){1'b0}}, Pm}; // Zero extend product
+    assign B = {{(`NF+1){1'b0}}, Pm};//change // Zero extend product
 
     assign P = A^B;
     assign G = A&B;
diff --git a/pipelined/src/fpu/fpu.sv b/pipelined/src/fpu/fpu.sv
index 4ae12462d..1ebd391c0 100755
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@@ -109,14 +109,14 @@ module fpu (
    logic 		      XExpMaxE;                           // is the exponent all ones (max value)
 
    // Fma Signals
-   logic [3*`NF+5:0] SmE, SmM;                       
+   logic [3*`NF+4:0] SmE, SmM;//change             
    logic 			   ZmStickyE, ZmStickyM;
    logic [`NE+1:0]   SeE,SeM;
    logic 			   InvAE, InvAM;
    logic 			   AsE, AsM;
    logic 			   PsE, PsM;
    logic 			   SsE, SsM;
-   logic [$clog2(3*`NF+7)-1:0] SCntE, SCntM;
+   logic [$clog2(3*`NF+6)-1:0] SCntE, SCntM;//change
 
    // Cvt Signals
    logic [`NE:0]           CeE, CeM;    // the calculated expoent
@@ -352,8 +352,8 @@ module fpu (
             {XsE, YsE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE, ZDenormE},
             {XsM, YsM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM, ZDenormM});     
    flopenrc #(1)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, PreNVE, PreNVM);      
-   flopenrc #(3*`NF+6) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM); 
-  flopenrc #($clog2(3*`NF+7)+7+`NE) EMRegFma4(clk, reset, FlushM, ~StallM, 
+   flopenrc #(3*`NF+5) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM);//change 
+  flopenrc #($clog2(3*`NF+6)+7+`NE) EMRegFma4(clk, reset, FlushM, ~StallM, //change
                            {ZmStickyE, InvAE, SCntE, AsE, PsE, SsE, SeE},
                            {ZmStickyM, InvAM, SCntM, AsM, PsM, SsM, SeM});
    flopenrc #(`NE+`LOGCVTLEN+`CVTLEN+4) EMRegCvt(clk, reset, FlushM, ~StallM, 
diff --git a/pipelined/src/fpu/postproc/fmashiftcalc.sv b/pipelined/src/fpu/postproc/fmashiftcalc.sv
index 1e8012784..5c301da7d 100644
--- a/pipelined/src/fpu/postproc/fmashiftcalc.sv
+++ b/pipelined/src/fpu/postproc/fmashiftcalc.sv
@@ -30,15 +30,15 @@
 `include "wally-config.vh"
 
 module fmashiftcalc(
-    input logic  [3*`NF+5:0]            FmaSm,       // the positive sum
-    input logic  [$clog2(3*`NF+7)-1:0]  FmaSCnt,   // normalization shift count
+    input logic  [3*`NF+4:0]            FmaSm,//change       // the positive sum
+    input logic  [$clog2(3*`NF+6)-1:0]  FmaSCnt,//change   // normalization shift count
     input logic  [`FMTBITS-1:0]         Fmt,       // precision 1 = double 0 = single
     input logic [`NE+1:0] FmaSe,
     output logic [`NE+1:0]              NormSumExp,          // exponent of the normalized sum not taking into account denormal or zero results
     output logic                        FmaSZero,    // is the result denormalized - calculated before LZA corection
     output logic                        FmaPreResultDenorm,    // is the result denormalized - calculated before LZA corection
-    output logic [$clog2(3*`NF+7)-1:0]  FmaShiftAmt,   // normalization shift count
-    output logic [3*`NF+7:0]            FmaShiftIn        // is the sum zero
+    output logic [$clog2(3*`NF+6)-1:0]  FmaShiftAmt,//change   // normalization shift count
+    output logic [3*`NF+6:0]            FmaShiftIn//change        // is the sum zero
 );
     logic [`NE+1:0]             PreNormSumExp;       // the exponent of the normalized sum with the `FLEN bias
     logic [`NE+1:0] BiasCorr;
@@ -50,7 +50,7 @@ module fmashiftcalc(
     // Determine if the sum is zero
     assign FmaSZero = ~(|FmaSm);
     // calculate the sum's exponent
-    assign PreNormSumExp = FmaSe + {{`NE+2-$unsigned($clog2(3*`NF+7)){1'b1}}, ~FmaSCnt} + (`NE+2)'(`NF+4);
+    assign PreNormSumExp = FmaSe + {{`NE+2-$unsigned($clog2(3*`NF+6)){1'b1}}, ~FmaSCnt} + (`NE+2)'(`NF+3);//change
 
     //convert the sum's exponent into the proper percision
     if (`FPSIZES == 1) begin
@@ -150,7 +150,7 @@ module fmashiftcalc(
     //  - shift once if killing a product and the result is denormalized
     assign FmaShiftIn = {2'b0, FmaSm};
     if (`FPSIZES == 1)
-        assign FmaShiftAmt = FmaPreResultDenorm ? FmaSe[$clog2(3*`NF+7)-1:0]+($clog2(3*`NF+7))'(`NF+3): FmaSCnt+1;
+        assign FmaShiftAmt = FmaPreResultDenorm ? FmaSe[$clog2(3*`NF+6)-1:0]+($clog2(3*`NF+6))'(`NF+2): FmaSCnt+1;//change
     else
-        assign FmaShiftAmt = FmaPreResultDenorm ? FmaSe[$clog2(3*`NF+7)-1:0]+($clog2(3*`NF+7))'(`NF+3)+BiasCorr[$clog2(3*`NF+7)-1:0]: FmaSCnt+1;
+        assign FmaShiftAmt = FmaPreResultDenorm ? FmaSe[$clog2(3*`NF+6)-1:0]+($clog2(3*`NF+6))'(`NF+2)+BiasCorr[$clog2(3*`NF+6)-1:0]: FmaSCnt+1;//change
 endmodule
diff --git a/pipelined/src/fpu/postproc/postprocess.sv b/pipelined/src/fpu/postproc/postprocess.sv
index 368f3ef77..7c758b28c 100644
--- a/pipelined/src/fpu/postproc/postprocess.sv
+++ b/pipelined/src/fpu/postproc/postprocess.sv
@@ -47,10 +47,10 @@ module postprocess (
     input logic                             FmaAs,   // the modified Z sign - depends on instruction
     input logic                             FmaPs,      // the product's sign
     input logic  [`NE+1:0]                  FmaSe,
-    input logic  [3*`NF+5:0]                FmaSm,       // the positive sum
+    input logic  [3*`NF+4:0]                FmaSm,//change      // the positive sum
     input logic                             FmaZmS,  // sticky bit that is calculated during alignment
     input logic                             FmaSs,
-    input logic  [$clog2(3*`NF+7)-1:0]      FmaSCnt,   // the normalization shift count
+    input logic  [$clog2(3*`NF+6)-1:0]      FmaSCnt,//change   // the normalization shift count
     //divide signals
     input logic                             DivS,
 //    input logic                             DivDone,
@@ -89,10 +89,10 @@ module postprocess (
     // fma signals
     logic [`NE+1:0] FmaMe;     // exponent of the normalized sum
     logic FmaSZero;        // is the sum zero
-    logic [3*`NF+7:0] FmaShiftIn;        // shift input
+    logic [3*`NF+6:0] FmaShiftIn;//change        // shift input
     logic [`NE+1:0] NormSumExp;          // exponent of the normalized sum not taking into account denormal or zero results
     logic FmaPreResultDenorm;    // is the result denormalized - calculated before LZA corection
-    logic [$clog2(3*`NF+7)-1:0] FmaShiftAmt;   // normalization shift count
+    logic [$clog2(3*`NF+6)-1:0] FmaShiftAmt;//change   // normalization shift count
     // division singals
     logic [`LOGNORMSHIFTSZ-1:0] DivShiftAmt;
     logic [`NORMSHIFTSZ-1:0] DivShiftIn;
@@ -152,8 +152,8 @@ module postprocess (
     always_comb
         case(PostProcSel)
             2'b10: begin // fma
-                ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(3*`NF+7){1'b0}}, FmaShiftAmt};
-                ShiftIn =  {FmaShiftIn, {`NORMSHIFTSZ-(3*`NF+8){1'b0}}};
+                ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(3*`NF+6){1'b0}}, FmaShiftAmt};//change
+                ShiftIn =  {FmaShiftIn, {`NORMSHIFTSZ-(3*`NF+7){1'b0}}};//change
             end
             2'b00: begin // cvt
                 ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(`CVTLEN+1){1'b0}}, CvtShiftAmt};
diff --git a/pipelined/src/fpu/postproc/shiftcorrection.sv b/pipelined/src/fpu/postproc/shiftcorrection.sv
index eca97bcf9..588daa945 100644
--- a/pipelined/src/fpu/postproc/shiftcorrection.sv
+++ b/pipelined/src/fpu/postproc/shiftcorrection.sv
@@ -43,7 +43,7 @@ module shiftcorrection(
     output logic [`NE+1:0]          Qe,
     output logic [`NE+1:0]          FmaMe         // exponent of the normalized sum
 );
-    logic [3*`NF+5:0]      CorrSumShifted;     // the shifted sum after LZA correction
+    logic [3*`NF+4:0]      CorrSumShifted;//change     // the shifted sum after LZA correction
     logic [`CORRSHIFTSZ-1:0] CorrQmShifted;
     logic                  ResDenorm;    // is the result denormalized
     logic                  LZAPlus1; // add one or two to the sum's exponent due to LZA correction
@@ -56,7 +56,7 @@ module shiftcorrection(
     assign CorrQmShifted = (LZAPlus1|(DivQe==1&~LZAPlus1)) ? Shifted[`NORMSHIFTSZ-2:`NORMSHIFTSZ-`CORRSHIFTSZ-1] : Shifted[`NORMSHIFTSZ-3:`NORMSHIFTSZ-`CORRSHIFTSZ-2];
     // if the result of the divider was calculated to be denormalized, then the result was correctly normalized, so select the top shifted bits
     always_comb
-        if(FmaOp)                       Mf = {CorrSumShifted, {`CORRSHIFTSZ-(3*`NF+6){1'b0}}};
+        if(FmaOp)                       Mf = {CorrSumShifted, {`CORRSHIFTSZ-(3*`NF+5){1'b0}}};//change
         else if (DivOp&~DivResDenorm)   Mf = CorrQmShifted;
         else                            Mf = Shifted[`NORMSHIFTSZ-1:`NORMSHIFTSZ-`CORRSHIFTSZ];
     // Determine sum's exponent
diff --git a/pipelined/testbench/testbench-fp.sv b/pipelined/testbench/testbench-fp.sv
index c20dd3ad7..d09534829 100644
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@@ -94,8 +94,8 @@ module testbenchfp;
   logic [`NE+1:0]	      Se;
   logic 				        ZmSticky;
   logic 					      KillProd; 
-  logic [$clog2(3*`NF+7)-1:0]	SCnt;
-  logic [3*`NF+5:0]	    Sm;       
+  logic [$clog2(3*`NF+6)-1:0]	SCnt;
+  logic [3*`NF+4:0]	    Sm;       
   logic 			          InvA;
   logic 			          NegSum;
   logic 			          As;

From e5a76817df697311ea71462ce6220c5a3108127b Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Thu, 29 Dec 2022 15:54:17 -0600
Subject: [PATCH 02/15] minor optimizations and renaming

---
 pipelined/src/fpu/fma/fma.sv        |  8 ++++----
 pipelined/src/fpu/fma/fmaadd.sv     | 11 ++++++-----
 pipelined/src/fpu/fma/fmaalign.sv   |  9 +++++----
 pipelined/src/fpu/fpu.sv            |  2 +-
 pipelined/testbench/testbench-fp.sv |  6 +++---
 5 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/pipelined/src/fpu/fma/fma.sv b/pipelined/src/fpu/fma/fma.sv
index eb2213da5..4d60f477f 100644
--- a/pipelined/src/fpu/fma/fma.sv
+++ b/pipelined/src/fpu/fma/fma.sv
@@ -36,7 +36,7 @@ module fma(
     input logic  [`NF:0]        Xm, Ym, Zm,    // input's significands in U(0.NF) format
     input logic                 XZero, YZero, ZZero, // is the input zero
     input logic  [2:0]          OpCtrl,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
-    output logic                ZmSticky,  // sticky bit that is calculated during alignment
+    output logic                ASticky,  // sticky bit that is calculated during alignment
     output logic [3*`NF+4:0]    Sm,//change           // the positive sum's significand
     output logic                InvA,          // Was A inverted for effective subtraction (P-A or -P+A)
     output logic                As,       // the aligned addend's sign (modified Z sign for other opperations)
@@ -75,7 +75,7 @@ module fma(
     fmasign sign(.OpCtrl, .Xs, .Ys, .Zs, .Ps, .As, .InvA);
 
     fmaalign align(.Ze, .Zm, .XZero, .YZero, .ZZero, .Xe, .Ye,
-                .Am, .ZmSticky, .KillProd);
+                .Am, .ASticky, .KillProd);
                         
 
 
@@ -83,10 +83,10 @@ module fma(
     // // Addition/LZA
     // ///////////////////////////////////////////////////////////////////////////////
         
-    fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .KillProd, .ZmSticky, .AmInv, .PmKilled, .InvA, .Sm, .Se, .Ss);
+    fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .KillProd, .ASticky, .AmInv, .PmKilled, .InvA, .Sm, .Se, .Ss);
 
     //change
-    fmalza #(3*`NF+5) lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
+    fmalza #(3*`NF+5) lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ASticky&KillProd}), .Cin(InvA & ~(ASticky & ~KillProd)), .sub(InvA), .SCnt);
 endmodule
 
 
diff --git a/pipelined/src/fpu/fma/fmaadd.sv b/pipelined/src/fpu/fma/fmaadd.sv
index 0991e44b0..509adb674 100644
--- a/pipelined/src/fpu/fma/fmaadd.sv
+++ b/pipelined/src/fpu/fma/fmaadd.sv
@@ -36,7 +36,7 @@ module fmaadd(
     input logic                 Ps, // the product sign and the alligend addeded's sign (Modified Z sign for other opperations)
     input logic                InvA,          // invert the aligned addend
     input logic                 KillProd,      // should the product be set to 0
-    input logic                 ZmSticky,
+    input logic                 ASticky,
     input logic  [`NE-1:0]      Ze,
     input logic  [`NE+1:0]      Pe,
     output logic [3*`NF+4:0]    AmInv,//change // aligned addend possibly inverted
@@ -62,11 +62,12 @@ module fmaadd(
     //      - calculate a positive and negitive sum in parallel
     // if there was a small negitive number killed in the alignment stage one needs to be subtracted from the sum
     //      prod - addend where some of the addend is put into the sticky bit then don't add +1 from negation 
-    //          ie ~(InvA&ZmSticky&~KillProd)&InvA = (~ZmSticky|KillProd)&InvA
+    //          ie ~(InvA&ASticky&~KillProd)&InvA = (~ASticky|KillProd)&InvA
     //      addend - prod where product is killed (and not exactly zero) then don't add +1 from negation 
-    //          ie ~(InvA&ZmSticky&KillProd)&InvA = (~ZmSticky|~KillProd)&InvA
-    assign {NegSum, PreSum} = {{`NF+2{1'b0}}, PmKilled, 2'b0} + {InvA, AmInv} + {{3*`NF+5{1'b0}}, (~ZmSticky|KillProd)&InvA};//change
-    assign NegPreSum = Am + {{`NF+1{1'b1}}, ~PmKilled, 2'b0} + {(3*`NF+2)'(0), (~ZmSticky|~KillProd)&InvA, 2'b0};//change
+    //          ie ~(InvA&ASticky&KillProd)&InvA = (~ASticky|~KillProd)&InvA
+    //          in this case this result is only ever selected when InvA=1 so we can remove &InvA
+    assign {NegSum, PreSum} = {{`NF+2{1'b0}}, PmKilled, 2'b0} + {InvA, AmInv} + {{3*`NF+5{1'b0}}, (~ASticky|KillProd)&InvA};//change
+    assign NegPreSum = Am + {{`NF+1{1'b1}}, ~PmKilled, 2'b0} + {(3*`NF+2)'(0), ~ASticky|~KillProd, 2'b0};//change
      
     // Choose the positive sum and accompanying LZA result.
     assign Sm = NegSum ? NegPreSum : PreSum;
diff --git a/pipelined/src/fpu/fma/fmaalign.sv b/pipelined/src/fpu/fma/fmaalign.sv
index 85b28c7b5..67dc0b824 100644
--- a/pipelined/src/fpu/fma/fmaalign.sv
+++ b/pipelined/src/fpu/fma/fmaalign.sv
@@ -36,7 +36,7 @@ module fmaalign(
     input logic  [`NF:0]        Zm,      // significand in U(0.NF) format]
     input logic                 XZero, YZero, ZZero, // is the input zero
     output logic [3*`NF+4:0]    Am,//change // addend aligned for addition in U(NF+5.2NF+1)
-    output logic                ZmSticky,  // Sticky bit calculated from the aliged addend
+    output logic                ASticky,  // Sticky bit calculated from the aliged addend
     output logic                KillProd       // should the product be set to zero
 );
 
@@ -44,6 +44,7 @@ module fmaalign(
     logic [4*`NF+4:0]   ZmShifted;//change        // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
     logic [4*`NF+4:0]   ZmPreshifted;//change     // input to the alignment shifter U(NF+5.3NF+1)
     logic KillZ;
+    logic PmSticky, tmpZmSticky;
 
     ///////////////////////////////////////////////////////////////////////////////
     // Alignment shifter
@@ -73,7 +74,7 @@ module fmaalign(
         //  | addnend |
         if (KillProd) begin
             ZmShifted = {(`NF+2)'(0), Zm, (2*`NF+2)'(0)};//change
-            ZmSticky = ~(XZero|YZero);
+            ASticky = ~(XZero|YZero);
 
         // If the addend is too small to effect the addition        
         //      - The addend has to shift two past the end of the product to be considered too small
@@ -83,14 +84,14 @@ module fmaalign(
         //                                                      | addnend |
         end else if (KillZ)  begin
             ZmShifted = 0;
-            ZmSticky = ~ZZero;
+            ASticky = ~ZZero;
 
         // If the Addend is shifted right
         //          |   54'b0    |  106'b(product)  | 2'b0 |
         //                                  | addnend |
         end else begin
             ZmShifted = ZmPreshifted >> ACnt;
-            ZmSticky = |(ZmShifted[`NF-1:0]); 
+            ASticky = |(ZmShifted[`NF-1:0]); 
 
         end
     end
diff --git a/pipelined/src/fpu/fpu.sv b/pipelined/src/fpu/fpu.sv
index 1ebd391c0..507cd9086 100755
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@@ -258,7 +258,7 @@ module fpu (
             .As(AsE), .Ps(PsE), .Ss(SsE), .Se(SeE),
             .Sm(SmE), 
             .InvA(InvAE), .SCnt(SCntE), 
-            .ZmSticky(ZmStickyE)); 
+            .ASticky(ZmStickyE)); 
 
    // divide and squareroot
    //    - fdiv
diff --git a/pipelined/testbench/testbench-fp.sv b/pipelined/testbench/testbench-fp.sv
index d09534829..f5986c839 100644
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@@ -92,7 +92,7 @@ module testbenchfp;
   logic                 Ss;
   logic [`NE+1:0]	      Pe;
   logic [`NE+1:0]	      Se;
-  logic 				        ZmSticky;
+  logic 				        ASticky;
   logic 					      KillProd; 
   logic [$clog2(3*`NF+6)-1:0]	SCnt;
   logic [3*`NF+4:0]	    Sm;       
@@ -690,7 +690,7 @@ module testbenchfp;
             .Xm(Xm), .Ym(Ym), .Zm(Zm),
             .XZero, .YZero, .ZZero, .Ss, .Se,
             .OpCtrl(OpCtrlVal), .Sm, .InvA, .SCnt, .As, .Ps,
-            .ZmSticky); 
+            .ASticky); 
   end
               
   postprocess postprocess(.Xs(Xs), .Ys(Ys), .PostProcSel(UnitVal[1:0]),
@@ -700,7 +700,7 @@ module testbenchfp;
               .XZero(XZero), .YZero(YZero), .ZZero(ZZero), .CvtShiftAmt(CvtShiftAmtE),
               .XInf(XInf), .YInf(YInf), .ZInf(ZInf), .CvtCs(CvtResSgnE), .ToInt(WriteIntVal),
               .XSNaN(XSNaN), .YSNaN(YSNaN), .ZSNaN(ZSNaN), .CvtLzcIn(CvtLzcInE), .IntZero,
-              .FmaZmS(ZmSticky), .FmaSe(Se),
+              .FmaZmS(ASticky), .FmaSe(Se),
               .FmaSm(Sm), .FmaSCnt(SCnt), .FmaAs(As), .FmaPs(Ps), .Fmt(ModFmt), .Frm(FrmVal), 
               .PostProcFlg(Flg), .PostProcRes(FpRes), .FCvtIntRes(IntRes));
   

From 0ecbb45b785ae5c5a8f3128a3bfe13b592640e7b Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Thu, 29 Dec 2022 21:09:23 -0800
Subject: [PATCH 03/15] Fixed register timing failure on SpecialCaseM in
 fdivsqrt

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index a5735ba3b..c16abd9b9 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -69,7 +69,8 @@ module fdivsqrtfsm(
     assign ISpecialCaseE = AZeroE | BZeroE; // *** why is AZeroE part of this.  Should other special cases be considered?
     assign SpecialCaseE  = MDUE ? ISpecialCaseE : FSpecialCaseE;
   end else assign SpecialCaseE = FSpecialCaseE;
-  flopenr #(1) SpecialCaseReg(clk, reset, ~StallM, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
+  //flopenr #(1) SpecialCaseReg(clk, reset, ~StallM, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
+  flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
 
 // DIVN = `NF+3
 // NS = NF + 1

From 18f19ce44d4e1cd9cbed6500aadc139178e452e1 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 06:40:25 -0800
Subject: [PATCH 04/15] fdiv cleanup, reduce number of rv32f fma_b15 tests
 being run to speed up regression

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv      |  1 -
 pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 16 +++++++---------
 pipelined/testbench/tests.vh                   |  4 ++--
 3 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
index c16abd9b9..a950ea7b3 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@@ -69,7 +69,6 @@ module fdivsqrtfsm(
     assign ISpecialCaseE = AZeroE | BZeroE; // *** why is AZeroE part of this.  Should other special cases be considered?
     assign SpecialCaseE  = MDUE ? ISpecialCaseE : FSpecialCaseE;
   end else assign SpecialCaseE = FSpecialCaseE;
-  //flopenr #(1) SpecialCaseReg(clk, reset, ~StallM, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
   flopenr #(1) SpecialCaseReg(clk, reset, IFDivStartE, SpecialCaseE, SpecialCaseM); // save SpecialCase for checking in fdivsqrtpostproc
 
 // DIVN = `NF+3
diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 8bedd3841..6d955d611 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -157,13 +157,6 @@ module fdivsqrtpostproc(
       end else begin
         NormShiftM = ((`DIVBLEN+1)'(`DIVb) - (nM * (`DIVBLEN+1)'(`LOGR)));
         PreResultM = IntQuotM;
-        /*
-        if (~ALTBM & NegQuotM) begin
-          PreResultM = {3'b111, -IntQuotM};
-        end else begin
-          PreResultM = {3'b000, IntQuotM};
-        end*/
-        //PreResultM = {IntQuotM[`DIVb], IntQuotM[`DIVb], IntQuotM[`DIVb], IntQuotM}; // Suspicious Sign Extender
       end
     
 
@@ -171,7 +164,12 @@ module fdivsqrtpostproc(
     
     assign PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
     assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
-    // *** conditional on RV64
-    assign FPIntDivResultM = (W64M ? {{(`XLEN-32){SpecialFPIntDivResultM[31]}}, SpecialFPIntDivResultM[31:0]} : SpecialFPIntDivResultM[`XLEN-1:0]); // Sign extending in case of W64
+
+    // sign extend result for W64
+    if (`XLEN==64)
+      assign FPIntDivResultM = (W64M ? {{(`XLEN-32){SpecialFPIntDivResultM[31]}}, SpecialFPIntDivResultM[31:0]} : 
+                                       SpecialFPIntDivResultM[`XLEN-1:0]); // Sign extending in case of W64
+    else
+      assign FPIntDivResultM = SpecialFPIntDivResultM[`XLEN-1:0];
   end
 endmodule
\ No newline at end of file
diff --git a/pipelined/testbench/tests.vh b/pipelined/testbench/tests.vh
index 61e45d9e4..48a29303b 100644
--- a/pipelined/testbench/tests.vh
+++ b/pipelined/testbench/tests.vh
@@ -1098,7 +1098,7 @@ string imperas32f[] = '{
     "rv64i_m/F/src/flw-align-01.S",
     "rv64i_m/F/src/fmadd_b1-01.S",
     "rv64i_m/F/src/fmadd_b14-01.S",
-    "rv64i_m/F/src/fmadd_b15-01.S",
+    //"rv64i_m/F/src/fmadd_b15-01.S",
     "rv64i_m/F/src/fmadd_b16-01.S",
     "rv64i_m/F/src/fmadd_b17-01.S",
     "rv64i_m/F/src/fmadd_b18-01.S",
@@ -1473,7 +1473,7 @@ string imperas32f[] = '{
     "rv32i_m/F/src/fmin_b19-01.S",
     "rv32i_m/F/src/fmsub_b1-01.S",
     "rv32i_m/F/src/fmsub_b14-01.S",
-    "rv32i_m/F/src/fmsub_b15-01.S",
+    //"rv32i_m/F/src/fmsub_b15-01.S",
     "rv32i_m/F/src/fmsub_b16-01.S",
     "rv32i_m/F/src/fmsub_b17-01.S",
     "rv32i_m/F/src/fmsub_b18-01.S",

From d2273e7037d37738e459b83d2792034b40cdbbc0 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 06:45:51 -0800
Subject: [PATCH 05/15] fdivsqrtpreproc shift simplification

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index b3f42a7c4..cb8833658 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -57,7 +57,6 @@ module fdivsqrtpreproc (
   // Intdiv signals
   logic  [`DIVb-1:0] IFNormLenX, IFNormLenD;
   logic  [`DIVBLEN:0] mE;
-  logic  [`DIVBLEN:0] ZeroDiff, IntBits, RightShiftX;
   logic  [`DIVBLEN:0] pPlusr, pPrCeil, p, ell;
   logic  [`LOGRK:0] pPrTrunc;
   logic  [`DIVb+3:0]  PreShiftX;
@@ -71,6 +70,7 @@ module fdivsqrtpreproc (
     logic  AsE, BsE, ALTBE, NegQuotE;
     logic  [`XLEN-1:0]  AE, BE;
     logic  [`XLEN-1:0] PosA, PosB;
+    logic  [`DIVBLEN:0] ZeroDiff, IntBits, RightShiftX;
 
     // Extract inputs, signs, zero, depending on W64 mode if applicable
     assign signedDiv = ~Funct3E[0];
@@ -108,12 +108,12 @@ module fdivsqrtpreproc (
 
   /* verilator lint_off WIDTH */
     // right shift amount to complete in discrete number of steps
-    assign pPlusr = (`DIVBLEN)'(`LOGR) + p;
+    assign pPlusr = `LOGR + p;
     assign pPrTrunc = pPlusr % `RK;
-    assign pPrCeil = (pPlusr >> `LOGRK) + {{`DIVBLEN{1'b0}}, |(pPrTrunc)};
-    assign nE = (pPrCeil * (`DIVBLEN+1)'(`DIVCOPIES)) - {{(`DIVBLEN){1'b0}}, 1'b1};
-    assign IntBits = (`DIVBLEN)'(`LOGR) + p - {{(`DIVBLEN){1'b0}}, 1'b1};
-    assign RightShiftX = ((`DIVBLEN)'(`RK) - 1) - (IntBits % `RK);
+    assign pPrCeil = (pPlusr >> `LOGRK) + |pPrTrunc;
+    assign nE = (pPrCeil * `DIVCOPIES) - 1;
+    assign IntBits = `LOGR + p - 1;
+    assign RightShiftX = `RK - 1 - IntBits % `RK;
   /* verilator lint_on WIDTH */
 
     // Selet integer or floating-point operands

From 802c440254facdf781ff4df1f93c461887788d6e Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 06:47:40 -0800
Subject: [PATCH 06/15] Reduced size of preproc right shift

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index cb8833658..d0a060795 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -70,7 +70,8 @@ module fdivsqrtpreproc (
     logic  AsE, BsE, ALTBE, NegQuotE;
     logic  [`XLEN-1:0]  AE, BE;
     logic  [`XLEN-1:0] PosA, PosB;
-    logic  [`DIVBLEN:0] ZeroDiff, IntBits, RightShiftX;
+    logic  [`DIVBLEN:0] ZeroDiff, IntBits;
+    logic  [`LOGRK-1:0] RightShiftX;
 
     // Extract inputs, signs, zero, depending on W64 mode if applicable
     assign signedDiv = ~Funct3E[0];

From 27588af00e753db11e2503b9e76174a871783fc1 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 06:55:20 -0800
Subject: [PATCH 07/15] Clean up sqrt initialization mux

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 29 ++++++++-----------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index d0a060795..e56456742 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -51,20 +51,14 @@ module fdivsqrtpreproc (
 );
 
   logic  [`DIVb-1:0] XPreproc;
-  logic  [`DIVb:0] SqrtX;
-  logic  [`DIVb+3:0] DivX;
+  logic  [`DIVb:0] PreSqrtX;
+  logic  [`DIVb+3:0] DivX, SqrtX;
   logic  [`NE+1:0] QeE;
-  // Intdiv signals
   logic  [`DIVb-1:0] IFNormLenX, IFNormLenD;
-  logic  [`DIVBLEN:0] mE;
-  logic  [`DIVBLEN:0] pPlusr, pPrCeil, p, ell;
-  logic  [`LOGRK:0] pPrTrunc;
+  logic  [`DIVBLEN:0] mE, ell;
   logic  [`DIVb+3:0]  PreShiftX;
   logic  NumZeroE;
 
-  // ***can probably merge X LZC with conversion
-  // cout the number of leading zeros
-
   if (`IDIV_ON_FPU) begin
     logic signedDiv;
     logic  AsE, BsE, ALTBE, NegQuotE;
@@ -72,6 +66,8 @@ module fdivsqrtpreproc (
     logic  [`XLEN-1:0] PosA, PosB;
     logic  [`DIVBLEN:0] ZeroDiff, IntBits;
     logic  [`LOGRK-1:0] RightShiftX;
+    logic  [`DIVBLEN:0] pPlusr, pPrCeil, p;
+    logic  [`LOGRK-1:0] pPrTrunc;
 
     // Extract inputs, signs, zero, depending on W64 mode if applicable
     assign signedDiv = ~Funct3E[0];
@@ -149,16 +145,15 @@ module fdivsqrtpreproc (
   assign DPreproc = IFNormLenD << (mE + {{`DIVBLEN{1'b0}}, 1'b1}); 
 
   //  append leading 1 (for nonzero inputs) and zero-extend
-  assign SqrtX = (Xe[0]^ell[0]) ? {1'b0, ~NumZeroE, XPreproc[`DIVb-1:1]} : {~NumZeroE, XPreproc}; // Bottom bit of XPreproc is always zero because DIVb is larger than XLEN and NF
+  assign PreSqrtX = (Xe[0]^ell[0]) ? {1'b0, ~NumZeroE, XPreproc[`DIVb-1:1]} : {~NumZeroE, XPreproc}; // Bottom bit of XPreproc is always zero because DIVb is larger than XLEN and NF
   assign DivX = {3'b000, ~NumZeroE, XPreproc};
-
-  // *** explain why X is shifted between radices (initial assignment of WS=RX)
-  if (`RADIX == 2)  assign PreShiftX = Sqrt ? {3'b111, SqrtX} : DivX;
-  else              assign PreShiftX = Sqrt ? {2'b11, SqrtX, 1'b0} : DivX;
-
+  // Sqrt is initialized after a first step of R(X-1), which depends on Radix
+  if (`RADIX == 2)  assign SqrtX = {3'b111, PreSqrtX};
+  else              assign SqrtX = {2'b11, PreSqrtX, 1'b0};
+  assign PreShiftX = Sqrt ? SqrtX : DivX;
+ 
   // Floating-point exponent
   fdivsqrtexpcalc expcalc(.Fmt, .Xe, .Ye, .Sqrt, .XZero(XZeroE), .ell, .m(mE), .Qe(QeE));
-
-  flopen #(`NE+2)    expreg(clk, IFDivStartE, QeE, QeM);
+  flopen #(`NE+2) expreg(clk, IFDivStartE, QeE, QeM);
 endmodule
 

From 2c6c3e799d3282dc4624039897da5a43230f8b13 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:00:48 -0800
Subject: [PATCH 08/15] Clean up sqrt preproc

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index e56456742..63d391ae9 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -104,7 +104,7 @@ module fdivsqrtpreproc (
     assign p = ALTBE ? '0 : ZeroDiff;
 
   /* verilator lint_off WIDTH */
-    // right shift amount to complete in discrete number of steps
+    // calculate number of cycles nE right shift amount RightShiftX to complete in discrete number of steps
     assign pPlusr = `LOGR + p;
     assign pPrTrunc = pPlusr % `RK;
     assign pPrCeil = (pPlusr >> `LOGRK) + |pPrTrunc;
@@ -145,6 +145,7 @@ module fdivsqrtpreproc (
   assign DPreproc = IFNormLenD << (mE + {{`DIVBLEN{1'b0}}, 1'b1}); 
 
   //  append leading 1 (for nonzero inputs) and zero-extend
+  // *** explain this next line
   assign PreSqrtX = (Xe[0]^ell[0]) ? {1'b0, ~NumZeroE, XPreproc[`DIVb-1:1]} : {~NumZeroE, XPreproc}; // Bottom bit of XPreproc is always zero because DIVb is larger than XLEN and NF
   assign DivX = {3'b000, ~NumZeroE, XPreproc};
   // Sqrt is initialized after a first step of R(X-1), which depends on Radix

From 55f25457c91fcf80b7a9e31ce1fd5032e2c12efd Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:01:44 -0800
Subject: [PATCH 09/15] Radix 4 divsqrt

---
 pipelined/config/shared/wally-shared.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index cc24c42f6..044bd7d7e 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -109,7 +109,7 @@
 `define CORRSHIFTSZ ((`DIVRESLEN+`NF) > (3*`NF+8) ? (`DIVRESLEN+`NF) : (3*`NF+6))
 
 // division constants
-`define RADIX 32'h2
+`define RADIX 32'h4
 `define DIVCOPIES 32'h4
 `define DIVLEN ((`NF < `XLEN) ? (`XLEN) : `NF+3)
 // `define DIVN (`NF < `XLEN ? `XLEN : `NF+1) // length of input

From 1e65bfd0585c744d78c0120502fc9abb2224a961 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:10:47 -0800
Subject: [PATCH 10/15] simplified sign handling mux

---
 .../src/fpu/fdivsqrt/fdivsqrtpostproc.sv      | 27 +++++--------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 6d955d611..1dd11b3fc 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -52,9 +52,6 @@ module fdivsqrtpostproc(
   logic [`DIVb:0] PreQmM;
   logic NegStickyM;
   logic weq0E, weq0M, WZeroM;
-  logic [`DIVBLEN:0] NormShiftM;
-  logic [`DIVb:0] NormQuotM;
-  logic [`DIVb+3:0] IntQuotM, IntRemM, NormRemM;
   logic signed [`DIVb+3:0] PreResultM, PreFPIntDivResultM;
   logic [`XLEN-1:0] SpecialFPIntDivResultM;
 
@@ -104,27 +101,17 @@ module fdivsqrtpostproc(
   assign QmM = SqrtM ? (PreQmM << 1) : PreQmM;
 
   if (`IDIV_ON_FPU) begin
+    logic [`DIVBLEN:0] NormShiftM;
+    logic [`DIVb:0] NormQuotM;
+    logic [`DIVb+3:0] IntQuotM, IntRemM, NormRemM, NormRemDM;
+
     assign W = $signed(Sum) >>> `LOGR;
     assign DM = {4'b0001, D};
 
     // Integer division: sign handling for div and rem
-    always_comb 
-      if (~AsM)
-        if (NegStickyM) begin
-          NormQuotM = FirstUM;
-          NormRemM  = W + DM;
-        end else begin
-          NormQuotM = FirstU;
-          NormRemM  = W;
-        end
-      else 
-        if (NegStickyM) begin
-          NormQuotM = FirstUM;
-          NormRemM  = -(W + DM);
-        end else begin 
-          NormQuotM = FirstU;
-          NormRemM  = -W;
-        end
+    mux2 #(`DIVb+1) normquotmux(FirstU, FirstUM, NegStickyM, NormQuotM);
+    mux2 #(`DIVb+4) normremdmux(W, W+DM, NegStickyM, NormRemDM);
+    mux2 #(`DIVb+4) normremsmux(NormRemDM, -NormRemDM, AsM, NormRemM);
 
     // Integer division: Special cases
     always_comb

From 6ae25537ea9ca626c8f00f05499eda2dc4ffe194 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:17:38 -0800
Subject: [PATCH 11/15] removed duplicate quotient mux

---
 pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 1dd11b3fc..258f0eb57 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -102,14 +102,12 @@ module fdivsqrtpostproc(
 
   if (`IDIV_ON_FPU) begin
     logic [`DIVBLEN:0] NormShiftM;
-    logic [`DIVb:0] NormQuotM;
     logic [`DIVb+3:0] IntQuotM, IntRemM, NormRemM, NormRemDM;
 
     assign W = $signed(Sum) >>> `LOGR;
     assign DM = {4'b0001, D};
 
-    // Integer division: sign handling for div and rem
-    mux2 #(`DIVb+1) normquotmux(FirstU, FirstUM, NegStickyM, NormQuotM);
+    // Integer remainder: sticky and sign correction muxes
     mux2 #(`DIVb+4) normremdmux(W, W+DM, NegStickyM, NormRemDM);
     mux2 #(`DIVb+4) normremsmux(NormRemDM, -NormRemDM, AsM, NormRemM);
 
@@ -129,7 +127,7 @@ module fdivsqrtpostproc(
             IntRemM  = '0;
           end 
         end else begin 
-          PreIntQuotM = {3'b000, NormQuotM};
+          PreIntQuotM = {3'b000, PreQmM};
           IntRemM  = NormRemM;
         end 
         // flip sign if necessary
@@ -147,7 +145,7 @@ module fdivsqrtpostproc(
       end
     
 
-    // division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
+    // integer division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
     
     assign PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
     assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases

From 1006305d759f45aefb6f7cfc1a63a0e4ab224dea Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:34:26 -0800
Subject: [PATCH 12/15] started simplifying integer division special cases

---
 .../src/fpu/fdivsqrt/fdivsqrtpostproc.sv      | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 258f0eb57..80d9e4b01 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -111,6 +111,43 @@ module fdivsqrtpostproc(
     mux2 #(`DIVb+4) normremdmux(W, W+DM, NegStickyM, NormRemDM);
     mux2 #(`DIVb+4) normremsmux(NormRemDM, -NormRemDM, AsM, NormRemM);
 
+    // special case logic
+    always_comb
+      if (ALTBM) begin
+        if (RemOpM) PreFPIntDivResultM = {{(`DIVb-`XLEN+4){1'b0}}, AM};
+        else        PreFPIntDivResultM = '0;
+ //       IntQuotM = '0;
+ //       IntRemM  = {{(`DIVb-`XLEN+4){1'b0}}, AM};
+      end else begin
+        logic [`DIVb+3:0] PreIntQuotM;
+        if (WZeroM) begin
+          if (weq0M) begin
+            PreIntQuotM = {3'b000, FirstU};
+            IntRemM  = '0;
+          end else begin
+            PreIntQuotM = {3'b000, FirstUM};
+            IntRemM  = '0;
+          end 
+        end else begin 
+          PreIntQuotM = {3'b000, PreQmM};
+          IntRemM  = NormRemM;
+        end 
+        // flip sign if necessary
+        if (NegQuotM) IntQuotM = -PreIntQuotM;
+        else          IntQuotM =  PreIntQuotM;
+        if (RemOpM) begin
+          NormShiftM = ALTBM ? '0 : (mM + (`DIVBLEN+1)'(`DIVa)); // no postshift if forwarding input A to remainder
+          PreResultM = IntRemM;
+        end else begin
+          NormShiftM = ((`DIVBLEN+1)'(`DIVb) - (nM * (`DIVBLEN+1)'(`LOGR)));
+          PreResultM = IntQuotM;
+        end
+        PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
+      end
+
+    assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
+
+/*
     // Integer division: Special cases
     always_comb
       if (ALTBM) begin
@@ -149,6 +186,7 @@ module fdivsqrtpostproc(
     
     assign PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
     assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
+*/
 
     // sign extend result for W64
     if (`XLEN==64)

From 919525ca17e62942bf50eaf9411fe386b9e0fc98 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 30 Dec 2022 07:40:28 -0800
Subject: [PATCH 13/15] continued simplifying integer division special cases

---
 .../src/fpu/fdivsqrt/fdivsqrtpostproc.sv      | 53 +++----------------
 1 file changed, 7 insertions(+), 46 deletions(-)

diff --git a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
index 80d9e4b01..c78738a4a 100644
--- a/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/pipelined/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@@ -113,9 +113,12 @@ module fdivsqrtpostproc(
 
     // special case logic
     always_comb
-      if (ALTBM) begin
-        if (RemOpM) PreFPIntDivResultM = {{(`DIVb-`XLEN+4){1'b0}}, AM};
-        else        PreFPIntDivResultM = '0;
+      if (BZeroM) begin 
+        if (RemOpM) SpecialFPIntDivResultM = AM;
+        else        SpecialFPIntDivResultM = {(`XLEN){1'b1}};
+      end else if (ALTBM) begin
+        if (RemOpM) SpecialFPIntDivResultM = AM;
+        else        SpecialFPIntDivResultM = '0;
  //       IntQuotM = '0;
  //       IntRemM  = {{(`DIVb-`XLEN+4){1'b0}}, AM};
       end else begin
@@ -143,51 +146,9 @@ module fdivsqrtpostproc(
           PreResultM = IntQuotM;
         end
         PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
+        SpecialFPIntDivResultM = PreFPIntDivResultM[`XLEN-1:0];
       end
 
-    assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
-
-/*
-    // Integer division: Special cases
-    always_comb
-      if (ALTBM) begin
-        IntQuotM = '0;
-        IntRemM  = {{(`DIVb-`XLEN+4){1'b0}}, AM};
-      end else begin
-        logic [`DIVb+3:0] PreIntQuotM;
-        if (WZeroM) begin
-          if (weq0M) begin
-            PreIntQuotM = {3'b000, FirstU};
-            IntRemM  = '0;
-          end else begin
-            PreIntQuotM = {3'b000, FirstUM};
-            IntRemM  = '0;
-          end 
-        end else begin 
-          PreIntQuotM = {3'b000, PreQmM};
-          IntRemM  = NormRemM;
-        end 
-        // flip sign if necessary
-        if (NegQuotM) IntQuotM = -PreIntQuotM;
-        else          IntQuotM =  PreIntQuotM;
-      end
-    
-    always_comb
-      if (RemOpM) begin
-        NormShiftM = ALTBM ? '0 : (mM + (`DIVBLEN+1)'(`DIVa)); // no postshift if forwarding input A to remainder
-        PreResultM = IntRemM;
-      end else begin
-        NormShiftM = ((`DIVBLEN+1)'(`DIVb) - (nM * (`DIVBLEN+1)'(`LOGR)));
-        PreResultM = IntQuotM;
-      end
-    
-
-    // integer division takes the result from the next cycle, which is shifted to the left one more time so the square root also needs to be shifted
-    
-    assign PreFPIntDivResultM = $signed(PreResultM >>> NormShiftM);
-    assign SpecialFPIntDivResultM = BZeroM ? (RemOpM ? AM : {(`XLEN){1'b1}}) : PreFPIntDivResultM[`XLEN-1:0]; // special cases
-*/
-
     // sign extend result for W64
     if (`XLEN==64)
       assign FPIntDivResultM = (W64M ? {{(`XLEN-32){SpecialFPIntDivResultM[31]}}, SpecialFPIntDivResultM[31:0]} : 

From e0ec45489a7f200db4a58f5f20c3acda90c3170b Mon Sep 17 00:00:00 2001
From: Ross Thompson <ross1728@gmail.com>
Date: Fri, 30 Dec 2022 10:51:35 -0600
Subject: [PATCH 14/15] Updated constraints to remove DivBusyE.

---
 fpga/constraints/debug2.xdc | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/fpga/constraints/debug2.xdc b/fpga/constraints/debug2.xdc
index eed201843..bdc073ee1 100644
--- a/fpga/constraints/debug2.xdc
+++ b/fpga/constraints/debug2.xdc
@@ -324,9 +324,9 @@ set_property PROBE_TYPE DATA_AND_TRIGGER [get_debug_ports u_ila_0/probe62]
 connect_debug_port u_ila_0/probe62 [get_nets [list wallypipelinedsoc/core/hzu/FCvtIntStallD ]]
 
 create_debug_port u_ila_0 probe
-set_property port_width 1 [get_debug_ports u_ila_0/probe63]
+set_property port_width 7 [get_debug_ports u_ila_0/probe63]
 set_property PROBE_TYPE DATA_AND_TRIGGER [get_debug_ports u_ila_0/probe63]
-connect_debug_port u_ila_0/probe63 [get_nets [list wallypipelinedsoc/core/hzu/DivBusyE ]]
+connect_debug_port u_ila_0/probe63 [get_nets [list {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][1]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][2]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][3]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][4]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][5]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][6]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][7]} ]]
 
 create_debug_port u_ila_0 probe
 set_property port_width 1 [get_debug_ports u_ila_0/probe64]
@@ -1148,7 +1148,3 @@ set_property port_width 53 [get_debug_ports u_ila_0/probe224]
 set_property PROBE_TYPE DATA_AND_TRIGGER [get_debug_ports u_ila_0/probe224]
 connect_debug_port u_ila_0/probe224 [get_nets [list {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][1]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][2]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][3]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][4]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][5]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][6]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][7]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][8]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][9]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][10]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][11]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][12]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][13]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][14]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][15]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][16]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][17]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][18]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][19]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][20]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][21]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][22]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][23]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][24]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][25]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][26]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][27]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][28]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][29]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][30]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][31]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][32]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][33]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][34]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][35]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][36]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][37]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][38]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][39]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][40]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][41]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][42]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][43]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][44]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][45]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][46]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][47]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][48]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][49]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][50]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][51]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][52]} {wallypipelinedsoc/uncore.uncore/plic.plic/irqs_at_max_priority[0][53]} ]]
 
-create_debug_port u_ila_0 probe
-set_property port_width 7 [get_debug_ports u_ila_0/probe225]
-set_property PROBE_TYPE DATA_AND_TRIGGER [get_debug_ports u_ila_0/probe225]
-connect_debug_port u_ila_0/probe225 [get_nets [list {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][1]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][2]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][3]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][4]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][5]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][6]} {wallypipelinedsoc/uncore.uncore/plic.plic/threshMask[0][7]} ]]

From 668c698bb491dc3e1e1310d55ac69ada994976f5 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Fri, 30 Dec 2022 12:07:44 -0600
Subject: [PATCH 15/15] removed ethe second bit from fma alignment shift

---
 pipelined/config/shared/wally-shared.vh       |  4 +-
 pipelined/src/fpu/fma/fma.sv                  | 60 +++++++++--------
 pipelined/src/fpu/fma/fmaadd.sv               | 12 ++--
 pipelined/src/fpu/fma/fmaalign.sv             | 25 ++++----
 pipelined/src/fpu/fma/fmalza.sv               | 12 ++--
 pipelined/src/fpu/fpu.sv                      | 18 +++---
 pipelined/src/fpu/postproc/fmashiftcalc.sv    | 28 ++++----
 pipelined/src/fpu/postproc/postprocess.sv     | 37 ++++++-----
 pipelined/src/fpu/postproc/round.sv           |  4 +-
 pipelined/src/fpu/postproc/shiftcorrection.sv |  4 +-
 pipelined/testbench/testbench-fp.sv           | 64 +++++++++----------
 11 files changed, 137 insertions(+), 131 deletions(-)

diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index e047d947a..5c210ebdc 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -104,9 +104,9 @@
 `define CVTLEN ((`NF<`XLEN) ? (`XLEN) : (`NF))
 `define LLEN ((`FLEN<`XLEN) ? (`XLEN) : (`FLEN))
 `define LOGCVTLEN $unsigned($clog2(`CVTLEN+1))
-`define NORMSHIFTSZ ((`QLEN+`NF+1) > (3*`NF+7) ? (`QLEN+`NF+1) : (3*`NF+7))//change
+`define NORMSHIFTSZ ((`QLEN+`NF+1) > (3*`NF+6) ? (`QLEN+`NF+1) : (3*`NF+6))
 `define LOGNORMSHIFTSZ ($clog2(`NORMSHIFTSZ))
-`define CORRSHIFTSZ ((`DIVRESLEN+`NF) > (3*`NF+7) ? (`DIVRESLEN+`NF) : (3*`NF+5))//change
+`define CORRSHIFTSZ ((`DIVRESLEN+`NF) > (3*`NF+6) ? (`DIVRESLEN+`NF) : (3*`NF+4))
 
 // division constants
 `define RADIX 32'h4
diff --git a/pipelined/src/fpu/fma/fma.sv b/pipelined/src/fpu/fma/fma.sv
index 4d60f477f..ed854c0a1 100644
--- a/pipelined/src/fpu/fma/fma.sv
+++ b/pipelined/src/fpu/fma/fma.sv
@@ -31,27 +31,37 @@
 `include "wally-config.vh"
 
 module fma(
-    input logic                 Xs, Ys, Zs,    // input's signs
-    input logic  [`NE-1:0]      Xe, Ye, Ze,    // input's biased exponents in B(NE.0) format
-    input logic  [`NF:0]        Xm, Ym, Zm,    // input's significands in U(0.NF) format
-    input logic                 XZero, YZero, ZZero, // is the input zero
-    input logic  [2:0]          OpCtrl,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
-    output logic                ASticky,  // sticky bit that is calculated during alignment
-    output logic [3*`NF+4:0]    Sm,//change           // the positive sum's significand
-    output logic                InvA,          // Was A inverted for effective subtraction (P-A or -P+A)
-    output logic                As,       // the aligned addend's sign (modified Z sign for other opperations)
-    output logic                Ps,          // the product's sign
-    output logic                Ss,          // the sum's sign
-    output logic [`NE+1:0]      Se,
-    output logic [$clog2(3*`NF+6)-1:0]          SCnt//change        // normalization shift count
+    input logic                         Xs, Ys, Zs, // input's signs
+    input logic  [`NE-1:0]              Xe, Ye, Ze, // input's biased exponents in B(NE.0) format
+    input logic  [`NF:0]                Xm, Ym, Zm, // input's significands in U(0.NF) format
+    input logic                         XZero, YZero, ZZero, // is the input zero
+    input logic  [2:0]                  OpCtrl,   // operation control
+    output logic                        ASticky,  // sticky bit that is calculated during alignment
+    output logic [3*`NF+3:0]            Sm,   // the positive sum's significand
+    output logic                        InvA, // Was A inverted for effective subtraction (P-A or -P+A)
+    output logic                        As,   // the aligned addend's sign (modified Z sign for other opperations)
+    output logic                        Ps,   // the product's sign
+    output logic                        Ss,   // the sum's sign
+    output logic [`NE+1:0]              Se,   // the sum's exponent
+    output logic [$clog2(3*`NF+5)-1:0]  SCnt  // normalization shift count
 );
 
-    logic [2*`NF+1:0]   Pm;           // the product's significand in U(2.2Nf) format
-    logic [3*`NF+4:0]   Am;//change     // addend aligned's mantissa for addition in U(NF+5.2NF+1)
-    logic [3*`NF+4:0]   AmInv; //change   // aligned addend's mantissa possibly inverted
-    logic [2*`NF+1:0]   PmKilled;      // the product's mantissa possibly killed
-    logic               KillProd;  // set the product to zero before addition if the product is too small to matter
-    logic [`NE+1:0]     Pe;       // the product's exponent B(NE+2.0) format; adds 2 bits to allow for size of number and negative sign
+    //  OpCtrl:
+    //    Fma: {not multiply-add?, negate prod?, negate Z?}
+    //        000 - fmadd
+    //        001 - fmsub
+    //        010 - fnmsub
+    //        011 - fnmadd
+    //        100 - mul
+    //        110 - add
+    //        111 - sub
+
+    logic [2*`NF+1:0]   Pm;          // the product's significand in U(2.2Nf) format
+    logic [3*`NF+3:0]   Am;         // addend aligned's mantissa for addition in U(NF+4.2NF)
+    logic [3*`NF+3:0]   AmInv;      // aligned addend's mantissa possibly inverted
+    logic [2*`NF+1:0]   PmKilled;   // the product's mantissa possibly killed U(2.2Nf)
+    logic               KillProd;   // set the product to zero before addition if the product is too small to matter
+    logic [`NE+1:0]     Pe;         // the product's exponent B(NE+2.0) format; adds 2 bits to allow for size of number and negative sign
 
     ///////////////////////////////////////////////////////////////////////////////
     // Calculate the product
@@ -68,25 +78,23 @@ module fma(
     // multiplication of the mantissa's
     fmamult mult(.Xm, .Ym, .Pm);
    
-    ///////////////////////////////////////////////////////////////////////////////
-    // Alignment shifter
-    ///////////////////////////////////////////////////////////////////////////////
     // calculate the signs and take the opperation into account
     fmasign sign(.OpCtrl, .Xs, .Ys, .Zs, .Ps, .As, .InvA);
 
+    ///////////////////////////////////////////////////////////////////////////////
+    // Alignment shifter
+    ///////////////////////////////////////////////////////////////////////////////
     fmaalign align(.Ze, .Zm, .XZero, .YZero, .ZZero, .Xe, .Ye,
                 .Am, .ASticky, .KillProd);
                         
-
-
     // ///////////////////////////////////////////////////////////////////////////////
     // // Addition/LZA
     // ///////////////////////////////////////////////////////////////////////////////
         
     fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .KillProd, .ASticky, .AmInv, .PmKilled, .InvA, .Sm, .Se, .Ss);
 
-    //change
-    fmalza #(3*`NF+5) lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ASticky&KillProd}), .Cin(InvA & ~(ASticky & ~KillProd)), .sub(InvA), .SCnt);
+    fmalza #(3*`NF+4) lza(.A(AmInv), .Pm({PmKilled, InvA&Ps&ASticky&KillProd}), .Cin(InvA & ~(ASticky & ~KillProd)), .sub(InvA), .SCnt);
+    
 endmodule
 
 
diff --git a/pipelined/src/fpu/fma/fmaadd.sv b/pipelined/src/fpu/fma/fmaadd.sv
index 509adb674..7ce641dbe 100644
--- a/pipelined/src/fpu/fma/fmaadd.sv
+++ b/pipelined/src/fpu/fma/fmaadd.sv
@@ -31,7 +31,7 @@
 `include "wally-config.vh"
 
 module fmaadd(
-    input logic  [3*`NF+4:0]    Am, //change // aligned addend's mantissa for addition in U(NF+5.2NF+1)
+    input logic  [3*`NF+3:0]    Am, // aligned addend's mantissa for addition in U(NF+5.2NF+1)
     input logic  [2*`NF+1:0]    Pm,       // the product's mantissa
     input logic                 Ps, // the product sign and the alligend addeded's sign (Modified Z sign for other opperations)
     input logic                InvA,          // invert the aligned addend
@@ -39,13 +39,13 @@ module fmaadd(
     input logic                 ASticky,
     input logic  [`NE-1:0]      Ze,
     input logic  [`NE+1:0]      Pe,
-    output logic [3*`NF+4:0]    AmInv,//change // aligned addend possibly inverted
+    output logic [3*`NF+3:0]    AmInv, // aligned addend possibly inverted
     output logic [2*`NF+1:0]    PmKilled,     // the product's mantissa possibly killed
     output logic                Ss,          
     output logic [`NE+1:0]      Se,
-    output logic [3*`NF+4:0]    Sm//change           // the positive sum
+    output logic [3*`NF+3:0]    Sm          // the positive sum
 );
-    logic [3*`NF+4:0]    PreSum, NegPreSum;//change // possibly negitive sum
+    logic [3*`NF+3:0]    PreSum, NegPreSum; // possibly negitive sum
     logic [3*`NF+5:0]    PreSumdebug, NegPreSumdebug; // possibly negitive sum
     logic                NegSum;        // was the sum negitive
     logic                NegSumdebug;        // was the sum negitive
@@ -66,8 +66,8 @@ module fmaadd(
     //      addend - prod where product is killed (and not exactly zero) then don't add +1 from negation 
     //          ie ~(InvA&ASticky&KillProd)&InvA = (~ASticky|~KillProd)&InvA
     //          in this case this result is only ever selected when InvA=1 so we can remove &InvA
-    assign {NegSum, PreSum} = {{`NF+2{1'b0}}, PmKilled, 2'b0} + {InvA, AmInv} + {{3*`NF+5{1'b0}}, (~ASticky|KillProd)&InvA};//change
-    assign NegPreSum = Am + {{`NF+1{1'b1}}, ~PmKilled, 2'b0} + {(3*`NF+2)'(0), ~ASticky|~KillProd, 2'b0};//change
+    assign {NegSum, PreSum} = {{`NF+2{1'b0}}, PmKilled, 1'b0} + {InvA, AmInv} + {{3*`NF+4{1'b0}}, (~ASticky|KillProd)&InvA};
+    assign NegPreSum = Am + {{`NF+1{1'b1}}, ~PmKilled, 1'b0} + {(3*`NF+2)'(0), ~ASticky|~KillProd, 1'b0};
      
     // Choose the positive sum and accompanying LZA result.
     assign Sm = NegSum ? NegPreSum : PreSum;
diff --git a/pipelined/src/fpu/fma/fmaalign.sv b/pipelined/src/fpu/fma/fmaalign.sv
index 67dc0b824..fc4d9c614 100644
--- a/pipelined/src/fpu/fma/fmaalign.sv
+++ b/pipelined/src/fpu/fma/fmaalign.sv
@@ -35,16 +35,15 @@ module fmaalign(
     input logic  [`NE-1:0]      Xe, Ye, Ze,      // biased exponents in B(NE.0) format
     input logic  [`NF:0]        Zm,      // significand in U(0.NF) format]
     input logic                 XZero, YZero, ZZero, // is the input zero
-    output logic [3*`NF+4:0]    Am,//change // addend aligned for addition in U(NF+5.2NF+1)
+    output logic [3*`NF+3:0]    Am, // addend aligned for addition in U(NF+5.2NF+1)
     output logic                ASticky,  // Sticky bit calculated from the aliged addend
     output logic                KillProd       // should the product be set to zero
 );
 
     logic [`NE+1:0]     ACnt;           // how far to shift the addend to align with the product in Q(NE+2.0) format
-    logic [4*`NF+4:0]   ZmShifted;//change        // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
-    logic [4*`NF+4:0]   ZmPreshifted;//change     // input to the alignment shifter U(NF+5.3NF+1)
+    logic [4*`NF+3:0]   ZmShifted;        // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
+    logic [4*`NF+3:0]   ZmPreshifted;     // input to the alignment shifter U(NF+5.3NF+1)
     logic KillZ;
-    logic PmSticky, tmpZmSticky;
 
     ///////////////////////////////////////////////////////////////////////////////
     // Alignment shifter
@@ -57,38 +56,38 @@ module fmaalign(
     assign ACnt = {2'b0, Xe} + {2'b0, Ye} - {2'b0, (`NE)'(`BIAS)} + (`NE+2)'(`NF+2) - {2'b0, Ze};
 
     // Defualt Addition with only inital left shift
-    //          |   53'b0    |  106'b(product)  | 2'b0 |
+    //          |   53'b0    |  106'b(product)  | 1'b0 |
     //          | addnend |
 
-    assign ZmPreshifted = {Zm,(3*`NF+4)'(0)}; //change
+    assign ZmPreshifted = {Zm,(3*`NF+3)'(0)};
     
     assign KillProd = (ACnt[`NE+1]&~ZZero)|XZero|YZero;
-    assign KillZ = $signed(ACnt)>$signed((`NE+2)'(3)*(`NE+2)'(`NF)+(`NE+2)'(4));//change
+    assign KillZ = $signed(ACnt)>$signed((`NE+2)'(3)*(`NE+2)'(`NF)+(`NE+2)'(3));
 
     always_comb
         begin
         
         // If the product is too small to effect the sum, kill the product
 
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //          |   53'b0    |  106'b(product)  | 1'b0 |
         //  | addnend |
         if (KillProd) begin
-            ZmShifted = {(`NF+2)'(0), Zm, (2*`NF+2)'(0)};//change
+            ZmShifted = {(`NF+2)'(0), Zm, (2*`NF+1)'(0)};
             ASticky = ~(XZero|YZero);
 
         // If the addend is too small to effect the addition        
         //      - The addend has to shift two past the end of the product to be considered too small
         //      - The 2 extra bits are needed for rounding
 
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //          |   53'b0    |  106'b(product)  | 1'b0 |
         //                                                      | addnend |
         end else if (KillZ)  begin
             ZmShifted = 0;
             ASticky = ~ZZero;
 
         // If the Addend is shifted right
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //                                  | addnend |
+        //          |   53'b0    |  106'b(product)  | 1'b0 |
+        //                                    | addnend |
         end else begin
             ZmShifted = ZmPreshifted >> ACnt;
             ASticky = |(ZmShifted[`NF-1:0]); 
@@ -96,7 +95,7 @@ module fmaalign(
         end
     end
 
-    assign Am = ZmShifted[4*`NF+4:`NF];//change
+    assign Am = ZmShifted[4*`NF+3:`NF];
 
 endmodule
 
diff --git a/pipelined/src/fpu/fma/fmalza.sv b/pipelined/src/fpu/fma/fmalza.sv
index 182075bde..9a0de74c8 100644
--- a/pipelined/src/fpu/fma/fmalza.sv
+++ b/pipelined/src/fpu/fma/fmalza.sv
@@ -31,18 +31,18 @@
 `include "wally-config.vh"
 
 module fmalza #(WIDTH) ( // [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
-    input logic [WIDTH-1:0] 	       A, // addend
-    input logic [2*`NF+3:0] 	       Pm, // product
-    input logic 		       Cin, // carry in
-    input logic sub,
-    output logic [$clog2(WIDTH+1)-1:0] SCnt   // normalization shift count for the positive result
+    input logic [WIDTH-1:0]             A,      // addend
+    input logic [2*`NF+2:0]             Pm,     // product
+    input logic 		                Cin,    // carry in
+    input logic                         sub,
+    output logic [$clog2(WIDTH+1)-1:0]  SCnt    // normalization shift count for the positive result
     ); 
 
    logic [WIDTH:0] 	       F;
    logic [WIDTH-1:0]  B, P, G, K;
     logic [WIDTH-1:0] Pp1, Gm1, Km1;
 
-    assign B = {{(`NF+1){1'b0}}, Pm};//change // Zero extend product
+    assign B = {{(`NF+1){1'b0}}, Pm}; // Zero extend product
 
     assign P = A^B;
     assign G = A&B;
diff --git a/pipelined/src/fpu/fpu.sv b/pipelined/src/fpu/fpu.sv
index 507cd9086..1f749e9e0 100755
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@@ -109,14 +109,14 @@ module fpu (
    logic 		      XExpMaxE;                           // is the exponent all ones (max value)
 
    // Fma Signals
-   logic [3*`NF+4:0] SmE, SmM;//change             
-   logic 			   ZmStickyE, ZmStickyM;
+   logic [3*`NF+3:0] SmE, SmM;            
+   logic 			   FmaAStickyE, FmaAStickyM;
    logic [`NE+1:0]   SeE,SeM;
    logic 			   InvAE, InvAM;
    logic 			   AsE, AsM;
    logic 			   PsE, PsM;
    logic 			   SsE, SsM;
-   logic [$clog2(3*`NF+6)-1:0] SCntE, SCntM;//change
+   logic [$clog2(3*`NF+5)-1:0] SCntE, SCntM;
 
    // Cvt Signals
    logic [`NE:0]           CeE, CeM;    // the calculated expoent
@@ -258,7 +258,7 @@ module fpu (
             .As(AsE), .Ps(PsE), .Ss(SsE), .Se(SeE),
             .Sm(SmE), 
             .InvA(InvAE), .SCnt(SCntE), 
-            .ASticky(ZmStickyE)); 
+            .ASticky(FmaAStickyE)); 
 
    // divide and squareroot
    //    - fdiv
@@ -352,10 +352,10 @@ module fpu (
             {XsE, YsE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE, ZDenormE},
             {XsM, YsM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM, ZDenormM});     
    flopenrc #(1)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, PreNVE, PreNVM);      
-   flopenrc #(3*`NF+5) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM);//change 
-  flopenrc #($clog2(3*`NF+6)+7+`NE) EMRegFma4(clk, reset, FlushM, ~StallM, //change
-                           {ZmStickyE, InvAE, SCntE, AsE, PsE, SsE, SeE},
-                           {ZmStickyM, InvAM, SCntM, AsM, PsM, SsM, SeM});
+   flopenrc #(3*`NF+4) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM);
+  flopenrc #($clog2(3*`NF+5)+7+`NE) EMRegFma4(clk, reset, FlushM, ~StallM,
+                           {FmaAStickyE, InvAE, SCntE, AsE, PsE, SsE, SeE},
+                           {FmaAStickyM, InvAM, SCntM, AsM, PsM, SsM, SeM});
    flopenrc #(`NE+`LOGCVTLEN+`CVTLEN+4) EMRegCvt(clk, reset, FlushM, ~StallM, 
                            {CeE, CvtShiftAmtE, CvtResDenormUfE, CsE, IntZeroE, CvtLzcInE},
                            {CeM, CvtShiftAmtM, CvtResDenormUfM, CsM, IntZeroM, CvtLzcInM});
@@ -375,7 +375,7 @@ module fpu (
    assign FpLoadStoreM = FResSelM[1];
 
    postprocess postprocess(.Xs(XsM), .Ys(YsM), .Xm(XmM), .Ym(YmM), .Zm(ZmM), .Frm(FrmM), .Fmt(FmtM), 
-                           .FmaZmS(ZmStickyM), .XZero(XZeroM), .YZero(YZeroM), .ZZero(ZZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QmM), .FmaSs(SsM),
+                           .FmaASticky(FmaAStickyM), .XZero(XZeroM), .YZero(YZeroM), .ZZero(ZZeroM), .XInf(XInfM), .YInf(YInfM), .DivQm(QmM), .FmaSs(SsM),
                            .ZInf(ZInfM), .XNaN(XNaNM), .YNaN(YNaNM), .ZNaN(ZNaNM), .XSNaN(XSNaNM), .YSNaN(YSNaNM), .ZSNaN(ZSNaNM), .FmaSm(SmM), .DivQe(QeM), /*.DivDone(DivDoneM), */
                            .ZDenorm(ZDenormM), .FmaAs(AsM), .FmaPs(PsM), .OpCtrl(OpCtrlM), .FmaSCnt(SCntM), .FmaSe(SeM),
                            .CvtCe(CeM), .CvtResDenormUf(CvtResDenormUfM),.CvtShiftAmt(CvtShiftAmtM), .CvtCs(CsM), .ToInt(FWriteIntM), .DivS(DivSM),
diff --git a/pipelined/src/fpu/postproc/fmashiftcalc.sv b/pipelined/src/fpu/postproc/fmashiftcalc.sv
index 5c301da7d..1110b70f0 100644
--- a/pipelined/src/fpu/postproc/fmashiftcalc.sv
+++ b/pipelined/src/fpu/postproc/fmashiftcalc.sv
@@ -30,18 +30,18 @@
 `include "wally-config.vh"
 
 module fmashiftcalc(
-    input logic  [3*`NF+4:0]            FmaSm,//change       // the positive sum
-    input logic  [$clog2(3*`NF+6)-1:0]  FmaSCnt,//change   // normalization shift count
-    input logic  [`FMTBITS-1:0]         Fmt,       // precision 1 = double 0 = single
-    input logic [`NE+1:0] FmaSe,
-    output logic [`NE+1:0]              NormSumExp,          // exponent of the normalized sum not taking into account denormal or zero results
-    output logic                        FmaSZero,    // is the result denormalized - calculated before LZA corection
-    output logic                        FmaPreResultDenorm,    // is the result denormalized - calculated before LZA corection
-    output logic [$clog2(3*`NF+6)-1:0]  FmaShiftAmt,//change   // normalization shift count
-    output logic [3*`NF+6:0]            FmaShiftIn//change        // is the sum zero
+    input logic  [3*`NF+3:0]            FmaSm,      // the positive sum
+    input logic  [$clog2(3*`NF+5)-1:0]  FmaSCnt,    // normalization shift count
+    input logic  [`FMTBITS-1:0]         Fmt,        // precision 1 = double 0 = single
+    input logic  [`NE+1:0]              FmaSe,      // sum's exponent
+    output logic [`NE+1:0]              NormSumExp, // exponent of the normalized sum not taking into account denormal or zero results
+    output logic                        FmaSZero,   // is the result denormalized - calculated before LZA corection
+    output logic                        FmaPreResultDenorm, // is the result denormalized - calculated before LZA corection
+    output logic [$clog2(3*`NF+5)-1:0]  FmaShiftAmt,    // normalization shift count
+    output logic [3*`NF+5:0]            FmaShiftIn      // is the sum zero
 );
-    logic [`NE+1:0]             PreNormSumExp;       // the exponent of the normalized sum with the `FLEN bias
-    logic [`NE+1:0] BiasCorr;
+    logic [`NE+1:0] PreNormSumExp;  // the exponent of the normalized sum with the `FLEN bias
+    logic [`NE+1:0] BiasCorr;       // correction for bias
 
     ///////////////////////////////////////////////////////////////////////////////
     // Normalization
@@ -50,7 +50,7 @@ module fmashiftcalc(
     // Determine if the sum is zero
     assign FmaSZero = ~(|FmaSm);
     // calculate the sum's exponent
-    assign PreNormSumExp = FmaSe + {{`NE+2-$unsigned($clog2(3*`NF+6)){1'b1}}, ~FmaSCnt} + (`NE+2)'(`NF+3);//change
+    assign PreNormSumExp = FmaSe + {{`NE+2-$unsigned($clog2(3*`NF+5)){1'b1}}, ~FmaSCnt} + (`NE+2)'(`NF+3);
 
     //convert the sum's exponent into the proper percision
     if (`FPSIZES == 1) begin
@@ -150,7 +150,7 @@ module fmashiftcalc(
     //  - shift once if killing a product and the result is denormalized
     assign FmaShiftIn = {2'b0, FmaSm};
     if (`FPSIZES == 1)
-        assign FmaShiftAmt = FmaPreResultDenorm ? FmaSe[$clog2(3*`NF+6)-1:0]+($clog2(3*`NF+6))'(`NF+2): FmaSCnt+1;//change
+        assign FmaShiftAmt = FmaPreResultDenorm ? FmaSe[$clog2(3*`NF+5)-1:0]+($clog2(3*`NF+5))'(`NF+2): FmaSCnt+1;
     else
-        assign FmaShiftAmt = FmaPreResultDenorm ? FmaSe[$clog2(3*`NF+6)-1:0]+($clog2(3*`NF+6))'(`NF+2)+BiasCorr[$clog2(3*`NF+6)-1:0]: FmaSCnt+1;//change
+        assign FmaShiftAmt = FmaPreResultDenorm ? FmaSe[$clog2(3*`NF+5)-1:0]+($clog2(3*`NF+5))'(`NF+2)+BiasCorr[$clog2(3*`NF+5)-1:0]: FmaSCnt+1;
 endmodule
diff --git a/pipelined/src/fpu/postproc/postprocess.sv b/pipelined/src/fpu/postproc/postprocess.sv
index 7c758b28c..4637f3707 100644
--- a/pipelined/src/fpu/postproc/postprocess.sv
+++ b/pipelined/src/fpu/postproc/postprocess.sv
@@ -32,28 +32,27 @@
 
 module postprocess (
     // general signals
-    input logic                             Xs, Ys,  // input signs
+    input logic                             Xs, Ys,     // input signs
     input logic  [`NF:0]                    Xm, Ym, Zm, // input mantissas
-    input logic  [2:0]                      Frm,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
-    input logic  [`FMTBITS-1:0]             Fmt,       // precision 1 = double 0 = single
-    input logic  [2:0]                      OpCtrl,       // choose which opperation (look below for values)
+    input logic  [2:0]                      Frm,        // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+    input logic  [`FMTBITS-1:0]             Fmt,        // precision 1 = double 0 = single
+    input logic  [2:0]                      OpCtrl,     // choose which opperation (look below for values)
     input logic                             XZero, YZero, ZZero, // inputs are zero
     input logic                             XInf, YInf, ZInf,    // inputs are infinity
     input logic                             XNaN, YNaN, ZNaN,    // inputs are NaN
     input logic                             XSNaN, YSNaN, ZSNaN, // inputs are signaling NaNs
-    input logic                             ZDenorm, // is the original precision denormalized
-    input logic  [1:0]                      PostProcSel, // select result to be written to fp register
+    input logic                             ZDenorm,        // is the original precision denormalized
+    input logic  [1:0]                      PostProcSel,    // select result to be written to fp register
     //fma signals
-    input logic                             FmaAs,   // the modified Z sign - depends on instruction
-    input logic                             FmaPs,      // the product's sign
-    input logic  [`NE+1:0]                  FmaSe,
-    input logic  [3*`NF+4:0]                FmaSm,//change      // the positive sum
-    input logic                             FmaZmS,  // sticky bit that is calculated during alignment
-    input logic                             FmaSs,
-    input logic  [$clog2(3*`NF+6)-1:0]      FmaSCnt,//change   // the normalization shift count
+    input logic                             FmaAs,  // the modified Z sign - depends on instruction
+    input logic                             FmaPs,  // the product's sign
+    input logic  [`NE+1:0]                  FmaSe,  // the sum's exponent
+    input logic  [3*`NF+3:0]                FmaSm,  // the positive sum
+    input logic                             FmaASticky, // sticky bit that is calculated during alignment
+    input logic                             FmaSs,  //
+    input logic  [$clog2(3*`NF+5)-1:0]      FmaSCnt,   // the normalization shift count
     //divide signals
     input logic                             DivS,
-//    input logic                             DivDone,
     input logic  [`NE+1:0]                  DivQe,
     input logic  [`DIVb:0]                  DivQm,
     // conversion signals
@@ -89,10 +88,10 @@ module postprocess (
     // fma signals
     logic [`NE+1:0] FmaMe;     // exponent of the normalized sum
     logic FmaSZero;        // is the sum zero
-    logic [3*`NF+6:0] FmaShiftIn;//change        // shift input
+    logic [3*`NF+5:0] FmaShiftIn;        // shift input
     logic [`NE+1:0] NormSumExp;          // exponent of the normalized sum not taking into account denormal or zero results
     logic FmaPreResultDenorm;    // is the result denormalized - calculated before LZA corection
-    logic [$clog2(3*`NF+6)-1:0] FmaShiftAmt;//change   // normalization shift count
+    logic [$clog2(3*`NF+5)-1:0] FmaShiftAmt;   // normalization shift count
     // division singals
     logic [`LOGNORMSHIFTSZ-1:0] DivShiftAmt;
     logic [`NORMSHIFTSZ-1:0] DivShiftIn;
@@ -152,8 +151,8 @@ module postprocess (
     always_comb
         case(PostProcSel)
             2'b10: begin // fma
-                ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(3*`NF+6){1'b0}}, FmaShiftAmt};//change
-                ShiftIn =  {FmaShiftIn, {`NORMSHIFTSZ-(3*`NF+7){1'b0}}};//change
+                ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(3*`NF+5){1'b0}}, FmaShiftAmt};
+                ShiftIn =  {FmaShiftIn, {`NORMSHIFTSZ-(3*`NF+6){1'b0}}};
             end
             2'b00: begin // cvt
                 ShiftAmt = {{`LOGNORMSHIFTSZ-$clog2(`CVTLEN+1){1'b0}}, CvtShiftAmt};
@@ -193,7 +192,7 @@ module postprocess (
                           
     roundsign roundsign(.FmaOp, .DivOp, .CvtOp, .Sqrt, .FmaSs, .Xs, .Ys, .CvtCs, .Ms);
 
-    round round(.OutFmt, .Frm, .FmaZmS, .Plus1, .PostProcSel, .CvtCe, .Qe,
+    round round(.OutFmt, .Frm, .FmaASticky, .Plus1, .PostProcSel, .CvtCe, .Qe,
                 .Ms, .FmaMe, .FmaOp, .CvtOp, .CvtResDenormUf, .Mf, .ToInt,  .CvtResUf,
                 .DivS, //.DivDone,
                 .DivOp, .UfPlus1, .FullRe, .Rf, .Re, .S, .R, .G, .Me);
diff --git a/pipelined/src/fpu/postproc/round.sv b/pipelined/src/fpu/postproc/round.sv
index c9e2b94e4..b24884dbf 100644
--- a/pipelined/src/fpu/postproc/round.sv
+++ b/pipelined/src/fpu/postproc/round.sv
@@ -48,7 +48,7 @@ module round(
     input logic                     CvtResDenormUf,
     input logic                     CvtResUf,
     input logic  [`CORRSHIFTSZ-1:0] Mf,
-    input logic                     FmaZmS,  // addend's sticky bit
+    input logic                     FmaASticky,  // addend's sticky bit
     input logic  [`NE+1:0]          FmaMe,         // exponent of the normalized sum
     input logic                     Ms,      // the result's sign
     input logic  [`NE:0]            CvtCe,    // the calculated expoent
@@ -175,7 +175,7 @@ module round(
 
     // only add the Addend sticky if doing an FMA opperation
     //      - the shifter shifts too far left when there's an underflow (shifting out all possible sticky bits)
-    assign S = FmaZmS&FmaOp | NormS | CvtResUf&CvtOp | FmaMe[`NE+1]&FmaOp | DivS&DivOp;
+    assign S = FmaASticky&FmaOp | NormS | CvtResUf&CvtOp | FmaMe[`NE+1]&FmaOp | DivS&DivOp;
     
     // determine round and LSB of the rounded value
     //      - underflow round bit is used to determint the underflow flag
diff --git a/pipelined/src/fpu/postproc/shiftcorrection.sv b/pipelined/src/fpu/postproc/shiftcorrection.sv
index 588daa945..172180465 100644
--- a/pipelined/src/fpu/postproc/shiftcorrection.sv
+++ b/pipelined/src/fpu/postproc/shiftcorrection.sv
@@ -43,7 +43,7 @@ module shiftcorrection(
     output logic [`NE+1:0]          Qe,
     output logic [`NE+1:0]          FmaMe         // exponent of the normalized sum
 );
-    logic [3*`NF+4:0]      CorrSumShifted;//change     // the shifted sum after LZA correction
+    logic [3*`NF+3:0]      CorrSumShifted;     // the shifted sum after LZA correction
     logic [`CORRSHIFTSZ-1:0] CorrQmShifted;
     logic                  ResDenorm;    // is the result denormalized
     logic                  LZAPlus1; // add one or two to the sum's exponent due to LZA correction
@@ -56,7 +56,7 @@ module shiftcorrection(
     assign CorrQmShifted = (LZAPlus1|(DivQe==1&~LZAPlus1)) ? Shifted[`NORMSHIFTSZ-2:`NORMSHIFTSZ-`CORRSHIFTSZ-1] : Shifted[`NORMSHIFTSZ-3:`NORMSHIFTSZ-`CORRSHIFTSZ-2];
     // if the result of the divider was calculated to be denormalized, then the result was correctly normalized, so select the top shifted bits
     always_comb
-        if(FmaOp)                       Mf = {CorrSumShifted, {`CORRSHIFTSZ-(3*`NF+5){1'b0}}};//change
+        if(FmaOp)                       Mf = {CorrSumShifted, {`CORRSHIFTSZ-(3*`NF+4){1'b0}}};
         else if (DivOp&~DivResDenorm)   Mf = CorrQmShifted;
         else                            Mf = Shifted[`NORMSHIFTSZ-1:`NORMSHIFTSZ-`CORRSHIFTSZ];
     // Determine sum's exponent
diff --git a/pipelined/testbench/testbench-fp.sv b/pipelined/testbench/testbench-fp.sv
index f5986c839..ac81a1376 100644
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@@ -53,39 +53,39 @@ module testbenchfp;
   logic [`FLEN*4+7:0] TestVectors[8388609:0];     // list of test vectors
 
   logic [1:0]           FmtVal;          // value of the current Fmt
-  logic [2:0]           UnitVal, OpCtrlVal, FrmVal; // vlaue of the currnet Unit/OpCtrl/FrmVal
+  logic [2:0]           UnitVal, OpCtrlVal, FrmVal; // value of the currnet Unit/OpCtrl/FrmVal
   logic                 WriteIntVal;                // value of the current WriteInt
   logic [`FLEN-1:0]     X, Y, Z;                    // inputs read from TestFloat
   logic [`XLEN-1:0]     SrcA;                       // integer input
   logic [`FLEN-1:0]	    Ans;                        // correct answer from TestFloat
-  logic [`FLEN-1:0]	    Res;                                                // result from other units
-  logic [4:0]	 	        AnsFlg;                                             // correct flags read from testfloat
-  logic [4:0]	 	        ResFlg, Flg;                                                            // Result flags
-  logic	[`FMTBITS-1:0]  ModFmt;  // format - 10 = half, 00 = single, 01 = double, 11 = quad
-  logic [`FLEN-1:0]     FpRes, FpCmpRes;  // Results from each unit
-  logic [`XLEN-1:0]     IntRes, CmpRes;  // Results from each unit
+  logic [`FLEN-1:0]	    Res;                        // result from other units
+  logic [4:0]	 	        AnsFlg;                     // correct flags read from testfloat
+  logic [4:0]	 	        ResFlg, Flg;                // Result flags
+  logic	[`FMTBITS-1:0]  ModFmt;                     // format - 10 = half, 00 = single, 01 = double, 11 = quad
+  logic [`FLEN-1:0]     FpRes, FpCmpRes;            // Results from each unit
+  logic [`XLEN-1:0]     IntRes, CmpRes;             // Results from each unit
   logic [4:0]           FmaFlg, CvtFlg, DivFlg, CmpFlg;  // Outputed flags
   logic                 AnsNaN, ResNaN, NaNGood;
-  logic                 Xs, Ys, Zs;                     // sign of the inputs
-  logic [`NE-1:0]       Xe, Ye, Ze;                     // exponent of the inputs
-  logic [`NF:0]         Xm, Ym, Zm;                     // mantissas of the inputs
-  logic                 XNaN, YNaN, ZNaN;                     // is the input NaN
-  logic                 XSNaN, YSNaN, ZSNaN;                  // is the input a signaling NaN
-  logic                 XDenorm, ZDenorm;            // is the input denormalized
-  logic                 XInf, YInf, ZInf;                   // is the input infinity
-  logic                 XZero, YZero, ZZero;                // is the input zero
-  logic                 XExpMax, YExpMax, ZExpMax;         // is the input's exponent all ones  
-  logic  [`CVTLEN-1:0]      CvtLzcInE;      // input to the Leading Zero Counter (priority encoder)
-  logic        IntZero;
-  logic CvtResSgnE;
-  logic [`NE:0]           CvtCalcExpE;    // the calculated expoent
+  logic                 Xs, Ys, Zs;                 // sign of the inputs
+  logic [`NE-1:0]       Xe, Ye, Ze;                 // exponent of the inputs
+  logic [`NF:0]         Xm, Ym, Zm;                 // mantissas of the inputs
+  logic                 XNaN, YNaN, ZNaN;           // is the input NaN
+  logic                 XSNaN, YSNaN, ZSNaN;        // is the input a signaling NaN
+  logic                 XDenorm, ZDenorm;           // is the input denormalized
+  logic                 XInf, YInf, ZInf;           // is the input infinity
+  logic                 XZero, YZero, ZZero;        // is the input zero
+  logic                 XExpMax, YExpMax, ZExpMax;  // is the input's exponent all ones  
+  logic  [`CVTLEN-1:0]  CvtLzcInE;                  // input to the Leading Zero Counter (priority encoder)
+  logic                 IntZero;
+  logic                 CvtResSgnE;
+  logic [`NE:0]         CvtCalcExpE;    // the calculated expoent
 	logic [`LOGCVTLEN-1:0] CvtShiftAmtE;  // how much to shift by
-	logic [`DIVb:0] Quot;
-  logic CvtResDenormUfE;
-  logic DivStart, FDivBusyE, OldFDivBusyE;
-  logic reset = 1'b0;
+	logic [`DIVb:0]       Quot;
+  logic                 CvtResDenormUfE;
+  logic                 DivStart, FDivBusyE, OldFDivBusyE;
+  logic                 reset = 1'b0;
   logic [$clog2(`NF+2)-1:0] XZeroCnt, YZeroCnt;
-  logic [`DURLEN-1:0] Dur;
+  logic [`DURLEN-1:0]   Dur;
 
   // in-between FMA signals
   logic                 Mult;
@@ -94,17 +94,17 @@ module testbenchfp;
   logic [`NE+1:0]	      Se;
   logic 				        ASticky;
   logic 					      KillProd; 
-  logic [$clog2(3*`NF+6)-1:0]	SCnt;
-  logic [3*`NF+4:0]	    Sm;       
+  logic [$clog2(3*`NF+5)-1:0]	SCnt;
+  logic [3*`NF+3:0]	    Sm;       
   logic 			          InvA;
   logic 			          NegSum;
   logic 			          As;
   logic 			          Ps;
-  logic       DivSticky;
-  logic       DivDone;
-  logic       DivNegSticky;
-  logic [`NE+1:0] DivCalcExp;
-  logic divsqrtop;
+  logic                 DivSticky;
+  logic                 DivDone;
+  logic                 DivNegSticky;
+  logic [`NE+1:0]       DivCalcExp;
+  logic                 divsqrtop;
 
 
   ///////////////////////////////////////////////////////////////////////////////////////////////