From c2b9e326caed3326ac0e8444ebdf9deb9ae8b148 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Mon, 13 May 2024 13:27:29 -0700
Subject: [PATCH 1/5] Fround cleanup

---
 src/fpu/fround.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fpu/fround.sv b/src/fpu/fround.sv
index fb4911253..085e25771 100644
--- a/src/fpu/fround.sv
+++ b/src/fpu/fround.sv
@@ -51,7 +51,7 @@ module fround import cvw::*;  #(parameter cvw_t P) (
 
   // Unbiased exponent
   assign E = Xe - P.BIAS[P.NE-1:0];
-  assign Xep1 = Xe + 1;
+  assign Xep1 = Xe + 1'b1;
 
   //////////////////////////////////////////
   // Compute LSB L', rounding bit R' and Sticky bit T'
@@ -85,7 +85,7 @@ module fround import cvw::*;  #(parameter cvw_t P) (
   assign Lnonneg = |(Xm & HotE);
   assign Rnonneg = |(Xm & HotEP1);
   assign Trunc = Xm & IMask;
-  assign {Two, Rnd} = Trunc + HotE; // Two means result is 10.000000 = 2.0
+  assign {Two, Rnd} = Trunc + HotE; // Two means result overflowed to 10.000000 = 2.0
 
   // mux and AND-OR logic to select final rounding bits
   mux2 #(1) Lmux(Lnonneg, 1'b0, Elt0, Lp);

From 2dfada06871644dbb0209526c5eebba5dd2604e9 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Mon, 13 May 2024 14:01:36 -0700
Subject: [PATCH 2/5] Started parameterizing FMA

---
 config/shared/config-shared.vh  |  7 +++++--
 config/shared/parameter-defs.vh |  1 +
 src/cvw.sv                      |  1 +
 src/fpu/fma/fma.sv              | 10 +++++-----
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 2401cada2..ae511e8c7 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -115,6 +115,9 @@ localparam CVTLEN = (ZFA_SUPPORTED & D_SUPPORTED) ? `max(BASECVTLEN, 32'd84) : B
 localparam LLEN = `max($unsigned(FLEN), $unsigned(XLEN));
 localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));
 
+// size of FMA output
+localparam FMALEN = 3*NF + 6;
+
 // NORMSHIFTSIZE is the bits out of the normalization shifter
 // RV32F: max(32+23+1, 2(23)+4, 3(23)+6) = 3*23+6 = 75
 // RV64F: max(64+23+1, 64 + 23 + 2, 3*23+6) = 89
@@ -125,8 +128,8 @@ localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));
 //     because NORMSHIFTSZ becomes limited by convert rather than divider
 //     The two extra bits are necessary because shiftcorrection dropped them for fcvt.
 //     May be possible to remove these two bits by modifying shiftcorrection
-localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1+2), (DIVb + 1 + NF + 1)), (3*NF+8));
-//localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1), (DIVb + 1 + NF + 1)), (3*NF+8));
+localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1+2), (DIVb + 1 + NF + 1)), (FMALEN + 2));
+//localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1), (DIVb + 1 + NF + 1)), (FMALEN + 2));
 
 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));                  // log_2(NORMSHIFTSZ)
 localparam CORRSHIFTSZ = NORMSHIFTSZ-2;                             // Drop leading 2 integer bits
diff --git a/config/shared/parameter-defs.vh b/config/shared/parameter-defs.vh
index 96440490c..026794b4b 100644
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@@ -193,6 +193,7 @@ localparam cvw_t P = '{
   CVTLEN : CVTLEN,
   LLEN : LLEN,
   LOGCVTLEN : LOGCVTLEN,
+  FMALEN : FMALEN,
   NORMSHIFTSZ : NORMSHIFTSZ,
   LOGNORMSHIFTSZ : LOGNORMSHIFTSZ,
   CORRSHIFTSZ : CORRSHIFTSZ,
diff --git a/src/cvw.sv b/src/cvw.sv
index 1f8e0a1c1..0a4cf1549 100644
--- a/src/cvw.sv
+++ b/src/cvw.sv
@@ -287,6 +287,7 @@ typedef struct packed {
   int LOGCVTLEN;
   int NORMSHIFTSZ;
   int LOGNORMSHIFTSZ;
+  int FMALEN;
   int CORRSHIFTSZ;
 
 // division constants
diff --git a/src/fpu/fma/fma.sv b/src/fpu/fma/fma.sv
index 3576b95df..8bf4d4cbb 100644
--- a/src/fpu/fma/fma.sv
+++ b/src/fpu/fma/fma.sv
@@ -34,13 +34,13 @@ module fma import cvw::*;  #(parameter cvw_t P) (
   input  logic                         XZero, YZero, ZZero,    // is the input zero
   input  logic [2:0]                   OpCtrl,                 // operation control
   output logic                         ASticky,                // sticky bit that is calculated during alignment
-  output logic [3*P.NF+5:0]            Sm,                     // the positive sum's significand
+  output logic [P.FMALEN-1:0]          Sm,                     // the positive sum's significand
   output logic                         InvA,                   // Was A inverted for effective subtraction (P-A or -P+A)
   output logic                         As,                     // the aligned addend's sign (modified Z sign for other operations)
   output logic                         Ps,                     // the product's sign
   output logic                         Ss,                     // the sum's sign
   output logic [P.NE+1:0]              Se,                     // the sum's exponent
-  output logic [$clog2(3*P.NF+7)-1:0]  SCnt                    // normalization shift count
+  output logic [$clog2(P.FMALEN+1)-1:0] SCnt                    // normalization shift count
 );
 
   //  OpCtrl:
@@ -54,8 +54,8 @@ module fma import cvw::*;  #(parameter cvw_t P) (
   //        111 - sub
 
   logic [2*P.NF+1:0]   Pm;         // the product's significand in U(2.2Nf) format
-  logic [3*P.NF+5:0]   Am;         // addend aligned's mantissa for addition in U(NF+4.2NF)
-  logic [3*P.NF+5:0]   AmInv;      // aligned addend's mantissa possibly inverted
+  logic [P.FMALEN-1:0] Am;         // addend aligned's mantissa for addition in U(NF+4.2NF)
+  logic [P.FMALEN-1:0] AmInv;      // aligned addend's mantissa possibly inverted
   logic [2*P.NF+1:0]   PmKilled;   // the product's mantissa possibly killed U(2.2Nf)
   logic                KillProd;   // set the product to zero before addition if the product is too small to matter
   logic [P.NE+1:0]     Pe;         // the product's exponent B(NE+2.0) format; adds 2 bits to allow for size of number and negative sign
@@ -89,6 +89,6 @@ module fma import cvw::*;  #(parameter cvw_t P) (
       
   fmaadd #(P) add(.Am, .Pm, .Ze, .Pe, .Ps, .KillProd, .ASticky, .AmInv, .PmKilled, .InvA, .Sm, .Se, .Ss);
 
-  fmalza #(3*P.NF+6, P.NF) lza(.A(AmInv), .Pm(PmKilled), .Cin(InvA & (~ASticky | KillProd)), .sub(InvA), .SCnt);
+  fmalza #(P.FMALEN, P.NF) lza(.A(AmInv), .Pm(PmKilled), .Cin(InvA & (~ASticky | KillProd)), .sub(InvA), .SCnt);
   
 endmodule

From 175c18da017dacf91b7d0bed5d7fe370d5f1cf46 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Mon, 13 May 2024 15:16:00 -0700
Subject: [PATCH 3/5] Parameterized FMA.  However, some offsets are not
 parameterized.  See PR #793 for list of changes

---
 config/shared/config-shared.vh      |   6 +-
 config/shared/parameter-defs.vh     |   1 -
 src/cvw.sv                          |   1 -
 src/fpu/fma/fmaalign.sv             |  10 +--
 src/fpu/fpu.sv                      |   8 +-
 src/fpu/postproc/fmashiftcalc.sv    |  16 ++--
 src/fpu/postproc/postprocess.sv     |  16 ++--
 src/fpu/postproc/round.sv           | 134 ++++++++++++++--------------
 src/fpu/postproc/shiftcorrection.sv |   8 +-
 testbench/testbench_fp.sv           |   4 +-
 10 files changed, 100 insertions(+), 104 deletions(-)

diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index ae511e8c7..02d60c4f4 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -128,12 +128,10 @@ localparam FMALEN = 3*NF + 6;
 //     because NORMSHIFTSZ becomes limited by convert rather than divider
 //     The two extra bits are necessary because shiftcorrection dropped them for fcvt.
 //     May be possible to remove these two bits by modifying shiftcorrection
-localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1+2), (DIVb + 1 + NF + 1)), (FMALEN + 2));
-//localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1), (DIVb + 1 + NF + 1)), (FMALEN + 2));
+//localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1+2), (DIVb + 1 + NF + 1)), (FMALEN + 2));
+localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1), (DIVb + 1 + NF + 1)), (FMALEN + 2));
 
 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));                  // log_2(NORMSHIFTSZ)
-localparam CORRSHIFTSZ = NORMSHIFTSZ-2;                             // Drop leading 2 integer bits
-
 
 // Disable spurious Verilator warnings
 
diff --git a/config/shared/parameter-defs.vh b/config/shared/parameter-defs.vh
index 026794b4b..1b99a9175 100644
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@@ -196,7 +196,6 @@ localparam cvw_t P = '{
   FMALEN : FMALEN,
   NORMSHIFTSZ : NORMSHIFTSZ,
   LOGNORMSHIFTSZ : LOGNORMSHIFTSZ,
-  CORRSHIFTSZ : CORRSHIFTSZ,
   LOGR        : LOGR,
   RK          : RK,
   FPDUR       : FPDUR,
diff --git a/src/cvw.sv b/src/cvw.sv
index 0a4cf1549..b43772b41 100644
--- a/src/cvw.sv
+++ b/src/cvw.sv
@@ -288,7 +288,6 @@ typedef struct packed {
   int NORMSHIFTSZ;
   int LOGNORMSHIFTSZ;
   int FMALEN;
-  int CORRSHIFTSZ;
 
 // division constants
   int LOGR       ;
diff --git a/src/fpu/fma/fmaalign.sv b/src/fpu/fma/fmaalign.sv
index 4fc796fda..c6f0afebc 100644
--- a/src/fpu/fma/fmaalign.sv
+++ b/src/fpu/fma/fmaalign.sv
@@ -31,14 +31,14 @@ module fmaalign import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.NE-1:0]      Xe, Ye, Ze,          // biased exponents in B(NE.0) format
   input  logic [P.NF:0]        Zm,                  // significand in U(0.NF) format]
   input  logic                 XZero, YZero, ZZero, // is the input zero
-  output logic [3*P.NF+5:0]    Am,                  // addend aligned for addition in U(NF+5.2NF+1)
+  output logic [P.FMALEN-1:0]  Am,                  // addend aligned for addition in U(NF+5.2NF+1)
   output logic                 ASticky,             // Sticky bit calculated from the aliged addend
   output logic                 KillProd             // should the product be set to zero
 );
 
   logic [P.NE+1:0]             ACnt;                // how far to shift the addend to align with the product in Q(NE+2.0) format
-  logic [4*P.NF+5:0]           ZmShifted;           // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
-  logic [4*P.NF+5:0]           ZmPreshifted;        // input to the alignment shifter U(NF+5.3NF+1)
+  logic [P.FMALEN+P.NF-1:0]    ZmShifted;           // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
+  logic [P.FMALEN+P.NF-1:0]    ZmPreshifted;        // input to the alignment shifter U(NF+5.3NF+1)
   logic                        KillZ;               // should the addend be killed
 
   ///////////////////////////////////////////////////////////////////////////////
@@ -56,7 +56,7 @@ module fmaalign import cvw::*;  #(parameter cvw_t P) (
   //  |   54'b0    |  106'b(product)  | 2'b0 |
   //  | addnend    |
 
-  assign ZmPreshifted = {Zm,(3*P.NF+5)'(0)};
+  assign ZmPreshifted = {Zm,(P.FMALEN-1)'(0)};
   assign KillProd     = (ACnt[P.NE+1]&~ZZero)|XZero|YZero;
   assign KillZ        = $signed(ACnt)>$signed((P.NE+2)'(3)*(P.NE+2)'(P.NF)+(P.NE+2)'(5));
 
@@ -87,6 +87,6 @@ module fmaalign import cvw::*;  #(parameter cvw_t P) (
     end
   end
 
-  assign Am = ZmShifted[4*P.NF+5:P.NF];
+  assign Am = ZmShifted[P.FMALEN+P.NF-1:P.NF];
 
 endmodule
diff --git a/src/fpu/fpu.sv b/src/fpu/fpu.sv
index 8f4297ec0..4cf17890c 100755
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@@ -119,14 +119,14 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   // Fma Signals
   logic                        FmaAddSubE;                         // Multiply by 1.0 when adding or subtracting
   logic [1:0]                  FmaZSelE;                           // Select Z = Y when adding or subtracting, 0 when multiplying
-  logic [3*P.NF+5:0]           SmE, SmM;                           // Sum significand
+  logic [P.FMALEN-1:0]         SmE, SmM;                           // Sum significand
   logic                        FmaAStickyE, FmaAStickyM;           // FMA addend sticky bit output
   logic [P.NE+1:0]             SeE,SeM;                            // Sum exponent
   logic                        InvAE, InvAM;                       // Invert addend
   logic                        AsE, AsM;                           // Addend sign
   logic                        PsE, PsM;                           // Product sign
   logic                        SsE, SsM;                           // Sum sign
-  logic [$clog2(3*P.NF+7)-1:0] SCntE, SCntM;                       // LZA sum leading zero count
+  logic [$clog2(P.FMALEN+1)-1:0] SCntE, SCntM;                       // LZA sum leading zero count
   
   // Cvt Signals
   logic [P.NE:0]               CeE, CeM;                           // convert intermediate expoent
@@ -358,8 +358,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
     {XsE, YsE, XZeroE, YZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
     {XsM, YsM, XZeroM, YZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
   flopenrc #(2)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, {PreNVE, PreNXE}, {PreNVM, PreNXM});      
-  flopenrc #(3*P.NF+6) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM);
-  flopenrc #($clog2(3*P.NF+7)+7+P.NE) EMRegFma4(clk, reset, FlushM, ~StallM,
+  flopenrc #(P.FMALEN) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM);
+  flopenrc #($clog2(P.FMALEN+1)+7+P.NE) EMRegFma4(clk, reset, FlushM, ~StallM,
     {FmaAStickyE, InvAE, SCntE, AsE, PsE, SsE, SeE},
     {FmaAStickyM, InvAM, SCntM, AsM, PsM, SsM, SeM});
   flopenrc #(P.NE+P.LOGCVTLEN+P.CVTLEN+4) EMRegCvt(clk, reset, FlushM, ~StallM, 
diff --git a/src/fpu/postproc/fmashiftcalc.sv b/src/fpu/postproc/fmashiftcalc.sv
index 5b0f1175b..27f39e2a5 100644
--- a/src/fpu/postproc/fmashiftcalc.sv
+++ b/src/fpu/postproc/fmashiftcalc.sv
@@ -30,13 +30,13 @@
 module fmashiftcalc import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0]         Fmt,                 // precision 1 = double 0 = single
   input  logic [P.NE+1:0]              FmaSe,               // sum's exponent
-  input  logic [3*P.NF+5:0]            FmaSm,               // the positive sum
-  input  logic [$clog2(3*P.NF+7)-1:0]  FmaSCnt,             // normalization shift count
+  input  logic [P.FMALEN-1:0]          FmaSm,               // the positive sum
+  input  logic [$clog2(P.FMALEN+1)-1:0] FmaSCnt,             // normalization shift count
   output logic [P.NE+1:0]              NormSumExp,          // exponent of the normalized sum not taking into account Subnormal or zero results
-  output logic                         FmaSZero,            // is the result subnormal - calculated before LZA corection
+  output logic                         FmaSZero,            //  is the sum zero
   output logic                         FmaPreResultSubnorm, // is the result subnormal - calculated before LZA corection
-  output logic [$clog2(3*P.NF+7)-1:0]  FmaShiftAmt,         // normalization shift count
-  output logic [3*P.NF+7:0]            FmaShiftIn           // is the sum zero
+  output logic [$clog2(P.FMALEN+1)-1:0] FmaShiftAmt,         // normalization shift count
+  output logic [P.FMALEN+1:0]          FmaShiftIn           
 );
   logic [P.NE+1:0]                     PreNormSumExp;       // the exponent of the normalized sum with the P.FLEN bias
   logic [P.NE+1:0]                     BiasCorr;            // correction for bias
@@ -49,7 +49,7 @@ module fmashiftcalc import cvw::*;  #(parameter cvw_t P) (
   assign FmaSZero = ~(|FmaSm);
 
   // calculate the sum's exponent FmaSe-FmaSCnt+NF+2
-  assign PreNormSumExp = FmaSe + {{P.NE+2-$unsigned($clog2(3*P.NF+7)){1'b1}}, ~FmaSCnt} + (P.NE+2)'(P.NF+4);
+  assign PreNormSumExp = FmaSe + {{P.NE+2-$unsigned($clog2(P.FMALEN+1)){1'b1}}, ~FmaSCnt} + (P.NE+2)'(P.NF+4);
 
   //convert the sum's exponent into the proper precision
   if (P.FPSIZES == 1) begin
@@ -131,6 +131,6 @@ module fmashiftcalc import cvw::*;  #(parameter cvw_t P) (
   // set and calculate the shift input and amount
   //  - shift once if killing a product and the result is subnormal
   assign FmaShiftIn = {2'b0, FmaSm};
-  if (P.FPSIZES == 1) assign FmaShiftAmt = FmaPreResultSubnorm ? FmaSe[$clog2(3*P.NF+5)-1:0]+($clog2(3*P.NF+5))'(P.NF+3): FmaSCnt+1;
-  else                assign FmaShiftAmt = FmaPreResultSubnorm ? FmaSe[$clog2(3*P.NF+5)-1:0]+($clog2(3*P.NF+5))'(P.NF+3)+BiasCorr[$clog2(3*P.NF+5)-1:0]: FmaSCnt+1;
+  if (P.FPSIZES == 1) assign FmaShiftAmt = FmaPreResultSubnorm ? FmaSe[$clog2(P.FMALEN-1)-1:0]+($clog2(P.FMALEN-1))'(P.NF+3): FmaSCnt+1;
+  else                assign FmaShiftAmt = FmaPreResultSubnorm ? FmaSe[$clog2(P.FMALEN-1)-1:0]+($clog2(P.FMALEN-1))'(P.NF+3)+BiasCorr[$clog2(P.FMALEN-1)-1:0]: FmaSCnt+1;
 endmodule
diff --git a/src/fpu/postproc/postprocess.sv b/src/fpu/postproc/postprocess.sv
index aa181c5e0..4e893a82e 100644
--- a/src/fpu/postproc/postprocess.sv
+++ b/src/fpu/postproc/postprocess.sv
@@ -44,9 +44,9 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   input logic                              FmaPs,               // the product's sign
   input logic                              FmaSs,               // Sum sign
   input logic  [P.NE+1:0]                  FmaSe,               // the sum's exponent
-  input logic  [3*P.NF+5:0]                FmaSm,               // the positive sum
+  input logic  [P.FMALEN-1:0]                FmaSm,               // the positive sum
   input logic                              FmaASticky,          // sticky bit that is calculated during alignment
-  input logic  [$clog2(3*P.NF+7)-1:0]      FmaSCnt,             // the normalization shift count
+  input logic  [$clog2(P.FMALEN+1)-1:0]      FmaSCnt,             // the normalization shift count
   //divide signals
   input logic                              DivSticky,           // divider sticky bit
   input logic  [P.NE+1:0]                  DivUe,               // divsqrt exponent
@@ -70,8 +70,8 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   logic                        Rs;                   // result sign
   logic [P.NF-1:0]             Rf;                   // Result fraction
   logic [P.NE-1:0]             Re;                   // Result exponent
-  logic                        Ms;                   // norMalized sign
-  logic [P.CORRSHIFTSZ-1:0]    Mf;                   // norMalized fraction
+  logic                        Ms;                   // normalized sign
+  logic [P.NORMSHIFTSZ-1:0]    Mf;                   // normalized fraction
   logic [P.NE+1:0]             Me;                   // normalized exponent
   logic [P.NE+1:0]             FullRe;               // Re with bits to determine sign and overflow
   logic                        UfPlus1;              // do you add one (for determining underflow flag)
@@ -86,10 +86,10 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   // fma signals
   logic [P.NE+1:0]             FmaMe;                // exponent of the normalized sum
   logic                        FmaSZero;             // is the sum zero
-  logic [3*P.NF+7:0]           FmaShiftIn;           // fma shift input
+  logic [P.FMALEN+1:0]         FmaShiftIn;           // fma shift input
   logic [P.NE+1:0]             NormSumExp;           // exponent of the normalized sum not taking into account Subnormal or zero results
   logic                        FmaPreResultSubnorm;  // is the result subnormal - calculated before LZA corection
-  logic [$clog2(3*P.NF+5)-1:0] FmaShiftAmt;          // normalization shift amount for fma
+  logic [$clog2(P.FMALEN+1)-1:0] FmaShiftAmt;          // normalization shift amount for fma
   // division signals
   logic [P.LOGNORMSHIFTSZ-1:0] DivShiftAmt;          // divsqrt shif amount
   logic [P.NORMSHIFTSZ-1:0]    DivShiftIn;           // divsqrt shift input
@@ -154,8 +154,8 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
   always_comb
     case(PostProcSel)
       2'b10: begin // fma
-        ShiftAmt = {{P.LOGNORMSHIFTSZ-$clog2(3*P.NF+5){1'b0}}, FmaShiftAmt};
-        ShiftIn  =  {FmaShiftIn, {P.NORMSHIFTSZ-(3*P.NF+8){1'b0}}};
+        ShiftAmt = {{P.LOGNORMSHIFTSZ-$clog2(P.FMALEN-1){1'b0}}, FmaShiftAmt};
+        ShiftIn  =  {FmaShiftIn, {P.NORMSHIFTSZ-(P.FMALEN+2){1'b0}}};
       end
       2'b00: begin // cvt
         ShiftAmt = {{P.LOGNORMSHIFTSZ-$clog2(P.CVTLEN+1){1'b0}}, CvtShiftAmt};
diff --git a/src/fpu/postproc/round.sv b/src/fpu/postproc/round.sv
index 15c9b4538..c99d5185c 100644
--- a/src/fpu/postproc/round.sv
+++ b/src/fpu/postproc/round.sv
@@ -32,7 +32,7 @@ module round import cvw::*;  #(parameter cvw_t P) (
   input  logic [2:0]               Frm,                // rounding mode
   input  logic [1:0]               PostProcSel,        // select the postprocessor output
   input  logic                     Ms,                 // normalized sign
-  input  logic [P.CORRSHIFTSZ-1:0] Mf,                 // normalized fraction
+  input  logic [P.NORMSHIFTSZ-1:0] Mf,                 // normalized fraction
   // fma
   input  logic                     FmaOp,              // is an fma operation being done?
   input  logic [P.NE+1:0]          FmaMe,              // exponent of the normalized sum for fma
@@ -123,61 +123,61 @@ module round import cvw::*;  #(parameter cvw_t P) (
       //      |    NF     |1|1|
       //                     ^    ^ if floating point result
       //                     ^ if not an FMA result
-      if (XLENPOS == 1)assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes)  |
-                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:0]);
+      if (XLENPOS == 1)assign NormSticky = (|Mf[P.NORMSHIFTSZ-P.NF-2:P.NORMSHIFTSZ-P.XLEN-1]&FpRes)  |
+                                                (|Mf[P.NORMSHIFTSZ-P.XLEN-2:0]);
       //     2: NF > XLEN
-      if (XLENPOS == 2)assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.NF-1]&IntRes) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:0]);
+      if (XLENPOS == 2)assign NormSticky = (|Mf[P.NORMSHIFTSZ-P.XLEN-2:P.NORMSHIFTSZ-P.NF-1]&IntRes) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF-2:0]);
 
   end else if (P.FPSIZES == 2) begin
       // XLEN is either 64 or 32
       // so half and single are always smaller then XLEN
 
       // 1: XLEN > NF   > NF1
-      if (XLENPOS == 1) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.NF-1]&FpRes&~OutFmt) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes) |
-                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:0]);
+      if (XLENPOS == 1) assign NormSticky = (|Mf[P.NORMSHIFTSZ-P.NF1-2:P.NORMSHIFTSZ-P.NF-1]&FpRes&~OutFmt) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF-2:P.NORMSHIFTSZ-P.XLEN-1]&FpRes) |
+                                                (|Mf[P.NORMSHIFTSZ-P.XLEN-2:0]);
       // 2: NF   > XLEN > NF1
-      if (XLENPOS == 2) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes&~OutFmt) | 
-                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.NF-1]&(IntRes|~OutFmt)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:0]);
+      if (XLENPOS == 2) assign NormSticky = (|Mf[P.NORMSHIFTSZ-P.NF1-2:P.NORMSHIFTSZ-P.XLEN-1]&FpRes&~OutFmt) | 
+                                                (|Mf[P.NORMSHIFTSZ-P.XLEN-2:P.NORMSHIFTSZ-P.NF-1]&(IntRes|~OutFmt)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF-2:0]);
       // 3: NF   > NF1  > XLEN
-      if (XLENPOS == 3) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.NF1-1]&IntRes) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.NF-1]&(~OutFmt|IntRes)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:0]);
+      if (XLENPOS == 3) assign NormSticky = (|Mf[P.NORMSHIFTSZ-P.XLEN-2:P.NORMSHIFTSZ-P.NF1-1]&IntRes) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF1-2:P.NORMSHIFTSZ-P.NF-1]&(~OutFmt|IntRes)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF-2:0]);
 
   end else if (P.FPSIZES == 3) begin
       // 1: XLEN > NF   > NF1
-      if (XLENPOS == 1) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF2-2:P.CORRSHIFTSZ-P.NF1-1]&FpRes&(OutFmt==P.FMT2)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.NF-1]&FpRes&~(OutFmt==P.FMT)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes) |
-                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:0]);
+      if (XLENPOS == 1) assign NormSticky = (|Mf[P.NORMSHIFTSZ-P.NF2-2:P.NORMSHIFTSZ-P.NF1-1]&FpRes&(OutFmt==P.FMT2)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF1-2:P.NORMSHIFTSZ-P.NF-1]&FpRes&~(OutFmt==P.FMT)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF-2:P.NORMSHIFTSZ-P.XLEN-1]&FpRes) |
+                                                (|Mf[P.NORMSHIFTSZ-P.XLEN-2:0]);
       // 2: NF   > XLEN > NF1
-      if (XLENPOS == 2) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF2-2:P.CORRSHIFTSZ-P.NF1-1]&FpRes&(OutFmt==P.FMT2)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes&~(OutFmt==P.FMT)) | 
-                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.NF-1]&(IntRes|~(OutFmt==P.FMT))) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:0]);
+      if (XLENPOS == 2) assign NormSticky = (|Mf[P.NORMSHIFTSZ-P.NF2-2:P.NORMSHIFTSZ-P.NF1-1]&FpRes&(OutFmt==P.FMT2)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF1-2:P.NORMSHIFTSZ-P.XLEN-1]&FpRes&~(OutFmt==P.FMT)) | 
+                                                (|Mf[P.NORMSHIFTSZ-P.XLEN-2:P.NORMSHIFTSZ-P.NF-1]&(IntRes|~(OutFmt==P.FMT))) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF-2:0]);
       // 3: NF   > NF1  > XLEN
-      if (XLENPOS == 3) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.NF2-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes&(OutFmt==P.FMT2)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.NF1-1]&((OutFmt==P.FMT2)|IntRes)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF1-2:P.CORRSHIFTSZ-P.NF-1]&(~(OutFmt==P.FMT)|IntRes)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.NF-2:0]);
+      if (XLENPOS == 3) assign NormSticky = (|Mf[P.NORMSHIFTSZ-P.NF2-2:P.NORMSHIFTSZ-P.XLEN-1]&FpRes&(OutFmt==P.FMT2)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.XLEN-2:P.NORMSHIFTSZ-P.NF1-1]&((OutFmt==P.FMT2)|IntRes)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF1-2:P.NORMSHIFTSZ-P.NF-1]&(~(OutFmt==P.FMT)|IntRes)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.NF-2:0]);
 
   end else if (P.FPSIZES == 4) begin
       // Quad precision will always be greater than XLEN
       // 2: NF   > XLEN > NF1
-      if (XLENPOS == 2) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.H_NF-2:P.CORRSHIFTSZ-P.S_NF-1]&FpRes&(OutFmt==P.H_FMT)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.S_NF-2:P.CORRSHIFTSZ-P.D_NF-1]&FpRes&((OutFmt==P.S_FMT)|(OutFmt==P.H_FMT))) | 
-                                                (|Mf[P.CORRSHIFTSZ-P.D_NF-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes&~(OutFmt==P.Q_FMT)) | 
-                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.Q_NF-1]&(~(OutFmt==P.Q_FMT)|IntRes)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.Q_NF-2:0]);
+      if (XLENPOS == 2) assign NormSticky = (|Mf[P.NORMSHIFTSZ-P.H_NF-2:P.NORMSHIFTSZ-P.S_NF-1]&FpRes&(OutFmt==P.H_FMT)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.S_NF-2:P.NORMSHIFTSZ-P.D_NF-1]&FpRes&((OutFmt==P.S_FMT)|(OutFmt==P.H_FMT))) | 
+                                                (|Mf[P.NORMSHIFTSZ-P.D_NF-2:P.NORMSHIFTSZ-P.XLEN-1]&FpRes&~(OutFmt==P.Q_FMT)) | 
+                                                (|Mf[P.NORMSHIFTSZ-P.XLEN-2:P.NORMSHIFTSZ-P.Q_NF-1]&(~(OutFmt==P.Q_FMT)|IntRes)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.Q_NF-2:0]);
       // 3: NF   > NF1  > XLEN
       // The extra XLEN bit will be ored later when caculating the final sticky bit - the ufplus1 not needed for integer
-      if (XLENPOS == 3) assign NormSticky = (|Mf[P.CORRSHIFTSZ-P.H_NF-2:P.CORRSHIFTSZ-P.S_NF-1]&FpRes&(OutFmt==P.H_FMT)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.S_NF-2:P.CORRSHIFTSZ-P.XLEN-1]&FpRes&((OutFmt==P.S_FMT)|(OutFmt==P.H_FMT))) |
-                                                (|Mf[P.CORRSHIFTSZ-P.XLEN-2:P.CORRSHIFTSZ-P.D_NF-1]&((OutFmt==P.S_FMT)|(OutFmt==P.H_FMT)|IntRes)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.D_NF-2:P.CORRSHIFTSZ-P.Q_NF-1]&(~(OutFmt==P.Q_FMT)|IntRes)) |
-                                                (|Mf[P.CORRSHIFTSZ-P.Q_NF-2:0]);
+      if (XLENPOS == 3) assign NormSticky = (|Mf[P.NORMSHIFTSZ-P.H_NF-2:P.NORMSHIFTSZ-P.S_NF-1]&FpRes&(OutFmt==P.H_FMT)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.S_NF-2:P.NORMSHIFTSZ-P.XLEN-1]&FpRes&((OutFmt==P.S_FMT)|(OutFmt==P.H_FMT))) |
+                                                (|Mf[P.NORMSHIFTSZ-P.XLEN-2:P.NORMSHIFTSZ-P.D_NF-1]&((OutFmt==P.S_FMT)|(OutFmt==P.H_FMT)|IntRes)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.D_NF-2:P.NORMSHIFTSZ-P.Q_NF-1]&(~(OutFmt==P.Q_FMT)|IntRes)) |
+                                                (|Mf[P.NORMSHIFTSZ-P.Q_NF-2:0]);
 
   end
 
@@ -188,32 +188,32 @@ module round import cvw::*;  #(parameter cvw_t P) (
   // determine round and LSB of the rounded value
   //      - underflow round bit is used to determint the underflow flag
   if (P.FPSIZES == 1) begin
-      assign FpGuard  = Mf[P.CORRSHIFTSZ-P.NF-1];
-      assign FpLsbRes = Mf[P.CORRSHIFTSZ-P.NF];
-      assign FpRound  = Mf[P.CORRSHIFTSZ-P.NF-2];
+      assign FpGuard  = Mf[P.NORMSHIFTSZ-P.NF-1];
+      assign FpLsbRes = Mf[P.NORMSHIFTSZ-P.NF];
+      assign FpRound  = Mf[P.NORMSHIFTSZ-P.NF-2];
 
   end else if (P.FPSIZES == 2) begin
-      assign FpGuard  = OutFmt ? Mf[P.CORRSHIFTSZ-P.NF-1] : Mf[P.CORRSHIFTSZ-P.NF1-1];
-      assign FpLsbRes = OutFmt ? Mf[P.CORRSHIFTSZ-P.NF] : Mf[P.CORRSHIFTSZ-P.NF1];
-      assign FpRound  = OutFmt ? Mf[P.CORRSHIFTSZ-P.NF-2] : Mf[P.CORRSHIFTSZ-P.NF1-2];
+      assign FpGuard  = OutFmt ? Mf[P.NORMSHIFTSZ-P.NF-1] : Mf[P.NORMSHIFTSZ-P.NF1-1];
+      assign FpLsbRes = OutFmt ? Mf[P.NORMSHIFTSZ-P.NF] : Mf[P.NORMSHIFTSZ-P.NF1];
+      assign FpRound  = OutFmt ? Mf[P.NORMSHIFTSZ-P.NF-2] : Mf[P.NORMSHIFTSZ-P.NF1-2];
 
   end else if (P.FPSIZES == 3) begin
       always_comb
           case (OutFmt)
               P.FMT: begin
-                  FpGuard  = Mf[P.CORRSHIFTSZ-P.NF-1];
-                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.NF];
-                  FpRound  = Mf[P.CORRSHIFTSZ-P.NF-2];
+                  FpGuard  = Mf[P.NORMSHIFTSZ-P.NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZ-P.NF];
+                  FpRound  = Mf[P.NORMSHIFTSZ-P.NF-2];
               end
               P.FMT1: begin
-                  FpGuard  = Mf[P.CORRSHIFTSZ-P.NF1-1];
-                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.NF1];
-                  FpRound  = Mf[P.CORRSHIFTSZ-P.NF1-2];
+                  FpGuard  = Mf[P.NORMSHIFTSZ-P.NF1-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZ-P.NF1];
+                  FpRound  = Mf[P.NORMSHIFTSZ-P.NF1-2];
               end
               P.FMT2: begin
-                  FpGuard  = Mf[P.CORRSHIFTSZ-P.NF2-1];
-                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.NF2];
-                  FpRound  = Mf[P.CORRSHIFTSZ-P.NF2-2];
+                  FpGuard  = Mf[P.NORMSHIFTSZ-P.NF2-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZ-P.NF2];
+                  FpRound  = Mf[P.NORMSHIFTSZ-P.NF2-2];
               end
               default: begin
                   FpGuard  = 1'bx;
@@ -225,31 +225,31 @@ module round import cvw::*;  #(parameter cvw_t P) (
       always_comb
           case (OutFmt)
               2'h3: begin
-                  FpGuard  = Mf[P.CORRSHIFTSZ-P.Q_NF-1];
-                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.Q_NF];
-                  FpRound  = Mf[P.CORRSHIFTSZ-P.Q_NF-2];
+                  FpGuard  = Mf[P.NORMSHIFTSZ-P.Q_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZ-P.Q_NF];
+                  FpRound  = Mf[P.NORMSHIFTSZ-P.Q_NF-2];
               end
               2'h1: begin
-                  FpGuard  = Mf[P.CORRSHIFTSZ-P.D_NF-1];
-                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.D_NF];
-                  FpRound  = Mf[P.CORRSHIFTSZ-P.D_NF-2];
+                  FpGuard  = Mf[P.NORMSHIFTSZ-P.D_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZ-P.D_NF];
+                  FpRound  = Mf[P.NORMSHIFTSZ-P.D_NF-2];
               end
               2'h0: begin
-                  FpGuard  = Mf[P.CORRSHIFTSZ-P.S_NF-1];
-                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.S_NF];
-                  FpRound  = Mf[P.CORRSHIFTSZ-P.S_NF-2];
+                  FpGuard  = Mf[P.NORMSHIFTSZ-P.S_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZ-P.S_NF];
+                  FpRound  = Mf[P.NORMSHIFTSZ-P.S_NF-2];
               end
               2'h2: begin
-                  FpGuard  = Mf[P.CORRSHIFTSZ-P.H_NF-1];
-                  FpLsbRes = Mf[P.CORRSHIFTSZ-P.H_NF];
-                  FpRound  = Mf[P.CORRSHIFTSZ-P.H_NF-2];
+                  FpGuard  = Mf[P.NORMSHIFTSZ-P.H_NF-1];
+                  FpLsbRes = Mf[P.NORMSHIFTSZ-P.H_NF];
+                  FpRound  = Mf[P.NORMSHIFTSZ-P.H_NF-2];
               end
           endcase
   end
 
-  assign Guard  = CvtToInt ? Mf[P.CORRSHIFTSZ-P.XLEN-1] : FpGuard;
-  assign LsbRes = CvtToInt ? Mf[P.CORRSHIFTSZ-P.XLEN] : FpLsbRes;
-  assign Round  = CvtToInt ? Mf[P.CORRSHIFTSZ-P.XLEN-2] : FpRound;
+  assign Guard  = CvtToInt ? Mf[P.NORMSHIFTSZ-P.XLEN-1] : FpGuard;
+  assign LsbRes = CvtToInt ? Mf[P.NORMSHIFTSZ-P.XLEN] : FpLsbRes;
+  assign Round  = CvtToInt ? Mf[P.NORMSHIFTSZ-P.XLEN-2] : FpRound;
 
   always_comb begin
       // Determine if you add 1
@@ -296,7 +296,7 @@ module round import cvw::*;  #(parameter cvw_t P) (
       assign RoundAdd = {(P.Q_NE+1+P.H_NF)'(0), FpPlus1&(OutFmt==P.H_FMT), (P.S_NF-P.H_NF-1)'(0), FpPlus1&(OutFmt==P.S_FMT), (P.D_NF-P.S_NF-1)'(0), FpPlus1&(OutFmt==P.D_FMT), (P.Q_NF-P.D_NF-1)'(0), FpPlus1&(OutFmt==P.Q_FMT)};
 
   // trim unneeded bits from fraction
-  assign RoundFrac = Mf[P.CORRSHIFTSZ-1:P.CORRSHIFTSZ-P.NF];
+  assign RoundFrac = Mf[P.NORMSHIFTSZ-1:P.NORMSHIFTSZ-P.NF];
   
   // select the exponent
   always_comb
diff --git a/src/fpu/postproc/shiftcorrection.sv b/src/fpu/postproc/shiftcorrection.sv
index 85e96c744..ad811a747 100644
--- a/src/fpu/postproc/shiftcorrection.sv
+++ b/src/fpu/postproc/shiftcorrection.sv
@@ -41,11 +41,11 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
   input logic                      FmaSZero,
   // output
   output logic [P.NE+1:0]          FmaMe,                  // exponent of the normalized sum
-  output logic [P.CORRSHIFTSZ-1:0] Mf,                     // the shifted sum after correction
+  output logic [P.NORMSHIFTSZ-1:0] Mf,                     // the shifted sum after correction
   output logic [P.NE+1:0]          Ue                      // corrected exponent for divider
 );
 
-  logic [P.CORRSHIFTSZ-1:0]        CorrShifted;         // the shifted sum after LZA correction
+  logic [P.NORMSHIFTSZ-1:0]        CorrShifted;         // the shifted sum after LZA correction
   logic                            ResSubnorm;             // is the result Subnormal
   logic                            LZAPlus1;               // add one or two to the sum's exponent due to LZA correction
   logic                            LeftShiftQm;            // should the divsqrt result be shifted one to the left
@@ -69,12 +69,12 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
   assign RightShift = FmaOp ? LZAPlus1 : LeftShiftQm;
 
   // one bit right shift for FMA or division
-  mux2 #(P.NORMSHIFTSZ-2) corrmux(Shifted[P.NORMSHIFTSZ-3:0], Shifted[P.NORMSHIFTSZ-2:1], RightShift, CorrShifted);
+  mux2 #(P.NORMSHIFTSZ) corrmux({Shifted[P.NORMSHIFTSZ-3:0], 2'b00}, {Shifted[P.NORMSHIFTSZ-2:1], 2'b00}, RightShift, CorrShifted);
   
   // if the result of the divider was calculated to be subnormal, then the result was correctly normalized, so select the top shifted bits
   always_comb
     if (FmaOp | (DivOp & ~DivResSubnorm))  Mf = CorrShifted;
-    else                               Mf = Shifted[P.NORMSHIFTSZ-1:2];
+    else                                   Mf = Shifted[P.NORMSHIFTSZ-1:0];
     
   // Determine sum's exponent
   //  main exponent issues: 
diff --git a/testbench/testbench_fp.sv b/testbench/testbench_fp.sv
index 75be5ca1e..f800a9fed 100644
--- a/testbench/testbench_fp.sv
+++ b/testbench/testbench_fp.sv
@@ -98,8 +98,8 @@ module testbench_fp;
    logic [P.NE+1:0] 		Se;
    logic 			ASticky;
    logic 			KillProd; 
-   logic [$clog2(3*P.NF+7)-1:0] SCnt;
-   logic [3*P.NF+5:0] 		Sm;       
+   logic [$clog2(P.FMALEN+1)-1:0] SCnt;
+   logic [P.FMALEN-1:0] 		Sm;       
    logic 			InvA;
    logic 			NegSum;
    logic 			As;

From c649cfba8377258d3b681e53a39f3bde352561e1 Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Tue, 14 May 2024 10:28:31 -0700
Subject: [PATCH 4/5] Expanded fpcalc to support quad

---
 examples/fp/fpcalc/Makefile |  8 ++--
 examples/fp/fpcalc/fpcalc.c | 94 +++++++++++++++++++++++++++++++++----
 2 files changed, 88 insertions(+), 14 deletions(-)

diff --git a/examples/fp/fpcalc/Makefile b/examples/fp/fpcalc/Makefile
index 196fdf3d2..e3165231b 100644
--- a/examples/fp/fpcalc/Makefile
+++ b/examples/fp/fpcalc/Makefile
@@ -2,14 +2,12 @@
 
 CC     = gcc
 CFLAGS = -O3 -Wno-format-overflow
-LIBS   = -lm
-LFLAGS = -L. 
 # Link against the riscv-isa-sim version of SoftFloat rather than 
 # the regular version to get RISC-V NaN behavior
 IFLAGS   = -I$(RISCV)/riscv-isa-sim/softfloat
-LIBS   = $(RISCV)/riscv-isa-sim/build/libsoftfloat.a
+LIBS   = $(RISCV)/riscv-isa-sim/build/libsoftfloat.a -lm -lquadmath
 #IFLAGS = -I../../../addins/SoftFloat-3e/source/include/
-#LIBS   = ../../../addins/SoftFloat-3e/build/Linux-x86_64-GCC/softfloat.a
+#LIBS   = ../../../addins/SoftFloat-3e/build/Linux-x86_64-GCC/softfloat.a -lm -lquadmath
 SRCS   = $(wildcard *.c)
 
 PROGS = $(patsubst %.c,%,$(SRCS))
@@ -17,7 +15,7 @@ PROGS = $(patsubst %.c,%,$(SRCS))
 all:	$(PROGS)
 
 %: %.c
-	$(CC) $(CFLAGS) $(IFLAGS) $(LFLAGS) -o $@ $< $(LIBS)
+	$(CC) $(CFLAGS) -DSOFTFLOAT_FAST_INT64 $(IFLAGS) $(LFLAGS) -o $@ $< $(LIBS)
 
 clean: 
 	rm -f $(PROGS)
diff --git a/examples/fp/fpcalc/fpcalc.c b/examples/fp/fpcalc/fpcalc.c
index 94bfc9ac1..5a075b69c 100644
--- a/examples/fp/fpcalc/fpcalc.c
+++ b/examples/fp/fpcalc/fpcalc.c
@@ -7,6 +7,8 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+#include <inttypes.h>
+#include <quadmath.h> // GCC Quad-Math Library
 #include "softfloat.h"
 #include "softfloat_types.h"
 
@@ -26,6 +28,12 @@ typedef union dp {
   double d;
 } dp;
 
+typedef union qp {
+  uint64_t v64[2];
+  __uint128_t v;
+  __float128 q;
+} qp;
+
 
 int opSize = 0;
 
@@ -140,6 +148,27 @@ void printF64(char *msg, float64_t f) {
   //  msg, conv.v, conv.d, sci, exp, fract); 
 }
 
+void printF128 (char *msg, float128_t q) {
+  qp conv;
+  //__int128_t v128;
+  int i, j;
+  char buf[64];
+  //v128 = q.v[1];
+  //v128 = v128 << 64 | q.v[0]; // use union to convert between hexadecimal and floating-point views
+  //conv.v = v128;
+  conv.v64[0] = q.v[0]; // use union to convert between hexadecimal and floating-point views
+  conv.v64[1] = q.v[1]; // use union to convert between hexadecimal and floating-point views  
+  printf("%s: ", msg);  // print out nicely
+
+  // Some compilers can understand %Q for printf on quad precision instead of the
+  // API call of quadmath_snprintf
+  // printf("0x%016" PRIx64 "_%016" PRIx64 " = %1.15Qe\n", q.v[1], q.v[0], conv.q);
+  quadmath_snprintf (buf, sizeof buf, "%1.15Qe", conv.q);
+  //printf("0x%032" PRIx12 " = %s\n", q.v, buf);  
+  printf("0x%016" PRIx64 "_%016" PRIx64 " = %s\n", q.v[1], q.v[0], buf);  
+
+}
+
 void printFlags(void) {
   int NX = softfloat_exceptionFlags % 2;
   int UF = (softfloat_exceptionFlags >> 1) % 2;
@@ -160,14 +189,32 @@ void softfloatInit(void) {
     softfloat_detectTininess = softfloat_tininess_afterRounding; // RISC-V behavior for tininess
 }
 
-uint64_t parseNum(char *num) {
-  uint64_t result;
+__uint128_t strtoul128(char *num, int base) {
+  __uint128_t result = 0;
+  int i;
+  for (i=0; i<strlen(num); i++) {
+    result = result * base;
+    if (num[i] >= '0' && num[i] <= '9') result += num[i] - '0';
+    else if (num[i] >= 'a' && num[i] <= 'f') result += num[i] - 'a' + 10;
+    else if (num[i] >= 'A' && num[i] <= 'F') result += num[i] - 'A' + 10;
+    else {
+      printf("Error: bad character %c in number %s\n", num[i], num);
+      exit(1);
+    }
+  }
+  return result;
+}
+
+__uint128_t parseNum(char *num) {
+//  uint64_t result;
+  __uint128_t result;
   int size; // size of operands in bytes (2= half, 4=single, 8 = double)
   if (strlen(num) < 8) size = 2;
   else if (strlen(num) < 16) size = 4;
-  else if (strlen(num) < 19) size = 8;
+  else if (strlen(num) < 32) size = 8;
+  else if (strlen(num) < 35) size = 16; // *** will need to increase
   else {
-    printf("Error: only half, single, and double precision supported");
+    printf("Error: only half, single, double, or quad precision supported");
     exit(1);
   }
   if (opSize != 0) {
@@ -179,7 +226,7 @@ uint64_t parseNum(char *num) {
     opSize = size;
     //printf ("Operand size is %d\n", opSize);
   }
-  result = (uint64_t)strtoul(num, NULL, 16);
+  result = (__uint128_t)strtoul128(num, 16);
   //printf("Parsed %s as 0x%lx\n", num, result);
   return result;
 }
@@ -206,7 +253,8 @@ char parseRound(char *rnd) {
 
 int main(int argc, char *argv[])
 {
-    uint64_t xn, yn, zn;
+    //uint64_t xn, yn, zn;
+    __uint128_t xn, yn, zn;
     char op1, op2;
     char cmd[200];
 
@@ -217,6 +265,7 @@ int main(int argc, char *argv[])
       exit(1);
     } else {
       softfloat_roundingMode = softfloat_round_near_even;
+      //printf("argv[0] = %s arvg[1] = %s argv[2] = %s argv[3] = %s\n", argv[0], argv[1], argv[2], argv[3]);
       xn = parseNum(argv[1]);
       yn = parseNum(argv[3]);
       op1 = parseOp(argv[2]);
@@ -241,12 +290,22 @@ int main(int argc, char *argv[])
             r = f32_mulAdd(x, y, z);
             printF32("X", x); printF32("Y", y); printF32("Z", z);
             printF32("result = X*Y+Z", r); printFlags();
-          } else { // opSize = 8
+          } else if (opSize == 8) { 
             float64_t x, y, z, r;
             x.v = xn; y.v = yn; z.v = zn;
             r = f64_mulAdd(x, y, z);
             printF64("X", x); printF64("Y", y); printF64("Z", z);
             printF64("result = X*Y+Z", r); printFlags();
+          } else { // opSize = 16
+            float128_t x, y, z, r;
+            qp xc, yc, zc;
+            xc.v = xn; yc.v = yn; zc.v = zn;
+            x.v[0] = xc.v64[0]; x.v[1] = xc.v64[1];
+            y.v[0] = yc.v64[0]; y.v[1] = yc.v64[1];
+            z.v[0] = zc.v64[0]; z.v[1] = zc.v64[1];
+            r = f128_mulAdd(x, y, z);
+            printF128("X", x); printF128("Y", y); printF128("Z", z);
+            printF128("result = X*Y+Z", r); printFlags();
           }
         }
       } else {
@@ -279,7 +338,7 @@ int main(int argc, char *argv[])
           sprintf(cmd, "0x%08x %c 0x%08x", x.v, op1, y.v);
           printF32(cmd, r); printFlags();
 
-        } else { // opSize = 8
+        } else if (opSize == 8) { // opSize = 8
           float64_t x, y, r;
           x.v = xn; y.v = yn;
           switch (op1) {
@@ -293,7 +352,24 @@ int main(int argc, char *argv[])
           printF64("X", x); printF64("Y", y); 
           sprintf(cmd, "0x%016lx %c 0x%016lx", x.v, op1, y.v);
           printF64(cmd, r); printFlags();
-
+        } else { // opSize = 16
+          float128_t x, y, r;
+          qp xc, yc;
+          xc.v = xn; yc.v = yn;
+          x.v[0] = xc.v64[0]; x.v[1] = xc.v64[1];
+          y.v[0] = yc.v64[0]; y.v[1] = yc.v64[1];
+          //x.v = xn; y.v = yn;
+          switch (op1) {
+            case 'x': r = f128_mul(x, y); break;
+            case '+': r = f128_add(x, y); break;
+            case '-': r = f128_sub(x, y); break;
+            case '/': r = f128_div(x, y); break;
+            case '%': r = f128_rem(x, y); break;
+            default: printf("Unknown op %c\n", op1); exit(1);
+          }
+          printF128("X", x); printF128("Y", y); 
+          //sprintf(cmd, "0x%016lx %c 0x%016lx", x.v, op1, y.v);
+          printF128("Result", r); printFlags();
         }
       }
     }

From 990d40410bd248f3a87326117c32e897131142be Mon Sep 17 00:00:00 2001
From: David Harris <David_Harris@hmc.edu>
Date: Tue, 14 May 2024 11:11:24 -0700
Subject: [PATCH 5/5] Test using fpcalc for fp_dataset.py

---
 examples/fp/fpcalc/fpcalc.c   | 21 ++++++++++-
 tests/fp/quad/fp_dataset.py   | 69 +++++++++++++++++++++++++----------
 tests/fp/quad/fpdatasetgen.py |  6 ++-
 3 files changed, 74 insertions(+), 22 deletions(-)
 mode change 100644 => 100755 tests/fp/quad/fp_dataset.py
 mode change 100644 => 100755 tests/fp/quad/fpdatasetgen.py

diff --git a/examples/fp/fpcalc/fpcalc.c b/examples/fp/fpcalc/fpcalc.c
index 5a075b69c..8264b1442 100644
--- a/examples/fp/fpcalc/fpcalc.c
+++ b/examples/fp/fpcalc/fpcalc.c
@@ -169,6 +169,24 @@ void printF128 (char *msg, float128_t q) {
 
 }
 
+void printF128val(float128_t q) {
+  qp conv;
+  //__int128_t v128;
+  int i, j;
+  char buf[64];
+  //v128 = q.v[1];
+  //v128 = v128 << 64 | q.v[0]; // use union to convert between hexadecimal and floating-point views
+  //conv.v = v128;
+  conv.v64[0] = q.v[0]; // use union to convert between hexadecimal and floating-point views
+  conv.v64[1] = q.v[1]; // use union to convert between hexadecimal and floating-point views  
+
+  // Some compilers can understand %Q for printf on quad precision instead of the
+  // API call of quadmath_snprintf
+  // printf("0x%016" PRIx64 "_%016" PRIx64 " = %1.15Qe\n", q.v[1], q.v[0], conv.q);
+  //quadmath_snprintf (buf, sizeof buf, "%1.15Qe", conv.q);
+  printf("%016" PRIx64 "%016" PRIx64 "\n", q.v[1], q.v[0]);  
+}
+
 void printFlags(void) {
   int NX = softfloat_exceptionFlags % 2;
   int UF = (softfloat_exceptionFlags >> 1) % 2;
@@ -369,7 +387,8 @@ int main(int argc, char *argv[])
           }
           printF128("X", x); printF128("Y", y); 
           //sprintf(cmd, "0x%016lx %c 0x%016lx", x.v, op1, y.v);
-          printF128("Result", r); printFlags();
+          printF128(cmd, r); printFlags();
+          printF128val(r); 
         }
       }
     }
diff --git a/tests/fp/quad/fp_dataset.py b/tests/fp/quad/fp_dataset.py
old mode 100644
new mode 100755
index 0b0bd32ca..d095d58f7
--- a/tests/fp/quad/fp_dataset.py
+++ b/tests/fp/quad/fp_dataset.py
@@ -5,6 +5,7 @@ import random
 import sys
 import math
 from decimal import *
+import os
 sys.set_int_max_str_digits(10000)
 
 fzero       = ['0x00000000', '0x80000000']
@@ -347,6 +348,35 @@ def comments_parser(coverpoints):
         cvpts.append((cvpt+ " #nosat",comment))
     return cvpts
 
+def softfloat_sub(a, b):
+    cmd = "$WALLY/examples/fp/fpcalc/fpcalc " + a + " - " + b
+    result = os.system(cmd)
+    print("cmd = ", cmd, "returns result = ", result)
+    return result
+
+# rs1, rs3, result are hexadecimal strings
+def gen_rs2(iflen,opcode,rs1,rs3,result):
+    if opcode in 'fadd':
+        rs2 = softfloat_sub(result, rs1)
+    elif opcode in 'fsub':
+        rs2 = rs1 - fields_dec_converter(iflen,result[i][0])
+    elif opcode in 'fmul':
+        rs2 = fields_dec_converter(iflen,result[i][0])/rs1
+    elif opcode in 'fdiv':
+        if fields_dec_converter(iflen,result[i][0]) != 0:
+            rs2 = rs1/fields_dec_converter(iflen,result[i][0])
+    elif opcode in 'fsqrt':
+        rs2 = fields_dec_converter(iflen,result[i][0])*fields_dec_converter(iflen,result[i][0])
+    elif opcode in 'fmadd':
+        rs2 = (fields_dec_converter(iflen,result[i][0]) - rs3)/rs1
+    elif opcode in 'fnmadd':
+        rs2 = (rs3 - fields_dec_converter(iflen,result[i][0]))/rs1
+    elif opcode in 'fmsub':
+        rs2 = (fields_dec_converter(iflen,result[i][0]) + rs3)/rs1
+    elif opcode in 'fnmsub':
+        rs2 = -1*(rs3 + fields_dec_converter(iflen,result[i][0]))/rs1
+    return rs2
+
 def ibm_b1(flen, iflen, opcode, ops):
     '''
     IBM Model B1 Definition:
@@ -525,25 +555,26 @@ def ibm_b2(flen, iflen, opcode, ops, int_val = 100, seed = -1): #***Quad support
         rs1 = fields_dec_converter(iflen,'0x'+hex(int('1'+rs1_bin[2:],2))[3:])
         #print(rs1)
         rs3 = fields_dec_converter(iflen,'0x'+hex(int('1'+rs3_bin[2:],2))[3:])
-        if opcode in 'fadd':
-            rs2 = fields_dec_converter(iflen,result[i][0]) - rs1
-        elif opcode in 'fsub':
-            rs2 = rs1 - fields_dec_converter(iflen,result[i][0])
-        elif opcode in 'fmul':
-            rs2 = fields_dec_converter(iflen,result[i][0])/rs1
-        elif opcode in 'fdiv':
-            if fields_dec_converter(iflen,result[i][0]) != 0:
-                rs2 = rs1/fields_dec_converter(iflen,result[i][0])
-        elif opcode in 'fsqrt':
-            rs2 = fields_dec_converter(iflen,result[i][0])*fields_dec_converter(iflen,result[i][0])
-        elif opcode in 'fmadd':
-            rs2 = (fields_dec_converter(iflen,result[i][0]) - rs3)/rs1
-        elif opcode in 'fnmadd':
-            rs2 = (rs3 - fields_dec_converter(iflen,result[i][0]))/rs1
-        elif opcode in 'fmsub':
-            rs2 = (fields_dec_converter(iflen,result[i][0]) + rs3)/rs1
-        elif opcode in 'fnmsub':
-            rs2 = -1*(rs3 + fields_dec_converter(iflen,result[i][0]))/rs1
+        rs2 = gen_rs2(iflen,opcode,"3FFF8000000000000000000000000000","3FFF4000000000000000000000000000","3FFF8800000000000000000000000000")
+        # if opcode in 'fadd':
+        #     rs2 = fields_dec_converter(iflen,result[i][0]) - rs1
+        # elif opcode in 'fsub':
+        #     rs2 = rs1 - fields_dec_converter(iflen,result[i][0])
+        # elif opcode in 'fmul':
+        #     rs2 = fields_dec_converter(iflen,result[i][0])/rs1
+        # elif opcode in 'fdiv':
+        #     if fields_dec_converter(iflen,result[i][0]) != 0:
+        #         rs2 = rs1/fields_dec_converter(iflen,result[i][0])
+        # elif opcode in 'fsqrt':
+        #     rs2 = fields_dec_converter(iflen,result[i][0])*fields_dec_converter(iflen,result[i][0])
+        # elif opcode in 'fmadd':
+        #     rs2 = (fields_dec_converter(iflen,result[i][0]) - rs3)/rs1
+        # elif opcode in 'fnmadd':
+        #     rs2 = (rs3 - fields_dec_converter(iflen,result[i][0]))/rs1
+        # elif opcode in 'fmsub':
+        #     rs2 = (fields_dec_converter(iflen,result[i][0]) + rs3)/rs1
+        # elif opcode in 'fnmsub':
+        #     rs2 = -1*(rs3 + fields_dec_converter(iflen,result[i][0]))/rs1
 
         if(iflen==32):
             m = struct.unpack('f', struct.pack('f', rs2))[0]
diff --git a/tests/fp/quad/fpdatasetgen.py b/tests/fp/quad/fpdatasetgen.py
old mode 100644
new mode 100755
index 3d49e7082..6208b1ba3
--- a/tests/fp/quad/fpdatasetgen.py
+++ b/tests/fp/quad/fpdatasetgen.py
@@ -1,7 +1,9 @@
+#!/usr/bin/python
+
 from fp_dataset import *
 #coverpoints=ibm_b1(128, 128, 'fadd.q', 2) #ibm_b1(flen, iflen, opcode, ops)
-#coverpoints=ibm_b2(128,128,'fadd.q',2) #ibm_b2(flen, iflen, opcode, ops, int_val = 100, seed = -1)
-coverpoints=ibm_b2(32,32,'fadd.s',2) #ibm_b2(flen, iflen, opcode, ops,seed = -1)
+coverpoints=ibm_b2(128,128,'fadd.q',2) #ibm_b2(flen, iflen, opcode, ops, int_val = 100, seed = -1)
+#coverpoints=ibm_b2(32,32,'fadd.s',2) #ibm_b2(flen, iflen, opcode, ops,seed = -1)
 #print(coverpoints)
 #quad_precision_hex = "0x3ff00000000000000000000000000001"  # Example quad precision hexadecimal value
 #quad_precision_dec = fields_dec_converter(128, quad_precision_hex)