From 1bd6351e1fd4b0f2ad0e248cd7999d2e098ba189 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Fri, 29 Jul 2022 22:54:49 +0000
Subject: [PATCH 01/16] re-added FStore2 in Cache

---
 pipelined/src/cache/cache.sv | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/pipelined/src/cache/cache.sv b/pipelined/src/cache/cache.sv
index d28697e21..609810e8c 100644
--- a/pipelined/src/cache/cache.sv
+++ b/pipelined/src/cache/cache.sv
@@ -162,12 +162,18 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGWPL, WORDLEN, MUXINTER
   logic [LINELEN-1:0] FinalWriteDataDup;
   assign FinalWriteDataDup = {WORDSPERLINE{FinalWriteData}};
 
-  onehotdecoder #(LOGWPL) adrdec(
-    .bin(PAdr[LOGWPL+LOGXLENBYTES-1:LOGXLENBYTES]), .decoded(MemPAdrDecoded));
+  if(`LLEN>`XLEN)begin 
+    logic [2**LOGWPL-1:0] MemPAdrDecodedtmp;
+    onehotdecoder #(LOGWPL) adrdec(
+      .bin(PAdr[LOGWPL+LOGXLENBYTES-1:LOGXLENBYTES]), .decoded(MemPAdrDecodedtmp));
+    assign MemPAdrDecoded = MemPAdrDecodedtmp|{MemPAdrDecodedtmp[2**LOGWPL-2:0]&{2**LOGWPL-1{FStore2}}, 1'b0};
+  end else
+    onehotdecoder #(LOGWPL) adrdec(
+      .bin(PAdr[LOGWPL+LOGXLENBYTES-1:LOGXLENBYTES]), .decoded(MemPAdrDecoded));
   for(index = 0; index < 2**LOGWPL; index++) begin
     assign DemuxedByteMask[(index+1)*(`XLEN/8)-1:index*(`XLEN/8)] = MemPAdrDecoded[index] ? ByteMask : '0;
   end
-  // *** have to add back in fstore2
+  
   assign LineByteMux = SetValid & ~SetDirty ? '1 : ~DemuxedByteMask;  // If load miss set all muxes to 1.
   assign LineByteMask = ~SetValid & ~SetDirty ? '0 : ~SetValid & SetDirty ? DemuxedByteMask : '1; // if store hit only enable the word and subword bytes, else write all bytes.
 

From 257107f908519683bdadd5c9faf8e9b26c0957ee Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 18:07:38 +0000
Subject: [PATCH 02/16] Partitioned fma into separate files

---
 pipelined/regression/sim-wally-batch |   2 +-
 pipelined/src/fpu/fma.sv             | 231 +--------------------------
 pipelined/src/fpu/fmaadd.sv          |  83 ++++++++++
 pipelined/src/fpu/fmaalign.sv        | 101 ++++++++++++
 pipelined/src/fpu/fmaexpadd.sv       |  42 +++++
 pipelined/src/fpu/fmalza.sv          |  62 +++++++
 pipelined/src/fpu/fmamult.sv         |  38 +++++
 pipelined/src/fpu/fmasign.sv         |  47 ++++++
 pipelined/testbench/tests.vh         |   3 +-
 9 files changed, 384 insertions(+), 225 deletions(-)
 create mode 100644 pipelined/src/fpu/fmaadd.sv
 create mode 100644 pipelined/src/fpu/fmaalign.sv
 create mode 100644 pipelined/src/fpu/fmaexpadd.sv
 create mode 100644 pipelined/src/fpu/fmalza.sv
 create mode 100644 pipelined/src/fpu/fmamult.sv
 create mode 100644 pipelined/src/fpu/fmasign.sv

diff --git a/pipelined/regression/sim-wally-batch b/pipelined/regression/sim-wally-batch
index 8b5b5d628..7afcadb2e 100755
--- a/pipelined/regression/sim-wally-batch
+++ b/pipelined/regression/sim-wally-batch
@@ -1 +1 @@
-vsim -c -do "do wally-pipelined-batch.do rv32gc wally32periph"
+vsim -c -do "do wally-pipelined-batch.do rv64gc arch64d"
diff --git a/pipelined/src/fpu/fma.sv b/pipelined/src/fpu/fma.sv
index 067147ee6..fcf209f6b 100644
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 //
-// Written: me@KatherineParry.com, David Harris
-// Modified: 6/23/2021
+// Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
+// Modified: 
 //
 // Purpose: Floating point multiply-accumulate of configurable size
 // 
@@ -63,18 +63,18 @@ module fma(
    
 
    // calculate the product's exponent 
-    expadd expadd(.Fmt, .Xe, .Ye, .XZero, .YZero, .Pe);
+    fmaexpadd expadd(.Fmt, .Xe, .Ye, .XZero, .YZero, .Pe);
 
     // multiplication of the mantissa's
-    mult mult(.Xm, .Ym, .Pm);
+    fmamult mult(.Xm, .Ym, .Pm);
    
     ///////////////////////////////////////////////////////////////////////////////
     // Alignment shifter
     ///////////////////////////////////////////////////////////////////////////////
     // calculate the signs and take the opperation into account
-    sign sign(.OpCtrl, .Xs, .Ys, .Zs, .Ps, .As);
+    fmasign sign(.OpCtrl, .Xs, .Ys, .Zs, .Ps, .As);
 
-    align align(.Ze, .Zm, .XZero, .YZero, .ZZero, .Xe, .Ye,
+    fmaalign align(.Ze, .Zm, .XZero, .YZero, .ZZero, .Xe, .Ye,
                 .Am, .ZmSticky, .KillProd);
                         
 
@@ -83,223 +83,8 @@ module fma(
     // // Addition/LZA
     // ///////////////////////////////////////////////////////////////////////////////
         
-    add add(.Am, .Pm, .Ze, .Pe, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .NegSum, .InvA, .Sm, .Se, .Ss);
+    fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .NegSum, .InvA, .Sm, .Se, .Ss);
     
-    loa loa(.A(AmInv+{(3*`NF+6)'(0),InvA&~((ZmSticky&~KillProd))}), .P({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .SCnt);
+    fmalza lza(.A(AmInv+{(3*`NF+6)'(0),InvA&~((ZmSticky&~KillProd))}), .P({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .SCnt);
 endmodule
 
-
-module expadd(    
-    input  logic [`FMTBITS-1:0] Fmt,          // format of the output: single double half quad
-    input  logic [`NE-1:0]      Xe, Ye,  // input's exponents
-    input  logic                XZero, YZero,        // are the inputs zero
-    output logic [`NE+1:0]      Pe       // product's exponent B^(1023)NE+2
-);
-
-    // kill the exponent if the product is zero - either X or Y is 0
-    assign Pe = ({2'b0, Xe} + {2'b0, Ye} - {2'b0, (`NE)'(`BIAS)})&{`NE+2{~(XZero|YZero)}};
-
-endmodule
-
-
-
-
-
-module mult(
-    input logic [`NF:0] Xm, Ym,
-    output logic [2*`NF+1:0] Pm
-);
-    assign Pm = Xm * Ym;
-endmodule
-
-
-
-
-
-
-
-
-module sign(    
-    input  logic [2:0]  OpCtrl,               // opperation contol
-    input  logic        Xs, Ys, Zs,    // sign of the inputs
-    output logic        Ps,     // the product's sign - takes opperation into account
-    output logic        As   // aligned addend sign used in fma - takes opperation into account
-);
-
-    // Calculate the product's sign
-    //      Negate product's sign if FNMADD or FNMSUB
-    
-    // flip is negation opperation
-    assign Ps = Xs ^ Ys ^ (OpCtrl[1]&~OpCtrl[2]);
-    // flip if subtraction
-    assign As = Zs^OpCtrl[0];
-
-endmodule
-
-
-
-
-
-
-
-
-module align(
-    input logic  [`NE-1:0]      Xe, Ye, Ze,      // biased exponents in B(NE.0) format
-    input logic  [`NF:0]        Zm,      // significand in U(0.NF) format]
-    input logic                 XZero, YZero, ZZero, // is the input zero
-    output logic [3*`NF+5:0]    Am, // addend aligned for addition in U(NF+5.2NF+1)
-    output logic                ZmSticky,  // Sticky bit calculated from the aliged addend
-    output logic                KillProd       // should the product be set to zero
-);
-
-    logic [`NE+1:0]     ACnt;           // how far to shift the addend to align with the product in Q(NE+2.0) format
-    logic [4*`NF+5:0]   ZmShifted;        // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
-    logic [4*`NF+5:0]   ZmPreshifted;     // input to the alignment shifter U(NF+5.3NF+1)
-    logic KillZ;
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Alignment shifter
-    ///////////////////////////////////////////////////////////////////////////////
-
-    // determine the shift count for alignment
-    //      - negitive means Z is larger, so shift Z left
-    //      - positive means the product is larger, so shift Z right
-    // This could have been done using Pe, but ACnt is on the critical path so we replicate logic for speed
-    assign ACnt = {2'b0, Xe} + {2'b0, Ye} - {2'b0, (`NE)'(`BIAS)} + (`NE+2)'(`NF+3) - {2'b0, Ze};
-
-    // Defualt Addition without shifting
-    //          |   54'b0    |  106'b(product)  | 2'b0 |
-    //          | addnend |
-
-    // the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
-    assign ZmPreshifted = {Zm,(3*`NF+5)'(0)};
-    
-    assign KillProd = (ACnt[`NE+1]&~ZZero)|XZero|YZero;
-    assign KillZ = $signed(ACnt)>$signed((`NE+2)'(3)*(`NE+2)'(`NF)+(`NE+2)'(5));
-
-    always_comb
-        begin
-        
-        // If the product is too small to effect the sum, kill the product
-
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //  | addnend |
-        if (KillProd) begin
-            ZmShifted = {(`NF+3)'(0), Zm, (2*`NF+2)'(0)};
-            ZmSticky = ~(XZero|YZero);
-
-        // If the addend is too small to effect the addition        
-        //      - The addend has to shift two past the end of the addend to be considered too small
-        //      - The 2 extra bits are needed for rounding
-
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //                                                      | addnend |
-        end else if (KillZ)  begin
-            ZmShifted = 0;
-            ZmSticky = ~ZZero;
-
-        // If the Addend is shifted right
-        //          |   54'b0    |  106'b(product)  | 2'b0 |
-        //                                  | addnend |
-        end else begin
-            ZmShifted = ZmPreshifted >> ACnt;
-            ZmSticky = |(ZmShifted[`NF-1:0]);
-
-        end
-    end
-
-    assign Am = ZmShifted[4*`NF+5:`NF];
-
-endmodule
-
-
-
-
-
-
-
-module add(
-    input logic  [3*`NF+5:0]    Am, // aligned addend's mantissa for addition in U(NF+5.2NF+1)
-    input logic  [2*`NF+1:0]    Pm,       // the product's mantissa
-    input logic                 Ps, As,// the product sign and the alligend addeded's sign (Modified Z sign for other opperations)
-    input logic                 KillProd,      // should the product be set to 0
-    input logic                 ZmSticky,
-    input logic  [`NE-1:0]      Ze,
-    input logic  [`NE+1:0]      Pe,
-    output logic [3*`NF+6:0]    AmInv,  // aligned addend possibly inverted
-    output logic [2*`NF+1:0]    PmKilled,     // the product's mantissa possibly killed
-    output logic                NegSum,        // was the sum negitive
-    output logic                InvA,          // do you invert the aligned addend
-    output logic                Ss,          
-    output logic [`NE+1:0]      Se,
-    output logic [3*`NF+5:0]    Sm           // the positive sum
-);
-    logic [3*`NF+6:0]    PreSum, NegPreSum; // possibly negitive sum
-
-    ///////////////////////////////////////////////////////////////////////////////
-    // Addition
-    ///////////////////////////////////////////////////////////////////////////////
-   
-    // Negate Z  when doing one of the following opperations:
-    //      -prod +  Z
-    //       prod -  Z
-    assign InvA = As ^ Ps;
-
-    // Choose an inverted or non-inverted addend - the one has to be added now for the LZA
-    assign AmInv = InvA ? {1'b1, ~Am} : {1'b0, Am};
-    // Kill the product if the product is too small to effect the addition (determined in fma1.sv)
-    assign PmKilled = Pm&{2*`NF+2{~KillProd}};
-    // Do the addition
-    //      - calculate a positive and negitive sum in parallel
-    //              Zsticky             Psticky
-    // PreSum    -1 = don't add 1     +1 = add 2
-    // NegPreSum +1 = add 2           -1 = don't add 1
-    // for NegPreSum the product is set to -1 whenever the product is killed, therefore add 1, 2 or 0
-    assign PreSum = {{`NF+3{1'b0}}, PmKilled, 1'b0, InvA&ZmSticky&KillProd} + AmInv + {{3*`NF+6{1'b0}}, InvA&~((ZmSticky&~KillProd))};
-    assign NegPreSum = {1'b0, Am} + {{`NF+3{1'b1}}, ~PmKilled, 2'b11} + {(3*`NF+5)'(0), ZmSticky&~KillProd, ~(ZmSticky)};
-     
-    // Is the sum negitive
-    assign NegSum = PreSum[3*`NF+6];
-
-    // Choose the positive sum and accompanying LZA result.
-    assign Sm = NegSum ? NegPreSum[3*`NF+5:0] : PreSum[3*`NF+5:0];
-    // is the result negitive
-    //  if p - z is the Sum negitive
-    //  if -p + z is the Sum positive
-    //  if -p - z then the Sum is negitive
-    assign Ss = NegSum^Ps; //*** move to execute stage
-    assign Se = KillProd ? {2'b0, Ze} : Pe;
-endmodule
-
-
-module loa( // [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
-    input logic  [3*`NF+6:0] A,     // addend
-    input logic  [2*`NF+3:0] P,     // product
-    output logic [$clog2(3*`NF+7)-1:0]       SCnt   // normalization shift count for the positive result
-    ); 
-    
-    logic [3*`NF+6:0] T;
-    logic [3*`NF+6:0] G;
-    logic [3*`NF+6:0] Z;
-    logic [3*`NF+6:0] f;
-
-    assign T[3*`NF+6:2*`NF+4] = A[3*`NF+6:2*`NF+4];
-    assign G[3*`NF+6:2*`NF+4] = 0;
-    assign Z[3*`NF+6:2*`NF+4] = ~A[3*`NF+6:2*`NF+4];
-    assign T[2*`NF+3:0] = A[2*`NF+3:0]^P;
-    assign G[2*`NF+3:0] = A[2*`NF+3:0]&P;
-    assign Z[2*`NF+3:0] = ~A[2*`NF+3:0]&~P;
-
-
-    // Apply function to determine Leading pattern
-    //      - note: the paper linked above uses the numbering system where 0 is the most significant bit
-    //f[n] = ~T[n]&T[n-1]           note: n is the MSB
-    //f[i] = (T[i+1]&(G[i]&~Z[i-1] | Z[i]&~G[i-1])) | (~T[i+1]&(Z[i]&~Z[i-1] | G[i]&~G[i-1]))
-    assign f[3*`NF+6] = ~T[3*`NF+6]&T[3*`NF+5];
-    assign f[3*`NF+5:0] = (T[3*`NF+6:1]&(G[3*`NF+5:0]&{~Z[3*`NF+4:0], 1'b0} | Z[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1})) | (~T[3*`NF+6:1]&(Z[3*`NF+5:0]&{~Z[3*`NF+4:0], 1'b0} | G[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1}));
-
-
-
-    lzc #(3*`NF+7) lzc (.num(f), .ZeroCnt(SCnt));
-  
-endmodule
diff --git a/pipelined/src/fpu/fmaadd.sv b/pipelined/src/fpu/fmaadd.sv
new file mode 100644
index 000000000..4b52208c6
--- /dev/null
+++ b/pipelined/src/fpu/fmaadd.sv
@@ -0,0 +1,83 @@
+///////////////////////////////////////////
+//
+// Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
+// Modified: 
+//
+// Purpose: FMA significand adder
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fmaadd(
+    input logic  [3*`NF+5:0]    Am, // aligned addend's mantissa for addition in U(NF+5.2NF+1)
+    input logic  [2*`NF+1:0]    Pm,       // the product's mantissa
+    input logic                 Ps, As,// the product sign and the alligend addeded's sign (Modified Z sign for other opperations)
+    input logic                 KillProd,      // should the product be set to 0
+    input logic                 ZmSticky,
+    input logic  [`NE-1:0]      Ze,
+    input logic  [`NE+1:0]      Pe,
+    output logic [3*`NF+6:0]    AmInv,  // aligned addend possibly inverted
+    output logic [2*`NF+1:0]    PmKilled,     // the product's mantissa possibly killed
+    output logic                NegSum,        // was the sum negitive
+    output logic                InvA,          // do you invert the aligned addend
+    output logic                Ss,          
+    output logic [`NE+1:0]      Se,
+    output logic [3*`NF+5:0]    Sm           // the positive sum
+);
+    logic [3*`NF+6:0]    PreSum, NegPreSum; // possibly negitive sum
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Addition
+    ///////////////////////////////////////////////////////////////////////////////
+   
+    // Negate Z  when doing one of the following opperations:
+    //      -prod +  Z
+    //       prod -  Z
+    assign InvA = As ^ Ps;
+
+    // Choose an inverted or non-inverted addend - the one has to be added now for the LZA
+    assign AmInv = InvA ? {1'b1, ~Am} : {1'b0, Am};
+    // Kill the product if the product is too small to effect the addition (determined in fma1.sv)
+    assign PmKilled = Pm&{2*`NF+2{~KillProd}};
+    // Do the addition
+    //      - calculate a positive and negitive sum in parallel
+    //              Zsticky             Psticky
+    // PreSum    -1 = don't add 1     +1 = add 2
+    // NegPreSum +1 = add 2           -1 = don't add 1
+    // for NegPreSum the product is set to -1 whenever the product is killed, therefore add 1, 2 or 0
+    assign PreSum = {{`NF+3{1'b0}}, PmKilled, 1'b0, InvA&ZmSticky&KillProd} + AmInv + {{3*`NF+6{1'b0}}, InvA&~((ZmSticky&~KillProd))};
+    assign NegPreSum = {1'b0, Am} + {{`NF+3{1'b1}}, ~PmKilled, 2'b11} + {(3*`NF+5)'(0), ZmSticky&~KillProd, ~(ZmSticky)};
+     
+    // Is the sum negitive
+    assign NegSum = PreSum[3*`NF+6];
+
+    // Choose the positive sum and accompanying LZA result.
+    assign Sm = NegSum ? NegPreSum[3*`NF+5:0] : PreSum[3*`NF+5:0];
+    // is the result negitive
+    //  if p - z is the Sum negitive
+    //  if -p + z is the Sum positive
+    //  if -p - z then the Sum is negitive
+    assign Ss = NegSum^Ps; //*** move to execute stage
+    assign Se = KillProd ? {2'b0, Ze} : Pe;
+endmodule
diff --git a/pipelined/src/fpu/fmaalign.sv b/pipelined/src/fpu/fmaalign.sv
new file mode 100644
index 000000000..f7c849993
--- /dev/null
+++ b/pipelined/src/fpu/fmaalign.sv
@@ -0,0 +1,101 @@
+
+///////////////////////////////////////////
+//
+// Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
+// Modified: 
+//
+// Purpose: FMA alginment shift
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fmaalign(
+    input logic  [`NE-1:0]      Xe, Ye, Ze,      // biased exponents in B(NE.0) format
+    input logic  [`NF:0]        Zm,      // significand in U(0.NF) format]
+    input logic                 XZero, YZero, ZZero, // is the input zero
+    output logic [3*`NF+5:0]    Am, // addend aligned for addition in U(NF+5.2NF+1)
+    output logic                ZmSticky,  // Sticky bit calculated from the aliged addend
+    output logic                KillProd       // should the product be set to zero
+);
+
+    logic [`NE+1:0]     ACnt;           // how far to shift the addend to align with the product in Q(NE+2.0) format
+    logic [4*`NF+5:0]   ZmShifted;        // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
+    logic [4*`NF+5:0]   ZmPreshifted;     // input to the alignment shifter U(NF+5.3NF+1)
+    logic KillZ;
+
+    ///////////////////////////////////////////////////////////////////////////////
+    // Alignment shifter
+    ///////////////////////////////////////////////////////////////////////////////
+
+    // determine the shift count for alignment
+    //      - negitive means Z is larger, so shift Z left
+    //      - positive means the product is larger, so shift Z right
+    // This could have been done using Pe, but ACnt is on the critical path so we replicate logic for speed
+    assign ACnt = {2'b0, Xe} + {2'b0, Ye} - {2'b0, (`NE)'(`BIAS)} + (`NE+2)'(`NF+3) - {2'b0, Ze};
+
+    // Defualt Addition without shifting
+    //          |   54'b0    |  106'b(product)  | 2'b0 |
+    //          | addnend |
+
+    // the 1'b0 before the added is because the product's mantissa has two bits before the binary point (xx.xxxxxxxxxx...)
+    assign ZmPreshifted = {Zm,(3*`NF+5)'(0)};
+    
+    assign KillProd = (ACnt[`NE+1]&~ZZero)|XZero|YZero;
+    assign KillZ = $signed(ACnt)>$signed((`NE+2)'(3)*(`NE+2)'(`NF)+(`NE+2)'(5));
+
+    always_comb
+        begin
+        
+        // If the product is too small to effect the sum, kill the product
+
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //  | addnend |
+        if (KillProd) begin
+            ZmShifted = {(`NF+3)'(0), Zm, (2*`NF+2)'(0)};
+            ZmSticky = ~(XZero|YZero);
+
+        // If the addend is too small to effect the addition        
+        //      - The addend has to shift two past the end of the addend to be considered too small
+        //      - The 2 extra bits are needed for rounding
+
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                                                      | addnend |
+        end else if (KillZ)  begin
+            ZmShifted = 0;
+            ZmSticky = ~ZZero;
+
+        // If the Addend is shifted right
+        //          |   54'b0    |  106'b(product)  | 2'b0 |
+        //                                  | addnend |
+        end else begin
+            ZmShifted = ZmPreshifted >> ACnt;
+            ZmSticky = |(ZmShifted[`NF-1:0]);
+
+        end
+    end
+
+    assign Am = ZmShifted[4*`NF+5:`NF];
+
+endmodule
+
diff --git a/pipelined/src/fpu/fmaexpadd.sv b/pipelined/src/fpu/fmaexpadd.sv
new file mode 100644
index 000000000..1d208327b
--- /dev/null
+++ b/pipelined/src/fpu/fmaexpadd.sv
@@ -0,0 +1,42 @@
+///////////////////////////////////////////
+//
+// Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
+// Modified: 
+//
+// Purpose: FMA exponent addition
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fmaexpadd(    
+    input  logic [`FMTBITS-1:0] Fmt,          // format of the output: single double half quad
+    input  logic [`NE-1:0]      Xe, Ye,  // input's exponents
+    input  logic                XZero, YZero,        // are the inputs zero
+    output logic [`NE+1:0]      Pe       // product's exponent B^(1023)NE+2
+);
+
+    // kill the exponent if the product is zero - either X or Y is 0
+    assign Pe = ({2'b0, Xe} + {2'b0, Ye} - {2'b0, (`NE)'(`BIAS)})&{`NE+2{~(XZero|YZero)}};
+
+endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
new file mode 100644
index 000000000..3baaf2a08
--- /dev/null
+++ b/pipelined/src/fpu/fmalza.sv
@@ -0,0 +1,62 @@
+///////////////////////////////////////////
+//
+// Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
+// Modified: 
+//
+// Purpose: Leading Zero Anticipator
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
+    input logic  [3*`NF+6:0] A,     // addend
+    input logic  [2*`NF+3:0] P,     // product
+    output logic [$clog2(3*`NF+7)-1:0]       SCnt   // normalization shift count for the positive result
+    ); 
+    
+    logic [3*`NF+6:0] T;
+    logic [3*`NF+6:0] G;
+    logic [3*`NF+6:0] Z;
+    logic [3*`NF+6:0] f;
+
+    assign T[3*`NF+6:2*`NF+4] = A[3*`NF+6:2*`NF+4];
+    assign G[3*`NF+6:2*`NF+4] = 0;
+    assign Z[3*`NF+6:2*`NF+4] = ~A[3*`NF+6:2*`NF+4];
+    assign T[2*`NF+3:0] = A[2*`NF+3:0]^P;
+    assign G[2*`NF+3:0] = A[2*`NF+3:0]&P;
+    assign Z[2*`NF+3:0] = ~A[2*`NF+3:0]&~P;
+
+
+    // Apply function to determine Leading pattern
+    //      - note: the paper linked above uses the numbering system where 0 is the most significant bit
+    //f[n] = ~T[n]&T[n-1]           note: n is the MSB
+    //f[i] = (T[i+1]&(G[i]&~Z[i-1] | Z[i]&~G[i-1])) | (~T[i+1]&(Z[i]&~Z[i-1] | G[i]&~G[i-1]))
+    assign f[3*`NF+6] = ~T[3*`NF+6]&T[3*`NF+5];
+    assign f[3*`NF+5:0] = (T[3*`NF+6:1]&(G[3*`NF+5:0]&{~Z[3*`NF+4:0], 1'b0} | Z[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1})) | (~T[3*`NF+6:1]&(Z[3*`NF+5:0]&{~Z[3*`NF+4:0], 1'b0} | G[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1}));
+
+
+
+    lzc #(3*`NF+7) lzc (.num(f), .ZeroCnt(SCnt));
+  
+endmodule
diff --git a/pipelined/src/fpu/fmamult.sv b/pipelined/src/fpu/fmamult.sv
new file mode 100644
index 000000000..1e1b0981e
--- /dev/null
+++ b/pipelined/src/fpu/fmamult.sv
@@ -0,0 +1,38 @@
+///////////////////////////////////////////
+//
+// Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
+// Modified: 
+//
+// Purpose: FMA Significand Multiplier
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fmamult(
+    input logic [`NF:0] Xm, Ym,
+    output logic [2*`NF+1:0] Pm
+);
+    assign Pm = Xm * Ym;
+endmodule
+
diff --git a/pipelined/src/fpu/fmasign.sv b/pipelined/src/fpu/fmasign.sv
new file mode 100644
index 000000000..66c1af83a
--- /dev/null
+++ b/pipelined/src/fpu/fmasign.sv
@@ -0,0 +1,47 @@
+///////////////////////////////////////////
+//
+// Written:  6/23/2021 me@KatherineParry.com, David_Harris@hmc.edu
+// Modified: 
+//
+// Purpose: FMA Sign Logic
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fmasign(    
+    input  logic [2:0]  OpCtrl,               // opperation contol
+    input  logic        Xs, Ys, Zs,    // sign of the inputs
+    output logic        Ps,     // the product's sign - takes opperation into account
+    output logic        As   // aligned addend sign used in fma - takes opperation into account
+);
+
+    // Calculate the product's sign
+    //      Negate product's sign if FNMADD or FNMSUB
+    
+    // flip is negation opperation
+    assign Ps = Xs ^ Ys ^ (OpCtrl[1]&~OpCtrl[2]);
+    // flip if subtraction
+    assign As = Zs^OpCtrl[0];
+
+endmodule
diff --git a/pipelined/testbench/tests.vh b/pipelined/testbench/tests.vh
index 587733c39..fe3bd62f4 100644
--- a/pipelined/testbench/tests.vh
+++ b/pipelined/testbench/tests.vh
@@ -1902,7 +1902,8 @@ string imperas32f[] = '{
     "rv32i_m/privilege/src/WALLY-gpio-01.S",
     "rv32i_m/privilege/src/WALLY-clint-01.S",
     "rv32i_m/privilege/src/WALLY-uart-01.S",
-    "rv32i_m/privilege/src/WALLY-plic-01.S"
+    "rv32i_m/privilege/src/WALLY-plic-01.S",
+    "rv32i_m/privilege/src/WALLY-plic-s-01.S"
  };
 
 

From 7f9b6014670f4f866c1f83e6c9d43a52fe22a239 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 18:23:39 +0000
Subject: [PATCH 03/16] fmalza edits to match textbook

---
 pipelined/src/fpu/fma.sv    |  2 +-
 pipelined/src/fpu/fmalza.sv | 33 ++++++++++++++-------------------
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/pipelined/src/fpu/fma.sv b/pipelined/src/fpu/fma.sv
index fcf209f6b..5f595b1fc 100644
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@@ -85,6 +85,6 @@ module fma(
         
     fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .NegSum, .InvA, .Sm, .Se, .Ss);
     
-    fmalza lza(.A(AmInv+{(3*`NF+6)'(0),InvA&~((ZmSticky&~KillProd))}), .P({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .SCnt);
+    fmalza lza(.A(AmInv+{(3*`NF+6)'(0),InvA&~((ZmSticky&~KillProd))}), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .SCnt);
 endmodule
 
diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index 3baaf2a08..a05084e2d 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -31,32 +31,27 @@
 
 module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
     input logic  [3*`NF+6:0] A,     // addend
-    input logic  [2*`NF+3:0] P,     // product
+    input logic  [2*`NF+3:0] Pm,     // product
     output logic [$clog2(3*`NF+7)-1:0]       SCnt   // normalization shift count for the positive result
     ); 
+
+    localparam WIDTH = 3*`NF+7;
     
-    logic [3*`NF+6:0] T;
-    logic [3*`NF+6:0] G;
-    logic [3*`NF+6:0] Z;
-    logic [3*`NF+6:0] f;
+    logic [WIDTH-1:0] B, P, G, K, F;
+    logic [WIDTH-1:0] Pp1, Gm1, Km1;
 
-    assign T[3*`NF+6:2*`NF+4] = A[3*`NF+6:2*`NF+4];
-    assign G[3*`NF+6:2*`NF+4] = 0;
-    assign Z[3*`NF+6:2*`NF+4] = ~A[3*`NF+6:2*`NF+4];
-    assign T[2*`NF+3:0] = A[2*`NF+3:0]^P;
-    assign G[2*`NF+3:0] = A[2*`NF+3:0]&P;
-    assign Z[2*`NF+3:0] = ~A[2*`NF+3:0]&~P;
+    assign B = {{(`NF+3){1'b0}}, Pm}; // Zero extend product
 
+    assign P = A^B;
+    assign G = A&B;
+    assign K= ~A&~B;
 
     // Apply function to determine Leading pattern
     //      - note: the paper linked above uses the numbering system where 0 is the most significant bit
-    //f[n] = ~T[n]&T[n-1]           note: n is the MSB
-    //f[i] = (T[i+1]&(G[i]&~Z[i-1] | Z[i]&~G[i-1])) | (~T[i+1]&(Z[i]&~Z[i-1] | G[i]&~G[i-1]))
-    assign f[3*`NF+6] = ~T[3*`NF+6]&T[3*`NF+5];
-    assign f[3*`NF+5:0] = (T[3*`NF+6:1]&(G[3*`NF+5:0]&{~Z[3*`NF+4:0], 1'b0} | Z[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1})) | (~T[3*`NF+6:1]&(Z[3*`NF+5:0]&{~Z[3*`NF+4:0], 1'b0} | G[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1}));
+    //f[n] = ~P[n]&P[n-1]           note: n is the MSB
+    //f[i] = (P[i+1]&(G[i]&~K[i-1] | K[i]&~G[i-1])) | (~P[i+1]&(K[i]&~K[i-1] | G[i]&~G[i-1]))
+    assign F[WIDTH-1] = ~P[WIDTH-1]&P[WIDTH-2];
+    assign F[WIDTH-2:0] = (P[3*`NF+6:1]&(G[3*`NF+5:0]&{~K[3*`NF+4:0], 1'b0} | K[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1})) | (~P[3*`NF+6:1]&(K[3*`NF+5:0]&{~K[3*`NF+4:0], 1'b0} | G[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1}));
 
-
-
-    lzc #(3*`NF+7) lzc (.num(f), .ZeroCnt(SCnt));
-  
+    lzc #(3*`NF+7) lzc (.num(F), .ZeroCnt(SCnt));
 endmodule

From 3c08aabcd355cce36a39bad253e1f57a3bfc0464 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 11:36:21 -0700
Subject: [PATCH 04/16] LZA refactoring

---
 pipelined/src/fpu/fma.sv    |  2 +-
 pipelined/src/fpu/fmalza.sv | 26 ++++++++++++++++----------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/pipelined/src/fpu/fma.sv b/pipelined/src/fpu/fma.sv
index 5f595b1fc..e698cdaf2 100644
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@@ -85,6 +85,6 @@ module fma(
         
     fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .NegSum, .InvA, .Sm, .Se, .Ss);
     
-    fmalza lza(.A(AmInv+{(3*`NF+6)'(0),InvA&~((ZmSticky&~KillProd))}), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .SCnt);
+    fmalza lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .SCnt);
 endmodule
 
diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index a05084e2d..e69ba73f4 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -30,28 +30,34 @@
 `include "wally-config.vh"
 
 module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
-    input logic  [3*`NF+6:0] A,     // addend
-    input logic  [2*`NF+3:0] Pm,     // product
-    output logic [$clog2(3*`NF+7)-1:0]       SCnt   // normalization shift count for the positive result
+    input logic [3*`NF+6:0] 	       A, // addend
+    input logic [2*`NF+3:0] 	       Pm, // product
+    input logic                        Cin, // carry in
+    output logic [$clog2(3*`NF+7)-1:0] SCnt   // normalization shift count for the positive result
     ); 
 
     localparam WIDTH = 3*`NF+7;
     
-    logic [WIDTH-1:0] B, P, G, K, F;
-    logic [WIDTH-1:0] Pp1, Gm1, Km1;
+    logic [WIDTH-1:0] AA, B, P, G, K, F;
+    logic [WIDTH-2:0] Pp1, Gm1, Km1;
 
     assign B = {{(`NF+3){1'b0}}, Pm}; // Zero extend product
+   assign AA = A + Cin;
 
-    assign P = A^B;
-    assign G = A&B;
-    assign K= ~A&~B;
+    assign P = AA^B;
+    assign G = AA&B;
+    assign K= ~AA&~B;
 
+   assign Pp1 = P[WIDTH-1:1];
+   assign Gm1 = {G[WIDTH-3:0], Cin};
+   assign Km1 = {K[WIDTH-3:0], ~Cin};
+   
     // Apply function to determine Leading pattern
     //      - note: the paper linked above uses the numbering system where 0 is the most significant bit
     //f[n] = ~P[n]&P[n-1]           note: n is the MSB
     //f[i] = (P[i+1]&(G[i]&~K[i-1] | K[i]&~G[i-1])) | (~P[i+1]&(K[i]&~K[i-1] | G[i]&~G[i-1]))
     assign F[WIDTH-1] = ~P[WIDTH-1]&P[WIDTH-2];
-    assign F[WIDTH-2:0] = (P[3*`NF+6:1]&(G[3*`NF+5:0]&{~K[3*`NF+4:0], 1'b0} | K[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1})) | (~P[3*`NF+6:1]&(K[3*`NF+5:0]&{~K[3*`NF+4:0], 1'b0} | G[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1}));
+    assign F[WIDTH-2:0] = (Pp1&(G[3*`NF+5:0]&{~K[3*`NF+4:0], 1'b0} | K[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1})) | (~P[3*`NF+6:1]&(K[3*`NF+5:0]&{~K[3*`NF+4:0], 1'b0} | G[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1}));
 
-    lzc #(3*`NF+7) lzc (.num(F), .ZeroCnt(SCnt));
+    lzc #(WIDTH) lzc (.num(F), .ZeroCnt(SCnt));
 endmodule

From 99462049e7998190cad96ff49234ee99c69634e8 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 12:20:23 -0700
Subject: [PATCH 05/16] LZA refactoring switched to Pp1, Gm1, Km1

---
 pipelined/src/fpu/fmalza.sv | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index e69ba73f4..afffca472 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -42,11 +42,11 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
     logic [WIDTH-2:0] Pp1, Gm1, Km1;
 
     assign B = {{(`NF+3){1'b0}}, Pm}; // Zero extend product
-   assign AA = A + Cin;
+//   assign AA = A + Cin;
 
-    assign P = AA^B;
-    assign G = AA&B;
-    assign K= ~AA&~B;
+    assign P = A^B;
+    assign G = A&B;
+    assign K= ~A&~B;
 
    assign Pp1 = P[WIDTH-1:1];
    assign Gm1 = {G[WIDTH-3:0], Cin};
@@ -57,7 +57,7 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
     //f[n] = ~P[n]&P[n-1]           note: n is the MSB
     //f[i] = (P[i+1]&(G[i]&~K[i-1] | K[i]&~G[i-1])) | (~P[i+1]&(K[i]&~K[i-1] | G[i]&~G[i-1]))
     assign F[WIDTH-1] = ~P[WIDTH-1]&P[WIDTH-2];
-    assign F[WIDTH-2:0] = (Pp1&(G[3*`NF+5:0]&{~K[3*`NF+4:0], 1'b0} | K[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1})) | (~P[3*`NF+6:1]&(K[3*`NF+5:0]&{~K[3*`NF+4:0], 1'b0} | G[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1}));
+    assign F[WIDTH-2:0] = (Pp1&(G[WIDTH-2:0]&~Km1 | K[WIDTH-2:0]&~Gm1)) | (~Pp1&(K[WIDTH-2:0]&~Km1 | G[WIDTH-2:0]&~Gm1));
 
     lzc #(WIDTH) lzc (.num(F), .ZeroCnt(SCnt));
 endmodule

From b34d2065c392d60cc4b80bb2d5e4adba5b582fff Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 12:30:42 -0700
Subject: [PATCH 06/16] LZA cleanup

---
 pipelined/src/fpu/fmalza.sv | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index afffca472..b7b40091d 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -38,7 +38,8 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
 
     localparam WIDTH = 3*`NF+7;
     
-    logic [WIDTH-1:0] AA, B, P, G, K, F;
+   logic [WIDTH-1:0] 		       B,F;
+   logic [WIDTH-1:0]  P, G, K;
     logic [WIDTH-2:0] Pp1, Gm1, Km1;
 
     assign B = {{(`NF+3){1'b0}}, Pm}; // Zero extend product
@@ -48,7 +49,7 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
     assign G = A&B;
     assign K= ~A&~B;
 
-   assign Pp1 = P[WIDTH-1:1];
+   assign Pp1 = {A[WIDTH-1], P[WIDTH-2:1]};
    assign Gm1 = {G[WIDTH-3:0], Cin};
    assign Km1 = {K[WIDTH-3:0], ~Cin};
    

From 2869d67e50c06e8746d90c10a3b65e401141467a Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 12:34:00 -0700
Subject: [PATCH 07/16] more lza cleanup

---
 pipelined/src/fpu/fmalza.sv | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index b7b40091d..f70b1bc93 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -39,15 +39,17 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
     localparam WIDTH = 3*`NF+7;
     
    logic [WIDTH-1:0] 		       B,F;
-   logic [WIDTH-1:0]  P, G, K;
+   logic [WIDTH-1:0] 		       P, G;
+   logic [WIDTH-2:0]  K;
     logic [WIDTH-2:0] Pp1, Gm1, Km1;
 
     assign B = {{(`NF+3){1'b0}}, Pm}; // Zero extend product
-//   assign AA = A + Cin;
 
+   // next steps***replace P[WIDTH-1] with sub, then remove one bit from A
+   
     assign P = A^B;
-    assign G = A&B;
-    assign K= ~A&~B;
+    assign G = A[WIDTH-2:0]&B[WIDTH-2:0];
+    assign K= ~A[WIDTH-2:0]&~B[WIDTH-2:0];
 
    assign Pp1 = {A[WIDTH-1], P[WIDTH-2:1]};
    assign Gm1 = {G[WIDTH-3:0], Cin};
@@ -58,7 +60,7 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
     //f[n] = ~P[n]&P[n-1]           note: n is the MSB
     //f[i] = (P[i+1]&(G[i]&~K[i-1] | K[i]&~G[i-1])) | (~P[i+1]&(K[i]&~K[i-1] | G[i]&~G[i-1]))
     assign F[WIDTH-1] = ~P[WIDTH-1]&P[WIDTH-2];
-    assign F[WIDTH-2:0] = (Pp1&(G[WIDTH-2:0]&~Km1 | K[WIDTH-2:0]&~Gm1)) | (~Pp1&(K[WIDTH-2:0]&~Km1 | G[WIDTH-2:0]&~Gm1));
+    assign F[WIDTH-2:0] = (Pp1&(G&~Km1 | K&~Gm1)) | (~Pp1&(K&~Km1 | G&~Gm1));
 
     lzc #(WIDTH) lzc (.num(F), .ZeroCnt(SCnt));
 endmodule

From 8ff3a693af891c99945ebac02205a2da7bb92ba5 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Mon, 1 Aug 2022 19:56:25 +0000
Subject: [PATCH 08/16] regression passes fpu tests

---
 pipelined/src/fpu/fpu.sv | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/pipelined/src/fpu/fpu.sv b/pipelined/src/fpu/fpu.sv
index d98079b2e..4b7a1ffea 100755
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@@ -319,10 +319,23 @@ module fpu (
    assign PreNVE = CmpNVE&(OpCtrlE[2]|FWriteIntE);
 
    // select the result that may be written to the integer register - to IEU
+   
+   logic [`FLEN-1:0] SgnExtXE;
+   generate
+   if(`FPSIZES == 1)
+      assign SgnExtXE = XE;
+   else if(`FPSIZES == 2) 
+      mux2 #(`FLEN) sgnextmux ({{`FLEN-`LEN1{XsE}}, XE[`LEN1-1:0]}, XE, FmtE, SgnExtXE);
+   else if(`FPSIZES == 3 | `FPSIZES == 4)
+      mux4 #(`FLEN) fmulzeromux ({{`FLEN-`H_LEN{XsE}}, XE[`H_LEN-1:0]}, 
+                                 {{`FLEN-`S_LEN{XsE}}, XE[`S_LEN-1:0]}, 
+                                 {{`FLEN-`D_LEN{XsE}}, XE[`D_LEN-1:0]}, 
+                                 XE, FmtE, SgnExtXE); // NaN boxing zeroes
+   endgenerate
    if (`FLEN>`XLEN)
-      assign IntSrcXE = XE[`XLEN-1:0];
+      assign IntSrcXE = SgnExtXE[`XLEN-1:0];
    else 
-      assign IntSrcXE = {{`XLEN-`FLEN{XE[`FLEN-1:0]}}, XE};
+      assign IntSrcXE = {{`XLEN-`FLEN{XsE}}, SgnExtXE};
 
    mux3 #(`XLEN) IntResMux (ClassResE, IntSrcXE, CmpIntResE, {~FResSelE[1], FResSelE[0]}, FIntResE);
    // *** DH 5/25/22: CvtRes will move to mem stage.  Premux in execute to save area, then make sure stalls are ok

From d6b5e7a6ef60f3d1a45554319b932a904ebfd46f Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 15:37:09 -0700
Subject: [PATCH 09/16] lza cleanup

---
 pipelined/src/fpu/fma.sv    |  3 ++-
 pipelined/src/fpu/fmalza.sv | 13 +++++++------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/pipelined/src/fpu/fma.sv b/pipelined/src/fpu/fma.sv
index e698cdaf2..68a509677 100644
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@@ -85,6 +85,7 @@ module fma(
         
     fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .NegSum, .InvA, .Sm, .Se, .Ss);
     
-    fmalza lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .SCnt);
+    fmalza lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
 endmodule
 
+
diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index f70b1bc93..9de1d745e 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -32,26 +32,27 @@
 module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
     input logic [3*`NF+6:0] 	       A, // addend
     input logic [2*`NF+3:0] 	       Pm, // product
-    input logic                        Cin, // carry in
+    input logic 		       Cin, // carry in
+    input logic sub,
     output logic [$clog2(3*`NF+7)-1:0] SCnt   // normalization shift count for the positive result
     ); 
 
     localparam WIDTH = 3*`NF+7;
     
    logic [WIDTH-1:0] 		       B,F;
-   logic [WIDTH-1:0] 		       P, G;
-   logic [WIDTH-2:0]  K;
+   logic [WIDTH-2:0]   P, G, K;
     logic [WIDTH-2:0] Pp1, Gm1, Km1;
 
     assign B = {{(`NF+3){1'b0}}, Pm}; // Zero extend product
 
    // next steps***replace P[WIDTH-1] with sub, then remove one bit from A
    
-    assign P = A^B;
+    assign P = A[WIDTH-2:0]^B[WIDTH-2:0];
     assign G = A[WIDTH-2:0]&B[WIDTH-2:0];
     assign K= ~A[WIDTH-2:0]&~B[WIDTH-2:0];
 
-   assign Pp1 = {A[WIDTH-1], P[WIDTH-2:1]};
+   assign Pp1 = {sub, P[WIDTH-2:1]};
+//   assign Pp1 = {A[WIDTH-1], P[WIDTH-2:1]};
    assign Gm1 = {G[WIDTH-3:0], Cin};
    assign Km1 = {K[WIDTH-3:0], ~Cin};
    
@@ -59,7 +60,7 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
     //      - note: the paper linked above uses the numbering system where 0 is the most significant bit
     //f[n] = ~P[n]&P[n-1]           note: n is the MSB
     //f[i] = (P[i+1]&(G[i]&~K[i-1] | K[i]&~G[i-1])) | (~P[i+1]&(K[i]&~K[i-1] | G[i]&~G[i-1]))
-    assign F[WIDTH-1] = ~P[WIDTH-1]&P[WIDTH-2];
+    assign F[WIDTH-1] = ~sub&P[WIDTH-2];
     assign F[WIDTH-2:0] = (Pp1&(G&~Km1 | K&~Gm1)) | (~Pp1&(K&~Km1 | G&~Gm1));
 
     lzc #(WIDTH) lzc (.num(F), .ZeroCnt(SCnt));

From c3e9719c991d2da019341cd46801decbcf8f8467 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 15:40:12 -0700
Subject: [PATCH 10/16] lza cleanup

---
 pipelined/src/fpu/fmalza.sv | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index 9de1d745e..c86459edb 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -39,14 +39,12 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
 
     localparam WIDTH = 3*`NF+7;
     
-   logic [WIDTH-1:0] 		       B,F;
-   logic [WIDTH-2:0]   P, G, K;
+   logic [WIDTH-1:0] 		       F;
+   logic [WIDTH-2:0]  B, P, G, K;
     logic [WIDTH-2:0] Pp1, Gm1, Km1;
 
-    assign B = {{(`NF+3){1'b0}}, Pm}; // Zero extend product
+    assign B = {{(`NF+2){1'b0}}, Pm}; // Zero extend product
 
-   // next steps***replace P[WIDTH-1] with sub, then remove one bit from A
-   
     assign P = A[WIDTH-2:0]^B[WIDTH-2:0];
     assign G = A[WIDTH-2:0]&B[WIDTH-2:0];
     assign K= ~A[WIDTH-2:0]&~B[WIDTH-2:0];

From f56b26ec400add075116f44f61c2e8b0f1399d05 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 15:43:48 -0700
Subject: [PATCH 11/16] lza cleanup

---
 pipelined/src/fpu/fma.sv    | 5 +++--
 pipelined/src/fpu/fmalza.sv | 9 ++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/pipelined/src/fpu/fma.sv b/pipelined/src/fpu/fma.sv
index 68a509677..dec492eba 100644
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@@ -84,8 +84,9 @@ module fma(
     // ///////////////////////////////////////////////////////////////////////////////
         
     fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .NegSum, .InvA, .Sm, .Se, .Ss);
-    
-    fmalza lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
+
+   
+    fmalza lza(.A(AmInv[3*`NF+5:0]), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
 endmodule
 
 
diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index c86459edb..d70f0267c 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -30,7 +30,7 @@
 `include "wally-config.vh"
 
 module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
-    input logic [3*`NF+6:0] 	       A, // addend
+    input logic [3*`NF+5:0] 	       A, // addend
     input logic [2*`NF+3:0] 	       Pm, // product
     input logic 		       Cin, // carry in
     input logic sub,
@@ -45,12 +45,11 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
 
     assign B = {{(`NF+2){1'b0}}, Pm}; // Zero extend product
 
-    assign P = A[WIDTH-2:0]^B[WIDTH-2:0];
-    assign G = A[WIDTH-2:0]&B[WIDTH-2:0];
-    assign K= ~A[WIDTH-2:0]&~B[WIDTH-2:0];
+    assign P = A[WIDTH-2:0]^B;
+    assign G = A[WIDTH-2:0]&B;
+    assign K= ~A[WIDTH-2:0]&~B;
 
    assign Pp1 = {sub, P[WIDTH-2:1]};
-//   assign Pp1 = {A[WIDTH-1], P[WIDTH-2:1]};
    assign Gm1 = {G[WIDTH-3:0], Cin};
    assign Km1 = {K[WIDTH-3:0], ~Cin};
    

From 91597bba87da84e99c683edfbd30937db56720ed Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 15:47:03 -0700
Subject: [PATCH 12/16] lza cleanup

---
 pipelined/src/fpu/fmalza.sv | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index d70f0267c..d71b398e7 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -37,28 +37,28 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
     output logic [$clog2(3*`NF+7)-1:0] SCnt   // normalization shift count for the positive result
     ); 
 
-    localparam WIDTH = 3*`NF+7;
+    localparam WIDTH = 3*`NF+6;
     
-   logic [WIDTH-1:0] 		       F;
-   logic [WIDTH-2:0]  B, P, G, K;
-    logic [WIDTH-2:0] Pp1, Gm1, Km1;
+   logic [WIDTH:0] 		       F;
+   logic [WIDTH-1:0]  B, P, G, K;
+    logic [WIDTH-1:0] Pp1, Gm1, Km1;
 
     assign B = {{(`NF+2){1'b0}}, Pm}; // Zero extend product
 
-    assign P = A[WIDTH-2:0]^B;
-    assign G = A[WIDTH-2:0]&B;
-    assign K= ~A[WIDTH-2:0]&~B;
+    assign P = A^B;
+    assign G = A&B;
+    assign K= ~A&~B;
 
-   assign Pp1 = {sub, P[WIDTH-2:1]};
-   assign Gm1 = {G[WIDTH-3:0], Cin};
-   assign Km1 = {K[WIDTH-3:0], ~Cin};
+   assign Pp1 = {sub, P[WIDTH-1:1]};
+   assign Gm1 = {G[WIDTH-2:0], Cin};
+   assign Km1 = {K[WIDTH-2:0], ~Cin};
    
     // Apply function to determine Leading pattern
     //      - note: the paper linked above uses the numbering system where 0 is the most significant bit
     //f[n] = ~P[n]&P[n-1]           note: n is the MSB
     //f[i] = (P[i+1]&(G[i]&~K[i-1] | K[i]&~G[i-1])) | (~P[i+1]&(K[i]&~K[i-1] | G[i]&~G[i-1]))
-    assign F[WIDTH-1] = ~sub&P[WIDTH-2];
-    assign F[WIDTH-2:0] = (Pp1&(G&~Km1 | K&~Gm1)) | (~Pp1&(K&~Km1 | G&~Gm1));
+    assign F[WIDTH] = ~sub&P[WIDTH-1];
+    assign F[WIDTH-1:0] = (Pp1&(G&~Km1 | K&~Gm1)) | (~Pp1&(K&~Km1 | G&~Gm1));
 
-    lzc #(WIDTH) lzc (.num(F), .ZeroCnt(SCnt));
+    lzc #(WIDTH+1) lzc (.num(F), .ZeroCnt(SCnt));
 endmodule

From 3b937b73fdade1a6b1bf2a36233aa912db2f4414 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 16:01:02 -0700
Subject: [PATCH 13/16] lza cleanup

---
 pipelined/src/fpu/fmalza.sv | 2 --
 1 file changed, 2 deletions(-)

diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index d71b398e7..fd180fbb6 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -55,8 +55,6 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
    
     // Apply function to determine Leading pattern
     //      - note: the paper linked above uses the numbering system where 0 is the most significant bit
-    //f[n] = ~P[n]&P[n-1]           note: n is the MSB
-    //f[i] = (P[i+1]&(G[i]&~K[i-1] | K[i]&~G[i-1])) | (~P[i+1]&(K[i]&~K[i-1] | G[i]&~G[i-1]))
     assign F[WIDTH] = ~sub&P[WIDTH-1];
     assign F[WIDTH-1:0] = (Pp1&(G&~Km1 | K&~Gm1)) | (~Pp1&(K&~Km1 | G&~Gm1));
 

From 94fa7a00e7b28c1d9f32dd3d98e1579fe23917fe Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 16:13:16 -0700
Subject: [PATCH 14/16] Completed LZA simplificaiton

---
 pipelined/src/fpu/fma.sv    |  4 ++--
 pipelined/src/fpu/fmaadd.sv | 12 ++++++------
 pipelined/src/fpu/fmalza.sv |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/pipelined/src/fpu/fma.sv b/pipelined/src/fpu/fma.sv
index dec492eba..950b55ff1 100644
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@@ -51,7 +51,7 @@ module fma(
 
     logic [2*`NF+1:0]   Pm;           // the product's significand in U(2.2Nf) format
     logic [3*`NF+5:0]   Am;     // addend aligned's mantissa for addition in U(NF+5.2NF+1)
-    logic [3*`NF+6:0]   AmInv;   // aligned addend's mantissa possibly inverted
+    logic [3*`NF+5:0]   AmInv;   // aligned addend's mantissa possibly inverted
     logic [2*`NF+1:0]   PmKilled;      // the product's mantissa possibly killed
     ///////////////////////////////////////////////////////////////////////////////
     // Calculate the product
@@ -86,7 +86,7 @@ module fma(
     fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .NegSum, .InvA, .Sm, .Se, .Ss);
 
    
-    fmalza lza(.A(AmInv[3*`NF+5:0]), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
+    fmalza lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
 endmodule
 
 
diff --git a/pipelined/src/fpu/fmaadd.sv b/pipelined/src/fpu/fmaadd.sv
index 4b52208c6..56ce5a74e 100644
--- a/pipelined/src/fpu/fmaadd.sv
+++ b/pipelined/src/fpu/fmaadd.sv
@@ -37,7 +37,7 @@ module fmaadd(
     input logic                 ZmSticky,
     input logic  [`NE-1:0]      Ze,
     input logic  [`NE+1:0]      Pe,
-    output logic [3*`NF+6:0]    AmInv,  // aligned addend possibly inverted
+    output logic [3*`NF+5:0]    AmInv,  // aligned addend possibly inverted
     output logic [2*`NF+1:0]    PmKilled,     // the product's mantissa possibly killed
     output logic                NegSum,        // was the sum negitive
     output logic                InvA,          // do you invert the aligned addend
@@ -45,7 +45,7 @@ module fmaadd(
     output logic [`NE+1:0]      Se,
     output logic [3*`NF+5:0]    Sm           // the positive sum
 );
-    logic [3*`NF+6:0]    PreSum, NegPreSum; // possibly negitive sum
+    logic [3*`NF+5:0]    PreSum, NegPreSum; // possibly negitive sum
 
     ///////////////////////////////////////////////////////////////////////////////
     // Addition
@@ -57,7 +57,7 @@ module fmaadd(
     assign InvA = As ^ Ps;
 
     // Choose an inverted or non-inverted addend - the one has to be added now for the LZA
-    assign AmInv = InvA ? {1'b1, ~Am} : {1'b0, Am};
+    assign AmInv = InvA ? ~Am : Am;
     // Kill the product if the product is too small to effect the addition (determined in fma1.sv)
     assign PmKilled = Pm&{2*`NF+2{~KillProd}};
     // Do the addition
@@ -66,11 +66,11 @@ module fmaadd(
     // PreSum    -1 = don't add 1     +1 = add 2
     // NegPreSum +1 = add 2           -1 = don't add 1
     // for NegPreSum the product is set to -1 whenever the product is killed, therefore add 1, 2 or 0
-    assign PreSum = {{`NF+3{1'b0}}, PmKilled, 1'b0, InvA&ZmSticky&KillProd} + AmInv + {{3*`NF+6{1'b0}}, InvA&~((ZmSticky&~KillProd))};
-    assign NegPreSum = {1'b0, Am} + {{`NF+3{1'b1}}, ~PmKilled, 2'b11} + {(3*`NF+5)'(0), ZmSticky&~KillProd, ~(ZmSticky)};
+    assign PreSum = {{`NF+2{1'b0}}, PmKilled, 1'b0, InvA&ZmSticky&KillProd} + AmInv + {{3*`NF+5{1'b0}}, InvA&~((ZmSticky&~KillProd))};
+    assign NegPreSum = Am + {{`NF+2{1'b1}}, ~PmKilled, 2'b11} + {(3*`NF+4)'(0), ZmSticky&~KillProd, ~(ZmSticky)};
      
     // Is the sum negitive
-    assign NegSum = PreSum[3*`NF+6];
+    assign NegSum = PreSum[3*`NF+5];
 
     // Choose the positive sum and accompanying LZA result.
     assign Sm = NegSum ? NegPreSum[3*`NF+5:0] : PreSum[3*`NF+5:0];
diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index fd180fbb6..65fe94266 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -39,7 +39,7 @@ module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection,
 
     localparam WIDTH = 3*`NF+6;
     
-   logic [WIDTH:0] 		       F;
+   logic [WIDTH:0] 	       F;
    logic [WIDTH-1:0]  B, P, G, K;
     logic [WIDTH-1:0] Pp1, Gm1, Km1;
 

From 7e4b04ff643624de43cdf64ea2144da46750324c Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 16:18:02 -0700
Subject: [PATCH 15/16] Parameterized fmalza

---
 pipelined/src/fpu/fma.sv    | 4 ++--
 pipelined/src/fpu/fmalza.sv | 8 +++-----
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/pipelined/src/fpu/fma.sv b/pipelined/src/fpu/fma.sv
index 950b55ff1..0106af7d5 100644
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@@ -85,8 +85,8 @@ module fma(
         
     fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .NegSum, .InvA, .Sm, .Se, .Ss);
 
-   
-    fmalza lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
+    
+    fmalza #(3*`NF+6) lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
 endmodule
 
 
diff --git a/pipelined/src/fpu/fmalza.sv b/pipelined/src/fpu/fmalza.sv
index 65fe94266..8e92a5dc4 100644
--- a/pipelined/src/fpu/fmalza.sv
+++ b/pipelined/src/fpu/fmalza.sv
@@ -29,16 +29,14 @@
 
 `include "wally-config.vh"
 
-module fmalza( // [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
-    input logic [3*`NF+5:0] 	       A, // addend
+module fmalza #(WIDTH) ( // [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
+    input logic [WIDTH-1:0] 	       A, // addend
     input logic [2*`NF+3:0] 	       Pm, // product
     input logic 		       Cin, // carry in
     input logic sub,
-    output logic [$clog2(3*`NF+7)-1:0] SCnt   // normalization shift count for the positive result
+    output logic [$clog2(WIDTH+1)-1:0] SCnt   // normalization shift count for the positive result
     ); 
 
-    localparam WIDTH = 3*`NF+6;
-    
    logic [WIDTH:0] 	       F;
    logic [WIDTH-1:0]  B, P, G, K;
     logic [WIDTH-1:0] Pp1, Gm1, Km1;

From 8147f7539952bcc3866af219c7e718f58a6bee26 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Mon, 1 Aug 2022 19:40:55 -0700
Subject: [PATCH 16/16] Fixed fmaadd to work with new LZA

---
 pipelined/src/fpu/fma.sv    |  4 ++--
 pipelined/src/fpu/fmaadd.sv | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/pipelined/src/fpu/fma.sv b/pipelined/src/fpu/fma.sv
index dec492eba..950b55ff1 100644
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@@ -51,7 +51,7 @@ module fma(
 
     logic [2*`NF+1:0]   Pm;           // the product's significand in U(2.2Nf) format
     logic [3*`NF+5:0]   Am;     // addend aligned's mantissa for addition in U(NF+5.2NF+1)
-    logic [3*`NF+6:0]   AmInv;   // aligned addend's mantissa possibly inverted
+    logic [3*`NF+5:0]   AmInv;   // aligned addend's mantissa possibly inverted
     logic [2*`NF+1:0]   PmKilled;      // the product's mantissa possibly killed
     ///////////////////////////////////////////////////////////////////////////////
     // Calculate the product
@@ -86,7 +86,7 @@ module fma(
     fmaadd add(.Am, .Pm, .Ze, .Pe, .Ps, .As, .KillProd, .ZmSticky, .AmInv, .PmKilled, .NegSum, .InvA, .Sm, .Se, .Ss);
 
    
-    fmalza lza(.A(AmInv[3*`NF+5:0]), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
+    fmalza lza(.A(AmInv), .Pm({PmKilled, 1'b0, InvA&Ps&ZmSticky&KillProd}), .Cin(InvA & ~(ZmSticky & ~KillProd)), .sub(InvA), .SCnt);
 endmodule
 
 
diff --git a/pipelined/src/fpu/fmaadd.sv b/pipelined/src/fpu/fmaadd.sv
index 4b52208c6..53ed023f8 100644
--- a/pipelined/src/fpu/fmaadd.sv
+++ b/pipelined/src/fpu/fmaadd.sv
@@ -37,7 +37,7 @@ module fmaadd(
     input logic                 ZmSticky,
     input logic  [`NE-1:0]      Ze,
     input logic  [`NE+1:0]      Pe,
-    output logic [3*`NF+6:0]    AmInv,  // aligned addend possibly inverted
+    output logic [3*`NF+5:0]    AmInv,  // aligned addend possibly inverted
     output logic [2*`NF+1:0]    PmKilled,     // the product's mantissa possibly killed
     output logic                NegSum,        // was the sum negitive
     output logic                InvA,          // do you invert the aligned addend
@@ -45,7 +45,7 @@ module fmaadd(
     output logic [`NE+1:0]      Se,
     output logic [3*`NF+5:0]    Sm           // the positive sum
 );
-    logic [3*`NF+6:0]    PreSum, NegPreSum; // possibly negitive sum
+    logic [3*`NF+5:0]    PreSum, NegPreSum; // possibly negitive sum
 
     ///////////////////////////////////////////////////////////////////////////////
     // Addition
@@ -57,7 +57,7 @@ module fmaadd(
     assign InvA = As ^ Ps;
 
     // Choose an inverted or non-inverted addend - the one has to be added now for the LZA
-    assign AmInv = InvA ? {1'b1, ~Am} : {1'b0, Am};
+    assign AmInv = InvA ? ~Am : Am;
     // Kill the product if the product is too small to effect the addition (determined in fma1.sv)
     assign PmKilled = Pm&{2*`NF+2{~KillProd}};
     // Do the addition
@@ -66,14 +66,14 @@ module fmaadd(
     // PreSum    -1 = don't add 1     +1 = add 2
     // NegPreSum +1 = add 2           -1 = don't add 1
     // for NegPreSum the product is set to -1 whenever the product is killed, therefore add 1, 2 or 0
-    assign PreSum = {{`NF+3{1'b0}}, PmKilled, 1'b0, InvA&ZmSticky&KillProd} + AmInv + {{3*`NF+6{1'b0}}, InvA&~((ZmSticky&~KillProd))};
-    assign NegPreSum = {1'b0, Am} + {{`NF+3{1'b1}}, ~PmKilled, 2'b11} + {(3*`NF+5)'(0), ZmSticky&~KillProd, ~(ZmSticky)};
+    assign {NegSum, PreSum} = {{`NF+3{1'b0}}, PmKilled, 1'b0, InvA&ZmSticky&KillProd} + {InvA, AmInv} + {{3*`NF+6{1'b0}}, InvA&~((ZmSticky&~KillProd))};
+    assign NegPreSum = Am + {{`NF+2{1'b1}}, ~PmKilled, 2'b11} + {(3*`NF+4)'(0), ZmSticky&~KillProd, ~(ZmSticky)};
      
     // Is the sum negitive
-    assign NegSum = PreSum[3*`NF+6];
+//    assign NegSum = PreSum[3*`NF+6];
 
     // Choose the positive sum and accompanying LZA result.
-    assign Sm = NegSum ? NegPreSum[3*`NF+5:0] : PreSum[3*`NF+5:0];
+    assign Sm = NegSum ? NegPreSum : PreSum;
     // is the result negitive
     //  if p - z is the Sum negitive
     //  if -p + z is the Sum positive