From 7e026f3e78f20d1d64237c983398eca3ae8f06a6 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Sat, 18 Dec 2021 10:21:17 -0800
Subject: [PATCH 1/7] Simplified Shifter Right input

---
 wally-pipelined/src/ieu/alu.sv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wally-pipelined/src/ieu/alu.sv b/wally-pipelined/src/ieu/alu.sv
index a2aa1a44..0558cbe1 100644
--- a/wally-pipelined/src/ieu/alu.sv
+++ b/wally-pipelined/src/ieu/alu.sv
@@ -51,8 +51,8 @@ module alu #(parameter WIDTH=32) (
   assign {Carry, Sum} = A + CondInvB + {{(WIDTH-1){1'b0}}, SubArith};
   
   // Shifts
-  assign Right = (Funct3[2:0] == 3'b101); // sra or srl
-  shifter sh(A, B[5:0], Right, SubArith, W64, Shift);
+  assign Right = Funct3[2]; // sra or srl
+  shifter sh(A, B[`LOG_XLEN-1:0], Right, SubArith, W64, Shift);
   
   // condition code flags based on add/subtract output
   // Overflow occurs when the numbers being added have the same sign 

From 721d0b5bcf59a48047512fc85091a3bbd8f11f55 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Sat, 18 Dec 2021 10:25:40 -0800
Subject: [PATCH 2/7] Simplified shifter right input

---
 wally-pipelined/src/ieu/alu.sv     |  6 ++---
 wally-pipelined/src/ieu/shifter.sv | 42 +++++++++++++++---------------
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/wally-pipelined/src/ieu/alu.sv b/wally-pipelined/src/ieu/alu.sv
index 0558cbe1..827aeb8a 100644
--- a/wally-pipelined/src/ieu/alu.sv
+++ b/wally-pipelined/src/ieu/alu.sv
@@ -33,7 +33,6 @@ module alu #(parameter WIDTH=32) (
   output logic [WIDTH-1:0] Sum);
 
   logic [WIDTH-1:0] CondInvB, Shift, SLT, SLTU, FullResult;
-  logic        Right;
   logic        Carry, Neg;
   logic        LT, LTU;
   logic        Overflow;
@@ -51,9 +50,8 @@ module alu #(parameter WIDTH=32) (
   assign {Carry, Sum} = A + CondInvB + {{(WIDTH-1){1'b0}}, SubArith};
   
   // Shifts
-  assign Right = Funct3[2]; // sra or srl
-  shifter sh(A, B[`LOG_XLEN-1:0], Right, SubArith, W64, Shift);
-  
+  shifter sh(.A, .Amt(B[`LOG_XLEN-1:0]), .Right(Funct3[2]), .Arith(SubArith), .W64, .Y(Shift));
+
   // condition code flags based on add/subtract output
   // Overflow occurs when the numbers being added have the same sign 
   // and the result has the opposite sign
diff --git a/wally-pipelined/src/ieu/shifter.sv b/wally-pipelined/src/ieu/shifter.sv
index fc170e75..232f7241 100644
--- a/wally-pipelined/src/ieu/shifter.sv
+++ b/wally-pipelined/src/ieu/shifter.sv
@@ -26,10 +26,10 @@
 `include "wally-config.vh"
 
 module shifter (
-  input  logic [`XLEN-1:0] a,
-  input  logic [`LOG_XLEN-1:0]       amt,
-  input  logic             right, arith, w64,
-  output logic [`XLEN-1:0] y);
+  input  logic [`XLEN-1:0]     A,
+  input  logic [`LOG_XLEN-1:0] Amt,
+  input  logic                 Right, Arith, W64,
+  output logic [`XLEN-1:0]     Y);
 
   logic [2*`XLEN-2:0]      z, zshift;
   logic [`LOG_XLEN-1:0]    amttrunc, offset;
@@ -42,34 +42,34 @@ module shifter (
   generate
     if (`XLEN==32) begin:shifter // RV32
       always_comb  // funnel mux
-        if (right) 
-          if (arith) z = {{31{a[31]}}, a};
-          else       z = {31'b0, a};
-        else         z = {a, 31'b0};
-      assign amttrunc = amt; // shift amount
+        if (Right) 
+          if (Arith) z = {{31{A[31]}}, A};
+          else       z = {31'b0, A};
+        else         z = {A, 31'b0};
+      assign amttrunc = Amt; // shift amount
     end else begin:shifter  // RV64
       always_comb  // funnel mux
-        if (w64) begin // 32-bit shifts
-          if (right)
-            if (arith) z = {64'b0, {31{a[31]}}, a[31:0]};
-            else       z = {95'b0, a[31:0]};
-          else         z = {32'b0, a[31:0], 63'b0};
+        if (W64) begin // 32-bit shifts
+          if (Right)
+            if (Arith) z = {64'b0, {31{A[31]}}, A[31:0]};
+            else       z = {95'b0, A[31:0]};
+          else         z = {32'b0, A[31:0], 63'b0};
         end else begin
-          if (right)
-            if (arith) z = {{63{a[63]}}, a};
-            else       z = {63'b0, a};
-          else         z = {a, 63'b0};         
+          if (Right)
+            if (Arith) z = {{63{A[63]}}, A};
+            else       z = {63'b0, A};
+          else         z = {A, 63'b0};         
         end
-      assign amttrunc = w64 ? {1'b0, amt[4:0]} : amt; // 32 or 64-bit shift
+      assign amttrunc = W64 ? {1'b0, Amt[4:0]} : Amt; // 32 or 64-bit shift
     end
   endgenerate
 
   // opposite offset for right shfits
-  assign offset = right ? amttrunc : ~amttrunc;
+  assign offset = Right ? amttrunc : ~amttrunc;
   
   // funnel operation
   assign zshift = z >> offset;
-  assign y = zshift[`XLEN-1:0];    
+  assign Y = zshift[`XLEN-1:0];    
 endmodule
 
 

From 67577d7c91ffaf574e96cdb28aee0aadf0ba813b Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Sat, 18 Dec 2021 21:26:00 -0800
Subject: [PATCH 3/7] Renamed RD1D to R1D, etc.

---
 wally-pipelined/src/ieu/alu.sv      |  8 ++++----
 wally-pipelined/src/ieu/datapath.sv | 14 +++++++-------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/wally-pipelined/src/ieu/alu.sv b/wally-pipelined/src/ieu/alu.sv
index 827aeb8a..a93ebd29 100644
--- a/wally-pipelined/src/ieu/alu.sv
+++ b/wally-pipelined/src/ieu/alu.sv
@@ -52,10 +52,10 @@ module alu #(parameter WIDTH=32) (
   // Shifts
   shifter sh(.A, .Amt(B[`LOG_XLEN-1:0]), .Right(Funct3[2]), .Arith(SubArith), .W64, .Y(Shift));
 
-  // condition code flags based on add/subtract output
-  // Overflow occurs when the numbers being added have the same sign 
-  // and the result has the opposite sign
-  assign Overflow = (A[WIDTH-1] ~^ CondInvB[WIDTH-1]) & (A[WIDTH-1] ^ Sum[WIDTH-1]);
+  // condition code flags based on subtract output
+  // Overflow occurs when the numbers being subtracted have the opposite sign 
+  // and the result has the opposite sign of A
+  assign Overflow = (A[WIDTH-1] ^ B[WIDTH-1]) & (A[WIDTH-1] ^ Sum[WIDTH-1]);
   assign Neg  = Sum[WIDTH-1];
   assign LT = Neg ^ Overflow;
   assign LTU = ~Carry;
diff --git a/wally-pipelined/src/ieu/datapath.sv b/wally-pipelined/src/ieu/datapath.sv
index c36077d2..9111a61f 100644
--- a/wally-pipelined/src/ieu/datapath.sv
+++ b/wally-pipelined/src/ieu/datapath.sv
@@ -66,11 +66,11 @@ module datapath (
 
   // Fetch stage signals
   // Decode stage signals
-  logic [`XLEN-1:0] RD1D, RD2D;
+  logic [`XLEN-1:0] R1D, R2D;
   logic [`XLEN-1:0] ExtImmD;
   logic [4:0]      RdD;
   // Execute stage signals
-  logic [`XLEN-1:0] RD1E, RD2E;
+  logic [`XLEN-1:0] R1E, R2E;
   logic [`XLEN-1:0] ExtImmE;
 
   // logic [`XLEN-1:0] ForwardedSrcAE, ForwardedSrcBE, SrcAE2, SrcBE2; // *** MAde forwardedsrcae an output to get rid of a mux in the critical path.
@@ -91,19 +91,19 @@ module datapath (
   assign Rs1D      = InstrD[19:15];
   assign Rs2D      = InstrD[24:20];
   assign RdD       = InstrD[11:7];
-  regfile regf(clk, reset, RegWriteW, Rs1D, Rs2D, RdW, WriteDataW, RD1D, RD2D);
+  regfile regf(clk, reset, RegWriteW, Rs1D, Rs2D, RdW, WriteDataW, R1D, R2D);
   extend ext(.InstrD(InstrD[31:7]), .ImmSrcD, .ExtImmD);
  
   // Execute stage pipeline register and logic
-  flopenrc #(`XLEN) RD1EReg(clk, reset, FlushE, ~StallE, RD1D, RD1E);
-  flopenrc #(`XLEN) RD2EReg(clk, reset, FlushE, ~StallE, RD2D, RD2E);
+  flopenrc #(`XLEN) RD1EReg(clk, reset, FlushE, ~StallE, R1D, R1E);
+  flopenrc #(`XLEN) RD2EReg(clk, reset, FlushE, ~StallE, R2D, R2E);
   flopenrc #(`XLEN) ExtImmEReg(clk, reset, FlushE, ~StallE, ExtImmD, ExtImmE);
   flopenrc #(5)     Rs1EReg(clk, reset, FlushE, ~StallE, Rs1D, Rs1E);
   flopenrc #(5)     Rs2EReg(clk, reset, FlushE, ~StallE, Rs2D, Rs2E);
   flopenrc #(5)     RdEReg(clk, reset, FlushE, ~StallE, RdD, RdE);
 	
-  mux3  #(`XLEN)  faemux(RD1E, WriteDataW, ResultM, ForwardAE, ForwardedSrcAE);
-  mux3  #(`XLEN)  fbemux(RD2E, WriteDataW, ResultM, ForwardBE, ForwardedSrcBE);
+  mux3  #(`XLEN)  faemux(R1E, WriteDataW, ResultM, ForwardAE, ForwardedSrcAE);
+  mux3  #(`XLEN)  fbemux(R2E, WriteDataW, ResultM, ForwardBE, ForwardedSrcBE);
   comparator #(`XLEN) comp(ForwardedSrcAE, ForwardedSrcBE, FlagsE);
   mux2  #(`XLEN)  srcamux(ForwardedSrcAE, PCE, ALUSrcAE, SrcAE);
   mux2  #(`XLEN)  srcbmux(ForwardedSrcBE, ExtImmE, ALUSrcBE, SrcBE);

From 406f129bedd9a9dc1a1830a5d5cbaa473d1c8098 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Sat, 18 Dec 2021 22:08:23 -0800
Subject: [PATCH 4/7] Controller fix

---
 wally-pipelined/src/ieu/controller.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wally-pipelined/src/ieu/controller.sv b/wally-pipelined/src/ieu/controller.sv
index 907aa650..94f3d65c 100644
--- a/wally-pipelined/src/ieu/controller.sv
+++ b/wally-pipelined/src/ieu/controller.sv
@@ -173,7 +173,7 @@ module controller(
   // ALU Decoding
   assign sltD = (Funct3D == 3'b010);
   assign sltuD = (Funct3D == 3'b011);
-  assign subD = (Funct3D == 3'b000 & Funct7D[5] & OpD[5]);
+  assign subD = (Funct3D == 3'b000 & Funct7D[5] & OpD[5]);  // OpD[5] needed; ***explain why
   assign sraD = (Funct3D == 3'b101 & Funct7D[5]);
   assign SubArithD = ALUOpD & (subD | sraD | sltD | sltuD); // TRUE for R-type subtracts and sra, slt, sltu
   assign ALUControlD = {W64D, SubArithD, ALUOpD};

From f201af4bb7720b0344fe256ea9cf0ce148b17790 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Sun, 19 Dec 2021 11:49:15 -0800
Subject: [PATCH 5/7] Renamed zero to eq in flag generation

---
 wally-pipelined/src/ieu/comparator.sv | 6 +++---
 wally-pipelined/src/ieu/controller.sv | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/wally-pipelined/src/ieu/comparator.sv b/wally-pipelined/src/ieu/comparator.sv
index 14117274..0c161d6d 100644
--- a/wally-pipelined/src/ieu/comparator.sv
+++ b/wally-pipelined/src/ieu/comparator.sv
@@ -30,7 +30,7 @@ module comparator #(parameter WIDTH=32) (
   output logic [2:0]       flags);
 
   logic [WIDTH-1:0] bbar, diff;
-  logic             carry, zero, neg, overflow, lt, ltu;
+  logic             carry, eq, neg, overflow, lt, ltu;
 
   // NOTE: This can be replaced by some faster logic optimized
   // to just compute flags and not the difference.
@@ -40,13 +40,13 @@ module comparator #(parameter WIDTH=32) (
   assign {carry, diff} = a + bbar + 1;
 
   // condition code flags based on add/subtract output
-  assign zero = (diff == 0);
+  assign eq = (diff == 0);
   assign neg  = diff[WIDTH-1];
   // overflow occurs when the numbers being subtracted have the opposite sign 
   // and the result has the opposite sign fron the first
   assign overflow = (a[WIDTH-1] ^ b[WIDTH-1]) & (a[WIDTH-1] ^ diff[WIDTH-1]);
   assign lt = neg ^ overflow;
   assign ltu = ~carry;
-  assign flags = {zero, lt, ltu};
+  assign flags = {eq, lt, ltu};
 endmodule
 
diff --git a/wally-pipelined/src/ieu/controller.sv b/wally-pipelined/src/ieu/controller.sv
index 94f3d65c..b081d40f 100644
--- a/wally-pipelined/src/ieu/controller.sv
+++ b/wally-pipelined/src/ieu/controller.sv
@@ -97,7 +97,7 @@ module controller(
   logic        SubArithD;
   logic        subD, sraD, sltD, sltuD;
   logic        BranchTakenE;
-  logic        zeroE, ltE, ltuE;
+  logic        eqE, ltE, ltuE;
   logic        unused;
 	logic        BranchFlagE;
   logic        IEURegWriteE;
@@ -202,8 +202,8 @@ module controller(
                            {IEURegWriteE, ResultSrcE, MemRWE, JumpE, BranchE, ALUControlE, ALUSrcAE, ALUSrcBE, ALUResultSrcE, CSRReadE, CSRWriteE, PrivilegedE, Funct3E, W64E, MulDivE, AtomicE, InvalidateICacheE, FlushDCacheE, InstrValidE});
 
   // Branch Logic
-  assign {zeroE, ltE, ltuE} = FlagsE;
-  mux4 #(1) branchflagmux(zeroE, 1'b0, ltE, ltuE, Funct3E[2:1], BranchFlagE);
+  assign {eqE, ltE, ltuE} = FlagsE;
+  mux4 #(1) branchflagmux(eqE, 1'b0, ltE, ltuE, Funct3E[2:1], BranchFlagE);
   assign BranchTakenE = BranchFlagE ^ Funct3E[0];
     
   assign PCSrcE = JumpE | BranchE & BranchTakenE;

From e3f2a252cdbb69e20c59d431dcea284c158782a5 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Sun, 19 Dec 2021 13:51:46 -0800
Subject: [PATCH 6/7] fixed some small errors in FMA

---
 wally-pipelined/src/fpu/fma.sv | 171 +++++++++++----------------------
 1 file changed, 56 insertions(+), 115 deletions(-)

diff --git a/wally-pipelined/src/fpu/fma.sv b/wally-pipelined/src/fpu/fma.sv
index 6ad3f986..a90848f5 100644
--- a/wally-pipelined/src/fpu/fma.sv
+++ b/wally-pipelined/src/fpu/fma.sv
@@ -28,6 +28,7 @@
 // `define NE   11//(`Q_SUPPORTED ? 15 : `D_SUPPORTED ? 11 : 8)
 // `define NF   52//(`Q_SUPPORTED ? 112 : `D_SUPPORTED ? 52 : 23)
 // `define XLEN 64
+`define NANPAYLOAD 1
 module fma(
     input logic                 clk,
     input logic                 reset,
@@ -117,9 +118,8 @@ module fma1(
     logic [3*`NF+6:0]   AlignedAddendInv;   // aligned addend possibly inverted
     logic [2*`NF+1:0]   ProdManKilled;      // the product's mantissa possibly killed
     logic [3*`NF+4:0]   NegProdManKilled;   // a negated ProdManKilled
-    logic [8:0]         PNormCnt, NNormCnt; // the positive and nagitive LOA results
     logic [3*`NF+6:0]   PreSum, NegPreSum;  // positive and negitve versions of the sum
-
+    logic [`NE-1:0]     XExpVal, YExpVal;   // exponent value after taking into accound denormals
     ///////////////////////////////////////////////////////////////////////////////
     // Calculate the product
     //      - When multipliying two fp numbers, add the exponents
@@ -130,7 +130,7 @@ module fma1(
    
 
    // calculate the product's exponent 
-    expadd expadd(.FmtE, .XExpE, .YExpE, .XZeroE, .YZeroE, .XDenormE, .YDenormE, 
+    expadd expadd(.FmtE, .XExpE, .YExpE, .XZeroE, .YZeroE, .XDenormE, .YDenormE, .XExpVal, .YExpVal, 
                     .Denorm, .ProdExpE);
 
     // multiplication of the mantissa's
@@ -140,7 +140,7 @@ module fma1(
     // Alignment shifter
     ///////////////////////////////////////////////////////////////////////////////
 
-    align align(.ZExpE, .ZManE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .ProdExpE, .Denorm,
+    align align(.ZExpE, .ZManE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .ProdExpE, .Denorm, .XExpVal, .YExpVal,
                         .AlignedAddendE, .AddendStickyE, .KillProdE);
                         
     // calculate the signs and take the opperation into account
@@ -150,9 +150,9 @@ module fma1(
     // // Addition/LZA
     // ///////////////////////////////////////////////////////////////////////////////
         
-    add add(.AlignedAddendE, .ProdManE, .PSgnE, .ZSgnEffE, .KillProdE, .AlignedAddendInv, .ProdManKilled, .NegProdManKilled, .NegSumE, .PreSum, .NegPreSum, .InvZE, .XZeroE, .YZeroE);
+    add add(.AlignedAddendE, .ProdManE, .PSgnE, .ZSgnEffE, .KillProdE, .AlignedAddendInv, .ProdManKilled, .NegSumE, .PreSum, .NegPreSum, .InvZE, .XZeroE, .YZeroE);
     
-    loa loa(.A(AlignedAddendInv+{162'b0,InvZE}), .P(ProdManKilled), .NegSumE, .NormCntE);
+    loa loa(.A(AlignedAddendInv+{162'b0,InvZE}), .P(ProdManKilled), .NormCntE);
 
     // Choose the positive sum and accompanying LZA result.
     assign SumE = NegSumE ? NegPreSum[3*`NF+5:0] : PreSum[3*`NF+5:0];
@@ -167,11 +167,11 @@ module expadd(
     input  logic [`NE-1:0]  XExpE, YExpE,  // input exponents
     input  logic            XDenormE, YDenormE,    // are the inputs denormalized
     input  logic            XZeroE, YZeroE,        // are the inputs zero
+    output logic [`NE-1:0]  XExpVal, YExpVal,      // Exponent value after taking into account denormals
     output logic [`NE-1:0]  Denorm,        // value of denormalized exponent
     output logic [`NE+1:0]  ProdExpE       // product's exponent B^(1023)NE+2
 );
 
-    logic [`NE-1:0] XExpVal, YExpVal;       // Exponent value after taking into account denormals
 
     // denormalized numbers have diffrent values depending on which precison it is.
     //      double - 1
@@ -233,6 +233,7 @@ module align(
     input logic  [`NF:0]        ZManE,      // fractions in U(0.NF) format]
     input logic                 ZDenormE,   // is the input denormal
     input logic                 XZeroE, YZeroE, ZZeroE, // is the input zero
+    input logic  [`NE-1:0]      XExpVal, YExpVal,       // Exponent value after taking into account denormals
     input logic  [`NE+1:0]      ProdExpE,       // the product's exponent
     input logic  [`NE-1:0]      Denorm,         // the biased value of a denormalized number
     output logic [3*`NF+5:0]    AlignedAddendE, // Z aligned for addition in U(NF+5.2NF+1)
@@ -254,7 +255,8 @@ module align(
     //      - positive means the product is larger, so shift Z right
     //      - Denormal numbers have a diffrent exponent value depending on the precision
     assign ZExpVal = ZDenormE ? Denorm : ZExpE;
-    assign AlignCnt = ProdExpE - {2'b0, ZExpVal} + (`NF+3);
+    // assign AlignCnt = ProdExpE - {2'b0, ZExpVal} + (`NF+3);
+    assign AlignCnt = XZeroE|YZeroE ? -1 : {2'b0, XExpVal} + {2'b0, YExpVal} - 1020+`NF - {2'b0, ZExpVal};
 
     // Defualt Addition without shifting
     //          |   54'b0    |  106'b(product)  | 2'b0 |
@@ -312,14 +314,14 @@ module add(
     input logic                 PSgnE, ZSgnEffE,// the product and modified Z signs
     input logic                 KillProdE,      // should the product be set to 0
     input logic                 XZeroE, YZeroE, // is the input zero
-    output logic [3*`NF+6:0] AlignedAddendInv,  // aligned addend possibly inverted
-    output logic [2*`NF+1:0] ProdManKilled,     // the product's mantissa possibly killed
-    output logic [3*`NF+4:0] NegProdManKilled,  // a negated ProdManKilled
+    output logic [3*`NF+6:0]    AlignedAddendInv,  // aligned addend possibly inverted
+    output logic [2*`NF+1:0]    ProdManKilled,     // the product's mantissa possibly killed
     output logic                NegSumE,        // was the sum negitive
     output logic                InvZE,          // do you invert Z
-    output logic [3*`NF+6:0]   PreSum, NegPreSum// possibly negitive sum
+    output logic [3*`NF+6:0]    PreSum, NegPreSum// possibly negitive sum
 );
 
+    logic [3*`NF+4:0] NegProdManKilled;  // a negated ProdManKilled
     ///////////////////////////////////////////////////////////////////////////////
     // Addition
     ///////////////////////////////////////////////////////////////////////////////
@@ -334,17 +336,17 @@ module add(
     // Kill the product if the product is too small to effect the addition (determined in fma1.sv)
     assign ProdManKilled = ProdManE&{2*`NF+2{~KillProdE}};
     // Negate ProdMan for LZA and the negitive sum calculation
-    assign NegProdManKilled = {{`NF+3{~(XZeroE|YZeroE|KillProdE)}}, ~ProdManKilled&{2*`NF+2{~(XZeroE|YZeroE)}}};
+    assign NegProdManKilled = {{`NF+3{~(XZeroE|YZeroE|KillProdE)}}, ~ProdManKilled&{2*`NF+2{~(XZeroE|YZeroE|KillProdE)}}};
 
 
-    // Is the sum negitive
-    assign NegSumE = (AlignedAddendE > {54'b0, ProdManKilled, 2'b0})&InvZE; //***use this to avoid addition and final muxing???
 
     // Do the addition
     //      - calculate a positive and negitive sum in parallel
     assign PreSum = AlignedAddendInv + {55'b0, ProdManKilled, 2'b0} + {{3*`NF+6{1'b0}}, InvZE};
-    assign NegPreSum = AlignedAddendE + {NegProdManKilled, 2'b0} + {{(3*`NF+3){1'b0}},~(XZeroE|YZeroE),2'b0};
+    assign NegPreSum = AlignedAddendE + {NegProdManKilled, 2'b0} + {{(3*`NF+3){1'b0}},~(XZeroE|YZeroE|KillProdE),2'b0};
      
+    // Is the sum negitive
+    assign NegSumE = PreSum[3*`NF+6];
 
 endmodule
 
@@ -352,28 +354,32 @@ endmodule
 module loa( //https://ieeexplore.ieee.org/abstract/document/930098
     input logic  [3*`NF+6:0] A,     // addend
     input logic  [2*`NF+1:0] P,     // product
-    input logic              NegSumE, // is the sum negitive
     output logic [8:0]       NormCntE   // normalization shift count for the positive result
     ); 
     
-
     logic [3*`NF+6:0] T;
-    logic [3*`NF+5:0] G;
-    logic [3*`NF+5:0] Z;
+    logic [3*`NF+6:0] G;
+    logic [3*`NF+6:0] Z;
     assign T[3*`NF+6:2*`NF+4] = A[3*`NF+6:2*`NF+4];
-    assign G[3*`NF+5:2*`NF+4] = 0;
-    assign Z[3*`NF+5:2*`NF+4] = ~A[3*`NF+5:2*`NF+4];
+    assign G[3*`NF+6:2*`NF+4] = 0;
+    assign Z[3*`NF+6:2*`NF+4] = ~A[3*`NF+6:2*`NF+4];
     assign T[2*`NF+3:2] = A[2*`NF+3:2]^P;
     assign G[2*`NF+3:2] = A[2*`NF+3:2]&P;
     assign Z[2*`NF+3:2] = ~A[2*`NF+3:2]&~P;
     assign T[1:0] = A[1:0];
     assign G[1:0] = 0;
     assign Z[1:0] = ~A[1:0];
-    
+
 
     // Apply function to determine Leading pattern
+    //      - note: the paper linked above uses the numbering system where 0 is the most significant bit
+    //f[n] = ~T[n]&T[n-1]           note: n is the MSB
+    //f[i] = (T[i+1]&(G[i]&~Z[i-1] | Z[i]&~G[i-1])) | (~T[i+1]&(Z[i]&~Z[i-1] | G[i]&~G[i-1]))
     logic [3*`NF+6:0] f;
-    assign f = NegSumE ? T^{~G[3*`NF+5:0],1'b1} : T^{~Z[3*`NF+5:0], 1'b1};
+    assign f[3*`NF+6] = ~T[3*`NF+6]&T[3*`NF+5];
+    assign f[3*`NF+5:0] = (T[3*`NF+6:1]&(G[3*`NF+5:0]&{~Z[3*`NF+4:0], 1'b0} | Z[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1})) | (~T[3*`NF+6:1]&(Z[3*`NF+5:0]&{~Z[3*`NF+4:0], 1'b0} | G[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1}));
+
+
 
     lzc lzc(.f, .NormCntE);
   
@@ -426,7 +432,7 @@ module fma2(
 
     logic [`NF-1:0]     ResultFrac; // Result fraction
     logic [`NE-1:0]     ResultExp;  // Result exponent
-    logic               ResultSgn;  // Result sign
+    logic               ResultSgn, ResultSgnTmp;  // Result sign
     logic [`NE+1:0]     SumExp;     // exponent of the normalized sum
     logic [`NE+1:0]     FullResultExp;  // ResultExp with bits to determine sign and overflow
     logic [`NF+2:0]     NormSum;        // normalized sum
@@ -464,7 +470,7 @@ module fma2(
     // round to infinity
     // round to nearest max magnitude
 
-    fmaround fmaround(.FmtM, .FrmM, .Sticky, .UfSticky, .NormSum, .AddendStickyM, .NormSumSticky, .ZZeroM, .InvZM, .ResultSgn, .SumExp,
+    fmaround fmaround(.FmtM, .FrmM, .Sticky, .UfSticky, .NormSum, .AddendStickyM, .NormSumSticky, .ZZeroM, .InvZM, .ResultSgnTmp, .SumExp,
         .CalcPlus1, .Plus1, .UfPlus1, .Minus1, .FullResultExp, .ResultFrac, .ResultExp, .Round, .Guard, .UfLSBNormSum);
 
 
@@ -476,7 +482,7 @@ module fma2(
     ///////////////////////////////////////////////////////////////////////////////
 
  
-    resultsign resultsign(.FrmM, .PSgnM, .ZSgnEffM, .Underflow, .InvZM, .NegSumM, .SumZero, .ResultSgn);
+    resultsign resultsign(.FrmM, .PSgnM, .ZSgnEffM, .Underflow, .InvZM, .NegSumM, .SumZero, .ResultSgnTmp, .ResultSgn);
 
 
 
@@ -512,11 +518,12 @@ module resultsign(
     input logic         InvZM,
     input logic         NegSumM,
     input logic         SumZero,
+    output logic        ResultSgnTmp,
     output logic        ResultSgn
 );
 
     logic ZeroSgn;
-    logic ResultSgnTmp;
+    // logic ResultSgnTmp;
 
     // Determine the sign if the sum is zero
     //      if cancelation then 0 unless round to -infinity
@@ -554,15 +561,24 @@ module resultselect(
 );
     logic [`FLEN-1:0]   XNaNResult, YNaNResult, ZNaNResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult; // possible results
 
-    assign XNaNResult = FmtM ? {XSgnM, XExpM, 1'b1, XManM[`NF-2:0]} : {{32{1'b1}}, XSgnM, XExpM[7:0], 1'b1, XManM[50:29]};
-    assign YNaNResult = FmtM ? {YSgnM, YExpM, 1'b1, YManM[`NF-2:0]} : {{32{1'b1}}, YSgnM, YExpM[7:0], 1'b1, YManM[50:29]};
-    assign ZNaNResult = FmtM ? {ZSgnEffM, ZExpM, 1'b1, ZManM[`NF-2:0]} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], 1'b1, ZManM[50:29]};
+    generate if(`NANPAYLOAD) begin
+        assign XNaNResult = FmtM ? {XSgnM, XExpM, 1'b1, XManM[`NF-2:0]} : {{32{1'b1}}, XSgnM, XExpM[7:0], 1'b1, XManM[50:29]};
+        assign YNaNResult = FmtM ? {YSgnM, YExpM, 1'b1, YManM[`NF-2:0]} : {{32{1'b1}}, YSgnM, YExpM[7:0], 1'b1, YManM[50:29]};
+        assign ZNaNResult = FmtM ? {ZSgnEffM, ZExpM, 1'b1, ZManM[`NF-2:0]} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], 1'b1, ZManM[50:29]};
+    end else begin
+        assign XNaNResult = FmtM ? {XSgnM, XExpM, 1'b1, 51'b0} : {{32{1'b1}}, XSgnM, XExpM[7:0], 1'b1, 22'b0};
+        assign YNaNResult = FmtM ? {YSgnM, YExpM, 1'b1, 51'b0} : {{32{1'b1}}, YSgnM, YExpM[7:0], 1'b1, 22'b0};
+        assign ZNaNResult = FmtM ? {ZSgnEffM, ZExpM, 1'b1, 51'b0} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], 1'b1, 22'b0};
+    end
+    endgenerate
+    
+    
     assign OverflowResult =  FmtM ? ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, {`NE-1{1'b1}}, 1'b0, {`NF{1'b1}}} :
                                                                                                                           {ResultSgn, {`NE{1'b1}}, {`NF{1'b0}}} :
                                     ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{32{1'b1}}, ResultSgn, 8'hfe, {23{1'b1}}} :
                                                                                                                           {{32{1'b1}}, ResultSgn, 8'hff, 23'b0};
     assign InvalidResult = FmtM ? {ResultSgn, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}} : {{32{1'b1}}, ResultSgn, 8'hff, 1'b1, 22'b0};
-    assign KillProdResult = FmtM ? {ResultSgn, {ZExpM, ZManM[`NF-1:0]} - {62'b0, (Minus1&AddendStickyM) + (Plus1&AddendStickyM)}} : {{32{1'b1}}, ResultSgn, {ZExpM[`NE-1],ZExpM[6:0], ZManM[51:29]} - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}};
+    assign KillProdResult = FmtM ? {ResultSgn, {ZExpM, ZManM[`NF-1:0]} - {62'b0, (Minus1&AddendStickyM)} + {62'b0, (Plus1&AddendStickyM)}} : {{32{1'b1}}, ResultSgn, {ZExpM[`NE-1],ZExpM[6:0], ZManM[51:29]} - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}};
     assign UnderflowResult = FmtM ? {ResultSgn, {`FLEN-1{1'b0}}} + {63'b0,(CalcPlus1&(AddendStickyM|FrmM[1]))} : {{32{1'b1}}, {ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}};
     assign FMAResM = XNaNM ? XNaNResult :
                         YNaNM ? YNaNResult :
@@ -579,81 +595,6 @@ module resultselect(
 
 endmodule
 
-
-// module normalize(
-//     input logic  [3*`NF+5:0]    SumM,       // the positive sum
-//     input logic  [`NE-1:0]      ZExpM,      // exponent of Z
-//     input logic  [`NE+1:0]      ProdExpM,   // X exponent + Y exponent - bias
-//     input logic  [8:0]          NormCntM,   // normalization shift count
-//     input logic                 FmtM,       // precision 1 = double 0 = single
-//     input logic                 KillProdM,  // is the product set to zero
-//     input logic                 AddendStickyM,  // the sticky bit caclulated from the aligned addend
-//     input logic                 NegSumM,    // was the sum negitive
-//     output logic [`NF+2:0]      NormSum,        // normalized sum
-//     output logic                SumZero,        // is the sum zero
-//     output logic                NormSumSticky, UfSticky,    // sticky bits
-//     output logic [`NE+1:0]      SumExp,         // exponent of the normalized sum
-//     output logic                ResultDenorm    // is the result denormalized
-// );
-//     logic [`NE+1:0]     FracLen;            // length of the fraction
-//     logic [`NE+1:0]     SumExpTmp;          // exponent of the normalized sum not taking into account denormal or zero results
-//     logic [8:0]         DenormShift;        // right shift if the result is denormalized //***change this later
-//     logic [3*`NF+5:0]   CorrSumShifted;     // the shifted sum after LZA correction
-//     logic [3*`NF+7:0]   SumShifted;         // the shifted sum before LZA correction
-//     logic [`NE+1:0]     SumExpTmpTmp;       // the exponent of the normalized sum with the `FLEN bias
-//     logic               PreResultDenorm;    // is the result denormalized - calculated before LZA corection
-//     logic               PreResultDenorm2;    // is the result denormalized - calculated before LZA corection
-//     logic               LZAPlus1;           // add one to the sum's exponent due to LZA correction
-
-//     ///////////////////////////////////////////////////////////////////////////////
-//     // Normalization
-//     ///////////////////////////////////////////////////////////////////////////////
-
-//     // Determine if the sum is zero
-//     assign SumZero = ~(|SumM);
-
-//     // determine the length of the fraction based on precision
-//     assign FracLen = FmtM ? `NF+1 : 13'd24;
-
-//     // calculate the sum's exponent
-//     assign SumExpTmpTmp = KillProdM ? {2'b0, ZExpM} : ProdExpM + -({4'b0, NormCntM} + 1 - (`NF+4)); // ****try moving this into previous stage
-//     assign SumExpTmp = FmtM ? SumExpTmpTmp : (SumExpTmpTmp-1023+127)&{`NE+2{|SumExpTmpTmp}}; // ***move this ^ the subtraction by a constant isn't simplified
-    
-//     logic SumDLTEZ, SumDGEFL, SumSLTEZ, SumSGEFL;
-//     assign SumDLTEZ = SumExpTmpTmp[`NE+1] | ~|SumExpTmpTmp;
-//     assign SumDGEFL = ($signed(SumExpTmpTmp)>=$signed(-(13'd`NF+13'd1)));
-//     assign SumSLTEZ = $signed(SumExpTmpTmp) <= $signed(13'd1023-13'd127);
-//     assign SumSGEFL = ($signed(SumExpTmpTmp)>=$signed(-13'd24+13'd1023-13'd127)) | ~|SumExpTmpTmp;
-//     assign PreResultDenorm2 = (FmtM ? SumDLTEZ : SumSLTEZ) & (FmtM ? SumDGEFL : SumSGEFL) & ~SumZero; //***make sure math good
-//     // always_comb begin
-//     //     assert (PreResultDenorm == PreResultDenorm2) else $fatal ("PreResultDenorms not equal");
-//     // end
-
-
-
-//     // Determine if the result is denormal
-//     // assign PreResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp)>=$signed(-FracLen)) & ~SumZero;
-
-//     // Determine the shift needed for denormal results
-//     //  - if not denorm add 1 to shift out the leading 1
-//     assign DenormShift = PreResultDenorm2 ? SumExpTmp[8:0] : 1; //*** change this when changing the size of DenormShift also change to an and opperation
-//     // Normalize the sum
-//     assign SumShifted = {2'b0, SumM} << NormCntM+DenormShift; //*** fix mux's with constants in them //***NormCnt can be simplified
-//     // LZA correction
-//     assign LZAPlus1 = SumShifted[3*`NF+7];
-//     assign CorrSumShifted =  LZAPlus1 ? SumShifted[3*`NF+6:1] : SumShifted[3*`NF+5:0];
-//     assign NormSum = CorrSumShifted[3*`NF+5:2*`NF+3];
-//     // Calculate the sticky bit
-//     assign NormSumSticky = (|CorrSumShifted[2*`NF+2:0]) | (|CorrSumShifted[136:2*`NF+3]&~FmtM);
-//     assign UfSticky = AddendStickyM | NormSumSticky;
-
-//     // Determine sum's exponent
-//     assign SumExp = (SumExpTmp+{12'b0, LZAPlus1}+{12'b0, ~|SumExpTmp&SumShifted[3*`NF+6]}) & {`NE+2{~(SumZero|ResultDenorm)}};
-//     // recalculate if the result is denormalized
-//     assign ResultDenorm = PreResultDenorm2&~SumShifted[3*`NF+6]&~SumShifted[3*`NF+7];
-
-// endmodule
-
 module normalize(
     input logic  [3*`NF+5:0]    SumM,       // the positive sum
     input logic  [`NE-1:0]      ZExpM,      // exponent of Z
@@ -733,7 +674,7 @@ module normalize(
     assign LZAPlus1 = SumShifted[3*`NF+7];
     assign LZAPlus2 = SumShifted[3*`NF+8];
 	// the only possible mantissa for a plus two is all zeroes - a one has to propigate all the way through a sum. so we can leave the bottom statement alone
-    assign CorrSumShifted =  LZAPlus1 ? SumShifted[3*`NF+6:1] : SumShifted[3*`NF+5:0];
+    assign CorrSumShifted =  LZAPlus1&~KillProdM ? SumShifted[3*`NF+6:1] : SumShifted[3*`NF+5:0];
     assign NormSum = CorrSumShifted[3*`NF+5:2*`NF+3];
     // Calculate the sticky bit
     assign NormSumSticky = (|CorrSumShifted[2*`NF+2:0]) | (|CorrSumShifted[136:2*`NF+3]&~FmtM);
@@ -757,7 +698,7 @@ module fmaround(
     input logic             ZZeroM,         // is Z zero
     input logic             InvZM,          // invert Z
     input logic  [`NE+1:0]  SumExp,         // exponent of the normalized sum
-    input logic             ResultSgn,      // the result's sign
+    input logic             ResultSgnTmp,      // the result's sign
     output logic            CalcPlus1, Plus1, UfPlus1, Minus1,  // do you add or subtract on from the result
     output logic [`NE+1:0]  FullResultExp,      // ResultExp with bits to determine sign and overflow
     output logic [`NF-1:0]  ResultFrac,         // Result fraction
@@ -824,8 +765,8 @@ module fmaround(
         case (FrmM)
             3'b000: CalcPlus1 = Guard & (Round | ((Sticky)&~(~Round&SubBySmallNum)) | (~Round&~(Sticky)&LSBNormSum&~SubBySmallNum));//round to nearest even
             3'b001: CalcPlus1 = 0;//round to zero
-            3'b010: CalcPlus1 = ResultSgn & ~(SubBySmallNum & ~Guard & ~Round);//round down
-            3'b011: CalcPlus1 = ~ResultSgn & ~(SubBySmallNum & ~Guard & ~Round);//round up
+            3'b010: CalcPlus1 = ResultSgnTmp & ~(SubBySmallNum & ~Guard & ~Round);//round down
+            3'b011: CalcPlus1 = ~ResultSgnTmp & ~(SubBySmallNum & ~Guard & ~Round);//round up
             3'b100: CalcPlus1 = (Guard & (Round | ((Sticky)&~(~Round&SubBySmallNum)) | (~Round&~(Sticky)&~SubBySmallNum)));//round to nearest max magnitude
             default: CalcPlus1 = 1'bx;
         endcase
@@ -833,8 +774,8 @@ module fmaround(
         case (FrmM)
             3'b000: UfCalcPlus1 = UfGuard & (UfRound | (UfSticky&UfRound|~UfSubBySmallNum) | (~Sticky&UfLSBNormSum&~UfSubBySmallNum));//round to nearest even
             3'b001: UfCalcPlus1 = 0;//round to zero
-            3'b010: UfCalcPlus1 = ResultSgn & ~(UfSubBySmallNum & ~UfGuard & ~UfRound);//round down
-            3'b011: UfCalcPlus1 = ~ResultSgn & ~(UfSubBySmallNum & ~UfGuard & ~UfRound);//round up
+            3'b010: UfCalcPlus1 = ResultSgnTmp & ~(UfSubBySmallNum & ~UfGuard & ~UfRound);//round down
+            3'b011: UfCalcPlus1 = ~ResultSgnTmp & ~(UfSubBySmallNum & ~UfGuard & ~UfRound);//round up
             3'b100: UfCalcPlus1 = (UfGuard & (UfRound | (UfSticky&~(~UfRound&UfSubBySmallNum)) | (~Sticky&~UfSubBySmallNum)));//round to nearest max magnitude
             default: UfCalcPlus1 = 1'bx;
         endcase
@@ -842,8 +783,8 @@ module fmaround(
         case (FrmM)
             3'b000: CalcMinus1 = 0;//round to nearest even
             3'b001: CalcMinus1 = SubBySmallNum & ~Guard & ~Round;//round to zero
-            3'b010: CalcMinus1 = ~ResultSgn & ~Guard & ~Round & SubBySmallNum;//round down
-            3'b011: CalcMinus1 = ResultSgn & ~Guard & ~Round & SubBySmallNum;//round up
+            3'b010: CalcMinus1 = ~ResultSgnTmp & ~Guard & ~Round & SubBySmallNum;//round down
+            3'b011: CalcMinus1 = ResultSgnTmp & ~Guard & ~Round & SubBySmallNum;//round up
             3'b100: CalcMinus1 = 0;//round to nearest max magnitude
             default: CalcMinus1 = 1'bx;
         endcase

From 9e6c9c38c0e78eae6ad768fdd0b30a10c7f09a81 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Sun, 19 Dec 2021 13:53:45 -0800
Subject: [PATCH 7/7] ALUControl cleanup

---
 wally-pipelined/src/ieu/controller.sv | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/wally-pipelined/src/ieu/controller.sv b/wally-pipelined/src/ieu/controller.sv
index b081d40f..040fa018 100644
--- a/wally-pipelined/src/ieu/controller.sv
+++ b/wally-pipelined/src/ieu/controller.sv
@@ -170,10 +170,10 @@ module controller(
   assign CSRZeroSrcD = InstrD[14] ? (InstrD[19:15] == 0) : (Rs1D == 0); // Is a CSR instruction using zero as the source?
   assign CSRWriteD = CSRReadD & !(CSRZeroSrcD && InstrD[13]); // Don't write if setting or clearing zeros
 
-  // ALU Decoding
+  // ALU Decoding is lazy, only using func7[5] to distinguish add/sub and srl/sra
   assign sltD = (Funct3D == 3'b010);
   assign sltuD = (Funct3D == 3'b011);
-  assign subD = (Funct3D == 3'b000 & Funct7D[5] & OpD[5]);  // OpD[5] needed; ***explain why
+  assign subD = (Funct3D == 3'b000 & Funct7D[5] & OpD[5]);  // OpD[5] needed to distinguish sub from addi
   assign sraD = (Funct3D == 3'b101 & Funct7D[5]);
   assign SubArithD = ALUOpD & (subD | sraD | sltD | sltuD); // TRUE for R-type subtracts and sra, slt, sltu
   assign ALUControlD = {W64D, SubArithD, ALUOpD};
@@ -205,12 +205,11 @@ module controller(
   assign {eqE, ltE, ltuE} = FlagsE;
   mux4 #(1) branchflagmux(eqE, 1'b0, ltE, ltuE, Funct3E[2:1], BranchFlagE);
   assign BranchTakenE = BranchFlagE ^ Funct3E[0];
-    
   assign PCSrcE = JumpE | BranchE & BranchTakenE;
 
+  // Other execute stage controller signals
   assign MemReadE = MemRWE[1];
   assign SCE = (ResultSrcE == 3'b100);
-
   assign RegWriteE = IEURegWriteE | FWriteIntE; // IRF register writes could come from IEU or FPU controllers
   
   // Memory stage pipeline control register