From 648c09e5ef15f517392ecd831f4953d371b9169c Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 2 Jul 2021 11:04:13 -0400
Subject: [PATCH 1/4] Optimized PMP checker logic and added support for
 configurable number of PMP registers

---
 wally-pipelined/src/ifu/ifu.sv                |  2 +-
 wally-pipelined/src/lsu/lsu.sv                |  2 +-
 wally-pipelined/src/mmu/mmu.sv                |  4 +-
 wally-pipelined/src/mmu/pmpchecker.sv         | 79 ++++++++++--------
 wally-pipelined/src/privileged/csr.sv         |  2 +-
 wally-pipelined/src/privileged/csrm.sv        | 80 +++++++++----------
 wally-pipelined/src/privileged/privileged.sv  |  2 +-
 .../src/wally/wallypipelinedhart.sv           |  2 +-
 8 files changed, 89 insertions(+), 84 deletions(-)

diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv
index afae5ff4f..b08a1503e 100644
--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@@ -79,7 +79,7 @@ module ifu (
   input  logic [2:0]       HSIZE, HBURST,
   input  logic             HWRITE,
   input  logic             ExecuteAccessF, //read, write, and atomic access are all set to zero because this mmu is onlt working with instructinos in the F stage.
-  input  logic [63:0]      PMPCFG01_REGW, PMPCFG23_REGW, // *** all of these come from the privileged unit, so they're gonna have to come over into ifu and dmem
+  input  var logic [63:0]      PMPCFG_ARRAY_REGW[`PMP_ENTRIES/8-1:0],
   input  var logic [`XLEN-1:0] PMPADDR_ARRAY_REGW [`PMP_ENTRIES-1:0], 
 
   output logic             PMPInstrAccessFaultF, PMAInstrAccessFaultF,
diff --git a/wally-pipelined/src/lsu/lsu.sv b/wally-pipelined/src/lsu/lsu.sv
index ffa79adfe..8c9de2ff7 100644
--- a/wally-pipelined/src/lsu/lsu.sv
+++ b/wally-pipelined/src/lsu/lsu.sv
@@ -70,7 +70,7 @@ module lsu (
   input  logic [2:0]       HSIZE, HBURST,
   input  logic             HWRITE,
   input  logic             AtomicAccessM, WriteAccessM, ReadAccessM, // execute access is hardwired to zero in this mmu because we're only working with data in the M stage.
-  input  logic [63:0]      PMPCFG01_REGW, PMPCFG23_REGW, // *** all of these come from the privileged unit, so thwyre gonna have to come over into ifu and dmem
+  input  var logic [63:0]      PMPCFG_ARRAY_REGW[`PMP_ENTRIES/8-1:0],
   input  var logic [`XLEN-1:0] PMPADDR_ARRAY_REGW [`PMP_ENTRIES-1:0], // *** this one especially has a large note attached to it in pmpchecker.
 
   output  logic            PMALoadAccessFaultM, PMAStoreAccessFaultM,
diff --git a/wally-pipelined/src/mmu/mmu.sv b/wally-pipelined/src/mmu/mmu.sv
index ff315f128..32309baaf 100644
--- a/wally-pipelined/src/mmu/mmu.sv
+++ b/wally-pipelined/src/mmu/mmu.sv
@@ -70,8 +70,8 @@ module mmu #(parameter ENTRY_BITS = 3,
   input  logic [2:0]       HSIZE, HBURST,
   input  logic             HWRITE,
   input  logic             AtomicAccessM, ExecuteAccessF, WriteAccessM, ReadAccessM,
-  input  logic [63:0]      PMPCFG01_REGW, PMPCFG23_REGW, // *** all of these come from the privileged unit, so thwyre gonna have to come over into ifu and dmem
-  input  var logic  [`XLEN-1:0] PMPADDR_ARRAY_REGW [`PMP_ENTRIES-1:0], 
+  input  var logic [63:0]      PMPCFG_ARRAY_REGW[`PMP_ENTRIES/8-1:0],
+  input  var logic [`XLEN-1:0] PMPADDR_ARRAY_REGW [`PMP_ENTRIES-1:0], 
 
   output logic             SquashBusAccess, // *** send to privileged unit
   output logic             PMPInstrAccessFaultF, PMPLoadAccessFaultM, PMPStoreAccessFaultM,
diff --git a/wally-pipelined/src/mmu/pmpchecker.sv b/wally-pipelined/src/mmu/pmpchecker.sv
index f88d56fa0..5344249c7 100644
--- a/wally-pipelined/src/mmu/pmpchecker.sv
+++ b/wally-pipelined/src/mmu/pmpchecker.sv
@@ -35,7 +35,6 @@ module pmpchecker (
 
   input  logic [1:0]       PrivilegeModeW,
 
-  input  logic [63:0]      PMPCFG01_REGW, PMPCFG23_REGW,
 
   // *** ModelSim has a switch -svinputport which controls whether input ports
   // are nets (wires) or vars by default. The default setting of this switch is
@@ -48,6 +47,7 @@ module pmpchecker (
   // boundary. It would be better to store the PMP address registers in a module
   // somewhere in the CSR hierarchy and do PMP checking _within_ that module, so
   // we don't have to pass around 16 whole registers.
+  input  var logic [63:0]      PMPCFG_ARRAY_REGW[`PMP_ENTRIES/8-1:0],
   input  var logic [`XLEN-1:0] PMPADDR_ARRAY_REGW [`PMP_ENTRIES-1:0],
 
   input  logic             ExecuteAccessF, WriteAccessM, ReadAccessM,
@@ -60,29 +60,23 @@ module pmpchecker (
 );
 
   // Bit i is high when the address falls in PMP region i
-  logic [15:0] Regions;
-  logic [3:0]  MatchedRegion;
-  logic        Match, EnforcePMP;
+  logic [`PMP_ENTRIES-1:0] Regions, FirstMatch;
+  //logic [3:0]  MatchedRegion;
+  logic        EnforcePMP;
 
-  logic [7:0] PMPCFG [15:0];
+  logic [7:0] PMPCFG [`PMP_ENTRIES-1:0];
 
   // Bit i is high when the address is greater than or equal to PMPADR[i]
   // Used for determining whether TOR PMP regions match
-  logic [15:0] AboveRegion;
+  logic [`PMP_ENTRIES-1:0] AboveRegion;
 
   // Bit i is high if PMP register i is non-null
-  logic [15:0] ActiveRegion;
+  logic [`PMP_ENTRIES-1:0] ActiveRegion;
 
-  logic L_Bit, X_Bit, W_Bit, R_Bit;
-  logic InvalidExecute, InvalidWrite, InvalidRead;
+  logic [`PMP_ENTRIES-1:0] L_Bits, X_Bits, W_Bits, R_Bits;
+  //logic InvalidExecute, InvalidWrite, InvalidRead;
 
-  // *** extend to optionally 64 configurations
-
-  assign {PMPCFG[15], PMPCFG[14], PMPCFG[13], PMPCFG[12],
-          PMPCFG[11], PMPCFG[10], PMPCFG[9], PMPCFG[8]} = PMPCFG23_REGW;
-
-  assign {PMPCFG[7], PMPCFG[6], PMPCFG[5], PMPCFG[4],
-          PMPCFG[3], PMPCFG[2], PMPCFG[1], PMPCFG[0]} = PMPCFG01_REGW;
+  genvar i,j;
 
   pmpadrdec pmpadrdec(.HADDR(HADDR), .AdrMode(PMPCFG[0][4:3]),
                       .CurrentPMPAdr(PMPADDR_ARRAY_REGW[0]),
@@ -92,7 +86,6 @@ module pmpchecker (
   assign ActiveRegion[0] = |PMPCFG[0][4:3];
 
   generate // *** only for PMP_ENTRIES > 0
-    genvar i;
     for (i = 1; i < `PMP_ENTRIES; i++) begin
       pmpadrdec pmpadrdec(.HADDR(HADDR), .AdrMode(PMPCFG[i][4:3]),
                           .CurrentPMPAdr(PMPADDR_ARRAY_REGW[i]),
@@ -104,12 +97,34 @@ module pmpchecker (
     end
   endgenerate
 
-  assign Match = |Regions;
+  //assign Match = |Regions; 
 
-  // Only enforce PMP checking for S and U modes when at least one PMP is active
-  assign EnforcePMP = |ActiveRegion;
-
-  // *** extend to up to 64, fold bit extraction to avoid need for binary encoding of region
+  // verilator lint_off UNOPTFLAT
+  logic [`PMP_ENTRIES-1:0] NoLowerMatch;
+//  assign NoLowerMatch[0] = 1;
+  generate
+    // verilator lint_off WIDTH
+    for (j=0; j<`PMP_ENTRIES; j = j+8) begin
+      assign {PMPCFG[j+7], PMPCFG[j+6], PMPCFG[j+5], PMPCFG[j+4],
+              PMPCFG[j+3], PMPCFG[j+2], PMPCFG[j+1], PMPCFG[j]} = PMPCFG_ARRAY_REGW[j/8];
+    end
+    // verilator lint_on WIDTH
+    for (i=0; i<`PMP_ENTRIES; i++) begin
+      if (i==0) begin
+	 assign FirstMatch[i] = Regions[i];
+	assign NoLowerMatch[i] = ~Regions[i];
+      end else begin
+	 assign FirstMatch[i] = Regions[i] & NoLowerMatch[i];
+	assign NoLowerMatch[i] = NoLowerMatch[i-1] & ~Regions[i];
+      end
+      assign L_Bits[i] = PMPCFG[i][7] & FirstMatch[i];
+      assign X_Bits[i] = PMPCFG[i][2] & FirstMatch[i];
+      assign W_Bits[i] = PMPCFG[i][1] & FirstMatch[i];
+      assign R_Bits[i] = PMPCFG[i][0] & FirstMatch[i];
+    end
+    // verilator lint_on UNOPTFLAT
+  endgenerate
+/*  // *** extend to up to 64, fold bit extraction to avoid need for binary encoding of region
   always_comb
     casez (Regions)
       16'b???????????????1: MatchedRegion = 0;
@@ -134,22 +149,18 @@ module pmpchecker (
   assign L_Bit = PMPCFG[MatchedRegion][7] && Match;
   assign X_Bit = PMPCFG[MatchedRegion][2] && Match;
   assign W_Bit = PMPCFG[MatchedRegion][1] && Match;
-  assign R_Bit = PMPCFG[MatchedRegion][0] && Match;
+  assign R_Bit = PMPCFG[MatchedRegion][0] && Match; 
 
   assign InvalidExecute = ExecuteAccessF && ~X_Bit;
   assign InvalidWrite   = WriteAccessM   && ~W_Bit;
-  assign InvalidRead    = ReadAccessM    && ~R_Bit;
+  assign InvalidRead    = ReadAccessM    && ~R_Bit;*/
 
-  // *** don't cause faults when there are no PMPs
-  assign PMPInstrAccessFaultF = (PrivilegeModeW == `M_MODE) ?
-                                  Match && L_Bit && InvalidExecute :
-                                  EnforcePMP && InvalidExecute;
-  assign PMPStoreAccessFaultM = (PrivilegeModeW == `M_MODE) ?
-                                  Match && L_Bit && InvalidWrite :
-                                  EnforcePMP && InvalidWrite;
-  assign PMPLoadAccessFaultM  = (PrivilegeModeW == `M_MODE) ?
-                                  Match && L_Bit && InvalidRead :
-                                  EnforcePMP && InvalidRead;
+  // Only enforce PMP checking for S and U modes when at least one PMP is active or in Machine mode when L bit is set in selected region
+  assign EnforcePMP = (PrivilegeModeW == `M_MODE) ? |L_Bits : |ActiveRegion;
+
+  assign PMPInstrAccessFaultF = EnforcePMP && ExecuteAccessF && ~|X_Bits;
+  assign PMPStoreAccessFaultM = EnforcePMP && WriteAccessM   && ~|W_Bits;
+  assign PMPLoadAccessFaultM  = EnforcePMP && ReadAccessM    && ~|R_Bits;
 
   assign PMPSquashBusAccess = PMPInstrAccessFaultF || PMPLoadAccessFaultM || PMPStoreAccessFaultM;
 
diff --git a/wally-pipelined/src/privileged/csr.sv b/wally-pipelined/src/privileged/csr.sv
index 213bcde33..dfac55711 100644
--- a/wally-pipelined/src/privileged/csr.sv
+++ b/wally-pipelined/src/privileged/csr.sv
@@ -60,7 +60,7 @@ module csr #(parameter
   output logic             STATUS_MIE, STATUS_SIE,
   output logic             STATUS_MXR, STATUS_SUM,
   output logic             STATUS_MPRV,
-  output logic [63:0]      PMPCFG01_REGW, PMPCFG23_REGW,
+  output var logic [63:0]      PMPCFG_ARRAY_REGW[`PMP_ENTRIES/8-1:0],
   output var logic [`XLEN-1:0] PMPADDR_ARRAY_REGW [`PMP_ENTRIES-1:0],
   input  logic [4:0]       SetFflagsM,
   output logic [2:0]       FRM_REGW, 
diff --git a/wally-pipelined/src/privileged/csrm.sv b/wally-pipelined/src/privileged/csrm.sv
index 33b903a83..f30ebb4ff 100644
--- a/wally-pipelined/src/privileged/csrm.sv
+++ b/wally-pipelined/src/privileged/csrm.sv
@@ -48,25 +48,9 @@ module csrm #(parameter
   MTVAL = 12'h343,
   MIP = 12'h344,
   PMPCFG0 = 12'h3A0,
-  PMPCFG1 = 12'h3A1,
-  PMPCFG2 = 12'h3A2,
-  PMPCFG3 = 12'h3A3,
+  // .. up to 15 more at consecutive addresses
   PMPADDR0 = 12'h3B0,
-  PMPADDR1 = 12'h3B1,
-  PMPADDR2 = 12'h3B2,
-  PMPADDR3 = 12'h3B3,
-  PMPADDR4 = 12'h3B4,
-  PMPADDR5 = 12'h3B5,
-  PMPADDR6 = 12'h3B6,
-  PMPADDR7 = 12'h3B7,
-  PMPADDR8 = 12'h3B8,
-  PMPADDR9 = 12'h3B9,
-  PMPADDR10 = 12'h3BA,
-  PMPADDR11 = 12'h3BB,
-  PMPADDR12 = 12'h3BC,
-  PMPADDR13 = 12'h3BD,
-  PMPADDR14 = 12'h3BE,
-  PMPADDR15 = 12'h3BF,
+  // ... up to 63 more at consecutive addresses
   TSELECT = 12'h7A0,
   TDATA1 = 12'h7A1,
   TDATA2 = 12'h7A2,
@@ -90,7 +74,7 @@ module csrm #(parameter
     output logic [31:0]      MCOUNTEREN_REGW, MCOUNTINHIBIT_REGW, 
     output logic [`XLEN-1:0]      MEDELEG_REGW, MIDELEG_REGW,
     // 64-bit registers in RV64, or two 32-bit registers in RV32
-    output logic [63:0]      PMPCFG01_REGW, PMPCFG23_REGW,
+    output var logic [63:0]      PMPCFG_ARRAY_REGW[`PMP_ENTRIES/8-1:0],
     output var logic [`XLEN-1:0] PMPADDR_ARRAY_REGW [`PMP_ENTRIES-1:0],
     input  logic [11:0]      MIP_REGW, MIE_REGW,
     output logic             WriteMSTATUSM,
@@ -103,8 +87,8 @@ module csrm #(parameter
   logic            WriteMTVECM, WriteMEDELEGM, WriteMIDELEGM;
   logic            WriteMSCRATCHM, WriteMEPCM, WriteMCAUSEM, WriteMTVALM;
   logic            WriteMCOUNTERENM, WriteMCOUNTINHIBITM;
-  logic            WritePMPCFG0M, WritePMPCFG2M;
-  logic            WritePMPADDRM [15:0]; 
+  logic [`PMP_ENTRIES/8-1:0] WritePMPCFGM, WritePMPCFGHM ;
+  logic [`PMP_ENTRIES-1:0]   WritePMPADDRM ; 
 
   localparam MISA_26 = (`MISA) & 32'h03ffffff;
 
@@ -120,7 +104,7 @@ module csrm #(parameter
   assign WriteMEPCM = MTrapM | (CSRMWriteM && (CSRAdrM == MEPC)) && ~StallW;
   assign WriteMCAUSEM = MTrapM | (CSRMWriteM && (CSRAdrM == MCAUSE)) && ~StallW;
   assign WriteMTVALM = MTrapM | (CSRMWriteM && (CSRAdrM == MTVAL)) && ~StallW;
-  assign WritePMPCFG0M = (CSRMWriteM && (CSRAdrM == PMPCFG0)) && ~StallW;
+/*  assign WritePMPCFG0M = (CSRMWriteM && (CSRAdrM == PMPCFG0)) && ~StallW;
   assign WritePMPCFG2M = (CSRMWriteM && (CSRAdrM == PMPCFG2)) && ~StallW;
   assign WritePMPADDRM[0] = (CSRMWriteM && (CSRAdrM == PMPADDR0)) && ~StallW;
   assign WritePMPADDRM[1] = (CSRMWriteM && (CSRAdrM == PMPADDR1)) && ~StallW;
@@ -137,10 +121,13 @@ module csrm #(parameter
   assign WritePMPADDRM[12] = (CSRMWriteM && (CSRAdrM == PMPADDR12)) && ~StallW;
   assign WritePMPADDRM[13] = (CSRMWriteM && (CSRAdrM == PMPADDR13)) && ~StallW;
   assign WritePMPADDRM[14] = (CSRMWriteM && (CSRAdrM == PMPADDR14)) && ~StallW;
-  assign WritePMPADDRM[15] = (CSRMWriteM && (CSRAdrM == PMPADDR15)) && ~StallW;
+  assign WritePMPADDRM[15] = (CSRMWriteM && (CSRAdrM == PMPADDR15)) && ~StallW; */
   assign WriteMCOUNTERENM = CSRMWriteM && (CSRAdrM == MCOUNTEREN) && ~StallW;
   assign WriteMCOUNTINHIBITM = CSRMWriteM && (CSRAdrM == MCOUNTINHIBIT) && ~StallW;
 
+
+
+
   assign IllegalCSRMWriteReadonlyM = CSRMWriteM && (CSRAdrM == MVENDORID || CSRAdrM == MARCHID || CSRAdrM == MIMPID || CSRAdrM == MHARTID);
 
   // CSRs
@@ -172,33 +159,39 @@ module csrm #(parameter
   flopenl #(32)   MCOUNTINHIBITreg(clk, reset, WriteMCOUNTINHIBITM, CSRWriteValM[31:0], 32'hFFFFFFFF, MCOUNTINHIBIT_REGW);
 
   // There are PMP_ENTRIES = 0, 16, or 64 PMPADDR registers, each of which has its own flop
+
+  // *** need to add support for locked PMPCFG and PMPADR
+  genvar i;
   generate
-    genvar i;
-    for (i = 0; i < `PMP_ENTRIES; i++) begin: pmp_flop
+    for(i=0; i<`PMP_ENTRIES; i++) begin
+      assign WritePMPADDRM[i] = (CSRMWriteM && (CSRAdrM == PMPADDR0+i)) && ~StallW;
       flopenr #(`XLEN) PMPADDRreg(clk, reset, WritePMPADDRM[i], CSRWriteValM, PMPADDR_ARRAY_REGW[i]);
     end
+    for (i=0; i<`PMP_ENTRIES/8; i++) begin
+      if (`XLEN==64) begin
+        assign WritePMPCFGM[i] = (CSRMWriteM && (CSRAdrM == PMPCFG0+2*i)) && ~StallW;
+        flopenr #(`XLEN) PMPCFGreg(clk, reset, WritePMPCFGM[i], CSRWriteValM, PMPCFG_ARRAY_REGW[i]);
+      end else begin
+        assign WritePMPCFGM[i]  = (CSRMWriteM && (CSRAdrM == PMPCFG0+2*i)) && ~StallW;
+        assign WritePMPCFGHM[i] = (CSRMWriteM && (CSRAdrM == PMPCFG0+2*i+1)) && ~StallW;
+        flopenr #(`XLEN) PMPCFGreg(clk, reset, WritePMPCFGM[i], CSRWriteValM, PMPCFG_ARRAY_REGW[i][31:0]);
+        flopenr #(`XLEN) PMPCFGHreg(clk, reset, WritePMPCFGHM[i], CSRWriteValM, PMPCFG_ARRAY_REGW[i][63:32]);
+      end
+    end
   endgenerate
 
-  // PMPCFG registers are a pair of 64-bit in RV64 and four 32-bit in RV32
-  generate
-    if (`XLEN==64) begin
-      flopenr #(`XLEN) PMPCFG01reg(clk, reset, WritePMPCFG0M, CSRWriteValM, PMPCFG01_REGW);
-      flopenr #(`XLEN) PMPCFG23reg(clk, reset, WritePMPCFG2M, CSRWriteValM, PMPCFG23_REGW);      
-    end else begin
-      logic WritePMPCFG1M, WritePMPCFG3M;
-      assign WritePMPCFG1M = MTrapM | (CSRMWriteM && (CSRAdrM == PMPCFG1));
-      assign WritePMPCFG3M = MTrapM | (CSRMWriteM && (CSRAdrM == PMPCFG3));
-      flopenr #(`XLEN) PMPCFG0reg(clk, reset, WritePMPCFG0M, CSRWriteValM, PMPCFG01_REGW[31:0]);
-      flopenr #(`XLEN) PMPCFG1reg(clk, reset, WritePMPCFG1M, CSRWriteValM, PMPCFG01_REGW[63:32]);            
-      flopenr #(`XLEN) PMPCFG2reg(clk, reset, WritePMPCFG2M, CSRWriteValM, PMPCFG23_REGW[31:0]);
-      flopenr #(`XLEN) PMPCFG3reg(clk, reset, WritePMPCFG3M, CSRWriteValM, PMPCFG23_REGW[63:32]);            
-    end
-  endgenerate
   // Read machine mode CSRs
+  // verilator lint_off WIDTH
   always_comb begin
     IllegalCSRMAccessM = !(`S_SUPPORTED | `U_SUPPORTED & `N_SUPPORTED) && 
                           (CSRAdrM == MEDELEG || CSRAdrM == MIDELEG); // trap on DELEG register access when no S or N-mode
-    case (CSRAdrM) 
+    if (CSRAdrM >= PMPADDR0 && CSRAdrM < PMPADDR0 + `PMP_ENTRIES) // reading a PMP entry
+      CSRMReadValM = PMPADDR_ARRAY_REGW[CSRAdrM - PMPADDR0];
+    else if (CSRAdrM >= PMPCFG0 && CSRAdrM < PMPCFG0 + `PMP_ENTRIES/8) begin
+      if (~CSRAdrM[0]) CSRMReadValM = PMPCFG_ARRAY_REGW[CSRAdrM - PMPCFG0][`XLEN-1:0];
+      else             CSRMReadValM = {{(`XLEN-32){1'b0}}, PMPCFG_ARRAY_REGW[CSRAdrM - PMPCFG0][63:32]};
+    end
+    else case (CSRAdrM) 
       MISA_ADR:  CSRMReadValM = MISA_REGW;
       MVENDORID: CSRMReadValM = 0;
       MARCHID:   CSRMReadValM = 0;
@@ -219,7 +212,7 @@ module csrm #(parameter
       MTVAL:     CSRMReadValM = MTVAL_REGW;
       MCOUNTEREN:CSRMReadValM = {{(`XLEN-32){1'b0}}, MCOUNTEREN_REGW};
       MCOUNTINHIBIT:CSRMReadValM = {{(`XLEN-32){1'b0}}, MCOUNTINHIBIT_REGW};
-      PMPCFG0:   CSRMReadValM = PMPCFG01_REGW[`XLEN-1:0];
+/*      PMPCFG0:   CSRMReadValM = PMPCFG01_REGW[`XLEN-1:0];
       PMPCFG1:   CSRMReadValM = {{(`XLEN-32){1'b0}}, PMPCFG01_REGW[63:32]};
       PMPCFG2:   CSRMReadValM = PMPCFG23_REGW[`XLEN-1:0];
       PMPCFG3:   CSRMReadValM = {{(`XLEN-32){1'b0}}, PMPCFG23_REGW[63:32]};
@@ -238,11 +231,12 @@ module csrm #(parameter
       PMPADDR12: CSRMReadValM = PMPADDR_ARRAY_REGW[12];
       PMPADDR13: CSRMReadValM = PMPADDR_ARRAY_REGW[13];
       PMPADDR14: CSRMReadValM = PMPADDR_ARRAY_REGW[14];
-      PMPADDR15: CSRMReadValM = PMPADDR_ARRAY_REGW[15];
+      PMPADDR15: CSRMReadValM = PMPADDR_ARRAY_REGW[15]; */
       default: begin
                  CSRMReadValM = 0;
                  IllegalCSRMAccessM = 1;
       end
     endcase
   end
+  // verilator lint_on WIDTH
 endmodule
diff --git a/wally-pipelined/src/privileged/privileged.sv b/wally-pipelined/src/privileged/privileged.sv
index 1275cd4b8..5ed8c8807 100644
--- a/wally-pipelined/src/privileged/privileged.sv
+++ b/wally-pipelined/src/privileged/privileged.sv
@@ -68,7 +68,7 @@ module privileged (
   output logic [1:0]       PrivilegeModeW,
   output logic [`XLEN-1:0] SATP_REGW,
   output logic             STATUS_MXR, STATUS_SUM,
-  output logic [63:0]      PMPCFG01_REGW, PMPCFG23_REGW,
+  output var logic [63:0]      PMPCFG_ARRAY_REGW[`PMP_ENTRIES/8-1:0],
   output var logic [`XLEN-1:0] PMPADDR_ARRAY_REGW [`PMP_ENTRIES-1:0], 
   output logic [2:0]       FRM_REGW
 );
diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv
index a77c3ab01..9358417b1 100644
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@@ -126,7 +126,7 @@ module wallypipelinedhart (
   logic             DSquashBusAccessM, ISquashBusAccessF;
   logic [5:0]            DHSELRegionsM, IHSELRegionsF;
   var logic [`XLEN-1:0] PMPADDR_ARRAY_REGW [`PMP_ENTRIES-1:0];
-  logic [63:0]      PMPCFG01_REGW, PMPCFG23_REGW; // signals being sent from privileged unit to pmp/pma in dmem and ifu.
+  var logic [63:0]      PMPCFG_ARRAY_REGW[`PMP_ENTRIES/8-1:0];
   assign            HSELRegions = ExecuteAccessF ? IHSELRegionsF : DHSELRegionsM; // *** this is a pure guess on how one of these should be selected. it passes tests, but is it the right way to do this?
 
   // IMem stalls

From cd6cabac2f1669784e3e7184911a69ab3d32c064 Mon Sep 17 00:00:00 2001
From: David Harris <david_harris@hmc.edu>
Date: Fri, 2 Jul 2021 11:05:25 -0400
Subject: [PATCH 2/4] Optimized PMP checker logic and added support for
 configurable number of PMP registers

---
 wally-pipelined/config/rv64ic/wally-config.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wally-pipelined/config/rv64ic/wally-config.vh b/wally-pipelined/config/rv64ic/wally-config.vh
index 954e126bb..a6f1c0133 100644
--- a/wally-pipelined/config/rv64ic/wally-config.vh
+++ b/wally-pipelined/config/rv64ic/wally-config.vh
@@ -53,7 +53,7 @@
 `define DTLB_ENTRY_BITS 5
 
 // Legal number of PMP entries are 0, 16, or 64
-`define PMP_ENTRIES 16
+`define PMP_ENTRIES 64
 
 // Address space
 `define RESET_VECTOR 64'h0000000080000000

From 3f61e313d2707d69c8b1db73806941eb0d536c32 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Fri, 2 Jul 2021 12:40:58 -0400
Subject: [PATCH 3/4] FPU update

---
 wally-pipelined/src/fpu/FPregfile.sv        |  54 --
 wally-pipelined/src/fpu/bk128.sv            | 599 --------------------
 wally-pipelined/src/fpu/bk13.sv             |  97 ----
 wally-pipelined/src/fpu/bk14.sv             |  86 ---
 wally-pipelined/src/fpu/csa.sv              |  70 ---
 wally-pipelined/src/fpu/divconv.sv          |  18 +-
 wally-pipelined/src/fpu/fctrl.sv            |  67 ++-
 wally-pipelined/src/fpu/fma2.sv             |  10 +-
 wally-pipelined/src/fpu/fpadd_denorm.sv     |   4 +-
 wally-pipelined/src/fpu/fpdiv.sv            | 256 ---------
 wally-pipelined/src/fpu/fpu.sv              | 369 ++++++------
 wally-pipelined/src/fpu/fpuaddcvt1.sv       |   4 +-
 wally-pipelined/src/fpu/fpuaddcvt2.sv       |  14 +-
 wally-pipelined/src/fpu/fpuclassify.sv      |  50 --
 wally-pipelined/src/fpu/fpucmp1.sv          | 465 ---------------
 wally-pipelined/src/fpu/fpucmp2.sv          | 243 --------
 wally-pipelined/src/fpu/fpuhazard.sv        |  67 ---
 wally-pipelined/src/fpu/freg.sv             | 515 -----------------
 wally-pipelined/src/fpu/fsgn.sv             |  19 +-
 wally-pipelined/src/fpu/ling_bk13.sv        |  89 ---
 wally-pipelined/src/fpu/lzd_denorm.sv       |   1 +
 wally-pipelined/src/fpu/mult_R4_64_64_cs.sv |   0
 wally-pipelined/src/fpu/rounder_denorm.sv   |   6 +-
 wally-pipelined/src/fpu/sbtm_a4.sv          | 204 -------
 wally-pipelined/src/fpu/sk14.sv             |  90 ---
 25 files changed, 254 insertions(+), 3143 deletions(-)
 delete mode 100644 wally-pipelined/src/fpu/FPregfile.sv
 delete mode 100755 wally-pipelined/src/fpu/bk128.sv
 delete mode 100755 wally-pipelined/src/fpu/bk13.sv
 delete mode 100755 wally-pipelined/src/fpu/bk14.sv
 delete mode 100644 wally-pipelined/src/fpu/csa.sv
 delete mode 100755 wally-pipelined/src/fpu/fpdiv.sv
 delete mode 100644 wally-pipelined/src/fpu/fpuclassify.sv
 delete mode 100755 wally-pipelined/src/fpu/fpucmp1.sv
 delete mode 100755 wally-pipelined/src/fpu/fpucmp2.sv
 delete mode 100644 wally-pipelined/src/fpu/fpuhazard.sv
 delete mode 100755 wally-pipelined/src/fpu/freg.sv
 delete mode 100755 wally-pipelined/src/fpu/ling_bk13.sv
 mode change 100755 => 100644 wally-pipelined/src/fpu/mult_R4_64_64_cs.sv
 delete mode 100755 wally-pipelined/src/fpu/sbtm_a4.sv
 delete mode 100755 wally-pipelined/src/fpu/sk14.sv

diff --git a/wally-pipelined/src/fpu/FPregfile.sv b/wally-pipelined/src/fpu/FPregfile.sv
deleted file mode 100644
index 99d18bce9..000000000
--- a/wally-pipelined/src/fpu/FPregfile.sv
+++ /dev/null
@@ -1,54 +0,0 @@
-///////////////////////////////////////////
-// regfile.sv
-//
-// Written: David_Harris@hmc.edu 9 January 2021
-// Modified: 
-//
-// Purpose: 4-port register file
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-///////////////////////////////////////////
-
-`include "wally-config.vh"
-
-module FPregfile (
-  input  logic             clk, reset,
-  input  logic             we4, 
-  input  logic [ 4:0]      a1, a2, a3, a4, 
-  input  logic [63:0] wd4,    //KEP `XLEN-1 changed to 63 (lint warning) *** figure out if double can be suported when XLEN = 32
-  output logic [63:0] rd1, rd2, rd3);
-
-  logic [63:0] rf[31:0];
-  integer i;
-
-  // three ported register file
-  // read three ports combinationally (A1/RD1, A2/RD2, A3/RD3)
-  // write fourth port on rising edge of clock (A4/WD4/WE4)
-  // write occurs on falling edge of clock
-  
-  // reset is intended for simulation only, not synthesis
-    
-   always_ff @(negedge clk or posedge reset)
-     if (reset) for(i=0; i<32; i++) rf[i] <= 0;
-     else if (we4) rf[a4] <= wd4;	
-   
-   assign #2 rd1 = rf[a1];
-   assign #2 rd2 = rf[a2];
-   assign #2 rd3 = rf[a3];
-   
-endmodule // regfile
-
diff --git a/wally-pipelined/src/fpu/bk128.sv b/wally-pipelined/src/fpu/bk128.sv
deleted file mode 100755
index a302a0310..000000000
--- a/wally-pipelined/src/fpu/bk128.sv
+++ /dev/null
@@ -1,599 +0,0 @@
-// Brent-Kung Carry-save Prefix Adder
-
-module bk128 (cout, sum, a, b, cin);
-   
-   input [127:0] a, b;
-   input 	 cin;
-   
-   output [127:0] sum;
-   output 	  cout;
-
-   wire [128:0]   p,g,t;
-   wire [127:0]   c;
-
-   // pre-computation
-   assign p={a^b,1'b0};
-   assign g={a&b, cin};
-   assign t[1]=p[1];
-   assign t[2]=p[2];
-   assign t[3]=p[3]^g[2];
-   assign t[4]=p[4];
-   assign t[5]=p[5]^g[4];
-   assign t[6]=p[6];
-   assign t[7]=p[7]^g[6];
-   assign t[8]=p[8];
-   assign t[9]=p[9]^g[8];
-   assign t[10]=p[10];
-   assign t[11]=p[11]^g[10];
-   assign t[12]=p[12];
-   assign t[13]=p[13]^g[12];
-   assign t[14]=p[14];
-   assign t[15]=p[15]^g[14];
-   assign t[16]=p[16];
-   assign t[17]=p[17]^g[16];
-   assign t[18]=p[18];
-   assign t[19]=p[19]^g[18];
-   assign t[20]=p[20];
-   assign t[21]=p[21]^g[20];
-   assign t[22]=p[22];
-   assign t[23]=p[23]^g[22];
-   assign t[24]=p[24];
-   assign t[25]=p[25]^g[24];
-   assign t[26]=p[26];
-   assign t[27]=p[27]^g[26];
-   assign t[28]=p[28];
-   assign t[29]=p[29]^g[28];
-   assign t[30]=p[30];
-   assign t[31]=p[31]^g[30];
-   assign t[32]=p[32];
-   assign t[33]=p[33]^g[32];
-   assign t[34]=p[34];
-   assign t[35]=p[35]^g[34];
-   assign t[36]=p[36];
-   assign t[37]=p[37]^g[36];
-   assign t[38]=p[38];
-   assign t[39]=p[39]^g[38];
-   assign t[40]=p[40];
-   assign t[41]=p[41]^g[40];
-   assign t[42]=p[42];
-   assign t[43]=p[43]^g[42];
-   assign t[44]=p[44];
-   assign t[45]=p[45]^g[44];
-   assign t[46]=p[46];
-   assign t[47]=p[47]^g[46];
-   assign t[48]=p[48];
-   assign t[49]=p[49]^g[48];
-   assign t[50]=p[50];
-   assign t[51]=p[51]^g[50];
-   assign t[52]=p[52];
-   assign t[53]=p[53]^g[52];
-   assign t[54]=p[54];
-   assign t[55]=p[55]^g[54];
-   assign t[56]=p[56];
-   assign t[57]=p[57]^g[56];
-   assign t[58]=p[58];
-   assign t[59]=p[59]^g[58];
-   assign t[60]=p[60];
-   assign t[61]=p[61]^g[60];
-   assign t[62]=p[62];
-   assign t[63]=p[63]^g[62];
-   assign t[64]=p[64];
-   assign t[65]=p[65]^g[64];
-   assign t[66]=p[66];
-   assign t[67]=p[67]^g[66];
-   assign t[68]=p[68];
-   assign t[69]=p[69]^g[68];
-   assign t[70]=p[70];
-   assign t[71]=p[71]^g[70];
-   assign t[72]=p[72];
-   assign t[73]=p[73]^g[72];
-   assign t[74]=p[74];
-   assign t[75]=p[75]^g[74];
-   assign t[76]=p[76];
-   assign t[77]=p[77]^g[76];
-   assign t[78]=p[78];
-   assign t[79]=p[79]^g[78];
-   assign t[80]=p[80];
-   assign t[81]=p[81]^g[80];
-   assign t[82]=p[82];
-   assign t[83]=p[83]^g[82];
-   assign t[84]=p[84];
-   assign t[85]=p[85]^g[84];
-   assign t[86]=p[86];
-   assign t[87]=p[87]^g[86];
-   assign t[88]=p[88];
-   assign t[89]=p[89]^g[88];
-   assign t[90]=p[90];
-   assign t[91]=p[91]^g[90];
-   assign t[92]=p[92];
-   assign t[93]=p[93]^g[92];
-   assign t[94]=p[94];
-   assign t[95]=p[95]^g[94];
-   assign t[96]=p[96];
-   assign t[97]=p[97]^g[96];
-   assign t[98]=p[98];
-   assign t[99]=p[99]^g[98];
-   assign t[100]=p[100];
-   assign t[101]=p[101]^g[100];
-   assign t[102]=p[102];
-   assign t[103]=p[103]^g[102];
-   assign t[104]=p[104];
-   assign t[105]=p[105]^g[104];
-   assign t[106]=p[106];
-   assign t[107]=p[107]^g[106];
-   assign t[108]=p[108];
-   assign t[109]=p[109]^g[108];
-   assign t[110]=p[110];
-   assign t[111]=p[111]^g[110];
-   assign t[112]=p[112];
-   assign t[113]=p[113]^g[112];
-   assign t[114]=p[114];
-   assign t[115]=p[115]^g[114];
-   assign t[116]=p[116];
-   assign t[117]=p[117]^g[116];
-   assign t[118]=p[118];
-   assign t[119]=p[119]^g[118];
-   assign t[120]=p[120];
-   assign t[121]=p[121]^g[120];
-   assign t[122]=p[122];
-   assign t[123]=p[123]^g[122];
-   assign t[124]=p[124];
-   assign t[125]=p[125]^g[124];
-   assign t[126]=p[126];
-   assign t[127]=p[127]^g[126];
-   assign t[128]=p[128];
-
-   // prefix tree
-   brent_kung_cs128 prefix_tree(c, p[127:0], g[127:0]);
-
-   // post-computation
-   assign sum=p[128:1]^c;
-   assign cout=g[128]|(p[128]&c[127]);
-
-endmodule
-
-module brent_kung_cs128 (c, p, g);
-   
-   input [127:0] p;
-   input [127:0] g;
-   output [128:1] c;
-
-
-   // parallel-prefix, Brent-Kung
-
-   // Stage 1: Generates G/P pairs that span 1 bits
-   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
-   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
-   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
-   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
-   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
-   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
-   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
-   black b_15_14 (G_15_14, P_15_14, {g[15],g[14]}, {p[15],p[14]});
-
-   black b_17_16 (G_17_16, P_17_16, {g[17],g[16]}, {p[17],p[16]});
-   black b_19_18 (G_19_18, P_19_18, {g[19],g[18]}, {p[19],p[18]});
-   black b_21_20 (G_21_20, P_21_20, {g[21],g[20]}, {p[21],p[20]});
-   black b_23_22 (G_23_22, P_23_22, {g[23],g[22]}, {p[23],p[22]});
-   black b_25_24 (G_25_24, P_25_24, {g[25],g[24]}, {p[25],p[24]});
-   black b_27_26 (G_27_26, P_27_26, {g[27],g[26]}, {p[27],p[26]});
-   black b_29_28 (G_29_28, P_29_28, {g[29],g[28]}, {p[29],p[28]});
-   black b_31_30 (G_31_30, P_31_30, {g[31],g[30]}, {p[31],p[30]});
-
-   black b_33_32 (G_33_32, P_33_32, {g[33],g[32]}, {p[33],p[32]});
-   black b_35_34 (G_35_34, P_35_34, {g[35],g[34]}, {p[35],p[34]});
-   black b_37_36 (G_37_36, P_37_36, {g[37],g[36]}, {p[37],p[36]});
-   black b_39_38 (G_39_38, P_39_38, {g[39],g[38]}, {p[39],p[38]});
-   black b_41_40 (G_41_40, P_41_40, {g[41],g[40]}, {p[41],p[40]});
-   black b_43_42 (G_43_42, P_43_42, {g[43],g[42]}, {p[43],p[42]});
-   black b_45_44 (G_45_44, P_45_44, {g[45],g[44]}, {p[45],p[44]});
-   black b_47_46 (G_47_46, P_47_46, {g[47],g[46]}, {p[47],p[46]});
-
-   black b_49_48 (G_49_48, P_49_48, {g[49],g[48]}, {p[49],p[48]});
-   black b_51_50 (G_51_50, P_51_50, {g[51],g[50]}, {p[51],p[50]});
-   black b_53_52 (G_53_52, P_53_52, {g[53],g[52]}, {p[53],p[52]});
-   black b_55_54 (G_55_54, P_55_54, {g[55],g[54]}, {p[55],p[54]});
-   black b_57_56 (G_57_56, P_57_56, {g[57],g[56]}, {p[57],p[56]});
-   black b_59_58 (G_59_58, P_59_58, {g[59],g[58]}, {p[59],p[58]});
-   black b_61_60 (G_61_60, P_61_60, {g[61],g[60]}, {p[61],p[60]});
-   black b_63_62 (G_63_62, P_63_62, {g[63],g[62]}, {p[63],p[62]});
-
-   black b_65_64 (G_65_64, P_65_64, {g[65],g[64]}, {p[65],p[64]});
-   black b_67_66 (G_67_66, P_67_66, {g[67],g[66]}, {p[67],p[66]});
-   black b_69_68 (G_69_68, P_69_68, {g[69],g[68]}, {p[69],p[68]});
-   black b_71_70 (G_71_70, P_71_70, {g[71],g[70]}, {p[71],p[70]});
-   black b_73_72 (G_73_72, P_73_72, {g[73],g[72]}, {p[73],p[72]});
-   black b_75_74 (G_75_74, P_75_74, {g[75],g[74]}, {p[75],p[74]});
-   black b_77_76 (G_77_76, P_77_76, {g[77],g[76]}, {p[77],p[76]});
-   black b_79_78 (G_79_78, P_79_78, {g[79],g[78]}, {p[79],p[78]});
-
-   black b_81_80 (G_81_80, P_81_80, {g[81],g[80]}, {p[81],p[80]});
-   black b_83_82 (G_83_82, P_83_82, {g[83],g[82]}, {p[83],p[82]});
-   black b_85_84 (G_85_84, P_85_84, {g[85],g[84]}, {p[85],p[84]});
-   black b_87_86 (G_87_86, P_87_86, {g[87],g[86]}, {p[87],p[86]});
-   black b_89_88 (G_89_88, P_89_88, {g[89],g[88]}, {p[89],p[88]});
-   black b_91_90 (G_91_90, P_91_90, {g[91],g[90]}, {p[91],p[90]});
-   black b_93_92 (G_93_92, P_93_92, {g[93],g[92]}, {p[93],p[92]});
-   black b_95_94 (G_95_94, P_95_94, {g[95],g[94]}, {p[95],p[94]});
-
-   black b_97_96 (G_97_96, P_97_96, {g[97],g[96]}, {p[97],p[96]});
-   black b_99_98 (G_99_98, P_99_98, {g[99],g[98]}, {p[99],p[98]});
-   black b_101_100 (G_101_100, P_101_100, {g[101],g[100]}, {p[101],p[100]});
-   black b_103_102 (G_103_102, P_103_102, {g[103],g[102]}, {p[103],p[102]});
-   black b_105_104 (G_105_104, P_105_104, {g[105],g[104]}, {p[105],p[104]});
-   black b_107_106 (G_107_106, P_107_106, {g[107],g[106]}, {p[107],p[106]});
-   black b_109_108 (G_109_108, P_109_108, {g[109],g[108]}, {p[109],p[108]});
-   black b_111_110 (G_111_110, P_111_110, {g[111],g[110]}, {p[111],p[110]});
-
-   black b_113_112 (G_113_112, P_113_112, {g[113],g[112]}, {p[113],p[112]});
-   black b_115_114 (G_115_114, P_115_114, {g[115],g[114]}, {p[115],p[114]});
-   black b_117_116 (G_117_116, P_117_116, {g[117],g[116]}, {p[117],p[116]});
-   black b_119_118 (G_119_118, P_119_118, {g[119],g[118]}, {p[119],p[118]});
-   black b_121_120 (G_121_120, P_121_120, {g[121],g[120]}, {p[121],p[120]});
-   black b_123_122 (G_123_122, P_123_122, {g[123],g[122]}, {p[123],p[122]});
-   black b_125_124 (G_125_124, P_125_124, {g[125],g[124]}, {p[125],p[124]});
-   black b_127_126 (G_127_126, P_127_126, {g[127],g[126]}, {p[127],p[126]});
-
-
-   // Stage 2: Generates G/P pairs that span 2 bits
-   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
-   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
-   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
-   black b_15_12 (G_15_12, P_15_12, {G_15_14,G_13_12}, {P_15_14,P_13_12});
-   black b_19_16 (G_19_16, P_19_16, {G_19_18,G_17_16}, {P_19_18,P_17_16});
-   black b_23_20 (G_23_20, P_23_20, {G_23_22,G_21_20}, {P_23_22,P_21_20});
-   black b_27_24 (G_27_24, P_27_24, {G_27_26,G_25_24}, {P_27_26,P_25_24});
-   black b_31_28 (G_31_28, P_31_28, {G_31_30,G_29_28}, {P_31_30,P_29_28});
-
-   black b_35_32 (G_35_32, P_35_32, {G_35_34,G_33_32}, {P_35_34,P_33_32});
-   black b_39_36 (G_39_36, P_39_36, {G_39_38,G_37_36}, {P_39_38,P_37_36});
-   black b_43_40 (G_43_40, P_43_40, {G_43_42,G_41_40}, {P_43_42,P_41_40});
-   black b_47_44 (G_47_44, P_47_44, {G_47_46,G_45_44}, {P_47_46,P_45_44});
-   black b_51_48 (G_51_48, P_51_48, {G_51_50,G_49_48}, {P_51_50,P_49_48});
-   black b_55_52 (G_55_52, P_55_52, {G_55_54,G_53_52}, {P_55_54,P_53_52});
-   black b_59_56 (G_59_56, P_59_56, {G_59_58,G_57_56}, {P_59_58,P_57_56});
-   black b_63_60 (G_63_60, P_63_60, {G_63_62,G_61_60}, {P_63_62,P_61_60});
-
-   black b_67_64 (G_67_64, P_67_64, {G_67_66,G_65_64}, {P_67_66,P_65_64});
-   black b_71_68 (G_71_68, P_71_68, {G_71_70,G_69_68}, {P_71_70,P_69_68});
-   black b_75_72 (G_75_72, P_75_72, {G_75_74,G_73_72}, {P_75_74,P_73_72});
-   black b_79_76 (G_79_76, P_79_76, {G_79_78,G_77_76}, {P_79_78,P_77_76});
-   black b_83_80 (G_83_80, P_83_80, {G_83_82,G_81_80}, {P_83_82,P_81_80});
-   black b_87_84 (G_87_84, P_87_84, {G_87_86,G_85_84}, {P_87_86,P_85_84});
-   black b_91_88 (G_91_88, P_91_88, {G_91_90,G_89_88}, {P_91_90,P_89_88});
-   black b_95_92 (G_95_92, P_95_92, {G_95_94,G_93_92}, {P_95_94,P_93_92});
-
-   black b_99_96 (G_99_96, P_99_96, {G_99_98,G_97_96}, {P_99_98,P_97_96});
-   black b_103_100 (G_103_100, P_103_100, {G_103_102,G_101_100}, {P_103_102,P_101_100});
-   black b_107_104 (G_107_104, P_107_104, {G_107_106,G_105_104}, {P_107_106,P_105_104});
-   black b_111_108 (G_111_108, P_111_108, {G_111_110,G_109_108}, {P_111_110,P_109_108});
-   black b_115_112 (G_115_112, P_115_112, {G_115_114,G_113_112}, {P_115_114,P_113_112});
-   black b_119_116 (G_119_116, P_119_116, {G_119_118,G_117_116}, {P_119_118,P_117_116});
-   black b_123_120 (G_123_120, P_123_120, {G_123_122,G_121_120}, {P_123_122,P_121_120});
-   black b_127_124 (G_127_124, P_127_124, {G_127_126,G_125_124}, {P_127_126,P_125_124});
-
-
-   // Stage 3: Generates G/P pairs that span 4 bits
-   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
-   black b_15_8 (G_15_8, P_15_8, {G_15_12,G_11_8}, {P_15_12,P_11_8});
-   black b_23_16 (G_23_16, P_23_16, {G_23_20,G_19_16}, {P_23_20,P_19_16});
-   black b_31_24 (G_31_24, P_31_24, {G_31_28,G_27_24}, {P_31_28,P_27_24});
-   black b_39_32 (G_39_32, P_39_32, {G_39_36,G_35_32}, {P_39_36,P_35_32});
-   black b_47_40 (G_47_40, P_47_40, {G_47_44,G_43_40}, {P_47_44,P_43_40});
-   black b_55_48 (G_55_48, P_55_48, {G_55_52,G_51_48}, {P_55_52,P_51_48});
-   black b_63_56 (G_63_56, P_63_56, {G_63_60,G_59_56}, {P_63_60,P_59_56});
-
-   black b_71_64 (G_71_64, P_71_64, {G_71_68,G_67_64}, {P_71_68,P_67_64});
-   black b_79_72 (G_79_72, P_79_72, {G_79_76,G_75_72}, {P_79_76,P_75_72});
-   black b_87_80 (G_87_80, P_87_80, {G_87_84,G_83_80}, {P_87_84,P_83_80});
-   black b_95_88 (G_95_88, P_95_88, {G_95_92,G_91_88}, {P_95_92,P_91_88});
-   black b_103_96 (G_103_96, P_103_96, {G_103_100,G_99_96}, {P_103_100,P_99_96});
-   black b_111_104 (G_111_104, P_111_104, {G_111_108,G_107_104}, {P_111_108,P_107_104});
-   black b_119_112 (G_119_112, P_119_112, {G_119_116,G_115_112}, {P_119_116,P_115_112});
-   black b_127_120 (G_127_120, P_127_120, {G_127_124,G_123_120}, {P_127_124,P_123_120});
-
-
-   // Stage 4: Generates G/P pairs that span 8 bits
-   grey g_15_0 (G_15_0, {G_15_8,G_7_0}, P_15_8);
-   black b_31_16 (G_31_16, P_31_16, {G_31_24,G_23_16}, {P_31_24,P_23_16});
-   black b_47_32 (G_47_32, P_47_32, {G_47_40,G_39_32}, {P_47_40,P_39_32});
-   black b_63_48 (G_63_48, P_63_48, {G_63_56,G_55_48}, {P_63_56,P_55_48});
-   black b_79_64 (G_79_64, P_79_64, {G_79_72,G_71_64}, {P_79_72,P_71_64});
-   black b_95_80 (G_95_80, P_95_80, {G_95_88,G_87_80}, {P_95_88,P_87_80});
-   black b_111_96 (G_111_96, P_111_96, {G_111_104,G_103_96}, {P_111_104,P_103_96});
-   black b_127_112 (G_127_112, P_127_112, {G_127_120,G_119_112}, {P_127_120,P_119_112});
-
-
-   // Stage 5: Generates G/P pairs that span 16 bits
-   grey g_31_0 (G_31_0, {G_31_16,G_15_0}, P_31_16);
-   black b_63_32 (G_63_32, P_63_32, {G_63_48,G_47_32}, {P_63_48,P_47_32});
-   black b_95_64 (G_95_64, P_95_64, {G_95_80,G_79_64}, {P_95_80,P_79_64});
-   black b_127_96 (G_127_96, P_127_96, {G_127_112,G_111_96}, {P_127_112,P_111_96});
-
-   // Stage 6: Generates G/P pairs that span 32 bits
-   grey g_63_0 (G_63_0, {G_63_32,G_31_0}, P_63_32);
-   black b_127_64 (G_127_64, P_127_64, {G_127_96,G_95_64}, {P_127_96,P_95_64});
-
-   // Stage 7: Generates G/P pairs that span 64 bits
-   grey g_127_0 (G_127_0, {G_127_64,G_63_0}, P_127_64);
-
-   // Stage 8: Generates G/P pairs that span 32 bits
-   grey g_95_0 (G_95_0, {G_95_64,G_63_0}, P_95_64);
-
-   // Stage 9: Generates G/P pairs that span 16 bits
-   grey g_47_0 (G_47_0, {G_47_32,G_31_0}, P_47_32);
-   grey g_79_0 (G_79_0, {G_79_64,G_63_0}, P_79_64);
-   grey g_111_0 (G_111_0, {G_111_96,G_95_0}, P_111_96);
-
-   // Stage 10: Generates G/P pairs that span 8 bits
-   grey g_23_0 (G_23_0, {G_23_16,G_15_0}, P_23_16);
-   grey g_39_0 (G_39_0, {G_39_32,G_31_0}, P_39_32);
-   grey g_55_0 (G_55_0, {G_55_48,G_47_0}, P_55_48);
-   grey g_71_0 (G_71_0, {G_71_64,G_63_0}, P_71_64);
-   grey g_87_0 (G_87_0, {G_87_80,G_79_0}, P_87_80);
-   grey g_103_0 (G_103_0, {G_103_96,G_95_0}, P_103_96);
-   grey g_119_0 (G_119_0, {G_119_112,G_111_0}, P_119_112);
-
-   // Stage 11: Generates G/P pairs that span 4 bits
-   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
-   grey g_19_0 (G_19_0, {G_19_16,G_15_0}, P_19_16);
-   grey g_27_0 (G_27_0, {G_27_24,G_23_0}, P_27_24);
-   grey g_35_0 (G_35_0, {G_35_32,G_31_0}, P_35_32);
-   grey g_43_0 (G_43_0, {G_43_40,G_39_0}, P_43_40);
-   grey g_51_0 (G_51_0, {G_51_48,G_47_0}, P_51_48);
-   grey g_59_0 (G_59_0, {G_59_56,G_55_0}, P_59_56);
-   grey g_67_0 (G_67_0, {G_67_64,G_63_0}, P_67_64);
-   grey g_75_0 (G_75_0, {G_75_72,G_71_0}, P_75_72);
-   grey g_83_0 (G_83_0, {G_83_80,G_79_0}, P_83_80);
-   grey g_91_0 (G_91_0, {G_91_88,G_87_0}, P_91_88);
-   grey g_99_0 (G_99_0, {G_99_96,G_95_0}, P_99_96);
-   grey g_107_0 (G_107_0, {G_107_104,G_103_0}, P_107_104);
-   grey g_115_0 (G_115_0, {G_115_112,G_111_0}, P_115_112);
-   grey g_123_0 (G_123_0, {G_123_120,G_119_0}, P_123_120);
-
-   // Stage 12: Generates G/P pairs that span 2 bits
-   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
-   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
-   grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
-   grey g_17_0 (G_17_0, {G_17_16,G_15_0}, P_17_16);
-   grey g_21_0 (G_21_0, {G_21_20,G_19_0}, P_21_20);
-   grey g_25_0 (G_25_0, {G_25_24,G_23_0}, P_25_24);
-   grey g_29_0 (G_29_0, {G_29_28,G_27_0}, P_29_28);
-   grey g_33_0 (G_33_0, {G_33_32,G_31_0}, P_33_32);
-   grey g_37_0 (G_37_0, {G_37_36,G_35_0}, P_37_36);
-   grey g_41_0 (G_41_0, {G_41_40,G_39_0}, P_41_40);
-   grey g_45_0 (G_45_0, {G_45_44,G_43_0}, P_45_44);
-   grey g_49_0 (G_49_0, {G_49_48,G_47_0}, P_49_48);
-   grey g_53_0 (G_53_0, {G_53_52,G_51_0}, P_53_52);
-   grey g_57_0 (G_57_0, {G_57_56,G_55_0}, P_57_56);
-   grey g_61_0 (G_61_0, {G_61_60,G_59_0}, P_61_60);
-   grey g_65_0 (G_65_0, {G_65_64,G_63_0}, P_65_64);
-   grey g_69_0 (G_69_0, {G_69_68,G_67_0}, P_69_68);
-   grey g_73_0 (G_73_0, {G_73_72,G_71_0}, P_73_72);
-   grey g_77_0 (G_77_0, {G_77_76,G_75_0}, P_77_76);
-   grey g_81_0 (G_81_0, {G_81_80,G_79_0}, P_81_80);
-   grey g_85_0 (G_85_0, {G_85_84,G_83_0}, P_85_84);
-   grey g_89_0 (G_89_0, {G_89_88,G_87_0}, P_89_88);
-   grey g_93_0 (G_93_0, {G_93_92,G_91_0}, P_93_92);
-   grey g_97_0 (G_97_0, {G_97_96,G_95_0}, P_97_96);
-   grey g_101_0 (G_101_0, {G_101_100,G_99_0}, P_101_100);
-   grey g_105_0 (G_105_0, {G_105_104,G_103_0}, P_105_104);
-   grey g_109_0 (G_109_0, {G_109_108,G_107_0}, P_109_108);
-   grey g_113_0 (G_113_0, {G_113_112,G_111_0}, P_113_112);
-   grey g_117_0 (G_117_0, {G_117_116,G_115_0}, P_117_116);
-   grey g_121_0 (G_121_0, {G_121_120,G_119_0}, P_121_120);
-   grey g_125_0 (G_125_0, {G_125_124,G_123_0}, P_125_124);
-
-   // Last grey cell stage 
-   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
-   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
-   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
-   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
-   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
-   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
-   grey g_14_0 (G_14_0, {g[14],G_13_0}, p[14]);
-   grey g_16_0 (G_16_0, {g[16],G_15_0}, p[16]);
-   grey g_18_0 (G_18_0, {g[18],G_17_0}, p[18]);
-   grey g_20_0 (G_20_0, {g[20],G_19_0}, p[20]);
-   grey g_22_0 (G_22_0, {g[22],G_21_0}, p[22]);
-   grey g_24_0 (G_24_0, {g[24],G_23_0}, p[24]);
-   grey g_26_0 (G_26_0, {g[26],G_25_0}, p[26]);
-   grey g_28_0 (G_28_0, {g[28],G_27_0}, p[28]);
-   grey g_30_0 (G_30_0, {g[30],G_29_0}, p[30]);
-   grey g_32_0 (G_32_0, {g[32],G_31_0}, p[32]);
-   grey g_34_0 (G_34_0, {g[34],G_33_0}, p[34]);
-   grey g_36_0 (G_36_0, {g[36],G_35_0}, p[36]);
-   grey g_38_0 (G_38_0, {g[38],G_37_0}, p[38]);
-   grey g_40_0 (G_40_0, {g[40],G_39_0}, p[40]);
-   grey g_42_0 (G_42_0, {g[42],G_41_0}, p[42]);
-   grey g_44_0 (G_44_0, {g[44],G_43_0}, p[44]);
-   grey g_46_0 (G_46_0, {g[46],G_45_0}, p[46]);
-   grey g_48_0 (G_48_0, {g[48],G_47_0}, p[48]);
-   grey g_50_0 (G_50_0, {g[50],G_49_0}, p[50]);
-   grey g_52_0 (G_52_0, {g[52],G_51_0}, p[52]);
-   grey g_54_0 (G_54_0, {g[54],G_53_0}, p[54]);
-   grey g_56_0 (G_56_0, {g[56],G_55_0}, p[56]);
-   grey g_58_0 (G_58_0, {g[58],G_57_0}, p[58]);
-   grey g_60_0 (G_60_0, {g[60],G_59_0}, p[60]);
-   grey g_62_0 (G_62_0, {g[62],G_61_0}, p[62]);
-   grey g_64_0 (G_64_0, {g[64],G_63_0}, p[64]);
-   grey g_66_0 (G_66_0, {g[66],G_65_0}, p[66]);
-   grey g_68_0 (G_68_0, {g[68],G_67_0}, p[68]);
-   grey g_70_0 (G_70_0, {g[70],G_69_0}, p[70]);
-   grey g_72_0 (G_72_0, {g[72],G_71_0}, p[72]);
-   grey g_74_0 (G_74_0, {g[74],G_73_0}, p[74]);
-   grey g_76_0 (G_76_0, {g[76],G_75_0}, p[76]);
-   grey g_78_0 (G_78_0, {g[78],G_77_0}, p[78]);
-   grey g_80_0 (G_80_0, {g[80],G_79_0}, p[80]);
-   grey g_82_0 (G_82_0, {g[82],G_81_0}, p[82]);
-   grey g_84_0 (G_84_0, {g[84],G_83_0}, p[84]);
-   grey g_86_0 (G_86_0, {g[86],G_85_0}, p[86]);
-   grey g_88_0 (G_88_0, {g[88],G_87_0}, p[88]);
-   grey g_90_0 (G_90_0, {g[90],G_89_0}, p[90]);
-   grey g_92_0 (G_92_0, {g[92],G_91_0}, p[92]);
-   grey g_94_0 (G_94_0, {g[94],G_93_0}, p[94]);
-   grey g_96_0 (G_96_0, {g[96],G_95_0}, p[96]);
-   grey g_98_0 (G_98_0, {g[98],G_97_0}, p[98]);
-   grey g_100_0 (G_100_0, {g[100],G_99_0}, p[100]);
-   grey g_102_0 (G_102_0, {g[102],G_101_0}, p[102]);
-   grey g_104_0 (G_104_0, {g[104],G_103_0}, p[104]);
-   grey g_106_0 (G_106_0, {g[106],G_105_0}, p[106]);
-   grey g_108_0 (G_108_0, {g[108],G_107_0}, p[108]);
-   grey g_110_0 (G_110_0, {g[110],G_109_0}, p[110]);
-   grey g_112_0 (G_112_0, {g[112],G_111_0}, p[112]);
-   grey g_114_0 (G_114_0, {g[114],G_113_0}, p[114]);
-   grey g_116_0 (G_116_0, {g[116],G_115_0}, p[116]);
-   grey g_118_0 (G_118_0, {g[118],G_117_0}, p[118]);
-   grey g_120_0 (G_120_0, {g[120],G_119_0}, p[120]);
-   grey g_122_0 (G_122_0, {g[122],G_121_0}, p[122]);
-   grey g_124_0 (G_124_0, {g[124],G_123_0}, p[124]);
-   grey g_126_0 (G_126_0, {g[126],G_125_0}, p[126]);
-
-   // Final Stage: Apply c_k+1=G_k_0
-   assign c[1]=g[0];
-   assign c[2]=G_1_0;
-   assign c[3]=G_2_0;
-   assign c[4]=G_3_0;
-   assign c[5]=G_4_0;
-   assign c[6]=G_5_0;
-   assign c[7]=G_6_0;
-   assign c[8]=G_7_0;
-   assign c[9]=G_8_0;
-
-   assign c[10]=G_9_0;
-   assign c[11]=G_10_0;
-   assign c[12]=G_11_0;
-   assign c[13]=G_12_0;
-   assign c[14]=G_13_0;
-   assign c[15]=G_14_0;
-   assign c[16]=G_15_0;
-   assign c[17]=G_16_0;
-
-   assign c[18]=G_17_0;
-   assign c[19]=G_18_0;
-   assign c[20]=G_19_0;
-   assign c[21]=G_20_0;
-   assign c[22]=G_21_0;
-   assign c[23]=G_22_0;
-   assign c[24]=G_23_0;
-   assign c[25]=G_24_0;
-
-   assign c[26]=G_25_0;
-   assign c[27]=G_26_0;
-   assign c[28]=G_27_0;
-   assign c[29]=G_28_0;
-   assign c[30]=G_29_0;
-   assign c[31]=G_30_0;
-   assign c[32]=G_31_0;
-   assign c[33]=G_32_0;
-
-   assign c[34]=G_33_0;
-   assign c[35]=G_34_0;
-   assign c[36]=G_35_0;
-   assign c[37]=G_36_0;
-   assign c[38]=G_37_0;
-   assign c[39]=G_38_0;
-   assign c[40]=G_39_0;
-   assign c[41]=G_40_0;
-
-   assign c[42]=G_41_0;
-   assign c[43]=G_42_0;
-   assign c[44]=G_43_0;
-   assign c[45]=G_44_0;
-   assign c[46]=G_45_0;
-   assign c[47]=G_46_0;
-   assign c[48]=G_47_0;
-   assign c[49]=G_48_0;
-
-   assign c[50]=G_49_0;
-   assign c[51]=G_50_0;
-   assign c[52]=G_51_0;
-   assign c[53]=G_52_0;
-   assign c[54]=G_53_0;
-   assign c[55]=G_54_0;
-   assign c[56]=G_55_0;
-   assign c[57]=G_56_0;
-
-   assign c[58]=G_57_0;
-   assign c[59]=G_58_0;
-   assign c[60]=G_59_0;
-   assign c[61]=G_60_0;
-   assign c[62]=G_61_0;
-   assign c[63]=G_62_0;
-   assign c[64]=G_63_0;
-   assign c[65]=G_64_0;
-
-   assign c[66]=G_65_0;
-   assign c[67]=G_66_0;
-   assign c[68]=G_67_0;
-   assign c[69]=G_68_0;
-   assign c[70]=G_69_0;
-   assign c[71]=G_70_0;
-   assign c[72]=G_71_0;
-   assign c[73]=G_72_0;
-
-   assign c[74]=G_73_0;
-   assign c[75]=G_74_0;
-   assign c[76]=G_75_0;
-   assign c[77]=G_76_0;
-   assign c[78]=G_77_0;
-   assign c[79]=G_78_0;
-   assign c[80]=G_79_0;
-   assign c[81]=G_80_0;
-
-   assign c[82]=G_81_0;
-   assign c[83]=G_82_0;
-   assign c[84]=G_83_0;
-   assign c[85]=G_84_0;
-   assign c[86]=G_85_0;
-   assign c[87]=G_86_0;
-   assign c[88]=G_87_0;
-   assign c[89]=G_88_0;
-
-   assign c[90]=G_89_0;
-   assign c[91]=G_90_0;
-   assign c[92]=G_91_0;
-   assign c[93]=G_92_0;
-   assign c[94]=G_93_0;
-   assign c[95]=G_94_0;
-   assign c[96]=G_95_0;
-   assign c[97]=G_96_0;
-
-   assign c[98]=G_97_0;
-   assign c[99]=G_98_0;
-   assign c[100]=G_99_0;
-   assign c[101]=G_100_0;
-   assign c[102]=G_101_0;
-   assign c[103]=G_102_0;
-   assign c[104]=G_103_0;
-   assign c[105]=G_104_0;
-
-   assign c[106]=G_105_0;
-   assign c[107]=G_106_0;
-   assign c[108]=G_107_0;
-   assign c[109]=G_108_0;
-   assign c[110]=G_109_0;
-   assign c[111]=G_110_0;
-   assign c[112]=G_111_0;
-   assign c[113]=G_112_0;
-
-   assign c[114]=G_113_0;
-   assign c[115]=G_114_0;
-   assign c[116]=G_115_0;
-   assign c[117]=G_116_0;
-   assign c[118]=G_117_0;
-   assign c[119]=G_118_0;
-   assign c[120]=G_119_0;
-   assign c[121]=G_120_0;
-
-   assign c[122]=G_121_0;
-   assign c[123]=G_122_0;
-   assign c[124]=G_123_0;
-   assign c[125]=G_124_0;
-   assign c[126]=G_125_0;
-   assign c[127]=G_126_0;
-   assign c[128]=G_127_0;
-
-endmodule // brent_kung_cs
-
-
diff --git a/wally-pipelined/src/fpu/bk13.sv b/wally-pipelined/src/fpu/bk13.sv
deleted file mode 100755
index 84158db98..000000000
--- a/wally-pipelined/src/fpu/bk13.sv
+++ /dev/null
@@ -1,97 +0,0 @@
-// Brent-Kung Carry-save Prefix Adder
-
-module bk13 (cout, sum, a, b, cin);
-	 input [12:0] a, b;
-	 input cin;
-	 output [12:0] sum;
-	 output cout;
-
-	 wire [13:0] p,g,t;
-	 wire [12:0] c;
-
-// pre-computation
-	 assign p={a^b,1'b0};
-	 assign g={a&b, cin};
-	 assign t[1]=p[1];
-	 assign t[2]=p[2];
-	 assign t[3]=p[3]^g[2];
-	 assign t[4]=p[4];
-	 assign t[5]=p[5]^g[4];
-	 assign t[6]=p[6];
-	 assign t[7]=p[7]^g[6];
-	 assign t[8]=p[8];
-	 assign t[9]=p[9]^g[8];
-	 assign t[10]=p[10];
-	 assign t[11]=p[11]^g[10];
-	 assign t[12]=p[12];
-	 assign t[13]=p[13];
-
-// prefix tree
-	 brent_kung_cs13 prefix_tree(c, p[12:0], g[12:0]);
-
-// post-computation
-	 assign sum=p[13:1]^c;
-	 assign cout=g[13]|(p[13]&c[12]);
-
-endmodule
-
-module brent_kung_cs13 (c, p, g);
-	
-	input [13:0] p;
-	input [13:0] g;
-	output [13:1] c;
-
-
-	// parallel-prefix, Brent-Kung
-
-	// Stage 1: Generates G/P pairs that span 1 bits
-	grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
-	black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
-	black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
-	black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
-	black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
-	black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
-	black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
-
-	// Stage 2: Generates G/P pairs that span 2 bits
-	grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
-	black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
-	black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
-
-	// Stage 3: Generates G/P pairs that span 4 bits
-	grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
-
-	// Stage 4: Generates G/P pairs that span 8 bits
-
-	// Stage 5: Generates G/P pairs that span 4 bits
-	grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
-
-	// Stage 6: Generates G/P pairs that span 2 bits
-	grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
-	grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
-
-	// Last grey cell stage 
-	grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
-	grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
-	grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
-	grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
-	grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
-	grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
-
-	// Final Stage: Apply c_k+1=G_k_0
-	assign c[1]=g[0];
-	assign c[2]=G_1_0;
-	assign c[3]=G_2_0;
-	assign c[4]=G_3_0;
-	assign c[5]=G_4_0;
-	assign c[6]=G_5_0;
-	assign c[7]=G_6_0;
-	assign c[8]=G_7_0;
-	assign c[9]=G_8_0;
-
-	assign c[10]=G_9_0;
-	assign c[11]=G_10_0;
-	assign c[12]=G_11_0;
-	assign c[13]=G_12_0;
-
-endmodule
diff --git a/wally-pipelined/src/fpu/bk14.sv b/wally-pipelined/src/fpu/bk14.sv
deleted file mode 100755
index 46872167e..000000000
--- a/wally-pipelined/src/fpu/bk14.sv
+++ /dev/null
@@ -1,86 +0,0 @@
-// Brent-Kung Prefix Adder
-
-module bk14 (cout, sum, a, b, cin);
-	 input [13:0] a, b;
-	 input cin;
-	 output [13:0] sum;
-	 output cout;
-
-	 wire [14:0] p,g;
-	 wire [13:0] c;
-
-// pre-computation
-	 assign p={a^b,1'b0};
-	 assign g={a&b, cin};
-
-// prefix tree
-	 brent_kung14 prefix_tree(c, p[13:0], g[13:0]);
-
-// post-computation
-	 assign sum=p[14:1]^c;
-	 assign cout=g[14]|(p[14]&c[13]);
-
-endmodule
-
-module brent_kung14 (c, p, g);
-	
-	input [13:0] p;
-	input [13:0] g;
-	output [14:1] c;
-
-
-	// parallel-prefix, Brent-Kung
-
-	// Stage 1: Generates G/P pairs that span 1 bits
-	grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
-	black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
-	black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
-	black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
-	black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
-	black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
-	black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
-
-	// Stage 2: Generates G/P pairs that span 2 bits
-	grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
-	black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
-	black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
-
-	// Stage 3: Generates G/P pairs that span 4 bits
-	grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
-
-	// Stage 4: Generates G/P pairs that span 8 bits
-
-	// Stage 5: Generates G/P pairs that span 4 bits
-	grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
-
-	// Stage 6: Generates G/P pairs that span 2 bits
-	grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
-	grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
-	grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
-
-	// Last grey cell stage 
-	grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
-	grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
-	grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
-	grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
-	grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
-	grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
-
-	// Final Stage: Apply c_k+1=G_k_0
-	assign c[1]=g[0];
-	assign c[2]=G_1_0;
-	assign c[3]=G_2_0;
-	assign c[4]=G_3_0;
-	assign c[5]=G_4_0;
-	assign c[6]=G_5_0;
-	assign c[7]=G_6_0;
-	assign c[8]=G_7_0;
-	assign c[9]=G_8_0;
-
-	assign c[10]=G_9_0;
-	assign c[11]=G_10_0;
-	assign c[12]=G_11_0;
-	assign c[13]=G_12_0;
-	assign c[14]=G_13_0;
-
-endmodule
diff --git a/wally-pipelined/src/fpu/csa.sv b/wally-pipelined/src/fpu/csa.sv
deleted file mode 100644
index 1e5682cfc..000000000
--- a/wally-pipelined/src/fpu/csa.sv
+++ /dev/null
@@ -1,70 +0,0 @@
-module ha (C, S, A, B) ;
-   
-   input  A, B;
-   output S, C;
-
-   assign S = A^B;
-   assign C = A&B;
-
-endmodule // HA
-
-// module fa (input logic a, b, c, output logic sum, carry);
-   
-//    assign sum = a^b^c;
-//    assign carry = a&b|a&c|b&c;   
-   
-// endmodule // fa
-
-// module csa #(parameter WIDTH=8) (a, b,c, sum, carry, cout);
-
-//    input logic [WIDTH-1:0] a, b, c;
-   
-//    output logic [WIDTH-1:0] sum, carry;
-//    output logic 	    cout;   
-
-//    logic [WIDTH:0] 	    carry_temp;   
-//    genvar 		    i;
-//    generate
-//       for (i=0;i<WIDTH;i=i+1)
-// 	begin : genbit
-// 	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
-// 	end
-//    endgenerate
-//    assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};
-//    assign cout = carry_temp[WIDTH];   
-   
-// endmodule // csa
-
-module FA_array (S, C, A, B, Ci) ;
-   parameter n = 32;
-   input  [n-1:0] A;
-   input  [n-1:0] B;
-   input  [n-1:0] Ci;
-   output [n-1:0] S;
-   output [n-1:0] C;
-
-   wire   [n-1:0] n0;
-   wire   [n-1:0] n1;
-   wire   [n-1:0] n2;
-
-   genvar 	  i;
-   generate
-      for (i = 0; i < n; i = i + 1) begin : index
-	 fa FA1(.sum(S[i]), .carry(C[i]), .a(A[i]), .b(B[i]), .c(Ci[i]));
-      end
-   endgenerate
-   
-endmodule // FA_array
-
-module HA_array (S, C, A, B) ;
-   parameter n = 32;
-   input  [n-1:0] A, B;
-   output [n-1:0] S, C;
-   genvar 	  i;
-   generate
-      for (i = 0; i < n; i = i + 1) begin : index
-	 ha ha1(.S(S[i]), .C(C[i]), .A(A[i]), .B(B[i]));
-      end
-   endgenerate
-   
-endmodule // HA_array
\ No newline at end of file
diff --git a/wally-pipelined/src/fpu/divconv.sv b/wally-pipelined/src/fpu/divconv.sv
index 455a0aaf4..a583141b0 100644
--- a/wally-pipelined/src/fpu/divconv.sv
+++ b/wally-pipelined/src/fpu/divconv.sv
@@ -68,9 +68,9 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
    mux2 #(64) mx5 (muxb_out, mcand_q, sel_muxr&op_type, mplier);   
    mux2 #(64) mx6 (muxa_out, mcand_q, sel_muxr, mcand);
    // TDM multiplier (carry/save)
-   multiplier mult1 (mcand, mplier, Sum, Carry);
+   multiplier mult1 (mcand, mplier, Sum, Carry);   // ***multiply
    // Q*D - N (reversed but changed in rounder.v to account for sign reversal)
-   csa #(128) csa1 (Sum, Carry, constant, Sum2, Carry2);
+   csa #(128) csa1 (Sum, Carry, constant, Sum2, Carry2); //***adder
    // Add ulp for subtraction in remainder
    mux2 #(1) mx7 (1'b0, 1'b1, sel_muxr, muxr_out);
 
@@ -80,15 +80,15 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
    mux2 #(64) mxA ({64'hFFFF_FFFF_FFFF_F9FF}, {64'hFFFF_FF3F_FFFF_FFFF}, P, qm_const);
    
    // CPA (from CSA)/Remainder addition/subtraction
-   ldf128 cpa1 (cout1, mul_out, Sum2, Carry2, muxr_out);
+   ldf128 cpa1 (cout1, mul_out, Sum2, Carry2, muxr_out); //***adder
    // Assuming [1,2) - q1
-   ldf64 cpa2 (cout2, q_out1, regb_out, q_const, 1'b0);
-   ldf64 cpa3 (cout3, qp_out1, regb_out, qp_const, 1'b0);
-   ldf64 cpa4 (cout4, qm_out1, regb_out, qm_const, 1'b1);   
+   ldf64 cpa2 (cout2, q_out1, regb_out, q_const, 1'b0); //***adder
+   ldf64 cpa3 (cout3, qp_out1, regb_out, qp_const, 1'b0); //***adder
+   ldf64 cpa4 (cout4, qm_out1, regb_out, qm_const, 1'b1);    //***adder
    // Assuming [0.5,1) - q0
-   ldf64 cpa5 (cout5, q_out0, {regb_out[62:0], vss}, q_const, 1'b0);
-   ldf64 cpa6 (cout6, qp_out0, {regb_out[62:0], vss}, qp_const, 1'b0);
-   ldf64 cpa7 (cout7, qm_out0, {regb_out[62:0], vss}, qm_const, 1'b1);
+   ldf64 cpa5 (cout5, q_out0, {regb_out[62:0], vss}, q_const, 1'b0); //***adder
+   ldf64 cpa6 (cout6, qp_out0, {regb_out[62:0], vss}, qp_const, 1'b0); //***adder
+   ldf64 cpa7 (cout7, qm_out0, {regb_out[62:0], vss}, qm_const, 1'b1); //***adder
    // One's complement instead of two's complement (for hw efficiency)
    assign three = {~mul_out[126], mul_out[126], ~mul_out[125:63]};   
    mux2 #(64) mxTC (~mul_out[126:63], three[64:1],  op_type, twocmp_out);
diff --git a/wally-pipelined/src/fpu/fctrl.sv b/wally-pipelined/src/fpu/fctrl.sv
index 3be9b281a..45f2e7efd 100755
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@@ -64,30 +64,38 @@ module fctrl (
                                 else if (Funct3D[1:0] == 2'b00) ControlsD = `FCTRLW'b0_1_100_0100_00_01_0_0; // fmv.x.w
                                 else if (Funct3D[1:0] == 2'b01) ControlsD = `FCTRLW'b0_1_100_0101_00_01_0_0; // fmv.x.d
                                 else                            ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
-                    7'b1100000: case(Rs2D[0])
-                                  1'b0:    ControlsD = `FCTRLW'b0_1_010_0110_00_00_0_0; // fcvt.s.w
-                                  1'b1:    ControlsD = `FCTRLW'b0_1_010_0101_00_00_0_0; // fcvt.s.wu
+                    7'b1100000: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0001_00_00_0_0; // fcvt.s.w
+                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0101_00_00_0_0; // fcvt.s.wu
+                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1001_00_00_0_0; // fcvt.s.l
+                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1101_00_00_0_0; // fcvt.s.lu
                                   default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                 endcase
-                    7'b1101000: case(Rs2D[0])
-                                  1'b0:    ControlsD = `FCTRLW'b1_1_010_0100_00_00_0_0; // fcvt.w.s
-                                  1'b1:    ControlsD = `FCTRLW'b1_1_010_0101_00_00_0_0; // fcvt.wu.s
+                    7'b1101000: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b1_1_100_0010_00_00_0_0; // fcvt.w.s
+                                  2'b01:    ControlsD = `FCTRLW'b1_1_100_0110_00_00_0_0; // fcvt.wu.s
+                                  2'b10:    ControlsD = `FCTRLW'b1_1_100_1010_00_00_0_0; // fcvt.l.s
+                                  2'b11:    ControlsD = `FCTRLW'b1_1_100_1110_00_00_0_0; // fcvt.lu.s
                                   default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                 endcase
                     7'b1111000: ControlsD = `FCTRLW'b1_0_100_0000_00_00_0_0; // fmv.w.x
-                    7'b0100000: ControlsD = `FCTRLW'b1_0_010_0010_00_00_0_0; // fcvt.s.d
-                    7'b1100001: case(Rs2D[0])
-                                  1'b0:    ControlsD = `FCTRLW'b0_1_010_1110_00_00_0_0; // fcvt.d.w
-                                  1'b1:    ControlsD = `FCTRLW'b0_1_010_1111_00_00_0_0; // fcvt.d.wu
+                    7'b0100000: ControlsD = `FCTRLW'b1_0_010_0000_00_00_0_0; // fcvt.s.d
+                    7'b1100001: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b0_1_100_0001_00_00_0_0; // fcvt.d.w
+                                  2'b01:    ControlsD = `FCTRLW'b0_1_100_0101_00_00_0_0; // fcvt.d.wu
+                                  2'b10:    ControlsD = `FCTRLW'b0_1_100_1001_00_00_0_0; // fcvt.d.l
+                                  2'b11:    ControlsD = `FCTRLW'b0_1_100_1101_00_00_0_0; // fcvt.d.lu
                                   default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                 endcase
-                    7'b1101001: case(Rs2D[0])
-                                  1'b0:    ControlsD = `FCTRLW'b1_0_010_1100_00_00_0_0; // fcvt.w.d
-                                  1'b1:    ControlsD = `FCTRLW'b1_0_010_1101_00_00_0_0; // fcvt.wu.d
+                    7'b1101001: case(Rs2D[1:0])
+                                  2'b00:    ControlsD = `FCTRLW'b1_0_100_0010_00_00_0_0; // fcvt.w.d
+                                  2'b01:    ControlsD = `FCTRLW'b1_0_100_0110_00_00_0_0; // fcvt.wu.d
+                                  2'b10:    ControlsD = `FCTRLW'b1_0_100_1010_00_00_0_0; // fcvt.l.d
+                                  2'b11:    ControlsD = `FCTRLW'b1_0_100_1110_00_00_0_0; // fcvt.lu.d
                                   default: ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                                 endcase
                     7'b1111001: ControlsD = `FCTRLW'b1_0_100_0001_00_00_0_0; // fmv.d.x
-                    7'b0100001: ControlsD = `FCTRLW'b1_0_010_1000_00_00_0_0; // fcvt.d.s
+                    7'b0100001: ControlsD = `FCTRLW'b1_0_100_0000_00_00_0_0; // fcvt.d.s
                     default:    ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
                   endcase
       default:      ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
@@ -130,17 +138,26 @@ module fctrl (
   // add/sub/cnvt
       //  fadd      = 0000
       //  fsub      = 0001
-      //  fcvt.w.s  = 0100
-      //  fcvt.wu.s = 0101
-      //  fcvt.s.w  = 0110
-      //  fcvt.s.wu = 0111
-      //  fcvt.s.d  = 0010
-      //  fcvt.w.d  = 1100
-      //  fcvt.wu.d = 1101
-      //  fcvt.d.w  = 1110
-      //  fcvt.d.wu = 1111
-      //  fcvt.d.s  = 1000
-      //		   { is double and not add/sub, is to/from int, is to int or float to double,      is unsigned or sub}
+  // cnvt
+      //  fcvt.w.s  = 0010
+      //  fcvt.wu.s = 0110
+      //  fcvt.s.w  = 0001
+      //  fcvt.s.wu = 0101
+      //  fcvt.s.d  = 0000
+      //  fcvt.l.s  = 1010
+      //  fcvt.lu.s = 1110
+      //  fcvt.s.l  = 1001
+      //  fcvt.s.lu = 1101
+      //  fcvt.w.d  = 0010
+      //  fcvt.wu.d = 0110
+      //  fcvt.d.w  = 0001
+      //  fcvt.d.wu = 0101
+      //  fcvt.d.s  = 0000
+      //  fcvt.l.d  = 1010
+      //  fcvt.lu.d = 1110
+      //  fcvt.d.l  = 1001
+      //  fcvt.d.lu = 1101
+      //  {long, unsigned, to int, from int} Fmt controls the output for fp -> fp
 
       //  fmv.w.x = ???0
       //  fmv.w.d = ???1
diff --git a/wally-pipelined/src/fpu/fma2.sv b/wally-pipelined/src/fpu/fma2.sv
index 131f98394..518b7a76c 100644
--- a/wally-pipelined/src/fpu/fma2.sv
+++ b/wally-pipelined/src/fpu/fma2.sv
@@ -16,8 +16,8 @@ module fma2(
     input logic                 XZeroM, YZeroM, ZZeroM, // inputs are zero
     input logic                 XInfM, YInfM, ZInfM,    // inputs are infinity
     input logic                 XNaNM, YNaNM, ZNaNM,    // inputs are NaN
-    output logic    [63:0]      FmaResultM,     // FMA final result
-    output logic    [4:0]       FmaFlagsM);     // FMA flags {invalid, divide by zero, overflow, underflow, inexact}
+    output logic    [63:0]      FMAResM,     // FMA final result
+    output logic    [4:0]       FMAFlgM);     // FMA flags {invalid, divide by zero, overflow, underflow, inexact}
    
 
 
@@ -57,7 +57,7 @@ module fma2(
     logic [12:0]    MaxExp;     // maximum value of the exponent
     logic [12:0]    FracLen;    // length of the fraction
     logic           SigNaN;     // is an input a signaling NaN
-    logic           UnderflowFlag;  // Underflow singal used in FmaFlagsM (used to avoid a circular depencency)
+    logic           UnderflowFlag;  // Underflow singal used in FMAFlgM (used to avoid a circular depencency)
     logic [63:0] XNaNResult, YNaNResult, ZNaNResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult; // possible results
 
    
@@ -316,7 +316,7 @@ module fma2(
     // Combine flags
     //      - FMA can't set the Divide by zero flag
     //      - Don't set the underflow flag if the result was rounded up to a normal number
-    assign FmaFlagsM = {Invalid, 1'b0, Overflow, UnderflowFlag, Inexact};
+    assign FMAFlgM = {Invalid, 1'b0, Overflow, UnderflowFlag, Inexact};
 
 
 
@@ -337,7 +337,7 @@ module fma2(
     assign InvalidResult = FmtM ? {ResultSgn, 11'h7ff, 1'b1, 51'b0} : {ResultSgn, 8'hff, 1'b1, 54'b0};
     assign KillProdResult = FmtM ?{ResultSgn, Addend[62:0] - {62'b0, (Minus1&AddendStickyM)}} + {62'b0, (Plus1&AddendStickyM)} : {ResultSgn, Addend[62:32] - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}, 32'b0};
     assign UnderflowResult = FmtM ? {ResultSgn, 63'b0} + {63'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))} : {{ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}, 32'b0};
-    assign FmaResultM = XNaNM ? XNaNResult :
+    assign FMAResM = XNaNM ? XNaNResult :
                         YNaNM ? YNaNResult :
                         ZNaNM ? ZNaNResult :
                         Invalid ? InvalidResult : // has to be before inf
diff --git a/wally-pipelined/src/fpu/fpadd_denorm.sv b/wally-pipelined/src/fpu/fpadd_denorm.sv
index eabfcd3a1..43de30879 100755
--- a/wally-pipelined/src/fpu/fpadd_denorm.sv
+++ b/wally-pipelined/src/fpu/fpadd_denorm.sv
@@ -229,11 +229,11 @@ module fpadd (AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);
    assign corr_sign = ~op_type[2]&~op_type[1]&op_type[0]&swap;
    
    // 64-bit Mantissa Adder/Subtractor
-   cla64 add1 (sum, mantissaA3, mantissaB3, sub);
+   cla64 add1 (sum, mantissaA3, mantissaB3, sub); //***adder
 
    // 64-bit Mantissa Subtractor - to get the two's complement of the 
    // result when the sign from the adder/subtractor is negative. 
-   cla_sub64 sub1 (sum_tc, mantissaB3, mantissaA3);
+   cla_sub64 sub1 (sum_tc, mantissaB3, mantissaA3); //***adder
 
    // Determine the correct sign of the result
    assign sign_corr = ((corr_sign ^ signA) & ~convert) ^ sum[63];   
diff --git a/wally-pipelined/src/fpu/fpdiv.sv b/wally-pipelined/src/fpu/fpdiv.sv
deleted file mode 100755
index 8c305f3ea..000000000
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ /dev/null
@@ -1,256 +0,0 @@
-//
-// File name : fpdiv
-// Title     : Floating-Point Divider/Square-Root
-// project   : FPU
-// Library   : fpdiv
-// Author(s) : James E. Stine, Jr.
-// Purpose   : definition of main unit to floating-point div/sqrt
-// notes :   
-//
-// Copyright Oklahoma State University
-//
-// Basic Operations
-//
-// Step 1: Load operands, set flags, and convert SP to DP
-// Step 2: Check for special inputs ( +/- Infinity,  NaN)
-// Step 3: Exponent Logic
-// Step 4: Divide/Sqrt using Goldschmidt
-// Step 5: Normalize the result.//
-//   Shift left until normalized.  Normalized when the value to the 
-//   left of the binrary point is 1.
-// Step 6: Round the result.// 
-// Step 7: Put quotient/remainder onto output.
-//
-
-// `timescale 1ps/1ps
-module fpdiv (FDivSqrtDoneE, FDivResultM, FDivFlagsM, DivDenormM, DivInput1E, DivInput2E, FrmE, DivOpType, FmtE, DivOvEn, DivUnEn,
-	      FDivStartE, reset, clk, FDivBusyE, HoldInputs);
-
-   input [63:0] DivInput1E;		// 1st input operand (A)
-   input [63:0] DivInput2E;		// 2nd input operand (B)
-   input [2:0] 	FrmE;		// Rounding mode - specify values 
-   input 	DivOpType;	// Function opcode
-   input 	FmtE;   		// Result Precision (0 for double, 1 for single) //***will need to swap this
-   input 	DivOvEn;		// Overflow trap enabled
-   input 	DivUnEn;   	// Underflow trap enabled
-
-   input 	FDivStartE;
-   input 	reset;
-   input 	clk;   
-
-   output [63:0] FDivResultM;	// Result of operation
-   output [4:0]  FDivFlagsM;   	// IEEE exception flags 
-   output 	 DivDenormM;   	// DivDenormM on input or output
-   output 	 FDivSqrtDoneE;
-   output    FDivBusyE, HoldInputs;
-
-   supply1 	  vdd;
-   supply0 	  vss;   
-
-   wire [63:0] 	 Float1; 
-   wire [63:0] 	 Float2;
-   wire [63:0] 	 IntValue;
-   
-   wire [12:0] 	 exp1, exp2, expF;
-   wire [12:0] 	 exp_diff, bias;
-   wire [13:0] 	 exp_sqrt;
-   wire [12:0] 	 exp_s;
-   wire [12:0] 	 exp_c;
-   
-   wire [10:0] 	 exponent, exp_pre;
-   wire [63:0] 	 Result;   
-   wire [52:0] 	 mantissaA;
-   wire [52:0] 	 mantissaB; 
-   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
-   
-   wire [5:0] 	 align_shift;
-   wire [5:0] 	 norm_shift;
-   wire [2:0] 	 sel_inv;
-   wire		 op1_Norm, op2_Norm;
-   wire		 opA_Norm, opB_Norm;
-   wire		 Invalid;
-   wire 	 DenormIn, DenormIO;
-   wire [4:0] 	 FlagsIn;   	
-   wire 	 exp_gt63;
-   wire 	 Sticky_out;
-   wire 	 signResult, sign_corr;
-   wire          corr_sign;
-   wire 	 zeroB;         
-   wire 	 convert;
-   wire          swap;
-   wire          sub;
-   
-   wire [63:0] 	 q1, qm1, qp1, q0, qm0, qp0;
-   wire [63:0] 	 rega_out, regb_out, regc_out, regd_out;
-   wire [127:0]  regr_out;
-   wire [2:0] 	 sel_muxa, sel_muxb;
-   wire 	 sel_muxr;   
-   wire 	 load_rega, load_regb, load_regc, load_regd, load_regr, load_regs;
-
-   wire 	 donev, sel_muxrv, sel_muxsv;
-   wire [1:0] 	 sel_muxav, sel_muxbv;   
-   wire 	 load_regav, load_regbv, load_regcv;
-   wire 	 load_regrv, load_regsv;
-   
-   logic exp_cout1, exp_cout2, exp_odd, open;
-
-
-   // Convert the input operands to their appropriate forms based on 
-   // the orignal operands, the DivOpType , and their precision FmtE. 
-   // Single precision inputs are converted to double precision 
-   // and the sign of the first operand is set appropratiately based on
-   // if the operation is absolute value or negation. 
-   convert_inputs_div divconv1 (Float1, Float2, DivInput1E, DivInput2E, DivOpType, FmtE);
-
-   // Test for exceptions and return the "Invalid Operation" and
-   // "Denormalized" Input FDivFlagsM. The "sel_inv" is used in
-   // the third pipeline stage to select the result. Also, op1_Norm
-   // and op2_Norm are one if DivInput1E and DivInput2E are not zero or denormalized.
-   // sub is one if the effective operation is subtaction. 
-   exception_div divexc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
-		   Float1, Float2, DivOpType);
-
-   // Determine Sign/Mantissa
-   assign signResult = ((Float1[63]^Float2[63])&~DivOpType) | Float1[63]&DivOpType;
-   assign mantissaA = {vdd, Float1[51:0]};
-   assign mantissaB = {vdd, Float2[51:0]};
-   // Perform Exponent Subtraction - expA - expB + Bias   
-   assign exp1 = {2'b0, Float1[62:52]};
-   assign exp2 = {2'b0, Float2[62:52]};
-   // bias : DP = 2^{11-1}-1 = 1023
-   assign bias = {3'h0, 10'h3FF};
-   // Divide exponent
-   csa #(13) csa1 (exp1, ~exp2, bias, exp_s, exp_c);
-   exp_add explogic1 (exp_cout1, {open, exp_diff}, 
-		      {vss, exp_s}, {vss, exp_c}, 1'b1);
-   // Sqrt exponent (check if exponent is odd)
-   assign exp_odd = Float1[52] ? vss : vdd;
-   exp_add explogic2 (exp_cout2, exp_sqrt, 
-		      {vss, exp1}, {4'h0, 10'h3ff}, exp_odd);
-   // Choose correct exponent
-   assign expF = DivOpType ? exp_sqrt[13:1] : exp_diff;   
-
-   // Main Goldschmidt/Division Routine
-   divconv goldy (q1, qm1, qp1, q0, qm0, qp0, 
-		  rega_out, regb_out, regc_out, regd_out,
-		  regr_out, mantissaB, mantissaA, 
-		  sel_muxa, sel_muxb, sel_muxr, 
-		  reset, clk,
-		  load_rega, load_regb, load_regc, load_regd,
-		  load_regr, load_regs, FmtE, DivOpType, exp_odd);
-
-   // FSM : control divider
-   fsm control (FDivSqrtDoneE, load_rega, load_regb, load_regc, load_regd, 
-		load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
-		clk, reset, FDivStartE, DivOpType, FDivBusyE, HoldInputs);
-   
-   // Round the mantissa to a 52-bit value, with the leading one
-   // removed. The rounding units also handles special cases and 
-   // set the exception flags.
-   //***add max magnitude and swap negitive and positive infinity
-   rounder_div divround1 (Result, DenormIO, FlagsIn, 
-		   FrmE, FmtE, DivOvEn, DivUnEn, expF, 
-   		   sel_inv, Invalid, DenormIn, signResult, 
-		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
-
-   // Store the final result and the exception flags in registers.
-   flopenr #(64) rega (clk, reset, FDivSqrtDoneE, Result, FDivResultM);
-   flopenr #(1) regb (clk, reset, FDivSqrtDoneE, DenormIO, DivDenormM);   
-   flopenr #(5) regc (clk, reset, FDivSqrtDoneE, FlagsIn, FDivFlagsM);   
-   
-endmodule // fpadd
-
-//
-// Brent-Kung Prefix Adder 
-//   (yes, it is 14 bits as my generator is broken for 13 bits :( 
-//    assume, synthesizer will delete stuff not needed )
-//
-module exp_add (cout, sum, a, b, cin);
-   
-   input [13:0] a, b;
-   input 	cin;
-   
-   output [13:0] sum;
-   output 	 cout;
-
-   wire [14:0] 	 p,g;
-   wire [13:0] 	 c;
-
-   // pre-computation
-   assign p={a^b,1'b0};
-   assign g={a&b, cin};
-
-   // prefix tree
-   brent_kung prefix_tree(c, p[13:0], g[13:0]);
-
-   // post-computation
-   assign sum=p[14:1]^c;
-   assign cout=g[14]|(p[14]&c[13]);
-
-endmodule // exp_add
-
-module brent_kung (c, p, g);
-   
-   input [13:0] p;
-   input [13:0] g;
-   output [14:1] c;
-
-   logic G_1_0, G_3_2,G_5_4,G_7_6,G_9_8,G_11_10,G_13_12,G_3_0,G_7_4,G_11_8;
-   logic P_3_2,P_5_4,P_7_6,P_9_8,P_11_10,P_13_12,P_7_4,P_11_8;
-   logic G_7_0,G_11_0,G_5_0,G_9_0,G_13_0,G_2_0,G_4_0,G_6_0,G_8_0,G_10_0,G_12_0;
-   // parallel-prefix, Brent-Kung
-
-   // Stage 1: Generates G/FmtE pairs that span 1 bits
-   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
-   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
-   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
-   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
-   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
-   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
-   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
-
-   // Stage 2: Generates G/FmtE pairs that span 2 bits
-   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
-   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
-   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
-
-   // Stage 3: Generates G/FmtE pairs that span 4 bits
-   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
-
-   // Stage 4: Generates G/FmtE pairs that span 8 bits
-
-   // Stage 5: Generates G/FmtE pairs that span 4 bits
-   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
-
-   // Stage 6: Generates G/FmtE pairs that span 2 bits
-   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
-   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
-   grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
-
-   // Last grey cell stage 
-   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
-   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
-   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
-   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
-   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
-   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
-
-   // Final Stage: Apply c_k+1=G_k_0
-   assign c[1]=g[0];
-   assign c[2]=G_1_0;
-   assign c[3]=G_2_0;
-   assign c[4]=G_3_0;
-   assign c[5]=G_4_0;
-   assign c[6]=G_5_0;
-   assign c[7]=G_6_0;
-   assign c[8]=G_7_0;
-   assign c[9]=G_8_0;
-
-   assign c[10]=G_9_0;
-   assign c[11]=G_10_0;
-   assign c[12]=G_11_0;
-   assign c[13]=G_12_0;
-   assign c[14]=G_13_0;
-
-endmodule // brent_kung
-
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index 5c15268ed..ff29dfd70 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -34,7 +34,7 @@ module fpu (
   input logic [`XLEN-1:0]  SrcAM,      // Integer input being written into fpreg
   input logic 		         StallE, StallM, StallW,
   input logic 		         FlushE, FlushM, FlushW,
-  output logic 		      FStallD,    // Stall the decode stage if Div/Sqrt instruction
+  output logic 		      FStallD,    // Stall the decode stage
   output logic 		      FWriteIntE, FWriteIntM, FWriteIntW, // Write integer register enable
   output logic [`XLEN-1:0] FWriteDataE,      // Data to be written to memory
   output logic [`XLEN-1:0] FIntResM,     
@@ -42,48 +42,38 @@ module fpu (
   output logic 		      IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
   output logic [4:0] 	   SetFflagsM,       // FPU flags
   output logic [`XLEN-1:0] FPUResultW);      // FPU result
-
+// *** change FMA to do 16 - 32 - 64 - 128 FEXPBITS 
    // control logic signal instantiation
    logic 		   FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;              // FP register write enable
-   logic [2:0] 	FrmD, FrmE, FrmM, FrmW;                                  // FP rounding mode
+   logic [2:0] 	FrmD, FrmE, FrmM;                                  // FP rounding mode
    logic 		   FmtD, FmtE, FmtM, FmtW;                                  // FP precision 0-single 1-double
    logic 		   FDivStartD, FDivStartE;                                  // Start division
    logic 		   FWriteIntD;                                              // Write to integer register
-   logic 		   FOutputInput2D, FOutputInput2E;                          // Put Input2 in Input1 if a store instruction
-   logic [1:0] 	FMemRWD;                                        // Read and write enable for memory
-   logic [1:0]    ForwardXD, ForwardXE;                        // Input1 forwarding mux control signal
-   logic [1:0] 	ForwardYD, ForwardYE;                        // Input2 forwarding mux control signal
-   logic [1:0]		   ForwardZD, ForwardZE;                        // Input3 forwarding mux control signal
-   logic 		   SrcYUsedD;                                            // Is input 2 used
-   logic 		   SrcZUsedD;                                            // Is input 3 used
+   logic [1:0]    ForwardXE, ForwardYE, ForwardZE;                        // Input3 forwarding mux control signal
    logic [2:0] 	FResultSelD, FResultSelE, FResultSelM, FResultSelW;      // Select FP result
-   logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM, FOpCtrlW;                  // Select which opperation to do in each component
-   logic [1:0]         FResSelD, FResSelE, FResSelM;  
-   logic [1:0]         FIntResSelD, FIntResSelE, FIntResSelM;                                   
+   logic [3:0] 	FOpCtrlD, FOpCtrlE, FOpCtrlM;                  // Select which opperation to do in each component
+   logic [1:0]    FResSelD, FResSelE, FResSelM;  
+   logic [1:0]    FIntResSelD, FIntResSelE, FIntResSelM;                                   
    logic [4:0] 	Adr1E, Adr2E, Adr3E;
    
    // regfile signals
    logic [4:0]    RdE, RdM, RdW;                                           // what adress to write to    // ***Can take from ieu insted of pipelining
-   logic [63:0] 	FWDM;                                                    // Write data for FP register
    logic [63:0] 	FRD1D, FRD2D, FRD3D;                                     // Read Data from FP register - decode stage
    logic [63:0] 	FRD1E, FRD2E, FRD3E;                                     // Read Data from FP register - execute stage
-   logic [63:0] 	SrcXE, SrcXM, SrcXW;                         // Input 1 to the various units (after forwarding)
    logic [`XLEN-1:0]   SrcXMAligned;
-   logic [63:0] 	SrcYE, SrcYM, SrcYW;                                      // Input 2 to the various units (after forwarding)
+   logic [63:0] 	SrcXE, SrcXM;                         // Input 1 to the various units (after forwarding)
+   logic [63:0] 	SrcYE, SrcYM;                                      // Input 2 to the various units (after forwarding)
    logic [63:0] 	SrcZE, SrcZM;                                      // Input 3 to the various units (after forwarding)
-   logic [63:0] 	FLoadResultW, FLoadStoreResultM, FLoadStoreResultW;      // Result for load, store, and move to int-reg instructions
    
    // div/sqrt signals
-   logic 		   DivDenormE, DivDenormM, DivDenormW;
-   logic 		   DivOvEn, DivUnEn;
-   logic [63:0] 	FDivResultE, FDivResultM, FDivResultW;
-   logic [4:0] 	FDivFlagsE, FDivFlagsM, FDivFlagsW;
-   logic          FDivSqrtDoneE, FDivSqrtDoneM;
+   logic [63:0] 	FDivResultM, FDivResultW;
+   logic [4:0]    FDivSqrtFlgM, FDivSqrtFlgW;
+   logic          FDivSqrtDoneE;
    logic [63:0] 	DivInput1E, DivInput2E;
    logic          HoldInputs;                                              // keep forwarded inputs arround durring division
    
    // FMA signals
-	logic [105:0]	ProdManE, ProdManM;
+	logic [105:0]	ProdManE, ProdManM; ///*** put pipline stages in units
 	logic [161:0]	AlignedAddendE, AlignedAddendM;                       
 	logic [12:0]	ProdExpE, ProdExpM;
 	logic 			AddendStickyE, AddendStickyM;
@@ -91,93 +81,112 @@ module fpu (
 	logic				XZeroE, YZeroE, ZZeroE, XZeroM, YZeroM, ZZeroM;
 	logic				XInfE, YInfE, ZInfE, XInfM, YInfM, ZInfM;
 	logic				XNaNE, YNaNE, ZNaNE, XNaNM, YNaNM, ZNaNM;
-   logic [63:0]   FmaResultM, FmaResultW;
-   logic [4:0]    FmaFlagsM, FmaFlagsW;
+   logic [63:0]   FMAResM, FMAResW;
+   logic [4:0]    FMAFlgM, FMAFlgW;
 
    // add/cvt signals
-   logic [63:0] 	AddSumE, AddSumTcE;
-   logic [3:0] 	AddSelInvE;
-   logic [10:0] 	AddExpPostSumE;
-   logic 		   AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
-   logic 		   AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
-   logic 		   AddConvertE;
-   logic [63:0] 	AddFloat1E, AddFloat2E;
-   logic [11:0] 	AddExp1DenormE, AddExp2DenormE;
-   logic [10:0] 	AddExponentE;
-   logic [2:0] 	AddRmE;
-   logic [3:0] 	AddOpTypeE;
-   logic 		   AddPE, AddOvEnE, AddUnEnE;    
-   logic 		   AddDenormM;
-   logic [63:0] 	AddSumM, AddSumTcM;
-   logic [3:0] 	AddSelInvM;
-   logic [10:0] 	AddExpPostSumM;
-   logic 		   AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
-   logic 		   AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
-   logic 		   AddConvertM, AddSignM;
-   logic [63:0] 	AddFloat1M, AddFloat2M;
-   logic [11:0] 	AddExp1DenormM, AddExp2DenormM;
-   logic [10:0] 	AddExponentM;
-   logic [63:0] 	AddOp1M, AddOp2M;
-   logic [2:0] 	AddRmM;
-   logic [3:0] 	AddOpTypeM;
-   logic 		   AddPM, AddOvEnM, AddUnEnM;  
-   logic [63:0] 	FAddResultM, FAddResultW;
-   logic [4:0] 	FAddFlagsM, FAddFlagsW;
+   logic [63:0] 	AddSumE, AddSumM;
+   logic [63:0]   AddSumTcE, AddSumTcM;
+   logic [3:0] 	AddSelInvE, AddSelInvM;
+   logic [10:0] 	AddExpPostSumE,AddExpPostSumM;
+   logic 		   AddCorrSignE, AddCorrSignM;
+   logic          AddOp1NormE, AddOp1NormM;
+   logic          AddOp2NormE, AddOp2NormM;
+   logic          AddOpANormE,  AddOpANormM;
+   logic          AddOpBNormE, AddOpBNormM;
+   logic          AddInvalidE, AddInvalidM;
+   logic 		   AddDenormInE, AddDenormInM;
+   logic          AddSwapE, AddSwapM;
+   logic          AddNormOvflowE, AddNormOvflowM; //***this isn't used in addcvt2
+   logic          AddSignAE, AddSignAM;
+   logic 		   AddConvertE, AddConvertM;
+   logic [63:0] 	AddFloat1E, AddFloat2E, AddFloat1M, AddFloat2M;
+   logic [11:0] 	AddExp1DenormE, AddExp2DenormE, AddExp1DenormM, AddExp2DenormM;
+   logic [10:0] 	AddExponentE, AddExponentM;
+   logic [63:0] 	FAddResM, FAddResW;
+   logic [4:0] 	FAddFlgM, FAddFlgW;  
    
    // cmp signals 
-   logic 		   CmpInvalidE, CmpInvalidM, CmpInvalidW;
-   logic [63:0] 	FCmpResultE, FCmpResultM, FCmpResultW;
+   logic 		   CmpNVE, CmpNVM, CmpNVW;
+   logic [63:0] 	CmpResE, CmpResM, CmpResW;
    
    // fsgn signals
-   logic [63:0] 	SgnResultE, SgnResultM, SgnResultW;
-   logic [4:0] 	SgnFlagsE, SgnFlagsM, SgnFlagsW;
+   logic [63:0] 	SgnResE, SgnResM;
+   logic        	SgnNVE, SgnNVM, SgnNVW;
    logic [63:0]   FResM, FResW;
-   logic    FFlgM, FFlgW;
+   logic          FFlgM, FFlgW;
    
    // instantiation of W stage regfile signals
-   logic [63:0] 	AlignedSrcAM, ForwardSrcAM, SrcAW;
+   logic [63:0] 	AlignedSrcAM;
    
    // classify signals
-   logic [63:0] 	ClassResultE, ClassResultM, ClassResultW;
+   logic [63:0] 	ClassResE, ClassResM;
    
    // 64-bit FPU result   
-   logic [63:0] 	FPUResult64W, FPUResult64E;                                           
+   logic [63:0] 	FPUResult64W;                                           
    logic [4:0] 	FPUFlagsW;
    
    
+
+
+
+
+
+
+
    //DECODE STAGE
    
    
    // top-level controller for FPU
-   fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*);
+   fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
+               .FRM_REGW, .IllegalFPUInstrD, .FWriteEnD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
+               .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
    
    // regfile instantiation
-   FPregfile fpregfile (clk, reset, FWriteEnW,
+   fregfile fregfile (clk, reset, FWriteEnW,
 			InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
 			FPUResult64W,
 			FRD1D, FRD2D, FRD3D);	
    
+
+
+
+
+
+
+
+
    //*****************
-   // fpregfile D/E pipe registers
+   // D/E pipe registers
    //*****************
    flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
    flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
    flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
-   
-   //*****************
-   // other  D/E pipe registers
-   //*****************
-   flopenrc #(1) CtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
-   flopenrc #(15) CtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
+   flopenrc #(1) DECtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
+   flopenrc #(15) DECtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
                                                          {Adr1E,         Adr2E,         Adr3E});
-   flopenrc #(22) DECtrlReg(clk, reset, FlushE, ~StallE, 
+   flopenrc #(22) DECtrlReg3(clk, reset, FlushE, ~StallE, 
                         {FWriteEnD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, InstrD[11:7], FOpCtrlD, FWriteIntD},
                         {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE,          FOpCtrlE, FWriteIntE});
 
+
+
+
+
+
+
+
+
+
+
+
+
+
    //EXECUTION STAGE
    
    // Hazard unit for FPU
-   fpuhazard hazard(.*);
+   fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FWriteEnM, .FWriteEnW, .RdM, .RdW, .FResultSelM, .FStallD, 
+                     .ForwardXE, .ForwardYE, .ForwardZE);
 
    // forwarding muxs
    mux3  #(64)  fxemux(FRD1E, FPUResult64W, FResM, ForwardXE, SrcXE);
@@ -186,7 +195,9 @@ module fpu (
 
    
    // first of two-stage instance of floating-point fused multiply-add unit
-   fma1 fma1 (.X(SrcXE), .Y(SrcYE), .Z(SrcZE), .FOpCtrlE(FOpCtrlE[2:0]),.*);
+   fma1 fma1 (.X(SrcXE), .Y(SrcYE), .Z(SrcZE), .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .ProdManE, .AlignedAddendE,
+               .ProdExpE, .AddendStickyE, .KillProdE, .XZeroE, .YZeroE, .ZZeroE, .XInfE, .YInfE, .ZInfE,
+               .XNaNE, .YNaNE, .ZNaNE );
    
    // first and only instance of floating-point divider
    logic fpdivClk;
@@ -204,174 +215,140 @@ module fpu (
                .en(~HoldInputs), .clear(FDivSqrtDoneE),
                .reset(reset),  .clk(clk));
 
-   fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .*);
+   fdivsqrt fdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
+                     .FrmE, .DivOvEn(1'b1), .DivUnEn(1'b1), .FDivStartE, .FDivResultM, .FDivSqrtFlgM, 
+                     .FDivSqrtDoneE, .FDivBusyE, .HoldInputs, .reset);
    
 
 
    // first of two-stage instance of floating-point add/cvt unit
-   fpuaddcvt1 fpadd1 (.*);
+   fpuaddcvt1 fpadd1 (.SrcXE, .SrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
+                     .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
+                     .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
+                     .AddDenormInE, .AddConvertE, .AddSwapE, .AddNormOvflowE);
    
-   // first of two-stage instance of floating-point comparator
-   fpucmp1 fpcmp1 (SrcXE, SrcYE, FOpCtrlE[2:0], FmtE, CmpInvalidE, FCmpResultE);
+   // first and only instance of floating-point comparator
+   fcmp fcmp (SrcXE, SrcYE, FOpCtrlE[2:0], FmtE, CmpNVE, CmpResE);
    
    // first and only instance of floating-point sign converter
-   fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*);
+   fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .SrcXE, .SrcYE, .SgnResE, .SgnNVE);
    
    // first and only instance of floating-point classify unit
-   fpuclassify fpuclass (.*);
+   fclassify fclassify (.SrcXE, .FmtE, .ClassResE);
 
    // output for store instructions
    assign FWriteDataE = FmtE ? SrcYE[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcYE[63:32]};
-   
+   //***swap to mux
+
+
+
+
+
+
+
+
+
+
    //*****************
-   //fpregfile D/E pipe registers
+   // E/M pipe registers
    //*****************
    flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, SrcXE, SrcXM);
    flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, SrcYE, SrcYM);
    flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, SrcZE, SrcZM);
    
-   //*****************
-   // fma E/M pipe registers
-   //*****************  
-  flopenrc #(106) EMRegFma3(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
-  flopenrc #(162) EMRegFma4(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
-  flopenrc #(13) EMRegFma6(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
-  flopenrc #(1) EMRegFma7(clk, reset, FlushM, ~StallM, AddendStickyE, AddendStickyM); 
-  flopenrc #(1) EMRegFma8(clk, reset, FlushM, ~StallM, KillProdE, KillProdM); 
-  flopenrc #(1) EMRegFma10(clk, reset, FlushM, ~StallM, XZeroE, XZeroM); 
-  flopenrc #(1) EMRegFma11(clk, reset, FlushM, ~StallM, YZeroE, YZeroM); 
-  flopenrc #(1) EMRegFma12(clk, reset, FlushM, ~StallM, ZZeroE, ZZeroM); 
-  flopenrc #(1) EMRegFma16(clk, reset, FlushM, ~StallM, XInfE, XInfM); 
-  flopenrc #(1) EMRegFma17(clk, reset, FlushM, ~StallM, YInfE, YInfM); 
-  flopenrc #(1) EMRegFma18(clk, reset, FlushM, ~StallM, ZInfE, ZInfM); 
-  flopenrc #(1) EMRegFma19(clk, reset, FlushM, ~StallM, XNaNE, XNaNM); 
-  flopenrc #(1) EMRegFma20(clk, reset, FlushM, ~StallM, YNaNE, YNaNM); 
-  flopenrc #(1) EMRegFma21(clk, reset, FlushM, ~StallM, ZNaNE, ZNaNM);  
+   flopenrc #(106) EMRegFma1(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
+   flopenrc #(162) EMRegFma2(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
+   flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
+   flopenrc #(11) EMRegFma4(clk, reset, FlushM, ~StallM, 
+                              {AddendStickyE, KillProdE, XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE},
+                              {AddendStickyM, KillProdM, XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM});
 
-   //*****************
-   // fpadd E/M pipe registers
-   //*****************
    flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
    flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
-   flopenrc #(4)  EMRegAdd3(clk, reset, FlushM, ~StallM, AddSelInvE, AddSelInvM); 
-   flopenrc #(11) EMRegAdd4(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
-   flopenrc #(1) EMRegAdd5(clk, reset, FlushM, ~StallM, AddCorrSignE, AddCorrSignM); 
-   flopenrc #(1) EMRegAdd6(clk, reset, FlushM, ~StallM, AddOp1NormE, AddOp1NormM); 
-   flopenrc #(1) EMRegAdd7(clk, reset, FlushM, ~StallM, AddOp2NormE, AddOp2NormM); 
-   flopenrc #(1) EMRegAdd8(clk, reset, FlushM, ~StallM, AddOpANormE, AddOpANormM); 
-   flopenrc #(1) EMRegAdd9(clk, reset, FlushM, ~StallM, AddOpBNormE, AddOpBNormM); 
-   flopenrc #(1) EMRegAdd10(clk, reset, FlushM, ~StallM, AddInvalidE, AddInvalidM); 
-   flopenrc #(1) EMRegAdd11(clk, reset, FlushM, ~StallM, AddDenormInE, AddDenormInM); 
-   flopenrc #(1) EMRegAdd12(clk, reset, FlushM, ~StallM, AddConvertE, AddConvertM); 
-   flopenrc #(1) EMRegAdd13(clk, reset, FlushM, ~StallM, AddSwapE, AddSwapM); 
-   flopenrc #(1) EMRegAdd14(clk, reset, FlushM, ~StallM, AddNormOvflowE, AddNormOvflowM); 
-   flopenrc #(1) EMRegAdd15(clk, reset, FlushM, ~StallM, AddSignAE, AddSignAM); 
-   flopenrc #(64) EMRegAdd16(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
-   flopenrc #(64) EMRegAdd17(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
-   flopenrc #(12) EMRegAdd18(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
-   flopenrc #(12) EMRegAdd19(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
-   flopenrc #(11) EMRegAdd20(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM); 
-   flopenrc #(3) EMRegAdd23(clk, reset, FlushM, ~StallM, AddRmE, AddRmM); 
-   flopenrc #(4) EMRegAdd24(clk, reset, FlushM, ~StallM, AddOpTypeE, AddOpTypeM); 
-   flopenrc #(1) EMRegAdd25(clk, reset, FlushM, ~StallM, AddPE, AddPM); 
-   flopenrc #(1) EMRegAdd26(clk, reset, FlushM, ~StallM, AddOvEnE, AddOvEnM); 
-   flopenrc #(1) EMRegAdd27(clk, reset, FlushM, ~StallM, AddUnEnE, AddUnEnM); 
+   flopenrc #(11) EMRegAdd3(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
+   flopenrc #(64) EMRegAdd4(clk, reset, FlushM, ~StallM, AddFloat1E, AddFloat1M); 
+   flopenrc #(64) EMRegAdd5(clk, reset, FlushM, ~StallM, AddFloat2E, AddFloat2M); 
+   flopenrc #(12) EMRegAdd6(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
+   flopenrc #(12) EMRegAdd7(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
+   flopenrc #(11) EMRegAdd8(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM);
+   flopenrc #(15) EMRegAdd9(clk, reset, FlushM, ~StallM, 
+                           {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE},
+                           {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM}); 
+
+   flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
+   flopenrc #(64) EMRegCmp2(clk, reset, FlushM, ~StallM, CmpResE, CmpResM); 
    
-   //*****************
-   // fpcmp E/M pipe registers
-   //*****************
-   flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpInvalidE, CmpInvalidM); 
-   flopenrc #(64) EMRegCmp3(clk, reset, FlushM, ~StallM, FCmpResultE, FCmpResultM); 
+   flopenrc #(64) EMRegSgn1(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
+   flopenrc #(1) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
    
-   //*****************
-   // fpsgn E/M pipe registers
-   //***************** 
-   flopenrc #(64) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnResultE, SgnResultM);
-   flopenrc #(5) EMRegSgn3(clk, reset, FlushM, ~StallM, SgnFlagsE, SgnFlagsM);
-   
-   //*****************
-   // other E/M pipe registers
-   //*****************
    flopenrc #(22) EMCtrlReg(clk, reset, FlushM, ~StallM,
                         {FWriteEnE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE, FOpCtrlE, FWriteIntE},
                         {FWriteEnM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, RdM, FOpCtrlM, FWriteIntM});
+
+   flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
    
-   //*****************
-   // fpuclassify E/M pipe registers
-   //***************** 
-   flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResultE, ClassResultM);
-   
+
+
+
+
+
+
+
    //BEGIN MEMORY STAGE
    
-   mux3  #(64)  FResMux(AlignedSrcAM, SgnResultM, FCmpResultM, FResSelM, FResM);
-   assign FFlgM = CmpInvalidM & FResSelM[1];
+   mux3  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, FResSelM, FResM);
+   mux3  #(1)  FFlgMux(1'b0, SgnNVM, CmpNVM, FResSelM, FFlgM);
 
+   //***change to mux
    assign SrcXMAligned = FmtM ? SrcXM[63:64-`XLEN] : {{`XLEN-32{1'b0}}, SrcXM[63:32]};
-   mux3  #(`XLEN)  IntResMux(FCmpResultM[`XLEN-1:0], SrcXMAligned, ClassResultM[`XLEN-1:0], FIntResSelM, FIntResM);
+   mux3  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], SrcXMAligned, ClassResM[`XLEN-1:0], FIntResSelM, FIntResM);
 
    // second instance of two-stage FMA unit
-   fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM(FOpCtrlM[2:0]), .*);
+   fma2 fma2(.X(SrcXM), .Y(SrcYM), .Z(SrcZM), .FOpCtrlM(FOpCtrlM[2:0]), .FrmM, .FmtM, 
+            .ProdManM, .AlignedAddendM, .ProdExpM, .AddendStickyM, .KillProdM, 
+            .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, 
+            .FMAResM, .FMAFlgM);
    
    // second instance of two-stage floating-point add/cvt unit
-   fpuaddcvt2 fpadd2 (.*);
+   fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM, .AddFloat1M, .AddFloat2M, 
+                     .AddExp1DenormM, .AddExp2DenormM, .AddExponentM, .AddExpPostSumM, .AddSelInvM, 
+                     .AddOp1NormM, .AddOp2NormM, .AddOpANormM, .AddOpBNormM, .AddInvalidM, .AddDenormInM, 
+                     .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
    
    // Align SrcA to MSB when single precicion
    mux2  #(64)  SrcAMux({SrcAM[31:0], 32'b0}, {{64-`XLEN{1'b0}}, SrcAM}, FmtM, AlignedSrcAM);
       
 
 
-      
+
+
+
+
+
+
+         
    //*****************
-   //fpregfile M/W pipe registers
+   // M/W pipe registers
    //*****************
-   flopenrc #(64) MWFpReg1(clk, reset, FlushW, ~StallW, SrcXM, SrcXW);
-   flopenrc #(64) MWFpReg2(clk, reset, FlushW, ~StallW, SrcYM, SrcYW);
+   flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
+   flopenrc #(5) MWRegFma2(clk, reset, FlushW, ~StallW, FMAFlgM, FMAFlgW); 
    
-   //*****************
-   // fma M/W pipe registers
-   //*****************
-   flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FmaResultM, FmaResultW); 
-   flopenrc #(5) MWRegFma2(clk, reset, FlushW, ~StallW, FmaFlagsM, FmaFlagsW); 
-   
-   //*****************
-   // fpdiv M/W pipe registers
-   //*****************
    flopenrc #(64) MWRegDiv1(clk, reset, FlushW, ~StallW, FDivResultM, FDivResultW); 
-   flopenrc #(5) MWRegDiv2(clk, reset, FlushW, ~StallW, FDivFlagsM, FDivFlagsW);
-   flopenrc #(1) MWRegDiv3(clk, reset, FlushW, ~StallW, DivDenormM, DivDenormW); 
+   flopenrc #(5) MWRegDiv2(clk, reset, FlushW, ~StallW, FDivSqrtFlgM, FDivSqrtFlgW);
    
-   //*****************
-   // fpadd M/W pipe registers
-   //*****************
-   flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResultM, FAddResultW); 
-   flopenrc #(5) MWRegAdd2(clk, reset, FlushW, ~StallW, FAddFlagsM, FAddFlagsW); 
+   flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
+   flopenrc #(5) MWRegAdd2(clk, reset, FlushW, ~StallW, FAddFlgM, FAddFlgW); 
    
-   //*****************
-   // fpcmp M/W pipe registers
-   //*****************
-   flopenrc #(1) MWRegCmp1(clk, reset, FlushW, ~StallW, CmpInvalidM, CmpInvalidW); 
-   // flopenrc #(2) MWRegCmp2(clk, reset, FlushW, ~StallW, CmpFCCM, CmpFCCW); 
-   flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, FCmpResultM, FCmpResultW); 
+   flopenrc #(1) MWRegCmp1(clk, reset, FlushW, ~StallW, CmpNVM, CmpNVW); 
+   flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, CmpResM, CmpResW);
+
+   flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
+   flopenrc #(1) MWRegClass1(clk, reset, FlushW, ~StallW, FFlgM, FFlgW);
    
-   //*****************
-   // fpsgn M/W pipe registers
-   //***************** 
-   flopenrc #(64) MWRegSgn1(clk, reset, FlushW, ~StallW, SgnResultM, SgnResultW);
-   flopenrc #(5) MWRegSgn2(clk, reset, FlushW, ~StallW, SgnFlagsM, SgnFlagsW);
-   
-   //*****************
-   // other M/W pipe registers
-   //*****************
    flopenrc #(11) MWCtrlReg(clk, reset, FlushW, ~StallW,
                         {FWriteEnM, FResultSelM, RdM, FmtM, FWriteIntM},
                         {FWriteEnW, FResultSelW, RdW, FmtW, FWriteIntW});
    
-   //*****************
-   // fpuclassify M/W pipe registers
-   //***************** 
-   flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, ClassResultM, ClassResultW);
-   flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
-   flopenrc #(1) MWRegClass1(clk, reset, FlushW, ~StallW, FFlgM, FFlgW);
    
 
 
@@ -385,13 +362,13 @@ module fpu (
 
 
 
-
+//***turn into muxs
    always_comb begin
       case (FResultSelW)
 	3'b000 : FPUFlagsW = 5'b0;
-	3'b001 : FPUFlagsW = FmaFlagsW;
-	3'b010 : FPUFlagsW = FAddFlagsW;
-	3'b011 : FPUFlagsW = FDivFlagsW;
+	3'b001 : FPUFlagsW = FMAFlgW;
+	3'b010 : FPUFlagsW = FAddFlgW;
+	3'b011 : FPUFlagsW = FDivSqrtFlgW;
 	3'b100 : FPUFlagsW = {4'b0,FFlgW};
 	default : FPUFlagsW = 5'bxxxxx;
       endcase
@@ -400,8 +377,8 @@ module fpu (
    always_comb begin
       case (FResultSelW)
 	3'b000 : FPUResult64W = FmtW ? {ReadDataW, {64-`XLEN{1'b0}}} : {ReadDataW[31:0], 32'b0};
-	3'b001 : FPUResult64W = FmaResultW;
-	3'b010 : FPUResult64W = FAddResultW;
+	3'b001 : FPUResult64W = FMAResW;
+	3'b010 : FPUResult64W = FAddResW;
 	3'b011 : FPUResult64W = FDivResultW;
 	3'b100 : FPUResult64W = FResW;
 	default : FPUResult64W = 64'bxxxxx;
@@ -415,7 +392,9 @@ module fpu (
    // define offsets for LSB zero extension or truncation
    always_comb begin      
       // zero extension 
+//***turn into mux
       FPUResultW = FmtW ? FPUResult64W[63:64-`XLEN] : {{`XLEN-32{1'b0}}, FPUResult64W[63:32]};
+      //*** put into mem stage
       SetFflagsM = FPUFlagsW;      
    end
   
diff --git a/wally-pipelined/src/fpu/fpuaddcvt1.sv b/wally-pipelined/src/fpu/fpuaddcvt1.sv
index 8f045dcdb..1b86b1984 100755
--- a/wally-pipelined/src/fpu/fpuaddcvt1.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt1.sv
@@ -183,11 +183,11 @@ module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE,
    assign AddCorrSignE = ~FOpCtrlE[2]&~FOpCtrlE[1]&FOpCtrlE[0]&AddSwapE;
 
    // 64-bit Mantissa Adder/Subtractor
-   cla64 add1 (AddSumE, mantissaA3, mantissaB3, sub);
+   cla64 add1 (AddSumE, mantissaA3, mantissaB3, sub); //***adder
 
    // 64-bit Mantissa Subtractor - to get the two's complement of the 
    // result when the sign from the adder/subtractor is negative. 
-   cla_sub64 sub1 (AddSumTcE, mantissaB3, mantissaA3);
+   cla_sub64 sub1 (AddSumTcE, mantissaB3, mantissaA3); //***adder
  
    // Finds normal underflow result to determine whether to round final exponent down
    //***KEP used to be (AddSumE == 16'h0) I am unsure what it's supposed to be
diff --git a/wally-pipelined/src/fpu/fpuaddcvt2.sv b/wally-pipelined/src/fpu/fpuaddcvt2.sv
index 46eac200f..1fe8ac658 100755
--- a/wally-pipelined/src/fpu/fpuaddcvt2.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt2.sv
@@ -27,7 +27,7 @@
 //
 
 
-module fpuaddcvt2 (FAddResultM, FAddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, FrmM, FOpCtrlM, FmtM);
+module fpuaddcvt2 (FAddResM, FAddFlgM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, FrmM, FOpCtrlM, FmtM);
 
    input [2:0] 	FrmM;		// Rounding mode - specify values 
    input [3:0]	FOpCtrlM;	// Function opcode
@@ -51,9 +51,9 @@ module fpuaddcvt2 (FAddResultM, FAddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddS
    input          AddSwapM;
    // input 	 AddNormOvflowM;
 
-   output [63:0] FAddResultM;	// Result of operation
-   output [4:0]  FAddFlagsM;   	// IEEE exception flags 
-   output 	 AddDenormM;   	// AddDenormM on input or output   
+   output [63:0] FAddResM;	// Result of operation
+   output [4:0]  FAddFlgM;   	// IEEE exception flags 
+   wire 	 AddDenormM;   	// AddDenormM on input or output   
 
    wire          P;
    assign P = ~FmtM | FOpCtrlM[2];
@@ -145,7 +145,7 @@ module fpuaddcvt2 (FAddResultM, FAddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddS
    // exactly where the rounding point is. The rounding units also
    // handles special cases and set the exception flags.
 
-   // Changed DenormIO -> AddDenormM and FlagsIn -> FAddFlagsM in order to
+   // Changed DenormIO -> AddDenormM and FlagsIn -> FAddFlgM in order to
    // help in processor reservation station detection of load/stores. In
    // other words, the processor would like to know ahead of time that
    // if the result is an exception then don't load or store.
@@ -155,8 +155,8 @@ module fpuaddcvt2 (FAddResultM, FAddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddS
 		   AddNormOvflowM, normal_underflow, AddSwapM, FOpCtrlM, AddSumM);
 
    // Store the final result and the exception flags in registers.
-   assign FAddResultM = Result;
-   assign {AddDenormM, FAddFlagsM} = {DenormIO, FlagsIn};
+   assign FAddResM = Result;
+   assign {AddDenormM, FAddFlgM} = {DenormIO, FlagsIn};
    
 endmodule // fpadd
 
diff --git a/wally-pipelined/src/fpu/fpuclassify.sv b/wally-pipelined/src/fpu/fpuclassify.sv
deleted file mode 100644
index b320b2f07..000000000
--- a/wally-pipelined/src/fpu/fpuclassify.sv
+++ /dev/null
@@ -1,50 +0,0 @@
-
-`include "wally-config.vh"
-
-module fpuclassify (
-    input  logic [63:0] SrcXE,
-    input  logic        FmtE,           // 0-single 1-double
-    output logic [63:0] ClassResultE
-    );
-
-    logic [31:0] single;
-    logic [63:0] double;
-    logic sign;
-    logic infinity, NaN, zero, normal, subnormal;
-    logic ExpNotZero, ExpOnes, ManNotZero, ExpZero, ManZero, FirstBitMan;
-   
-    // single and double precision layouts
-    assign single = SrcXE[63:32];
-    assign double = SrcXE;
-    assign sign = SrcXE[63];
-
-    // basic calculations for readabillity
-    assign ExpNotZero = FmtE ? |double[62:52] : |single[30:23];
-    assign ExpZero = ~ExpNotZero;
-    assign ExpOnes = FmtE ? &double[62:52] : &single[30:23];
-    assign ManNotZero = FmtE ? |double[51:0] : |single[22:0];
-    assign ManZero = ~ManNotZero;
-    assign FirstBitMan = FmtE ? double[51] : single[22];
-
-    // determine the type of number
-    assign NaN      = ExpOnes & ManNotZero;
-    assign infinity = ExpOnes & ManZero;
-    assign zero     = ExpZero & ManZero;
-    assign subnormal= ExpZero & ManNotZero;
-    assign normal   = ExpNotZero;
-
-    // determine sub category and combine into the result
-    //  bit 0 - -infinity
-    //  bit 1 - -normal
-    //  bit 2 - -subnormal
-    //  bit 3 - -zero
-    //  bit 4 - +zero
-    //  bit 5 - +subnormal
-    //  bit 6 - +normal
-    //  bit 7 - +infinity
-    //  bit 8 - signaling NaN
-    //  bit 9 - quiet NaN
-    assign ClassResultE = {{54{1'b0}}, FirstBitMan&NaN, ~FirstBitMan&NaN, ~sign&infinity, ~sign&normal, 
-                                    ~sign&subnormal, ~sign&zero, sign&zero, sign&subnormal, sign&normal, sign&infinity};
-
-endmodule
diff --git a/wally-pipelined/src/fpu/fpucmp1.sv b/wally-pipelined/src/fpu/fpucmp1.sv
deleted file mode 100755
index 3a8245e63..000000000
--- a/wally-pipelined/src/fpu/fpucmp1.sv
+++ /dev/null
@@ -1,465 +0,0 @@
-
-//
-// File name : fpcomp.v
-// Title     : Floating-Point Comparator
-// project   : FPU
-// Library   : fpcomp
-// Author(s) : James E. Stine
-// Purpose   : definition of main unit to floating-point comparator
-// notes :   
-//
-// Copyright Oklahoma State University
-//
-// Floating Point Comparator (Algorithm)
-//
-// 1.) Performs sign-extension if the inputs are 32-bit integers.
-// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
-// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
-//     and correct for sign bits
-//
-// This module takes 64-bits inputs op1 and op2, VSS, and VDD
-// signals, and a 2-bit signal FOpCtrlE that indicates the type of 
-// operands being compared as indicated below.
-//	FOpCtrlE	Description
-//	 00	double precision numbers
-//	 01	single precision numbers
-//	 10	half precision numbers
-//	 11	(unused)
-//
-// The comparator produces a 2-bit signal FCC, which
-// indicates the result of the comparison:
-//
-//     fcc 	decscription
-//      00	A = B	
-//      01	A < B	
-//      10	A > B	
-//      11	A and B	are unordered (i.e., A or B is NaN)
-//
-// It also produces an invalid operation flag, which is one
-// if either of the input operands is a signaling NaN per 754
-
-`include "wally-config.vh"
-module fpucmp1 (   
-   input logic [63:0] op1, 
-   input logic [63:0] op2,
-   input logic [2:0]  FOpCtrlE,
-   input logic 	      FmtE,
-
-   
-   output logic       Invalid, 		 // Invalid Operation
-   // output logic [1:0] FCC,  		 // Condition Codes 
-   output logic [63:0] FCmpResultE);
-   // Perform magnitude comparison between the 63 least signficant bits
-   // of the input operands. Only LT and EQ are returned, since GT can
-   // be determined from these values. 
-   logic [1:0] FCC;  		 // Condition Codes 
-   logic [7:0]	      w, x;
-   logic	      ANaN, BNaN;
-   logic	      Azero, Bzero;
-   logic 	      LT;                // magnitude op1 < magnitude op2
-   logic 	      EQ;                // magnitude op1 = magnitude op2
-   
-   magcompare64b_1 magcomp1 (w, x, {~op1[63], op1[62:0]}, {~op2[63], op2[62:0]});
-
-   // Determine final values based on output of magnitude comparison, 
-   // sign bits, and special case testing. 
-   exception_cmp_1 exc1 (ANaN, BNaN, Azero, Bzero, op1, op2, FOpCtrlE);
-   
-   // Perform magnitude comparison between the 63 least signficant bits
-   // of the input operands. Only LT and EQ are returned, since GT can
-   // be determined from these values. 
-   magcompare64b_2 magcomp2 (LT, EQ, w, x);
-
-   // Determine final values based on output of magnitude comparison, 
-   // sign bits, and special case testing. 
-   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .FOpCtrlE(FOpCtrlE), .A(op1), .B(op2), .*);
-
-endmodule // fpcomp
-
-// module magcompare2b (LT, GT, A, B);
-
-//    input logic [1:0] A;
-//    input logic [1:0] B;
-   
-//    output logic     LT;
-//    output logic     GT;
-
-//    // Determine if A < B  using a minimized sum-of-products expression
-//    assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
-//    // Determine if A > B  using a minimized sum-of-products expression
-//    assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
-
-// endmodule // magcompare2b
-
-// 2-bit magnitude comparator
-// This module compares two 2-bit values A and B. LT is '1' if A < B 
-// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
-// this version actually incorporates don't cares into the equation to
-// simplify the optimization
-
-module magcompare2c (LT, GT, A, B);
-
-   input logic [1:0] A;
-   input logic [1:0] B;
-   
-   output logic      LT;
-   output logic      GT;
-
-   assign LT = B[1] | (!A[1]&B[0]);
-   assign GT = A[1] | (!B[1]&A[0]);
-
-endmodule // magcompare2b
-
-// This module compares two 64-bit values A and B. LT is '1' if A < B 
-// and EQ is '1'if A = B. LT and GT are both '0' if A > B.
-// This structure was modified so
-// that it only does a strict magnitdude comparison, and only
-// returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
-// of 63 2-bit magnitude comparators, followed by one OR gates.
-//
-// J. E. Stine and M. J. Schulte, "A combined two's complement and
-// floating-point comparator," 2005 IEEE International Symposium on
-// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
-// doi: 10.1109/ISCAS.2005.1464531
-
-module magcompare64b_1 (w, x,  A, B);
-
-   input logic [63:0] A;
-   input logic [63:0] B;
-   
-   logic [31:0]       s;
-   logic [31:0]       t;
-   logic [15:0]       u;
-   logic [15:0]       v;
-   output logic [7:0] 	      w;
-   output logic [7:0] 	      x;
-   
-   magcompare2b mag1(s[0], t[0], A[1:0], B[1:0]);
-   magcompare2b mag2(s[1], t[1], A[3:2], B[3:2]);
-   magcompare2b mag3(s[2], t[2], A[5:4], B[5:4]);
-   magcompare2b mag4(s[3], t[3], A[7:6], B[7:6]);
-   magcompare2b mag5(s[4], t[4], A[9:8], B[9:8]);
-   magcompare2b mag6(s[5], t[5], A[11:10], B[11:10]);
-   magcompare2b mag7(s[6], t[6], A[13:12], B[13:12]);
-   magcompare2b mag8(s[7], t[7], A[15:14], B[15:14]);
-   magcompare2b mag9(s[8], t[8], A[17:16], B[17:16]);
-   magcompare2b magA(s[9], t[9], A[19:18], B[19:18]);
-   magcompare2b magB(s[10], t[10], A[21:20], B[21:20]);
-   magcompare2b magC(s[11], t[11], A[23:22], B[23:22]);
-   magcompare2b magD(s[12], t[12], A[25:24], B[25:24]);
-   magcompare2b magE(s[13], t[13], A[27:26], B[27:26]);
-   magcompare2b magF(s[14], t[14], A[29:28], B[29:28]);
-   magcompare2b mag10(s[15], t[15], A[31:30], B[31:30]);
-   magcompare2b mag11(s[16], t[16], A[33:32], B[33:32]);
-   magcompare2b mag12(s[17], t[17], A[35:34], B[35:34]);
-   magcompare2b mag13(s[18], t[18], A[37:36], B[37:36]);
-   magcompare2b mag14(s[19], t[19], A[39:38], B[39:38]);
-   magcompare2b mag15(s[20], t[20], A[41:40], B[41:40]);
-   magcompare2b mag16(s[21], t[21], A[43:42], B[43:42]);
-   magcompare2b mag17(s[22], t[22], A[45:44], B[45:44]);
-   magcompare2b mag18(s[23], t[23], A[47:46], B[47:46]);
-   magcompare2b mag19(s[24], t[24], A[49:48], B[49:48]);
-   magcompare2b mag1A(s[25], t[25], A[51:50], B[51:50]);
-   magcompare2b mag1B(s[26], t[26], A[53:52], B[53:52]);
-   magcompare2b mag1C(s[27], t[27], A[55:54], B[55:54]);
-   magcompare2b mag1D(s[28], t[28], A[57:56], B[57:56]);
-   magcompare2b mag1E(s[29], t[29], A[59:58], B[59:58]);
-   magcompare2b mag1F(s[30], t[30], A[61:60], B[61:60]);
-   magcompare2b mag20(s[31], t[31], A[63:62], B[63:62]);
-
-   magcompare2c mag21(u[0], v[0], t[1:0], s[1:0]);
-   magcompare2c mag22(u[1], v[1], t[3:2], s[3:2]);
-   magcompare2c mag23(u[2], v[2], t[5:4], s[5:4]);
-   magcompare2c mag24(u[3], v[3], t[7:6], s[7:6]);
-   magcompare2c mag25(u[4], v[4], t[9:8], s[9:8]);
-   magcompare2c mag26(u[5], v[5], t[11:10], s[11:10]);
-   magcompare2c mag27(u[6], v[6], t[13:12], s[13:12]);
-   magcompare2c mag28(u[7], v[7], t[15:14], s[15:14]);
-   magcompare2c mag29(u[8], v[8], t[17:16], s[17:16]);
-   magcompare2c mag2A(u[9], v[9], t[19:18], s[19:18]);
-   magcompare2c mag2B(u[10], v[10], t[21:20], s[21:20]);
-   magcompare2c mag2C(u[11], v[11], t[23:22], s[23:22]);
-   magcompare2c mag2D(u[12], v[12], t[25:24], s[25:24]);
-   magcompare2c mag2E(u[13], v[13], t[27:26], s[27:26]);
-   magcompare2c mag2F(u[14], v[14], t[29:28], s[29:28]);
-   magcompare2c mag30(u[15], v[15], t[31:30], s[31:30]);
-
-   magcompare2c mag31(w[0], x[0], v[1:0], u[1:0]);
-   magcompare2c mag32(w[1], x[1], v[3:2], u[3:2]);
-   magcompare2c mag33(w[2], x[2], v[5:4], u[5:4]);
-   magcompare2c mag34(w[3], x[3], v[7:6], u[7:6]);
-   magcompare2c mag35(w[4], x[4], v[9:8], u[9:8]);
-   magcompare2c mag36(w[5], x[5], v[11:10], u[11:10]);
-   magcompare2c mag37(w[6], x[6], v[13:12], u[13:12]);
-   magcompare2c mag38(w[7], x[7], v[15:14], u[15:14]);
-
-endmodule // magcompare64b
-
-// This module takes 64-bits inputs A and B, two magnitude comparison
-// flags LT_mag and EQ_mag, and a 2-bit signal FOpCtrlE that indicates the type of 
-// operands being compared as indicated below.
-//	FOpCtrlE	Description
-//	 00	double precision numbers
-//	 01	single precision numbers
-//	 10	half precision numbers
-//	 11	bfloat precision numbers
-//
-// The comparator produces a 2-bit signal fcc, which
-// indicates the result of the comparison as follows:
-//     fcc 	decscription
-//      00	A = B	
-//      01	A < B	
-//      10	A > B	
-//      11	A and B	are unordered (i.e., A or B is NaN)
-// It also produces a invalid operation flag, which is one
-// if either of the input operands is a signaling NaN.
-
-module exception_cmp_1 (ANaN, BNaN, Azero, Bzero, A, B, FOpCtrlE);
-
-   input logic [63:0] A;
-   input logic [63:0] B;
-   input logic [2:0]  FOpCtrlE;
-
-   logic 		      dp, sp, hp;
-
-   output logic 	      ANaN;
-   output logic 	      BNaN;
-   output logic               Azero;
-   output logic               Bzero;
-
-   assign dp = !FOpCtrlE[1]&!FOpCtrlE[0];
-   assign sp = !FOpCtrlE[1]&FOpCtrlE[0];
-   assign hp = FOpCtrlE[1]&!FOpCtrlE[0];
-
-   // Test if A or B is NaN.
-   assign ANaN = (A[62]&A[61]&A[60]&A[59]&A[58]) & 
-		 ((sp&A[57]&A[56]&A[55]&(A[54]|A[53])) | 
-		 (dp&A[57]&A[56]&A[55]&A[54]&A[53]&A[52]&(A[51]|A[50])) |
-		 (hp&(A[57]|A[56])));
-
-   assign BNaN = (B[62]&B[61]&B[60]&B[59]&B[58]) & 
-		 ((sp&B[57]&B[56]&B[55]&(B[54]|B[53])) | 
-		 (dp&B[57]&B[56]&B[55]&B[54]&B[53]&B[52]&(B[51]|B[50])) |
-		 (hp&(B[57]|B[56])));
-
-   // Test if A is +0 or -0 when viewed as a floating point number (i.e,
-   // the 63 least siginficant bits of A are zero). 
-   // Depending on how this synthesizes, it may work better to replace
-   // this with assign Azero = ~(A[62] | A[61] | ... | A[0])
-   assign Azero = (A[62:0] == 63'h0);
-   assign Bzero = (B[62:0] == 63'h0);
-
-endmodule // exception_cmp
-//
-// File name : fpcomp.v
-// Title     : Floating-Point Comparator
-// project   : FPU
-// Library   : fpcomp
-// Author(s) : James E. Stine
-// Purpose   : definition of main unit to floating-point comparator
-// notes :   
-//
-// Copyright Oklahoma State University
-//
-// Floating Point Comparator (Algorithm)
-//
-// 1.) Performs sign-extension if the inputs are 32-bit integers.
-// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
-// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
-//     and correct for sign bits
-//
-// This module takes 64-bits inputs op1 and op2, VSS, and VDD
-// signals, and a 2-bit signal FOpCtrlE that indicates the type of 
-// operands being compared as indicated below.
-//	FOpCtrlE	Description
-//	 00	double precision numbers
-//	 01	single precision numbers
-//	 10	half precision numbers
-//	 11	(unused)
-//
-// The comparator produces a 2-bit signal FCC, which
-// indicates the result of the comparison:
-//
-//     fcc 	decscription
-//      00	A = B	
-//      01	A < B	
-//      10	A > B	
-//      11	A and B	are unordered (i.e., A or B is NaN)
-//
-// It also produces an invalid operation flag, which is one
-// if either of the input operands is a signaling NaN per 754
-
-
-/*module magcompare2b (LT, GT, A, B);
-
-   input logic [1:0] A;
-   input logic [1:0] B;
-   
-   output logic     LT;
-   output logic     GT;
-
-   // Determine if A < B  using a minimized sum-of-products expression
-   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
-   // Determine if A > B  using a minimized sum-of-products expression
-   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
-
-endmodule*/ // magcompare2b
-
-// 2-bit magnitude comparator
-// This module compares two 2-bit values A and B. LT is '1' if A < B 
-// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
-// this version actually incorporates don't cares into the equation to
-// simplify the optimization
-
-// module magcompare2c (LT, GT, A, B);
-
-//    input logic [1:0] A;
-//    input logic [1:0] B;
-   
-//    output logic      LT;
-//    output logic      GT;
-
-//    assign LT = B[1] | (!A[1]&B[0]);
-//    assign GT = A[1] | (!B[1]&A[0]);
-
-// endmodule // magcompare2b
-
-// This module compares two 64-bit values A and B. LT is '1' if A < B 
-// and EQ is '1'if A = B. LT and GT are both '0' if A > B.
-// This structure was modified so
-// that it only does a strict magnitdude comparison, and only
-// returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
-// of 63 2-bit magnitude comparators, followed by one OR gates.
-//
-// J. E. Stine and M. J. Schulte, "A combined two's complement and
-// floating-point comparator," 2005 IEEE International Symposium on
-// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
-// doi: 10.1109/ISCAS.2005.1464531
-
-module magcompare64b_2 (LT, EQ, w, x);
-
-   input logic [7:0]  w;
-   input logic [7:0]  x;
-   logic [3:0] 	      y;
-   logic [3:0] 	      z;
-   logic [1:0] 	      a;
-   logic [1:0] 	      b;   
-   logic 	      GT;
-   
-   output logic       LT;
-   output logic       EQ;
-   
-   magcompare2c mag39(y[0], z[0], x[1:0], w[1:0]);
-   magcompare2c mag3A(y[1], z[1], x[3:2], w[3:2]);
-   magcompare2c mag3B(y[2], z[2], x[5:4], w[5:4]);
-   magcompare2c mag3C(y[3], z[3], x[7:6], w[7:6]);
-   
-   magcompare2c mag3D(a[0], b[0], z[1:0], y[1:0]);
-   magcompare2c mag3E(a[1], b[1], z[3:2], y[3:2]);
-   
-   magcompare2c mag3F(LT, GT, b[1:0], a[1:0]);
-
-   assign EQ = ~(LT | GT);
-
-endmodule // magcompare64b
-
-// This module takes 64-bits inputs A and B, two magnitude comparison
-// flags LT_mag and EQ_mag, and a 2-bit signal FOpCtrlE that indicates the type of 
-// operands being compared as indicated below.
-//	FOpCtrlE	Description
-//	 00	double precision numbers
-//	 01	single precision numbers
-//	 10	half precision numbers
-//	 11	bfloat precision numbers
-//
-// The comparator produces a 2-bit signal fcc, which
-// indicates the result of the comparison as follows:
-//     fcc 	decscription
-//      00	A = B	
-//      01	A < B	
-//      10	A > B	
-//      11	A and B	are unordered (i.e., A or B is NaN)
-// It also produces a invalid operation flag, which is one
-// if either of the input operands is a signaling NaN.
-
-module exception_cmp_2 (
-   input logic [63:0] A,
-   input logic [63:0] B,
-   input logic 	      FmtE,
-   input logic 	      LT_mag,
-   input logic 	      EQ_mag,
-   input logic [2:0]  FOpCtrlE,
-   
-   output logic       invalid,
-   output logic [1:0] fcc,
-   output logic [63:0] FCmpResultE,
-
-   input logic 	      Azero,
-   input logic 	      Bzero,   
-   input logic 	      ANaN,
-   input logic 	      BNaN);
-   
-   logic 	      dp;   
-   logic 	      sp;
-   logic 	      hp;   
-   logic 	      ASNaN;
-   logic 	      BSNaN;
-   logic 	      UO;
-   logic 	      GT;
-   logic 	      LT;
-   logic 	      EQ;
-   logic [62:0]       sixtythreezeros = 63'h0;
-
-   assign dp = !FOpCtrlE[1]&!FOpCtrlE[0];
-   assign sp = !FOpCtrlE[1]&FOpCtrlE[0];
-   assign hp = FOpCtrlE[1]&!FOpCtrlE[0];
-
-   // Values are unordered if ((A is NaN) OR (B is NaN)) AND (a floating 
-   // point comparison is being performed. 
-   assign UO = (ANaN | BNaN);
-
-   // Test if A or B is a signaling NaN.
-   assign ASNaN = ANaN & (sp&~A[53] | dp&~A[50] | hp&~A[56]);
-   assign BSNaN = BNaN & (sp&~B[53] | dp&~B[50] | hp&~B[56]);
-
-   // If either A or B is a signaling NaN the "Invalid Operation"
-   // exception flag is set to one; otherwise it is zero.    
-   assign invalid = (ASNaN | BSNaN);
-
-   // A and B are equal if (their magnitudes are equal) AND ((their signs are
-   // equal) or (their magnitudes are zero AND they are floating point
-   // numbers)). Also, A and B are not equal if they are unordered.
-   assign EQ = (EQ_mag | (Azero&Bzero)) & (~UO);
-   
-   // A is less than B if (A is negative and B is posiive) OR
-   // (A and B are positive and the magnitude of A is less than
-   // the magnitude of B) or (A and B are negative integers and
-   // the magnitude of A is less than the magnitude of B) or
-   // (A and B are negative floating point numbers and
-   // the magnitude of A is greater than the magnitude of B).
-   // Also, A is not less than B if A and B are equal or unordered.
-   assign LT = ((~LT_mag & A[63] & B[63]) |
-		(LT_mag & ~(A[63] & B[63])))&~EQ&~UO;
-   
-   // A is greater than B when LT, EQ, and UO are are false.
-   assign GT = ~(LT | EQ | UO);
-
-   // Note: it may be possible to optimize the setting of fcc 
-   // a little more, but it is probably not worth the effort. 
-
-   // Set the bits of fcc based on LT, GT, EQ, and UO
-   assign fcc[0] = LT | UO;
-   assign fcc[1] = GT | UO;  
-
-   always_comb begin
-      case (FOpCtrlE[2:0])
-         3'b111: FCmpResultE = LT ? A : B;//min 
-         3'b101: FCmpResultE = GT ? A : B;//max
-         3'b010: FCmpResultE = {63'b0, EQ};//equal
-         3'b001: FCmpResultE = {63'b0, LT};//less than
-         3'b011: FCmpResultE = {63'b0, LT|EQ};//less than or equal
-         default: FCmpResultE = 64'b0;
-      endcase
-   end 
-
-endmodule // exception_cmp
diff --git a/wally-pipelined/src/fpu/fpucmp2.sv b/wally-pipelined/src/fpu/fpucmp2.sv
deleted file mode 100755
index ee14afb94..000000000
--- a/wally-pipelined/src/fpu/fpucmp2.sv
+++ /dev/null
@@ -1,243 +0,0 @@
-// //
-// // File name : fpcomp.v
-// // Title     : Floating-Point Comparator
-// // project   : FPU
-// // Library   : fpcomp
-// // Author(s) : James E. Stine
-// // Purpose   : definition of main unit to floating-point comparator
-// // notes :   
-// //
-// // Copyright Oklahoma State University
-// //
-// // Floating Point Comparator (Algorithm)
-// //
-// // 1.) Performs sign-extension if the inputs are 32-bit integers.
-// // 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
-// // 3.) Check for special cases (+0=-0, unordered, and infinite values) 
-// //     and correct for sign bits
-// //
-// // This module takes 64-bits inputs op1 and op2, VSS, and VDD
-// // signals, and a 2-bit signal Sel that indicates the type of 
-// // operands being compared as indicated below.
-// //	Sel	Description
-// //	 00	double precision numbers
-// //	 01	single precision numbers
-// //	 10	half precision numbers
-// //	 11	(unused)
-// //
-// // The comparator produces a 2-bit signal FCC, which
-// // indicates the result of the comparison:
-// //
-// //     fcc 	decscription
-// //      00	A = B	
-// //      01	A < B	
-// //      10	A > B	
-// //      11	A and B	are unordered (i.e., A or B is NaN)
-// //
-// // It also produces an invalid operation flag, which is one
-// // if either of the input operands is a signaling NaN per 754
-
-// module fpucmp2 (   
-//    input logic [63:0] op1, 
-//    input logic [63:0] op2,
-//    input logic [1:0]  Sel,
-//    input logic [7:0]  w, x,
-//    input logic        ANaN, BNaN,
-//    input logic        Azero, Bzero,
-//    input logic [3:0]  FOpCtrlM,
-//    input logic 	      FmtM,
-   
-//    output logic       Invalid, 		 // Invalid Operation
-//    output logic [1:0] FCC,  		 // Condition Codes 
-//    output logic [63:0] FCmpResultM);
-   
-//    logic 	      LT;                // magnitude op1 < magnitude op2
-//    logic 	      EQ;                // magnitude op1 = magnitude op2
-   
-//    // Perform magnitude comparison between the 63 least signficant bits
-//    // of the input operands. Only LT and EQ are returned, since GT can
-//    // be determined from these values. 
-//    magcompare64b_2 magcomp2 (LT, EQ, w, x);
-
-//    // Determine final values based on output of magnitude comparison, 
-//    // sign bits, and special case testing. 
-//    exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .Sel(Sel), .A(op1), .B(op2), .*);
-   
-
-// endmodule // fpcomp
-
-// /*module magcompare2b (LT, GT, A, B);
-
-//    input logic [1:0] A;
-//    input logic [1:0] B;
-   
-//    output logic     LT;
-//    output logic     GT;
-
-//    // Determine if A < B  using a minimized sum-of-products expression
-//    assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
-//    // Determine if A > B  using a minimized sum-of-products expression
-//    assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
-
-// endmodule*/ // magcompare2b
-
-// // 2-bit magnitude comparator
-// // This module compares two 2-bit values A and B. LT is '1' if A < B 
-// // and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
-// // this version actually incorporates don't cares into the equation to
-// // simplify the optimization
-
-// // module magcompare2c (LT, GT, A, B);
-
-// //    input logic [1:0] A;
-// //    input logic [1:0] B;
-   
-// //    output logic      LT;
-// //    output logic      GT;
-
-// //    assign LT = B[1] | (!A[1]&B[0]);
-// //    assign GT = A[1] | (!B[1]&A[0]);
-
-// // endmodule // magcompare2b
-
-// // This module compares two 64-bit values A and B. LT is '1' if A < B 
-// // and EQ is '1'if A = B. LT and GT are both '0' if A > B.
-// // This structure was modified so
-// // that it only does a strict magnitdude comparison, and only
-// // returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
-// // of 63 2-bit magnitude comparators, followed by one OR gates.
-// //
-// // J. E. Stine and M. J. Schulte, "A combined two's complement and
-// // floating-point comparator," 2005 IEEE International Symposium on
-// // Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
-// // doi: 10.1109/ISCAS.2005.1464531
-
-// module magcompare64b_2 (LT, EQ, w, x);
-
-//    input logic [7:0]  w;
-//    input logic [7:0]  x;
-//    logic [3:0] 	      y;
-//    logic [3:0] 	      z;
-//    logic [1:0] 	      a;
-//    logic [1:0] 	      b;   
-//    logic 	      GT;
-   
-//    output logic       LT;
-//    output logic       EQ;
-   
-//    magcompare2c mag39(y[0], z[0], x[1:0], w[1:0]);
-//    magcompare2c mag3A(y[1], z[1], x[3:2], w[3:2]);
-//    magcompare2c mag3B(y[2], z[2], x[5:4], w[5:4]);
-//    magcompare2c mag3C(y[3], z[3], x[7:6], w[7:6]);
-   
-//    magcompare2c mag3D(a[0], b[0], z[1:0], y[1:0]);
-//    magcompare2c mag3E(a[1], b[1], z[3:2], y[3:2]);
-   
-//    magcompare2c mag3F(LT, GT, b[1:0], a[1:0]);
-
-//    assign EQ = ~(LT | GT);
-
-// endmodule // magcompare64b
-
-// // This module takes 64-bits inputs A and B, two magnitude comparison
-// // flags LT_mag and EQ_mag, and a 2-bit signal Sel that indicates the type of 
-// // operands being compared as indicated below.
-// //	Sel	Description
-// //	 00	double precision numbers
-// //	 01	single precision numbers
-// //	 10	half precision numbers
-// //	 11	bfloat precision numbers
-// //
-// // The comparator produces a 2-bit signal fcc, which
-// // indicates the result of the comparison as follows:
-// //     fcc 	decscription
-// //      00	A = B	
-// //      01	A < B	
-// //      10	A > B	
-// //      11	A and B	are unordered (i.e., A or B is NaN)
-// // It also produces a invalid operation flag, which is one
-// // if either of the input operands is a signaling NaN.
-
-// module exception_cmp_2 (
-//    input logic [63:0] A,
-//    input logic [63:0] B,
-//    input logic 	      FmtM,
-//    input logic 	      LT_mag,
-//    input logic 	      EQ_mag,
-//    input logic [1:0]  Sel,
-//    input logic [3:0]  FOpCtrlM,
-   
-//    output logic       invalid,
-//    output logic [1:0] fcc,
-//    output logic [63:0] FCmpResultM,
-
-//    input logic 	      Azero,
-//    input logic 	      Bzero,   
-//    input logic 	      ANaN,
-//    input logic 	      BNaN);
-   
-//    logic 	      dp;   
-//    logic 	      sp;
-//    logic 	      hp;   
-//    logic 	      ASNaN;
-//    logic 	      BSNaN;
-//    logic 	      UO;
-//    logic 	      GT;
-//    logic 	      LT;
-//    logic 	      EQ;
-//    logic [62:0]       sixtythreezeros = 63'h0;
-
-//    assign dp = !Sel[1]&!Sel[0];
-//    assign sp = !Sel[1]&Sel[0];
-//    assign hp = Sel[1]&!Sel[0];
-
-//    // Values are unordered if ((A is NaN) OR (B is NaN)) AND (a floating 
-//    // point comparison is being performed. 
-//    assign UO = (ANaN | BNaN);
-
-//    // Test if A or B is a signaling NaN.
-//    assign ASNaN = ANaN & (sp&~A[53] | dp&~A[50] | hp&~A[56]);
-//    assign BSNaN = BNaN & (sp&~B[53] | dp&~B[50] | hp&~B[56]);
-
-//    // If either A or B is a signaling NaN the "Invalid Operation"
-//    // exception flag is set to one; otherwise it is zero.    
-//    assign invalid = (ASNaN | BSNaN);
-
-//    // A and B are equal if (their magnitudes are equal) AND ((their signs are
-//    // equal) or (their magnitudes are zero AND they are floating point
-//    // numbers)). Also, A and B are not equal if they are unordered.
-//    assign EQ = (EQ_mag | (Azero&Bzero)) & (~UO);
-   
-//    // A is less than B if (A is negative and B is posiive) OR
-//    // (A and B are positive and the magnitude of A is less than
-//    // the magnitude of B) or (A and B are negative integers and
-//    // the magnitude of A is less than the magnitude of B) or
-//    // (A and B are negative floating point numbers and
-//    // the magnitude of A is greater than the magnitude of B).
-//    // Also, A is not less than B if A and B are equal or unordered.
-//    assign LT = ((~LT_mag & A[63] & B[63]) |
-// 		(LT_mag & ~(A[63] & B[63])))&~EQ&~UO;
-   
-//    // A is greater than B when LT, EQ, and UO are are false.
-//    assign GT = ~(LT | EQ | UO);
-
-//    // Note: it may be possible to optimize the setting of fcc 
-//    // a little more, but it is probably not worth the effort. 
-
-//    // Set the bits of fcc based on LT, GT, EQ, and UO
-//    assign fcc[0] = LT | UO;
-//    assign fcc[1] = GT | UO;  
-
-//    always_comb begin
-//       case (FOpCtrlM[2:0])
-//          3'b111: FCmpResultM = LT ? A : B;//min 
-//          3'b101: FCmpResultM = GT ? A : B;//max
-//          3'b010: FCmpResultM = FmtM ? {63'b0, EQ} : {31'b0, EQ, 32'b0};//equal
-//          3'b001: FCmpResultM = FmtM ? {63'b0, LT} : {31'b0, LT, 32'b0};//less than
-//          3'b011: FCmpResultM = FmtM ? {63'b0, LT|EQ} : {31'b0, LT|EQ, 32'b0};//less than or equal
-//          default: FCmpResultM = 64'b0;
-//       endcase
-//    end 
-
-
-// endmodule // exception_cmp
diff --git a/wally-pipelined/src/fpu/fpuhazard.sv b/wally-pipelined/src/fpu/fpuhazard.sv
deleted file mode 100644
index 4d0895a77..000000000
--- a/wally-pipelined/src/fpu/fpuhazard.sv
+++ /dev/null
@@ -1,67 +0,0 @@
-///////////////////////////////////////////
-// fpuhazard.sv
-//
-// Written: me@KatherineParry.com 19 May 2021
-// Modified: 
-//
-// Purpose: Determine forwarding, stalls and flushes for the FPU
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-///////////////////////////////////////////
-
-`include "wally-config.vh"
-
-module fpuhazard(
-    input logic [4:0] Adr1E, Adr2E, Adr3E,
-    input logic FWriteEnM, FWriteEnW, 
-	  input logic [4:0] RdM, RdW,
-    input logic [2:0] FResultSelM,
-    output logic FStallD,
-    output logic [1:0] ForwardXE, ForwardYE, ForwardZE
-);
-
-
-  always_comb begin
-    // set ReadData as default
-    ForwardXE = 2'b00; // choose FRD1E
-    ForwardYE = 2'b00; // choose FRD2E
-    ForwardZE = 2'b00; // choose FRD3E
-    FStallD = 0;
-
-      if ((Adr1E == RdM) & FWriteEnM)
-      // if the result will be FResM
-        if(FResultSelM == 3'b100) ForwardXE = 2'b10; // choose FResM
-        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr1E == RdW) & FWriteEnW) ForwardXE = 2'b01; // choose FPUResult64W
-    
-
-      if ((Adr2E == RdM) & FWriteEnM)
-      // if the result will be FResM
-        if(FResultSelM == 3'b100) ForwardYE = 2'b10; // choose FResM
-        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr2E == RdW) & FWriteEnW) ForwardYE = 2'b01; // choose FPUResult64W
-
- 
-      if ((Adr3E == RdM) & FWriteEnM)
-      // if the result will be FResM
-        if(FResultSelM == 3'b100) ForwardZE = 2'b10; // choose FResM
-        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr3E == RdW) & FWriteEnW) ForwardZE = 2'b01; // choose FPUResult64W
-
-  end 
-
-endmodule
diff --git a/wally-pipelined/src/fpu/freg.sv b/wally-pipelined/src/fpu/freg.sv
deleted file mode 100755
index b7e167131..000000000
--- a/wally-pipelined/src/fpu/freg.sv
+++ /dev/null
@@ -1,515 +0,0 @@
-
-`include "wally-config.vh"
-//  `include "../../config/rv64icfd/wally-config.vh" //debug
-
-module freg1adr (
-  input  logic 	       	   FmtW,
-  input  logic             reset,
-  input  logic             clear,
-  input  logic             clk,
-  input  logic [4:0]       rd,
-  input  logic             write,
-  input  logic [4:0]       adr1,
-  input  logic [`XLEN-1:0] writeData,
-  output logic [`XLEN-1:0] readData);
-
-  //note - not word aligning based on precision of 
-  //operation (FmtW)
-
-  //reg number should remain static, but it doesn't hurt
-  //to parameterize
-  parameter numRegs = 32;
-
-  //intermediary signals - useful for debugging
-  //and easy instatiation of generated modules
-  logic [`XLEN-1:0] [numRegs-1:0] regInput;
-  logic [`XLEN-1:0] [numRegs-1:0] regOutput;
-
-  //generate fp registers themselves
-  genvar i;
-  generate
-  	for (i = 0; i < numRegs; i = i + 1) begin:register
-
-  		floprc #(`XLEN) freg[i](.clk(clk), .reset(reset), .clear(clear), .d(regInput[i][`XLEN-1:0]), .q(regOutput[i][`XLEN-1:0])); 
-	end
-
-  endgenerate
-
-  //this could be done with:
-  //
-  //assign readData = regOutput[adr1];
-  //
-  //but always_comb allows for finer control
-
-
-  //address decoder
-  //only 1 for this fp register set
-  //used with fpsign
-  //defaults to outputting zeroes
-  always_comb begin
-  	case(adr1)
-		5'b00000 : readData = regOutput[0];
-		5'b00001 : readData = regOutput[1];
-		5'b00010 : readData = regOutput[2];
-		5'b00011 : readData = regOutput[3];
-		5'b00100 : readData = regOutput[4];
-		5'b00101 : readData = regOutput[5];
-		5'b00110 : readData = regOutput[6];
-		5'b00111 : readData = regOutput[7];
-		5'b01000 : readData = regOutput[8];
-		5'b01001 : readData = regOutput[9];
-		5'b01010 : readData = regOutput[10];
-		5'b01011 : readData = regOutput[11];
-		5'b01100 : readData = regOutput[12];
-		5'b01101 : readData = regOutput[13];
-		5'b01110 : readData = regOutput[14];
-		5'b01111 : readData = regOutput[15];
-		5'b10000 : readData = regOutput[16];
-		5'b10001 : readData = regOutput[17];
-		5'b10010 : readData = regOutput[18];
-		5'b10011 : readData = regOutput[19];
-		5'b10100 : readData = regOutput[20];
-		5'b10101 : readData = regOutput[21];
-		5'b10110 : readData = regOutput[22];
-		5'b10111 : readData = regOutput[23];
-		5'b11000 : readData = regOutput[24];
-		5'b11001 : readData = regOutput[25];
-		5'b11010 : readData = regOutput[26];
-		5'b11011 : readData = regOutput[27];
-		5'b11100 : readData = regOutput[28];
-		5'b11101 : readData = regOutput[29];
-		5'b11110 : readData = regOutput[30];
-		5'b11111 : readData = regOutput[31];
-		default : readData = `XLEN'h0;
-	endcase
-  end
-
-  //destination register decoder
-  //only change input values on write
-  //defaults to undefined with invalid address
-  //
-  //note - this is an intermediary signal, so
-  //this is not asynch assignment. FF in flopr
-  //will not update data until clk pulse
-  always_comb begin
-	  if(write) begin
-		case(rd)	
-			5'b00000 : regInput[0] = writeData;
-			5'b00001 : regInput[1] = writeData;
-			5'b00010 : regInput[2] = writeData;
-			5'b00011 : regInput[3] = writeData;	
-			5'b00100 : regInput[4] = writeData;
-			5'b00101 : regInput[5] = writeData;
-			5'b00110 : regInput[6] = writeData;
-			5'b00111 : regInput[7] = writeData;
-			5'b01000 : regInput[8] = writeData;
-			5'b01000 : regInput[9] = writeData;
-			5'b01001 : regInput[10] = writeData;
-			5'b01010 : regInput[11] = writeData;
-			5'b01111 : regInput[12] = writeData;
-			5'b01101 : regInput[13] = writeData;
-			5'b01110 : regInput[14] = writeData;
-			5'b01111 : regInput[15] = writeData;
-			5'b10000 : regInput[16] = writeData;
-			5'b10001 : regInput[17] = writeData;
-			5'b10010 : regInput[18] = writeData;
-			5'b10011 : regInput[19] = writeData;	
-			5'b10100 : regInput[20] = writeData;
-			5'b10101 : regInput[21] = writeData;
-			5'b10110 : regInput[22] = writeData;
-			5'b10111 : regInput[23] = writeData;
-			5'b11000 : regInput[24] = writeData;
-			5'b11000 : regInput[25] = writeData;
-			5'b11001 : regInput[26] = writeData;
-			5'b11010 : regInput[27] = writeData;
-			5'b11111 : regInput[28] = writeData;
-			5'b11101 : regInput[29] = writeData;
-			5'b11110 : regInput[30] = writeData;
-			5'b11111 : regInput[31] = writeData;
-			default : regInput[0] = `XLEN'hx;
-		endcase
-	end	
-  end
-
-endmodule
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//********
-//formatting separation
-//********
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-module freg2adr (
-  input  logic 	           FmtW,
-  input  logic             reset,
-  input  logic             clear,
-  input  logic             clk,
-  input  logic [4:0]       rd,
-  input  logic             write,
-  input  logic [4:0]       adr1,
-  input  logic [4:0]       adr2,
-  input  logic [`XLEN-1:0] writeData,
-  output logic [`XLEN-1:0] readData1,
-  output logic [`XLEN-1:0] readData2);
-
-  //note - not word aligning based on precision of 
-  //operation (FmtW)
-
-  //reg number should remain static, but it doesn't hurt
-  //to parameterize
-  parameter numRegs = 32;
-
-  //intermediary signals - useful for debugging
-  //and easy instatiation of generated modules
-  logic [`XLEN-1:0] [numRegs-1:0] regInput;
-  logic [`XLEN-1:0] [numRegs-1:0] regOutput;
-
-  //generate fp registers themselves
-  genvar i;
-  generate
-  	for (i = 0; i < numRegs; i = i + 1) begin:register
-
-  		floprc #(`XLEN) freg[i](.clk(clk), .reset(reset), .clear(clear), .d(regInput[i][`XLEN-1:0]), .q(regOutput[i][`XLEN-1:0])); 
-	end
-
-  endgenerate
-
-  //address decoder
-  //2 are used for this fp register set
-  //used with fpadd/cvt, fpdiv/sqrt, and fpcmp
-  //defaults to outputting zeroes
-  always_comb begin
-
-	//adderss 1 decoder
-  	case(adr1)
-		5'b00000 : readData1 = regOutput[0];
-		5'b00001 : readData1 = regOutput[1];
-		5'b00010 : readData1 = regOutput[2];
-		5'b00011 : readData1 = regOutput[3];
-		5'b00100 : readData1 = regOutput[4];
-		5'b00101 : readData1 = regOutput[5];
-		5'b00110 : readData1 = regOutput[6];
-		5'b00111 : readData1 = regOutput[7];
-		5'b01000 : readData1 = regOutput[8];
-		5'b01001 : readData1 = regOutput[9];
-		5'b01010 : readData1 = regOutput[10];
-		5'b01011 : readData1 = regOutput[11];
-		5'b01100 : readData1 = regOutput[12];
-		5'b01101 : readData1 = regOutput[13];
-		5'b01110 : readData1 = regOutput[14];
-		5'b01111 : readData1 = regOutput[15];
-		5'b10000 : readData1 = regOutput[16];
-		5'b10001 : readData1 = regOutput[17];
-		5'b10010 : readData1 = regOutput[18];
-		5'b10011 : readData1 = regOutput[19];
-		5'b10100 : readData1 = regOutput[20];
-		5'b10101 : readData1 = regOutput[21];
-		5'b10110 : readData1 = regOutput[22];
-		5'b10111 : readData1 = regOutput[23];
-		5'b11000 : readData1 = regOutput[24];
-		5'b11001 : readData1 = regOutput[25];
-		5'b11010 : readData1 = regOutput[26];
-		5'b11011 : readData1 = regOutput[27];
-		5'b11100 : readData1 = regOutput[28];
-		5'b11101 : readData1 = regOutput[29];
-		5'b11110 : readData1 = regOutput[30];
-		5'b11111 : readData1 = regOutput[31];
-		default : readData1 = `XLEN'h0;
-	endcase
-
-	//address 2 decoder
-  	case(adr2)
-		5'b00000 : readData2 = regOutput[0];
-		5'b00001 : readData2 = regOutput[1];
-		5'b00010 : readData2 = regOutput[2];
-		5'b00011 : readData2 = regOutput[3];
-		5'b00100 : readData2 = regOutput[4];
-		5'b00101 : readData2 = regOutput[5];
-		5'b00110 : readData2 = regOutput[6];
-		5'b00111 : readData2 = regOutput[7];
-		5'b01000 : readData2 = regOutput[8];
-		5'b01001 : readData2 = regOutput[9];
-		5'b01010 : readData2 = regOutput[10];
-		5'b01011 : readData2 = regOutput[11];
-		5'b01100 : readData2 = regOutput[12];
-		5'b01101 : readData2 = regOutput[13];
-		5'b01110 : readData2 = regOutput[14];
-		5'b01111 : readData2 = regOutput[15];
-		5'b10000 : readData2 = regOutput[16];
-		5'b10001 : readData2 = regOutput[17];
-		5'b10010 : readData2 = regOutput[18];
-		5'b10011 : readData2 = regOutput[19];
-		5'b10100 : readData2 = regOutput[20];
-		5'b10101 : readData2 = regOutput[21];
-		5'b10110 : readData2 = regOutput[22];
-		5'b10111 : readData2 = regOutput[23];
-		5'b11000 : readData2 = regOutput[24];
-		5'b11001 : readData2 = regOutput[25];
-		5'b11010 : readData2 = regOutput[26];
-		5'b11011 : readData2 = regOutput[27];
-		5'b11100 : readData2 = regOutput[28];
-		5'b11101 : readData2 = regOutput[29];
-		5'b11110 : readData2 = regOutput[30];
-		5'b11111 : readData2 = regOutput[31];
-		default : readData2 = `XLEN'h0;
-	endcase
-  end
-
-  //destination register decoder
-  //only change input values on write
-  //defaults to undefined with invalid address
-  //
-  //note - this is an intermediary signal, so
-  //this is not asynch assignment. FF in flopr
-  //will not update data until clk pulse
-  always_comb begin
-	  if(write) begin
-		case(rd)	
-			5'b00000 : regInput[0] = writeData;
-			5'b00001 : regInput[1] = writeData;
-			5'b00010 : regInput[2] = writeData;
-			5'b00011 : regInput[3] = writeData;	
-			5'b00100 : regInput[4] = writeData;
-			5'b00101 : regInput[5] = writeData;
-			5'b00110 : regInput[6] = writeData;
-			5'b00111 : regInput[7] = writeData;
-			5'b01000 : regInput[8] = writeData;
-			5'b01000 : regInput[9] = writeData;
-			5'b01001 : regInput[10] = writeData;
-			5'b01010 : regInput[11] = writeData;
-			5'b01111 : regInput[12] = writeData;
-			5'b01101 : regInput[13] = writeData;
-			5'b01110 : regInput[14] = writeData;
-			5'b01111 : regInput[15] = writeData;
-			5'b10000 : regInput[16] = writeData;
-			5'b10001 : regInput[17] = writeData;
-			5'b10010 : regInput[18] = writeData;
-			5'b10011 : regInput[19] = writeData;	
-			5'b10100 : regInput[20] = writeData;
-			5'b10101 : regInput[21] = writeData;
-			5'b10110 : regInput[22] = writeData;
-			5'b10111 : regInput[23] = writeData;
-			5'b11000 : regInput[24] = writeData;
-			5'b11000 : regInput[25] = writeData;
-			5'b11001 : regInput[26] = writeData;
-			5'b11010 : regInput[27] = writeData;
-			5'b11111 : regInput[28] = writeData;
-			5'b11101 : regInput[29] = writeData;
-			5'b11110 : regInput[30] = writeData;
-			5'b11111 : regInput[31] = writeData;
-			default : regInput[0] = `XLEN'hx;
-		endcase
-	end	
-  end
-
-endmodule
-
-//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-//********
-//formatting separation
-//********
-/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-module freg3adr (
-  input  logic    	   FmtW,
-  input  logic             reset,
-  input  logic             clear,
-  input  logic             clk,
-  input  logic [4:0]       rd,
-  input  logic             write,
-  input  logic [4:0]       adr1,
-  input  logic [4:0]       adr2,
-  input  logic [4:0]       adr3,
-  input  logic [`XLEN-1:0] writeData,
-  output logic [`XLEN-1:0] readData1,
-  output logic [`XLEN-1:0] readData2,
-  output logic [`XLEN-1:0] readData3);
-
-  //note - not word aligning based on precision of 
-  //operation (FmtW)
-
-  //reg number should remain static, but it doesn't hurt
-  //to parameterize
-  parameter numRegs = 32;
-
-  //intermediary signals - useful for debugging
-  //and easy instatiation of generated modules
-  logic [numRegs-1:0] [`XLEN-1:0] regInput;
-  logic [numRegs-1:0] [`XLEN-1:0] regOutput;
-
-  //generate fp registers themselves
-  genvar i;
-  generate
-  	for (i = 0; i < numRegs; i = i + 1) begin:register
-
-  		floprc #(`XLEN) freg(.clk(clk), .reset(reset), .clear(clear), .d(regInput[i][`XLEN-1:0]), .q(regOutput[i][`XLEN-1:0])); 
-	end
-
-  endgenerate
-
-  //address decoder
-  //3 are used for this fp register set
-  //used exclusively for fma
-  //defaults to outputting zeroes
-  always_comb begin
-
-	//adderss 1 decoder
-  	case(adr1)
-		5'b00000 : readData1 = regOutput[0];
-		5'b00001 : readData1 = regOutput[1];
-		5'b00010 : readData1 = regOutput[2];
-		5'b00011 : readData1 = regOutput[3];
-		5'b00100 : readData1 = regOutput[4];
-		5'b00101 : readData1 = regOutput[5];
-		5'b00110 : readData1 = regOutput[6];
-		5'b00111 : readData1 = regOutput[7];
-		5'b01000 : readData1 = regOutput[8];
-		5'b01001 : readData1 = regOutput[9];
-		5'b01010 : readData1 = regOutput[10];
-		5'b01011 : readData1 = regOutput[11];
-		5'b01100 : readData1 = regOutput[12];
-		5'b01101 : readData1 = regOutput[13];
-		5'b01110 : readData1 = regOutput[14];
-		5'b01111 : readData1 = regOutput[15];
-		5'b10000 : readData1 = regOutput[16];
-		5'b10001 : readData1 = regOutput[17];
-		5'b10010 : readData1 = regOutput[18];
-		5'b10011 : readData1 = regOutput[19];
-		5'b10100 : readData1 = regOutput[20];
-		5'b10101 : readData1 = regOutput[21];
-		5'b10110 : readData1 = regOutput[22];
-		5'b10111 : readData1 = regOutput[23];
-		5'b11000 : readData1 = regOutput[24];
-		5'b11001 : readData1 = regOutput[25];
-		5'b11010 : readData1 = regOutput[26];
-		5'b11011 : readData1 = regOutput[27];
-		5'b11100 : readData1 = regOutput[28];
-		5'b11101 : readData1 = regOutput[29];
-		5'b11110 : readData1 = regOutput[30];
-		5'b11111 : readData1 = regOutput[31];
-		default : readData1 = `XLEN'h0;
-	endcase
-
-	//address 2 decoder
-  	case(adr2)
-		5'b00000 : readData2 = regOutput[0];
-		5'b00001 : readData2 = regOutput[1];
-		5'b00010 : readData2 = regOutput[2];
-		5'b00011 : readData2 = regOutput[3];
-		5'b00100 : readData2 = regOutput[4];
-		5'b00101 : readData2 = regOutput[5];
-		5'b00110 : readData2 = regOutput[6];
-		5'b00111 : readData2 = regOutput[7];
-		5'b01000 : readData2 = regOutput[8];
-		5'b01001 : readData2 = regOutput[9];
-		5'b01010 : readData2 = regOutput[10];
-		5'b01011 : readData2 = regOutput[11];
-		5'b01100 : readData2 = regOutput[12];
-		5'b01101 : readData2 = regOutput[13];
-		5'b01110 : readData2 = regOutput[14];
-		5'b01111 : readData2 = regOutput[15];
-		5'b10000 : readData2 = regOutput[16];
-		5'b10001 : readData2 = regOutput[17];
-		5'b10010 : readData2 = regOutput[18];
-		5'b10011 : readData2 = regOutput[19];
-		5'b10100 : readData2 = regOutput[20];
-		5'b10101 : readData2 = regOutput[21];
-		5'b10110 : readData2 = regOutput[22];
-		5'b10111 : readData2 = regOutput[23];
-		5'b11000 : readData2 = regOutput[24];
-		5'b11001 : readData2 = regOutput[25];
-		5'b11010 : readData2 = regOutput[26];
-		5'b11011 : readData2 = regOutput[27];
-		5'b11100 : readData2 = regOutput[28];
-		5'b11101 : readData2 = regOutput[29];
-		5'b11110 : readData2 = regOutput[30];
-		5'b11111 : readData2 = regOutput[31];
-		default : readData2 = `XLEN'h0;
-	endcase
-
-	//address 3 decoder
-  	case(adr3)
-		5'b00000 : readData3 = regOutput[0];
-		5'b00001 : readData3 = regOutput[1];
-		5'b00010 : readData3 = regOutput[2];
-		5'b00011 : readData3 = regOutput[3];
-		5'b00100 : readData3 = regOutput[4];
-		5'b00101 : readData3 = regOutput[5];
-		5'b00110 : readData3 = regOutput[6];
-		5'b00111 : readData3 = regOutput[7];
-		5'b01000 : readData3 = regOutput[8];
-		5'b01001 : readData3 = regOutput[9];
-		5'b01010 : readData3 = regOutput[10];
-		5'b01011 : readData3 = regOutput[11];
-		5'b01100 : readData3 = regOutput[12];
-		5'b01101 : readData3 = regOutput[13];
-		5'b01110 : readData3 = regOutput[14];
-		5'b01111 : readData3 = regOutput[15];
-		5'b10000 : readData3 = regOutput[16];
-		5'b10001 : readData3 = regOutput[17];
-		5'b10010 : readData3 = regOutput[18];
-		5'b10011 : readData3 = regOutput[19];
-		5'b10100 : readData3 = regOutput[20];
-		5'b10101 : readData3 = regOutput[21];
-		5'b10110 : readData3 = regOutput[22];
-		5'b10111 : readData3 = regOutput[23];
-		5'b11000 : readData3 = regOutput[24];
-		5'b11001 : readData3 = regOutput[25];
-		5'b11010 : readData3 = regOutput[26];
-		5'b11011 : readData3 = regOutput[27];
-		5'b11100 : readData3 = regOutput[28];
-		5'b11101 : readData3 = regOutput[29];
-		5'b11110 : readData3 = regOutput[30];
-		5'b11111 : readData3 = regOutput[31];
-		default : readData3 = `XLEN'h0;
-	endcase
-  end
-
-  //destination register decoder
-  //only change input values on write
-  //defaults to undefined with invalid address
-  //
-  //note - this is an intermediary signal, so
-  //this is not asynch assignment. FF in flopr
-  //will not update data until clk pulse
-  always_comb begin
-	  if(write) begin
-		case(rd)	
-			5'b00000 : regInput[0] = writeData;
-			5'b00001 : regInput[1] = writeData;
-			5'b00010 : regInput[2] = writeData;
-			5'b00011 : regInput[3] = writeData;	
-			5'b00100 : regInput[4] = writeData;
-			5'b00101 : regInput[5] = writeData;
-			5'b00110 : regInput[6] = writeData;
-			5'b00111 : regInput[7] = writeData;
-			5'b01000 : regInput[8] = writeData;
-			5'b01001 : regInput[9] = writeData;
-			5'b01010 : regInput[10] = writeData;
-			5'b01011 : regInput[11] = writeData;
-			5'b01100 : regInput[12] = writeData;
-			5'b01101 : regInput[13] = writeData;
-			5'b01110 : regInput[14] = writeData;
-			5'b01111 : regInput[15] = writeData;
-			5'b10000 : regInput[16] = writeData;
-			5'b10001 : regInput[17] = writeData;
-			5'b10010 : regInput[18] = writeData;
-			5'b10011 : regInput[19] = writeData;	
-			5'b10100 : regInput[20] = writeData;
-			5'b10101 : regInput[21] = writeData;
-			5'b10110 : regInput[22] = writeData;
-			5'b10111 : regInput[23] = writeData;
-			5'b11000 : regInput[24] = writeData;
-			5'b11001 : regInput[25] = writeData;
-			5'b11010 : regInput[26] = writeData;
-			5'b11011 : regInput[27] = writeData;
-			5'b11100 : regInput[28] = writeData;
-			5'b11101 : regInput[29] = writeData;
-			5'b11110 : regInput[30] = writeData;
-			5'b11111 : regInput[31] = writeData;
-			default : regInput[0] = `XLEN'hx;
-		endcase
-	end	
-  end
-
-endmodule
diff --git a/wally-pipelined/src/fpu/fsgn.sv b/wally-pipelined/src/fpu/fsgn.sv
index 62d0e7d7c..7df9386c7 100755
--- a/wally-pipelined/src/fpu/fsgn.sv
+++ b/wally-pipelined/src/fpu/fsgn.sv
@@ -1,13 +1,12 @@
 //performs the fsgnj/fsgnjn/fsgnjx RISCV instructions
 
-module fpusgn (SgnOpCodeE, SgnResultE, SgnFlagsE, SrcXE, SrcYE);
+module fsgn (
+	input  logic [63:0]  SrcXE, SrcYE,
+	input  logic [1:0]   SgnOpCodeE,
+	output logic [63:0]  SgnResE,
+	output logic   SgnNVE);
 
-	input  [63:0]  SrcXE, SrcYE;
-	input  [1:0]   SgnOpCodeE;
-	output [63:0]  SgnResultE;
-	output [4:0]   SgnFlagsE;
-
-	wire AonesExp;
+	logic AonesExp;
 
 	//op code designation:
 	//
@@ -16,8 +15,8 @@ module fpusgn (SgnOpCodeE, SgnResultE, SgnFlagsE, SrcXE, SrcYE);
 	//10 - fsgnjx - XOR sign values of SrcXE & SrcYE
 	//
 	
-	assign SgnResultE[63] = SgnOpCodeE[1] ? (SrcXE[63] ^ SrcYE[63]) : (SrcYE[63] ^ SgnOpCodeE[0]);
-	assign SgnResultE[62:0] = SrcXE[62:0];
+	assign SgnResE[63] = SgnOpCodeE[1] ? (SrcXE[63] ^ SrcYE[63]) : (SrcYE[63] ^ SgnOpCodeE[0]);
+	assign SgnResE[62:0] = SrcXE[62:0];
 
 	//If the exponent is all ones, then the value is either Inf or NaN,
 	//both of which will produce a QNaN/SNaN value of some sort. This will 
@@ -26,6 +25,6 @@ module fpusgn (SgnOpCodeE, SgnResultE, SgnFlagsE, SrcXE, SrcYE);
 
 	//the only flag that can occur during this operation is invalid
 	//due to changing sign on already existing NaN
-	assign SgnFlagsE = {AonesExp & SgnResultE[63], 1'b0, 1'b0, 1'b0, 1'b0};
+	assign SgnNVE = AonesExp & SgnResE[63];
 
 endmodule
diff --git a/wally-pipelined/src/fpu/ling_bk13.sv b/wally-pipelined/src/fpu/ling_bk13.sv
deleted file mode 100755
index a35c7a8f7..000000000
--- a/wally-pipelined/src/fpu/ling_bk13.sv
+++ /dev/null
@@ -1,89 +0,0 @@
-// Brent-Kung Prefix Adder
-
-module ling_bk13 (cout, sum, a, b, cin);
-	 input [12:0] a, b;
-	 input cin;
-	 output [12:0] sum;
-	 output cout;
-
-	 wire [13:0] p,g;
-	 wire [13:1] h,c;
-
-// pre-computation
-	 assign p={a|b,1'b1};
-	 assign g={a&b, cin};
-
-// prefix tree
-	 ling_brent_kung prefix_tree(h, c, p[12:0], g[12:0]);
-
-// post-computation
-	 assign h[13]=g[13]|c[13];
-	 assign sum=p[13:1]^h|g[13:1]&c;
-	 assign cout=p[13]&h[13];
-
-endmodule
-
-module ling_brent_kung (h, c, p, g);
-	
-	input [12:0] p;
-	input [13:0] g;
-	output [13:1] h;
-	output [13:1] c;
-
-
-	// parallel-prefix, Brent-Kung
-
-	// Stage 1: Generates H/I pairs that span 1 bits
-	rgry g_1_0 (H_1_0, {g[1],g[0]});
-	rblk b_3_2 (H_3_2, I_3_2, {g[3],g[2]}, {p[2],p[1]});
-	rblk b_5_4 (H_5_4, I_5_4, {g[5],g[4]}, {p[4],p[3]});
-	rblk b_7_6 (H_7_6, I_7_6, {g[7],g[6]}, {p[6],p[5]});
-	rblk b_9_8 (H_9_8, I_9_8, {g[9],g[8]}, {p[8],p[7]});
-	rblk b_11_10 (H_11_10, I_11_10, {g[11],g[10]}, {p[10],p[9]});
-	rblk b_13_12 (H_13_12, I_13_12, {g[13],g[12]}, {p[12],p[11]});
-
-	// Stage 2: Generates H/I pairs that span 2 bits
-	grey g_3_0 (H_3_0, {H_3_2,H_1_0}, I_3_2);
-	black b_7_4 (H_7_4, I_7_4, {H_7_6,H_5_4}, {I_7_6,I_5_4});
-	black b_11_8 (H_11_8, I_11_8, {H_11_10,H_9_8}, {I_11_10,I_9_8});
-
-	// Stage 3: Generates H/I pairs that span 4 bits
-	grey g_7_0 (H_7_0, {H_7_4,H_3_0}, I_7_4);
-
-	// Stage 4: Generates H/I pairs that span 8 bits
-
-	// Stage 5: Generates H/I pairs that span 4 bits
-	grey g_11_0 (H_11_0, {H_11_8,H_7_0}, I_11_8);
-
-	// Stage 6: Generates H/I pairs that span 2 bits
-	grey g_5_0 (H_5_0, {H_5_4,H_3_0}, I_5_4);
-	grey g_9_0 (H_9_0, {H_9_8,H_7_0}, I_9_8);
-
-	// Last grey cell stage 
-	grey g_2_0 (H_2_0, {g[2],H_1_0}, p[1]);
-	grey g_4_0 (H_4_0, {g[4],H_3_0}, p[3]);
-	grey g_6_0 (H_6_0, {g[6],H_5_0}, p[5]);
-	grey g_8_0 (H_8_0, {g[8],H_7_0}, p[7]);
-	grey g_10_0 (H_10_0, {g[10],H_9_0}, p[9]);
-	grey g_12_0 (H_12_0, {g[12],H_11_0}, p[11]);
-
-	// Final Stage: Apply c_k+1=p_k&H_k_0
-	assign c[1]=g[0];
-
-	assign h[1]=H_1_0;		assign c[2]=p[1]&H_1_0;
-	assign h[2]=H_2_0;		assign c[3]=p[2]&H_2_0;
-	assign h[3]=H_3_0;		assign c[4]=p[3]&H_3_0;
-	assign h[4]=H_4_0;		assign c[5]=p[4]&H_4_0;
-	assign h[5]=H_5_0;		assign c[6]=p[5]&H_5_0;
-	assign h[6]=H_6_0;		assign c[7]=p[6]&H_6_0;
-	assign h[7]=H_7_0;		assign c[8]=p[7]&H_7_0;
-	assign h[8]=H_8_0;		assign c[9]=p[8]&H_8_0;
-
-	assign h[9]=H_9_0;		assign c[10]=p[9]&H_9_0;
-	assign h[10]=H_10_0;		assign c[11]=p[10]&H_10_0;
-	assign h[11]=H_11_0;		assign c[12]=p[11]&H_11_0;
-	assign h[12]=H_12_0;		assign c[13]=p[12]&H_12_0;
-
-endmodule
-
-
diff --git a/wally-pipelined/src/fpu/lzd_denorm.sv b/wally-pipelined/src/fpu/lzd_denorm.sv
index 21efbf5fc..860a33817 100755
--- a/wally-pipelined/src/fpu/lzd_denorm.sv
+++ b/wally-pipelined/src/fpu/lzd_denorm.sv
@@ -168,3 +168,4 @@ module lz52 (ZP, ZV, B);
 
 endmodule // lz52
 
+
diff --git a/wally-pipelined/src/fpu/mult_R4_64_64_cs.sv b/wally-pipelined/src/fpu/mult_R4_64_64_cs.sv
old mode 100755
new mode 100644
diff --git a/wally-pipelined/src/fpu/rounder_denorm.sv b/wally-pipelined/src/fpu/rounder_denorm.sv
index 70df0656b..b6793594c 100755
--- a/wally-pipelined/src/fpu/rounder_denorm.sv
+++ b/wally-pipelined/src/fpu/rounder_denorm.sv
@@ -115,11 +115,11 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
    assign B_12_overflow = {8'h0, 3'b0, normal_overflow};
    assign B_12_underflow = {8'h0, 3'b0, normal_underflow};
 
-   cla52 add1(Tmant, Cout, A[62:11], B);
+   cla52 add1(Tmant, Cout, A[62:11], B); //***adder
 
-   cla12 add1_exp(Texp_addone, Cout_overflow, Texp, B_12_overflow);
+   cla12 add1_exp(Texp_addone, Cout_overflow, Texp, B_12_overflow); //***adder
 
-   cla_sub12 sub1_exp(Texp_subone, Texp, B_12_underflow);
+   cla_sub12 sub1_exp(Texp_subone, Texp, B_12_underflow); //***adder
 
    // Now that rounding is done, we compute the final exponent
    // and test for special cases. 
diff --git a/wally-pipelined/src/fpu/sbtm_a4.sv b/wally-pipelined/src/fpu/sbtm_a4.sv
deleted file mode 100755
index 7ffe4c617..000000000
--- a/wally-pipelined/src/fpu/sbtm_a4.sv
+++ /dev/null
@@ -1,204 +0,0 @@
-module sbtm_a4 (input  logic [7:0] a,
-		output logic [13:0] y);
-   always_comb
-     case(a)
-       8'b01000000: y = 14'b10110100010111;
-       8'b01000001: y = 14'b10110010111111;
-       8'b01000010: y = 14'b10110001101000;
-       8'b01000011: y = 14'b10110000010011;
-       8'b01000100: y = 14'b10101111000001;
-       8'b01000101: y = 14'b10101101110000;
-       8'b01000110: y = 14'b10101100100001;
-       8'b01000111: y = 14'b10101011010011;
-       8'b01001000: y = 14'b10101010000111;
-       8'b01001001: y = 14'b10101000111101;
-       8'b01001010: y = 14'b10100111110100;
-       8'b01001011: y = 14'b10100110101101;
-       8'b01001100: y = 14'b10100101100111;
-       8'b01001101: y = 14'b10100100100010;
-       8'b01001110: y = 14'b10100011011111;
-       8'b01001111: y = 14'b10100010011101;
-       8'b01010000: y = 14'b10100001011100;
-       8'b01010001: y = 14'b10100000011100;
-       8'b01010010: y = 14'b10011111011110;
-       8'b01010011: y = 14'b10011110100001;
-       8'b01010100: y = 14'b10011101100100;
-       8'b01010101: y = 14'b10011100101001;
-       8'b01010110: y = 14'b10011011101111;
-       8'b01010111: y = 14'b10011010110110;
-       8'b01011000: y = 14'b10011001111110;
-       8'b01011001: y = 14'b10011001000110;
-       8'b01011010: y = 14'b10011000010000;
-       8'b01011011: y = 14'b10010111011011;
-       8'b01011100: y = 14'b10010110100110;
-       8'b01011101: y = 14'b10010101110011;
-       8'b01011110: y = 14'b10010101000000;
-       8'b01011111: y = 14'b10010100001110;
-       8'b01100000: y = 14'b10010011011100;
-       8'b01100001: y = 14'b10010010101100;
-       8'b01100010: y = 14'b10010001111100;
-       8'b01100011: y = 14'b10010001001101;
-       8'b01100100: y = 14'b10010000011111;
-       8'b01100101: y = 14'b10001111110001;
-       8'b01100110: y = 14'b10001111000100;
-       8'b01100111: y = 14'b10001110011000;
-       8'b01101000: y = 14'b10001101101100;
-       8'b01101001: y = 14'b10001101000001;
-       8'b01101010: y = 14'b10001100010110;
-       8'b01101011: y = 14'b10001011101100;
-       8'b01101100: y = 14'b10001011000011;
-       8'b01101101: y = 14'b10001010011010;
-       8'b01101110: y = 14'b10001001110010;
-       8'b01101111: y = 14'b10001001001010;
-       8'b01110000: y = 14'b10001000100011;
-       8'b01110001: y = 14'b10000111111101;
-       8'b01110010: y = 14'b10000111010111;
-       8'b01110011: y = 14'b10000110110001;
-       8'b01110100: y = 14'b10000110001100;
-       8'b01110101: y = 14'b10000101100111;
-       8'b01110110: y = 14'b10000101000011;
-       8'b01110111: y = 14'b10000100011111;
-       8'b01111000: y = 14'b10000011111100;
-       8'b01111001: y = 14'b10000011011001;
-       8'b01111010: y = 14'b10000010110111;
-       8'b01111011: y = 14'b10000010010101;
-       8'b01111100: y = 14'b10000001110011;
-       8'b01111101: y = 14'b10000001010010;
-       8'b01111110: y = 14'b10000000110001;
-       8'b01111111: y = 14'b10000000010001;       
-       8'b10000000: y = 14'b01111111110001;
-       8'b10000001: y = 14'b01111111010001;
-       8'b10000010: y = 14'b01111110110010;
-       8'b10000011: y = 14'b01111110010011;
-       8'b10000100: y = 14'b01111101110101;
-       8'b10000101: y = 14'b01111101010110;
-       8'b10000110: y = 14'b01111100111001;
-       8'b10000111: y = 14'b01111100011011;
-       8'b10001000: y = 14'b01111011111110;
-       8'b10001001: y = 14'b01111011100001;
-       8'b10001010: y = 14'b01111011000100;
-       8'b10001011: y = 14'b01111010101000;
-       8'b10001100: y = 14'b01111010001100;
-       8'b10001101: y = 14'b01111001110000;
-       8'b10001110: y = 14'b01111001010101;
-       8'b10001111: y = 14'b01111000111010;
-       8'b10010000: y = 14'b01111000011111;
-       8'b10010001: y = 14'b01111000000100;
-       8'b10010010: y = 14'b01110111101010;
-       8'b10010011: y = 14'b01110111010000;
-       8'b10010100: y = 14'b01110110110110;
-       8'b10010101: y = 14'b01110110011101;
-       8'b10010110: y = 14'b01110110000100;
-       8'b10010111: y = 14'b01110101101011;
-       8'b10011000: y = 14'b01110101010010;
-       8'b10011001: y = 14'b01110100111001;
-       8'b10011010: y = 14'b01110100100001;
-       8'b10011011: y = 14'b01110100001001;
-       8'b10011100: y = 14'b01110011110001;
-       8'b10011101: y = 14'b01110011011010;
-       8'b10011110: y = 14'b01110011000010;
-       8'b10011111: y = 14'b01110010101011;
-       8'b10100000: y = 14'b01110010010100;
-       8'b10100001: y = 14'b01110001111110;
-       8'b10100010: y = 14'b01110001100111;
-       8'b10100011: y = 14'b01110001010001;
-       8'b10100100: y = 14'b01110000111011;
-       8'b10100101: y = 14'b01110000100101;
-       8'b10100110: y = 14'b01110000001111;
-       8'b10100111: y = 14'b01101111111010;
-       8'b10101000: y = 14'b01101111100101;
-       8'b10101001: y = 14'b01101111010000;
-       8'b10101010: y = 14'b01101110111011;
-       8'b10101011: y = 14'b01101110100110;
-       8'b10101100: y = 14'b01101110010001;
-       8'b10101101: y = 14'b01101101111101;
-       8'b10101110: y = 14'b01101101101001;
-       8'b10101111: y = 14'b01101101010101;
-       8'b10110000: y = 14'b01101101000001;
-       8'b10110001: y = 14'b01101100101101;
-       8'b10110010: y = 14'b01101100011010;
-       8'b10110011: y = 14'b01101100000110;
-       8'b10110100: y = 14'b01101011110011;
-       8'b10110101: y = 14'b01101011100000;
-       8'b10110110: y = 14'b01101011001101;
-       8'b10110111: y = 14'b01101010111010;
-       8'b10111000: y = 14'b01101010101000;
-       8'b10111001: y = 14'b01101010010101;
-       8'b10111010: y = 14'b01101010000011;
-       8'b10111011: y = 14'b01101001110001;
-       8'b10111100: y = 14'b01101001011111;
-       8'b10111101: y = 14'b01101001001101;
-       8'b10111110: y = 14'b01101000111100;
-       8'b10111111: y = 14'b01101000101010;
-       8'b11000000: y = 14'b01101000011001;
-       8'b11000001: y = 14'b01101000000111;
-       8'b11000010: y = 14'b01100111110110;
-       8'b11000011: y = 14'b01100111100101;
-       8'b11000100: y = 14'b01100111010100;
-       8'b11000101: y = 14'b01100111000011;
-       8'b11000110: y = 14'b01100110110011;
-       8'b11000111: y = 14'b01100110100010;
-       8'b11001000: y = 14'b01100110010010;
-       8'b11001001: y = 14'b01100110000010;
-       8'b11001010: y = 14'b01100101110010;
-       8'b11001011: y = 14'b01100101100001;
-       8'b11001100: y = 14'b01100101010010;
-       8'b11001101: y = 14'b01100101000010;
-       8'b11001110: y = 14'b01100100110010;
-       8'b11001111: y = 14'b01100100100011;
-       8'b11010000: y = 14'b01100100010011;
-       8'b11010001: y = 14'b01100100000100;
-       8'b11010010: y = 14'b01100011110101;
-       8'b11010011: y = 14'b01100011100101;
-       8'b11010100: y = 14'b01100011010110;
-       8'b11010101: y = 14'b01100011000111;
-       8'b11010110: y = 14'b01100010111001;
-       8'b11010111: y = 14'b01100010101010;
-       8'b11011000: y = 14'b01100010011011;
-       8'b11011001: y = 14'b01100010001101;
-       8'b11011010: y = 14'b01100001111110;
-       8'b11011011: y = 14'b01100001110000;
-       8'b11011100: y = 14'b01100001100010;
-       8'b11011101: y = 14'b01100001010100;
-       8'b11011110: y = 14'b01100001000110;
-       8'b11011111: y = 14'b01100000111000;
-       8'b11100000: y = 14'b01100000101010;
-       8'b11100001: y = 14'b01100000011100;
-       8'b11100010: y = 14'b01100000001111;
-       8'b11100011: y = 14'b01100000000001;
-       8'b11100100: y = 14'b01011111110100;
-       8'b11100101: y = 14'b01011111100110;
-       8'b11100110: y = 14'b01011111011001;
-       8'b11100111: y = 14'b01011111001100;
-       8'b11101000: y = 14'b01011110111111;
-       8'b11101001: y = 14'b01011110110010;
-       8'b11101010: y = 14'b01011110100101;
-       8'b11101011: y = 14'b01011110011000;
-       8'b11101100: y = 14'b01011110001011;
-       8'b11101101: y = 14'b01011101111110;
-       8'b11101110: y = 14'b01011101110010;
-       8'b11101111: y = 14'b01011101100101;
-       8'b11110000: y = 14'b01011101011001;
-       8'b11110001: y = 14'b01011101001100;
-       8'b11110010: y = 14'b01011101000000;
-       8'b11110011: y = 14'b01011100110100;
-       8'b11110100: y = 14'b01011100101000;
-       8'b11110101: y = 14'b01011100011100;
-       8'b11110110: y = 14'b01011100010000;
-       8'b11110111: y = 14'b01011100000100;
-       8'b11111000: y = 14'b01011011111000;
-       8'b11111001: y = 14'b01011011101100;
-       8'b11111010: y = 14'b01011011100000;
-       8'b11111011: y = 14'b01011011010101;
-       8'b11111100: y = 14'b01011011001001;
-       8'b11111101: y = 14'b01011010111101;
-       8'b11111110: y = 14'b01011010110010;
-       8'b11111111: y = 14'b01011010100111;
-       default: y = 14'bxxxxxxxxxxxxxx;
-     endcase // case (a)
-    
-endmodule // sbtm_a0
-
-    
-    
-    
\ No newline at end of file
diff --git a/wally-pipelined/src/fpu/sk14.sv b/wally-pipelined/src/fpu/sk14.sv
deleted file mode 100755
index 8d6aadb59..000000000
--- a/wally-pipelined/src/fpu/sk14.sv
+++ /dev/null
@@ -1,90 +0,0 @@
-// Sklansky Prefix Adder
-
-module sk14 (cout, sum, a, b, cin);
-	 input [13:0] a, b;
-	 input cin;
-	 output [13:0] sum;
-	 output cout;
-
-	 wire [14:0] p,g;
-	 wire [13:0] c;
-
-// pre-computation
-	 assign p={a^b,1'b0};
-	 assign g={a&b, cin};
-
-// prefix tree
-	 sklansky prefix_tree(c, p[13:0], g[13:0]);
-
-// post-computation
-	 assign sum=p[14:1]^c;
-	 assign cout=g[14]|(p[14]&c[13]);
-
-endmodule
-
-module sklansky (c, p, g);
-	
-	input [14:0] p;
-	input [14:0] g;
-	output [14:1] c;
-
-
-	// parallel-prefix, Sklansky
-	// Stage 1: Generates G/P pairs that span 1 bits
-	grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
-	black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
-	black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
-	black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
-	black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
-	black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
-	black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
-	// Stage 2: Generates G/P pairs that span 2 bits
-	grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
-	grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
-	black b_6_4 (G_6_4, P_6_4, {g[6],G_5_4}, {p[6],P_5_4});
-	black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
-	black b_10_8 (G_10_8, P_10_8, {g[10],G_9_8}, {p[10],P_9_8});
-	black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
-	black b_14_12 (G_14_12, P_14_12, {g[14],G_13_12}, {p[14],P_13_12});
-	black b_15_12 (G_15_12, P_15_12, {G_15_14,G_13_12}, {P_15_14,P_13_12});
-
-	// Stage 3: Generates G/P pairs that span 4 bits
-	grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
-	grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
-	grey g_6_0 (G_6_0, {G_6_4,G_3_0}, P_6_4);
-	grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
-	black b_12_8 (G_12_8, P_12_8, {g[12],G_11_8}, {p[12],P_11_8});
-	black b_13_8 (G_13_8, P_13_8, {G_13_12,G_11_8}, {P_13_12,P_11_8});
-	black b_14_8 (G_14_8, P_14_8, {G_14_12,G_11_8}, {P_14_12,P_11_8});
-	black b_15_8 (G_15_8, P_15_8, {G_15_12,G_11_8}, {P_15_12,P_11_8});
-
-	// Stage 4: Generates G/P pairs that span 8 bits
-	grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
-	grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
-	grey g_10_0 (G_10_0, {G_10_8,G_7_0}, P_10_8);
-	grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
-	grey g_12_0 (G_12_0, {G_12_8,G_7_0}, P_12_8);
-	grey g_13_0 (G_13_0, {G_13_8,G_7_0}, P_13_8);
-	grey g_14_0 (G_14_0, {G_14_8,G_7_0}, P_14_8);
-	grey g_15_0 (G_15_0, {G_15_8,G_7_0}, P_15_8);
-
-
-	// Final Stage: Apply c_k+1=G_k_0
-	assign c[1]=g[0];
-	assign c[2]=G_1_0;
-	assign c[3]=G_2_0;
-	assign c[4]=G_3_0;
-	assign c[5]=G_4_0;
-	assign c[6]=G_5_0;
-	assign c[7]=G_6_0;
-	assign c[8]=G_7_0;
-	assign c[9]=G_8_0;
-
-	assign c[10]=G_9_0;
-	assign c[11]=G_10_0;
-	assign c[12]=G_11_0;
-	assign c[13]=G_12_0;
-	assign c[14]=G_13_0;
-
-endmodule
-

From 72406b8a88139c67358fe93bf561e9832d812099 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Fri, 2 Jul 2021 12:53:05 -0400
Subject: [PATCH 4/4] FPU update - missing files

---
 wally-pipelined/src/fpu/fclassify.sv |  62 ++++
 wally-pipelined/src/fpu/fcmp.sv      | 465 +++++++++++++++++++++++++++
 wally-pipelined/src/fpu/fdivsqrt.sv  | 256 +++++++++++++++
 wally-pipelined/src/fpu/fhazard.sv   |  67 ++++
 wally-pipelined/src/fpu/fregfile.sv  |  54 ++++
 5 files changed, 904 insertions(+)
 create mode 100644 wally-pipelined/src/fpu/fclassify.sv
 create mode 100755 wally-pipelined/src/fpu/fcmp.sv
 create mode 100755 wally-pipelined/src/fpu/fdivsqrt.sv
 create mode 100644 wally-pipelined/src/fpu/fhazard.sv
 create mode 100644 wally-pipelined/src/fpu/fregfile.sv

diff --git a/wally-pipelined/src/fpu/fclassify.sv b/wally-pipelined/src/fpu/fclassify.sv
new file mode 100644
index 000000000..a15edcb4a
--- /dev/null
+++ b/wally-pipelined/src/fpu/fclassify.sv
@@ -0,0 +1,62 @@
+
+`include "wally-config.vh"
+
+module fclassify (
+    input  logic [63:0] SrcXE,
+    input  logic        FmtE,           // 0-Single 1-Double
+    output logic [63:0] ClassResE
+    );
+
+    logic [31:0] Single;
+    logic [63:0] Double;
+    logic Sgn;
+    logic Inf, NaN, Zero, Norm, Denorm;
+    logic PInf, QNaN, PZero, PNorm, PDenorm;
+    logic NInf, SNaN, NZero, NNorm, NDenorm;
+    logic MaxExp, ExpZero, ManZero, FirstBitFrac;
+   
+    // Single and Double precision layouts
+    assign Single = SrcXE[63:32];
+    assign Double = SrcXE;
+    assign Sgn = SrcXE[63];
+
+    // basic calculations for readabillity
+    
+    assign ExpZero = FmtE ? ~|Double[62:52] : ~|Single[30:23];
+    assign MaxExp = FmtE ? &Double[62:52] : &Single[30:23];
+    assign ManZero = FmtE ? ~|Double[51:0] : ~|Single[22:0];
+    assign FirstBitFrac = FmtE ? Double[51] : Single[22];
+
+    // determine the type of number
+    assign NaN      = MaxExp & ~ManZero;
+    assign Inf = MaxExp & ManZero;
+    assign Zero     = ExpZero & ManZero;
+    assign Denorm= ExpZero & ~ManZero;
+    assign Norm   = ~ExpZero;
+
+    // determine the sub categories
+    assign QNaN = FirstBitFrac&NaN;
+    assign SNaN = ~FirstBitFrac&NaN;
+    assign PInf = ~Sgn&Inf;
+    assign NInf = Sgn&Inf;
+    assign PNorm = ~Sgn&Norm;
+    assign NNorm = Sgn&Norm;
+    assign PDenorm = ~Sgn&Denorm;
+    assign NDenorm = Sgn&Denorm;
+    assign PZero = ~Sgn&Zero;
+    assign NZero = Sgn&Zero;
+
+    // determine sub category and combine into the result
+    //  bit 0 - -Inf
+    //  bit 1 - -Norm
+    //  bit 2 - -Denorm
+    //  bit 3 - -Zero
+    //  bit 4 - +Zero
+    //  bit 5 - +Denorm
+    //  bit 6 - +Norm
+    //  bit 7 - +Inf
+    //  bit 8 - signaling NaN
+    //  bit 9 - quiet NaN
+    assign ClassResE = {{54{1'b0}}, QNaN, SNaN, PInf, PNorm,  PDenorm, PZero, NZero, NDenorm, NNorm, NInf};
+
+endmodule
diff --git a/wally-pipelined/src/fpu/fcmp.sv b/wally-pipelined/src/fpu/fcmp.sv
new file mode 100755
index 000000000..f47d7c9ef
--- /dev/null
+++ b/wally-pipelined/src/fpu/fcmp.sv
@@ -0,0 +1,465 @@
+
+//
+// File name : fpcomp.v
+// Title     : Floating-Point Comparator
+// project   : FPU
+// Library   : fpcomp
+// Author(s) : James E. Stine
+// Purpose   : definition of main unit to floating-point comparator
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Floating Point Comparator (Algorithm)
+//
+// 1.) Performs sign-extension if the inputs are 32-bit integers.
+// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
+// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
+//     and correct for sign bits
+//
+// This module takes 64-bits inputs op1 and op2, VSS, and VDD
+// signals, and a 2-bit signal FOpCtrlE that indicates the type of 
+// operands being compared as indicated below.
+//	FOpCtrlE	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	(unused)
+//
+// The comparator produces a 2-bit signal FCC, which
+// indicates the result of the comparison:
+//
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+//
+// It also produces an invalid operation flag, which is one
+// if either of the input operands is a signaling NaN per 754
+
+`include "wally-config.vh"
+module fcmp (   
+   input logic [63:0] op1, 
+   input logic [63:0] op2,
+   input logic [2:0]  FOpCtrlE,
+   input logic 	      FmtE,
+
+   
+   output logic       Invalid, 		 // Invalid Operation
+   // output logic [1:0] FCC,  		 // Condition Codes 
+   output logic [63:0] CmpResE);
+   // Perform magnitude comparison between the 63 least signficant bits
+   // of the input operands. Only LT and EQ are returned, since GT can
+   // be determined from these values. 
+   logic [1:0] FCC;  		 // Condition Codes 
+   logic [7:0]	      w, x;
+   logic	      ANaN, BNaN;
+   logic	      Azero, Bzero;
+   logic 	      LT;                // magnitude op1 < magnitude op2
+   logic 	      EQ;                // magnitude op1 = magnitude op2
+   
+   magcompare64b_1 magcomp1 (w, x, {~op1[63], op1[62:0]}, {~op2[63], op2[62:0]});
+
+   // Determine final values based on output of magnitude comparison, 
+   // sign bits, and special case testing. 
+   exception_cmp_1 exc1 (ANaN, BNaN, Azero, Bzero, op1, op2, FOpCtrlE);
+   
+   // Perform magnitude comparison between the 63 least signficant bits
+   // of the input operands. Only LT and EQ are returned, since GT can
+   // be determined from these values. 
+   magcompare64b_2 magcomp2 (LT, EQ, w, x);
+
+   // Determine final values based on output of magnitude comparison, 
+   // sign bits, and special case testing. 
+   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .FOpCtrlE(FOpCtrlE), .A(op1), .B(op2), .*);
+
+endmodule // fpcomp
+
+// module magcompare2b (LT, GT, A, B);
+
+//    input logic [1:0] A;
+//    input logic [1:0] B;
+   
+//    output logic     LT;
+//    output logic     GT;
+
+//    // Determine if A < B  using a minimized sum-of-products expression
+//    assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
+//    // Determine if A > B  using a minimized sum-of-products expression
+//    assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
+
+// endmodule // magcompare2b
+
+// 2-bit magnitude comparator
+// This module compares two 2-bit values A and B. LT is '1' if A < B 
+// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
+// this version actually incorporates don't cares into the equation to
+// simplify the optimization
+
+module magcompare2c (LT, GT, A, B);
+
+   input logic [1:0] A;
+   input logic [1:0] B;
+   
+   output logic      LT;
+   output logic      GT;
+
+   assign LT = B[1] | (!A[1]&B[0]);
+   assign GT = A[1] | (!B[1]&A[0]);
+
+endmodule // magcompare2b
+
+// This module compares two 64-bit values A and B. LT is '1' if A < B 
+// and EQ is '1'if A = B. LT and GT are both '0' if A > B.
+// This structure was modified so
+// that it only does a strict magnitdude comparison, and only
+// returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
+// of 63 2-bit magnitude comparators, followed by one OR gates.
+//
+// J. E. Stine and M. J. Schulte, "A combined two's complement and
+// floating-point comparator," 2005 IEEE International Symposium on
+// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
+// doi: 10.1109/ISCAS.2005.1464531
+
+module magcompare64b_1 (w, x,  A, B);
+
+   input logic [63:0] A;
+   input logic [63:0] B;
+   
+   logic [31:0]       s;
+   logic [31:0]       t;
+   logic [15:0]       u;
+   logic [15:0]       v;
+   output logic [7:0] 	      w;
+   output logic [7:0] 	      x;
+   
+   magcompare2b mag1(s[0], t[0], A[1:0], B[1:0]);
+   magcompare2b mag2(s[1], t[1], A[3:2], B[3:2]);
+   magcompare2b mag3(s[2], t[2], A[5:4], B[5:4]);
+   magcompare2b mag4(s[3], t[3], A[7:6], B[7:6]);
+   magcompare2b mag5(s[4], t[4], A[9:8], B[9:8]);
+   magcompare2b mag6(s[5], t[5], A[11:10], B[11:10]);
+   magcompare2b mag7(s[6], t[6], A[13:12], B[13:12]);
+   magcompare2b mag8(s[7], t[7], A[15:14], B[15:14]);
+   magcompare2b mag9(s[8], t[8], A[17:16], B[17:16]);
+   magcompare2b magA(s[9], t[9], A[19:18], B[19:18]);
+   magcompare2b magB(s[10], t[10], A[21:20], B[21:20]);
+   magcompare2b magC(s[11], t[11], A[23:22], B[23:22]);
+   magcompare2b magD(s[12], t[12], A[25:24], B[25:24]);
+   magcompare2b magE(s[13], t[13], A[27:26], B[27:26]);
+   magcompare2b magF(s[14], t[14], A[29:28], B[29:28]);
+   magcompare2b mag10(s[15], t[15], A[31:30], B[31:30]);
+   magcompare2b mag11(s[16], t[16], A[33:32], B[33:32]);
+   magcompare2b mag12(s[17], t[17], A[35:34], B[35:34]);
+   magcompare2b mag13(s[18], t[18], A[37:36], B[37:36]);
+   magcompare2b mag14(s[19], t[19], A[39:38], B[39:38]);
+   magcompare2b mag15(s[20], t[20], A[41:40], B[41:40]);
+   magcompare2b mag16(s[21], t[21], A[43:42], B[43:42]);
+   magcompare2b mag17(s[22], t[22], A[45:44], B[45:44]);
+   magcompare2b mag18(s[23], t[23], A[47:46], B[47:46]);
+   magcompare2b mag19(s[24], t[24], A[49:48], B[49:48]);
+   magcompare2b mag1A(s[25], t[25], A[51:50], B[51:50]);
+   magcompare2b mag1B(s[26], t[26], A[53:52], B[53:52]);
+   magcompare2b mag1C(s[27], t[27], A[55:54], B[55:54]);
+   magcompare2b mag1D(s[28], t[28], A[57:56], B[57:56]);
+   magcompare2b mag1E(s[29], t[29], A[59:58], B[59:58]);
+   magcompare2b mag1F(s[30], t[30], A[61:60], B[61:60]);
+   magcompare2b mag20(s[31], t[31], A[63:62], B[63:62]);
+
+   magcompare2c mag21(u[0], v[0], t[1:0], s[1:0]);
+   magcompare2c mag22(u[1], v[1], t[3:2], s[3:2]);
+   magcompare2c mag23(u[2], v[2], t[5:4], s[5:4]);
+   magcompare2c mag24(u[3], v[3], t[7:6], s[7:6]);
+   magcompare2c mag25(u[4], v[4], t[9:8], s[9:8]);
+   magcompare2c mag26(u[5], v[5], t[11:10], s[11:10]);
+   magcompare2c mag27(u[6], v[6], t[13:12], s[13:12]);
+   magcompare2c mag28(u[7], v[7], t[15:14], s[15:14]);
+   magcompare2c mag29(u[8], v[8], t[17:16], s[17:16]);
+   magcompare2c mag2A(u[9], v[9], t[19:18], s[19:18]);
+   magcompare2c mag2B(u[10], v[10], t[21:20], s[21:20]);
+   magcompare2c mag2C(u[11], v[11], t[23:22], s[23:22]);
+   magcompare2c mag2D(u[12], v[12], t[25:24], s[25:24]);
+   magcompare2c mag2E(u[13], v[13], t[27:26], s[27:26]);
+   magcompare2c mag2F(u[14], v[14], t[29:28], s[29:28]);
+   magcompare2c mag30(u[15], v[15], t[31:30], s[31:30]);
+
+   magcompare2c mag31(w[0], x[0], v[1:0], u[1:0]);
+   magcompare2c mag32(w[1], x[1], v[3:2], u[3:2]);
+   magcompare2c mag33(w[2], x[2], v[5:4], u[5:4]);
+   magcompare2c mag34(w[3], x[3], v[7:6], u[7:6]);
+   magcompare2c mag35(w[4], x[4], v[9:8], u[9:8]);
+   magcompare2c mag36(w[5], x[5], v[11:10], u[11:10]);
+   magcompare2c mag37(w[6], x[6], v[13:12], u[13:12]);
+   magcompare2c mag38(w[7], x[7], v[15:14], u[15:14]);
+
+endmodule // magcompare64b
+
+// This module takes 64-bits inputs A and B, two magnitude comparison
+// flags LT_mag and EQ_mag, and a 2-bit signal FOpCtrlE that indicates the type of 
+// operands being compared as indicated below.
+//	FOpCtrlE	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	bfloat precision numbers
+//
+// The comparator produces a 2-bit signal fcc, which
+// indicates the result of the comparison as follows:
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+// It also produces a invalid operation flag, which is one
+// if either of the input operands is a signaling NaN.
+
+module exception_cmp_1 (ANaN, BNaN, Azero, Bzero, A, B, FOpCtrlE);
+
+   input logic [63:0] A;
+   input logic [63:0] B;
+   input logic [2:0]  FOpCtrlE;
+
+   logic 		      dp, sp, hp;
+
+   output logic 	      ANaN;
+   output logic 	      BNaN;
+   output logic               Azero;
+   output logic               Bzero;
+
+   assign dp = !FOpCtrlE[1]&!FOpCtrlE[0];
+   assign sp = !FOpCtrlE[1]&FOpCtrlE[0];
+   assign hp = FOpCtrlE[1]&!FOpCtrlE[0];
+
+   // Test if A or B is NaN.
+   assign ANaN = (A[62]&A[61]&A[60]&A[59]&A[58]) & 
+		 ((sp&A[57]&A[56]&A[55]&(A[54]|A[53])) | 
+		 (dp&A[57]&A[56]&A[55]&A[54]&A[53]&A[52]&(A[51]|A[50])) |
+		 (hp&(A[57]|A[56])));
+
+   assign BNaN = (B[62]&B[61]&B[60]&B[59]&B[58]) & 
+		 ((sp&B[57]&B[56]&B[55]&(B[54]|B[53])) | 
+		 (dp&B[57]&B[56]&B[55]&B[54]&B[53]&B[52]&(B[51]|B[50])) |
+		 (hp&(B[57]|B[56])));
+
+   // Test if A is +0 or -0 when viewed as a floating point number (i.e,
+   // the 63 least siginficant bits of A are zero). 
+   // Depending on how this synthesizes, it may work better to replace
+   // this with assign Azero = ~(A[62] | A[61] | ... | A[0])
+   assign Azero = (A[62:0] == 63'h0);
+   assign Bzero = (B[62:0] == 63'h0);
+
+endmodule // exception_cmp
+//
+// File name : fpcomp.v
+// Title     : Floating-Point Comparator
+// project   : FPU
+// Library   : fpcomp
+// Author(s) : James E. Stine
+// Purpose   : definition of main unit to floating-point comparator
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Floating Point Comparator (Algorithm)
+//
+// 1.) Performs sign-extension if the inputs are 32-bit integers.
+// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
+// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
+//     and correct for sign bits
+//
+// This module takes 64-bits inputs op1 and op2, VSS, and VDD
+// signals, and a 2-bit signal FOpCtrlE that indicates the type of 
+// operands being compared as indicated below.
+//	FOpCtrlE	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	(unused)
+//
+// The comparator produces a 2-bit signal FCC, which
+// indicates the result of the comparison:
+//
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+//
+// It also produces an invalid operation flag, which is one
+// if either of the input operands is a signaling NaN per 754
+
+
+/*module magcompare2b (LT, GT, A, B);
+
+   input logic [1:0] A;
+   input logic [1:0] B;
+   
+   output logic     LT;
+   output logic     GT;
+
+   // Determine if A < B  using a minimized sum-of-products expression
+   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
+   // Determine if A > B  using a minimized sum-of-products expression
+   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
+
+endmodule*/ // magcompare2b
+
+// 2-bit magnitude comparator
+// This module compares two 2-bit values A and B. LT is '1' if A < B 
+// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
+// this version actually incorporates don't cares into the equation to
+// simplify the optimization
+
+// module magcompare2c (LT, GT, A, B);
+
+//    input logic [1:0] A;
+//    input logic [1:0] B;
+   
+//    output logic      LT;
+//    output logic      GT;
+
+//    assign LT = B[1] | (!A[1]&B[0]);
+//    assign GT = A[1] | (!B[1]&A[0]);
+
+// endmodule // magcompare2b
+
+// This module compares two 64-bit values A and B. LT is '1' if A < B 
+// and EQ is '1'if A = B. LT and GT are both '0' if A > B.
+// This structure was modified so
+// that it only does a strict magnitdude comparison, and only
+// returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
+// of 63 2-bit magnitude comparators, followed by one OR gates.
+//
+// J. E. Stine and M. J. Schulte, "A combined two's complement and
+// floating-point comparator," 2005 IEEE International Symposium on
+// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
+// doi: 10.1109/ISCAS.2005.1464531
+
+module magcompare64b_2 (LT, EQ, w, x);
+
+   input logic [7:0]  w;
+   input logic [7:0]  x;
+   logic [3:0] 	      y;
+   logic [3:0] 	      z;
+   logic [1:0] 	      a;
+   logic [1:0] 	      b;   
+   logic 	      GT;
+   
+   output logic       LT;
+   output logic       EQ;
+   
+   magcompare2c mag39(y[0], z[0], x[1:0], w[1:0]);
+   magcompare2c mag3A(y[1], z[1], x[3:2], w[3:2]);
+   magcompare2c mag3B(y[2], z[2], x[5:4], w[5:4]);
+   magcompare2c mag3C(y[3], z[3], x[7:6], w[7:6]);
+   
+   magcompare2c mag3D(a[0], b[0], z[1:0], y[1:0]);
+   magcompare2c mag3E(a[1], b[1], z[3:2], y[3:2]);
+   
+   magcompare2c mag3F(LT, GT, b[1:0], a[1:0]);
+
+   assign EQ = ~(LT | GT);
+
+endmodule // magcompare64b
+
+// This module takes 64-bits inputs A and B, two magnitude comparison
+// flags LT_mag and EQ_mag, and a 2-bit signal FOpCtrlE that indicates the type of 
+// operands being compared as indicated below.
+//	FOpCtrlE	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	bfloat precision numbers
+//
+// The comparator produces a 2-bit signal fcc, which
+// indicates the result of the comparison as follows:
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+// It also produces a invalid operation flag, which is one
+// if either of the input operands is a signaling NaN.
+
+module exception_cmp_2 (
+   input logic [63:0] A,
+   input logic [63:0] B,
+   input logic 	      FmtE,
+   input logic 	      LT_mag,
+   input logic 	      EQ_mag,
+   input logic [2:0]  FOpCtrlE,
+   
+   output logic       invalid,
+   output logic [1:0] fcc,
+   output logic [63:0] CmpResE,
+
+   input logic 	      Azero,
+   input logic 	      Bzero,   
+   input logic 	      ANaN,
+   input logic 	      BNaN);
+   
+   logic 	      dp;   
+   logic 	      sp;
+   logic 	      hp;   
+   logic 	      ASNaN;
+   logic 	      BSNaN;
+   logic 	      UO;
+   logic 	      GT;
+   logic 	      LT;
+   logic 	      EQ;
+   logic [62:0]       sixtythreezeros = 63'h0;
+
+   assign dp = !FOpCtrlE[1]&!FOpCtrlE[0];
+   assign sp = !FOpCtrlE[1]&FOpCtrlE[0];
+   assign hp = FOpCtrlE[1]&!FOpCtrlE[0];
+
+   // Values are unordered if ((A is NaN) OR (B is NaN)) AND (a floating 
+   // point comparison is being performed. 
+   assign UO = (ANaN | BNaN);
+
+   // Test if A or B is a signaling NaN.
+   assign ASNaN = ANaN & (sp&~A[53] | dp&~A[50] | hp&~A[56]);
+   assign BSNaN = BNaN & (sp&~B[53] | dp&~B[50] | hp&~B[56]);
+
+   // If either A or B is a signaling NaN the "Invalid Operation"
+   // exception flag is set to one; otherwise it is zero.    
+   assign invalid = (ASNaN | BSNaN);
+
+   // A and B are equal if (their magnitudes are equal) AND ((their signs are
+   // equal) or (their magnitudes are zero AND they are floating point
+   // numbers)). Also, A and B are not equal if they are unordered.
+   assign EQ = (EQ_mag | (Azero&Bzero)) & (~UO);
+   
+   // A is less than B if (A is negative and B is posiive) OR
+   // (A and B are positive and the magnitude of A is less than
+   // the magnitude of B) or (A and B are negative integers and
+   // the magnitude of A is less than the magnitude of B) or
+   // (A and B are negative floating point numbers and
+   // the magnitude of A is greater than the magnitude of B).
+   // Also, A is not less than B if A and B are equal or unordered.
+   assign LT = ((~LT_mag & A[63] & B[63]) |
+		(LT_mag & ~(A[63] & B[63])))&~EQ&~UO;
+   
+   // A is greater than B when LT, EQ, and UO are are false.
+   assign GT = ~(LT | EQ | UO);
+
+   // Note: it may be possible to optimize the setting of fcc 
+   // a little more, but it is probably not worth the effort. 
+
+   // Set the bits of fcc based on LT, GT, EQ, and UO
+   assign fcc[0] = LT | UO;
+   assign fcc[1] = GT | UO;  
+
+   always_comb begin
+      case (FOpCtrlE[2:0])
+         3'b111: CmpResE = LT ? A : B;//min 
+         3'b101: CmpResE = GT ? A : B;//max
+         3'b010: CmpResE = {63'b0, EQ};//equal
+         3'b001: CmpResE = {63'b0, LT};//less than
+         3'b011: CmpResE = {63'b0, LT|EQ};//less than or equal
+         default: CmpResE = 64'b0;
+      endcase
+   end 
+
+endmodule // exception_cmp
diff --git a/wally-pipelined/src/fpu/fdivsqrt.sv b/wally-pipelined/src/fpu/fdivsqrt.sv
new file mode 100755
index 000000000..6d8da23f2
--- /dev/null
+++ b/wally-pipelined/src/fpu/fdivsqrt.sv
@@ -0,0 +1,256 @@
+//
+// File name : fpdiv
+// Title     : Floating-Point Divider/Square-Root
+// project   : FPU
+// Library   : fpdiv
+// Author(s) : James E. Stine, Jr.
+// Purpose   : definition of main unit to floating-point div/sqrt
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Basic Operations
+//
+// Step 1: Load operands, set flags, and convert SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Exponent Logic
+// Step 4: Divide/Sqrt using Goldschmidt
+// Step 5: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 6: Round the result.// 
+// Step 7: Put quotient/remainder onto output.
+//
+
+// `timescale 1ps/1ps
+module fdivsqrt (FDivSqrtDoneE, FDivResultM, FDivSqrtFlgM, DivInput1E, DivInput2E, FrmE, DivOpType, FmtE, DivOvEn, DivUnEn,
+	      FDivStartE, reset, clk, FDivBusyE, HoldInputs);
+
+   input [63:0] DivInput1E;		// 1st input operand (A)
+   input [63:0] DivInput2E;		// 2nd input operand (B)
+   input [2:0] 	FrmE;		// Rounding mode - specify values 
+   input 	DivOpType;	// Function opcode
+   input 	FmtE;   		// Result Precision (0 for double, 1 for single) //***will need to swap this
+   input 	DivOvEn;		// Overflow trap enabled
+   input 	DivUnEn;   	// Underflow trap enabled
+
+   input 	FDivStartE;
+   input 	reset;
+   input 	clk;   
+
+   output [63:0] FDivResultM;	// Result of operation
+   output [4:0]  FDivSqrtFlgM;   	// IEEE exception flags 
+   output 	 FDivSqrtDoneE;
+   output    FDivBusyE, HoldInputs;
+
+   supply1 	  vdd;
+   supply0 	  vss;   
+
+   wire [63:0] 	 Float1; 
+   wire [63:0] 	 Float2;
+   wire [63:0] 	 IntValue;
+   
+   wire 	 DivDenormM;   	// DivDenormM on input or output
+   wire [12:0] 	 exp1, exp2, expF;
+   wire [12:0] 	 exp_diff, bias;
+   wire [13:0] 	 exp_sqrt;
+   wire [12:0] 	 exp_s;
+   wire [12:0] 	 exp_c;
+   
+   wire [10:0] 	 exponent, exp_pre;
+   wire [63:0] 	 Result;   
+   wire [52:0] 	 mantissaA;
+   wire [52:0] 	 mantissaB; 
+   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
+   
+   wire [5:0] 	 align_shift;
+   wire [5:0] 	 norm_shift;
+   wire [2:0] 	 sel_inv;
+   wire		 op1_Norm, op2_Norm;
+   wire		 opA_Norm, opB_Norm;
+   wire		 Invalid;
+   wire 	 DenormIn, DenormIO;
+   wire [4:0] 	 FlagsIn;   	
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire 	 signResult, sign_corr;
+   wire          corr_sign;
+   wire 	 zeroB;         
+   wire 	 convert;
+   wire          swap;
+   wire          sub;
+   
+   wire [63:0] 	 q1, qm1, qp1, q0, qm0, qp0;
+   wire [63:0] 	 rega_out, regb_out, regc_out, regd_out;
+   wire [127:0]  regr_out;
+   wire [2:0] 	 sel_muxa, sel_muxb;
+   wire 	 sel_muxr;   
+   wire 	 load_rega, load_regb, load_regc, load_regd, load_regr, load_regs;
+
+   wire 	 donev, sel_muxrv, sel_muxsv;
+   wire [1:0] 	 sel_muxav, sel_muxbv;   
+   wire 	 load_regav, load_regbv, load_regcv;
+   wire 	 load_regrv, load_regsv;
+   
+   logic exp_cout1, exp_cout2, exp_odd, open;
+
+
+   // Convert the input operands to their appropriate forms based on 
+   // the orignal operands, the DivOpType , and their precision FmtE. 
+   // Single precision inputs are converted to double precision 
+   // and the sign of the first operand is set appropratiately based on
+   // if the operation is absolute value or negation. 
+   convert_inputs_div divconv1 (Float1, Float2, DivInput1E, DivInput2E, DivOpType, FmtE);
+
+   // Test for exceptions and return the "Invalid Operation" and
+   // "Denormalized" Input FDivSqrtFlgM. The "sel_inv" is used in
+   // the third pipeline stage to select the result. Also, op1_Norm
+   // and op2_Norm are one if DivInput1E and DivInput2E are not zero or denormalized.
+   // sub is one if the effective operation is subtaction. 
+   exception_div divexc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
+		   Float1, Float2, DivOpType);
+
+   // Determine Sign/Mantissa
+   assign signResult = ((Float1[63]^Float2[63])&~DivOpType) | Float1[63]&DivOpType;
+   assign mantissaA = {vdd, Float1[51:0]};
+   assign mantissaB = {vdd, Float2[51:0]};
+   // Perform Exponent Subtraction - expA - expB + Bias   
+   assign exp1 = {2'b0, Float1[62:52]};
+   assign exp2 = {2'b0, Float2[62:52]};
+   // bias : DP = 2^{11-1}-1 = 1023
+   assign bias = {3'h0, 10'h3FF};
+   // Divide exponent
+   csa #(13) csa1 (exp1, ~exp2, bias, exp_s, exp_c); //***adder
+   exp_add explogic1 (exp_cout1, {open, exp_diff}, //***adder?
+		      {vss, exp_s}, {vss, exp_c}, 1'b1);
+   // Sqrt exponent (check if exponent is odd)
+   assign exp_odd = Float1[52] ? vss : vdd;
+   exp_add explogic2 (exp_cout2, exp_sqrt, //***adder?
+		      {vss, exp1}, {4'h0, 10'h3ff}, exp_odd);
+   // Choose correct exponent
+   assign expF = DivOpType ? exp_sqrt[13:1] : exp_diff;   
+
+   // Main Goldschmidt/Division Routine
+   divconv goldy (q1, qm1, qp1, q0, qm0, qp0, 
+		  rega_out, regb_out, regc_out, regd_out,
+		  regr_out, mantissaB, mantissaA, 
+		  sel_muxa, sel_muxb, sel_muxr, 
+		  reset, clk,
+		  load_rega, load_regb, load_regc, load_regd,
+		  load_regr, load_regs, FmtE, DivOpType, exp_odd);
+
+   // FSM : control divider
+   fsm control (FDivSqrtDoneE, load_rega, load_regb, load_regc, load_regd, 
+		load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
+		clk, reset, FDivStartE, DivOpType, FDivBusyE, HoldInputs);
+   
+   // Round the mantissa to a 52-bit value, with the leading one
+   // removed. The rounding units also handles special cases and 
+   // set the exception flags.
+   //***add max magnitude and swap negitive and positive infinity
+   rounder_div divround1 (Result, DenormIO, FlagsIn, 
+		   FrmE, FmtE, DivOvEn, DivUnEn, expF, 
+   		   sel_inv, Invalid, DenormIn, signResult, 
+		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
+
+   // Store the final result and the exception flags in registers.
+   flopenr #(64) rega (clk, reset, FDivSqrtDoneE, Result, FDivResultM);
+   flopenr #(1) regb (clk, reset, FDivSqrtDoneE, DenormIO, DivDenormM);   
+   flopenr #(5) regc (clk, reset, FDivSqrtDoneE, FlagsIn, FDivSqrtFlgM);   
+   
+endmodule // fpadd
+
+//
+// Brent-Kung Prefix Adder 
+//   (yes, it is 14 bits as my generator is broken for 13 bits :( 
+//    assume, synthesizer will delete stuff not needed )
+//
+module exp_add (cout, sum, a, b, cin);
+   
+   input [13:0] a, b;
+   input 	cin;
+   
+   output [13:0] sum;
+   output 	 cout;
+
+   wire [14:0] 	 p,g;
+   wire [13:0] 	 c;
+
+   // pre-computation
+   assign p={a^b,1'b0};
+   assign g={a&b, cin};
+
+   // prefix tree
+   brent_kung prefix_tree(c, p[13:0], g[13:0]);
+
+   // post-computation
+   assign sum=p[14:1]^c;
+   assign cout=g[14]|(p[14]&c[13]);
+
+endmodule // exp_add
+
+module brent_kung (c, p, g);
+   
+   input [13:0] p;
+   input [13:0] g;
+   output [14:1] c;
+
+   logic G_1_0, G_3_2,G_5_4,G_7_6,G_9_8,G_11_10,G_13_12,G_3_0,G_7_4,G_11_8;
+   logic P_3_2,P_5_4,P_7_6,P_9_8,P_11_10,P_13_12,P_7_4,P_11_8;
+   logic G_7_0,G_11_0,G_5_0,G_9_0,G_13_0,G_2_0,G_4_0,G_6_0,G_8_0,G_10_0,G_12_0;
+   // parallel-prefix, Brent-Kung
+
+   // Stage 1: Generates G/FmtE pairs that span 1 bits
+   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
+   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
+   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
+   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
+   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
+   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
+   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
+
+   // Stage 2: Generates G/FmtE pairs that span 2 bits
+   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
+   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
+   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
+
+   // Stage 3: Generates G/FmtE pairs that span 4 bits
+   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
+
+   // Stage 4: Generates G/FmtE pairs that span 8 bits
+
+   // Stage 5: Generates G/FmtE pairs that span 4 bits
+   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
+
+   // Stage 6: Generates G/FmtE pairs that span 2 bits
+   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
+   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
+   grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
+
+   // Last grey cell stage 
+   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
+   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
+   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
+   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
+   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
+   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
+
+   // Final Stage: Apply c_k+1=G_k_0
+   assign c[1]=g[0];
+   assign c[2]=G_1_0;
+   assign c[3]=G_2_0;
+   assign c[4]=G_3_0;
+   assign c[5]=G_4_0;
+   assign c[6]=G_5_0;
+   assign c[7]=G_6_0;
+   assign c[8]=G_7_0;
+   assign c[9]=G_8_0;
+
+   assign c[10]=G_9_0;
+   assign c[11]=G_10_0;
+   assign c[12]=G_11_0;
+   assign c[13]=G_12_0;
+   assign c[14]=G_13_0;
+
+endmodule // brent_kung
+
diff --git a/wally-pipelined/src/fpu/fhazard.sv b/wally-pipelined/src/fpu/fhazard.sv
new file mode 100644
index 000000000..53f7dde2c
--- /dev/null
+++ b/wally-pipelined/src/fpu/fhazard.sv
@@ -0,0 +1,67 @@
+///////////////////////////////////////////
+// fpuhazard.sv
+//
+// Written: me@KatherineParry.com 19 May 2021
+// Modified: 
+//
+// Purpose: Determine forwarding, stalls and flushes for the FPU
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fhazard(
+    input logic [4:0] Adr1E, Adr2E, Adr3E,
+    input logic FWriteEnM, FWriteEnW, 
+	  input logic [4:0] RdM, RdW,
+    input logic [2:0] FResultSelM,
+    output logic FStallD,
+    output logic [1:0] ForwardXE, ForwardYE, ForwardZE
+);
+
+
+  always_comb begin
+    // set ReadData as default
+    ForwardXE = 2'b00; // choose FRD1E
+    ForwardYE = 2'b00; // choose FRD2E
+    ForwardZE = 2'b00; // choose FRD3E
+    FStallD = 0;
+
+      if ((Adr1E == RdM) & FWriteEnM)
+      // if the result will be FResM
+        if(FResultSelM == 3'b100) ForwardXE = 2'b10; // choose FResM
+        else FStallD = 1;   // if the result won't be ready stall
+      else if ((Adr1E == RdW) & FWriteEnW) ForwardXE = 2'b01; // choose FPUResult64W
+    
+
+      if ((Adr2E == RdM) & FWriteEnM)
+      // if the result will be FResM
+        if(FResultSelM == 3'b100) ForwardYE = 2'b10; // choose FResM
+        else FStallD = 1;   // if the result won't be ready stall
+      else if ((Adr2E == RdW) & FWriteEnW) ForwardYE = 2'b01; // choose FPUResult64W
+
+ 
+      if ((Adr3E == RdM) & FWriteEnM)
+      // if the result will be FResM
+        if(FResultSelM == 3'b100) ForwardZE = 2'b10; // choose FResM
+        else FStallD = 1;   // if the result won't be ready stall
+      else if ((Adr3E == RdW) & FWriteEnW) ForwardZE = 2'b01; // choose FPUResult64W
+
+  end 
+
+endmodule
diff --git a/wally-pipelined/src/fpu/fregfile.sv b/wally-pipelined/src/fpu/fregfile.sv
new file mode 100644
index 000000000..78c24b3e6
--- /dev/null
+++ b/wally-pipelined/src/fpu/fregfile.sv
@@ -0,0 +1,54 @@
+///////////////////////////////////////////
+// regfile.sv
+//
+// Written: David_Harris@hmc.edu 9 January 2021
+// Modified: 
+//
+// Purpose: 4-port register file
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fregfile (
+  input  logic             clk, reset,
+  input  logic             we4, 
+  input  logic [ 4:0]      a1, a2, a3, a4, 
+  input  logic [63:0] wd4,    //KEP `XLEN-1 changed to 63 (lint warning) *** figure out if double can be suported when XLEN = 32
+  output logic [63:0] rd1, rd2, rd3);
+
+  logic [63:0] rf[31:0];
+  integer i;
+
+  // three ported register file
+  // read three ports combinationally (A1/RD1, A2/RD2, A3/RD3)
+  // write fourth port on rising edge of clock (A4/WD4/WE4)
+  // write occurs on falling edge of clock
+  
+  // reset is intended for simulation only, not synthesis
+    
+   always_ff @(negedge clk or posedge reset)
+     if (reset) for(i=0; i<32; i++) rf[i] <= 0;
+     else if (we4) rf[a4] <= wd4;	
+   
+   assign #2 rd1 = rf[a1];
+   assign #2 rd2 = rf[a2];
+   assign #2 rd3 = rf[a3];
+   
+endmodule // regfile
+