From 8ae43a15d4f91fac7b5cef39991f6c4c471cd7ac Mon Sep 17 00:00:00 2001
From: Kip Macsai-Goren <kipmacsaigoren@github.com>
Date: Mon, 24 May 2021 20:59:26 -0400
Subject: [PATCH 01/14] partially complete MSTATUS test of sd, xs, fs, mie,
 mpp, mpie, sie, spie bitfields

---
 wally-pipelined/testbench/testbench-imperas.sv | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index ddee23a1..f1e83994 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -443,7 +443,7 @@ string tests32f[] = '{
   };
 
   string tests64p[] = '{
-    //"rv64p/WALLY-MSTATUS", "2010",
+    "rv64p/WALLY-MSTATUS", "2000",
     "rv64p/WALLY-MCAUSE", "3000",
     "rv64p/WALLY-SCAUSE", "2000",
     "rv64p/WALLY-MEPC", "5000",
@@ -464,6 +464,7 @@ string tests32f[] = '{
   };
 
   string tests32p[] = '{
+    "rv32p/WALLY-MSTATUS", "2000",
     "rv32p/WALLY-MCAUSE", "3000",
     "rv32p/WALLY-SCAUSE", "2000",
     "rv32p/WALLY-MEPC", "5000",
@@ -652,7 +653,7 @@ string tests32f[] = '{
               errors = errors+1;
               $display("  Error on test %s result %d: adr = %h sim = %h, signature = %h", 
                     tests[test], i, (testadr+i)*`XLEN/8, dut.uncore.dtim.RAM[testadr+i], signature[i]);
-              $stop;//***debug
+              // $stop;//***debug
             end
           end
           i = i + 1;

From bb5404e14aae7639ba04ccb7cb72387e2d0f6412 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Tue, 25 May 2021 13:21:59 -0500
Subject: [PATCH 02/14] Update FPregfile to use more compact code and better
 structure for ease in reading

---
 wally-pipelined/src/fpu/FPregfile.sv  |   54 ++
 wally-pipelined/src/fpu/FPregfile.sv~ |   52 ++
 wally-pipelined/src/fpu/fpu.sv        | 1230 ++++++++++++-------------
 3 files changed, 689 insertions(+), 647 deletions(-)
 create mode 100644 wally-pipelined/src/fpu/FPregfile.sv
 create mode 100644 wally-pipelined/src/fpu/FPregfile.sv~

diff --git a/wally-pipelined/src/fpu/FPregfile.sv b/wally-pipelined/src/fpu/FPregfile.sv
new file mode 100644
index 00000000..2f27b2ba
--- /dev/null
+++ b/wally-pipelined/src/fpu/FPregfile.sv
@@ -0,0 +1,54 @@
+///////////////////////////////////////////
+// regfile.sv
+//
+// Written: David_Harris@hmc.edu 9 January 2021
+// Modified: 
+//
+// Purpose: 4-port register file
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module FPregfile (
+  input  logic             clk, reset,
+  input  logic             we4, 
+  input  logic [ 4:0]      a1, a2, a3, a4, 
+  input  logic [`XLEN-1:0] wd4, 
+  output logic [`XLEN-1:0] rd1, rd2, rd3);
+
+  logic [`XLEN-1:0] rf[31:0];
+  integer i;
+
+  // three ported register file
+  // read three ports combinationally (A1/RD1, A2/RD2, A3/RD3)
+  // write fourth port on rising edge of clock (A4/WD4/WE4)
+  // write occurs on falling edge of clock
+  
+  // reset is intended for simulation only, not synthesis
+    
+   always_ff @(negedge clk or posedge reset)
+     if (reset) for(i=0; i<32; i++) rf[i] <= 0;
+     else if (we4) rf[a4] <= wd4;	
+   
+   assign #2 rd1 = rf[a1];
+   assign #2 rd2 = rf[a2];
+   assign #2 rd3 = rf[a3];
+   
+endmodule // regfile
+
diff --git a/wally-pipelined/src/fpu/FPregfile.sv~ b/wally-pipelined/src/fpu/FPregfile.sv~
new file mode 100644
index 00000000..73b62a57
--- /dev/null
+++ b/wally-pipelined/src/fpu/FPregfile.sv~
@@ -0,0 +1,52 @@
+///////////////////////////////////////////
+// regfile.sv
+//
+// Written: David_Harris@hmc.edu 9 January 2021
+// Modified: 
+//
+// Purpose: 3-port register file
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module regfile (
+  input  logic             clk, reset,
+  input  logic             we3, 
+  input  logic [ 4:0]      a1, a2, a3, 
+  input  logic [`XLEN-1:0] wd3, 
+  output logic [`XLEN-1:0] rd1, rd2);
+
+  logic [`XLEN-1:0] rf[31:1];
+  integer i;
+
+  // three ported register file
+  // read two ports combinationally (A1/RD1, A2/RD2)
+  // write third port on rising edge of clock (A3/WD3/WE3)
+  // write occurs on falling edge of clock
+  // register 0 hardwired to 0
+  
+  // reset is intended for simulation only, not synthesis
+    
+  always_ff @(negedge clk or posedge reset)
+    if (reset) for(i=1; i<32; i++) rf[i] <= 0;
+    else if (we3) rf[a3] <= wd3;	
+
+  assign #2 rd1 = (a1 != 0) ? rf[a1] : 0;
+  assign #2 rd2 = (a2 != 0) ? rf[a2] : 0;
+endmodule
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index 3298e83b..cbc0f482 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -49,662 +49,598 @@ module fpu (
   output logic             IllegalFPUInstrD,
   output logic [`XLEN-1:0] FPUResultW);
 
-  //NOTE:
-  //For readability and ease of modification, logic signals will be
-  //instantiated as they occur within the pipeline. This will keep local
-  //signals, modules, and combinational logic closely defined.
-
-  //used for OSU DP-size hardware to wally XLEN interfacing
-
-  integer XLENDIFF;
-  assign XLENDIFF = `XLEN - 64;
-  integer XLENDIFFN;
-  assign XLENDIFFN = 63 - `XLEN;
-
-  //#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
-  //BEGIN PIPELINE CONTROL LOGIC
-  //
+   //NOTE:
+   //For readability and ease of modification, logic signals will be
+   //instantiated as they occur within the pipeline. This will keep local
+   //signals, modules, and combinational logic closely defined.
    
-  logic	                   PipeEnableDE;
-  logic	                   PipeEnableEM;
-  logic	                   PipeEnableMW;
-  logic                    PipeClearDE;
-  logic                    PipeClearEM;
-  logic                    PipeClearMW;
-
-  //temporarily assign pipe clear and enable signals
-  //to never flush & always be running
-  localparam PipeClear = 1'b0;
-  localparam PipeEnable = 1'b1;
-  always_comb begin
-
-	  PipeEnableDE = ~StallE;
-	  PipeEnableEM = ~StallM;
-	  PipeEnableMW = ~StallW;
-	  PipeClearDE = FlushE;
-	  PipeClearEM = FlushM;
-	  PipeClearMW = FlushW;
-
-  end
-
-  //
-  //END PIPELINE CONTROL LOGIC
-  //#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#*#
-
-  //#########################################
-  //BEGIN DECODE STAGE
-  //
- 
-  //wally-spec D stage control logic signal instantiation
-  logic                    FRegWriteD;
-  logic [2:0]              FResultSelD;
-  logic [2:0]              FrmD;
-  logic                    FmtD;
-  logic                    DivSqrtStartD;
-  logic [3:0]              OpCtrlD;
-  logic                    FWriteIntD;
-  logic                    OutputInput2D;
-  logic [1:0]              FMemRWD;
-
-  logic                    DivBusyM;
-	logic [1:0]              Input1MuxD, Input2MuxD;
-  logic                    Input3MuxD;
-  logic                    In2UsedD, In3UsedD;
-  //Hazard unit for FPU
-  fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*);
-
-  //top-level controller for FPU
-  fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*);
-
-  //instantiation of D stage regfile signals (includes some W stage signals
-  //for easy reference)
-  logic [2:0]              FrmW;
-  logic                    FmtW;
-  logic                    FRegWriteW;
-  logic [4:0]              RdW, Rs1D, Rs2D, Rs3D;
-  logic [`XLEN-1:0]        WriteDataW;
-  logic [63:0] FPUResultDirW; 
-  logic [`XLEN-1:0]        ReadData1D, ReadData2D, ReadData3D; 
-
-  //regfile instantiation
-  freg3adr fpregfile (FmtW, reset, PipeClear, clk, RdW, FRegWriteW, InstrD[19:15], InstrD[24:20], InstrD[31:27], FPUResultDirW, ReadData1D, ReadData2D, ReadData3D);
-
-  //always_comb begin
-  //   FrmW = InstrD[14:12];
-  //end
-  //
-  //END DECODE STAGE
-  //#########################################
-
-  //*****************************************
-  //BEGIN D/E PIPE
-  //
-
-  //wally-spec E stage control logic signal instantiation
-  logic                    FRegWriteE;
-  logic [2:0]              FResultSelE;
-  logic [2:0]              FrmE;
-  logic                    FmtE;
-  logic                    DivSqrtStartE;
-  logic [3:0]              OpCtrlE;
-	logic [1:0]              Input1MuxE, Input2MuxE;
-  logic                    Input3MuxE;
-  logic [63:0]             FPUResultDirE;
-  logic                    FWriteIntE;
-  logic                    OutputInput2E;
-  logic [1:0]              FMemRWE;
-
-  //instantiation of E stage regfile signals
-  logic [4:0]              RdE;
-  logic [`XLEN-1:0]        ReadData1E, ReadData2E, ReadData3E;
-  logic [`XLEN-1:0]        Input1E, Input2E, Input3E, Input1tmpE;
-
-  //instantiation of E/M stage div/sqrt signals
-  logic                    DivSqrtDone, DivDenormM;
-  logic [63:0]             DivResultM;
-  logic [4:0]              DivFlagsM;
-  logic [63:0]             DivOp1, DivOp2;
-  logic [2:0]              DivFrm;
-  logic                    DivOpType;
-  logic                    DivP;
-  logic                    DivOvEn, DivUnEn;
-  logic                    DivStart;
-
-  //instantiate E stage FMA signals here
-  logic [12:0]		aligncntE; 
-  logic [105:0]		rE; 
-  logic [105:0]		sE; 
-  logic [163:0]		tE;	
-  logic [8:0]		normcntE; 
-  logic [12:0]		aeE; 
-  logic 		bsE;
-  logic 		killprodE; 
-  logic 		prodofE; 
-  logic			xzeroE;
-  logic			yzeroE;
-  logic			zzeroE;
-  logic			xdenormE;
-  logic			ydenormE;
-  logic			zdenormE;
-  logic			xinfE;
-  logic			yinfE;
-  logic			zinfE;
-  logic			xnanE;
-  logic			ynanE;
-  logic			znanE;
-  logic			nanE;
-  logic	[8:0]		sumshiftE;
-  logic			sumshiftzeroE;
-  logic                 prodinfE;
-  
-  //instantiation of E stage add/cvt signals
-  logic [63:0]             AddSumE, AddSumTcE;
-  logic [3:0]              AddSelInvE;
-  logic [10:0]             AddExpPostSumE;
-  logic                    AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
-  logic                    AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
-  logic                    AddConvertE;
-  logic [63:0]             AddFloat1E, AddFloat2E;
-  logic [11:0]             AddExp1DenormE, AddExp2DenormE;
-  logic [10:0]             AddExponentE;
-  logic [63:0]             AddOp1E, AddOp2E;
-  logic [2:0]              AddRmE;
-  logic [3:0]              AddOpTypeE;
-  logic                    AddPE, AddOvEnE, AddUnEnE;  
-
-  //instantiation of E stage cmp signals 
-  logic [7:0]              WE, XE;
-  logic                    ANaNE, BNaNE, AzeroE, BzeroE;
-  logic [63:0]             CmpOp1E, CmpOp2E;
-  logic [1:0]              CmpSelE;
-
-  //instantiation of E/M stage fsgn signals (due to bypass logic)
-  logic [63:0]             SgnOp1E, SgnOp2E;
-  logic [1:0]              SgnOpCodeE, SgnOpCodeM;
-  logic [63:0]             SgnResultE, SgnResultM;
-  logic [4:0]              SgnFlagsE, SgnFlagsM;
-
-  //*****************
-  //fpregfile D/E pipe registers
-  //*****************
-  flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, ReadData1D, ReadData1E);
-  flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, ReadData2D, ReadData2E);
-  flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, ReadData3D, ReadData3E);
-
-  //*****************
-  //other  D/E pipe registers
-  //*****************
-  flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FRegWriteD, FRegWriteE);
-  flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE);
-  flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
-  flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE);
-  flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE);
-  flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, OpCtrlD, OpCtrlE);
-  flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, DivSqrtStartD, DivSqrtStartE);
-  flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, Input1MuxD, Input1MuxE);
-  flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, Input2MuxD, Input2MuxE);
-  flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, Input3MuxD, Input3MuxE);
-  flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResultDirW, FPUResultDirE);
-  flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE);
-  flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, OutputInput2D, OutputInput2E);
-  flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE);
-
-  //
-  //END D/E PIPE
-  //*****************************************
-
-  //#########################################
-  //BEGIN EXECUTION STAGE
-  //
-
-
+   //used for OSU DP-size hardware to wally XLEN interfacing
+   
+   integer 		   XLENDIFF;
+   assign XLENDIFF = `XLEN - 64;
+   integer 		   XLENDIFFN;
+   assign XLENDIFFN = 63 - `XLEN;
+   
+   // BEGIN PIPELINE CONTROL LOGIC
+   logic 		   PipeEnableDE;
+   logic 		   PipeEnableEM;
+   logic 		   PipeEnableMW;
+   logic 		   PipeClearDE;
+   logic 		   PipeClearEM;
+   logic 		   PipeClearMW;
+   
+   //temporarily assign pipe clear and enable signals
+   //to never flush & always be running
+   localparam PipeClear = 1'b0;
+   localparam PipeEnable = 1'b1;
+   always_comb begin
+      PipeEnableDE = ~StallE;
+      PipeEnableEM = ~StallM;
+      PipeEnableMW = ~StallW;
+      PipeClearDE = FlushE;
+      PipeClearEM = FlushM;
+      PipeClearMW = FlushW;
+   end   
+   
+   // Wally-spec D stage control logic signal instantiation
+   logic                    FRegWriteD;
+   logic [2:0] 		    FResultSelD;
+   logic [2:0] 		    FrmD;
+   logic                    FmtD;
+   logic                    DivSqrtStartD;
+   logic [3:0] 		    OpCtrlD;
+   logic                    FWriteIntD;
+   logic                    OutputInput2D;
+   logic [1:0] 		    FMemRWD;
+   
+   logic 		    DivBusyM;
+   logic [1:0] 		    Input1MuxD, Input2MuxD;
+   logic 		    Input3MuxD;
+   logic                    In2UsedD, In3UsedD;
+   
+   //Hazard unit for FPU
+   fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*);
+   
+   //top-level controller for FPU
+   fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*);
+   
+   //instantiation of D stage regfile signals (includes some W stage signals
+   //for easy reference)
+   logic [2:0] 		    FrmW;
+   logic                    FmtW;
+   logic                    FRegWriteW;
+   logic [4:0] 		    RdW, Rs1D, Rs2D, Rs3D;
+   logic [`XLEN-1:0] 	    WriteDataW;
+   logic [63:0] 	    FPUResultDirW; 
+   logic [`XLEN-1:0] 	    ReadData1D, ReadData2D, ReadData3D; 
+   
+   //regfile instantiation
+   //freg3adr fpregfile (FmtW, reset, PipeClear, clk, RdW, 
+   //		       FRegWriteW, 
+   //		       InstrD[19:15], InstrD[24:20], InstrD[31:27], 
+   //		       FPUResultDirW, 
+   //		       ReadData1D, ReadData2D, ReadData3D);
+   FPregfile fpregfile (clk, reset, FRegWriteW,
+			InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
+			FPUResultDirW,
+			ReadData1D, ReadData2D, ReadData3D);		
 
+  // wally-spec E stage control logic signal instantiation
+   logic                    FRegWriteE;
+   logic [2:0] 		    FResultSelE;
+   logic [2:0] 		    FrmE;
+   logic                    FmtE;
+   logic                    DivSqrtStartE;
+   logic [3:0] 		    OpCtrlE;
+   logic [1:0] 		    Input1MuxE, Input2MuxE;
+   logic                    Input3MuxE;
+   logic [63:0] 	    FPUResultDirE;
+   logic                    FWriteIntE;
+   logic                    OutputInput2E;
+   logic [1:0] 		    FMemRWE;
+   
+   //instantiation of E stage regfile signals
+   logic [4:0] 		    RdE;
+   logic [`XLEN-1:0] 	    ReadData1E, ReadData2E, ReadData3E;
+   logic [`XLEN-1:0] 	    Input1E, Input2E, Input3E, Input1tmpE;
+   
+   //instantiation of E/M stage div/sqrt signals
+   logic                    DivSqrtDone, DivDenormM;
+   logic [63:0] 	    DivResultM;
+   logic [4:0] 		    DivFlagsM;
+   logic [63:0] 	    DivOp1, DivOp2;
+   logic [2:0] 		    DivFrm;
+   logic                    DivOpType;
+   logic                    DivP;
+   logic                    DivOvEn, DivUnEn;
+   logic                    DivStart;
+   
+   //instantiate E stage FMA signals here
+   logic [12:0] 	    aligncntE; 
+   logic [105:0] 	    rE; 
+   logic [105:0] 	    sE; 
+   logic [163:0] 	    tE;	
+   logic [8:0] 		    normcntE; 
+   logic [12:0] 	    aeE; 
+   logic 		    bsE;
+   logic 		    killprodE; 
+   logic 		    prodofE; 
+   logic 		    xzeroE;
+   logic 		    yzeroE;
+   logic 		    zzeroE;
+   logic 		    xdenormE;
+   logic 		    ydenormE;
+   logic 		    zdenormE;
+   logic 		    xinfE;
+   logic 		    yinfE;
+   logic 		    zinfE;
+   logic 		    xnanE;
+   logic 		    ynanE;
+   logic 		    znanE;
+   logic 		    nanE;
+   logic [8:0] 		    sumshiftE;
+   logic 		    sumshiftzeroE;
+   logic 		    prodinfE;
+   
+   //instantiation of E stage add/cvt signals
+   logic [63:0] 	    AddSumE, AddSumTcE;
+   logic [3:0] 		    AddSelInvE;
+   logic [10:0] 	    AddExpPostSumE;
+   logic                    AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
+   logic                    AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
+   logic                    AddConvertE;
+   logic [63:0] 	    AddFloat1E, AddFloat2E;
+   logic [11:0] 	    AddExp1DenormE, AddExp2DenormE;
+   logic [10:0] 	    AddExponentE;
+   logic [63:0] 	    AddOp1E, AddOp2E;
+   logic [2:0] 		    AddRmE;
+   logic [3:0] 		    AddOpTypeE;
+   logic                    AddPE, AddOvEnE, AddUnEnE;  
+   
+   //instantiation of E stage cmp signals 
+   logic [7:0] 		    WE, XE;
+   logic                    ANaNE, BNaNE, AzeroE, BzeroE;
+   logic [63:0] 	    CmpOp1E, CmpOp2E;
+   logic [1:0] 		    CmpSelE;
+   
+   //instantiation of E/M stage fsgn signals (due to bypass logic)
+   logic [63:0] 	    SgnOp1E, SgnOp2E;
+   logic [1:0] 		    SgnOpCodeE, SgnOpCodeM;
+   logic [63:0] 	    SgnResultE, SgnResultM;
+   logic [4:0] 		    SgnFlagsE, SgnFlagsM;
+   
+   //*****************
+   //fpregfile D/E pipe registers
+   //*****************
+   flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, ReadData1D, ReadData1E);
+   flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, ReadData2D, ReadData2E);
+   flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, ReadData3D, ReadData3E);
+   
+   //*****************
+   //other  D/E pipe registers
+   //*****************
+   flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FRegWriteD, FRegWriteE);
+   flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE);
+   flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
+   flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE);
+   flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE);
+   flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, OpCtrlD, OpCtrlE);
+   flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, DivSqrtStartD, DivSqrtStartE);
+   flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, Input1MuxD, Input1MuxE);
+   flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, Input2MuxD, Input2MuxE);
+   flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, Input3MuxD, Input3MuxE);
+   flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResultDirW, FPUResultDirE);
+   flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE);
+   flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, OutputInput2D, OutputInput2E);
+   flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE);
+   
   // input muxs for forwarding
-  
-  mux4  #(64)  Input1Emux(ReadData1E, FPUResultDirW, FPUResultDirE, SrcAM, Input1MuxE, Input1tmpE);
-  mux3  #(64)  Input2Emux(ReadData2E, FPUResultDirW, FPUResultDirE, Input2MuxE, Input2E);
-  mux2  #(64)  Input3Emux(ReadData3E, FPUResultDirE, Input3MuxE, Input3E);
-  mux2  #(64)  OutputInput2mux(Input1tmpE, Input2E, OutputInput2E, Input1E);
+   mux4  #(64)  Input1Emux(ReadData1E, FPUResultDirW, FPUResultDirE, SrcAM, Input1MuxE, Input1tmpE);
+   mux3  #(64)  Input2Emux(ReadData2E, FPUResultDirW, FPUResultDirE, Input2MuxE, Input2E);
+   mux2  #(64)  Input3Emux(ReadData3E, FPUResultDirE, Input3MuxE, Input3E);
+   mux2  #(64)  OutputInput2mux(Input1tmpE, Input2E, OutputInput2E, Input1E);
 
+   fma1 fma1 (.*);
 
-
-  fma1 fma1 (.*);
-
-  //first and only instance of floating-point divider
-  fpdiv fpdivsqrt (.*);
-
-  //first of two-stage instance of floating-point add/cvt unit
-  fpuaddcvt1 fpadd1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, Input1E, Input2E, FrmE, OpCtrlE, FmtE);
-
-  //first of two-stage instance of floating-point comparator
-  fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, Input1E, Input2E, OpCtrlE[1:0]);
-
-  //first and only instance of floating-point sign converter
-  fpusgn fpsgn (.*);
-
-  //interface between XLEN size datapath and double-precision sized
-  //floating-point results
-  //
-  //define offsets for LSB zero extension or truncation
-  always_comb begin
-
-  //truncate to 64 bits
-  //(causes warning during compilation - case never reached) 
-//   if(`XLEN > 64) begin // ***KEP this isn't usedand it causes a lint error
-//         DivOp1 = Input1E[`XLEN-1:`XLEN-64];
-// 	DivOp2 = Input2E[`XLEN-1:`XLEN-64];
-//         AddOp1E = Input1E[`XLEN-1:`XLEN-64];
-// 	AddOp2E = Input2E[`XLEN-1:`XLEN-64];
-//         CmpOp1E = Input1E[`XLEN-1:`XLEN-64];
-// 	CmpOp2E = Input2E[`XLEN-1:`XLEN-64];
-//         SgnOp1E = Input1E[`XLEN-1:`XLEN-64];
-// 	SgnOp2E = Input2E[`XLEN-1:`XLEN-64];
-//   end
-//   //zero extend to 64 bits
-//   else begin
-//         DivOp1 = {Input1E,{64-`XLEN{1'b0}}};
-// 	DivOp2 = {Input2E,{64-`XLEN{1'b0}}};
-//         AddOp1E = {Input1E,{64-`XLEN{1'b0}}};
-// 	AddOp2E = {Input2E,{64-`XLEN{1'b0}}};
-//         CmpOp1E = {Input1E,{64-`XLEN{1'b0}}};
-// 	CmpOp2E = {Input2E,{64-`XLEN{1'b0}}};
-//         SgnOp1E = {Input1E,{64-`XLEN{1'b0}}};
-// 	SgnOp2E = {Input2E,{64-`XLEN{1'b0}}};
-//   end
-
-  //assign op codes
-  AddOpTypeE[3:0] = OpCtrlE[3:0];
-  CmpSelE[1:0] = OpCtrlE[1:0];
-  DivOpType = OpCtrlE[0];
-  SgnOpCodeE[1:0] = OpCtrlE[1:0];
-
-  end 
-
-  //E stage control signal interfacing between wally spec and OSU fp hardware
-  //op codes
-
-  //
-  //END EXECUTION STAGE
-  //#########################################
-
-  //*****************************************
-  //BEGIN E/M PIPE
-  //
-
-  //wally-spec M stage control logic signal instantiation
-  logic                    FRegWriteM;
-  logic [2:0]              FResultSelM;
-  logic [2:0]              FrmM;
-  logic                    FmtM;
-  logic [3:0]              OpCtrlM;
-
-  //instantiate M stage FMA signals here ***rename fma signals and resize for XLEN
-  logic [63:0]		FmaResultM;
-  logic [4:0]	 	FmaFlagsM;
-  logic [12:0]		aligncntM; 
-  logic [105:0]		rM; 
-  logic [105:0]		sM; 
-  logic [163:0]		tM;	
-  logic [8:0]		normcntM; 
-  logic [12:0]		aeM; 
-  logic 		bsM;
-  logic 		killprodM; 
-  logic 		prodofM; 
-  logic			xzeroM;
-  logic			yzeroM;
-  logic			zzeroM;
-  logic			xdenormM;
-  logic			ydenormM;
-  logic			zdenormM;
-  logic			xinfM;
-  logic			yinfM;
-  logic			zinfM;
-  logic			xnanM;
-  logic			ynanM;
-  logic			znanM;
-  logic			nanM;
-  logic	[8:0]		sumshiftM;
-  logic			sumshiftzeroM;
-  logic                 prodinfM;
-
-  //instantiation of M stage regfile signals
-  logic [4:0]              RdM;
-  logic [`XLEN-1:0]        Input1M, Input2M, Input3M;
-  logic [`XLEN-1:0]        LoadStoreResultM;
-
-  //instantiation of M stage add/cvt signals
-  logic [63:0]             AddResultM;
-  logic [4:0]              AddFlagsM;
-  logic                    AddDenormM;
-  logic [63:0]             AddSumM, AddSumTcM;
-  logic [3:0]              AddSelInvM;
-  logic [10:0]             AddExpPostSumM;
-  logic                    AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
-  logic                    AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
-  logic                    AddConvertM, AddSignM;
-  logic [63:0]             AddFloat1M, AddFloat2M;
-  logic [11:0]             AddExp1DenormM, AddExp2DenormM;
-  logic [10:0]             AddExponentM;
-  logic [63:0]             AddOp1M, AddOp2M;
-  logic [2:0]              AddRmM;
-  logic [3:0]              AddOpTypeM;
-  logic                    AddPM, AddOvEnM, AddUnEnM;  
-
-  //instantiation of M stage cmp signals
-  logic                    CmpInvalidM;
-  logic [1:0]              CmpFCCM; 
-  logic [7:0]              WM, XM;
-  logic                    ANaNM, BNaNM, AzeroM, BzeroM;
-  logic [63:0]             CmpOp1M, CmpOp2M;
-  logic [1:0]              CmpSelM;
-
-
-  //*****************
-  //fpregfile D/E pipe registers
-  //*****************
-  flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, Input1E, Input1M);
-  flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, Input2E, Input2M);
-  flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, Input3E, Input3M);
-
-  //*****************
-  //fma E/M pipe registers
-  //*****************  
-  flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); 
-  flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); 
-  flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); 
-  flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); 
-  flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); 
-  flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM);  
-  flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); 
-  flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); 
-  flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); 
-  flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); 
-  flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); 
-  flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); 
-  flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); 
-  flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); 
-  flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); 
-  flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); 
-  flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); 
-  flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); 
-  flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); 
-  flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); 
-  flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); 
-  flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); 
-  flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); 
-  flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); 
-  flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); 
-
-  //*****************
-  //fpadd E/M pipe registers
-  //*****************
-  flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
-  flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
-  flopenrc #(4)  EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
-  flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
-  flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
-  flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
-  flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
-  flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
-  flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
-  flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
-  flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
-  flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
-  flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
-  flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
-  flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignM); 
-  flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
-  flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
-  flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
-  flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
-  flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
-  flopenrc #(64) EMRegAdd21(clk, reset, PipeClearEM, PipeEnableEM, AddOp1E, AddOp1M); 
-  flopenrc #(64) EMRegAdd22(clk, reset, PipeClearEM, PipeEnableEM, AddOp2E, AddOp2M); 
-  flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
-  flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
-  flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
-  flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
-  flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
-
-  //*****************
-  //fpcmp E/M pipe registers
-  //*****************
-  flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
-  flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
-  flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
-  flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
-  flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
-  flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
-  flopenrc #(64) EMRegCmp7(clk, reset, PipeClearEM, PipeEnableEM, CmpOp1E, CmpOp1M); 
-  flopenrc #(64) EMRegCmp8(clk, reset, PipeClearEM, PipeEnableEM, CmpOp2E, CmpOp2M); 
-  flopenrc #(2) EMRegCmp9(clk, reset, PipeClearEM, PipeEnableEM, CmpSelE, CmpSelM);
-
-  //put this in for the event we want to delay fsgn - will otherwise bypass
-  //*****************
-  //fpsgn E/M pipe registers
-  //***************** 
-  flopenrc #(2) EMRegSgn1(clk, reset, PipeClearEM, PipeEnableEM, SgnOpCodeE, SgnOpCodeM);
-  flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
-  flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
-
-  //*****************
-  //other E/M pipe registers
-  //*****************
-  flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FRegWriteE, FRegWriteM);
-  flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM);
-  flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
-  flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM);
-  flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM);
-  flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, OpCtrlE, OpCtrlM);
-  flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
-  flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
-
-  //
-  //END E/M PIPE
-  //*****************************************
-
-  //#########################################
-  //BEGIN MEMORY STAGE
-  //
-
-  
+   //first and only instance of floating-point divider
+   fpdiv fpdivsqrt (.*);
+   
+   //first of two-stage instance of floating-point add/cvt unit
+   fpuaddcvt1 fpadd1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, 
+		      AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, 
+		      AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, 
+		      AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, 
+		      AddExp1DenormE, AddExp2DenormE, AddExponentE, 
+		      Input1E, Input2E, FrmE, OpCtrlE, FmtE);
+   
+   //first of two-stage instance of floating-point comparator
+   fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, Input1E, Input2E, OpCtrlE[1:0]);
+   
+   //first and only instance of floating-point sign converter
+   fpusgn fpsgn (.*);
+   
+   //interface between XLEN size datapath and double-precision sized
+   //floating-point results
+   //
+   //define offsets for LSB zero extension or truncation
+   always_comb begin
+      
+      //truncate to 64 bits
+      //(causes warning during compilation - case never reached) 
+      //   if(`XLEN > 64) begin // ***KEP this isn't usedand it causes a lint error
+      //         DivOp1 = Input1E[`XLEN-1:`XLEN-64];
+      // 	DivOp2 = Input2E[`XLEN-1:`XLEN-64];
+      //         AddOp1E = Input1E[`XLEN-1:`XLEN-64];
+      // 	AddOp2E = Input2E[`XLEN-1:`XLEN-64];
+      //         CmpOp1E = Input1E[`XLEN-1:`XLEN-64];
+      // 	CmpOp2E = Input2E[`XLEN-1:`XLEN-64];
+      //         SgnOp1E = Input1E[`XLEN-1:`XLEN-64];
+      // 	SgnOp2E = Input2E[`XLEN-1:`XLEN-64];
+      //   end
+      //   //zero extend to 64 bits
+      //   else begin
+      //         DivOp1 = {Input1E,{64-`XLEN{1'b0}}};
+      // 	DivOp2 = {Input2E,{64-`XLEN{1'b0}}};
+      //         AddOp1E = {Input1E,{64-`XLEN{1'b0}}};
+      // 	AddOp2E = {Input2E,{64-`XLEN{1'b0}}};
+      //         CmpOp1E = {Input1E,{64-`XLEN{1'b0}}};
+      // 	CmpOp2E = {Input2E,{64-`XLEN{1'b0}}};
+      //         SgnOp1E = {Input1E,{64-`XLEN{1'b0}}};
+      // 	SgnOp2E = {Input2E,{64-`XLEN{1'b0}}};
+      //   end
+      
+      //assign op codes
+      AddOpTypeE[3:0] = OpCtrlE[3:0];
+      CmpSelE[1:0] = OpCtrlE[1:0];
+      DivOpType = OpCtrlE[0];
+      SgnOpCodeE[1:0] = OpCtrlE[1:0];
+      
+   end 
+   
+   //E stage control signal interfacing between wally spec and OSU fp hardware
+   //op codes
+   
+   //wally-spec M stage control logic signal instantiation
+   logic                    FRegWriteM;
+   logic [2:0] 		    FResultSelM;
+   logic [2:0] 		    FrmM;
+   logic                    FmtM;
+   logic [3:0] 		    OpCtrlM;
+   
+   //instantiate M stage FMA signals here ***rename fma signals and resize for XLEN
+   logic [63:0] 	    FmaResultM;
+   logic [4:0] 		    FmaFlagsM;
+   logic [12:0] 	    aligncntM; 
+   logic [105:0] 	    rM; 
+   logic [105:0] 	    sM; 
+   logic [163:0] 	    tM;	
+   logic [8:0] 		    normcntM; 
+   logic [12:0] 	    aeM; 
+   logic 		    bsM;
+   logic 		    killprodM; 
+   logic 		    prodofM; 
+   logic 		    xzeroM;
+   logic 		    yzeroM;
+   logic 		    zzeroM;
+   logic 		    xdenormM;
+   logic 		    ydenormM;
+   logic 		    zdenormM;
+   logic 		    xinfM;
+   logic 		    yinfM;
+   logic 		    zinfM;
+   logic 		    xnanM;
+   logic 		    ynanM;
+   logic 		    znanM;
+   logic 		    nanM;
+   logic [8:0] 		    sumshiftM;
+   logic 		    sumshiftzeroM;
+   logic 		    prodinfM;
+   
+   //instantiation of M stage regfile signals
+   logic [4:0] 		    RdM;
+   logic [`XLEN-1:0] 	    Input1M, Input2M, Input3M;
+   logic [`XLEN-1:0] 	    LoadStoreResultM;
+   
+   //instantiation of M stage add/cvt signals
+   logic [63:0] 	    AddResultM;
+   logic [4:0] 		    AddFlagsM;
+   logic                    AddDenormM;
+   logic [63:0] 	    AddSumM, AddSumTcM;
+   logic [3:0] 		    AddSelInvM;
+   logic [10:0] 	    AddExpPostSumM;
+   logic                    AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
+   logic                    AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
+   logic                    AddConvertM, AddSignM;
+   logic [63:0] 	    AddFloat1M, AddFloat2M;
+   logic [11:0] 	    AddExp1DenormM, AddExp2DenormM;
+   logic [10:0] 	    AddExponentM;
+   logic [63:0] 	    AddOp1M, AddOp2M;
+   logic [2:0] 		    AddRmM;
+   logic [3:0] 		    AddOpTypeM;
+   logic                    AddPM, AddOvEnM, AddUnEnM;  
+   
+   //instantiation of M stage cmp signals
+   logic                    CmpInvalidM;
+   logic [1:0] 		    CmpFCCM; 
+   logic [7:0] 		    WM, XM;
+   logic                    ANaNM, BNaNM, AzeroM, BzeroM;
+   logic [63:0] 	    CmpOp1M, CmpOp2M;
+   logic [1:0] 		    CmpSelM;
+   
+   
+   //*****************
+   //fpregfile D/E pipe registers
+   //*****************
+   flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, Input1E, Input1M);
+   flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, Input2E, Input2M);
+   flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, Input3E, Input3M);
+   
+   //*****************
+   //fma E/M pipe registers
+   //*****************  
+   flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); 
+   flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); 
+   flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); 
+   flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); 
+   flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); 
+   flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM);  
+   flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); 
+   flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); 
+   flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); 
+   flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); 
+   flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); 
+   flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); 
+   flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); 
+   flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); 
+   flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); 
+   flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); 
+   flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); 
+   flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); 
+   flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); 
+   flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); 
+   flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); 
+   flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); 
+   flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); 
+   flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); 
+   flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); 
+   
+   //*****************
+   //fpadd E/M pipe registers
+   //*****************
+   flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
+   flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
+   flopenrc #(4)  EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
+   flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
+   flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
+   flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
+   flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
+   flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
+   flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
+   flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
+   flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
+   flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
+   flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
+   flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
+   flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignM); 
+   flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
+   flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
+   flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
+   flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
+   flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
+   flopenrc #(64) EMRegAdd21(clk, reset, PipeClearEM, PipeEnableEM, AddOp1E, AddOp1M); 
+   flopenrc #(64) EMRegAdd22(clk, reset, PipeClearEM, PipeEnableEM, AddOp2E, AddOp2M); 
+   flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
+   flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
+   flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
+   flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
+   flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
+   
+   //*****************
+   //fpcmp E/M pipe registers
+   //*****************
+   flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
+   flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
+   flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
+   flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
+   flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
+   flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
+   flopenrc #(64) EMRegCmp7(clk, reset, PipeClearEM, PipeEnableEM, CmpOp1E, CmpOp1M); 
+   flopenrc #(64) EMRegCmp8(clk, reset, PipeClearEM, PipeEnableEM, CmpOp2E, CmpOp2M); 
+   flopenrc #(2) EMRegCmp9(clk, reset, PipeClearEM, PipeEnableEM, CmpSelE, CmpSelM);
+   
+   //put this in for the event we want to delay fsgn - will otherwise bypass
+   //*****************
+   //fpsgn E/M pipe registers
+   //***************** 
+   flopenrc #(2) EMRegSgn1(clk, reset, PipeClearEM, PipeEnableEM, SgnOpCodeE, SgnOpCodeM);
+   flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
+   flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
+   
+   //*****************
+   //other E/M pipe registers
+   //*****************
+   flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FRegWriteE, FRegWriteM);
+   flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM);
+   flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
+   flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM);
+   flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM);
+   flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, OpCtrlE, OpCtrlM);
+   flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
+   flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
+   
   assign FWriteDataM = Input1M;
-
   mux2  #(64)  LoadStoreResultMux(HRDATA, Input1M, |OpCtrlM[2:1], LoadStoreResultM);
-
   fma2 fma2(.*);
 
-  //second instance of two-stage floating-point add/cvt unit
-  fpuaddcvt2 fpadd2 (.*);
-
-  //second instance of two-stage floating-point comparator
-  fpucmp2 fpcmp2 (CmpInvalidM, CmpFCCM, ANaNM, BNaNM, AzeroM, BzeroM, WM, XM, CmpSelM, CmpOp1M, CmpOp2M);
-
-  //
-  //END MEMORY STAGE
-  //#########################################
-
-
-  //*****************************************
-  //BEGIN M/W PIPE
-  //
-  
-  //wally-spec W stage control logic signal instantiation
-  logic [2:0]              FResultSelW;
-
-  //instantiate W stage fma signals here
-  logic [63:0]             FmaResultW;
-  logic [4:0]              FmaFlagsW;
-
-  //instantiation of W stage div/sqrt signals
-  logic                    DivDenormW;
-  logic [63:0]             DivResultW;
-  logic [4:0]              DivFlagsW;
-
-  //instantiation of W stage fsgn signals
-  logic [63:0]            SgnResultW;
-  logic [4:0]             SgnFlagsW;
-
-  //instantiation of W stage regfile signals
-  logic [`XLEN-1:0]        LoadStoreResultW;
-  logic [`XLEN-1:0]        SrcAW;
-
-  //instantiation of W stage add/cvt signals
-  logic [63:0]             AddResultW;
-  logic [4:0]              AddFlagsW;
-  logic                    AddDenormW;
-
-  //instantiation of W stage cmp signals
-  logic [63:0]             CmpResultW;
-  logic                    CmpInvalidW;
-  logic [1:0]              CmpFCCW; 
-
-  //instantiation of W stage classify signals
-  logic [63:0]             ClassResultW;
-  logic [4:0]              ClassFlagsW;
-
-  //*****************
-  //fma M/W pipe registers
-  //*****************
-  flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); 
-  flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); 
-
-  //*****************
-  //fpdiv M/W pipe registers
-  //*****************
-  flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, DivResultM, DivResultW); 
-  flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, DivFlagsM, DivFlagsW);
-  flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
-
-  //*****************
-  //fpadd M/W pipe registers
-  //*****************
-  flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, AddResultM, AddResultW); 
-  flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, AddFlagsM, AddFlagsW); 
-  flopenrc #(1) MWRegAdd3(clk, reset, PipeClearMW, PipeEnableMW, AddDenormM, AddDenormW); 
-
-  //*****************
-  //fpcmp M/W pipe registers
-  //*****************
-  flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
-  flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
-
-  //*****************
-  //fpsgn M/W pipe registers
-  //***************** 
-  flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW);
-  flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW);
-
-  //*****************
-  //other M/W pipe registers
-  //*****************
-  flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FRegWriteM, FRegWriteW);
-  flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
-  flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
-  flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
-  flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW);
-  flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, LoadStoreResultM, LoadStoreResultW);
-  flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
-
-  ////END M/W PIPE
-  //*****************************************
-
-
-  //#########################################
-  //BEGIN WRITEBACK STAGE
-  //
-
-  //flag signal mux via in-line ternaries
-  logic [4:0] FPUFlagsW;
-  //if bit 2 is active set to sign flags - otherwise:
-  //iff bit one is high - if bit zero is active set to fma flags - otherwise
-  //set to cmp flags
-  //iff bit one is low - if bit zero is active set to add/cvt flags - otherwise
-  //set to div/sqrt flags
-  //assign FPUFlagsW = (FResultSelW[2]) ? (SgnFlagsW) : (
-//	             (FResultSelW[1]) ? 
-//		     ( (FResultSelW[0]) ? (FmaFlagsW) : ({CmpInvalidW,4'b0000}) ) 
-//		     : ( (FResultSelW[0]) ? (AddFlagsW) : (DivFlagsW) ) 
-//                     );
-  always_comb begin
-	case (FResultSelW)
-		// div/sqrt
-		3'b000 : FPUFlagsW = DivFlagsW;
-		// cmp		
-		3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0};
-		//fma/mult
-		3'b010 : FPUFlagsW = FmaFlagsW;
-		// sgn inj
-		3'b011 : FPUFlagsW = SgnFlagsW;
-		// add/sub/cnvt
-		3'b100 : FPUFlagsW = AddFlagsW;
-		// classify
-		3'b101 : FPUFlagsW = ClassFlagsW;
-		// output SrcAW
-		3'b110 : FPUFlagsW = 5'b0;
-		// output ReadData1
-		3'b111 : FPUFlagsW = 5'b0;
-		default : FPUFlagsW = 5'bxxxxx;
-	endcase
-  end
-
-  //result mux via in-line ternaries
-  //the uses the same logic as for flag signals
-  //assign FPUResultDirW = (FResultSelW[2]) ? (SgnResultW) : (
-  //	             (FResultSelW[1]) ? 
-  //		     ( (FResultSelW[0]) ? (FmaResultW) : ({62'b0,CmpFCCW}) ) 
-  //		     : ( (FResultSelW[0]) ? (AddResultW) : (DivResultW) ) 
-  //                   );
-
-
-  always_comb begin
-	case (FResultSelW)
-		// div/sqrt
-		3'b000 : FPUResultDirW = DivResultW;
-		// cmp		
-		3'b001 : FPUResultDirW = CmpResultW;
-		//fma/mult
-		3'b010 : FPUResultDirW = FmaResultW;
-		// sgn inj
-		3'b011 : FPUResultDirW = SgnResultW;
-		// add/sub/cnvt
-		3'b100 : FPUResultDirW = AddResultW;
-		// classify
-		3'b101 : FPUResultDirW = ClassResultW;
-		// output SrcAW
-		3'b110 : FPUResultDirW = SrcAW;
-		// Load/Store/Move to FP-register
-		3'b111 : FPUResultDirW = LoadStoreResultW;
-		default : FPUResultDirW = {64{1'bx}};
-	endcase
-  end
-  //interface between XLEN size datapath and double-precision sized
-  //floating-point results
-  //
-  //define offsets for LSB zero extension or truncation
-  always_comb begin
-           
-  //zero extension  
-
-// Teo 04/13/2021
-// Commented out XLENDIFF{1'b0} due to error:
-// Repetition multiplier must be constant.
-
-  //if(`XLEN > 64) begin
-  //    FPUResultW = {FPUResultDirW,{XLENDIFF{1'b0}}};
-  //end
-  //truncate
-  //else begin
+   //second instance of two-stage floating-point add/cvt unit
+   fpuaddcvt2 fpadd2 (.*);
+   
+   //second instance of two-stage floating-point comparator
+   fpucmp2 fpcmp2 (CmpInvalidM, CmpFCCM, ANaNM, BNaNM, AzeroM, BzeroM, WM, XM, CmpSelM, CmpOp1M, CmpOp2M);
+   
+   //wally-spec W stage control logic signal instantiation
+   logic [2:0] 		    FResultSelW;
+   
+   //instantiate W stage fma signals here
+   logic [63:0] 	    FmaResultW;
+   logic [4:0] 		    FmaFlagsW;
+   
+   //instantiation of W stage div/sqrt signals
+   logic                    DivDenormW;
+   logic [63:0] 	    DivResultW;
+   logic [4:0] 		    DivFlagsW;
+   
+   //instantiation of W stage fsgn signals
+   logic [63:0] 	    SgnResultW;
+   logic [4:0] 		    SgnFlagsW;
+   
+   //instantiation of W stage regfile signals
+   logic [`XLEN-1:0] 	    LoadStoreResultW;
+   logic [`XLEN-1:0] 	    SrcAW;
+   
+   //instantiation of W stage add/cvt signals
+   logic [63:0] 	    AddResultW;
+   logic [4:0] 		    AddFlagsW;
+   logic                    AddDenormW;
+   
+   //instantiation of W stage cmp signals
+   logic [63:0] 	    CmpResultW;
+   logic                    CmpInvalidW;
+   logic [1:0] 		    CmpFCCW; 
+   
+   //instantiation of W stage classify signals
+   logic [63:0] 	    ClassResultW;
+   logic [4:0] 		    ClassFlagsW;
+   
+   //*****************
+   //fma M/W pipe registers
+   //*****************
+   flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); 
+   flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); 
+   
+   //*****************
+   //fpdiv M/W pipe registers
+   //*****************
+   flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, DivResultM, DivResultW); 
+   flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, DivFlagsM, DivFlagsW);
+   flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
+   
+   //*****************
+   //fpadd M/W pipe registers
+   //*****************
+   flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, AddResultM, AddResultW); 
+   flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, AddFlagsM, AddFlagsW); 
+   flopenrc #(1) MWRegAdd3(clk, reset, PipeClearMW, PipeEnableMW, AddDenormM, AddDenormW); 
+   
+   //*****************
+   //fpcmp M/W pipe registers
+   //*****************
+   flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
+   flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
+   
+   //*****************
+   //fpsgn M/W pipe registers
+   //***************** 
+   flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW);
+   flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW);
+   
+   //*****************
+   //other M/W pipe registers
+   //*****************
+   flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FRegWriteM, FRegWriteW);
+   flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
+   flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
+   flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
+   flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW);
+   flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, LoadStoreResultM, LoadStoreResultW);
+   flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
+   
+   //flag signal mux via in-line ternaries
+   logic [4:0] 		    FPUFlagsW;
+   //if bit 2 is active set to sign flags - otherwise:
+   //iff bit one is high - if bit zero is active set to fma flags - otherwise
+   //set to cmp flags
+   //iff bit one is low - if bit zero is active set to add/cvt flags - otherwise
+   //set to div/sqrt flags
+   //assign FPUFlagsW = (FResultSelW[2]) ? (SgnFlagsW) : (
+   //	             (FResultSelW[1]) ? 
+   //		     ( (FResultSelW[0]) ? (FmaFlagsW) : ({CmpInvalidW,4'b0000}) ) 
+   //		     : ( (FResultSelW[0]) ? (AddFlagsW) : (DivFlagsW) ) 
+   //                     );
+   always_comb begin
+      case (FResultSelW)
+	// div/sqrt
+	3'b000 : FPUFlagsW = DivFlagsW;
+	// cmp		
+	3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0};
+	//fma/mult
+	3'b010 : FPUFlagsW = FmaFlagsW;
+	// sgn inj
+	3'b011 : FPUFlagsW = SgnFlagsW;
+	// add/sub/cnvt
+	3'b100 : FPUFlagsW = AddFlagsW;
+	// classify
+	3'b101 : FPUFlagsW = ClassFlagsW;
+	// output SrcAW
+	3'b110 : FPUFlagsW = 5'b0;
+	// output ReadData1
+	3'b111 : FPUFlagsW = 5'b0;
+	default : FPUFlagsW = 5'bxxxxx;
+      endcase
+   end
+   
+   //result mux via in-line ternaries
+   //the uses the same logic as for flag signals
+   //assign FPUResultDirW = (FResultSelW[2]) ? (SgnResultW) : (
+   //	             (FResultSelW[1]) ? 
+   //		     ( (FResultSelW[0]) ? (FmaResultW) : ({62'b0,CmpFCCW}) ) 
+   //		     : ( (FResultSelW[0]) ? (AddResultW) : (DivResultW) ) 
+   //                   );
+   
+   
+   always_comb begin
+      case (FResultSelW)
+	// div/sqrt
+	3'b000 : FPUResultDirW = DivResultW;
+	// cmp		
+	3'b001 : FPUResultDirW = CmpResultW;
+	//fma/mult
+	3'b010 : FPUResultDirW = FmaResultW;
+	// sgn inj
+	3'b011 : FPUResultDirW = SgnResultW;
+	// add/sub/cnvt
+	3'b100 : FPUResultDirW = AddResultW;
+	// classify
+	3'b101 : FPUResultDirW = ClassResultW;
+	// output SrcAW
+	3'b110 : FPUResultDirW = SrcAW;
+	// Load/Store/Move to FP-register
+	3'b111 : FPUResultDirW = LoadStoreResultW;
+	default : FPUResultDirW = {64{1'bx}};
+      endcase
+   end
+   //interface between XLEN size datapath and double-precision sized
+   //floating-point results
+   //
+   //define offsets for LSB zero extension or truncation
+   always_comb begin
+      
+      //zero extension  
+      
+      // Teo 04/13/2021
+      // Commented out XLENDIFF{1'b0} due to error:
+      // Repetition multiplier must be constant.
+      
+      //if(`XLEN > 64) begin
+      //    FPUResultW = {FPUResultDirW,{XLENDIFF{1'b0}}};
+      //end
+      //truncate
+      //else begin
       FPUResultW = FPUResultDirW[63:64-`XLEN];
       SetFflagsM = FPUFlagsW;
-  //end
+      //end
+      
+   end  
+   
+endmodule // fpu
 
-  end  
-
-  //
-  //END WRITEBACK STAGE
-  //#########################################
-
-
-
-endmodule

From e7190b06903f8fd2162824beddbd2d88b26ed0e7 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Tue, 25 May 2021 20:04:34 -0400
Subject: [PATCH 04/14] renamed top level FPU wires

---
 wally-pipelined/src/fpu/fctrl.sv              |   43 +-
 wally-pipelined/src/fpu/fma1.sv               |   24 +-
 wally-pipelined/src/fpu/fma2.sv               |   28 +-
 wally-pipelined/src/fpu/fpdiv.sv              |   54 +-
 wally-pipelined/src/fpu/fpu.sv                | 1080 ++++++++---------
 wally-pipelined/src/fpu/fpuaddcvt1.sv         |  121 +-
 wally-pipelined/src/fpu/fpuaddcvt2.sv         |   46 +-
 wally-pipelined/src/fpu/fpucmp1.sv            |    2 +-
 wally-pipelined/src/fpu/fpuhazard.sv          |   40 +-
 wally-pipelined/src/fpu/fsgn.sv               |   16 +-
 wally-pipelined/src/fpu/special.sv            |   62 +-
 .../src/wally/wallypipelinedhart.sv           |    2 +-
 12 files changed, 707 insertions(+), 811 deletions(-)

diff --git a/wally-pipelined/src/fpu/fctrl.sv b/wally-pipelined/src/fpu/fctrl.sv
index f24368e1..840c9530 100755
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@@ -6,16 +6,15 @@ module fctrl (
   input  logic [2:0] Funct3D,
   input  logic [2:0] FRM_REGW,
   output logic       IllegalFPUInstrD,
-  output logic       FRegWriteD,
-  output logic       DivSqrtStartD,
-  //output logic [2:0] regSelD,
+  output logic       FWriteEnD,
+  output logic       FDivStartD,
   output logic [2:0] FResultSelD,
-  output logic [3:0] OpCtrlD,
+  output logic [3:0] FOpCtrlD,
   output logic       FmtD,
   output logic [2:0] FrmD,
   output logic [1:0] FMemRWD,
-  output logic       OutputInput2D,
-  output logic       In2UsedD, In3UsedD,
+  output logic       FOutputInput2D,
+  output logic       FInput2UsedD, FInput3UsedD,
   output logic       FWriteIntD);
 
 
@@ -102,9 +101,9 @@ module fctrl (
     end
   end
 
-  assign OutputInput2D = OpD == 7'b0100111;
+  assign FOutputInput2D = OpD == 7'b0100111;
 
-  assign FMemRWD[0] = OutputInput2D;
+  assign FMemRWD[0] = FOutputInput2D;
   assign FMemRWD[1] = OpD == 7'b0000111;
 
 
@@ -131,7 +130,7 @@ module fctrl (
   //this value is used enough to be shorthand
 
   //if op is div/sqrt - start div/sqrt
-  assign DivSqrtStartD = ~|FResultSelD; // is FResultSelD == 000
+  assign FDivStartD = ~|FResultSelD; // is FResultSelD == 000
 
   //operation control for each fp operation
   //has to be expanded over standard to account for
@@ -144,7 +143,7 @@ module fctrl (
   //version I used for this repo
 
   //let's do separate SOP for each type of operation
-//  assign OpCtrlD[3] = 1'b0;
+//  assign FOpCtrlD[3] = 1'b0;
 //
 //
 
@@ -152,12 +151,12 @@ module fctrl (
  
   always_comb begin
     IllegalFPUInstr1D = 0;
-    In3UsedD = 0;
+    FInput3UsedD = 0;
     case (FResultSelD)
       // div/sqrt
       //  fdiv  = ???0
       //  fsqrt = ???1
-      3'b000 : begin OpCtrlD = {3'b0, Funct7D[5]}; In2UsedD = ~Funct7D[5]; end
+      3'b000 : begin FOpCtrlD = {3'b0, Funct7D[5]}; FInput2UsedD = ~Funct7D[5]; end
       // cmp		
       //  fmin = ?100
       //  fmax = ?101
@@ -165,7 +164,7 @@ module fctrl (
       //  flt  = ?001
       //  fle  = ?011
       //		   {?,    is min or max, is eq or le, is lt or le}
-      3'b001 : begin OpCtrlD = {1'b0, Funct7D[2], ~Funct3D[0], ~(|Funct3D[2:1])}; In2UsedD = 1'b1; end
+      3'b001 : begin FOpCtrlD = {1'b0, Funct7D[2], ~Funct3D[0], ~(|Funct3D[2:1])}; FInput2UsedD = 1'b1; end
       //fma/mult	
       //  fmadd  = ?000
       //  fmsub  = ?001
@@ -173,12 +172,12 @@ module fctrl (
       //  fnmsub = ?011
       //  fmul   = ?100
       //		  {?, is mul, is negitive, is sub}
-      3'b010 : begin OpCtrlD = {1'b0, OpD[4:2]}; In2UsedD = 1'b1; In3UsedD = ~OpD[4]; end
+      3'b010 : begin FOpCtrlD = {1'b0, OpD[4:2]}; FInput2UsedD = 1'b1; FInput3UsedD = ~OpD[4]; end
       // sgn inj
       //  fsgnj  = ??00
       //  fsgnjn = ??01
       //  fsgnjx = ??10
-      3'b011 : begin OpCtrlD = {2'b0, Funct3D[1:0]}; In2UsedD = 1'b1; end
+      3'b011 : begin FOpCtrlD = {2'b0, Funct3D[1:0]}; FInput2UsedD = 1'b1; end
       // add/sub/cnvt
       //  fadd      = 0000
       //  fsub      = 0001
@@ -193,13 +192,13 @@ module fctrl (
       //  fcvt.d.wu = 1111
       //  fcvt.d.s  = 1000
       //		   { is double and not add/sub, is to/from int, is to int or float to double,      is unsigned or sub
-      3'b100 : begin OpCtrlD = {Funct7D[0]&Funct7D[5], Funct7D[6], Funct7D[3] | (~Funct7D[6]&Funct7D[5]&~Funct7D[0]), Rs2D[0]|(Funct7D[2]&~Funct7D[5])}; In2UsedD = ~Funct7D[5]; end
+      3'b100 : begin FOpCtrlD = {Funct7D[0]&Funct7D[5], Funct7D[6], Funct7D[3] | (~Funct7D[6]&Funct7D[5]&~Funct7D[0]), Rs2D[0]|(Funct7D[2]&~Funct7D[5])}; FInput2UsedD = ~Funct7D[5]; end
       // classify	  {?, ?, ?, ?}
-      3'b101 : begin OpCtrlD = 4'b0; In2UsedD = 1'b0; end
+      3'b101 : begin FOpCtrlD = 4'b0; FInput2UsedD = 1'b0; end
       // output SrcAW
       //  fmv.w.x = ???0
       //  fmv.w.d = ???1
-      3'b110 : begin OpCtrlD = {3'b0, Funct7D[0]}; In2UsedD = 1'b0; end
+      3'b110 : begin FOpCtrlD = {3'b0, Funct7D[0]}; FInput2UsedD = 1'b0; end
       // output Input1
       //  flw       = ?000
       //  fld       = ?001 
@@ -207,9 +206,9 @@ module fctrl (
       //  fsd       = ?011 // output Input2
       //  fmv.x.w  = ?100
       //  fmv.x.d  = ?101
-      //		   {?, is mv, is store, is double or fcvt.d.w}
-      3'b111 : begin OpCtrlD = {1'b0, OpD[6:5], Funct3D[0] | (OpD[6]&Funct7D[0])}; In2UsedD = OpD[5]; end
-      default : begin OpCtrlD = 4'b0; IllegalFPUInstr1D = 1'b1; In2UsedD = 1'b0; end
+      //		   {?, is mv, is store, is double or fmv}
+      3'b111 : begin FOpCtrlD = {1'b0, OpD[6:5], Funct3D[0] | (OpD[6]&Funct7D[0])}; FInput2UsedD = OpD[5]; end
+      default : begin FOpCtrlD = 4'b0; IllegalFPUInstr1D = 1'b1; FInput2UsedD = 1'b0; end
     endcase
   end
 
@@ -219,5 +218,5 @@ module fctrl (
   //			is add/cvt       and  is to int  or is classify		 or     is cmp	       	and not max/min or is output ReadData1 and is mv
   assign FWriteIntD = ((FResultSelD == 3'b100)&Funct7D[3]) | (FResultSelD == 3'b101) | ((FResultSelD == 3'b001)&~Funct7D[2]) | ((FResultSelD == 3'b111)&OpD[6]);
   // 		      if not writting to int reg and not a store function and not move
-  assign FRegWriteD = ~FWriteIntD & ~OpD[5] & ~((FResultSelD == 3'b111)&OpD[6]) & isFP;
+  assign FWriteEnD = ~FWriteIntD & ~OpD[5] & ~((FResultSelD == 3'b111)&OpD[6]) & isFP;
 endmodule
diff --git a/wally-pipelined/src/fpu/fma1.sv b/wally-pipelined/src/fpu/fma1.sv
index 59b51600..e4f818c8 100644
--- a/wally-pipelined/src/fpu/fma1.sv
+++ b/wally-pipelined/src/fpu/fma1.sv
@@ -15,13 +15,13 @@
 //    normalize Normalization shifter
 //    round     Rounding of result
 //    exception Handles exceptional cases
-//    bypass    Handles bypass of result to Input1E or Input3E inputs
+//    bypass    Handles bypass of result to FInput1E or FInput3E inputs
 //    sign      One bit sign handling block 
 //    special   Catch special cases (inputs = 0  / infinity /  etc.) 
 //
-//   The FMAC computes FmaResultM=Input1E*Input2E+Input3E, rounded with the mode specified by
+//   The FMAC computes FmaResultM=FInput1E*FInput2E+FInput3E, rounded with the mode specified by
 //   RN, RZ, RM, or RP.  The result is optionally bypassed back to
-//   the Input1E or Input3E inputs for use on the next cycle.  In addition,  four signals
+//   the FInput1E or FInput3E inputs for use on the next cycle.  In addition,  four signals
 //   are produced: trap, overflow, underflow, and inexact.  Trap indicates
 //   an infinity, NaN, or denormalized number to be handled in software;
 //   the other three signals are IEEE flags.
@@ -29,15 +29,15 @@
 /////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////////////////////////////////////////////////////////
-module fma1(Input1E, Input2E, Input3E, FrmE,  
+module fma1(FInput1E, FInput2E, FInput3E, FrmE,  
 			rE, sE, tE, bsE, killprodE, sumshiftE, sumshiftzeroE,  aligncntE, aeE
 			, xzeroE, yzeroE, zzeroE, xnanE,ynanE, znanE, xdenormE, ydenormE, zdenormE,
 			xinfE, yinfE, zinfE, nanE, prodinfE);
 /////////////////////////////////////////////////////////////////////////////
  
-	input logic 		[63:0]		Input1E;		// input 1
-	input logic		[63:0]		Input2E;     // input 2 
-	input logic 		[63:0]		Input3E;     // input 3
+	input logic 		[63:0]		FInput1E;		// input 1
+	input logic		[63:0]		FInput2E;     // input 2 
+	input logic 		[63:0]		FInput3E;     // input 3
 	input logic 		[2:0]	 	FrmE;          	// Rounding mode
 	output logic 		[12:0]		aligncntE;    	// status flags
 	output logic 		[105:0]		rE; 				// one result of partial product sum
@@ -45,7 +45,7 @@ module fma1(Input1E, Input2E, Input3E, FrmE,
 	output logic 		[163:0]		tE;				// output logic of alignment shifter	
 	output logic 		[12:0]		aeE; 		// multiplier expoent
 	output logic 					bsE;				// sticky bit of addend
-	output logic 					killprodE; 		// Input3E >> product
+	output logic 					killprodE; 		// FInput3E >> product
 	output logic					xzeroE;
 	output logic					yzeroE;
 	output logic					zzeroE;
@@ -68,7 +68,7 @@ module fma1(Input1E, Input2E, Input3E, FrmE,
 //	output logic 		[12:0]		aligncntE; 		// shift count for alignment
 
 
-	logic 					prodof; 		// Input1E*Input2E out of range
+	logic 					prodof; 		// FInput1E*FInput2E out of range
 
 
 
@@ -84,12 +84,12 @@ module fma1(Input1E, Input2E, Input3E, FrmE,
 
 //   Instantiate fraction datapath
 
-	multiply		multiply(.xman(Input1E[51:0]), .yman(Input2E[51:0]), .*);
-	align			align(.zman(Input3E[51:0]),.*);
+	multiply		multiply(.xman(FInput1E[51:0]), .yman(FInput2E[51:0]), .*);
+	align			align(.zman(FInput3E[51:0]),.*);
 
 // Instantiate exponent datapath
 
-	expgen1			expgen1(.xexp(Input1E[62:52]),.yexp(Input2E[62:52]),.zexp(Input3E[62:52]),.*);
+	expgen1			expgen1(.xexp(FInput1E[62:52]),.yexp(FInput2E[62:52]),.zexp(FInput3E[62:52]),.*);
 // Instantiate special case detection across datapath & exponent path 
 
 	special			special(.*);
diff --git a/wally-pipelined/src/fpu/fma2.sv b/wally-pipelined/src/fpu/fma2.sv
index 23e6bb6b..467a4d28 100644
--- a/wally-pipelined/src/fpu/fma2.sv
+++ b/wally-pipelined/src/fpu/fma2.sv
@@ -15,13 +15,13 @@
 //    normalize Normalization shifter
 //    round     Rounding of result
 //    exception Handles exceptional cases
-//    bypass    Handles bypass of result to Input1M or Input3M input logics
+//    bypass    Handles bypass of result to FInput1M or FInput3M input logics
 //    sign      One bit sign handling block 
 //    special   Catch special cases (input logics = 0  / infinity /  etc.) 
 //
-//   The FMAC computes FmaResultM=Input1M*Input2M+Input3M, rounded with the mode specified by
+//   The FMAC computes FmaResultM=FInput1M*FInput2M+FInput3M, rounded with the mode specified by
 //   RN, RZ, RM, or RP.  The result is optionally bypassed back to
-//   the Input1M or Input3M input logics for use on the next cycle.  In addition,  four signals
+//   the FInput1M or FInput3M input logics for use on the next cycle.  In addition,  four signals
 //   are produced: trap, overflow, underflow, and inexact.  Trap indicates
 //   an infinity, NaN, or denormalized number to be handled in software;
 //   the other three signals are IMMM flags.
@@ -29,7 +29,7 @@
 /////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////////////////////////////////////////////////////////
-module fma2(Input1M, Input2M, Input3M, FrmM,
+module fma2(FInput1M, FInput2M, FInput3M, FrmM,
 			FmaResultM, FmaFlagsM, aligncntM, rM, sM,
 			tM,	normcntM, aeM, bsM,killprodM,
 			xzeroM,	yzeroM,zzeroM,xdenormM,ydenormM,
@@ -39,9 +39,9 @@ module fma2(Input1M, Input2M, Input3M, FrmM,
 );
 /////////////////////////////////////////////////////////////////////////////
  
-	input logic 		[63:0]		Input1M;		// input logic 1
-	input logic		[63:0]		Input2M;     // input logic 2 
-	input logic 		[63:0]		Input3M;     // input logic 3
+	input logic 		[63:0]		FInput1M;		// input logic 1
+	input logic		[63:0]		FInput2M;     // input logic 2 
+	input logic 		[63:0]		FInput3M;     // input logic 3
 	input logic 		[2:0]	 	FrmM;          	// Rounding mode
 	input logic 		[12:0]		aligncntM;    	// status flags
 	input logic 		[105:0]		rM; 				// one result of partial product sum
@@ -50,7 +50,7 @@ module fma2(Input1M, Input2M, Input3M, FrmM,
 	input logic 		[8:0]		normcntM; 		// shift count for normalizer
 	input logic 		[12:0]		aeM; 		// multiplier expoent
 	input logic 					bsM;				// sticky bit of addend
-	input logic 					killprodM; 		// Input3M >> product
+	input logic 					killprodM; 		// FInput3M >> product
 	input logic					prodinfM;
 	input logic					xzeroM;
 	input logic					yzeroM;
@@ -69,7 +69,7 @@ module fma2(Input1M, Input2M, Input3M, FrmM,
 	input logic					sumshiftzeroM;
 
 
-	output logic 		[63:0]		FmaResultM;     // output FmaResultM=Input1M*Input2M+Input3M
+	output logic 		[63:0]		FmaResultM;     // output FmaResultM=FInput1M*FInput2M+FInput3M
 	output logic 		[4:0]		FmaFlagsM;    	// status flags
 	
 
@@ -120,18 +120,18 @@ module fma2(Input1M, Input2M, Input3M, FrmM,
 
 	add				add(.*);
 	lza				lza(.*);
-	normalize		normalize(.zexp(Input3M[62:52]),.*); 
-	round			round(.xman(Input1M[51:0]), .yman(Input2M[51:0]),.zman(Input3M[51:0]),.*);
+	normalize		normalize(.zexp(FInput3M[62:52]),.*); 
+	round			round(.xman(FInput1M[51:0]), .yman(FInput2M[51:0]),.zman(FInput3M[51:0]),.*);
 
 // Instantiate exponent datapath
 
-	expgen2			expgen2(.xexp(Input1M[62:52]),.yexp(Input2M[62:52]),.zexp(Input3M[62:52]),.*);
+	expgen2			expgen2(.xexp(FInput1M[62:52]),.yexp(FInput2M[62:52]),.zexp(FInput3M[62:52]),.*);
 
 
 // Instantiate control logic
  
-sign				sign(.xsign(Input1M[63]),.ysign(Input2M[63]),.zsign(Input3M[63]),.*); 
-flag2				flag2(.xsign(Input1M[63]),.ysign(Input2M[63]),.zsign(Input3M[63]),.vbits(v[1:0]),.*); 
+sign				sign(.xsign(FInput1M[63]),.ysign(FInput2M[63]),.zsign(FInput3M[63]),.*); 
+flag2				flag2(.xsign(FInput1M[63]),.ysign(FInput2M[63]),.zsign(FInput3M[63]),.vbits(v[1:0]),.*); 
 
 assign FmaResultM = {wsign,wexp,wman};
 
diff --git a/wally-pipelined/src/fpu/fpdiv.sv b/wally-pipelined/src/fpu/fpdiv.sv
index 1574b79e..0d493359 100755
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@@ -23,25 +23,25 @@
 //
 
 // `timescale 1ps/1ps
-module fpdiv (DivSqrtDone, DivResultM, DivFlagsM, DivDenormM, DivOp1, DivOp2, DivFrm, DivOpType, DivP, DivOvEn, DivUnEn,
-	      DivStart, reset, clk, DivBusyM);
+module fpdiv (FDivSqrtDoneM, FDivResultM, FDivFlagsM, DivDenormM, FInput1E, FInput2E, FrmE, DivOpType, FmtE, DivOvEn, DivUnEn,
+	      FDivStartE, reset, clk, DivBusyM);
 
-   input [63:0] DivOp1;		// 1st input operand (A)
-   input [63:0] DivOp2;		// 2nd input operand (B)
-   input [2:0] 	DivFrm;		// Rounding mode - specify values 
+   input [63:0] FInput1E;		// 1st input operand (A)
+   input [63:0] FInput2E;		// 2nd input operand (B)
+   input [2:0] 	FrmE;		// Rounding mode - specify values 
    input 	DivOpType;	// Function opcode
-   input 	DivP;   		// Result Precision (0 for double, 1 for single)
+   input 	FmtE;   		// Result Precision (0 for double, 1 for single)
    input 	DivOvEn;		// Overflow trap enabled
    input 	DivUnEn;   	// Underflow trap enabled
 
-   input 	DivStart;
+   input 	FDivStartE;
    input 	reset;
    input 	clk;   
 
-   output [63:0] DivResultM;	// Result of operation
-   output [4:0]  DivFlagsM;   	// IEEE exception flags 
+   output [63:0] FDivResultM;	// Result of operation
+   output [4:0]  FDivFlagsM;   	// IEEE exception flags 
    output 	 DivDenormM;   	// DivDenormM on input or output
-   output 	 DivSqrtDone;
+   output 	 FDivSqrtDoneM;
    output    DivBusyM;
 
    supply1 	  vdd;
@@ -94,16 +94,16 @@ module fpdiv (DivSqrtDone, DivResultM, DivFlagsM, DivDenormM, DivOp1, DivOp2, Di
    
    logic exp_cout1, exp_cout2, exp_odd, open;
    // Convert the input operands to their appropriate forms based on 
-   // the orignal operands, the DivOpType , and their precision DivP. 
+   // the orignal operands, the DivOpType , and their precision FmtE. 
    // Single precision inputs are converted to double precision 
    // and the sign of the first operand is set appropratiately based on
    // if the operation is absolute value or negation. 
-   convert_inputs_div divconv1 (Float1, Float2, DivOp1, DivOp2, DivOpType, DivP);
+   convert_inputs_div divconv1 (Float1, Float2, FInput1E, FInput2E, DivOpType, FmtE);
 
    // Test for exceptions and return the "Invalid Operation" and
-   // "Denormalized" Input DivFlagsM. The "sel_inv" is used in
+   // "Denormalized" Input FDivFlagsM. The "sel_inv" is used in
    // the third pipeline stage to select the result. Also, op1_Norm
-   // and op2_Norm are one if DivOp1 and DivOp2 are not zero or denormalized.
+   // and op2_Norm are one if FInput1E and FInput2E are not zero or denormalized.
    // sub is one if the effective operation is subtaction. 
    exception_div divexc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
 		   Float1, Float2, DivOpType);
@@ -135,26 +135,26 @@ module fpdiv (DivSqrtDone, DivResultM, DivFlagsM, DivDenormM, DivOp1, DivOp2, Di
 		  sel_muxa, sel_muxb, sel_muxr, 
 		  reset, clk,
 		  load_rega, load_regb, load_regc, load_regd,
-		  load_regr, load_regs, DivP, DivOpType, exp_odd);
+		  load_regr, load_regs, FmtE, DivOpType, exp_odd);
 
    // FSM : control divider
-   fsm control (DivSqrtDone, load_rega, load_regb, load_regc, load_regd, 
+   fsm control (FDivSqrtDoneM, load_rega, load_regb, load_regc, load_regd, 
 		load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
-		clk, reset, DivStart, DivOpType, DivBusyM);
+		clk, reset, FDivStartE, DivOpType, DivBusyM);
    
    // Round the mantissa to a 52-bit value, with the leading one
    // removed. The rounding units also handles special cases and 
    // set the exception flags.
    //***add max magnitude and swap negitive and positive infinity
    rounder_div divround1 (Result, DenormIO, FlagsIn, 
-		   DivFrm, DivP, DivOvEn, DivUnEn, expF, 
+		   FrmE, FmtE, DivOvEn, DivUnEn, expF, 
    		   sel_inv, Invalid, DenormIn, signResult, 
 		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
 
    // Store the final result and the exception flags in registers.
-   flopenr #(64) rega (clk, reset, DivSqrtDone, Result, DivResultM);
-   flopenr #(1) regb (clk, reset, DivSqrtDone, DenormIO, DivDenormM);   
-   flopenr #(5) regc (clk, reset, DivSqrtDone, FlagsIn, DivFlagsM);   
+   flopenr #(64) rega (clk, reset, FDivSqrtDoneM, Result, FDivResultM);
+   flopenr #(1) regb (clk, reset, FDivSqrtDoneM, DenormIO, DivDenormM);   
+   flopenr #(5) regc (clk, reset, FDivSqrtDoneM, FlagsIn, FDivFlagsM);   
    
 endmodule // fpadd
 
@@ -198,7 +198,7 @@ module brent_kung (c, p, g);
    logic G_7_0,G_11_0,G_5_0,G_9_0,G_13_0,G_2_0,G_4_0,G_6_0,G_8_0,G_10_0,G_12_0;
    // parallel-prefix, Brent-Kung
 
-   // Stage 1: Generates G/DivP pairs that span 1 bits
+   // Stage 1: Generates G/FmtE pairs that span 1 bits
    grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
    black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
    black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
@@ -207,20 +207,20 @@ module brent_kung (c, p, g);
    black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
    black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
 
-   // Stage 2: Generates G/DivP pairs that span 2 bits
+   // Stage 2: Generates G/FmtE pairs that span 2 bits
    grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
    black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
    black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
 
-   // Stage 3: Generates G/DivP pairs that span 4 bits
+   // Stage 3: Generates G/FmtE pairs that span 4 bits
    grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
 
-   // Stage 4: Generates G/DivP pairs that span 8 bits
+   // Stage 4: Generates G/FmtE pairs that span 8 bits
 
-   // Stage 5: Generates G/DivP pairs that span 4 bits
+   // Stage 5: Generates G/FmtE pairs that span 4 bits
    grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
 
-   // Stage 6: Generates G/DivP pairs that span 2 bits
+   // Stage 6: Generates G/FmtE pairs that span 2 bits
    grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
    grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
    grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index cbc0f482..9f40300a 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -23,10 +23,8 @@
 ///////////////////////////////////////////
 
 `include "wally-config.vh"
-//  `include "../../config/rv64icfd/wally-config.vh" //debug
 
 module fpu (
-  //input  logic [2:0]       FrmD,
   input  logic [2:0]       FRM_REGW,    // Rounding mode from CSR
   input  logic             reset,
   //input  logic             clear,     // *** not being used anywhere
@@ -42,605 +40,501 @@ module fpu (
   output logic [31:0]      FSROutW,
   output logic [1:0]       FMemRWM,
 	output logic             FStallD,
-  output logic             FWriteIntW,
-  output logic             FWriteIntM,
-  output logic [`XLEN-1:0] FWriteDataM,       // Integer input being written into fpreg
-  output logic             DivSqrtDoneE,
+  output logic             FWriteIntM, FWriteIntW,
+  output logic [`XLEN-1:0] FWriteDataM,
+  output logic             FDivSqrtDoneM,
   output logic             IllegalFPUInstrD,
   output logic [`XLEN-1:0] FPUResultW);
 
-   //NOTE:
-   //For readability and ease of modification, logic signals will be
-   //instantiated as they occur within the pipeline. This will keep local
-   //signals, modules, and combinational logic closely defined.
-   
-   //used for OSU DP-size hardware to wally XLEN interfacing
-   
-   integer 		   XLENDIFF;
-   assign XLENDIFF = `XLEN - 64;
-   integer 		   XLENDIFFN;
-   assign XLENDIFFN = 63 - `XLEN;
-   
-   // BEGIN PIPELINE CONTROL LOGIC
-   logic 		   PipeEnableDE;
-   logic 		   PipeEnableEM;
-   logic 		   PipeEnableMW;
-   logic 		   PipeClearDE;
-   logic 		   PipeClearEM;
-   logic 		   PipeClearMW;
-   
-   //temporarily assign pipe clear and enable signals
-   //to never flush & always be running
-   localparam PipeClear = 1'b0;
-   localparam PipeEnable = 1'b1;
-   always_comb begin
-      PipeEnableDE = ~StallE;
-      PipeEnableEM = ~StallM;
-      PipeEnableMW = ~StallW;
-      PipeClearDE = FlushE;
-      PipeClearEM = FlushM;
-      PipeClearMW = FlushW;
-   end   
-   
-   // Wally-spec D stage control logic signal instantiation
-   logic                    FRegWriteD;
-   logic [2:0] 		    FResultSelD;
-   logic [2:0] 		    FrmD;
-   logic                    FmtD;
-   logic                    DivSqrtStartD;
-   logic [3:0] 		    OpCtrlD;
-   logic                    FWriteIntD;
-   logic                    OutputInput2D;
-   logic [1:0] 		    FMemRWD;
-   
-   logic 		    DivBusyM;
-   logic [1:0] 		    Input1MuxD, Input2MuxD;
-   logic 		    Input3MuxD;
-   logic                    In2UsedD, In3UsedD;
-   
-   //Hazard unit for FPU
-   fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*);
-   
-   //top-level controller for FPU
-   fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*);
-   
-   //instantiation of D stage regfile signals (includes some W stage signals
-   //for easy reference)
-   logic [2:0] 		    FrmW;
-   logic                    FmtW;
-   logic                    FRegWriteW;
-   logic [4:0] 		    RdW, Rs1D, Rs2D, Rs3D;
-   logic [`XLEN-1:0] 	    WriteDataW;
-   logic [63:0] 	    FPUResultDirW; 
-   logic [`XLEN-1:0] 	    ReadData1D, ReadData2D, ReadData3D; 
-   
-   //regfile instantiation
-   //freg3adr fpregfile (FmtW, reset, PipeClear, clk, RdW, 
-   //		       FRegWriteW, 
-   //		       InstrD[19:15], InstrD[24:20], InstrD[31:27], 
-   //		       FPUResultDirW, 
-   //		       ReadData1D, ReadData2D, ReadData3D);
-   FPregfile fpregfile (clk, reset, FRegWriteW,
+
+
+
+
+  //control logic signal instantiation
+  logic             FWriteEnD, FWriteEnE, FWriteEnM, FWriteEnW;             // FP register write enable
+  logic [2:0]       FrmD, FrmE, FrmM, FrmW;                                 // FP rounding mode
+  logic             FmtD, FmtE, FmtM, FmtW;                                 // FP precision 0-single 1-double
+  logic             FDivStartD, FDivStartE;                                 // Start division
+  logic             FWriteIntD, FWriteIntE;                                 // Write to integer register
+  logic             FOutputInput2D, FOutputInput2E;                         // Put Input2 in Input1 if a store instruction
+  logic [1:0]       FMemRWD, FMemRWE;                                       // Read and write enable for memory
+  logic [1:0]       FForwardInput1D, FForwardInput1E;                       // Input1 forwarding mux control signal
+  logic [1:0]       FForwardInput2D, FForwardInput2E;                       // Input2 forwarding mux control signal
+  logic             FForwardInput3D, FForwardInput3E;                       // Input3 forwarding mux control signal
+  logic             FInput2UsedD;                                           // Is input 2 used
+  logic             FInput3UsedD;                                           // Is input 3 used
+  logic [2:0]       FResultSelD, FResultSelE, FResultSelM, FResultSelW;     // Select FP result
+  logic [3:0]       FOpCtrlD, FOpCtrlE, FOpCtrlM;                           // Select which opperation to do in each component
+  
+  // regfile signals
+  logic [4:0]       RdE, RdM, RdW; // ***Can take from ieu
+  logic [`XLEN-1:0] FWDM;                                                   // Write data for FP register
+  logic [`XLEN-1:0] FRD1D, FRD2D, FRD3D;                                    // Read Data from FP register
+  logic [`XLEN-1:0] FRD1E, FRD2E, FRD3E;
+  logic [`XLEN-1:0] FInput1E, FInput1M, FInput1tmpE;
+  logic [`XLEN-1:0] FInput2E, FInput2M;
+  logic [`XLEN-1:0] FInput3E, FInput3M;
+  logic [`XLEN-1:0] FLoadStoreResultM, FLoadStoreResultW;                   // Result for load, store, and move to int-reg instructions
+
+  // div/sqrt signals
+  logic             DivDenormM, DivDenormW;
+  logic             DivOvEn, DivUnEn;
+  logic             DivBusyM;
+  logic [63:0]      FDivResultM, FDivResultW;
+  logic [4:0]       FDivFlagsM, FDivFlagsW;
+
+  // FMA signals
+  logic [12:0]		  aligncntE, aligncntM; 
+  logic [105:0]		  rE, rM; 
+  logic [105:0]		  sE, sM; 
+  logic [163:0]		  tE, tM;	
+  logic [8:0]		    normcntE, normcntM; 
+  logic [12:0]		  aeE, aeM; 
+  logic 		        bsE, bsM;
+  logic 		        killprodE, killprodM; 
+  logic 		        prodofE, prodofM; 
+  logic			        xzeroE, xzeroM;
+  logic			        yzeroE, yzeroM;
+  logic			        zzeroE, zzeroM;
+  logic			        xdenormE, xdenormM;
+  logic			        ydenormE, ydenormM;
+  logic			        zdenormE, zdenormM;
+  logic			        xinfE, xinfM;
+  logic			        yinfE, yinfM;
+  logic			        zinfE, zinfM;
+  logic			        xnanE, xnanM;
+  logic			        ynanE, ynanM;
+  logic			        znanE, znanM;
+  logic			        nanE, nanM;
+  logic	[8:0]		    sumshiftE, sumshiftM;
+  logic			        sumshiftzeroE, sumshiftzeroM;
+  logic             prodinfE, prodinfM;
+  logic [63:0]      FmaResultM, FmaResultW;
+  logic [4:0]       FmaFlagsM, FmaFlagsW;
+  
+  // add/cvt signals
+  logic [63:0]      AddSumE, AddSumTcE;
+  logic [3:0]       AddSelInvE;
+  logic [10:0]      AddExpPostSumE;
+  logic             AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
+  logic             AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
+  logic             AddConvertE;
+  logic [63:0]      AddFloat1E, AddFloat2E;
+  logic [11:0]      AddExp1DenormE, AddExp2DenormE;
+  logic [10:0]      AddExponentE;
+  logic [2:0]       AddRmE;
+  logic [3:0]       AddOpTypeE;
+  logic             AddPE, AddOvEnE, AddUnEnE;    
+  logic             AddDenormM;
+  logic [63:0]      AddSumM, AddSumTcM;
+  logic [3:0]       AddSelInvM;
+  logic [10:0]      AddExpPostSumM;
+  logic             AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
+  logic             AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
+  logic             AddConvertM, AddSignM;
+  logic [63:0]      AddFloat1M, AddFloat2M;
+  logic [11:0]      AddExp1DenormM, AddExp2DenormM;
+  logic [10:0]      AddExponentM;
+  logic [63:0]      AddOp1M, AddOp2M;
+  logic [2:0]       AddRmM;
+  logic [3:0]       AddOpTypeM;
+  logic             AddPM, AddOvEnM, AddUnEnM;  
+  logic [63:0]      FAddResultM, FAddResultW;
+  logic [4:0]       FAddFlagsM, FAddFlagsW;
+
+  //cmp signals 
+  logic [7:0]       WE, WM;
+  logic [7:0]       XE, XM;
+  logic             ANaNE, ANaNM;
+  logic             BNaNE, BNaNM;
+  logic             AzeroE, AzeroM;
+  logic             BzeroE, BzeroM;
+  logic             CmpInvalidM, CmpInvalidW;
+  logic [1:0]       CmpFCCM, CmpFCCW; 
+  logic [63:0]      FCmpResultW;
+
+  // fsgn signals
+  logic [63:0]      SgnResultE, SgnResultM, SgnResultW;
+  logic [4:0]       SgnFlagsE, SgnFlagsM, SgnFlagsW;
+
+  //instantiation of W stage regfile signals
+  logic [`XLEN-1:0] SrcAW;
+
+  // classify signals
+  logic [63:0]      ClassResultE, ClassResultM, ClassResultW;
+  logic [4:0]       ClassFlagsE, ClassFlagsM, ClassFlagsW;
+
+  // other
+  logic [63:0]      FPUResult64W, FPUResult64E;                                           // 64-bit FPU result
+  logic [4:0]       FPUFlagsW;
+
+  // pipeline control logic
+  logic	                   PipeEnableDE;
+  logic	                   PipeEnableEM;
+  logic	                   PipeEnableMW;
+  logic                    PipeClearDE;
+  logic                    PipeClearEM;
+  logic                    PipeClearMW;
+
+  //temporarily assign pipe clear and enable signals
+  //to never flush & always be running
+  localparam PipeClear = 1'b0;
+  localparam PipeEnable = 1'b1;
+  always_comb begin
+
+	  PipeEnableDE = ~StallE;
+	  PipeEnableEM = ~StallM;
+	  PipeEnableMW = ~StallW;
+	  PipeClearDE = FlushE;
+	  PipeClearEM = FlushM;
+	  PipeClearMW = FlushW;
+
+  end
+
+ 
+
+
+
+
+
+
+
+
+
+
+
+  //DECODE STAGE
+
+  //Hazard unit for FPU
+  fpuhazard hazard(.Adr1(InstrD[19:15]), .Adr2(InstrD[24:20]), .Adr3(InstrD[31:27]), .*);
+
+  //top-level controller for FPU
+  fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .*);
+
+
+  //regfile instantiation
+   FPregfile fpregfile (clk, reset, FWriteEnW,
 			InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
-			FPUResultDirW,
-			ReadData1D, ReadData2D, ReadData3D);		
+			FPUResult64W,
+			FRD1D, FRD2D, FRD3D);	
+
+
+
+
+
+
+
+
+
+  //*****************
+  //fpregfile D/E pipe registers
+  //*****************
+  flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, FRD1D, FRD1E);
+  flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, FRD2D, FRD2E);
+  flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, FRD3D, FRD3E);
+
+  //*****************
+  //other  D/E pipe registers
+  //*****************
+  flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FWriteEnD, FWriteEnE);
+  flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE);
+  flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
+  flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE);
+  flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE);
+  flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, FOpCtrlD, FOpCtrlE);
+  flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, FDivStartD, FDivStartE);
+  flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput1D, FForwardInput1E);
+  flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput2D, FForwardInput2E);
+  flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, FForwardInput3D, FForwardInput3E);
+  flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResult64W, FPUResult64E);
+  flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE);
+  flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, FOutputInput2D, FOutputInput2E);
+  flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE);
+
+
+
+
+
+
+
+
+
+
+
+
+
+  //EXECUTION STAGE
+
+
 
-  // wally-spec E stage control logic signal instantiation
-   logic                    FRegWriteE;
-   logic [2:0] 		    FResultSelE;
-   logic [2:0] 		    FrmE;
-   logic                    FmtE;
-   logic                    DivSqrtStartE;
-   logic [3:0] 		    OpCtrlE;
-   logic [1:0] 		    Input1MuxE, Input2MuxE;
-   logic                    Input3MuxE;
-   logic [63:0] 	    FPUResultDirE;
-   logic                    FWriteIntE;
-   logic                    OutputInput2E;
-   logic [1:0] 		    FMemRWE;
-   
-   //instantiation of E stage regfile signals
-   logic [4:0] 		    RdE;
-   logic [`XLEN-1:0] 	    ReadData1E, ReadData2E, ReadData3E;
-   logic [`XLEN-1:0] 	    Input1E, Input2E, Input3E, Input1tmpE;
-   
-   //instantiation of E/M stage div/sqrt signals
-   logic                    DivSqrtDone, DivDenormM;
-   logic [63:0] 	    DivResultM;
-   logic [4:0] 		    DivFlagsM;
-   logic [63:0] 	    DivOp1, DivOp2;
-   logic [2:0] 		    DivFrm;
-   logic                    DivOpType;
-   logic                    DivP;
-   logic                    DivOvEn, DivUnEn;
-   logic                    DivStart;
-   
-   //instantiate E stage FMA signals here
-   logic [12:0] 	    aligncntE; 
-   logic [105:0] 	    rE; 
-   logic [105:0] 	    sE; 
-   logic [163:0] 	    tE;	
-   logic [8:0] 		    normcntE; 
-   logic [12:0] 	    aeE; 
-   logic 		    bsE;
-   logic 		    killprodE; 
-   logic 		    prodofE; 
-   logic 		    xzeroE;
-   logic 		    yzeroE;
-   logic 		    zzeroE;
-   logic 		    xdenormE;
-   logic 		    ydenormE;
-   logic 		    zdenormE;
-   logic 		    xinfE;
-   logic 		    yinfE;
-   logic 		    zinfE;
-   logic 		    xnanE;
-   logic 		    ynanE;
-   logic 		    znanE;
-   logic 		    nanE;
-   logic [8:0] 		    sumshiftE;
-   logic 		    sumshiftzeroE;
-   logic 		    prodinfE;
-   
-   //instantiation of E stage add/cvt signals
-   logic [63:0] 	    AddSumE, AddSumTcE;
-   logic [3:0] 		    AddSelInvE;
-   logic [10:0] 	    AddExpPostSumE;
-   logic                    AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE;
-   logic                    AddDenormInE, AddSwapE, AddNormOvflowE, AddSignAE;
-   logic                    AddConvertE;
-   logic [63:0] 	    AddFloat1E, AddFloat2E;
-   logic [11:0] 	    AddExp1DenormE, AddExp2DenormE;
-   logic [10:0] 	    AddExponentE;
-   logic [63:0] 	    AddOp1E, AddOp2E;
-   logic [2:0] 		    AddRmE;
-   logic [3:0] 		    AddOpTypeE;
-   logic                    AddPE, AddOvEnE, AddUnEnE;  
-   
-   //instantiation of E stage cmp signals 
-   logic [7:0] 		    WE, XE;
-   logic                    ANaNE, BNaNE, AzeroE, BzeroE;
-   logic [63:0] 	    CmpOp1E, CmpOp2E;
-   logic [1:0] 		    CmpSelE;
-   
-   //instantiation of E/M stage fsgn signals (due to bypass logic)
-   logic [63:0] 	    SgnOp1E, SgnOp2E;
-   logic [1:0] 		    SgnOpCodeE, SgnOpCodeM;
-   logic [63:0] 	    SgnResultE, SgnResultM;
-   logic [4:0] 		    SgnFlagsE, SgnFlagsM;
-   
-   //*****************
-   //fpregfile D/E pipe registers
-   //*****************
-   flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, ReadData1D, ReadData1E);
-   flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, ReadData2D, ReadData2E);
-   flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, ReadData3D, ReadData3E);
-   
-   //*****************
-   //other  D/E pipe registers
-   //*****************
-   flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FRegWriteD, FRegWriteE);
-   flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE);
-   flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
-   flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, FmtD, FmtE);
-   flopenrc #(5) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, InstrD[11:7], RdE);
-   flopenrc #(4) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, OpCtrlD, OpCtrlE);
-   flopenrc #(1) DEReg10(clk, reset, PipeClearDE, PipeEnableDE, DivSqrtStartD, DivSqrtStartE);
-   flopenrc #(2) DEReg11(clk, reset, PipeClearDE, PipeEnableDE, Input1MuxD, Input1MuxE);
-   flopenrc #(2) DEReg12(clk, reset, PipeClearDE, PipeEnableDE, Input2MuxD, Input2MuxE);
-   flopenrc #(1) DEReg13(clk, reset, PipeClearDE, PipeEnableDE, Input3MuxD, Input3MuxE);
-   flopenrc #(64) DEReg14(clk, reset, PipeClearDE, PipeEnableDE, FPUResultDirW, FPUResultDirE);
-   flopenrc #(1) DEReg15(clk, reset, PipeClearDE, PipeEnableDE, FWriteIntD, FWriteIntE);
-   flopenrc #(1) DEReg16(clk, reset, PipeClearDE, PipeEnableDE, OutputInput2D, OutputInput2E);
-   flopenrc #(2) DEReg17(clk, reset, PipeClearDE, PipeEnableDE, FMemRWD, FMemRWE);
-   
   // input muxs for forwarding
-   mux4  #(64)  Input1Emux(ReadData1E, FPUResultDirW, FPUResultDirE, SrcAM, Input1MuxE, Input1tmpE);
-   mux3  #(64)  Input2Emux(ReadData2E, FPUResultDirW, FPUResultDirE, Input2MuxE, Input2E);
-   mux2  #(64)  Input3Emux(ReadData3E, FPUResultDirE, Input3MuxE, Input3E);
-   mux2  #(64)  OutputInput2mux(Input1tmpE, Input2E, OutputInput2E, Input1E);
+  mux4  #(64)  FInput1Emux(FRD1E, FPUResult64W, FPUResult64E, SrcAM, FForwardInput1E, FInput1tmpE);
+  mux3  #(64)  FInput2Emux(FRD2E, FPUResult64W, FPUResult64E, FForwardInput2E, FInput2E);
+  mux2  #(64)  FInput3Emux(FRD3E, FPUResult64E, FForwardInput3E, FInput3E);
+  mux2  #(64)  FOutputInput2mux(FInput1tmpE, FInput2E, FOutputInput2E, FInput1E);
 
-   fma1 fma1 (.*);
+  fma1 fma1 (.*);
+
+  //first and only instance of floating-point divider
+  fpdiv fpdivsqrt (.DivOpType(FOpCtrlE[0]), .*);
+
+  //first of two-stage instance of floating-point add/cvt unit
+  fpuaddcvt1 fpadd1 (.*);
+
+  //first of two-stage instance of floating-point comparator
+  fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, FInput1E, FInput2E, FOpCtrlE[1:0]);
+
+  //first and only instance of floating-point sign converter
+  fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+  //*****************
+  //fpregfile D/E pipe registers
+  //*****************
+  flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, FInput1E, FInput1M);
+  flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, FInput2E, FInput2M);
+  flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, FInput3E, FInput3M);
+
+  //*****************
+  //fma E/M pipe registers
+  //*****************  
+  flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); 
+  flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); 
+  flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); 
+  flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); 
+  flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); 
+  flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM);  
+  flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); 
+  flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); 
+  flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); 
+  flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); 
+  flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); 
+  flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); 
+  flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); 
+  flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); 
+  flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); 
+  flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); 
+  flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); 
+  flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); 
+  flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); 
+  flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); 
+  flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); 
+  flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); 
+  flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); 
+  flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); 
+  flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); 
+
+  //*****************
+  //fpadd E/M pipe registers
+  //*****************
+  flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
+  flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
+  flopenrc #(4)  EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
+  flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
+  flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
+  flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
+  flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
+  flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
+  flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
+  flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
+  flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
+  flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
+  flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
+  flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
+  flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignM); 
+  flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
+  flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
+  flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
+  flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
+  flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
+  flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
+  flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
+  flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
+  flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
+  flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
+
+  //*****************
+  //fpcmp E/M pipe registers
+  //*****************
+  flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
+  flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
+  flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
+  flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
+  flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
+  flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
+
+  //put this in for the event we want to delay fsgn - will otherwise bypass
+  //*****************
+  //fpsgn E/M pipe registers
+  //***************** 
+  flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
+  flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
+
+  //*****************
+  //other E/M pipe registers
+  //*****************
+  flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FWriteEnE, FWriteEnM);
+  flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM);
+  flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
+  flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM);
+  flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM);
+  flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, FOpCtrlE, FOpCtrlM);
+  flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
+  flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
+
+
+
+
+
+
+
+
+
+
+  //BEGIN MEMORY STAGE
+
+  assign FWriteDataM = FInput1M;
+
+  mux2  #(64)  FLoadStoreResultMux(HRDATA, FInput1M, |FOpCtrlM[2:1], FLoadStoreResultM);
 
-   //first and only instance of floating-point divider
-   fpdiv fpdivsqrt (.*);
-   
-   //first of two-stage instance of floating-point add/cvt unit
-   fpuaddcvt1 fpadd1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, 
-		      AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, 
-		      AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, 
-		      AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, 
-		      AddExp1DenormE, AddExp2DenormE, AddExponentE, 
-		      Input1E, Input2E, FrmE, OpCtrlE, FmtE);
-   
-   //first of two-stage instance of floating-point comparator
-   fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, Input1E, Input2E, OpCtrlE[1:0]);
-   
-   //first and only instance of floating-point sign converter
-   fpusgn fpsgn (.*);
-   
-   //interface between XLEN size datapath and double-precision sized
-   //floating-point results
-   //
-   //define offsets for LSB zero extension or truncation
-   always_comb begin
-      
-      //truncate to 64 bits
-      //(causes warning during compilation - case never reached) 
-      //   if(`XLEN > 64) begin // ***KEP this isn't usedand it causes a lint error
-      //         DivOp1 = Input1E[`XLEN-1:`XLEN-64];
-      // 	DivOp2 = Input2E[`XLEN-1:`XLEN-64];
-      //         AddOp1E = Input1E[`XLEN-1:`XLEN-64];
-      // 	AddOp2E = Input2E[`XLEN-1:`XLEN-64];
-      //         CmpOp1E = Input1E[`XLEN-1:`XLEN-64];
-      // 	CmpOp2E = Input2E[`XLEN-1:`XLEN-64];
-      //         SgnOp1E = Input1E[`XLEN-1:`XLEN-64];
-      // 	SgnOp2E = Input2E[`XLEN-1:`XLEN-64];
-      //   end
-      //   //zero extend to 64 bits
-      //   else begin
-      //         DivOp1 = {Input1E,{64-`XLEN{1'b0}}};
-      // 	DivOp2 = {Input2E,{64-`XLEN{1'b0}}};
-      //         AddOp1E = {Input1E,{64-`XLEN{1'b0}}};
-      // 	AddOp2E = {Input2E,{64-`XLEN{1'b0}}};
-      //         CmpOp1E = {Input1E,{64-`XLEN{1'b0}}};
-      // 	CmpOp2E = {Input2E,{64-`XLEN{1'b0}}};
-      //         SgnOp1E = {Input1E,{64-`XLEN{1'b0}}};
-      // 	SgnOp2E = {Input2E,{64-`XLEN{1'b0}}};
-      //   end
-      
-      //assign op codes
-      AddOpTypeE[3:0] = OpCtrlE[3:0];
-      CmpSelE[1:0] = OpCtrlE[1:0];
-      DivOpType = OpCtrlE[0];
-      SgnOpCodeE[1:0] = OpCtrlE[1:0];
-      
-   end 
-   
-   //E stage control signal interfacing between wally spec and OSU fp hardware
-   //op codes
-   
-   //wally-spec M stage control logic signal instantiation
-   logic                    FRegWriteM;
-   logic [2:0] 		    FResultSelM;
-   logic [2:0] 		    FrmM;
-   logic                    FmtM;
-   logic [3:0] 		    OpCtrlM;
-   
-   //instantiate M stage FMA signals here ***rename fma signals and resize for XLEN
-   logic [63:0] 	    FmaResultM;
-   logic [4:0] 		    FmaFlagsM;
-   logic [12:0] 	    aligncntM; 
-   logic [105:0] 	    rM; 
-   logic [105:0] 	    sM; 
-   logic [163:0] 	    tM;	
-   logic [8:0] 		    normcntM; 
-   logic [12:0] 	    aeM; 
-   logic 		    bsM;
-   logic 		    killprodM; 
-   logic 		    prodofM; 
-   logic 		    xzeroM;
-   logic 		    yzeroM;
-   logic 		    zzeroM;
-   logic 		    xdenormM;
-   logic 		    ydenormM;
-   logic 		    zdenormM;
-   logic 		    xinfM;
-   logic 		    yinfM;
-   logic 		    zinfM;
-   logic 		    xnanM;
-   logic 		    ynanM;
-   logic 		    znanM;
-   logic 		    nanM;
-   logic [8:0] 		    sumshiftM;
-   logic 		    sumshiftzeroM;
-   logic 		    prodinfM;
-   
-   //instantiation of M stage regfile signals
-   logic [4:0] 		    RdM;
-   logic [`XLEN-1:0] 	    Input1M, Input2M, Input3M;
-   logic [`XLEN-1:0] 	    LoadStoreResultM;
-   
-   //instantiation of M stage add/cvt signals
-   logic [63:0] 	    AddResultM;
-   logic [4:0] 		    AddFlagsM;
-   logic                    AddDenormM;
-   logic [63:0] 	    AddSumM, AddSumTcM;
-   logic [3:0] 		    AddSelInvM;
-   logic [10:0] 	    AddExpPostSumM;
-   logic                    AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM;
-   logic                    AddDenormInM, AddSwapM, AddNormOvflowM, AddSignAM;
-   logic                    AddConvertM, AddSignM;
-   logic [63:0] 	    AddFloat1M, AddFloat2M;
-   logic [11:0] 	    AddExp1DenormM, AddExp2DenormM;
-   logic [10:0] 	    AddExponentM;
-   logic [63:0] 	    AddOp1M, AddOp2M;
-   logic [2:0] 		    AddRmM;
-   logic [3:0] 		    AddOpTypeM;
-   logic                    AddPM, AddOvEnM, AddUnEnM;  
-   
-   //instantiation of M stage cmp signals
-   logic                    CmpInvalidM;
-   logic [1:0] 		    CmpFCCM; 
-   logic [7:0] 		    WM, XM;
-   logic                    ANaNM, BNaNM, AzeroM, BzeroM;
-   logic [63:0] 	    CmpOp1M, CmpOp2M;
-   logic [1:0] 		    CmpSelM;
-   
-   
-   //*****************
-   //fpregfile D/E pipe registers
-   //*****************
-   flopenrc #(64) EMFpReg1(clk, reset, PipeClearEM, PipeEnableEM, Input1E, Input1M);
-   flopenrc #(64) EMFpReg2(clk, reset, PipeClearEM, PipeEnableEM, Input2E, Input2M);
-   flopenrc #(64) EMFpReg3(clk, reset, PipeClearEM, PipeEnableEM, Input3E, Input3M);
-   
-   //*****************
-   //fma E/M pipe registers
-   //*****************  
-   flopenrc #(13) EMRegFma1(clk, reset, PipeClearEM, PipeEnableEM, aligncntE, aligncntM); 
-   flopenrc #(106) EMRegFma2(clk, reset, PipeClearEM, PipeEnableEM, rE, rM); 
-   flopenrc #(106) EMRegFma3(clk, reset, PipeClearEM, PipeEnableEM, sE, sM); 
-   flopenrc #(164) EMRegFma4(clk, reset, PipeClearEM, PipeEnableEM, tE, tM); 
-   flopenrc #(9) EMRegFma5(clk, reset, PipeClearEM, PipeEnableEM, normcntE, normcntM); 
-   flopenrc #(13) EMRegFma6(clk, reset, PipeClearEM, PipeEnableEM, aeE, aeM);  
-   flopenrc #(1) EMRegFma7(clk, reset, PipeClearEM, PipeEnableEM, bsE, bsM); 
-   flopenrc #(1) EMRegFma8(clk, reset, PipeClearEM, PipeEnableEM, killprodE, killprodM); 
-   flopenrc #(1) EMRegFma9(clk, reset, PipeClearEM, PipeEnableEM, prodofE, prodofM); 
-   flopenrc #(1) EMRegFma10(clk, reset, PipeClearEM, PipeEnableEM, xzeroE, xzeroM); 
-   flopenrc #(1) EMRegFma11(clk, reset, PipeClearEM, PipeEnableEM, yzeroE, yzeroM); 
-   flopenrc #(1) EMRegFma12(clk, reset, PipeClearEM, PipeEnableEM, zzeroE, zzeroM); 
-   flopenrc #(1) EMRegFma13(clk, reset, PipeClearEM, PipeEnableEM, xdenormE, xdenormM); 
-   flopenrc #(1) EMRegFma14(clk, reset, PipeClearEM, PipeEnableEM, ydenormE, ydenormM); 
-   flopenrc #(1) EMRegFma15(clk, reset, PipeClearEM, PipeEnableEM, zdenormE, zdenormM); 
-   flopenrc #(1) EMRegFma16(clk, reset, PipeClearEM, PipeEnableEM, xinfE, xinfM); 
-   flopenrc #(1) EMRegFma17(clk, reset, PipeClearEM, PipeEnableEM, yinfE, yinfM); 
-   flopenrc #(1) EMRegFma18(clk, reset, PipeClearEM, PipeEnableEM, zinfE, zinfM); 
-   flopenrc #(1) EMRegFma19(clk, reset, PipeClearEM, PipeEnableEM, xnanE, xnanM); 
-   flopenrc #(1) EMRegFma20(clk, reset, PipeClearEM, PipeEnableEM, ynanE, ynanM); 
-   flopenrc #(1) EMRegFma21(clk, reset, PipeClearEM, PipeEnableEM, znanE, znanM); 
-   flopenrc #(1) EMRegFma22(clk, reset, PipeClearEM, PipeEnableEM, nanE, nanM); 
-   flopenrc #(9) EMRegFma23(clk, reset, PipeClearEM, PipeEnableEM, sumshiftE, sumshiftM); 
-   flopenrc #(1) EMRegFma24(clk, reset, PipeClearEM, PipeEnableEM, sumshiftzeroE, sumshiftzeroM); 
-   flopenrc #(1) EMRegFma25(clk, reset, PipeClearEM, PipeEnableEM, prodinfE, prodinfM); 
-   
-   //*****************
-   //fpadd E/M pipe registers
-   //*****************
-   flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
-   flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
-   flopenrc #(4)  EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
-   flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
-   flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
-   flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
-   flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
-   flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
-   flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
-   flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
-   flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
-   flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
-   flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
-   flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
-   flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignM); 
-   flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
-   flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
-   flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
-   flopenrc #(12) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
-   flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
-   flopenrc #(64) EMRegAdd21(clk, reset, PipeClearEM, PipeEnableEM, AddOp1E, AddOp1M); 
-   flopenrc #(64) EMRegAdd22(clk, reset, PipeClearEM, PipeEnableEM, AddOp2E, AddOp2M); 
-   flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
-   flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
-   flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
-   flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
-   flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
-   
-   //*****************
-   //fpcmp E/M pipe registers
-   //*****************
-   flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
-   flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
-   flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
-   flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
-   flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
-   flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
-   flopenrc #(64) EMRegCmp7(clk, reset, PipeClearEM, PipeEnableEM, CmpOp1E, CmpOp1M); 
-   flopenrc #(64) EMRegCmp8(clk, reset, PipeClearEM, PipeEnableEM, CmpOp2E, CmpOp2M); 
-   flopenrc #(2) EMRegCmp9(clk, reset, PipeClearEM, PipeEnableEM, CmpSelE, CmpSelM);
-   
-   //put this in for the event we want to delay fsgn - will otherwise bypass
-   //*****************
-   //fpsgn E/M pipe registers
-   //***************** 
-   flopenrc #(2) EMRegSgn1(clk, reset, PipeClearEM, PipeEnableEM, SgnOpCodeE, SgnOpCodeM);
-   flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
-   flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
-   
-   //*****************
-   //other E/M pipe registers
-   //*****************
-   flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FRegWriteE, FRegWriteM);
-   flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM);
-   flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
-   flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, FmtE, FmtM);
-   flopenrc #(5) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, RdE, RdM);
-   flopenrc #(4) EMReg6(clk, reset, PipeClearEM, PipeEnableEM, OpCtrlE, OpCtrlM);
-   flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
-   flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
-   
-  assign FWriteDataM = Input1M;
-  mux2  #(64)  LoadStoreResultMux(HRDATA, Input1M, |OpCtrlM[2:1], LoadStoreResultM);
   fma2 fma2(.*);
 
-   //second instance of two-stage floating-point add/cvt unit
-   fpuaddcvt2 fpadd2 (.*);
-   
-   //second instance of two-stage floating-point comparator
-   fpucmp2 fpcmp2 (CmpInvalidM, CmpFCCM, ANaNM, BNaNM, AzeroM, BzeroM, WM, XM, CmpSelM, CmpOp1M, CmpOp2M);
-   
-   //wally-spec W stage control logic signal instantiation
-   logic [2:0] 		    FResultSelW;
-   
-   //instantiate W stage fma signals here
-   logic [63:0] 	    FmaResultW;
-   logic [4:0] 		    FmaFlagsW;
-   
-   //instantiation of W stage div/sqrt signals
-   logic                    DivDenormW;
-   logic [63:0] 	    DivResultW;
-   logic [4:0] 		    DivFlagsW;
-   
-   //instantiation of W stage fsgn signals
-   logic [63:0] 	    SgnResultW;
-   logic [4:0] 		    SgnFlagsW;
-   
-   //instantiation of W stage regfile signals
-   logic [`XLEN-1:0] 	    LoadStoreResultW;
-   logic [`XLEN-1:0] 	    SrcAW;
-   
-   //instantiation of W stage add/cvt signals
-   logic [63:0] 	    AddResultW;
-   logic [4:0] 		    AddFlagsW;
-   logic                    AddDenormW;
-   
-   //instantiation of W stage cmp signals
-   logic [63:0] 	    CmpResultW;
-   logic                    CmpInvalidW;
-   logic [1:0] 		    CmpFCCW; 
-   
-   //instantiation of W stage classify signals
-   logic [63:0] 	    ClassResultW;
-   logic [4:0] 		    ClassFlagsW;
-   
-   //*****************
-   //fma M/W pipe registers
-   //*****************
-   flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); 
-   flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); 
-   
-   //*****************
-   //fpdiv M/W pipe registers
-   //*****************
-   flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, DivResultM, DivResultW); 
-   flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, DivFlagsM, DivFlagsW);
-   flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
-   
-   //*****************
-   //fpadd M/W pipe registers
-   //*****************
-   flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, AddResultM, AddResultW); 
-   flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, AddFlagsM, AddFlagsW); 
-   flopenrc #(1) MWRegAdd3(clk, reset, PipeClearMW, PipeEnableMW, AddDenormM, AddDenormW); 
-   
-   //*****************
-   //fpcmp M/W pipe registers
-   //*****************
-   flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
-   flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
-   
-   //*****************
-   //fpsgn M/W pipe registers
-   //***************** 
-   flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW);
-   flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW);
-   
-   //*****************
-   //other M/W pipe registers
-   //*****************
-   flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FRegWriteM, FRegWriteW);
-   flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
-   flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
-   flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
-   flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW);
-   flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, LoadStoreResultM, LoadStoreResultW);
-   flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
-   
-   //flag signal mux via in-line ternaries
-   logic [4:0] 		    FPUFlagsW;
-   //if bit 2 is active set to sign flags - otherwise:
-   //iff bit one is high - if bit zero is active set to fma flags - otherwise
-   //set to cmp flags
-   //iff bit one is low - if bit zero is active set to add/cvt flags - otherwise
-   //set to div/sqrt flags
-   //assign FPUFlagsW = (FResultSelW[2]) ? (SgnFlagsW) : (
-   //	             (FResultSelW[1]) ? 
-   //		     ( (FResultSelW[0]) ? (FmaFlagsW) : ({CmpInvalidW,4'b0000}) ) 
-   //		     : ( (FResultSelW[0]) ? (AddFlagsW) : (DivFlagsW) ) 
-   //                     );
-   always_comb begin
-      case (FResultSelW)
-	// div/sqrt
-	3'b000 : FPUFlagsW = DivFlagsW;
-	// cmp		
-	3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0};
-	//fma/mult
-	3'b010 : FPUFlagsW = FmaFlagsW;
-	// sgn inj
-	3'b011 : FPUFlagsW = SgnFlagsW;
-	// add/sub/cnvt
-	3'b100 : FPUFlagsW = AddFlagsW;
-	// classify
-	3'b101 : FPUFlagsW = ClassFlagsW;
-	// output SrcAW
-	3'b110 : FPUFlagsW = 5'b0;
-	// output ReadData1
-	3'b111 : FPUFlagsW = 5'b0;
-	default : FPUFlagsW = 5'bxxxxx;
-      endcase
-   end
-   
-   //result mux via in-line ternaries
-   //the uses the same logic as for flag signals
-   //assign FPUResultDirW = (FResultSelW[2]) ? (SgnResultW) : (
-   //	             (FResultSelW[1]) ? 
-   //		     ( (FResultSelW[0]) ? (FmaResultW) : ({62'b0,CmpFCCW}) ) 
-   //		     : ( (FResultSelW[0]) ? (AddResultW) : (DivResultW) ) 
-   //                   );
-   
-   
-   always_comb begin
-      case (FResultSelW)
-	// div/sqrt
-	3'b000 : FPUResultDirW = DivResultW;
-	// cmp		
-	3'b001 : FPUResultDirW = CmpResultW;
-	//fma/mult
-	3'b010 : FPUResultDirW = FmaResultW;
-	// sgn inj
-	3'b011 : FPUResultDirW = SgnResultW;
-	// add/sub/cnvt
-	3'b100 : FPUResultDirW = AddResultW;
-	// classify
-	3'b101 : FPUResultDirW = ClassResultW;
-	// output SrcAW
-	3'b110 : FPUResultDirW = SrcAW;
-	// Load/Store/Move to FP-register
-	3'b111 : FPUResultDirW = LoadStoreResultW;
-	default : FPUResultDirW = {64{1'bx}};
-      endcase
-   end
-   //interface between XLEN size datapath and double-precision sized
-   //floating-point results
-   //
-   //define offsets for LSB zero extension or truncation
-   always_comb begin
-      
-      //zero extension  
-      
-      // Teo 04/13/2021
-      // Commented out XLENDIFF{1'b0} due to error:
-      // Repetition multiplier must be constant.
-      
-      //if(`XLEN > 64) begin
-      //    FPUResultW = {FPUResultDirW,{XLENDIFF{1'b0}}};
-      //end
-      //truncate
-      //else begin
-      FPUResultW = FPUResultDirW[63:64-`XLEN];
-      SetFflagsM = FPUFlagsW;
-      //end
-      
-   end  
-   
-endmodule // fpu
+  //second instance of two-stage floating-point add/cvt unit
+  fpuaddcvt2 fpadd2 (.*);
 
+  //second instance of two-stage floating-point comparator
+  fpucmp2 fpcmp2 (CmpInvalidM, CmpFCCM, ANaNM, BNaNM, AzeroM, BzeroM, WM, XM, {1'b0, FmtM}, FInput1M, FInput2M);
+
+
+
+
+
+
+
+
+
+
+  
+  //*****************
+  //fma M/W pipe registers
+  //*****************
+  flopenrc #(64) MWRegFma1(clk, reset, PipeClearMW, PipeEnableMW, FmaResultM, FmaResultW); 
+  flopenrc #(5) MWRegFma2(clk, reset, PipeClearMW, PipeEnableMW, FmaFlagsM, FmaFlagsW); 
+
+  //*****************
+  //fpdiv M/W pipe registers
+  //*****************
+  flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, FDivResultM, FDivResultW); 
+  flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, FDivFlagsM, FDivFlagsW);
+  flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
+
+  //*****************
+  //fpadd M/W pipe registers
+  //*****************
+  flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, FAddResultM, FAddResultW); 
+  flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, FAddFlagsM, FAddFlagsW); 
+
+  //*****************
+  //fpcmp M/W pipe registers
+  //*****************
+  flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
+  flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
+
+  //*****************
+  //fpsgn M/W pipe registers
+  //***************** 
+  flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW);
+  flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW);
+
+  //*****************
+  //other M/W pipe registers
+  //*****************
+  flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FWriteEnM, FWriteEnW);
+  flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
+  flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, FmtM, FmtW);
+  flopenrc #(5) MWReg4(clk, reset, PipeClearMW, PipeEnableMW, RdM, RdW);
+  flopenrc #(`XLEN) MWReg5(clk, reset, PipeClearMW, PipeEnableMW, SrcAM, SrcAW);
+  flopenrc #(64) MWReg6(clk, reset, PipeClearMW, PipeEnableMW, FLoadStoreResultM, FLoadStoreResultW);
+  flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
+
+
+
+
+
+
+
+
+
+
+  //#########################################
+  //BEGIN WRITEBACK STAGE
+  //#########################################
+
+  always_comb begin
+	case (FResultSelW)
+		// div/sqrt
+		3'b000 : FPUFlagsW = FDivFlagsW;
+		// cmp		
+		3'b001 : FPUFlagsW = {CmpInvalidW, 4'b0};
+		//fma/mult
+		3'b010 : FPUFlagsW = FmaFlagsW;
+		// sgn inj
+		3'b011 : FPUFlagsW = SgnFlagsW;
+		// add/sub/cnvt
+		3'b100 : FPUFlagsW = FAddFlagsW;
+		// classify
+		3'b101 : FPUFlagsW = ClassFlagsW;
+		// output SrcAW
+		3'b110 : FPUFlagsW = 5'b0;
+		// output FRD1
+		3'b111 : FPUFlagsW = 5'b0;
+		default : FPUFlagsW = 5'bxxxxx;
+	endcase
+  end
+
+
+  always_comb begin
+	case (FResultSelW)
+		// div/sqrt
+		3'b000 : FPUResult64W = FDivResultW;
+		// cmp		
+		3'b001 : FPUResult64W = FCmpResultW;
+		//fma/mult
+		3'b010 : FPUResult64W = FmaResultW;
+		// sgn inj
+		3'b011 : FPUResult64W = SgnResultW;
+		// add/sub/cnvt
+		3'b100 : FPUResult64W = FAddResultW;
+		// classify
+		3'b101 : FPUResult64W = ClassResultW;
+		// output SrcAW
+		3'b110 : FPUResult64W = SrcAW;
+		// Load/Store/Move to FP-register
+		3'b111 : FPUResult64W = FLoadStoreResultW;
+		default : FPUResult64W = {64{1'bx}};
+	endcase
+  end
+  //interface between XLEN size datapath and double-precision sized
+  //floating-point results
+  //
+  //define offsets for LSB zero extension or truncation
+  always_comb begin
+           
+  //zero extension 
+      FPUResultW = FPUResult64W[63:64-`XLEN];
+      SetFflagsM = FPUFlagsW;
+
+  end  
+endmodule
diff --git a/wally-pipelined/src/fpu/fpuaddcvt1.sv b/wally-pipelined/src/fpu/fpuaddcvt1.sv
index d50cb4e2..e1228f32 100755
--- a/wally-pipelined/src/fpu/fpuaddcvt1.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt1.sv
@@ -27,16 +27,15 @@
 //
 
 
-module fpuaddcvt1 (sum, sum_tc, sel_inv, exponent_postsum, corr_sign, op1_Norm, op2_Norm, opA_Norm, opB_Norm, Invalid, DenormIn, convert, swap, normal_overflow, signA, Float1, Float2, exp1_denorm, exp2_denorm, exponent, op1, op2, rm, op_type, Pin);
+module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, FInput1E, FInput2E, FOpCtrlE, FmtE);
 
-   input logic [63:0] op1;		// 1st input operand (A)
-   input logic [63:0] op2;		// 2nd input operand (B)
-   input logic [2:0] 	rm;		// Rounding mode - specify values 
-   input logic [3:0]	op_type;	// Function opcode
-   input logic 	Pin;   		// Result Precision (1 for double, 0 for single)
+   input logic [63:0] FInput1E;		// 1st input operand (A)
+   input logic [63:0] FInput2E;		// 2nd input operand (B)
+   input logic [3:0]	FOpCtrlE;	// Function opcode
+   input logic 	FmtE;   		// Result Precision (1 for double, 0 for single)
 
    wire          P;
-   assign P = ~Pin | op_type[2];
+   assign P = ~FmtE | FOpCtrlE[2];
 
    wire [63:0] 	 IntValue;
    wire [11:0] 	 exp1, exp2;
@@ -54,44 +53,44 @@ module fpuaddcvt1 (sum, sum_tc, sel_inv, exponent_postsum, corr_sign, op1_Norm,
    wire 	 zeroB;
    wire [5:0]	 align_shift; 
 
-   output logic [63:0] 	 Float1; 
-   output logic [63:0] 	 Float2;
-   output logic [10:0] 	 exponent;
-   output logic [10:0]	 exponent_postsum;
-   output logic [11:0]	 exp1_denorm, exp2_denorm;//KEP used to be [10:0]
-   output logic [63:0] sum, sum_tc;
-   output logic [3:0]  sel_inv;
-   output logic        corr_sign;
-   output logic 	 signA;
-   output logic	 op1_Norm, op2_Norm;
-   output logic	 opA_Norm, opB_Norm;
-   output logic	 Invalid;
-   output logic 	 DenormIn;
+   output logic [63:0] 	 AddFloat1E; 
+   output logic [63:0] 	 AddFloat2E;
+   output logic [10:0] 	 AddExponentE;
+   output logic [10:0]	 AddExpPostSumE;
+   output logic [11:0]	 AddExp1DenormE, AddExp2DenormE;//KEP used to be [10:0]
+   output logic [63:0] AddSumE, AddSumTcE;
+   output logic [3:0]  AddSelInvE;
+   output logic        AddCorrSignE;
+   output logic 	 AddSignAE;
+   output logic	 AddOp1NormE, AddOp2NormE;
+   output logic	 AddOpANormE, AddOpBNormE;
+   output logic	 AddInvalidE;
+   output logic 	 AddDenormInE;
 //   output logic 	 exp_valid;
-   output logic 	 convert;
-   output logic        swap;
-   output logic 	 normal_overflow;
+   output logic 	 AddConvertE;
+   output logic        AddSwapE;
+   output logic 	 AddNormOvflowE;
    wire [5:0]	 ZP_mantissaA;
    wire [5:0]	 ZP_mantissaB;
    wire		 ZV_mantissaA;
    wire		 ZV_mantissaB;
 
    // Convert the input operands to their appropriate forms based on 
-   // the orignal operands, the op_type , and their precision P. 
+   // the orignal operands, the FOpCtrlE , and their precision P. 
    // Single precision inputs are converted to double precision 
    // and the sign of the first operand is set appropratiately based on
    // if the operation is absolute value or negation. 
 
-   convert_inputs conv1 (Float1, Float2, op1, op2, op_type, P);
+   convert_inputs conv1 (AddFloat1E, AddFloat2E, FInput1E, FInput2E, FOpCtrlE, P);
 
    // Test for exceptions and return the "Invalid Operation" and
-   // "Denormalized" Input Flags. The "sel_inv" is used in
-   // the third pipeline stage to select the result. Also, op1_Norm
-   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
+   // "Denormalized" Input Flags. The "AddSelInvE" is used in
+   // the third pipeline stage to select the result. Also, AddOp1NormE
+   // and AddOp2NormE are one if FInput1E and FInput2E are not zero or denormalized.
    // sub is one if the effective operation is subtaction. 
 
-   exception exc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, sub, 
-		   Float1, Float2, op_type);
+   exception exc1 (AddSelInvE, AddInvalidE, AddDenormInE, AddOp1NormE, AddOp2NormE, sub, 
+		   AddFloat1E, AddFloat2E, FOpCtrlE);
 
    // Perform Exponent Subtraction (used for alignment). For performance
    // both exponent subtractions are performed in parallel. This was 
@@ -99,25 +98,25 @@ module fpuaddcvt1 (sum, sum_tc, sel_inv, exponent_postsum, corr_sign, op1_Norm,
    // the two parallel additions. The input values are zero-extended to 12 
    // bits prior to performing the addition. 
 
-   assign exp1 = {1'b0, Float1[62:52]};
-   assign exp2 = {1'b0, Float2[62:52]};
+   assign exp1 = {1'b0, AddFloat1E[62:52]};
+   assign exp2 = {1'b0, AddFloat2E[62:52]};
    assign exp_diff1 = exp1 - exp2;
-   assign exp_diff2 = DenormIn ? ({Float2[63], exp2[10:0]} - {Float1[63], exp1[10:0]}): exp2 - exp1;
+   assign exp_diff2 = AddDenormInE ? ({AddFloat2E[63], exp2[10:0]} - {AddFloat1E[63], exp1[10:0]}): exp2 - exp1;
 
-   // The second operand (B) should be set to zero, if op_type does not
+   // The second operand (B) should be set to zero, if FOpCtrlE does not
    // specify addition or subtraction
-   assign zeroB = op_type[2] | op_type[1];
+   assign zeroB = FOpCtrlE[2] | FOpCtrlE[1];
 
    // Swapped operands if zeroB is not one and exp1 < exp2. 
-   // Swapping causes exp2 to be used for the result exponent. 
+   // SwapFmtEg causes exp2 to be used for the result exponent. 
    // Only the exponent of the larger operand is used to determine
    // the final result. 
-   assign swap = exp_diff1[11] & ~zeroB;
-   assign exponent = swap ? exp2[10:0] : exp1[10:0];
-   assign exponent_postsum = swap ? exp2[10:0] : exp1[10:0];
-   assign mantissaA = swap ? Float2[51:0] : Float1[51:0];
-   assign mantissaB = swap ? Float1[51:0] : Float2[51:0];
-   assign signA     = swap ? Float2[63] : Float1[63];   
+   assign AddSwapE = exp_diff1[11] & ~zeroB;
+   assign AddExponentE = AddSwapE ? exp2[10:0] : exp1[10:0];
+   assign AddExpPostSumE = AddSwapE ? exp2[10:0] : exp1[10:0];
+   assign mantissaA = AddSwapE ? AddFloat2E[51:0] : AddFloat1E[51:0];
+   assign mantissaB = AddSwapE ? AddFloat1E[51:0] : AddFloat2E[51:0];
+   assign AddSignAE     = AddSwapE ? AddFloat2E[63] : AddFloat1E[63];   
 
    // Leading-Zero Detector. Determine the size of the shift needed for
    // normalization. If sum_corrected is all zeros, the exp_valid is 
@@ -127,12 +126,12 @@ module fpuaddcvt1 (sum, sum_tc, sel_inv, exponent_postsum, corr_sign, op1_Norm,
    lz52 lz_norm_2 (ZP_mantissaB, ZV_mantissaB, mantissaB);
 
    // Denormalized exponents created by subtracting the leading zeroes from the original exponents
-   assign exp1_denorm = swap ? (exp1 - {6'b0, ZP_mantissaB}) : (exp1 - {6'b0, ZP_mantissaA}); //KEP extended ZP_mantissa 
-   assign exp2_denorm = swap ? (exp2 - {6'b0, ZP_mantissaA}) : (exp2 - {6'b0, ZP_mantissaB});
+   assign AddExp1DenormE = AddSwapE ? (exp1 - {6'b0, ZP_mantissaB}) : (exp1 - {6'b0, ZP_mantissaA}); //KEP extended ZP_mantissa 
+   assign AddExp2DenormE = AddSwapE ? (exp2 - {6'b0, ZP_mantissaA}) : (exp2 - {6'b0, ZP_mantissaB});
 
    // Determine the alignment shift and limit it to 63. If any bit from 
    // exp_shift[6] to exp_shift[11] is one, then shift is set to all ones. 
-   assign exp_shift = swap ? exp_diff2 : exp_diff1;
+   assign exp_shift = AddSwapE ? exp_diff2 : exp_diff1;
    assign exp_gt63 = exp_shift[11] | exp_shift[10] | exp_shift[9] 
      | exp_shift[8] | exp_shift[7] | exp_shift[6];
    assign align_shift = exp_shift[5:0] | {6{exp_gt63}}; //KEP used to be all of exp_shift
@@ -147,10 +146,10 @@ module fpuaddcvt1 (sum, sum_tc, sel_inv, exponent_postsum, corr_sign, op1_Norm,
    // and loss of sign information. The two bits to the right of the 
    // original mantissa form the "guard" and "round" bits that are used
    // to round the result. 
-   assign opA_Norm = swap ? op2_Norm : op1_Norm;
-   assign opB_Norm = swap ? op1_Norm : op2_Norm;
-   assign mantissaA1 = {2'h0, opA_Norm, mantissaA[51:0]&{52{opA_Norm}}, 2'h0};
-   assign mantissaB1 = {2'h0, opB_Norm, mantissaB[51:0]&{52{opB_Norm}}, 2'h0};
+   assign AddOpANormE = AddSwapE ? AddOp2NormE : AddOp1NormE;
+   assign AddOpBNormE = AddSwapE ? AddOp1NormE : AddOp2NormE;
+   assign mantissaA1 = {2'h0, AddOpANormE, mantissaA[51:0]&{52{AddOpANormE}}, 2'h0};
+   assign mantissaB1 = {2'h0, AddOpBNormE, mantissaB[51:0]&{52{AddOpBNormE}}, 2'h0};
 
    // Perform mantissa alignment using a 57-bit barrel shifter 
    // If any of the bits shifted out are one, Sticky_out is set. 
@@ -160,8 +159,8 @@ module fpuaddcvt1 (sum, sum_tc, sel_inv, exponent_postsum, corr_sign, op1_Norm,
 
    // Place either the sign-extened 32-bit value or the original 64-bit value 
    // into IntValue (to be used for integer to floating point conversion)
-   assign IntValue [31:0] = op1[31:0];
-   assign IntValue [63:32] = op_type[0] ? {32{op1[31]}} : op1[63:32];
+   assign IntValue [31:0] = FInput1E[31:0];
+   assign IntValue [63:32] = FOpCtrlE[0] ? {32{FInput1E[31]}} : FInput1E[63:32];
 
    // If doing an integer to floating point conversion, mantissaA3 is set to 
    // IntVal and the prenomalized exponent is set to 1084. Otherwise, 
@@ -169,30 +168,30 @@ module fpuaddcvt1 (sum, sum_tc, sel_inv, exponent_postsum, corr_sign, op1_Norm,
    // and the exponent value is left unchanged. 
    // Under denormalized cases, the exponent before the rounder is set to 1
    // if the normal shift value is 11.
-   assign convert       = ~op_type[2] & op_type[1];
-   assign mantissaA3    = (op_type[3]) ? (op_type[0] ? Float1 : ~Float1) : (DenormIn ? ({12'h0, mantissaA}) : (convert ? IntValue : {mantissaA1, 7'h0}));
+   assign AddConvertE       = ~FOpCtrlE[2] & FOpCtrlE[1];
+   assign mantissaA3    = (FOpCtrlE[3]) ? (FOpCtrlE[0] ? AddFloat1E : ~AddFloat1E) : (AddDenormInE ? ({12'h0, mantissaA}) : (AddConvertE ? IntValue : {mantissaA1, 7'h0}));
 
    // Put zero in for mantissaB3, if zeroB is one. Otherwise, B is extended to 
    // 64-bits by setting the 7 LSBs to the Sticky_out bit followed by six  
    // zeros. 
-   assign mantissaB3[63:7] = (op_type[3]) ? (57'h0) : (DenormIn ? {12'h0, mantissaB[51:7]} : mantissaB2 & {57{~zeroB}});
-   assign mantissaB3[6]    = (op_type[3]) ? (1'b0) : (DenormIn ? mantissaB[6] : Sticky_out & ~zeroB);
-   assign mantissaB3[5:0]  = (op_type[3]) ? (6'h01) : (DenormIn ? mantissaB[5:0] : 6'h0);
+   assign mantissaB3[63:7] = (FOpCtrlE[3]) ? (57'h0) : (AddDenormInE ? {12'h0, mantissaB[51:7]} : mantissaB2 & {57{~zeroB}});
+   assign mantissaB3[6]    = (FOpCtrlE[3]) ? (1'b0) : (AddDenormInE ? mantissaB[6] : Sticky_out & ~zeroB);
+   assign mantissaB3[5:0]  = (FOpCtrlE[3]) ? (6'h01) : (AddDenormInE ? mantissaB[5:0] : 6'h0);
 
    // The sign of the result needs to be corrected if the true
    // operation is subtraction and the input operands were swapped. 
-   assign corr_sign = ~op_type[2]&~op_type[1]&op_type[0]&swap;
+   assign AddCorrSignE = ~FOpCtrlE[2]&~FOpCtrlE[1]&FOpCtrlE[0]&AddSwapE;
 
    // 64-bit Mantissa Adder/Subtractor
-   cla64 add1 (sum, mantissaA3, mantissaB3, sub);
+   cla64 add1 (AddSumE, mantissaA3, mantissaB3, sub);
 
    // 64-bit Mantissa Subtractor - to get the two's complement of the 
    // result when the sign from the adder/subtractor is negative. 
-   cla_sub64 sub1 (sum_tc, mantissaB3, mantissaA3);
+   cla_sub64 sub1 (AddSumTcE, mantissaB3, mantissaA3);
  
    // Finds normal underflow result to determine whether to round final exponent down
-   //***KEP used to be (sum == 16'h0) I am unsure what it's supposed to be
-   assign normal_overflow = (DenormIn & (sum == 64'h0) & (opA_Norm | opB_Norm) & ~op_type[0]) ? 1'b1 : (sum[63] ? sum_tc[52] : sum[52]);
+   //***KEP used to be (AddSumE == 16'h0) I am unsure what it's supposed to be
+   assign AddNormOvflowE = (AddDenormInE & (AddSumE == 64'h0) & (AddOpANormE | AddOpBNormE) & ~FOpCtrlE[0]) ? 1'b1 : (AddSumE[63] ? AddSumTcE[52] : AddSumE[52]);
 
 endmodule // fpadd
 
diff --git a/wally-pipelined/src/fpu/fpuaddcvt2.sv b/wally-pipelined/src/fpu/fpuaddcvt2.sv
index e040d2d2..36dabf08 100755
--- a/wally-pipelined/src/fpu/fpuaddcvt2.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt2.sv
@@ -27,15 +27,13 @@
 //
 
 
-module fpuaddcvt2 (AddResultM, AddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, AddOp1M, AddOp2M, AddRmM, AddOpTypeM, AddPM, AddOvEnM, AddUnEnM);
+module fpuaddcvt2 (FAddResultM, FAddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, FrmM, FOpCtrlM, FmtM);
 
-   input [63:0] AddOp1M;		// 1st input operand (A)
-   input [63:0] AddOp2M;		// 2nd input operand (B)
-   input [2:0] 	AddRmM;		// Rounding mode - specify values 
-   input [3:0]	AddOpTypeM;	// Function opcode
-   input 	AddPM;   		// Result Precision (0 for double, 1 for single)
-   input 	AddOvEnM;		// Overflow trap enabled
-   input 	AddUnEnM;   	// Underflow trap enabled
+   input [2:0] 	FrmM;		// Rounding mode - specify values 
+   input [3:0]	FOpCtrlM;	// Function opcode
+   input 	FmtM;   		// Result Precision (0 for double, 1 for single)
+   // input 	AddOvEnM;		// Overflow trap enabled
+   // input 	AddUnEnM;   	// Underflow trap enabled
    input [63:0] AddSumM, AddSumTcM;
    input [63:0] 	 AddFloat1M; 
    input [63:0] 	 AddFloat2M;
@@ -53,12 +51,12 @@ module fpuaddcvt2 (AddResultM, AddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSel
    input          AddSwapM;
    // input 	 AddNormOvflowM;
 
-   output [63:0] AddResultM;	// Result of operation
-   output [4:0]  AddFlagsM;   	// IEEE exception flags 
+   output [63:0] FAddResultM;	// Result of operation
+   output [4:0]  FAddFlagsM;   	// IEEE exception flags 
    output 	 AddDenormM;   	// AddDenormM on input or output   
 
    wire          P;
-   assign P = AddPM | AddOpTypeM[2];
+   assign P = FmtM | FOpCtrlM[2];
 
    wire [10:0]   exp_pre;
    wire [63:0] 	 Result;   
@@ -82,6 +80,12 @@ module fpuaddcvt2 (AddResultM, AddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSel
    wire [63:0]   sum_corr;
    logic AddNormOvflowM;
  
+ 
+   logic 	AddOvEnM;		// Overflow trap enabled
+   logic 	AddUnEnM;   	// Underflow trap enabled
+
+   assign AddOvEnM = 1'b1;
+   assign AddUnEnM = 1'b1;
    //AddExponentM value pre-rounding with considerations for denormalized
    //cases/conversion cases
    assign exp_pre       = AddDenormInM ?
@@ -101,7 +105,7 @@ module fpuaddcvt2 (AddResultM, AddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSel
    assign mantissa_comp_sum_tc = AddSwapM ? Float2_sum_tc_comp : Float1_sum_tc_comp;
 
    // Determines the correct comparison result based on operation and sign of resulting AddSumM
-   assign mantissa_comp = (AddOpTypeM[0] ^ AddSumM[63]) ? mantissa_comp_sum_tc : mantissa_comp_sum;
+   assign mantissa_comp = (FOpCtrlM[0] ^ AddSumM[63]) ? mantissa_comp_sum_tc : mantissa_comp_sum;
 
    // If the signs are different and both operands aren't denormalized
    // the normal underflow bit is needed and therefore updated.
@@ -113,12 +117,12 @@ module fpuaddcvt2 (AddResultM, AddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSel
    // If the AddSumM is negative, use its two complement instead. 
    // This value has to be 64-bits to correctly handle the 
    // case 10...00
-   assign sum_corr = (AddDenormInM & (AddOpANormM | AddOpBNormM) & ( ( (AddFloat1M[63] ~^ AddFloat2M[63]) & AddOpTypeM[0] ) | ((AddFloat1M[63] ^ AddFloat2M[63]) & ~AddOpTypeM[0]) ))
-			 ? (AddSumM[63] ? AddSumM : AddSumTcM) : ( (AddOpTypeM[3]) ? AddSumM : (AddSumM[63] ? AddSumTcM : AddSumM));
+   assign sum_corr = (AddDenormInM & (AddOpANormM | AddOpBNormM) & ( ( (AddFloat1M[63] ~^ AddFloat2M[63]) & FOpCtrlM[0] ) | ((AddFloat1M[63] ^ AddFloat2M[63]) & ~FOpCtrlM[0]) ))
+			 ? (AddSumM[63] ? AddSumM : AddSumTcM) : ( (FOpCtrlM[3]) ? AddSumM : (AddSumM[63] ? AddSumTcM : AddSumM));
 
    // Finds normal underflow result to determine whether to round final AddExponentM down
    //KEP used to be (AddSumM == 16'h0) not sure what it is supposed to be
-   assign AddNormOvflowM = (AddDenormInM & (AddSumM == 64'h0) & (AddOpANormM | AddOpBNormM) & ~AddOpTypeM[0]) ? 1'b1 : (AddSumM[63] ? AddSumTcM[52] : AddSumM[52]);
+   assign AddNormOvflowM = (AddDenormInM & (AddSumM == 64'h0) & (AddOpANormM | AddOpBNormM) & ~FOpCtrlM[0]) ? 1'b1 : (AddSumM[63] ? AddSumTcM[52] : AddSumM[52]);
 
    // Leading-Zero Detector. Determine the size of the shift needed for
    // normalization. If sum_corrected is all zeros, the exp_valid is 
@@ -132,7 +136,7 @@ module fpuaddcvt2 (AddResultM, AddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSel
    // be right shifted. It outputs the normalized AddSumM. 
    barrel_shifter_l64 bs2 (sum_norm, sum_corr, norm_shift_denorm);
   
-   assign sum_norm_w_bypass = (AddOpTypeM[3]) ? (AddOpTypeM[0] ? ~sum_corr : sum_corr) : (sum_norm);
+   assign sum_norm_w_bypass = (FOpCtrlM[3]) ? (FOpCtrlM[0] ? ~sum_corr : sum_corr) : (sum_norm);
 
    // Round the mantissa to a 52-bit value, with the leading one
    // removed. If the result is a single precision number, the actual 
@@ -141,18 +145,18 @@ module fpuaddcvt2 (AddResultM, AddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSel
    // exactly where the rounding point is. The rounding units also
    // handles special cases and set the exception flags.
 
-   // Changed DenormIO -> AddDenormM and FlagsIn -> AddFlagsM in order to
+   // Changed DenormIO -> AddDenormM and FlagsIn -> FAddFlagsM in order to
    // help in processor reservation station detection of load/stores. In
    // other words, the processor would like to know ahead of time that
    // if the result is an exception then don't load or store.
-   rounder round1 (Result, DenormIO, FlagsIn, AddRmM, P, AddOvEnM, AddUnEnM, exp_valid, 
+   rounder round1 (Result, DenormIO, FlagsIn, FrmM, P, AddOvEnM, AddUnEnM, exp_valid, 
 		   AddSelInvM, AddInvalidM, AddDenormInM, AddConvertM, sign_corr, exp_pre, norm_shift, sum_norm_w_bypass,
 		   AddExpPostSumM, AddOp1NormM, AddOp2NormM, AddFloat1M[63:52], AddFloat2M[63:52],
-		   AddNormOvflowM, normal_underflow, AddSwapM, AddOpTypeM, AddSumM);
+		   AddNormOvflowM, normal_underflow, AddSwapM, FOpCtrlM, AddSumM);
 
    // Store the final result and the exception flags in registers.
-   assign AddResultM = Result;
-   assign {AddDenormM, AddFlagsM} = {DenormIO, FlagsIn};
+   assign FAddResultM = Result;
+   assign {AddDenormM, FAddFlagsM} = {DenormIO, FlagsIn};
    
 endmodule // fpadd
 
diff --git a/wally-pipelined/src/fpu/fpucmp1.sv b/wally-pipelined/src/fpu/fpucmp1.sv
index 71bdea3b..1cf267f2 100755
--- a/wally-pipelined/src/fpu/fpucmp1.sv
+++ b/wally-pipelined/src/fpu/fpucmp1.sv
@@ -37,7 +37,7 @@
 // It also produces an invalid operation flag, which is one
 // if either of the input operands is a signaling NaN per 754
 
-module fpucmp1 (w, x, ANaN, BNaN, Azero, Bzero, op1, op2, Sel);
+module fpucmp1 (w, x, ANaN, BNaN, Azero, Bzero, op1, op2, Sel);///***fix Sel to match spec
    
    input logic [63:0] op1; 
    input logic [63:0] op2;
diff --git a/wally-pipelined/src/fpu/fpuhazard.sv b/wally-pipelined/src/fpu/fpuhazard.sv
index ba748251..4c1344dc 100644
--- a/wally-pipelined/src/fpu/fpuhazard.sv
+++ b/wally-pipelined/src/fpu/fpuhazard.sv
@@ -27,45 +27,45 @@
 
 module fpuhazard(
     input logic [4:0] Adr1, Adr2, Adr3,
-    input logic  FRegWriteE, FRegWriteM, FRegWriteW, 
+    input logic FWriteEnE, FWriteEnM, FWriteEnW, 
 	  input logic [4:0] RdE, RdM, RdW,
-	  input logic  DivBusyM,
+	  input logic DivBusyM,
 	  input logic	RegWriteD,
     input logic [2:0] FResultSelD, FResultSelE,
     input logic IllegalFPUInstrD,
-    input logic In2UsedD, In3UsedD,
+    input logic FInput2UsedD, FInput3UsedD,
   // Stall outputs
 	  output logic FStallD,
-    output logic [1:0] Input1MuxD, Input2MuxD, 
-    output logic Input3MuxD
+    output logic [1:0] FForwardInput1D, FForwardInput2D, 
+    output logic FForwardInput3D
 );
 
 
   always_comb begin
     // set ReadData as default
-    Input1MuxD = 2'b00; 
-    Input2MuxD = 2'b00;
-    Input3MuxD = 1'b0;
+    FForwardInput1D = 2'b00; 
+    FForwardInput2D = 2'b00;
+    FForwardInput3D = 1'b0;
     FStallD = DivBusyM;
     if (~IllegalFPUInstrD) begin
 //					if taking a value from int register
-      if ((Adr1 == RdE) & (FRegWriteE | ((FResultSelE == 3'b110) & RegWriteD))) 
-        if (FResultSelE == 3'b110) Input1MuxD = 2'b11; // choose SrcAM
+      if ((Adr1 == RdE) & (FWriteEnE | ((FResultSelE == 3'b110) & RegWriteD))) 
+        if (FResultSelE == 3'b110) FForwardInput1D = 2'b11; // choose SrcAM
         else FStallD = 1'b1;                           // otherwise stall
-      else if ((Adr1 == RdM) & FRegWriteM) Input1MuxD = 2'b01; // choose FPUResultDirW
-      else if ((Adr1 == RdW) & FRegWriteW) Input1MuxD = 2'b11; // choose FPUResultDirE
+      else if ((Adr1 == RdM) & FWriteEnM) FForwardInput1D = 2'b01; // choose FPUResultDirW
+      else if ((Adr1 == RdW) & FWriteEnW) FForwardInput1D = 2'b11; // choose FPUResultDirE
     
 
-      if(In2UsedD)
-        if      ((Adr2 == RdE) & FRegWriteE) FStallD = 1'b1;
-        else if ((Adr2 == RdM) & FRegWriteM) Input2MuxD = 2'b01; // choose FPUResultDirW
-        else if ((Adr2 == RdW) & FRegWriteW) Input2MuxD = 2'b10; // choose FPUResultDirE
+      if(FInput2UsedD)
+        if      ((Adr2 == RdE) & FWriteEnE) FStallD = 1'b1;
+        else if ((Adr2 == RdM) & FWriteEnM) FForwardInput2D = 2'b01; // choose FPUResultDirW
+        else if ((Adr2 == RdW) & FWriteEnW) FForwardInput2D = 2'b10; // choose FPUResultDirE
 
 
-      if(In3UsedD)
-        if      ((Adr3 == RdE) & FRegWriteE) FStallD = 1'b1;
-        else if ((Adr3 == RdM) & FRegWriteM) FStallD = 1'b1;
-        else if ((Adr3 == RdW) & FRegWriteW) Input3MuxD = 1'b1; // choose FPUResultDirE
+      if(FInput3UsedD)
+        if      ((Adr3 == RdE) & FWriteEnE) FStallD = 1'b1;
+        else if ((Adr3 == RdM) & FWriteEnM) FStallD = 1'b1;
+        else if ((Adr3 == RdW) & FWriteEnW) FForwardInput3D = 1'b1; // choose FPUResultDirE
     end
 
   end 
diff --git a/wally-pipelined/src/fpu/fsgn.sv b/wally-pipelined/src/fpu/fsgn.sv
index 4f4748bd..2850af86 100755
--- a/wally-pipelined/src/fpu/fsgn.sv
+++ b/wally-pipelined/src/fpu/fsgn.sv
@@ -1,8 +1,8 @@
 //performs the fsgnj/fsgnjn/fsgnjx RISCV instructions
 
-module fpusgn (SgnOpCodeE, SgnResultE, SgnFlagsE, SgnOp1E, SgnOp2E);
+module fpusgn (SgnOpCodeE, SgnResultE, SgnFlagsE, FInput1E, FInput2E);
 
-	input  [63:0]  SgnOp1E, SgnOp2E;
+	input  [63:0]  FInput1E, FInput2E;
 	input  [1:0]   SgnOpCodeE;
 	output [63:0]  SgnResultE;
 	output [4:0]   SgnFlagsE;
@@ -11,18 +11,18 @@ module fpusgn (SgnOpCodeE, SgnResultE, SgnFlagsE, SgnOp1E, SgnOp2E);
 
 	//op code designation:
 	//
-	//00 - fsgnj - directly copy over sign value of SgnOp2E
-	//01 - fsgnjn - negate sign value of SgnOp2E
-	//10 - fsgnjx - XOR sign values of SgnOp1E & SgnOp2E
+	//00 - fsgnj - directly copy over sign value of FInput2E
+	//01 - fsgnjn - negate sign value of FInput2E
+	//10 - fsgnjx - XOR sign values of FInput1E & FInput2E
 	//
 	
-	assign SgnResultE[63] = SgnOpCodeE[1] ? (SgnOp1E[63] ^ SgnOp2E[63]) : (SgnOp2E[63] ^ SgnOpCodeE[0]);
-	assign SgnResultE[62:0] = SgnOp1E[62:0];
+	assign SgnResultE[63] = SgnOpCodeE[1] ? (FInput1E[63] ^ FInput2E[63]) : (FInput2E[63] ^ SgnOpCodeE[0]);
+	assign SgnResultE[62:0] = FInput1E[62:0];
 
 	//If the exponent is all ones, then the value is either Inf or NaN,
 	//both of which will produce a QNaN/SNaN value of some sort. This will 
 	//set the invalid flag high.
-	assign AonesExp = SgnOp1E[62]&SgnOp1E[61]&SgnOp1E[60]&SgnOp1E[59]&SgnOp1E[58]&SgnOp1E[57]&SgnOp1E[56]&SgnOp1E[55]&SgnOp1E[54]&SgnOp1E[53]&SgnOp1E[52];
+	assign AonesExp = FInput1E[62]&FInput1E[61]&FInput1E[60]&FInput1E[59]&FInput1E[58]&FInput1E[57]&FInput1E[56]&FInput1E[55]&FInput1E[54]&FInput1E[53]&FInput1E[52];
 
 	//the only flag that can occur during this operation is invalid
 	//due to changing sign on already existing NaN
diff --git a/wally-pipelined/src/fpu/special.sv b/wally-pipelined/src/fpu/special.sv
index 711fd12d..8ca265bb 100644
--- a/wally-pipelined/src/fpu/special.sv
+++ b/wally-pipelined/src/fpu/special.sv
@@ -10,46 +10,46 @@
 /////////////////////////////////////////////////////////////////////////////
 
 /////////////////////////////////////////////////////////////////////////////
-module special(Input1E, Input2E, Input3E, xzeroE, yzeroE, zzeroE,
+module special(FInput1E, FInput2E, FInput3E, xzeroE, yzeroE, zzeroE,
 				xnanE, ynanE, znanE, xdenormE, ydenormE, zdenormE, xinfE, yinfE, zinfE);
 /////////////////////////////////////////////////////////////////////////////
 
-	input logic   	[63:0]     	Input1E;              // Input Input1E
-	input logic     	[63:0]     	Input2E;           	// Input Input2E
-	input logic      	[63:0]    	Input3E;            	// Input Input3E 
-	output logic				xzeroE;		// Input Input1E = 0
-	output logic				yzeroE;		// Input Input2E = 0
-	output logic				zzeroE;		// Input Input3E = 0
-	output logic				xnanE;		// Input1E is NaN
-	output logic				ynanE;		// Input2E is NaN
-	output logic				znanE;		// Input3E is NaN
-	output logic				xdenormE;	// Input1E is denormalized
-	output logic				ydenormE;	// Input2E is denormalized
-	output logic				zdenormE;	// Input3E is denormalized
-	output logic				xinfE;		// Input1E is infinity
-	output logic				yinfE;		// Input2E is infinity
-	output logic				zinfE;		// Input3E is infinity
+	input logic   	[63:0]     	FInput1E;              // Input FInput1E
+	input logic     	[63:0]     	FInput2E;           	// Input FInput2E
+	input logic      	[63:0]    	FInput3E;            	// Input FInput3E 
+	output logic				xzeroE;		// Input FInput1E = 0
+	output logic				yzeroE;		// Input FInput2E = 0
+	output logic				zzeroE;		// Input FInput3E = 0
+	output logic				xnanE;		// FInput1E is NaN
+	output logic				ynanE;		// FInput2E is NaN
+	output logic				znanE;		// FInput3E is NaN
+	output logic				xdenormE;	// FInput1E is denormalized
+	output logic				ydenormE;	// FInput2E is denormalized
+	output logic				zdenormE;	// FInput3E is denormalized
+	output logic				xinfE;		// FInput1E is infinity
+	output logic				yinfE;		// FInput2E is infinity
+	output logic				zinfE;		// FInput3E is infinity
 
 	// In the actual circuit design, the gates looking at bits
 	// 51:0 and at bits 62:52 should be shared among the various detectors.
 
 	// Check if input is NaN
 
-	assign xnanE = &Input1E[62:52] && |Input1E[51:0]; 
-	assign ynanE = &Input2E[62:52] && |Input2E[51:0]; 
-	assign znanE = &Input3E[62:52] && |Input3E[51:0];
+	assign xnanE = &FInput1E[62:52] && |FInput1E[51:0]; 
+	assign ynanE = &FInput2E[62:52] && |FInput2E[51:0]; 
+	assign znanE = &FInput3E[62:52] && |FInput3E[51:0];
 
 	// Check if input is denormalized
 
-	assign xdenormE = ~(|Input1E[62:52]) && |Input1E[51:0]; 
-	assign ydenormE = ~(|Input2E[62:52]) && |Input2E[51:0]; 
-	assign zdenormE = ~(|Input3E[62:52]) && |Input3E[51:0];
+	assign xdenormE = ~(|FInput1E[62:52]) && |FInput1E[51:0]; 
+	assign ydenormE = ~(|FInput2E[62:52]) && |FInput2E[51:0]; 
+	assign zdenormE = ~(|FInput3E[62:52]) && |FInput3E[51:0];
 
 	// Check if input is infinity
 
-	assign xinfE = &Input1E[62:52] && ~(|Input1E[51:0]); 
-	assign yinfE = &Input2E[62:52] && ~(|Input2E[51:0]); 
-	assign zinfE = &Input3E[62:52] && ~(|Input3E[51:0]);
+	assign xinfE = &FInput1E[62:52] && ~(|FInput1E[51:0]); 
+	assign yinfE = &FInput2E[62:52] && ~(|FInput2E[51:0]); 
+	assign zinfE = &FInput3E[62:52] && ~(|FInput3E[51:0]);
 
 	// Check if inputs are all zero
 	// Also forces denormalized inputs to zero.
@@ -57,11 +57,11 @@ module special(Input1E, Input2E, Input3E, xzeroE, yzeroE, zzeroE,
 	// to just check if the exponent is zero.
 	
 	// KATHERINE - commented following (21/01/11)
-	// assign xzeroE = ~(|Input1E[62:0]) || xdenormE;
-	// assign yzeroE = ~(|Input2E[62:0]) || ydenormE;
-	// assign zzeroE = ~(|Input3E[62:0]) || zdenormE;
+	// assign xzeroE = ~(|FInput1E[62:0]) || xdenormE;
+	// assign yzeroE = ~(|FInput2E[62:0]) || ydenormE;
+	// assign zzeroE = ~(|FInput3E[62:0]) || zdenormE;
 	// KATHERINE - removed denorm to prevent output logicing zero when computing with a denormalized number
-	assign xzeroE = ~(|Input1E[62:0]);
-	assign yzeroE = ~(|Input2E[62:0]);
-	assign zzeroE = ~(|Input3E[62:0]);
+	assign xzeroE = ~(|FInput1E[62:0]);
+	assign yzeroE = ~(|FInput2E[62:0]);
+	assign zzeroE = ~(|FInput3E[62:0]);
  endmodule
diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv
index cb4a60a8..eab0885d 100644
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@@ -100,7 +100,7 @@ module wallypipelinedhart (
   logic       FStallD;
   logic       FWriteIntW, FWriteIntM;
   logic [31:0]      FSROutW;
-  logic             DivSqrtDoneE;
+  logic             FDivSqrtDoneM;
   logic             IllegalFPUInstrD, IllegalFPUInstrE;
   logic [`XLEN-1:0] FPUResultW;
 

From 77260643ebe6f9fa6e0e02dfe083659a056cd94a Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Wed, 26 May 2021 09:12:37 -0500
Subject: [PATCH 05/14] Add regression test for fpadd

---
 wally-pipelined/src/fpu/fpadd/adder.v         | 758 ++++++++++++++++++
 wally-pipelined/src/fpu/fpadd/cla52.v         | 202 +++++
 wally-pipelined/src/fpu/fpadd/cla64.v         | 420 ++++++++++
 .../src/fpu/fpadd/convert_inputs.v            |  61 ++
 wally-pipelined/src/fpu/fpadd/exception.v     | 120 +++
 wally-pipelined/src/fpu/fpadd/f32_add_rd.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/f32_add_rne.do  |  56 ++
 wally-pipelined/src/fpu/fpadd/f32_add_ru.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/f32_add_rz.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/f32_f64_rne.do  |  56 ++
 wally-pipelined/src/fpu/fpadd/f32_sub_rd.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/f32_sub_rne.do  |  56 ++
 wally-pipelined/src/fpu/fpadd/f32_sub_ru.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/f32_sub_rz.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/f64_add_rd.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/f64_add_rne.do  |  56 ++
 wally-pipelined/src/fpu/fpadd/f64_add_ru.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/f64_add_rz.do   |  58 ++
 wally-pipelined/src/fpu/fpadd/f64_f32_rne.do  |  56 ++
 wally-pipelined/src/fpu/fpadd/f64_sub_rd.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/f64_sub_rne.do  |  56 ++
 wally-pipelined/src/fpu/fpadd/f64_sub_ru.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/f64_sub_rz.do   |  56 ++
 wally-pipelined/src/fpu/fpadd/fpadd.v         | 216 +++++
 wally-pipelined/src/fpu/fpadd/lzd.v           | 137 ++++
 wally-pipelined/src/fpu/fpadd/rounder.v       | 214 +++++
 wally-pipelined/src/fpu/fpadd/shifter.v       | 119 +++
 wally-pipelined/src/fpu/fpadd/tb.v            |  86 ++
 .../src/fpu/fpadd/tb_f32_add_rd.sv            |  79 ++
 .../src/fpu/fpadd/tb_f32_add_rne.sv           |  79 ++
 .../src/fpu/fpadd/tb_f32_add_ru.sv            |  79 ++
 .../src/fpu/fpadd/tb_f32_add_rz.sv            |  79 ++
 .../src/fpu/fpadd/tb_f32_f64_rne.sv           |  75 ++
 .../src/fpu/fpadd/tb_f32_sub_rd.sv            |  79 ++
 .../src/fpu/fpadd/tb_f32_sub_rne.sv           |  79 ++
 .../src/fpu/fpadd/tb_f32_sub_ru.sv            |  79 ++
 .../src/fpu/fpadd/tb_f32_sub_rz.sv            |  79 ++
 .../src/fpu/fpadd/tb_f64_add_rd.sv            |  78 ++
 .../src/fpu/fpadd/tb_f64_add_rne.sv           |  78 ++
 .../src/fpu/fpadd/tb_f64_add_ru.sv            |  78 ++
 .../src/fpu/fpadd/tb_f64_add_rz.sv            |  78 ++
 .../src/fpu/fpadd/tb_f64_f32_rne.sv           |  79 ++
 .../src/fpu/fpadd/tb_f64_sub_rd.sv            |  78 ++
 .../src/fpu/fpadd/tb_f64_sub_rne.sv           |  78 ++
 .../src/fpu/fpadd/tb_f64_sub_ru.sv            |  78 ++
 .../src/fpu/fpadd/tb_f64_sub_rz.sv            |  78 ++
 46 files changed, 4753 insertions(+)
 create mode 100755 wally-pipelined/src/fpu/fpadd/adder.v
 create mode 100755 wally-pipelined/src/fpu/fpadd/cla52.v
 create mode 100755 wally-pipelined/src/fpu/fpadd/cla64.v
 create mode 100755 wally-pipelined/src/fpu/fpadd/convert_inputs.v
 create mode 100755 wally-pipelined/src/fpu/fpadd/exception.v
 create mode 100755 wally-pipelined/src/fpu/fpadd/f32_add_rd.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f32_add_rne.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f32_add_ru.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f32_add_rz.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f32_f64_rne.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f32_sub_rd.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f32_sub_rne.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f32_sub_ru.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f32_sub_rz.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f64_add_rd.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f64_add_rne.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f64_add_ru.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f64_add_rz.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f64_f32_rne.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f64_sub_rd.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f64_sub_rne.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f64_sub_ru.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/f64_sub_rz.do
 create mode 100755 wally-pipelined/src/fpu/fpadd/fpadd.v
 create mode 100755 wally-pipelined/src/fpu/fpadd/lzd.v
 create mode 100755 wally-pipelined/src/fpu/fpadd/rounder.v
 create mode 100755 wally-pipelined/src/fpu/fpadd/shifter.v
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb.v
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f32_add_rd.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f32_add_rne.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f32_add_ru.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f32_add_rz.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f32_f64_rne.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f32_sub_rd.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f32_sub_rne.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f32_sub_ru.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f32_sub_rz.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f64_add_rd.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f64_add_rne.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f64_add_ru.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f64_add_rz.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f64_f32_rne.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f64_sub_rd.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f64_sub_rne.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f64_sub_ru.sv
 create mode 100755 wally-pipelined/src/fpu/fpadd/tb_f64_sub_rz.sv

diff --git a/wally-pipelined/src/fpu/fpadd/adder.v b/wally-pipelined/src/fpu/fpadd/adder.v
new file mode 100755
index 00000000..3d4124af
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/adder.v
@@ -0,0 +1,758 @@
+// The following module make up the basic building blocks that
+// are used by the cla64, cla_sub64, and cla52.
+
+module INVBLOCK ( GIN, GOUT );
+   
+   input  GIN;
+   output GOUT;
+   
+   assign GOUT =  ~ GIN;
+   
+endmodule // INVBLOCK
+
+
+module XXOR1 ( A, B, GIN, SUM );
+   
+   input  A;
+   input  B;
+   input  GIN;
+   output SUM;
+   
+   assign SUM = ( ~ (A ^ B)) ^ GIN;
+   
+endmodule // XXOR1
+
+
+module BLOCK0 ( A, B, POUT, GOUT );
+   
+   input  A;
+   input  B;
+   output POUT;
+   output GOUT;
+   
+   assign POUT =  ~ (A | B);
+   assign GOUT =  ~ (A & B);
+   
+endmodule // BLOCK0
+
+
+module BLOCK1 ( PIN1, PIN2, GIN1, GIN2, POUT, GOUT );
+   
+   input  PIN1;
+   input  PIN2;
+   input  GIN1;
+   input  GIN2;
+   output POUT;
+   output GOUT;
+   
+   assign POUT =  ~ (PIN1 | PIN2);
+   assign GOUT =  ~ (GIN2 & (PIN2 | GIN1));
+   
+endmodule // BLOCK1
+
+
+module BLOCK2 ( PIN1, PIN2, GIN1, GIN2, POUT, GOUT );
+   
+   input  PIN1;
+   input  PIN2;
+   input  GIN1;
+   input  GIN2;
+   output POUT;
+   output GOUT;
+   
+   assign POUT =  ~ (PIN1 & PIN2);
+   assign GOUT =  ~ (GIN2 | (PIN2 & GIN1));
+   
+endmodule // BLOCK2
+
+
+module BLOCK1A ( PIN2, GIN1, GIN2, GOUT );
+   
+   input  PIN2;
+   input  GIN1;
+   input  GIN2;
+   output GOUT;
+   
+   assign GOUT =  ~ (GIN2 & (PIN2 | GIN1));
+   
+endmodule // BLOCK1A
+
+
+module BLOCK2A ( PIN2, GIN1, GIN2, GOUT );
+   
+   input  PIN2;
+   input  GIN1;
+   input  GIN2;
+   output GOUT;
+   
+   assign GOUT =  ~ (GIN2 | (PIN2 & GIN1));
+   
+endmodule
+
+module PRESTAGE_64 ( A, B, CIN, POUT, GOUT );
+   
+   input  [0:63] A;
+   input [0:63]  B;
+   input 	 CIN;
+   
+   output [0:63] POUT;
+   output [0:64] GOUT;
+   
+   BLOCK0 U10 (A[0] , B[0] , POUT[0] , GOUT[1] );
+   BLOCK0 U11 (A[1] , B[1] , POUT[1] , GOUT[2] );
+   BLOCK0 U12 (A[2] , B[2] , POUT[2] , GOUT[3] );
+   BLOCK0 U13 (A[3] , B[3] , POUT[3] , GOUT[4] );
+   BLOCK0 U14 (A[4] , B[4] , POUT[4] , GOUT[5] );
+   BLOCK0 U15 (A[5] , B[5] , POUT[5] , GOUT[6] );
+   BLOCK0 U16 (A[6] , B[6] , POUT[6] , GOUT[7] );
+   BLOCK0 U17 (A[7] , B[7] , POUT[7] , GOUT[8] );
+   BLOCK0 U18 (A[8] , B[8] , POUT[8] , GOUT[9] );
+   BLOCK0 U19 (A[9] , B[9] , POUT[9] , GOUT[10] );
+   BLOCK0 U110 (A[10] , B[10] , POUT[10] , GOUT[11] );
+   BLOCK0 U111 (A[11] , B[11] , POUT[11] , GOUT[12] );
+   BLOCK0 U112 (A[12] , B[12] , POUT[12] , GOUT[13] );
+   BLOCK0 U113 (A[13] , B[13] , POUT[13] , GOUT[14] );
+   BLOCK0 U114 (A[14] , B[14] , POUT[14] , GOUT[15] );
+   BLOCK0 U115 (A[15] , B[15] , POUT[15] , GOUT[16] );
+   BLOCK0 U116 (A[16] , B[16] , POUT[16] , GOUT[17] );
+   BLOCK0 U117 (A[17] , B[17] , POUT[17] , GOUT[18] );
+   BLOCK0 U118 (A[18] , B[18] , POUT[18] , GOUT[19] );
+   BLOCK0 U119 (A[19] , B[19] , POUT[19] , GOUT[20] );
+   BLOCK0 U120 (A[20] , B[20] , POUT[20] , GOUT[21] );
+   BLOCK0 U121 (A[21] , B[21] , POUT[21] , GOUT[22] );
+   BLOCK0 U122 (A[22] , B[22] , POUT[22] , GOUT[23] );
+   BLOCK0 U123 (A[23] , B[23] , POUT[23] , GOUT[24] );
+   BLOCK0 U124 (A[24] , B[24] , POUT[24] , GOUT[25] );
+   BLOCK0 U125 (A[25] , B[25] , POUT[25] , GOUT[26] );
+   BLOCK0 U126 (A[26] , B[26] , POUT[26] , GOUT[27] );
+   BLOCK0 U127 (A[27] , B[27] , POUT[27] , GOUT[28] );
+   BLOCK0 U128 (A[28] , B[28] , POUT[28] , GOUT[29] );
+   BLOCK0 U129 (A[29] , B[29] , POUT[29] , GOUT[30] );
+   BLOCK0 U130 (A[30] , B[30] , POUT[30] , GOUT[31] );
+   BLOCK0 U131 (A[31] , B[31] , POUT[31] , GOUT[32] );
+   BLOCK0 U132 (A[32] , B[32] , POUT[32] , GOUT[33] );
+   BLOCK0 U133 (A[33] , B[33] , POUT[33] , GOUT[34] );
+   BLOCK0 U134 (A[34] , B[34] , POUT[34] , GOUT[35] );
+   BLOCK0 U135 (A[35] , B[35] , POUT[35] , GOUT[36] );
+   BLOCK0 U136 (A[36] , B[36] , POUT[36] , GOUT[37] );
+   BLOCK0 U137 (A[37] , B[37] , POUT[37] , GOUT[38] );
+   BLOCK0 U138 (A[38] , B[38] , POUT[38] , GOUT[39] );
+   BLOCK0 U139 (A[39] , B[39] , POUT[39] , GOUT[40] );
+   BLOCK0 U140 (A[40] , B[40] , POUT[40] , GOUT[41] );
+   BLOCK0 U141 (A[41] , B[41] , POUT[41] , GOUT[42] );
+   BLOCK0 U142 (A[42] , B[42] , POUT[42] , GOUT[43] );
+   BLOCK0 U143 (A[43] , B[43] , POUT[43] , GOUT[44] );
+   BLOCK0 U144 (A[44] , B[44] , POUT[44] , GOUT[45] );
+   BLOCK0 U145 (A[45] , B[45] , POUT[45] , GOUT[46] );
+   BLOCK0 U146 (A[46] , B[46] , POUT[46] , GOUT[47] );
+   BLOCK0 U147 (A[47] , B[47] , POUT[47] , GOUT[48] );
+   BLOCK0 U148 (A[48] , B[48] , POUT[48] , GOUT[49] );
+   BLOCK0 U149 (A[49] , B[49] , POUT[49] , GOUT[50] );
+   BLOCK0 U150 (A[50] , B[50] , POUT[50] , GOUT[51] );
+   BLOCK0 U151 (A[51] , B[51] , POUT[51] , GOUT[52] );
+   BLOCK0 U152 (A[52] , B[52] , POUT[52] , GOUT[53] );
+   BLOCK0 U153 (A[53] , B[53] , POUT[53] , GOUT[54] );
+   BLOCK0 U154 (A[54] , B[54] , POUT[54] , GOUT[55] );
+   BLOCK0 U155 (A[55] , B[55] , POUT[55] , GOUT[56] );
+   BLOCK0 U156 (A[56] , B[56] , POUT[56] , GOUT[57] );
+   BLOCK0 U157 (A[57] , B[57] , POUT[57] , GOUT[58] );
+   BLOCK0 U158 (A[58] , B[58] , POUT[58] , GOUT[59] );
+   BLOCK0 U159 (A[59] , B[59] , POUT[59] , GOUT[60] );
+   BLOCK0 U160 (A[60] , B[60] , POUT[60] , GOUT[61] );
+   BLOCK0 U161 (A[61] , B[61] , POUT[61] , GOUT[62] );
+   BLOCK0 U162 (A[62] , B[62] , POUT[62] , GOUT[63] );
+   BLOCK0 U163 (A[63] , B[63] , POUT[63] , GOUT[64] );
+   INVBLOCK U2 (CIN , GOUT[0] );
+   
+endmodule // PRESTAGE_64
+
+
+module DBLC_0_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:63] PIN;
+   input [0:64]  GIN;
+   
+   output [0:62] POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   BLOCK1A U21 (PIN[0] , GIN[0] , GIN[1] , GOUT[1] );
+   BLOCK1 U32 (PIN[0] , PIN[1] , GIN[1] , GIN[2] , POUT[0] , GOUT[2] );
+   BLOCK1 U33 (PIN[1] , PIN[2] , GIN[2] , GIN[3] , POUT[1] , GOUT[3] );
+   BLOCK1 U34 (PIN[2] , PIN[3] , GIN[3] , GIN[4] , POUT[2] , GOUT[4] );
+   BLOCK1 U35 (PIN[3] , PIN[4] , GIN[4] , GIN[5] , POUT[3] , GOUT[5] );
+   BLOCK1 U36 (PIN[4] , PIN[5] , GIN[5] , GIN[6] , POUT[4] , GOUT[6] );
+   BLOCK1 U37 (PIN[5] , PIN[6] , GIN[6] , GIN[7] , POUT[5] , GOUT[7] );
+   BLOCK1 U38 (PIN[6] , PIN[7] , GIN[7] , GIN[8] , POUT[6] , GOUT[8] );
+   BLOCK1 U39 (PIN[7] , PIN[8] , GIN[8] , GIN[9] , POUT[7] , GOUT[9] );
+   BLOCK1 U310 (PIN[8] , PIN[9] , GIN[9] , GIN[10] , POUT[8] , GOUT[10] );
+   BLOCK1 U311 (PIN[9] , PIN[10] , GIN[10] , GIN[11] , POUT[9] , GOUT[11] );
+   BLOCK1 U312 (PIN[10] , PIN[11] , GIN[11] , GIN[12] , POUT[10] , GOUT[12] );
+   BLOCK1 U313 (PIN[11] , PIN[12] , GIN[12] , GIN[13] , POUT[11] , GOUT[13] );
+   BLOCK1 U314 (PIN[12] , PIN[13] , GIN[13] , GIN[14] , POUT[12] , GOUT[14] );
+   BLOCK1 U315 (PIN[13] , PIN[14] , GIN[14] , GIN[15] , POUT[13] , GOUT[15] );
+   BLOCK1 U316 (PIN[14] , PIN[15] , GIN[15] , GIN[16] , POUT[14] , GOUT[16] );
+   BLOCK1 U317 (PIN[15] , PIN[16] , GIN[16] , GIN[17] , POUT[15] , GOUT[17] );
+   BLOCK1 U318 (PIN[16] , PIN[17] , GIN[17] , GIN[18] , POUT[16] , GOUT[18] );
+   BLOCK1 U319 (PIN[17] , PIN[18] , GIN[18] , GIN[19] , POUT[17] , GOUT[19] );
+   BLOCK1 U320 (PIN[18] , PIN[19] , GIN[19] , GIN[20] , POUT[18] , GOUT[20] );
+   BLOCK1 U321 (PIN[19] , PIN[20] , GIN[20] , GIN[21] , POUT[19] , GOUT[21] );
+   BLOCK1 U322 (PIN[20] , PIN[21] , GIN[21] , GIN[22] , POUT[20] , GOUT[22] );
+   BLOCK1 U323 (PIN[21] , PIN[22] , GIN[22] , GIN[23] , POUT[21] , GOUT[23] );
+   BLOCK1 U324 (PIN[22] , PIN[23] , GIN[23] , GIN[24] , POUT[22] , GOUT[24] );
+   BLOCK1 U325 (PIN[23] , PIN[24] , GIN[24] , GIN[25] , POUT[23] , GOUT[25] );
+   BLOCK1 U326 (PIN[24] , PIN[25] , GIN[25] , GIN[26] , POUT[24] , GOUT[26] );
+   BLOCK1 U327 (PIN[25] , PIN[26] , GIN[26] , GIN[27] , POUT[25] , GOUT[27] );
+   BLOCK1 U328 (PIN[26] , PIN[27] , GIN[27] , GIN[28] , POUT[26] , GOUT[28] );
+   BLOCK1 U329 (PIN[27] , PIN[28] , GIN[28] , GIN[29] , POUT[27] , GOUT[29] );
+   BLOCK1 U330 (PIN[28] , PIN[29] , GIN[29] , GIN[30] , POUT[28] , GOUT[30] );
+   BLOCK1 U331 (PIN[29] , PIN[30] , GIN[30] , GIN[31] , POUT[29] , GOUT[31] );
+   BLOCK1 U332 (PIN[30] , PIN[31] , GIN[31] , GIN[32] , POUT[30] , GOUT[32] );
+   BLOCK1 U333 (PIN[31] , PIN[32] , GIN[32] , GIN[33] , POUT[31] , GOUT[33] );
+   BLOCK1 U334 (PIN[32] , PIN[33] , GIN[33] , GIN[34] , POUT[32] , GOUT[34] );
+   BLOCK1 U335 (PIN[33] , PIN[34] , GIN[34] , GIN[35] , POUT[33] , GOUT[35] );
+   BLOCK1 U336 (PIN[34] , PIN[35] , GIN[35] , GIN[36] , POUT[34] , GOUT[36] );
+   BLOCK1 U337 (PIN[35] , PIN[36] , GIN[36] , GIN[37] , POUT[35] , GOUT[37] );
+   BLOCK1 U338 (PIN[36] , PIN[37] , GIN[37] , GIN[38] , POUT[36] , GOUT[38] );
+   BLOCK1 U339 (PIN[37] , PIN[38] , GIN[38] , GIN[39] , POUT[37] , GOUT[39] );
+   BLOCK1 U340 (PIN[38] , PIN[39] , GIN[39] , GIN[40] , POUT[38] , GOUT[40] );
+   BLOCK1 U341 (PIN[39] , PIN[40] , GIN[40] , GIN[41] , POUT[39] , GOUT[41] );
+   BLOCK1 U342 (PIN[40] , PIN[41] , GIN[41] , GIN[42] , POUT[40] , GOUT[42] );
+   BLOCK1 U343 (PIN[41] , PIN[42] , GIN[42] , GIN[43] , POUT[41] , GOUT[43] );
+   BLOCK1 U344 (PIN[42] , PIN[43] , GIN[43] , GIN[44] , POUT[42] , GOUT[44] );
+   BLOCK1 U345 (PIN[43] , PIN[44] , GIN[44] , GIN[45] , POUT[43] , GOUT[45] );
+   BLOCK1 U346 (PIN[44] , PIN[45] , GIN[45] , GIN[46] , POUT[44] , GOUT[46] );
+   BLOCK1 U347 (PIN[45] , PIN[46] , GIN[46] , GIN[47] , POUT[45] , GOUT[47] );
+   BLOCK1 U348 (PIN[46] , PIN[47] , GIN[47] , GIN[48] , POUT[46] , GOUT[48] );
+   BLOCK1 U349 (PIN[47] , PIN[48] , GIN[48] , GIN[49] , POUT[47] , GOUT[49] );
+   BLOCK1 U350 (PIN[48] , PIN[49] , GIN[49] , GIN[50] , POUT[48] , GOUT[50] );
+   BLOCK1 U351 (PIN[49] , PIN[50] , GIN[50] , GIN[51] , POUT[49] , GOUT[51] );
+   BLOCK1 U352 (PIN[50] , PIN[51] , GIN[51] , GIN[52] , POUT[50] , GOUT[52] );
+   BLOCK1 U353 (PIN[51] , PIN[52] , GIN[52] , GIN[53] , POUT[51] , GOUT[53] );
+   BLOCK1 U354 (PIN[52] , PIN[53] , GIN[53] , GIN[54] , POUT[52] , GOUT[54] );
+   BLOCK1 U355 (PIN[53] , PIN[54] , GIN[54] , GIN[55] , POUT[53] , GOUT[55] );
+   BLOCK1 U356 (PIN[54] , PIN[55] , GIN[55] , GIN[56] , POUT[54] , GOUT[56] );
+   BLOCK1 U357 (PIN[55] , PIN[56] , GIN[56] , GIN[57] , POUT[55] , GOUT[57] );
+   BLOCK1 U358 (PIN[56] , PIN[57] , GIN[57] , GIN[58] , POUT[56] , GOUT[58] );
+   BLOCK1 U359 (PIN[57] , PIN[58] , GIN[58] , GIN[59] , POUT[57] , GOUT[59] );
+   BLOCK1 U360 (PIN[58] , PIN[59] , GIN[59] , GIN[60] , POUT[58] , GOUT[60] );
+   BLOCK1 U361 (PIN[59] , PIN[60] , GIN[60] , GIN[61] , POUT[59] , GOUT[61] );
+   BLOCK1 U362 (PIN[60] , PIN[61] , GIN[61] , GIN[62] , POUT[60] , GOUT[62] );
+   BLOCK1 U363 (PIN[61] , PIN[62] , GIN[62] , GIN[63] , POUT[61] , GOUT[63] );
+   BLOCK1 U364 (PIN[62] , PIN[63] , GIN[63] , GIN[64] , POUT[62] , GOUT[64] );
+   
+endmodule // DBLC_0_64
+
+
+module DBLC_1_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:62] PIN;
+   input [0:64]  GIN;
+   
+   output [0:60] POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   INVBLOCK U11 (GIN[1] , GOUT[1] );
+   BLOCK2A U22 (PIN[0] , GIN[0] , GIN[2] , GOUT[2] );
+   BLOCK2A U23 (PIN[1] , GIN[1] , GIN[3] , GOUT[3] );
+   BLOCK2 U34 (PIN[0] , PIN[2] , GIN[2] , GIN[4] , POUT[0] , GOUT[4] );
+   BLOCK2 U35 (PIN[1] , PIN[3] , GIN[3] , GIN[5] , POUT[1] , GOUT[5] );
+   BLOCK2 U36 (PIN[2] , PIN[4] , GIN[4] , GIN[6] , POUT[2] , GOUT[6] );
+   BLOCK2 U37 (PIN[3] , PIN[5] , GIN[5] , GIN[7] , POUT[3] , GOUT[7] );
+   BLOCK2 U38 (PIN[4] , PIN[6] , GIN[6] , GIN[8] , POUT[4] , GOUT[8] );
+   BLOCK2 U39 (PIN[5] , PIN[7] , GIN[7] , GIN[9] , POUT[5] , GOUT[9] );
+   BLOCK2 U310 (PIN[6] , PIN[8] , GIN[8] , GIN[10] , POUT[6] , GOUT[10] );
+   BLOCK2 U311 (PIN[7] , PIN[9] , GIN[9] , GIN[11] , POUT[7] , GOUT[11] );
+   BLOCK2 U312 (PIN[8] , PIN[10] , GIN[10] , GIN[12] , POUT[8] , GOUT[12] );
+   BLOCK2 U313 (PIN[9] , PIN[11] , GIN[11] , GIN[13] , POUT[9] , GOUT[13] );
+   BLOCK2 U314 (PIN[10] , PIN[12] , GIN[12] , GIN[14] , POUT[10] , GOUT[14] );
+   BLOCK2 U315 (PIN[11] , PIN[13] , GIN[13] , GIN[15] , POUT[11] , GOUT[15] );
+   BLOCK2 U316 (PIN[12] , PIN[14] , GIN[14] , GIN[16] , POUT[12] , GOUT[16] );
+   BLOCK2 U317 (PIN[13] , PIN[15] , GIN[15] , GIN[17] , POUT[13] , GOUT[17] );
+   BLOCK2 U318 (PIN[14] , PIN[16] , GIN[16] , GIN[18] , POUT[14] , GOUT[18] );
+   BLOCK2 U319 (PIN[15] , PIN[17] , GIN[17] , GIN[19] , POUT[15] , GOUT[19] );
+   BLOCK2 U320 (PIN[16] , PIN[18] , GIN[18] , GIN[20] , POUT[16] , GOUT[20] );
+   BLOCK2 U321 (PIN[17] , PIN[19] , GIN[19] , GIN[21] , POUT[17] , GOUT[21] );
+   BLOCK2 U322 (PIN[18] , PIN[20] , GIN[20] , GIN[22] , POUT[18] , GOUT[22] );
+   BLOCK2 U323 (PIN[19] , PIN[21] , GIN[21] , GIN[23] , POUT[19] , GOUT[23] );
+   BLOCK2 U324 (PIN[20] , PIN[22] , GIN[22] , GIN[24] , POUT[20] , GOUT[24] );
+   BLOCK2 U325 (PIN[21] , PIN[23] , GIN[23] , GIN[25] , POUT[21] , GOUT[25] );
+   BLOCK2 U326 (PIN[22] , PIN[24] , GIN[24] , GIN[26] , POUT[22] , GOUT[26] );
+   BLOCK2 U327 (PIN[23] , PIN[25] , GIN[25] , GIN[27] , POUT[23] , GOUT[27] );
+   BLOCK2 U328 (PIN[24] , PIN[26] , GIN[26] , GIN[28] , POUT[24] , GOUT[28] );
+   BLOCK2 U329 (PIN[25] , PIN[27] , GIN[27] , GIN[29] , POUT[25] , GOUT[29] );
+   BLOCK2 U330 (PIN[26] , PIN[28] , GIN[28] , GIN[30] , POUT[26] , GOUT[30] );
+   BLOCK2 U331 (PIN[27] , PIN[29] , GIN[29] , GIN[31] , POUT[27] , GOUT[31] );
+   BLOCK2 U332 (PIN[28] , PIN[30] , GIN[30] , GIN[32] , POUT[28] , GOUT[32] );
+   BLOCK2 U333 (PIN[29] , PIN[31] , GIN[31] , GIN[33] , POUT[29] , GOUT[33] );
+   BLOCK2 U334 (PIN[30] , PIN[32] , GIN[32] , GIN[34] , POUT[30] , GOUT[34] );
+   BLOCK2 U335 (PIN[31] , PIN[33] , GIN[33] , GIN[35] , POUT[31] , GOUT[35] );
+   BLOCK2 U336 (PIN[32] , PIN[34] , GIN[34] , GIN[36] , POUT[32] , GOUT[36] );
+   BLOCK2 U337 (PIN[33] , PIN[35] , GIN[35] , GIN[37] , POUT[33] , GOUT[37] );
+   BLOCK2 U338 (PIN[34] , PIN[36] , GIN[36] , GIN[38] , POUT[34] , GOUT[38] );
+   BLOCK2 U339 (PIN[35] , PIN[37] , GIN[37] , GIN[39] , POUT[35] , GOUT[39] );
+   BLOCK2 U340 (PIN[36] , PIN[38] , GIN[38] , GIN[40] , POUT[36] , GOUT[40] );
+   BLOCK2 U341 (PIN[37] , PIN[39] , GIN[39] , GIN[41] , POUT[37] , GOUT[41] );
+   BLOCK2 U342 (PIN[38] , PIN[40] , GIN[40] , GIN[42] , POUT[38] , GOUT[42] );
+   BLOCK2 U343 (PIN[39] , PIN[41] , GIN[41] , GIN[43] , POUT[39] , GOUT[43] );
+   BLOCK2 U344 (PIN[40] , PIN[42] , GIN[42] , GIN[44] , POUT[40] , GOUT[44] );
+   BLOCK2 U345 (PIN[41] , PIN[43] , GIN[43] , GIN[45] , POUT[41] , GOUT[45] );
+   BLOCK2 U346 (PIN[42] , PIN[44] , GIN[44] , GIN[46] , POUT[42] , GOUT[46] );
+   BLOCK2 U347 (PIN[43] , PIN[45] , GIN[45] , GIN[47] , POUT[43] , GOUT[47] );
+   BLOCK2 U348 (PIN[44] , PIN[46] , GIN[46] , GIN[48] , POUT[44] , GOUT[48] );
+   BLOCK2 U349 (PIN[45] , PIN[47] , GIN[47] , GIN[49] , POUT[45] , GOUT[49] );
+   BLOCK2 U350 (PIN[46] , PIN[48] , GIN[48] , GIN[50] , POUT[46] , GOUT[50] );
+   BLOCK2 U351 (PIN[47] , PIN[49] , GIN[49] , GIN[51] , POUT[47] , GOUT[51] );
+   BLOCK2 U352 (PIN[48] , PIN[50] , GIN[50] , GIN[52] , POUT[48] , GOUT[52] );
+   BLOCK2 U353 (PIN[49] , PIN[51] , GIN[51] , GIN[53] , POUT[49] , GOUT[53] );
+   BLOCK2 U354 (PIN[50] , PIN[52] , GIN[52] , GIN[54] , POUT[50] , GOUT[54] );
+   BLOCK2 U355 (PIN[51] , PIN[53] , GIN[53] , GIN[55] , POUT[51] , GOUT[55] );
+   BLOCK2 U356 (PIN[52] , PIN[54] , GIN[54] , GIN[56] , POUT[52] , GOUT[56] );
+   BLOCK2 U357 (PIN[53] , PIN[55] , GIN[55] , GIN[57] , POUT[53] , GOUT[57] );
+   BLOCK2 U358 (PIN[54] , PIN[56] , GIN[56] , GIN[58] , POUT[54] , GOUT[58] );
+   BLOCK2 U359 (PIN[55] , PIN[57] , GIN[57] , GIN[59] , POUT[55] , GOUT[59] );
+   BLOCK2 U360 (PIN[56] , PIN[58] , GIN[58] , GIN[60] , POUT[56] , GOUT[60] );
+   BLOCK2 U361 (PIN[57] , PIN[59] , GIN[59] , GIN[61] , POUT[57] , GOUT[61] );
+   BLOCK2 U362 (PIN[58] , PIN[60] , GIN[60] , GIN[62] , POUT[58] , GOUT[62] );
+   BLOCK2 U363 (PIN[59] , PIN[61] , GIN[61] , GIN[63] , POUT[59] , GOUT[63] );
+   BLOCK2 U364 (PIN[60] , PIN[62] , GIN[62] , GIN[64] , POUT[60] , GOUT[64] );
+   
+endmodule // DBLC_1_64
+
+
+module DBLC_2_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:60] PIN;
+   input [0:64]  GIN;
+   
+   output [0:56] POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   INVBLOCK U11 (GIN[1] , GOUT[1] );
+   INVBLOCK U12 (GIN[2] , GOUT[2] );
+   INVBLOCK U13 (GIN[3] , GOUT[3] );
+   BLOCK1A U24 (PIN[0] , GIN[0] , GIN[4] , GOUT[4] );
+   BLOCK1A U25 (PIN[1] , GIN[1] , GIN[5] , GOUT[5] );
+   BLOCK1A U26 (PIN[2] , GIN[2] , GIN[6] , GOUT[6] );
+   BLOCK1A U27 (PIN[3] , GIN[3] , GIN[7] , GOUT[7] );
+   BLOCK1 U38 (PIN[0] , PIN[4] , GIN[4] , GIN[8] , POUT[0] , GOUT[8] );
+   BLOCK1 U39 (PIN[1] , PIN[5] , GIN[5] , GIN[9] , POUT[1] , GOUT[9] );
+   BLOCK1 U310 (PIN[2] , PIN[6] , GIN[6] , GIN[10] , POUT[2] , GOUT[10] );
+   BLOCK1 U311 (PIN[3] , PIN[7] , GIN[7] , GIN[11] , POUT[3] , GOUT[11] );
+   BLOCK1 U312 (PIN[4] , PIN[8] , GIN[8] , GIN[12] , POUT[4] , GOUT[12] );
+   BLOCK1 U313 (PIN[5] , PIN[9] , GIN[9] , GIN[13] , POUT[5] , GOUT[13] );
+   BLOCK1 U314 (PIN[6] , PIN[10] , GIN[10] , GIN[14] , POUT[6] , GOUT[14] );
+   BLOCK1 U315 (PIN[7] , PIN[11] , GIN[11] , GIN[15] , POUT[7] , GOUT[15] );
+   BLOCK1 U316 (PIN[8] , PIN[12] , GIN[12] , GIN[16] , POUT[8] , GOUT[16] );
+   BLOCK1 U317 (PIN[9] , PIN[13] , GIN[13] , GIN[17] , POUT[9] , GOUT[17] );
+   BLOCK1 U318 (PIN[10] , PIN[14] , GIN[14] , GIN[18] , POUT[10] , GOUT[18] );
+   BLOCK1 U319 (PIN[11] , PIN[15] , GIN[15] , GIN[19] , POUT[11] , GOUT[19] );
+   BLOCK1 U320 (PIN[12] , PIN[16] , GIN[16] , GIN[20] , POUT[12] , GOUT[20] );
+   BLOCK1 U321 (PIN[13] , PIN[17] , GIN[17] , GIN[21] , POUT[13] , GOUT[21] );
+   BLOCK1 U322 (PIN[14] , PIN[18] , GIN[18] , GIN[22] , POUT[14] , GOUT[22] );
+   BLOCK1 U323 (PIN[15] , PIN[19] , GIN[19] , GIN[23] , POUT[15] , GOUT[23] );
+   BLOCK1 U324 (PIN[16] , PIN[20] , GIN[20] , GIN[24] , POUT[16] , GOUT[24] );
+   BLOCK1 U325 (PIN[17] , PIN[21] , GIN[21] , GIN[25] , POUT[17] , GOUT[25] );
+   BLOCK1 U326 (PIN[18] , PIN[22] , GIN[22] , GIN[26] , POUT[18] , GOUT[26] );
+   BLOCK1 U327 (PIN[19] , PIN[23] , GIN[23] , GIN[27] , POUT[19] , GOUT[27] );
+   BLOCK1 U328 (PIN[20] , PIN[24] , GIN[24] , GIN[28] , POUT[20] , GOUT[28] );
+   BLOCK1 U329 (PIN[21] , PIN[25] , GIN[25] , GIN[29] , POUT[21] , GOUT[29] );
+   BLOCK1 U330 (PIN[22] , PIN[26] , GIN[26] , GIN[30] , POUT[22] , GOUT[30] );
+   BLOCK1 U331 (PIN[23] , PIN[27] , GIN[27] , GIN[31] , POUT[23] , GOUT[31] );
+   BLOCK1 U332 (PIN[24] , PIN[28] , GIN[28] , GIN[32] , POUT[24] , GOUT[32] );
+   BLOCK1 U333 (PIN[25] , PIN[29] , GIN[29] , GIN[33] , POUT[25] , GOUT[33] );
+   BLOCK1 U334 (PIN[26] , PIN[30] , GIN[30] , GIN[34] , POUT[26] , GOUT[34] );
+   BLOCK1 U335 (PIN[27] , PIN[31] , GIN[31] , GIN[35] , POUT[27] , GOUT[35] );
+   BLOCK1 U336 (PIN[28] , PIN[32] , GIN[32] , GIN[36] , POUT[28] , GOUT[36] );
+   BLOCK1 U337 (PIN[29] , PIN[33] , GIN[33] , GIN[37] , POUT[29] , GOUT[37] );
+   BLOCK1 U338 (PIN[30] , PIN[34] , GIN[34] , GIN[38] , POUT[30] , GOUT[38] );
+   BLOCK1 U339 (PIN[31] , PIN[35] , GIN[35] , GIN[39] , POUT[31] , GOUT[39] );
+   BLOCK1 U340 (PIN[32] , PIN[36] , GIN[36] , GIN[40] , POUT[32] , GOUT[40] );
+   BLOCK1 U341 (PIN[33] , PIN[37] , GIN[37] , GIN[41] , POUT[33] , GOUT[41] );
+   BLOCK1 U342 (PIN[34] , PIN[38] , GIN[38] , GIN[42] , POUT[34] , GOUT[42] );
+   BLOCK1 U343 (PIN[35] , PIN[39] , GIN[39] , GIN[43] , POUT[35] , GOUT[43] );
+   BLOCK1 U344 (PIN[36] , PIN[40] , GIN[40] , GIN[44] , POUT[36] , GOUT[44] );
+   BLOCK1 U345 (PIN[37] , PIN[41] , GIN[41] , GIN[45] , POUT[37] , GOUT[45] );
+   BLOCK1 U346 (PIN[38] , PIN[42] , GIN[42] , GIN[46] , POUT[38] , GOUT[46] );
+   BLOCK1 U347 (PIN[39] , PIN[43] , GIN[43] , GIN[47] , POUT[39] , GOUT[47] );
+   BLOCK1 U348 (PIN[40] , PIN[44] , GIN[44] , GIN[48] , POUT[40] , GOUT[48] );
+   BLOCK1 U349 (PIN[41] , PIN[45] , GIN[45] , GIN[49] , POUT[41] , GOUT[49] );
+   BLOCK1 U350 (PIN[42] , PIN[46] , GIN[46] , GIN[50] , POUT[42] , GOUT[50] );
+   BLOCK1 U351 (PIN[43] , PIN[47] , GIN[47] , GIN[51] , POUT[43] , GOUT[51] );
+   BLOCK1 U352 (PIN[44] , PIN[48] , GIN[48] , GIN[52] , POUT[44] , GOUT[52] );
+   BLOCK1 U353 (PIN[45] , PIN[49] , GIN[49] , GIN[53] , POUT[45] , GOUT[53] );
+   BLOCK1 U354 (PIN[46] , PIN[50] , GIN[50] , GIN[54] , POUT[46] , GOUT[54] );
+   BLOCK1 U355 (PIN[47] , PIN[51] , GIN[51] , GIN[55] , POUT[47] , GOUT[55] );
+   BLOCK1 U356 (PIN[48] , PIN[52] , GIN[52] , GIN[56] , POUT[48] , GOUT[56] );
+   BLOCK1 U357 (PIN[49] , PIN[53] , GIN[53] , GIN[57] , POUT[49] , GOUT[57] );
+   BLOCK1 U358 (PIN[50] , PIN[54] , GIN[54] , GIN[58] , POUT[50] , GOUT[58] );
+   BLOCK1 U359 (PIN[51] , PIN[55] , GIN[55] , GIN[59] , POUT[51] , GOUT[59] );
+   BLOCK1 U360 (PIN[52] , PIN[56] , GIN[56] , GIN[60] , POUT[52] , GOUT[60] );
+   BLOCK1 U361 (PIN[53] , PIN[57] , GIN[57] , GIN[61] , POUT[53] , GOUT[61] );
+   BLOCK1 U362 (PIN[54] , PIN[58] , GIN[58] , GIN[62] , POUT[54] , GOUT[62] );
+   BLOCK1 U363 (PIN[55] , PIN[59] , GIN[59] , GIN[63] , POUT[55] , GOUT[63] );
+   BLOCK1 U364 (PIN[56] , PIN[60] , GIN[60] , GIN[64] , POUT[56] , GOUT[64] );
+   
+endmodule // DBLC_2_64
+
+
+module DBLC_3_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:56] PIN;
+   input [0:64]  GIN;
+   
+   output [0:48] POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   INVBLOCK U11 (GIN[1] , GOUT[1] );
+   INVBLOCK U12 (GIN[2] , GOUT[2] );
+   INVBLOCK U13 (GIN[3] , GOUT[3] );
+   INVBLOCK U14 (GIN[4] , GOUT[4] );
+   INVBLOCK U15 (GIN[5] , GOUT[5] );
+   INVBLOCK U16 (GIN[6] , GOUT[6] );
+   INVBLOCK U17 (GIN[7] , GOUT[7] );
+   BLOCK2A U28 (PIN[0] , GIN[0] , GIN[8] , GOUT[8] );
+   BLOCK2A U29 (PIN[1] , GIN[1] , GIN[9] , GOUT[9] );
+   BLOCK2A U210 (PIN[2] , GIN[2] , GIN[10] , GOUT[10] );
+   BLOCK2A U211 (PIN[3] , GIN[3] , GIN[11] , GOUT[11] );
+   BLOCK2A U212 (PIN[4] , GIN[4] , GIN[12] , GOUT[12] );
+   BLOCK2A U213 (PIN[5] , GIN[5] , GIN[13] , GOUT[13] );
+   BLOCK2A U214 (PIN[6] , GIN[6] , GIN[14] , GOUT[14] );
+   BLOCK2A U215 (PIN[7] , GIN[7] , GIN[15] , GOUT[15] );
+   BLOCK2 U316 (PIN[0] , PIN[8] , GIN[8] , GIN[16] , POUT[0] , GOUT[16] );
+   BLOCK2 U317 (PIN[1] , PIN[9] , GIN[9] , GIN[17] , POUT[1] , GOUT[17] );
+   BLOCK2 U318 (PIN[2] , PIN[10] , GIN[10] , GIN[18] , POUT[2] , GOUT[18] );
+   BLOCK2 U319 (PIN[3] , PIN[11] , GIN[11] , GIN[19] , POUT[3] , GOUT[19] );
+   BLOCK2 U320 (PIN[4] , PIN[12] , GIN[12] , GIN[20] , POUT[4] , GOUT[20] );
+   BLOCK2 U321 (PIN[5] , PIN[13] , GIN[13] , GIN[21] , POUT[5] , GOUT[21] );
+   BLOCK2 U322 (PIN[6] , PIN[14] , GIN[14] , GIN[22] , POUT[6] , GOUT[22] );
+   BLOCK2 U323 (PIN[7] , PIN[15] , GIN[15] , GIN[23] , POUT[7] , GOUT[23] );
+   BLOCK2 U324 (PIN[8] , PIN[16] , GIN[16] , GIN[24] , POUT[8] , GOUT[24] );
+   BLOCK2 U325 (PIN[9] , PIN[17] , GIN[17] , GIN[25] , POUT[9] , GOUT[25] );
+   BLOCK2 U326 (PIN[10] , PIN[18] , GIN[18] , GIN[26] , POUT[10] , GOUT[26] );
+   BLOCK2 U327 (PIN[11] , PIN[19] , GIN[19] , GIN[27] , POUT[11] , GOUT[27] );
+   BLOCK2 U328 (PIN[12] , PIN[20] , GIN[20] , GIN[28] , POUT[12] , GOUT[28] );
+   BLOCK2 U329 (PIN[13] , PIN[21] , GIN[21] , GIN[29] , POUT[13] , GOUT[29] );
+   BLOCK2 U330 (PIN[14] , PIN[22] , GIN[22] , GIN[30] , POUT[14] , GOUT[30] );
+   BLOCK2 U331 (PIN[15] , PIN[23] , GIN[23] , GIN[31] , POUT[15] , GOUT[31] );
+   BLOCK2 U332 (PIN[16] , PIN[24] , GIN[24] , GIN[32] , POUT[16] , GOUT[32] );
+   BLOCK2 U333 (PIN[17] , PIN[25] , GIN[25] , GIN[33] , POUT[17] , GOUT[33] );
+   BLOCK2 U334 (PIN[18] , PIN[26] , GIN[26] , GIN[34] , POUT[18] , GOUT[34] );
+   BLOCK2 U335 (PIN[19] , PIN[27] , GIN[27] , GIN[35] , POUT[19] , GOUT[35] );
+   BLOCK2 U336 (PIN[20] , PIN[28] , GIN[28] , GIN[36] , POUT[20] , GOUT[36] );
+   BLOCK2 U337 (PIN[21] , PIN[29] , GIN[29] , GIN[37] , POUT[21] , GOUT[37] );
+   BLOCK2 U338 (PIN[22] , PIN[30] , GIN[30] , GIN[38] , POUT[22] , GOUT[38] );
+   BLOCK2 U339 (PIN[23] , PIN[31] , GIN[31] , GIN[39] , POUT[23] , GOUT[39] );
+   BLOCK2 U340 (PIN[24] , PIN[32] , GIN[32] , GIN[40] , POUT[24] , GOUT[40] );
+   BLOCK2 U341 (PIN[25] , PIN[33] , GIN[33] , GIN[41] , POUT[25] , GOUT[41] );
+   BLOCK2 U342 (PIN[26] , PIN[34] , GIN[34] , GIN[42] , POUT[26] , GOUT[42] );
+   BLOCK2 U343 (PIN[27] , PIN[35] , GIN[35] , GIN[43] , POUT[27] , GOUT[43] );
+   BLOCK2 U344 (PIN[28] , PIN[36] , GIN[36] , GIN[44] , POUT[28] , GOUT[44] );
+   BLOCK2 U345 (PIN[29] , PIN[37] , GIN[37] , GIN[45] , POUT[29] , GOUT[45] );
+   BLOCK2 U346 (PIN[30] , PIN[38] , GIN[38] , GIN[46] , POUT[30] , GOUT[46] );
+   BLOCK2 U347 (PIN[31] , PIN[39] , GIN[39] , GIN[47] , POUT[31] , GOUT[47] );
+   BLOCK2 U348 (PIN[32] , PIN[40] , GIN[40] , GIN[48] , POUT[32] , GOUT[48] );
+   BLOCK2 U349 (PIN[33] , PIN[41] , GIN[41] , GIN[49] , POUT[33] , GOUT[49] );
+   BLOCK2 U350 (PIN[34] , PIN[42] , GIN[42] , GIN[50] , POUT[34] , GOUT[50] );
+   BLOCK2 U351 (PIN[35] , PIN[43] , GIN[43] , GIN[51] , POUT[35] , GOUT[51] );
+   BLOCK2 U352 (PIN[36] , PIN[44] , GIN[44] , GIN[52] , POUT[36] , GOUT[52] );
+   BLOCK2 U353 (PIN[37] , PIN[45] , GIN[45] , GIN[53] , POUT[37] , GOUT[53] );
+   BLOCK2 U354 (PIN[38] , PIN[46] , GIN[46] , GIN[54] , POUT[38] , GOUT[54] );
+   BLOCK2 U355 (PIN[39] , PIN[47] , GIN[47] , GIN[55] , POUT[39] , GOUT[55] );
+   BLOCK2 U356 (PIN[40] , PIN[48] , GIN[48] , GIN[56] , POUT[40] , GOUT[56] );
+   BLOCK2 U357 (PIN[41] , PIN[49] , GIN[49] , GIN[57] , POUT[41] , GOUT[57] );
+   BLOCK2 U358 (PIN[42] , PIN[50] , GIN[50] , GIN[58] , POUT[42] , GOUT[58] );
+   BLOCK2 U359 (PIN[43] , PIN[51] , GIN[51] , GIN[59] , POUT[43] , GOUT[59] );
+   BLOCK2 U360 (PIN[44] , PIN[52] , GIN[52] , GIN[60] , POUT[44] , GOUT[60] );
+   BLOCK2 U361 (PIN[45] , PIN[53] , GIN[53] , GIN[61] , POUT[45] , GOUT[61] );
+   BLOCK2 U362 (PIN[46] , PIN[54] , GIN[54] , GIN[62] , POUT[46] , GOUT[62] );
+   BLOCK2 U363 (PIN[47] , PIN[55] , GIN[55] , GIN[63] , POUT[47] , GOUT[63] );
+   BLOCK2 U364 (PIN[48] , PIN[56] , GIN[56] , GIN[64] , POUT[48] , GOUT[64] );
+   
+endmodule // DBLC_3_64
+
+
+module DBLC_4_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:48] PIN;
+   input [0:64]  GIN;
+   
+   output [0:32] POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   INVBLOCK U11 (GIN[1] , GOUT[1] );
+   INVBLOCK U12 (GIN[2] , GOUT[2] );
+   INVBLOCK U13 (GIN[3] , GOUT[3] );
+   INVBLOCK U14 (GIN[4] , GOUT[4] );
+   INVBLOCK U15 (GIN[5] , GOUT[5] );
+   INVBLOCK U16 (GIN[6] , GOUT[6] );
+   INVBLOCK U17 (GIN[7] , GOUT[7] );
+   INVBLOCK U18 (GIN[8] , GOUT[8] );
+   INVBLOCK U19 (GIN[9] , GOUT[9] );
+   INVBLOCK U110 (GIN[10] , GOUT[10] );
+   INVBLOCK U111 (GIN[11] , GOUT[11] );
+   INVBLOCK U112 (GIN[12] , GOUT[12] );
+   INVBLOCK U113 (GIN[13] , GOUT[13] );
+   INVBLOCK U114 (GIN[14] , GOUT[14] );
+   INVBLOCK U115 (GIN[15] , GOUT[15] );
+   BLOCK1A U216 (PIN[0] , GIN[0] , GIN[16] , GOUT[16] );
+   BLOCK1A U217 (PIN[1] , GIN[1] , GIN[17] , GOUT[17] );
+   BLOCK1A U218 (PIN[2] , GIN[2] , GIN[18] , GOUT[18] );
+   BLOCK1A U219 (PIN[3] , GIN[3] , GIN[19] , GOUT[19] );
+   BLOCK1A U220 (PIN[4] , GIN[4] , GIN[20] , GOUT[20] );
+   BLOCK1A U221 (PIN[5] , GIN[5] , GIN[21] , GOUT[21] );
+   BLOCK1A U222 (PIN[6] , GIN[6] , GIN[22] , GOUT[22] );
+   BLOCK1A U223 (PIN[7] , GIN[7] , GIN[23] , GOUT[23] );
+   BLOCK1A U224 (PIN[8] , GIN[8] , GIN[24] , GOUT[24] );
+   BLOCK1A U225 (PIN[9] , GIN[9] , GIN[25] , GOUT[25] );
+   BLOCK1A U226 (PIN[10] , GIN[10] , GIN[26] , GOUT[26] );
+   BLOCK1A U227 (PIN[11] , GIN[11] , GIN[27] , GOUT[27] );
+   BLOCK1A U228 (PIN[12] , GIN[12] , GIN[28] , GOUT[28] );
+   BLOCK1A U229 (PIN[13] , GIN[13] , GIN[29] , GOUT[29] );
+   BLOCK1A U230 (PIN[14] , GIN[14] , GIN[30] , GOUT[30] );
+   BLOCK1A U231 (PIN[15] , GIN[15] , GIN[31] , GOUT[31] );
+   BLOCK1 U332 (PIN[0] , PIN[16] , GIN[16] , GIN[32] , POUT[0] , GOUT[32] );
+   BLOCK1 U333 (PIN[1] , PIN[17] , GIN[17] , GIN[33] , POUT[1] , GOUT[33] );
+   BLOCK1 U334 (PIN[2] , PIN[18] , GIN[18] , GIN[34] , POUT[2] , GOUT[34] );
+   BLOCK1 U335 (PIN[3] , PIN[19] , GIN[19] , GIN[35] , POUT[3] , GOUT[35] );
+   BLOCK1 U336 (PIN[4] , PIN[20] , GIN[20] , GIN[36] , POUT[4] , GOUT[36] );
+   BLOCK1 U337 (PIN[5] , PIN[21] , GIN[21] , GIN[37] , POUT[5] , GOUT[37] );
+   BLOCK1 U338 (PIN[6] , PIN[22] , GIN[22] , GIN[38] , POUT[6] , GOUT[38] );
+   BLOCK1 U339 (PIN[7] , PIN[23] , GIN[23] , GIN[39] , POUT[7] , GOUT[39] );
+   BLOCK1 U340 (PIN[8] , PIN[24] , GIN[24] , GIN[40] , POUT[8] , GOUT[40] );
+   BLOCK1 U341 (PIN[9] , PIN[25] , GIN[25] , GIN[41] , POUT[9] , GOUT[41] );
+   BLOCK1 U342 (PIN[10] , PIN[26] , GIN[26] , GIN[42] , POUT[10] , GOUT[42] );
+   BLOCK1 U343 (PIN[11] , PIN[27] , GIN[27] , GIN[43] , POUT[11] , GOUT[43] );
+   BLOCK1 U344 (PIN[12] , PIN[28] , GIN[28] , GIN[44] , POUT[12] , GOUT[44] );
+   BLOCK1 U345 (PIN[13] , PIN[29] , GIN[29] , GIN[45] , POUT[13] , GOUT[45] );
+   BLOCK1 U346 (PIN[14] , PIN[30] , GIN[30] , GIN[46] , POUT[14] , GOUT[46] );
+   BLOCK1 U347 (PIN[15] , PIN[31] , GIN[31] , GIN[47] , POUT[15] , GOUT[47] );
+   BLOCK1 U348 (PIN[16] , PIN[32] , GIN[32] , GIN[48] , POUT[16] , GOUT[48] );
+   BLOCK1 U349 (PIN[17] , PIN[33] , GIN[33] , GIN[49] , POUT[17] , GOUT[49] );
+   BLOCK1 U350 (PIN[18] , PIN[34] , GIN[34] , GIN[50] , POUT[18] , GOUT[50] );
+   BLOCK1 U351 (PIN[19] , PIN[35] , GIN[35] , GIN[51] , POUT[19] , GOUT[51] );
+   BLOCK1 U352 (PIN[20] , PIN[36] , GIN[36] , GIN[52] , POUT[20] , GOUT[52] );
+   BLOCK1 U353 (PIN[21] , PIN[37] , GIN[37] , GIN[53] , POUT[21] , GOUT[53] );
+   BLOCK1 U354 (PIN[22] , PIN[38] , GIN[38] , GIN[54] , POUT[22] , GOUT[54] );
+   BLOCK1 U355 (PIN[23] , PIN[39] , GIN[39] , GIN[55] , POUT[23] , GOUT[55] );
+   BLOCK1 U356 (PIN[24] , PIN[40] , GIN[40] , GIN[56] , POUT[24] , GOUT[56] );
+   BLOCK1 U357 (PIN[25] , PIN[41] , GIN[41] , GIN[57] , POUT[25] , GOUT[57] );
+   BLOCK1 U358 (PIN[26] , PIN[42] , GIN[42] , GIN[58] , POUT[26] , GOUT[58] );
+   BLOCK1 U359 (PIN[27] , PIN[43] , GIN[43] , GIN[59] , POUT[27] , GOUT[59] );
+   BLOCK1 U360 (PIN[28] , PIN[44] , GIN[44] , GIN[60] , POUT[28] , GOUT[60] );
+   BLOCK1 U361 (PIN[29] , PIN[45] , GIN[45] , GIN[61] , POUT[29] , GOUT[61] );
+   BLOCK1 U362 (PIN[30] , PIN[46] , GIN[46] , GIN[62] , POUT[30] , GOUT[62] );
+   BLOCK1 U363 (PIN[31] , PIN[47] , GIN[47] , GIN[63] , POUT[31] , GOUT[63] );
+   BLOCK1 U364 (PIN[32] , PIN[48] , GIN[48] , GIN[64] , POUT[32] , GOUT[64] );
+   
+endmodule // DBLC_4_64
+
+
+module DBLC_5_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:32] PIN;
+   input [0:64]  GIN;
+   
+   output [0:0]  POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   INVBLOCK U11 (GIN[1] , GOUT[1] );
+   INVBLOCK U12 (GIN[2] , GOUT[2] );
+   INVBLOCK U13 (GIN[3] , GOUT[3] );
+   INVBLOCK U14 (GIN[4] , GOUT[4] );
+   INVBLOCK U15 (GIN[5] , GOUT[5] );
+   INVBLOCK U16 (GIN[6] , GOUT[6] );
+   INVBLOCK U17 (GIN[7] , GOUT[7] );
+   INVBLOCK U18 (GIN[8] , GOUT[8] );
+   INVBLOCK U19 (GIN[9] , GOUT[9] );
+   INVBLOCK U110 (GIN[10] , GOUT[10] );
+   INVBLOCK U111 (GIN[11] , GOUT[11] );
+   INVBLOCK U112 (GIN[12] , GOUT[12] );
+   INVBLOCK U113 (GIN[13] , GOUT[13] );
+   INVBLOCK U114 (GIN[14] , GOUT[14] );
+   INVBLOCK U115 (GIN[15] , GOUT[15] );
+   INVBLOCK U116 (GIN[16] , GOUT[16] );
+   INVBLOCK U117 (GIN[17] , GOUT[17] );
+   INVBLOCK U118 (GIN[18] , GOUT[18] );
+   INVBLOCK U119 (GIN[19] , GOUT[19] );
+   INVBLOCK U120 (GIN[20] , GOUT[20] );
+   INVBLOCK U121 (GIN[21] , GOUT[21] );
+   INVBLOCK U122 (GIN[22] , GOUT[22] );
+   INVBLOCK U123 (GIN[23] , GOUT[23] );
+   INVBLOCK U124 (GIN[24] , GOUT[24] );
+   INVBLOCK U125 (GIN[25] , GOUT[25] );
+   INVBLOCK U126 (GIN[26] , GOUT[26] );
+   INVBLOCK U127 (GIN[27] , GOUT[27] );
+   INVBLOCK U128 (GIN[28] , GOUT[28] );
+   INVBLOCK U129 (GIN[29] , GOUT[29] );
+   INVBLOCK U130 (GIN[30] , GOUT[30] );
+   INVBLOCK U131 (GIN[31] , GOUT[31] );
+   BLOCK2A U232 (PIN[0] , GIN[0] , GIN[32] , GOUT[32] );
+   BLOCK2A U233 (PIN[1] , GIN[1] , GIN[33] , GOUT[33] );
+   BLOCK2A U234 (PIN[2] , GIN[2] , GIN[34] , GOUT[34] );
+   BLOCK2A U235 (PIN[3] , GIN[3] , GIN[35] , GOUT[35] );
+   BLOCK2A U236 (PIN[4] , GIN[4] , GIN[36] , GOUT[36] );
+   BLOCK2A U237 (PIN[5] , GIN[5] , GIN[37] , GOUT[37] );
+   BLOCK2A U238 (PIN[6] , GIN[6] , GIN[38] , GOUT[38] );
+   BLOCK2A U239 (PIN[7] , GIN[7] , GIN[39] , GOUT[39] );
+   BLOCK2A U240 (PIN[8] , GIN[8] , GIN[40] , GOUT[40] );
+   BLOCK2A U241 (PIN[9] , GIN[9] , GIN[41] , GOUT[41] );
+   BLOCK2A U242 (PIN[10] , GIN[10] , GIN[42] , GOUT[42] );
+   BLOCK2A U243 (PIN[11] , GIN[11] , GIN[43] , GOUT[43] );
+   BLOCK2A U244 (PIN[12] , GIN[12] , GIN[44] , GOUT[44] );
+   BLOCK2A U245 (PIN[13] , GIN[13] , GIN[45] , GOUT[45] );
+   BLOCK2A U246 (PIN[14] , GIN[14] , GIN[46] , GOUT[46] );
+   BLOCK2A U247 (PIN[15] , GIN[15] , GIN[47] , GOUT[47] );
+   BLOCK2A U248 (PIN[16] , GIN[16] , GIN[48] , GOUT[48] );
+   BLOCK2A U249 (PIN[17] , GIN[17] , GIN[49] , GOUT[49] );
+   BLOCK2A U250 (PIN[18] , GIN[18] , GIN[50] , GOUT[50] );
+   BLOCK2A U251 (PIN[19] , GIN[19] , GIN[51] , GOUT[51] );
+   BLOCK2A U252 (PIN[20] , GIN[20] , GIN[52] , GOUT[52] );
+   BLOCK2A U253 (PIN[21] , GIN[21] , GIN[53] , GOUT[53] );
+   BLOCK2A U254 (PIN[22] , GIN[22] , GIN[54] , GOUT[54] );
+   BLOCK2A U255 (PIN[23] , GIN[23] , GIN[55] , GOUT[55] );
+   BLOCK2A U256 (PIN[24] , GIN[24] , GIN[56] , GOUT[56] );
+   BLOCK2A U257 (PIN[25] , GIN[25] , GIN[57] , GOUT[57] );
+   BLOCK2A U258 (PIN[26] , GIN[26] , GIN[58] , GOUT[58] );
+   BLOCK2A U259 (PIN[27] , GIN[27] , GIN[59] , GOUT[59] );
+   BLOCK2A U260 (PIN[28] , GIN[28] , GIN[60] , GOUT[60] );
+   BLOCK2A U261 (PIN[29] , GIN[29] , GIN[61] , GOUT[61] );
+   BLOCK2A U262 (PIN[30] , GIN[30] , GIN[62] , GOUT[62] );
+   BLOCK2A U263 (PIN[31] , GIN[31] , GIN[63] , GOUT[63] );
+   BLOCK2 U364 (PIN[0] , PIN[32] , GIN[32] , GIN[64] , POUT[0] , GOUT[64] );
+   
+endmodule // DBLC_5_64
+
+
+module XORSTAGE_64 ( A, B, PBIT, CARRY, SUM, COUT );
+   
+   input  [0:63] A;
+   input [0:63]  B;
+   input 	 PBIT;
+   input [0:64]  CARRY;
+   
+   output [0:63] SUM;
+   output 	 COUT;
+   
+   XXOR1 U20 (A[0] , B[0] , CARRY[0] , SUM[0] );
+   XXOR1 U21 (A[1] , B[1] , CARRY[1] , SUM[1] );
+   XXOR1 U22 (A[2] , B[2] , CARRY[2] , SUM[2] );
+   XXOR1 U23 (A[3] , B[3] , CARRY[3] , SUM[3] );
+   XXOR1 U24 (A[4] , B[4] , CARRY[4] , SUM[4] );
+   XXOR1 U25 (A[5] , B[5] , CARRY[5] , SUM[5] );
+   XXOR1 U26 (A[6] , B[6] , CARRY[6] , SUM[6] );
+   XXOR1 U27 (A[7] , B[7] , CARRY[7] , SUM[7] );
+   XXOR1 U28 (A[8] , B[8] , CARRY[8] , SUM[8] );
+   XXOR1 U29 (A[9] , B[9] , CARRY[9] , SUM[9] );
+   XXOR1 U210 (A[10] , B[10] , CARRY[10] , SUM[10] );
+   XXOR1 U211 (A[11] , B[11] , CARRY[11] , SUM[11] );
+   XXOR1 U212 (A[12] , B[12] , CARRY[12] , SUM[12] );
+   XXOR1 U213 (A[13] , B[13] , CARRY[13] , SUM[13] );
+   XXOR1 U214 (A[14] , B[14] , CARRY[14] , SUM[14] );
+   XXOR1 U215 (A[15] , B[15] , CARRY[15] , SUM[15] );
+   XXOR1 U216 (A[16] , B[16] , CARRY[16] , SUM[16] );
+   XXOR1 U217 (A[17] , B[17] , CARRY[17] , SUM[17] );
+   XXOR1 U218 (A[18] , B[18] , CARRY[18] , SUM[18] );
+   XXOR1 U219 (A[19] , B[19] , CARRY[19] , SUM[19] );
+   XXOR1 U220 (A[20] , B[20] , CARRY[20] , SUM[20] );
+   XXOR1 U221 (A[21] , B[21] , CARRY[21] , SUM[21] );
+   XXOR1 U222 (A[22] , B[22] , CARRY[22] , SUM[22] );
+   XXOR1 U223 (A[23] , B[23] , CARRY[23] , SUM[23] );
+   XXOR1 U224 (A[24] , B[24] , CARRY[24] , SUM[24] );
+   XXOR1 U225 (A[25] , B[25] , CARRY[25] , SUM[25] );
+   XXOR1 U226 (A[26] , B[26] , CARRY[26] , SUM[26] );
+   XXOR1 U227 (A[27] , B[27] , CARRY[27] , SUM[27] );
+   XXOR1 U228 (A[28] , B[28] , CARRY[28] , SUM[28] );
+   XXOR1 U229 (A[29] , B[29] , CARRY[29] , SUM[29] );
+   XXOR1 U230 (A[30] , B[30] , CARRY[30] , SUM[30] );
+   XXOR1 U231 (A[31] , B[31] , CARRY[31] , SUM[31] );
+   XXOR1 U232 (A[32] , B[32] , CARRY[32] , SUM[32] );
+   XXOR1 U233 (A[33] , B[33] , CARRY[33] , SUM[33] );
+   XXOR1 U234 (A[34] , B[34] , CARRY[34] , SUM[34] );
+   XXOR1 U235 (A[35] , B[35] , CARRY[35] , SUM[35] );
+   XXOR1 U236 (A[36] , B[36] , CARRY[36] , SUM[36] );
+   XXOR1 U237 (A[37] , B[37] , CARRY[37] , SUM[37] );
+   XXOR1 U238 (A[38] , B[38] , CARRY[38] , SUM[38] );
+   XXOR1 U239 (A[39] , B[39] , CARRY[39] , SUM[39] );
+   XXOR1 U240 (A[40] , B[40] , CARRY[40] , SUM[40] );
+   XXOR1 U241 (A[41] , B[41] , CARRY[41] , SUM[41] );
+   XXOR1 U242 (A[42] , B[42] , CARRY[42] , SUM[42] );
+   XXOR1 U243 (A[43] , B[43] , CARRY[43] , SUM[43] );
+   XXOR1 U244 (A[44] , B[44] , CARRY[44] , SUM[44] );
+   XXOR1 U245 (A[45] , B[45] , CARRY[45] , SUM[45] );
+   XXOR1 U246 (A[46] , B[46] , CARRY[46] , SUM[46] );
+   XXOR1 U247 (A[47] , B[47] , CARRY[47] , SUM[47] );
+   XXOR1 U248 (A[48] , B[48] , CARRY[48] , SUM[48] );
+   XXOR1 U249 (A[49] , B[49] , CARRY[49] , SUM[49] );
+   XXOR1 U250 (A[50] , B[50] , CARRY[50] , SUM[50] );
+   XXOR1 U251 (A[51] , B[51] , CARRY[51] , SUM[51] );
+   XXOR1 U252 (A[52] , B[52] , CARRY[52] , SUM[52] );
+   XXOR1 U253 (A[53] , B[53] , CARRY[53] , SUM[53] );
+   XXOR1 U254 (A[54] , B[54] , CARRY[54] , SUM[54] );
+   XXOR1 U255 (A[55] , B[55] , CARRY[55] , SUM[55] );
+   XXOR1 U256 (A[56] , B[56] , CARRY[56] , SUM[56] );
+   XXOR1 U257 (A[57] , B[57] , CARRY[57] , SUM[57] );
+   XXOR1 U258 (A[58] , B[58] , CARRY[58] , SUM[58] );
+   XXOR1 U259 (A[59] , B[59] , CARRY[59] , SUM[59] );
+   XXOR1 U260 (A[60] , B[60] , CARRY[60] , SUM[60] );
+   XXOR1 U261 (A[61] , B[61] , CARRY[61] , SUM[61] );
+   XXOR1 U262 (A[62] , B[62] , CARRY[62] , SUM[62] );
+   XXOR1 U263 (A[63] , B[63] , CARRY[63] , SUM[63] );
+   BLOCK1A U1 (PBIT , CARRY[0] , CARRY[64] , COUT );
+   
+endmodule // XORSTAGE_64
+
+
+module DBLCTREE_64 ( PIN, GIN, GOUT, POUT );
+   
+   input  [0:63] PIN;
+   input [0:64]  GIN;
+   
+   output [0:64] GOUT;
+   output [0:0]  POUT;
+   
+   wire [0:62] 	 INTPROP_0;
+   wire [0:64] 	 INTGEN_0;
+   wire [0:60] 	 INTPROP_1;
+   wire [0:64] 	 INTGEN_1;
+   wire [0:56] 	 INTPROP_2;
+   wire [0:64] 	 INTGEN_2;
+   wire [0:48] 	 INTPROP_3;
+   wire [0:64] 	 INTGEN_3;
+   wire [0:32] 	 INTPROP_4;
+   wire [0:64] 	 INTGEN_4;
+   
+   DBLC_0_64 U_0 (.PIN(PIN) , .GIN(GIN) , .POUT(INTPROP_0) , .GOUT(INTGEN_0) );
+   DBLC_1_64 U_1 (.PIN(INTPROP_0) , .GIN(INTGEN_0) , .POUT(INTPROP_1) , .GOUT(INTGEN_1) );
+   DBLC_2_64 U_2 (.PIN(INTPROP_1) , .GIN(INTGEN_1) , .POUT(INTPROP_2) , .GOUT(INTGEN_2) );
+   DBLC_3_64 U_3 (.PIN(INTPROP_2) , .GIN(INTGEN_2) , .POUT(INTPROP_3) , .GOUT(INTGEN_3) );
+   DBLC_4_64 U_4 (.PIN(INTPROP_3) , .GIN(INTGEN_3) , .POUT(INTPROP_4) , .GOUT(INTGEN_4) );
+   DBLC_5_64 U_5 (.PIN(INTPROP_4) , .GIN(INTGEN_4) , .POUT(POUT) , .GOUT(GOUT) );
+   
+endmodule // DBLCTREE_64
+
+
+module DBLCADDER_64_64 ( OPA, OPB, CIN, SUM, COUT );
+   
+   input  [0:63] OPA;
+   input [0:63]  OPB;
+   input 	 CIN;
+   
+   output [0:63] SUM;
+   output 	 COUT;
+   
+   wire [0:63] 	 INTPROP;
+   wire [0:64] 	 INTGEN;
+   wire [0:0] 	 PBIT;
+   wire [0:64] 	 CARRY;
+   
+   PRESTAGE_64 U1 (OPA , OPB , CIN , INTPROP , INTGEN );
+   DBLCTREE_64 U2 (INTPROP , INTGEN , CARRY , PBIT );
+   XORSTAGE_64 U3 (OPA[0:63] , OPB[0:63] , PBIT[0] , CARRY[0:64] , SUM , COUT );
+   
+endmodule 
diff --git a/wally-pipelined/src/fpu/fpadd/cla52.v b/wally-pipelined/src/fpu/fpadd/cla52.v
new file mode 100755
index 00000000..00fca299
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/cla52.v
@@ -0,0 +1,202 @@
+// This module implements a 52-bit carry lookahead adder. It is used
+// for rounding in the floating point adder. 
+
+module cla52 (S, CO, X, Y);
+   
+   input  [51:0] X;
+   input [51:0]  Y;
+   
+   output [51:0] S;
+   output 	 CO;
+   
+   wire [0:63] 	 A,B,Q;
+   wire 	 LOGIC0;
+   wire 	 CIN;
+   wire 	 CO_64;
+   
+   assign LOGIC0 = 0;
+   assign CIN = 0;
+   DBLCADDER_64_64 U1 (A , B , CIN, Q , CO_64);
+   assign A[0] = X[0];
+   assign B[0] = Y[0];
+   assign A[1] = X[1];
+   assign B[1] = Y[1];
+   assign A[2] = X[2];
+   assign B[2] = Y[2];
+   assign A[3] = X[3];
+   assign B[3] = Y[3];
+   assign A[4] = X[4];
+   assign B[4] = Y[4];
+   assign A[5] = X[5];
+   assign B[5] = Y[5];
+   assign A[6] = X[6];
+   assign B[6] = Y[6];
+   assign A[7] = X[7];
+   assign B[7] = Y[7];
+   assign A[8] = X[8];
+   assign B[8] = Y[8];
+   assign A[9] = X[9];
+   assign B[9] = Y[9];
+   assign A[10] = X[10];
+   assign B[10] = Y[10];
+   assign A[11] = X[11];
+   assign B[11] = Y[11];
+   assign A[12] = X[12];
+   assign B[12] = Y[12];
+   assign A[13] = X[13];
+   assign B[13] = Y[13];
+   assign A[14] = X[14];
+   assign B[14] = Y[14];
+   assign A[15] = X[15];
+   assign B[15] = Y[15];
+   assign A[16] = X[16];
+   assign B[16] = Y[16];
+   assign A[17] = X[17];
+   assign B[17] = Y[17];
+   assign A[18] = X[18];
+   assign B[18] = Y[18];
+   assign A[19] = X[19];
+   assign B[19] = Y[19];
+   assign A[20] = X[20];
+   assign B[20] = Y[20];
+   assign A[21] = X[21];
+   assign B[21] = Y[21];
+   assign A[22] = X[22];
+   assign B[22] = Y[22];
+   assign A[23] = X[23];
+   assign B[23] = Y[23];
+   assign A[24] = X[24];
+   assign B[24] = Y[24];
+   assign A[25] = X[25];
+   assign B[25] = Y[25];
+   assign A[26] = X[26];
+   assign B[26] = Y[26];
+   assign A[27] = X[27];
+   assign B[27] = Y[27];
+   assign A[28] = X[28];
+   assign B[28] = Y[28];
+   assign A[29] = X[29];
+   assign B[29] = Y[29];
+   assign A[30] = X[30];
+   assign B[30] = Y[30];
+   assign A[31] = X[31];
+   assign B[31] = Y[31];
+   assign A[32] = X[32];
+   assign B[32] = Y[32];
+   assign A[33] = X[33];
+   assign B[33] = Y[33];
+   assign A[34] = X[34];
+   assign B[34] = Y[34];
+   assign A[35] = X[35];
+   assign B[35] = Y[35];
+   assign A[36] = X[36];
+   assign B[36] = Y[36];
+   assign A[37] = X[37];
+   assign B[37] = Y[37];
+   assign A[38] = X[38];
+   assign B[38] = Y[38];
+   assign A[39] = X[39];
+   assign B[39] = Y[39];
+   assign A[40] = X[40];
+   assign B[40] = Y[40];
+   assign A[41] = X[41];
+   assign B[41] = Y[41];
+   assign A[42] = X[42];
+   assign B[42] = Y[42];
+   assign A[43] = X[43];
+   assign B[43] = Y[43];
+   assign A[44] = X[44];
+   assign B[44] = Y[44];
+   assign A[45] = X[45];
+   assign B[45] = Y[45];
+   assign A[46] = X[46];
+   assign B[46] = Y[46];
+   assign A[47] = X[47];
+   assign B[47] = Y[47];
+   assign A[48] = X[48];
+   assign B[48] = Y[48];
+   assign A[49] = X[49];
+   assign B[49] = Y[49];
+   assign A[50] = X[50];
+   assign B[50] = Y[50];
+   assign A[51] = X[51];
+   assign B[51] = Y[51];
+   assign A[52] = LOGIC0;
+   assign B[52] = LOGIC0;
+   assign A[53] = LOGIC0;
+   assign B[53] = LOGIC0;
+   assign A[54] = LOGIC0;
+   assign B[54] = LOGIC0;
+   assign A[55] = LOGIC0;
+   assign B[55] = LOGIC0;
+   assign A[56] = LOGIC0;
+   assign B[56] = LOGIC0;
+   assign A[57] = LOGIC0;
+   assign B[57] = LOGIC0;
+   assign A[58] = LOGIC0;
+   assign B[58] = LOGIC0;
+   assign A[59] = LOGIC0;
+   assign B[59] = LOGIC0;
+   assign A[60] = LOGIC0;
+   assign B[60] = LOGIC0;
+   assign A[61] = LOGIC0;
+   assign B[61] = LOGIC0;
+   assign A[62] = LOGIC0;
+   assign B[62] = LOGIC0;
+   assign A[63] = LOGIC0;
+   assign B[63] = LOGIC0;
+   assign S[0] = Q[0];
+   assign S[1] = Q[1];
+   assign S[2] = Q[2];
+   assign S[3] = Q[3];
+   assign S[4] = Q[4];
+   assign S[5] = Q[5];
+   assign S[6] = Q[6];
+   assign S[7] = Q[7];
+   assign S[8] = Q[8];
+   assign S[9] = Q[9];
+   assign S[10] = Q[10];
+   assign S[11] = Q[11];
+   assign S[12] = Q[12];
+   assign S[13] = Q[13];
+   assign S[14] = Q[14];
+   assign S[15] = Q[15];
+   assign S[16] = Q[16];
+   assign S[17] = Q[17];
+   assign S[18] = Q[18];
+   assign S[19] = Q[19];
+   assign S[20] = Q[20];
+   assign S[21] = Q[21];
+   assign S[22] = Q[22];
+   assign S[23] = Q[23];
+   assign S[24] = Q[24];
+   assign S[25] = Q[25];
+   assign S[26] = Q[26];
+   assign S[27] = Q[27];
+   assign S[28] = Q[28];
+   assign S[29] = Q[29];
+   assign S[30] = Q[30];
+   assign S[31] = Q[31];
+   assign S[32] = Q[32];
+   assign S[33] = Q[33];
+   assign S[34] = Q[34];
+   assign S[35] = Q[35];
+   assign S[36] = Q[36];
+   assign S[37] = Q[37];
+   assign S[38] = Q[38];
+   assign S[39] = Q[39];
+   assign S[40] = Q[40];
+   assign S[41] = Q[41];
+   assign S[42] = Q[42];
+   assign S[43] = Q[43];
+   assign S[44] = Q[44];
+   assign S[45] = Q[45];
+   assign S[46] = Q[46];
+   assign S[47] = Q[47];
+   assign S[48] = Q[48];
+   assign S[49] = Q[49];
+   assign S[50] = Q[50];
+   assign S[51] = Q[51];
+   assign CO    = Q[52];
+   
+endmodule //cla52
diff --git a/wally-pipelined/src/fpu/fpadd/cla64.v b/wally-pipelined/src/fpu/fpadd/cla64.v
new file mode 100755
index 00000000..a0809e9d
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/cla64.v
@@ -0,0 +1,420 @@
+// This module implements a 64-bit carry lookehead adder/subtractor. 
+// It is used to perform the primary addition in the floating point
+// adder
+
+module cla64 (S, X, Y, Sub);
+   
+   input  [63:0] X;
+   input [63:0]  Y;
+   input 	 Sub;
+   output [63:0] S;
+   wire 	 CO;
+   wire [0:63] 	 A,B,Q, Bbar;
+   
+   DBLCADDER_64_64 U1 (A , Bbar , Sub , Q , CO );
+   assign A[0] = X[0];
+   assign B[0] = Y[0];
+   assign A[1] = X[1];
+   assign B[1] = Y[1];
+   assign A[2] = X[2];
+   assign B[2] = Y[2];
+   assign A[3] = X[3];
+   assign B[3] = Y[3];
+   assign A[4] = X[4];
+   assign B[4] = Y[4];
+   assign A[5] = X[5];
+   assign B[5] = Y[5];
+   assign A[6] = X[6];
+   assign B[6] = Y[6];
+   assign A[7] = X[7];
+   assign B[7] = Y[7];
+   assign A[8] = X[8];
+   assign B[8] = Y[8];
+   assign A[9] = X[9];
+   assign B[9] = Y[9];
+   assign A[10] = X[10];
+   assign B[10] = Y[10];
+   assign A[11] = X[11];
+   assign B[11] = Y[11];
+   assign A[12] = X[12];
+   assign B[12] = Y[12];
+   assign A[13] = X[13];
+   assign B[13] = Y[13];
+   assign A[14] = X[14];
+   assign B[14] = Y[14];
+   assign A[15] = X[15];
+   assign B[15] = Y[15];
+   assign A[16] = X[16];
+   assign B[16] = Y[16];
+   assign A[17] = X[17];
+   assign B[17] = Y[17];
+   assign A[18] = X[18];
+   assign B[18] = Y[18];
+   assign A[19] = X[19];
+   assign B[19] = Y[19];
+   assign A[20] = X[20];
+   assign B[20] = Y[20];
+   assign A[21] = X[21];
+   assign B[21] = Y[21];
+   assign A[22] = X[22];
+   assign B[22] = Y[22];
+   assign A[23] = X[23];
+   assign B[23] = Y[23];
+   assign A[24] = X[24];
+   assign B[24] = Y[24];
+   assign A[25] = X[25];
+   assign B[25] = Y[25];
+   assign A[26] = X[26];
+   assign B[26] = Y[26];
+   assign A[27] = X[27];
+   assign B[27] = Y[27];
+   assign A[28] = X[28];
+   assign B[28] = Y[28];
+   assign A[29] = X[29];
+   assign B[29] = Y[29];
+   assign A[30] = X[30];
+   assign B[30] = Y[30];
+   assign A[31] = X[31];
+   assign B[31] = Y[31];
+   assign A[32] = X[32];
+   assign B[32] = Y[32];
+   assign A[33] = X[33];
+   assign B[33] = Y[33];
+   assign A[34] = X[34];
+   assign B[34] = Y[34];
+   assign A[35] = X[35];
+   assign B[35] = Y[35];
+   assign A[36] = X[36];
+   assign B[36] = Y[36];
+   assign A[37] = X[37];
+   assign B[37] = Y[37];
+   assign A[38] = X[38];
+   assign B[38] = Y[38];
+   assign A[39] = X[39];
+   assign B[39] = Y[39];
+   assign A[40] = X[40];
+   assign B[40] = Y[40];
+   assign A[41] = X[41];
+   assign B[41] = Y[41];
+   assign A[42] = X[42];
+   assign B[42] = Y[42];
+   assign A[43] = X[43];
+   assign B[43] = Y[43];
+   assign A[44] = X[44];
+   assign B[44] = Y[44];
+   assign A[45] = X[45];
+   assign B[45] = Y[45];
+   assign A[46] = X[46];
+   assign B[46] = Y[46];
+   assign A[47] = X[47];
+   assign B[47] = Y[47];
+   assign A[48] = X[48];
+   assign B[48] = Y[48];
+   assign A[49] = X[49];
+   assign B[49] = Y[49];
+   assign A[50] = X[50];
+   assign B[50] = Y[50];
+   assign A[51] = X[51];
+   assign B[51] = Y[51];
+   assign A[52] = X[52];
+   assign B[52] = Y[52];
+   assign A[53] = X[53];
+   assign B[53] = Y[53];
+   assign A[54] = X[54];
+   assign B[54] = Y[54];
+   assign A[55] = X[55];
+   assign B[55] = Y[55];
+   assign A[56] = X[56];
+   assign B[56] = Y[56];
+   assign A[57] = X[57];
+   assign B[57] = Y[57];
+   assign A[58] = X[58];
+   assign B[58] = Y[58];
+   assign A[59] = X[59];
+   assign B[59] = Y[59];
+   assign A[60] = X[60];
+   assign B[60] = Y[60];
+   assign A[61] = X[61];
+   assign B[61] = Y[61];
+   assign A[62] = X[62];
+   assign B[62] = Y[62];
+   assign A[63] = X[63];
+   assign B[63] = Y[63];
+   assign S[0] = Q[0];
+   assign S[1] = Q[1];
+   assign S[2] = Q[2];
+   assign S[3] = Q[3];
+   assign S[4] = Q[4];
+   assign S[5] = Q[5];
+   assign S[6] = Q[6];
+   assign S[7] = Q[7];
+   assign S[8] = Q[8];
+   assign S[9] = Q[9];
+   assign S[10] = Q[10];
+   assign S[11] = Q[11];
+   assign S[12] = Q[12];
+   assign S[13] = Q[13];
+   assign S[14] = Q[14];
+   assign S[15] = Q[15];
+   assign S[16] = Q[16];
+   assign S[17] = Q[17];
+   assign S[18] = Q[18];
+   assign S[19] = Q[19];
+   assign S[20] = Q[20];
+   assign S[21] = Q[21];
+   assign S[22] = Q[22];
+   assign S[23] = Q[23];
+   assign S[24] = Q[24];
+   assign S[25] = Q[25];
+   assign S[26] = Q[26];
+   assign S[27] = Q[27];
+   assign S[28] = Q[28];
+   assign S[29] = Q[29];
+   assign S[30] = Q[30];
+   assign S[31] = Q[31];
+   assign S[32] = Q[32];
+   assign S[33] = Q[33];
+   assign S[34] = Q[34];
+   assign S[35] = Q[35];
+   assign S[36] = Q[36];
+   assign S[37] = Q[37];
+   assign S[38] = Q[38];
+   assign S[39] = Q[39];
+   assign S[40] = Q[40];
+   assign S[41] = Q[41];
+   assign S[42] = Q[42];
+   assign S[43] = Q[43];
+   assign S[44] = Q[44];
+   assign S[45] = Q[45];
+   assign S[46] = Q[46];
+   assign S[47] = Q[47];
+   assign S[48] = Q[48];
+   assign S[49] = Q[49];
+   assign S[50] = Q[50];
+   assign S[51] = Q[51];
+   assign S[52] = Q[52];
+   assign S[53] = Q[53];
+   assign S[54] = Q[54];
+   assign S[55] = Q[55];
+   assign S[56] = Q[56];
+   assign S[57] = Q[57];
+   assign S[58] = Q[58];
+   assign S[59] = Q[59];
+   assign S[60] = Q[60];
+   assign S[61] = Q[61];
+   assign S[62] = Q[62];
+   assign S[63] = Q[63];
+   assign Bbar = B ^ {64{Sub}};
+   
+endmodule // cla64
+
+// This module performs 64-bit subtraction. It is used to get the two's complement
+// of main addition or subtraction in the floating point adder. 
+
+module cla_sub64 (S, X, Y);
+   
+   input  [63:0] X;
+   input [63:0]  Y;
+   
+   output [63:0] S;
+   
+   wire 	 CO;
+   wire 	 VDD = 1'b1;
+   wire [0:63] 	 A,B,Q, Bbar;
+   
+   DBLCADDER_64_64 U1 (A , Bbar , VDD, Q , CO );
+   assign A[0] = X[0];
+   assign B[0] = Y[0];
+   assign A[1] = X[1];
+   assign B[1] = Y[1];
+   assign A[2] = X[2];
+   assign B[2] = Y[2];
+   assign A[3] = X[3];
+   assign B[3] = Y[3];
+   assign A[4] = X[4];
+   assign B[4] = Y[4];
+   assign A[5] = X[5];
+   assign B[5] = Y[5];
+   assign A[6] = X[6];
+   assign B[6] = Y[6];
+   assign A[7] = X[7];
+   assign B[7] = Y[7];
+   assign A[8] = X[8];
+   assign B[8] = Y[8];
+   assign A[9] = X[9];
+   assign B[9] = Y[9];
+   assign A[10] = X[10];
+   assign B[10] = Y[10];
+   assign A[11] = X[11];
+   assign B[11] = Y[11];
+   assign A[12] = X[12];
+   assign B[12] = Y[12];
+   assign A[13] = X[13];
+   assign B[13] = Y[13];
+   assign A[14] = X[14];
+   assign B[14] = Y[14];
+   assign A[15] = X[15];
+   assign B[15] = Y[15];
+   assign A[16] = X[16];
+   assign B[16] = Y[16];
+   assign A[17] = X[17];
+   assign B[17] = Y[17];
+   assign A[18] = X[18];
+   assign B[18] = Y[18];
+   assign A[19] = X[19];
+   assign B[19] = Y[19];
+   assign A[20] = X[20];
+   assign B[20] = Y[20];
+   assign A[21] = X[21];
+   assign B[21] = Y[21];
+   assign A[22] = X[22];
+   assign B[22] = Y[22];
+   assign A[23] = X[23];
+   assign B[23] = Y[23];
+   assign A[24] = X[24];
+   assign B[24] = Y[24];
+   assign A[25] = X[25];
+   assign B[25] = Y[25];
+   assign A[26] = X[26];
+   assign B[26] = Y[26];
+   assign A[27] = X[27];
+   assign B[27] = Y[27];
+   assign A[28] = X[28];
+   assign B[28] = Y[28];
+   assign A[29] = X[29];
+   assign B[29] = Y[29];
+   assign A[30] = X[30];
+   assign B[30] = Y[30];
+   assign A[31] = X[31];
+   assign B[31] = Y[31];
+   assign A[32] = X[32];
+   assign B[32] = Y[32];
+   assign A[33] = X[33];
+   assign B[33] = Y[33];
+   assign A[34] = X[34];
+   assign B[34] = Y[34];
+   assign A[35] = X[35];
+   assign B[35] = Y[35];
+   assign A[36] = X[36];
+   assign B[36] = Y[36];
+   assign A[37] = X[37];
+   assign B[37] = Y[37];
+   assign A[38] = X[38];
+   assign B[38] = Y[38];
+   assign A[39] = X[39];
+   assign B[39] = Y[39];
+   assign A[40] = X[40];
+   assign B[40] = Y[40];
+   assign A[41] = X[41];
+   assign B[41] = Y[41];
+   assign A[42] = X[42];
+   assign B[42] = Y[42];
+   assign A[43] = X[43];
+   assign B[43] = Y[43];
+   assign A[44] = X[44];
+   assign B[44] = Y[44];
+   assign A[45] = X[45];
+   assign B[45] = Y[45];
+   assign A[46] = X[46];
+   assign B[46] = Y[46];
+   assign A[47] = X[47];
+   assign B[47] = Y[47];
+   assign A[48] = X[48];
+   assign B[48] = Y[48];
+   assign A[49] = X[49];
+   assign B[49] = Y[49];
+   assign A[50] = X[50];
+   assign B[50] = Y[50];
+   assign A[51] = X[51];
+   assign B[51] = Y[51];
+   assign A[52] = X[52];
+   assign B[52] = Y[52];
+   assign A[53] = X[53];
+   assign B[53] = Y[53];
+   assign A[54] = X[54];
+   assign B[54] = Y[54];
+   assign A[55] = X[55];
+   assign B[55] = Y[55];
+   assign A[56] = X[56];
+   assign B[56] = Y[56];
+   assign A[57] = X[57];
+   assign B[57] = Y[57];
+   assign A[58] = X[58];
+   assign B[58] = Y[58];
+   assign A[59] = X[59];
+   assign B[59] = Y[59];
+   assign A[60] = X[60];
+   assign B[60] = Y[60];
+   assign A[61] = X[61];
+   assign B[61] = Y[61];
+   assign A[62] = X[62];
+   assign B[62] = Y[62];
+   assign A[63] = X[63];
+   assign B[63] = Y[63];
+   assign S[0] = Q[0];
+   assign S[1] = Q[1];
+   assign S[2] = Q[2];
+   assign S[3] = Q[3];
+   assign S[4] = Q[4];
+   assign S[5] = Q[5];
+   assign S[6] = Q[6];
+   assign S[7] = Q[7];
+   assign S[8] = Q[8];
+   assign S[9] = Q[9];
+   assign S[10] = Q[10];
+   assign S[11] = Q[11];
+   assign S[12] = Q[12];
+   assign S[13] = Q[13];
+   assign S[14] = Q[14];
+   assign S[15] = Q[15];
+   assign S[16] = Q[16];
+   assign S[17] = Q[17];
+   assign S[18] = Q[18];
+   assign S[19] = Q[19];
+   assign S[20] = Q[20];
+   assign S[21] = Q[21];
+   assign S[22] = Q[22];
+   assign S[23] = Q[23];
+   assign S[24] = Q[24];
+   assign S[25] = Q[25];
+   assign S[26] = Q[26];
+   assign S[27] = Q[27];
+   assign S[28] = Q[28];
+   assign S[29] = Q[29];
+   assign S[30] = Q[30];
+   assign S[31] = Q[31];
+   assign S[32] = Q[32];
+   assign S[33] = Q[33];
+   assign S[34] = Q[34];
+   assign S[35] = Q[35];
+   assign S[36] = Q[36];
+   assign S[37] = Q[37];
+   assign S[38] = Q[38];
+   assign S[39] = Q[39];
+   assign S[40] = Q[40];
+   assign S[41] = Q[41];
+   assign S[42] = Q[42];
+   assign S[43] = Q[43];
+   assign S[44] = Q[44];
+   assign S[45] = Q[45];
+   assign S[46] = Q[46];
+   assign S[47] = Q[47];
+   assign S[48] = Q[48];
+   assign S[49] = Q[49];
+   assign S[50] = Q[50];
+   assign S[51] = Q[51];
+   assign S[52] = Q[52];
+   assign S[53] = Q[53];
+   assign S[54] = Q[54];
+   assign S[55] = Q[55];
+   assign S[56] = Q[56];
+   assign S[57] = Q[57];
+   assign S[58] = Q[58];
+   assign S[59] = Q[59];
+   assign S[60] = Q[60];
+   assign S[61] = Q[61];
+   assign S[62] = Q[62];
+   assign S[63] = Q[63];
+   assign Bbar = ~B;
+   
+endmodule // cla_sub64
\ No newline at end of file
diff --git a/wally-pipelined/src/fpu/fpadd/convert_inputs.v b/wally-pipelined/src/fpu/fpadd/convert_inputs.v
new file mode 100755
index 00000000..7ad93453
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/convert_inputs.v
@@ -0,0 +1,61 @@
+// This module takes as inputs two operands (op1 and op2) 
+// the operation type (op_type) and the result precision (P). 
+// Based on the operation and precision , it conditionally
+// converts single precision values to double precision values
+// and modifies the sign of op1. The converted operands are Float1
+// and Float2.
+
+module convert_inputs(Float1, Float2, op1, op2, op_type, P);
+   
+   input [63:0]  op1;            // 1st input operand (A)
+   input [63:0]  op2;            // 2nd input operand (B)
+   input [2:0] 	 op_type;        // Function opcode
+   input 	 P;              // Result Precision (0 for double, 1 for single)
+
+   output [63:0] Float1;	// Converted 1st input operand
+   output [63:0] Float2;	// Converted 2nd input operand   
+   
+   wire 	 conv_SP;        // Convert from SP to DP
+   wire 	 negate;         // Operation is negation
+   wire 	 abs_val;        // Operation is absolute value
+   wire 	 Zexp1;		// One if the exponent of op1 is zero
+   wire 	 Zexp2;		// One if the exponent of op2 is zero
+   wire 	 Oexp1;		// One if the exponent of op1 is all ones
+   wire 	 Oexp2;		// One if the exponent of op2 is all ones
+
+   // Convert from single precision to double precision if (op_type is 11X
+   // and P is 0) or (op_type is not 11X and P is one). 
+   assign conv_SP = (op_type[2]&op_type[1]) ^ P;
+
+   // Test if the input exponent is zero, because if it is then the
+   // exponent of the converted number should be zero. 
+   assign Zexp1 = ~(op1[62] | op1[61] | op1[60] | op1[59] | 
+		    op1[58] | op1[57] | op1[56] | op1[55]);
+   assign Zexp2 = ~(op2[62] | op2[61] | op2[60] | op2[59] | 
+		    op2[58] | op2[57] | op2[56] | op2[55]);
+   assign Oexp1 =  (op1[62] & op1[61] & op1[60] & op1[59] & 
+		    op1[58] & op1[57] & op1[56] & op1[55]);
+   assign Oexp2 =  (op2[62] & op2[61] & op2[60] & op2[59] & 
+		    op2[58] & op2[57] & op2[56] &op2[55]);
+
+   // Conditionally convert op1. Lower 29 bits are zero for single precision.
+   assign Float1[62:29] = conv_SP ? {op1[62], {3{(~op1[62]&~Zexp1)|Oexp1}}, op1[61:32]}
+			  : op1[62:29];
+   assign Float1[28:0] = op1[28:0] & {29{~conv_SP}};
+
+   // Conditionally convert op2. Lower 29 bits are zero for single precision. 
+   assign Float2[62:29] = conv_SP ? {op2[62], 
+				     {3{(~op2[62]&~Zexp2)|Oexp2}}, op2[61:32]}
+			  : op2[62:29];
+   assign Float2[28:0] = op2[28:0] & {29{~conv_SP}};
+
+   // Set the sign of Float1 based on its original sign and if the operation
+   // is negation (op_type = 101) or absolute value (op_type = 100)
+
+   assign negate  = op_type[2] & ~op_type[1] & op_type[0];
+   assign abs_val = op_type[2] & ~op_type[1] & ~op_type[0];
+   assign Float1[63]  = (op1[63] ^ negate) & ~abs_val;
+   assign Float2[63]  = op2[63];
+
+endmodule // convert_inputs
+
diff --git a/wally-pipelined/src/fpu/fpadd/exception.v b/wally-pipelined/src/fpu/fpadd/exception.v
new file mode 100755
index 00000000..8f5b1cd4
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/exception.v
@@ -0,0 +1,120 @@
+// Exception logic for the floating point adder. Note: We may 
+// actually want to move to where the result is computed.
+
+module exception (Ztype, Invalid, Denorm, ANorm, BNorm, Sub, A, B, op_type);
+
+   input [63:0] A;		// 1st input operand (op1)
+   input [63:0] B;		// 2nd input operand (op2)
+   input [2:0] 	op_type;   	// Function opcode
+   output [3:0] Ztype;		// Indicates type of result (Z)
+   output 	Invalid;	// Invalid operation exception
+   output 	Denorm;		// Denormalized input
+   output       ANorm;          // A is not zero or Denorm
+   output       BNorm;          // B is not zero or Denorm
+   output       Sub;		// The effective operation is subtraction
+   wire		AzeroM;	 	// '1' if the mantissa of A is zero
+   wire		BzeroM;		// '1' if the mantissa of B is zero
+   wire		AzeroE;	 	// '1' if the exponent of A is zero
+   wire		BzeroE;		// '1' if the exponent of B is zero
+   wire		AonesE;	 	// '1' if the exponent of A is all ones
+   wire		BonesE;		// '1' if the exponent of B is all ones
+   wire		ADenorm; 	// '1' if A is a denomalized number
+   wire		BDenorm; 	// '1' if B is a denomalized number
+   wire		AInf;	 	// '1' if A is infinite
+   wire		BInf;	 	// '1' if B is infinite
+   wire		AZero;	 	// '1' if A is 0
+   wire		BZero;	 	// '1' if B is 0
+   wire		ANaN;	 	// '1' if A is a not-a-number
+   wire		BNaN; 		// '1' if B is a not-a-number
+   wire		ASNaN;	 	// '1' if A is a signalling not-a-number
+   wire		BSNaN;	 	// '1' if B is a signalling not-a-number
+   wire		ZQNaN;	 	// '1' if result Z is a quiet NaN
+   wire		ZPInf;	 	// '1' if result Z positive infnity
+   wire		ZNInf;	 	// '1' if result Z negative infnity
+   wire         add_sub;	// '1' if operation is add or subtract
+   wire 	converts;       // See if there are any converts   
+   
+   parameter [51:0]  fifty_two_zeros = 52'h0000000000000; // Use parameter?
+
+
+   // Is this instruction a convert
+   assign converts      = ~(~op_type[1] & ~op_type[2]);
+   
+   // Determine if mantissas are all zeros
+   assign AzeroM = (A[51:0] == fifty_two_zeros);
+   assign BzeroM = (B[51:0] == fifty_two_zeros);
+
+   // Determine if exponents are all ones or all zeros 
+   assign AonesE = A[62]&A[61]&A[60]&A[59]&A[58]&A[57]&A[56]&A[55]&A[54]&A[53]&A[52];
+   assign BonesE = B[62]&B[61]&B[60]&B[59]&B[58]&B[57]&B[56]&B[55]&B[54]&B[53]&B[52];
+   assign AzeroE = ~(A[62]|A[61]|A[60]|A[59]|A[58]|A[57]|A[56]|A[55]|A[54]|A[53]|A[52]);
+   assign BzeroE = ~(B[62]|B[61]|B[60]|B[59]|B[58]|B[57]|B[56]|B[55]|B[54]|B[53]|B[52]);
+
+   // Determine special cases. Note: Zero is not really a special case. 
+   assign ADenorm = AzeroE & ~AzeroM;
+   assign BDenorm = BzeroE & ~BzeroM;
+   assign AInf = AonesE & AzeroM;
+   assign BInf = BonesE & BzeroM;
+   assign ANaN = AonesE & ~AzeroM;
+   assign BNaN = BonesE & ~BzeroM;
+   assign ASNaN = ANaN & ~A[51];
+   assign BSNaN = BNaN & ~B[51];
+   assign AZero = AzeroE & AzeroM;
+   assign BZero = BzeroE & BzeroE;
+
+   // A and B are normalized if their exponents are not zero. 
+   assign ANorm = ~AzeroE;
+   assign BNorm = ~BzeroE;
+
+   // An "Invalid Operation" exception occurs if (A or B is a signalling NaN)
+   // or (A and B are both Infinite and the "effective operation" is 
+   // subtraction). 
+   assign add_sub = ~op_type[2] & ~op_type[1];
+   assign Invalid = (ASNaN | BSNaN | 
+		     (add_sub & AInf & BInf & (A[63]^B[63]^op_type[0]))) & ~converts;
+
+   // The Denorm flag is set if (A is denormlized and the operation is not integer 
+   // conversion ) or (if B is normalized and the operation is addition or  subtraction). 
+   assign Denorm = ADenorm&(op_type[2]|~op_type[1]) | BDenorm & add_sub;
+
+   // The result is a quiet NaN if (an "Invalid Operation" exception occurs) 
+   // or (A is a NaN) or (B is a NaN and the operation uses B).
+   assign ZQNaN = Invalid | ANaN | (BNaN & add_sub);
+
+   // The result is +Inf if ((A is +Inf) or (B is -Inf and the operation is
+   // subtraction) or (B is +Inf and the operation is addition)) and (the
+   // result is not a quiet NaN).  
+   assign ZPInf = (AInf&A[63] | add_sub&BInf&(~B[63]^op_type[0]))&~ZQNaN;
+
+   // The result is -Inf if ((A is -Inf) or (B is +Inf and the operation is
+   // subtraction) or (B is -Inf and the operation is addition)) and the
+   // result is not a quiet NaN.  
+   assign ZNInf = (AInf&~A[63] | add_sub&BInf&(B[63]^op_type[0]))&~ZQNaN;
+
+   // Set the type of the result as follows:
+   // (needs optimization - got lazy or was late)
+   // Ztype	Result 
+   //  0000	Normal
+   //  0001	Quiet NaN
+   //  0010     Negative Infinity
+   //  0011     Positive Infinity
+   //  0100     +Bzero and +Azero (and vice-versa)
+   //  0101     +Bzero and -Azero (and vice-versa)
+   //  1000     Convert SP to DP (and vice-versa)
+
+   assign Ztype[0] = ((ZQNaN | ZPInf) & ~(~op_type[2] & op_type[1])) | 
+		     ((AZero & BZero & (A[63]^B[63]^op_type[0])) 
+		      & ~converts);
+   assign Ztype[1] = ((ZNInf | ZPInf) & ~(~op_type[2] & op_type[1])) | 
+		     (((AZero & BZero & A[63] & B[63] & ~op_type[0]) |
+		       (AZero & BZero & A[63] & ~B[63] & op_type[0])) 
+		      & ~converts);
+   assign Ztype[2] = ((AZero & BZero & ~op_type[1] & ~op_type[2]) 
+		      & ~converts);
+   assign Ztype[3] = (op_type[1] & op_type[2] & ~op_type[0]);
+
+   // Determine if the effective operation is subtraction
+   assign Sub = add_sub & (A[63]^B[63]^op_type[0]);
+
+endmodule // exception
+
diff --git a/wally-pipelined/src/fpu/fpadd/f32_add_rd.do b/wally-pipelined/src/fpu/fpadd/f32_add_rd.do
new file mode 100755
index 00000000..607fda62
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f32_add_rd.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f32_add_rd.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   38,932 vectors, 389,365ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f32_add_rne.do b/wally-pipelined/src/fpu/fpadd/f32_add_rne.do
new file mode 100755
index 00000000..bc5ede61
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f32_add_rne.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f32_add_rne.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   39,052 vectors, 390,565ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f32_add_ru.do b/wally-pipelined/src/fpu/fpadd/f32_add_ru.do
new file mode 100755
index 00000000..faf652d8
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f32_add_ru.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f32_add_ru.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   38,946 vectors, 389,500ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f32_add_rz.do b/wally-pipelined/src/fpu/fpadd/f32_add_rz.do
new file mode 100755
index 00000000..f24385db
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f32_add_rz.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f32_add_rz.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   39,111 vectors, 391,150ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f32_f64_rne.do b/wally-pipelined/src/fpu/fpadd/f32_f64_rne.do
new file mode 100755
index 00000000..4f5cc284
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f32_f64_rne.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f32_f64_rne.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   544 vectors, 390,565ns
+run 5480ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f32_sub_rd.do b/wally-pipelined/src/fpu/fpadd/f32_sub_rd.do
new file mode 100755
index 00000000..f4e8f6f7
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f32_sub_rd.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f32_sub_rd.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   38,932 vectors, 389,365ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f32_sub_rne.do b/wally-pipelined/src/fpu/fpadd/f32_sub_rne.do
new file mode 100755
index 00000000..e8efd2a2
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f32_sub_rne.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f32_sub_rne.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   39,052 vectors, 390,565ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f32_sub_ru.do b/wally-pipelined/src/fpu/fpadd/f32_sub_ru.do
new file mode 100755
index 00000000..677584f1
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f32_sub_ru.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f32_sub_ru.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   38,946 vectors, 389,500ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f32_sub_rz.do b/wally-pipelined/src/fpu/fpadd/f32_sub_rz.do
new file mode 100755
index 00000000..031da39b
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f32_sub_rz.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f32_sub_rz.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   39,111 vectors, 391,150ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f64_add_rd.do b/wally-pipelined/src/fpu/fpadd/f64_add_rd.do
new file mode 100755
index 00000000..cb6005b0
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f64_add_rd.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f64_add_rd.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   38,932 vectors, 389,365ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f64_add_rne.do b/wally-pipelined/src/fpu/fpadd/f64_add_rne.do
new file mode 100755
index 00000000..c22ba168
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f64_add_rne.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f64_add_rne.sv
+
+# start and run simulation
+vsim -voptargs=+acc work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   39,052 vectors, 390,565ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f64_add_ru.do b/wally-pipelined/src/fpu/fpadd/f64_add_ru.do
new file mode 100755
index 00000000..18f340a0
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f64_add_ru.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f64_add_ru.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   38,946 vectors, 389,500ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f64_add_rz.do b/wally-pipelined/src/fpu/fpadd/f64_add_rz.do
new file mode 100755
index 00000000..b527719e
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f64_add_rz.do
@@ -0,0 +1,58 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f64_add_rz.sv
+
+# start and run simulation
+vsim -voptargs=+acc work.tb
+
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   39,111 vectors, 391,150ns
+# run 405000ns
+run 100ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f64_f32_rne.do b/wally-pipelined/src/fpu/fpadd/f64_f32_rne.do
new file mode 100755
index 00000000..9376da17
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f64_f32_rne.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f64_f32_rne.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   565 vectors, 390,565ns
+run 5750ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f64_sub_rd.do b/wally-pipelined/src/fpu/fpadd/f64_sub_rd.do
new file mode 100755
index 00000000..fcbbbfcd
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f64_sub_rd.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f64_sub_rd.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   38,927 vectors, 389,315ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f64_sub_rne.do b/wally-pipelined/src/fpu/fpadd/f64_sub_rne.do
new file mode 100755
index 00000000..007c92e7
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f64_sub_rne.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f64_sub_rne.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   39,059 vectors, 390,635ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f64_sub_ru.do b/wally-pipelined/src/fpu/fpadd/f64_sub_ru.do
new file mode 100755
index 00000000..e5afa415
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f64_sub_ru.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f64_sub_ru.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   38,937 vectors, 389,415ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/f64_sub_rz.do b/wally-pipelined/src/fpu/fpadd/f64_sub_rz.do
new file mode 100755
index 00000000..cc807b08
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/f64_sub_rz.do
@@ -0,0 +1,56 @@
+# Copyright 1991-2016 Mentor Graphics Corporation
+# 
+# Modification by Oklahoma State University
+# Use with Testbench 
+# James Stine, 2008
+# Go Cowboys!!!!!!
+#
+# All Rights Reserved.
+#
+# THIS WORK CONTAINS TRADE SECRET AND PROPRIETARY INFORMATION
+# WHICH IS THE PROPERTY OF MENTOR GRAPHICS CORPORATION
+# OR ITS LICENSORS AND IS SUBJECT TO LICENSE TERMS.
+
+# Use this run.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do run.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do run.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+vlog convert_inputs.v exception.v lzd.v shifter.v adder.v cla52.v cla64.v rounder.v fpadd.v tb_f64_sub_rz.sv
+
+# start and run simulation
+vsim -novopt work.tb
+
+view wave
+
+-- display input and output signals as hexidecimal values
+# Diplays All Signals recursively
+add wave -hex -r /tb/*
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {75 ns}
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+
+-- Run the Simulation 
+--   39,113 vectors, 391,175ns
+run 405000ns
+quit
diff --git a/wally-pipelined/src/fpu/fpadd/fpadd.v b/wally-pipelined/src/fpu/fpadd/fpadd.v
new file mode 100755
index 00000000..7f5f05eb
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/fpadd.v
@@ -0,0 +1,216 @@
+//
+// File name : fpadd
+// Title     : Floating-Point Adder/Subtractor
+// project   : FPU
+// Library   : fpadd
+// Author(s) : James E. Stine, Jr.
+// Purpose   : definition of main unit to floating-point add/sub
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Basic Operations
+//
+// Step 1: Load operands, set flags, and convert SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
+//         or of (exp1 = exp2 AND mnt1 < mnt2)
+// Step 4: Shift the mantissa corresponding to the smaller exponent, 
+//          and extend precision by three bits to the right.
+// Step 5: Add or subtract the mantissas.
+// Step 6: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 7: Round the result.// 
+// Step 8: Put sum onto output.
+//
+
+
+module fpadd (AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);
+
+   input [63:0] op1;		// 1st input operand (A)
+   input [63:0] op2;		// 2nd input operand (B)
+   input [2:0] 	rm;		// Rounding mode - specify values 
+   input [2:0]	op_type;	// Function opcode
+   input 	P;   		// Result Precision (0 for double, 1 for single)
+   input 	OvEn;		// Overflow trap enabled
+   input 	UnEn;   	// Underflow trap enabled
+
+   output [63:0] AS_Result;	// Result of operation
+   output [4:0]  Flags;   	// IEEE exception flags 
+   output 	 Denorm;   	// Denorm on input or output   
+
+   wire [63:0] 	 Float1; 
+   wire [63:0] 	 Float2;
+   wire [63:0] 	 IntValue;
+   wire [11:0] 	 exp1, exp2;
+   wire [11:0] 	 exp_diff1, exp_diff2;
+   wire [10:0] 	 exponent, exp_pre;
+   wire [11:0] 	 exp_shift;
+   wire [63:0] 	 Result;   
+   wire [51:0] 	 mantissaA;
+   wire [56:0] 	 mantissaA1;
+   wire [63:0] 	 mantissaA3;
+   wire [51:0] 	 mantissaB; 
+   wire [56:0] 	 mantissaB1, mantissaB2;
+   wire [63:0] 	 mantissaB3;
+   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
+   wire [5:0] 	 align_shift;
+   wire [5:0] 	 norm_shift;
+   wire [3:0] 	 sel_inv;
+   wire		 op1_Norm, op2_Norm;
+   wire		 opA_Norm, opB_Norm;
+   wire		 Invalid;
+   wire 	 DenormIn, DenormIO;
+   wire [4:0] 	 FlagsIn;   	
+   wire 	 exp_valid;
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire 	 signA, sign_corr;
+   wire          corr_sign;
+   wire 	 zeroB;         
+   wire 	 convert;
+   wire          swap;
+   wire          sub;
+
+   // Convert the input operands to their appropriate forms based on 
+   // the orignal operands, the op_type , and their precision P. 
+   // Single precision inputs are converted to double precision 
+   // and the sign of the first operand is set appropratiately based on
+   // if the operation is absolute value or negation. 
+
+   convert_inputs conv1 (Float1, Float2, op1, op2, op_type, P);
+
+   // Test for exceptions and return the "Invalid Operation" and
+   // "Denormalized" Input Flags. The "sel_inv" is used in
+   // the third pipeline stage to select the result. Also, op1_Norm
+   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
+   // sub is one if the effective operation is subtaction. 
+
+   exception exc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, sub, 
+		   Float1, Float2, op_type);
+
+   // Perform Exponent Subtraction (used for alignment). For performance
+   // both exponent subtractions are performed in parallel. This was 
+   // changed to a behavior level to allow the tools to  try to optimize
+   // the two parallel additions. The input values are zero-extended to 12 
+   // bits prior to performing the addition. 
+
+   assign exp1 = {1'b0, Float1[62:52]};
+   assign exp2 = {1'b0, Float2[62:52]};
+   assign exp_diff1 = exp1 - exp2;
+   assign exp_diff2 = exp2 - exp1;
+
+   // The second operand (B) should be set to zero, if op_type does not
+   // specify addition or subtraction
+   assign zeroB = op_type[2] | op_type[1];
+
+   // Swapped operands if zeroB is not one and exp1 < exp2. 
+   // Swapping causes exp2 to be used for the result exponent. 
+   // Only the exponent of the larger operand is used to determine
+   // the final result. 
+   assign swap = exp_diff1[11] & ~zeroB;
+   assign exponent = swap ? exp2[10:0] : exp1[10:0];
+   assign mantissaA = swap ? Float2[51:0] : Float1[51:0];
+   assign mantissaB = swap ? Float1[51:0] : Float2[51:0];
+   assign signA     = swap ? Float2[63] : Float1[63];   
+
+   // Determine the alignment shift and limit it to 63. If any bit from 
+   // exp_shift[6] to exp_shift[11] is one, then shift is set to all ones. 
+   assign exp_shift = swap ? exp_diff2 : exp_diff1;
+   assign exp_gt63 = exp_shift[11] | exp_shift[10] | exp_shift[9] 
+     | exp_shift[8] | exp_shift[7] | exp_shift[6];
+   assign align_shift = exp_shift | {6{exp_gt63}};
+
+   // Unpack the 52-bit mantissas to 57-bit numbers of the form.
+   //    001.M[51]M[50] ... M[1]M[0]00
+   // Unless the number has an exponent of zero, in which case it
+   // is unpacked as
+   //    000.00 ... 00
+   // This effectively flushes denormalized values to zero. 
+   // The three bits of to the left of the binary point prevent overflow
+   // and loss of sign information. The two bits to the right of the 
+   // original mantissa form the "guard" and "round" bits that are used
+   // to round the result. 
+   assign opA_Norm = swap ? op2_Norm : op1_Norm;
+   assign opB_Norm = swap ? op1_Norm : op2_Norm;
+   assign mantissaA1 = {2'h0, opA_Norm, mantissaA[51:0]&{52{opA_Norm}}, 2'h0};
+   assign mantissaB1 = {2'h0, opB_Norm, mantissaB[51:0]&{52{opB_Norm}}, 2'h0};
+
+   // Perform mantissa alignment using a 57-bit barrel shifter 
+   // If any of the bits shifted out are one, Sticky_out is set. 
+   // The size of the barrel shifter could be reduced by two bits
+   // by not adding the leading two zeros until after the shift. 
+   barrel_shifter_r57 bs1 (mantissaB2, Sticky_out, mantissaB1, align_shift);
+
+   // Place either the sign-extened 32-bit value or the original 64-bit value 
+   // into IntValue (to be used for integer to floating point conversion)
+   assign IntValue [31:0] = op1[31:0];
+   assign IntValue [63:32] = op_type[0] ? {32{op1[31]}} : op1[63:32];
+
+   // If doing an integer to floating point conversion, mantissaA3 is set to 
+   // IntVal and the prenomalized exponent is set to 1084. Otherwise, 
+   // mantissaA3 is simply extended to 64-bits by setting the 7 LSBs to zero, 
+   // and the exponent value is left unchanged. 
+   assign convert       = ~op_type[2] & op_type[1];
+   assign mantissaA3    = convert ? IntValue : {mantissaA1, 7'h0};
+   assign exp_pre       = convert ? 11'b10000111100 : exponent;
+
+   // Put zero in for mantissaB3, if zeroB is one. Otherwise, B is extended to 
+   // 64-bits by setting the 7 LSBs to the Sticky_out bit followed by six  
+   // zeros. 
+   assign mantissaB3[63:7] = mantissaB2 & {57{~zeroB}};
+   assign mantissaB3[6]    = Sticky_out & ~zeroB;
+   assign mantissaB3[5:0]  = 6'h0;
+
+   // The sign of the result needs to be corrected if the true
+   // operation is subtraction and the input operands were swapped. 
+   assign corr_sign = ~op_type[2]&~op_type[1]&op_type[0]&swap;
+   
+   // 64-bit Mantissa Adder/Subtractor
+   cla64 add1 (sum, mantissaA3, mantissaB3, sub);
+
+   // 64-bit Mantissa Subtractor - to get the two's complement of the 
+   // result when the sign from the adder/subtractor is negative. 
+   cla_sub64 sub1 (sum_tc, mantissaB3, mantissaA3);
+
+   // Determine the correct sign of the result
+   assign sign_corr = ((corr_sign ^ signA) & ~convert) ^ sum[63];   
+   
+   // If the sum is negative, use its two complement instead. 
+   // This value has to be 64-bits to correctly handle the 
+   // case 10...00
+   assign sum_corr = sum[63] ? sum_tc : sum;
+
+   // Leading-Zero Detector. Determine the size of the shift needed for
+   // normalization. If sum_corrected is all zeros, the exp_valid is 
+   // zero; otherwise, it is one. 
+   lz64 lzd1 (norm_shift, exp_valid, sum_corr);
+
+   // Barell shifter used for normalization. It takes as inputs the 
+   // the corrected sum and the amount by which the sum should 
+   // be right shifted. It outputs the normalized sum. 
+   barrel_shifter_l64 bs2 (sum_norm, sum_corr, norm_shift);
+   
+   // Round the mantissa to a 52-bit value, with the leading one
+   // removed. If the result is a single precision number, the actual 
+   // mantissa is in the upper 23 bits and the lower 29 bits are zero. 
+   // At this point, normalization has already been performed, so we know 
+   // exactly where the rounding point is. The rounding units also
+   // handles special cases and set the exception flags.
+
+   // Changed DenormIO -> Denorm and FlagsIn -> Flags in order to
+   // help in processor reservation station detection of load/stores. In
+   // other words, the processor would like to know ahead of time that
+   // if the result is an exception then don't load or store.
+   rounder round1 (Result, DenormIO, FlagsIn, rm, P, OvEn, UnEn, exp_valid, 
+		   sel_inv, Invalid, DenormIn, convert, sign_corr, exp_pre, 
+		   norm_shift, sum_norm);
+
+   // Store the final result and the exception flags in registers.
+   assign AS_Result = Result;
+   assign {Denorm, Flags} = {DenormIO, FlagsIn};
+   
+endmodule // fpadd
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/lzd.v b/wally-pipelined/src/fpu/fpadd/lzd.v
new file mode 100755
index 00000000..b3a14160
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/lzd.v
@@ -0,0 +1,137 @@
+// V. G. Oklobdzija, "Algorithmic design of a hierarchical and modular
+//   leading zero detector circuit," in Electronics Letters, vol. 29,
+//   no. 3, pp. 283-284, 4 Feb. 1993, doi: 10.1049/el:19930193.
+      
+module lz2 (P, V, B0, B1);
+
+   input B0;
+   input B1;
+
+   output P;
+   output V;
+
+   assign V = B0 | B1;
+   assign P = B0 & ~B1;
+   
+endmodule // lz2
+
+// Note: This module is not made out of two lz2's - why not? (MJS)
+
+module lz4 (ZP, ZV, B0, B1, V0, V1);
+   
+   input B0;
+   input B1;
+   input V0;
+   input V1;
+
+   output [1:0] ZP;
+   output 	ZV;
+
+   assign ZP[0] = V0 ? B0 : B1;
+   assign ZP[1] = ~V0;
+   assign ZV = V0 | V1;
+
+endmodule // lz4
+
+// Note: This module is not made out of two lz4's - why not? (MJS)
+
+module lz8 (ZP, ZV, B);
+   
+   input [7:0] B;
+
+   wire        s1p0;
+   wire        s1v0;
+   wire        s1p1;
+   wire        s1v1;
+   wire        s2p0;
+   wire        s2v0;
+   wire        s2p1;
+   wire        s2v1;
+   wire [1:0]  ZPa;
+   wire [1:0]  ZPb;
+   wire        ZVa;
+   wire        ZVb;
+   
+   output [2:0] ZP;
+   output       ZV;
+   
+   lz2 l1(s1p0, s1v0, B[2], B[3]);
+   lz2 l2(s1p1, s1v1, B[0], B[1]);
+   lz4 l3(ZPa, ZVa, s1p0, s1p1, s1v0, s1v1);
+
+   lz2 l4(s2p0, s2v0, B[6], B[7]);
+   lz2 l5(s2p1, s2v1, B[4], B[5]);
+   lz4 l6(ZPb, ZVb, s2p0, s2p1, s2v0, s2v1);
+
+   assign ZP[1:0] = ZVb ? ZPb : ZPa;
+   assign ZP[2]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz8
+
+module lz16 (ZP, ZV, B);
+
+   input [15:0] B;
+
+   wire [2:0] 	ZPa;
+   wire [2:0] 	ZPb;
+   wire 	ZVa;
+   wire 	ZVb;   
+
+   output [3:0] ZP;
+   output 	ZV;
+
+   lz8 l1(ZPa, ZVa, B[7:0]);
+   lz8 l2(ZPb, ZVb, B[15:8]);
+
+   assign ZP[2:0] = ZVb ? ZPb : ZPa;
+   assign ZP[3]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz16
+
+module lz32 (ZP, ZV, B);
+
+   input [31:0] B;
+
+   wire [3:0] 	ZPa;
+   wire [3:0] 	ZPb;
+   wire 	ZVa;
+   wire 	ZVb;
+
+   output [4:0] ZP;
+   output 	ZV;
+
+   lz16 l1(ZPa, ZVa, B[15:0]);
+   lz16 l2(ZPb, ZVb, B[31:16]);
+
+   assign ZP[3:0] = ZVb ? ZPb : ZPa;
+   assign ZP[4]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz32
+
+// This module returns the number of leading zeros ZP in the 64-bit 
+// number B. If there are no ones in B, then ZP and ZV are both 0.
+
+module lz64 (ZP, ZV, B);
+
+   input [63:0] B;
+
+   wire [4:0] 	ZPa;
+   wire [4:0] 	ZPb;
+   wire 	ZVa;
+   wire 	ZVb;   
+
+   output [5:0] ZP;
+   output 	ZV;
+
+   lz32 l1(ZPa, ZVa, B[31:0]);
+   lz32 l2(ZPb, ZVb, B[63:32]);
+
+   assign ZV = ZVa | ZVb;
+   assign ZP[4:0] = (ZVb ? ZPb : ZPa) & {5{ZV}};
+   assign ZP[5]   = ~ZVb & ZV;
+
+endmodule // lz64
+
diff --git a/wally-pipelined/src/fpu/fpadd/rounder.v b/wally-pipelined/src/fpu/fpadd/rounder.v
new file mode 100755
index 00000000..b994acb3
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/rounder.v
@@ -0,0 +1,214 @@
+// The rounder takes as inputs a 64-bit value to be rounded, A, the 
+// exponent of the value to be rounded, the sign of the final result, Sign, 
+// the precision of the results, P, and the two-bit rounding mode, rm. 
+// It produces a rounded 52-bit result, Z, the exponent of the rounded 
+// result, Z_exp, and a flag that indicates if the result was rounded,
+// Inexact. The rounding mode has the following values.
+//	rm		Modee
+//      00 		round-to-nearest-even
+//	01 		round-toward-zero
+//      10 		round-toward-plus infinity
+//      11  		round-toward-minus infinity
+// The rounding algorithm determines if '1' should be added to the 
+// truncated signficant result, based on three significant bits 
+// (least (L), round (R) and sticky (S)), the rounding mode (rm)
+// and the sign of the final result (Sign). Visually, L and R appear as
+//    xxxxxL,Rxxxxxxx
+// where , denotes the rounding boundary. S is the logical OR of all the
+// bits to the right of R. 
+
+module rounder (Result, DenormIO, Flags, rm, P, OvEn, 
+		UnEn, exp_valid, sel_inv, Invalid, DenormIn, convert, Asign, Aexp, 
+		norm_shift, A);
+
+   input  [2:0]  rm;
+   input         P;
+   input         OvEn;
+   input         UnEn;
+   input         exp_valid;
+   input [3:0] 	 sel_inv;
+   input	 Invalid;
+   input	 DenormIn;
+   input         convert;
+   input         Asign;
+   input [10:0]  Aexp;
+   input [5:0] 	 norm_shift;
+   input [63:0]  A;
+   
+   output [63:0] Result;
+   output 	 DenormIO;
+   output [4:0]  Flags;
+   
+   wire          Rsign;
+   wire [10:0] 	 Rexp;
+   wire [11:0] 	 Texp;
+   wire [51:0] 	 Rmant;
+   wire [51:0] 	 Tmant;
+   wire          Rzero;
+   wire          VSS = 1'b0;
+   wire          VDD = 1'b1;
+   wire [51:0] 	 B;			// Value used to add the "ones"
+   wire		 S_SP;			// Single precision sticky bit
+   wire		 S_DP;			// Double precision sticky bit
+   wire		 S;			// Actual sticky bit
+   wire		 R;			// Round bit
+   wire		 L;			// Least significant bit
+   wire		 add_one;		// '1' if one should be added
+   wire		 UnFlow_SP, UnFlow_DP, UnderFlow; 
+   wire		 OvFlow_SP, OvFlow_DP, OverFlow;		
+   wire		 Inexact;
+   wire		 Round_zero;
+   wire		 Infinite;
+   wire		 VeryLarge;
+   wire		 Largest;
+   wire		 Adj_exp;
+   wire		 Valid;
+   wire		 NaN;
+   wire		 Cout;
+   wire		 Texp_l7z;
+   wire		 Texp_l7o;
+   wire		 OvCon;
+
+   // Determine the sticky bits for double and single precision
+   assign S_DP= A[9]|A[8]|A[7]|A[6]|A[5]|A[4]|A[3]|A[2]|A[1]|A[0];
+   assign S_SP = S_DP |A[38]|A[37]|A[36]|A[35]|A[34]|A[33]|A[32]|A[31]|A[30]|
+                 A[29]|A[28]|A[27]|A[26]|A[25]|A[24]|A[23]|A[22]|A[21]|A[20]|
+                 A[19]|A[18]|A[17]|A[16]|A[15]|A[14]|A[13]|A[12]|A[11]|A[10];
+
+   // Set the least (L), round (R), and sticky (S) bits based on
+   // the precision. 
+   assign {L, R, S} = P ? {A[40],A[39],S_SP} : {A[11],A[10],S_DP};
+
+   // Add one if ((the rounding mode is round-to-nearest) and (R is one) and
+   // (S or L is one)) or ((the rounding mode is towards plus or minus 
+   // infinity (rm[1] = 1)) and (the sign and rm[0] are the same) and 
+   // (R or S is one)). 
+
+   // Appended statement allows for roundTiesAway: if the rounding mode is round-towards-away,
+   // then if the sign of the result is 0 (i.e., positive), then add_one; otherwise, add zero.
+
+   assign add_one = ~rm[2] & ((~rm[1]&~rm[0]&R&(L|S)) | (rm[1]&(Asign^~rm[0])&(R|S))) | (rm[2] & R);
+
+   // Add one using a 52-bit adder. The one is added to the LSB B[0] for
+   // double precision or to B[29] for single precision. 
+   // This could be simplified by using a specialized adder.
+   // The current adder is actually 64-bits. The leading one 
+   // for normalized results in not included in the addition.
+   assign B = {{22{VSS}}, add_one&P, {28{VSS}}, add_one&~P};
+   cla52 add1(Tmant, Cout, A[62:11], B);
+
+   // Now that rounding is done, we compute the final exponent
+   // and test for special cases. 
+
+   // Compute the value of the exponent by subtracting the shift 
+   // value from the previous exponent and then adding 2 + cout. 
+   // If needed this could be optimized to used a specialized 
+   // adder. 
+
+   assign Texp    = {VSS, Aexp} - {{6{VSS}}, norm_shift} +{{10{VSS}}, VDD, Cout};   
+   
+   // Overflow only occurs for double precision, if Texp[10] to Texp[0] are 
+   // all ones. To encourage sharing with single precision overflow detection,
+   // the lower 7 bits are tested separately. 
+   assign Texp_l7o  = Texp[6]&Texp[5]&Texp[4]&Texp[3]&Texp[2]&Texp[1]&Texp[0];
+   assign OvFlow_DP = Texp[10]&Texp[9]&Texp[8]&Texp[7]&Texp_l7o;
+
+   // Overflow occurs for single precision if (Texp[10] is one)  and 
+   // ((Texp[9] or Texp[8] or Texp[7]) is one) or (Texp[6] to Texp[0] 
+   // are all ones. 
+   assign OvFlow_SP = Texp[10]&(Texp[9]|Texp[8]|Texp[7]|Texp_l7o);
+
+   // Underflow occurs for double precision if (Texp[11] is one)  or Texp[10] to 
+   // Texp[0] are all zeros. 
+   assign Texp_l7z  = ~Texp[6]&~Texp[5]&~Texp[4]&~Texp[3]&~Texp[2]&~Texp[1]&~Texp[0];
+   assign UnFlow_DP = Texp[11] | ~Texp[10]&~Texp[9]&~Texp[8]&~Texp[7]&Texp_l7z;
+
+   // Underflow occurs for single precision if (Texp[10] is zero)  and 
+   // (Texp[9] or Texp[8] or Texp[7]) is zero. 
+   assign UnFlow_SP = (~Texp[10]&(~Texp[9]|~Texp[8]|~Texp[7]|Texp_l7z));
+   
+   // Set the overflow and underflow flags. They should not be set if
+   // the input was infinite or NaN or the output of the adder is zero.
+   // 00 = Valid
+   // 10 = NaN
+   assign Valid = (~sel_inv[2]&~sel_inv[1]&~sel_inv[0]);
+   assign NaN   = ~sel_inv[2]&~sel_inv[1]& sel_inv[0];
+   assign UnderFlow = ((P & UnFlow_SP | UnFlow_DP)&Valid&exp_valid) |
+		      (~Aexp[10]&Aexp[9]&Aexp[8]&Aexp[7]&~Aexp[6]
+		       &~Aexp[5]&~Aexp[4]&~Aexp[3]&~Aexp[2]
+		       &~Aexp[1]&~Aexp[0]&sel_inv[3]);
+   assign OverFlow  = (P & OvFlow_SP | OvFlow_DP)&Valid&~UnderFlow&exp_valid;
+
+   // The DenormIO is set if underflow has occurred or if their was a
+   // denormalized input. 
+   assign DenormIO = DenormIn | UnderFlow;
+
+   // The final result is Inexact if any rounding occurred ((i.e., R or S 
+   // is one), or (if the result overflows ) or (if the result underflows and the 
+   // underflow trap is not enabled)) and (value of the result was not previous set 
+   // by an exception case). 
+   assign Inexact = (R|S|OverFlow|(UnderFlow&~UnEn))&Valid;
+
+   // Set the IEEE Exception Flags: Inexact, Underflow, Overflow, Div_By_0, 
+   // Invlalid. 
+   assign Flags = {UnderFlow, VSS, OverFlow, Invalid, Inexact};
+
+   // Determine the final result. 
+
+   // The sign of the final result is one if the result is not zero and
+   // the sign of A is one, or if the result is zero and the the rounding 
+   // mode is round-to-minus infinity. The final result is zero, if exp_valid
+   // is zero. If underflow occurs, then the result is set to zero.
+   //   
+   // For Zero (goes equally for subtraction although 
+   // signs may alter operands sign):
+   // -0 + -0 = -0 (always)
+   // +0 + +0 = +0 (always)
+   // -0 + +0 = +0 (for RN, RZ, RU) 
+   // -0 + +0 = -0 (for RD) 
+   assign Rzero = ~exp_valid | UnderFlow;
+   assign Rsign = ((Asign&exp_valid | 
+		    (sel_inv[2]&~sel_inv[1]&sel_inv[0]&rm[1]&rm[0] |
+		     sel_inv[2]&sel_inv[1]&~sel_inv[0] |		  
+		     ~exp_valid&rm[1]&rm[0]&~sel_inv[2] | 
+		     UnderFlow&rm[1]&rm[0]) & ~convert) & ~sel_inv[3]) |
+		  (Asign & sel_inv[3]);
+   
+   // The exponent of the final result is zero if the final result is 
+   // zero or a denorm, all ones if the final result is NaN or Infinite
+   // or overflow occurred and the magnitude of the number is 
+   // not rounded toward from zero, and all ones with an LSB of zero
+   // if overflow occurred and the magnitude of the number is 
+   // rounded toward zero. If the result is single precision, 
+   // Texp[7] shoud be inverted. When the Overflow trap is enabled (OvEn = 1)
+   // and overflow occurs and the operation is not conversion, bits 10 and 9 are 
+   // inverted for double precision, and bits 7 and 6 are inverted for single precision. 
+   assign Round_zero = ~rm[1]&rm[0] | ~Asign&rm[0] | Asign&rm[1]&~rm[0];
+   assign VeryLarge = OverFlow & ~OvEn;
+   assign Infinite   = (VeryLarge & ~Round_zero) | (~sel_inv[2] & sel_inv[1]);
+   assign Largest = VeryLarge & Round_zero;
+   assign Adj_exp = OverFlow & OvEn & ~convert;
+   assign Rexp[10:1] = ({10{~Valid}} | 
+			{Texp[10]&~Adj_exp, Texp[9]&~Adj_exp, Texp[8], 
+			 (Texp[7]^P)&~(Adj_exp&P), Texp[6]&~(Adj_exp&P), Texp[5:1]} | 
+		        {10{VeryLarge}})&{10{~Rzero | NaN}};
+   assign Rexp[0]    = ({~Valid} | Texp[0] | Infinite)&(~Rzero | NaN)&~Largest;
+   
+   // If the result is zero or infinity, the mantissa is all zeros. 
+   // If the result is NaN, the mantissa is 10...0
+   // If the result the largest floating point number, the mantissa
+   // is all ones. Otherwise, the mantissa is not changed. 
+   assign Rmant[51] = Largest | NaN | (Tmant[51]&~Infinite&~Rzero);
+   assign Rmant[50:0] = {51{Largest}} | (Tmant[50:0]&{51{~Infinite&Valid&~Rzero}});
+
+   // For single precision, the 8 least significant bits of the exponent
+   // and 23 most significant bits of the mantissa contain bits used 
+   // for the final result. A double precision result is returned if 
+   // overflow has occurred, the overflow trap is enabled, and a conversion
+   // is being performed. 
+   assign OvCon = OverFlow & OvEn & convert;
+   assign Result = (P&~OvCon) ? {Rsign, Rexp[7:0], Rmant[51:29], {32{VSS}}}
+	           : {Rsign, Rexp, Rmant};
+
+endmodule // rounder
+
diff --git a/wally-pipelined/src/fpu/fpadd/shifter.v b/wally-pipelined/src/fpu/fpadd/shifter.v
new file mode 100755
index 00000000..7a85fc6a
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/shifter.v
@@ -0,0 +1,119 @@
+
+// MJS - This module implements a 57-bit 2-to-1 multiplexor, which is
+// used in the barrel shifter for significand alignment.
+
+module mux21x57 (Z, A, B, Sel);
+
+   input [56:0] A;
+   input [56:0] B;
+   input 	Sel;
+
+   output [56:0] Z;
+
+   assign Z = Sel ? B : A;
+
+endmodule // mux21x57
+
+// MJS - This module implements a 64-bit 2-to-1 multiplexor, which is
+// used in the barrel shifter for significand normalization. 
+
+module mux21x64 (Z, A, B, Sel);
+
+   input [63:0] A;
+   input [63:0] B;
+   input 	Sel;
+
+   output [63:0] Z;
+   
+   assign Z = Sel ? B : A;
+   
+endmodule // mux21x64
+
+// The implementation of the barrel shifter was modified to use 
+// fewer gates. It is now implemented using six 64-bit 2-to-1 muxes. The 
+// barrel shifter takes a 64-bit input A and shifts it left by up to 
+// 63-bits, as specified by Shift, to produce a 63-bit output Z. 
+// Bits to the right are filled with zeros. 
+// The 64 bit shift is implemented using 6 stages of shifts of 32
+// 16, 8, 4, 2, and 1 bit shifts. 
+
+module barrel_shifter_l64 (Z, A, Shift);
+
+   input [63:0] A;
+   input [5:0] 	Shift;
+   
+   wire [63:0] 	stage1;
+   wire [63:0] 	stage2;
+   wire [63:0] 	stage3;
+   wire [63:0] 	stage4;
+   wire [63:0] 	stage5;
+   wire [31:0] 	thirtytwozeros = 32'h0;
+   wire [15:0] 	sixteenzeros = 16'h0;
+   wire [ 7:0] 	eightzeros = 8'h0;
+   wire [ 3:0] 	fourzeros = 4'h0;
+   wire [ 1:0] 	twozeros = 2'b00;
+   wire 	onezero = 1'b0;   
+
+   output [63:0] Z;      
+
+   mux21x64  mx01(stage1, A,      {A[31:0], thirtytwozeros}, Shift[5]);
+   mux21x64  mx02(stage2, stage1, {stage1[47:0], sixteenzeros}, Shift[4]);
+   mux21x64  mx03(stage3, stage2, {stage2[55:0], eightzeros}, Shift[3]);
+   mux21x64  mx04(stage4, stage3, {stage3[59:0], fourzeros}, Shift[2]);
+   mux21x64  mx05(stage5, stage4, {stage4[61:0], twozeros}, Shift[1]);
+   mux21x64  mx06(Z     , stage5, {stage5[62:0], onezero}, Shift[0]);
+
+endmodule // barrel_shifter_l63
+
+// The implementation of the barrel shifter was modified to use 
+// fewer gates. It is now implemented using six 57-bit 2-to-1 muxes. The 
+// barrel shifter takes a 57-bit input A and right shifts it by up to 
+// 63-bits, as specified by Shift, to produce a 57-bit output Z. 
+// It also computes a Sticky bit, which is set to 
+// one if any of the bits that were shifted out was one.
+// Bits shifted into the left are filled with zeros. 
+// The 63 bit shift is implemented using 6 stages of shifts of 32
+// 16, 8, 4, 2, and 1 bits.
+
+module barrel_shifter_r57 (Z, Sticky, A, Shift);
+   
+   input [56:0] A;
+   input [5:0] 	Shift;
+
+   output 	Sticky;
+   output [56:0] Z;      
+   
+   wire [56:0] 	stage1;
+   wire [56:0] 	stage2;
+   wire [56:0] 	stage3;
+   wire [56:0] 	stage4;
+   wire [56:0] 	stage5;
+   wire [62:0] 	sixtythreezeros = 63'h0;
+   wire [31:0] 	thirtytwozeros = 32'h0;
+   wire [15:0] 	sixteenzeros = 16'h0;
+   wire [ 7:0] 	eightzeros = 8'h0;
+   wire [ 3:0] 	fourzeros = 4'h0;
+   wire [ 1:0] 	twozeros = 2'b00;
+   wire 	onezero = 1'b0;   
+   wire [62:0] 	S;
+
+   // Shift operations
+   mux21x57  mx01(stage1,      A, {thirtytwozeros,    A[56:32]}, Shift[5]);
+   mux21x57  mx02(stage2, stage1, {sixteenzeros, stage1[56:16]}, Shift[4]);
+   mux21x57  mx03(stage3, stage2, {eightzeros, stage2[56:8]}, Shift[3]);
+   mux21x57  mx04(stage4, stage3, {fourzeros, stage3[56:4]}, Shift[2]);
+   mux21x57  mx05(stage5, stage4, {twozeros, stage4[56:2]}, Shift[1]);
+   mux21x57  mx06(Z     , stage5, {onezero, stage5[56:1]}, Shift[0]);
+
+   // Sticky bit calculation. The Sticky bit is set to one if any of the
+   // bits that were shifter out were one
+
+   assign S[31:0]  = {32{Shift[5]}} &      A[31:0];  
+   assign S[47:32] = {16{Shift[4]}} & stage1[15:0];  
+   assign S[55:48] = { 8{Shift[3]}} & stage2[7:0];  
+   assign S[59:56] = { 4{Shift[2]}} & stage3[3:0];  
+   assign S[61:60] = { 2{Shift[1]}} & stage4[1:0];  
+   assign S[62] =        Shift[0]   & stage5[0];  
+   assign Sticky = (S != sixtythreezeros);
+
+endmodule // barrel_shifter_r57
\ No newline at end of file
diff --git a/wally-pipelined/src/fpu/fpadd/tb.v b/wally-pipelined/src/fpu/fpadd/tb.v
new file mode 100755
index 00000000..e3c65559
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb.v
@@ -0,0 +1,86 @@
+//
+// File name : tb.v
+// Title     : stimulus
+// project   : mult
+// Library   : test
+// Author(s) : James E. Stine, Jr.
+// Purpose   : definition of modules for testbench 
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+
+// Top level stimulus module
+
+module stimulus;
+
+   reg clk;  // Always declared so can simulate based on clock
+    
+   // Declare variables for stimulating input
+   reg [63:0]  op1;
+   reg [63:0]  op2;
+   reg [1:0] rm;
+   reg [2:0] op_type;
+   reg P;
+   reg OvEn;
+   reg UnEn;
+   
+   wire [63:0] AS_Result;
+   wire [4:0] Flags;
+   wire Denorm;
+
+   integer     handle3;
+   integer     desc3;      
+
+   // Instantiate the design block counter
+   fpadd dut (AS_Result, Flags, Denorm, op1, op2, rm, op_type, P , OvEn, UnEn);
+   
+   // Setup the clock to toggle every 1 time units 
+   initial 
+     begin	
+	clk = 1'b1;
+	forever #25 clk = ~clk;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("tb.out");
+     end
+   
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#5 $display(desc3, "%h %h || %h", op1, op2, AS_Result);
+     end
+   
+   // Stimulate the Input Signals
+   initial
+     begin
+	// Add your test vectors here
+	$display("%h", AS_Result);
+	#0   rm = 2'b00;
+	#0   op_type = 3'b000;
+	#0   P = 1'b0;
+	#0   OvEn = 1'b0;
+	#0   UnEn = 1'b0;
+	#0   op1 = 64'h4031e147ae147ae1;
+	#0   op2 = 64'h4046e147ae147ae1;
+	$display("%h", AS_Result);
+	#200;
+	#0   rm = 2'b00;
+	#0   op_type = 3'b000;
+	#0   P = 1'b0;
+	#0   OvEn = 1'b0;
+	#0   UnEn = 1'b0;
+	#0   op1 = 64'h4031e147ae147ae1;
+	#0   op2 = 64'h4046e147ae147ae1;
+	$display("%h", AS_Result);
+	
+     end
+
+endmodule // stimulus
+
+
+
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f32_add_rd.sv b/wally-pipelined/src/fpu/fpadd/tb_f32_add_rd.sv
new file mode 100755
index 00000000..9b2060cb
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f32_add_rd.sv
@@ -0,0 +1,79 @@
+// testbench
+module tb ();
+
+   logic [31:0]  op1;		
+   logic [31:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [31:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [103:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, {op1, 32'h0}, {op2, 32'h0}, 
+	      rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f32_add_rd.out");
+	$readmemh("f32_add_rd.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b000;
+	#0  P = 1'b1;
+	#0  rm = 2'b11;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5  $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result[63:32] !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f32_add_rne.sv b/wally-pipelined/src/fpu/fpadd/tb_f32_add_rne.sv
new file mode 100755
index 00000000..49e70bae
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f32_add_rne.sv
@@ -0,0 +1,79 @@
+// testbench
+module tb ();
+
+   logic [31:0]  op1;		
+   logic [31:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [31:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [103:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, {op1, 32'h0}, {op2, 32'h0}, 
+	      rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f32_add_rne.out");
+	$readmemh("f32_add_rne.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b000;
+	#0  P = 1'b1;
+	#0  rm = 2'b00;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5  $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result[63:32] !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f32_add_ru.sv b/wally-pipelined/src/fpu/fpadd/tb_f32_add_ru.sv
new file mode 100755
index 00000000..c6dabea3
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f32_add_ru.sv
@@ -0,0 +1,79 @@
+// testbench
+module tb ();
+
+   logic [31:0]  op1;		
+   logic [31:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [31:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [103:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, {op1, 32'h0}, {op2, 32'h0}, 
+	      rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f32_add_ru.out");
+	$readmemh("f32_add_ru.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b000;
+	#0  P = 1'b1;
+	#0  rm = 2'b10;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5  $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result[63:32] !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f32_add_rz.sv b/wally-pipelined/src/fpu/fpadd/tb_f32_add_rz.sv
new file mode 100755
index 00000000..95ee9287
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f32_add_rz.sv
@@ -0,0 +1,79 @@
+// testbench
+module tb ();
+
+   logic [31:0]  op1;		
+   logic [31:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [31:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [103:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, {op1, 32'h0}, {op2, 32'h0}, 
+	      rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f32_add_rz.out");
+	$readmemh("f32_add_rz.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b000;
+	#0  P = 1'b1;
+	#0  rm = 2'b01;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5  $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result[63:32] !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f32_f64_rne.sv b/wally-pipelined/src/fpu/fpadd/tb_f32_f64_rne.sv
new file mode 100755
index 00000000..d0766c2b
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f32_f64_rne.sv
@@ -0,0 +1,75 @@
+// testbench
+module tb ();
+
+   logic [31:0]  op1;		
+   logic [63:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [63:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [103:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, {op1, 32'h0}, op2, rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f32_f64_rne.out");
+	$readmemh("f32_f64_rne.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b110;
+	#0  P = 1'b0;
+	#0  rm = 2'b00;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;
+	#0  op2 = 64'h0;	
+	#1; {op1, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f32_sub_rd.sv b/wally-pipelined/src/fpu/fpadd/tb_f32_sub_rd.sv
new file mode 100755
index 00000000..366e4d76
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f32_sub_rd.sv
@@ -0,0 +1,79 @@
+// testbench
+module tb ();
+
+   logic [31:0]  op1;		
+   logic [31:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [31:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [103:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, {op1, 32'h0}, {op2, 32'h0}, 
+	      rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f32_sub_rd.out");
+	$readmemh("f32_sub_rd.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b001;
+	#0  P = 1'b1;
+	#0  rm = 2'b11;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5  $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result[63:32] !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f32_sub_rne.sv b/wally-pipelined/src/fpu/fpadd/tb_f32_sub_rne.sv
new file mode 100755
index 00000000..b8fca359
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f32_sub_rne.sv
@@ -0,0 +1,79 @@
+// testbench
+module tb ();
+
+   logic [31:0]  op1;		
+   logic [31:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [31:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [103:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, {op1, 32'h0}, {op2, 32'h0}, 
+	      rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f32_sub_rne.out");
+	$readmemh("f32_sub_rne.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b001;
+	#0  P = 1'b1;
+	#0  rm = 2'b00;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5  $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result[63:32] !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f32_sub_ru.sv b/wally-pipelined/src/fpu/fpadd/tb_f32_sub_ru.sv
new file mode 100755
index 00000000..158ff474
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f32_sub_ru.sv
@@ -0,0 +1,79 @@
+// testbench
+module tb ();
+
+   logic [31:0]  op1;		
+   logic [31:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [31:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [103:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, {op1, 32'h0}, {op2, 32'h0}, 
+	      rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f32_sub_ru.out");
+	$readmemh("f32_sub_ru.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b001;
+	#0  P = 1'b1;
+	#0  rm = 2'b10;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h", op1, op2, result);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result[63:32] !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f32_sub_rz.sv b/wally-pipelined/src/fpu/fpadd/tb_f32_sub_rz.sv
new file mode 100755
index 00000000..ef8eb65e
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f32_sub_rz.sv
@@ -0,0 +1,79 @@
+// testbench
+module tb ();
+
+   logic [31:0]  op1;		
+   logic [31:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [31:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [103:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, {op1, 32'h0}, {op2, 32'h0}, 
+	      rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f32_sub_rz.out");
+	$readmemh("f32_sub_rz.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b001;
+	#0  P = 1'b1;
+	#0  rm = 2'b01;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5  $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result[63:32] !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f64_add_rd.sv b/wally-pipelined/src/fpu/fpadd/tb_f64_add_rd.sv
new file mode 100755
index 00000000..0f37bca4
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f64_add_rd.sv
@@ -0,0 +1,78 @@
+// testbench
+module tb ();
+
+   logic [63:0]  op1;		
+   logic [63:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [63:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [199:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f64_add_rd.out");
+	$readmemh("f64_add_rd.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b000;
+	#0  P = 1'b0;
+	#0  rm = 2'b11;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f64_add_rne.sv b/wally-pipelined/src/fpu/fpadd/tb_f64_add_rne.sv
new file mode 100755
index 00000000..1e473357
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f64_add_rne.sv
@@ -0,0 +1,78 @@
+// testbench
+module tb ();
+
+   logic [63:0]  op1;		
+   logic [63:0]  op2;		
+   logic [2:0] 	 rm; 
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [63:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [199:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f64_add_rne.out");
+	$readmemh("f64_add_rne.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b000;
+	#0  P = 1'b0;
+	#0  rm = 3'b000;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f64_add_ru.sv b/wally-pipelined/src/fpu/fpadd/tb_f64_add_ru.sv
new file mode 100755
index 00000000..191cca38
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f64_add_ru.sv
@@ -0,0 +1,78 @@
+// testbench
+module tb ();
+
+   logic [63:0]  op1;		
+   logic [63:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [63:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [199:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f64_add_ru.out");
+	$readmemh("f64_add_ru.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b000;
+	#0  P = 1'b0;
+	#0  rm = 2'b10;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);	
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f64_add_rz.sv b/wally-pipelined/src/fpu/fpadd/tb_f64_add_rz.sv
new file mode 100755
index 00000000..6a4df797
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f64_add_rz.sv
@@ -0,0 +1,78 @@
+// testbench
+module tb ();
+
+   logic [63:0]  op1;		
+   logic [63:0]  op2;		
+   logic [2:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [63:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [199:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f64_add_rz.out");
+	$readmemh("f64_add_rz.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b000;
+	#0  P = 1'b0;
+	#0  rm = 3'b001;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);	
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f64_f32_rne.sv b/wally-pipelined/src/fpu/fpadd/tb_f64_f32_rne.sv
new file mode 100755
index 00000000..53eb2598
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f64_f32_rne.sv
@@ -0,0 +1,79 @@
+// testbench
+module tb ();
+
+   logic [63:0]  op1;		
+   logic [63:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [31:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [103:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f64_f32_rne.out");
+	$readmemh("f64_f32_rne.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b110;
+	#0  P = 1'b1;
+	#0  rm = 2'b00;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;
+	#0  op2 = 64'h0;	
+	#1; {op1, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result[63:32] !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f64_sub_rd.sv b/wally-pipelined/src/fpu/fpadd/tb_f64_sub_rd.sv
new file mode 100755
index 00000000..a427ebf9
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f64_sub_rd.sv
@@ -0,0 +1,78 @@
+// testbench
+module tb ();
+
+   logic [63:0]  op1;		
+   logic [63:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [63:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [199:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f64_sub_rd.out");
+	$readmemh("f64_sub_rd.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b001;
+	#0  P = 1'b0;
+	#0  rm = 2'b11;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f64_sub_rne.sv b/wally-pipelined/src/fpu/fpadd/tb_f64_sub_rne.sv
new file mode 100755
index 00000000..dd25bd03
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f64_sub_rne.sv
@@ -0,0 +1,78 @@
+// testbench
+module tb ();
+
+   logic [63:0]  op1;		
+   logic [63:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [63:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [199:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f64_sub_rne.out");
+	$readmemh("f64_sub_rne.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b001;
+	#0  P = 1'b0;
+	#0  rm = 2'b00;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f64_sub_ru.sv b/wally-pipelined/src/fpu/fpadd/tb_f64_sub_ru.sv
new file mode 100755
index 00000000..d16ea7c2
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f64_sub_ru.sv
@@ -0,0 +1,78 @@
+// testbench
+module tb ();
+
+   logic [63:0]  op1;		
+   logic [63:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [63:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [199:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f64_sub_ru.out");
+	$readmemh("f64_sub_ru.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b001;
+	#0  P = 1'b0;
+	#0  rm = 2'b10;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+
diff --git a/wally-pipelined/src/fpu/fpadd/tb_f64_sub_rz.sv b/wally-pipelined/src/fpu/fpadd/tb_f64_sub_rz.sv
new file mode 100755
index 00000000..e68ec215
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpadd/tb_f64_sub_rz.sv
@@ -0,0 +1,78 @@
+// testbench
+module tb ();
+
+   logic [63:0]  op1;		
+   logic [63:0]  op2;		
+   logic [1:0] 	 rm;		
+   logic [2:0]	 op_type;	
+   logic 	 P;   		
+   logic 	 OvEn;		
+   logic 	 UnEn;   	
+
+   logic [63:0]  result;
+   logic [4:0]   Flags;   	
+   logic 	 Denorm;   	
+
+   logic         clk;
+   logic [63:0]  yexpected;
+   logic 	 reset;   
+   logic [63:0]  vectornum, errors;    // bookkeeping variables
+   logic [199:0] testvectors[50000:0]; // array of testvectors
+   logic [7:0] 	 flags_expected;
+
+   integer 	handle3;
+   integer 	desc3;   
+   
+   // instantiate device under test
+   fpadd dut (result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);   
+
+   always     
+     begin
+	clk = 1; #5; clk = 0; #5;
+     end
+   
+   initial
+     begin
+	handle3 = $fopen("f64_sub_rz.out");
+	$readmemh("f64_sub_rz.tv", testvectors);
+	vectornum = 0; errors = 0;
+	reset = 1; #27; reset = 0;
+     end
+
+   always @(posedge clk)
+     begin
+	desc3 = handle3;
+	#0  op_type = 3'b001;
+	#0  P = 1'b0;
+	#0  rm = 2'b01;
+	#0  OvEn = 1'b0;
+	#0  UnEn = 1'b0;	
+	#1; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	#5 $fdisplay(desc3, "%h_%h_%h_%b", op1, op2, result, Flags);	
+     end
+
+   // check results on falling edge of clk
+   always @(negedge clk)
+     if (~reset) 
+       begin // skip during reset
+	  if (result !== yexpected) begin  
+             $display("Error: inputs = %h %h", op1, op2);
+             $display("  outputs = %h (%h expected)", result, yexpected);
+             errors = errors + 1;
+	  end
+	  //else 
+	  //begin
+          //$display("Good");
+	  // end
+	  
+	  vectornum = vectornum + 1;
+	  if (testvectors[vectornum] === 56'bx) 
+	    begin 
+               $display("%d tests completed with %d errors", 
+			vectornum, errors);
+	    end	
+       end // if (~reset)
+   
+endmodule // tb
+
+

From bb99480fcab3c86a5870b925307acdf3ac2d9e62 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Wed, 26 May 2021 09:13:09 -0500
Subject: [PATCH 06/14] delete old file for FPregfile

---
 wally-pipelined/src/fpu/FPregfile.sv~ | 52 ---------------------------
 1 file changed, 52 deletions(-)
 delete mode 100644 wally-pipelined/src/fpu/FPregfile.sv~

diff --git a/wally-pipelined/src/fpu/FPregfile.sv~ b/wally-pipelined/src/fpu/FPregfile.sv~
deleted file mode 100644
index 73b62a57..00000000
--- a/wally-pipelined/src/fpu/FPregfile.sv~
+++ /dev/null
@@ -1,52 +0,0 @@
-///////////////////////////////////////////
-// regfile.sv
-//
-// Written: David_Harris@hmc.edu 9 January 2021
-// Modified: 
-//
-// Purpose: 3-port register file
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-///////////////////////////////////////////
-
-`include "wally-config.vh"
-
-module regfile (
-  input  logic             clk, reset,
-  input  logic             we3, 
-  input  logic [ 4:0]      a1, a2, a3, 
-  input  logic [`XLEN-1:0] wd3, 
-  output logic [`XLEN-1:0] rd1, rd2);
-
-  logic [`XLEN-1:0] rf[31:1];
-  integer i;
-
-  // three ported register file
-  // read two ports combinationally (A1/RD1, A2/RD2)
-  // write third port on rising edge of clock (A3/WD3/WE3)
-  // write occurs on falling edge of clock
-  // register 0 hardwired to 0
-  
-  // reset is intended for simulation only, not synthesis
-    
-  always_ff @(negedge clk or posedge reset)
-    if (reset) for(i=1; i<32; i++) rf[i] <= 0;
-    else if (we3) rf[a3] <= wd3;	
-
-  assign #2 rd1 = (a1 != 0) ? rf[a1] : 0;
-  assign #2 rd2 = (a2 != 0) ? rf[a2] : 0;
-endmodule

From 309e6c3dc19c913d8aa3ea346c4e34f823cc7fcc Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Wed, 26 May 2021 12:33:33 -0400
Subject: [PATCH 07/14] FADD and FSUB imperas tests pass

---
 wally-pipelined/src/fpu/fctrl.sv               | 2 +-
 wally-pipelined/src/fpu/fpu.sv                 | 2 +-
 wally-pipelined/src/fpu/fpuaddcvt1.sv          | 2 +-
 wally-pipelined/src/fpu/fpuaddcvt2.sv          | 2 +-
 wally-pipelined/testbench/testbench-imperas.sv | 6 +++---
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/wally-pipelined/src/fpu/fctrl.sv b/wally-pipelined/src/fpu/fctrl.sv
index 840c9530..e925ad13 100755
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@@ -192,7 +192,7 @@ module fctrl (
       //  fcvt.d.wu = 1111
       //  fcvt.d.s  = 1000
       //		   { is double and not add/sub, is to/from int, is to int or float to double,      is unsigned or sub
-      3'b100 : begin FOpCtrlD = {Funct7D[0]&Funct7D[5], Funct7D[6], Funct7D[3] | (~Funct7D[6]&Funct7D[5]&~Funct7D[0]), Rs2D[0]|(Funct7D[2]&~Funct7D[5])}; FInput2UsedD = ~Funct7D[5]; end
+      3'b100 : begin FOpCtrlD = {Funct7D[0]&Funct7D[5], Funct7D[6], Funct7D[3] | (~Funct7D[6]&Funct7D[5]&~Funct7D[0]), (Rs2D[0]&Funct7D[5])|(Funct7D[2]&~Funct7D[5])}; FInput2UsedD = ~Funct7D[5]; end
       // classify	  {?, ?, ?, ?}
       3'b101 : begin FOpCtrlD = 4'b0; FInput2UsedD = 1'b0; end
       // output SrcAW
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index 9f40300a..34db50e7 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -353,7 +353,7 @@ module fpu (
   flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
   flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
   flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
-  flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignM); 
+  flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignAM); 
   flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
   flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
   flopenrc #(12) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
diff --git a/wally-pipelined/src/fpu/fpuaddcvt1.sv b/wally-pipelined/src/fpu/fpuaddcvt1.sv
index e1228f32..febd47d1 100755
--- a/wally-pipelined/src/fpu/fpuaddcvt1.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt1.sv
@@ -108,7 +108,7 @@ module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE,
    assign zeroB = FOpCtrlE[2] | FOpCtrlE[1];
 
    // Swapped operands if zeroB is not one and exp1 < exp2. 
-   // SwapFmtEg causes exp2 to be used for the result exponent. 
+   // Swapping causes exp2 to be used for the result exponent. 
    // Only the exponent of the larger operand is used to determine
    // the final result. 
    assign AddSwapE = exp_diff1[11] & ~zeroB;
diff --git a/wally-pipelined/src/fpu/fpuaddcvt2.sv b/wally-pipelined/src/fpu/fpuaddcvt2.sv
index 36dabf08..46eac200 100755
--- a/wally-pipelined/src/fpu/fpuaddcvt2.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt2.sv
@@ -56,7 +56,7 @@ module fpuaddcvt2 (FAddResultM, FAddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddS
    output 	 AddDenormM;   	// AddDenormM on input or output   
 
    wire          P;
-   assign P = FmtM | FOpCtrlM[2];
+   assign P = ~FmtM | FOpCtrlM[2];
 
    wire [10:0]   exp_pre;
    wire [63:0] 	 Result;   
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index f1e83994..5636455b 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -118,7 +118,7 @@ string tests32f[] = '{
   };
 
   string tests64d[] = '{
-    // "rv64d/I-FADD-D-01", "2000",
+    "rv64d/I-FADD-D-01", "2000",
     // "rv64d/I-FCLASS-D-01", "2000",
     // "rv64d/I-FCVT-D-L-01", "2000",
     // "rv64d/I-FCVT-D-LU-01", "2000",
@@ -142,14 +142,14 @@ string tests32f[] = '{
     // "rv64d/I-FMSUB-D-01", "2000",
     // "rv64d/I-FMUL-D-01", "2000",
     "rv64d/I-FMV-D-X-01", "2000",
-    "rv64d/I-FMV-X-D-01", "2000"
+    "rv64d/I-FMV-X-D-01", "2000",
     // "rv64d/I-FNMADD-D-01", "2000",
     // "rv64d/I-FNMSUB-D-01", "2000",
     // "rv64d/I-FSGNJ-D-01", "2000",
     // "rv64d/I-FSGNJN-D-01", "2000",
     // "rv64d/I-FSGNJX-D-01", "2000",
     // "rv64d/I-FSQRTD-01", "2000",
-    // "rv64d/I-FSUB-D-01", "2000"
+    "rv64d/I-FSUB-D-01", "2000"
   };
 
   string tests64a[] = '{

From 1459d840eda2d4672a977d017ac33496e3aa6c5c Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Thu, 27 May 2021 15:23:28 -0400
Subject: [PATCH 08/14] All compare instructions pass imperas tests

---
 wally-pipelined/src/fpu/fctrl.sv              |  2 +-
 wally-pipelined/src/fpu/fpu.sv                |  9 +--
 wally-pipelined/src/fpu/fpucmp2.sv            | 64 +++++++++++--------
 wally-pipelined/src/hazard/hazard.sv          |  4 +-
 wally-pipelined/src/ieu/forward.sv            |  5 +-
 wally-pipelined/src/ieu/ieu.sv                |  3 +-
 .../src/wally/wallypipelinedhart.sv           |  4 +-
 .../testbench/testbench-imperas.sv            | 18 +++---
 8 files changed, 63 insertions(+), 46 deletions(-)

diff --git a/wally-pipelined/src/fpu/fctrl.sv b/wally-pipelined/src/fpu/fctrl.sv
index e925ad13..220ccd8f 100755
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@@ -158,7 +158,7 @@ module fctrl (
       //  fsqrt = ???1
       3'b000 : begin FOpCtrlD = {3'b0, Funct7D[5]}; FInput2UsedD = ~Funct7D[5]; end
       // cmp		
-      //  fmin = ?100
+      //  fmin = ?111
       //  fmax = ?101
       //  feq  = ?010
       //  flt  = ?001
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index 34db50e7..aa1039be 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -40,7 +40,7 @@ module fpu (
   output logic [31:0]      FSROutW,
   output logic [1:0]       FMemRWM,
 	output logic             FStallD,
-  output logic             FWriteIntM, FWriteIntW,
+  output logic             FWriteIntE, FWriteIntM, FWriteIntW,
   output logic [`XLEN-1:0] FWriteDataM,
   output logic             FDivSqrtDoneM,
   output logic             IllegalFPUInstrD,
@@ -55,7 +55,7 @@ module fpu (
   logic [2:0]       FrmD, FrmE, FrmM, FrmW;                                 // FP rounding mode
   logic             FmtD, FmtE, FmtM, FmtW;                                 // FP precision 0-single 1-double
   logic             FDivStartD, FDivStartE;                                 // Start division
-  logic             FWriteIntD, FWriteIntE;                                 // Write to integer register
+  logic             FWriteIntD;                                 // Write to integer register
   logic             FOutputInput2D, FOutputInput2E;                         // Put Input2 in Input1 if a store instruction
   logic [1:0]       FMemRWD, FMemRWE;                                       // Read and write enable for memory
   logic [1:0]       FForwardInput1D, FForwardInput1E;                       // Input1 forwarding mux control signal
@@ -151,7 +151,7 @@ module fpu (
   logic             BzeroE, BzeroM;
   logic             CmpInvalidM, CmpInvalidW;
   logic [1:0]       CmpFCCM, CmpFCCW; 
-  logic [63:0]      FCmpResultW;
+  logic [63:0]      FCmpResultM, FCmpResultW;
 
   // fsgn signals
   logic [63:0]      SgnResultE, SgnResultM, SgnResultW;
@@ -415,7 +415,7 @@ module fpu (
   fpuaddcvt2 fpadd2 (.*);
 
   //second instance of two-stage floating-point comparator
-  fpucmp2 fpcmp2 (CmpInvalidM, CmpFCCM, ANaNM, BNaNM, AzeroM, BzeroM, WM, XM, {1'b0, FmtM}, FInput1M, FInput2M);
+  fpucmp2 fpcmp2 (.Invalid(CmpInvalidM), .FCC(CmpFCCM), .ANaN(ANaNM), .BNaN(BNaNM), .Azero(AzeroM), .Bzero(BzeroM), .w(WM), .x(XM), .Sel({1'b0, FmtM}), .op1(FInput1M), .op2(FInput2M), .*);
 
 
 
@@ -451,6 +451,7 @@ module fpu (
   //*****************
   flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
   flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
+  flopenrc #(64) MWRegCmp3(clk, reset, PipeClearMW, PipeEnableMW, FCmpResultM, FCmpResultW); 
 
   //*****************
   //fpsgn M/W pipe registers
diff --git a/wally-pipelined/src/fpu/fpucmp2.sv b/wally-pipelined/src/fpu/fpucmp2.sv
index 766f7f57..e2820688 100755
--- a/wally-pipelined/src/fpu/fpucmp2.sv
+++ b/wally-pipelined/src/fpu/fpucmp2.sv
@@ -37,17 +37,18 @@
 // It also produces an invalid operation flag, which is one
 // if either of the input operands is a signaling NaN per 754
 
-module fpucmp2 (Invalid, FCC, ANaN, BNaN, Azero, Bzero, w, x, Sel, op1, op2);
+module fpucmp2 (   
+   input logic [63:0] op1, 
+   input logic [63:0] op2,
+   input logic [1:0]  Sel,
+   input logic [7:0]  w, x,
+   input logic        ANaN, BNaN,
+   input logic        Azero, Bzero,
+   input logic [3:0]  FOpCtrlM,
    
-   input logic [63:0] op1; 
-   input logic [63:0] op2;
-   input logic [1:0]  Sel;
-   input logic [7:0]  w, x;
-   input logic        ANaN, BNaN;
-   input logic        Azero, Bzero;
-   
-   output logic       Invalid; 		 // Invalid Operation
-   output logic [1:0] FCC;  		 // Condition Codes 
+   output logic       Invalid, 		 // Invalid Operation
+   output logic [1:0] FCC,  		 // Condition Codes 
+   output logic [63:0] FCmpResultM);
    
    logic 	      LT;                // magnitude op1 < magnitude op2
    logic 	      EQ;                // magnitude op1 = magnitude op2
@@ -59,7 +60,7 @@ module fpucmp2 (Invalid, FCC, ANaN, BNaN, Azero, Bzero, w, x, Sel, op1, op2);
 
    // Determine final values based on output of magnitude comparison, 
    // sign bits, and special case testing. 
-   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .Sel(Sel), .A(op1), .B(op2));
+   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .Sel(Sel), .A(op1), .B(op2), .*);
    
 
 endmodule // fpcomp
@@ -156,24 +157,26 @@ endmodule // magcompare64b
 // It also produces a invalid operation flag, which is one
 // if either of the input operands is a signaling NaN.
 
-module exception_cmp_2 (invalid, fcc, LT_mag, EQ_mag, ANaN, BNaN, Azero, Bzero, Sel, A, B);
-
-   input logic [63:0] A;
-   input logic [63:0] B;
-   input logic 	      LT_mag;
-   input logic 	      EQ_mag;
-   input logic [1:0]  Sel;
+module exception_cmp_2 (
+   input logic [63:0] A,
+   input logic [63:0] B,
+   input logic 	      LT_mag,
+   input logic 	      EQ_mag,
+   input logic [1:0]  Sel,
+   input logic [3:0]  FOpCtrlM,
    
-   output logic       invalid;
-   output logic [1:0] fcc;   
+   output logic       invalid,
+   output logic [1:0] fcc,
+   output logic [63:0] FCmpResultM,
 
+   input logic 	      Azero,
+   input logic 	      Bzero,   
+   input logic 	      ANaN,
+   input logic 	      BNaN);
+   
    logic 	      dp;   
    logic 	      sp;
    logic 	      hp;   
-   input logic 	      Azero;
-   input logic 	      Bzero;   
-   input logic 	      ANaN;
-   input logic 	      BNaN;
    logic 	      ASNaN;
    logic 	      BSNaN;
    logic 	      UO;
@@ -221,6 +224,17 @@ module exception_cmp_2 (invalid, fcc, LT_mag, EQ_mag, ANaN, BNaN, Azero, Bzero,
 
    // Set the bits of fcc based on LT, GT, EQ, and UO
    assign fcc[0] = LT | UO;
-   assign fcc[1] = GT | UO;   
+   assign fcc[1] = GT | UO;  
+
+   always_comb begin
+      case (FOpCtrlM[2:0])
+         3'b111: FCmpResultM = LT ? A : B;//min 
+         3'b101: FCmpResultM = GT ? A : B;//max
+         3'b010: FCmpResultM = {63'b0, EQ};//equal
+         3'b001: FCmpResultM = {63'b0, LT};//less than
+         3'b011: FCmpResultM = {63'b0, LT | EQ};//less than or equal
+         default: FCmpResultM = 64'b0;
+      endcase
+   end 
 
 endmodule // exception_cmp
diff --git a/wally-pipelined/src/hazard/hazard.sv b/wally-pipelined/src/hazard/hazard.sv
index 7bd59286..35aa9835 100644
--- a/wally-pipelined/src/hazard/hazard.sv
+++ b/wally-pipelined/src/hazard/hazard.sv
@@ -30,7 +30,7 @@ module hazard(
 	      input logic  reset,
   // Detect hazards
 	      input logic  BPPredWrongE, CSRWritePendingDEM, RetM, TrapM,
-	      input logic  LoadStallD, MulDivStallD, CSRRdStallD,
+	      input logic  FPUStallD, LoadStallD, MulDivStallD, CSRRdStallD,
 	      input logic  DataStall, ICacheStallF,
         input logic  FStallD,
 	      input logic  DivBusyE,
@@ -59,7 +59,7 @@ module hazard(
   assign BranchFlushDE = BPPredWrongE | RetM | TrapM;
 
   assign StallFCause = CSRWritePendingDEM & ~(BranchFlushDE);
-  assign StallDCause = (LoadStallD | MulDivStallD | CSRRdStallD | FStallD) & ~(BranchFlushDE);    // stall in decode if instruction is a load/mul/csr dependent on previous
+  assign StallDCause = (FPUStallD | LoadStallD | MulDivStallD | CSRRdStallD | FStallD) & ~(BranchFlushDE);    // stall in decode if instruction is a load/mul/csr dependent on previous
 //  assign StallDCause = LoadStallD | MulDivStallD | CSRRdStallD;    // stall in decode if instruction is a load/mul/csr dependent on previous
   assign StallECause = DivBusyE;
   assign StallMCause = 0; 
diff --git a/wally-pipelined/src/ieu/forward.sv b/wally-pipelined/src/ieu/forward.sv
index f00a6ecb..cdc6d270 100644
--- a/wally-pipelined/src/ieu/forward.sv
+++ b/wally-pipelined/src/ieu/forward.sv
@@ -31,10 +31,10 @@ module forward(
   input logic 	     MemReadE, MulDivE, CSRReadE,
   input logic 	     RegWriteM, RegWriteW,
   input logic 	     DivDoneE, DivBusyE,
-  input logic	     FWriteIntM, FWriteIntW,
+  input logic	     FWriteIntE, FWriteIntM, FWriteIntW,
   // Forwarding controls
   output logic [1:0] ForwardAE, ForwardBE,
-  output logic 	     LoadStallD, MulDivStallD, CSRRdStallD
+  output logic 	     FPUStallD, LoadStallD, MulDivStallD, CSRRdStallD
 );
   
   always_comb begin
@@ -52,6 +52,7 @@ module forward(
   end
 
   // Stall on dependent operations that finish in Mem Stage and can't bypass in time
+   assign FPUStallD = FWriteIntE & ((Rs1D == RdE) | (Rs2D == RdE)); 
    assign LoadStallD = MemReadE & ((Rs1D == RdE) | (Rs2D == RdE));  
    assign MulDivStallD = MulDivE & ((Rs1D == RdE) | (Rs2D == RdE)) | MulDivE | DivBusyE; // *** extend with stalls for divide
    assign CSRRdStallD = CSRReadE & ((Rs1D == RdE) | (Rs2D == RdE));
diff --git a/wally-pipelined/src/ieu/ieu.sv b/wally-pipelined/src/ieu/ieu.sv
index 8b1993be..a4ab9b06 100644
--- a/wally-pipelined/src/ieu/ieu.sv
+++ b/wally-pipelined/src/ieu/ieu.sv
@@ -35,6 +35,7 @@ module ieu (
   // Execute Stage interface
   input logic [`XLEN-1:0]  PCE, 
   input logic [`XLEN-1:0]  PCLinkE,
+  input logic 		   FWriteIntE, 
   output logic [`XLEN-1:0] PCTargetE,
   output logic 		   MulDivE, W64E,
   output logic [2:0] 	   Funct3E,
@@ -59,7 +60,7 @@ module ieu (
   // hazards
   input logic 		   StallE, StallM, StallW,
   input logic 		   FlushE, FlushM, FlushW,
-  output logic 		   LoadStallD, MulDivStallD, CSRRdStallD,
+  output logic 		   FPUStallD, LoadStallD, MulDivStallD, CSRRdStallD,
   output logic 		   PCSrcE,
   input logic 		   DivDoneE,
   input logic 		   DivBusyE,
diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv
index eab0885d..e49cc6c6 100644
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@@ -86,7 +86,7 @@ module wallypipelinedhart (
 
   logic        PCSrcE;
   logic        CSRWritePendingDEM;
-  logic        LoadStallD, MulDivStallD, CSRRdStallD;
+  logic        FPUStallD, LoadStallD, MulDivStallD, CSRRdStallD;
   logic       DivDoneE;
   logic       DivBusyE;
   logic       DivDoneW;
@@ -98,7 +98,7 @@ module wallypipelinedhart (
   logic [`XLEN-1:0] FWriteDataM;
   logic       SquashSCW;
   logic       FStallD;
-  logic       FWriteIntW, FWriteIntM;
+  logic       FWriteIntE, FWriteIntW, FWriteIntM;
   logic [31:0]      FSROutW;
   logic             FDivSqrtDoneM;
   logic             IllegalFPUInstrD, IllegalFPUInstrE;
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index 5636455b..b8f97b26 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -118,6 +118,11 @@ string tests32f[] = '{
   };
 
   string tests64d[] = '{
+    "rv64d/I-FMAX-D-01", "2000",
+    "rv64d/I-FMIN-D-01", "2000",
+    "rv64d/I-FLE-D-01", "2000",
+    "rv64d/I-FLT-D-01", "2000",
+    "rv64d/I-FEQ-D-01", "2000",
     "rv64d/I-FADD-D-01", "2000",
     // "rv64d/I-FCLASS-D-01", "2000",
     // "rv64d/I-FCVT-D-L-01", "2000",
@@ -131,23 +136,18 @@ string tests32f[] = '{
     // "rv64d/I-FCVT-W-D-01", "2000",
     // "rv64d/I-FCVT-WU-D-01", "2000",
     // "rv64d/I-FDIV-D-01", "2000",
-    // "rv64d/I-FEQ-D-01", "2000",
     "rv64d/I-FSD-01", "2000",
     "rv64d/I-FLD-01", "2420",
-    // "rv64d/I-FLE-D-01", "2000",
-    // "rv64d/I-FLT-D-01", "2000",
-    // "rv64d/I-FMADD-D-01", "2000",
-    // "rv64d/I-FMAX-D-01", "2000",
-    // "rv64d/I-FMIN-D-01", "2000",
+    "rv64d/I-FMADD-D-01", "2000",
     // "rv64d/I-FMSUB-D-01", "2000",
     // "rv64d/I-FMUL-D-01", "2000",
     "rv64d/I-FMV-D-X-01", "2000",
     "rv64d/I-FMV-X-D-01", "2000",
     // "rv64d/I-FNMADD-D-01", "2000",
     // "rv64d/I-FNMSUB-D-01", "2000",
-    // "rv64d/I-FSGNJ-D-01", "2000",
-    // "rv64d/I-FSGNJN-D-01", "2000",
-    // "rv64d/I-FSGNJX-D-01", "2000",
+    "rv64d/I-FSGNJ-D-01", "2000",
+    "rv64d/I-FSGNJN-D-01", "2000",
+    "rv64d/I-FSGNJX-D-01", "2000",
     // "rv64d/I-FSQRTD-01", "2000",
     "rv64d/I-FSUB-D-01", "2000"
   };

From 778ba6bbf5e7ae8c7d680941043bd41a344af76f Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Thu, 27 May 2021 18:53:55 -0400
Subject: [PATCH 09/14] classify unit created and passes imperas tests

---
 wally-pipelined/src/fpu/fpu.sv                | 17 +++++--
 wally-pipelined/src/fpu/fpuclassify.sv        | 50 +++++++++++++++++++
 .../testbench/testbench-imperas.sv            |  4 +-
 3 files changed, 66 insertions(+), 5 deletions(-)
 create mode 100644 wally-pipelined/src/fpu/fpuclassify.sv

diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index aa1039be..c876b313 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -162,7 +162,6 @@ module fpu (
 
   // classify signals
   logic [63:0]      ClassResultE, ClassResultM, ClassResultW;
-  logic [4:0]       ClassFlagsE, ClassFlagsM, ClassFlagsW;
 
   // other
   logic [63:0]      FPUResult64W, FPUResult64E;                                           // 64-bit FPU result
@@ -287,6 +286,11 @@ module fpu (
   //first and only instance of floating-point sign converter
   fpusgn fpsgn (.SgnOpCodeE(FOpCtrlE[1:0]),.*);
 
+  //first and only instance of floating-point classify unit
+  fpuclassify fpuclass (.*);
+
+  
+
 
 
 
@@ -394,7 +398,10 @@ module fpu (
   flopenrc #(1) EMReg7(clk, reset, PipeClearEM, PipeEnableEM, FWriteIntE, FWriteIntM);
   flopenrc #(2) EMReg8(clk, reset, PipeClearEM, PipeEnableEM, FMemRWE, FMemRWM);
 
-
+  //*****************
+  //fpuclassify E/M pipe registers
+  //***************** 
+  flopenrc #(64) EMRegClass(clk, reset, PipeClearEM, PipeEnableEM, ClassResultE, ClassResultM);
 
 
 
@@ -471,6 +478,10 @@ module fpu (
   flopenrc #(1) MWReg7(clk, reset, PipeClearMW, PipeEnableMW, FWriteIntM, FWriteIntW);
 
 
+  //*****************
+  //fpuclassify M/W pipe registers
+  //***************** 
+  flopenrc #(64) MWRegClass(clk, reset, PipeClearMW, PipeEnableMW, ClassResultM, ClassResultW);
 
 
 
@@ -496,7 +507,7 @@ module fpu (
 		// add/sub/cnvt
 		3'b100 : FPUFlagsW = FAddFlagsW;
 		// classify
-		3'b101 : FPUFlagsW = ClassFlagsW;
+		3'b101 : FPUFlagsW = 5'b0;
 		// output SrcAW
 		3'b110 : FPUFlagsW = 5'b0;
 		// output FRD1
diff --git a/wally-pipelined/src/fpu/fpuclassify.sv b/wally-pipelined/src/fpu/fpuclassify.sv
new file mode 100644
index 00000000..ee03cb52
--- /dev/null
+++ b/wally-pipelined/src/fpu/fpuclassify.sv
@@ -0,0 +1,50 @@
+`include "wally-config.vh"
+
+module fpuclassify (
+    input  logic [63:0] FInput1E,
+    input  logic        FmtE,           // 0-single 1-double
+    output logic [63:0] ClassResultE
+    );
+
+    logic [31:0] single;
+    logic [63:0] double;
+    logic sign;
+    logic infinity, NaN, zero, normal, subnormal;
+    logic ExpNotZero, ExpOnes, ManNotZero, ExpZero, ManZero, FirstBitMan;
+   
+    // single and double precision layouts
+    assign single = FInput1E[63:32];
+    assign double = FInput1E;
+    assign sign = FInput1E[63];
+
+    // basic calculations for readabillity
+    assign ExpNotZero = FmtE ? |double[62:52] : |single[30:23];
+    assign ExpZero = ~ExpNotZero;
+    assign ExpOnes = FmtE ? &double[62:52] : &single[30:23];
+    assign ManNotZero = FmtE ? |double[51:0] : |single[22:0];
+    assign ManZero = ~ManNotZero;
+    assign FirstBitMan = FmtE ? double[51] : single[22];
+
+    // determine the type of number
+    assign NaN      = ExpOnes & ManNotZero;
+    assign infinity = ExpOnes & ManZero;
+    assign zero     = ExpZero & ManZero;
+    assign subnormal= ExpZero & ManNotZero;
+    assign normal   = ExpNotZero;
+
+    // determine sub category and combine into the result
+    //  bit 0 - -infinity
+    //  bit 1 - -normal
+    //  bit 2 - -subnormal
+    //  bit 3 - -zero
+    //  bit 4 - +zero
+    //  bit 5 - +subnormal
+    //  bit 6 - +normal
+    //  bit 7 - +infinity
+    //  bit 8 - signaling NaN
+    //  bit 9 - quiet NaN
+    assign ClassResultE = {{`XLEN-10{1'b0}}, FirstBitMan&NaN, ~FirstBitMan&NaN, ~sign&infinity, ~sign&normal, 
+                                    ~sign&subnormal, ~sign&zero, sign&zero, sign&subnormal, sign&normal, sign&infinity, {64-`XLEN{1'b0}}};
+
+
+endmodule
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index b8f97b26..c5abff91 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -124,7 +124,7 @@ string tests32f[] = '{
     "rv64d/I-FLT-D-01", "2000",
     "rv64d/I-FEQ-D-01", "2000",
     "rv64d/I-FADD-D-01", "2000",
-    // "rv64d/I-FCLASS-D-01", "2000",
+    "rv64d/I-FCLASS-D-01", "2000",
     // "rv64d/I-FCVT-D-L-01", "2000",
     // "rv64d/I-FCVT-D-LU-01", "2000",
     // "rv64d/I-FCVT-D-S-01", "2000",
@@ -653,7 +653,7 @@ string tests32f[] = '{
               errors = errors+1;
               $display("  Error on test %s result %d: adr = %h sim = %h, signature = %h", 
                     tests[test], i, (testadr+i)*`XLEN/8, dut.uncore.dtim.RAM[testadr+i], signature[i]);
-              // $stop;//***debug
+              $stop;//***debug
             end
           end
           i = i + 1;

From 39ae7435437bde3135ca5873a5a0ddb507ff548f Mon Sep 17 00:00:00 2001
From: bbracker <bbracker@hmc.edu>
Date: Fri, 28 May 2021 23:11:37 -0400
Subject: [PATCH 10/14] turns out I should not have tried renaming FStallD to
 FPUStallD because that name was already used! All the same it does feel weird
 to have two such signals floating around \(ah pun!\)

---
 wally-pipelined/regression/wally-pipelined.do |   2 +-
 .../regression/wave-dos/peripheral-waves.do   |   5 +
 wally-pipelined/src/hazard/hazard.sv          |  40 +++----
 wally-pipelined/src/ifu/ifu.sv                |   9 +-
 wally-pipelined/src/privileged/csr.sv         |  38 ++++---
 wally-pipelined/src/privileged/csrsr.sv       | 102 +++++++++---------
 wally-pipelined/src/privileged/privileged.sv  |   4 +-
 .../src/wally/wallypipelinedhart.sv           |   4 +-
 .../testbench/testbench-imperas.sv            |  13 ++-
 9 files changed, 118 insertions(+), 99 deletions(-)

diff --git a/wally-pipelined/regression/wally-pipelined.do b/wally-pipelined/regression/wally-pipelined.do
index 51335b82..500e1fe6 100644
--- a/wally-pipelined/regression/wally-pipelined.do
+++ b/wally-pipelined/regression/wally-pipelined.do
@@ -40,7 +40,7 @@ vsim workopt
 
 view wave
 -- display input and output signals as hexidecimal values
-do ./wave-dos/default-waves.do
+do ./wave-dos/peripheral-waves.do
 
 -- Run the Simulation 
 #run 5000 
diff --git a/wally-pipelined/regression/wave-dos/peripheral-waves.do b/wally-pipelined/regression/wave-dos/peripheral-waves.do
index f92c1af5..3c4945c7 100644
--- a/wally-pipelined/regression/wave-dos/peripheral-waves.do
+++ b/wally-pipelined/regression/wave-dos/peripheral-waves.do
@@ -48,6 +48,11 @@ add wave /testbench/dut/hart/ieu/dp/RegWriteW
 add wave -hex /testbench/dut/hart/ieu/dp/ResultW
 add wave -hex /testbench/dut/hart/ieu/dp/RdW
 add wave -divider
+add wave -hex /testbench/dut/hart/priv/csr/ProposedEPCM
+add wave -hex /testbench/dut/hart/priv/csr/TrapM
+add wave -hex /testbench/dut/hart/priv/csr/UnalignedNextEPCM
+add wave -hex /testbench/dut/hart/priv/csr/genblk1/csrm/WriteMEPCM
+add wave -hex /testbench/dut/hart/priv/csr/genblk1/csrm/MEPC_REGW
 add wave -divider
 
 # peripherals
diff --git a/wally-pipelined/src/hazard/hazard.sv b/wally-pipelined/src/hazard/hazard.sv
index 35aa9835..72857fb3 100644
--- a/wally-pipelined/src/hazard/hazard.sv
+++ b/wally-pipelined/src/hazard/hazard.sv
@@ -30,16 +30,15 @@ module hazard(
 	      input logic  reset,
   // Detect hazards
 	      input logic  BPPredWrongE, CSRWritePendingDEM, RetM, TrapM,
-	      input logic  FPUStallD, LoadStallD, MulDivStallD, CSRRdStallD,
+	      input logic  LoadStallD, MulDivStallD, CSRRdStallD,
 	      input logic  DataStall, ICacheStallF,
-        input logic  FStallD,
+        input logic  FPUStallD,
 	      input logic  DivBusyE,
   // Stall & flush outputs
 	      output logic StallF, StallD, StallE, StallM, StallW,
 	      output logic FlushF, FlushD, FlushE, FlushM, FlushW
 );
 
-  logic BranchFlushDE;
   logic StallFCause, StallDCause, StallECause, StallMCause, StallWCause;
   logic FirstUnstalledD, FirstUnstalledE, FirstUnstalledM, FirstUnstalledW;
 
@@ -56,34 +55,29 @@ module hazard(
   // A stage must stall if the next stage is stalled
   // If any stages are stalled, the first stage that isn't stalled must flush.
 
-  assign BranchFlushDE = BPPredWrongE | RetM | TrapM;
-
-  assign StallFCause = CSRWritePendingDEM & ~(BranchFlushDE);
-  assign StallDCause = (FPUStallD | LoadStallD | MulDivStallD | CSRRdStallD | FStallD) & ~(BranchFlushDE);    // stall in decode if instruction is a load/mul/csr dependent on previous
-//  assign StallDCause = LoadStallD | MulDivStallD | CSRRdStallD;    // stall in decode if instruction is a load/mul/csr dependent on previous
+  assign StallFCause = CSRWritePendingDEM && ~(TrapM || RetM || BPPredWrongE);
+  assign StallDCause = (LoadStallD || MulDivStallD || CSRRdStallD || FPUStallD) && ~(TrapM || RetM || BPPredWrongE);    // stall in decode if instruction is a load/mul/csr dependent on previous
   assign StallECause = DivBusyE;
   assign StallMCause = 0; 
-  assign StallWCause = DataStall | ICacheStallF;
+  assign StallWCause = DataStall || ICacheStallF;
 
-  // Each stage stalls if the next stage is stalled or there is a cause to stall this stage.
-  assign StallF = StallD | StallFCause;
-
-  assign StallD = StallE | StallDCause;
-  assign StallE = StallM | StallECause;
-  assign StallM = StallW | StallMCause;
+  assign StallF = StallFCause || StallD;
+  assign StallD = StallDCause || StallE;
+  assign StallE = StallECause || StallM;
+  assign StallM = StallMCause || StallW;
   assign StallW = StallWCause;
 
   //assign FirstUnstalledD = (~StallD & StallF & ~MulDivStallD);
-  assign FirstUnstalledD = (~StallD & StallF);
   //assign FirstUnstalledE = (~StallE & StallD & ~MulDivStallD);
-  assign FirstUnstalledE = (~StallE & StallD);
-  assign FirstUnstalledM = (~StallM & StallE);
-  assign FirstUnstalledW = (~StallW & StallM);;
+  assign FirstUnstalledD = (~StallD && StallF);
+  assign FirstUnstalledE = (~StallE && StallD);
+  assign FirstUnstalledM = (~StallM && StallE);
+  assign FirstUnstalledW = (~StallW && StallM);
   
   // Each stage flushes if the previous stage is the last one stalled (for cause) or the system has reason to flush
   assign FlushF = BPPredWrongE;
-  assign FlushD = FirstUnstalledD || BranchFlushDE;  // PCSrcE |InstrStall | CSRWritePendingDEM | RetM | TrapM;
-  assign FlushE = FirstUnstalledE || BranchFlushDE;  // LoadStallD | PCSrcE | RetM | TrapM;
-  assign FlushM = FirstUnstalledM || RetM || TrapM;
-  assign FlushW = FirstUnstalledW | TrapM;
+  assign FlushD = FirstUnstalledD || TrapM || RetM || BPPredWrongE;
+  assign FlushE = FirstUnstalledE || TrapM || RetM || BPPredWrongE;
+  assign FlushM = FirstUnstalledM || TrapM || RetM;
+  assign FlushW = FirstUnstalledW || TrapM;
 endmodule
diff --git a/wally-pipelined/src/ifu/ifu.sv b/wally-pipelined/src/ifu/ifu.sv
index 994288bd..28f7597e 100644
--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@@ -37,7 +37,8 @@ module ifu (
   output logic [`XLEN-1:0] InstrPAdrF,
   output logic             InstrReadF,
   output logic             ICacheStallF,
-  // Decode  
+  // Decode
+  output logic [`XLEN-1:0] PCD, 
   // Execute
   output logic [`XLEN-1:0] PCLinkE,
   input logic 		   PCSrcE, 
@@ -47,7 +48,7 @@ module ifu (
   // Mem
   input logic 		   RetM, TrapM, 
   input logic [`XLEN-1:0]  PrivilegedNextPCM, 
-  output logic [31:0] 	   InstrD, InstrM,
+  output logic [31:0] 	   InstrD, InstrE, InstrM, InstrW,
   output logic [`XLEN-1:0] PCM, 
   output logic [4:0] 	   InstrClassM,
   output logic 		   BPPredDirWrongM,
@@ -76,9 +77,9 @@ module ifu (
   logic             misaligned, BranchMisalignedFaultE, BranchMisalignedFaultM, TrapMisalignedFaultM;
   logic             PrivilegedChangePCM;
   logic             IllegalCompInstrD;
-  logic [`XLEN-1:0] PCPlusUpperF, PCPlus2or4F, PCD, PCW, PCLinkD, PCLinkM, PCNextPF, PCPF;
+  logic [`XLEN-1:0] PCPlusUpperF, PCPlus2or4F, PCW, PCLinkD, PCLinkM, PCNextPF, PCPF;
   logic             CompressedF;
-  logic [31:0]      InstrRawD, InstrE, InstrW;
+  logic [31:0]      InstrRawD;
   localparam [31:0]      nop = 32'h00000013; // instruction for NOP
   logic 	    reset_q; // *** look at this later.
 
diff --git a/wally-pipelined/src/privileged/csr.sv b/wally-pipelined/src/privileged/csr.sv
index 744b8f9b..89d71fb5 100644
--- a/wally-pipelined/src/privileged/csr.sv
+++ b/wally-pipelined/src/privileged/csr.sv
@@ -34,8 +34,8 @@ module csr #(parameter
   ) (
   input  logic             clk, reset,
   input  logic             FlushW, StallD, StallE, StallM, StallW,
-  input  logic [31:0]      InstrM, 
-  input  logic [`XLEN-1:0] PCM, SrcAM,
+  input  logic [31:0]      InstrD,InstrE,InstrM, 
+  input  logic [`XLEN-1:0] PCF, PCD, PCE, PCM, SrcAM,
   input  logic             InterruptM,
   input  logic             CSRReadM, CSRWriteM, TrapM, MTrapM, STrapM, UTrapM, mretM, sretM, uretM,
   input  logic             TimerIntM, ExtIntM, SwIntM,
@@ -47,6 +47,9 @@ module csr #(parameter
   input  logic [4:0]       InstrClassM,
   input  logic [1:0]       NextPrivilegeModeM, PrivilegeModeW,
   input  logic [`XLEN-1:0] CauseM, NextFaultMtvalM,
+  input  logic             BreakpointFaultM, EcallFaultM,
+  input  logic             InstrMisalignedFaultM, InstrAccessFaultM, IllegalInstrFaultM,
+  input  logic             LoadMisalignedFaultM, StoreMisalignedFaultM, LoadAccessFaultM, StoreAccessFaultM,
   output logic [1:0]       STATUS_MPP,
   output logic             STATUS_SPP, STATUS_TSR,
   output logic [`XLEN-1:0] MEPC_REGW, SEPC_REGW, UEPC_REGW, UTVEC_REGW, STVEC_REGW, MTVEC_REGW,
@@ -65,6 +68,7 @@ module csr #(parameter
   output logic             IllegalCSRAccessM
 );
 
+  localparam NOP = 32'h13;
   logic [`XLEN-1:0] CSRMReadValM, CSRSReadValM, CSRUReadValM, CSRNReadValM, CSRCReadValM, CSRReadValM;
   logic [`XLEN-1:0] CSRSrcM, CSRRWM, CSRRSM, CSRRCM, CSRWriteValM;
  
@@ -73,22 +77,32 @@ module csr #(parameter
   logic            WriteMSTATUSM, WriteSSTATUSM, WriteUSTATUSM;
   logic            CSRMWriteM, CSRSWriteM, CSRUWriteM;
 
-  logic [`XLEN-1:0] UnalignedNextEPCM, NextEPCM, preservedPCM, readPCM, NextCauseM, NextMtvalM;
-
-  always_ff @(posedge clk) begin
-      preservedPCM <= PCM;
-  end
-
-  mux2 #(`XLEN) pcmux(PCM, preservedPCM, InterruptM, readPCM);
-  //flop #(`XLEN) CSRReadPCMreg(clk, reset, PCM, readPCM);
+  logic MStageFailed;
+  logic [`XLEN-1:0] ProposedEPCM, UnalignedNextEPCM, NextEPCM, NextCauseM, NextMtvalM;
 
   logic [11:0] CSRAdrM;
   logic [11:0] SIP_REGW, SIE_REGW;
   //logic [11:0] UIP_REGW, UIE_REGW = 0; // N user-mode exceptions not supported
   logic        IllegalCSRCAccessM, IllegalCSRMAccessM, IllegalCSRSAccessM, IllegalCSRUAccessM, IllegalCSRNAccessM, InsufficientCSRPrivilegeM;
-
   logic IllegalCSRMWriteReadonlyM;
 
+  assign MStageFailed = BreakpointFaultM || EcallFaultM || InstrMisalignedFaultM || InstrAccessFaultM || IllegalInstrFaultM || LoadMisalignedFaultM || StoreMisalignedFaultM || LoadAccessFaultM || StoreAccessFaultM;
+  always_comb begin
+    if (MStageFailed)
+      casez({InstrD==NOP,InstrE==NOP,InstrM==NOP})
+        3'b??0: ProposedEPCM = PCM;
+        3'b?01: ProposedEPCM = PCE;
+        3'b011: ProposedEPCM = PCD;
+        3'b111: ProposedEPCM = PCF;
+      endcase
+    else
+      casez({InstrD==NOP,InstrE==NOP})
+        2'b?0: ProposedEPCM = PCE;
+        2'b01: ProposedEPCM = PCD;
+        2'b11: ProposedEPCM = PCF;
+      endcase
+  end
+  
   generate
     if (`ZCSR_SUPPORTED) begin
       // modify CSRs
@@ -109,7 +123,7 @@ module csr #(parameter
 
       // write CSRs
       assign CSRAdrM = InstrM[31:20];
-      assign UnalignedNextEPCM = TrapM ? readPCM : CSRWriteValM;
+      assign UnalignedNextEPCM = TrapM ? ProposedEPCM : CSRWriteValM;
       assign NextEPCM = `C_SUPPORTED ? {UnalignedNextEPCM[`XLEN-1:1], 1'b0} : {UnalignedNextEPCM[`XLEN-1:2], 2'b00}; // 3.1.15 alignment
       assign NextCauseM = TrapM ? CauseM : CSRWriteValM;
       assign NextMtvalM = TrapM ? NextFaultMtvalM : CSRWriteValM;
diff --git a/wally-pipelined/src/privileged/csrsr.sv b/wally-pipelined/src/privileged/csrsr.sv
index 8c5c7a3d..0b36df49 100644
--- a/wally-pipelined/src/privileged/csrsr.sv
+++ b/wally-pipelined/src/privileged/csrsr.sv
@@ -109,74 +109,74 @@ module csrsr (
   // complex register with reset, write enable, and the ability to update other bits in certain cases
   always_ff @(posedge clk, posedge reset)
     if (reset) begin
-      STATUS_SUM_INT <= 0;
-      STATUS_MPRV_INT <= 0; // Per Priv 3.3
-      STATUS_FS_INT <= 0; //2'b01; // busybear: change all these reset values to 0
-      STATUS_MPP <= 0; //`M_MODE;
-      STATUS_SPP <= 0; //1'b1;
-      STATUS_MPIE <= 0; //1;
-      STATUS_SPIE <= 0; //`S_SUPPORTED;
-      STATUS_UPIE <= 0; // `U_SUPPORTED;
-      STATUS_MIE <= 0; // Per Priv 3.3
-      STATUS_SIE <= 0; //`S_SUPPORTED;
-      STATUS_UIE <= 0; //`U_SUPPORTED;
+      STATUS_SUM_INT <= #1 0;
+      STATUS_MPRV_INT <= #1 0; // Per Priv 3.3
+      STATUS_FS_INT <= #1 0; //2'b01; // busybear: change all these reset values to 0
+      STATUS_MPP <= #1 0; //`M_MODE;
+      STATUS_SPP <= #1 0; //1'b1;
+      STATUS_MPIE <= #1 0; //1;
+      STATUS_SPIE <= #1 0; //`S_SUPPORTED;
+      STATUS_UPIE <= #1 0; // `U_SUPPORTED;
+      STATUS_MIE <= #1 0; // Per Priv 3.3
+      STATUS_SIE <= #1 0; //`S_SUPPORTED;
+      STATUS_UIE <= #1 0; //`U_SUPPORTED;
     end else if (~StallW) begin
       if (WriteMSTATUSM) begin
-        STATUS_SUM_INT <= CSRWriteValM[18];
-        STATUS_MPRV_INT <= CSRWriteValM[17];
-        STATUS_FS_INT <= CSRWriteValM[14:13];
-        STATUS_MPP <= STATUS_MPP_NEXT;
-        STATUS_SPP <= `S_SUPPORTED & CSRWriteValM[8];
-        STATUS_MPIE <= CSRWriteValM[7];
-        STATUS_SPIE <= `S_SUPPORTED & CSRWriteValM[5];
-        STATUS_UPIE <= `U_SUPPORTED & CSRWriteValM[4];
-        STATUS_MIE <= CSRWriteValM[3];
-        STATUS_SIE <= `S_SUPPORTED & CSRWriteValM[1];
-        STATUS_UIE <= `U_SUPPORTED & CSRWriteValM[0];
+        STATUS_SUM_INT <= #1 CSRWriteValM[18];
+        STATUS_MPRV_INT <= #1 CSRWriteValM[17];
+        STATUS_FS_INT <= #1 CSRWriteValM[14:13];
+        STATUS_MPP <= #1 STATUS_MPP_NEXT;
+        STATUS_SPP <= #1 `S_SUPPORTED & CSRWriteValM[8];
+        STATUS_MPIE <= #1 CSRWriteValM[7];
+        STATUS_SPIE <= #1 `S_SUPPORTED & CSRWriteValM[5];
+        STATUS_UPIE <= #1 `U_SUPPORTED & CSRWriteValM[4];
+        STATUS_MIE <= #1 CSRWriteValM[3];
+        STATUS_SIE <= #1 `S_SUPPORTED & CSRWriteValM[1];
+        STATUS_UIE <= #1 `U_SUPPORTED & CSRWriteValM[0];
       end else if (WriteSSTATUSM) begin // write a subset of the STATUS bits
-        STATUS_SUM_INT <= CSRWriteValM[18];
-        STATUS_FS_INT <= CSRWriteValM[14:13];
-        STATUS_SPP <= `S_SUPPORTED & CSRWriteValM[8];
-        STATUS_SPIE <= `S_SUPPORTED & CSRWriteValM[5];
-        STATUS_UPIE <= `U_SUPPORTED & CSRWriteValM[4];
-        STATUS_SIE <= `S_SUPPORTED & CSRWriteValM[1];
-        STATUS_UIE <= `U_SUPPORTED & CSRWriteValM[0];      
+        STATUS_SUM_INT <= #1 CSRWriteValM[18];
+        STATUS_FS_INT <= #1 CSRWriteValM[14:13];
+        STATUS_SPP <= #1 `S_SUPPORTED & CSRWriteValM[8];
+        STATUS_SPIE <= #1 `S_SUPPORTED & CSRWriteValM[5];
+        STATUS_UPIE <= #1 `U_SUPPORTED & CSRWriteValM[4];
+        STATUS_SIE <= #1 `S_SUPPORTED & CSRWriteValM[1];
+        STATUS_UIE <= #1 `U_SUPPORTED & CSRWriteValM[0];      
       end else if (WriteUSTATUSM) begin // write a subset of the STATUS bits
-        STATUS_FS_INT <= CSRWriteValM[14:13];
-        STATUS_UPIE <= `U_SUPPORTED & CSRWriteValM[4];
-        STATUS_UIE <= `U_SUPPORTED & CSRWriteValM[0];      
+        STATUS_FS_INT <= #1 CSRWriteValM[14:13];
+        STATUS_UPIE <= #1 `U_SUPPORTED & CSRWriteValM[4];
+        STATUS_UIE <= #1 `U_SUPPORTED & CSRWriteValM[0];      
       end else begin
-        if (FloatRegWriteW) STATUS_FS_INT <=2'b11; // mark Float State dirty
+        if (FloatRegWriteW) STATUS_FS_INT <= #12'b11; // mark Float State dirty
         if (TrapM) begin
           // Update interrupt enables per Privileged Spec p. 21
           // y = PrivilegeModeW
           // x = NextPrivilegeModeM
           // Modes: 11 = Machine, 01 = Supervisor, 00 = User
           if (NextPrivilegeModeM == `M_MODE) begin
-            STATUS_MPIE <= STATUS_MIE;
-            STATUS_MIE <= 0;
-            STATUS_MPP <= PrivilegeModeW;
+            STATUS_MPIE <= #1 STATUS_MIE;
+            STATUS_MIE <= #1 0;
+            STATUS_MPP <= #1 PrivilegeModeW;
           end else if (NextPrivilegeModeM == `S_MODE) begin
-            STATUS_SPIE <= STATUS_SIE;
-            STATUS_SIE <= 0;
-            STATUS_SPP <= PrivilegeModeW[0]; // *** seems to disagree with P. 56
+            STATUS_SPIE <= #1 STATUS_SIE;
+            STATUS_SIE <= #1 0;
+            STATUS_SPP <= #1 PrivilegeModeW[0]; // *** seems to disagree with P. 56
           end else begin // user mode
-            STATUS_UPIE <= STATUS_UIE;
-            STATUS_UIE <= 0;
+            STATUS_UPIE <= #1 STATUS_UIE;
+            STATUS_UIE <= #1 0;
           end
         end else if (mretM) begin // Privileged 3.1.6.1
-          STATUS_MIE <= STATUS_MPIE;
-          STATUS_MPIE <= 1;
-          STATUS_MPP <= `U_SUPPORTED ? `U_MODE : `M_MODE; // per spec, not sure why
-          STATUS_MPRV_INT <= 0; // per 20210108 draft spec
+          STATUS_MIE <= #1 STATUS_MPIE;
+          STATUS_MPIE <= #1 1;
+          STATUS_MPP <= #1 `U_SUPPORTED ? `U_MODE : `M_MODE; // per spec, not sure why
+          STATUS_MPRV_INT <= #1 0; // per 20210108 draft spec
         end else if (sretM) begin
-          STATUS_SIE <= STATUS_SPIE;
-          STATUS_SPIE <= `S_SUPPORTED;
-          STATUS_SPP <= 0; // Privileged 4.1.1
-          STATUS_MPRV_INT <= 0; // per 20210108 draft spec
+          STATUS_SIE <= #1 STATUS_SPIE;
+          STATUS_SPIE <= #1 `S_SUPPORTED;
+          STATUS_SPP <= #1 0; // Privileged 4.1.1
+          STATUS_MPRV_INT <= #1 0; // per 20210108 draft spec
         end else if (uretM) begin
-          STATUS_UIE <= STATUS_UPIE;
-          STATUS_UPIE <= `U_SUPPORTED;
+          STATUS_UIE <= #1 STATUS_UPIE;
+          STATUS_UPIE <= #1 `U_SUPPORTED;
         end
         // *** add code to track STATUS_FS_INT for dirty floating point registers
       end
diff --git a/wally-pipelined/src/privileged/privileged.sv b/wally-pipelined/src/privileged/privileged.sv
index 2e3af3e2..41d685c4 100644
--- a/wally-pipelined/src/privileged/privileged.sv
+++ b/wally-pipelined/src/privileged/privileged.sv
@@ -31,8 +31,8 @@ module privileged (
   input  logic             FlushW,
   input  logic             CSRReadM, CSRWriteM,
   input  logic [`XLEN-1:0] SrcAM,
-  input  logic [31:0]      InstrM,
-  input  logic [`XLEN-1:0] PCM,
+  input  logic [`XLEN-1:0] PCF,PCD,PCE,PCM,
+  input  logic [31:0]      InstrD, InstrE, InstrM, InstrW,
   output logic [`XLEN-1:0] CSRReadValW,
   output logic [`XLEN-1:0] PrivilegedNextPCM,
   output logic             RetM, TrapM,
diff --git a/wally-pipelined/src/wally/wallypipelinedhart.sv b/wally-pipelined/src/wally/wallypipelinedhart.sv
index e49cc6c6..00ae8493 100644
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@@ -68,8 +68,8 @@ module wallypipelinedhart (
   logic [`XLEN-1:0] SrcAM;
   logic [2:0] Funct3E;
   //  logic [31:0] InstrF;
-  logic [31:0] InstrD, InstrM;
-  logic [`XLEN-1:0] PCE, PCM, PCLinkE, PCLinkW;
+  logic [31:0] InstrD, InstrE, InstrM, InstrW;
+  logic [`XLEN-1:0] PCD, PCE, PCM, PCLinkE, PCLinkW;
   logic [`XLEN-1:0] PCTargetE;
   logic [`XLEN-1:0] CSRReadValW, MulDivResultW;
   logic [`XLEN-1:0] PrivilegedNextPCM;
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index c5abff91..ea693900 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -29,6 +29,7 @@
 module testbench();
   parameter DEBUG = 0;
   parameter TESTSPERIPH = 0; // set to 0 for regression
+  parameter TESTSPRIV = 0; // set to 0 for regression
   
   logic        clk;
   logic        reset;
@@ -516,9 +517,11 @@ string tests32f[] = '{
         tests = testsBP64;
 	// testsbp should not run the other tests. It starts at address 0 rather than
 	// 0x8000_0000, the next if must remain an else if.	
-      end else if (TESTSPERIPH) begin 
+      end else if (TESTSPERIPH)
         tests = tests64periph;
-      end else begin
+      else if (TESTSPRIV)
+        tests = tests64p;
+      else begin
         tests = {tests64p,tests64i,tests64periph};
         if (`C_SUPPORTED) tests = {tests, tests64ic};
         else              tests = {tests, tests64iNOc};
@@ -531,9 +534,11 @@ string tests32f[] = '{
       //tests = {tests64a, tests};
     end else begin // RV32
       // *** add the 32 bit bp tests
-      if (TESTSPERIPH) begin 
+      if (TESTSPERIPH)
         tests = tests32periph;
-      end else begin
+      else if (TESTSPRIV)
+        tests = tests32p;
+      else begin
           tests = {tests32i, tests32p};//,tests32periph}; *** broken at the moment
           if (`C_SUPPORTED % 2 == 1) tests = {tests, tests32ic};    
           else                       tests = {tests, tests32iNOc};

From 12c34c25f3f122c90b8a99f6fdb4590f721fe0d2 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Mon, 31 May 2021 08:36:19 -0400
Subject: [PATCH 11/14] Modify elements of generics for LZD and shifter wrote
 for integer divider.

---
 wally-pipelined/src/generic/lzd.sv   | 195 +++++++++++++++++++++++++++
 wally-pipelined/src/generic/lzd.sv~  | 195 +++++++++++++++++++++++++++
 wally-pipelined/src/generic/shift.sv |  76 +++++++++++
 wally-pipelined/src/muldiv/div.sv    | 146 +-------------------
 4 files changed, 471 insertions(+), 141 deletions(-)
 create mode 100755 wally-pipelined/src/generic/lzd.sv
 create mode 100755 wally-pipelined/src/generic/lzd.sv~
 create mode 100755 wally-pipelined/src/generic/shift.sv

diff --git a/wally-pipelined/src/generic/lzd.sv b/wally-pipelined/src/generic/lzd.sv
new file mode 100755
index 00000000..98642c15
--- /dev/null
+++ b/wally-pipelined/src/generic/lzd.sv
@@ -0,0 +1,195 @@
+///////////////////////////////////////////
+// lzd.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+/* verilator lint_off DECLFILENAME */
+
+// Original idea came from  V. G. Oklobdzija, "An algorithmic and novel
+// design of a leading zero detector circuit: comparison with logic
+// synthesis," in IEEE Transactions on Very Large Scale Integration
+// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi:
+// 10.1109/92.273153.
+
+// Modified to be more hierarchical
+
+module lzd2 (P, V, B);
+
+   input logic  [1:0] B;
+
+   output logic P;
+   output logic V;
+
+   assign V = B[0] | B[1];
+   assign P = B[0] & ~B[1];
+   
+endmodule // lz2
+
+module lzd_hier #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]          B,
+    output logic [$clog2(WIDTH)-1:0] ZP,
+    output logic 		     ZV);
+
+   if (WIDTH == 128)
+     lzd128 lz127 (ZP, ZV, B);	      
+   else if (WIDTH == 64)
+     lzd64 lz64 (ZP, ZV, B);	   
+   else if (WIDTH == 32)
+     lzd32 lz32 (ZP, ZV, B);
+   else if (WIDTH == 16)
+     lzd16 lz16 (ZP, ZV, B);
+   else if (WIDTH == 8)
+     lzd8 lz8 (ZP, ZV, B);
+   else if (WIDTH == 4)
+     lzd4 lz4 (ZP, ZV, B);
+
+endmodule // lzd_hier
+
+module lzd4 (ZP, ZV, B);
+
+   input logic [3:0]  B;
+
+   logic  	       ZPa;
+   logic  	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [1:0]  ZP;
+   output logic        ZV;
+
+   lz2 l1(ZPa, ZVa, B[1:0]);
+   lz2 l2(ZPb, ZVb, B[3:2]);
+
+   assign ZP[0:0] = ZVb ? ZPb : ZPa;
+   assign ZP[1]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd4
+
+module lzd8 (ZP, ZV, B);
+
+   input logic [7:0]  B;
+
+   logic [1:0] 	       ZPa;
+   logic [1:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [2:0]  ZP;
+   output logic        ZV;
+
+   lz4 l1(ZPa, ZVa, B[3:0]);
+   lz4 l2(ZPb, ZVb, B[7:4]);
+
+   assign ZP[1:0] = ZVb ? ZPb : ZPa;
+   assign ZP[2]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd8
+
+module lzd16 (ZP, ZV, B);
+
+   input logic [15:0]  B;
+
+   logic [2:0] 	       ZPa;
+   logic [2:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [3:0]  ZP;
+   output logic        ZV;
+
+   lz8 l1(ZPa, ZVa, B[7:0]);
+   lz8 l2(ZPb, ZVb, B[15:8]);
+
+   assign ZP[2:0] = ZVb ? ZPb : ZPa;
+   assign ZP[3]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd16
+
+module lzd32 (ZP, ZV, B);
+
+   input logic [31:0] B;
+
+   logic [3:0] 	      ZPa;
+   logic [3:0] 	      ZPb;
+   logic 	      ZVa;
+   logic 	      ZVb;
+   
+   output logic [4:0] ZP;
+   output logic       ZV;
+   
+   lz16 l1(ZPa, ZVa, B[15:0]);
+   lz16 l2(ZPb, ZVb, B[31:16]);
+   
+   assign ZP[3:0] = ZVb ? ZPb : ZPa;
+   assign ZP[4]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd32
+
+module lzd64 (ZP, ZV, B);
+
+   input logic [63:0]  B;
+   
+   logic [4:0] 	       ZPa;
+   logic [4:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [5:0]  ZP;
+   output logic        ZV;
+   
+   lz32 l1(ZPa, ZVa, B[31:0]);
+   lz32 l2(ZPb, ZVb, B[63:32]);
+   
+   assign ZP[4:0] = ZVb ? ZPb : ZPa;
+   assign ZP[5]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd64
+
+module lzd128 (ZP, ZV, B);
+
+   input logic [127:0]  B;
+   
+   logic [5:0] 	       ZPa;
+   logic [5:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [6:0]  ZP;
+   output logic        ZV;
+   
+   lz64 l1(ZPa, ZVa, B[64:0]);
+   lz64 l2(ZPb, ZVb, B[127:63]);
+   
+   assign ZP[5:0] = ZVb ? ZPb : ZPa;
+   assign ZP[6]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lzd128
+
+/* verilator lint_on DECLFILENAME */
diff --git a/wally-pipelined/src/generic/lzd.sv~ b/wally-pipelined/src/generic/lzd.sv~
new file mode 100755
index 00000000..bfffe5e5
--- /dev/null
+++ b/wally-pipelined/src/generic/lzd.sv~
@@ -0,0 +1,195 @@
+///////////////////////////////////////////
+// lzd.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+/* verilator lint_off DECLFILENAME */
+
+// Original idea came from  V. G. Oklobdzija, "An algorithmic and novel
+// design of a leading zero detector circuit: comparison with logic
+// synthesis," in IEEE Transactions on Very Large Scale Integration
+// (VLSI) Systems, vol. 2, no. 1, pp. 124-128, March 1994, doi:
+// 10.1109/92.273153.
+
+// Modified to be more hierarchical
+
+module lz2 (P, V, B);
+
+   input logic  [1:0] B;
+
+   output logic P;
+   output logic V;
+
+   assign V = B[0] | B[1];
+   assign P = B[0] & ~B[1];
+   
+endmodule // lz2
+
+module lzd_hier #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]          B,
+    output logic [$clog2(WIDTH)-1:0] ZP,
+    output logic 		     ZV);
+
+   if (WIDTH == 128)
+     lz128 lzd127 (ZP, ZV, B);	      
+   else if (WIDTH == 64)
+     lz64 lzd64 (ZP, ZV, B);	   
+   else if (WIDTH == 32)
+     lz32 lzd32 (ZP, ZV, B);
+   else if (WIDTH == 16)
+     lz16 lzd16 (ZP, ZV, B);
+   else if (WIDTH == 8)
+     lz8 lzd8 (ZP, ZV, B);
+   else if (WIDTH == 4)
+     lz4 lzd4 (ZP, ZV, B);
+
+endmodule // lzd_hier
+
+module lz4 (ZP, ZV, B);
+
+   input logic [3:0]  B;
+
+   logic  	       ZPa;
+   logic  	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [1:0]  ZP;
+   output logic        ZV;
+
+   lz2 l1(ZPa, ZVa, B[1:0]);
+   lz2 l2(ZPb, ZVb, B[3:2]);
+
+   assign ZP[0:0] = ZVb ? ZPb : ZPa;
+   assign ZP[1]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule 
+
+module lz8 (ZP, ZV, B);
+
+   input logic [7:0]  B;
+
+   logic [1:0] 	       ZPa;
+   logic [1:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [2:0]  ZP;
+   output logic        ZV;
+
+   lz4 l1(ZPa, ZVa, B[3:0]);
+   lz4 l2(ZPb, ZVb, B[7:4]);
+
+   assign ZP[1:0] = ZVb ? ZPb : ZPa;
+   assign ZP[2]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule 
+
+module lz16 (ZP, ZV, B);
+
+   input logic [15:0]  B;
+
+   logic [2:0] 	       ZPa;
+   logic [2:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [3:0]  ZP;
+   output logic        ZV;
+
+   lz8 l1(ZPa, ZVa, B[7:0]);
+   lz8 l2(ZPb, ZVb, B[15:8]);
+
+   assign ZP[2:0] = ZVb ? ZPb : ZPa;
+   assign ZP[3]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz16
+
+module lz32 (ZP, ZV, B);
+
+   input logic [31:0] B;
+
+   logic [3:0] 	      ZPa;
+   logic [3:0] 	      ZPb;
+   logic 	      ZVa;
+   logic 	      ZVb;
+   
+   output logic [4:0] ZP;
+   output logic       ZV;
+   
+   lz16 l1(ZPa, ZVa, B[15:0]);
+   lz16 l2(ZPb, ZVb, B[31:16]);
+   
+   assign ZP[3:0] = ZVb ? ZPb : ZPa;
+   assign ZP[4]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz32
+
+module lz64 (ZP, ZV, B);
+
+   input logic [63:0]  B;
+   
+   logic [4:0] 	       ZPa;
+   logic [4:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [5:0]  ZP;
+   output logic        ZV;
+   
+   lz32 l1(ZPa, ZVa, B[31:0]);
+   lz32 l2(ZPb, ZVb, B[63:32]);
+   
+   assign ZP[4:0] = ZVb ? ZPb : ZPa;
+   assign ZP[5]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz64
+
+module lz128 (ZP, ZV, B);
+
+   input logic [127:0]  B;
+   
+   logic [5:0] 	       ZPa;
+   logic [5:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [6:0]  ZP;
+   output logic        ZV;
+   
+   lz64 l1(ZPa, ZVa, B[64:0]);
+   lz64 l2(ZPb, ZVb, B[127:63]);
+   
+   assign ZP[5:0] = ZVb ? ZPb : ZPa;
+   assign ZP[6]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz128
+
+/* verilator lint_on DECLFILENAME */
diff --git a/wally-pipelined/src/generic/shift.sv b/wally-pipelined/src/generic/shift.sv
new file mode 100755
index 00000000..88152588
--- /dev/null
+++ b/wally-pipelined/src/generic/shift.sv
@@ -0,0 +1,76 @@
+///////////////////////////////////////////
+// shifters.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+/* verilator lint_off DECLFILENAME */
+/* verilator lint_off UNOPTFLAT */
+
+module shift_right #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]         A,
+    input logic [$clog2(WIDTH)-1:0] Shift,
+    output logic [WIDTH-1:0] 	    Z);
+   
+   logic [WIDTH-1:0] 		    stage [$clog2(WIDTH):0];
+   logic 			    sign;   
+   genvar 			    i;
+
+   assign stage[0] = A;   
+   generate
+      for (i=0;i<$clog2(WIDTH);i=i+1)
+	begin : genbit
+	   mux2 #(WIDTH) mux_inst (stage[i], 
+				   {{(WIDTH/(2**(i+1))){1'b0}}, stage[i][WIDTH-1:WIDTH/(2**(i+1))]}, 
+				   Shift[$clog2(WIDTH)-i-1], 
+				   stage[i+1]);
+	end
+   endgenerate
+   assign Z = stage[$clog2(WIDTH)];   
+
+endmodule // shift_right
+
+module shift_left #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0]         A,
+    input logic [$clog2(WIDTH)-1:0] Shift,
+    output logic [WIDTH-1:0] 	    Z);
+   
+   logic [WIDTH-1:0] 		    stage [$clog2(WIDTH):0];
+   genvar 			    i;
+   
+   assign stage[0] = A;   
+   generate
+      for (i=0;i<$clog2(WIDTH);i=i+1)
+	begin : genbit
+	   mux2 #(WIDTH) mux_inst (stage[i], 
+				   {stage[i][WIDTH-1-WIDTH/(2**(i+1)):0], {(WIDTH/(2**(i+1))){1'b0}}}, 
+				   Shift[$clog2(WIDTH)-i-1], 
+				   stage[i+1]);
+	end
+   endgenerate
+   assign Z = stage[$clog2(WIDTH)];   
+
+endmodule // shift_left
+
+/* verilator lint_on DECLFILENAME */
+/* verilator lint_on UNOPTFLAT */
diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv
index db830ca3..4266ae61 100755
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
@@ -78,11 +78,7 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
    assign D_NegOne = &D;
 
    // Divider goes the distance to 37 cycles
-   // (thanks the evil divisor for D = 0x1) 
-   // but could theoretically be stopped when
-   // divdone is asserted.  The enable signal
-   // turns off register storage thus invalidating
-   // any future cycles.
+   // (thanks to the evil divisor for D = 0x1) 
    
    // Shift D, if needed (for integer)
    // needed to allow qst to be in range for integer
@@ -93,8 +89,8 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
    // exception is given to FSM to tell the operation to 
    // quit gracefully.
 
-   lz64 p1 (P, V, twoD);
-   shifter_l64 p2 (op2, twoD, P);
+   lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD));
+   shift_left #(64) p2 (twoD, P, op2);   
    assign op1 = twoN;
    assign div0 = ~V;
 
@@ -141,9 +137,8 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
    assign Q = Qd2[63:0];
    assign Rem5 = Rd2[64:1];  
    
-   // Adjust remainder by m (no need to adjust by
-   // n ln(r)
-   shifter_r64 p4 (rem0, Rem5, RemShift);
+   // Adjust remainder by m 
+   shift_right #(64) p4 (Rem5, RemShift, rem0);   
 
    // Adjust Q/Rem for Signed
    assign tcQ = (SignN ^ SignD) & S;
@@ -368,8 +363,6 @@ module qst4 (input logic [6:0] s, input logic [2:0] d,
    
 endmodule // qst4
 
-// LZD
-
 module lz2 (P, V, B0, B1);
 
    input logic  B0;
@@ -497,7 +490,6 @@ module lz64 (ZP, ZV, B);
 endmodule // lz64
 
 // FSM Control for Integer Divider
-
 module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 	      start, error, NumIter, clk, reset);
 
@@ -1505,134 +1497,6 @@ module magcompare8 (LT, EQ, A, B);
 
 endmodule // magcompare8
 
-module shifter_l64 (Z, A, Shift);
-
-   input logic [63:0]  A;
-   input logic [5:0]   Shift;
-   
-   logic [63:0]        stage1;
-   logic [63:0]        stage2;
-   logic [63:0]        stage3;
-   logic [63:0]        stage4;
-   logic [63:0]        stage5;   
-   
-   output logic [63:0] Z;      
-   
-   mux2 #(64) mx01(A,      {A[31:0], 32'h0}, Shift[5], stage1);   
-   mux2 #(64) mx02(stage1, {stage1[47:0], 16'h0}, Shift[4], stage2);
-   mux2 #(64) mx03(stage2, {stage2[55:0], 8'h0}, Shift[3], stage3);
-   mux2 #(64) mx04(stage3, {stage3[59:0], 4'h0}, Shift[2], stage4);
-   mux2 #(64) mx05(stage4, {stage4[61:0], 2'h0}, Shift[1], stage5);
-   mux2 #(64) mx06(stage5, {stage5[62:0], 1'h0}, Shift[0], Z);
-
-endmodule // shifter_l64
-
-module shifter_r64 (Z, A, Shift);
-
-   input logic [63:0]  A;
-   input logic [5:0]   Shift;
-   
-   logic [63:0]        stage1;
-   logic [63:0]        stage2;
-   logic [63:0]        stage3;
-   logic [63:0]        stage4;
-   logic [63:0]        stage5;   		  
-   
-   output logic [63:0] Z;
-   
-   mux2 #(64) mx01(A, {32'h0, A[63:32]}, Shift[5], stage1);		  
-   mux2 #(64) mx02(stage1, {16'h0, stage1[63:16]}, Shift[4], stage2);
-   mux2 #(64) mx03(stage2, {8'h0, stage2[63:8]}, Shift[3], stage3);
-   mux2 #(64) mx04(stage3, {4'h0, stage3[63:4]}, Shift[2], stage4);
-   mux2 #(64) mx05(stage4, {2'h0, stage4[63:2]}, Shift[1], stage5);
-   mux2 #(64) mx06(stage5, {1'h0, stage5[63:1]},  Shift[0], Z);
-   
-endmodule // shifter_r64
-
-module shifter_l32 (Z, A, Shift);
-
-   input logic [31:0]  A;
-   input logic [4:0]   Shift;
-   
-   logic [31:0]        stage1;
-   logic [31:0]        stage2;
-   logic [31:0]        stage3;
-   logic [31:0]        stage4;
-   
-   output logic [31:0] Z;      
-
-   mux2 #(32) mx01(A,      {A[15:0], 16'h0},    Shift[4], stage1);
-   mux2 #(32) mx02(stage1, {stage1[23:0], 8'h0}, Shift[3], stage2);
-   mux2 #(32) mx03(stage2, {stage2[27:0], 4'h0},  Shift[2], stage3);
-   mux2 #(32) mx04(stage3, {stage3[29:0], 2'h0},   Shift[1], stage4);
-   mux2 #(32) mx05(stage4, {stage4[30:0], 1'h0},    Shift[0], Z);
-
-endmodule // shifter_l32
-
-module shifter_r32 (Z, A, Shift);
-
-   input logic [31:0]  A;
-   input logic [4:0]   Shift;
-   
-   logic [31:0]        stage1;
-   logic [31:0]        stage2;
-   logic [31:0]        stage3;
-   logic [31:0]        stage4;
-   
-   output logic [31:0] Z;
-   
-   mux2 #(32) mx01(A,      {16'h0, A[31:16]},   Shift[4], stage1);
-   mux2 #(32) mx02(stage1, {8'h0, stage1[31:8]}, Shift[3], stage2);
-   mux2 #(32) mx03(stage2, {4'h0, stage2[31:4]},  Shift[2], stage3);
-   mux2 #(32) mx04(stage3, {2'h0, stage3[31:2]},   Shift[1], stage4);
-   mux2 #(32) mx05(stage4, {1'h0, stage4[31:1]},    Shift[0], Z);
-   
-endmodule // shifter_r32
-
-module shift_right #(parameter WIDTH=8) 
-   (input logic [`XLEN-1:0]         A,
-    input logic [$clog2(`XLEN)-1:0] Shift,
-    output logic [`XLEN-1:0] 	    Z);
-   
-   logic [`XLEN-1:0] 							 stage [$clog2(`XLEN):0];
-   genvar 								 i;
-   
-   assign stage[0] = A;   
-   generate
-      for (i=0;i<$clog2(`XLEN);i=i+1)
-	begin : genbit
-	   mux2 #(`XLEN) mux_inst (stage[i], 
-				   {{(`XLEN/(2**(i+1))){1'b0}}, stage[i][`XLEN-1:`XLEN/(2**(i+1))]}, 
-				   Shift[$clog2(`XLEN)-i-1], 
-				   stage[i+1]);
-	end
-   endgenerate
-   assign Z = stage[$clog2(`XLEN)];   
-
-endmodule // shift_right
-
-module shift_left #(parameter WIDTH=8) 
-   (input logic [`XLEN-1:0]         A,
-    input logic [$clog2(`XLEN)-1:0] Shift,
-    output logic [`XLEN-1:0] 	    Z);
-   
-   logic [`XLEN-1:0] 							stage [$clog2(`XLEN):0];
-   genvar 								i;
-   
-   assign stage[0] = A;   
-   generate
-      for (i=0;i<$clog2(`XLEN);i=i+1)
-	begin : genbit
-	   mux2 #(`XLEN) mux_inst (stage[i], 
-				   {stage[i][`XLEN-1-`XLEN/(2**(i+1)):0], {(`XLEN/(2**(i+1))){1'b0}}}, 
-				   Shift[$clog2(`XLEN)-i-1], 
-				   stage[i+1]);
-	end
-   endgenerate
-   assign Z = stage[$clog2(`XLEN)];   
-
-endmodule // shift_right
-
 module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
 
    input logic [63:0] Q;

From 9954d16fc91017dae8df34f0b60f6ab188242708 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Mon, 31 May 2021 09:12:21 -0400
Subject: [PATCH 12/14] Add enhancements to integer divider including:   -
 better comments   - optimize FSM to end earlier   - passes for 32-bit or
 64-bit depending on parameter to intdiv

Left div.bak in just in case have to revert back to original for now.
---
 wally-pipelined/src/muldiv/div.bak   | 1560 ++++++++++++++++++++++++++
 wally-pipelined/src/muldiv/div.sv    |  614 ++++------
 wally-pipelined/src/muldiv/muldiv.sv |    3 +-
 3 files changed, 1773 insertions(+), 404 deletions(-)
 create mode 100755 wally-pipelined/src/muldiv/div.bak

diff --git a/wally-pipelined/src/muldiv/div.bak b/wally-pipelined/src/muldiv/div.bak
new file mode 100755
index 00000000..4266ae61
--- /dev/null
+++ b/wally-pipelined/src/muldiv/div.bak
@@ -0,0 +1,1560 @@
+///////////////////////////////////////////
+// mul.sv
+//
+// Written: James.Stine@okstate.edu 1 February 2021
+// Modified: 
+//
+// Purpose: Integer Divide instructions
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+// *** <Thomas Fleming> I added these verilator controls to clean up the
+// lint output. The linter warnings should be fixed, but now the output is at
+// least readable.
+/* verilator lint_off COMBDLY */
+/* verilator lint_off IMPLICIT */
+
+`include "wally-config.vh"
+
+module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
+
+   input logic [63:0]  N, D;
+   input logic 	       clk;
+   input logic 	       reset;
+   input logic 	       start;
+   input logic 	       S;   
+   
+   output logic [63:0] Qf;
+   output logic [63:0] remf;
+   output logic        div0;
+   output logic        done;
+   output logic        divBusy;   
+
+   logic 	       divdone;   
+   logic 	       enable;
+   logic 	       state0;
+   logic 	       V;   
+   logic [7:0] 	       Num;
+   logic [5:0] 	       P, NumIter, RemShift;
+   logic [63:0]        op1, op2, op1shift, Rem5;
+   logic [64:0]        Qd, Rd, Qd2, Rd2;
+   logic [63:0]        Q, rem0;
+   logic [3:0] 	       quotient;
+   logic 	       otfzero; 
+   logic 	       shiftResult;
+   logic 	       enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp;
+
+   logic [63:0]        twoD;
+   logic [63:0]        twoN;
+   logic 	       SignD;
+   logic 	       SignN;
+   logic [63:0]        QT, remT;
+   logic 	       D_NegOne;
+   logic 	       Max_N;
+
+   // Check if negative (two's complement)
+   //   If so, convert to positive
+   adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD);
+   adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN);   
+   assign SignD = D[63];
+   assign SignN = N[63];   
+   // Max N and D = -1 (Overflow)
+   assign Max_N = (~|N[62:0]) & N[63];
+   assign D_NegOne = &D;
+
+   // Divider goes the distance to 37 cycles
+   // (thanks to the evil divisor for D = 0x1) 
+   
+   // Shift D, if needed (for integer)
+   // needed to allow qst to be in range for integer
+   // division [1,2) and allow integer divide to work.
+   //
+   // The V or valid bit can be used to determine if D
+   // is 0 and thus a divide by 0 exception.  This div0
+   // exception is given to FSM to tell the operation to 
+   // quit gracefully.
+
+   lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD));
+   shift_left #(64) p2 (twoD, P, op2);   
+   assign op1 = twoN;
+   assign div0 = ~V;
+
+   // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0)
+   // v = 2 since \rho < 1 (add 4 to make sure its a ceil)
+   adder #(8) cpa3 ({2'b0, P}, 
+		    {5'h0, shiftResult, ~shiftResult, 1'b0}, 
+		    Num);      
+   
+   // Determine whether need to add just Q/Rem
+   assign shiftResult = P[0];   
+   // div by 2 (ceil)
+   assign NumIter = Num[6:1];   
+   assign RemShift = P;
+
+   // FSM to control integer divider
+   //   assume inputs are postive edge and
+   //   datapath (divider) is negative edge
+   fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv,
+	       start, div0, NumIter, ~clk, reset);
+
+   flopr #(1) rega (~clk, reset, donev, done);
+   flopr #(1) regb (~clk, reset, divdonev, divdone);
+   flopr #(1) regc (~clk, reset, otfzerov, otfzero);
+   flopr #(1) regd (~clk, reset, enablev, enable);
+   flopr #(1) rege (~clk, reset, state0v, state0);
+   flopr #(1) regf (~clk, reset, divBusyv, divBusy);      
+   
+   // To obtain a correct remainder the last bit of the
+   // quotient has to be aligned with a radix-r boundary.
+   // Since the quotient is in the range 1/2 < q < 2 (one
+   // integer bit and m fractional bits), this is achieved by
+   // shifting N right by v+s so that (m+v+s) mod k = 0.  And,
+   // the quotient has to be aligned to the integer position.
+
+   divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, 
+		  enable, otfzero, shiftResult);
+
+   // Storage registers to hold contents stable
+   flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2);
+   flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2);         
+
+   // Probably not needed - just assigns results
+   assign Q = Qd2[63:0];
+   assign Rem5 = Rd2[64:1];  
+   
+   // Adjust remainder by m 
+   shift_right #(64) p4 (Rem5, RemShift, rem0);   
+
+   // Adjust Q/Rem for Signed
+   assign tcQ = (SignN ^ SignD) & S;
+   assign tcR = SignN & S;
+   // Signed Divide
+   // - When N and D are negative: Remainder is negative (undergoes a two's complement).
+   // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement).
+   // - When D is negative: Quotient is negative (undergoes a two's complement).
+   adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT);
+   adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT);         
+
+   // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec)
+   exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf);
+
+endmodule // int32div
+
+module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, 
+		   enable, otfzero, shiftResult); 
+
+   input logic [63:0]   op1, op2;
+   input logic 		clk, state0;
+   input logic 		reset;
+   input logic 		enable;
+   input logic 		otfzero;
+   input logic 		shiftResult;   
+   
+   output logic [64:0] 	rem0;
+   output logic [64:0] 	Q;
+   output logic [3:0] 	quotient;   
+
+   logic [67:0] 	Sum, Carry;   
+   logic [64:0] 	Qstar;   
+   logic [64:0] 	QMstar;   
+   logic [7:0] 		qtotal;   
+   logic [67:0] 	SumN, CarryN, SumN2, CarryN2;
+   logic [67:0] 	divi1, divi2, divi1c, divi2c, dive1;
+   logic [67:0] 	mdivi_temp, mdivi;   
+   logic 		zero;
+   logic [1:0] 		qsel;
+   logic [1:0] 		Qin, QMin;
+   logic 		CshiftQ, CshiftQM;
+   logic [67:0] 	rem1, rem2, rem3;
+   logic [67:0] 	SumR, CarryR;
+   logic [64:0] 	Qt;   
+
+   // Create one's complement values of Divisor (for q*D)
+   assign divi1 = {3'h0, op2, 1'b0};
+   assign divi2 = {2'h0, op2, 2'b0};
+   assign divi1c = ~divi1;
+   assign divi2c = ~divi2;
+   // Shift x1 if not mod k
+   mux2 #(68) mx1 ({3'b000, op1, 1'b0},  {4'h0, op1}, shiftResult, dive1);   
+
+   // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D)
+   mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN);
+   mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN);
+   // Simplify QST
+   adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal);   
+   // q = {+2, +1, -1, -2} else q = 0
+   qst4 pd1 (qtotal[7:1], divi1[63:61], quotient);
+   assign ulp = quotient[2]|quotient[3];
+   assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]);
+   // Map to binary encoding
+   assign qsel[1] = quotient[3]|quotient[2];
+   assign qsel[0] = quotient[3]|quotient[1];   
+   mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp);
+   mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi);
+   csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry);
+   // regs : save CSA
+   flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2);
+   flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2);
+   // OTF
+   ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM);   
+   otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, 
+		   otfzero, enable, Qstar, QMstar);
+
+   // Correction and generation of Remainder
+   adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1);
+   // Add back +D as correction
+   csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR);
+   adder #(68) cpa3 (SumR, CarryR, rem2);   
+   // Choose remainder (Rem or Rem+D)
+   mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3);
+   // Choose correct Q or QM
+   mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt);
+   // Final results
+   assign rem0 = rem3[64:0];
+   assign Q = Qt;   
+   
+endmodule // divide4x64
+
+module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM);
+
+   input logic [3:0] quot;
+
+   output logic [1:0] Qin;
+   output logic [1:0] QMin;
+   output logic       CshiftQ;
+   output logic       CshiftQM;
+
+   // Load/Store Control for OTF
+   assign Qin[1] = (quot[1]) | (quot[3]) | (quot[0]);
+   assign Qin[0] = (quot[1]) | (quot[2]);
+   assign QMin[1] = (quot[1]) | (!quot[3]&!quot[2]&!quot[1]&!quot[0]);
+   assign QMin[0] = (quot[3]) | (quot[0]) | 
+		    (!quot[3]&!quot[2]&!quot[1]&!quot[0]);
+   assign CshiftQ = (quot[1]) | (quot[0]);
+   assign CshiftQM = (quot[3]) | (quot[2]);   
+
+endmodule 
+
+// On-the-fly Conversion per Ercegovac/Lang
+
+module otf #(parameter WIDTH=8) 
+   (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q);
+   
+   input logic [1:0]        Qin, QMin;
+   input logic 		    CshiftQ, CshiftQM;   
+   input logic 		    clk;
+   input logic 	            reset;
+   input logic 		    enable;   
+
+   output logic [WIDTH-1:0] R2Q;
+   output logic [WIDTH-1:0] R1Q;   
+
+   logic [WIDTH-1:0] 	    Qstar, QMstar;      
+   logic [WIDTH-1:0] 	    M1Q, M2Q;
+   
+   // QM
+   mux2 #(WIDTH)  m1 (QMstar, Qstar, CshiftQM, M1Q);
+   flopenr #(WIDTH) r1 (clk, reset, enable, {M1Q[WIDTH-3:0], QMin}, R1Q);
+   // Q
+   mux2 #(WIDTH)  m2 (Qstar, QMstar, CshiftQ, M2Q);
+   flopenr #(WIDTH) r2 (clk, reset, enable, {M2Q[WIDTH-3:0], Qin}, R2Q);
+   
+   assign Qstar = R2Q;
+   assign QMstar = R1Q;
+
+endmodule // otf8
+
+module adder #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b,
+				   output logic [WIDTH-1:0] y);
+
+   assign y = a + b;
+
+endmodule // adder
+
+module fa (input logic a, b, c, output logic sum, carry);
+
+   assign sum = a^b^c;
+   assign carry = a&b|a&c|b&c;   
+
+endmodule // fa
+
+module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c,
+				 output logic [WIDTH-1:0] sum, carry);
+
+   logic [WIDTH:0] 					  carry_temp;   
+   genvar 						  i;
+   generate
+      for (i=0;i<WIDTH;i=i+1)
+	begin : genbit
+	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
+	end
+   endgenerate
+   //assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     // trmimmed excess bit dh 5/3/21
+   assign carry = {carry_temp[WIDTH-1:1], 1'b0};     
+
+endmodule // adder
+
+module eqcmp #(parameter WIDTH = 8)
+   (input  logic [WIDTH-1:0] a, b,
+    output logic y);
+   
+   assign y = (a == b);
+   
+endmodule // eqcmp
+
+module qst4 (input logic [6:0] s, input logic [2:0] d,
+	     output logic [3:0] q);
+   
+   
+   assign q[3] = (!s[6]&s[5]) | (!d[2]&!s[6]&s[4]) | (!s[6]&s[4]&s[3]) | 
+		 (!d[1]&!s[6]&s[4]&s[2]) | (!d[0]&!s[6]&s[4]&s[2]) | 
+		 (!d[1]&!d[0]&!s[6]&s[4]&s[1]) | 
+		 (!d[2]&!d[1]&!d[0]&!s[6]&s[3]&s[2]) | 
+		 (!d[2]&!d[1]&!s[6]&s[3]&s[2]&s[1]) | 
+		 (!d[2]&!d[0]&!s[6]&s[3]&s[2]&s[1]&s[0]);
+   
+   assign q[2] = (d[2]&!s[6]&!s[5]&!s[4]&s[3]) | 
+		 (!s[6]&!s[5]&!s[4]&s[3]&!s[2]) | 
+		 (!d[2]&!s[6]&!s[5]&!s[4]&!s[3]&s[2]) | 
+		 (d[2]&d[1]&d[0]&!s[6]&!s[5]&s[4]&!s[3]) | 
+		 (d[2]&d[1]&!s[6]&!s[5]&s[4]&!s[3]&!s[2]) | 
+		 (d[2]&d[0]&!s[6]&!s[5]&s[4]&!s[3]&!s[2]) | 
+		 (d[2]&!s[6]&!s[5]&s[4]&!s[3]&!s[2]&!s[1]) | 
+		 (!d[2]&d[1]&d[0]&!s[6]&!s[5]&!s[4]&s[2]) | 
+		 (!d[1]&!s[6]&!s[5]&!s[4]&!s[3]&s[2]&s[1]) | 
+		 (!d[2]&d[1]&!s[6]&!s[5]&!s[4]&s[2]&!s[1]) | 
+		 (!d[2]&d[0]&!s[6]&!s[5]&!s[4]&s[2]&!s[1]) | 
+		 (!d[2]&d[1]&!s[6]&!s[5]&!s[4]&s[2]&!s[0]);
+   
+   assign q[1] = (d[2]&s[6]&s[5]&s[4]&!s[3]) | 
+		 (d[1]&s[6]&s[5]&s[4]&!s[3]) | (s[6]&s[5]&s[4]&!s[3]&s[2]) | 
+		 (d[2]&s[6]&s[5]&!s[4]&s[3]&s[2]) | 
+		 (d[0]&s[6]&s[5]&s[4]&!s[3]&s[1]) | 
+		 (d[2]&d[1]&d[0]&s[6]&s[5]&!s[4]&s[3]) | 
+		 (d[2]&d[1]&s[6]&s[5]&!s[4]&s[3]&s[1]) | 
+		 (!d[2]&s[6]&s[5]&s[4]&s[3]&!s[2]&!s[1]) | 
+		 (!d[2]&!d[1]&!d[0]&s[6]&s[5]&s[4]&s[3]&!s[2]) | 
+		 (d[1]&d[0]&s[6]&s[5]&!s[4]&s[3]&s[2]&s[1]) | 
+		 (!d[2]&d[0]&s[6]&s[5]&s[4]&!s[2]&!s[1]&s[0]) | 
+		 (!d[2]&!d[1]&!d[0]&s[6]&s[5]&s[4]&!s[2]&s[1]&s[0]);
+   
+   assign q[0] = (s[6]&!s[5]) | (s[6]&!s[4]&!s[3]) | 
+		 (!d[2]&!d[1]&s[6]&!s[4]) | (!d[2]&!d[0]&s[6]&!s[4]) | 
+		 (!d[2]&s[6]&!s[4]&!s[2]) | (!d[1]&s[6]&!s[4]&!s[2]) | 
+		 (!d[2]&s[6]&!s[4]&!s[1]) | (!d[0]&s[6]&!s[4]&!s[2]&!s[1]) | 
+		 (!d[2]&!d[1]&!d[0]&s[6]&!s[3]&!s[2]&!s[1]) | 
+		 (!d[2]&!d[1]&!d[0]&s[6]&!s[3]&!s[2]&!s[0]) | 
+		 (!d[2]&!d[1]&s[6]&!s[3]&!s[2]&!s[1]&!s[0]);
+   
+endmodule // qst4
+
+module lz2 (P, V, B0, B1);
+
+   input logic  B0;
+   input logic 	B1;
+
+   output logic P;
+   output logic V;
+
+   assign V = B0 | B1;
+   assign P = B0 & ~B1;
+   
+endmodule // lz2
+
+module lz4 (ZP, ZV, B0, B1, V0, V1);
+   
+   input logic        B0;
+   input logic        B1;
+   input logic        V0;
+   input logic        V1;
+   
+   output logic [1:0] ZP;
+   output logic       ZV;
+   
+   assign ZP[0] = V0 ? B0 : B1;
+   assign ZP[1] = ~V0;
+   assign ZV = V0 | V1;
+
+endmodule // lz4
+
+module lz8 (ZP, ZV, B);
+   
+   input logic [7:0]  B;
+
+   logic 	      s1p0;
+   logic 	      s1v0;
+   logic 	      s1p1;
+   logic 	      s1v1;
+   logic 	      s2p0;
+   logic 	      s2v0;
+   logic 	      s2p1;
+   logic 	      s2v1;
+   logic [1:0] 	      ZPa;
+   logic [1:0] 	      ZPb;
+   logic 	      ZVa;
+   logic 	      ZVb;
+   
+   output logic [2:0] ZP;
+   output logic       ZV;
+   
+   lz2 l1(s1p0, s1v0, B[2], B[3]);
+   lz2 l2(s1p1, s1v1, B[0], B[1]);
+   lz4 l3(ZPa, ZVa, s1p0, s1p1, s1v0, s1v1);
+
+   lz2 l4(s2p0, s2v0, B[6], B[7]);
+   lz2 l5(s2p1, s2v1, B[4], B[5]);
+   lz4 l6(ZPb, ZVb, s2p0, s2p1, s2v0, s2v1);
+
+   assign ZP[1:0] = ZVb ? ZPb : ZPa;
+   assign ZP[2]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz8
+
+module lz16 (ZP, ZV, B);
+
+   input logic [15:0]  B;
+
+   logic [2:0] 	       ZPa;
+   logic [2:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;   
+
+   output logic [3:0]  ZP;
+   output logic        ZV;
+
+   lz8 l1(ZPa, ZVa, B[7:0]);
+   lz8 l2(ZPb, ZVb, B[15:8]);
+
+   assign ZP[2:0] = ZVb ? ZPb : ZPa;
+   assign ZP[3]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz16
+
+module lz32 (ZP, ZV, B);
+
+   input logic [31:0] B;
+
+   logic [3:0] 	      ZPa;
+   logic [3:0] 	      ZPb;
+   logic 	      ZVa;
+   logic 	      ZVb;
+   
+   output logic [4:0] ZP;
+   output logic       ZV;
+   
+   lz16 l1(ZPa, ZVa, B[15:0]);
+   lz16 l2(ZPb, ZVb, B[31:16]);
+   
+   assign ZP[3:0] = ZVb ? ZPb : ZPa;
+   assign ZP[4]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz32
+
+module lz64 (ZP, ZV, B);
+
+   input logic [63:0]  B;
+   
+   logic [4:0] 	       ZPa;
+   logic [4:0] 	       ZPb;
+   logic 	       ZVa;
+   logic 	       ZVb;
+   
+   output logic [5:0]  ZP;
+   output logic        ZV;
+   
+   lz32 l1(ZPa, ZVa, B[31:0]);
+   lz32 l2(ZPb, ZVb, B[63:32]);
+   
+   assign ZP[4:0] = ZVb ? ZPb : ZPa;
+   assign ZP[5]   = ~ZVb;
+   assign ZV = ZVa | ZVb;
+
+endmodule // lz64
+
+// FSM Control for Integer Divider
+module fsm64 (en, state0, done, divdone, otfzero, divBusy,
+	      start, error, NumIter, clk, reset);
+
+   input logic [5:0]  NumIter;   
+   input logic 	      clk;
+   input logic 	      reset;
+   input logic 	      start;
+   input logic 	      error;   
+   
+   output logic       done;      
+   output logic       en;
+   output logic       state0;
+   output logic       divdone;
+   output logic       otfzero;
+   output logic       divBusy;   
+   
+   logic 	      LT, EQ;
+   logic 	      Divide0;   
+   logic [5:0] 	      CURRENT_STATE;
+   logic [5:0] 	      NEXT_STATE;   
+   
+   parameter [5:0] 
+     S0=6'd0, S1=6'd1, S2=6'd2,
+     S3=6'd3, S4=6'd4, S5=6'd5,
+     S6=6'd6, S7=6'd7, S8=6'd8,
+     S9=6'd9, S10=6'd10, S11=6'd11,
+     S12=6'd12, S13=6'd13, S14=6'd14,
+     S15=6'd15, S16=6'd16, S17=6'd17,
+     S18=6'd18, S19=6'd19, S20=6'd20,
+     S21=6'd21, S22=6'd22, S23=6'd23,
+     S24=6'd24, S25=6'd25, S26=6'd26,
+     S27=6'd27, S28=6'd28, S29=6'd29,
+     S30=6'd30, S31=6'd31, S32=6'd32,
+     S33=6'd33, S34=6'd34, S35=6'd35,
+     S36=6'd36, Done=6'd37;      
+   
+   always @(posedge clk)
+     begin
+	if(reset==1'b1)
+	  CURRENT_STATE<=S0;
+	else
+	  CURRENT_STATE<=NEXT_STATE;
+     end
+
+   // Going to cheat and hard code number of states 
+   // needed into FSM instead of using a counter
+   // FIXME: could counter be better
+
+   // Cheated and made 8 - let synthesis do its magic
+   magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {2'h0, NumIter});
+
+   always @(CURRENT_STATE or start)
+     begin
+ 	case(CURRENT_STATE)
+	  S0:
+	    begin
+	       if (start==1'b0)
+		 begin
+		    otfzero = 1'b1;   
+		    en = 1'b0;
+		    divBusy = 1'b0;		    
+		    state0 = 1'b0;
+		    divdone = 1'b0;		    
+		    done = 1'b0;
+		    NEXT_STATE <= S0;
+		 end 
+	       else 
+		 begin
+		    otfzero = 1'b0;	       		    
+		    en = 1'b1;
+		    divBusy = 1'b1;		    		    
+		    state0 = 1'b1;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		    
+		    done = 1'b0;
+		    divdone = 1'b0;		 		 
+		    NEXT_STATE <= S1;
+		 end 
+	    end	    
+	  S1:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S2;
+		 end
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S2;
+		 end		    
+	    end // case: S1	  
+	  S2:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S3;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S3;
+		 end		    	       	       
+	    end // case: S2
+	  S3:
+	    begin	       
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S4;
+		 end 
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S4;
+		 end		    	       
+	    end // case: S3
+	  S4:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S5;
+		 end 	       	    
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S5;
+		 end		       	       
+	    end // case: S4
+	  S5:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S6;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S6;
+		 end		    	       	       	       
+	    end // case: S5
+	  S6:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S7;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S7;
+		 end		    	       	       
+	    end // case: S6
+	  S7:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S8;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S8;
+		 end		    	       	       
+	    end // case: S7
+	  S8:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S9;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S9;
+		 end		    	       	       
+	    end // case: S8
+	  S9:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S10;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S10;
+		 end		    	       	       
+	    end // case: S9
+	  S10:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S11;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S11;
+		 end		    	       	       
+	    end // case: S10
+	  S11:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S12;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S12;
+		 end		    	       	       
+	    end // case: S11
+	  S12:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S13;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S13;
+		 end		    	       	       
+	    end // case: S12
+	  S13:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S14;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S14;
+		 end		    	       	       
+	    end // case: S13
+	  S14:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S15;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S15;
+		 end		    	       	       
+	    end // case: S14
+	  S15:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S16;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S16;
+		 end		    	       	       
+	    end // case: S15
+	  S16:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S17;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S17;
+		 end		    	       	       
+	    end // case: S16
+	  S17:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S18;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S18;
+		 end		    	       	       
+	    end // case: S17
+	  S18:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S19;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S19;
+		 end		    	       	       
+	    end // case: S18
+	  S19:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S20;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S20;
+		 end		    	       	       
+	    end // case: S19
+	  S20:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S21;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S21;
+		 end		    	       	       
+	    end // case: S20
+	  S21:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S22;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S22;
+		 end		    	       	       
+	    end // case: S21
+	  S22:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;
+		    NEXT_STATE <= S23;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S23;
+		 end		    	       	       
+	    end // case: S22
+	  S23:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S24;		    
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S24;
+		 end		    	       	       
+	    end // case: S23 
+	  S24:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S25;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S25;
+		 end		    	       	       
+	    end // case: S24
+	  S25:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S26;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S26;
+		 end		    	       	       
+	    end // case: S25
+	  S26:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S27;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S27;
+		 end		    	       	       
+	    end // case: S26
+	  S27:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S28;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S28;
+		 end		    	       	       
+	    end // case: S27
+	  S28:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S29;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S29;
+		 end		    	       	       
+	    end // case: S28
+	  S29:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S30;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S30;
+		 end		    	       	       
+	    end // case: S29
+	  S30:
+	    begin
+	       otfzero = 1'b0;
+     	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S31;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S31;
+		 end		    	       	       
+	    end // case: S30
+	  S31:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S32;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S32;
+		 end		    	       	       
+	    end // case: S31  
+	  S32:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S33;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S33;
+		 end		    	       	       
+	    end // case: S32
+	  S33:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S34;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S34;
+		 end		    	       	       
+	    end // case: S33
+	  S34:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S35;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S35;
+		 end		    	       	       
+	    end // case: S34  	  
+	  S35:
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       if (LT|EQ)
+		 begin
+		    en = 1'b1;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    if (EQ)
+		      divdone = 1'b1;		    
+		    else
+		      divdone = 1'b0;		 		 
+		    NEXT_STATE <= S36;
+		 end // if (LT|EQ)
+	       else
+		 begin
+		    en = 1'b0;
+		    state0 = 1'b0;
+		    done = 1'b0;
+		    divdone = 1'b0;
+		    NEXT_STATE <= S36;
+		 end		    	       	       
+	    end // case: S35	  
+	  S36:
+	    begin
+	       otfzero = 1'b1;
+	       divBusy = 1'b1;	       
+	       state0 = 1'b0;
+	       done = 1'b1;
+	       if (EQ)
+		 begin
+		    divdone = 1'b1;
+		    en = 1'b1;
+		 end
+	       else
+		 begin
+		    divdone = 1'b0;
+		    en = 1'b0;
+		 end
+	       NEXT_STATE <= S0;
+	    end // case: S36
+	  default: 
+	    begin
+	       otfzero = 1'b0;
+	       divBusy = 1'b1;	       
+	       en = 1'b0;
+	       state0 = 1'b0;
+	       done = 1'b0;
+	       divdone = 1'b0;
+	       NEXT_STATE <= S0;
+	    end
+	endcase // case(CURRENT_STATE)	
+     end // always @ (CURRENT_STATE or X)   
+
+endmodule // fsm64
+
+// 2-bit magnitude comparator
+// This module compares two 2-bit values A and B. LT is '1' if A < B 
+// and GT is '1'if A > B. LT and GT are both '0' if A = B.
+
+module magcompare2b (LT, GT, A, B);
+
+   input logic [1:0] A;
+   input logic [1:0] B;
+   
+   output logic      LT;
+   output logic      GT;
+   
+   // Determine if A < B  using a minimized sum-of-products expression
+   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
+   // Determine if A > B  using a minimized sum-of-products expression
+   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
+
+endmodule // magcompare2b
+
+// J. E. Stine and M. J. Schulte, "A combined two's complement and
+// floating-point comparator," 2005 IEEE International Symposium on
+// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
+// doi: 10.1109/ISCAS.2005.1464531
+
+module magcompare8 (LT, EQ, A, B);
+
+   input logic [7:0]  A;
+   input logic [7:0]  B;
+   
+   logic [3:0] 	      s;
+   logic [3:0] 	      t;
+   logic [1:0] 	      u;
+   logic [1:0] 	      v;
+   logic 	      GT;
+   //wire 	LT;   
+   
+   output logic       EQ;
+   output logic       LT;   
+   
+   magcompare2b mag1 (s[0], t[0], A[1:0], B[1:0]);
+   magcompare2b mag2 (s[1], t[1], A[3:2], B[3:2]);
+   magcompare2b mag3 (s[2], t[2], A[5:4], B[5:4]);
+   magcompare2b mag4 (s[3], t[3], A[7:6], B[7:6]);
+   
+   magcompare2b mag5 (u[0], v[0], t[1:0], s[1:0]);
+   magcompare2b mag6 (u[1], v[1], t[3:2], s[3:2]);
+
+   magcompare2b mag7 (LT, GT, v[1:0], u[1:0]);
+   
+   assign EQ = ~(GT | LT);   
+
+endmodule // magcompare8
+
+module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
+
+   input logic [63:0] Q;
+   input logic [63:0] rem;
+   input logic [63:0] op1;      
+   input logic 	      S;
+   input logic 	      div0;
+   input logic 	      Max_N;
+   input logic 	      D_NegOne;
+   
+   output logic [63:0] Qf;
+   output logic [63:0] remf;
+
+   // Needs to be optimized
+   always_comb
+     case ({div0, S, Max_N, D_NegOne})
+       4'b0000 : Qf = Q;
+       4'b0001 : Qf = Q;
+       4'b0010 : Qf = Q;              
+       4'b0011 : Qf = Q;              
+       4'b0100 : Qf = Q;
+       4'b0101 : Qf = Q;
+       4'b0110 : Qf = Q;       
+       4'b0111 : Qf = {1'b1, 31'h0};
+       4'b1000 : Qf = {64{1'b1}};
+       4'b1001 : Qf = {64{1'b1}};
+       4'b1010 : Qf = {64{1'b1}};
+       4'b1011 : Qf = {64{1'b1}};              
+       4'b1100 : Qf = {64{1'b1}};
+       4'b1101 : Qf = {64{1'b1}};       
+       4'b1110 : Qf = {64{1'b1}};       
+       4'b1111 : Qf = {64{1'b1}};              
+       default: Qf = Q;       
+     endcase 
+
+   always_comb
+     case ({div0, S, Max_N, D_NegOne})
+       4'b0000 : remf = rem;
+       4'b0001 : remf = rem;
+       4'b0010 : remf = rem;
+       4'b0011 : remf = rem;
+       4'b0100 : remf = rem;
+       4'b0101 : remf = rem;
+       4'b0110 : remf = rem;
+       4'b0111 : remf = 64'h0;     
+       4'b1000 : remf = op1;
+       4'b1001 : remf = op1;
+       4'b1010 : remf = op1;
+       4'b1011 : remf = op1;       
+       4'b1100 : remf = op1;
+       4'b1101 : remf = op1;
+       4'b1110 : remf = op1;       
+       4'b1111 : remf = op1;              
+       default: remf = rem;
+     endcase 
+
+endmodule // exception_int
+
+/* verilator lint_on COMBDLY */
+/* verilator lint_on IMPLICIT */
+
diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv
index 4266ae61..107b002f 100755
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
@@ -1,5 +1,5 @@
 ///////////////////////////////////////////
-// mul.sv
+// divide4x64.sv
 //
 // Written: James.Stine@okstate.edu 1 February 2021
 // Modified: 
@@ -29,54 +29,53 @@
 /* verilator lint_off COMBDLY */
 /* verilator lint_off IMPLICIT */
 
-`include "wally-config.vh"
+module intdiv #(parameter WIDTH=64) 
+   (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
 
-module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
-
-   input logic [63:0]  N, D;
-   input logic 	       clk;
-   input logic 	       reset;
-   input logic 	       start;
-   input logic 	       S;   
+   input logic [WIDTH-1:0]   N, D;
+   input logic 		     clk;
+   input logic 		     reset;
+   input logic 		     start;
+   input logic 		     S;   
+   
+   output logic [WIDTH-1:0]  Qf;
+   output logic [WIDTH-1:0]  remf;
+   output logic 	     div0;
+   output logic 	     done;
+   output logic 	     divBusy;   
+   
+   logic 		     enable;
+   logic 		     state0;
+   logic 		     V;   
+   logic [$clog2(WIDTH):0]   Num;
+   logic [$clog2(WIDTH)-1:0] P, NumIter, RemShift;
+   logic [WIDTH-1:0] 	     op1, op2, op1shift, Rem5;
+   logic [WIDTH:0] 	     Qd, Rd, Qd2, Rd2;
+   logic [WIDTH-1:0] 	     Q, rem0;
+   logic [3:0] 		     quotient;
+   logic 		     otfzero; 
+   logic 		     shiftResult;
+   logic 		     enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp;   
+   
+   logic [WIDTH-1:0] 	     twoD;
+   logic [WIDTH-1:0] 	     twoN;
+   logic 		     SignD;
+   logic 		     SignN;
+   logic [WIDTH-1:0] 	     QT, remT;
+   logic 		     D_NegOne;
+   logic 		     Max_N;      
    
-   output logic [63:0] Qf;
-   output logic [63:0] remf;
-   output logic        div0;
-   output logic        done;
-   output logic        divBusy;   
-
-   logic 	       divdone;   
-   logic 	       enable;
-   logic 	       state0;
-   logic 	       V;   
-   logic [7:0] 	       Num;
-   logic [5:0] 	       P, NumIter, RemShift;
-   logic [63:0]        op1, op2, op1shift, Rem5;
-   logic [64:0]        Qd, Rd, Qd2, Rd2;
-   logic [63:0]        Q, rem0;
-   logic [3:0] 	       quotient;
-   logic 	       otfzero; 
-   logic 	       shiftResult;
-   logic 	       enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp;
-
-   logic [63:0]        twoD;
-   logic [63:0]        twoN;
-   logic 	       SignD;
-   logic 	       SignN;
-   logic [63:0]        QT, remT;
-   logic 	       D_NegOne;
-   logic 	       Max_N;
 
    // Check if negative (two's complement)
    //   If so, convert to positive
-   adder #(64) cpa1 ((D ^ {64{D[63]&S}}), {63'h0, D[63]&S}, twoD);
-   adder #(64) cpa2 ((N ^ {64{N[63]&S}}), {63'h0, N[63]&S}, twoN);   
-   assign SignD = D[63];
-   assign SignN = N[63];   
+   adder #(WIDTH) cpa1 ((D ^ {WIDTH{D[WIDTH-1]&S}}), {{WIDTH-1{1'b0}}, D[WIDTH-1]&S}, twoD);
+   adder #(WIDTH) cpa2 ((N ^ {WIDTH{N[WIDTH-1]&S}}), {{WIDTH-1{1'b0}}, N[WIDTH-1]&S}, twoN);   
+   assign SignD = D[WIDTH-1];
+   assign SignN = N[WIDTH-1];   
    // Max N and D = -1 (Overflow)
-   assign Max_N = (~|N[62:0]) & N[63];
+   assign Max_N = (~|N[WIDTH-2:0]) & N[WIDTH-1];
    assign D_NegOne = &D;
-
+   
    // Divider goes the distance to 37 cycles
    // (thanks to the evil divisor for D = 0x1) 
    
@@ -89,31 +88,31 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
    // exception is given to FSM to tell the operation to 
    // quit gracefully.
 
-   lzd_hier #(64) p1 (.ZP(P), .ZV(V), .B(twoD));
-   shift_left #(64) p2 (twoD, P, op2);   
-   assign op1 = twoN;
+   lzd_hier #(WIDTH) p1 (.ZP(P), .ZV(V), .B(twoD));
+   shift_left #(WIDTH) p2 (twoD, P, op2);
+   assign op1 = twoN;   
    assign div0 = ~V;
 
-   // #iter: N = m+v+s = m+(s+2) = m+2+s (mod k = 0)
+   // #iter: N = m+v+s = m+2+s (mod k = 0)
    // v = 2 since \rho < 1 (add 4 to make sure its a ceil)
-   adder #(8) cpa3 ({2'b0, P}, 
-		    {5'h0, shiftResult, ~shiftResult, 1'b0}, 
-		    Num);      
+   // k = 2 (r = 2^k)
+   adder #($clog2(WIDTH)+1) cpa3 ({1'b0, P}, 
+				  {{$clog2(WIDTH)+1-3{1'b0}}, shiftResult, ~shiftResult, 1'b0}, 
+				  Num);      
    
    // Determine whether need to add just Q/Rem
    assign shiftResult = P[0];   
    // div by 2 (ceil)
-   assign NumIter = Num[6:1];   
+   assign NumIter = Num[$clog2(WIDTH):1];   
    assign RemShift = P;
 
    // FSM to control integer divider
    //   assume inputs are postive edge and
    //   datapath (divider) is negative edge
-   fsm64 fsm1 (enablev, state0v, donev, divdonev, otfzerov, divBusyv,
-	       start, div0, NumIter, ~clk, reset);
+   fsm64 #($clog2(WIDTH)) fsm1 (enablev, state0v, donev, otfzerov, divBusyv,
+				start, div0, NumIter, ~clk, reset);
 
    flopr #(1) rega (~clk, reset, donev, done);
-   flopr #(1) regb (~clk, reset, divdonev, divdone);
    flopr #(1) regc (~clk, reset, otfzerov, otfzero);
    flopr #(1) regd (~clk, reset, enablev, enable);
    flopr #(1) rege (~clk, reset, state0v, state0);
@@ -125,64 +124,66 @@ module div (Qf, remf, done, divBusy, div0, N, D, clk, reset, start, S);
    // integer bit and m fractional bits), this is achieved by
    // shifting N right by v+s so that (m+v+s) mod k = 0.  And,
    // the quotient has to be aligned to the integer position.
-
-   divide4x64 p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, 
-		  enable, otfzero, shiftResult);
+   divide4 #(WIDTH) p3 (Qd, Rd, quotient, op1, op2, clk, reset, state0, 
+			enable, otfzero, shiftResult);
 
    // Storage registers to hold contents stable
-   flopenr #(65) reg3 (clk, reset, enable, Rd, Rd2);
-   flopenr #(65) reg4 (clk, reset, enable, Qd, Qd2);         
+   flopenr #(WIDTH+1) reg3 (clk, reset, enable, Rd, Rd2);
+   flopenr #(WIDTH+1) reg4 (clk, reset, enable, Qd, Qd2);         
 
    // Probably not needed - just assigns results
-   assign Q = Qd2[63:0];
-   assign Rem5 = Rd2[64:1];  
+   assign Q = Qd2[WIDTH-1:0];
+   assign Rem5 = Rd2[WIDTH:1];  
    
-   // Adjust remainder by m 
-   shift_right #(64) p4 (Rem5, RemShift, rem0);   
+   // Adjust remainder by m (no need to adjust by
+   shift_right #(WIDTH) p4 (Rem5, RemShift, rem0);
 
    // Adjust Q/Rem for Signed
    assign tcQ = (SignN ^ SignD) & S;
    assign tcR = SignN & S;
-   // Signed Divide
+
+   // When Dividend (N) and/or Divisor (D) are negative (first bit is '1'):
    // - When N and D are negative: Remainder is negative (undergoes a two's complement).
    // - When N is negative: Quotient and Remainder are both negative (undergo a two's complement).
    // - When D is negative: Quotient is negative (undergoes a two's complement).
-   adder #(64) cpa4 ((rem0 ^ {64{tcR}}), {63'h0, tcR}, remT);
-   adder #(64) cpa5 ((Q ^ {64{tcQ}}), {63'h0, tcQ}, QT);         
+   adder #(WIDTH) cpa4 ((rem0 ^ {WIDTH{tcR}}), {{WIDTH-1{1'b0}}, tcR}, remT);
+   adder #(WIDTH) cpa5 ((Q ^ {WIDTH{tcQ}}), {{WIDTH-1{1'b0}}, tcQ}, QT);         
 
    // RISC-V has exceptions for divide by 0 and overflow (see Table 6.1 of spec)
-   exception_int exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf);
-
+   exception_int #(WIDTH) exc (QT, remT, N, S, div0, Max_N, D_NegOne, Qf, remf);
+   
 endmodule // int32div
 
-module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0, 
-		   enable, otfzero, shiftResult); 
+// Division by Recurrence (r=4)
+module divide4 #(parameter WIDTH=64) 
+   (Q, rem0, quotient, op1, op2, clk, reset, state0, 
+    enable, otfzero, shiftResult); 
 
-   input logic [63:0]   op1, op2;
-   input logic 		clk, state0;
-   input logic 		reset;
-   input logic 		enable;
-   input logic 		otfzero;
-   input logic 		shiftResult;   
+   input logic [WIDTH-1:0]   op1, op2;
+   input logic 		     clk, state0;
+   input logic 		     reset;
+   input logic 		     enable;
+   input logic 		     otfzero;
+   input logic 		     shiftResult;   
    
-   output logic [64:0] 	rem0;
-   output logic [64:0] 	Q;
-   output logic [3:0] 	quotient;   
+   output logic [WIDTH:0]    rem0;
+   output logic [WIDTH:0]    Q;
+   output logic [3:0] 	     quotient;   
 
-   logic [67:0] 	Sum, Carry;   
-   logic [64:0] 	Qstar;   
-   logic [64:0] 	QMstar;   
-   logic [7:0] 		qtotal;   
-   logic [67:0] 	SumN, CarryN, SumN2, CarryN2;
-   logic [67:0] 	divi1, divi2, divi1c, divi2c, dive1;
-   logic [67:0] 	mdivi_temp, mdivi;   
-   logic 		zero;
-   logic [1:0] 		qsel;
-   logic [1:0] 		Qin, QMin;
-   logic 		CshiftQ, CshiftQM;
-   logic [67:0] 	rem1, rem2, rem3;
-   logic [67:0] 	SumR, CarryR;
-   logic [64:0] 	Qt;   
+   logic [WIDTH+3:0] 	     Sum, Carry;   
+   logic [WIDTH:0] 	     Qstar;   
+   logic [WIDTH:0] 	     QMstar;   
+   logic [7:0] 		     qtotal;   
+   logic [WIDTH+3:0] 	     SumN, CarryN, SumN2, CarryN2;
+   logic [WIDTH+3:0] 	     divi1, divi2, divi1c, divi2c, dive1;
+   logic [WIDTH+3:0] 	     mdivi_temp, mdivi;   
+   logic 		     zero;
+   logic [1:0] 		     qsel;
+   logic [1:0] 		     Qin, QMin;
+   logic 		     CshiftQ, CshiftQM;
+   logic [WIDTH+3:0] 	     rem1, rem2, rem3;
+   logic [WIDTH+3:0] 	     SumR, CarryR;
+   logic [WIDTH:0] 	     Qt;   
 
    // Create one's complement values of Divisor (for q*D)
    assign divi1 = {3'h0, op2, 1'b0};
@@ -190,42 +191,42 @@ module divide4x64 (Q, rem0, quotient, op1, op2, clk, reset, state0,
    assign divi1c = ~divi1;
    assign divi2c = ~divi2;
    // Shift x1 if not mod k
-   mux2 #(68) mx1 ({3'b000, op1, 1'b0},  {4'h0, op1}, shiftResult, dive1);   
+   mux2 #(WIDTH+4) mx1 ({3'b000, op1, 1'b0},  {4'h0, op1}, shiftResult, dive1);   
 
    // I I I . F F F F F ... (Robertson Criteria - \rho * qmax * D)
-   mux2 #(68) mx2 ({CarryN2[65:0], 2'h0}, 68'h0, state0, CarryN);
-   mux2 #(68) mx3 ({SumN2[65:0], 2'h0}, dive1, state0, SumN);
+   mux2 #(WIDTH+4) mx2 ({CarryN2[WIDTH+1:0], 2'h0}, {WIDTH+4{1'b0}}, state0, CarryN);
+   mux2 #(WIDTH+4) mx3 ({SumN2[WIDTH+1:0], 2'h0}, dive1, state0, SumN);
    // Simplify QST
-   adder #(8) cpa1 (SumN[67:60], CarryN[67:60], qtotal);   
+   adder #(8) cpa1 (SumN[WIDTH+3:WIDTH-4], CarryN[WIDTH+3:WIDTH-4], qtotal);   
    // q = {+2, +1, -1, -2} else q = 0
-   qst4 pd1 (qtotal[7:1], divi1[63:61], quotient);
+   qst4 pd1 (qtotal[7:1], divi1[WIDTH-1:WIDTH-3], quotient);
    assign ulp = quotient[2]|quotient[3];
    assign zero = ~(quotient[3]|quotient[2]|quotient[1]|quotient[0]);
    // Map to binary encoding
    assign qsel[1] = quotient[3]|quotient[2];
    assign qsel[0] = quotient[3]|quotient[1];   
-   mux4 #(68) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp);
-   mux2 #(68) mx5 (mdivi_temp, 68'h0, zero, mdivi);
-   csa #(68) csa1 (mdivi, SumN, {CarryN[67:1], ulp}, Sum, Carry);
+   mux4 #(WIDTH+4) mx4 (divi2, divi1, divi1c, divi2c, qsel, mdivi_temp);
+   mux2 #(WIDTH+4) mx5 (mdivi_temp, {WIDTH+4{1'b0}}, zero, mdivi);
+   csa #(WIDTH+4) csa1 (mdivi, SumN, {CarryN[WIDTH+3:1], ulp}, Sum, Carry);
    // regs : save CSA
-   flopenr #(68) reg1 (clk, reset, enable, Sum, SumN2);
-   flopenr #(68) reg2 (clk, reset, enable, Carry, CarryN2);
+   flopenr #(WIDTH+4) reg1 (clk, reset, enable, Sum, SumN2);
+   flopenr #(WIDTH+4) reg2 (clk, reset, enable, Carry, CarryN2);
    // OTF
    ls_control otf1 (quotient, Qin, QMin, CshiftQ, CshiftQM);   
-   otf #(65) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, 
-		   otfzero, enable, Qstar, QMstar);
+   otf #(WIDTH+1) otf2 (Qin, QMin, CshiftQ, CshiftQM, clk, 
+			otfzero, enable, Qstar, QMstar);
 
    // Correction and generation of Remainder
-   adder #(68) cpa2 (SumN2[67:0], CarryN2[67:0], rem1);
+   adder #(WIDTH+4) cpa2 (SumN2[WIDTH+3:0], CarryN2[WIDTH+3:0], rem1);
    // Add back +D as correction
-   csa #(68) csa2 (CarryN2[67:0], SumN2[67:0], divi1, SumR, CarryR);
-   adder #(68) cpa3 (SumR, CarryR, rem2);   
+   csa #(WIDTH+4) csa2 (CarryN2[WIDTH+3:0], SumN2[WIDTH+3:0], divi1, SumR, CarryR);
+   adder #(WIDTH+4) cpa3 (SumR, CarryR, rem2);   
    // Choose remainder (Rem or Rem+D)
-   mux2 #(68) mx6 (rem1, rem2, rem1[67], rem3);
+   mux2 #(WIDTH+4) mx6 (rem1, rem2, rem1[WIDTH+3], rem3);
    // Choose correct Q or QM
-   mux2 #(65) mx7 (Qstar, QMstar, rem1[67], Qt);
+   mux2 #(WIDTH+1) mx7 (Qstar, QMstar, rem1[WIDTH+3], Qt);
    // Final results
-   assign rem0 = rem3[64:0];
+   assign rem0 = rem3[WIDTH:0];
    assign Q = Qt;   
    
 endmodule // divide4x64
@@ -304,10 +305,9 @@ module csa #(parameter WIDTH=8) (input logic [WIDTH-1:0] a, b, c,
 	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
 	end
    endgenerate
-   //assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     // trmimmed excess bit dh 5/3/21
-   assign carry = {carry_temp[WIDTH-1:1], 1'b0};     
+   assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     
 
-endmodule // adder
+endmodule // csa
 
 module eqcmp #(parameter WIDTH = 8)
    (input  logic [WIDTH-1:0] a, b,
@@ -490,26 +490,24 @@ module lz64 (ZP, ZV, B);
 endmodule // lz64
 
 // FSM Control for Integer Divider
-module fsm64 (en, state0, done, divdone, otfzero, divBusy,
-	      start, error, NumIter, clk, reset);
+module fsm64 #(parameter WIDTH=6)
+  (en, state0, done, otfzero, divBusy, start, error, NumIter, clk, reset);
 
-   input logic [5:0]  NumIter;   
-   input logic 	      clk;
-   input logic 	      reset;
-   input logic 	      start;
-   input logic 	      error;   
+   input logic [WIDTH-1:0]  NumIter;   
+   input logic 		    clk;
+   input logic 		    reset;
+   input logic 		    start;
+   input logic 		    error;   
    
-   output logic       done;      
-   output logic       en;
-   output logic       state0;
-   output logic       divdone;
-   output logic       otfzero;
-   output logic       divBusy;   
+   output logic 	    done;      
+   output logic 	    en;
+   output logic 	    state0;
+   output logic 	    otfzero;
+   output logic 	    divBusy;   
    
-   logic 	      LT, EQ;
-   logic 	      Divide0;   
-   logic [5:0] 	      CURRENT_STATE;
-   logic [5:0] 	      NEXT_STATE;   
+   logic 		    LT, EQ;
+   logic [5:0] 		    CURRENT_STATE;
+   logic [5:0] 		    NEXT_STATE;   
    
    parameter [5:0] 
      S0=6'd0, S1=6'd1, S2=6'd2,
@@ -534,12 +532,8 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 	  CURRENT_STATE<=NEXT_STATE;
      end
 
-   // Going to cheat and hard code number of states 
-   // needed into FSM instead of using a counter
-   // FIXME: could counter be better
-
    // Cheated and made 8 - let synthesis do its magic
-   magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {2'h0, NumIter});
+   magcompare8 comp1 (LT, EQ, {2'h0, CURRENT_STATE}, {{8-WIDTH{1'b0}}, NumIter});
 
    always @(CURRENT_STATE or start)
      begin
@@ -552,7 +546,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    divBusy = 1'b0;		    
 		    state0 = 1'b0;
-		    divdone = 1'b0;		    
 		    done = 1'b0;
 		    NEXT_STATE <= S0;
 		 end 
@@ -560,30 +553,21 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		 begin
 		    otfzero = 1'b0;	       		    
 		    en = 1'b1;
-		    divBusy = 1'b1;		    		    
+		    divBusy = 1'b1;		    
 		    state0 = 1'b1;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		    
 		    done = 1'b0;
-		    divdone = 1'b0;		 		 
 		    NEXT_STATE <= S1;
 		 end 
 	    end	    
 	  S1:
 	    begin
-	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
+	       otfzero = 1'b0;	   
+	       divBusy = 1'b1;
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S2;
 		 end
 	       else
@@ -591,8 +575,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S2;
+		    NEXT_STATE <= S36;
 		 end		    
 	    end // case: S1	  
 	  S2:
@@ -604,10 +587,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S3;
 		 end // if (LT|EQ)
 	       else
@@ -615,8 +594,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S3;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S2
 	  S3:
@@ -628,10 +606,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S4;
 		 end 
 	       else
@@ -639,8 +613,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S4;
+		    NEXT_STATE <= S36;
 		 end		    	       
 	    end // case: S3
 	  S4:
@@ -652,10 +625,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S5;
 		 end 	       	    
 	       else
@@ -663,8 +632,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S5;
+		    NEXT_STATE <= S36;
 		 end		       	       
 	    end // case: S4
 	  S5:
@@ -676,10 +644,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S6;
 		 end // if (LT|EQ)
 	       else
@@ -687,8 +651,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S6;
+		    NEXT_STATE <= S36;
 		 end		    	       	       	       
 	    end // case: S5
 	  S6:
@@ -700,10 +663,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S7;
 		 end // if (LT|EQ)
 	       else
@@ -711,8 +670,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S7;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S6
 	  S7:
@@ -724,10 +682,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S8;
 		 end // if (LT|EQ)
 	       else
@@ -735,8 +689,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S8;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S7
 	  S8:
@@ -748,10 +701,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S9;
 		 end // if (LT|EQ)
 	       else
@@ -759,8 +708,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S9;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S8
 	  S9:
@@ -772,10 +720,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S10;
 		 end // if (LT|EQ)
 	       else
@@ -783,8 +727,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S10;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S9
 	  S10:
@@ -796,10 +739,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S11;
 		 end // if (LT|EQ)
 	       else
@@ -807,8 +746,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S11;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S10
 	  S11:
@@ -820,10 +758,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S12;
 		 end // if (LT|EQ)
 	       else
@@ -831,8 +765,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S12;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S11
 	  S12:
@@ -844,10 +777,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S13;
 		 end // if (LT|EQ)
 	       else
@@ -855,8 +784,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S13;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S12
 	  S13:
@@ -868,10 +796,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S14;
 		 end // if (LT|EQ)
 	       else
@@ -879,23 +803,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S14;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S13
 	  S14:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S15;
 		 end // if (LT|EQ)
 	       else
@@ -903,23 +822,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S15;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S14
 	  S15:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S16;
 		 end // if (LT|EQ)
 	       else
@@ -927,23 +841,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S16;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S15
 	  S16:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S17;
 		 end // if (LT|EQ)
 	       else
@@ -951,23 +860,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S17;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S16
 	  S17:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S18;
 		 end // if (LT|EQ)
 	       else
@@ -975,23 +879,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S18;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S17
 	  S18:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S19;
 		 end // if (LT|EQ)
 	       else
@@ -999,23 +898,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S19;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S18
 	  S19:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S20;
 		 end // if (LT|EQ)
 	       else
@@ -1023,23 +917,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S20;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S19
 	  S20:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S21;
 		 end // if (LT|EQ)
 	       else
@@ -1047,23 +936,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S21;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S20
 	  S21:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S22;
 		 end // if (LT|EQ)
 	       else
@@ -1071,23 +955,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S22;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S21
 	  S22:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;
 		    NEXT_STATE <= S23;
 		 end // if (LT|EQ)
 	       else
@@ -1095,23 +974,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S23;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S22
 	  S23:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S24;		    
 		 end // if (LT|EQ)
 	       else
@@ -1119,23 +993,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S24;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S23 
 	  S24:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S25;
 		 end // if (LT|EQ)
 	       else
@@ -1143,23 +1012,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S25;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S24
 	  S25:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S26;
 		 end // if (LT|EQ)
 	       else
@@ -1167,23 +1031,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S26;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S25
 	  S26:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S27;
 		 end // if (LT|EQ)
 	       else
@@ -1191,23 +1050,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S27;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S26
 	  S27:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S28;
 		 end // if (LT|EQ)
 	       else
@@ -1215,23 +1069,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S28;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S27
 	  S28:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S29;
 		 end // if (LT|EQ)
 	       else
@@ -1239,23 +1088,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S29;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S28
 	  S29:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S30;
 		 end // if (LT|EQ)
 	       else
@@ -1263,23 +1107,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S30;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S29
 	  S30:
 	    begin
 	       otfzero = 1'b0;
-     	       divBusy = 1'b1;	       
+	       divBusy = 1'b1;	       
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S31;
 		 end // if (LT|EQ)
 	       else
@@ -1287,8 +1126,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S31;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S30
 	  S31:
@@ -1300,10 +1138,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S32;
 		 end // if (LT|EQ)
 	       else
@@ -1311,8 +1145,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S32;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S31  
 	  S32:
@@ -1324,10 +1157,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S33;
 		 end // if (LT|EQ)
 	       else
@@ -1335,8 +1164,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S33;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S32
 	  S33:
@@ -1348,10 +1176,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S34;
 		 end // if (LT|EQ)
 	       else
@@ -1359,23 +1183,18 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S34;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S33
 	  S34:
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
+	       divBusy = 1'b1;
 	       if (LT|EQ)
 		 begin
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S35;
 		 end // if (LT|EQ)
 	       else
@@ -1383,8 +1202,7 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
-		    NEXT_STATE <= S35;
+		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S34  	  
 	  S35:
@@ -1396,10 +1214,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b1;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    if (EQ)
-		      divdone = 1'b1;		    
-		    else
-		      divdone = 1'b0;		 		 
 		    NEXT_STATE <= S36;
 		 end // if (LT|EQ)
 	       else
@@ -1407,7 +1221,6 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 		    en = 1'b0;
 		    state0 = 1'b0;
 		    done = 1'b0;
-		    divdone = 1'b0;
 		    NEXT_STATE <= S36;
 		 end		    	       	       
 	    end // case: S35	  
@@ -1419,12 +1232,10 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 	       done = 1'b1;
 	       if (EQ)
 		 begin
-		    divdone = 1'b1;
 		    en = 1'b1;
 		 end
 	       else
 		 begin
-		    divdone = 1'b0;
 		    en = 1'b0;
 		 end
 	       NEXT_STATE <= S0;
@@ -1432,11 +1243,10 @@ module fsm64 (en, state0, done, divdone, otfzero, divBusy,
 	  default: 
 	    begin
 	       otfzero = 1'b0;
-	       divBusy = 1'b1;	       
+	       divBusy = 1'b0;	       
 	       en = 1'b0;
 	       state0 = 1'b0;
 	       done = 1'b0;
-	       divdone = 1'b0;
 	       NEXT_STATE <= S0;
 	    end
 	endcase // case(CURRENT_STATE)	
@@ -1497,38 +1307,39 @@ module magcompare8 (LT, EQ, A, B);
 
 endmodule // magcompare8
 
-module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
+// RISC-V Exception Logic for Divide by 0 and Overflow (Signed Integer Divide)
+module exception_int #(parameter WIDTH=8) 
+   (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
 
-   input logic [63:0] Q;
-   input logic [63:0] rem;
-   input logic [63:0] op1;      
-   input logic 	      S;
-   input logic 	      div0;
-   input logic 	      Max_N;
-   input logic 	      D_NegOne;
+   input logic [WIDTH-1:0] Q;
+   input logic [WIDTH-1:0] rem;
+   input logic [WIDTH-1:0] op1;      
+   input logic 		   S;
+   input logic 		   div0;
+   input logic 		   Max_N;
+   input logic 		   D_NegOne;
    
-   output logic [63:0] Qf;
-   output logic [63:0] remf;
+   output logic [WIDTH-1:0] Qf;
+   output logic [WIDTH-1:0] remf;
 
-   // Needs to be optimized
    always_comb
      case ({div0, S, Max_N, D_NegOne})
        4'b0000 : Qf = Q;
        4'b0001 : Qf = Q;
-       4'b0010 : Qf = Q;              
-       4'b0011 : Qf = Q;              
+       4'b0010 : Qf = Q;       
+       4'b0011 : Qf = Q;
        4'b0100 : Qf = Q;
-       4'b0101 : Qf = Q;
+       4'b0101 : Qf = Q;       
        4'b0110 : Qf = Q;       
-       4'b0111 : Qf = {1'b1, 31'h0};
-       4'b1000 : Qf = {64{1'b1}};
-       4'b1001 : Qf = {64{1'b1}};
-       4'b1010 : Qf = {64{1'b1}};
-       4'b1011 : Qf = {64{1'b1}};              
-       4'b1100 : Qf = {64{1'b1}};
-       4'b1101 : Qf = {64{1'b1}};       
-       4'b1110 : Qf = {64{1'b1}};       
-       4'b1111 : Qf = {64{1'b1}};              
+       4'b0111 : Qf = {1'b1, {WIDTH-1{1'h0}}};       
+       4'b1000 : Qf = {WIDTH{1'b1}};
+       4'b1001 : Qf = {WIDTH{1'b1}};
+       4'b1010 : Qf = {WIDTH{1'b1}};
+       4'b1011 : Qf = {WIDTH{1'b1}};       
+       4'b1100 : Qf = {WIDTH{1'b1}};
+       4'b1101 : Qf = {WIDTH{1'b1}};
+       4'b1110 : Qf = {WIDTH{1'b1}};
+       4'b1111 : Qf = {WIDTH{1'b1}};       
        default: Qf = Q;       
      endcase 
 
@@ -1536,18 +1347,18 @@ module exception_int (Q, rem, op1, S, div0, Max_N, D_NegOne, Qf, remf);
      case ({div0, S, Max_N, D_NegOne})
        4'b0000 : remf = rem;
        4'b0001 : remf = rem;
-       4'b0010 : remf = rem;
+       4'b0010 : remf = rem;       
        4'b0011 : remf = rem;
        4'b0100 : remf = rem;
        4'b0101 : remf = rem;
        4'b0110 : remf = rem;
-       4'b0111 : remf = 64'h0;     
+       4'b0111 : remf = {WIDTH{1'h0}};
        4'b1000 : remf = op1;
        4'b1001 : remf = op1;
        4'b1010 : remf = op1;
        4'b1011 : remf = op1;       
        4'b1100 : remf = op1;
-       4'b1101 : remf = op1;
+       4'b1101 : remf = op1;       
        4'b1110 : remf = op1;       
        4'b1111 : remf = op1;              
        default: remf = rem;
@@ -1557,4 +1368,3 @@ endmodule // exception_int
 
 /* verilator lint_on COMBDLY */
 /* verilator lint_on IMPLICIT */
-
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index 17c4aac5..f4096fd1 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -78,7 +78,7 @@ module muldiv (
 				    .en(startDivideE), .clear(DivDoneE),
 				    .reset(reset),  .clk(~gclk));	 
 	 assign signedDivide = (Funct3E[2]&~Funct3E[1]&~Funct3E[0]) | (Funct3E[2]&Funct3E[1]&~Funct3E[0]);	 
-	 div div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide);
+	 intdiv #(`XLEN) div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide);
 
 	 // Added for debugging of start signal for divide
 	 assign startDivideE = MulDivE&DivStartE&~DivBusyE;
@@ -93,7 +93,6 @@ module muldiv (
 	 
 	 // Select result
 	 always_comb
-	   //           case (DivDoneE ? Funct3E_Q : Funct3E)
            case (Funct3E)	   
              3'b000: PrelimResultE = ProdE[`XLEN-1:0];
              3'b001: PrelimResultE = ProdE[`XLEN*2-1:`XLEN];

From 46a232b862249262e91fd0241c48f7b662bac599 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Mon, 31 May 2021 09:16:30 -0400
Subject: [PATCH 13/14] Cosmetic changes on integer divider

---
 wally-pipelined/src/muldiv/div.sv    | 7 ++++---
 wally-pipelined/src/muldiv/muldiv.sv | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv
index 107b002f..8b4e0463 100755
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
@@ -55,7 +55,7 @@ module intdiv #(parameter WIDTH=64)
    logic [3:0] 		     quotient;
    logic 		     otfzero; 
    logic 		     shiftResult;
-   logic 		     enablev, state0v, donev, divdonev, oftzerov, divBusyv, ulp;   
+   logic 		     enablev, state0v, donev, oftzerov, divBusyv, ulp;   
    
    logic [WIDTH-1:0] 	     twoD;
    logic [WIDTH-1:0] 	     twoN;
@@ -231,6 +231,7 @@ module divide4 #(parameter WIDTH=64)
    
 endmodule // divide4x64
 
+// Load/Control for OTFC
 module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM);
 
    input logic [3:0] quot;
@@ -251,8 +252,7 @@ module ls_control (quot, Qin, QMin, CshiftQ, CshiftQM);
 
 endmodule 
 
-// On-the-fly Conversion per Ercegovac/Lang
-
+// On-the-fly Conversion (OTFC)
 module otf #(parameter WIDTH=8) 
    (Qin, QMin, CshiftQ, CshiftQM, clk, reset, enable, R2Q, R1Q);
    
@@ -317,6 +317,7 @@ module eqcmp #(parameter WIDTH = 8)
    
 endmodule // eqcmp
 
+// QST for r=4
 module qst4 (input logic [6:0] s, input logic [2:0] d,
 	     output logic [3:0] q);
    
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index f4096fd1..ccabe341 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -47,7 +47,6 @@ module muldiv (
 	 logic [`XLEN-1:0] MulDivResultE, MulDivResultM;
 	 logic [`XLEN-1:0] PrelimResultE;
 	 logic [`XLEN-1:0] QuotE, RemE;
-	 //logic [`XLEN-1:0] Q, R;	 
 	 logic [`XLEN*2-1:0] ProdE; 
 
 	 logic 		     enable_q;	 

From ddbdd0d5a27d485537994516a0671225c3cb7219 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Mon, 31 May 2021 23:27:42 -0400
Subject: [PATCH 14/14] Modify muldiv.sv to handle W instructions for 64-bits

---
 wally-pipelined/src/muldiv/div.sv              |  1 -
 wally-pipelined/src/muldiv/muldiv.sv           | 17 ++++++++++++++---
 wally-pipelined/testbench/testbench-imperas.sv | 10 +++++-----
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/wally-pipelined/src/muldiv/div.sv b/wally-pipelined/src/muldiv/div.sv
index 8b4e0463..10af5eee 100755
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
@@ -87,7 +87,6 @@ module intdiv #(parameter WIDTH=64)
    // is 0 and thus a divide by 0 exception.  This div0
    // exception is given to FSM to tell the operation to 
    // quit gracefully.
-
    lzd_hier #(WIDTH) p1 (.ZP(P), .ZV(V), .B(twoD));
    shift_left #(WIDTH) p2 (twoD, P, op2);
    assign op1 = twoN;   
diff --git a/wally-pipelined/src/muldiv/muldiv.sv b/wally-pipelined/src/muldiv/muldiv.sv
index ccabe341..0c26a5df 100644
--- a/wally-pipelined/src/muldiv/muldiv.sv
+++ b/wally-pipelined/src/muldiv/muldiv.sv
@@ -53,6 +53,7 @@ module muldiv (
 	 logic [2:0] 	     Funct3E_Q;
 	 logic 		     div0error;
 	 logic [`XLEN-1:0]   N, D;
+	 logic [`XLEN-1:0]   Num0, Den0;	 
 
 	 logic 		     gclk;
 	 logic 		     DivStartE;
@@ -69,13 +70,23 @@ module muldiv (
 	 end
 	 assign gclk = enable_q & clk;
 
+	 // Handle sign extension for W-type instructions
+	 if (`XLEN == 64) begin // RV64 has W-type instructions
+            assign Num0 = W64E ? {{32{SrcAE[31]&signedDivide}}, SrcAE[31:0]} : SrcAE;
+            assign Den0 = W64E ? {{32{SrcBE[31]&signedDivide}}, SrcBE[31:0]} : SrcBE;
+	 end else begin // RV32 has no W-type instructions
+            assign Num0 = SrcAE;
+            assign Den0 = SrcAE;	    
+	 end	    
+
 	 // capture the Numerator/Denominator	 
-	 flopenrc #(`XLEN) reg_num (.d(SrcAE), .q(N),
+	 flopenrc #(`XLEN) reg_num (.d(Num0), .q(N),
 				    .en(startDivideE), .clear(DivDoneE),
 				    .reset(reset),  .clk(~gclk));
-	 flopenrc #(`XLEN) reg_den (.d(SrcBE), .q(D),
+	 flopenrc #(`XLEN) reg_den (.d(Den0), .q(D),
 				    .en(startDivideE), .clear(DivDoneE),
-				    .reset(reset),  .clk(~gclk));	 
+				    .reset(reset),  .clk(~gclk));
+	 
 	 assign signedDivide = (Funct3E[2]&~Funct3E[1]&~Funct3E[0]) | (Funct3E[2]&Funct3E[1]&~Funct3E[0]);	 
 	 intdiv #(`XLEN) div (QuotE, RemE, DivDoneE, DivBusyE, div0error, N, D, gclk, reset, startDivideE, signedDivide);
 
diff --git a/wally-pipelined/testbench/testbench-imperas.sv b/wally-pipelined/testbench/testbench-imperas.sv
index ea693900..6d8f1049 100644
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@@ -166,12 +166,12 @@ string tests32f[] = '{
     "rv64m/I-MULW-01", "3000",
     "rv64m/I-DIV-01", "3000",
     "rv64m/I-DIVU-01", "3000",
-    //"rv64m/I-DIVUW-01", "3000",
-    //"rv64m/I-DIVW-01", "3000",
+    "rv64m/I-DIVUW-01", "3000",
+    "rv64m/I-DIVW-01", "3000",
     "rv64m/I-REM-01", "3000",
-    "rv64m/I-REMU-01", "3000"
-    //"rv64m/I-REMUW-01", "3000",
-    //"rv64m/I-REMW-01", "3000"
+    "rv64m/I-REMU-01", "3000",
+    "rv64m/I-REMUW-01", "3000",
+    "rv64m/I-REMW-01", "3000"
   };
 
   string tests64ic[] = '{