From a91c0c8fc714017f25d0aa6a4144e68ee4028efd Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Wed, 6 Oct 2021 08:26:09 -0500
Subject: [PATCH] Make changes to fpdiv - still working on clock issue with fsm
 that was changed from posedge to negedge - also updated fpdivsqrt rounding to
 handle testfloat

---
 wally-pipelined/src/fpu/convert_inputs.sv |  29 +-
 wally-pipelined/src/fpu/exception_div.sv  |  27 +-
 wally-pipelined/src/fpu/fpdiv.sv          | 155 +++---
 wally-pipelined/src/fpu/fpu.sv            | 613 ++++++++++------------
 wally-pipelined/src/fpu/fregfile.sv       |  33 +-
 wally-pipelined/src/fpu/fsm.sv            | 146 +++---
 wally-pipelined/src/fpu/rounder_div.sv    | 109 ++--
 wally-pipelined/src/fpu/sbtm_a0.sv        |  29 +-
 wally-pipelined/src/fpu/sbtm_a1.sv        |  29 +-
 wally-pipelined/src/fpu/sbtm_a2.sv        |  29 +-
 wally-pipelined/src/fpu/sbtm_a3.sv        |  27 +-
 wally-pipelined/src/fpu/sbtm_div.sv       |  24 +
 wally-pipelined/src/fpu/sbtm_sqrt.sv      |  24 +
 13 files changed, 698 insertions(+), 576 deletions(-)

diff --git a/wally-pipelined/src/fpu/convert_inputs.sv b/wally-pipelined/src/fpu/convert_inputs.sv
index bf56cb006..9a0584baa 100755
--- a/wally-pipelined/src/fpu/convert_inputs.sv
+++ b/wally-pipelined/src/fpu/convert_inputs.sv
@@ -1,9 +1,26 @@
-// This module takes as inputs two operands (op1 and op2) 
-// the operation type (op_type) and the result precision (P). 
-// Based on the operation and precision , it conditionally
-// converts single precision values to double precision values
-// and modifies the sign of op1. The converted operands are Float1
-// and Float2.
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Floating point divider/square root top unit (Goldschmidt)
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
 
 module convert_inputs(
    input [63:0]  op1,      // 1st input operand (A)
diff --git a/wally-pipelined/src/fpu/exception_div.sv b/wally-pipelined/src/fpu/exception_div.sv
index 374320683..3e701d2fb 100755
--- a/wally-pipelined/src/fpu/exception_div.sv
+++ b/wally-pipelined/src/fpu/exception_div.sv
@@ -23,9 +23,10 @@ module exception_div (
    logic 	      BNaN; 		// '1' if B is a not-a-number
    logic 	      ASNaN;	 	// '1' if A is a signalling not-a-number
    logic 	      BSNaN;	 	// '1' if B is a signalling not-a-number
-   logic 	      ZQNaN;	 	// '1' if result Z is a quiet NaN
+   logic 	      ZSNaN;	 	// '1' if result Z is a quiet NaN
    logic 	      ZInf;	 	// '1' if result Z is an infnity
-   logic 	      Zero;             // '1' if result is zero   
+   logic 	      Zero;             // '1' if result is zero
+   logic              NegSqrt;          // '1' if sqrt and operand is negative   
    
    //***take this module out and add more registers or just recalculate it all
    // Determine if mantissas are all zeros
@@ -48,32 +49,34 @@ module exception_div (
    assign AZero = AzeroE & AzeroM;
    assign BZero = BzeroE & BzeroE;
 
+   // Is NaN if operand is negative and its a sqrt
+   assign NegSqrt = (A[63] & op_type & ~AZero);
+
    // An "Invalid Operation" exception occurs if (A or B is a signalling NaN)
    // or (A and B are both Infinite)
    assign Invalid = ASNaN | BSNaN | (((AInf & BInf) | (AZero & BZero))&~op_type) | 
-		    (A[63] & op_type);
-
+		    NegSqrt;
 
    // The result is a quiet NaN if (an "Invalid Operation" exception occurs) 
    // or (A is a NaN) or (B is a NaN).
-   assign ZQNaN = Invalid | ANaN | BNaN;
+   assign ZSNaN = Invalid | ANaN | BNaN;
 
    //  The result is zero
    assign Zero = (AZero | BInf)&~op_type | AZero&op_type;   
 
    // The result is +Inf if ((A is Inf) or (B is 0)) and (the
    // result is not a quiet NaN).  
-   assign ZInf = (AInf | BZero)&~ZQNaN&~op_type | AInf&op_type&~ZQNaN;   
+   assign ZInf = (AInf | BZero)&~ZSNaN&~op_type | AInf&op_type&~ZSNaN;   
 
    // Set the type of the result as follows:
    // Ztype	Result 
    //  000     Normal
-   //  001     Quiet NaN
    //  010     Infinity
    //  011     Zero
-   //  110     DivZero
-   assign Ztype[0] = ZQNaN | Zero;
-   assign Ztype[1] = ZInf | Zero;
-   assign Ztype[2] = BZero&~op_type;   
-
+   //  110     Div by 0
+   //  111     SNaN
+   assign Ztype[2] = (ZSNaN);
+   assign Ztype[1] = (ZSNaN) | (Zero) | (ZInf);
+   assign Ztype[0] = (ZSNaN) | (Zero);
+   
 endmodule // exception
diff --git a/wally-pipelined/src/fpu/fpdiv.sv b/wally-pipelined/src/fpu/fpdiv.sv
index a2534149f..0a937b5b0 100755
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@@ -1,92 +1,86 @@
+///////////////////////////////////////////
 //
-// File name : fpdiv
-// Title     : Floating-Point Divider/Square-Root
-// project   : FPU
-// Library   : fpdiv
-// Author(s) : James E. Stine, Jr.
-// Purpose   : definition of main unit to floating-point div/sqrt
-// notes :   
+// Written: James Stine
+// Modified: 8/1/2018
 //
-// Copyright Oklahoma State University
+// Purpose: Floating point divider/square root top unit (Goldschmidt)
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
-// Basic Operations
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
 //
-// Step 1: Load operands, set flags, and convert SP to DP
-// Step 2: Check for special inputs ( +/- Infinity,  NaN)
-// Step 3: Exponent Logic
-// Step 4: Divide/Sqrt using Goldschmidt
-// Step 5: Normalize the result.//
-//   Shift left until normalized.  Normalized when the value to the 
-//   left of the binrary point is 1.
-// Step 6: Round the result.// 
-// Step 7: Put quotient/remainder onto output.
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
 
 // `timescale 1ps/1ps
 module fpdiv (
-   input logic 	      clk,
-   input logic 	      reset,
-   input logic 	      start,
-   input logic [63:0]   op1,		// 1st input operand (A)
-   input logic [63:0]   op2,		// 2nd input operand (B)
-   input logic [1:0]    rm,		// Rounding mode - specify values 
-   input logic 	      op_type,	// Function opcode
-   input logic 	      P,   		// Result Precision (0 for double, 1 for single)
-   input logic 	      OvEn,		// Overflow trap enabled
-   input logic 	      UnEn,   	// Underflow trap enabled
-   output logic         done,
-   output logic         FDivBusyE,
-   output logic [63:0]  AS_Result,	// Result of operation
-   output logic [4:0]   Flags);   	// IEEE exception flags 
+  input logic 	      clk,
+  input logic 	      reset,
+  input logic 	      start,
+  input logic [63:0]  op1, 
+  input logic [63:0]  op2, 
+  input logic [1:0]   rm, 
+  input logic 	      op_type, 
+  input logic 	      P, 
+  input logic 	      OvEn, 
+  input logic 	      UnEn,
+  input logic 	      XNaNQ,
+  input logic 	      YNaNQ,
+  input logic 	      XZeroQ,
+  input logic 	      YZeroQ,
+  input logic 	      XInfQ,
+  input logic 	      YInfQ, 
 
-
-   logic [63:0]   Float1; 
-   logic [63:0] 	Float2;
+  output logic 	      done,
+  output logic 	      FDivBusyE,
+  output logic [63:0] AS_Result, 
+  output logic [4:0]  Flags);
    
-   logic [12:0] 	exp1, exp2, expF;
-   logic [12:0] 	exp_diff, bias;
-   logic [13:0] 	exp_sqrt;
-   logic [12:0] 	exp_s;
-   logic [12:0] 	exp_c;
+   logic [63:0]       Float1; 
+   logic [63:0]       Float2;
    
-   logic [10:0] 	exponent;
-   logic [63:0] 	Result;   
-   logic [52:0] 	mantissaA;
-   logic [52:0] 	mantissaB; 
+   logic [12:0]       exp1, exp2, expF;
+   logic [12:0]       exp_diff, bias;
+   logic [13:0]       exp_sqrt;
+   logic [63:0]       Result;   
+   logic [52:0]       mantissaA;
+   logic [52:0]       mantissaB; 
    
-   logic [2:0] 	sel_inv;
-   logic		      Invalid;
-   logic [4:0] 	FlagsIn;   	
+   logic [2:0] 	      sel_inv;
+   logic 	      Invalid;
+   logic [4:0] 	      FlagsIn;   	
    logic 	      signResult;      
    logic 	      convert;
-   logic          sub;
+   logic 	      sub;
    
-   logic [63:0] 	q1, qm1, qp1, q0, qm0, qp0;
-   logic [63:0] 	rega_out, regb_out, regc_out, regd_out;
-   logic [127:0]  regr_out;
-   logic [2:0] 	sel_muxa, sel_muxb;
+   logic [63:0]       q1, qm1, qp1, q0, qm0, qp0;
+   logic [63:0]       rega_out, regb_out, regc_out, regd_out;
+   logic [127:0]      regr_out;
+   logic [2:0] 	      sel_muxa, sel_muxb;
    logic 	      sel_muxr;   
    logic 	      load_rega, load_regb, load_regc, load_regd, load_regr;
-
-   logic 	      load_regs;
-   logic          exp_cout1, exp_cout2;
-   logic          exp_odd, open;
    
-   // div/sqrt
-         //  fdiv  = 0
-         //  fsqrt = 1
+   logic 	      load_regs;
+   logic 	      exp_cout1, exp_cout2;
+   logic 	      exp_odd, open;
+   
+   //  op_type : fdiv=0, fsqrt=1
    assign Float1 = op1;
    assign Float2 = op_type ? op1 : op2;   
-
-   // Test for exceptions and return the "Invalid Operation" and
-   // "Denormalized" Input Flags. The "sel_inv" is used in
-   // the third pipeline stage to select the result. Also, op1_Norm
-   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
-   // sub is one if the effective operation is subtaction.   
-   exception_div exc1 (.A(Float1), .B(Float2), .op_type,
-                     // output:
-                     .Ztype(sel_inv), .Invalid);
-
+   
+   // Exception detection
+   exception_div exc1 (.A(Float1), .B(Float2), .op_type, .Ztype(sel_inv), .Invalid);
+   
    // Determine Sign/Mantissa
    assign signResult = (Float1[63]^Float2[63]);
    assign mantissaA = {1'b1, Float1[51:0]};
@@ -103,29 +97,30 @@ module fpdiv (
    assign {exp_cout2, exp_sqrt} = {1'b0, exp1} + {4'h0, 10'h3ff} + exp_odd;
    // Choose correct exponent
    assign expF = op_type ? exp_sqrt[13:1] : exp_diff;   
-
+   
    // Main Goldschmidt/Division Routine   
    divconv goldy (.q1, .qm1, .qp1, .q0, .qm0, .qp0, .rega_out, .regb_out, .regc_out, .regd_out,
 		  .regr_out, .d(mantissaB), .n(mantissaA), .sel_muxa, .sel_muxb, .sel_muxr, 
 		  .reset, .clk,  .load_rega, .load_regb, .load_regc, .load_regd,
 		  .load_regr, .load_regs, .P, .op_type, .exp_odd);
-
+   
    // FSM : control divider   
    fsm control (.clk, .reset, .start, .op_type,
-               // outputs:
-               .done, .load_rega, .load_regb, .load_regc, .load_regd, 
-		         .load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, 
-		         .divBusy(FDivBusyE));
+		.done, .load_rega, .load_regb, .load_regc, .load_regd, 
+		.load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, 
+		.divBusy(FDivBusyE));
    
    // Round the mantissa to a 52-bit value, with the leading one
    // removed. The rounding units also handles special cases and 
    // set the exception flags.   
    rounder_div round1 (.rm, .P, .OvEn, .UnEn, .exp_diff(expF), 
-   		            .sel_inv, .Invalid, .SignR(signResult), 
-		               .q1, .qm1, .qp1, .q0, .qm0, .qp0, .regr_out, 
-                     // outputs:
-                     .Result, .Flags(FlagsIn));
-
+   		       .sel_inv, .Invalid, .SignR(signResult),
+		       .Float1(op1), .Float2(op2),
+		       .XNaNQ, .YNaNQ, .XZeroQ, .YZeroQ, 
+		       .XInfQ, .YInfQ, .op_type,		       
+		       .q1, .qm1, .qp1, .q0, .qm0, .qp0, .regr_out, 
+                       .Result, .Flags(FlagsIn));
+   
    // Store the final result and the exception flags in registers.
    flopenr #(64) rega (clk, reset, done, Result, AS_Result);  
    flopenr #(5) regc (clk, reset, done, FlagsIn, Flags);   
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index cadfafae0..34aa3edd3 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -1,6 +1,6 @@
 ///////////////////////////////////////////
 //
-// Written: Katherine Parry, Bret Mathis
+// Written: Katherine Parry, James Stine, Brett Mathis
 // Modified: 6/23/2021
 //
 // Purpose: FPU
@@ -25,24 +25,24 @@
 `include "wally-config.vh"
 
 module fpu (
-  input logic 		          clk,
-  input logic 		          reset,
-  input logic  [2:0] 	      FRM_REGW, // Rounding mode from CSR
-  input logic  [31:0]       InstrD,   // instruction from IFU
-  input logic  [`XLEN-1:0]  ReadDataW,// Read data from memory
-  input logic  [`XLEN-1:0]  SrcAE,    // Integer input being processed (from IEU)
-  input logic  [`XLEN-1:0]  SrcAM,    // Integer input being written into fpreg (from IEU)
-  input logic 		          StallE, StallM, StallW, // stall signals from HZU
-  input logic 		          FlushE, FlushM, FlushW, // flush signals from HZU
-  input logic  [4:0] 	      RdE, RdM, RdW,  // which FP register to write to (from IEU)
-  output logic 		          FRegWriteM,     // FP register write enable
-  output logic 		          FStallD,        // Stall the decode stage
-  output logic 		          FWriteIntE, FWriteIntM, FWriteIntW, // integer register write enable
-  output logic [`XLEN-1:0]  FWriteDataE,  // Data to be written to memory
-  output logic [`XLEN-1:0]  FIntResM,     // data to be written to integer register
-  output logic 		          FDivBusyE,    // Is the divide/sqrt unit busy (stall execute stage)
-  output logic 		          IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
-  output logic [4:0] 	      SetFflagsM        // FMA flags (to privileged unit)
+  input logic 		   clk,
+  input logic 		   reset,
+  input logic [2:0] 	   FRM_REGW, // Rounding mode from CSR
+  input logic [31:0] 	   InstrD, // instruction from IFU
+  input logic [`XLEN-1:0]  ReadDataW,// Read data from memory
+  input logic [`XLEN-1:0]  SrcAE, // Integer input being processed (from IEU)
+  input logic [`XLEN-1:0]  SrcAM, // Integer input being written into fpreg (from IEU)
+  input logic 		   StallE, StallM, StallW, // stall signals from HZU
+  input logic 		   FlushE, FlushM, FlushW, // flush signals from HZU
+  input logic [4:0] 	   RdE, RdM, RdW, // which FP register to write to (from IEU)
+  output logic 		   FRegWriteM, // FP register write enable
+  output logic 		   FStallD, // Stall the decode stage
+  output logic 		   FWriteIntE, FWriteIntM, FWriteIntW, // integer register write enable
+  output logic [`XLEN-1:0] FWriteDataE, // Data to be written to memory
+  output logic [`XLEN-1:0] FIntResM, // data to be written to integer register
+  output logic 		   FDivBusyE, // Is the divide/sqrt unit busy (stall execute stage)
+  output logic 		   IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
+  output logic [4:0] 	   SetFflagsM        // FMA flags (to privileged unit)
   );
 
   //*** make everything FLEN at some point
@@ -59,338 +59,257 @@ module fpu (
   
   generate if (`F_SUPPORTED | `D_SUPPORTED) begin : fpu
 
-  // control signals
-	logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
-	logic [2:0] FrmD, FrmE, FrmM;                   // FP rounding mode
-	logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
-	logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
-	logic 		  FWriteIntD;                         // Write to integer register
-	logic [1:0] FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
-	logic [1:0] FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select the result written to FP register
-	logic [2:0] FOpCtrlD, FOpCtrlE, FOpCtrlM;           // Select which opperation to do in each component
-	logic [2:0] FResSelD, FResSelE, FResSelM;           // Select one of the results that finish in the memory stage
-	logic [1:0] FIntResSelD, FIntResSelE, FIntResSelM;  // Select the result written to the integer resister
-	logic [4:0] Adr1E, Adr2E, Adr3E;                    // adresses of each input
+     // control signals
+     logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
+     logic [2:0] 	  FrmD, FrmE, FrmM;                   // FP rounding mode
+     logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
+     logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
+     logic 		  FWriteIntD;                         // Write to integer register
+     logic [1:0] 	  FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
+     logic [1:0] 	  FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select the result written to FP register
+     logic [2:0] 	  FOpCtrlD, FOpCtrlE, FOpCtrlM;           // Select which opperation to do in each component
+     logic [2:0] 	  FResSelD, FResSelE, FResSelM;           // Select one of the results that finish in the memory stage
+     logic [1:0] 	  FIntResSelD, FIntResSelE, FIntResSelM;  // Select the result written to the integer resister
+     logic [4:0] 	  Adr1E, Adr2E, Adr3E;                    // adresses of each input
+     
+     // regfile signals
+     logic [63:0] 	  FRD1D, FRD2D, FRD3D;  // Read Data from FP register - decode stage
+     logic [63:0] 	  FRD1E, FRD2E, FRD3E;  // Read Data from FP register - execute stage
+     logic [63:0] 	  FSrcXE, FSrcXM;       // Input 1 to the various units (after forwarding)
+     logic [63:0] 	  FPreSrcYE, FSrcYE;               // Input 2 to the various units (after forwarding)
+     logic [63:0] 	  FPreSrcZE, FSrcZE;     // Input 3 to the various units (after forwarding)
+     
+     // unpacking signals
+     logic 		  XSgnE, YSgnE, ZSgnE;     // input's sign - execute stage
+     logic 		  XSgnM, YSgnM;     // input's sign - memory stage
+     logic [10:0] 	  XExpE, YExpE, ZExpE;     // input's exponent - execute stage
+     logic [10:0] 	  XExpM, YExpM, ZExpM;     // input's exponent - memory stage
+     logic [52:0] 	  XManE, YManE, ZManE;  // input's fraction - execute stage
+     logic [52:0] 	  XManM, YManM, ZManM;  // input's fraction - memory stage
+     logic [10:0] 	  BiasE;                   // bias based on precision (single=7f double=3ff - max expoent/2)
+     logic 		  XNaNE, YNaNE, ZNaNE;           // is the input a NaN - execute stage
+     logic 		  XNaNM, YNaNM, ZNaNM;           // is the input a NaN - memory stage
+     logic 		  XSNaNE, YSNaNE, ZSNaNE;        // is the input a signaling NaN - execute stage
+     logic 		  XSNaNM, YSNaNM, ZSNaNM;        // is the input a signaling NaN - memory stage
+     logic 		  XDenormE, YDenormE, ZDenormE;  // is the input denormalized
+     logic 		  XZeroE, YZeroE, ZZeroE;        // is the input zero - execute stage
+     logic 		  XZeroM, YZeroM, ZZeroM;        // is the input zero - memory stage
+     logic 		  XInfE, YInfE, ZInfE;           // is the input infinity - execute stage
+     logic 		  XInfM, YInfM, ZInfM;           // is the input infinity - memory stage
+     logic 		  XExpMaxE;                      // is the exponent all ones (max value)
+     logic 		  XNormE;                 // is normal     
+     
+     // result and flag signals
+     logic [63:0] 	  FDivResM, FDivResW; // divide/squareroot result
+     logic [4:0] 	  FDivFlgM, FDivFlgW; // divide/squareroot flags  
+     logic [63:0] 	  FMAResM, FMAResW;   // FMA/multiply result
+     logic [4:0] 	  FMAFlgM, FMAFlgW;   // FMA/multiply result	
+     logic [63:0] 	  ReadResW;           // read result (load instruction)
+     logic [63:0] 	  CvtFpResE, CvtFpResM, CvtFpResW; // add/FP -> FP convert result
+     logic [4:0] 	  CvtFpFlgE, CvtFpFlgM, CvtFpFlgW; // add/FP -> FP convert flags
+     logic [63:0] 	  CvtResE, CvtResM;   // FP <-> int convert result
+     logic [4:0] 	  CvtFlgE, CvtFlgM;   // FP <-> int convert flags //*** trim this	
+     logic [63:0] 	  ClassResE, ClassResM; // classify result
+     logic [63:0] 	  CmpResE, CmpResM; // compare result
+     logic 		  CmpNVE, CmpNVM;   // compare invalid flag (Not Valid)     
+     logic [63:0] 	  SgnResE, SgnResM; // sign injection result
+     logic 		  SgnNVE, SgnNVM;   // sign injection invalid flag (Not Valid)     
+     logic [63:0] 	  FResE, FResM, FResW;     // selected result that is ready in the memory stage
+     logic [4:0] 	  FFlgE, FFlgM;            // selected flag that is ready in the memory stage     
+     logic [`XLEN-1:0] 	  FIntResE;     
+     logic [63:0] 	  FPUResultW;    // final FP result being written to the FP register
+     
+     // other signals
+     logic 		  FDivSqrtDoneE;          // is divide done
+     logic [63:0] 	  DivInput1E, DivInput2E; // inputs to divide/squareroot unit
+     logic 		  FDivClk;                // clock for divide/squareroot unit
+     logic [63:0] 	  AlignedSrcAE;           // align SrcA to the floating point format
+
+     // DECODE STAGE
+     // calculate FP control signals
+     fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
+		  .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
+		  .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
 	
-	// regfile signals
-	logic [63:0] 	    FRD1D, FRD2D, FRD3D;  // Read Data from FP register - decode stage
-	logic [63:0] 	    FRD1E, FRD2E, FRD3E;  // Read Data from FP register - execute stage
-	logic [63:0] 	    FSrcXE, FSrcXM;       // Input 1 to the various units (after forwarding)
-	logic [63:0] 	    FPreSrcYE, FSrcYE;               // Input 2 to the various units (after forwarding)
-	logic [63:0] 	    FPreSrcZE, FSrcZE;     // Input 3 to the various units (after forwarding)
-	
-	// unpacking signals
-	logic 		   XSgnE, YSgnE, ZSgnE;     // input's sign - execute stage
-	logic 		   XSgnM, YSgnM;     // input's sign - memory stage
-	logic [10:0] XExpE, YExpE, ZExpE;     // input's exponent - execute stage
-	logic [10:0] XExpM, YExpM, ZExpM;     // input's exponent - memory stage
-	logic [52:0] XManE, YManE, ZManE;  // input's fraction - execute stage
-	logic [52:0] XManM, YManM, ZManM;  // input's fraction - memory stage
-	logic [10:0] BiasE;                   // bias based on precision (single=7f double=3ff - max expoent/2)
-	logic 		   XNaNE, YNaNE, ZNaNE;           // is the input a NaN - execute stage
-	logic 		   XNaNM, YNaNM, ZNaNM;           // is the input a NaN - memory stage
-	logic 		   XSNaNE, YSNaNE, ZSNaNE;        // is the input a signaling NaN - execute stage
-	logic 		   XSNaNM, YSNaNM, ZSNaNM;        // is the input a signaling NaN - memory stage
-	logic 		   XDenormE, YDenormE, ZDenormE;  // is the input denormalized
-	logic 		   XZeroE, YZeroE, ZZeroE;        // is the input zero - execute stage
-	logic 		   XZeroM, YZeroM, ZZeroM;        // is the input zero - memory stage
-	logic 		   XInfE, YInfE, ZInfE;           // is the input infinity - execute stage
-	logic 		   XInfM, YInfM, ZInfM;           // is the input infinity - memory stage
-	logic 		   XExpMaxE;                      // is the exponent all ones (max value)
-	logic 		   XNormE;                 // is normal
-	
-	
-	// result and flag signals
-	logic [63:0]  FDivResM, FDivResW; // divide/squareroot result
-	logic [4:0] 	FDivFlgM, FDivFlgW; // divide/squareroot flags
+     // FP register file
+     //    - can read 3 registers and write 1 register every cycle
+     fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
+			.a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), .a4(RdW), 
+			.wd4(FPUResultW),
+			.rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
+
+     // D/E pipeline registers
+     flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
+     flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
+     flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
+     flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
+                             {Adr1E, Adr2E, Adr3E});
+     flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+			       {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
+			       {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
+
+     // EXECUTION STAGE
+     // Hazard unit for FPU  
+     //    - determines if any forwarding or stalls are needed
+     fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
+                     .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
+     
+     // forwarding muxs
+     mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
+     mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
+     mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
+     mux3  #(64)  fyaddmux(FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, 
+			   {2'b0, {10{1'b1}}, 52'b0}, 
+			   {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01)}, 
+			   FSrcYE); // Force Z to be 0 for multiply instructions
+     // Force Z to be 0 for multiply instructions     
+     mux3  #(64)  fzmulmux(FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE);
+       
+     // unpacking unit
+     //    - splits FP inputs into their various parts
+     //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
+     unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
+			 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
+			 .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
+			 .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
+     
+     // FMA
+     //   - two stage FMA
+     //   - execute stage - multiplication and addend shifting
+     //   - memory stage  - addition and rounding
+     //   - handles FMA and multiply instructions
+     fma fma (.clk, .reset, .FlushM, .StallM, 
+	      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
+	      .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
+	      .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
+	      .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
+	      .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
+	      .FOpCtrlE,
+	      .FmtE, .FmtM, .FrmM, 
+	      .FMAFlgM, .FMAResM);
+     
+     // clock gater
+     //    - creates a clock that only runs durring divide/sqrt instructions
+     //    - using the seperate clock gives the divide/sqrt unit some to get set up
+     // *** the module says not to use in synthisis
+     clockgater fpdivclkg(.E(FDivStartE),
+			  .SE(1'b0),
+			  .CLK(clk),
+			  .ECLK(FDivClk));
+     
+     // capture the inputs for divide/sqrt
+     //    - if not captured any forwarded inputs will change durring computation
+     //        - this problem is caused by stalling the execute stage
+     //    - the other units don't have this problem, only div/sqrt stalls the execute stage
+     flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
+				.en(1'b1), .clear(FDivSqrtDoneE),
+				.reset(reset),  .clk(FDivBusyE));
+     flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
+				.en(1'b1), .clear(FDivSqrtDoneE),
+				.reset(reset),  .clk(FDivBusyE));
+      flopenrc #(6) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE}), 
+				.q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ}),
+				.en(1'b1), .clear(FDivSqrtDoneE),
+				.reset(reset),  .clk(FDivBusyE));
+            
+      // fpdivsqrt using Goldschmidt's iteration
+      fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
+		      .reset, .clk(FDivClk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
+		      .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ,
+		      .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
+
+     // convert from signle to double and vice versa
+     cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
+     
+     // compare unit
+     //    - computation is done in one stage
+     //    - writes to FP file durring min/max instructions
+     //    - other comparisons write a 1 or 0 to the integer register
+     fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
+		.FSrcXE, .FSrcYE, .FOpCtrlE, 
+		.FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
+		.Invalid(CmpNVE), .CmpResE);
+     
+     // sign injection unit
+     //    - computation is done in one stage
+     fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
+		.SgnNVE, .SgnResE);
+     
+     // classify
+     //    - computation is done in one stage
+     //    - most of the work is done in the unpacking unit
+     //    - result is written to the integer register
+     fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
+			  .XSNaNE, .ClassResE);
+     
+     fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE,
+		.CvtResE, .CvtFlgE);
+     
+     // data to be stored in memory - to IEU
+     //    - FP uses NaN-blocking format
+     //        - if there are any unsused bits the most significant bits are filled with 1s
+     assign FWriteDataE = FSrcYE[`XLEN-1:0];     
+     
+     // Align SrcA to MSB when single precicion
+     mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAE[31:0]}, {{64-`XLEN{1'b1}}, SrcAE}, FmtE, AlignedSrcAE);
+     
+     // select a result that may be written to the FP register
+     mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
+     mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
+     
+     // select the result that may be written to the integer register - to IEU
+     mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], 
+			       CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
+     
+     // E/M pipe registers
+
+     // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
+     flopenrc #(65) EMFpReg2(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
+     flopenrc #(65) EMFpReg3(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
+     flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
+     flopenrc #(12) EMFpReg5(clk, reset, FlushM, ~StallM, 
+			     {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
+			     {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
+     flopenrc #(64) EMRegCmpRes(clk, reset, FlushM, ~StallM, FResE, FResM); 
+     flopenrc #(5)  EMRegCmpFlg(clk, reset, FlushM, ~StallM, FFlgE, FFlgM);      
+     flopenrc #(`XLEN) EMRegSgnRes(clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
+     flopenrc #(11) EMCtrlReg(clk, reset, FlushM, ~StallM,
+			      {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
+			      {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
+     
+     // BEGIN MEMORY STAGE
+     // FPU flag selection - to privileged
+     mux4  #(5)  FPUFlgMux(5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
   
-	logic [63:0]  FMAResM, FMAResW;   // FMA/multiply result
-	logic [4:0] 	FMAFlgM, FMAFlgW;   // FMA/multiply result
-	
-	logic [63:0] 	ReadResW;           // read result (load instruction)
+     // M/W pipe registers
+     flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
+     flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
+     flopenrc #(64) MWRegAdd(clk, reset, FlushW, ~StallW, CvtFpResM, CvtFpResW); 
+     flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
+     flopenrc #(5)  MWCtrlReg(clk, reset, FlushW, ~StallW,
+			      {FRegWriteM, FResultSelM, FmtM, FWriteIntM},
+			      {FRegWriteW, FResultSelW, FmtW, FWriteIntW});
+     
+     // BEGIN WRITEBACK STAGE
+     
+     // put ReadData into NaN-blocking format
+     //    - if there are any unsused bits the most significant bits are filled with 1s
+     //    - for load instruction
+     mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
+     
+     // select the result to be written to the FP register
+     mux4  #(64)  FPUResultMux(ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
 
-	logic [63:0] 	CvtFpResE, CvtFpResM, CvtFpResW; // add/FP -> FP convert result
-	logic [4:0] 	CvtFpFlgE, CvtFpFlgM, CvtFpFlgW; // add/FP -> FP convert flags
-
-	logic [63:0] 	CvtResE, CvtResM;   // FP <-> int convert result
-	logic [4:0] 	CvtFlgE, CvtFlgM;   // FP <-> int convert flags //*** trim this
-	
-	logic [63:0] 	ClassResE, ClassResM; // classify result
-
-	logic [63:0] 	CmpResE, CmpResM; // compare result
-	logic 		    CmpNVE, CmpNVM;   // compare invalid flag (Not Valid)
-	
-	logic [63:0] 	SgnResE, SgnResM; // sign injection result
-	logic 		    SgnNVE, SgnNVM;   // sign injection invalid flag (Not Valid)
-
-	logic [63:0] 	FResE, FResM, FResW;     // selected result that is ready in the memory stage
-	logic [4:0] 	FFlgE, FFlgM;            // selected flag that is ready in the memory stage
-
-	logic [`XLEN-1:0]  FIntResE;
-
-	logic [63:0] 	   FPUResultW;    // final FP result being written to the FP register
-		
-	// other signals
-	logic 		    FDivSqrtDoneE;          // is divide done
-	logic [63:0] 	DivInput1E, DivInput2E; // inputs to divide/squareroot unit
-	logic 		    FDivClk;                // clock for divide/squareroot unit
-	logic [63:0] 	AlignedSrcAE;           // align SrcA to the floating point format
-
-
-
-
-
-  ////////////////////////////////////////////////////////////////////////////////////////
-	//DECODE STAGE
-	////////////////////////////////////////////////////////////////////////////////////////
-
-
-
-	// calculate FP control signals
-	fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
-              // outputs:
-              .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
-              .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
-	
-	// FP register file
-  //    - can read 3 registers and write 1 register every cycle
-	fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
-			   .a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), .a4(RdW), 
-         .wd4(FPUResultW),
-         // outputs:
-			   .rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
-	
-
-
-
-
-	////////////////////////////////////////////////////////////////////////////////////////
-	// D/E pipeline registers
-	////////////////////////////////////////////////////////////////////////////////////////
-
-	flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
-	flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
-	flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
-	flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
-                                                       {Adr1E,         Adr2E,         Adr3E});
-	flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-				  {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
-				  {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
-	
-
-
-
-
-
-  
-	////////////////////////////////////////////////////////////////////////////////////////
-	//EXECUTION STAGE
-	////////////////////////////////////////////////////////////////////////////////////////
-
-
-	// Hazard unit for FPU  
-  //    - determines if any forwarding or stalls are needed
-	fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
-                  // outputs:
-                  .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
-	
-
-	// forwarding muxs
-	mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
-	mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
-	mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
-	mux3  #(64)  fyaddmux(FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, {2'b0, {10{1'b1}}, 52'b0}, {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01)}, FSrcYE); // Force Z to be 0 for multiply instructions
-	mux3  #(64)  fzmulmux(FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE); // Force Z to be 0 for multiply instructions
- 	
-   
-  // unpacking unit
-  //    - splits FP inputs into their various parts
-  //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
-	unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
-                      // outputs:
-                      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
-                      .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
-                      .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
-
-  // FMA
-  //    - two stage FMA
-  //        - execute stage - multiplication and addend shifting
-  //        - memory stage  - addition and rounding
-  //    - handles FMA and multiply instructions
-  //    - contains some E/M pipleine registers
-  // *** currently handles FLEN and 32 bits(dont know if 32 works with 128 - easy to fix) - change to handle only the supported formats
-	fma fma (.clk, .reset, .FlushM, .StallM, 
-		 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
-     .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
-		 .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
-     .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
-     .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
-		 .FOpCtrlE,
-		 .FmtE, .FmtM, .FrmM, 
-     // outputs:
-     .FMAFlgM, .FMAResM);
-	
-	// clock gater
-  //    - creates a clock that only runs durring divide/sqrt instructions
-  //    - using the seperate clock gives the divide/sqrt unit some to get set up
-  // *** the module says not to use in synthisis
-	clockgater fpdivclkg(.E(FDivStartE),
-			     .SE(1'b0),
-			     .CLK(clk),
-			     .ECLK(FDivClk));
-	
-	// capture the inputs for divide/sqrt
-  //    - if not captured any forwarded inputs will change durring computation
-  //        - this problem is caused by stalling the execute stage
-  //    - the other units don't have this problem, only div/sqrt stalls the execute stage
-	flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
-				   .en(1'b1), .clear(FDivSqrtDoneE),
-				   .reset(reset),  .clk(FDivBusyE));
-	flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
-				   .en(1'b1), .clear(FDivSqrtDoneE),
-				   .reset(reset),  .clk(FDivBusyE));
-	
-	// output for store instructions
-  //*** change to use the unpacking unit if possible
-	fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
-			             .reset, .clk(FDivClk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
-                   // outputs:
-			             .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
-	
-	// convert from signle to double and vice versa
-	cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
-	
-	// compare unit
-  //    - computation is done in one stage
-  //    - writes to FP file durring min/max instructions
-  //    - other comparisons write a 1 or 0 to the integer register
-	fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
-            .FSrcXE, .FSrcYE, .FOpCtrlE, 
-            .FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
-            // outputs:
-		        .Invalid(CmpNVE), .CmpResE);
-	
-	// sign injection unit
-  //    - computation is done in one stage
-	fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
-            // outputs:
-            .SgnNVE, .SgnResE);
-	
-	// classify
-  //    - computation is done in one stage
-  //    - most of the work is done in the unpacking unit
-  //    - result is written to the integer register
-	fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
-                      // outputs:
-                      .XSNaNE, .ClassResE);
-	
-	fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE,
-            // outputs: 
-            .CvtResE, .CvtFlgE);
-	
-	// data to be stored in memory - to IEU
-  //    - FP uses NaN-blocking format
-  //        - if there are any unsused bits the most significant bits are filled with 1s
-	assign FWriteDataE = FSrcYE[`XLEN-1:0];
-	
-
-	// Align SrcA to MSB when single precicion
-	mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAE[31:0]}, {{64-`XLEN{1'b1}}, SrcAE}, FmtE, AlignedSrcAE);
-
-  // select a result that may be written to the FP register
-	mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
-	mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
-	
-  // select the result that may be written to the integer register - to IEU
-	mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
-	
-
-
-  //***will synth remove registers of values that are always zero?
-	////////////////////////////////////////////////////////////////////////////////////////
-	// E/M pipe registers
-	////////////////////////////////////////////////////////////////////////////////////////
-
-	// flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
-	flopenrc #(65) EMFpReg2(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
-	flopenrc #(65) EMFpReg3(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
-	flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
-	flopenrc #(12) EMFpReg5(clk, reset, FlushM, ~StallM, 
-				{XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
-				{XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});
-	
-	flopenrc #(64) EMRegCmpRes(clk, reset, FlushM, ~StallM, FResE, FResM); 
-	flopenrc #(5)  EMRegCmpFlg(clk, reset, FlushM, ~StallM, FFlgE, FFlgM); 
-	
-	flopenrc #(`XLEN) EMRegSgnRes(clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
-	// flopenrc #(1) EMRegSgnFlg(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
-
-	//flopenrc #(64) EMRegCvtFpRes(clk, reset, FlushM, ~StallM, CvtFpResE, CvtFpResM);
-	//flopenrc #(5) EMRegCvtFpFlg(clk, reset, FlushM, ~StallM, CvtFpFlgE, CvtFpFlgM);
-	
-	// flopenrc #(64) EMRegCvtRes(clk, reset, FlushM, ~StallM, CvtResE, CvtResM);
-	// flopenrc #(5) EMRegCvtFlg(clk, reset, FlushM, ~StallM, CvtFlgE, CvtFlgM);
-  
-	// flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
-	
-	flopenrc #(11) EMCtrlReg(clk, reset, FlushM, ~StallM,
-				 {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
-				 {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
-	
-	
-
-
-
-
-	////////////////////////////////////////////////////////////////////////////////////////
-	//BEGIN MEMORY STAGE
-	////////////////////////////////////////////////////////////////////////////////////////
-
-
-  // FPU flag selection - to privileged
-	mux4  #(5)  FPUFlgMux(5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
-	
-
-
-
-  
-	////////////////////////////////////////////////////////////////////////////////////////
-	// M/W pipe registers
-	////////////////////////////////////////////////////////////////////////////////////////
-	flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
-	flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
-	flopenrc #(64) MWRegAdd(clk, reset, FlushW, ~StallW, CvtFpResM, CvtFpResW); 
-	flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
-	flopenrc #(5)  MWCtrlReg(clk, reset, FlushW, ~StallW,
-				{FRegWriteM, FResultSelM, FmtM, FWriteIntM},
-				{FRegWriteW, FResultSelW, FmtW, FWriteIntW});
-	
-
-
-
-	////////////////////////////////////////////////////////////////////////////////////////
-	// BEGIN WRITEBACK STAGE
-	////////////////////////////////////////////////////////////////////////////////////////
-
-  // put ReadData into NaN-blocking format
-  //    - if there are any unsused bits the most significant bits are filled with 1s
-  //    - for load instruction
-	mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
-
-  // select the result to be written to the FP register
-	mux4  #(64)  FPUResultMux(ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
-	
-	
   end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low
-	assign FStallD = 0;
-	assign FWriteIntE = 0; 
-	assign FWriteIntM = 0;
-	assign FWriteIntW = 0;
-	assign FWriteDataE = 0;
-	assign FIntResM = 0;
-	assign FDivBusyE = 0;
-	assign IllegalFPUInstrD = 1;
-	assign SetFflagsM = 0;
+     assign FStallD = 0;
+     assign FWriteIntE = 0; 
+     assign FWriteIntM = 0;
+     assign FWriteIntW = 0;
+     assign FWriteDataE = 0;
+     assign FIntResM = 0;
+     assign FDivBusyE = 0;
+     assign IllegalFPUInstrD = 1;
+     assign SetFflagsM = 0;
   end
   endgenerate 
    
diff --git a/wally-pipelined/src/fpu/fregfile.sv b/wally-pipelined/src/fpu/fregfile.sv
index 4b001bc93..fd8e0f608 100644
--- a/wally-pipelined/src/fpu/fregfile.sv
+++ b/wally-pipelined/src/fpu/fregfile.sv
@@ -1,10 +1,9 @@
 ///////////////////////////////////////////
-// regfile.sv
 //
 // Written: David_Harris@hmc.edu 9 January 2021
-// Modified: 
+// Modified: James Stine 
 //
-// Purpose: 4-port register file
+// Purpose: 3-port output register file
 // 
 // A component of the Wally configurable RISC-V project.
 // 
@@ -26,22 +25,20 @@
 `include "wally-config.vh"
 
 module fregfile (
-  input  logic        clk, reset,
-  input  logic        we4, 
-  input  logic [ 4:0] a1, a2, a3, a4, 
-  input  logic [63:0] wd4,
+  input logic 	      clk, reset,
+  input logic 	      we4, 
+  input logic [4:0]   a1, a2, a3, a4, 
+  input logic [63:0]  wd4,
   output logic [63:0] rd1, rd2, rd3);
-
-  logic [63:0] rf[31:0];
-  integer i;
-
-  // three ported register file
-  // read three ports combinationally (A1/RD1, A2/RD2, A3/RD3)
-  // write fourth port on rising edge of clock (A4/WD4/WE4)
-  // write occurs on falling edge of clock
-  
-  // reset is intended for simulation only, not synthesis
-    
+   
+   logic [63:0]       rf[31:0];
+   integer 	      i;
+   
+   // three ported register file
+   // read three ports combinationally (A1/RD1, A2/RD2, A3/RD3)
+   // write fourth port on rising edge of clock (A4/WD4/WE4)
+   // write occurs on falling edge of clock   
+   
    always_ff @(negedge clk or posedge reset)
      if (reset) for(i=0; i<32; i++) rf[i] <= 0;
      else if (we4) rf[a4] <= wd4;	
diff --git a/wally-pipelined/src/fpu/fsm.sv b/wally-pipelined/src/fpu/fsm.sv
index 00f959930..a0e874bc7 100755
--- a/wally-pipelined/src/fpu/fsm.sv
+++ b/wally-pipelined/src/fpu/fsm.sv
@@ -1,49 +1,63 @@
-module fsm (
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 9/28/2021
+//
+// Purpose: FSM for floating point divider/square root unit (Goldschmidt)
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
 
-   input logic 			clk,
-   input logic 			reset,
-   input logic 			start,
-   input logic  		op_type,
-   output logic 		done,      // End of cycles
-   output logic 		load_rega, // enable for regA
-   output logic 		load_regb, // enable for regB
-   output logic 		load_regc, // enable for regC
-   output logic 		load_regd, // enable for regD
-   output logic 		load_regr, // enable for rem
-   output logic 		load_regs, // enable for q,qm,qp 
-   output logic [2:0] 	sel_muxa,  // Select muxA
-   output logic [2:0] 	sel_muxb,  // Select muxB
-   output logic 		sel_muxr,  // Select rem mux
-   output logic			divBusy	   // calculation is happening
+module fsm (
+   input logic 	      clk,
+   input logic 	      reset,
+   input logic 	      start,
+   input logic 	      op_type,
+   output logic       done, 
+   output logic       load_rega, 
+   output logic       load_regb, 
+   output logic       load_regc, 
+   output logic       load_regd,
+   output logic       load_regr,
+   output logic       load_regs,
+   output logic [2:0] sel_muxa, 
+   output logic [2:0] sel_muxb, 
+   output logic       sel_muxr, 
+   output logic       divBusy	   
    );
 
-
-   reg [4:0] 	CURRENT_STATE;
-   reg [4:0] 	NEXT_STATE;   
-
-   parameter [4:0] 
-     S0=5'd0, S1=5'd1, S2=5'd2,
-     S3=5'd3, S4=5'd4, S5=5'd5,
-     S6=5'd6, S7=5'd7, S8=5'd8,
-     S9=5'd9, S10=5'd10,
-     S13=5'd13, S14=5'd14, S15=5'd15,     
-     S16=5'd16, S17=5'd17, S18=5'd18,
-     S19=5'd19, S20=5'd20, S21=5'd21,
-     S22=5'd22, S23=5'd23, S24=5'd24,
-     S25=5'd25, S26=5'd26, S27=5'd27,
-     S28=5'd28, S29=5'd29, S30=5'd30;
+   typedef enum       logic [4:0] {S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
+				   S10, S11, S12, S13, S14, S15, S16, S17, S18, S19,
+				   S20, S21, S22, S23, S24, S25, S26, S27, S28, S29,
+				   S30} statetype;
+   
+   statetype current_state, next_state;
    
    always @(negedge clk)
      begin
-	if(reset==1'b1)
-	  CURRENT_STATE=S0;
+	if (reset == 1'b1)
+	  current_state = S0;
 	else
-	  CURRENT_STATE=NEXT_STATE;
+	  current_state = next_state;
      end
 
    always @(*)
      begin
- 	case(CURRENT_STATE)
+ 	case(current_state)
 	  S0:  // iteration 0
 	    begin
 	       if (start==1'b0)
@@ -59,7 +73,7 @@ module fsm (
 		    sel_muxa = 3'b000;
 		    sel_muxb = 3'b000;
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S0;
+		    next_state = S0;
 		 end 
 	       else if (start==1'b1 && op_type==1'b0) 
 		 begin
@@ -74,7 +88,7 @@ module fsm (
 		    sel_muxa = 3'b001;
 		    sel_muxb = 3'b001;		    
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S1;
+		    next_state = S1;
 		 end // if (start==1'b1 && op_type==1'b0)
 	       else if (start==1'b1 && op_type==1'b1) 
 		 begin
@@ -89,7 +103,7 @@ module fsm (
 		    sel_muxa = 3'b010;
 		    sel_muxb = 3'b000;		    
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S13;
+		    next_state = S13;
 		 end 	   
 	       else
 		 begin
@@ -104,7 +118,7 @@ module fsm (
 		    sel_muxa = 3'b000;
 		    sel_muxb = 3'b000;		    
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S0;
+		    next_state = S0;
 		 end
 	    end // case: S0
 	  S1:
@@ -120,7 +134,7 @@ module fsm (
 	       sel_muxa = 3'b010;
 	       sel_muxb = 3'b000;		    
 	       sel_muxr = 1'b0;	
-	       NEXT_STATE = S2;
+	       next_state = S2;
 	    end	  
 	  S2: // iteration 1
 	    begin
@@ -135,7 +149,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S3;
+	       next_state = S3;
 	    end
 	  S3:
 	    begin
@@ -150,7 +164,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S4;
+	       next_state = S4;
 	    end
 	  S4: // iteration 2
 	    begin
@@ -165,7 +179,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S5;
+	       next_state = S5;
 	    end
 	  S5:
 	    begin
@@ -180,7 +194,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;  // add
-	       NEXT_STATE = S6;
+	       next_state = S6;
 	    end
 	  S6: // iteration 3
 	    begin
@@ -195,7 +209,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S8;
+	       next_state = S8;
 	    end
 	  S7:
 	    begin
@@ -210,7 +224,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S8;
+	       next_state = S8;
 	    end // case: S7
 	  S8: // q,qm,qp
 	    begin
@@ -225,7 +239,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S9;
+	       next_state = S9;
 	    end 
 	  S9:  // rem
 	    begin
@@ -240,7 +254,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b1;
-	       NEXT_STATE = S10;
+	       next_state = S10;
 	    end 	  
 	  S10:  // done
 	    begin
@@ -255,7 +269,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S0;
+	       next_state = S0;
 	    end 
 	  S13:  // start of sqrt path
 	    begin
@@ -270,7 +284,7 @@ module fsm (
 	       sel_muxa = 3'b010;
 	       sel_muxb = 3'b001;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S14;
+	       next_state = S14;
 	    end
 	  S14:  
 	    begin
@@ -285,7 +299,7 @@ module fsm (
 	       sel_muxa = 3'b001;
 	       sel_muxb = 3'b100;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S15;
+	       next_state = S15;
 	    end 
 	  S15:  // iteration 1
 	    begin
@@ -300,7 +314,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S16;
+	       next_state = S16;
 	    end
 	  S16:  
 	    begin
@@ -315,7 +329,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S17;
+	       next_state = S17;
 	    end
 	  S17:  
 	    begin
@@ -330,7 +344,7 @@ module fsm (
 	       sel_muxa = 3'b100;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S18;
+	       next_state = S18;
 	    end
 	  S18:  // iteration 2
 	    begin
@@ -345,7 +359,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S19;
+	       next_state = S19;
 	    end
 	  S19:  
 	    begin
@@ -360,7 +374,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S20;
+	       next_state = S20;
 	    end
 	  S20:  
 	    begin
@@ -375,7 +389,7 @@ module fsm (
 	       sel_muxa = 3'b100;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S21;
+	       next_state = S21;
 	    end
 	  S21:  // iteration 3
 	    begin
@@ -390,7 +404,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S22;
+	       next_state = S22;
 	    end
 	  S22:  
 	    begin
@@ -405,7 +419,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S23;
+	       next_state = S23;
 	    end
 	  S23:  
 	    begin
@@ -420,7 +434,7 @@ module fsm (
 	       sel_muxa = 3'b100;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S24;
+	       next_state = S24;
 	    end 
 	  S24: // q,qm,qp
 	    begin
@@ -435,7 +449,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S25;
+	       next_state = S25;
 	    end 	  
 	  S25:  // rem
 	    begin
@@ -450,7 +464,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b110;
 	       sel_muxr = 1'b1;
-	       NEXT_STATE = S26;
+	       next_state = S26;
 	    end 
 	  S26:  // done
 	    begin
@@ -465,7 +479,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S0;
+	       next_state = S0;
 	    end 
 	  default: 
 	    begin
@@ -480,9 +494,9 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S0;
+	       next_state = S0;
 	    end
-	endcase // case(CURRENT_STATE)	
-     end // always @ (CURRENT_STATE or X)   
+	endcase // case(current_state)	
+     end // always @ (current_state or X)   
 
 endmodule // fsm
diff --git a/wally-pipelined/src/fpu/rounder_div.sv b/wally-pipelined/src/fpu/rounder_div.sv
index ff7c4830f..1d2ff1cc3 100755
--- a/wally-pipelined/src/fpu/rounder_div.sv
+++ b/wally-pipelined/src/fpu/rounder_div.sv
@@ -1,37 +1,55 @@
+///////////////////////////////////////////
 //
-// The rounder takes as inputs a 64-bit value to be rounded, A, the 
-// exponent of the value to be rounded, the sign of the final result, Sign, 
-// the precision of the results, P, and the two-bit rounding mode, rm. 
-// It produces a rounded 52-bit result, Z, the exponent of the rounded 
-// result, Z_exp, and a flag that indicates if the result was rounded,
-// Inexact. The rounding mode has the following values.
-//	    rm		Mode
-//      00 		round-to-nearest-even
-//	    01 		round-toward-zero
-//      10 		round-toward-plus infinity
-//      11  	round-toward-minus infinity
+// Written: James Stine
+// Modified: 8/1/2018
 //
+// Purpose: Floating point divider/square root rounder unit (Goldschmidt)
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
 
 module rounder_div (
-    input logic [1:0]   rm,
-    input logic         P,
-    input logic         OvEn,
-    input logic         UnEn,
-    input logic [12:0]  exp_diff,
-    input logic [2:0]   sel_inv,
-    input logic         Invalid,
-    input logic 	    SignR,
-   
-    input logic [63:0]  q1,
-    input logic [63:0]  qm1,
-    input logic [63:0]  qp1,
-    input logic [63:0]  q0,
-    input logic [63:0]  qm0,
-    input logic [63:0]  qp0,   
+    input logic [1:0] 	rm,
+    input logic 	P,
+    input logic 	OvEn,
+    input logic 	UnEn,
+    input logic [12:0] 	exp_diff,
+    input logic [2:0] 	sel_inv,
+    input logic 	Invalid,
+    input logic 	SignR,
+    input logic [63:0] 	Float1,
+    input logic [63:0] 	Float2,
+    input logic 	XNaNQ,
+    input logic 	YNaNQ,
+    input logic 	XZeroQ,
+    input logic 	YZeroQ, 
+    input logic 	XInfQ,
+    input logic 	YInfQ,
+    input logic 	op_type, 
+    input logic [63:0] 	q1,
+    input logic [63:0] 	qm1,
+    input logic [63:0] 	qp1,
+    input logic [63:0] 	q0,
+    input logic [63:0] 	qm0,
+    input logic [63:0] 	qp0, 
     input logic [127:0] regr_out,
    
     output logic [63:0] Result,
-    output logic [4:0]  Flags
+    output logic [4:0] 	Flags
     );
       
    logic 	       Rsign;
@@ -56,11 +74,15 @@ module rounder_div (
    logic 	       Texp_l7z;
    logic 	       Texp_l7o;
    logic 	       OvCon;
-   logic           zero_rem;
-   logic [1:0] 	   mux_mant;
+   logic 	       zero_rem;
+   logic [1:0] 	       mux_mant;
    logic 	       sign_rem;
-   logic [63:0]    q, qm, qp;
-   logic 	       exp_ovf;   
+   logic [63:0]        q, qm, qp;
+   logic 	       exp_ovf;
+
+   logic [50:0]        NaN_out;
+   logic 	       NaN_Sign_out;   
+   logic 	       Sign_out;     
 
    // Remainder = 0?
    assign zero_rem = ~(|regr_out);
@@ -117,12 +139,11 @@ module rounder_div (
    // the input was infinite or NaN or the output of the adder is zero.
    // 00 = Valid
    // 10 = NaN
-   assign Valid = (~sel_inv[2]&~sel_inv[1]&~sel_inv[0]);
-   assign NaN = ~sel_inv[1]& sel_inv[0];
+   assign Valid = ~sel_inv[2]&~sel_inv[1]&~sel_inv[0];
+   assign NaN = sel_inv[2]&sel_inv[1]&sel_inv[0]; 
    assign UnderFlow = (P & UnFlow_SP | UnFlow_DP) & Valid;
    assign OverFlow  = (P & OvFlow_SP | OvFlow_DP) & Valid;
-   assign Div0 = sel_inv[2]&sel_inv[1]&~sel_inv[0];
-
+   assign Div0 = YZeroQ&~XZeroQ&~op_type&~NaN;   
 
    // The final result is Inexact if any rounding occurred ((i.e., R or S 
    // is one), or (if the result overflows ) or (if the result underflows and the 
@@ -161,18 +182,26 @@ module rounder_div (
    // If the result is zero or infinity, the mantissa is all zeros. 
    // If the result is NaN, the mantissa is 10...0
    // If the result the largest floating point number, the mantissa
-   // is all ones. Otherwise, the mantissa is not changed. 
-   assign Rmant[51] = Largest | NaN | (Smant[51]&~Infinite&~Rzero);
-   assign Rmant[50:0] = {51{Largest}} | (Smant[50:0]&{51{~Infinite&Valid&~Rzero}});
+   // is all ones. Otherwise, the mantissa is not changed.
+   assign NaN_out = ~XNaNQ&YNaNQ ? Float2[50:0] : Float1[50:0];
+   assign NaN_Sign_out = ~XNaNQ&YNaNQ ? Float2[63] : Float1[63];
+   assign Sign_out = (XZeroQ&YZeroQ | XInfQ&YInfQ)&~op_type | Rsign&~XNaNQ&~YNaNQ | 
+   		     NaN_Sign_out&(XNaNQ|YNaNQ);
 
+   // FIXME (jes) - Imperas gives sNaN a Sign=0 where x86 gives Sign=1
+   // | Float1[63]&op_type;
+   assign Rmant[51] = Largest | NaN | (Smant[51]&~Infinite&~Rzero);
+   assign Rmant[50:0] = ({51{Largest}} | (Smant[50:0]&{51{~Infinite&Valid&~Rzero}}) |
+			(NaN_out&{51{NaN}}))&({51{~(op_type&Float1[63]&~XZeroQ)}});
+   
    // For single precision, the 8 least significant bits of the exponent
    // and 23 most significant bits of the mantissa contain bits used 
    // for the final result. A double precision result is returned if 
    // overflow has occurred, the overflow trap is enabled, and a conversion
    // is being performed. 
    assign OvCon = OverFlow & OvEn;
-   assign Result = (P&~OvCon) ? { {32{1'b1}}, Rsign, Rexp[7:0], Rmant[51:29]}
-	           : {Rsign, Rexp, Rmant};
+   assign Result = (P&~OvCon) ? { {32{1'b1}}, Sign_out, Rexp[7:0], Rmant[51:29]}
+	           : {Sign_out, Rexp, Rmant};
 
 endmodule // rounder
 
diff --git a/wally-pipelined/src/fpu/sbtm_a0.sv b/wally-pipelined/src/fpu/sbtm_a0.sv
index 83953787b..61dd183bb 100644
--- a/wally-pipelined/src/fpu/sbtm_a0.sv
+++ b/wally-pipelined/src/fpu/sbtm_a0.sv
@@ -1,5 +1,30 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_a0 (input  logic [6:0] a,
-		            output logic [12:0] y);
+		output logic [12:0] y);
+   
    always_comb
      case(a)
        7'b0000000: y = 13'b1111111100010;
@@ -137,4 +162,4 @@ endmodule // sbtm_a0
 
     
     
-    
\ No newline at end of file
+    
diff --git a/wally-pipelined/src/fpu/sbtm_a1.sv b/wally-pipelined/src/fpu/sbtm_a1.sv
index 76e4bdec9..88845283c 100644
--- a/wally-pipelined/src/fpu/sbtm_a1.sv
+++ b/wally-pipelined/src/fpu/sbtm_a1.sv
@@ -1,5 +1,30 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_a1 (input  logic [6:0] a,
-		            output logic [4:0] y);
+		output logic [4:0] y);
+   
    always_comb
      case(a)
        7'b0000000: y = 5'b11100;
@@ -137,4 +162,4 @@ endmodule // sbtm_a0
 
     
     
-    
\ No newline at end of file
+    
diff --git a/wally-pipelined/src/fpu/sbtm_a2.sv b/wally-pipelined/src/fpu/sbtm_a2.sv
index ae407ec81..8d32ad157 100755
--- a/wally-pipelined/src/fpu/sbtm_a2.sv
+++ b/wally-pipelined/src/fpu/sbtm_a2.sv
@@ -1,5 +1,30 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_a2 (input  logic [7:0] a,
-		            output logic [13:0] y);
+		output logic [13:0] y);
+   
    always_comb
      case(a)
        8'b01000000: y = 14'b10110100010111;
@@ -201,4 +226,4 @@ endmodule // sbtm_a0
 
     
     
-    
\ No newline at end of file
+    
diff --git a/wally-pipelined/src/fpu/sbtm_a3.sv b/wally-pipelined/src/fpu/sbtm_a3.sv
index c6b367933..5958c3bf6 100755
--- a/wally-pipelined/src/fpu/sbtm_a3.sv
+++ b/wally-pipelined/src/fpu/sbtm_a3.sv
@@ -1,5 +1,30 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_a3 (input  logic [7:0] a,
-		            output logic [5:0] y);
+		output logic [5:0] y);
+   
    always_comb
      case(a)
        8'b01000000: y = 6'b100110;
diff --git a/wally-pipelined/src/fpu/sbtm_div.sv b/wally-pipelined/src/fpu/sbtm_div.sv
index 53b56dbd7..999106d86 100644
--- a/wally-pipelined/src/fpu/sbtm_div.sv
+++ b/wally-pipelined/src/fpu/sbtm_div.sv
@@ -1,3 +1,27 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup for divide portion of fpdivsqrt
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_div (input logic [11:0] a, output logic [10:0] ia_out);
 
    // bit partitions
diff --git a/wally-pipelined/src/fpu/sbtm_sqrt.sv b/wally-pipelined/src/fpu/sbtm_sqrt.sv
index 27ffbeccf..fdf0bb6df 100644
--- a/wally-pipelined/src/fpu/sbtm_sqrt.sv
+++ b/wally-pipelined/src/fpu/sbtm_sqrt.sv
@@ -1,3 +1,27 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup for sqrt part of fpdivsqrt
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_sqrt (input logic [11:0] a, output logic [10:0] y);
 
    // bit partitions