From 5bcae393c93b11c83420ba1e05ea51d40c474b3a Mon Sep 17 00:00:00 2001
From: Skylar Litz <slitz@hmc.edu>
Date: Mon, 4 Oct 2021 18:23:31 -0400
Subject: [PATCH 1/3] added delayed MIP signal

---
 wally-pipelined/testbench/testbench-linux.sv | 22 ++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/wally-pipelined/testbench/testbench-linux.sv b/wally-pipelined/testbench/testbench-linux.sv
index 76a1841b..73077e7d 100644
--- a/wally-pipelined/testbench/testbench-linux.sv
+++ b/wally-pipelined/testbench/testbench-linux.sv
@@ -38,7 +38,7 @@
 
 module testbench();
   
-  parameter waveOnICount = `BUSYBEAR*140000 + `BUILDROOT*8700000; // # of instructions at which to turn on waves in graphical sim
+  parameter waveOnICount = `BUSYBEAR*140000 + `BUILDROOT*3100000; // # of instructions at which to turn on waves in graphical sim
   string ProgramAddrMapFile, ProgramLabelMapFile;
 
   ///////////////////////////////////////////////////////////////////////////////
@@ -137,6 +137,7 @@ module testbench();
   integer           NumCSRWIndex;
   integer           NumCSRPostWIndex;
   logic [`XLEN-1:0] InstrCountW;
+  integer           RequestDelayedMIP;
   
   // ------
   // Macros
@@ -246,9 +247,16 @@ module testbench();
           MarkerIndex += 2;
           // match MIP to QEMU's because interrupts are imprecise
           if(ExpectedCSRArrayM[NumCSRM].substr(0, 2) == "mip") begin
-            $display("%tns: Updating MIP to %x",$time,ExpectedCSRArrayValueM[NumCSRM]);
-            MIPexpected = ExpectedCSRArrayValueM[NumCSRM];
-            force dut.hart.priv.csr.genblk1.csri.MIP_REGW = MIPexpected;
+            $display("%tn: ExpectedCSRArrayM[7] (MEPC) = %x",$time,ExpectedCSRArrayM[7]);
+            $display("%tn: ExpectedPCM = %x",$time,ExpectedPCM);
+            // if PC does not equal MEPC, request delayed MIP is True
+            if(ExpectedPCM != ExpectedCSRArrayM[7]) begin
+              RequestDelayedMIP = 1;
+            end else begin
+              $display("%tns: Updating MIP to %x",$time,ExpectedCSRArrayValueM[NumCSRM]);
+              MIPexpected = ExpectedCSRArrayValueM[NumCSRM];
+              force dut.hart.priv.csr.genblk1.csri.MIP_REGW = MIPexpected;
+            end
           end 
           NumCSRM++;      
         end
@@ -326,6 +334,12 @@ module testbench();
   
   // step2: make all checks in the write back stage.
   always @(negedge clk) begin
+    if(RequestDelayedMIP) begin
+      $display("%tns: Updating MIP to %x",$time,ExpectedCSRArrayValueW[NumCSRM]);
+      MIPexpected = ExpectedCSRArrayValueW[NumCSRM];
+      force dut.hart.priv.csr.genblk1.csri.MIP_REGW = MIPexpected;
+      RequestDelayedMIP = 0;
+    end
     // always check PC, instruction bits
     if (checkInstrW) begin
       InstrCountW += 1;

From a91c0c8fc714017f25d0aa6a4144e68ee4028efd Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Wed, 6 Oct 2021 08:26:09 -0500
Subject: [PATCH 2/3] Make changes to fpdiv - still working on clock issue with
 fsm that was changed from posedge to negedge - also updated fpdivsqrt
 rounding to handle testfloat

---
 wally-pipelined/src/fpu/convert_inputs.sv |  29 +-
 wally-pipelined/src/fpu/exception_div.sv  |  27 +-
 wally-pipelined/src/fpu/fpdiv.sv          | 155 +++---
 wally-pipelined/src/fpu/fpu.sv            | 613 ++++++++++------------
 wally-pipelined/src/fpu/fregfile.sv       |  33 +-
 wally-pipelined/src/fpu/fsm.sv            | 146 +++---
 wally-pipelined/src/fpu/rounder_div.sv    | 109 ++--
 wally-pipelined/src/fpu/sbtm_a0.sv        |  29 +-
 wally-pipelined/src/fpu/sbtm_a1.sv        |  29 +-
 wally-pipelined/src/fpu/sbtm_a2.sv        |  29 +-
 wally-pipelined/src/fpu/sbtm_a3.sv        |  27 +-
 wally-pipelined/src/fpu/sbtm_div.sv       |  24 +
 wally-pipelined/src/fpu/sbtm_sqrt.sv      |  24 +
 13 files changed, 698 insertions(+), 576 deletions(-)

diff --git a/wally-pipelined/src/fpu/convert_inputs.sv b/wally-pipelined/src/fpu/convert_inputs.sv
index bf56cb00..9a0584ba 100755
--- a/wally-pipelined/src/fpu/convert_inputs.sv
+++ b/wally-pipelined/src/fpu/convert_inputs.sv
@@ -1,9 +1,26 @@
-// This module takes as inputs two operands (op1 and op2) 
-// the operation type (op_type) and the result precision (P). 
-// Based on the operation and precision , it conditionally
-// converts single precision values to double precision values
-// and modifies the sign of op1. The converted operands are Float1
-// and Float2.
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Floating point divider/square root top unit (Goldschmidt)
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
 
 module convert_inputs(
    input [63:0]  op1,      // 1st input operand (A)
diff --git a/wally-pipelined/src/fpu/exception_div.sv b/wally-pipelined/src/fpu/exception_div.sv
index 37432068..3e701d2f 100755
--- a/wally-pipelined/src/fpu/exception_div.sv
+++ b/wally-pipelined/src/fpu/exception_div.sv
@@ -23,9 +23,10 @@ module exception_div (
    logic 	      BNaN; 		// '1' if B is a not-a-number
    logic 	      ASNaN;	 	// '1' if A is a signalling not-a-number
    logic 	      BSNaN;	 	// '1' if B is a signalling not-a-number
-   logic 	      ZQNaN;	 	// '1' if result Z is a quiet NaN
+   logic 	      ZSNaN;	 	// '1' if result Z is a quiet NaN
    logic 	      ZInf;	 	// '1' if result Z is an infnity
-   logic 	      Zero;             // '1' if result is zero   
+   logic 	      Zero;             // '1' if result is zero
+   logic              NegSqrt;          // '1' if sqrt and operand is negative   
    
    //***take this module out and add more registers or just recalculate it all
    // Determine if mantissas are all zeros
@@ -48,32 +49,34 @@ module exception_div (
    assign AZero = AzeroE & AzeroM;
    assign BZero = BzeroE & BzeroE;
 
+   // Is NaN if operand is negative and its a sqrt
+   assign NegSqrt = (A[63] & op_type & ~AZero);
+
    // An "Invalid Operation" exception occurs if (A or B is a signalling NaN)
    // or (A and B are both Infinite)
    assign Invalid = ASNaN | BSNaN | (((AInf & BInf) | (AZero & BZero))&~op_type) | 
-		    (A[63] & op_type);
-
+		    NegSqrt;
 
    // The result is a quiet NaN if (an "Invalid Operation" exception occurs) 
    // or (A is a NaN) or (B is a NaN).
-   assign ZQNaN = Invalid | ANaN | BNaN;
+   assign ZSNaN = Invalid | ANaN | BNaN;
 
    //  The result is zero
    assign Zero = (AZero | BInf)&~op_type | AZero&op_type;   
 
    // The result is +Inf if ((A is Inf) or (B is 0)) and (the
    // result is not a quiet NaN).  
-   assign ZInf = (AInf | BZero)&~ZQNaN&~op_type | AInf&op_type&~ZQNaN;   
+   assign ZInf = (AInf | BZero)&~ZSNaN&~op_type | AInf&op_type&~ZSNaN;   
 
    // Set the type of the result as follows:
    // Ztype	Result 
    //  000     Normal
-   //  001     Quiet NaN
    //  010     Infinity
    //  011     Zero
-   //  110     DivZero
-   assign Ztype[0] = ZQNaN | Zero;
-   assign Ztype[1] = ZInf | Zero;
-   assign Ztype[2] = BZero&~op_type;   
-
+   //  110     Div by 0
+   //  111     SNaN
+   assign Ztype[2] = (ZSNaN);
+   assign Ztype[1] = (ZSNaN) | (Zero) | (ZInf);
+   assign Ztype[0] = (ZSNaN) | (Zero);
+   
 endmodule // exception
diff --git a/wally-pipelined/src/fpu/fpdiv.sv b/wally-pipelined/src/fpu/fpdiv.sv
index a2534149..0a937b5b 100755
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@@ -1,92 +1,86 @@
+///////////////////////////////////////////
 //
-// File name : fpdiv
-// Title     : Floating-Point Divider/Square-Root
-// project   : FPU
-// Library   : fpdiv
-// Author(s) : James E. Stine, Jr.
-// Purpose   : definition of main unit to floating-point div/sqrt
-// notes :   
+// Written: James Stine
+// Modified: 8/1/2018
 //
-// Copyright Oklahoma State University
+// Purpose: Floating point divider/square root top unit (Goldschmidt)
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
-// Basic Operations
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
 //
-// Step 1: Load operands, set flags, and convert SP to DP
-// Step 2: Check for special inputs ( +/- Infinity,  NaN)
-// Step 3: Exponent Logic
-// Step 4: Divide/Sqrt using Goldschmidt
-// Step 5: Normalize the result.//
-//   Shift left until normalized.  Normalized when the value to the 
-//   left of the binrary point is 1.
-// Step 6: Round the result.// 
-// Step 7: Put quotient/remainder onto output.
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
 
 // `timescale 1ps/1ps
 module fpdiv (
-   input logic 	      clk,
-   input logic 	      reset,
-   input logic 	      start,
-   input logic [63:0]   op1,		// 1st input operand (A)
-   input logic [63:0]   op2,		// 2nd input operand (B)
-   input logic [1:0]    rm,		// Rounding mode - specify values 
-   input logic 	      op_type,	// Function opcode
-   input logic 	      P,   		// Result Precision (0 for double, 1 for single)
-   input logic 	      OvEn,		// Overflow trap enabled
-   input logic 	      UnEn,   	// Underflow trap enabled
-   output logic         done,
-   output logic         FDivBusyE,
-   output logic [63:0]  AS_Result,	// Result of operation
-   output logic [4:0]   Flags);   	// IEEE exception flags 
+  input logic 	      clk,
+  input logic 	      reset,
+  input logic 	      start,
+  input logic [63:0]  op1, 
+  input logic [63:0]  op2, 
+  input logic [1:0]   rm, 
+  input logic 	      op_type, 
+  input logic 	      P, 
+  input logic 	      OvEn, 
+  input logic 	      UnEn,
+  input logic 	      XNaNQ,
+  input logic 	      YNaNQ,
+  input logic 	      XZeroQ,
+  input logic 	      YZeroQ,
+  input logic 	      XInfQ,
+  input logic 	      YInfQ, 
 
-
-   logic [63:0]   Float1; 
-   logic [63:0] 	Float2;
+  output logic 	      done,
+  output logic 	      FDivBusyE,
+  output logic [63:0] AS_Result, 
+  output logic [4:0]  Flags);
    
-   logic [12:0] 	exp1, exp2, expF;
-   logic [12:0] 	exp_diff, bias;
-   logic [13:0] 	exp_sqrt;
-   logic [12:0] 	exp_s;
-   logic [12:0] 	exp_c;
+   logic [63:0]       Float1; 
+   logic [63:0]       Float2;
    
-   logic [10:0] 	exponent;
-   logic [63:0] 	Result;   
-   logic [52:0] 	mantissaA;
-   logic [52:0] 	mantissaB; 
+   logic [12:0]       exp1, exp2, expF;
+   logic [12:0]       exp_diff, bias;
+   logic [13:0]       exp_sqrt;
+   logic [63:0]       Result;   
+   logic [52:0]       mantissaA;
+   logic [52:0]       mantissaB; 
    
-   logic [2:0] 	sel_inv;
-   logic		      Invalid;
-   logic [4:0] 	FlagsIn;   	
+   logic [2:0] 	      sel_inv;
+   logic 	      Invalid;
+   logic [4:0] 	      FlagsIn;   	
    logic 	      signResult;      
    logic 	      convert;
-   logic          sub;
+   logic 	      sub;
    
-   logic [63:0] 	q1, qm1, qp1, q0, qm0, qp0;
-   logic [63:0] 	rega_out, regb_out, regc_out, regd_out;
-   logic [127:0]  regr_out;
-   logic [2:0] 	sel_muxa, sel_muxb;
+   logic [63:0]       q1, qm1, qp1, q0, qm0, qp0;
+   logic [63:0]       rega_out, regb_out, regc_out, regd_out;
+   logic [127:0]      regr_out;
+   logic [2:0] 	      sel_muxa, sel_muxb;
    logic 	      sel_muxr;   
    logic 	      load_rega, load_regb, load_regc, load_regd, load_regr;
-
-   logic 	      load_regs;
-   logic          exp_cout1, exp_cout2;
-   logic          exp_odd, open;
    
-   // div/sqrt
-         //  fdiv  = 0
-         //  fsqrt = 1
+   logic 	      load_regs;
+   logic 	      exp_cout1, exp_cout2;
+   logic 	      exp_odd, open;
+   
+   //  op_type : fdiv=0, fsqrt=1
    assign Float1 = op1;
    assign Float2 = op_type ? op1 : op2;   
-
-   // Test for exceptions and return the "Invalid Operation" and
-   // "Denormalized" Input Flags. The "sel_inv" is used in
-   // the third pipeline stage to select the result. Also, op1_Norm
-   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
-   // sub is one if the effective operation is subtaction.   
-   exception_div exc1 (.A(Float1), .B(Float2), .op_type,
-                     // output:
-                     .Ztype(sel_inv), .Invalid);
-
+   
+   // Exception detection
+   exception_div exc1 (.A(Float1), .B(Float2), .op_type, .Ztype(sel_inv), .Invalid);
+   
    // Determine Sign/Mantissa
    assign signResult = (Float1[63]^Float2[63]);
    assign mantissaA = {1'b1, Float1[51:0]};
@@ -103,29 +97,30 @@ module fpdiv (
    assign {exp_cout2, exp_sqrt} = {1'b0, exp1} + {4'h0, 10'h3ff} + exp_odd;
    // Choose correct exponent
    assign expF = op_type ? exp_sqrt[13:1] : exp_diff;   
-
+   
    // Main Goldschmidt/Division Routine   
    divconv goldy (.q1, .qm1, .qp1, .q0, .qm0, .qp0, .rega_out, .regb_out, .regc_out, .regd_out,
 		  .regr_out, .d(mantissaB), .n(mantissaA), .sel_muxa, .sel_muxb, .sel_muxr, 
 		  .reset, .clk,  .load_rega, .load_regb, .load_regc, .load_regd,
 		  .load_regr, .load_regs, .P, .op_type, .exp_odd);
-
+   
    // FSM : control divider   
    fsm control (.clk, .reset, .start, .op_type,
-               // outputs:
-               .done, .load_rega, .load_regb, .load_regc, .load_regd, 
-		         .load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, 
-		         .divBusy(FDivBusyE));
+		.done, .load_rega, .load_regb, .load_regc, .load_regd, 
+		.load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, 
+		.divBusy(FDivBusyE));
    
    // Round the mantissa to a 52-bit value, with the leading one
    // removed. The rounding units also handles special cases and 
    // set the exception flags.   
    rounder_div round1 (.rm, .P, .OvEn, .UnEn, .exp_diff(expF), 
-   		            .sel_inv, .Invalid, .SignR(signResult), 
-		               .q1, .qm1, .qp1, .q0, .qm0, .qp0, .regr_out, 
-                     // outputs:
-                     .Result, .Flags(FlagsIn));
-
+   		       .sel_inv, .Invalid, .SignR(signResult),
+		       .Float1(op1), .Float2(op2),
+		       .XNaNQ, .YNaNQ, .XZeroQ, .YZeroQ, 
+		       .XInfQ, .YInfQ, .op_type,		       
+		       .q1, .qm1, .qp1, .q0, .qm0, .qp0, .regr_out, 
+                       .Result, .Flags(FlagsIn));
+   
    // Store the final result and the exception flags in registers.
    flopenr #(64) rega (clk, reset, done, Result, AS_Result);  
    flopenr #(5) regc (clk, reset, done, FlagsIn, Flags);   
diff --git a/wally-pipelined/src/fpu/fpu.sv b/wally-pipelined/src/fpu/fpu.sv
index cadfafae..34aa3edd 100755
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@@ -1,6 +1,6 @@
 ///////////////////////////////////////////
 //
-// Written: Katherine Parry, Bret Mathis
+// Written: Katherine Parry, James Stine, Brett Mathis
 // Modified: 6/23/2021
 //
 // Purpose: FPU
@@ -25,24 +25,24 @@
 `include "wally-config.vh"
 
 module fpu (
-  input logic 		          clk,
-  input logic 		          reset,
-  input logic  [2:0] 	      FRM_REGW, // Rounding mode from CSR
-  input logic  [31:0]       InstrD,   // instruction from IFU
-  input logic  [`XLEN-1:0]  ReadDataW,// Read data from memory
-  input logic  [`XLEN-1:0]  SrcAE,    // Integer input being processed (from IEU)
-  input logic  [`XLEN-1:0]  SrcAM,    // Integer input being written into fpreg (from IEU)
-  input logic 		          StallE, StallM, StallW, // stall signals from HZU
-  input logic 		          FlushE, FlushM, FlushW, // flush signals from HZU
-  input logic  [4:0] 	      RdE, RdM, RdW,  // which FP register to write to (from IEU)
-  output logic 		          FRegWriteM,     // FP register write enable
-  output logic 		          FStallD,        // Stall the decode stage
-  output logic 		          FWriteIntE, FWriteIntM, FWriteIntW, // integer register write enable
-  output logic [`XLEN-1:0]  FWriteDataE,  // Data to be written to memory
-  output logic [`XLEN-1:0]  FIntResM,     // data to be written to integer register
-  output logic 		          FDivBusyE,    // Is the divide/sqrt unit busy (stall execute stage)
-  output logic 		          IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
-  output logic [4:0] 	      SetFflagsM        // FMA flags (to privileged unit)
+  input logic 		   clk,
+  input logic 		   reset,
+  input logic [2:0] 	   FRM_REGW, // Rounding mode from CSR
+  input logic [31:0] 	   InstrD, // instruction from IFU
+  input logic [`XLEN-1:0]  ReadDataW,// Read data from memory
+  input logic [`XLEN-1:0]  SrcAE, // Integer input being processed (from IEU)
+  input logic [`XLEN-1:0]  SrcAM, // Integer input being written into fpreg (from IEU)
+  input logic 		   StallE, StallM, StallW, // stall signals from HZU
+  input logic 		   FlushE, FlushM, FlushW, // flush signals from HZU
+  input logic [4:0] 	   RdE, RdM, RdW, // which FP register to write to (from IEU)
+  output logic 		   FRegWriteM, // FP register write enable
+  output logic 		   FStallD, // Stall the decode stage
+  output logic 		   FWriteIntE, FWriteIntM, FWriteIntW, // integer register write enable
+  output logic [`XLEN-1:0] FWriteDataE, // Data to be written to memory
+  output logic [`XLEN-1:0] FIntResM, // data to be written to integer register
+  output logic 		   FDivBusyE, // Is the divide/sqrt unit busy (stall execute stage)
+  output logic 		   IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
+  output logic [4:0] 	   SetFflagsM        // FMA flags (to privileged unit)
   );
 
   //*** make everything FLEN at some point
@@ -59,338 +59,257 @@ module fpu (
   
   generate if (`F_SUPPORTED | `D_SUPPORTED) begin : fpu
 
-  // control signals
-	logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
-	logic [2:0] FrmD, FrmE, FrmM;                   // FP rounding mode
-	logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
-	logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
-	logic 		  FWriteIntD;                         // Write to integer register
-	logic [1:0] FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
-	logic [1:0] FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select the result written to FP register
-	logic [2:0] FOpCtrlD, FOpCtrlE, FOpCtrlM;           // Select which opperation to do in each component
-	logic [2:0] FResSelD, FResSelE, FResSelM;           // Select one of the results that finish in the memory stage
-	logic [1:0] FIntResSelD, FIntResSelE, FIntResSelM;  // Select the result written to the integer resister
-	logic [4:0] Adr1E, Adr2E, Adr3E;                    // adresses of each input
+     // control signals
+     logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
+     logic [2:0] 	  FrmD, FrmE, FrmM;                   // FP rounding mode
+     logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
+     logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
+     logic 		  FWriteIntD;                         // Write to integer register
+     logic [1:0] 	  FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
+     logic [1:0] 	  FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select the result written to FP register
+     logic [2:0] 	  FOpCtrlD, FOpCtrlE, FOpCtrlM;           // Select which opperation to do in each component
+     logic [2:0] 	  FResSelD, FResSelE, FResSelM;           // Select one of the results that finish in the memory stage
+     logic [1:0] 	  FIntResSelD, FIntResSelE, FIntResSelM;  // Select the result written to the integer resister
+     logic [4:0] 	  Adr1E, Adr2E, Adr3E;                    // adresses of each input
+     
+     // regfile signals
+     logic [63:0] 	  FRD1D, FRD2D, FRD3D;  // Read Data from FP register - decode stage
+     logic [63:0] 	  FRD1E, FRD2E, FRD3E;  // Read Data from FP register - execute stage
+     logic [63:0] 	  FSrcXE, FSrcXM;       // Input 1 to the various units (after forwarding)
+     logic [63:0] 	  FPreSrcYE, FSrcYE;               // Input 2 to the various units (after forwarding)
+     logic [63:0] 	  FPreSrcZE, FSrcZE;     // Input 3 to the various units (after forwarding)
+     
+     // unpacking signals
+     logic 		  XSgnE, YSgnE, ZSgnE;     // input's sign - execute stage
+     logic 		  XSgnM, YSgnM;     // input's sign - memory stage
+     logic [10:0] 	  XExpE, YExpE, ZExpE;     // input's exponent - execute stage
+     logic [10:0] 	  XExpM, YExpM, ZExpM;     // input's exponent - memory stage
+     logic [52:0] 	  XManE, YManE, ZManE;  // input's fraction - execute stage
+     logic [52:0] 	  XManM, YManM, ZManM;  // input's fraction - memory stage
+     logic [10:0] 	  BiasE;                   // bias based on precision (single=7f double=3ff - max expoent/2)
+     logic 		  XNaNE, YNaNE, ZNaNE;           // is the input a NaN - execute stage
+     logic 		  XNaNM, YNaNM, ZNaNM;           // is the input a NaN - memory stage
+     logic 		  XSNaNE, YSNaNE, ZSNaNE;        // is the input a signaling NaN - execute stage
+     logic 		  XSNaNM, YSNaNM, ZSNaNM;        // is the input a signaling NaN - memory stage
+     logic 		  XDenormE, YDenormE, ZDenormE;  // is the input denormalized
+     logic 		  XZeroE, YZeroE, ZZeroE;        // is the input zero - execute stage
+     logic 		  XZeroM, YZeroM, ZZeroM;        // is the input zero - memory stage
+     logic 		  XInfE, YInfE, ZInfE;           // is the input infinity - execute stage
+     logic 		  XInfM, YInfM, ZInfM;           // is the input infinity - memory stage
+     logic 		  XExpMaxE;                      // is the exponent all ones (max value)
+     logic 		  XNormE;                 // is normal     
+     
+     // result and flag signals
+     logic [63:0] 	  FDivResM, FDivResW; // divide/squareroot result
+     logic [4:0] 	  FDivFlgM, FDivFlgW; // divide/squareroot flags  
+     logic [63:0] 	  FMAResM, FMAResW;   // FMA/multiply result
+     logic [4:0] 	  FMAFlgM, FMAFlgW;   // FMA/multiply result	
+     logic [63:0] 	  ReadResW;           // read result (load instruction)
+     logic [63:0] 	  CvtFpResE, CvtFpResM, CvtFpResW; // add/FP -> FP convert result
+     logic [4:0] 	  CvtFpFlgE, CvtFpFlgM, CvtFpFlgW; // add/FP -> FP convert flags
+     logic [63:0] 	  CvtResE, CvtResM;   // FP <-> int convert result
+     logic [4:0] 	  CvtFlgE, CvtFlgM;   // FP <-> int convert flags //*** trim this	
+     logic [63:0] 	  ClassResE, ClassResM; // classify result
+     logic [63:0] 	  CmpResE, CmpResM; // compare result
+     logic 		  CmpNVE, CmpNVM;   // compare invalid flag (Not Valid)     
+     logic [63:0] 	  SgnResE, SgnResM; // sign injection result
+     logic 		  SgnNVE, SgnNVM;   // sign injection invalid flag (Not Valid)     
+     logic [63:0] 	  FResE, FResM, FResW;     // selected result that is ready in the memory stage
+     logic [4:0] 	  FFlgE, FFlgM;            // selected flag that is ready in the memory stage     
+     logic [`XLEN-1:0] 	  FIntResE;     
+     logic [63:0] 	  FPUResultW;    // final FP result being written to the FP register
+     
+     // other signals
+     logic 		  FDivSqrtDoneE;          // is divide done
+     logic [63:0] 	  DivInput1E, DivInput2E; // inputs to divide/squareroot unit
+     logic 		  FDivClk;                // clock for divide/squareroot unit
+     logic [63:0] 	  AlignedSrcAE;           // align SrcA to the floating point format
+
+     // DECODE STAGE
+     // calculate FP control signals
+     fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
+		  .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
+		  .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
 	
-	// regfile signals
-	logic [63:0] 	    FRD1D, FRD2D, FRD3D;  // Read Data from FP register - decode stage
-	logic [63:0] 	    FRD1E, FRD2E, FRD3E;  // Read Data from FP register - execute stage
-	logic [63:0] 	    FSrcXE, FSrcXM;       // Input 1 to the various units (after forwarding)
-	logic [63:0] 	    FPreSrcYE, FSrcYE;               // Input 2 to the various units (after forwarding)
-	logic [63:0] 	    FPreSrcZE, FSrcZE;     // Input 3 to the various units (after forwarding)
-	
-	// unpacking signals
-	logic 		   XSgnE, YSgnE, ZSgnE;     // input's sign - execute stage
-	logic 		   XSgnM, YSgnM;     // input's sign - memory stage
-	logic [10:0] XExpE, YExpE, ZExpE;     // input's exponent - execute stage
-	logic [10:0] XExpM, YExpM, ZExpM;     // input's exponent - memory stage
-	logic [52:0] XManE, YManE, ZManE;  // input's fraction - execute stage
-	logic [52:0] XManM, YManM, ZManM;  // input's fraction - memory stage
-	logic [10:0] BiasE;                   // bias based on precision (single=7f double=3ff - max expoent/2)
-	logic 		   XNaNE, YNaNE, ZNaNE;           // is the input a NaN - execute stage
-	logic 		   XNaNM, YNaNM, ZNaNM;           // is the input a NaN - memory stage
-	logic 		   XSNaNE, YSNaNE, ZSNaNE;        // is the input a signaling NaN - execute stage
-	logic 		   XSNaNM, YSNaNM, ZSNaNM;        // is the input a signaling NaN - memory stage
-	logic 		   XDenormE, YDenormE, ZDenormE;  // is the input denormalized
-	logic 		   XZeroE, YZeroE, ZZeroE;        // is the input zero - execute stage
-	logic 		   XZeroM, YZeroM, ZZeroM;        // is the input zero - memory stage
-	logic 		   XInfE, YInfE, ZInfE;           // is the input infinity - execute stage
-	logic 		   XInfM, YInfM, ZInfM;           // is the input infinity - memory stage
-	logic 		   XExpMaxE;                      // is the exponent all ones (max value)
-	logic 		   XNormE;                 // is normal
-	
-	
-	// result and flag signals
-	logic [63:0]  FDivResM, FDivResW; // divide/squareroot result
-	logic [4:0] 	FDivFlgM, FDivFlgW; // divide/squareroot flags
+     // FP register file
+     //    - can read 3 registers and write 1 register every cycle
+     fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
+			.a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), .a4(RdW), 
+			.wd4(FPUResultW),
+			.rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
+
+     // D/E pipeline registers
+     flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
+     flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
+     flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
+     flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
+                             {Adr1E, Adr2E, Adr3E});
+     flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+			       {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
+			       {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
+
+     // EXECUTION STAGE
+     // Hazard unit for FPU  
+     //    - determines if any forwarding or stalls are needed
+     fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
+                     .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
+     
+     // forwarding muxs
+     mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
+     mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
+     mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
+     mux3  #(64)  fyaddmux(FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, 
+			   {2'b0, {10{1'b1}}, 52'b0}, 
+			   {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01)}, 
+			   FSrcYE); // Force Z to be 0 for multiply instructions
+     // Force Z to be 0 for multiply instructions     
+     mux3  #(64)  fzmulmux(FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE);
+       
+     // unpacking unit
+     //    - splits FP inputs into their various parts
+     //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
+     unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
+			 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
+			 .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
+			 .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
+     
+     // FMA
+     //   - two stage FMA
+     //   - execute stage - multiplication and addend shifting
+     //   - memory stage  - addition and rounding
+     //   - handles FMA and multiply instructions
+     fma fma (.clk, .reset, .FlushM, .StallM, 
+	      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
+	      .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
+	      .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
+	      .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
+	      .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
+	      .FOpCtrlE,
+	      .FmtE, .FmtM, .FrmM, 
+	      .FMAFlgM, .FMAResM);
+     
+     // clock gater
+     //    - creates a clock that only runs durring divide/sqrt instructions
+     //    - using the seperate clock gives the divide/sqrt unit some to get set up
+     // *** the module says not to use in synthisis
+     clockgater fpdivclkg(.E(FDivStartE),
+			  .SE(1'b0),
+			  .CLK(clk),
+			  .ECLK(FDivClk));
+     
+     // capture the inputs for divide/sqrt
+     //    - if not captured any forwarded inputs will change durring computation
+     //        - this problem is caused by stalling the execute stage
+     //    - the other units don't have this problem, only div/sqrt stalls the execute stage
+     flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
+				.en(1'b1), .clear(FDivSqrtDoneE),
+				.reset(reset),  .clk(FDivBusyE));
+     flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
+				.en(1'b1), .clear(FDivSqrtDoneE),
+				.reset(reset),  .clk(FDivBusyE));
+      flopenrc #(6) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE}), 
+				.q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ}),
+				.en(1'b1), .clear(FDivSqrtDoneE),
+				.reset(reset),  .clk(FDivBusyE));
+            
+      // fpdivsqrt using Goldschmidt's iteration
+      fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
+		      .reset, .clk(FDivClk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
+		      .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ,
+		      .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
+
+     // convert from signle to double and vice versa
+     cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
+     
+     // compare unit
+     //    - computation is done in one stage
+     //    - writes to FP file durring min/max instructions
+     //    - other comparisons write a 1 or 0 to the integer register
+     fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
+		.FSrcXE, .FSrcYE, .FOpCtrlE, 
+		.FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
+		.Invalid(CmpNVE), .CmpResE);
+     
+     // sign injection unit
+     //    - computation is done in one stage
+     fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
+		.SgnNVE, .SgnResE);
+     
+     // classify
+     //    - computation is done in one stage
+     //    - most of the work is done in the unpacking unit
+     //    - result is written to the integer register
+     fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
+			  .XSNaNE, .ClassResE);
+     
+     fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE,
+		.CvtResE, .CvtFlgE);
+     
+     // data to be stored in memory - to IEU
+     //    - FP uses NaN-blocking format
+     //        - if there are any unsused bits the most significant bits are filled with 1s
+     assign FWriteDataE = FSrcYE[`XLEN-1:0];     
+     
+     // Align SrcA to MSB when single precicion
+     mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAE[31:0]}, {{64-`XLEN{1'b1}}, SrcAE}, FmtE, AlignedSrcAE);
+     
+     // select a result that may be written to the FP register
+     mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
+     mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
+     
+     // select the result that may be written to the integer register - to IEU
+     mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], 
+			       CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
+     
+     // E/M pipe registers
+
+     // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
+     flopenrc #(65) EMFpReg2(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
+     flopenrc #(65) EMFpReg3(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
+     flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
+     flopenrc #(12) EMFpReg5(clk, reset, FlushM, ~StallM, 
+			     {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
+			     {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
+     flopenrc #(64) EMRegCmpRes(clk, reset, FlushM, ~StallM, FResE, FResM); 
+     flopenrc #(5)  EMRegCmpFlg(clk, reset, FlushM, ~StallM, FFlgE, FFlgM);      
+     flopenrc #(`XLEN) EMRegSgnRes(clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
+     flopenrc #(11) EMCtrlReg(clk, reset, FlushM, ~StallM,
+			      {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
+			      {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
+     
+     // BEGIN MEMORY STAGE
+     // FPU flag selection - to privileged
+     mux4  #(5)  FPUFlgMux(5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
   
-	logic [63:0]  FMAResM, FMAResW;   // FMA/multiply result
-	logic [4:0] 	FMAFlgM, FMAFlgW;   // FMA/multiply result
-	
-	logic [63:0] 	ReadResW;           // read result (load instruction)
+     // M/W pipe registers
+     flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
+     flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
+     flopenrc #(64) MWRegAdd(clk, reset, FlushW, ~StallW, CvtFpResM, CvtFpResW); 
+     flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
+     flopenrc #(5)  MWCtrlReg(clk, reset, FlushW, ~StallW,
+			      {FRegWriteM, FResultSelM, FmtM, FWriteIntM},
+			      {FRegWriteW, FResultSelW, FmtW, FWriteIntW});
+     
+     // BEGIN WRITEBACK STAGE
+     
+     // put ReadData into NaN-blocking format
+     //    - if there are any unsused bits the most significant bits are filled with 1s
+     //    - for load instruction
+     mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
+     
+     // select the result to be written to the FP register
+     mux4  #(64)  FPUResultMux(ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
 
-	logic [63:0] 	CvtFpResE, CvtFpResM, CvtFpResW; // add/FP -> FP convert result
-	logic [4:0] 	CvtFpFlgE, CvtFpFlgM, CvtFpFlgW; // add/FP -> FP convert flags
-
-	logic [63:0] 	CvtResE, CvtResM;   // FP <-> int convert result
-	logic [4:0] 	CvtFlgE, CvtFlgM;   // FP <-> int convert flags //*** trim this
-	
-	logic [63:0] 	ClassResE, ClassResM; // classify result
-
-	logic [63:0] 	CmpResE, CmpResM; // compare result
-	logic 		    CmpNVE, CmpNVM;   // compare invalid flag (Not Valid)
-	
-	logic [63:0] 	SgnResE, SgnResM; // sign injection result
-	logic 		    SgnNVE, SgnNVM;   // sign injection invalid flag (Not Valid)
-
-	logic [63:0] 	FResE, FResM, FResW;     // selected result that is ready in the memory stage
-	logic [4:0] 	FFlgE, FFlgM;            // selected flag that is ready in the memory stage
-
-	logic [`XLEN-1:0]  FIntResE;
-
-	logic [63:0] 	   FPUResultW;    // final FP result being written to the FP register
-		
-	// other signals
-	logic 		    FDivSqrtDoneE;          // is divide done
-	logic [63:0] 	DivInput1E, DivInput2E; // inputs to divide/squareroot unit
-	logic 		    FDivClk;                // clock for divide/squareroot unit
-	logic [63:0] 	AlignedSrcAE;           // align SrcA to the floating point format
-
-
-
-
-
-  ////////////////////////////////////////////////////////////////////////////////////////
-	//DECODE STAGE
-	////////////////////////////////////////////////////////////////////////////////////////
-
-
-
-	// calculate FP control signals
-	fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
-              // outputs:
-              .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
-              .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
-	
-	// FP register file
-  //    - can read 3 registers and write 1 register every cycle
-	fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
-			   .a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), .a4(RdW), 
-         .wd4(FPUResultW),
-         // outputs:
-			   .rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
-	
-
-
-
-
-	////////////////////////////////////////////////////////////////////////////////////////
-	// D/E pipeline registers
-	////////////////////////////////////////////////////////////////////////////////////////
-
-	flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
-	flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
-	flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
-	flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
-                                                       {Adr1E,         Adr2E,         Adr3E});
-	flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-				  {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
-				  {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
-	
-
-
-
-
-
-  
-	////////////////////////////////////////////////////////////////////////////////////////
-	//EXECUTION STAGE
-	////////////////////////////////////////////////////////////////////////////////////////
-
-
-	// Hazard unit for FPU  
-  //    - determines if any forwarding or stalls are needed
-	fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
-                  // outputs:
-                  .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
-	
-
-	// forwarding muxs
-	mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
-	mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
-	mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
-	mux3  #(64)  fyaddmux(FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, {2'b0, {10{1'b1}}, 52'b0}, {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01)}, FSrcYE); // Force Z to be 0 for multiply instructions
-	mux3  #(64)  fzmulmux(FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE); // Force Z to be 0 for multiply instructions
- 	
-   
-  // unpacking unit
-  //    - splits FP inputs into their various parts
-  //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
-	unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
-                      // outputs:
-                      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
-                      .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
-                      .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
-
-  // FMA
-  //    - two stage FMA
-  //        - execute stage - multiplication and addend shifting
-  //        - memory stage  - addition and rounding
-  //    - handles FMA and multiply instructions
-  //    - contains some E/M pipleine registers
-  // *** currently handles FLEN and 32 bits(dont know if 32 works with 128 - easy to fix) - change to handle only the supported formats
-	fma fma (.clk, .reset, .FlushM, .StallM, 
-		 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
-     .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
-		 .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
-     .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
-     .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
-		 .FOpCtrlE,
-		 .FmtE, .FmtM, .FrmM, 
-     // outputs:
-     .FMAFlgM, .FMAResM);
-	
-	// clock gater
-  //    - creates a clock that only runs durring divide/sqrt instructions
-  //    - using the seperate clock gives the divide/sqrt unit some to get set up
-  // *** the module says not to use in synthisis
-	clockgater fpdivclkg(.E(FDivStartE),
-			     .SE(1'b0),
-			     .CLK(clk),
-			     .ECLK(FDivClk));
-	
-	// capture the inputs for divide/sqrt
-  //    - if not captured any forwarded inputs will change durring computation
-  //        - this problem is caused by stalling the execute stage
-  //    - the other units don't have this problem, only div/sqrt stalls the execute stage
-	flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
-				   .en(1'b1), .clear(FDivSqrtDoneE),
-				   .reset(reset),  .clk(FDivBusyE));
-	flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
-				   .en(1'b1), .clear(FDivSqrtDoneE),
-				   .reset(reset),  .clk(FDivBusyE));
-	
-	// output for store instructions
-  //*** change to use the unpacking unit if possible
-	fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
-			             .reset, .clk(FDivClk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
-                   // outputs:
-			             .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
-	
-	// convert from signle to double and vice versa
-	cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
-	
-	// compare unit
-  //    - computation is done in one stage
-  //    - writes to FP file durring min/max instructions
-  //    - other comparisons write a 1 or 0 to the integer register
-	fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
-            .FSrcXE, .FSrcYE, .FOpCtrlE, 
-            .FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
-            // outputs:
-		        .Invalid(CmpNVE), .CmpResE);
-	
-	// sign injection unit
-  //    - computation is done in one stage
-	fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
-            // outputs:
-            .SgnNVE, .SgnResE);
-	
-	// classify
-  //    - computation is done in one stage
-  //    - most of the work is done in the unpacking unit
-  //    - result is written to the integer register
-	fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
-                      // outputs:
-                      .XSNaNE, .ClassResE);
-	
-	fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE,
-            // outputs: 
-            .CvtResE, .CvtFlgE);
-	
-	// data to be stored in memory - to IEU
-  //    - FP uses NaN-blocking format
-  //        - if there are any unsused bits the most significant bits are filled with 1s
-	assign FWriteDataE = FSrcYE[`XLEN-1:0];
-	
-
-	// Align SrcA to MSB when single precicion
-	mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAE[31:0]}, {{64-`XLEN{1'b1}}, SrcAE}, FmtE, AlignedSrcAE);
-
-  // select a result that may be written to the FP register
-	mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
-	mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
-	
-  // select the result that may be written to the integer register - to IEU
-	mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
-	
-
-
-  //***will synth remove registers of values that are always zero?
-	////////////////////////////////////////////////////////////////////////////////////////
-	// E/M pipe registers
-	////////////////////////////////////////////////////////////////////////////////////////
-
-	// flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
-	flopenrc #(65) EMFpReg2(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
-	flopenrc #(65) EMFpReg3(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
-	flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
-	flopenrc #(12) EMFpReg5(clk, reset, FlushM, ~StallM, 
-				{XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
-				{XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});
-	
-	flopenrc #(64) EMRegCmpRes(clk, reset, FlushM, ~StallM, FResE, FResM); 
-	flopenrc #(5)  EMRegCmpFlg(clk, reset, FlushM, ~StallM, FFlgE, FFlgM); 
-	
-	flopenrc #(`XLEN) EMRegSgnRes(clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
-	// flopenrc #(1) EMRegSgnFlg(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
-
-	//flopenrc #(64) EMRegCvtFpRes(clk, reset, FlushM, ~StallM, CvtFpResE, CvtFpResM);
-	//flopenrc #(5) EMRegCvtFpFlg(clk, reset, FlushM, ~StallM, CvtFpFlgE, CvtFpFlgM);
-	
-	// flopenrc #(64) EMRegCvtRes(clk, reset, FlushM, ~StallM, CvtResE, CvtResM);
-	// flopenrc #(5) EMRegCvtFlg(clk, reset, FlushM, ~StallM, CvtFlgE, CvtFlgM);
-  
-	// flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
-	
-	flopenrc #(11) EMCtrlReg(clk, reset, FlushM, ~StallM,
-				 {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
-				 {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
-	
-	
-
-
-
-
-	////////////////////////////////////////////////////////////////////////////////////////
-	//BEGIN MEMORY STAGE
-	////////////////////////////////////////////////////////////////////////////////////////
-
-
-  // FPU flag selection - to privileged
-	mux4  #(5)  FPUFlgMux(5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
-	
-
-
-
-  
-	////////////////////////////////////////////////////////////////////////////////////////
-	// M/W pipe registers
-	////////////////////////////////////////////////////////////////////////////////////////
-	flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
-	flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
-	flopenrc #(64) MWRegAdd(clk, reset, FlushW, ~StallW, CvtFpResM, CvtFpResW); 
-	flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
-	flopenrc #(5)  MWCtrlReg(clk, reset, FlushW, ~StallW,
-				{FRegWriteM, FResultSelM, FmtM, FWriteIntM},
-				{FRegWriteW, FResultSelW, FmtW, FWriteIntW});
-	
-
-
-
-	////////////////////////////////////////////////////////////////////////////////////////
-	// BEGIN WRITEBACK STAGE
-	////////////////////////////////////////////////////////////////////////////////////////
-
-  // put ReadData into NaN-blocking format
-  //    - if there are any unsused bits the most significant bits are filled with 1s
-  //    - for load instruction
-	mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
-
-  // select the result to be written to the FP register
-	mux4  #(64)  FPUResultMux(ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
-	
-	
   end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low
-	assign FStallD = 0;
-	assign FWriteIntE = 0; 
-	assign FWriteIntM = 0;
-	assign FWriteIntW = 0;
-	assign FWriteDataE = 0;
-	assign FIntResM = 0;
-	assign FDivBusyE = 0;
-	assign IllegalFPUInstrD = 1;
-	assign SetFflagsM = 0;
+     assign FStallD = 0;
+     assign FWriteIntE = 0; 
+     assign FWriteIntM = 0;
+     assign FWriteIntW = 0;
+     assign FWriteDataE = 0;
+     assign FIntResM = 0;
+     assign FDivBusyE = 0;
+     assign IllegalFPUInstrD = 1;
+     assign SetFflagsM = 0;
   end
   endgenerate 
    
diff --git a/wally-pipelined/src/fpu/fregfile.sv b/wally-pipelined/src/fpu/fregfile.sv
index 4b001bc9..fd8e0f60 100644
--- a/wally-pipelined/src/fpu/fregfile.sv
+++ b/wally-pipelined/src/fpu/fregfile.sv
@@ -1,10 +1,9 @@
 ///////////////////////////////////////////
-// regfile.sv
 //
 // Written: David_Harris@hmc.edu 9 January 2021
-// Modified: 
+// Modified: James Stine 
 //
-// Purpose: 4-port register file
+// Purpose: 3-port output register file
 // 
 // A component of the Wally configurable RISC-V project.
 // 
@@ -26,22 +25,20 @@
 `include "wally-config.vh"
 
 module fregfile (
-  input  logic        clk, reset,
-  input  logic        we4, 
-  input  logic [ 4:0] a1, a2, a3, a4, 
-  input  logic [63:0] wd4,
+  input logic 	      clk, reset,
+  input logic 	      we4, 
+  input logic [4:0]   a1, a2, a3, a4, 
+  input logic [63:0]  wd4,
   output logic [63:0] rd1, rd2, rd3);
-
-  logic [63:0] rf[31:0];
-  integer i;
-
-  // three ported register file
-  // read three ports combinationally (A1/RD1, A2/RD2, A3/RD3)
-  // write fourth port on rising edge of clock (A4/WD4/WE4)
-  // write occurs on falling edge of clock
-  
-  // reset is intended for simulation only, not synthesis
-    
+   
+   logic [63:0]       rf[31:0];
+   integer 	      i;
+   
+   // three ported register file
+   // read three ports combinationally (A1/RD1, A2/RD2, A3/RD3)
+   // write fourth port on rising edge of clock (A4/WD4/WE4)
+   // write occurs on falling edge of clock   
+   
    always_ff @(negedge clk or posedge reset)
      if (reset) for(i=0; i<32; i++) rf[i] <= 0;
      else if (we4) rf[a4] <= wd4;	
diff --git a/wally-pipelined/src/fpu/fsm.sv b/wally-pipelined/src/fpu/fsm.sv
index 00f95993..a0e874bc 100755
--- a/wally-pipelined/src/fpu/fsm.sv
+++ b/wally-pipelined/src/fpu/fsm.sv
@@ -1,49 +1,63 @@
-module fsm (
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 9/28/2021
+//
+// Purpose: FSM for floating point divider/square root unit (Goldschmidt)
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
 
-   input logic 			clk,
-   input logic 			reset,
-   input logic 			start,
-   input logic  		op_type,
-   output logic 		done,      // End of cycles
-   output logic 		load_rega, // enable for regA
-   output logic 		load_regb, // enable for regB
-   output logic 		load_regc, // enable for regC
-   output logic 		load_regd, // enable for regD
-   output logic 		load_regr, // enable for rem
-   output logic 		load_regs, // enable for q,qm,qp 
-   output logic [2:0] 	sel_muxa,  // Select muxA
-   output logic [2:0] 	sel_muxb,  // Select muxB
-   output logic 		sel_muxr,  // Select rem mux
-   output logic			divBusy	   // calculation is happening
+module fsm (
+   input logic 	      clk,
+   input logic 	      reset,
+   input logic 	      start,
+   input logic 	      op_type,
+   output logic       done, 
+   output logic       load_rega, 
+   output logic       load_regb, 
+   output logic       load_regc, 
+   output logic       load_regd,
+   output logic       load_regr,
+   output logic       load_regs,
+   output logic [2:0] sel_muxa, 
+   output logic [2:0] sel_muxb, 
+   output logic       sel_muxr, 
+   output logic       divBusy	   
    );
 
-
-   reg [4:0] 	CURRENT_STATE;
-   reg [4:0] 	NEXT_STATE;   
-
-   parameter [4:0] 
-     S0=5'd0, S1=5'd1, S2=5'd2,
-     S3=5'd3, S4=5'd4, S5=5'd5,
-     S6=5'd6, S7=5'd7, S8=5'd8,
-     S9=5'd9, S10=5'd10,
-     S13=5'd13, S14=5'd14, S15=5'd15,     
-     S16=5'd16, S17=5'd17, S18=5'd18,
-     S19=5'd19, S20=5'd20, S21=5'd21,
-     S22=5'd22, S23=5'd23, S24=5'd24,
-     S25=5'd25, S26=5'd26, S27=5'd27,
-     S28=5'd28, S29=5'd29, S30=5'd30;
+   typedef enum       logic [4:0] {S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
+				   S10, S11, S12, S13, S14, S15, S16, S17, S18, S19,
+				   S20, S21, S22, S23, S24, S25, S26, S27, S28, S29,
+				   S30} statetype;
+   
+   statetype current_state, next_state;
    
    always @(negedge clk)
      begin
-	if(reset==1'b1)
-	  CURRENT_STATE=S0;
+	if (reset == 1'b1)
+	  current_state = S0;
 	else
-	  CURRENT_STATE=NEXT_STATE;
+	  current_state = next_state;
      end
 
    always @(*)
      begin
- 	case(CURRENT_STATE)
+ 	case(current_state)
 	  S0:  // iteration 0
 	    begin
 	       if (start==1'b0)
@@ -59,7 +73,7 @@ module fsm (
 		    sel_muxa = 3'b000;
 		    sel_muxb = 3'b000;
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S0;
+		    next_state = S0;
 		 end 
 	       else if (start==1'b1 && op_type==1'b0) 
 		 begin
@@ -74,7 +88,7 @@ module fsm (
 		    sel_muxa = 3'b001;
 		    sel_muxb = 3'b001;		    
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S1;
+		    next_state = S1;
 		 end // if (start==1'b1 && op_type==1'b0)
 	       else if (start==1'b1 && op_type==1'b1) 
 		 begin
@@ -89,7 +103,7 @@ module fsm (
 		    sel_muxa = 3'b010;
 		    sel_muxb = 3'b000;		    
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S13;
+		    next_state = S13;
 		 end 	   
 	       else
 		 begin
@@ -104,7 +118,7 @@ module fsm (
 		    sel_muxa = 3'b000;
 		    sel_muxb = 3'b000;		    
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S0;
+		    next_state = S0;
 		 end
 	    end // case: S0
 	  S1:
@@ -120,7 +134,7 @@ module fsm (
 	       sel_muxa = 3'b010;
 	       sel_muxb = 3'b000;		    
 	       sel_muxr = 1'b0;	
-	       NEXT_STATE = S2;
+	       next_state = S2;
 	    end	  
 	  S2: // iteration 1
 	    begin
@@ -135,7 +149,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S3;
+	       next_state = S3;
 	    end
 	  S3:
 	    begin
@@ -150,7 +164,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S4;
+	       next_state = S4;
 	    end
 	  S4: // iteration 2
 	    begin
@@ -165,7 +179,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S5;
+	       next_state = S5;
 	    end
 	  S5:
 	    begin
@@ -180,7 +194,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;  // add
-	       NEXT_STATE = S6;
+	       next_state = S6;
 	    end
 	  S6: // iteration 3
 	    begin
@@ -195,7 +209,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S8;
+	       next_state = S8;
 	    end
 	  S7:
 	    begin
@@ -210,7 +224,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S8;
+	       next_state = S8;
 	    end // case: S7
 	  S8: // q,qm,qp
 	    begin
@@ -225,7 +239,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S9;
+	       next_state = S9;
 	    end 
 	  S9:  // rem
 	    begin
@@ -240,7 +254,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b1;
-	       NEXT_STATE = S10;
+	       next_state = S10;
 	    end 	  
 	  S10:  // done
 	    begin
@@ -255,7 +269,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S0;
+	       next_state = S0;
 	    end 
 	  S13:  // start of sqrt path
 	    begin
@@ -270,7 +284,7 @@ module fsm (
 	       sel_muxa = 3'b010;
 	       sel_muxb = 3'b001;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S14;
+	       next_state = S14;
 	    end
 	  S14:  
 	    begin
@@ -285,7 +299,7 @@ module fsm (
 	       sel_muxa = 3'b001;
 	       sel_muxb = 3'b100;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S15;
+	       next_state = S15;
 	    end 
 	  S15:  // iteration 1
 	    begin
@@ -300,7 +314,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S16;
+	       next_state = S16;
 	    end
 	  S16:  
 	    begin
@@ -315,7 +329,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S17;
+	       next_state = S17;
 	    end
 	  S17:  
 	    begin
@@ -330,7 +344,7 @@ module fsm (
 	       sel_muxa = 3'b100;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S18;
+	       next_state = S18;
 	    end
 	  S18:  // iteration 2
 	    begin
@@ -345,7 +359,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S19;
+	       next_state = S19;
 	    end
 	  S19:  
 	    begin
@@ -360,7 +374,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S20;
+	       next_state = S20;
 	    end
 	  S20:  
 	    begin
@@ -375,7 +389,7 @@ module fsm (
 	       sel_muxa = 3'b100;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S21;
+	       next_state = S21;
 	    end
 	  S21:  // iteration 3
 	    begin
@@ -390,7 +404,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S22;
+	       next_state = S22;
 	    end
 	  S22:  
 	    begin
@@ -405,7 +419,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S23;
+	       next_state = S23;
 	    end
 	  S23:  
 	    begin
@@ -420,7 +434,7 @@ module fsm (
 	       sel_muxa = 3'b100;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S24;
+	       next_state = S24;
 	    end 
 	  S24: // q,qm,qp
 	    begin
@@ -435,7 +449,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S25;
+	       next_state = S25;
 	    end 	  
 	  S25:  // rem
 	    begin
@@ -450,7 +464,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b110;
 	       sel_muxr = 1'b1;
-	       NEXT_STATE = S26;
+	       next_state = S26;
 	    end 
 	  S26:  // done
 	    begin
@@ -465,7 +479,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S0;
+	       next_state = S0;
 	    end 
 	  default: 
 	    begin
@@ -480,9 +494,9 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S0;
+	       next_state = S0;
 	    end
-	endcase // case(CURRENT_STATE)	
-     end // always @ (CURRENT_STATE or X)   
+	endcase // case(current_state)	
+     end // always @ (current_state or X)   
 
 endmodule // fsm
diff --git a/wally-pipelined/src/fpu/rounder_div.sv b/wally-pipelined/src/fpu/rounder_div.sv
index ff7c4830..1d2ff1cc 100755
--- a/wally-pipelined/src/fpu/rounder_div.sv
+++ b/wally-pipelined/src/fpu/rounder_div.sv
@@ -1,37 +1,55 @@
+///////////////////////////////////////////
 //
-// The rounder takes as inputs a 64-bit value to be rounded, A, the 
-// exponent of the value to be rounded, the sign of the final result, Sign, 
-// the precision of the results, P, and the two-bit rounding mode, rm. 
-// It produces a rounded 52-bit result, Z, the exponent of the rounded 
-// result, Z_exp, and a flag that indicates if the result was rounded,
-// Inexact. The rounding mode has the following values.
-//	    rm		Mode
-//      00 		round-to-nearest-even
-//	    01 		round-toward-zero
-//      10 		round-toward-plus infinity
-//      11  	round-toward-minus infinity
+// Written: James Stine
+// Modified: 8/1/2018
 //
+// Purpose: Floating point divider/square root rounder unit (Goldschmidt)
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
 
 module rounder_div (
-    input logic [1:0]   rm,
-    input logic         P,
-    input logic         OvEn,
-    input logic         UnEn,
-    input logic [12:0]  exp_diff,
-    input logic [2:0]   sel_inv,
-    input logic         Invalid,
-    input logic 	    SignR,
-   
-    input logic [63:0]  q1,
-    input logic [63:0]  qm1,
-    input logic [63:0]  qp1,
-    input logic [63:0]  q0,
-    input logic [63:0]  qm0,
-    input logic [63:0]  qp0,   
+    input logic [1:0] 	rm,
+    input logic 	P,
+    input logic 	OvEn,
+    input logic 	UnEn,
+    input logic [12:0] 	exp_diff,
+    input logic [2:0] 	sel_inv,
+    input logic 	Invalid,
+    input logic 	SignR,
+    input logic [63:0] 	Float1,
+    input logic [63:0] 	Float2,
+    input logic 	XNaNQ,
+    input logic 	YNaNQ,
+    input logic 	XZeroQ,
+    input logic 	YZeroQ, 
+    input logic 	XInfQ,
+    input logic 	YInfQ,
+    input logic 	op_type, 
+    input logic [63:0] 	q1,
+    input logic [63:0] 	qm1,
+    input logic [63:0] 	qp1,
+    input logic [63:0] 	q0,
+    input logic [63:0] 	qm0,
+    input logic [63:0] 	qp0, 
     input logic [127:0] regr_out,
    
     output logic [63:0] Result,
-    output logic [4:0]  Flags
+    output logic [4:0] 	Flags
     );
       
    logic 	       Rsign;
@@ -56,11 +74,15 @@ module rounder_div (
    logic 	       Texp_l7z;
    logic 	       Texp_l7o;
    logic 	       OvCon;
-   logic           zero_rem;
-   logic [1:0] 	   mux_mant;
+   logic 	       zero_rem;
+   logic [1:0] 	       mux_mant;
    logic 	       sign_rem;
-   logic [63:0]    q, qm, qp;
-   logic 	       exp_ovf;   
+   logic [63:0]        q, qm, qp;
+   logic 	       exp_ovf;
+
+   logic [50:0]        NaN_out;
+   logic 	       NaN_Sign_out;   
+   logic 	       Sign_out;     
 
    // Remainder = 0?
    assign zero_rem = ~(|regr_out);
@@ -117,12 +139,11 @@ module rounder_div (
    // the input was infinite or NaN or the output of the adder is zero.
    // 00 = Valid
    // 10 = NaN
-   assign Valid = (~sel_inv[2]&~sel_inv[1]&~sel_inv[0]);
-   assign NaN = ~sel_inv[1]& sel_inv[0];
+   assign Valid = ~sel_inv[2]&~sel_inv[1]&~sel_inv[0];
+   assign NaN = sel_inv[2]&sel_inv[1]&sel_inv[0]; 
    assign UnderFlow = (P & UnFlow_SP | UnFlow_DP) & Valid;
    assign OverFlow  = (P & OvFlow_SP | OvFlow_DP) & Valid;
-   assign Div0 = sel_inv[2]&sel_inv[1]&~sel_inv[0];
-
+   assign Div0 = YZeroQ&~XZeroQ&~op_type&~NaN;   
 
    // The final result is Inexact if any rounding occurred ((i.e., R or S 
    // is one), or (if the result overflows ) or (if the result underflows and the 
@@ -161,18 +182,26 @@ module rounder_div (
    // If the result is zero or infinity, the mantissa is all zeros. 
    // If the result is NaN, the mantissa is 10...0
    // If the result the largest floating point number, the mantissa
-   // is all ones. Otherwise, the mantissa is not changed. 
-   assign Rmant[51] = Largest | NaN | (Smant[51]&~Infinite&~Rzero);
-   assign Rmant[50:0] = {51{Largest}} | (Smant[50:0]&{51{~Infinite&Valid&~Rzero}});
+   // is all ones. Otherwise, the mantissa is not changed.
+   assign NaN_out = ~XNaNQ&YNaNQ ? Float2[50:0] : Float1[50:0];
+   assign NaN_Sign_out = ~XNaNQ&YNaNQ ? Float2[63] : Float1[63];
+   assign Sign_out = (XZeroQ&YZeroQ | XInfQ&YInfQ)&~op_type | Rsign&~XNaNQ&~YNaNQ | 
+   		     NaN_Sign_out&(XNaNQ|YNaNQ);
 
+   // FIXME (jes) - Imperas gives sNaN a Sign=0 where x86 gives Sign=1
+   // | Float1[63]&op_type;
+   assign Rmant[51] = Largest | NaN | (Smant[51]&~Infinite&~Rzero);
+   assign Rmant[50:0] = ({51{Largest}} | (Smant[50:0]&{51{~Infinite&Valid&~Rzero}}) |
+			(NaN_out&{51{NaN}}))&({51{~(op_type&Float1[63]&~XZeroQ)}});
+   
    // For single precision, the 8 least significant bits of the exponent
    // and 23 most significant bits of the mantissa contain bits used 
    // for the final result. A double precision result is returned if 
    // overflow has occurred, the overflow trap is enabled, and a conversion
    // is being performed. 
    assign OvCon = OverFlow & OvEn;
-   assign Result = (P&~OvCon) ? { {32{1'b1}}, Rsign, Rexp[7:0], Rmant[51:29]}
-	           : {Rsign, Rexp, Rmant};
+   assign Result = (P&~OvCon) ? { {32{1'b1}}, Sign_out, Rexp[7:0], Rmant[51:29]}
+	           : {Sign_out, Rexp, Rmant};
 
 endmodule // rounder
 
diff --git a/wally-pipelined/src/fpu/sbtm_a0.sv b/wally-pipelined/src/fpu/sbtm_a0.sv
index 83953787..61dd183b 100644
--- a/wally-pipelined/src/fpu/sbtm_a0.sv
+++ b/wally-pipelined/src/fpu/sbtm_a0.sv
@@ -1,5 +1,30 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_a0 (input  logic [6:0] a,
-		            output logic [12:0] y);
+		output logic [12:0] y);
+   
    always_comb
      case(a)
        7'b0000000: y = 13'b1111111100010;
@@ -137,4 +162,4 @@ endmodule // sbtm_a0
 
     
     
-    
\ No newline at end of file
+    
diff --git a/wally-pipelined/src/fpu/sbtm_a1.sv b/wally-pipelined/src/fpu/sbtm_a1.sv
index 76e4bdec..88845283 100644
--- a/wally-pipelined/src/fpu/sbtm_a1.sv
+++ b/wally-pipelined/src/fpu/sbtm_a1.sv
@@ -1,5 +1,30 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_a1 (input  logic [6:0] a,
-		            output logic [4:0] y);
+		output logic [4:0] y);
+   
    always_comb
      case(a)
        7'b0000000: y = 5'b11100;
@@ -137,4 +162,4 @@ endmodule // sbtm_a0
 
     
     
-    
\ No newline at end of file
+    
diff --git a/wally-pipelined/src/fpu/sbtm_a2.sv b/wally-pipelined/src/fpu/sbtm_a2.sv
index ae407ec8..8d32ad15 100755
--- a/wally-pipelined/src/fpu/sbtm_a2.sv
+++ b/wally-pipelined/src/fpu/sbtm_a2.sv
@@ -1,5 +1,30 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_a2 (input  logic [7:0] a,
-		            output logic [13:0] y);
+		output logic [13:0] y);
+   
    always_comb
      case(a)
        8'b01000000: y = 14'b10110100010111;
@@ -201,4 +226,4 @@ endmodule // sbtm_a0
 
     
     
-    
\ No newline at end of file
+    
diff --git a/wally-pipelined/src/fpu/sbtm_a3.sv b/wally-pipelined/src/fpu/sbtm_a3.sv
index c6b36793..5958c3bf 100755
--- a/wally-pipelined/src/fpu/sbtm_a3.sv
+++ b/wally-pipelined/src/fpu/sbtm_a3.sv
@@ -1,5 +1,30 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_a3 (input  logic [7:0] a,
-		            output logic [5:0] y);
+		output logic [5:0] y);
+   
    always_comb
      case(a)
        8'b01000000: y = 6'b100110;
diff --git a/wally-pipelined/src/fpu/sbtm_div.sv b/wally-pipelined/src/fpu/sbtm_div.sv
index 53b56dbd..999106d8 100644
--- a/wally-pipelined/src/fpu/sbtm_div.sv
+++ b/wally-pipelined/src/fpu/sbtm_div.sv
@@ -1,3 +1,27 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup for divide portion of fpdivsqrt
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_div (input logic [11:0] a, output logic [10:0] ia_out);
 
    // bit partitions
diff --git a/wally-pipelined/src/fpu/sbtm_sqrt.sv b/wally-pipelined/src/fpu/sbtm_sqrt.sv
index 27ffbecc..fdf0bb6d 100644
--- a/wally-pipelined/src/fpu/sbtm_sqrt.sv
+++ b/wally-pipelined/src/fpu/sbtm_sqrt.sv
@@ -1,3 +1,27 @@
+///////////////////////////////////////////
+//
+// Written: James Stine
+// Modified: 8/1/2018
+//
+// Purpose: Bipartite Lookup for sqrt part of fpdivsqrt
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
 module sbtm_sqrt (input logic [11:0] a, output logic [10:0] y);
 
    // bit partitions

From 2afa6e7a6e2930194b6af9ea43b9b020ca0cd8e3 Mon Sep 17 00:00:00 2001
From: "James E. Stine" <james.stine@okstate.edu>
Date: Wed, 6 Oct 2021 08:56:01 -0500
Subject: [PATCH 3/3] Add TV for testbenches (to be added shortly) however had
 to leave off fma due to size.  The TV were slightly modified within TestFloat
 to add underscores for readability.  The scripts I created to create these TV
 were also included

---
 .../testbench/fp/create_vectors32.csh         | 30 +++++++++++++++++++
 .../testbench/fp/create_vectors64.csh         | 30 +++++++++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100755 wally-pipelined/testbench/fp/create_vectors32.csh
 create mode 100755 wally-pipelined/testbench/fp/create_vectors64.csh

diff --git a/wally-pipelined/testbench/fp/create_vectors32.csh b/wally-pipelined/testbench/fp/create_vectors32.csh
new file mode 100755
index 00000000..1a43eb4c
--- /dev/null
+++ b/wally-pipelined/testbench/fp/create_vectors32.csh
@@ -0,0 +1,30 @@
+#!/bin/sh
+./testfloat_gen -rnear_even f32_add > f32_add_rne.tv
+./testfloat_gen -rminMag f32_add > f32_add_rz.tv
+./testfloat_gen -rmax f32_add > f32_add_ru.tv
+./testfloat_gen -rmin f32_add > f32_add_rd.tv
+
+./testfloat_gen -rnear_even f32_sub > f32_sub_rne.tv
+./testfloat_gen -rminMag f32_sub > f32_sub_rz.tv
+./testfloat_gen -rmax f32_sub > f32_sub_ru.tv
+./testfloat_gen -rmin f32_sub > f32_sub_rd.tv
+
+./testfloat_gen -rnear_even f32_mul > f32_mul_rne.tv
+./testfloat_gen -rminMag f32_mul > f32_mul_rz.tv
+./testfloat_gen -rmax f32_mul > f32_mul_ru.tv
+./testfloat_gen -rmin f32_mul > f32_mul_rd.tv
+
+./testfloat_gen -rnear_even f32_mulAdd > f32_fma_rne.tv
+./testfloat_gen -rminMag f32_mulAdd > f32_fma_rz.tv
+./testfloat_gen -rmax f32_mulAdd > f32_fma_ru.tv
+./testfloat_gen -rmin f32_mulAdd > f32_fma_rd.tv
+
+./testfloat_gen -rnear_even f32_div > f32_div_rne.tv
+./testfloat_gen -rminMag f32_div > f32_div_rz.tv
+./testfloat_gen -rmax f32_div > f32_div_ru.tv
+./testfloat_gen -rmin f32_div > f32_div_rd.tv
+
+./testfloat_gen -rnear_even f32_sqrt > f32_sqrt_rne.tv
+./testfloat_gen -rminMag f32_sqrt > f32_sqrt_rz.tv
+./testfloat_gen -rmax f32_sqrt > f32_sqrt_ru.tv
+./testfloat_gen -rmin f32_sqrt > f32_sqrt_rd.tv
diff --git a/wally-pipelined/testbench/fp/create_vectors64.csh b/wally-pipelined/testbench/fp/create_vectors64.csh
new file mode 100755
index 00000000..bb0c0fda
--- /dev/null
+++ b/wally-pipelined/testbench/fp/create_vectors64.csh
@@ -0,0 +1,30 @@
+#!/bin/sh
+./testfloat_gen -rnear_even f64_add > f64_add_rne.tv
+./testfloat_gen -rminMag f64_add > f64_add_rz.tv
+./testfloat_gen -rmax f64_add > f64_add_ru.tv
+./testfloat_gen -rmin f64_add > f64_add_rd.tv
+
+./testfloat_gen -rnear_even f64_sub > f64_sub_rne.tv
+./testfloat_gen -rminMag f64_sub > f64_sub_rz.tv
+./testfloat_gen -rmax f64_sub > f64_sub_ru.tv
+./testfloat_gen -rmin f64_sub > f64_sub_rd.tv
+
+./testfloat_gen -rnear_even f64_mul > f64_mul_rne.tv
+./testfloat_gen -rminMag f64_mul > f64_mul_rz.tv
+./testfloat_gen -rmax f64_mul > f64_mul_ru.tv
+./testfloat_gen -rmin f64_mul > f64_mul_rd.tv
+
+./testfloat_gen -rnear_even f64_mulAdd > f64_fma_rne.tv
+./testfloat_gen -rminMag f64_mulAdd > f64_fma_rz.tv
+./testfloat_gen -rmax f64_mulAdd > f64_fma_ru.tv
+./testfloat_gen -rmin f64_mulAdd > f64_fma_rd.tv
+
+./testfloat_gen -rnear_even f64_div > f64_div_rne.tv
+./testfloat_gen -rminMag f64_div > f64_div_rz.tv
+./testfloat_gen -rmax f64_div > f64_div_ru.tv
+./testfloat_gen -rmin f64_div > f64_div_rd.tv
+
+./testfloat_gen -rnear_even f64_sqrt > f64_sqrt_rne.tv
+./testfloat_gen -rminMag f64_sqrt > f64_sqrt_rz.tv
+./testfloat_gen -rmax f64_sqrt > f64_sqrt_ru.tv
+./testfloat_gen -rmin f64_sqrt > f64_sqrt_rd.tv