Make changes to fpdiv - still working on clock issue with fsm that was changed from posedge to negedge - also updated fpdivsqrt rounding to handle testfloat

2021-10-06 08:26:09 -05:00 · 2021-10-06 08:26:09 -05:00 · a91c0c8fc7
commit a91c0c8fc7
parent 5bcae393c9
13 changed files with 698 additions and 576 deletions
--- a/wally-pipelined/src/fpu/convert_inputs.sv
+++ b/wally-pipelined/src/fpu/convert_inputs.sv
@ -1,9 +1,26 @@
-// This module takes as inputs two operands (op1 and op2) 
+///////////////////////////////////////////
-// the operation type (op_type) and the result precision (P). 
+//
-// Based on the operation and precision , it conditionally
+// Written: James Stine
-// converts single precision values to double precision values
+// Modified: 8/1/2018
-// and modifies the sign of op1. The converted operands are Float1
+//
-// and Float2.
+// Purpose: Floating point divider/square root top unit (Goldschmidt)
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 module convert_inputs(
   input [63:0]  op1,      // 1st input operand (A)
--- a/wally-pipelined/src/fpu/exception_div.sv
+++ b/wally-pipelined/src/fpu/exception_div.sv
@ -23,9 +23,10 @@ module exception_div (
   logic 	      BNaN; 		// '1' if B is a not-a-number
   logic 	      ASNaN;	 	// '1' if A is a signalling not-a-number
   logic 	      BSNaN;	 	// '1' if B is a signalling not-a-number
-   logic 	      ZQNaN;	 	// '1' if result Z is a quiet NaN
+   logic 	      ZSNaN;	 	// '1' if result Z is a quiet NaN
   logic 	      ZInf;	 	// '1' if result Z is an infnity
-   logic 	      Zero;             // '1' if result is zero   
+   logic 	      Zero;             // '1' if result is zero
   logic              NegSqrt;          // '1' if sqrt and operand is negative   
   //***take this module out and add more registers or just recalculate it all
   // Determine if mantissas are all zeros
@ -48,32 +49,34 @@ module exception_div (
   assign AZero = AzeroE & AzeroM;
   assign BZero = BzeroE & BzeroE;
   // Is NaN if operand is negative and its a sqrt
   assign NegSqrt = (A[63] & op_type & ~AZero);
   // An "Invalid Operation" exception occurs if (A or B is a signalling NaN)
   // or (A and B are both Infinite)
   assign Invalid = ASNaN | BSNaN | (((AInf & BInf) | (AZero & BZero))&~op_type) | 
-		    (A[63] & op_type);
+		    NegSqrt;
   // The result is a quiet NaN if (an "Invalid Operation" exception occurs) 
   // or (A is a NaN) or (B is a NaN).
-   assign ZQNaN = Invalid | ANaN | BNaN;
+   assign ZSNaN = Invalid | ANaN | BNaN;
   //  The result is zero
   assign Zero = (AZero | BInf)&~op_type | AZero&op_type;   
   // The result is +Inf if ((A is Inf) or (B is 0)) and (the
   // result is not a quiet NaN).  
-   assign ZInf = (AInf | BZero)&~ZQNaN&~op_type | AInf&op_type&~ZQNaN;   
+   assign ZInf = (AInf | BZero)&~ZSNaN&~op_type | AInf&op_type&~ZSNaN;   
   // Set the type of the result as follows:
   // Ztype	Result 
   //  000     Normal
   //  001     Quiet NaN
   //  010     Infinity
   //  011     Zero
-   //  110     DivZero
+   //  110     Div by 0
-   assign Ztype[0] = ZQNaN | Zero;
+   //  111     SNaN
-   assign Ztype[1] = ZInf | Zero;
+   assign Ztype[2] = (ZSNaN);
-   assign Ztype[2] = BZero&~op_type;   
+   assign Ztype[1] = (ZSNaN) | (Zero) | (ZInf);
-
+   assign Ztype[0] = (ZSNaN) | (Zero);
 endmodule // exception
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@ -1,92 +1,86 @@
 ///////////////////////////////////////////
 //
-// File name : fpdiv
+// Written: James Stine
-// Title     : Floating-Point Divider/Square-Root
+// Modified: 8/1/2018
 // project   : FPU
 // Library   : fpdiv
 // Author(s) : James E. Stine, Jr.
 // Purpose   : definition of main unit to floating-point div/sqrt
 // notes :   
 //
-// Copyright Oklahoma State University
+// Purpose: Floating point divider/square root top unit (Goldschmidt)
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
-// Basic Operations
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
-// Step 1: Load operands, set flags, and convert SP to DP
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 // Step 2: Check for special inputs ( +/- Infinity,  NaN)
 // Step 3: Exponent Logic
 // Step 4: Divide/Sqrt using Goldschmidt
 // Step 5: Normalize the result.//
 //   Shift left until normalized.  Normalized when the value to the 
 //   left of the binrary point is 1.
 // Step 6: Round the result.// 
 // Step 7: Put quotient/remainder onto output.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 // `timescale 1ps/1ps
 module fpdiv (
-   input logic 	      clk,
+  input logic 	      clk,
-   input logic 	      reset,
+  input logic 	      reset,
-   input logic 	      start,
+  input logic 	      start,
-   input logic [63:0]   op1,		// 1st input operand (A)
+  input logic [63:0]  op1, 
-   input logic [63:0]   op2,		// 2nd input operand (B)
+  input logic [63:0]  op2, 
-   input logic [1:0]    rm,		// Rounding mode - specify values 
+  input logic [1:0]   rm, 
-   input logic 	      op_type,	// Function opcode
+  input logic 	      op_type, 
-   input logic 	      P,   		// Result Precision (0 for double, 1 for single)
+  input logic 	      P, 
-   input logic 	      OvEn,		// Overflow trap enabled
+  input logic 	      OvEn, 
-   input logic 	      UnEn,   	// Underflow trap enabled
+  input logic 	      UnEn,
-   output logic         done,
+  input logic 	      XNaNQ,
-   output logic         FDivBusyE,
+  input logic 	      YNaNQ,
-   output logic [63:0]  AS_Result,	// Result of operation
+  input logic 	      XZeroQ,
-   output logic [4:0]   Flags);   	// IEEE exception flags 
+  input logic 	      YZeroQ,
  input logic 	      XInfQ,
  input logic 	      YInfQ, 
-
+  output logic 	      done,
-   logic [63:0]   Float1; 
+  output logic 	      FDivBusyE,
-   logic [63:0] 	Float2;
+  output logic [63:0] AS_Result, 
  output logic [4:0]  Flags);
-   logic [12:0] 	exp1, exp2, expF;
+   logic [63:0]       Float1; 
-   logic [12:0] 	exp_diff, bias;
+   logic [63:0]       Float2;
   logic [13:0] 	exp_sqrt;
   logic [12:0] 	exp_s;
   logic [12:0] 	exp_c;
-   logic [10:0] 	exponent;
+   logic [12:0]       exp1, exp2, expF;
-   logic [63:0] 	Result;   
+   logic [12:0]       exp_diff, bias;
-   logic [52:0] 	mantissaA;
+   logic [13:0]       exp_sqrt;
-   logic [52:0] 	mantissaB; 
+   logic [63:0]       Result;   
   logic [52:0]       mantissaA;
   logic [52:0]       mantissaB; 
-   logic [2:0] 	sel_inv;
+   logic [2:0] 	      sel_inv;
-   logic		      Invalid;
+   logic 	      Invalid;
-   logic [4:0] 	FlagsIn;   	
+   logic [4:0] 	      FlagsIn;   	
   logic 	      signResult;      
   logic 	      convert;
-   logic          sub;
+   logic 	      sub;
-   logic [63:0] 	q1, qm1, qp1, q0, qm0, qp0;
+   logic [63:0]       q1, qm1, qp1, q0, qm0, qp0;
-   logic [63:0] 	rega_out, regb_out, regc_out, regd_out;
+   logic [63:0]       rega_out, regb_out, regc_out, regd_out;
-   logic [127:0]  regr_out;
+   logic [127:0]      regr_out;
-   logic [2:0] 	sel_muxa, sel_muxb;
+   logic [2:0] 	      sel_muxa, sel_muxb;
   logic 	      sel_muxr;   
   logic 	      load_rega, load_regb, load_regc, load_regd, load_regr;
   logic 	      load_regs;
   logic          exp_cout1, exp_cout2;
   logic          exp_odd, open;
-   // div/sqrt
+   logic 	      load_regs;
-         //  fdiv  = 0
+   logic 	      exp_cout1, exp_cout2;
-         //  fsqrt = 1
+   logic 	      exp_odd, open;
   //  op_type : fdiv=0, fsqrt=1
   assign Float1 = op1;
   assign Float2 = op_type ? op1 : op2;   
-
+   
-   // Test for exceptions and return the "Invalid Operation" and
+   // Exception detection
-   // "Denormalized" Input Flags. The "sel_inv" is used in
+   exception_div exc1 (.A(Float1), .B(Float2), .op_type, .Ztype(sel_inv), .Invalid);
-   // the third pipeline stage to select the result. Also, op1_Norm
+   
   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
   // sub is one if the effective operation is subtaction.   
   exception_div exc1 (.A(Float1), .B(Float2), .op_type,
                     // output:
                     .Ztype(sel_inv), .Invalid);
   // Determine Sign/Mantissa
   assign signResult = (Float1[63]^Float2[63]);
   assign mantissaA = {1'b1, Float1[51:0]};
@ -103,29 +97,30 @@ module fpdiv (
   assign {exp_cout2, exp_sqrt} = {1'b0, exp1} + {4'h0, 10'h3ff} + exp_odd;
   // Choose correct exponent
   assign expF = op_type ? exp_sqrt[13:1] : exp_diff;   
-
+   
   // Main Goldschmidt/Division Routine   
   divconv goldy (.q1, .qm1, .qp1, .q0, .qm0, .qp0, .rega_out, .regb_out, .regc_out, .regd_out,
 		  .regr_out, .d(mantissaB), .n(mantissaA), .sel_muxa, .sel_muxb, .sel_muxr, 
 		  .reset, .clk,  .load_rega, .load_regb, .load_regc, .load_regd,
 		  .load_regr, .load_regs, .P, .op_type, .exp_odd);
-
+   
   // FSM : control divider   
   fsm control (.clk, .reset, .start, .op_type,
-               // outputs:
+		.done, .load_rega, .load_regb, .load_regc, .load_regd, 
-               .done, .load_rega, .load_regb, .load_regc, .load_regd, 
+		.load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, 
-		         .load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, 
+		.divBusy(FDivBusyE));
 		         .divBusy(FDivBusyE));
   // Round the mantissa to a 52-bit value, with the leading one
   // removed. The rounding units also handles special cases and 
   // set the exception flags.   
   rounder_div round1 (.rm, .P, .OvEn, .UnEn, .exp_diff(expF), 
-   		            .sel_inv, .Invalid, .SignR(signResult), 
+   		       .sel_inv, .Invalid, .SignR(signResult),
-		               .q1, .qm1, .qp1, .q0, .qm0, .qp0, .regr_out, 
+		       .Float1(op1), .Float2(op2),
-                     // outputs:
+		       .XNaNQ, .YNaNQ, .XZeroQ, .YZeroQ, 
-                     .Result, .Flags(FlagsIn));
+		       .XInfQ, .YInfQ, .op_type,		       
-
+		       .q1, .qm1, .qp1, .q0, .qm0, .qp0, .regr_out, 
                       .Result, .Flags(FlagsIn));
   // Store the final result and the exception flags in registers.
   flopenr #(64) rega (clk, reset, done, Result, AS_Result);  
   flopenr #(5) regc (clk, reset, done, FlagsIn, Flags);   
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -1,6 +1,6 @@
 ///////////////////////////////////////////
 //
-// Written: Katherine Parry, Bret Mathis
+// Written: Katherine Parry, James Stine, Brett Mathis
 // Modified: 6/23/2021
 //
 // Purpose: FPU
@ -25,24 +25,24 @@
 `include "wally-config.vh"
 module fpu (
-  input logic 		          clk,
+  input logic 		   clk,
-  input logic 		          reset,
+  input logic 		   reset,
-  input logic  [2:0] 	      FRM_REGW, // Rounding mode from CSR
+  input logic [2:0] 	   FRM_REGW, // Rounding mode from CSR
-  input logic  [31:0]       InstrD,   // instruction from IFU
+  input logic [31:0] 	   InstrD, // instruction from IFU
-  input logic  [`XLEN-1:0]  ReadDataW,// Read data from memory
+  input logic [`XLEN-1:0]  ReadDataW,// Read data from memory
-  input logic  [`XLEN-1:0]  SrcAE,    // Integer input being processed (from IEU)
+  input logic [`XLEN-1:0]  SrcAE, // Integer input being processed (from IEU)
-  input logic  [`XLEN-1:0]  SrcAM,    // Integer input being written into fpreg (from IEU)
+  input logic [`XLEN-1:0]  SrcAM, // Integer input being written into fpreg (from IEU)
-  input logic 		          StallE, StallM, StallW, // stall signals from HZU
+  input logic 		   StallE, StallM, StallW, // stall signals from HZU
-  input logic 		          FlushE, FlushM, FlushW, // flush signals from HZU
+  input logic 		   FlushE, FlushM, FlushW, // flush signals from HZU
-  input logic  [4:0] 	      RdE, RdM, RdW,  // which FP register to write to (from IEU)
+  input logic [4:0] 	   RdE, RdM, RdW, // which FP register to write to (from IEU)
-  output logic 		          FRegWriteM,     // FP register write enable
+  output logic 		   FRegWriteM, // FP register write enable
-  output logic 		          FStallD,        // Stall the decode stage
+  output logic 		   FStallD, // Stall the decode stage
-  output logic 		          FWriteIntE, FWriteIntM, FWriteIntW, // integer register write enable
+  output logic 		   FWriteIntE, FWriteIntM, FWriteIntW, // integer register write enable
-  output logic [`XLEN-1:0]  FWriteDataE,  // Data to be written to memory
+  output logic [`XLEN-1:0] FWriteDataE, // Data to be written to memory
-  output logic [`XLEN-1:0]  FIntResM,     // data to be written to integer register
+  output logic [`XLEN-1:0] FIntResM, // data to be written to integer register
-  output logic 		          FDivBusyE,    // Is the divide/sqrt unit busy (stall execute stage)
+  output logic 		   FDivBusyE, // Is the divide/sqrt unit busy (stall execute stage)
-  output logic 		          IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
+  output logic 		   IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
-  output logic [4:0] 	      SetFflagsM        // FMA flags (to privileged unit)
+  output logic [4:0] 	   SetFflagsM        // FMA flags (to privileged unit)
  );
  //*** make everything FLEN at some point
@ -59,338 +59,257 @@ module fpu (
  generate if (`F_SUPPORTED | `D_SUPPORTED) begin : fpu
-  // control signals
+     // control signals
-	logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
+     logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
-	logic [2:0] FrmD, FrmE, FrmM;                   // FP rounding mode
+     logic [2:0] 	  FrmD, FrmE, FrmM;                   // FP rounding mode
-	logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
+     logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
-	logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
+     logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
-	logic 		  FWriteIntD;                         // Write to integer register
+     logic 		  FWriteIntD;                         // Write to integer register
-	logic [1:0] FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
+     logic [1:0] 	  FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
-	logic [1:0] FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select the result written to FP register
+     logic [1:0] 	  FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select the result written to FP register
-	logic [2:0] FOpCtrlD, FOpCtrlE, FOpCtrlM;           // Select which opperation to do in each component
+     logic [2:0] 	  FOpCtrlD, FOpCtrlE, FOpCtrlM;           // Select which opperation to do in each component
-	logic [2:0] FResSelD, FResSelE, FResSelM;           // Select one of the results that finish in the memory stage
+     logic [2:0] 	  FResSelD, FResSelE, FResSelM;           // Select one of the results that finish in the memory stage
-	logic [1:0] FIntResSelD, FIntResSelE, FIntResSelM;  // Select the result written to the integer resister
+     logic [1:0] 	  FIntResSelD, FIntResSelE, FIntResSelM;  // Select the result written to the integer resister
-	logic [4:0] Adr1E, Adr2E, Adr3E;                    // adresses of each input
+     logic [4:0] 	  Adr1E, Adr2E, Adr3E;                    // adresses of each input
     // regfile signals
     logic [63:0] 	  FRD1D, FRD2D, FRD3D;  // Read Data from FP register - decode stage
     logic [63:0] 	  FRD1E, FRD2E, FRD3E;  // Read Data from FP register - execute stage
     logic [63:0] 	  FSrcXE, FSrcXM;       // Input 1 to the various units (after forwarding)
     logic [63:0] 	  FPreSrcYE, FSrcYE;               // Input 2 to the various units (after forwarding)
     logic [63:0] 	  FPreSrcZE, FSrcZE;     // Input 3 to the various units (after forwarding)
     // unpacking signals
     logic 		  XSgnE, YSgnE, ZSgnE;     // input's sign - execute stage
     logic 		  XSgnM, YSgnM;     // input's sign - memory stage
     logic [10:0] 	  XExpE, YExpE, ZExpE;     // input's exponent - execute stage
     logic [10:0] 	  XExpM, YExpM, ZExpM;     // input's exponent - memory stage
     logic [52:0] 	  XManE, YManE, ZManE;  // input's fraction - execute stage
     logic [52:0] 	  XManM, YManM, ZManM;  // input's fraction - memory stage
     logic [10:0] 	  BiasE;                   // bias based on precision (single=7f double=3ff - max expoent/2)
     logic 		  XNaNE, YNaNE, ZNaNE;           // is the input a NaN - execute stage
     logic 		  XNaNM, YNaNM, ZNaNM;           // is the input a NaN - memory stage
     logic 		  XSNaNE, YSNaNE, ZSNaNE;        // is the input a signaling NaN - execute stage
     logic 		  XSNaNM, YSNaNM, ZSNaNM;        // is the input a signaling NaN - memory stage
     logic 		  XDenormE, YDenormE, ZDenormE;  // is the input denormalized
     logic 		  XZeroE, YZeroE, ZZeroE;        // is the input zero - execute stage
     logic 		  XZeroM, YZeroM, ZZeroM;        // is the input zero - memory stage
     logic 		  XInfE, YInfE, ZInfE;           // is the input infinity - execute stage
     logic 		  XInfM, YInfM, ZInfM;           // is the input infinity - memory stage
     logic 		  XExpMaxE;                      // is the exponent all ones (max value)
     logic 		  XNormE;                 // is normal     
     // result and flag signals
     logic [63:0] 	  FDivResM, FDivResW; // divide/squareroot result
     logic [4:0] 	  FDivFlgM, FDivFlgW; // divide/squareroot flags  
     logic [63:0] 	  FMAResM, FMAResW;   // FMA/multiply result
     logic [4:0] 	  FMAFlgM, FMAFlgW;   // FMA/multiply result	
     logic [63:0] 	  ReadResW;           // read result (load instruction)
     logic [63:0] 	  CvtFpResE, CvtFpResM, CvtFpResW; // add/FP -> FP convert result
     logic [4:0] 	  CvtFpFlgE, CvtFpFlgM, CvtFpFlgW; // add/FP -> FP convert flags
     logic [63:0] 	  CvtResE, CvtResM;   // FP <-> int convert result
     logic [4:0] 	  CvtFlgE, CvtFlgM;   // FP <-> int convert flags //*** trim this	
     logic [63:0] 	  ClassResE, ClassResM; // classify result
     logic [63:0] 	  CmpResE, CmpResM; // compare result
     logic 		  CmpNVE, CmpNVM;   // compare invalid flag (Not Valid)     
     logic [63:0] 	  SgnResE, SgnResM; // sign injection result
     logic 		  SgnNVE, SgnNVM;   // sign injection invalid flag (Not Valid)     
     logic [63:0] 	  FResE, FResM, FResW;     // selected result that is ready in the memory stage
     logic [4:0] 	  FFlgE, FFlgM;            // selected flag that is ready in the memory stage     
     logic [`XLEN-1:0] 	  FIntResE;     
     logic [63:0] 	  FPUResultW;    // final FP result being written to the FP register
     // other signals
     logic 		  FDivSqrtDoneE;          // is divide done
     logic [63:0] 	  DivInput1E, DivInput2E; // inputs to divide/squareroot unit
     logic 		  FDivClk;                // clock for divide/squareroot unit
     logic [63:0] 	  AlignedSrcAE;           // align SrcA to the floating point format
     // DECODE STAGE
     // calculate FP control signals
     fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
 		  .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
 		  .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
-	// regfile signals
+     // FP register file
-	logic [63:0] 	    FRD1D, FRD2D, FRD3D;  // Read Data from FP register - decode stage
+     //    - can read 3 registers and write 1 register every cycle
-	logic [63:0] 	    FRD1E, FRD2E, FRD3E;  // Read Data from FP register - execute stage
+     fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
-	logic [63:0] 	    FSrcXE, FSrcXM;       // Input 1 to the various units (after forwarding)
+			.a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), .a4(RdW), 
-	logic [63:0] 	    FPreSrcYE, FSrcYE;               // Input 2 to the various units (after forwarding)
+			.wd4(FPUResultW),
-	logic [63:0] 	    FPreSrcZE, FSrcZE;     // Input 3 to the various units (after forwarding)
+			.rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
-	
+
-	// unpacking signals
+     // D/E pipeline registers
-	logic 		   XSgnE, YSgnE, ZSgnE;     // input's sign - execute stage
+     flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
-	logic 		   XSgnM, YSgnM;     // input's sign - memory stage
+     flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
-	logic [10:0] XExpE, YExpE, ZExpE;     // input's exponent - execute stage
+     flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
-	logic [10:0] XExpM, YExpM, ZExpM;     // input's exponent - memory stage
+     flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
-	logic [52:0] XManE, YManE, ZManE;  // input's fraction - execute stage
+                             {Adr1E, Adr2E, Adr3E});
-	logic [52:0] XManM, YManM, ZManM;  // input's fraction - memory stage
+     flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-	logic [10:0] BiasE;                   // bias based on precision (single=7f double=3ff - max expoent/2)
+			       {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
-	logic 		   XNaNE, YNaNE, ZNaNE;           // is the input a NaN - execute stage
+			       {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
-	logic 		   XNaNM, YNaNM, ZNaNM;           // is the input a NaN - memory stage
+
-	logic 		   XSNaNE, YSNaNE, ZSNaNE;        // is the input a signaling NaN - execute stage
+     // EXECUTION STAGE
-	logic 		   XSNaNM, YSNaNM, ZSNaNM;        // is the input a signaling NaN - memory stage
+     // Hazard unit for FPU  
-	logic 		   XDenormE, YDenormE, ZDenormE;  // is the input denormalized
+     //    - determines if any forwarding or stalls are needed
-	logic 		   XZeroE, YZeroE, ZZeroE;        // is the input zero - execute stage
+     fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
-	logic 		   XZeroM, YZeroM, ZZeroM;        // is the input zero - memory stage
+                     .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
-	logic 		   XInfE, YInfE, ZInfE;           // is the input infinity - execute stage
+     
-	logic 		   XInfM, YInfM, ZInfM;           // is the input infinity - memory stage
+     // forwarding muxs
-	logic 		   XExpMaxE;                      // is the exponent all ones (max value)
+     mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
-	logic 		   XNormE;                 // is normal
+     mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
-	
+     mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
-	
+     mux3  #(64)  fyaddmux(FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, 
-	// result and flag signals
+			   {2'b0, {10{1'b1}}, 52'b0}, 
-	logic [63:0]  FDivResM, FDivResW; // divide/squareroot result
+			   {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01)}, 
-	logic [4:0] 	FDivFlgM, FDivFlgW; // divide/squareroot flags
+			   FSrcYE); // Force Z to be 0 for multiply instructions
     // Force Z to be 0 for multiply instructions     
     mux3  #(64)  fzmulmux(FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE);
     // unpacking unit
     //    - splits FP inputs into their various parts
     //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
     unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
 			 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
 			 .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
 			 .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
     // FMA
     //   - two stage FMA
     //   - execute stage - multiplication and addend shifting
     //   - memory stage  - addition and rounding
     //   - handles FMA and multiply instructions
     fma fma (.clk, .reset, .FlushM, .StallM, 
 	      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
 	      .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
 	      .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
 	      .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
 	      .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
 	      .FOpCtrlE,
 	      .FmtE, .FmtM, .FrmM, 
 	      .FMAFlgM, .FMAResM);
     // clock gater
     //    - creates a clock that only runs durring divide/sqrt instructions
     //    - using the seperate clock gives the divide/sqrt unit some to get set up
     // *** the module says not to use in synthisis
     clockgater fpdivclkg(.E(FDivStartE),
 			  .SE(1'b0),
 			  .CLK(clk),
 			  .ECLK(FDivClk));
     // capture the inputs for divide/sqrt
     //    - if not captured any forwarded inputs will change durring computation
     //        - this problem is caused by stalling the execute stage
     //    - the other units don't have this problem, only div/sqrt stalls the execute stage
     flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
 				.en(1'b1), .clear(FDivSqrtDoneE),
 				.reset(reset),  .clk(FDivBusyE));
     flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
 				.en(1'b1), .clear(FDivSqrtDoneE),
 				.reset(reset),  .clk(FDivBusyE));
      flopenrc #(6) reg_input3 (.d({XNaNE, YNaNE, XInfE, YInfE, XZeroE, YZeroE}), 
 				.q({XNaNQ, YNaNQ, XInfQ, YInfQ, XZeroQ, YZeroQ}),
 				.en(1'b1), .clear(FDivSqrtDoneE),
 				.reset(reset),  .clk(FDivBusyE));
      // fpdivsqrt using Goldschmidt's iteration
      fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
 		      .reset, .clk(FDivClk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
 		      .XNaNQ, .YNaNQ, .XInfQ, .YInfQ, .XZeroQ, .YZeroQ,
 		      .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
     // convert from signle to double and vice versa
     cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
     // compare unit
     //    - computation is done in one stage
     //    - writes to FP file durring min/max instructions
     //    - other comparisons write a 1 or 0 to the integer register
     fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
 		.FSrcXE, .FSrcYE, .FOpCtrlE, 
 		.FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
 		.Invalid(CmpNVE), .CmpResE);
     // sign injection unit
     //    - computation is done in one stage
     fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
 		.SgnNVE, .SgnResE);
     // classify
     //    - computation is done in one stage
     //    - most of the work is done in the unpacking unit
     //    - result is written to the integer register
     fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
 			  .XSNaNE, .ClassResE);
     fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE,
 		.CvtResE, .CvtFlgE);
     // data to be stored in memory - to IEU
     //    - FP uses NaN-blocking format
     //        - if there are any unsused bits the most significant bits are filled with 1s
     assign FWriteDataE = FSrcYE[`XLEN-1:0];     
     // Align SrcA to MSB when single precicion
     mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAE[31:0]}, {{64-`XLEN{1'b1}}, SrcAE}, FmtE, AlignedSrcAE);
     // select a result that may be written to the FP register
     mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
     mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
     // select the result that may be written to the integer register - to IEU
     mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], 
 			       CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
     // E/M pipe registers
     // flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
     flopenrc #(65) EMFpReg2(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
     flopenrc #(65) EMFpReg3(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
     flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
     flopenrc #(12) EMFpReg5(clk, reset, FlushM, ~StallM, 
 			     {XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
 			     {XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
     flopenrc #(64) EMRegCmpRes(clk, reset, FlushM, ~StallM, FResE, FResM); 
     flopenrc #(5)  EMRegCmpFlg(clk, reset, FlushM, ~StallM, FFlgE, FFlgM);      
     flopenrc #(`XLEN) EMRegSgnRes(clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
     flopenrc #(11) EMCtrlReg(clk, reset, FlushM, ~StallM,
 			      {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
 			      {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
     // BEGIN MEMORY STAGE
     // FPU flag selection - to privileged
     mux4  #(5)  FPUFlgMux(5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
-	logic [63:0]  FMAResM, FMAResW;   // FMA/multiply result
+     // M/W pipe registers
-	logic [4:0] 	FMAFlgM, FMAFlgW;   // FMA/multiply result
+     flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
-	
+     flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
-	logic [63:0] 	ReadResW;           // read result (load instruction)
+     flopenrc #(64) MWRegAdd(clk, reset, FlushW, ~StallW, CvtFpResM, CvtFpResW); 
     flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
     flopenrc #(5)  MWCtrlReg(clk, reset, FlushW, ~StallW,
 			      {FRegWriteM, FResultSelM, FmtM, FWriteIntM},
 			      {FRegWriteW, FResultSelW, FmtW, FWriteIntW});
     // BEGIN WRITEBACK STAGE
     // put ReadData into NaN-blocking format
     //    - if there are any unsused bits the most significant bits are filled with 1s
     //    - for load instruction
     mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
     // select the result to be written to the FP register
     mux4  #(64)  FPUResultMux(ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
 	logic [63:0] 	CvtFpResE, CvtFpResM, CvtFpResW; // add/FP -> FP convert result
 	logic [4:0] 	CvtFpFlgE, CvtFpFlgM, CvtFpFlgW; // add/FP -> FP convert flags
 	logic [63:0] 	CvtResE, CvtResM;   // FP <-> int convert result
 	logic [4:0] 	CvtFlgE, CvtFlgM;   // FP <-> int convert flags //*** trim this
 	logic [63:0] 	ClassResE, ClassResM; // classify result
 	logic [63:0] 	CmpResE, CmpResM; // compare result
 	logic 		    CmpNVE, CmpNVM;   // compare invalid flag (Not Valid)
 	logic [63:0] 	SgnResE, SgnResM; // sign injection result
 	logic 		    SgnNVE, SgnNVM;   // sign injection invalid flag (Not Valid)
 	logic [63:0] 	FResE, FResM, FResW;     // selected result that is ready in the memory stage
 	logic [4:0] 	FFlgE, FFlgM;            // selected flag that is ready in the memory stage
 	logic [`XLEN-1:0]  FIntResE;
 	logic [63:0] 	   FPUResultW;    // final FP result being written to the FP register
 	// other signals
 	logic 		    FDivSqrtDoneE;          // is divide done
 	logic [63:0] 	DivInput1E, DivInput2E; // inputs to divide/squareroot unit
 	logic 		    FDivClk;                // clock for divide/squareroot unit
 	logic [63:0] 	AlignedSrcAE;           // align SrcA to the floating point format
  ////////////////////////////////////////////////////////////////////////////////////////
 	//DECODE STAGE
 	////////////////////////////////////////////////////////////////////////////////////////
 	// calculate FP control signals
 	fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
              // outputs:
              .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
              .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
 	// FP register file
  //    - can read 3 registers and write 1 register every cycle
 	fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
 			   .a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), .a4(RdW), 
         .wd4(FPUResultW),
         // outputs:
 			   .rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
 	////////////////////////////////////////////////////////////////////////////////////////
 	// D/E pipeline registers
 	////////////////////////////////////////////////////////////////////////////////////////
 	flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
 	flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
 	flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
 	flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
                                                       {Adr1E,         Adr2E,         Adr3E});
 	flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
 				  {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
 				  {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
 	////////////////////////////////////////////////////////////////////////////////////////
 	//EXECUTION STAGE
 	////////////////////////////////////////////////////////////////////////////////////////
 	// Hazard unit for FPU  
  //    - determines if any forwarding or stalls are needed
 	fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
                  // outputs:
                  .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
 	// forwarding muxs
 	mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
 	mux3  #(64)  fyemux(FRD2E, FPUResultW, FResM, FForwardYE, FPreSrcYE);
 	mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
 	mux3  #(64)  fyaddmux(FPreSrcYE, {{32{1'b1}}, 2'b0, {7{1'b1}}, 23'b0}, {2'b0, {10{1'b1}}, 52'b0}, {FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01), ~FmtE&FOpCtrlE[2]&FOpCtrlE[1]&(FResultSelE==3'b01)}, FSrcYE); // Force Z to be 0 for multiply instructions
 	mux3  #(64)  fzmulmux(FPreSrcZE, 64'b0, FPreSrcYE, {FOpCtrlE[2]&FOpCtrlE[1], FOpCtrlE[2]&~FOpCtrlE[1]}, FSrcZE); // Force Z to be 0 for multiply instructions
  // unpacking unit
  //    - splits FP inputs into their various parts
  //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
 	unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE, .FmtE, 
                      // outputs:
                      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
                      .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
                      .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
  // FMA
  //    - two stage FMA
  //        - execute stage - multiplication and addend shifting
  //        - memory stage  - addition and rounding
  //    - handles FMA and multiply instructions
  //    - contains some E/M pipleine registers
  // *** currently handles FLEN and 32 bits(dont know if 32 works with 128 - easy to fix) - change to handle only the supported formats
 	fma fma (.clk, .reset, .FlushM, .StallM, 
 		 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
     .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
 		 .XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
     .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
     .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
 		 .FOpCtrlE,
 		 .FmtE, .FmtM, .FrmM, 
     // outputs:
     .FMAFlgM, .FMAResM);
 	// clock gater
  //    - creates a clock that only runs durring divide/sqrt instructions
  //    - using the seperate clock gives the divide/sqrt unit some to get set up
  // *** the module says not to use in synthisis
 	clockgater fpdivclkg(.E(FDivStartE),
 			     .SE(1'b0),
 			     .CLK(clk),
 			     .ECLK(FDivClk));
 	// capture the inputs for divide/sqrt
  //    - if not captured any forwarded inputs will change durring computation
  //        - this problem is caused by stalling the execute stage
  //    - the other units don't have this problem, only div/sqrt stalls the execute stage
 	flopenrc #(64) reg_input1 (.d({XSgnE, XExpE, XManE[51:0]}), .q(DivInput1E),
 				   .en(1'b1), .clear(FDivSqrtDoneE),
 				   .reset(reset),  .clk(FDivBusyE));
 	flopenrc #(64) reg_input2 (.d({YSgnE, YExpE, YManE[51:0]}), .q(DivInput2E),
 				   .en(1'b1), .clear(FDivSqrtDoneE),
 				   .reset(reset),  .clk(FDivBusyE));
 	// output for store instructions
  //*** change to use the unpacking unit if possible
 	fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
 			             .reset, .clk(FDivClk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
                   // outputs:
 			             .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
 	// convert from signle to double and vice versa
 	cvtfp cvtfp (.XExpE, .XManE, .XSgnE, .XZeroE, .XDenormE, .XInfE, .XNaNE, .XSNaNE, .FrmE, .FmtE, .CvtFpResE, .CvtFpFlgE);
 	// compare unit
  //    - computation is done in one stage
  //    - writes to FP file durring min/max instructions
  //    - other comparisons write a 1 or 0 to the integer register
 	fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
            .FSrcXE, .FSrcYE, .FOpCtrlE, 
            .FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
            // outputs:
 		        .Invalid(CmpNVE), .CmpResE);
 	// sign injection unit
  //    - computation is done in one stage
 	fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
            // outputs:
            .SgnNVE, .SgnResE);
 	// classify
  //    - computation is done in one stage
  //    - most of the work is done in the unpacking unit
  //    - result is written to the integer register
 	fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
                      // outputs:
                      .XSNaNE, .ClassResE);
 	fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE,
            // outputs: 
            .CvtResE, .CvtFlgE);
 	// data to be stored in memory - to IEU
  //    - FP uses NaN-blocking format
  //        - if there are any unsused bits the most significant bits are filled with 1s
 	assign FWriteDataE = FSrcYE[`XLEN-1:0];
 	// Align SrcA to MSB when single precicion
 	mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAE[31:0]}, {{64-`XLEN{1'b1}}, SrcAE}, FmtE, AlignedSrcAE);
  // select a result that may be written to the FP register
 	mux5  #(64) FResMux(AlignedSrcAE, SgnResE, CmpResE, CvtResE, CvtFpResE, FResSelE, FResE);
 	mux5  #(5)  FFlgMux(5'b0, {4'b0, SgnNVE}, {4'b0, CmpNVE}, CvtFlgE, CvtFpFlgE, FResSelE, FFlgE);
  // select the result that may be written to the integer register - to IEU
 	mux4  #(`XLEN)  IntResMux(CmpResE[`XLEN-1:0], FSrcXE[`XLEN-1:0], ClassResE[`XLEN-1:0], CvtResE[`XLEN-1:0], FIntResSelE, FIntResE);
  //***will synth remove registers of values that are always zero?
 	////////////////////////////////////////////////////////////////////////////////////////
 	// E/M pipe registers
 	////////////////////////////////////////////////////////////////////////////////////////
 	// flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
 	flopenrc #(65) EMFpReg2(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
 	flopenrc #(65) EMFpReg3(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
 	flopenrc #(64) EMFpReg4(clk, reset, FlushM, ~StallM, {ZExpE,ZManE}, {ZExpM,ZManM});
 	flopenrc #(12) EMFpReg5(clk, reset, FlushM, ~StallM, 
 				{XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
 				{XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});
 	flopenrc #(64) EMRegCmpRes(clk, reset, FlushM, ~StallM, FResE, FResM); 
 	flopenrc #(5)  EMRegCmpFlg(clk, reset, FlushM, ~StallM, FFlgE, FFlgM); 
 	flopenrc #(`XLEN) EMRegSgnRes(clk, reset, FlushM, ~StallM, FIntResE, FIntResM);
 	// flopenrc #(1) EMRegSgnFlg(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
 	//flopenrc #(64) EMRegCvtFpRes(clk, reset, FlushM, ~StallM, CvtFpResE, CvtFpResM);
 	//flopenrc #(5) EMRegCvtFpFlg(clk, reset, FlushM, ~StallM, CvtFpFlgE, CvtFpFlgM);
 	// flopenrc #(64) EMRegCvtRes(clk, reset, FlushM, ~StallM, CvtResE, CvtResM);
 	// flopenrc #(5) EMRegCvtFlg(clk, reset, FlushM, ~StallM, CvtFlgE, CvtFlgM);
 	// flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
 	flopenrc #(11) EMCtrlReg(clk, reset, FlushM, ~StallM,
 				 {FRegWriteE, FResultSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
 				 {FRegWriteM, FResultSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
 	////////////////////////////////////////////////////////////////////////////////////////
 	//BEGIN MEMORY STAGE
 	////////////////////////////////////////////////////////////////////////////////////////
  // FPU flag selection - to privileged
 	mux4  #(5)  FPUFlgMux(5'b0, FMAFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
 	////////////////////////////////////////////////////////////////////////////////////////
 	// M/W pipe registers
 	////////////////////////////////////////////////////////////////////////////////////////
 	flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
 	flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
 	flopenrc #(64) MWRegAdd(clk, reset, FlushW, ~StallW, CvtFpResM, CvtFpResW); 
 	flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
 	flopenrc #(5)  MWCtrlReg(clk, reset, FlushW, ~StallW,
 				{FRegWriteM, FResultSelM, FmtM, FWriteIntM},
 				{FRegWriteW, FResultSelW, FmtW, FWriteIntW});
 	////////////////////////////////////////////////////////////////////////////////////////
 	// BEGIN WRITEBACK STAGE
 	////////////////////////////////////////////////////////////////////////////////////////
  // put ReadData into NaN-blocking format
  //    - if there are any unsused bits the most significant bits are filled with 1s
  //    - for load instruction
 	mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
  // select the result to be written to the FP register
 	mux4  #(64)  FPUResultMux(ReadResW, FMAResW, FDivResW, FResW, FResultSelW, FPUResultW);
  end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low
-	assign FStallD = 0;
+     assign FStallD = 0;
-	assign FWriteIntE = 0; 
+     assign FWriteIntE = 0; 
-	assign FWriteIntM = 0;
+     assign FWriteIntM = 0;
-	assign FWriteIntW = 0;
+     assign FWriteIntW = 0;
-	assign FWriteDataE = 0;
+     assign FWriteDataE = 0;
-	assign FIntResM = 0;
+     assign FIntResM = 0;
-	assign FDivBusyE = 0;
+     assign FDivBusyE = 0;
-	assign IllegalFPUInstrD = 1;
+     assign IllegalFPUInstrD = 1;
-	assign SetFflagsM = 0;
+     assign SetFflagsM = 0;
  end
  endgenerate 
--- a/wally-pipelined/src/fpu/fregfile.sv
+++ b/wally-pipelined/src/fpu/fregfile.sv
@ -1,10 +1,9 @@
 ///////////////////////////////////////////
 // regfile.sv
 //
 // Written: David_Harris@hmc.edu 9 January 2021
-// Modified: 
+// Modified: James Stine 
 //
-// Purpose: 4-port register file
+// Purpose: 3-port output register file
 // 
 // A component of the Wally configurable RISC-V project.
 // 
@ -26,22 +25,20 @@
 `include "wally-config.vh"
 module fregfile (
-  input  logic        clk, reset,
+  input logic 	      clk, reset,
-  input  logic        we4, 
+  input logic 	      we4, 
-  input  logic [ 4:0] a1, a2, a3, a4, 
+  input logic [4:0]   a1, a2, a3, a4, 
-  input  logic [63:0] wd4,
+  input logic [63:0]  wd4,
  output logic [63:0] rd1, rd2, rd3);
-
+   
-  logic [63:0] rf[31:0];
+   logic [63:0]       rf[31:0];
-  integer i;
+   integer 	      i;
-
+   
-  // three ported register file
+   // three ported register file
-  // read three ports combinationally (A1/RD1, A2/RD2, A3/RD3)
+   // read three ports combinationally (A1/RD1, A2/RD2, A3/RD3)
-  // write fourth port on rising edge of clock (A4/WD4/WE4)
+   // write fourth port on rising edge of clock (A4/WD4/WE4)
-  // write occurs on falling edge of clock
+   // write occurs on falling edge of clock   
-  
+   
  // reset is intended for simulation only, not synthesis
   always_ff @(negedge clk or posedge reset)
     if (reset) for(i=0; i<32; i++) rf[i] <= 0;
     else if (we4) rf[a4] <= wd4;	
--- a/wally-pipelined/src/fpu/fsm.sv
+++ b/wally-pipelined/src/fpu/fsm.sv
@ -1,49 +1,63 @@
-module fsm (
+///////////////////////////////////////////
 //
 // Written: James Stine
 // Modified: 9/28/2021
 //
 // Purpose: FSM for floating point divider/square root unit (Goldschmidt)
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
-   input logic 			clk,
+module fsm (
-   input logic 			reset,
+   input logic 	      clk,
-   input logic 			start,
+   input logic 	      reset,
-   input logic  		op_type,
+   input logic 	      start,
-   output logic 		done,      // End of cycles
+   input logic 	      op_type,
-   output logic 		load_rega, // enable for regA
+   output logic       done, 
-   output logic 		load_regb, // enable for regB
+   output logic       load_rega, 
-   output logic 		load_regc, // enable for regC
+   output logic       load_regb, 
-   output logic 		load_regd, // enable for regD
+   output logic       load_regc, 
-   output logic 		load_regr, // enable for rem
+   output logic       load_regd,
-   output logic 		load_regs, // enable for q,qm,qp 
+   output logic       load_regr,
-   output logic [2:0] 	sel_muxa,  // Select muxA
+   output logic       load_regs,
-   output logic [2:0] 	sel_muxb,  // Select muxB
+   output logic [2:0] sel_muxa, 
-   output logic 		sel_muxr,  // Select rem mux
+   output logic [2:0] sel_muxb, 
-   output logic			divBusy	   // calculation is happening
+   output logic       sel_muxr, 
   output logic       divBusy	   
   );
-
+   typedef enum       logic [4:0] {S0, S1, S2, S3, S4, S5, S6, S7, S8, S9,
-   reg [4:0] 	CURRENT_STATE;
+				   S10, S11, S12, S13, S14, S15, S16, S17, S18, S19,
-   reg [4:0] 	NEXT_STATE;   
+				   S20, S21, S22, S23, S24, S25, S26, S27, S28, S29,
-
+				   S30} statetype;
-   parameter [4:0] 
+   
-     S0=5'd0, S1=5'd1, S2=5'd2,
+   statetype current_state, next_state;
     S3=5'd3, S4=5'd4, S5=5'd5,
     S6=5'd6, S7=5'd7, S8=5'd8,
     S9=5'd9, S10=5'd10,
     S13=5'd13, S14=5'd14, S15=5'd15,     
     S16=5'd16, S17=5'd17, S18=5'd18,
     S19=5'd19, S20=5'd20, S21=5'd21,
     S22=5'd22, S23=5'd23, S24=5'd24,
     S25=5'd25, S26=5'd26, S27=5'd27,
     S28=5'd28, S29=5'd29, S30=5'd30;
   always @(negedge clk)
     begin
-	if(reset==1'b1)
+	if (reset == 1'b1)
-	  CURRENT_STATE=S0;
+	  current_state = S0;
 	else
-	  CURRENT_STATE=NEXT_STATE;
+	  current_state = next_state;
     end
   always @(*)
     begin
- 	case(CURRENT_STATE)
+ 	case(current_state)
 	  S0:  // iteration 0
 	    begin
 	       if (start==1'b0)
@ -59,7 +73,7 @@ module fsm (
 		    sel_muxa = 3'b000;
 		    sel_muxb = 3'b000;
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S0;
+		    next_state = S0;
 		 end 
 	       else if (start==1'b1 && op_type==1'b0) 
 		 begin
@ -74,7 +88,7 @@ module fsm (
 		    sel_muxa = 3'b001;
 		    sel_muxb = 3'b001;		    
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S1;
+		    next_state = S1;
 		 end // if (start==1'b1 && op_type==1'b0)
 	       else if (start==1'b1 && op_type==1'b1) 
 		 begin
@ -89,7 +103,7 @@ module fsm (
 		    sel_muxa = 3'b010;
 		    sel_muxb = 3'b000;		    
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S13;
+		    next_state = S13;
 		 end 	   
 	       else
 		 begin
@ -104,7 +118,7 @@ module fsm (
 		    sel_muxa = 3'b000;
 		    sel_muxb = 3'b000;		    
 		    sel_muxr = 1'b0;
-		    NEXT_STATE = S0;
+		    next_state = S0;
 		 end
 	    end // case: S0
 	  S1:
@ -120,7 +134,7 @@ module fsm (
 	       sel_muxa = 3'b010;
 	       sel_muxb = 3'b000;		    
 	       sel_muxr = 1'b0;	
-	       NEXT_STATE = S2;
+	       next_state = S2;
 	    end	  
 	  S2: // iteration 1
 	    begin
@ -135,7 +149,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S3;
+	       next_state = S3;
 	    end
 	  S3:
 	    begin
@ -150,7 +164,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S4;
+	       next_state = S4;
 	    end
 	  S4: // iteration 2
 	    begin
@ -165,7 +179,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S5;
+	       next_state = S5;
 	    end
 	  S5:
 	    begin
@ -180,7 +194,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;  // add
-	       NEXT_STATE = S6;
+	       next_state = S6;
 	    end
 	  S6: // iteration 3
 	    begin
@ -195,7 +209,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S8;
+	       next_state = S8;
 	    end
 	  S7:
 	    begin
@ -210,7 +224,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S8;
+	       next_state = S8;
 	    end // case: S7
 	  S8: // q,qm,qp
 	    begin
@ -225,7 +239,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S9;
+	       next_state = S9;
 	    end 
 	  S9:  // rem
 	    begin
@ -240,7 +254,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b1;
-	       NEXT_STATE = S10;
+	       next_state = S10;
 	    end 	  
 	  S10:  // done
 	    begin
@ -255,7 +269,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S0;
+	       next_state = S0;
 	    end 
 	  S13:  // start of sqrt path
 	    begin
@ -270,7 +284,7 @@ module fsm (
 	       sel_muxa = 3'b010;
 	       sel_muxb = 3'b001;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S14;
+	       next_state = S14;
 	    end
 	  S14:  
 	    begin
@ -285,7 +299,7 @@ module fsm (
 	       sel_muxa = 3'b001;
 	       sel_muxb = 3'b100;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S15;
+	       next_state = S15;
 	    end 
 	  S15:  // iteration 1
 	    begin
@ -300,7 +314,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S16;
+	       next_state = S16;
 	    end
 	  S16:  
 	    begin
@ -315,7 +329,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S17;
+	       next_state = S17;
 	    end
 	  S17:  
 	    begin
@ -330,7 +344,7 @@ module fsm (
 	       sel_muxa = 3'b100;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S18;
+	       next_state = S18;
 	    end
 	  S18:  // iteration 2
 	    begin
@ -345,7 +359,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S19;
+	       next_state = S19;
 	    end
 	  S19:  
 	    begin
@ -360,7 +374,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S20;
+	       next_state = S20;
 	    end
 	  S20:  
 	    begin
@ -375,7 +389,7 @@ module fsm (
 	       sel_muxa = 3'b100;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S21;
+	       next_state = S21;
 	    end
 	  S21:  // iteration 3
 	    begin
@ -390,7 +404,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S22;
+	       next_state = S22;
 	    end
 	  S22:  
 	    begin
@ -405,7 +419,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b011;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S23;
+	       next_state = S23;
 	    end
 	  S23:  
 	    begin
@ -420,7 +434,7 @@ module fsm (
 	       sel_muxa = 3'b100;
 	       sel_muxb = 3'b010;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S24;
+	       next_state = S24;
 	    end 
 	  S24: // q,qm,qp
 	    begin
@ -435,7 +449,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S25;
+	       next_state = S25;
 	    end 	  
 	  S25:  // rem
 	    begin
@ -450,7 +464,7 @@ module fsm (
 	       sel_muxa = 3'b011;
 	       sel_muxb = 3'b110;
 	       sel_muxr = 1'b1;
-	       NEXT_STATE = S26;
+	       next_state = S26;
 	    end 
 	  S26:  // done
 	    begin
@ -465,7 +479,7 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S0;
+	       next_state = S0;
 	    end 
 	  default: 
 	    begin
@ -480,9 +494,9 @@ module fsm (
 	       sel_muxa = 3'b000;
 	       sel_muxb = 3'b000;
 	       sel_muxr = 1'b0;
-	       NEXT_STATE = S0;
+	       next_state = S0;
 	    end
-	endcase // case(CURRENT_STATE)	
+	endcase // case(current_state)	
-     end // always @ (CURRENT_STATE or X)   
+     end // always @ (current_state or X)   
 endmodule // fsm
--- a/wally-pipelined/src/fpu/rounder_div.sv
+++ b/wally-pipelined/src/fpu/rounder_div.sv
@ -1,37 +1,55 @@
 ///////////////////////////////////////////
 //
-// The rounder takes as inputs a 64-bit value to be rounded, A, the 
+// Written: James Stine
-// exponent of the value to be rounded, the sign of the final result, Sign, 
+// Modified: 8/1/2018
 // the precision of the results, P, and the two-bit rounding mode, rm. 
 // It produces a rounded 52-bit result, Z, the exponent of the rounded 
 // result, Z_exp, and a flag that indicates if the result was rounded,
 // Inexact. The rounding mode has the following values.
 //	    rm		Mode
 //      00 		round-to-nearest-even
 //	    01 		round-toward-zero
 //      10 		round-toward-plus infinity
 //      11  	round-toward-minus infinity
 //
 // Purpose: Floating point divider/square root rounder unit (Goldschmidt)
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 module rounder_div (
-    input logic [1:0]   rm,
+    input logic [1:0] 	rm,
-    input logic         P,
+    input logic 	P,
-    input logic         OvEn,
+    input logic 	OvEn,
-    input logic         UnEn,
+    input logic 	UnEn,
-    input logic [12:0]  exp_diff,
+    input logic [12:0] 	exp_diff,
-    input logic [2:0]   sel_inv,
+    input logic [2:0] 	sel_inv,
-    input logic         Invalid,
+    input logic 	Invalid,
-    input logic 	    SignR,
+    input logic 	SignR,
-   
+    input logic [63:0] 	Float1,
-    input logic [63:0]  q1,
+    input logic [63:0] 	Float2,
-    input logic [63:0]  qm1,
+    input logic 	XNaNQ,
-    input logic [63:0]  qp1,
+    input logic 	YNaNQ,
-    input logic [63:0]  q0,
+    input logic 	XZeroQ,
-    input logic [63:0]  qm0,
+    input logic 	YZeroQ, 
-    input logic [63:0]  qp0,   
+    input logic 	XInfQ,
    input logic 	YInfQ,
    input logic 	op_type, 
    input logic [63:0] 	q1,
    input logic [63:0] 	qm1,
    input logic [63:0] 	qp1,
    input logic [63:0] 	q0,
    input logic [63:0] 	qm0,
    input logic [63:0] 	qp0, 
    input logic [127:0] regr_out,
    output logic [63:0] Result,
-    output logic [4:0]  Flags
+    output logic [4:0] 	Flags
    );
   logic 	       Rsign;
@ -56,11 +74,15 @@ module rounder_div (
   logic 	       Texp_l7z;
   logic 	       Texp_l7o;
   logic 	       OvCon;
-   logic           zero_rem;
+   logic 	       zero_rem;
-   logic [1:0] 	   mux_mant;
+   logic [1:0] 	       mux_mant;
   logic 	       sign_rem;
-   logic [63:0]    q, qm, qp;
+   logic [63:0]        q, qm, qp;
-   logic 	       exp_ovf;   
+   logic 	       exp_ovf;
   logic [50:0]        NaN_out;
   logic 	       NaN_Sign_out;   
   logic 	       Sign_out;     
   // Remainder = 0?
   assign zero_rem = ~(|regr_out);
@ -117,12 +139,11 @@ module rounder_div (
   // the input was infinite or NaN or the output of the adder is zero.
   // 00 = Valid
   // 10 = NaN
-   assign Valid = (~sel_inv[2]&~sel_inv[1]&~sel_inv[0]);
+   assign Valid = ~sel_inv[2]&~sel_inv[1]&~sel_inv[0];
-   assign NaN = ~sel_inv[1]& sel_inv[0];
+   assign NaN = sel_inv[2]&sel_inv[1]&sel_inv[0]; 
   assign UnderFlow = (P & UnFlow_SP | UnFlow_DP) & Valid;
   assign OverFlow  = (P & OvFlow_SP | OvFlow_DP) & Valid;
-   assign Div0 = sel_inv[2]&sel_inv[1]&~sel_inv[0];
+   assign Div0 = YZeroQ&~XZeroQ&~op_type&~NaN;   
   // The final result is Inexact if any rounding occurred ((i.e., R or S 
   // is one), or (if the result overflows ) or (if the result underflows and the 
@ -161,18 +182,26 @@ module rounder_div (
   // If the result is zero or infinity, the mantissa is all zeros. 
   // If the result is NaN, the mantissa is 10...0
   // If the result the largest floating point number, the mantissa
-   // is all ones. Otherwise, the mantissa is not changed. 
+   // is all ones. Otherwise, the mantissa is not changed.
-   assign Rmant[51] = Largest | NaN | (Smant[51]&~Infinite&~Rzero);
+   assign NaN_out = ~XNaNQ&YNaNQ ? Float2[50:0] : Float1[50:0];
-   assign Rmant[50:0] = {51{Largest}} | (Smant[50:0]&{51{~Infinite&Valid&~Rzero}});
+   assign NaN_Sign_out = ~XNaNQ&YNaNQ ? Float2[63] : Float1[63];
   assign Sign_out = (XZeroQ&YZeroQ | XInfQ&YInfQ)&~op_type | Rsign&~XNaNQ&~YNaNQ | 
   		     NaN_Sign_out&(XNaNQ|YNaNQ);
   // FIXME (jes) - Imperas gives sNaN a Sign=0 where x86 gives Sign=1
   // | Float1[63]&op_type;
   assign Rmant[51] = Largest | NaN | (Smant[51]&~Infinite&~Rzero);
   assign Rmant[50:0] = ({51{Largest}} | (Smant[50:0]&{51{~Infinite&Valid&~Rzero}}) |
 			(NaN_out&{51{NaN}}))&({51{~(op_type&Float1[63]&~XZeroQ)}});
   // For single precision, the 8 least significant bits of the exponent
   // and 23 most significant bits of the mantissa contain bits used 
   // for the final result. A double precision result is returned if 
   // overflow has occurred, the overflow trap is enabled, and a conversion
   // is being performed. 
   assign OvCon = OverFlow & OvEn;
-   assign Result = (P&~OvCon) ? { {32{1'b1}}, Rsign, Rexp[7:0], Rmant[51:29]}
+   assign Result = (P&~OvCon) ? { {32{1'b1}}, Sign_out, Rexp[7:0], Rmant[51:29]}
-	           : {Rsign, Rexp, Rmant};
+	           : {Sign_out, Rexp, Rmant};
 endmodule // rounder
--- a/wally-pipelined/src/fpu/sbtm_a0.sv
+++ b/wally-pipelined/src/fpu/sbtm_a0.sv
@ -1,5 +1,30 @@
 ///////////////////////////////////////////
 //
 // Written: James Stine
 // Modified: 8/1/2018
 //
 // Purpose: Bipartite Lookup
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 module sbtm_a0 (input  logic [6:0] a,
-		            output logic [12:0] y);
+		output logic [12:0] y);
   always_comb
     case(a)
       7'b0000000: y = 13'b1111111100010;
@ -137,4 +162,4 @@ endmodule // sbtm_a0
-    
+    
--- a/wally-pipelined/src/fpu/sbtm_a1.sv
+++ b/wally-pipelined/src/fpu/sbtm_a1.sv
@ -1,5 +1,30 @@
 ///////////////////////////////////////////
 //
 // Written: James Stine
 // Modified: 8/1/2018
 //
 // Purpose: Bipartite Lookup
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 module sbtm_a1 (input  logic [6:0] a,
-		            output logic [4:0] y);
+		output logic [4:0] y);
   always_comb
     case(a)
       7'b0000000: y = 5'b11100;
@ -137,4 +162,4 @@ endmodule // sbtm_a0
-    
+    
--- a/wally-pipelined/src/fpu/sbtm_a2.sv
+++ b/wally-pipelined/src/fpu/sbtm_a2.sv
@ -1,5 +1,30 @@
 ///////////////////////////////////////////
 //
 // Written: James Stine
 // Modified: 8/1/2018
 //
 // Purpose: Bipartite Lookup
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 module sbtm_a2 (input  logic [7:0] a,
-		            output logic [13:0] y);
+		output logic [13:0] y);
   always_comb
     case(a)
       8'b01000000: y = 14'b10110100010111;
@ -201,4 +226,4 @@ endmodule // sbtm_a0
-    
+    
--- a/wally-pipelined/src/fpu/sbtm_a3.sv
+++ b/wally-pipelined/src/fpu/sbtm_a3.sv
@ -1,5 +1,30 @@
 ///////////////////////////////////////////
 //
 // Written: James Stine
 // Modified: 8/1/2018
 //
 // Purpose: Bipartite Lookup
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 module sbtm_a3 (input  logic [7:0] a,
-		            output logic [5:0] y);
+		output logic [5:0] y);
   always_comb
     case(a)
       8'b01000000: y = 6'b100110;
--- a/wally-pipelined/src/fpu/sbtm_div.sv
+++ b/wally-pipelined/src/fpu/sbtm_div.sv
@ -1,3 +1,27 @@
 ///////////////////////////////////////////
 //
 // Written: James Stine
 // Modified: 8/1/2018
 //
 // Purpose: Bipartite Lookup for divide portion of fpdivsqrt
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 module sbtm_div (input logic [11:0] a, output logic [10:0] ia_out);
   // bit partitions
--- a/wally-pipelined/src/fpu/sbtm_sqrt.sv
+++ b/wally-pipelined/src/fpu/sbtm_sqrt.sv
@ -1,3 +1,27 @@
 ///////////////////////////////////////////
 //
 // Written: James Stine
 // Modified: 8/1/2018
 //
 // Purpose: Bipartite Lookup for sqrt part of fpdivsqrt
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 module sbtm_sqrt (input logic [11:0] a, output logic [10:0] y);
   // bit partitions