FPU update - missing files

2025-02-11 06:05:49 +00:00 · 2021-07-02 12:53:05 -04:00 · 2021-07-02 12:53:05 -04:00 · 72406b8a88
commit 72406b8a88
parent 3f61e313d2
5 changed files with 904 additions and 0 deletions
--- a/wally-pipelined/src/fpu/fclassify.sv
+++ b/wally-pipelined/src/fpu/fclassify.sv
@ -0,0 +1,62 @@
+
+`include "wally-config.vh"
+
+module fclassify (
+    input  logic [63:0] SrcXE,
+    input  logic        FmtE,           // 0-Single 1-Double
+    output logic [63:0] ClassResE
+    );
+
+    logic [31:0] Single;
+    logic [63:0] Double;
+    logic Sgn;
+    logic Inf, NaN, Zero, Norm, Denorm;
+    logic PInf, QNaN, PZero, PNorm, PDenorm;
+    logic NInf, SNaN, NZero, NNorm, NDenorm;
+    logic MaxExp, ExpZero, ManZero, FirstBitFrac;
+   
+    // Single and Double precision layouts
+    assign Single = SrcXE[63:32];
+    assign Double = SrcXE;
+    assign Sgn = SrcXE[63];
+
+    // basic calculations for readabillity
+    
+    assign ExpZero = FmtE ? ~|Double[62:52] : ~|Single[30:23];
+    assign MaxExp = FmtE ? &Double[62:52] : &Single[30:23];
+    assign ManZero = FmtE ? ~|Double[51:0] : ~|Single[22:0];
+    assign FirstBitFrac = FmtE ? Double[51] : Single[22];
+
+    // determine the type of number
+    assign NaN      = MaxExp & ~ManZero;
+    assign Inf = MaxExp & ManZero;
+    assign Zero     = ExpZero & ManZero;
+    assign Denorm= ExpZero & ~ManZero;
+    assign Norm   = ~ExpZero;
+
+    // determine the sub categories
+    assign QNaN = FirstBitFrac&NaN;
+    assign SNaN = ~FirstBitFrac&NaN;
+    assign PInf = ~Sgn&Inf;
+    assign NInf = Sgn&Inf;
+    assign PNorm = ~Sgn&Norm;
+    assign NNorm = Sgn&Norm;
+    assign PDenorm = ~Sgn&Denorm;
+    assign NDenorm = Sgn&Denorm;
+    assign PZero = ~Sgn&Zero;
+    assign NZero = Sgn&Zero;
+
+    // determine sub category and combine into the result
+    //  bit 0 - -Inf
+    //  bit 1 - -Norm
+    //  bit 2 - -Denorm
+    //  bit 3 - -Zero
+    //  bit 4 - +Zero
+    //  bit 5 - +Denorm
+    //  bit 6 - +Norm
+    //  bit 7 - +Inf
+    //  bit 8 - signaling NaN
+    //  bit 9 - quiet NaN
+    assign ClassResE = {{54{1'b0}}, QNaN, SNaN, PInf, PNorm,  PDenorm, PZero, NZero, NDenorm, NNorm, NInf};
+
+endmodule
--- a/wally-pipelined/src/fpu/fcmp.sv
+++ b/wally-pipelined/src/fpu/fcmp.sv
@ -0,0 +1,465 @@
+
+//
+// File name : fpcomp.v
+// Title     : Floating-Point Comparator
+// project   : FPU
+// Library   : fpcomp
+// Author(s) : James E. Stine
+// Purpose   : definition of main unit to floating-point comparator
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Floating Point Comparator (Algorithm)
+//
+// 1.) Performs sign-extension if the inputs are 32-bit integers.
+// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
+// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
+//     and correct for sign bits
+//
+// This module takes 64-bits inputs op1 and op2, VSS, and VDD
+// signals, and a 2-bit signal FOpCtrlE that indicates the type of 
+// operands being compared as indicated below.
+//	FOpCtrlE	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	(unused)
+//
+// The comparator produces a 2-bit signal FCC, which
+// indicates the result of the comparison:
+//
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+//
+// It also produces an invalid operation flag, which is one
+// if either of the input operands is a signaling NaN per 754
+
+`include "wally-config.vh"
+module fcmp (   
+   input logic [63:0] op1, 
+   input logic [63:0] op2,
+   input logic [2:0]  FOpCtrlE,
+   input logic 	      FmtE,
+
+   
+   output logic       Invalid, 		 // Invalid Operation
+   // output logic [1:0] FCC,  		 // Condition Codes 
+   output logic [63:0] CmpResE);
+   // Perform magnitude comparison between the 63 least signficant bits
+   // of the input operands. Only LT and EQ are returned, since GT can
+   // be determined from these values. 
+   logic [1:0] FCC;  		 // Condition Codes 
+   logic [7:0]	      w, x;
+   logic	      ANaN, BNaN;
+   logic	      Azero, Bzero;
+   logic 	      LT;                // magnitude op1 < magnitude op2
+   logic 	      EQ;                // magnitude op1 = magnitude op2
+   
+   magcompare64b_1 magcomp1 (w, x, {~op1[63], op1[62:0]}, {~op2[63], op2[62:0]});
+
+   // Determine final values based on output of magnitude comparison, 
+   // sign bits, and special case testing. 
+   exception_cmp_1 exc1 (ANaN, BNaN, Azero, Bzero, op1, op2, FOpCtrlE);
+   
+   // Perform magnitude comparison between the 63 least signficant bits
+   // of the input operands. Only LT and EQ are returned, since GT can
+   // be determined from these values. 
+   magcompare64b_2 magcomp2 (LT, EQ, w, x);
+
+   // Determine final values based on output of magnitude comparison, 
+   // sign bits, and special case testing. 
+   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .FOpCtrlE(FOpCtrlE), .A(op1), .B(op2), .*);
+
+endmodule // fpcomp
+
+// module magcompare2b (LT, GT, A, B);
+
+//    input logic [1:0] A;
+//    input logic [1:0] B;
+   
+//    output logic     LT;
+//    output logic     GT;
+
+//    // Determine if A < B  using a minimized sum-of-products expression
+//    assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
+//    // Determine if A > B  using a minimized sum-of-products expression
+//    assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
+
+// endmodule // magcompare2b
+
+// 2-bit magnitude comparator
+// This module compares two 2-bit values A and B. LT is '1' if A < B 
+// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
+// this version actually incorporates don't cares into the equation to
+// simplify the optimization
+
+module magcompare2c (LT, GT, A, B);
+
+   input logic [1:0] A;
+   input logic [1:0] B;
+   
+   output logic      LT;
+   output logic      GT;
+
+   assign LT = B[1] | (!A[1]&B[0]);
+   assign GT = A[1] | (!B[1]&A[0]);
+
+endmodule // magcompare2b
+
+// This module compares two 64-bit values A and B. LT is '1' if A < B 
+// and EQ is '1'if A = B. LT and GT are both '0' if A > B.
+// This structure was modified so
+// that it only does a strict magnitdude comparison, and only
+// returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
+// of 63 2-bit magnitude comparators, followed by one OR gates.
+//
+// J. E. Stine and M. J. Schulte, "A combined two's complement and
+// floating-point comparator," 2005 IEEE International Symposium on
+// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
+// doi: 10.1109/ISCAS.2005.1464531
+
+module magcompare64b_1 (w, x,  A, B);
+
+   input logic [63:0] A;
+   input logic [63:0] B;
+   
+   logic [31:0]       s;
+   logic [31:0]       t;
+   logic [15:0]       u;
+   logic [15:0]       v;
+   output logic [7:0] 	      w;
+   output logic [7:0] 	      x;
+   
+   magcompare2b mag1(s[0], t[0], A[1:0], B[1:0]);
+   magcompare2b mag2(s[1], t[1], A[3:2], B[3:2]);
+   magcompare2b mag3(s[2], t[2], A[5:4], B[5:4]);
+   magcompare2b mag4(s[3], t[3], A[7:6], B[7:6]);
+   magcompare2b mag5(s[4], t[4], A[9:8], B[9:8]);
+   magcompare2b mag6(s[5], t[5], A[11:10], B[11:10]);
+   magcompare2b mag7(s[6], t[6], A[13:12], B[13:12]);
+   magcompare2b mag8(s[7], t[7], A[15:14], B[15:14]);
+   magcompare2b mag9(s[8], t[8], A[17:16], B[17:16]);
+   magcompare2b magA(s[9], t[9], A[19:18], B[19:18]);
+   magcompare2b magB(s[10], t[10], A[21:20], B[21:20]);
+   magcompare2b magC(s[11], t[11], A[23:22], B[23:22]);
+   magcompare2b magD(s[12], t[12], A[25:24], B[25:24]);
+   magcompare2b magE(s[13], t[13], A[27:26], B[27:26]);
+   magcompare2b magF(s[14], t[14], A[29:28], B[29:28]);
+   magcompare2b mag10(s[15], t[15], A[31:30], B[31:30]);
+   magcompare2b mag11(s[16], t[16], A[33:32], B[33:32]);
+   magcompare2b mag12(s[17], t[17], A[35:34], B[35:34]);
+   magcompare2b mag13(s[18], t[18], A[37:36], B[37:36]);
+   magcompare2b mag14(s[19], t[19], A[39:38], B[39:38]);
+   magcompare2b mag15(s[20], t[20], A[41:40], B[41:40]);
+   magcompare2b mag16(s[21], t[21], A[43:42], B[43:42]);
+   magcompare2b mag17(s[22], t[22], A[45:44], B[45:44]);
+   magcompare2b mag18(s[23], t[23], A[47:46], B[47:46]);
+   magcompare2b mag19(s[24], t[24], A[49:48], B[49:48]);
+   magcompare2b mag1A(s[25], t[25], A[51:50], B[51:50]);
+   magcompare2b mag1B(s[26], t[26], A[53:52], B[53:52]);
+   magcompare2b mag1C(s[27], t[27], A[55:54], B[55:54]);
+   magcompare2b mag1D(s[28], t[28], A[57:56], B[57:56]);
+   magcompare2b mag1E(s[29], t[29], A[59:58], B[59:58]);
+   magcompare2b mag1F(s[30], t[30], A[61:60], B[61:60]);
+   magcompare2b mag20(s[31], t[31], A[63:62], B[63:62]);
+
+   magcompare2c mag21(u[0], v[0], t[1:0], s[1:0]);
+   magcompare2c mag22(u[1], v[1], t[3:2], s[3:2]);
+   magcompare2c mag23(u[2], v[2], t[5:4], s[5:4]);
+   magcompare2c mag24(u[3], v[3], t[7:6], s[7:6]);
+   magcompare2c mag25(u[4], v[4], t[9:8], s[9:8]);
+   magcompare2c mag26(u[5], v[5], t[11:10], s[11:10]);
+   magcompare2c mag27(u[6], v[6], t[13:12], s[13:12]);
+   magcompare2c mag28(u[7], v[7], t[15:14], s[15:14]);
+   magcompare2c mag29(u[8], v[8], t[17:16], s[17:16]);
+   magcompare2c mag2A(u[9], v[9], t[19:18], s[19:18]);
+   magcompare2c mag2B(u[10], v[10], t[21:20], s[21:20]);
+   magcompare2c mag2C(u[11], v[11], t[23:22], s[23:22]);
+   magcompare2c mag2D(u[12], v[12], t[25:24], s[25:24]);
+   magcompare2c mag2E(u[13], v[13], t[27:26], s[27:26]);
+   magcompare2c mag2F(u[14], v[14], t[29:28], s[29:28]);
+   magcompare2c mag30(u[15], v[15], t[31:30], s[31:30]);
+
+   magcompare2c mag31(w[0], x[0], v[1:0], u[1:0]);
+   magcompare2c mag32(w[1], x[1], v[3:2], u[3:2]);
+   magcompare2c mag33(w[2], x[2], v[5:4], u[5:4]);
+   magcompare2c mag34(w[3], x[3], v[7:6], u[7:6]);
+   magcompare2c mag35(w[4], x[4], v[9:8], u[9:8]);
+   magcompare2c mag36(w[5], x[5], v[11:10], u[11:10]);
+   magcompare2c mag37(w[6], x[6], v[13:12], u[13:12]);
+   magcompare2c mag38(w[7], x[7], v[15:14], u[15:14]);
+
+endmodule // magcompare64b
+
+// This module takes 64-bits inputs A and B, two magnitude comparison
+// flags LT_mag and EQ_mag, and a 2-bit signal FOpCtrlE that indicates the type of 
+// operands being compared as indicated below.
+//	FOpCtrlE	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	bfloat precision numbers
+//
+// The comparator produces a 2-bit signal fcc, which
+// indicates the result of the comparison as follows:
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+// It also produces a invalid operation flag, which is one
+// if either of the input operands is a signaling NaN.
+
+module exception_cmp_1 (ANaN, BNaN, Azero, Bzero, A, B, FOpCtrlE);
+
+   input logic [63:0] A;
+   input logic [63:0] B;
+   input logic [2:0]  FOpCtrlE;
+
+   logic 		      dp, sp, hp;
+
+   output logic 	      ANaN;
+   output logic 	      BNaN;
+   output logic               Azero;
+   output logic               Bzero;
+
+   assign dp = !FOpCtrlE[1]&!FOpCtrlE[0];
+   assign sp = !FOpCtrlE[1]&FOpCtrlE[0];
+   assign hp = FOpCtrlE[1]&!FOpCtrlE[0];
+
+   // Test if A or B is NaN.
+   assign ANaN = (A[62]&A[61]&A[60]&A[59]&A[58]) & 
+		 ((sp&A[57]&A[56]&A[55]&(A[54]|A[53])) | 
+		 (dp&A[57]&A[56]&A[55]&A[54]&A[53]&A[52]&(A[51]|A[50])) |
+		 (hp&(A[57]|A[56])));
+
+   assign BNaN = (B[62]&B[61]&B[60]&B[59]&B[58]) & 
+		 ((sp&B[57]&B[56]&B[55]&(B[54]|B[53])) | 
+		 (dp&B[57]&B[56]&B[55]&B[54]&B[53]&B[52]&(B[51]|B[50])) |
+		 (hp&(B[57]|B[56])));
+
+   // Test if A is +0 or -0 when viewed as a floating point number (i.e,
+   // the 63 least siginficant bits of A are zero). 
+   // Depending on how this synthesizes, it may work better to replace
+   // this with assign Azero = ~(A[62] | A[61] | ... | A[0])
+   assign Azero = (A[62:0] == 63'h0);
+   assign Bzero = (B[62:0] == 63'h0);
+
+endmodule // exception_cmp
+//
+// File name : fpcomp.v
+// Title     : Floating-Point Comparator
+// project   : FPU
+// Library   : fpcomp
+// Author(s) : James E. Stine
+// Purpose   : definition of main unit to floating-point comparator
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Floating Point Comparator (Algorithm)
+//
+// 1.) Performs sign-extension if the inputs are 32-bit integers.
+// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
+// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
+//     and correct for sign bits
+//
+// This module takes 64-bits inputs op1 and op2, VSS, and VDD
+// signals, and a 2-bit signal FOpCtrlE that indicates the type of 
+// operands being compared as indicated below.
+//	FOpCtrlE	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	(unused)
+//
+// The comparator produces a 2-bit signal FCC, which
+// indicates the result of the comparison:
+//
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+//
+// It also produces an invalid operation flag, which is one
+// if either of the input operands is a signaling NaN per 754
+
+
+/*module magcompare2b (LT, GT, A, B);
+
+   input logic [1:0] A;
+   input logic [1:0] B;
+   
+   output logic     LT;
+   output logic     GT;
+
+   // Determine if A < B  using a minimized sum-of-products expression
+   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
+   // Determine if A > B  using a minimized sum-of-products expression
+   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
+
+endmodule*/ // magcompare2b
+
+// 2-bit magnitude comparator
+// This module compares two 2-bit values A and B. LT is '1' if A < B 
+// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
+// this version actually incorporates don't cares into the equation to
+// simplify the optimization
+
+// module magcompare2c (LT, GT, A, B);
+
+//    input logic [1:0] A;
+//    input logic [1:0] B;
+   
+//    output logic      LT;
+//    output logic      GT;
+
+//    assign LT = B[1] | (!A[1]&B[0]);
+//    assign GT = A[1] | (!B[1]&A[0]);
+
+// endmodule // magcompare2b
+
+// This module compares two 64-bit values A and B. LT is '1' if A < B 
+// and EQ is '1'if A = B. LT and GT are both '0' if A > B.
+// This structure was modified so
+// that it only does a strict magnitdude comparison, and only
+// returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
+// of 63 2-bit magnitude comparators, followed by one OR gates.
+//
+// J. E. Stine and M. J. Schulte, "A combined two's complement and
+// floating-point comparator," 2005 IEEE International Symposium on
+// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
+// doi: 10.1109/ISCAS.2005.1464531
+
+module magcompare64b_2 (LT, EQ, w, x);
+
+   input logic [7:0]  w;
+   input logic [7:0]  x;
+   logic [3:0] 	      y;
+   logic [3:0] 	      z;
+   logic [1:0] 	      a;
+   logic [1:0] 	      b;   
+   logic 	      GT;
+   
+   output logic       LT;
+   output logic       EQ;
+   
+   magcompare2c mag39(y[0], z[0], x[1:0], w[1:0]);
+   magcompare2c mag3A(y[1], z[1], x[3:2], w[3:2]);
+   magcompare2c mag3B(y[2], z[2], x[5:4], w[5:4]);
+   magcompare2c mag3C(y[3], z[3], x[7:6], w[7:6]);
+   
+   magcompare2c mag3D(a[0], b[0], z[1:0], y[1:0]);
+   magcompare2c mag3E(a[1], b[1], z[3:2], y[3:2]);
+   
+   magcompare2c mag3F(LT, GT, b[1:0], a[1:0]);
+
+   assign EQ = ~(LT | GT);
+
+endmodule // magcompare64b
+
+// This module takes 64-bits inputs A and B, two magnitude comparison
+// flags LT_mag and EQ_mag, and a 2-bit signal FOpCtrlE that indicates the type of 
+// operands being compared as indicated below.
+//	FOpCtrlE	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	bfloat precision numbers
+//
+// The comparator produces a 2-bit signal fcc, which
+// indicates the result of the comparison as follows:
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+// It also produces a invalid operation flag, which is one
+// if either of the input operands is a signaling NaN.
+
+module exception_cmp_2 (
+   input logic [63:0] A,
+   input logic [63:0] B,
+   input logic 	      FmtE,
+   input logic 	      LT_mag,
+   input logic 	      EQ_mag,
+   input logic [2:0]  FOpCtrlE,
+   
+   output logic       invalid,
+   output logic [1:0] fcc,
+   output logic [63:0] CmpResE,
+
+   input logic 	      Azero,
+   input logic 	      Bzero,   
+   input logic 	      ANaN,
+   input logic 	      BNaN);
+   
+   logic 	      dp;   
+   logic 	      sp;
+   logic 	      hp;   
+   logic 	      ASNaN;
+   logic 	      BSNaN;
+   logic 	      UO;
+   logic 	      GT;
+   logic 	      LT;
+   logic 	      EQ;
+   logic [62:0]       sixtythreezeros = 63'h0;
+
+   assign dp = !FOpCtrlE[1]&!FOpCtrlE[0];
+   assign sp = !FOpCtrlE[1]&FOpCtrlE[0];
+   assign hp = FOpCtrlE[1]&!FOpCtrlE[0];
+
+   // Values are unordered if ((A is NaN) OR (B is NaN)) AND (a floating 
+   // point comparison is being performed. 
+   assign UO = (ANaN | BNaN);
+
+   // Test if A or B is a signaling NaN.
+   assign ASNaN = ANaN & (sp&~A[53] | dp&~A[50] | hp&~A[56]);
+   assign BSNaN = BNaN & (sp&~B[53] | dp&~B[50] | hp&~B[56]);
+
+   // If either A or B is a signaling NaN the "Invalid Operation"
+   // exception flag is set to one; otherwise it is zero.    
+   assign invalid = (ASNaN | BSNaN);
+
+   // A and B are equal if (their magnitudes are equal) AND ((their signs are
+   // equal) or (their magnitudes are zero AND they are floating point
+   // numbers)). Also, A and B are not equal if they are unordered.
+   assign EQ = (EQ_mag | (Azero&Bzero)) & (~UO);
+   
+   // A is less than B if (A is negative and B is posiive) OR
+   // (A and B are positive and the magnitude of A is less than
+   // the magnitude of B) or (A and B are negative integers and
+   // the magnitude of A is less than the magnitude of B) or
+   // (A and B are negative floating point numbers and
+   // the magnitude of A is greater than the magnitude of B).
+   // Also, A is not less than B if A and B are equal or unordered.
+   assign LT = ((~LT_mag & A[63] & B[63]) |
+		(LT_mag & ~(A[63] & B[63])))&~EQ&~UO;
+   
+   // A is greater than B when LT, EQ, and UO are are false.
+   assign GT = ~(LT | EQ | UO);
+
+   // Note: it may be possible to optimize the setting of fcc 
+   // a little more, but it is probably not worth the effort. 
+
+   // Set the bits of fcc based on LT, GT, EQ, and UO
+   assign fcc[0] = LT | UO;
+   assign fcc[1] = GT | UO;  
+
+   always_comb begin
+      case (FOpCtrlE[2:0])
+         3'b111: CmpResE = LT ? A : B;//min 
+         3'b101: CmpResE = GT ? A : B;//max
+         3'b010: CmpResE = {63'b0, EQ};//equal
+         3'b001: CmpResE = {63'b0, LT};//less than
+         3'b011: CmpResE = {63'b0, LT|EQ};//less than or equal
+         default: CmpResE = 64'b0;
+      endcase
+   end 
+
+endmodule // exception_cmp
--- a/wally-pipelined/src/fpu/fdivsqrt.sv
+++ b/wally-pipelined/src/fpu/fdivsqrt.sv
@ -0,0 +1,256 @@
+//
+// File name : fpdiv
+// Title     : Floating-Point Divider/Square-Root
+// project   : FPU
+// Library   : fpdiv
+// Author(s) : James E. Stine, Jr.
+// Purpose   : definition of main unit to floating-point div/sqrt
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Basic Operations
+//
+// Step 1: Load operands, set flags, and convert SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Exponent Logic
+// Step 4: Divide/Sqrt using Goldschmidt
+// Step 5: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 6: Round the result.// 
+// Step 7: Put quotient/remainder onto output.
+//
+
+// `timescale 1ps/1ps
+module fdivsqrt (FDivSqrtDoneE, FDivResultM, FDivSqrtFlgM, DivInput1E, DivInput2E, FrmE, DivOpType, FmtE, DivOvEn, DivUnEn,
+	      FDivStartE, reset, clk, FDivBusyE, HoldInputs);
+
+   input [63:0] DivInput1E;		// 1st input operand (A)
+   input [63:0] DivInput2E;		// 2nd input operand (B)
+   input [2:0] 	FrmE;		// Rounding mode - specify values 
+   input 	DivOpType;	// Function opcode
+   input 	FmtE;   		// Result Precision (0 for double, 1 for single) //***will need to swap this
+   input 	DivOvEn;		// Overflow trap enabled
+   input 	DivUnEn;   	// Underflow trap enabled
+
+   input 	FDivStartE;
+   input 	reset;
+   input 	clk;   
+
+   output [63:0] FDivResultM;	// Result of operation
+   output [4:0]  FDivSqrtFlgM;   	// IEEE exception flags 
+   output 	 FDivSqrtDoneE;
+   output    FDivBusyE, HoldInputs;
+
+   supply1 	  vdd;
+   supply0 	  vss;   
+
+   wire [63:0] 	 Float1; 
+   wire [63:0] 	 Float2;
+   wire [63:0] 	 IntValue;
+   
+   wire 	 DivDenormM;   	// DivDenormM on input or output
+   wire [12:0] 	 exp1, exp2, expF;
+   wire [12:0] 	 exp_diff, bias;
+   wire [13:0] 	 exp_sqrt;
+   wire [12:0] 	 exp_s;
+   wire [12:0] 	 exp_c;
+   
+   wire [10:0] 	 exponent, exp_pre;
+   wire [63:0] 	 Result;   
+   wire [52:0] 	 mantissaA;
+   wire [52:0] 	 mantissaB; 
+   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
+   
+   wire [5:0] 	 align_shift;
+   wire [5:0] 	 norm_shift;
+   wire [2:0] 	 sel_inv;
+   wire		 op1_Norm, op2_Norm;
+   wire		 opA_Norm, opB_Norm;
+   wire		 Invalid;
+   wire 	 DenormIn, DenormIO;
+   wire [4:0] 	 FlagsIn;   	
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire 	 signResult, sign_corr;
+   wire          corr_sign;
+   wire 	 zeroB;         
+   wire 	 convert;
+   wire          swap;
+   wire          sub;
+   
+   wire [63:0] 	 q1, qm1, qp1, q0, qm0, qp0;
+   wire [63:0] 	 rega_out, regb_out, regc_out, regd_out;
+   wire [127:0]  regr_out;
+   wire [2:0] 	 sel_muxa, sel_muxb;
+   wire 	 sel_muxr;   
+   wire 	 load_rega, load_regb, load_regc, load_regd, load_regr, load_regs;
+
+   wire 	 donev, sel_muxrv, sel_muxsv;
+   wire [1:0] 	 sel_muxav, sel_muxbv;   
+   wire 	 load_regav, load_regbv, load_regcv;
+   wire 	 load_regrv, load_regsv;
+   
+   logic exp_cout1, exp_cout2, exp_odd, open;
+
+
+   // Convert the input operands to their appropriate forms based on 
+   // the orignal operands, the DivOpType , and their precision FmtE. 
+   // Single precision inputs are converted to double precision 
+   // and the sign of the first operand is set appropratiately based on
+   // if the operation is absolute value or negation. 
+   convert_inputs_div divconv1 (Float1, Float2, DivInput1E, DivInput2E, DivOpType, FmtE);
+
+   // Test for exceptions and return the "Invalid Operation" and
+   // "Denormalized" Input FDivSqrtFlgM. The "sel_inv" is used in
+   // the third pipeline stage to select the result. Also, op1_Norm
+   // and op2_Norm are one if DivInput1E and DivInput2E are not zero or denormalized.
+   // sub is one if the effective operation is subtaction. 
+   exception_div divexc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
+		   Float1, Float2, DivOpType);
+
+   // Determine Sign/Mantissa
+   assign signResult = ((Float1[63]^Float2[63])&~DivOpType) | Float1[63]&DivOpType;
+   assign mantissaA = {vdd, Float1[51:0]};
+   assign mantissaB = {vdd, Float2[51:0]};
+   // Perform Exponent Subtraction - expA - expB + Bias   
+   assign exp1 = {2'b0, Float1[62:52]};
+   assign exp2 = {2'b0, Float2[62:52]};
+   // bias : DP = 2^{11-1}-1 = 1023
+   assign bias = {3'h0, 10'h3FF};
+   // Divide exponent
+   csa #(13) csa1 (exp1, ~exp2, bias, exp_s, exp_c); //***adder
+   exp_add explogic1 (exp_cout1, {open, exp_diff}, //***adder?
+		      {vss, exp_s}, {vss, exp_c}, 1'b1);
+   // Sqrt exponent (check if exponent is odd)
+   assign exp_odd = Float1[52] ? vss : vdd;
+   exp_add explogic2 (exp_cout2, exp_sqrt, //***adder?
+		      {vss, exp1}, {4'h0, 10'h3ff}, exp_odd);
+   // Choose correct exponent
+   assign expF = DivOpType ? exp_sqrt[13:1] : exp_diff;   
+
+   // Main Goldschmidt/Division Routine
+   divconv goldy (q1, qm1, qp1, q0, qm0, qp0, 
+		  rega_out, regb_out, regc_out, regd_out,
+		  regr_out, mantissaB, mantissaA, 
+		  sel_muxa, sel_muxb, sel_muxr, 
+		  reset, clk,
+		  load_rega, load_regb, load_regc, load_regd,
+		  load_regr, load_regs, FmtE, DivOpType, exp_odd);
+
+   // FSM : control divider
+   fsm control (FDivSqrtDoneE, load_rega, load_regb, load_regc, load_regd, 
+		load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
+		clk, reset, FDivStartE, DivOpType, FDivBusyE, HoldInputs);
+   
+   // Round the mantissa to a 52-bit value, with the leading one
+   // removed. The rounding units also handles special cases and 
+   // set the exception flags.
+   //***add max magnitude and swap negitive and positive infinity
+   rounder_div divround1 (Result, DenormIO, FlagsIn, 
+		   FrmE, FmtE, DivOvEn, DivUnEn, expF, 
+   		   sel_inv, Invalid, DenormIn, signResult, 
+		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
+
+   // Store the final result and the exception flags in registers.
+   flopenr #(64) rega (clk, reset, FDivSqrtDoneE, Result, FDivResultM);
+   flopenr #(1) regb (clk, reset, FDivSqrtDoneE, DenormIO, DivDenormM);   
+   flopenr #(5) regc (clk, reset, FDivSqrtDoneE, FlagsIn, FDivSqrtFlgM);   
+   
+endmodule // fpadd
+
+//
+// Brent-Kung Prefix Adder 
+//   (yes, it is 14 bits as my generator is broken for 13 bits :( 
+//    assume, synthesizer will delete stuff not needed )
+//
+module exp_add (cout, sum, a, b, cin);
+   
+   input [13:0] a, b;
+   input 	cin;
+   
+   output [13:0] sum;
+   output 	 cout;
+
+   wire [14:0] 	 p,g;
+   wire [13:0] 	 c;
+
+   // pre-computation
+   assign p={a^b,1'b0};
+   assign g={a&b, cin};
+
+   // prefix tree
+   brent_kung prefix_tree(c, p[13:0], g[13:0]);
+
+   // post-computation
+   assign sum=p[14:1]^c;
+   assign cout=g[14]|(p[14]&c[13]);
+
+endmodule // exp_add
+
+module brent_kung (c, p, g);
+   
+   input [13:0] p;
+   input [13:0] g;
+   output [14:1] c;
+
+   logic G_1_0, G_3_2,G_5_4,G_7_6,G_9_8,G_11_10,G_13_12,G_3_0,G_7_4,G_11_8;
+   logic P_3_2,P_5_4,P_7_6,P_9_8,P_11_10,P_13_12,P_7_4,P_11_8;
+   logic G_7_0,G_11_0,G_5_0,G_9_0,G_13_0,G_2_0,G_4_0,G_6_0,G_8_0,G_10_0,G_12_0;
+   // parallel-prefix, Brent-Kung
+
+   // Stage 1: Generates G/FmtE pairs that span 1 bits
+   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
+   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
+   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
+   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
+   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
+   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
+   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
+
+   // Stage 2: Generates G/FmtE pairs that span 2 bits
+   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
+   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
+   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
+
+   // Stage 3: Generates G/FmtE pairs that span 4 bits
+   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
+
+   // Stage 4: Generates G/FmtE pairs that span 8 bits
+
+   // Stage 5: Generates G/FmtE pairs that span 4 bits
+   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
+
+   // Stage 6: Generates G/FmtE pairs that span 2 bits
+   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
+   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
+   grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
+
+   // Last grey cell stage 
+   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
+   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
+   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
+   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
+   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
+   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
+
+   // Final Stage: Apply c_k+1=G_k_0
+   assign c[1]=g[0];
+   assign c[2]=G_1_0;
+   assign c[3]=G_2_0;
+   assign c[4]=G_3_0;
+   assign c[5]=G_4_0;
+   assign c[6]=G_5_0;
+   assign c[7]=G_6_0;
+   assign c[8]=G_7_0;
+   assign c[9]=G_8_0;
+
+   assign c[10]=G_9_0;
+   assign c[11]=G_10_0;
+   assign c[12]=G_11_0;
+   assign c[13]=G_12_0;
+   assign c[14]=G_13_0;
+
+endmodule // brent_kung
+
--- a/wally-pipelined/src/fpu/fhazard.sv
+++ b/wally-pipelined/src/fpu/fhazard.sv
@ -0,0 +1,67 @@
+///////////////////////////////////////////
+// fpuhazard.sv
+//
+// Written: me@KatherineParry.com 19 May 2021
+// Modified: 
+//
+// Purpose: Determine forwarding, stalls and flushes for the FPU
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fhazard(
+    input logic [4:0] Adr1E, Adr2E, Adr3E,
+    input logic FWriteEnM, FWriteEnW, 
+	  input logic [4:0] RdM, RdW,
+    input logic [2:0] FResultSelM,
+    output logic FStallD,
+    output logic [1:0] ForwardXE, ForwardYE, ForwardZE
+);
+
+
+  always_comb begin
+    // set ReadData as default
+    ForwardXE = 2'b00; // choose FRD1E
+    ForwardYE = 2'b00; // choose FRD2E
+    ForwardZE = 2'b00; // choose FRD3E
+    FStallD = 0;
+
+      if ((Adr1E == RdM) & FWriteEnM)
+      // if the result will be FResM
+        if(FResultSelM == 3'b100) ForwardXE = 2'b10; // choose FResM
+        else FStallD = 1;   // if the result won't be ready stall
+      else if ((Adr1E == RdW) & FWriteEnW) ForwardXE = 2'b01; // choose FPUResult64W
+    
+
+      if ((Adr2E == RdM) & FWriteEnM)
+      // if the result will be FResM
+        if(FResultSelM == 3'b100) ForwardYE = 2'b10; // choose FResM
+        else FStallD = 1;   // if the result won't be ready stall
+      else if ((Adr2E == RdW) & FWriteEnW) ForwardYE = 2'b01; // choose FPUResult64W
+
+ 
+      if ((Adr3E == RdM) & FWriteEnM)
+      // if the result will be FResM
+        if(FResultSelM == 3'b100) ForwardZE = 2'b10; // choose FResM
+        else FStallD = 1;   // if the result won't be ready stall
+      else if ((Adr3E == RdW) & FWriteEnW) ForwardZE = 2'b01; // choose FPUResult64W
+
+  end 
+
+endmodule
--- a/wally-pipelined/src/fpu/fregfile.sv
+++ b/wally-pipelined/src/fpu/fregfile.sv
@ -0,0 +1,54 @@
+///////////////////////////////////////////
+// regfile.sv
+//
+// Written: David_Harris@hmc.edu 9 January 2021
+// Modified: 
+//
+// Purpose: 4-port register file
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module fregfile (
+  input  logic             clk, reset,
+  input  logic             we4, 
+  input  logic [ 4:0]      a1, a2, a3, a4, 
+  input  logic [63:0] wd4,    //KEP `XLEN-1 changed to 63 (lint warning) *** figure out if double can be suported when XLEN = 32
+  output logic [63:0] rd1, rd2, rd3);
+
+  logic [63:0] rf[31:0];
+  integer i;
+
+  // three ported register file
+  // read three ports combinationally (A1/RD1, A2/RD2, A3/RD3)
+  // write fourth port on rising edge of clock (A4/WD4/WE4)
+  // write occurs on falling edge of clock
+  
+  // reset is intended for simulation only, not synthesis
+    
+   always_ff @(negedge clk or posedge reset)
+     if (reset) for(i=0; i<32; i++) rf[i] <= 0;
+     else if (we4) rf[a4] <= wd4;	
+   
+   assign #2 rd1 = rf[a1];
+   assign #2 rd2 = rf[a2];
+   assign #2 rd3 = rf[a3];
+   
+endmodule // regfile
+