Merge branch 'main' of https://github.com/davidharrishmc/riscv-wally into main

2021-07-14 00:21:39 -04:00 · 2021-07-14 00:21:39 -04:00 · 46e704b7ef
commit 46e704b7ef
parent 92899b33f8 46001fef27
65 changed files with 14647 additions and 2492 deletions
--- a/wally-pipelined/src/fpu/convert_inputs_div.sv
+++ b/wally-pipelined/src/fpu/convert_inputs_div.sv
@ -3,22 +3,21 @@
 // it conditionally converts single precision values to double 
 // precision values and modifies the sign of op1. 
 // The converted operands are Float1 and Float2.
-
 module convert_inputs_div (Float1, Float2b, op1, op2, op_type, P);
   
-   input [63:0]  op1;           // 1st input operand (A)
-   input [63:0]  op2;           // 2nd input operand (B)
-   input 	 P;             // Result Precision (0 for double, 1 for single)
-   input 	 op_type;       // Operation   
+   input logic [63:0]  op1;           // 1st input operand (A)
+   input logic [63:0]  op2;           // 2nd input operand (B)
+   input logic 	       P;             // Result Precision (0 for double, 1 for single)
+   input logic 	       op_type;       // Operation   

-   output [63:0] Float1;	// Converted 1st input operand
-   output [63:0] Float2b;	// Converted 2nd input operand   
+   output logic [63:0] Float1;	      // Converted 1st input operand
+   output logic [63:0] Float2b;	      // Converted 2nd input operand   

-   wire [63:0] 	 Float2;   
-   wire 	 Zexp1;		// One if the exponent of op1 is zero
-   wire 	 Zexp2;		// One if the exponent of op2 is zero
-   wire 	 Oexp1;		// One if the exponent of op1 is all ones
-   wire 	 Oexp2;		// One if the exponent of op2 is all ones
+   logic [63:0]        Float2;   
+   logic 	       Zexp1;	      // One if the exponent of op1 is zero
+   logic 	       Zexp2;	      // One if the exponent of op2 is zero
+   logic 	       Oexp1;	      // One if the exponent of op1 is all ones
+   logic 	       Oexp2;	      // One if the exponent of op2 is all ones

   // Test if the input exponent is zero, because if it is then the
   // exponent of the converted number should be zero. 
--- a/wally-pipelined/src/fpu/divconv.sv
+++ b/wally-pipelined/src/fpu/divconv.sv
@ -1,11 +1,7 @@
-// `timescale 1ps/1ps
-module divconv (q1, qm1, qp1, q0, qm0, qp0, 
-		rega_out, regb_out, regc_out, regd_out,
-		regr_out, d, n, 
-		sel_muxa, sel_muxb, sel_muxr, 
-		reset, clk,
-		load_rega, load_regb, load_regc, load_regd,
-		load_regr, load_regs, P, op_type, exp_odd);
+`timescale 1ps/1ps
+module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_out,
+		regr_out, d, n, sel_muxa, sel_muxb, sel_muxr, reset, clk, load_rega, load_regb, 
+		load_regc, load_regd, load_regr, load_regs, P, op_type, exp_odd);

   input logic [52:0]   d, n;
   input logic [2:0] 	sel_muxa, sel_muxb;
@ -40,9 +36,7 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
   logic [127:0] 	constant, constant2;
   logic [63:0] 	q_const, qp_const, qm_const;
   logic [63:0] 	d2, n2;   
-   logic [11:0] 	d3;  
-
-   logic cout1, cout2, cout3, cout4, cout5, cout6, cout7, muxr_out; 
+   logic [11:0] 	d3;   

   // Check if exponent is odd for sqrt
   // If exp_odd=1 and sqrt, then M/2 and use ia_addr=0 as IA
@ -68,9 +62,9 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
   mux2 #(64) mx5 (muxb_out, mcand_q, sel_muxr&op_type, mplier);   
   mux2 #(64) mx6 (muxa_out, mcand_q, sel_muxr, mcand);
   // TDM multiplier (carry/save)
-   multiplier mult1 (mcand, mplier, Sum, Carry);   // ***multiply
+   multiplier mult1 (mcand, mplier, Sum, Carry);
   // Q*D - N (reversed but changed in rounder.v to account for sign reversal)
-   csa #(128) csa1 (Sum, Carry, constant, Sum2, Carry2); //***adder
+   csa #(128) csa1 (Sum, Carry, constant, Sum2, Carry2);
   // Add ulp for subtraction in remainder
   mux2 #(1) mx7 (1'b0, 1'b1, sel_muxr, muxr_out);

@ -80,15 +74,17 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
   mux2 #(64) mxA ({64'hFFFF_FFFF_FFFF_F9FF}, {64'hFFFF_FF3F_FFFF_FFFF}, P, qm_const);
   
   // CPA (from CSA)/Remainder addition/subtraction
-   ldf128 cpa1 (cout1, mul_out, Sum2, Carry2, muxr_out); //***adder
+   adder #(128) cpa1 (Sum2, Carry2, muxr_out, mul_out, cout1);   
+   
   // Assuming [1,2) - q1
-   ldf64 cpa2 (cout2, q_out1, regb_out, q_const, 1'b0); //***adder
-   ldf64 cpa3 (cout3, qp_out1, regb_out, qp_const, 1'b0); //***adder
-   ldf64 cpa4 (cout4, qm_out1, regb_out, qm_const, 1'b1);    //***adder
-   // Assuming [0.5,1) - q0
-   ldf64 cpa5 (cout5, q_out0, {regb_out[62:0], vss}, q_const, 1'b0); //***adder
-   ldf64 cpa6 (cout6, qp_out0, {regb_out[62:0], vss}, qp_const, 1'b0); //***adder
-   ldf64 cpa7 (cout7, qm_out0, {regb_out[62:0], vss}, qm_const, 1'b1); //***adder
+   adder #(64) cpa2 (regb_out, q_const, 1'b0, q_out1, cout2);
+   adder #(64) cpa3 (regb_out, qp_const, 1'b0, qp_out1, cout3);
+   adder #(64) cpa4 (regb_out, qm_const, 1'b1, qm_out1, cout4);
+   // Assuming [0.5,1) - q0   
+   adder #(64) cpa5 ({regb_out[62:0], vss}, q_const, 1'b0, q_out0, cout5);
+   adder #(64) cpa6 ({regb_out[62:0], vss}, qp_const, 1'b0, qp_out0, cout6);
+   adder #(64) cpa7 ({regb_out[62:0], vss}, qm_const, 1'b1, qm_out0, cout7);    
+
   // One's complement instead of two's complement (for hw efficiency)
   assign three = {~mul_out[126], mul_out[126], ~mul_out[125:63]};   
   mux2 #(64) mxTC (~mul_out[126:63], three[64:1],  op_type, twocmp_out);
@ -110,126 +106,151 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
   
 endmodule // divconv

-// module adder #(parameter WIDTH=8)
-//    (input  logic [WIDTH-1:0] a, b,
-//     output logic [WIDTH-1:0] y);
+module adder #(parameter WIDTH=8)
+   (input  logic [WIDTH-1:0] a, b,
+    input logic 	     cin,
+    output logic [WIDTH-1:0] y,
+    output logic 	     cout);
   
-//    assign y = a + b;
+   assign {cout, y} = a + b + cin;
   
-// endmodule // adder
+endmodule // adder

-// module flopenr #(parameter WIDTH = 8)
-//    (input  logic             clk, reset, en,
-//     input  logic [WIDTH-1:0] d, 
-//     output logic [WIDTH-1:0] q);
+module flopenr #(parameter WIDTH = 8)
+   (input  logic             clk, reset, en,
+    input  logic [WIDTH-1:0] d, 
+    output logic [WIDTH-1:0] q);

-//    always_ff @(posedge clk, posedge reset)
-//      if (reset)   q <= #10 0;
-//      else if (en) q <= #10 d;
+   always_ff @(posedge clk, posedge reset)
+     if (reset)   q <= #10 0;
+     else if (en) q <= #10 d;
   
-// endmodule // flopenr
+endmodule // flopenr

-// module flopr #(parameter WIDTH = 8)
-//    (input  logic             clk, reset,
-//     input  logic [WIDTH-1:0] d, 
-//     output logic [WIDTH-1:0] q);
+module flopr #(parameter WIDTH = 8)
+   (input  logic             clk, reset,
+    input  logic [WIDTH-1:0] d, 
+    output logic [WIDTH-1:0] q);

-//    always_ff @(posedge clk, posedge reset)
-//      if (reset) q <= #10 0;
-//      else       q <= #10 d;
+   always_ff @(posedge clk, posedge reset)
+     if (reset) q <= #10 0;
+     else       q <= #10 d;
   
-// endmodule // flopr
+endmodule // flopr

-// module flopenrc #(parameter WIDTH = 8)
-//    (input  logic             clk, reset, en, clear,
-//     input  logic [WIDTH-1:0] d, 
-//     output logic [WIDTH-1:0] q);
+module flopenrc #(parameter WIDTH = 8)
+   (input  logic             clk, reset, en, clear,
+    input  logic [WIDTH-1:0] d, 
+    output logic [WIDTH-1:0] q);

-//    always_ff @(posedge clk, posedge reset)
-//      if (reset)    q <= #10 0;
-//      else if (en) 
-//        if (clear) q <= #10 0;
-//        else       q <= #10 d;
+   always_ff @(posedge clk, posedge reset)
+     if (reset)    q <= #10 0;
+     else if (en) 
+       if (clear) q <= #10 0;
+       else       q <= #10 d;
   
-// endmodule // flopenrc
+endmodule // flopenrc

-// module floprc #(parameter WIDTH = 8)
-//    (input  logic             clk, reset, clear,
-//     input  logic [WIDTH-1:0] d, 
-//     output logic [WIDTH-1:0] q);
+module floprc #(parameter WIDTH = 8)
+   (input  logic             clk, reset, clear,
+    input  logic [WIDTH-1:0] d, 
+    output logic [WIDTH-1:0] q);

-//    always_ff @(posedge clk, posedge reset)
-//      if (reset) q <= #10 0;
-//      else       
-//        if (clear) q <= #10 0;
-//        else       q <= #10 d;
+   always_ff @(posedge clk, posedge reset)
+     if (reset) q <= #10 0;
+     else       
+       if (clear) q <= #10 0;
+       else       q <= #10 d;
   
-// endmodule // floprc
+endmodule // floprc

-// module mux2 #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] d0, d1, 
-//     input  logic             s, 
-//     output logic [WIDTH-1:0] y);
+module mux2 #(parameter WIDTH = 8)
+   (input  logic [WIDTH-1:0] d0, d1, 
+    input  logic             s, 
+    output logic [WIDTH-1:0] y);

-//    assign y = s ? d1 : d0;
+   assign y = s ? d1 : d0;
   
-// endmodule // mux2
+endmodule // mux2

-// module mux3 #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] d0, d1, d2,
-//     input  logic [1:0]       s, 
-//     output logic [WIDTH-1:0] y);
+module mux3 #(parameter WIDTH = 8)
+   (input  logic [WIDTH-1:0] d0, d1, d2,
+    input  logic [1:0]       s, 
+    output logic [WIDTH-1:0] y);

-//    assign y = s[1] ? d2 : (s[0] ? d1 : d0);
+   assign y = s[1] ? d2 : (s[0] ? d1 : d0);
   
-// endmodule // mux3
+endmodule // mux3

-// module mux4 #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] d0, d1, d2, d3,
-//     input  logic [1:0]       s, 
-//     output logic [WIDTH-1:0] y);
+module mux4 #(parameter WIDTH = 8)
+   (input  logic [WIDTH-1:0] d0, d1, d2, d3,
+    input  logic [1:0]       s, 
+    output logic [WIDTH-1:0] y);

-//    assign y = s[1] ? (s[0] ? d3 : d2) : (s[0] ? d1 : d0);
+   assign y = s[1] ? (s[0] ? d3 : d2) : (s[0] ? d1 : d0);

-// endmodule // mux4
+endmodule // mux4

-// module mux5 #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4,
-//     input  logic [2:0]       s,
-//     output logic [WIDTH-1:0] y);
+module mux5 #(parameter WIDTH = 8)
+   (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4,
+    input  logic [2:0]       s,
+    output logic [WIDTH-1:0] y);
   
-//    always_comb
-//      casez (s)
-//        3'b000 : y = d0;       
-//        3'b001 : y = d1;
-//        3'b010 : y = d2;
-//        3'b011 : y = d3;
-//        3'b1?? : y = d4;
-//      endcase // casez (s)
+   always_comb
+     casez (s)
+       3'b000 : y = d0;       
+       3'b001 : y = d1;
+       3'b010 : y = d2;
+       3'b011 : y = d3;
+       3'b1?? : y = d4;
+     endcase // casez (s)

-// endmodule // mux5
+endmodule // mux5

-// module mux6 #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4, d5,
-//     input  logic [2:0]       s,
-//     output logic [WIDTH-1:0] y);
+module mux6 #(parameter WIDTH = 8)
+   (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4, d5,
+    input  logic [2:0]       s,
+    output logic [WIDTH-1:0] y);
   
-//    always_comb
-//      casez (s)
-//        3'b000 : y = d0;       
-//        3'b001 : y = d1;
-//        3'b010 : y = d2;
-//        3'b011 : y = d3;
-//        3'b10? : y = d4;
-//        3'b11? : y = d5;       
-//      endcase // casez (s)
+   always_comb
+     casez (s)
+       3'b000 : y = d0;       
+       3'b001 : y = d1;
+       3'b010 : y = d2;
+       3'b011 : y = d3;
+       3'b10? : y = d4;
+       3'b11? : y = d5;       
+     endcase // casez (s)

-// endmodule // mux6
+endmodule // mux6

-// module eqcmp #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] a, b,
-//     output logic             y);
+module eqcmp #(parameter WIDTH = 8)
+   (input  logic [WIDTH-1:0] a, b,
+    output logic             y);

-//    assign y = (a == b);
+   assign y = (a == b);
   
-// endmodule // eqcmp
+endmodule // eqcmp
+
+module fa (input logic a, b, c, output logic sum, carry);
+
+   assign sum = a^b^c;
+   assign carry = a&b|a&c|b&c;   
+
+endmodule // fa
+
+module csa #(parameter WIDTH=8) 
+   (input logic [WIDTH-1:0] a, b, c,
+    output logic [WIDTH-1:0] sum, carry);
+
+   logic [WIDTH:0] 	     carry_temp;   
+   genvar 		     i;
+   generate
+      for (i=0;i<WIDTH;i=i+1)
+	begin : genbit
+	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
+	end
+   endgenerate
+   assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     
+   
+endmodule // csa
--- a/wally-pipelined/src/fpu/exception_div.sv
+++ b/wally-pipelined/src/fpu/exception_div.sv
@ -1,38 +1,36 @@
 // Exception logic for the floating point adder. Note: We may 
 // actually want to move to where the result is computed.
-
 module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);

-   input [63:0] A;		// 1st input operand (op1)
-   input [63:0] B;		// 2nd input operand (op2)
-   input 	op_type;        // Determine operation   
+   input logic [63:0] A;		// 1st input operand (op1)
+   input logic [63:0] B;		// 2nd input operand (op2)
+   input logic 	      op_type;          // Determine operation   
   
-   output [2:0] Ztype;		// Indicates type of result (Z)
-   output 	Invalid;	// Invalid operation exception
-   output 	Denorm;		// Denormalized input
-   output       ANorm;          // A is not zero or Denorm
-   output       BNorm;          // B is not zero or Denorm
+   output logic [2:0] Ztype;		// Indicates type of result (Z)
+   output logic       Invalid;	        // Invalid operation exception
+   output logic       Denorm;		// Denormalized input
+   output logic       ANorm;            // A is not zero or Denorm
+   output logic       BNorm;            // B is not zero or Denorm
   
-   wire		AzeroM;	 	// '1' if the mantissa of A is zero
-   wire		BzeroM;		// '1' if the mantissa of B is zero
-   wire		AzeroE;	 	// '1' if the exponent of A is zero
-   wire		BzeroE;		// '1' if the exponent of B is zero
-   wire		AonesE;	 	// '1' if the exponent of A is all ones
-   wire		BonesE;		// '1' if the exponent of B is all ones
-   wire		ADenorm; 	// '1' if A is a denomalized number
-   wire		BDenorm; 	// '1' if B is a denomalized number
-   wire		AInf;	 	// '1' if A is infinite
-   wire		BInf;	 	// '1' if B is infinite
-   wire		AZero;	 	// '1' if A is 0
-   wire		BZero;	 	// '1' if B is 0
-   wire		ANaN;	 	// '1' if A is a not-a-number
-   wire		BNaN; 		// '1' if B is a not-a-number
-   wire		ASNaN;	 	// '1' if A is a signalling not-a-number
-   wire		BSNaN;	 	// '1' if B is a signalling not-a-number
-   wire		ZQNaN;	 	// '1' if result Z is a quiet NaN
-   wire		ZInf;	 	// '1' if result Z is an infnity
-   wire 	square_root;    // '1' if square root operation
-   wire 	Zero;           // '1' if result is zero   
+   logic 	      AzeroM;	 	// '1' if the mantissa of A is zero
+   logic 	      BzeroM;		// '1' if the mantissa of B is zero
+   logic 	      AzeroE;	 	// '1' if the exponent of A is zero
+   logic 	      BzeroE;		// '1' if the exponent of B is zero
+   logic 	      AonesE;	 	// '1' if the exponent of A is all ones
+   logic 	      BonesE;		// '1' if the exponent of B is all ones
+   logic 	      ADenorm; 	        // '1' if A is a denomalized number
+   logic 	      BDenorm; 	        // '1' if B is a denomalized number
+   logic 	      AInf;	 	// '1' if A is infinite
+   logic 	      BInf;	 	// '1' if B is infinite
+   logic 	      AZero;	 	// '1' if A is 0
+   logic 	      BZero;	 	// '1' if B is 0
+   logic 	      ANaN;	 	// '1' if A is a not-a-number
+   logic 	      BNaN; 		// '1' if B is a not-a-number
+   logic 	      ASNaN;	 	// '1' if A is a signalling not-a-number
+   logic 	      BSNaN;	 	// '1' if B is a signalling not-a-number
+   logic 	      ZQNaN;	 	// '1' if result Z is a quiet NaN
+   logic 	      ZInf;	 	// '1' if result Z is an infnity
+   logic 	      Zero;             // '1' if result is zero   
   
   parameter [51:0]  fifty_two_zeros = 52'h0; // Use parameter?

@ -93,4 +91,3 @@ module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
   assign Ztype[2] = BZero&~op_type;   

 endmodule // exception
-
--- a/wally-pipelined/src/fpu/fdivsqrt.sv
+++ b/wally-pipelined/src/fpu/fdivsqrt.sv
@ -1,256 +0,0 @@
-//
-// File name : fpdiv
-// Title     : Floating-Point Divider/Square-Root
-// project   : FPU
-// Library   : fpdiv
-// Author(s) : James E. Stine, Jr.
-// Purpose   : definition of main unit to floating-point div/sqrt
-// notes :   
-//
-// Copyright Oklahoma State University
-//
-// Basic Operations
-//
-// Step 1: Load operands, set flags, and convert SP to DP
-// Step 2: Check for special inputs ( +/- Infinity,  NaN)
-// Step 3: Exponent Logic
-// Step 4: Divide/Sqrt using Goldschmidt
-// Step 5: Normalize the result.//
-//   Shift left until normalized.  Normalized when the value to the 
-//   left of the binrary point is 1.
-// Step 6: Round the result.// 
-// Step 7: Put quotient/remainder onto output.
-//
-
-// `timescale 1ps/1ps
-module fdivsqrt (FDivSqrtDoneE, FDivResultM, FDivSqrtFlgM, DivInput1E, DivInput2E, FrmE, DivOpType, FmtE, DivOvEn, DivUnEn,
-	      FDivStartE, reset, clk, FDivBusyE, HoldInputs);
-
-   input [63:0] DivInput1E;		// 1st input operand (A)
-   input [63:0] DivInput2E;		// 2nd input operand (B)
-   input [2:0] 	FrmE;		// Rounding mode - specify values 
-   input 	DivOpType;	// Function opcode
-   input 	FmtE;   		// Result Precision (0 for double, 1 for single) //***will need to swap this
-   input 	DivOvEn;		// Overflow trap enabled
-   input 	DivUnEn;   	// Underflow trap enabled
-
-   input 	FDivStartE;
-   input 	reset;
-   input 	clk;   
-
-   output [63:0] FDivResultM;	// Result of operation
-   output [4:0]  FDivSqrtFlgM;   	// IEEE exception flags 
-   output 	 FDivSqrtDoneE;
-   output    FDivBusyE, HoldInputs;
-
-   supply1 	  vdd;
-   supply0 	  vss;   
-
-   wire [63:0] 	 Float1; 
-   wire [63:0] 	 Float2;
-   wire [63:0] 	 IntValue;
-   
-   wire 	 DivDenormM;   	// DivDenormM on input or output
-   wire [12:0] 	 exp1, exp2, expF;
-   wire [12:0] 	 exp_diff, bias;
-   wire [13:0] 	 exp_sqrt;
-   wire [12:0] 	 exp_s;
-   wire [12:0] 	 exp_c;
-   
-   wire [10:0] 	 exponent, exp_pre;
-   wire [63:0] 	 Result;   
-   wire [52:0] 	 mantissaA;
-   wire [52:0] 	 mantissaB; 
-   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
-   
-   wire [5:0] 	 align_shift;
-   wire [5:0] 	 norm_shift;
-   wire [2:0] 	 sel_inv;
-   wire		 op1_Norm, op2_Norm;
-   wire		 opA_Norm, opB_Norm;
-   wire		 Invalid;
-   wire 	 DenormIn, DenormIO;
-   wire [4:0] 	 FlagsIn;   	
-   wire 	 exp_gt63;
-   wire 	 Sticky_out;
-   wire 	 signResult, sign_corr;
-   wire          corr_sign;
-   wire 	 zeroB;         
-   wire 	 convert;
-   wire          swap;
-   wire          sub;
-   
-   wire [63:0] 	 q1, qm1, qp1, q0, qm0, qp0;
-   wire [63:0] 	 rega_out, regb_out, regc_out, regd_out;
-   wire [127:0]  regr_out;
-   wire [2:0] 	 sel_muxa, sel_muxb;
-   wire 	 sel_muxr;   
-   wire 	 load_rega, load_regb, load_regc, load_regd, load_regr, load_regs;
-
-   wire 	 donev, sel_muxrv, sel_muxsv;
-   wire [1:0] 	 sel_muxav, sel_muxbv;   
-   wire 	 load_regav, load_regbv, load_regcv;
-   wire 	 load_regrv, load_regsv;
-   
-   logic exp_cout1, exp_cout2, exp_odd, open;
-
-
-   // Convert the input operands to their appropriate forms based on 
-   // the orignal operands, the DivOpType , and their precision FmtE. 
-   // Single precision inputs are converted to double precision 
-   // and the sign of the first operand is set appropratiately based on
-   // if the operation is absolute value or negation. 
-   convert_inputs_div divconv1 (Float1, Float2, DivInput1E, DivInput2E, DivOpType, FmtE);
-
-   // Test for exceptions and return the "Invalid Operation" and
-   // "Denormalized" Input FDivSqrtFlgM. The "sel_inv" is used in
-   // the third pipeline stage to select the result. Also, op1_Norm
-   // and op2_Norm are one if DivInput1E and DivInput2E are not zero or denormalized.
-   // sub is one if the effective operation is subtaction. 
-   exception_div divexc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
-		   Float1, Float2, DivOpType);
-
-   // Determine Sign/Mantissa
-   assign signResult = ((Float1[63]^Float2[63])&~DivOpType) | Float1[63]&DivOpType;
-   assign mantissaA = {vdd, Float1[51:0]};
-   assign mantissaB = {vdd, Float2[51:0]};
-   // Perform Exponent Subtraction - expA - expB + Bias   
-   assign exp1 = {2'b0, Float1[62:52]};
-   assign exp2 = {2'b0, Float2[62:52]};
-   // bias : DP = 2^{11-1}-1 = 1023
-   assign bias = {3'h0, 10'h3FF};
-   // Divide exponent
-   csa #(13) csa1 (exp1, ~exp2, bias, exp_s, exp_c); //***adder
-   exp_add explogic1 (exp_cout1, {open, exp_diff}, //***adder?
-		      {vss, exp_s}, {vss, exp_c}, 1'b1);
-   // Sqrt exponent (check if exponent is odd)
-   assign exp_odd = Float1[52] ? vss : vdd;
-   exp_add explogic2 (exp_cout2, exp_sqrt, //***adder?
-		      {vss, exp1}, {4'h0, 10'h3ff}, exp_odd);
-   // Choose correct exponent
-   assign expF = DivOpType ? exp_sqrt[13:1] : exp_diff;   
-
-   // Main Goldschmidt/Division Routine
-   divconv goldy (q1, qm1, qp1, q0, qm0, qp0, 
-		  rega_out, regb_out, regc_out, regd_out,
-		  regr_out, mantissaB, mantissaA, 
-		  sel_muxa, sel_muxb, sel_muxr, 
-		  reset, clk,
-		  load_rega, load_regb, load_regc, load_regd,
-		  load_regr, load_regs, FmtE, DivOpType, exp_odd);
-
-   // FSM : control divider
-   fsm control (FDivSqrtDoneE, load_rega, load_regb, load_regc, load_regd, 
-		load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
-		clk, reset, FDivStartE, DivOpType, FDivBusyE, HoldInputs);
-   
-   // Round the mantissa to a 52-bit value, with the leading one
-   // removed. The rounding units also handles special cases and 
-   // set the exception flags.
-   //***add max magnitude and swap negitive and positive infinity
-   rounder_div divround1 (Result, DenormIO, FlagsIn, 
-		   FrmE, FmtE, DivOvEn, DivUnEn, expF, 
-   		   sel_inv, Invalid, DenormIn, signResult, 
-		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
-
-   // Store the final result and the exception flags in registers.
-   flopenr #(64) rega (clk, reset, FDivSqrtDoneE, Result, FDivResultM);
-   flopenr #(1) regb (clk, reset, FDivSqrtDoneE, DenormIO, DivDenormM);   
-   flopenr #(5) regc (clk, reset, FDivSqrtDoneE, FlagsIn, FDivSqrtFlgM);   
-   
-endmodule // fpadd
-
-//
-// Brent-Kung Prefix Adder 
-//   (yes, it is 14 bits as my generator is broken for 13 bits :( 
-//    assume, synthesizer will delete stuff not needed )
-//
-module exp_add (cout, sum, a, b, cin);
-   
-   input [13:0] a, b;
-   input 	cin;
-   
-   output [13:0] sum;
-   output 	 cout;
-
-   wire [14:0] 	 p,g;
-   wire [13:0] 	 c;
-
-   // pre-computation
-   assign p={a^b,1'b0};
-   assign g={a&b, cin};
-
-   // prefix tree
-   brent_kung prefix_tree(c, p[13:0], g[13:0]);
-
-   // post-computation
-   assign sum=p[14:1]^c;
-   assign cout=g[14]|(p[14]&c[13]);
-
-endmodule // exp_add
-
-module brent_kung (c, p, g);
-   
-   input [13:0] p;
-   input [13:0] g;
-   output [14:1] c;
-
-   logic G_1_0, G_3_2,G_5_4,G_7_6,G_9_8,G_11_10,G_13_12,G_3_0,G_7_4,G_11_8;
-   logic P_3_2,P_5_4,P_7_6,P_9_8,P_11_10,P_13_12,P_7_4,P_11_8;
-   logic G_7_0,G_11_0,G_5_0,G_9_0,G_13_0,G_2_0,G_4_0,G_6_0,G_8_0,G_10_0,G_12_0;
-   // parallel-prefix, Brent-Kung
-
-   // Stage 1: Generates G/FmtE pairs that span 1 bits
-   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
-   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
-   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
-   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
-   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
-   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
-   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
-
-   // Stage 2: Generates G/FmtE pairs that span 2 bits
-   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
-   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
-   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
-
-   // Stage 3: Generates G/FmtE pairs that span 4 bits
-   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
-
-   // Stage 4: Generates G/FmtE pairs that span 8 bits
-
-   // Stage 5: Generates G/FmtE pairs that span 4 bits
-   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
-
-   // Stage 6: Generates G/FmtE pairs that span 2 bits
-   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
-   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
-   grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
-
-   // Last grey cell stage 
-   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
-   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
-   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
-   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
-   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
-   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
-
-   // Final Stage: Apply c_k+1=G_k_0
-   assign c[1]=g[0];
-   assign c[2]=G_1_0;
-   assign c[3]=G_2_0;
-   assign c[4]=G_3_0;
-   assign c[5]=G_4_0;
-   assign c[6]=G_5_0;
-   assign c[7]=G_6_0;
-   assign c[8]=G_7_0;
-   assign c[9]=G_8_0;
-
-   assign c[10]=G_9_0;
-   assign c[11]=G_10_0;
-   assign c[12]=G_11_0;
-   assign c[13]=G_12_0;
-   assign c[14]=G_13_0;
-
-endmodule // brent_kung
-
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@ -0,0 +1,151 @@
+//
+// File name : fpdiv
+// Title     : Floating-Point Divider/Square-Root
+// project   : FPU
+// Library   : fpdiv
+// Author(s) : James E. Stine, Jr.
+// Purpose   : definition of main unit to floating-point div/sqrt
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Basic Operations
+//
+// Step 1: Load operands, set flags, and convert SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Exponent Logic
+// Step 4: Divide/Sqrt using Goldschmidt
+// Step 5: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 6: Round the result.// 
+// Step 7: Put quotient/remainder onto output.
+//
+
+`timescale 1ps/1ps
+module fpdiv (done, AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn,
+	      start, reset, clk);
+
+   input [63:0] op1;		// 1st input operand (A)
+   input [63:0] op2;		// 2nd input operand (B)
+   input [1:0] 	rm;		// Rounding mode - specify values 
+   input 	op_type;	// Function opcode
+   input 	P;   		// Result Precision (0 for double, 1 for single)
+   input 	OvEn;		// Overflow trap enabled
+   input 	UnEn;   	// Underflow trap enabled
+   input 	start;
+   input 	reset;
+   input 	clk;   
+
+   output [63:0] AS_Result;	// Result of operation
+   output [4:0]  Flags;   	// IEEE exception flags 
+   output 	 Denorm;   	// Denorm on input or output
+   output 	 done;
+
+   supply1 	  vdd;
+   supply0 	  vss;   
+
+   wire [63:0] 	 Float1; 
+   wire [63:0] 	 Float2;
+   wire [63:0] 	 IntValue;
+   
+   wire [12:0] 	 exp1, exp2, expF;
+   wire [12:0] 	 exp_diff, bias;
+   wire [13:0] 	 exp_sqrt;
+   wire [12:0] 	 exp_s;
+   wire [12:0] 	 exp_c;
+   
+   wire [10:0] 	 exponent, exp_pre;
+   wire [63:0] 	 Result;   
+   wire [52:0] 	 mantissaA;
+   wire [52:0] 	 mantissaB; 
+   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
+   
+   wire [5:0] 	 align_shift;
+   wire [5:0] 	 norm_shift;
+   wire [2:0] 	 sel_inv;
+   wire		 op1_Norm, op2_Norm;
+   wire		 opA_Norm, opB_Norm;
+   wire		 Invalid;
+   wire 	 DenormIn, DenormIO;
+   wire [4:0] 	 FlagsIn;   	
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire 	 signResult, sign_corr;
+   wire          corr_sign;
+   wire 	 zeroB;         
+   wire 	 convert;
+   wire          swap;
+   wire          sub;
+   
+   wire [63:0] 	 q1, qm1, qp1, q0, qm0, qp0;
+   wire [63:0] 	 rega_out, regb_out, regc_out, regd_out;
+   wire [127:0]  regr_out;
+   wire [2:0] 	 sel_muxa, sel_muxb;
+   wire 	 sel_muxr;   
+   wire 	 load_rega, load_regb, load_regc, load_regd, load_regr;
+
+   wire 	 donev, sel_muxrv, sel_muxsv;
+   wire [1:0] 	 sel_muxav, sel_muxbv;   
+   wire 	 load_regav, load_regbv, load_regcv;
+   wire 	 load_regrv, load_regsv;
+   
+   // Convert the input operands to their appropriate forms based on 
+   // the orignal operands, the op_type , and their precision P. 
+   // Single precision inputs are converted to double precision 
+   // and the sign of the first operand is set appropratiately based on
+   // if the operation is absolute value or negation.   
+   convert_inputs_div conv1 (Float1, Float2, op1, op2, op_type, P);
+
+   // Test for exceptions and return the "Invalid Operation" and
+   // "Denormalized" Input Flags. The "sel_inv" is used in
+   // the third pipeline stage to select the result. Also, op1_Norm
+   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
+   // sub is one if the effective operation is subtaction.   
+   exception_div exc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
+		       Float1, Float2, op_type);
+
+   // Determine Sign/Mantissa
+   assign signResult = ((Float1[63]^Float2[63])&~op_type) | Float1[63]&op_type;
+   assign mantissaA = {vdd, Float1[51:0]};
+   assign mantissaB = {vdd, Float2[51:0]};
+   // Perform Exponent Subtraction - expA - expB + Bias   
+   assign exp1 = {2'b0, Float1[62:52]};
+   assign exp2 = {2'b0, Float2[62:52]};
+   // bias : DP = 2^{11-1}-1 = 1023
+   assign bias = {3'h0, 10'h3FF};
+   // Divide exponent
+   csa #(13) csa1 (exp1, ~exp2, bias, exp_s, exp_c);
+   adder #(14) explogic1 ({vss, exp_s}, {vss, exp_c}, 1'b1, {open, exp_diff}, exp_cout1);
+   
+   // Sqrt exponent (check if exponent is odd)
+   assign exp_odd = Float1[52] ? vss : vdd;
+   adder #(14) explogic2 ({vss, exp1}, {4'h0, 10'h3ff}, exp_odd, exp_sqrt, exp_cout2);
+   // Choose correct exponent
+   assign expF = op_type ? exp_sqrt[13:1] : exp_diff;   
+
+   // Main Goldschmidt/Division Routine   
+   divconv goldy (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_out,
+		  regr_out, mantissaB, mantissaA, sel_muxa, sel_muxb, sel_muxr, 
+		  reset, clk,  load_rega, load_regb, load_regc, load_regd,
+		  load_regr, load_regs, P, op_type, exp_odd);
+
+   // FSM : control divider   
+   fsm_div control (done, load_rega, load_regb, load_regc, load_regd, 
+		    load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
+		    clk, reset, start, error, op_type);
+   
+   // Round the mantissa to a 52-bit value, with the leading one
+   // removed. The rounding units also handles special cases and 
+   // set the exception flags.   
+   rounder_div round1 (Result, DenormIO, FlagsIn, 
+		   rm, P, OvEn, UnEn, expF, 
+   		   sel_inv, Invalid, DenormIn, signResult, 
+		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
+
+   // Store the final result and the exception flags in registers.
+   flopenr #(64) rega (clk, reset, done, Result, AS_Result);
+   flopenr #(1) regb (clk, reset, done, DenormIO, Denorm);   
+   flopenr #(5) regc (clk, reset, done, FlagsIn, Flags);   
+   
+endmodule // fpadd
--- a/wally-pipelined/src/fpu/fpdivsqrt/adder_ip.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/adder_ip.sv
@ -1,9 +0,0 @@
-module adder_ip #(parameter WIDTH=8)
-   (input  logic [WIDTH-1:0] a, b,
-    input logic 	     cin,
-    output logic [WIDTH-1:0] y,
-    output logic 	     cout);
-   
-   assign {cout, y} = a + b + cin;
-   
-endmodule // adder
--- a/wally-pipelined/src/fpu/fpdivsqrt/convert_inputs_div.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/convert_inputs_div.sv
@ -3,8 +3,7 @@
 // it conditionally converts single precision values to double 
 // precision values and modifies the sign of op1. 
 // The converted operands are Float1 and Float2.
-
-module convert_inputs(Float1, Float2b, op1, op2, op_type, P);
+module convert_inputs_div (Float1, Float2b, op1, op2, op_type, P);
   
   input logic [63:0]  op1;           // 1st input operand (A)
   input logic [63:0]  op2;           // 2nd input operand (B)
--- a/wally-pipelined/src/fpu/fpdivsqrt/divconvDP.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/divconvDP.sv
@ -1,19 +1,13 @@
 `timescale 1ps/1ps
-module divconv (q1, qm1, qp1, q0, qm0, qp0, 
-		rega_out, regb_out, regc_out, regd_out,
-		regr_out, d, n, 
-		sel_muxa, sel_muxb, sel_muxr, 
-		reset, clk,
-		load_rega, load_regb, load_regc, load_regd,
-		load_regr, load_regs, load_regp,
-		P, op_type, exp_odd);
+module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_out,
+		regr_out, d, n, sel_muxa, sel_muxb, sel_muxr, reset, clk, load_rega, load_regb, 
+		load_regc, load_regd, load_regr, load_regs, P, op_type, exp_odd);

   input logic [52:0]   d, n;
   input logic [2:0] 	sel_muxa, sel_muxb;
   input logic 	        sel_muxr;   
   input logic 	        load_rega, load_regb, load_regc, load_regd;
   input logic 		load_regr, load_regs;
-   input logic 		load_regp;   
   input logic 		P;
   input logic 		op_type;
   input logic 		exp_odd;   
@ -78,86 +72,47 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0,
   mux2 #(64) mx8 ({64'h0000_0000_0000_0200}, {64'h0000_0040_0000_0000}, P, q_const);
   mux2 #(64) mx9 ({64'h0000_0000_0000_0A00}, {64'h0000_0140_0000_0000}, P, qp_const);
   mux2 #(64) mxA ({64'hFFFF_FFFF_FFFF_F9FF}, {64'hFFFF_FF3F_FFFF_FFFF}, P, qm_const);
-
-   logic [127:0] 	Sum_pipe;
-   logic [127:0] 	Carry_pipe;
-   logic 		muxr_pipe;   
-   logic 		rega_pipe;
-   logic 		regb_pipe;
-   logic 		regc_pipe;
-   logic 		regd_pipe;
-   logic 		regs_pipe;
-   logic 		regr_pipe;
-   logic 		P_pipe;
-   logic 		op_type_pipe;
-   logic [63:0] 	q_const_pipe;
-   logic [63:0] 	qm_const_pipe;
-   logic [63:0] 	qp_const_pipe;   
   
-   // Pipeline Stage 2 of iteration for Goldschmidt's algorithm
-   flopenr #(128) regp1 (clk, reset, load_regp, Sum2, Sum_pipe);
-   flopenr #(128) regp2 (clk, reset, load_regp, Carry2, Carry_pipe);
-   flopenr #(1) regp3 (clk, reset, load_regp, muxr_out, muxr_pipe);
-
-   flopenr #(1) regp4 (clk, reset, load_regp, load_rega, rega_pipe);
-   flopenr #(1) regp5 (clk, reset, load_regp, load_regb, regb_pipe);
-   flopenr #(1) regp6 (clk, reset, load_regp, load_regc, regc_pipe);
-   flopenr #(1) regp7 (clk, reset, load_regp, load_regd, regd_pipe);
-   flopenr #(1) regp8 (clk, reset, load_regp, load_regs, regs_pipe);
-   flopenr #(1) regp9 (clk, reset, load_regp, load_regr, regr_pipe);
-   flopenr #(1) regpA (clk, reset, load_regp, P, P_pipe);
-   flopenr #(1) regpB (clk, reset, load_regp, op_type, op_type_pipe);
-   flopenr #(64) regpC (clk, reset, load_regp, q_const, q_const_pipe);
-   flopenr #(64) regpD (clk, reset, load_regp, qp_const, qp_const_pipe);
-   flopenr #(64) regpE (clk, reset, load_regp, qm_const, qm_const_pipe);
-
   // CPA (from CSA)/Remainder addition/subtraction
-   adder_ip #(128) cpa1 (Sum_pipe, Carry_pipe, muxr_pipe, mul_out, cout1);   
-   // ldf128 cpa1 (cout1, mul_out, Sum_pipe, Carry_pipe, muxr_pipe);
-   // One's complement instead of two's complement (for hw efficiency)
-   assign three = {~mul_out[126] , mul_out[126], ~mul_out[125:63]};   
-   mux2 #(64) mxTC (~mul_out[126:63], three[64:1],  op_type_pipe, twocmp_out);
+   adder #(128) cpa1 (Sum2, Carry2, muxr_out, mul_out, cout1);   
   
   // Assuming [1,2) - q1
-   adder_ip #(64) cpa2 (regb_out, q_const_pipe, 1'b0, q_out1, cout2);
-   adder_ip #(64) cpa3 (regb_out, qp_const_pipe, 1'b0, qp_out1, cout3);
-   adder_ip #(64) cpa4 (regb_out, qm_const_pipe, 1'b1, qm_out1, cout4);
-   adder_ip #(64) cpa5 ({regb_out[62:0], vss}, q_const_pipe, 1'b0, q_out0, cout5);
-   adder_ip #(64) cpa6 ({regb_out[62:0], vss}, qp_const_pipe, 1'b0, qp_out0, cout6);
-   adder_ip #(64) cpa7 ({regb_out[62:0], vss}, qm_const_pipe, 1'b1, qm_out0, cout7);      
-  
-   //ldf64 cpa2 (cout2, q_out1, regb_out, q_const_pipe, 1'b0);
-   //ldf64 cpa3 (cout3, qp_out1, regb_out, qp_const_pipe, 1'b0);
-   //ldf64 cpa4 (cout4, qm_out1, regb_out, qm_const_pipe, 1'b1);   
-   // Assuming [0.5,1) - q0
-   //ldf64 cpa5 (cout5, q_out0, {regb_out[62:0], vss}, q_const_pipe, 1'b0);
-   //ldf64 cpa6 (cout6, qp_out0, {regb_out[62:0], vss}, qp_const_pipe, 1'b0);
-   //ldf64 cpa7 (cout7, qm_out0, {regb_out[62:0], vss}, qm_const_pipe, 1'b1);
+   adder #(64) cpa2 (regb_out, q_const, 1'b0, q_out1, cout2);
+   adder #(64) cpa3 (regb_out, qp_const, 1'b0, qp_out1, cout3);
+   adder #(64) cpa4 (regb_out, qm_const, 1'b1, qm_out1, cout4);
+   // Assuming [0.5,1) - q0   
+   adder #(64) cpa5 ({regb_out[62:0], vss}, q_const, 1'b0, q_out0, cout5);
+   adder #(64) cpa6 ({regb_out[62:0], vss}, qp_const, 1'b0, qp_out0, cout6);
+   adder #(64) cpa7 ({regb_out[62:0], vss}, qm_const, 1'b1, qm_out0, cout7);    
+
+   // One's complement instead of two's complement (for hw efficiency)
+   assign three = {~mul_out[126], mul_out[126], ~mul_out[125:63]};   
+   mux2 #(64) mxTC (~mul_out[126:63], three[64:1],  op_type, twocmp_out);

   // regs
-   flopenr #(64) regc (clk, reset, regc_pipe, twocmp_out, regc_out);
-   flopenr #(64) regb (clk, reset, regb_pipe, mul_out[126:63], regb_out);
-   flopenr #(64) rega (clk, reset, rega_pipe, mul_out[126:63], rega_out);
-   flopenr #(64) regd (clk, reset, regd_pipe, mul_out[126:63], regd_out);
-
-   // remainder
-   flopenr #(128) regr (clk, reset, regr_pipe, mul_out, regr_out);
+   flopenr #(64) regc (clk, reset, load_regc, twocmp_out, regc_out);
+   flopenr #(64) regb (clk, reset, load_regb, mul_out[126:63], regb_out);
+   flopenr #(64) rega (clk, reset, load_rega, mul_out[126:63], rega_out);
+   flopenr #(64) regd (clk, reset, load_regd, mul_out[126:63], regd_out);
+   flopenr #(128) regr (clk, reset, load_regr, mul_out, regr_out);
   // Assuming [1,2)
-   flopenr #(64) rege (clk, reset, regs_pipe, {q_out1[63:39], (q_out1[38:10] & {29{~P_pipe}}), 10'h0}, q1);   
-   flopenr #(64) regf (clk, reset, regs_pipe, {qm_out1[63:39], (qm_out1[38:10] & {29{~P_pipe}}), 10'h0}, qm1);
-   flopenr #(64) regg (clk, reset, regs_pipe, {qp_out1[63:39], (qp_out1[38:10] & {29{~P_pipe}}), 10'h0}, qp1);
+   flopenr #(64) rege (clk, reset, load_regs, {q_out1[63:39], (q_out1[38:10] & {29{~P}}), 10'h0}, q1);   
+   flopenr #(64) regf (clk, reset, load_regs, {qm_out1[63:39], (qm_out1[38:10] & {29{~P}}), 10'h0}, qm1);
+   flopenr #(64) regg (clk, reset, load_regs, {qp_out1[63:39], (qp_out1[38:10] & {29{~P}}), 10'h0}, qp1);
   // Assuming [0,1)
-   flopenr #(64) regh (clk, reset, regs_pipe, {q_out0[63:39], (q_out0[38:10] & {29{~P_pipe}}), 10'h0}, q0);
-   flopenr #(64) regj (clk, reset, regs_pipe, {qm_out0[63:39], (qm_out0[38:10] & {29{~P_pipe}}), 10'h0}, qm0);
-   flopenr #(64) regk (clk, reset, regs_pipe, {qp_out0[63:39], (qp_out0[38:10] & {29{~P_pipe}}), 10'h0}, qp0);
+   flopenr #(64) regh (clk, reset, load_regs, {q_out0[63:39], (q_out0[38:10] & {29{~P}}), 10'h0}, q0);
+   flopenr #(64) regj (clk, reset, load_regs, {qm_out0[63:39], (qm_out0[38:10] & {29{~P}}), 10'h0}, qm0);
+   flopenr #(64) regk (clk, reset, load_regs, {qp_out0[63:39], (qp_out0[38:10] & {29{~P}}), 10'h0}, qp0);
   
 endmodule // divconv

 module adder #(parameter WIDTH=8)
   (input  logic [WIDTH-1:0] a, b,
-    output logic [WIDTH-1:0] y);
+    input logic 	     cin,
+    output logic [WIDTH-1:0] y,
+    output logic 	     cout);
   
-   assign y = a + b;
+   assign {cout, y} = a + b + cin;
   
 endmodule // adder

--- a/wally-pipelined/src/fpu/fpdivsqrt/exception_div.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/exception_div.sv
@ -1,7 +1,6 @@
 // Exception logic for the floating point adder. Note: We may 
 // actually want to move to where the result is computed.
-
-module exception (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
+module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);

   input logic [63:0] A;		// 1st input operand (op1)
   input logic [63:0] B;		// 2nd input operand (op2)
@ -31,7 +30,6 @@ module exception (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
   logic 	      BSNaN;	 	// '1' if B is a signalling not-a-number
   logic 	      ZQNaN;	 	// '1' if result Z is a quiet NaN
   logic 	      ZInf;	 	// '1' if result Z is an infnity
-   logic 	      square_root;      // '1' if square root operation
   logic 	      Zero;             // '1' if result is zero   
   
   parameter [51:0]  fifty_two_zeros = 52'h0; // Use parameter?
@ -93,4 +91,3 @@ module exception (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
   assign Ztype[2] = BZero&~op_type;   

 endmodule // exception
-
--- a/wally-pipelined/src/fpu/fpdivsqrt/f32_div_rd.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f32_div_rd.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f32_div_rd.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f32_div_rd.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -52,5 +53,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9269690000 ns
+run 299690000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f32_div_rne.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f32_div_rne.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f32_div_rne.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f32_div_rne.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -52,5 +53,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9269690000 ns
+run 299690000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f32_div_ru.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f32_div_ru.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f32_div_ru.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f32_div_ru.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -52,5 +53,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9269690000 ns
+run 299690000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f32_div_rz.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f32_div_rz.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f32_div_rz.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f32_div_rz.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -52,5 +53,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9269690000 ns
+run 299690000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f32_sqrt_rd.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f32_sqrt_rd.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f32_sqrt_rd.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f32_sqrt_rd.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -51,5 +52,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9234244000 ns
+run 294244000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f32_sqrt_rne.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f32_sqrt_rne.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f32_sqrt_rne.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f32_sqrt_rne.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -51,5 +52,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9234244000 ns
+run 294244000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f32_sqrt_ru.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f32_sqrt_ru.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f32_sqrt_ru.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f32_sqrt_ru.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -51,5 +52,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9234244000 ns
+run 294244000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f32_sqrt_rz.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f32_sqrt_rz.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f32_sqrt_rz.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f32_sqrt_rz.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -51,5 +52,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9234244000 ns
+run 294244000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f64_div_rd.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f64_div_rd.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f64_div_rd.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f64_div_rd.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -41,8 +42,8 @@ vsim -voptargs=+acc work.tb
 -- Set Wave Output Items 
 TreeUpdate [SetDefaultTree]
 WaveRestoreZoom {0 ps} {75 ns}
-configure wave -namecolwidth 350
-configure wave -valuecolwidth 200
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
 configure wave -justifyvalue left
 configure wave -signalnamewidth 0
 configure wave -snapdistance 10
@ -52,5 +53,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9338600000 ns
+run 368600000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f64_div_rne.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f64_div_rne.do
@ -27,7 +27,7 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f64_div_rne.sv
+vlog mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f64_div_rne.sv


 # start and run simulation
@ -53,5 +53,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9398600000 ns
+run 368600000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f64_div_ru.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f64_div_ru.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f64_div_ru.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f64_div_ru.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -52,5 +53,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9338600000 ns
+run 368600000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f64_div_rz.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f64_div_rz.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f64_div_rz.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f64_div_rz.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -41,8 +42,8 @@ vsim -voptargs=+acc work.tb
 -- Set Wave Output Items 
 TreeUpdate [SetDefaultTree]
 WaveRestoreZoom {0 ps} {75 ns}
-configure wave -namecolwidth 350
-configure wave -valuecolwidth 250
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
 configure wave -justifyvalue left
 configure wave -signalnamewidth 0
 configure wave -snapdistance 10
@ -52,5 +53,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 9398600000 ns
+run 368600000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f64_sqrt_rd.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f64_sqrt_rd.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f64_sqrt_rd.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f64_sqrt_rd.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -51,5 +52,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 94364000 ns
+run 4364000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f64_sqrt_rne.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f64_sqrt_rne.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f64_sqrt_rne.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f64_sqrt_rne.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -51,5 +52,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 94364000 ns
+run 4364000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f64_sqrt_ru.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f64_sqrt_ru.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f64_sqrt_ru.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f64_sqrt_ru.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -51,5 +52,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 94364000 ns
+run 4364000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/f64_sqrt_rz.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/f64_sqrt_rz.do
@ -27,7 +27,8 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv mult_R4_64_64_cs.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv tb_f64_sqrt_rz.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv tb_f64_sqrt_rz.sv
+

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -51,5 +52,5 @@ configure wave -childrowmargin 2

 -- Run the Simulation 
 --   39,052 vectors, 390,565ns
-run 94364000 ns
+run 4364000
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/fpdiv.do
+++ b/wally-pipelined/src/fpu/fpdivsqrt/fpdiv.do
@ -27,7 +27,7 @@ if [file exists work] {
 vlib work

 # compile source files
-vlog adder_ip.sv bk15.v mult_R4_64_64_cs.v ldf128.v ldf64.v sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm.v divconvDP.sv convert_inputs.sv exception.sv rounder.sv fpdiv.sv test_fpdiv.sv
+vlog mult_R4_64_64_cs.v  sbtm_a1.sv sbtm_a0.sv sbtm.sv sbtm_a4.sv sbtm_a5.sv sbtm3.sv fsm_div.v divconvDP.sv convert_inputs_div.sv exception_div.sv rounder_div.sv fpdiv.sv test_fpdiv.sv

 # start and run simulation
 vsim -voptargs=+acc work.tb
@ -74,29 +74,14 @@ add wave -noupdate -divider -height 32 "Exceptions"
 add wave -hex -r /tb/dut/exc1/*
 add wave -noupdate -divider -height 32 "Rounder"
 add wave -hex -r /tb/dut/round1/*
-add wave -noupdate -divider -height 32 "Pipe State"
-add wave -hex -r /tb/dut/goldy/Sum_pipe;
-add wave -hex -r /tb/dut/goldy/Carry_pipe;
-add wave -hex -r /tb/dut/goldy/muxr_pipe;   
-add wave -hex -r /tb/dut/goldy/rega_pipe;
-add wave -hex -r /tb/dut/goldy/regb_pipe;
-add wave -hex -r /tb/dut/goldy/regc_pipe;
-add wave -hex -r /tb/dut/goldy/regd_pipe;
-add wave -hex -r /tb/dut/goldy/regs_pipe;
-add wave -hex -r /tb/dut/goldy/regr_pipe;
-add wave -hex -r /tb/dut/goldy/P_pipe;
-add wave -hex -r /tb/dut/goldy/op_type_pipe;
-add wave -hex -r /tb/dut/goldy/q_const_pipe;
-add wave -hex -r /tb/dut/goldy/qm_const_pipe;
-add wave -hex -r /tb/dut/goldy/qp_const_pipe;   
 add wave -noupdate -divider -height 32 "Goldschmidt"
 add wave -hex -r /tb/dut/goldy/*

 -- Set Wave Output Items 
 TreeUpdate [SetDefaultTree]
 WaveRestoreZoom {0 ps} {75 ns}
-configure wave -namecolwidth 350
-configure wave -valuecolwidth 250
+configure wave -namecolwidth 150
+configure wave -valuecolwidth 100
 configure wave -justifyvalue left
 configure wave -signalnamewidth 0
 configure wave -snapdistance 10
@ -105,5 +90,5 @@ configure wave -rowmargin 4
 configure wave -childrowmargin 2

 -- Run the Simulation 
-run 20ns
+run 14ns
 quit
--- a/wally-pipelined/src/fpu/fpdivsqrt/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/fpdiv.sv
@ -1,5 +1,5 @@
 //
-// File name : fpdivP
+// File name : fpdiv
 // Title     : Floating-Point Divider/Square-Root
 // project   : FPU
 // Library   : fpdiv
@ -26,94 +26,89 @@
 module fpdiv (done, AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn,
 	      start, reset, clk);

-   input logic [63:0] op1;		// 1st input operand (A)
-   input logic [63:0] op2;		// 2nd input operand (B)
-   input logic [1:0] 	rm;		// Rounding mode - specify values 
-   input logic 		op_type;	// Function opcode
-   input logic 		P;   		// Result Precision (0 for double, 1 for single)
-   input logic 		OvEn;		// Overflow trap enabled
-   input logic 		UnEn;   	// Underflow trap enabled
-   
-   input logic 		start;
-   input logic 		reset;
-   input logic 		clk;   
-   
-   output logic [63:0] 	AS_Result;	// Result of operation
-   output logic [4:0] 	Flags;   	// IEEE exception flags 
-   output logic 	Denorm;   	// Denorm on input or output
-   output logic 	done;
+   input [63:0] op1;		// 1st input operand (A)
+   input [63:0] op2;		// 2nd input operand (B)
+   input [1:0] 	rm;		// Rounding mode - specify values 
+   input 	op_type;	// Function opcode
+   input 	P;   		// Result Precision (0 for double, 1 for single)
+   input 	OvEn;		// Overflow trap enabled
+   input 	UnEn;   	// Underflow trap enabled
+   input 	start;
+   input 	reset;
+   input 	clk;   

-   supply1 		vdd;
-   supply0 		vss;   
-   
-   logic [63:0] 	Float1; 
-   logic [63:0] 	Float2;
-   logic [63:0] 	IntValue;
-   
-   logic [12:0] 	exp1, exp2, expF;
-   logic [12:0] 	exp_diff, bias;
-   logic [13:0] 	exp_sqrt;
-   logic [12:0] 	exp_s;
-   logic [12:0] 	exp_c;
-   
-   logic [10:0] 	exponent, exp_pre;
-   logic [63:0] 	Result;   
-   logic [52:0] 	mantissaA;
-   logic [52:0] 	mantissaB; 
-   logic [63:0] 	sum, sum_tc, sum_corr, sum_norm;
-   
-   logic [5:0] 		align_shift;
-   logic [5:0] 		norm_shift;
-   logic [2:0] 		sel_inv;
-   logic 		op1_Norm, op2_Norm;
-   logic 		opA_Norm, opB_Norm;
-   logic 		Invalid;
-   logic 		DenormIn, DenormIO;
-   logic [4:0] 		FlagsIn;   	
-   logic 		exp_gt63;
-   logic 		Sticky_out;
-   logic 		signResult, sign_corr;
-   logic 		corr_sign;
-   logic 		zeroB;         
-   logic 		convert;
-   logic 		swap;
-   logic 		sub;
-   
-   logic [63:0] 	q1, qm1, qp1, q0, qm0, qp0;
-   logic [63:0] 	rega_out, regb_out, regc_out, regd_out;
-   logic [127:0] 	regr_out;
-   logic [2:0] 		sel_muxa, sel_muxb;
-   logic 		sel_muxr;   
-   logic 		load_rega, load_regb, load_regc, load_regd, load_regr;
-   logic 		load_regp;   
+   output [63:0] AS_Result;	// Result of operation
+   output [4:0]  Flags;   	// IEEE exception flags 
+   output 	 Denorm;   	// Denorm on input or output
+   output 	 done;

-   logic 		donev, sel_muxrv, sel_muxsv;
-   logic [1:0] 		sel_muxav, sel_muxbv;   
-   logic 		load_regav, load_regbv, load_regcv;
-   logic 		load_regrv, load_regsv;
+   supply1 	  vdd;
+   supply0 	  vss;   
+
+   wire [63:0] 	 Float1; 
+   wire [63:0] 	 Float2;
+   wire [63:0] 	 IntValue;
+   
+   wire [12:0] 	 exp1, exp2, expF;
+   wire [12:0] 	 exp_diff, bias;
+   wire [13:0] 	 exp_sqrt;
+   wire [12:0] 	 exp_s;
+   wire [12:0] 	 exp_c;
+   
+   wire [10:0] 	 exponent, exp_pre;
+   wire [63:0] 	 Result;   
+   wire [52:0] 	 mantissaA;
+   wire [52:0] 	 mantissaB; 
+   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
+   
+   wire [5:0] 	 align_shift;
+   wire [5:0] 	 norm_shift;
+   wire [2:0] 	 sel_inv;
+   wire		 op1_Norm, op2_Norm;
+   wire		 opA_Norm, opB_Norm;
+   wire		 Invalid;
+   wire 	 DenormIn, DenormIO;
+   wire [4:0] 	 FlagsIn;   	
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire 	 signResult, sign_corr;
+   wire          corr_sign;
+   wire 	 zeroB;         
+   wire 	 convert;
+   wire          swap;
+   wire          sub;
+   
+   wire [63:0] 	 q1, qm1, qp1, q0, qm0, qp0;
+   wire [63:0] 	 rega_out, regb_out, regc_out, regd_out;
+   wire [127:0]  regr_out;
+   wire [2:0] 	 sel_muxa, sel_muxb;
+   wire 	 sel_muxr;   
+   wire 	 load_rega, load_regb, load_regc, load_regd, load_regr;
+
+   wire 	 donev, sel_muxrv, sel_muxsv;
+   wire [1:0] 	 sel_muxav, sel_muxbv;   
+   wire 	 load_regav, load_regbv, load_regcv;
+   wire 	 load_regrv, load_regsv;
   
   // Convert the input operands to their appropriate forms based on 
   // the orignal operands, the op_type , and their precision P. 
   // Single precision inputs are converted to double precision 
   // and the sign of the first operand is set appropratiately based on
-   // if the operation is absolute value or negation. 
-   convert_inputs conv1 (Float1, Float2, op1, op2, op_type, P);
+   // if the operation is absolute value or negation.   
+   convert_inputs_div conv1 (Float1, Float2, op1, op2, op_type, P);

   // Test for exceptions and return the "Invalid Operation" and
   // "Denormalized" Input Flags. The "sel_inv" is used in
   // the third pipeline stage to select the result. Also, op1_Norm
   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
-   // sub is one if the effective operation is subtaction. 
-   exception exc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
-		   Float1, Float2, op_type);
+   // sub is one if the effective operation is subtaction.   
+   exception_div exc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
+		       Float1, Float2, op_type);

   // Determine Sign/Mantissa
   assign signResult = ((Float1[63]^Float2[63])&~op_type) | Float1[63]&op_type;
   assign mantissaA = {vdd, Float1[51:0]};
   assign mantissaB = {vdd, Float2[51:0]};
-   // Early-ending detection
-   assign early_detection = |mantissaB[31:0];
-   
   // Perform Exponent Subtraction - expA - expB + Bias   
   assign exp1 = {2'b0, Float1[62:52]};
   assign exp2 = {2'b0, Float2[62:52]};
@ -121,37 +116,29 @@ module fpdiv (done, AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, Un
   assign bias = {3'h0, 10'h3FF};
   // Divide exponent
   csa #(13) csa1 (exp1, ~exp2, bias, exp_s, exp_c);
-   //exp_add explogic1 (exp_cout1, {open, exp_diff}, 
-   //		      {vss, exp_s}, {vss, exp_c}, 1'b1);
-   adder_ip #(14) explogic1 ({vss, exp_s}, {vss, exp_c}, 1'b1, {open, exp_diff}, exp_cout1);
+   adder #(14) explogic1 ({vss, exp_s}, {vss, exp_c}, 1'b1, {open, exp_diff}, exp_cout1);
   
   // Sqrt exponent (check if exponent is odd)
   assign exp_odd = Float1[52] ? vss : vdd;
-   //exp_add explogic2 (exp_cout2, exp_sqrt, 
-   //		      {vss, exp1}, {4'h0, 10'h3ff}, exp_odd);
-   adder_ip #(14) explogic2 ({vss, exp1}, {4'h0, 10'h3ff}, exp_odd, exp_sqrt, exp_cout2);
-   
+   adder #(14) explogic2 ({vss, exp1}, {4'h0, 10'h3ff}, exp_odd, exp_sqrt, exp_cout2);
   // Choose correct exponent
   assign expF = op_type ? exp_sqrt[13:1] : exp_diff;   

-   // Main Goldschmidt/Division Routine
+   // Main Goldschmidt/Division Routine   
   divconv goldy (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_out,
-		  regr_out, mantissaB, mantissaA, 
-		  sel_muxa, sel_muxb, sel_muxr, reset, clk,
-		  load_rega, load_regb, load_regc, load_regd,
-		  load_regr, load_regs, load_regp,
-		  P, op_type, exp_odd);
+		  regr_out, mantissaB, mantissaA, sel_muxa, sel_muxb, sel_muxr, 
+		  reset, clk,  load_rega, load_regb, load_regc, load_regd,
+		  load_regr, load_regs, P, op_type, exp_odd);

-   // FSM : control divider
-   fsm_fpdivsqrt control (done, load_rega, load_regb, load_regc, load_regd, 
-			  load_regr, load_regs, load_regp,
-			  sel_muxa, sel_muxb, sel_muxr, 
-			  clk, reset, start, error, op_type, P);
+   // FSM : control divider   
+   fsm_div control (done, load_rega, load_regb, load_regc, load_regd, 
+		    load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
+		    clk, reset, start, error, op_type);
   
   // Round the mantissa to a 52-bit value, with the leading one
   // removed. The rounding units also handles special cases and 
-   // set the exception flags.
-   rounder round1 (Result, DenormIO, FlagsIn, 
+   // set the exception flags.   
+   rounder_div round1 (Result, DenormIO, FlagsIn, 
 		   rm, P, OvEn, UnEn, expF, 
   		   sel_inv, Invalid, DenormIn, signResult, 
 		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
@ -161,4 +148,4 @@ module fpdiv (done, AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, Un
   flopenr #(1) regb (clk, reset, done, DenormIO, Denorm);   
   flopenr #(5) regc (clk, reset, done, FlagsIn, Flags);   
   
-endmodule // fpdivP
+endmodule // fpadd
--- a/wally-pipelined/src/fpu/fpdivsqrt/fpdiv3.in
+++ b/wally-pipelined/src/fpu/fpdivsqrt/fpdiv3.in
@ -1,78 +0,0 @@
-.i 6
-.o 2
-.ilb SignR rm[1] rm[0] G zero_rem sign_rem
-.ob M1 M0
-
-000000 00
-000001 00
-000010 00
-000011 00
-000100 10
-000101 00
-000110 --
-000111 --
-
-001000 00
-001001 01
-001010 00
-001011 00
-001100 00
-001101 00
-001110 --
-001111 --
-
-010000 10
-010001 00
-010010 00
-010011 00
-010100 10
-010101 10
-010110 --
-010111 --
-
-011000 00
-011001 01
-011010 00
-011011 00
-011100 00
-011101 00
-011110 --
-011111 --
-
-100000 00
-100001 00
-100010 00
-100011 00
-100100 10
-100101 00
-100110 --
-100111 --
-
-101000 00
-101001 01
-101010 00
-101011 00
-101100 00
-101101 00
-101110 --
-101111 --
-
-110000 00
-110001 01
-110010 00
-110011 00
-110100 00
-110101 00
-110110 --
-110111 --
-
-111000 10
-111001 00
-111010 00
-111011 00
-111100 10
-111101 10
-111110 --
-111111 --
-
-.e
--- a/wally-pipelined/src/fpu/fpdivsqrt/fpdiv4.in
+++ b/wally-pipelined/src/fpu/fpdivsqrt/fpdiv4.in
@ -1,98 +0,0 @@
-.i 7
-.o 2
-.ilb SignR rm[2] rm[1] rm[0] G zero_rem sign_rem
-.ob M1 M0
-
-0000000 00
-0000001 00
-0000010 00
-0000011 00
-0000100 10
-0000101 00
-0000110 --
-0000111 --
-
-0001000 00
-0001001 01
-0001010 00
-0001011 00
-0001100 00
-0001101 00
-0001110 --
-0001111 --
-
-0010000 10
-0010001 00
-0010010 00
-0010011 00
-0010100 10
-0010101 10
-0010110 --
-0010111 --
-
-0011000 00
-0011001 01
-0011010 00
-0011011 00
-0011100 00
-0011101 00
-0011110 --
-0011111 --
-
-01--000 10
-01--001 00
-01--010 00
-01--011 00
-01--100 10
-01--101 10
-01--110 --
-01--111 --
-
-1000000 00
-1000001 00
-1000010 00
-1000011 00
-1000100 10
-1000101 00
-1000110 --
-1000111 --
-
-1001000 00
-1001001 01
-1001010 00
-1001011 00
-1001100 00
-1001101 00
-1001110 --
-1001111 --
-
-1010000 00
-1010001 01
-1010010 00
-1010011 00
-1010100 00
-1010101 00
-1010110 --
-1010111 --
-
-1011000 10
-1011001 00
-1011010 00
-1011011 00
-1011100 10
-1011101 10
-1011110 --
-1011111 --
-
-11--000 10
-11--001 00
-11--010 00
-11--011 00
-11--100 10
-11--101 10
-11--110 --
-11--111 --
-
-
-
-.e
--- a/wally-pipelined/src/fpu/fpdivsqrt/fsm.v
+++ b/wally-pipelined/src/fpu/fpdivsqrt/fsm.v
--- a/wally-pipelined/src/fpu/fpdivsqrt/fsm_div.v
+++ b/wally-pipelined/src/fpu/fpdivsqrt/fsm_div.v
@ -0,0 +1,459 @@
+module fsm_div (done, load_rega, load_regb, load_regc, 
+		load_regd, load_regr, load_regs,
+		sel_muxa, sel_muxb, sel_muxr, 
+		clk, reset, start, error, op_type);
+
+   input 	clk;
+   input 	reset;
+   input 	start;
+   input 	error;
+   input  	op_type;
+   
+   output       done;      
+   output       load_rega;
+   output       load_regb;
+   output       load_regc;
+   output 	load_regd;   
+   output 	load_regr;
+   output 	load_regs;
+   
+   output [2:0] sel_muxa;
+   output [2:0] sel_muxb;
+   output 	sel_muxr;
+
+   reg 		done;      // End of cycles
+   reg 		load_rega; // enable for regA
+   reg 		load_regb; // enable for regB
+   reg 		load_regc; // enable for regC
+   reg 		load_regd; // enable for regD
+   reg 		load_regr; // enable for rem
+   reg 		load_regs; // enable for q,qm,qp   
+   reg [2:0] 	sel_muxa;  // Select muxA
+   reg [2:0] 	sel_muxb;  // Select muxB
+   reg 		sel_muxr;  // Select rem mux
+
+   reg [4:0] 	CURRENT_STATE;
+   reg [4:0] 	NEXT_STATE;   
+
+   parameter [4:0] 
+     S0=5'd0, S1=5'd1, S2=5'd2,
+     S3=5'd3, S4=5'd4, S5=5'd5,
+     S6=5'd6, S7=5'd7, S8=5'd8,
+     S9=5'd9, S10=5'd10,
+     S13=5'd13, S14=5'd14, S15=5'd15,     
+     S16=5'd16, S17=5'd17, S18=5'd18,
+     S19=5'd19, S20=5'd20, S21=5'd21,
+     S22=5'd22, S23=5'd23, S24=5'd24,
+     S25=5'd25, S26=5'd26, S27=5'd27,
+     S28=5'd28, S29=5'd29, S30=5'd30;
+   
+   always @(posedge clk)
+     begin
+	if(reset==1'b1)
+	  CURRENT_STATE<=S0;
+	else
+	  CURRENT_STATE<=NEXT_STATE;
+     end
+
+   always @(*)
+     begin
+ 	case(CURRENT_STATE)
+	  S0:  // iteration 0
+	    begin
+	       if (start==1'b0)
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b0;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;
+		    sel_muxa = 3'b000;
+		    sel_muxb = 3'b000;
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S0;
+		 end 
+	       else if (start==1'b1 && op_type==1'b0) 
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b1;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;		    
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;		    		    
+		    sel_muxa = 3'b001;
+		    sel_muxb = 3'b001;		    
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S1;
+		 end // if (start==1'b1 && op_type==1'b0)
+	       else if (start==1'b1 && op_type==1'b1) 
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b1;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;		    
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;		    		    
+		    sel_muxa = 3'b010;
+		    sel_muxb = 3'b000;		    
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S13;
+		 end 	       
+	    end // case: S0
+	  S1:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b010;
+	       sel_muxb = 3'b000;		    
+	       sel_muxr = 1'b0;	
+	       NEXT_STATE <= S2;
+	    end	  
+	  S2: // iteration 1
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S3;
+	    end
+	  S3:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S4;
+	    end
+	  S4: // iteration 2
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S5;
+	    end
+	  S5:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;  // add
+	       NEXT_STATE <= S6;
+	    end
+	  S6: // iteration 3
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S8;
+	    end
+	  S7:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S8;
+	    end // case: S7
+	  S8: // q,qm,qp
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b1;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S9;
+	    end 
+	  S9:  // rem
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b1;
+	       load_regs = 1'b0;  
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b1;
+	       NEXT_STATE <= S10;
+	    end 	  
+	  S10:  // done
+	    begin
+	       done = 1'b1;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end 
+	  S13:  // start of sqrt path
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b010;
+	       sel_muxb = 3'b001;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S14;
+	    end
+	  S14:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b001;
+	       sel_muxb = 3'b100;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S15;
+	    end 
+	  S15:  // iteration 1
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S16;
+	    end
+	  S16:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S17;
+	    end
+	  S17:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S18;
+	    end
+	  S18:  // iteration 2
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S19;
+	    end
+	  S19:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S20;
+	    end
+	  S20:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S21;
+	    end
+	  S21:  // iteration 3
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S22;
+	    end
+	  S22:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S23;
+	    end
+	  S23:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S24;
+	    end 
+	  S24: // q,qm,qp
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b1;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S25;
+	    end 	  
+	  S25:  // rem
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b1;
+	       load_regs = 1'b0;  
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b110;
+	       sel_muxr = 1'b1;
+	       NEXT_STATE <= S26;
+	    end 	  
+	  S26:  // done
+	    begin
+	       done = 1'b1;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end 
+	  default: 
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end
+	endcase // case(CURRENT_STATE)	
+     end // always @ (CURRENT_STATE or X)   
+
+endmodule // fsm
--- a/wally-pipelined/src/fpu/fpdivsqrt/fsm_div.v~
+++ b/wally-pipelined/src/fpu/fpdivsqrt/fsm_div.v~
@ -0,0 +1,459 @@
+module fsm (done, load_rega, load_regb, load_regc, 
+	    load_regd, load_regr, load_regs,
+	    sel_muxa, sel_muxb, sel_muxr, 
+	    clk, reset, start, error, op_type);
+
+   input 	clk;
+   input 	reset;
+   input 	start;
+   input 	error;
+   input  	op_type;
+   
+   output       done;      
+   output       load_rega;
+   output       load_regb;
+   output       load_regc;
+   output 	load_regd;   
+   output 	load_regr;
+   output 	load_regs;
+   
+   output [2:0] sel_muxa;
+   output [2:0] sel_muxb;
+   output 	sel_muxr;
+
+   reg 		done;      // End of cycles
+   reg 		load_rega; // enable for regA
+   reg 		load_regb; // enable for regB
+   reg 		load_regc; // enable for regC
+   reg 		load_regd; // enable for regD
+   reg 		load_regr; // enable for rem
+   reg 		load_regs; // enable for q,qm,qp   
+   reg [2:0] 	sel_muxa;  // Select muxA
+   reg [2:0] 	sel_muxb;  // Select muxB
+   reg 		sel_muxr;  // Select rem mux
+
+   reg [4:0] 	CURRENT_STATE;
+   reg [4:0] 	NEXT_STATE;   
+
+   parameter [4:0] 
+     S0=5'd0, S1=5'd1, S2=5'd2,
+     S3=5'd3, S4=5'd4, S5=5'd5,
+     S6=5'd6, S7=5'd7, S8=5'd8,
+     S9=5'd9, S10=5'd10,
+     S13=5'd13, S14=5'd14, S15=5'd15,     
+     S16=5'd16, S17=5'd17, S18=5'd18,
+     S19=5'd19, S20=5'd20, S21=5'd21,
+     S22=5'd22, S23=5'd23, S24=5'd24,
+     S25=5'd25, S26=5'd26, S27=5'd27,
+     S28=5'd28, S29=5'd29, S30=5'd30;
+   
+   always @(posedge clk)
+     begin
+	if(reset==1'b1)
+	  CURRENT_STATE<=S0;
+	else
+	  CURRENT_STATE<=NEXT_STATE;
+     end
+
+   always @(*)
+     begin
+ 	case(CURRENT_STATE)
+	  S0:  // iteration 0
+	    begin
+	       if (start==1'b0)
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b0;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;
+		    sel_muxa = 3'b000;
+		    sel_muxb = 3'b000;
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S0;
+		 end 
+	       else if (start==1'b1 && op_type==1'b0) 
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b1;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;		    
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;		    		    
+		    sel_muxa = 3'b001;
+		    sel_muxb = 3'b001;		    
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S1;
+		 end // if (start==1'b1 && op_type==1'b0)
+	       else if (start==1'b1 && op_type==1'b1) 
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b1;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;		    
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;		    		    
+		    sel_muxa = 3'b010;
+		    sel_muxb = 3'b000;		    
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S13;
+		 end 	       
+	    end // case: S0
+	  S1:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b010;
+	       sel_muxb = 3'b000;		    
+	       sel_muxr = 1'b0;	
+	       NEXT_STATE <= S2;
+	    end	  
+	  S2: // iteration 1
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S3;
+	    end
+	  S3:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S4;
+	    end
+	  S4: // iteration 2
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S5;
+	    end
+	  S5:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;  // add
+	       NEXT_STATE <= S6;
+	    end
+	  S6: // iteration 3
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S8;
+	    end
+	  S7:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S8;
+	    end // case: S7
+	  S8: // q,qm,qp
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b1;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S9;
+	    end 
+	  S9:  // rem
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b1;
+	       load_regs = 1'b0;  
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b1;
+	       NEXT_STATE <= S10;
+	    end 	  
+	  S10:  // done
+	    begin
+	       done = 1'b1;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end 
+	  S13:  // start of sqrt path
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b010;
+	       sel_muxb = 3'b001;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S14;
+	    end
+	  S14:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b001;
+	       sel_muxb = 3'b100;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S15;
+	    end 
+	  S15:  // iteration 1
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S16;
+	    end
+	  S16:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S17;
+	    end
+	  S17:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S18;
+	    end
+	  S18:  // iteration 2
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S19;
+	    end
+	  S19:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S20;
+	    end
+	  S20:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S21;
+	    end
+	  S21:  // iteration 3
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S22;
+	    end
+	  S22:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S23;
+	    end
+	  S23:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S24;
+	    end 
+	  S24: // q,qm,qp
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b1;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S25;
+	    end 	  
+	  S25:  // rem
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b1;
+	       load_regs = 1'b0;  
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b110;
+	       sel_muxr = 1'b1;
+	       NEXT_STATE <= S26;
+	    end 	  
+	  S26:  // done
+	    begin
+	       done = 1'b1;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end 
+	  default: 
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end
+	endcase // case(CURRENT_STATE)	
+     end // always @ (CURRENT_STATE or X)   
+
+endmodule // fsm
--- a/wally-pipelined/src/fpu/fpdivsqrt/mult_R4_64_64_cs.v
+++ b/wally-pipelined/src/fpu/fpdivsqrt/mult_R4_64_64_cs.v
@ -16,8 +16,7 @@ module mult64 (x, y, P);
   multiplier p1 (y, x, Sum, Carry);
   //assign Pt = Sum + Carry;
   //assign P = Pt[127:0];
-   // ldf128 cpa (cout, P, Sum, Carry, 1'b0);
-   adder_ip #(128) cpa (Sum, Carry, 1'b0, P, cout);   
+   ldf128 cpa (cout, P, Sum, Carry, 1'b0);   

 endmodule // mult64  

--- a/wally-pipelined/src/fpu/fpdivsqrt/rounder_div.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/rounder_div.sv
@ -12,19 +12,19 @@
 //      11  		round-toward-minus infinity
 //

-module rounder (Result, DenormIO, Flags, rm, P, OvEn, 
-		UnEn, exp_diff, sel_inv, Invalid, DenormIn, 
-		SignR, q1, qm1, qp1, q0, qm0, qp0, regr_out);
+module rounder_div (Result, DenormIO, Flags, rm, P, OvEn, 
+		    UnEn, exp_diff, sel_inv, Invalid, DenormIn, 
+		    SignR, q1, qm1, qp1, q0, qm0, qp0, regr_out);

-   input logic [1:0]   rm;
-   input logic         P;
-   input logic         OvEn;
-   input logic         UnEn;
-   input logic [12:0]  exp_diff;
-   input logic [2:0]   sel_inv;
-   input logic 	       Invalid;
-   input logic 	       DenormIn;
-   input logic 	       SignR;
+   input  [1:0]   rm;
+   input          P;
+   input          OvEn;
+   input          UnEn;
+   input [12:0]   exp_diff;
+   input [2:0] 	  sel_inv;
+   input	  Invalid;
+   input	  DenormIn;
+   input 	  SignR;
   
   input logic [63:0]  q1;
   input logic [63:0]  qm1;
@ -37,7 +37,7 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
   output logic [63:0] Result;
   output logic        DenormIO;
   output logic [4:0]  Flags;
-
+   
   supply1 	       vdd;
   supply0 	       vss;
   
@ -146,7 +146,7 @@ module rounder (Result, DenormIO, Flags, rm, P, OvEn,
   // Determine sign
   assign Rzero = UnderFlow | (~sel_inv[2]&sel_inv[1]&sel_inv[0]);
   assign Rsign = SignR;   
-   
+      
   // The exponent of the final result is zero if the final result is 
   // zero or a denorm, all ones if the final result is NaN or Infinite
   // or overflow occurred and the magnitude of the number is 
--- a/wally-pipelined/src/fpu/fpdivsqrt/sbtm.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/sbtm.sv
@ -26,8 +26,7 @@ module sbtm (input logic [11:0] a, output logic [10:0] ia_out);
   assign op2 = x2[3] ? {1'b1, {8{1'b1}}, ~y1, 1'b1} :
 		{1'b0, 8'b0, y1, 1'b1};
   // CPA
-   //bk15 cp1 (cout, p, op1, op2, 1'b0);
-   adder_ip #(15) cp1 (op1, op2, 1'b0, p, cout);   
+   adder #(15) cp1 (op1, op2, 1'b0, p, cout);  
   //assign ia_out = {p[14:4], {53{1'b0}}};
   assign ia_out = p[14:4];

--- a/wally-pipelined/src/fpu/fpdivsqrt/sbtm3.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/sbtm3.sv
@ -28,8 +28,7 @@ module sbtm2 (input logic [11:0] a, output logic [10:0] y);
 		{8'b0, y1, 1'b1};
   
   // CPA
-   //bk15 cp1 (cout, p, op1, op2, 1'b0);
-   adder_ip #(15) cp1 (op1, op2, 1'b0, p, cout);      
+   adder #(15) cp1 (op1, op2, 1'b0, p, cout); 
   assign y = p[14:4];

 endmodule // sbtm2
--- a/wally-pipelined/src/fpu/fpdivsqrt/sbtm_a2.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/sbtm_a2.sv
@ -1,140 +0,0 @@
-module sbtm_a2 (input  logic [6:0] a,
-		output logic [12:0] y);
-   always_comb
-     case(a)
-       7'b0000000: y = 13'b1111111110001;
-       7'b0000001: y = 13'b1111111010001;
-       7'b0000010: y = 13'b1111110110010;
-       7'b0000011: y = 13'b1111110010011;
-       7'b0000100: y = 13'b1111101110101;
-       7'b0000101: y = 13'b1111101010110;
-       7'b0000110: y = 13'b1111100111001;
-       7'b0000111: y = 13'b1111100011011;
-       7'b0001000: y = 13'b1111011111110;
-       7'b0001001: y = 13'b1111011100001;
-       7'b0001010: y = 13'b1111011000100;
-       7'b0001011: y = 13'b1111010101000;
-       7'b0001100: y = 13'b1111010001100;
-       7'b0001101: y = 13'b1111001110000;
-       7'b0001110: y = 13'b1111001010101;
-       7'b0001111: y = 13'b1111000111010;
-       7'b0010000: y = 13'b1111000011111;
-       7'b0010001: y = 13'b1111000000100;
-       7'b0010010: y = 13'b1110111101010;
-       7'b0010011: y = 13'b1110111010000;
-       7'b0010100: y = 13'b1110110110110;
-       7'b0010101: y = 13'b1110110011101;
-       7'b0010110: y = 13'b1110110000100;
-       7'b0010111: y = 13'b1110101101011;
-       7'b0011000: y = 13'b1110101010010;
-       7'b0011001: y = 13'b1110100111001;
-       7'b0011010: y = 13'b1110100100001;
-       7'b0011011: y = 13'b1110100001001;
-       7'b0011100: y = 13'b1110011110001;
-       7'b0011101: y = 13'b1110011011010;
-       7'b0011110: y = 13'b1110011000010;
-       7'b0011111: y = 13'b1110010101011;
-       7'b0100000: y = 13'b1110010010100;
-       7'b0100001: y = 13'b1110001111110;
-       7'b0100010: y = 13'b1110001100111;
-       7'b0100011: y = 13'b1110001010001;
-       7'b0100100: y = 13'b1110000111011;
-       7'b0100101: y = 13'b1110000100101;
-       7'b0100110: y = 13'b1110000001111;
-       7'b0100111: y = 13'b1101111111010;
-       7'b0101000: y = 13'b1101111100101;
-       7'b0101001: y = 13'b1101111010000;
-       7'b0101010: y = 13'b1101110111011;
-       7'b0101011: y = 13'b1101110100110;
-       7'b0101100: y = 13'b1101110010001;
-       7'b0101101: y = 13'b1101101111101;
-       7'b0101110: y = 13'b1101101101001;
-       7'b0101111: y = 13'b1101101010101;
-       7'b0110000: y = 13'b1101101000001;
-       7'b0110001: y = 13'b1101100101101;
-       7'b0110010: y = 13'b1101100011010;
-       7'b0110011: y = 13'b1101100000110;
-       7'b0110100: y = 13'b1101011110011;
-       7'b0110101: y = 13'b1101011100000;
-       7'b0110110: y = 13'b1101011001101;
-       7'b0110111: y = 13'b1101010111010;
-       7'b0111000: y = 13'b1101010101000;
-       7'b0111001: y = 13'b1101010010101;
-       7'b0111010: y = 13'b1101010000011;
-       7'b0111011: y = 13'b1101001110001;
-       7'b0111100: y = 13'b1101001011111;
-       7'b0111101: y = 13'b1101001001101;
-       7'b0111110: y = 13'b1101000111100;
-       7'b0111111: y = 13'b1101000101010;
-       7'b1000000: y = 13'b1101000011001;
-       7'b1000001: y = 13'b1101000000111;
-       7'b1000010: y = 13'b1100111110110;
-       7'b1000011: y = 13'b1100111100101;
-       7'b1000100: y = 13'b1100111010100;
-       7'b1000101: y = 13'b1100111000011;
-       7'b1000110: y = 13'b1100110110011;
-       7'b1000111: y = 13'b1100110100010;
-       7'b1001000: y = 13'b1100110010010;
-       7'b1001001: y = 13'b1100110000010;
-       7'b1001010: y = 13'b1100101110010;
-       7'b1001011: y = 13'b1100101100001;
-       7'b1001100: y = 13'b1100101010010;
-       7'b1001101: y = 13'b1100101000010;
-       7'b1001110: y = 13'b1100100110010;
-       7'b1001111: y = 13'b1100100100011;
-       7'b1010000: y = 13'b1100100010011;
-       7'b1010001: y = 13'b1100100000100;
-       7'b1010010: y = 13'b1100011110101;
-       7'b1010011: y = 13'b1100011100101;
-       7'b1010100: y = 13'b1100011010110;
-       7'b1010101: y = 13'b1100011000111;
-       7'b1010110: y = 13'b1100010111001;
-       7'b1010111: y = 13'b1100010101010;
-       7'b1011000: y = 13'b1100010011011;
-       7'b1011001: y = 13'b1100010001101;
-       7'b1011010: y = 13'b1100001111110;
-       7'b1011011: y = 13'b1100001110000;
-       7'b1011100: y = 13'b1100001100010;
-       7'b1011101: y = 13'b1100001010100;
-       7'b1011110: y = 13'b1100001000110;
-       7'b1011111: y = 13'b1100000111000;
-       7'b1100000: y = 13'b1100000101010;
-       7'b1100001: y = 13'b1100000011100;
-       7'b1100010: y = 13'b1100000001111;
-       7'b1100011: y = 13'b1100000000001;
-       7'b1100100: y = 13'b1011111110100;
-       7'b1100101: y = 13'b1011111100110;
-       7'b1100110: y = 13'b1011111011001;
-       7'b1100111: y = 13'b1011111001100;
-       7'b1101000: y = 13'b1011110111111;
-       7'b1101001: y = 13'b1011110110010;
-       7'b1101010: y = 13'b1011110100101;
-       7'b1101011: y = 13'b1011110011000;
-       7'b1101100: y = 13'b1011110001011;
-       7'b1101101: y = 13'b1011101111110;
-       7'b1101110: y = 13'b1011101110010;
-       7'b1101111: y = 13'b1011101100101;
-       7'b1110000: y = 13'b1011101011001;
-       7'b1110001: y = 13'b1011101001100;
-       7'b1110010: y = 13'b1011101000000;
-       7'b1110011: y = 13'b1011100110100;
-       7'b1110100: y = 13'b1011100101000;
-       7'b1110101: y = 13'b1011100011100;
-       7'b1110110: y = 13'b1011100010000;
-       7'b1110111: y = 13'b1011100000100;
-       7'b1111000: y = 13'b1011011111000;
-       7'b1111001: y = 13'b1011011101100;
-       7'b1111010: y = 13'b1011011100000;
-       7'b1111011: y = 13'b1011011010101;
-       7'b1111100: y = 13'b1011011001001;
-       7'b1111101: y = 13'b1011010111101;
-       7'b1111110: y = 13'b1011010110010;
-       7'b1111111: y = 13'b1011010100111;	    
-       default: y = 13'bxxxxxxxxxxxxx;
-     endcase // case (a)
-    
-endmodule // sbtm_a0
-
-    
-    
-    
--- a/wally-pipelined/src/fpu/fpdivsqrt/sim.csh
+++ b/wally-pipelined/src/fpu/fpdivsqrt/sim.csh
@ -1,6 +1,6 @@
 #!/bin/sh
 ./runme_f64div.csh
 ./runme_f32div.csh
-./runme_f64sqrt_csh
+./runme_f64sqrt.csh
 ./runme_f32sqrt.csh
 echo "Simulation Ended, Go Pokes!..."
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_div_rd.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_div_rd.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [31:0] op1;		
@ -59,25 +59,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (11)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 31605) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (31605)
+	  if (~reset)
+	    begin
+	       #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (10)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;	
     end // always @ (posedge clk)
   
 endmodule // tb
-
-
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_div_rne.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_div_rne.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [31:0] op1;		
@ -59,25 +59,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (11)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 31743) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (39509)
+	  if (~reset)
+	    begin
+	       #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (10)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;					
     end // always @ (posedge clk)
   
 endmodule // tb
-
-
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_div_ru.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_div_ru.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [31:0] op1;		
@ -59,25 +59,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (11)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 31614) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (31614)
+	  if (~reset)
+	    begin
+	       #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (10)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;			
     end // always @ (posedge clk)
   
 endmodule // tb
-
-
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_div_rz.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_div_rz.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [31:0] op1;		
@ -59,25 +59,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (11)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 31792) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (31792)
+	  if (~reset)
+	    begin
+	       #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (10)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;						
     end // always @ (posedge clk)
   
 endmodule // tb
-
-
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_sqrt_rd.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_sqrt_rd.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [31:0] op1;		
@ -58,6 +58,7 @@ module tb ();

   always @(posedge clk)
     begin
+	repeat (19538)
 	if (~reset)
 	  begin
 	     #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
@ -66,15 +67,13 @@ module tb ();
 	       @(posedge clk);
 	     // deassert start after 2 cycles
 	     start = 1'b0;	
-	     repeat (16)
+	     repeat (15)
 	       @(posedge clk);
 	     $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
 	     vectornum = vectornum + 1;
 	  end // if (~reset)
-	if (vectornum == 19538) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	$display("%d vectors processed", vectornum);
+	$finish;					
     end // always @ (posedge clk)
   
 endmodule // tb
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_sqrt_rne.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_sqrt_rne.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [31:0] op1;		
@ -58,23 +58,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (16)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 19538) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (19538)
+	  if (~reset)
+	    begin
+	       #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (15)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;		
     end // always @ (posedge clk)
   
 endmodule // tb
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_sqrt_ru.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_sqrt_ru.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [31:0] op1;		
@ -58,23 +58,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (16)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 19538) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (19538)
+	  if (~reset)
+	    begin
+	       #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (15)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;				
     end // always @ (posedge clk)
   
 endmodule // tb
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_sqrt_rz.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f32_sqrt_rz.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [31:0] op1;		
@ -58,23 +58,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (16)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 19538) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (19538)
+	  if (~reset)
+	    begin
+	       #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (15)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result[63:32]==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;			
     end // always @ (posedge clk)
   
 endmodule // tb
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_div_rd.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_div_rd.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [63:0] op1;		
@ -59,25 +59,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (13)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 39050) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (39050)
+	  if (~reset)
+	    begin
+	       #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (10)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;			
     end // always @ (posedge clk)
   
 endmodule // tb
-
-
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_div_rne.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_div_rne.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [63:0] op1;		
@ -24,8 +24,7 @@ module tb ();
   logic [7:0] 	 flags_expected;

   integer 	handle3;
-   integer 	handle4;   
-   integer 	desc3;   
+    integer 	desc3;
   
   // instantiate device under test
   fpdiv dut (done, AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn,
@ -50,7 +49,7 @@ module tb ();

   initial
     begin
-	desc3 = handle3;	
+	desc3 = handle3;
 	#0  op_type = 1'b0;
 	#0  P = 1'b0;
 	#0  rm = 2'b00;
@ -60,23 +59,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (13)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 39509) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (39509)
+	  if (~reset)
+	    begin
+	       #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (10)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;	
     end // always @ (posedge clk)
   
 endmodule // tb
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_div_ru.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_div_ru.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [63:0] op1;		
@ -59,25 +59,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (13)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 39020) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end
+	repeat (39020)
+	  if (~reset)
+	    begin
+	       #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (10)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;				
     end // always @ (posedge clk)
   
 endmodule // tb
-
-
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_div_rz.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_div_rz.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [63:0] op1;		
@ -59,23 +59,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (13)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 39515) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end		
+	repeat (39515)
+	  if (~reset)
+	    begin
+	       #0; {op1, op2, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (10)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%h_%b_%b | %h_%b", op1, op2, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;		
     end // always @ (posedge clk)
   
 endmodule // tb
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_sqrt_rd.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_sqrt_rd.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [63:0] op1;		
@ -58,23 +58,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (20)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 363) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (363)
+	  if (~reset)
+	    begin
+	       #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (15)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;	
     end // always @ (posedge clk)
   
 endmodule // tb
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_sqrt_rne.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_sqrt_rne.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [63:0] op1;		
@ -58,25 +58,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (20)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 363) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (363)
+	  if (~reset)
+	    begin
+	       #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (15)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;		
     end // always @ (posedge clk)
   
 endmodule // tb
-
-
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_sqrt_ru.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_sqrt_ru.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [63:0] op1;		
@ -58,23 +58,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (20)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 363) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (363)
+	  if (~reset)
+	    begin
+	       #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (15)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;			
     end // always @ (posedge clk)
   
 endmodule // tb
--- a/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_sqrt_rz.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/tb_f64_sqrt_rz.sv
@ -1,4 +1,4 @@
-`timescale 1ns/1ps
+`timescale 1ps/1ps
 module tb ();

   logic [63:0] op1;		
@ -58,25 +58,22 @@ module tb ();

   always @(posedge clk)
     begin
-	if (~reset)
-	  begin
-	     #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
-	     #50 start = 1'b1;
-	     repeat (2)
-	       @(posedge clk);
-	     // deassert start after 2 cycles
-	     start = 1'b0;	
-	     repeat (20)
-	       @(posedge clk);
-	     $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
-	     vectornum = vectornum + 1;
-	  end // if (~reset)
-	if (vectornum == 363) begin
-	   $display("%d vectors processed", vectornum);
-	   $finish;
-	end	
+	repeat (363)
+	  if (~reset)
+	    begin
+	       #0; {op1, yexpected, flags_expected} = testvectors[vectornum];
+	       #50 start = 1'b1;
+	       repeat (2)
+		 @(posedge clk);
+	       // deassert start after 2 cycles
+	       start = 1'b0;	
+	       repeat (15)
+		 @(posedge clk);
+	       $fdisplay(desc3, "%h_%h_%b_%b | %h_%b", op1, AS_Result, Flags, Denorm, yexpected, (AS_Result==yexpected));
+	       vectornum = vectornum + 1;
+	    end // if (~reset)
+	$display("%d vectors processed", vectornum);
+	$finish;		
     end // always @ (posedge clk)
   
 endmodule // tb
-
-
--- a/wally-pipelined/src/fpu/fpdivsqrt/test_fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdivsqrt/test_fpdiv.sv
@ -21,8 +21,7 @@ module tb;
   integer 	 handle3;
   integer 	 desc3;   

-   fpdivP dut (done, AS_Result, Flags, Denorm, op1, op2, 
-	       rm, op_type, P, OvEn, UnEn,
+   fpdiv dut (done, AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn,
 	      start, reset, clk);   
   
   initial 
@ -54,23 +53,16 @@ module tb;
 	// 10 round-toward-plus infinity
 	// 11 round-toward-minus infinity	
 	#0  rm = 2'b00;
-	#0  op_type = 1'b1;
+	#0  op_type = 1'b0;
 	
 	#0  op1 = 64'h3ffc_0000_0000_0000; // 1.75
 	#0  op2 = 64'h3ffe_0000_0000_0000; // 1.875

-	// P=1 divide
-	//#0  op1 = 64'h8683_F7FF_0000_0000;
-	//#0  op2 = 64'hC07F_3FFF_0000_0000;
+	#0  op1 = 64'h4020_5fff_ffff_ffff;	
+	#0  op2 = 64'hbcaf_ffff_ffff_ffff;

-	//#0  op1 = 64'h4F95_1295_0000_0000;
-	//#0  op2 = 64'h4F95_1295_0000_0000;	
-
-	//#0  op1 = 64'h4020_5fff_ffff_ffff;	
-	//#0  op2 = 64'hbcaf_ffff_ffff_ffff;
-
-	//#0  op1 = 64'h3fed_c505_fada_95fd; // 0.930300703
-	//#0  op2 = 64'h3ffe_0000_0000_0000; // 12.9303733
+	#0  op1 = 64'h0010_0000_0000_0001;
+	#0  op2 = 64'hc8cf_ffff_ffff_c001;	
 	
 	//#0  op1 = 64'h3ffe_e219_652b_d3c3; // 1.9302
 	//#0  op2 = 64'h3ff7_346d_c5d6_3886; // 1.4503
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -108,17 +108,9 @@ module fpu (
      logic [63:0] 	FPUResultW;                                           
      logic [4:0] 	FPUFlagsW;
      
-      
-
-
-
-
-
-

      //DECODE STAGE
      
-      
      // top-level controller for FPU
      fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
                  .FRM_REGW, .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
@ -129,14 +121,6 @@ module fpu (
            InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
            FPUResultW,
            FRD1D, FRD2D, FRD3D);	
-      
-
-
-
-
-
-
-

      //*****************
      // D/E pipe registers
@ -152,18 +136,6 @@ module fpu (
                           {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, RdE,          FOpCtrlE, FWriteIntE});


-
-
-
-
-
-
-
-
-
-
-
-
      //EXECUTION STAGE
      
      // Hazard unit for FPU
@ -198,12 +170,10 @@ module fpu (
                  .en(~HoldInputs), .clear(FDivSqrtDoneE),
                  .reset(reset),  .clk(clk));

-      fdivsqrt fdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
+      fpdiv fdivsqrt (.DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
                        .FrmE, .DivOvEn(1'b1), .DivUnEn(1'b1), .FDivStartE, .FDivResultM, .FDivSqrtFlgM, 
                        .FDivSqrtDoneE, .FDivBusyE, .HoldInputs, .reset);
      
-
-
      // first of two-stage instance of floating-point add/cvt unit
      faddcvt faddcvt (.clk, .reset, .FlushM, .StallM, .FrmM, .FOpCtrlM, .FmtE, .FmtM,
                        .SrcXE, .SrcYE, .FOpCtrlE, .FAddResM, .FAddFlgM);
@ -224,15 +194,6 @@ module fpu (
      // mux2  #(`XLEN)  FWriteDataMux({{`XLEN-32{1'b0}}, SrcYE[63:32]}, SrcYE[63:64-`XLEN], FmtE, FWriteDataE);
      assign FWriteDataE = SrcYE[`XLEN-1:0];

-
-
-
-
-
-
-
-
-
      //*****************
      // E/M pipe registers
      //*****************
@ -255,36 +216,18 @@ module fpu (
                           {FRegWriteM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, RdM, FOpCtrlM, FWriteIntM});

      flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
-      
-
-
-
-
-
-

      //BEGIN MEMORY STAGE
-      
      mux4  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, CvtResM, FResSelM, FResM);
      mux4  #(5)  FFlgMux(5'b0, {4'b0, SgnNVM}, {4'b0, CmpNVM}, CvtFlgM, FResSelM, FFlgM);

      // mux2  #(`XLEN)  SrcXAlignedMux({{`XLEN-32{1'b0}}, SrcXM[63:32]}, SrcXM[63:64-`XLEN], FmtM, SrcXMAligned);
      mux4  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], SrcXM[`XLEN-1:0], ClassResM[`XLEN-1:0], CvtResM[`XLEN-1:0], FIntResSelM, FIntResM);
-
      
      // Align SrcA to MSB when single precicion
      mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAM[31:0]}, {{64-`XLEN{1'b1}}, SrcAM}, FmtM, AlignedSrcAM);
-         
-         
      mux5  #(5)  FPUFlgMux(5'b0, FMAFlgM, FAddFlgM, FDivSqrtFlgM, FFlgM, FResultSelW, SetFflagsM);

-
-
-
-
-
-
-            
      //*****************
      // M/W pipe registers
      //*****************
@ -302,16 +245,10 @@ module fpu (
                           {FRegWriteM, FResultSelM, RdM, FmtM, FWriteIntM},
                           {FRegWriteW, FResultSelW, RdW, FmtW, FWriteIntW});
      
-      
-
-
-
-
   //#########################################
   // BEGIN WRITEBACK STAGE
   //#########################################

-
      mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
      mux5  #(64)  FPUResultMux(ReadResW, FMAResW, FAddResW, FDivResultW, FResW, FResultSelW, FPUResultW);
      
@ -330,4 +267,3 @@ module fpu (
  endgenerate 
  
 endmodule // fpu
-
--- a/wally-pipelined/src/fpu/fsm_div.v
+++ b/wally-pipelined/src/fpu/fsm_div.v
@ -0,0 +1,459 @@
+module fsm_div (done, load_rega, load_regb, load_regc, 
+		load_regd, load_regr, load_regs,
+		sel_muxa, sel_muxb, sel_muxr, 
+		clk, reset, start, error, op_type);
+
+   input 	clk;
+   input 	reset;
+   input 	start;
+   input 	error;
+   input  	op_type;
+   
+   output       done;      
+   output       load_rega;
+   output       load_regb;
+   output       load_regc;
+   output 	load_regd;   
+   output 	load_regr;
+   output 	load_regs;
+   
+   output [2:0] sel_muxa;
+   output [2:0] sel_muxb;
+   output 	sel_muxr;
+
+   reg 		done;      // End of cycles
+   reg 		load_rega; // enable for regA
+   reg 		load_regb; // enable for regB
+   reg 		load_regc; // enable for regC
+   reg 		load_regd; // enable for regD
+   reg 		load_regr; // enable for rem
+   reg 		load_regs; // enable for q,qm,qp   
+   reg [2:0] 	sel_muxa;  // Select muxA
+   reg [2:0] 	sel_muxb;  // Select muxB
+   reg 		sel_muxr;  // Select rem mux
+
+   reg [4:0] 	CURRENT_STATE;
+   reg [4:0] 	NEXT_STATE;   
+
+   parameter [4:0] 
+     S0=5'd0, S1=5'd1, S2=5'd2,
+     S3=5'd3, S4=5'd4, S5=5'd5,
+     S6=5'd6, S7=5'd7, S8=5'd8,
+     S9=5'd9, S10=5'd10,
+     S13=5'd13, S14=5'd14, S15=5'd15,     
+     S16=5'd16, S17=5'd17, S18=5'd18,
+     S19=5'd19, S20=5'd20, S21=5'd21,
+     S22=5'd22, S23=5'd23, S24=5'd24,
+     S25=5'd25, S26=5'd26, S27=5'd27,
+     S28=5'd28, S29=5'd29, S30=5'd30;
+   
+   always @(posedge clk)
+     begin
+	if(reset==1'b1)
+	  CURRENT_STATE<=S0;
+	else
+	  CURRENT_STATE<=NEXT_STATE;
+     end
+
+   always @(*)
+     begin
+ 	case(CURRENT_STATE)
+	  S0:  // iteration 0
+	    begin
+	       if (start==1'b0)
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b0;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;
+		    sel_muxa = 3'b000;
+		    sel_muxb = 3'b000;
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S0;
+		 end 
+	       else if (start==1'b1 && op_type==1'b0) 
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b1;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;		    
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;		    		    
+		    sel_muxa = 3'b001;
+		    sel_muxb = 3'b001;		    
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S1;
+		 end // if (start==1'b1 && op_type==1'b0)
+	       else if (start==1'b1 && op_type==1'b1) 
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b1;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;		    
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;		    		    
+		    sel_muxa = 3'b010;
+		    sel_muxb = 3'b000;		    
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S13;
+		 end 	       
+	    end // case: S0
+	  S1:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b010;
+	       sel_muxb = 3'b000;		    
+	       sel_muxr = 1'b0;	
+	       NEXT_STATE <= S2;
+	    end	  
+	  S2: // iteration 1
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S3;
+	    end
+	  S3:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S4;
+	    end
+	  S4: // iteration 2
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S5;
+	    end
+	  S5:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;  // add
+	       NEXT_STATE <= S6;
+	    end
+	  S6: // iteration 3
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S8;
+	    end
+	  S7:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S8;
+	    end // case: S7
+	  S8: // q,qm,qp
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b1;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S9;
+	    end 
+	  S9:  // rem
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b1;
+	       load_regs = 1'b0;  
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b1;
+	       NEXT_STATE <= S10;
+	    end 	  
+	  S10:  // done
+	    begin
+	       done = 1'b1;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end 
+	  S13:  // start of sqrt path
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b010;
+	       sel_muxb = 3'b001;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S14;
+	    end
+	  S14:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b001;
+	       sel_muxb = 3'b100;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S15;
+	    end 
+	  S15:  // iteration 1
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S16;
+	    end
+	  S16:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S17;
+	    end
+	  S17:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S18;
+	    end
+	  S18:  // iteration 2
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S19;
+	    end
+	  S19:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S20;
+	    end
+	  S20:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S21;
+	    end
+	  S21:  // iteration 3
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S22;
+	    end
+	  S22:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S23;
+	    end
+	  S23:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S24;
+	    end 
+	  S24: // q,qm,qp
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b1;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S25;
+	    end 	  
+	  S25:  // rem
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b1;
+	       load_regs = 1'b0;  
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b110;
+	       sel_muxr = 1'b1;
+	       NEXT_STATE <= S26;
+	    end 	  
+	  S26:  // done
+	    begin
+	       done = 1'b1;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end 
+	  default: 
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end
+	endcase // case(CURRENT_STATE)	
+     end // always @ (CURRENT_STATE or X)   
+
+endmodule // fsm
--- a/wally-pipelined/src/fpu/mult_R4_64_64_cs.v
+++ b/wally-pipelined/src/fpu/mult_R4_64_64_cs.v
--- a/wally-pipelined/src/fpu/rounder_div.sv
+++ b/wally-pipelined/src/fpu/rounder_div.sv
@ -13,10 +13,10 @@
 //

 module rounder_div (Result, DenormIO, Flags, rm, P, OvEn, 
-		UnEn, exp_diff, sel_inv, Invalid, DenormIn, 
-		SignR, q1, qm1, qp1, q0, qm0, qp0, regr_out);
+		    UnEn, exp_diff, sel_inv, Invalid, DenormIn, 
+		    SignR, q1, qm1, qp1, q0, qm0, qp0, regr_out);

-   input  [2:0]   rm;
+   input  [1:0]   rm;
   input          P;
   input          OvEn;
   input          UnEn;
@ -26,48 +26,47 @@ module rounder_div (Result, DenormIO, Flags, rm, P, OvEn,
   input	  DenormIn;
   input 	  SignR;
   
-   input [63:0]   q1;
-   input [63:0]   qm1;
-   input [63:0]   qp1;
-   input [63:0]   q0;
-   input [63:0]   qm0;
-   input [63:0]   qp0;   
-   input [127:0]  regr_out;
+   input logic [63:0]  q1;
+   input logic [63:0]  qm1;
+   input logic [63:0]  qp1;
+   input logic [63:0]  q0;
+   input logic [63:0]  qm0;
+   input logic [63:0]  qp0;   
+   input logic [127:0] regr_out;
   
-   output [63:0]  Result;
-   output 	  DenormIO;
-   output [4:0]   Flags;
-
-   supply1 	  vdd;
-   supply0 	  vss;
+   output logic [63:0] Result;
+   output logic        DenormIO;
+   output logic [4:0]  Flags;
   
-   wire 	  Rsign;
-   wire [10:0] 	  Rexp;
-   wire [12:0] 	  Texp;
-   wire [51:0] 	  Rmant;
-   wire [63:0] 	  Tmant;
-   wire [51:0] 	  Smant;   
-   wire 	  Rzero;
-   wire 	  Gdp, Gsp, G;
-   wire 	  UnFlow_SP, UnFlow_DP, UnderFlow; 
-   wire 	  OvFlow_SP, OvFlow_DP, OverFlow;		
-   wire 	  Inexact;
-   wire 	  Round_zero;
-   wire 	  Infinite;
-   wire 	  VeryLarge;
-   wire 	  Largest;
-   wire 	  Div0;      
-   wire 	  Adj_exp;
-   wire 	  Valid;
-   wire 	  NaN;
-   wire 	  Texp_l7z;
-   wire 	  Texp_l7o;
-   wire 	  OvCon;
-   wire [1:0] 	  mux_mant;
-   wire 	  sign_rem;
-   wire [63:0] 	  q, qm, qp;
-   wire 	  exp_ovf, exp_ovfSP, exp_ovfDP;
-   logic zero_rem;   
+   supply1 	       vdd;
+   supply0 	       vss;
+   
+   logic 	       Rsign;
+   logic [10:0]        Rexp;
+   logic [12:0]        Texp;
+   logic [51:0]        Rmant;
+   logic [63:0]        Tmant;
+   logic [51:0]        Smant;   
+   logic 	       Rzero;
+   logic 	       Gdp, Gsp, G;
+   logic 	       UnFlow_SP, UnFlow_DP, UnderFlow; 
+   logic 	       OvFlow_SP, OvFlow_DP, OverFlow;		
+   logic 	       Inexact;
+   logic 	       Round_zero;
+   logic 	       Infinite;
+   logic 	       VeryLarge;
+   logic 	       Largest;
+   logic 	       Div0;      
+   logic 	       Adj_exp;
+   logic 	       Valid;
+   logic 	       NaN;
+   logic 	       Texp_l7z;
+   logic 	       Texp_l7o;
+   logic 	       OvCon;
+   logic [1:0] 	       mux_mant;
+   logic 	       sign_rem;
+   logic [63:0]        q, qm, qp;
+   logic 	       exp_ovf, exp_ovfSP, exp_ovfDP;   

   // Remainder = 0?
   assign zero_rem = ~(|regr_out);
@ -98,7 +97,7 @@ module rounder_div (Result, DenormIO, Flags, rm, P, OvEn,
   //   1.) we choose any qm0, qp0, q0 (since we shift mant)
   //   2.) we choose qp and we overflow (for RU)
   assign exp_ovf = |{qp[62:40], (qp[39:11] & {29{~P}})};
-   assign Texp = exp_diff - {{12{vss}}, ~q1[63]} + {{12{vss}}, mux_mant[1]&qp1[63]&~exp_ovf}; // KEP used to be 13{vss}
+   assign Texp = exp_diff - {{13{vss}}, ~q1[63]} + {{13{vss}}, mux_mant[1]&qp1[63]&~exp_ovf};
   
   // Overflow only occurs for double precision, if Texp[10] to Texp[0] are 
   // all ones. To encourage sharing with single precision overflow detection,
--- a/wally-pipelined/src/fpu/sbtm.sv
+++ b/wally-pipelined/src/fpu/sbtm.sv
@ -11,8 +11,7 @@ module sbtm (input logic [11:0] a, output logic [10:0] ia_out);
   // input to CPA
   logic [14:0] op1;
   logic [14:0] op2;
-   logic [14:0] p; 
-   logic cout;  
+   logic [14:0] p;   

   assign x0 = a[10:7];
   assign x1 = a[6:4];
@ -27,8 +26,8 @@ module sbtm (input logic [11:0] a, output logic [10:0] ia_out);
   assign op2 = x2[3] ? {1'b1, {8{1'b1}}, ~y1, 1'b1} :
 		{1'b0, 8'b0, y1, 1'b1};
   // CPA
-   bk15 cp1 (cout, p, op1, op2, 1'b0);
+   adder #(15) cp1 (op1, op2, 1'b0, p, cout);  
   //assign ia_out = {p[14:4], {53{1'b0}}};
   assign ia_out = p[14:4];

-endmodule // sbtm
+endmodule // sbtm
--- a/wally-pipelined/src/fpu/sbtm3.sv
+++ b/wally-pipelined/src/fpu/sbtm3.sv
@ -0,0 +1,39 @@
+module sbtm2 (input logic [11:0] a, output logic [10:0] y);
+
+   // bit partitions
+   logic [4:0] x0;
+   logic [2:0] x1;
+   logic [3:0] x2;
+   logic [2:0] x2_1cmp;   
+   // mem outputs
+   logic [13:0] y0;
+   logic [5:0] 	y1;
+   // input to CPA
+   logic [14:0] op1;
+   logic [14:0] op2;
+   logic [14:0] p;   
+
+   assign x0 = a[11:7];
+   assign x1 = a[6:4];
+   assign x2 = a[3:0];   
+
+   sbtm_a2 mem1 ({x0, x1}, y0);
+   assign op1 = {y0, 1'b0};
+   
+   // 1s cmp per sbtm/stam
+   assign x2_1cmp = x2[3] ? ~x2[2:0] : x2[2:0];   
+   sbtm_a3 mem2 ({x0, x2_1cmp}, y1);
+   // 1s cmp per sbtm/stam
+   assign op2 = x2[3] ? {{8{1'b1}}, ~y1, 1'b1} :
+		{8'b0, y1, 1'b1};
+   
+   // CPA
+   adder #(15) cp1 (op1, op2, 1'b0, p, cout); 
+   assign y = p[14:4];
+
+endmodule // sbtm2
+
+
+   
+
+   
--- a/wally-pipelined/src/fpu/sbtm_a0.sv
+++ b/wally-pipelined/src/fpu/sbtm_a0.sv
@ -133,4 +133,8 @@ module sbtm_a0 (input  logic [6:0] a,
       default: y = 13'bxxxxxxxxxxxxx;
     endcase // case (a)
    
-endmodule // sbtm_a0
+endmodule // sbtm_a0
+
+    
+    
+    
--- a/wally-pipelined/src/fpu/sbtm_a1.sv
+++ b/wally-pipelined/src/fpu/sbtm_a1.sv
@ -133,4 +133,8 @@ module sbtm_a1 (input  logic [6:0] a,
       default: y = 5'bxxxxx;
     endcase // case (a)
    
-endmodule // sbtm_a0
+endmodule // sbtm_a0
+
+    
+    
+    
--- a/wally-pipelined/src/fpu/sbtm_a4.sv
+++ b/wally-pipelined/src/fpu/sbtm_a4.sv
@ -0,0 +1,204 @@
+module sbtm_a2 (input  logic [7:0] a,
+		output logic [13:0] y);
+   always_comb
+     case(a)
+       8'b01000000: y = 14'b10110100010111;
+       8'b01000001: y = 14'b10110010111111;
+       8'b01000010: y = 14'b10110001101000;
+       8'b01000011: y = 14'b10110000010011;
+       8'b01000100: y = 14'b10101111000001;
+       8'b01000101: y = 14'b10101101110000;
+       8'b01000110: y = 14'b10101100100001;
+       8'b01000111: y = 14'b10101011010011;
+       8'b01001000: y = 14'b10101010000111;
+       8'b01001001: y = 14'b10101000111101;
+       8'b01001010: y = 14'b10100111110100;
+       8'b01001011: y = 14'b10100110101101;
+       8'b01001100: y = 14'b10100101100111;
+       8'b01001101: y = 14'b10100100100010;
+       8'b01001110: y = 14'b10100011011111;
+       8'b01001111: y = 14'b10100010011101;
+       8'b01010000: y = 14'b10100001011100;
+       8'b01010001: y = 14'b10100000011100;
+       8'b01010010: y = 14'b10011111011110;
+       8'b01010011: y = 14'b10011110100001;
+       8'b01010100: y = 14'b10011101100100;
+       8'b01010101: y = 14'b10011100101001;
+       8'b01010110: y = 14'b10011011101111;
+       8'b01010111: y = 14'b10011010110110;
+       8'b01011000: y = 14'b10011001111110;
+       8'b01011001: y = 14'b10011001000110;
+       8'b01011010: y = 14'b10011000010000;
+       8'b01011011: y = 14'b10010111011011;
+       8'b01011100: y = 14'b10010110100110;
+       8'b01011101: y = 14'b10010101110011;
+       8'b01011110: y = 14'b10010101000000;
+       8'b01011111: y = 14'b10010100001110;
+       8'b01100000: y = 14'b10010011011100;
+       8'b01100001: y = 14'b10010010101100;
+       8'b01100010: y = 14'b10010001111100;
+       8'b01100011: y = 14'b10010001001101;
+       8'b01100100: y = 14'b10010000011111;
+       8'b01100101: y = 14'b10001111110001;
+       8'b01100110: y = 14'b10001111000100;
+       8'b01100111: y = 14'b10001110011000;
+       8'b01101000: y = 14'b10001101101100;
+       8'b01101001: y = 14'b10001101000001;
+       8'b01101010: y = 14'b10001100010110;
+       8'b01101011: y = 14'b10001011101100;
+       8'b01101100: y = 14'b10001011000011;
+       8'b01101101: y = 14'b10001010011010;
+       8'b01101110: y = 14'b10001001110010;
+       8'b01101111: y = 14'b10001001001010;
+       8'b01110000: y = 14'b10001000100011;
+       8'b01110001: y = 14'b10000111111101;
+       8'b01110010: y = 14'b10000111010111;
+       8'b01110011: y = 14'b10000110110001;
+       8'b01110100: y = 14'b10000110001100;
+       8'b01110101: y = 14'b10000101100111;
+       8'b01110110: y = 14'b10000101000011;
+       8'b01110111: y = 14'b10000100011111;
+       8'b01111000: y = 14'b10000011111100;
+       8'b01111001: y = 14'b10000011011001;
+       8'b01111010: y = 14'b10000010110111;
+       8'b01111011: y = 14'b10000010010101;
+       8'b01111100: y = 14'b10000001110011;
+       8'b01111101: y = 14'b10000001010010;
+       8'b01111110: y = 14'b10000000110001;
+       8'b01111111: y = 14'b10000000010001;       
+       8'b10000000: y = 14'b01111111110001;
+       8'b10000001: y = 14'b01111111010001;
+       8'b10000010: y = 14'b01111110110010;
+       8'b10000011: y = 14'b01111110010011;
+       8'b10000100: y = 14'b01111101110101;
+       8'b10000101: y = 14'b01111101010110;
+       8'b10000110: y = 14'b01111100111001;
+       8'b10000111: y = 14'b01111100011011;
+       8'b10001000: y = 14'b01111011111110;
+       8'b10001001: y = 14'b01111011100001;
+       8'b10001010: y = 14'b01111011000100;
+       8'b10001011: y = 14'b01111010101000;
+       8'b10001100: y = 14'b01111010001100;
+       8'b10001101: y = 14'b01111001110000;
+       8'b10001110: y = 14'b01111001010101;
+       8'b10001111: y = 14'b01111000111010;
+       8'b10010000: y = 14'b01111000011111;
+       8'b10010001: y = 14'b01111000000100;
+       8'b10010010: y = 14'b01110111101010;
+       8'b10010011: y = 14'b01110111010000;
+       8'b10010100: y = 14'b01110110110110;
+       8'b10010101: y = 14'b01110110011101;
+       8'b10010110: y = 14'b01110110000100;
+       8'b10010111: y = 14'b01110101101011;
+       8'b10011000: y = 14'b01110101010010;
+       8'b10011001: y = 14'b01110100111001;
+       8'b10011010: y = 14'b01110100100001;
+       8'b10011011: y = 14'b01110100001001;
+       8'b10011100: y = 14'b01110011110001;
+       8'b10011101: y = 14'b01110011011010;
+       8'b10011110: y = 14'b01110011000010;
+       8'b10011111: y = 14'b01110010101011;
+       8'b10100000: y = 14'b01110010010100;
+       8'b10100001: y = 14'b01110001111110;
+       8'b10100010: y = 14'b01110001100111;
+       8'b10100011: y = 14'b01110001010001;
+       8'b10100100: y = 14'b01110000111011;
+       8'b10100101: y = 14'b01110000100101;
+       8'b10100110: y = 14'b01110000001111;
+       8'b10100111: y = 14'b01101111111010;
+       8'b10101000: y = 14'b01101111100101;
+       8'b10101001: y = 14'b01101111010000;
+       8'b10101010: y = 14'b01101110111011;
+       8'b10101011: y = 14'b01101110100110;
+       8'b10101100: y = 14'b01101110010001;
+       8'b10101101: y = 14'b01101101111101;
+       8'b10101110: y = 14'b01101101101001;
+       8'b10101111: y = 14'b01101101010101;
+       8'b10110000: y = 14'b01101101000001;
+       8'b10110001: y = 14'b01101100101101;
+       8'b10110010: y = 14'b01101100011010;
+       8'b10110011: y = 14'b01101100000110;
+       8'b10110100: y = 14'b01101011110011;
+       8'b10110101: y = 14'b01101011100000;
+       8'b10110110: y = 14'b01101011001101;
+       8'b10110111: y = 14'b01101010111010;
+       8'b10111000: y = 14'b01101010101000;
+       8'b10111001: y = 14'b01101010010101;
+       8'b10111010: y = 14'b01101010000011;
+       8'b10111011: y = 14'b01101001110001;
+       8'b10111100: y = 14'b01101001011111;
+       8'b10111101: y = 14'b01101001001101;
+       8'b10111110: y = 14'b01101000111100;
+       8'b10111111: y = 14'b01101000101010;
+       8'b11000000: y = 14'b01101000011001;
+       8'b11000001: y = 14'b01101000000111;
+       8'b11000010: y = 14'b01100111110110;
+       8'b11000011: y = 14'b01100111100101;
+       8'b11000100: y = 14'b01100111010100;
+       8'b11000101: y = 14'b01100111000011;
+       8'b11000110: y = 14'b01100110110011;
+       8'b11000111: y = 14'b01100110100010;
+       8'b11001000: y = 14'b01100110010010;
+       8'b11001001: y = 14'b01100110000010;
+       8'b11001010: y = 14'b01100101110010;
+       8'b11001011: y = 14'b01100101100001;
+       8'b11001100: y = 14'b01100101010010;
+       8'b11001101: y = 14'b01100101000010;
+       8'b11001110: y = 14'b01100100110010;
+       8'b11001111: y = 14'b01100100100011;
+       8'b11010000: y = 14'b01100100010011;
+       8'b11010001: y = 14'b01100100000100;
+       8'b11010010: y = 14'b01100011110101;
+       8'b11010011: y = 14'b01100011100101;
+       8'b11010100: y = 14'b01100011010110;
+       8'b11010101: y = 14'b01100011000111;
+       8'b11010110: y = 14'b01100010111001;
+       8'b11010111: y = 14'b01100010101010;
+       8'b11011000: y = 14'b01100010011011;
+       8'b11011001: y = 14'b01100010001101;
+       8'b11011010: y = 14'b01100001111110;
+       8'b11011011: y = 14'b01100001110000;
+       8'b11011100: y = 14'b01100001100010;
+       8'b11011101: y = 14'b01100001010100;
+       8'b11011110: y = 14'b01100001000110;
+       8'b11011111: y = 14'b01100000111000;
+       8'b11100000: y = 14'b01100000101010;
+       8'b11100001: y = 14'b01100000011100;
+       8'b11100010: y = 14'b01100000001111;
+       8'b11100011: y = 14'b01100000000001;
+       8'b11100100: y = 14'b01011111110100;
+       8'b11100101: y = 14'b01011111100110;
+       8'b11100110: y = 14'b01011111011001;
+       8'b11100111: y = 14'b01011111001100;
+       8'b11101000: y = 14'b01011110111111;
+       8'b11101001: y = 14'b01011110110010;
+       8'b11101010: y = 14'b01011110100101;
+       8'b11101011: y = 14'b01011110011000;
+       8'b11101100: y = 14'b01011110001011;
+       8'b11101101: y = 14'b01011101111110;
+       8'b11101110: y = 14'b01011101110010;
+       8'b11101111: y = 14'b01011101100101;
+       8'b11110000: y = 14'b01011101011001;
+       8'b11110001: y = 14'b01011101001100;
+       8'b11110010: y = 14'b01011101000000;
+       8'b11110011: y = 14'b01011100110100;
+       8'b11110100: y = 14'b01011100101000;
+       8'b11110101: y = 14'b01011100011100;
+       8'b11110110: y = 14'b01011100010000;
+       8'b11110111: y = 14'b01011100000100;
+       8'b11111000: y = 14'b01011011111000;
+       8'b11111001: y = 14'b01011011101100;
+       8'b11111010: y = 14'b01011011100000;
+       8'b11111011: y = 14'b01011011010101;
+       8'b11111100: y = 14'b01011011001001;
+       8'b11111101: y = 14'b01011010111101;
+       8'b11111110: y = 14'b01011010110010;
+       8'b11111111: y = 14'b01011010100111;
+       default: y = 14'bxxxxxxxxxxxxxx;
+     endcase // case (a)
+    
+endmodule // sbtm_a0
+
+    
+    
+    
--- a/wally-pipelined/src/fpu/sbtm_a5.sv
+++ b/wally-pipelined/src/fpu/sbtm_a5.sv
@ -0,0 +1,200 @@
+module sbtm_a3 (input  logic [7:0] a,
+		output logic [5:0] y);
+   always_comb
+     case(a)
+       8'b01000000: y = 6'b100110;
+       8'b01000001: y = 6'b100001;
+       8'b01000010: y = 6'b011100;
+       8'b01000011: y = 6'b010111;
+       8'b01000100: y = 6'b010010;
+       8'b01000101: y = 6'b001100;
+       8'b01000110: y = 6'b000111;
+       8'b01000111: y = 6'b000010;
+       8'b01001000: y = 6'b100000;
+       8'b01001001: y = 6'b011100;
+       8'b01001010: y = 6'b011000;
+       8'b01001011: y = 6'b010011;
+       8'b01001100: y = 6'b001111;
+       8'b01001101: y = 6'b001010;
+       8'b01001110: y = 6'b000110;
+       8'b01001111: y = 6'b000010;
+       8'b01010000: y = 6'b011100;
+       8'b01010001: y = 6'b011000;
+       8'b01010010: y = 6'b010100;
+       8'b01010011: y = 6'b010000;
+       8'b01010100: y = 6'b001101;
+       8'b01010101: y = 6'b001001;
+       8'b01010110: y = 6'b000101;
+       8'b01010111: y = 6'b000001;
+       8'b01011000: y = 6'b011000;
+       8'b01011001: y = 6'b010101;
+       8'b01011010: y = 6'b010010;
+       8'b01011011: y = 6'b001110;
+       8'b01011100: y = 6'b001011;
+       8'b01011101: y = 6'b001000;
+       8'b01011110: y = 6'b000100;
+       8'b01011111: y = 6'b000001;
+       8'b01100000: y = 6'b010101;
+       8'b01100001: y = 6'b010010;
+       8'b01100010: y = 6'b001111;
+       8'b01100011: y = 6'b001101;
+       8'b01100100: y = 6'b001010;
+       8'b01100101: y = 6'b000111;
+       8'b01100110: y = 6'b000100;
+       8'b01100111: y = 6'b000001;
+       8'b01101000: y = 6'b010011;
+       8'b01101001: y = 6'b010000;
+       8'b01101010: y = 6'b001110;
+       8'b01101011: y = 6'b001011;
+       8'b01101100: y = 6'b001001;
+       8'b01101101: y = 6'b000110;
+       8'b01101110: y = 6'b000011;
+       8'b01101111: y = 6'b000001;
+       8'b01110000: y = 6'b010001;
+       8'b01110001: y = 6'b001111;
+       8'b01110010: y = 6'b001100;
+       8'b01110011: y = 6'b001010;
+       8'b01110100: y = 6'b001000;
+       8'b01110101: y = 6'b000101;
+       8'b01110110: y = 6'b000011;
+       8'b01110111: y = 6'b000001;
+       8'b01111000: y = 6'b001111;
+       8'b01111001: y = 6'b001101;
+       8'b01111010: y = 6'b001011;
+       8'b01111011: y = 6'b001001;
+       8'b01111100: y = 6'b000111;
+       8'b01111101: y = 6'b000101;
+       8'b01111110: y = 6'b000011;
+       8'b01111111: y = 6'b000001;       
+       8'b10000000: y = 6'b001110;
+       8'b10000001: y = 6'b001100;
+       8'b10000010: y = 6'b001010;
+       8'b10000011: y = 6'b001000;
+       8'b10000100: y = 6'b000110;
+       8'b10000101: y = 6'b000100;
+       8'b10000110: y = 6'b000010;
+       8'b10000111: y = 6'b000000;
+       8'b10001000: y = 6'b001101;
+       8'b10001001: y = 6'b001011;
+       8'b10001010: y = 6'b001001;
+       8'b10001011: y = 6'b000111;
+       8'b10001100: y = 6'b000110;
+       8'b10001101: y = 6'b000100;
+       8'b10001110: y = 6'b000010;
+       8'b10001111: y = 6'b000000;
+       8'b10010000: y = 6'b001100;
+       8'b10010001: y = 6'b001010;
+       8'b10010010: y = 6'b001000;
+       8'b10010011: y = 6'b000111;
+       8'b10010100: y = 6'b000101;
+       8'b10010101: y = 6'b000100;
+       8'b10010110: y = 6'b000010;
+       8'b10010111: y = 6'b000000;
+       8'b10011000: y = 6'b001011;
+       8'b10011001: y = 6'b001001;
+       8'b10011010: y = 6'b001000;
+       8'b10011011: y = 6'b000110;
+       8'b10011100: y = 6'b000101;
+       8'b10011101: y = 6'b000011;
+       8'b10011110: y = 6'b000010;
+       8'b10011111: y = 6'b000000;
+       8'b10100000: y = 6'b001010;
+       8'b10100001: y = 6'b001000;
+       8'b10100010: y = 6'b000111;
+       8'b10100011: y = 6'b000110;
+       8'b10100100: y = 6'b000100;
+       8'b10100101: y = 6'b000011;
+       8'b10100110: y = 6'b000010;
+       8'b10100111: y = 6'b000000;
+       8'b10101000: y = 6'b001001;
+       8'b10101001: y = 6'b001000;
+       8'b10101010: y = 6'b000111;
+       8'b10101011: y = 6'b000101;
+       8'b10101100: y = 6'b000100;
+       8'b10101101: y = 6'b000011;
+       8'b10101110: y = 6'b000001;
+       8'b10101111: y = 6'b000000;
+       8'b10110000: y = 6'b001000;
+       8'b10110001: y = 6'b000111;
+       8'b10110010: y = 6'b000110;
+       8'b10110011: y = 6'b000101;
+       8'b10110100: y = 6'b000100;
+       8'b10110101: y = 6'b000010;
+       8'b10110110: y = 6'b000001;
+       8'b10110111: y = 6'b000000;
+       8'b10111000: y = 6'b001000;
+       8'b10111001: y = 6'b000111;
+       8'b10111010: y = 6'b000110;
+       8'b10111011: y = 6'b000101;
+       8'b10111100: y = 6'b000011;
+       8'b10111101: y = 6'b000010;
+       8'b10111110: y = 6'b000001;
+       8'b10111111: y = 6'b000000;
+       8'b11000000: y = 6'b000111;
+       8'b11000001: y = 6'b000110;
+       8'b11000010: y = 6'b000101;
+       8'b11000011: y = 6'b000100;
+       8'b11000100: y = 6'b000011;
+       8'b11000101: y = 6'b000010;
+       8'b11000110: y = 6'b000001;
+       8'b11000111: y = 6'b000000;
+       8'b11001000: y = 6'b000111;
+       8'b11001001: y = 6'b000110;
+       8'b11001010: y = 6'b000101;
+       8'b11001011: y = 6'b000100;
+       8'b11001100: y = 6'b000011;
+       8'b11001101: y = 6'b000010;
+       8'b11001110: y = 6'b000001;
+       8'b11001111: y = 6'b000000;
+       8'b11010000: y = 6'b000111;
+       8'b11010001: y = 6'b000110;
+       8'b11010010: y = 6'b000101;
+       8'b11010011: y = 6'b000100;
+       8'b11010100: y = 6'b000011;
+       8'b11010101: y = 6'b000010;
+       8'b11010110: y = 6'b000001;
+       8'b11010111: y = 6'b000000;
+       8'b11011000: y = 6'b000110;
+       8'b11011001: y = 6'b000101;
+       8'b11011010: y = 6'b000100;
+       8'b11011011: y = 6'b000011;
+       8'b11011100: y = 6'b000011;
+       8'b11011101: y = 6'b000010;
+       8'b11011110: y = 6'b000001;
+       8'b11011111: y = 6'b000000;
+       8'b11100000: y = 6'b000110;
+       8'b11100001: y = 6'b000101;
+       8'b11100010: y = 6'b000100;
+       8'b11100011: y = 6'b000011;
+       8'b11100100: y = 6'b000010;
+       8'b11100101: y = 6'b000010;
+       8'b11100110: y = 6'b000001;
+       8'b11100111: y = 6'b000000;
+       8'b11101000: y = 6'b000101;
+       8'b11101001: y = 6'b000101;
+       8'b11101010: y = 6'b000100;
+       8'b11101011: y = 6'b000011;
+       8'b11101100: y = 6'b000010;
+       8'b11101101: y = 6'b000001;
+       8'b11101110: y = 6'b000001;
+       8'b11101111: y = 6'b000000;
+       8'b11110000: y = 6'b000101;
+       8'b11110001: y = 6'b000100;
+       8'b11110010: y = 6'b000100;
+       8'b11110011: y = 6'b000011;
+       8'b11110100: y = 6'b000010;
+       8'b11110101: y = 6'b000001;
+       8'b11110110: y = 6'b000001;
+       8'b11110111: y = 6'b000000;
+       8'b11111000: y = 6'b000101;
+       8'b11111001: y = 6'b000100;
+       8'b11111010: y = 6'b000011;
+       8'b11111011: y = 6'b000011;
+       8'b11111100: y = 6'b000010;
+       8'b11111101: y = 6'b000001;
+       8'b11111110: y = 6'b000001;
+       8'b11111111: y = 6'b000000;
+       default: y = 6'bxxxxxx;
+     endcase // case (a)
+    
+endmodule // sbtm_a0