Merge branch 'main' of github.com:davidharrishmc/riscv-wally into main

2021-07-26 11:55:00 -05:00 · 2021-07-26 11:55:00 -05:00 · ef55b30e99
commit ef55b30e99
parent 60177b92a6 30ac22edff
52 changed files with 1637 additions and 27033 deletions
--- a/wally-pipelined/config/rv64icfd/wally-config.vh
+++ b/wally-pipelined/config/rv64icfd/wally-config.vh
@ -26,6 +26,7 @@

 // include shared configuration
 `include "wally-shared.vh"
+// `include "../../../config/shared/wally-shared.vh"

 `define BUILDROOT 0
 `define BUSYBEAR 0
--- a/wally-pipelined/fpu-testfloat/FMA/tbgen/tb.sv
+++ b/wally-pipelined/fpu-testfloat/FMA/tbgen/tb.sv
@ -0,0 +1,215 @@
+
+`include "../../../config/rv64icfd/wally-config.vh"
+module testbench3();
+
+ logic [31:0] errors=0;
+ logic [31:0] vectornum=0;
+ logic [`FLEN*4+7:0] testvectors[6133248:0];
+
+//  logic 	[63:0]		X,Y,Z;
+ logic 	[`FLEN-1:0]		ans;
+ logic 	[7:0]	 	flags;
+ logic 	[2:0]		FrmE;
+ logic				FmtE;
+ logic  [`FLEN-1:0]      FMAResM;
+ logic  [4:0]       FMAFlgM;
+integer fp;
+logic 	[2:0]		FOpCtrlE;
+logic 		[2*`NF+1:0]		ProdManE; 
+logic 		[3*`NF+5:0]		AlignedAddendE;	
+logic 		[`NE+1:0]		ProdExpE; 
+logic 					AddendStickyE;
+logic 					KillProdE; 
+// logic					XZeroE;
+// logic					YZeroE;
+// logic					ZZeroE;
+// logic					XDenormE;
+// logic					YDenormE;
+// logic					ZDenormE;
+// logic					XInfE;
+// logic					YInfE;
+// logic					ZInfE;
+// logic					XNaNE;
+// logic					YNaNE;
+// logic					ZNaNE;
+
+logic wnan;
+// logic XNaNE;
+// logic YNaNE;
+// logic ZNaNE;
+logic ansnan, clk;
+
+
+assign FOpCtrlE = 3'b0;  
+
+// nearest even - 000
+// twords zero - 001
+// down - 010
+// up - 011
+// nearest max mag - 100  
+assign FrmE = 3'b000;
+assign FmtE = 1'b0;
+
+    logic  [`FLEN-1:0] X, Y, Z;
+    // logic         FmtE;
+    // logic  [2:0]  FOpCtrlE;
+    logic        XSgnE, YSgnE, ZSgnE;
+    logic [`NE-1:0] XExpE, YExpE, ZExpE;
+    logic [`NF-1:0] XFracE, YFracE, ZFracE;
+    logic        XAssumed1E, YAssumed1E, ZAssumed1E;
+    logic XNormE;
+    logic XNaNE, YNaNE, ZNaNE;
+    logic XSNaNE, YSNaNE, ZSNaNE;
+    logic XDenormE, YDenormE, ZDenormE;
+    logic XZeroE, YZeroE, ZZeroE;
+    logic [`NE-1:0] BiasE;
+    logic XInfE, YInfE, ZInfE;
+    logic XExpMaxE;
+ //***rename to make significand = 1.frac m = significand
+    logic           XFracZero, YFracZero, ZFracZero; // input fraction zero
+    logic           XExpZero, YExpZero, ZExpZero; // input exponent zero
+    logic [`FLEN-1:0]    Addend; // value to add (Z or zero)
+    logic           YExpMaxE, ZExpMaxE;  // input exponent all 1s
+
+    assign Addend = FOpCtrlE[2] ? (`FLEN)'(0) : Z; // Z is only used in the FMA, and is set to Zero if a multiply opperation
+    assign XSgnE = FmtE ? X[`FLEN-1] : X[31];
+    assign YSgnE = FmtE ? Y[`FLEN-1] : Y[31];
+    assign ZSgnE = FmtE ? Addend[`FLEN-1] : Addend[31];
+
+    assign XExpE = FmtE ? X[62:52] : {3'b0, X[30:23]};//{X[30], {3{~X[30]&~XExpZero|XExpMaxE}}, X[29:23]}; 
+    assign YExpE = FmtE ? Y[62:52] : {3'b0, Y[30:23]};//{Y[30], {3{~Y[30]&~YExpZero|YExpMaxE}}, Y[29:23]}; 
+    assign ZExpE = FmtE ? Addend[62:52] : {3'b0, Addend[30:23]};//{Addend[30], {3{~Addend[30]&~ZExpZero|ZExpMaxE}}, Addend[29:23]}; 
+
+    assign XFracE = FmtE ? X[`NF-1:0] : {X[22:0], 29'b0};
+    assign YFracE = FmtE ? Y[`NF-1:0] : {Y[22:0], 29'b0};
+    assign ZFracE = FmtE ? Addend[`NF-1:0] : {Addend[22:0], 29'b0};
+
+    assign XAssumed1E = FmtE ? |X[62:52] : |X[30:23]; 
+    assign YAssumed1E = FmtE ? |Y[62:52] : |Y[30:23];
+    assign ZAssumed1E = FmtE ? |Z[62:52] : |Z[30:23];
+
+    assign XExpZero = ~XAssumed1E;
+    assign YExpZero = ~YAssumed1E;
+    assign ZExpZero = ~ZAssumed1E;
+   
+    assign XFracZero = ~|XFracE;
+    assign YFracZero = ~|YFracE;
+    assign ZFracZero = ~|ZFracE;
+
+    assign XExpMaxE = FmtE ? &X[62:52] : &X[30:23];
+    assign YExpMaxE = FmtE ? &Y[62:52] : &Y[30:23];
+    assign ZExpMaxE = FmtE ? &Z[62:52] : &Z[30:23];
+   
+    assign XNormE = ~(XExpMaxE|XExpZero);
+    
+    assign XNaNE = XExpMaxE & ~XFracZero;
+    assign YNaNE = YExpMaxE & ~YFracZero;
+    assign ZNaNE = ZExpMaxE & ~ZFracZero;
+
+    assign XSNaNE = XNaNE&~XFracE[`NF-1];
+    assign YSNaNE = YNaNE&~YFracE[`NF-1];
+    assign ZSNaNE = ZNaNE&~ZFracE[`NF-1];
+
+    assign XDenormE = XExpZero & ~XFracZero;
+    assign YDenormE = YExpZero & ~YFracZero;
+    assign ZDenormE = ZExpZero & ~ZFracZero;
+
+    assign XInfE = XExpMaxE & XFracZero;
+    assign YInfE = YExpMaxE & YFracZero;
+    assign ZInfE = ZExpMaxE & ZFracZero;
+
+    assign XZeroE = XExpZero & XFracZero;
+    assign YZeroE = YExpZero & YFracZero;
+    assign ZZeroE = ZExpZero & ZFracZero;
+
+    assign BiasE = FmtE ? {1'b0, {`NE-1{1'b1}}} : 13'h7f;
+
+assign	wnan = FmtE ? &FMAResM[`FLEN-2:`NF] && |FMAResM[`NF-1:0] : &FMAResM[30:23] && |FMAResM[22:0]; 
+// assign	XNaNE = FmtE ? &X[62:52] && |X[51:0] : &X[62:55] && |X[54:32]; 
+// assign	YNaNE = FmtE ? &Y[62:52] && |Y[51:0] : &Y[62:55] && |Y[54:32]; 
+// assign	ZNaNE = FmtE ? &Z[62:52] && |Z[51:0] : &Z[62:55] && |Z[54:32]; 
+assign	ansnan = FmtE ? &ans[`FLEN-2:`NF] && |ans[`NF-1:0] : &ans[30:23] && |ans[22:0]; 
+ // instantiate device under test
+fma1 UUT1(.XManE({XAssumed1E,XFracE}), .YManE({YAssumed1E,YFracE}), .ZManE({ZAssumed1E,ZFracE}), .*);
+fma2 UUT2(.XSgnM(XSgnE), .YSgnM(YSgnE), .ZSgnM(ZSgnE), .XExpM(XExpE), .YExpM(YExpE), .ZExpM(ZExpE), .XManM({XAssumed1E,XFracE}), .YManM({YAssumed1E,YFracE}), .ZManM({ZAssumed1E,ZFracE}), .XNaNM(XNaNE), .YNaNM(YNaNE), .ZNaNM(ZNaNE), .XZeroM(XZeroE), .YZeroM(YZeroE), .ZZeroM(ZZeroE), .XInfM(XInfE), .YInfM(YInfE), .ZInfM(ZInfE), .XSNaNM(XSNaNE), .YSNaNM(YSNaNE), .ZSNaNM(ZSNaNE),
+              //  .FSrcXE, .FSrcYE, .FSrcZE, .FSrcXM, .FSrcYM, .FSrcZM, 
+               .FOpCtrlM(FOpCtrlE[2:0]), .KillProdM(KillProdE), .AddendStickyM(AddendStickyE), .ProdExpM(ProdExpE), .AlignedAddendM(AlignedAddendE), .ProdManM(ProdManE),
+               .FmtM(FmtE), .FrmM(FrmE), .FMAFlgM, .FMAResM);
+
+
+ // generate clock
+ always
+ begin
+ clk = 1; #5; clk = 0; #5;
+ end
+ // at start of test, load vectors
+ // and pulse reset
+ initial
+ begin
+    $readmemh("testFloatNoSpace", testvectors);
+ end
+ // apply test vectors on rising edge of clk
+always @(posedge clk)
+ begin
+  #1; 
+  if (FmtE==1'b1) {X, Y, Z, ans, flags} = testvectors[vectornum];
+  else	begin	  X = {{32{1'b1}}, testvectors[vectornum][135:104]};
+  		  Y = {{32{1'b1}}, testvectors[vectornum][103:72]};
+  		  Z = {{32{1'b1}}, testvectors[vectornum][71:40]};
+  		  ans = {{32{1'b1}}, testvectors[vectornum][39:8]};
+  		  flags = testvectors[vectornum][7:0];
+  end
+ end
+ // check results on falling edge of clk
+  always @(negedge clk) begin
+ 
+  //  fp = $fopen("/home/kparry/riscv-wally/wally-pipelined/src/fpu/FMA/tbgen/results.dat","w");
+	if((FmtE==1'b1) & (FMAFlgM != flags[4:0] || (!wnan && (FMAResM != ans)) || (wnan && ansnan && ~((XNaNE && (FMAResM[`FLEN-2:0] == {XExpE,1'b1,X[`NF-2:0]})) || (YNaNE && (FMAResM[`FLEN-2:0] == {YExpE,1'b1,Y[`NF-2:0]}))  || (ZNaNE && (FMAResM[`FLEN-2:0] == {ZExpE,1'b1,Z[`NF-2:0]})) || (FMAResM[`FLEN-2:0] == ans[`FLEN-2:0]))))) begin
+        $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+		if(FMAResM == 64'h8000000000000000) $display( "FMAResM=-zero ");
+		if(XDenormE) $display( "xdenorm ");
+		if(YDenormE) $display( "ydenorm ");
+		if(ZDenormE) $display( "zdenorm ");
+		if(FMAFlgM[4] != 0) $display( "invld ");
+		if(FMAFlgM[2] != 0) $display( "ovrflw ");
+		if(FMAFlgM[1] != 0) $display( "unflw ");
+		if(FMAResM[`FLEN] && FMAResM[`FLEN-2:`NF] == {`NE{1'b1}} && FMAResM[`NF-1:0] == 0) $display( "FMAResM=-inf ");
+		if(~FMAResM[`FLEN] && FMAResM[`FLEN-2:`NF] == {`NE{1'b1}} && FMAResM[`NF-1:0] == 0) $display( "FMAResM=+inf ");
+		if(FMAResM[`FLEN-2:`NF] == {`NE{1'b1}} && FMAResM[`NF-1:0] != 0 && ~FMAResM[`NF-1]) $display( "FMAResM=sigNaN ");
+		if(FMAResM[`FLEN-2:`NF] == {`NE{1'b1}} && FMAResM[`NF-1:0] != 0 && FMAResM[`NF-1]) $display( "FMAResM=qutNaN ");
+		if(ans[`FLEN] && ans[`FLEN-2:`NF] == {`NE{1'b1}} && ans[`NF-1:0] == 0) $display( "ans=-inf ");
+		if(~ans[`FLEN] && ans[`FLEN-2:`NF] == {`NE{1'b1}} && ans[`NF-1:0] == 0) $display( "ans=+inf ");
+		if(ans[`FLEN-2:`NF] == {`NE{1'b1}} && ans[`NF-1:0] != 0 && ~ans[`NF-1]) $display( "ans=sigNaN ");
+		if(ans[`FLEN-2:`NF] == {`NE{1'b1}} && ans[`NF-1:0] != 0 && ans[`NF-1]) $display( "ans=qutNaN ");
+        errors = errors + 1;
+
+		$stop;
+    end
+    if((FmtE==1'b0)&(FMAFlgM != flags[4:0] || (!wnan && (FMAResM != ans)) || (wnan && ansnan && ~(((XNaNE && (FMAResM[30:0] == {X[30:23],1'b1,X[21:0]})) || (YNaNE && (FMAResM[30:0] == {Y[30:23],1'b1,Y[21:0]}))  || (ZNaNE && (FMAResM[30:0] == {Z[30:23],1'b1,Z[21:0]})) || (FMAResM[30:0] == ans[30:0]))) ))) begin
+        $display( "%h %h %h %h %h %h %h  Wrong ",X,Y, Z, FMAResM, ans, FMAFlgM, flags);
+		if(FMAResM == 64'h8000000000000000) $display( "FMAResM=-zero ");
+		if(~(|X[30:23]) && |X[22:0]) $display( "xdenorm ");
+		if(~(|Y[30:23]) && |Y[22:0]) $display( "ydenorm ");
+		if(~(|Z[30:23]) && |Z[22:0]) $display( "zdenorm ");
+		if(FMAFlgM[4] != 0) $display( "invld ");
+		if(FMAFlgM[2] != 0) $display( "ovrflw ");
+		if(FMAFlgM[1] != 0) $display( "unflw ");
+		if(FMAResM == 64'hFF80000000000000) $display( "FMAResM=-inf ");
+		if(FMAResM == 64'h7F80000000000000) $display( "FMAResM=+inf ");
+		if(&FMAResM[30:23] && |FMAResM[22:0] && ~FMAResM[22]) $display( "FMAResM=sigNaN ");
+		if(&FMAResM[30:23] && |FMAResM[22:0] && FMAResM[22] ) $display( "FMAResM=qutNaN ");
+		if(ans == 64'hFF80000000000000) $display( "ans=-inf ");
+		if(ans == 64'h7F80000000000000) $display( "ans=+inf ");
+		if(&ans[30:23] && |ans[22:0] && ~ans[22] ) $display( "ans=sigNaN ");
+		if(&ans[30:23] && |ans[22:0] && ans[22]) $display( "ans=qutNaN ");
+        errors = errors + 1;
+	  //if (errors == 10)
+		$stop;
+    end
+ vectornum = vectornum + 1;
+ if (testvectors[vectornum] === 194'bx) begin
+ $display("%d tests completed with %d errors", vectornum, errors);
+ $stop;
+ end
+ end
+endmodule
--- a/wally-pipelined/fpu-testfloat/FMA/tbgen/test_gen.sh
+++ b/wally-pipelined/fpu-testfloat/FMA/tbgen/test_gen.sh
@ -0,0 +1,3 @@
+testfloat_gen f32_mulAdd -tininessafter -n 6133248 -rnear_even  -seed 113355 -level 1 > testFloat
+tr -d ' ' < testFloat > testFloatNoSpace
+
--- a/wally-pipelined/src/fpu/adder.sv
+++ b/wally-pipelined/src/fpu/adder.sv
--- a/wally-pipelined/src/fpu/bk15.sv
+++ b/wally-pipelined/src/fpu/bk15.sv
@ -1,117 +0,0 @@
-// Kogge-Stone Prefix Adder
-module bk15 (cout, sum, a, b, cin);
-   
-   input [14:0] a, b;
-   input 	cin;
-   
-   output [14:0] sum;
-   output 	 cout;
-
-   wire [15:0] 	 p,g;
-   wire [15:1] 	 h,c;
-
-   // pre-computation
-   assign p={a|b,1'b1};
-   assign g={a&b, cin};
-
-   // prefix tree
-   kogge_stone prefix_tree(h, c, p[14:0], g[14:0]);
-
-   // post-computation
-   assign h[15]=g[15]|c[15];
-   assign sum=p[15:1]^h|g[15:1]&c;
-   assign cout=p[15]&h[15];
-
-endmodule // bk15
-
-module kogge_stone (h, c, p, g);
-   
-   input [14:0] p;
-   input [14:0] g;
-   
-   output [15:1] h;
-   output [15:1] c;
-   logic H_1_0,H_2_1,I_2_1,H_3_2,I_3_2,H_4_3,I_4_3,H_5_4,I_5_4,H_6_5,I_6_5,H_7_6,I_7_6,H_8_7,I_8_7,H_9_8,I_9_8,H_10_9
-      ,I_10_9,H_11_10,I_11_10,H_12_11,I_12_11,H_13_12,I_13_12,H_14_13,I_14_13,H_2_0,H_3_0,H_4_1,I_4_1,H_5_2,I_5_2,H_6_3
-      ,I_6_3,H_7_4,I_7_4,H_8_5,I_8_5,H_9_6,I_9_6,H_10_7,I_10_7,H_11_8,I_11_8,H_12_9,I_12_9,H_13_10,I_13_10,H_14_11,I_14_11
-      ,H_4_0,H_5_0,H_6_0,H_7_0,H_8_1,I_8_1,H_9_2,I_9_2,H_10_3,I_10_3,H_11_4,I_11_4,H_12_5,I_12_5,H_13_6,I_13_6,H_14_7
-      ,I_14_7,H_8_0,H_9_0,H_10_0,H_11_0,H_12_0,H_13_0,H_14_0;
-
-   // parallel-prefix, Kogge-Stone
-
-   // Stage 1: Generates G/P pairs that span 1 bits
-   rgry g_1_0 (H_1_0, {g[1],g[0]});
-   rblk b_2_1 (H_2_1, I_2_1, {g[2],g[1]}, {p[1],p[0]});
-   rblk b_3_2 (H_3_2, I_3_2, {g[3],g[2]}, {p[2],p[1]});
-   rblk b_4_3 (H_4_3, I_4_3, {g[4],g[3]}, {p[3],p[2]});
-   rblk b_5_4 (H_5_4, I_5_4, {g[5],g[4]}, {p[4],p[3]});
-   rblk b_6_5 (H_6_5, I_6_5, {g[6],g[5]}, {p[5],p[4]});
-   rblk b_7_6 (H_7_6, I_7_6, {g[7],g[6]}, {p[6],p[5]});
-   rblk b_8_7 (H_8_7, I_8_7, {g[8],g[7]}, {p[7],p[6]});
-
-   rblk b_9_8 (H_9_8, I_9_8, {g[9],g[8]}, {p[8],p[7]});
-   rblk b_10_9 (H_10_9, I_10_9, {g[10],g[9]}, {p[9],p[8]});
-   rblk b_11_10 (H_11_10, I_11_10, {g[11],g[10]}, {p[10],p[9]});
-   rblk b_12_11 (H_12_11, I_12_11, {g[12],g[11]}, {p[11],p[10]});
-   rblk b_13_12 (H_13_12, I_13_12, {g[13],g[12]}, {p[12],p[11]});
-   rblk b_14_13 (H_14_13, I_14_13, {g[14],g[13]}, {p[13],p[12]});
-
-   // Stage 2: Generates G/P pairs that span 2 bits
-   grey g_2_0 (H_2_0, {H_2_1,g[0]}, I_2_1);
-   grey g_3_0 (H_3_0, {H_3_2,H_1_0}, I_3_2);
-   black b_4_1 (H_4_1, I_4_1, {H_4_3,H_2_1}, {I_4_3,I_2_1});
-   black b_5_2 (H_5_2, I_5_2, {H_5_4,H_3_2}, {I_5_4,I_3_2});
-   black b_6_3 (H_6_3, I_6_3, {H_6_5,H_4_3}, {I_6_5,I_4_3});
-   black b_7_4 (H_7_4, I_7_4, {H_7_6,H_5_4}, {I_7_6,I_5_4});
-   black b_8_5 (H_8_5, I_8_5, {H_8_7,H_6_5}, {I_8_7,I_6_5});
-   black b_9_6 (H_9_6, I_9_6, {H_9_8,H_7_6}, {I_9_8,I_7_6});
-
-   black b_10_7 (H_10_7, I_10_7, {H_10_9,H_8_7}, {I_10_9,I_8_7});
-   black b_11_8 (H_11_8, I_11_8, {H_11_10,H_9_8}, {I_11_10,I_9_8});
-   black b_12_9 (H_12_9, I_12_9, {H_12_11,H_10_9}, {I_12_11,I_10_9});
-   black b_13_10 (H_13_10, I_13_10, {H_13_12,H_11_10}, {I_13_12,I_11_10});
-   black b_14_11 (H_14_11, I_14_11, {H_14_13,H_12_11}, {I_14_13,I_12_11});
-
-   // Stage 3: Generates G/P pairs that span 4 bits
-   grey g_4_0 (H_4_0, {H_4_1,g[0]}, I_4_1);
-   grey g_5_0 (H_5_0, {H_5_2,H_1_0}, I_5_2);
-   grey g_6_0 (H_6_0, {H_6_3,H_2_0}, I_6_3);
-   grey g_7_0 (H_7_0, {H_7_4,H_3_0}, I_7_4);
-   black b_8_1 (H_8_1, I_8_1, {H_8_5,H_4_1}, {I_8_5,I_4_1});
-   black b_9_2 (H_9_2, I_9_2, {H_9_6,H_5_2}, {I_9_6,I_5_2});
-   black b_10_3 (H_10_3, I_10_3, {H_10_7,H_6_3}, {I_10_7,I_6_3});
-   black b_11_4 (H_11_4, I_11_4, {H_11_8,H_7_4}, {I_11_8,I_7_4});
-
-   black b_12_5 (H_12_5, I_12_5, {H_12_9,H_8_5}, {I_12_9,I_8_5});
-   black b_13_6 (H_13_6, I_13_6, {H_13_10,H_9_6}, {I_13_10,I_9_6});
-   black b_14_7 (H_14_7, I_14_7, {H_14_11,H_10_7}, {I_14_11,I_10_7});
-
-   // Stage 4: Generates G/P pairs that span 8 bits
-   grey g_8_0 (H_8_0, {H_8_1,g[0]}, I_8_1);
-   grey g_9_0 (H_9_0, {H_9_2,H_1_0}, I_9_2);
-   grey g_10_0 (H_10_0, {H_10_3,H_2_0}, I_10_3);
-   grey g_11_0 (H_11_0, {H_11_4,H_3_0}, I_11_4);
-   grey g_12_0 (H_12_0, {H_12_5,H_4_0}, I_12_5);
-   grey g_13_0 (H_13_0, {H_13_6,H_5_0}, I_13_6);
-   grey g_14_0 (H_14_0, {H_14_7,H_6_0}, I_14_7);
-
-   // Final Stage: Apply c_k+1=p_k&H_k_0
-   assign c[1]=g[0];
-
-   assign h[1]=H_1_0;		assign c[2]=p[1]&H_1_0;
-   assign h[2]=H_2_0;		assign c[3]=p[2]&H_2_0;
-   assign h[3]=H_3_0;		assign c[4]=p[3]&H_3_0;
-   assign h[4]=H_4_0;		assign c[5]=p[4]&H_4_0;
-   assign h[5]=H_5_0;		assign c[6]=p[5]&H_5_0;
-   assign h[6]=H_6_0;		assign c[7]=p[6]&H_6_0;
-   assign h[7]=H_7_0;		assign c[8]=p[7]&H_7_0;
-   assign h[8]=H_8_0;		assign c[9]=p[8]&H_8_0;
-
-   assign h[9]=H_9_0;		assign c[10]=p[9]&H_9_0;
-   assign h[10]=H_10_0;		assign c[11]=p[10]&H_10_0;
-   assign h[11]=H_11_0;		assign c[12]=p[11]&H_11_0;
-   assign h[12]=H_12_0;		assign c[13]=p[12]&H_12_0;
-   assign h[13]=H_13_0;		assign c[14]=p[13]&H_13_0;
-   assign h[14]=H_14_0;		assign c[15]=p[14]&H_14_0;
-
-endmodule // kogge_stone
--- a/wally-pipelined/src/fpu/black_gray_cells.sv
+++ b/wally-pipelined/src/fpu/black_gray_cells.sv
@ -1,43 +0,0 @@
-
-// Black cell
-module black(gout, pout, gin, pin);
-
-   input [1:0] gin, pin;
-   output      gout, pout;
-
-   assign pout=pin[1]&pin[0];
-   assign gout=gin[1]|(pin[1]&gin[0]);
-
-endmodule // black
-
-// Grey cell
-module grey(gout, gin, pin);
-
-   input[1:0] gin;
-   input      pin;
-   output     gout;
-
-   assign gout=gin[1]|(pin&gin[0]);
-
-endmodule // grey
-
-// reduced Black cell
-module rblk(hout, iout, gin, pin);
-
-   input [1:0] gin, pin;
-   output      hout, iout;
-
-   assign iout=pin[1]&pin[0];
-   assign hout=gin[1]|gin[0];
-
-endmodule // rblk
-
-// reduced Grey cell
-module rgry(hout, gin);
-
-   input[1:0] gin;
-   output     hout;
-
-   assign hout=gin[1]|gin[0];
-
-endmodule // rgry
--- a/wally-pipelined/src/fpu/cla12.sv
+++ b/wally-pipelined/src/fpu/cla12.sv
--- a/wally-pipelined/src/fpu/cla52.sv
+++ b/wally-pipelined/src/fpu/cla52.sv
--- a/wally-pipelined/src/fpu/cla64.sv
+++ b/wally-pipelined/src/fpu/cla64.sv
@ -207,7 +207,7 @@ module cla64 (S, X, Y, Sub);
   assign Bbar = B ^ {64{Sub}};
   
 endmodule // cla64
-
+ 
 // This module performs 64-bit subtraction. It is used to get the two's complement
 // of main addition or subtraction in the floating point adder. 

--- a/wally-pipelined/src/fpu/convert_inputs.sv
+++ b/wally-pipelined/src/fpu/convert_inputs.sv
@ -5,19 +5,19 @@
 // and modifies the sign of op1. The converted operands are Float1
 // and Float2.

-module convert_inputs(Float1, Float2, op1, op2, op_type, P);
-   
-   input [63:0]  op1;            // 1st input operand (A)
-   input [63:0]  op2;            // 2nd input operand (B)
-   input [3:0] 	 op_type;        // Function opcode
-   input 	 P;              // Result Precision (0 for double, 1 for single)
+module convert_inputs(
+   input [63:0]  op1,      // 1st input operand (A)
+   input [63:0]  op2,      // 2nd input operand (B)
+   input [3:0]   op_type,  // Function opcode
+   input 	     P,        // Result Precision (0 for double, 1 for single)

-   output [63:0] Float1;	// Converted 1st input operand
-   output [63:0] Float2;	// Converted 2nd input operand   
-   
-   wire 	 conv_SP;        // Convert from SP to DP
-   wire 	 negate;         // Operation is negation
-   wire 	 abs_val;        // Operation is absolute value
+   output [63:0] Float1,	// Converted 1st input operand
+   output [63:0] Float2	   // Converted 2nd input operand   
+);
+
+   wire 	 conv_SP;   // Convert from SP to DP
+   wire 	 negate;    // Operation is negation
+   wire 	 abs_val;   // Operation is absolute value
   wire 	 Zexp1;		// One if the exponent of op1 is zero
   wire 	 Zexp2;		// One if the exponent of op2 is zero
   wire 	 Oexp1;		// One if the exponent of op1 is all ones
@ -33,14 +33,6 @@ module convert_inputs(Float1, Float2, op1, op2, op_type, P);
   assign Zexp2 = ~(|op2[30:23]);
   assign Oexp1 =  (&op1[30:23]);
   assign Oexp2 =  (&op2[30:23]);
-   // assign Zexp1 = ~(op1[62] | op1[61] | op1[60] | op1[59] | 
-	// 	    op1[58] | op1[57] | op1[56] | op1[55]);
-   // assign Zexp2 = ~(op2[62] | op2[61] | op2[60] | op2[59] | 
-	// 	    op2[58] | op2[57] | op2[56] | op2[55]);
-   // assign Oexp1 =  (op1[62] & op1[61] & op1[60] & op1[59] & 
-	// 	    op1[58] & op1[57] & op1[56] & op1[55]);
-   // assign Oexp2 =  (op2[62] & op2[61] & op2[60] & op2[59] & 
-	// 	    op2[58] & op2[57] & op2[56] &op2[55]);

   // Conditionally convert op1. Lower 29 bits are zero for single precision.
   assign Float1[62:29] = conv_SP ? {op1[30], {3{(~op1[30]&~Zexp1)|Oexp1}}, op1[29:0]}
@ -57,7 +49,7 @@ module convert_inputs(Float1, Float2, op1, op2, op_type, P);
   // is negation (op_type = 101) or absolute value (op_type = 100)

   assign negate  = op_type[2] & ~op_type[1] & op_type[0];
-   assign abs_val = op_type[2] & ~op_type[1] & ~op_type[0];
+   assign abs_val = op_type[2] & ~op_type[1] & ~op_type[0]; //*** remove abs_val
   assign Float1[63]  = conv_SP ? (op1[31] ^ negate) & ~abs_val : (op1[63] ^ negate) & ~abs_val;
   assign Float2[63]  = conv_SP ? op2[31] : op2[63];

--- a/wally-pipelined/src/fpu/convert_inputs_div.sv
+++ b/wally-pipelined/src/fpu/convert_inputs_div.sv
@ -3,21 +3,22 @@
 // it conditionally converts single precision values to double 
 // precision values and modifies the sign of op1. 
 // The converted operands are Float1 and Float2.
-module convert_inputs_div (Float1, Float2b, op1, op2, op_type, P);
+module convert_inputs_div (
   
-   input logic [63:0]  op1;           // 1st input operand (A)
-   input logic [63:0]  op2;           // 2nd input operand (B)
-   input logic 	       P;             // Result Precision (0 for double, 1 for single)
-   input logic 	       op_type;       // Operation   
+   input logic [63:0]  op1,           // 1st input operand (A)
+   input logic [63:0]  op2,           // 2nd input operand (B)
+   input logic 	     P,             // Result Precision (0 for double, 1 for single)
+   input logic 	     op_type,       // Operation   

-   output logic [63:0] Float1;	      // Converted 1st input operand
-   output logic [63:0] Float2b;	      // Converted 2nd input operand   
+   output logic [63:0] Float1,	      // Converted 1st input operand
+   output logic [63:0] Float2b	      // Converted 2nd input operand   
+);

   logic [63:0]        Float2;   
-   logic 	       Zexp1;	      // One if the exponent of op1 is zero
-   logic 	       Zexp2;	      // One if the exponent of op2 is zero
-   logic 	       Oexp1;	      // One if the exponent of op1 is all ones
-   logic 	       Oexp2;	      // One if the exponent of op2 is all ones
+   logic 	           Zexp1;	      // One if the exponent of op1 is zero
+   logic 	           Zexp2;	      // One if the exponent of op2 is zero
+   logic 	           Oexp1;	      // One if the exponent of op1 is all ones
+   logic 	           Oexp2;	      // One if the exponent of op2 is all ones

   // Test if the input exponent is zero, because if it is then the
   // exponent of the converted number should be zero. 
--- a/wally-pipelined/src/fpu/divconv.sv
+++ b/wally-pipelined/src/fpu/divconv.sv
@ -1,25 +1,21 @@
-module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_out,
-		regr_out, d, n, sel_muxa, sel_muxb, sel_muxr, reset, clk, load_rega, load_regb, 
-		load_regc, load_regd, load_regr, load_regs, P, op_type, exp_odd);
+module divconv (

-   input logic [52:0]   d, n;
-   input logic [2:0] 	sel_muxa, sel_muxb;
-   input logic 	        sel_muxr;   
-   input logic 	        load_rega, load_regb, load_regc, load_regd;
-   input logic 		load_regr, load_regs;
-   input logic 		P;
-   input logic 		op_type;
-   input logic 		exp_odd;   
-   input logic 	        reset;
-   input logic 	        clk;   
+   input logic [52:0]   d, n,
+   input logic [2:0] 	sel_muxa, sel_muxb,
+   input logic 	      sel_muxr,   
+   input logic 	      load_rega, load_regb, load_regc, load_regd,
+   input logic 		   load_regr, load_regs,
+   input logic 		   P,
+   input logic 		   op_type,
+   input logic 		   exp_odd,   
+   input logic 	      reset,
+   input logic 	      clk,   
   
-   output logic [63:0] 	q1, qp1, qm1;
-   output logic [63:0] 	q0, qp0, qm0;   
-   output logic [63:0] 	rega_out, regb_out, regc_out, regd_out;
-   output logic [127:0] regr_out;
-   
-   supply1 		vdd;
-   supply0 		vss;   
+   output logic [63:0] 	q1, qp1, qm1,
+   output logic [63:0] 	q0, qp0, qm0,   
+   output logic [63:0] 	rega_out, regb_out, regc_out, regd_out,
+   output logic [127:0] regr_out
+);

   logic [63:0] 	muxa_out, muxb_out;
   logic [10:0] 	ia_div, ia_sqrt;
@ -36,12 +32,12 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_o
   logic [63:0] 	q_const, qp_const, qm_const;
   logic [63:0] 	d2, n2;   
   logic [11:0] 	d3;   
-   logic muxr_out;
-   logic cout1, cout2, cout3, cout4, cout5, cout6, cout7;
+   logic          muxr_out;
+   logic          cout1, cout2, cout3, cout4, cout5, cout6, cout7;

   // Check if exponent is odd for sqrt
   // If exp_odd=1 and sqrt, then M/2 and use ia_addr=0 as IA
-   assign d2 = (exp_odd&op_type) ? {vss,d,10'h0} : {d,11'h0};
+   assign d2 = (exp_odd&op_type) ? {1'b0,d,10'h0} : {d,11'h0};
   assign n2 = op_type ? d2 : {n,11'h0};
   
   // IA div/sqrt
@ -62,10 +58,7 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_o
   mux2 #(64) mx4 (q0, q1, q1[63], mcand_q);
   mux2 #(64) mx5 (muxb_out, mcand_q, sel_muxr&op_type, mplier);   
   mux2 #(64) mx6 (muxa_out, mcand_q, sel_muxr, mcand);
-   // TDM multiplier (carry/save)
-   multiplier mult1 (mcand, mplier, Sum, Carry);
   // Q*D - N (reversed but changed in rounder.v to account for sign reversal)
-   csa #(128) csa1 (Sum, Carry, constant, Sum2, Carry2);
   // Add ulp for subtraction in remainder
   mux2 #(1) mx7 (1'b0, 1'b1, sel_muxr, muxr_out);

@ -74,24 +67,17 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_o
   mux2 #(64) mx9 ({64'h0000_0000_0000_0A00}, {64'h0000_0140_0000_0000}, P, qp_const);
   mux2 #(64) mxA ({64'hFFFF_FFFF_FFFF_F9FF}, {64'hFFFF_FF3F_FFFF_FFFF}, P, qm_const);
   
-   // CPA (from CSA)/Remainder addition/subtraction
-   // adder #(128) cpa1 (Sum2, Carry2, muxr_out, mul_out, cout1); 
-   assign {cout1, mul_out} = Sum2 + Carry2 + muxr_out;  
+   // CPA (from CSA)/Remainder addition/subtraction 
+   assign {cout1, mul_out} = (mcand*mplier) + constant + muxr_out;  
   
   // Assuming [1,2) - q1
-   // adder #(64) cpa2 (regb_out, q_const, 1'b0, q_out1, cout2);
   assign {cout2, q_out1} = regb_out + q_const;  
-   // adder #(64) cpa3 (regb_out, qp_const, 1'b0, qp_out1, cout3);
   assign {cout3, qp_out1} = regb_out + qp_const;  
-   // adder #(64) cpa4 (regb_out, qm_const, 1'b1, qm_out1, cout4);
   assign {cout4, qm_out1} = regb_out + qm_const + 1'b1;  
   // Assuming [0.5,1) - q0   
-   // adder #(64) cpa5 ({regb_out[62:0], vss}, q_const, 1'b0, q_out0, cout5);
-   assign {cout5, q_out0} = {regb_out[62:0], vss} + q_const;  
-   // adder #(64) cpa6 ({regb_out[62:0], vss}, qp_const, 1'b0, qp_out0, cout6);
-   assign {cout6, qp_out0} = {regb_out[62:0], vss} + qp_const;  
-   // adder #(64) cpa7 ({regb_out[62:0], vss}, qm_const, 1'b1, qm_out0, cout7);  
-   assign {cout7, qm_out0} = {regb_out[62:0], vss} + qm_const + 1'b1;    
+   assign {cout5, q_out0} = {regb_out[62:0], 1'b0} + q_const;  
+   assign {cout6, qp_out0} = {regb_out[62:0], 1'b0} + qp_const;  
+   assign {cout7, qm_out0} = {regb_out[62:0], 1'b0} + qm_const + 1'b1;    

   // One's complement instead of two's complement (for hw efficiency)
   assign three = {~mul_out[126], mul_out[126], ~mul_out[125:63]};   
@ -114,151 +100,3 @@ module divconv (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_o
   
 endmodule // divconv

-// module adder #(parameter WIDTH=8)
-//    (input  logic [WIDTH-1:0] a, b,
-//     input logic 	     cin,
-//     output logic [WIDTH-1:0] y,
-//     output logic 	     cout);
-   
-//    assign {cout, y} = a + b + cin;
-   
-// endmodule // adder
-
-// module flopenr #(parameter WIDTH = 8)
-//    (input  logic             clk, reset, en,
-//     input  logic [WIDTH-1:0] d, 
-//     output logic [WIDTH-1:0] q);
-
-//    always_ff @(posedge clk, posedge reset)
-//      if (reset)   q <= #10 0;
-//      else if (en) q <= #10 d;
-   
-// endmodule // flopenr
-
-// module flopr #(parameter WIDTH = 8)
-//    (input  logic             clk, reset,
-//     input  logic [WIDTH-1:0] d, 
-//     output logic [WIDTH-1:0] q);
-
-//    always_ff @(posedge clk, posedge reset)
-//      if (reset) q <= #10 0;
-//      else       q <= #10 d;
-   
-// endmodule // flopr
-
-// module flopenrc #(parameter WIDTH = 8)
-//    (input  logic             clk, reset, en, clear,
-//     input  logic [WIDTH-1:0] d, 
-//     output logic [WIDTH-1:0] q);
-
-//    always_ff @(posedge clk, posedge reset)
-//      if (reset)    q <= #10 0;
-//      else if (en) 
-//        if (clear) q <= #10 0;
-//        else       q <= #10 d;
-   
-// endmodule // flopenrc
-
-// module floprc #(parameter WIDTH = 8)
-//    (input  logic             clk, reset, clear,
-//     input  logic [WIDTH-1:0] d, 
-//     output logic [WIDTH-1:0] q);
-
-//    always_ff @(posedge clk, posedge reset)
-//      if (reset) q <= #10 0;
-//      else       
-//        if (clear) q <= #10 0;
-//        else       q <= #10 d;
-   
-// endmodule // floprc
-
-// module mux2 #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] d0, d1, 
-//     input  logic             s, 
-//     output logic [WIDTH-1:0] y);
-
-//    assign y = s ? d1 : d0;
-   
-// endmodule // mux2
-
-// module mux3 #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] d0, d1, d2,
-//     input  logic [1:0]       s, 
-//     output logic [WIDTH-1:0] y);
-
-//    assign y = s[1] ? d2 : (s[0] ? d1 : d0);
-   
-// endmodule // mux3
-
-// module mux4 #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] d0, d1, d2, d3,
-//     input  logic [1:0]       s, 
-//     output logic [WIDTH-1:0] y);
-
-//    assign y = s[1] ? (s[0] ? d3 : d2) : (s[0] ? d1 : d0);
-
-// endmodule // mux4
-
-// module mux5 #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4,
-//     input  logic [2:0]       s,
-//     output logic [WIDTH-1:0] y);
-   
-//    always_comb
-//      casez (s)
-//        3'b000 : y = d0;       
-//        3'b001 : y = d1;
-//        3'b010 : y = d2;
-//        3'b011 : y = d3;
-//        3'b1?? : y = d4;
-//      endcase // casez (s)
-
-// endmodule // mux5
-
-// module mux6 #(parameter WIDTH = 8)
-//    (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4, d5,
-//     input  logic [2:0]       s,
-//     output logic [WIDTH-1:0] y);
-   
-//    always_comb
-//      casez (s)
-//        3'b000 : y = d0;       
-//        3'b001 : y = d1;
-//        3'b010 : y = d2;
-//        3'b011 : y = d3;
-//        3'b10? : y = d4;
-//        3'b11? : y = d5;       
-//      endcase // casez (s)
-
-// endmodule // mux6
-
-module eqcmp #(parameter WIDTH = 8)
-   (input  logic [WIDTH-1:0] a, b,
-    output logic             y);
-
-   assign y = (a == b);
-   
-endmodule // eqcmp
-
-// module fa (input logic a, b, c, output logic sum, carry);
-
-//    assign sum = a^b^c;
-//    assign carry = a&b|a&c|b&c;   
-
-// endmodule // fa
-
-// module csa #(parameter WIDTH=8) 
-//    (input logic [WIDTH-1:0] a, b, c,
-//     output logic [WIDTH-1:0] sum, carry);
-
-//    logic [WIDTH:0] 	     carry_temp;   
-//    genvar 		     i;
-//    generate
-//       for (i=0;i<WIDTH;i=i+1)
-// 	begin : genbit
-// 	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
-// 	end
-//    endgenerate
-//    assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};     
-   
-// endmodule // csa
--- a/wally-pipelined/src/fpu/exception.sv
+++ b/wally-pipelined/src/fpu/exception.sv
@ -115,6 +115,6 @@ module exception (Ztype, Invalid, Denorm, ANorm, BNorm, Sub, A, B, op_type);

   // Determine if the effective operation is subtraction
   assign Sub = ~(op_type[3] & ~op_type[0]) & ( (op_type[3] & op_type[0]) | (add_sub & (A[63]^B[63]^op_type[0])) );
-
+ 
 endmodule // exception

--- a/wally-pipelined/src/fpu/exception_div.sv
+++ b/wally-pipelined/src/fpu/exception_div.sv
@ -1,16 +1,13 @@
 // Exception logic for the floating point adder. Note: We may 
 // actually want to move to where the result is computed.
-module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
+module exception_div (

-   input logic [63:0] A;		// 1st input operand (op1)
-   input logic [63:0] B;		// 2nd input operand (op2)
-   input logic 	      op_type;          // Determine operation   
-   
-   output logic [2:0] Ztype;		// Indicates type of result (Z)
-   output logic       Invalid;	        // Invalid operation exception
-   output logic       Denorm;		// Denormalized input
-   output logic       ANorm;            // A is not zero or Denorm
-   output logic       BNorm;            // B is not zero or Denorm
+   input logic [63:0] A,		// 1st input operand (op1)
+   input logic [63:0] B,		// 2nd input operand (op2)
+   input logic 	    op_type,   // Determine operation   
+   output logic [2:0] Ztype,		// Indicates type of result (Z)
+   output logic       Invalid	// Invalid operation exception
+);
   
   logic 	      AzeroM;	 	// '1' if the mantissa of A is zero
   logic 	      BzeroM;		// '1' if the mantissa of B is zero
@ -18,8 +15,6 @@ module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
   logic 	      BzeroE;		// '1' if the exponent of B is zero
   logic 	      AonesE;	 	// '1' if the exponent of A is all ones
   logic 	      BonesE;		// '1' if the exponent of B is all ones
-   logic 	      ADenorm; 	        // '1' if A is a denomalized number
-   logic 	      BDenorm; 	        // '1' if B is a denomalized number
   logic 	      AInf;	 	// '1' if A is infinite
   logic 	      BInf;	 	// '1' if B is infinite
   logic 	      AZero;	 	// '1' if A is 0
@ -32,11 +27,10 @@ module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
   logic 	      ZInf;	 	// '1' if result Z is an infnity
   logic 	      Zero;             // '1' if result is zero   
   
-   parameter [51:0]  fifty_two_zeros = 52'h0; // Use parameter?

   // Determine if mantissas are all zeros
-   assign AzeroM = (A[51:0] == fifty_two_zeros);
-   assign BzeroM = (B[51:0] == fifty_two_zeros);
+   assign AzeroM = (A[51:0] == 52'h0);
+   assign BzeroM = (B[51:0] == 52'h0);

   // Determine if exponents are all ones or all zeros 
   assign AonesE = A[62]&A[61]&A[60]&A[59]&A[58]&A[57]&A[56]&A[55]&A[54]&A[53]&A[52];
@ -45,8 +39,6 @@ module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
   assign BzeroE = ~(B[62]|B[61]|B[60]|B[59]|B[58]|B[57]|B[56]|B[55]|B[54]|B[53]|B[52]);

   // Determine special cases. Note: Zero is not really a special case. 
-   assign ADenorm = AzeroE & ~AzeroM;
-   assign BDenorm = BzeroE & ~BzeroM;
   assign AInf = AonesE & AzeroM;
   assign BInf = BonesE & BzeroM;
   assign ANaN = AonesE & ~AzeroM;
@ -56,17 +48,11 @@ module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
   assign AZero = AzeroE & AzeroM;
   assign BZero = BzeroE & BzeroE;

-   // A and B are normalized if their exponents are not zero. 
-   assign ANorm = ~AzeroE;
-   assign BNorm = ~BzeroE;
-
   // An "Invalid Operation" exception occurs if (A or B is a signalling NaN)
   // or (A and B are both Infinite)
   assign Invalid = ASNaN | BSNaN | (((AInf & BInf) | (AZero & BZero))&~op_type) | 
 		    (A[63] & op_type);

-   // The Denorm flag is set if A is denormlized or if B is normalized 
-   assign Denorm = ADenorm | BDenorm;

   // The result is a quiet NaN if (an "Invalid Operation" exception occurs) 
   // or (A is a NaN) or (B is a NaN).
--- a/wally-pipelined/src/fpu/faddcvt.sv
+++ b/wally-pipelined/src/fpu/faddcvt.sv
@ -29,14 +29,14 @@
 module faddcvt(
   input logic          clk,
   input logic          reset,
-   input logic          FlushM,
-   input logic          StallM,
+   input logic          FlushM,     // flush the memory stage
+   input logic          StallM,     // stall the memory stage
   input logic  [63:0]  FSrcXE,		// 1st input operand (A)
   input logic  [63:0]  FSrcYE,		// 2nd input operand (B)
   input logic  [3:0]   FOpCtrlE, FOpCtrlM,	// Function opcode
-   input logic          FmtE, FmtM,   		// Result Precision (0 for double, 1 for single)
-   input logic  [2:0] 	FrmM,		// Rounding mode - specify values 
-   output logic [63:0]  FAddResM,	// Result of operation
+   input logic          FmtE, FmtM,   	// Result Precision (0 for double, 1 for single)
+   input logic  [2:0] 	FrmM,		      // Rounding mode - specify values 
+   output logic [63:0]  FAddResM,	   // Result of operation
   output logic [4:0]   FAddFlgM);   	// IEEE exception flags 
   
   logic [63:0] 	AddSumE, AddSumM;
@ -51,7 +51,6 @@ module faddcvt(
   logic          AddInvalidE, AddInvalidM;
   logic 		   AddDenormInE, AddDenormInM;
   logic          AddSwapE, AddSwapM;
-   logic          AddNormOvflowE, AddNormOvflowM; //***this isn't used in addcvt2
   logic          AddSignAE, AddSignAM;
   logic 		   AddConvertE, AddConvertM;
   logic [63:0] 	AddFloat1E, AddFloat2E, AddFloat1M, AddFloat2M;
@ -62,8 +61,9 @@ module faddcvt(
   fpuaddcvt1 fpadd1 (.FSrcXE, .FSrcYE, .FOpCtrlE, .FmtE, .AddFloat1E, .AddFloat2E, .AddExponentE, 
                     .AddExpPostSumE, .AddExp1DenormE, .AddExp2DenormE, .AddSumE, .AddSumTcE, .AddSelInvE, 
                     .AddCorrSignE, .AddSignAE, .AddOp1NormE, .AddOp2NormE, .AddOpANormE, .AddOpBNormE, .AddInvalidE, 
-                     .AddDenormInE, .AddConvertE, .AddSwapE, .AddNormOvflowE);
+                     .AddDenormInE, .AddConvertE, .AddSwapE);

+   // E/M pipeline registers
   flopenrc #(64) EMRegAdd1(clk, reset, FlushM, ~StallM, AddSumE, AddSumM); 
   flopenrc #(64) EMRegAdd2(clk, reset, FlushM, ~StallM, AddSumTcE, AddSumTcM); 
   flopenrc #(11) EMRegAdd3(clk, reset, FlushM, ~StallM, AddExpPostSumE, AddExpPostSumM); 
@ -72,9 +72,9 @@ module faddcvt(
   flopenrc #(12) EMRegAdd6(clk, reset, FlushM, ~StallM, AddExp1DenormE, AddExp1DenormM); 
   flopenrc #(12) EMRegAdd7(clk, reset, FlushM, ~StallM, AddExp2DenormE, AddExp2DenormM); 
   flopenrc #(11) EMRegAdd8(clk, reset, FlushM, ~StallM, AddExponentE, AddExponentM);
-   flopenrc #(15) EMRegAdd9(clk, reset, FlushM, ~StallM, 
-                           {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE},
-                           {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM}); 
+   flopenrc #(14) EMRegAdd9(clk, reset, FlushM, ~StallM, 
+                           {AddSelInvE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddSignAE},
+                           {AddSelInvM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddSignAM}); 

                     
   fpuaddcvt2 fpadd2 (.FrmM, .FOpCtrlM, .FmtM, .AddSumM, .AddSumTcM, .AddFloat1M, .AddFloat2M, 
@ -83,53 +83,52 @@ module faddcvt(
                     .AddSignAM, .AddCorrSignM, .AddConvertM, .AddSwapM, .FAddResM, .FAddFlgM);
 endmodule

-module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, FSrcXE, FSrcYE, FOpCtrlE, FmtE);
+module fpuaddcvt1 (
+   input logic [63:0]   FSrcXE,		// 1st input operand (A)
+   input logic [63:0]   FSrcYE,		// 2nd input operand (B)
+   input logic [3:0]	   FOpCtrlE,	// Function opcode
+   input logic 	      FmtE,   		// Result Precision (1 for double, 0 for single)

-   input logic [63:0] FSrcXE;		// 1st input operand (A)
-   input logic [63:0] FSrcYE;		// 2nd input operand (B)
-   input logic [3:0]	FOpCtrlE;	// Function opcode
-   input logic 	FmtE;   		// Result Precision (1 for double, 0 for single)
+   output logic [63:0] 	AddFloat1E, 
+   output logic [63:0] 	AddFloat2E,
+   output logic [10:0] 	AddExponentE,
+   output logic [10:0]	AddExpPostSumE,
+   output logic [11:0]  AddExp1DenormE, AddExp2DenormE,//KEP used to be [10:0]
+   output logic [63:0]  AddSumE, AddSumTcE,
+   output logic [3:0]   AddSelInvE,
+   output logic         AddCorrSignE,
+   output logic 	      AddSignAE,
+   output logic	      AddOp1NormE, AddOp2NormE,
+   output logic	      AddOpANormE, AddOpBNormE,
+   output logic	      AddInvalidE,
+   output logic 	      AddDenormInE,
+   output logic 	      AddConvertE,
+   output logic         AddSwapE
+   );
+
+   wire [5:0]	 ZP_mantissaA;
+   wire [5:0]	 ZP_mantissaB;
+   wire		    ZV_mantissaA;
+   wire		    ZV_mantissaB;

   wire          P;
   assign P = ~FmtE;

-   wire [63:0] 	 IntValue;
-   wire [11:0] 	 exp1, exp2;
-   wire [11:0] 	 exp_diff1, exp_diff2;
-   wire [11:0] 	 exp_shift;
-   wire [51:0] 	 mantissaA;
-   wire [56:0] 	 mantissaA1;
-   wire [63:0] 	 mantissaA3;
-   wire [51:0] 	 mantissaB; 
-   wire [56:0] 	 mantissaB1, mantissaB2;
-   wire [63:0] 	 mantissaB3;
-   wire 	 exp_gt63;
-   wire 	 Sticky_out;
-   wire          sub;
-   wire 	 zeroB;
-   wire [5:0]	 align_shift; 
-
-   output logic [63:0] 	 AddFloat1E; 
-   output logic [63:0] 	 AddFloat2E;
-   output logic [10:0] 	 AddExponentE;
-   output logic [10:0]	 AddExpPostSumE;
-   output logic [11:0]	 AddExp1DenormE, AddExp2DenormE;//KEP used to be [10:0]
-   output logic [63:0] AddSumE, AddSumTcE;
-   output logic [3:0]  AddSelInvE;
-   output logic        AddCorrSignE;
-   output logic 	 AddSignAE;
-   output logic	 AddOp1NormE, AddOp2NormE;
-   output logic	 AddOpANormE, AddOpBNormE;
-   output logic	 AddInvalidE;
-   output logic 	 AddDenormInE;
-//   output logic 	 exp_valid;
-   output logic 	 AddConvertE;
-   output logic        AddSwapE;
-   output logic 	 AddNormOvflowE;
-   wire [5:0]	 ZP_mantissaA;
-   wire [5:0]	 ZP_mantissaB;
-   wire		 ZV_mantissaA;
-   wire		 ZV_mantissaB;
+   wire [63:0] IntValue;
+   wire [11:0] exp1, exp2;
+   wire [11:0] exp_diff1, exp_diff2;
+   wire [11:0] exp_shift;
+   wire [51:0] mantissaA;
+   wire [56:0] mantissaA1;
+   wire [63:0] mantissaA3;
+   wire [51:0] mantissaB; 
+   wire [56:0] mantissaB1, mantissaB2;
+   wire [63:0] mantissaB3;
+   wire 	      exp_gt63;
+   wire 	      Sticky_out;
+   wire        sub;
+   wire 	      zeroB;
+   wire [5:0]	align_shift;

   // Convert the input operands to their appropriate forms based on 
   // the orignal operands, the FOpCtrlE , and their precision P. 
@ -137,7 +136,7 @@ module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE,
   // and the sign of the first operand is set appropratiately based on
   // if the operation is absolute value or negation. 

-   convert_inputs conv1 (AddFloat1E, AddFloat2E, FSrcXE, FSrcYE, FOpCtrlE, P);
+   convert_inputs conv1 (.Float1(AddFloat1E), .Float2(AddFloat2E), .op1(FSrcXE), .op2(FSrcYE), .op_type(FOpCtrlE), .P);

   // Test for exceptions and return the "Invalid Operation" and
   // "Denormalized" Input Flags. The "AddSelInvE" is used in
@ -247,7 +246,7 @@ module fpuaddcvt1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE,
 
   // Finds normal underflow result to determine whether to round final exponent down
   //***KEP used to be (AddSumE == 16'h0) I am unsure what it's supposed to be
-   assign AddNormOvflowE = (AddDenormInE & (AddSumE == 64'h0) & (AddOpANormE | AddOpBNormE) & ~FOpCtrlE[0]) ? 1'b1 : (AddSumE[63] ? AddSumTcE[52] : AddSumE[52]);
+   // assign AddNormOvflowE = (AddDenormInE & (AddSumE == 64'h0) & (AddOpANormE | AddOpBNormE) & ~FOpCtrlE[0]) ? 1'b1 : (AddSumE[63] ? AddSumTcE[52] : AddSumE[52]);

 endmodule // fpadd

@ -281,32 +280,28 @@ endmodule // fpadd
 //


-module fpuaddcvt2 (FAddResM, FAddFlgM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, FrmM, FOpCtrlM, FmtM);
+module fpuaddcvt2 (
+   input [2:0] 	FrmM,		// Rounding mode - specify values 
+   input [3:0]	FOpCtrlM,	// Function opcode
+   input 	FmtM,   		// Result Precision (0 for double, 1 for single)
+   input [63:0] AddSumM, AddSumTcM,
+   input [63:0] 	 AddFloat1M, 
+   input [63:0] 	 AddFloat2M,
+   input [11:0]	 AddExp1DenormM, AddExp2DenormM,
+   input [10:0] 	 AddExponentM, AddExpPostSumM,
+   input [3:0] 	 AddSelInvM,
+   input		 AddOp1NormM, AddOp2NormM,
+   input		 AddOpANormM, AddOpBNormM,
+   input		 AddInvalidM,
+   input 	 AddDenormInM, 
+   input 	 AddSignAM, 
+   input         AddCorrSignM,
+   input 	 AddConvertM,
+   input          AddSwapM,

-   input [2:0] 	FrmM;		// Rounding mode - specify values 
-   input [3:0]	FOpCtrlM;	// Function opcode
-   input 	FmtM;   		// Result Precision (0 for double, 1 for single)
-   // input 	AddOvEnM;		// Overflow trap enabled
-   // input 	AddUnEnM;   	// Underflow trap enabled
-   input [63:0] AddSumM, AddSumTcM;
-   input [63:0] 	 AddFloat1M; 
-   input [63:0] 	 AddFloat2M;
-   input [11:0]	 AddExp1DenormM, AddExp2DenormM;
-   input [10:0] 	 AddExponentM, AddExpPostSumM; //exp_pre;
-   //input		 exp_valid;
-   input [3:0] 	 AddSelInvM;
-   input		 AddOp1NormM, AddOp2NormM;
-   input		 AddOpANormM, AddOpBNormM;
-   input		 AddInvalidM;
-   input 	 AddDenormInM; 
-   input 	 AddSignAM; 
-   input         AddCorrSignM;
-   input 	 AddConvertM;
-   input          AddSwapM;
-   // input 	 AddNormOvflowM;
-
-   output [63:0] FAddResM;	// Result of operation
-   output [4:0]  FAddFlgM;   	// IEEE exception flags 
+   output [63:0] FAddResM,	// Result of operation
+   output [4:0]  FAddFlgM   	// IEEE exception flags 
+);
   wire 	 AddDenormM;   	// AddDenormM on input or output   

   wire          P;
@ -322,7 +317,6 @@ module fpuaddcvt2 (FAddResM, FAddFlgM, AddSumM, AddSumTcM, AddSelInvM, AddExpPos
   wire 	 Sticky_out;
   wire 	 sign_corr;
   wire 	 zeroB;         
-   wire [10:0]	 AddExpPostSumM;
   wire 	 mantissa_comp;
   wire 	 mantissa_comp_sum;
   wire 	 mantissa_comp_sum_tc;
--- a/wally-pipelined/src/fpu/fclassify.sv
+++ b/wally-pipelined/src/fpu/fclassify.sv
@ -2,19 +2,21 @@
 `include "wally-config.vh"

 module fclassify (
-    input  logic XSgnE,
-    input logic XNaNE, 
-    input logic XSNaNE,
-    input logic XNormE,
-    input logic XDenormE,
-    input logic XZeroE,
-    input logic XInfE,
-    output logic [63:0] ClassResE
+    input logic         XSgnE,  // sign bit
+    input logic         XNaNE,  // is NaN
+    input logic         XSNaNE, // is signaling NaN
+    input logic         XNormE, // is normal
+    input logic         XDenormE, // is denormal
+    input logic         XZeroE, // is zero
+    input logic         XInfE,  // is infinity
+    output logic [63:0] ClassResE // classify result
    );

    logic PInf, PZero, PNorm, PDenorm;
    logic NInf, NZero, NNorm, NDenorm;

+   
+    // determine the sub categories
    assign PInf = ~XSgnE&XInfE;
    assign NInf = XSgnE&XInfE;
    assign PNorm = ~XSgnE&XNormE;
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@ -1,20 +1,21 @@

 module fctrl (
-  input  logic [6:0] Funct7D,
-  input  logic [6:0] OpD,
-  input  logic [4:0] Rs2D,
-  input  logic [2:0] Funct3D,
-  input  logic [2:0] FRM_REGW,
-  output logic       IllegalFPUInstrD,
-  output logic       FRegWriteD,
-  output logic       FDivStartD,
-  output logic [2:0] FResultSelD,
-  output logic [3:0] FOpCtrlD,
-  output logic [1:0] FResSelD,
-  output logic [1:0] FIntResSelD,
-  output logic       FmtD,
-  output logic [2:0] FrmD,
-  output logic       FWriteIntD);
+  input  logic [6:0] Funct7D,   // bits 31:25 of instruction - may contain percision
+  input  logic [6:0] OpD,       // bits 6:0 of instruction
+  input  logic [4:0] Rs2D,      // bits 24:20 of instruction
+  input  logic [2:0] Funct3D,   // bits 14:12 of instruction - may contain rounding mode
+  input  logic [2:0] FRM_REGW,  // rounding mode from CSR
+  output logic       IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
+  output logic       FRegWriteD,  // FP register write enable
+  output logic       FDivStartD,  // Start division or squareroot
+  output logic [2:0] FResultSelD, // select result to be written to fp register
+  output logic [3:0] FOpCtrlD,    // chooses which opperation to do - specifics shown at bottom of module and in each unit
+  output logic [1:0] FResSelD,    // select one of the results done in the memory stage
+  output logic [1:0] FIntResSelD, // select the result that will be written to the integer register
+  output logic       FmtD,        // precision - single-0 double-1
+  output logic [2:0] FrmD,        // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+  output logic       FWriteIntD   // is the result written to the integer register
+  );

  `define FCTRLW 15
  logic [`FCTRLW-1:0] ControlsD;
@ -100,16 +101,43 @@ module fctrl (
                  endcase
      default:      ControlsD = `FCTRLW'b0_0_000_0000_00_00_0_1; // non-implemented instruction
    endcase
+
  // unswizzle control bits
  assign {FRegWriteD, FWriteIntD, FResultSelD, FOpCtrlD, FResSelD, FIntResSelD, FDivStartD, IllegalFPUInstrD} = ControlsD;
  
-  // if dynamic rounding, choose FRM_REGW
+  // rounding modes:
+  //    000 - round to nearest, ties to even
+  //    001 - round twords 0 - round to min magnitude
+  //    010 - round down - round twords negitive infinity
+  //    011 - round up - round twords positive infinity
+  //    100 - round to nearest, ties to max magnitude - round to nearest, ties away from zero
+  //    111 - dynamic - choose FRM_REGW as rounding mode
  assign FrmD = &Funct3D ? FRM_REGW : Funct3D;

  // Precision
-  //  0-single
-  //  1-double
+  //    0-single
+  //    1-double
  assign FmtD = FResultSelD == 3'b000 ? Funct3D[0] : OpD[6:1] == 6'b010000 ? ~Funct7D[0] : Funct7D[0];
+
+  // FResultSel:
+  //    000 - ReadRes - load
+  //    001 - FMARes  - FMA and multiply
+  //    010 - FAddRes - add and fp to fp
+  //    011 - FDivRes - divide and squareroot
+  //    100 - FRes    - anything that is written to the fp register and is ready in the memory stage
+  //        FResSel:
+  //            00 - SrcA   - move to fp register 
+  //            01 - SgnRes - sign injection
+  //            10 - CmpRes - min/max
+  //            11 - CvtRes - convert to fp
+  
+  // FIntResSel:
+  //    00 - CmpRes   - less than, equal, or less than or equal 
+  //    01 - FSrcX    - move to int register
+  //    10 - ClassRes - classify
+  //    11 - CvtRes   - convert to signed/unsigned int
+
+  // OpCtrl values: 
  // div/sqrt
      //  fdiv  = ???0
      //  fsqrt = ???1
@ -120,7 +148,7 @@ module fctrl (
      //  feq  = ?010
      //  flt  = ?001
      //  fle  = ?011
-      //		   {?,    is min or max, is eq or le, is lt or le}
+      //  {?,  is min or max,   is eq or le,   is lt or le}

  //fma/mult	
      //  fmadd  = ?000
@ -128,7 +156,7 @@ module fctrl (
      //  fnmsub = ?010	-(a*b)+c
      //  fnmadd = ?011 -(a*b)-c
      //  fmul   = ?100
-      //		  {?, is mul, is negitive, is sub}
+      //	{?, is mul, negate product, negate addend}

  // sgn inj
      //  fsgnj  = ??00
@ -138,37 +166,28 @@ module fctrl (
  // add/sub/cnvt
      //  fadd      = 0000
      //  fsub      = 0001
-  // cnvt
+      //  fcvt.s.d  = 0111
+      //  fcvt.d.s  = 0111
+      //  Fmt controls the output for fp -> fp
+      
+  // convert
      //  fcvt.w.s  = 0010
      //  fcvt.wu.s = 0110
      //  fcvt.s.w  = 0001
      //  fcvt.s.wu = 0101
-      //  fcvt.s.d  = 0000
      //  fcvt.l.s  = 1010
      //  fcvt.lu.s = 1110
      //  fcvt.s.l  = 1001
      //  fcvt.s.lu = 1101
-      //  fcvt.w.d  = 0010
+      //  fcvt.w.d  = 0010 
      //  fcvt.wu.d = 0110
      //  fcvt.d.w  = 0001
      //  fcvt.d.wu = 0101
-      //  fcvt.d.s  = 0000
      //  fcvt.l.d  = 1010
      //  fcvt.lu.d = 1110
      //  fcvt.d.l  = 1001
      //  fcvt.d.lu = 1101
-      //  {long, unsigned, to int, from int} Fmt controls the output for fp -> fp
-
-      //  fmv.w.x = ???0
-      //  fmv.w.d = ???1
-
-      //  flw       = ?000
-      //  fld       = ?001 
-      //  fsw       = ?010
-      //  fsd       = ?011
-      //  fmv.x.w  = ?100
-      //  fmv.x.d  = ?101
-      //		   {?, is mv, is store, is double or fmv}
+      //  {long, unsigned, to int, from int}
    

 endmodule
--- a/wally-pipelined/src/fpu/fcvt.sv
+++ b/wally-pipelined/src/fpu/fcvt.sv
@ -1,36 +1,37 @@

 `include "wally-config.vh"
+// `include "../../config/rv64icfd/wally-config.vh"
 module fcvt (
-	input logic        XSgnE,
-    input logic [10:0] XExpE,
-    input logic [52:0] XManE,
-    input logic XZeroE,
-    input logic XNaNE,
-    input logic XInfE,
-    input logic XDenormE,
-    input logic [10:0] BiasE,
-    input logic [`XLEN-1:0] SrcAE,  // integer input
-    input logic [3:0] FOpCtrlE,     // chooses which instruction is done (full list below)
-    input logic [2:0] FrmE,         // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
-    input logic FmtE,               // precision 1 = double 0 = single
-    output logic [63:0] CvtResE,    // convert final result
-    output logic [4:0] CvtFlgE);     // convert flags {invalid, divide by zero, overflow, underflow, inexact}
+	input logic             XSgnE,      // X's sign
+    input logic [10:0]      XExpE,      // X's exponent
+    input logic [52:0]      XManE,     // X's fraction
+    input logic             XZeroE,     // is X zero
+    input logic             XNaNE,      // is X NaN 
+    input logic             XInfE,      // is X infinity
+    input logic             XDenormE,   // is X denormalized
+    input logic [10:0]      BiasE,      // bias - depends on precision (max exponent/2)
+    input logic [`XLEN-1:0] SrcAE,      // integer input
+    input logic [3:0]       FOpCtrlE,   // chooses which instruction is done (full list below)
+    input logic [2:0]       FrmE,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+    input logic             FmtE,       // precision 1 = double 0 = single
+    output logic [63:0]     CvtResE,    // convert final result
+    output logic [4:0]      CvtFlgE);   // convert flags {invalid, divide by zero, overflow, underflow, inexact}

-    logic               ResSgn; // FP result's sign
-    logic [10:0]        ResExp,TmpExp; // FP result's exponent
-    logic [51:0]        ResFrac;    // FP result's fraction
-    logic [5:0]         LZResP;     // lz output
-    logic [7:0]         Bits;       // how many bits are in the integer result
-    logic [7:0]         SubBits;    // subtract these bits from the exponent (FP result)
-    logic [64+51:0]  ShiftedManTmp; // Shifted mantissa
-    logic [64+51:0]  ShiftVal;       // value being shifted (to int - XMan, to FP - |integer input|)
-    logic [64+1:0]   ShiftedMan;     // shifted mantissa truncated
+    logic               ResSgn;         // FP result's sign
+    logic [10:0]        ResExp,TmpExp;  // FP result's exponent
+    logic [51:0]        ResFrac;        // FP result's fraction
+    logic [5:0]         LZResP;         // lz output
+    logic [7:0]         Bits;           // how many bits are in the integer result
+    logic [7:0]         SubBits;        // subtract these bits from the exponent (FP result)
+    logic [64+51:0]     ShiftedManTmp;  // Shifted mantissa
+    logic [64+51:0]     ShiftVal;       // value being shifted (to int - XMan, to FP - |integer input|)
+    logic [64+1:0]      ShiftedMan;     // shifted mantissa truncated
    logic [64:0]	    RoundedTmp;     // full size rounded result - in case of overfow
    logic [63:0]	    Rounded;        // rounded result
    logic [12:0]        ExpVal;         // unbiased X exponent
    logic [12:0]        ShiftCnt;       // how much is the mantissa shifted
-	logic [64-1:0]   IntIn;          // trimed integer input
-    logic [64-1:0]   PosInt;         // absolute value of the integer input
+	logic [64-1:0]      IntIn;          // trimed integer input
+    logic [64-1:0]      PosInt;         // absolute value of the integer input
    logic [63:0]        CvtIntRes;      // interger result from the fp -> int instructions
    logic [63:0]        CvtFPRes;       // floating point result from the int -> fp instructions
    logic               Of, Uf;         // did the integer result underflow or overflow
@ -61,11 +62,9 @@ module fcvt (
      //  {long, unsigned, to int, from int}
   
    // calculate signals based off the input and output's size
-    // assign Bias = FmtE ? 12'h3ff : 12'h7f;
-    assign Res64 = ((FOpCtrlE==4'b1010 || FOpCtrlE==4'b1110) | (FmtE&(FOpCtrlE==4'b0001 | FOpCtrlE==4'b0101 | FOpCtrlE==4'b0000 | FOpCtrlE==4'b1001 | FOpCtrlE==4'b1101)));
-    assign In64 = ((FOpCtrlE==4'b1001 || FOpCtrlE==4'b1101) | (FmtE&(FOpCtrlE==4'b0010 | FOpCtrlE==4'b0110 | FOpCtrlE==4'b1010 | FOpCtrlE==4'b1110) | (FOpCtrlE==4'b1101 & ~FmtE)));
-    //assign SubBits = In64 ? 8'd64 : 8'd32;
-    assign SubBits = 8'd64;
+    assign Res64 = (FOpCtrlE[1]&FOpCtrlE[3]) | (FmtE&FOpCtrlE[0]);
+    assign In64 =  (FOpCtrlE[0]&FOpCtrlE[3]) | (FmtE&FOpCtrlE[1]);
+    assign SubBits = In64 ? 8'd64 : 8'd32;
    assign Bits = Res64 ? 8'd64 : 8'd32;

    // calulate the unbiased exponent
@ -80,15 +79,6 @@ module fcvt (
    // determine the integer's sign
    assign ResSgn = ~FOpCtrlE[2] ? IntIn[64-1] : 1'b0;
    
-    // generate
-    //     if(`XLEN == 64) 
-    //         lz64 lz(LZResP, LZResV, PosInt);
-    //     else if(`XLEN == 32) begin
-    //         assign LZResP[5] = 1'b0;
-    //         lz32 lz(LZResP[4:0], LZResV, PosInt);
-    //     end 
-    // endgenerate
-
 	// Leading one detector
 	logic [8:0]	i;
 	always_comb begin
@ -98,7 +88,7 @@ module fcvt (
 	end

    // if no one was found set to zero otherwise calculate the exponent
-    assign TmpExp = i==`XLEN ? 0 : BiasE + SubBits - LZResP;
+    assign TmpExp = i==`XLEN ? 0 : FmtE ? 1023 + SubBits - LZResP : 127 + SubBits - LZResP;



--- a/wally-pipelined/src/fpu/fdivsqrt.sv
+++ b/wally-pipelined/src/fpu/fdivsqrt.sv
@ -1,256 +0,0 @@
-// //
-// // File name : fpdiv
-// // Title     : Floating-Point Divider/Square-Root
-// // project   : FPU
-// // Library   : fpdiv
-// // Author(s) : James E. Stine, Jr.
-// // Purpose   : definition of main unit to floating-point div/sqrt
-// // notes :   
-// //
-// // Copyright Oklahoma State University
-// //
-// // Basic Operations
-// //
-// // Step 1: Load operands, set flags, and convert SP to DP
-// // Step 2: Check for special inputs ( +/- Infinity,  NaN)
-// // Step 3: Exponent Logic
-// // Step 4: Divide/Sqrt using Goldschmidt
-// // Step 5: Normalize the result.//
-// //   Shift left until normalized.  Normalized when the value to the 
-// //   left of the binrary point is 1.
-// // Step 6: Round the result.// 
-// // Step 7: Put quotient/remainder onto output.
-// //
-
-// // `timescale 1ps/1ps
-// module fdivsqrt (FDivSqrtDoneE, FDivResultM, FDivSqrtFlgM, DivInput1E, DivInput2E, FrmE, DivOpType, FmtE, DivOvEn, DivUnEn,
-// 	      FDivStartE, reset, clk, FDivBusyE, HoldInputs);
-
-//    input [63:0] DivInput1E;		// 1st input operand (A)
-//    input [63:0] DivInput2E;		// 2nd input operand (B)
-//    input [2:0] 	FrmE;		// Rounding mode - specify values 
-//    input 	DivOpType;	// Function opcode
-//    input 	FmtE;   		// Result Precision (0 for double, 1 for single) //***will need to swap this
-//    input 	DivOvEn;		// Overflow trap enabled
-//    input 	DivUnEn;   	// Underflow trap enabled
-
-//    input 	FDivStartE;
-//    input 	reset;
-//    input 	clk;   
-
-//    output [63:0] FDivResultM;	// Result of operation
-//    output [4:0]  FDivSqrtFlgM;   	// IEEE exception flags 
-//    output 	 FDivSqrtDoneE;
-//    output    FDivBusyE, HoldInputs;
-
-//    supply1 	  vdd;
-//    supply0 	  vss;   
-
-//    wire [63:0] 	 Float1; 
-//    wire [63:0] 	 Float2;
-//    wire [63:0] 	 IntValue;
-   
-//    wire 	 DivDenormM;   	// DivDenormM on input or output
-//    wire [12:0] 	 exp1, exp2, expF;
-//    wire [12:0] 	 exp_diff, bias;
-//    wire [13:0] 	 exp_sqrt;
-//    wire [12:0] 	 exp_s;
-//    wire [12:0] 	 exp_c;
-   
-//    wire [10:0] 	 exponent, exp_pre;
-//    wire [63:0] 	 Result;   
-//    wire [52:0] 	 mantissaA;
-//    wire [52:0] 	 mantissaB; 
-//    wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
-   
-//    wire [5:0] 	 align_shift;
-//    wire [5:0] 	 norm_shift;
-//    wire [2:0] 	 sel_inv;
-//    wire		 op1_Norm, op2_Norm;
-//    wire		 opA_Norm, opB_Norm;
-//    wire		 Invalid;
-//    wire 	 DenormIn, DenormIO;
-//    wire [4:0] 	 FlagsIn;   	
-//    wire 	 exp_gt63;
-//    wire 	 Sticky_out;
-//    wire 	 signResult, sign_corr;
-//    wire          corr_sign;
-//    wire 	 zeroB;         
-//    wire 	 convert;
-//    wire          swap;
-//    wire          sub;
-   
-//    wire [63:0] 	 q1, qm1, qp1, q0, qm0, qp0;
-//    wire [63:0] 	 rega_out, regb_out, regc_out, regd_out;
-//    wire [127:0]  regr_out;
-//    wire [2:0] 	 sel_muxa, sel_muxb;
-//    wire 	 sel_muxr;   
-//    wire 	 load_rega, load_regb, load_regc, load_regd, load_regr, load_regs;
-
-//    wire 	 donev, sel_muxrv, sel_muxsv;
-//    wire [1:0] 	 sel_muxav, sel_muxbv;   
-//    wire 	 load_regav, load_regbv, load_regcv;
-//    wire 	 load_regrv, load_regsv;
-   
-//    logic exp_cout1, exp_cout2, exp_odd, open;
-
-
-//    // Convert the input operands to their appropriate forms based on 
-//    // the orignal operands, the DivOpType , and their precision FmtE. 
-//    // Single precision inputs are converted to double precision 
-//    // and the sign of the first operand is set appropratiately based on
-//    // if the operation is absolute value or negation. 
-//    convert_inputs_div divconv1 (Float1, Float2, DivInput1E, DivInput2E, DivOpType, FmtE);
-
-//    // Test for exceptions and return the "Invalid Operation" and
-//    // "Denormalized" Input FDivSqrtFlgM. The "sel_inv" is used in
-//    // the third pipeline stage to select the result. Also, op1_Norm
-//    // and op2_Norm are one if DivInput1E and DivInput2E are not zero or denormalized.
-//    // sub is one if the effective operation is subtaction. 
-//    exception_div divexc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
-// 		   Float1, Float2, DivOpType);
-
-//    // Determine Sign/Mantissa
-//    assign signResult = ((Float1[63]^Float2[63])&~DivOpType) | Float1[63]&DivOpType;
-//    assign mantissaA = {vdd, Float1[51:0]};
-//    assign mantissaB = {vdd, Float2[51:0]};
-//    // Perform Exponent Subtraction - expA - expB + Bias   
-//    assign exp1 = {2'b0, Float1[62:52]};
-//    assign exp2 = {2'b0, Float2[62:52]};
-//    // bias : DP = 2^{11-1}-1 = 1023
-//    assign bias = {3'h0, 10'h3FF};
-//    // Divide exponent
-//    csa #(13) csa1 (exp1, ~exp2, bias, exp_s, exp_c); //***adder
-//    exp_add explogic1 (exp_cout1, {open, exp_diff}, //***adder?
-// 		      {vss, exp_s}, {vss, exp_c}, 1'b1);
-//    // Sqrt exponent (check if exponent is odd)
-//    assign exp_odd = Float1[52] ? vss : vdd;
-//    exp_add explogic2 (exp_cout2, exp_sqrt, //***adder?
-// 		      {vss, exp1}, {4'h0, 10'h3ff}, exp_odd);
-//    // Choose correct exponent
-//    assign expF = DivOpType ? exp_sqrt[13:1] : exp_diff;   
-
-//    // Main Goldschmidt/Division Routine
-//    divconv goldy (q1, qm1, qp1, q0, qm0, qp0, 
-// 		  rega_out, regb_out, regc_out, regd_out,
-// 		  regr_out, mantissaB, mantissaA, 
-// 		  sel_muxa, sel_muxb, sel_muxr, 
-// 		  reset, clk,
-// 		  load_rega, load_regb, load_regc, load_regd,
-// 		  load_regr, load_regs, FmtE, DivOpType, exp_odd);
-
-//    // FSM : control divider
-//    fsm control (FDivSqrtDoneE, load_rega, load_regb, load_regc, load_regd, 
-// 		load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
-// 		clk, reset, FDivStartE, DivOpType, FDivBusyE, HoldInputs);
-   
-//    // Round the mantissa to a 52-bit value, with the leading one
-//    // removed. The rounding units also handles special cases and 
-//    // set the exception flags.
-//    //***add max magnitude and swap negitive and positive infinity
-//    rounder_div divround1 (Result, DenormIO, FlagsIn, 
-// 		   FrmE, FmtE, DivOvEn, DivUnEn, expF, 
-//    		   sel_inv, Invalid, DenormIn, signResult, 
-// 		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
-
-//    // Store the final result and the exception flags in registers.
-//    flopenr #(64) rega (clk, reset, FDivSqrtDoneE, Result, FDivResultM);
-//    flopenr #(1) regb (clk, reset, FDivSqrtDoneE, DenormIO, DivDenormM);   
-//    flopenr #(5) regc (clk, reset, FDivSqrtDoneE, FlagsIn, FDivSqrtFlgM);   
-   
-// endmodule // fpadd
-
-// //
-// // Brent-Kung Prefix Adder 
-// //   (yes, it is 14 bits as my generator is broken for 13 bits :( 
-// //    assume, synthesizer will delete stuff not needed )
-// //
-// module exp_add (cout, sum, a, b, cin);
-   
-//    input [13:0] a, b;
-//    input 	cin;
-   
-//    output [13:0] sum;
-//    output 	 cout;
-
-//    wire [14:0] 	 p,g;
-//    wire [13:0] 	 c;
-
-//    // pre-computation
-//    assign p={a^b,1'b0};
-//    assign g={a&b, cin};
-
-//    // prefix tree
-//    brent_kung prefix_tree(c, p[13:0], g[13:0]);
-
-//    // post-computation
-//    assign sum=p[14:1]^c;
-//    assign cout=g[14]|(p[14]&c[13]);
-
-// endmodule // exp_add
-
-// module brent_kung (c, p, g);
-   
-//    input [13:0] p;
-//    input [13:0] g;
-//    output [14:1] c;
-
-//    logic G_1_0, G_3_2,G_5_4,G_7_6,G_9_8,G_11_10,G_13_12,G_3_0,G_7_4,G_11_8;
-//    logic P_3_2,P_5_4,P_7_6,P_9_8,P_11_10,P_13_12,P_7_4,P_11_8;
-//    logic G_7_0,G_11_0,G_5_0,G_9_0,G_13_0,G_2_0,G_4_0,G_6_0,G_8_0,G_10_0,G_12_0;
-//    // parallel-prefix, Brent-Kung
-
-//    // Stage 1: Generates G/FmtE pairs that span 1 bits
-//    grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
-//    black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
-//    black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
-//    black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
-//    black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
-//    black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
-//    black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
-
-//    // Stage 2: Generates G/FmtE pairs that span 2 bits
-//    grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
-//    black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
-//    black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
-
-//    // Stage 3: Generates G/FmtE pairs that span 4 bits
-//    grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
-
-//    // Stage 4: Generates G/FmtE pairs that span 8 bits
-
-//    // Stage 5: Generates G/FmtE pairs that span 4 bits
-//    grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
-
-//    // Stage 6: Generates G/FmtE pairs that span 2 bits
-//    grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
-//    grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
-//    grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
-
-//    // Last grey cell stage 
-//    grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
-//    grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
-//    grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
-//    grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
-//    grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
-//    grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
-
-//    // Final Stage: Apply c_k+1=G_k_0
-//    assign c[1]=g[0];
-//    assign c[2]=G_1_0;
-//    assign c[3]=G_2_0;
-//    assign c[4]=G_3_0;
-//    assign c[5]=G_4_0;
-//    assign c[6]=G_5_0;
-//    assign c[7]=G_6_0;
-//    assign c[8]=G_7_0;
-//    assign c[9]=G_8_0;
-
-//    assign c[10]=G_9_0;
-//    assign c[11]=G_10_0;
-//    assign c[12]=G_11_0;
-//    assign c[13]=G_12_0;
-//    assign c[14]=G_13_0;
-
-// endmodule // brent_kung
-
--- a/wally-pipelined/src/fpu/fhazard.sv
+++ b/wally-pipelined/src/fpu/fhazard.sv
@ -26,41 +26,47 @@
 `include "wally-config.vh"

 module fhazard(
-    input logic [4:0] Adr1E, Adr2E, Adr3E,
-    input logic FRegWriteM, FRegWriteW, 
-	  input logic [4:0] RdM, RdW,
-    input logic [2:0] FResultSelM,
-    output logic FStallD,
-    output logic [1:0] FForwardXE, FForwardYE, FForwardZE
+    input logic [4:0]   Adr1E, Adr2E, Adr3E,    // read data adresses
+    input logic         FRegWriteM, FRegWriteW, // is the fp register being written to
+	  input logic [4:0]   RdM, RdW,               // the adress being written to
+    input logic [2:0]   FResultSelM,            // the result being selected
+    output logic        FStallD,                // stall the decode stage
+    output logic [1:0]  FForwardXE, FForwardYE, FForwardZE // select a forwarded value
 );


  always_comb begin
-    // set ReadData as default
+    // set defaults
    FForwardXE = 2'b00; // choose FRD1E
    FForwardYE = 2'b00; // choose FRD2E
    FForwardZE = 2'b00; // choose FRD3E
    FStallD = 0;

-      if ((Adr1E == RdM) & FRegWriteM)
-      // if the result will be FResM
-        if(FResultSelM == 3'b100) FForwardXE = 2'b10; // choose FResM
-        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr1E == RdW) & FRegWriteW) FForwardXE = 2'b01; // choose FPUResult64W
-    
+    // if the needed value is in the memory stage - input 1
+    if ((Adr1E == RdM) & FRegWriteM) 
+      // if the result will be FResM (can be taken from the memory stage)
+      if(FResultSelM == 3'b100) FForwardXE = 2'b10; // choose FResM
+      else FStallD = 1;                             // otherwise stall
+    // if the needed value is in the writeback stage
+    else if ((Adr1E == RdW) & FRegWriteW) FForwardXE = 2'b01; // choose FPUResult64W
+  

-      if ((Adr2E == RdM) & FRegWriteM)
-      // if the result will be FResM
-        if(FResultSelM == 3'b100) FForwardYE = 2'b10; // choose FResM
-        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr2E == RdW) & FRegWriteW) FForwardYE = 2'b01; // choose FPUResult64W
+    // if the needed value is in the memory stage - input 2
+    if ((Adr2E == RdM) & FRegWriteM)
+      // if the result will be FResM (can be taken from the memory stage)
+      if(FResultSelM == 3'b100) FForwardYE = 2'b10; // choose FResM
+      else FStallD = 1;                             // otherwise stall
+    // if the needed value is in the writeback stage
+    else if ((Adr2E == RdW) & FRegWriteW) FForwardYE = 2'b01; // choose FPUResult64W

- 
-      if ((Adr3E == RdM) & FRegWriteM)
-      // if the result will be FResM
-        if(FResultSelM == 3'b100) FForwardZE = 2'b10; // choose FResM
-        else FStallD = 1;   // if the result won't be ready stall
-      else if ((Adr3E == RdW) & FRegWriteW) FForwardZE = 2'b01; // choose FPUResult64W
+
+    // if the needed value is in the memory stage - input 3
+    if ((Adr3E == RdM) & FRegWriteM)
+      // if the result will be FResM (can be taken from the memory stage)
+      if(FResultSelM == 3'b100) FForwardZE = 2'b10; // choose FResM
+      else FStallD = 1;                             // otherwise stall
+    // if the needed value is in the writeback stage
+    else if ((Adr3E == RdW) & FRegWriteW) FForwardZE = 2'b01; // choose FPUResult64W

  end 

--- a/wally-pipelined/src/fpu/fma.sv
+++ b/wally-pipelined/src/fpu/fma.sv
@ -26,41 +26,50 @@
 // `include "../../../config/rv64icfd/wally-config.vh"

 module fma(
-    input logic             clk,
-    input logic             reset,
-    input logic             FlushM,
-    input logic             StallM,
-    input logic             FmtE, FmtM,       // precision 1 = double 0 = single
-    input logic  [2:0]      FOpCtrlM, FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
-    input logic  [2:0]      FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
-    input logic        XSgnE, YSgnE, ZSgnE,
-    input logic [`NE-1:0] XExpE, YExpE, ZExpE,
-    input logic [`NF:0] XManE, YManE, ZManE,
-    input logic        XSgnM, YSgnM, ZSgnM,
-    input logic [`NE-1:0] XExpM, YExpM, ZExpM, // ***needed
-    input logic [`NF:0] XManM, YManM, ZManM,
-    input logic XDenormE, YDenormE, ZDenormE,
-    input logic XZeroE, YZeroE, ZZeroE,
-    input logic XNaNM, YNaNM, ZNaNM,
-    input logic XSNaNM, YSNaNM, ZSNaNM,
-    input logic XZeroM, YZeroM, ZZeroM,
-    input logic XInfM, YInfM, ZInfM,
-    input logic [10:0] BiasE,
-	output logic [`FLEN-1:0]		FMAResM,
-	output logic [4:0]		FMAFlgM);
+    input logic                 clk,
+    input logic                 reset,
+    input logic                 FlushM,     // flush the memory stage
+    input logic                 StallM,     // stall memory stage
+    input logic                 FmtE, FmtM, // precision 1 = double 0 = single
+    input logic  [2:0]          FOpCtrlM, FOpCtrlE, // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
+    input logic  [2:0]          FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+    input logic                 XSgnE, YSgnE, ZSgnE,    // input signs - execute stage
+    input logic [`NE-1:0]       XExpE, YExpE, ZExpE,    // input exponents - execute stage
+    input logic [`NF:0]         XManE, YManE, ZManE,    // input mantissa - execute stage
+    input logic                 XSgnM, YSgnM, ZSgnM,    // input signs - memory stage
+    input logic [`NE-1:0]       XExpM, YExpM, ZExpM,    // input exponents - memory stage
+    input logic [`NF:0]         XManM, YManM, ZManM,    // input mantissa - memory stage
+    input logic                 XDenormE, YDenormE, ZDenormE, // is denorm
+    input logic                 XZeroE, YZeroE, ZZeroE,     // is zero - execute stage
+    input logic                 XNaNM, YNaNM, ZNaNM,        // is NaN
+    input logic                 XSNaNM, YSNaNM, ZSNaNM,     // is signaling NaN
+    input logic                 XZeroM, YZeroM, ZZeroM,     // is zero - memory stage
+    input logic                 XInfM, YInfM, ZInfM,        // is infinity
+    input logic [10:0]          BiasE,      // bias - depends on precison (max exponent/2)
+	output logic [`FLEN-1:0]    FMAResM,    // FMA result
+	output logic [4:0]		    FMAFlgM);   // FMA flags
 	
+  //fma/mult	
+      //  fmadd  = ?000
+      //  fmsub  = ?001
+      //  fnmsub = ?010	-(a*b)+c
+      //  fnmadd = ?011 -(a*b)-c
+      //  fmul   = ?100
+      //	{?, is mul, negate product, negate addend}

+    // signals transfered between pipeline stages
    logic [2*`NF+1:0]	ProdManE, ProdManM; 
    logic [3*`NF+5:0]	AlignedAddendE, AlignedAddendM;                       
-    logic [`NE+1:0]	ProdExpE, ProdExpM;
-    logic 			AddendStickyE, AddendStickyM;
-    logic 			KillProdE, KillProdM;
+    logic [`NE+1:0]	    ProdExpE, ProdExpM;
+    logic 			    AddendStickyE, AddendStickyM;
+    logic 			    KillProdE, KillProdM;
    
    fma1 fma1 (.XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
                .BiasE, .XDenormE, .YDenormE, .ZDenormE,  .XZeroE, .YZeroE, .ZZeroE,
                .FOpCtrlE, .FmtE, .ProdManE, .AlignedAddendE,
                .ProdExpE, .AddendStickyE, .KillProdE); 
                
+    // E/M pipeline registers
    flopenrc #(106) EMRegFma1(clk, reset, FlushM, ~StallM, ProdManE, ProdManM); 
    flopenrc #(162) EMRegFma2(clk, reset, FlushM, ~StallM, AlignedAddendE, AlignedAddendM); 
    flopenrc #(13) EMRegFma3(clk, reset, FlushM, ~StallM, ProdExpE, ProdExpM);  
@ -82,8 +91,8 @@ module fma1(
    // input logic        XSgnE, YSgnE, ZSgnE,
    input logic [`NE-1:0] XExpE, YExpE, ZExpE,      // biased exponents in B(NE.0) format
    input logic [`NF:0] XManE, YManE, ZManE,   // fractions in U(0.NF) format]
-    input logic        XDenormE, YDenormE, ZDenormE,
-    input logic XZeroE, YZeroE, ZZeroE,
+    input logic        XDenormE, YDenormE, ZDenormE, // is the input denormal
+    input logic XZeroE, YZeroE, ZZeroE, // is the input zero
    input logic [`NE-1:0] BiasE,
    input logic     [2:0]       FOpCtrlE,   // 000 = fmadd (X*Y)+Z,  001 = fmsub (X*Y)-Z,  010 = fnmsub -(X*Y)+Z,  011 = fnmadd -(X*Y)-Z,  100 = fmul (X*Y)
    input logic                 FmtE,       // precision 1 = double 0 = single
@ -94,8 +103,8 @@ module fma1(
    output logic                KillProdE      // set the product to zero before addition if the product is too small to matter
    );

-    logic [`NE+1:0]    AlignCnt;           // how far to shift the addend to align with the product in Q(NE+2.0) format *** is this enough bits?
-    logic [4*`NF+5:0]   ZManShifted;                // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
+    logic [`NE+1:0]     AlignCnt;           // how far to shift the addend to align with the product in Q(NE+2.0) format
+    logic [4*`NF+5:0]   ZManShifted;        // output of the alignment shifter including sticky bits U(NF+5.3NF+1)
    logic [4*`NF+5:0]   ZManPreShifted;     // input to the alignment shifter U(NF+5.3NF+1)

    ///////////////////////////////////////////////////////////////////////////////
@ -200,32 +209,33 @@ module fma2(
    output logic    [4:0]       FMAFlgM);     // FMA flags {invalid, divide by zero, overflow, underflow, inexact}
   

-    logic [`NF-1:0]    ResultFrac; // Result fraction
-    logic [`NE-1:0]    ResultExp;  // Result exponent
-    logic           ResultSgn;  // Result sign
-    logic           PSgn;       // product sign
+
+    logic [`NF-1:0]     ResultFrac; // Result fraction
+    logic [`NE-1:0]     ResultExp;  // Result exponent
+    logic               ResultSgn;  // Result sign
+    logic               PSgn;       // product sign
    logic [2*`NF+1:0]   ProdMan2;   // product being added
    logic [3*`NF+6:0]   AlignedAddend2; // possibly inverted aligned Z
    logic [3*`NF+5:0]   Sum;        // positive sum
    logic [3*`NF+6:0]   PreSum;     // possibly negitive sum
-    logic [`NE+1:0]    SumExp;     // exponent of the normalized sum
-    logic [`NE+1:0]    SumExpTmp;  // exponent of the normalized sum not taking into account denormal or zero results
-    logic [`NE+1:0]    SumExpTmpMinus1;    // SumExpTmp-1
-    logic [`NE+1:0]    FullResultExp;      // ResultExp with bits to determine sign and overflow
-    logic [`NF+2:0]    NormSum;    // normalized sum
+    logic [`NE+1:0]     SumExp;     // exponent of the normalized sum
+    logic [`NE+1:0]     SumExpTmp;  // exponent of the normalized sum not taking into account denormal or zero results
+    logic [`NE+1:0]     SumExpTmpMinus1;    // SumExpTmp-1
+    logic [`NE+1:0]     FullResultExp;      // ResultExp with bits to determine sign and overflow
+    logic [`NF+2:0]     NormSum;    // normalized sum
    logic [3*`NF+5:0]   SumShifted; // sum shifted for normalization
-    logic [8:0]     NormCnt;    // output of the leading zero detector //***change this later
-    logic           NormSumSticky; // sticky bit calulated from the normalized sum
-    logic           SumZero;    // is the sum zero
-    logic           NegSum;     // is the sum negitive
-    logic           InvZ;       // invert Z if there is a subtraction (-product + Z or product - Z)
-    logic           ResultDenorm;   // is the result denormalized
-    logic           Sticky;     // Sticky bit
-    logic           Plus1, Minus1, CalcPlus1, CalcMinus1;   // do you add or subtract one for rounding
-    logic           UfPlus1, UfCalcPlus1;  // do you add one (for determining underflow flag)
-    logic           Invalid,Underflow,Overflow,Inexact; // flags
-    logic [8:0]     DenormShift;    // right shift if the result is denormalized //***change this later
-    logic           SubBySmallNum;  // was there supposed to be a subtraction by a small number
+    logic [8:0]         NormCnt;    // output of the leading zero detector //***change this later
+    logic               NormSumSticky; // sticky bit calulated from the normalized sum
+    logic               SumZero;    // is the sum zero
+    logic               NegSum;     // is the sum negitive
+    logic               InvZ;       // invert Z if there is a subtraction (-product + Z or product - Z)
+    logic               ResultDenorm;   // is the result denormalized
+    logic               Sticky;     // Sticky bit
+    logic               Plus1, Minus1, CalcPlus1, CalcMinus1;   // do you add or subtract one for rounding
+    logic               UfPlus1, UfCalcPlus1;  // do you add one (for determining underflow flag)
+    logic               Invalid,Underflow,Overflow,Inexact; // flags
+    logic [8:0]         DenormShift;    // right shift if the result is denormalized //***change this later
+    logic               SubBySmallNum;  // was there supposed to be a subtraction by a small number
    logic [`FLEN-1:0]    Addend;     // value to add (Z or zero)
    logic           ZeroSgn;        // the result's sign if the sum is zero
    logic           ResultSgnTmp;   // the result's sign assuming the result is not zero
@ -306,11 +316,12 @@ module fma2(
    assign SumZero = ~(|Sum);

    // determine the length of the fraction based on precision
-    //assign FracLen = FmtM ? `NF : 13'd23;
-    assign FracLen = `NF;
+    assign FracLen = FmtM ? `NF : 13'd23;
+    //assign FracLen = `NF;

    // Determine if the result is denormal
    assign SumExpTmp = KillProdM ? {2'b0, ZExpM} : ProdExpM + -({4'b0, NormCnt} - (`NF+4));
+
    assign ResultDenorm = $signed(SumExpTmp)<=0 & ($signed(SumExpTmp)>=$signed(-FracLen)) & ~SumZero;

    // Determine the shift needed for denormal results
@ -458,16 +469,18 @@ module fma2(
    //   1) any input is a signaling NaN
    //   2) Inf - Inf (unless x or y is NaN)
    //   3) 0 * Inf
-    assign MaxExp = FmtM ? {`NE{1'b1}} : 13'd255;
+
+    assign MaxExp = FmtM ? {`NE{1'b1}} : {8{1'b1}};
    assign SigNaN = XSNaNM | YSNaNM | ZSNaNM;
    assign Invalid = SigNaN | ((XInfM || YInfM) & ZInfM & (PSgn ^ ZSgnEffM) & ~XNaNM & ~YNaNM) | (XZeroM & YInfM) | (YZeroM & XInfM);  
   
    // Set Overflow flag if the number is too big to be represented
    //      - Don't set the overflow flag if an overflowed result isn't outputed
-    assign Overflow = FullResultExp >= MaxExp & ~FullResultExp[`NE+1]&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+    assign Overflow = FullResultExp >= {MaxExp} & ~FullResultExp[`NE+1]&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);

    // Set Underflow flag if the number is too small to be represented in normal numbers
    //      - Don't set the underflow flag if the result is exact
+
    assign Underflow = (SumExp[`NE+1] | ((SumExp == 0) & (Round|Guard|Sticky|UfGuard)))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
    assign UnderflowFlag = (FullResultExp[`NE+1] | ((FullResultExp == 0) | ((FullResultExp == 1) & (SumExp == 0) & ~(UfPlus1&UfLSBNormSum)))&(Round|Guard|Sticky))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
    // Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
@ -504,8 +517,8 @@ module fma2(
                        YNaNM ? YNaNResult :
                        ZNaNM ? ZNaNResult :
                        Invalid ? InvalidResult : // has to be before inf
-                        XInfM ? FmtM ? {PSgn, XExpM, XManM[`NF-1:0]} : {{32{1'b1}}, PSgn, XExpM[7:0], XManM[51:29]} : 
-                        YInfM ? FmtM ? {PSgn, YExpM, YManM[`NF-1:0]} : {{32{1'b1}}, PSgn, YExpM[7:0], YManM[51:29]} :
+                        XInfM ? FmtM ? {PSgn, XExpM, XManM[`NF-1:0]} : {{32{1'b1}}, PSgn,  XExpM[7:0], XManM[51:29]} : 
+                        YInfM ? FmtM ? {PSgn, YExpM, YManM[`NF-1:0]} : {{32{1'b1}}, PSgn,  YExpM[7:0], YManM[51:29]} :
                        ZInfM ? FmtM ? {ZSgnEffM, ZExpM, ZManM[`NF-1:0]} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], ZManM[51:29]} :
                        Overflow ? OverflowResult :
                        KillProdM ? KillProdResult : // has to be after Underflow      
--- a/wally-pipelined/src/fpu/fpadd_denorm.sv
+++ b/wally-pipelined/src/fpu/fpadd_denorm.sv
@ -1,286 +0,0 @@
-///////////////////////////////////////////
-//
-// Written: James.Stine@okstate.edu 1 February 2021
-// Modified: 
-//
-// Purpose: FP Add/Sub instructions
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-///////////////////////////////////////////
-
-//
-// Basic and Denormalized Operations
-//
-// Step 1: Load operands, set flags, and convert SP to DP
-// Step 2: Check for special inputs ( +/- Infinity,  NaN)
-// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
-//         or of (exp1 = exp2 AND mnt1 < mnt2)
-// Step 4: Shift the mantissa corresponding to the smaller exponent, 
-//          and extend precision by three bits to the right.
-// Step 5: Add or subtract the mantissas.
-// Step 6: Normalize the result.//
-//   Shift left until normalized.  Normalized when the value to the 
-//   left of the binrary point is 1.
-// Step 7: Round the result.// 
-// Step 8: Put sum onto output.
-//
-
-module fpadd (AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);
-
-   input [63:0] op1;		// 1st input operand (A)
-   input [63:0] op2;		// 2nd input operand (B)
-   input [2:0] 	rm;		// Rounding mode - specify values 
-   input [3:0]	op_type;	// Function opcode
-   input 	P;   		// Result Precision (0 for double, 1 for single)
-   input 	OvEn;		// Overflow trap enabled
-   input 	UnEn;   	// Underflow trap enabled
-
-   output [63:0] AS_Result;	// Result of operation
-   output [4:0]  Flags;   	// IEEE exception flags 
-   output 	 Denorm;   	// Denorm on input or output   
-
-   wire [63:0] 	 Float1; 
-   wire [63:0] 	 Float2;
-   wire [63:0] 	 IntValue;
-   wire [11:0] 	 exp1, exp2;
-   wire [11:0] 	 exp_diff1, exp_diff2;
-   wire [10:0] 	 exponent, exp_pre;
-   wire [11:0] 	 exp_shift;
-   wire [63:0] 	 Result;   
-   wire [51:0] 	 mantissaA;
-   wire [56:0] 	 mantissaA1;
-   wire [63:0] 	 mantissaA3;
-   wire [51:0] 	 mantissaB; 
-   wire [56:0] 	 mantissaB1, mantissaB2;
-   wire [63:0] 	 mantissaB3;
-   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm, sum_norm_w_bypass;
-   wire [5:0] 	 align_shift;
-   wire [5:0] 	 norm_shift, norm_shift_denorm;
-   wire [3:0] 	 sel_inv;
-   wire		 op1_Norm, op2_Norm;
-   wire		 opA_Norm, opB_Norm;
-   wire		 Invalid;
-   wire 	 DenormIn, DenormIO;
-   wire [4:0] 	 FlagsIn;   	
-   wire 	 exp_valid;
-   wire 	 exp_gt63;
-   wire 	 Sticky_out;
-   wire 	 signA, sign_corr;
-   wire          corr_sign;
-   wire 	 zeroB;         
-   wire 	 convert;
-   wire          swap;
-   wire          sub;
-   wire [10:0]	 exponent_postsum;
-   wire 	 mantissa_comp;
-   wire 	 mantissa_comp_sum;
-   wire 	 mantissa_comp_sum_tc;
-   wire 	 Float1_sum_comp;
-   wire 	 Float2_sum_comp;
-   wire 	 Float1_sum_tc_comp;
-   wire 	 Float2_sum_tc_comp;
-   wire [5:0]	 ZP_mantissaA;
-   wire [5:0] 	 ZP_mantissaB;
-   wire 	 ZV_mantissaA;
-   wire 	 ZV_mantissaB;
-   wire 	 normal_underflow;
-   wire 	 normal_overflow;
-
-   // Convert the input operands to their appropriate forms based on 
-   // the orignal operands, the op_type , and their precision P. 
-   // Single precision inputs are converted to double precision 
-   // and the sign of the first operand is set appropratiately based on
-   // if the operation is absolute value or negation. 
-
-   convert_inputs conv1 (Float1, Float2, op1, op2, op_type, P);
-
-   // Test for exceptions and return the "Invalid Operation" and
-   // "Denormalized" Input Flags. The "sel_inv" is used in
-   // the third pipeline stage to select the result. Also, op1_Norm
-   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
-   // sub is one if the effective operation is subtaction. 
-
-   exception exc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, sub, 
-		   Float1, Float2, op_type);
-
-   // Perform Exponent Subtraction (used for alignment). For performance
-   // both exponent subtractions are performed in parallel. This was 
-   // changed to a behavior level to allow the tools to  try to optimize
-   // the two parallel additions. The input values are zero-extended to 12 
-   // bits prior to performing the addition. 
-
-   assign exp1 = {1'b0, Float1[62:52]};
-   assign exp2 = {1'b0, Float2[62:52]};
-   assign exp_diff1 = exp1 - exp2;
-   assign exp_diff2 = DenormIn ? ({Float2[63], exp2[10:0]} - {Float1[63], exp1[10:0]}): exp2 - exp1;
-
-   // The second operand (B) should be set to zero, if op_type does not
-   // specify addition or subtraction
-   assign zeroB = op_type[2] | op_type[1];
-
-   // Swapped operands if zeroB is not one and exp1 < exp2. 
-   // Swapping causes exp2 to be used for the result exponent. 
-   // Only the exponent of the larger operand is used to determine
-   // the final result. 
-   assign swap = exp_diff1[11] & ~zeroB;
-   assign exponent = swap ? exp2[10:0] : exp1[10:0];
-   assign exponent_postsum = swap ? exp2[10:0] : exp1[10:0];
-   assign mantissaA = swap ? Float2[51:0] : Float1[51:0];
-   assign mantissaB = swap ? Float1[51:0] : Float2[51:0];
-   assign signA     = swap ? Float2[63] : Float1[63];   
-
-   // Leading-Zero Detector. Determine the size of the shift needed for
-   // normalization. If sum_corrected is all zeros, the exp_valid is 
-   // zero; otherwise, it is one. 
-   // modified to 52 bits to detect leading zeroes on denormalized mantissas
-   lz52 lz_norm_1 (ZP_mantissaA, ZV_mantissaA, mantissaA);
-   lz52 lz_norm_2 (ZP_mantissaB, ZV_mantissaB, mantissaB);
-
-   // Denormalized exponents created by subtracting the leading zeroes from the original exponents
-   assign exp1_denorm = swap ? (exp1 - ZP_mantissaB) : (exp1 - ZP_mantissaA);
-   assign exp2_denorm = swap ? (exp2 - ZP_mantissaA) : (exp2 - ZP_mantissaB);
-
-   // Finds normal underflow result to determine whether to round final exponent down
-   // Comparison between each float and the resulting sum of the primary cla adder/subtractor and cla subtractor
-   assign Float1_sum_comp = (Float1[51:0] > sum[51:0]) ? 1'b0 : 1'b1;
-   assign Float2_sum_comp = (Float2[51:0] > sum[51:0]) ? 1'b0 : 1'b1;
-   assign Float1_sum_tc_comp = (Float1[51:0] > sum_tc[51:0]) ? 1'b0 : 1'b1;
-   assign Float2_sum_tc_comp = (Float2[51:0] > sum_tc[51:0]) ? 1'b0 : 1'b1;
-
-   // Determines the correct Float value to compare based on swap result
-   assign mantissa_comp_sum = swap ? Float2_sum_comp : Float1_sum_comp;
-   assign mantissa_comp_sum_tc = swap ? Float2_sum_tc_comp : Float1_sum_tc_comp;
-
-   // Determines the correct comparison result based on operation and sign of resulting sum
-   assign mantissa_comp = (op_type[0] ^ sum[63]) ? mantissa_comp_sum_tc : mantissa_comp_sum;
-
-   // If the signs are different and both operands aren't denormalized
-   // the normal underflow bit is needed and therefore updated.
-   assign normal_underflow = ((Float1[63] ~^ Float2[63]) & (opA_Norm | opB_Norm)) ? mantissa_comp : 1'b0;
-
-   // Determine the alignment shift and limit it to 63. If any bit from 
-   // exp_shift[6] to exp_shift[11] is one, then shift is set to all ones. 
-   assign exp_shift = swap ? exp_diff2 : exp_diff1;
-   assign exp_gt63 = exp_shift[11] | exp_shift[10] | exp_shift[9] 
-     | exp_shift[8] | exp_shift[7] | exp_shift[6];
-   assign align_shift = exp_shift | {6{exp_gt63}};
-
-   // Unpack the 52-bit mantissas to 57-bit numbers of the form.
-   //    001.M[51]M[50] ... M[1]M[0]00
-   // Unless the number has an exponent of zero, in which case it
-   // is unpacked as
-   //    000.00 ... 00
-   // This effectively flushes denormalized values to zero. 
-   // The three bits of to the left of the binary point prevent overflow
-   // and loss of sign information. The two bits to the right of the 
-   // original mantissa form the "guard" and "round" bits that are used
-   // to round the result. 
-   assign opA_Norm = swap ? op2_Norm : op1_Norm;
-   assign opB_Norm = swap ? op1_Norm : op2_Norm;
-   assign mantissaA1 = {2'h0, opA_Norm, mantissaA[51:0]&{52{opA_Norm}}, 2'h0};
-   assign mantissaB1 = {2'h0, opB_Norm, mantissaB[51:0]&{52{opB_Norm}}, 2'h0};
-
-   // Perform mantissa alignment using a 57-bit barrel shifter 
-   // If any of the bits shifted out are one, Sticky_out is set. 
-   // The size of the barrel shifter could be reduced by two bits
-   // by not adding the leading two zeros until after the shift. 
-   barrel_shifter_r57 bs1 (mantissaB2, Sticky_out, mantissaB1, align_shift);
-
-   // Place either the sign-extened 32-bit value or the original 64-bit value 
-   // into IntValue (to be used for integer to floating point conversion)
-   assign IntValue [31:0] = op1[31:0];
-   assign IntValue [63:32] = op_type[0] ? {32{op1[31]}} : op1[63:32];
-
-   // If doing an integer to floating point conversion, mantissaA3 is set to 
-   // IntVal and the prenomalized exponent is set to 1084. Otherwise, 
-   // mantissaA3 is simply extended to 64-bits by setting the 7 LSBs to zero, 
-   // and the exponent value is left unchanged. 
-   // Under denormalized cases, the exponent before the rounder is set to 1
-   // if the normal shift value is 11.
-   assign convert       = ~op_type[2] & op_type[1];
-   assign mantissaA3    = (op_type[3]) ? (op_type[0] ? Float1 : ~Float1) : (DenormIn ? ({12'h0, mantissaA}) : (convert ? IntValue : {mantissaA1, 7'h0}));
-   assign exp_pre       = DenormIn ? 
-			  ((norm_shift == 6'b001011) ? 11'b00000000001 : (swap ? exp2_denorm : exp1_denorm))
-			  : (convert ? 11'b10000111100 : exponent);
-
-   // Put zero in for mantissaB3, if zeroB is one. Otherwise, B is extended to 
-   // 64-bits by setting the 7 LSBs to the Sticky_out bit followed by six  
-   // zeros. 
-   assign mantissaB3[63:7] = (op_type[3]) ? (57'h0) : (DenormIn ? {12'h0, mantissaB[51:7]} : mantissaB2 & {57{~zeroB}});
-   assign mantissaB3[6]    = (op_type[3]) ? (1'b0) : (DenormIn ? mantissaB[6] : Sticky_out & ~zeroB);
-   assign mantissaB3[5:0]  = (op_type[3]) ? (6'h01) : (DenormIn ? mantissaB[5:0] : 6'h0);
-
-   // The sign of the result needs to be corrected if the true
-   // operation is subtraction and the input operands were swapped. 
-   assign corr_sign = ~op_type[2]&~op_type[1]&op_type[0]&swap;
-   
-   // 64-bit Mantissa Adder/Subtractor
-   cla64 add1 (sum, mantissaA3, mantissaB3, sub); //***adder
-
-   // 64-bit Mantissa Subtractor - to get the two's complement of the 
-   // result when the sign from the adder/subtractor is negative. 
-   cla_sub64 sub1 (sum_tc, mantissaB3, mantissaA3); //***adder
-
-   // Determine the correct sign of the result
-   assign sign_corr = ((corr_sign ^ signA) & ~convert) ^ sum[63];   
-   
-   // If the sum is negative, use its two complement instead. 
-   // This value has to be 64-bits to correctly handle the 
-   // case 10...00
-   assign sum_corr = (DenormIn & (opA_Norm | opB_Norm) & ( ( (Float1[63] ~^ Float2[63]) & op_type[0] ) | ((Float1[63] ^ Float2[63]) & ~op_type[0]) ))
-			 ? (sum[63] ? sum : sum_tc) : ( (op_type[3]) ? sum : (sum[63] ? sum_tc : sum));
-
-   // Finds normal underflow result to determine whether to round final exponent down
-   assign normal_overflow = (DenormIn & (sum == 16'h0) & (opA_Norm | opB_Norm) & ~op_type[0]) ? 1'b1 : (sum[63] ? sum_tc[52] : sum[52]);
-
-   // Leading-Zero Detector. Determine the size of the shift needed for
-   // normalization. If sum_corrected is all zeros, the exp_valid is 
-   // zero; otherwise, it is one. 
-   lz64 lzd1 (norm_shift, exp_valid, sum_corr);
-
-   assign norm_shift_denorm = (DenormIn & ( (~opA_Norm & ~opB_Norm) | normal_underflow)) ? (6'h00) : (norm_shift);
-
-   // Barell shifter used for normalization. It takes as inputs the 
-   // the corrected sum and the amount by which the sum should 
-   // be right shifted. It outputs the normalized sum. 
-   barrel_shifter_l64 bs2 (sum_norm, sum_corr, norm_shift_denorm);
-  
-   assign sum_norm_w_bypass = (op_type[3]) ? (op_type[0] ? ~sum_corr : sum_corr) : (sum_norm);
-
-   // Round the mantissa to a 52-bit value, with the leading one
-   // removed. If the result is a single precision number, the actual 
-   // mantissa is in the upper 23 bits and the lower 29 bits are zero. 
-   // At this point, normalization has already been performed, so we know 
-   // exactly where the rounding point is. The rounding units also
-   // handles special cases and set the exception flags.
-
-   // Changed DenormIO -> Denorm and FlagsIn -> Flags in order to
-   // help in processor reservation station detection of load/stores. In
-   // other words, the processor would like to know ahead of time that
-   // if the result is an exception then don't load or store.
-   rounder round1 (Result, DenormIO, FlagsIn, rm, P, OvEn, UnEn, exp_valid, 
-		   sel_inv, Invalid, DenormIn, convert, sign_corr, exp_pre, norm_shift, sum_norm_w_bypass,
-		   exponent_postsum, op1_Norm, op2_Norm, Float1[63:52], Float2[63:52],
-		   normal_overflow, normal_underflow, swap, op_type, sum);
-
-   // Store the final result and the exception flags in registers.
-   assign AS_Result = Result;
-   assign {Denorm, Flags} = {DenormIO, FlagsIn};
-   
-endmodule // fpadd
-
-
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@ -24,133 +24,117 @@

 // `timescale 1ps/1ps
 module fpdiv (
-   input logic [63:0] op1,		// 1st input operand (A)
-   input logic [63:0] op2,		// 2nd input operand (B)
-   input logic [1:0] 	rm,		// Rounding mode - specify values 
-   input logic 	op_type,	// Function opcode
-   input logic 	P,   		// Result Precision (0 for double, 1 for single)
-   input logic 	OvEn,		// Overflow trap enabled
-   input logic 	UnEn,   	// Underflow trap enabled
-   input logic 	start,
-   input logic 	reset,
-   input logic 	clk,
-   output logic done,
-   output logic   FDivBusyE,
-   output logic   HoldInputs,
-   output logic [63:0] AS_Result,	// Result of operation
-   output logic [4:0]  Flags);   	// IEEE exception flags 
-   logic 	 Denorm;   	// Denorm on input or output
-   // output 	 done;
+   input logic 	      clk,
+   input logic 	      reset,
+   input logic 	      start,
+   input logic [63:0]   op1,		// 1st input operand (A)
+   input logic [63:0]   op2,		// 2nd input operand (B)
+   input logic [1:0]    rm,		// Rounding mode - specify values 
+   input logic 	      op_type,	// Function opcode
+   input logic 	      P,   		// Result Precision (0 for double, 1 for single)
+   input logic 	      OvEn,		// Overflow trap enabled
+   input logic 	      UnEn,   	// Underflow trap enabled
+   output logic         done,
+   output logic         FDivBusyE,
+   output logic [63:0]  AS_Result,	// Result of operation
+   output logic [4:0]   Flags);   	// IEEE exception flags 

-   supply1 	  vdd;
-   supply0 	  vss;   

-   wire [63:0] 	 Float1; 
-   wire [63:0] 	 Float2;
-   wire [63:0] 	 IntValue;
+   logic [63:0]   Float1; 
+   logic [63:0] 	Float2;
   
-   wire [12:0] 	 exp1, exp2, expF;
-   wire [12:0] 	 exp_diff, bias;
-   wire [13:0] 	 exp_sqrt;
-   wire [12:0] 	 exp_s;
-   wire [12:0] 	 exp_c;
+   logic [12:0] 	exp1, exp2, expF;
+   logic [12:0] 	exp_diff, bias;
+   logic [13:0] 	exp_sqrt;
+   logic [12:0] 	exp_s;
+   logic [12:0] 	exp_c;
   
-   wire [10:0] 	 exponent, exp_pre;
-   wire [63:0] 	 Result;   
-   wire [52:0] 	 mantissaA;
-   wire [52:0] 	 mantissaB; 
-   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
+   logic [10:0] 	exponent;
+   logic [63:0] 	Result;   
+   logic [52:0] 	mantissaA;
+   logic [52:0] 	mantissaB; 
   
-   wire [5:0] 	 align_shift;
-   wire [5:0] 	 norm_shift;
-   wire [2:0] 	 sel_inv;
-   wire		 op1_Norm, op2_Norm;
-   wire		 opA_Norm, opB_Norm;
-   wire		 Invalid;
-   wire 	 DenormIn, DenormIO;
-   wire [4:0] 	 FlagsIn;   	
-   wire 	 exp_gt63;
-   wire 	 Sticky_out;
-   wire 	 signResult, sign_corr;
-   wire          corr_sign;
-   wire 	 zeroB;         
-   wire 	 convert;
-   wire          swap;
-   wire          sub;
+   logic [2:0] 	sel_inv;
+   logic		      Invalid;
+   logic [4:0] 	FlagsIn;   	
+   logic 	      signResult;      
+   logic 	      convert;
+   logic          sub;
   
-   wire [63:0] 	 q1, qm1, qp1, q0, qm0, qp0;
-   wire [63:0] 	 rega_out, regb_out, regc_out, regd_out;
-   wire [127:0]  regr_out;
-   wire [2:0] 	 sel_muxa, sel_muxb;
-   wire 	 sel_muxr;   
-   wire 	 load_rega, load_regb, load_regc, load_regd, load_regr;
+   logic [63:0] 	q1, qm1, qp1, q0, qm0, qp0;
+   logic [63:0] 	rega_out, regb_out, regc_out, regd_out;
+   logic [127:0]  regr_out;
+   logic [2:0] 	sel_muxa, sel_muxb;
+   logic 	      sel_muxr;   
+   logic 	      load_rega, load_regb, load_regc, load_regd, load_regr;

-   wire 	 donev, sel_muxrv, sel_muxsv;
-   wire [1:0] 	 sel_muxav, sel_muxbv;   
-   wire 	 load_regav, load_regbv, load_regcv;
-   wire 	 load_regrv, load_regs;
-   logic exp_cout1, exp_cout2;
-   logic exp_odd, open;
-   // logic reset;
+   logic 	      load_regs;
+   logic          exp_cout1, exp_cout2;
+   logic          exp_odd, open;
   
+   // div/sqrt
+         //  fdiv  = 0
+         //  fsqrt = 1
+
   // Convert the input operands to their appropriate forms based on 
   // the orignal operands, the op_type , and their precision P. 
   // Single precision inputs are converted to double precision 
   // and the sign of the first operand is set appropratiately based on
   // if the operation is absolute value or negation.   
-   convert_inputs_div conv1 (Float1, Float2, op1, op2, op_type, P);
+   convert_inputs_div conv1 (.op1, .op2, .op_type, .P, 
+                           // outputs:
+                           .Float1, .Float2b(Float2));

   // Test for exceptions and return the "Invalid Operation" and
   // "Denormalized" Input Flags. The "sel_inv" is used in
   // the third pipeline stage to select the result. Also, op1_Norm
   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
   // sub is one if the effective operation is subtaction.   
-   exception_div exc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
-		       Float1, Float2, op_type);
+   exception_div exc1 (.A(Float1), .B(Float2), .op_type,
+                     // output:
+                     .Ztype(sel_inv), .Invalid);

   // Determine Sign/Mantissa
   assign signResult = (Float1[63]^Float2[63]);
-   assign mantissaA = {vdd, Float1[51:0]};
-   assign mantissaB = {vdd, Float2[51:0]};
+   assign mantissaA = {1'b1, Float1[51:0]};
+   assign mantissaB = {1'b1, Float2[51:0]};
   // Perform Exponent Subtraction - expA - expB + Bias   
   assign exp1 = {2'b0, Float1[62:52]};
   assign exp2 = {2'b0, Float2[62:52]};
-   // bias : DP = 2^{11-1}-1 = 1023
   assign bias = {3'h0, 10'h3FF};
   // Divide exponent
-   csa #(13) csa1 (exp1, ~exp2, bias, exp_s, exp_c);
-   // adder #(14) explogic1 ({vss, exp_s}, {vss, exp_c}, 1'b1, {open, exp_diff}, exp_cout1);
-   assign {exp_cout1, open, exp_diff} = {vss, exp_s} + {vss, exp_c} + 1'b1;
+   assign {exp_cout1, open, exp_diff} = exp1 - exp2 + bias;
   
   // Sqrt exponent (check if exponent is odd)
-   assign exp_odd = Float1[52] ? vss : vdd;
-   // adder #(14) explogic2 ({vss, exp1}, {4'h0, 10'h3ff}, exp_odd, exp_sqrt, exp_cout2);
-   assign {exp_cout2, exp_sqrt} = {vss, exp1} + {4'h0, 10'h3ff} + exp_odd;
+   assign exp_odd = Float1[52] ? 1'b0 : 1'b1;
+   assign {exp_cout2, exp_sqrt} = {1'b0, exp1} + {4'h0, 10'h3ff} + exp_odd;
   // Choose correct exponent
   assign expF = op_type ? exp_sqrt[13:1] : exp_diff;   

   // Main Goldschmidt/Division Routine   
-   divconv goldy (q1, qm1, qp1, q0, qm0, qp0, rega_out, regb_out, regc_out, regd_out,
-		  regr_out, mantissaB, mantissaA, sel_muxa, sel_muxb, sel_muxr, 
-		  reset, clk,  load_rega, load_regb, load_regc, load_regd,
-		  load_regr, load_regs, P, op_type, exp_odd);
+   divconv goldy (.q1, .qm1, .qp1, .q0, .qm0, .qp0, .rega_out, .regb_out, .regc_out, .regd_out,
+		  .regr_out, .d(mantissaB), .n(mantissaA), .sel_muxa, .sel_muxb, .sel_muxr, 
+		  .reset, .clk,  .load_rega, .load_regb, .load_regc, .load_regd,
+		  .load_regr, .load_regs, .P, .op_type, .exp_odd);

   // FSM : control divider   
-   fsm control (done, load_rega, load_regb, load_regc, load_regd, 
-		    load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
-		    clk, reset, start, op_type, FDivBusyE, HoldInputs);
+   fsm control (.clk, .reset, .start, .op_type,
+               // outputs:
+               .done, .load_rega, .load_regb, .load_regc, .load_regd, 
+		         .load_regr, .load_regs, .sel_muxa, .sel_muxb, .sel_muxr, 
+		         .divBusy(FDivBusyE));
   
   // Round the mantissa to a 52-bit value, with the leading one
   // removed. The rounding units also handles special cases and 
   // set the exception flags.   
-   rounder_div round1 (Result, DenormIO, FlagsIn, 
-		   rm, P, OvEn, UnEn, expF, 
-   		   sel_inv, Invalid, DenormIn, signResult, 
-		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
+   rounder_div round1 (.rm, .P, .OvEn, .UnEn, .exp_diff(expF), 
+   		            .sel_inv, .Invalid, .SignR(signResult), 
+		               .q1, .qm1, .qp1, .q0, .qm0, .qp0, .regr_out, 
+                     // outputs:
+                     .Result, .Flags(FlagsIn));

   // Store the final result and the exception flags in registers.
-   flopenr #(64) rega (clk, reset, done, Result, AS_Result);
-   flopenr #(1) regb (clk, reset, done, DenormIO, Denorm);   
+   flopenr #(64) rega (clk, reset, done, Result, AS_Result);  
   flopenr #(5) regc (clk, reset, done, FlagsIn, Flags);   
   
 endmodule // fpadd
--- a/wally-pipelined/src/fpu/fpu.sv
+++ b/wally-pipelined/src/fpu/fpu.sv
@ -25,142 +25,173 @@
 `include "wally-config.vh"

 module fpu (
-  input logic 		   clk,
-  input logic 		   reset,
-  input logic [2:0] 	   FRM_REGW, // Rounding mode from CSR
-  input logic [31:0] 	   InstrD,
-  input logic [`XLEN-1:0]  ReadDataW, // Read data from memory
-  input logic [`XLEN-1:0]  SrcAE, // Integer input being processed
-  input logic [`XLEN-1:0]  SrcAM, // Integer input being written into fpreg
-  input logic 		   StallE, StallM, StallW,
-  input logic 		   FlushE, FlushM, FlushW,
-  input logic [4:0] 	   RdE, RdM, RdW, 
-  output logic 		   FRegWriteM,
-  output logic 		   FStallD, // Stall the decode stage
-  output logic 		   FWriteIntE, FWriteIntM, FWriteIntW, // Write integer register enable
-  output logic [`XLEN-1:0] FWriteDataE, // Data to be written to memory
-  output logic [`XLEN-1:0] FIntResM, 
-  output logic 		   FDivBusyE, // Is the divison/sqrt unit busy
-  output logic 		   IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
-  output logic [4:0] 	   SetFflagsM);      // FPU result
-// *** change FMA to do 16 - 32 - 64 - 128 FEXPBITS 
-// *** folder at same level of src for tests fpu tests
-// qa.b
-// u1.52 - u sunsigned, q signed
-  generate
-     if (`F_SUPPORTED | `D_SUPPORTED) begin 
-      // control logic signal instantiation
-	logic 		   FRegWriteD, FRegWriteE, FRegWriteW;                 // FP register write enable
-	logic [2:0] 	   FrmD, FrmE, FrmM;                                   // FP rounding mode
-	logic 		   FmtD, FmtE, FmtM, FmtW;                             // FP precision 0-single 1-double
-	logic 		   FDivStartD, FDivStartE;                             // Start division
-	logic 		   FWriteIntD;                                         // Write to integer register
-	logic [1:0] 	   FForwardXE, FForwardYE, FForwardZE;                 // Input3 forwarding mux control signal
-	logic [2:0] 	   FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select FP result
-	logic [3:0] 	   FOpCtrlD, FOpCtrlE, FOpCtrlM;                       // Select which opperation to do in each component
-	logic [1:0] 	   FResSelD, FResSelE, FResSelM;  
-	logic [1:0] 	   FIntResSelD, FIntResSelE, FIntResSelM;                                   
-	logic [4:0] 	   Adr1E, Adr2E, Adr3E;
+  input logic 		          clk,
+  input logic 		          reset,
+  input logic  [2:0] 	      FRM_REGW, // Rounding mode from CSR
+  input logic  [31:0]       InstrD,   // instruction from IFU
+  input logic  [`XLEN-1:0]  ReadDataW,// Read data from memory
+  input logic  [`XLEN-1:0]  SrcAE,    // Integer input being processed (from IEU)
+  input logic  [`XLEN-1:0]  SrcAM,    // Integer input being written into fpreg (from IEU)
+  input logic 		          StallE, StallM, StallW, // stall signals from HZU
+  input logic 		          FlushE, FlushM, FlushW, // flush signals from HZU
+  input logic  [4:0] 	      RdE, RdM, RdW,  // which FP register to write to (from IEU)
+  output logic 		          FRegWriteM,     // FP register write enable
+  output logic 		          FStallD,        // Stall the decode stage
+  output logic 		          FWriteIntE, FWriteIntM, FWriteIntW, // integer register write enable
+  output logic [`XLEN-1:0]  FWriteDataE,  // Data to be written to memory
+  output logic [`XLEN-1:0]  FIntResM,     // data to be written to integer register
+  output logic 		          FDivBusyE,    // Is the divide/sqrt unit busy (stall execute stage)
+  output logic 		          IllegalFPUInstrD, // Is the instruction an illegal fpu instruction
+  output logic [4:0] 	      SetFflagsM        // FMA flags (to privileged unit)
+  );
+
+  //*** make everything FLEN at some point
+  //*** add the 128 bit support to the if statement when needed
+  //*** make new tests for fp using testfloat that include flag checking and all rounding modes
+  //*** what is the format for 16-bit - finding conflicting info online can't find anything specified in spec
+  //*** only fma/mul and fp <-> int convert flags have been tested. test the others.
+
+  // FPU specifics:
+  //    - uses NaN-blocking format
+  //        - if there are any unsused bits the most significant bits are filled with 1s
+  //                single stored in a double: | 32 1s | single precision value |
+  //    - sets the underflow after rounding
+  
+  generate if (`F_SUPPORTED | `D_SUPPORTED) begin 
+
+  // control signals
+	logic 		  FRegWriteD, FRegWriteE, FRegWriteW; // FP register write enable
+	logic [2:0] FrmD, FrmE, FrmM;                   // FP rounding mode
+	logic 		  FmtD, FmtE, FmtM, FmtW;             // FP precision 0-single 1-double
+	logic 		  FDivStartD, FDivStartE;             // Start division or squareroot
+	logic 		  FWriteIntD;                         // Write to integer register
+	logic [1:0] FForwardXE, FForwardYE, FForwardZE; // forwarding mux control signals
+	logic [2:0] FResultSelD, FResultSelE, FResultSelM, FResultSelW; // Select the result written to FP register
+	logic [3:0] FOpCtrlD, FOpCtrlE, FOpCtrlM;           // Select which opperation to do in each component
+	logic [1:0] FResSelD, FResSelE, FResSelM;           // Select one of the results that finish in the memory stage
+	logic [1:0] FIntResSelD, FIntResSelE, FIntResSelM;  // Select the result written to the integer resister
+	logic [4:0] Adr1E, Adr2E, Adr3E;                    // adresses of each input
 	
 	// regfile signals
-	logic [63:0] 	   FRD1D, FRD2D, FRD3D;                                // Read Data from FP register - decode stage
-	logic [63:0] 	   FRD1E, FRD2E, FRD3E;                                // Read Data from FP register - execute stage
-	logic [`XLEN-1:0]  FSrcXMAligned;
-	logic [63:0] 	   FSrcXE, FSrcXM;                                     // Input 1 to the various units (after forwarding)
-	logic [63:0] 	   FSrcYE;                                             // Input 2 to the various units (after forwarding)
-	logic [63:0] 	   FPreSrcZE, FSrcZE;                                             // Input 3 to the various units (after forwarding)
+	logic [63:0] 	    FRD1D, FRD2D, FRD3D;  // Read Data from FP register - decode stage
+	logic [63:0] 	    FRD1E, FRD2E, FRD3E;  // Read Data from FP register - execute stage
+	logic [63:0] 	    FSrcXE, FSrcXM;       // Input 1 to the various units (after forwarding)
+	logic [63:0] 	    FSrcYE;               // Input 2 to the various units (after forwarding)
+	logic [63:0] 	    FPreSrcZE, FSrcZE;     // Input 3 to the various units (after forwarding)
 	
 	// unpacking signals
-	logic 		   XSgnE, YSgnE, ZSgnE;
-	logic [10:0] 	   XExpE, YExpE, ZExpE;
-	logic [52:0] 	   XManE, YManE, ZManE;
-	logic 		   XNaNE, YNaNE, ZNaNE;
-	logic 		   XSNaNE, YSNaNE, ZSNaNE;
-	logic 		   XDenormE, YDenormE, ZDenormE;
-	logic 		   XZeroE, YZeroE, ZZeroE;
-	logic [10:0] 	   BiasE;
-	logic 		   XInfE, YInfE, ZInfE;
-	logic 		   XExpMaxE;
-	logic 		   XNormE;
+	logic 		   XSgnE, YSgnE, ZSgnE;     // input's sign - execute stage
+	logic 		   XSgnM, YSgnM, ZSgnM;     // input's sign - memory stage
+	logic [10:0] XExpE, YExpE, ZExpE;     // input's exponent - execute stage
+	logic [10:0] XExpM, YExpM, ZExpM;     // input's exponent - memory stage
+	logic [52:0] XManE, YManE, ZManE;  // input's fraction - execute stage
+	logic [52:0] XManM, YManM, ZManM;  // input's fraction - memory stage
+	logic [10:0] BiasE;                   // bias based on precision (single=7f double=3ff - max expoent/2)
+	logic 		   XNaNE, YNaNE, ZNaNE;           // is the input a NaN - execute stage
+	logic 		   XNaNM, YNaNM, ZNaNM;           // is the input a NaN - memory stage
+	logic 		   XSNaNE, YSNaNE, ZSNaNE;        // is the input a signaling NaN - execute stage
+	logic 		   XSNaNM, YSNaNM, ZSNaNM;        // is the input a signaling NaN - memory stage
+	logic 		   XDenormE, YDenormE, ZDenormE;  // is the input denormalized
+	logic 		   XZeroE, YZeroE, ZZeroE;        // is the input zero - execute stage
+	logic 		   XZeroM, YZeroM, ZZeroM;        // is the input zero - memory stage
+	logic 		   XInfE, YInfE, ZInfE;           // is the input infinity - execute stage
+	logic 		   XInfM, YInfM, ZInfM;           // is the input infinity - memory stage
+	logic 		   XExpMaxE;                      // is the exponent all ones (max value)
+	logic 		   XNormE;                        // is X normal
 	
-	logic 		   XSgnM, YSgnM, ZSgnM;
-	logic [10:0] 	   XExpM, YExpM, ZExpM;
-	logic [52:0] 	   XManM, YManM, ZManM;
-	logic 		   XNaNM, YNaNM, ZNaNM;
-	logic 		   XSNaNM, YSNaNM, ZSNaNM;
-	logic 		   XZeroM, YZeroM, ZZeroM;
-	logic 		   XInfM, YInfM, ZInfM;
 	
-	// div/sqrt signals
-	logic [63:0] 	   FDivResultM, FDivResultW;
-	logic [4:0] 	   FDivSqrtFlgM, FDivSqrtFlgW;
-	logic 		   FDivSqrtDoneE;
-	logic [63:0] 	   DivInput1E, DivInput2E;
-	logic 		   HoldInputs;                                              // keep forwarded inputs arround durring division
+	// result and flag signals
+	logic [63:0]  FDivResM, FDivResW; // divide/squareroot result
+	logic [4:0] 	FDivFlgM, FDivFlgW; // divide/squareroot flags
+  
+	logic [63:0]  FMAResM, FMAResW;   // FMA/multiply result
+	logic [4:0] 	FMAFlgM, FMAFlgW;   // FMA/multiply result
 	
-	//fpu signals
-	logic [63:0] 	   FMAResM, FMAResW;
-	logic [4:0] 	   FMAFlgM, FMAFlgW;
+	logic [63:0] 	ReadResW;           // read result (load instruction)
+
+	logic [63:0] 	FAddResM, FAddResW; // add/FP -> FP convert result
+	logic [4:0] 	FAddFlgM, FAddFlgW; // add/FP -> FP convert flags
+
+	logic [63:0] 	CvtResE, CvtResM;   // FP <-> int convert result
+	logic [4:0] 	CvtFlgE, CvtFlgM;   // FP <-> int convert flags //*** trim this
 	
-	logic [63:0] 	   ReadResW;
-	
-	// add/cvt signals
-	logic [63:0] 	   FAddResM, FAddResW;
-	logic [4:0] 	   FAddFlgM, FAddFlgW;  
-	logic [63:0] 	   CvtResE, CvtResM;
-	logic [4:0] 	   CvtFlgE, CvtFlgM;  
-	
-	// cmp signals 
-	logic 		   CmpNVE, CmpNVM, CmpNVW;
-	logic [63:0] 	   CmpResE, CmpResM, CmpResW;
-	
-	// fsgn signals
-	logic [63:0] 	   SgnResE, SgnResM;
-	logic 		   SgnNVE, SgnNVM, SgnNVW;
-	logic [63:0] 	   FResM, FResW;
-	logic [4:0] 	   FFlgM, FFlgW;
-	
-	// instantiation of W stage regfile signals
-	logic [63:0] 	   AlignedSrcAM;
-	
-	// classify signals
-	logic [63:0] 	   ClassResE, ClassResM;
-	
-	// 64-bit FPU result   
-	logic [63:0] 	   FPUResultW;                                           
-	logic [4:0] 	   FPUFlagsW;
+	logic [63:0] 	ClassResE, ClassResM; // classify result
+
+	logic [63:0] 	CmpResE, CmpResM; // compare result
+	logic 		    CmpNVE, CmpNVM;   // compare invalid flag (Not Valid)
 	
+	logic [63:0] 	SgnResE, SgnResM; // sign injection result
+	logic 		    SgnNVE, SgnNVM;   // sign injection invalid flag (Not Valid)
+
+	logic [63:0] 	FResM, FResW;     // selected result that is ready in the memory stage
+	logic [4:0] 	FFlgM;            // selected flag that is ready in the memory stage
+
+	logic [63:0] 	   FPUResultW;    // final FP result being written to the FP register
+		
+	// other signals
+	logic 		    FDivSqrtDoneE;          // is divide done
+	logic [63:0] 	DivInput1E, DivInput2E; // inputs to divide/squareroot unit
+	logic 		    FDivClk;                // clock for divide/squareroot unit
+	logic [63:0] 	AlignedSrcAM;           // align SrcA to the floating point format
+
+
+
+
+
+  ////////////////////////////////////////////////////////////////////////////////////////
 	//DECODE STAGE
+	////////////////////////////////////////////////////////////////////////////////////////
+
+
+
+	// calculate FP control signals
+	fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), .FRM_REGW,
+              // outputs:
+              .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
+              .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
 	
-	// top-level controller for FPU
-	fctrl fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
-                     .FRM_REGW, .IllegalFPUInstrD, .FRegWriteD, .FDivStartD, .FResultSelD, .FOpCtrlD, .FResSelD, 
-                     .FIntResSelD, .FmtD, .FrmD, .FWriteIntD);
+	// FP register file
+  //    - can read 3 registers and write 1 register every cycle
+	fregfile fregfile (.clk, .reset, .we4(FRegWriteW),
+			   .a1(InstrD[19:15]), .a2(InstrD[24:20]), .a3(InstrD[31:27]), .a4(RdW), 
+         .wd4(FPUResultW),
+         // outputs:
+			   .rd1(FRD1D), .rd2(FRD2D), .rd3(FRD3D));	
 	
-	// regfile instantiation
-	fregfile fregfile (clk, reset, FRegWriteW,
-			   InstrD[19:15], InstrD[24:20], InstrD[31:27], RdW,
-			   FPUResultW,
-			   FRD1D, FRD2D, FRD3D);	
-	
-	//*****************
-	// D/E pipe registers
-	//*****************
+
+
+
+
+	////////////////////////////////////////////////////////////////////////////////////////
+	// D/E pipeline registers
+	////////////////////////////////////////////////////////////////////////////////////////
+
 	flopenrc #(64) DEReg1(clk, reset, FlushE, ~StallE, FRD1D, FRD1E);
 	flopenrc #(64) DEReg2(clk, reset, FlushE, ~StallE, FRD2D, FRD2E);
 	flopenrc #(64) DEReg3(clk, reset, FlushE, ~StallE, FRD3D, FRD3E);
-	flopenrc #(1) DECtrlRegE1(clk, reset, FlushE, ~StallE, FDivStartD, FDivStartE);
-	flopenrc #(15) DECtrlRegE2(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
-                                   {Adr1E,         Adr2E,         Adr3E});
-	flopenrc #(17) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-				  {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD},
-				  {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE});
+	flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {InstrD[19:15], InstrD[24:20], InstrD[31:27]}, 
+                                                       {Adr1E,         Adr2E,         Adr3E});
+	flopenrc #(18) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+				  {FRegWriteD, FResultSelD, FResSelD, FIntResSelD, FrmD, FmtD, FOpCtrlD, FWriteIntD, FDivStartD},
+				  {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE, FDivStartE});
 	
+
+
+
+
+
+  
+	////////////////////////////////////////////////////////////////////////////////////////
 	//EXECUTION STAGE
+	////////////////////////////////////////////////////////////////////////////////////////
+
+
+	// Hazard unit for FPU  
+  //    - determines if any forwarding or stalls are needed
+	fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, 
+                  // outputs:
+                  .FStallD, .FForwardXE, .FForwardYE, .FForwardZE);
 	
-	// Hazard unit for FPU
-	fhazard fhazard(.Adr1E, .Adr2E, .Adr3E, .FRegWriteM, .FRegWriteW, .RdM, .RdW, .FResultSelM, .FStallD, 
-                        .FForwardXE, .FForwardYE, .FForwardZE);

 	// forwarding muxs
 	mux3  #(64)  fxemux(FRD1E, FPUResultW, FResM, FForwardXE, FSrcXE);
@ -168,128 +199,190 @@ module fpu (
 	mux3  #(64)  fzemux(FRD3E, FPUResultW, FResM, FForwardZE, FPreSrcZE);
 	mux2  #(64)  fzmulmux(FPreSrcZE, 64'b0, FOpCtrlE[2], FSrcZE); // Force Z to be 0 for multiply instructions
 	
-	unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), 
-			    .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, .XSgnE, .YSgnE, 
-			    .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
-			    .XNaNE, .YNaNE, .ZNaNE, 
-			    .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
-			    .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
-      // first of two-stage instance of floating-point fused multiply-add unit
+   
+  // unpacking unit
+  //    - splits FP inputs into their various parts
+  //    - does some classifications (SNaN, NaN, Denorm, Norm, Zero, Infifnity)
+	unpacking unpacking(.X(FSrcXE), .Y(FSrcYE), .Z(FSrcZE), .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, 
+                      // outputs:
+                      .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
+                      .XNaNE, .YNaNE, .ZNaNE, .XSNaNE, .YSNaNE, .ZSNaNE, .XDenormE, .YDenormE, .ZDenormE, 
+                      .XZeroE, .YZeroE, .ZZeroE, .BiasE, .XInfE, .YInfE, .ZInfE, .XExpMaxE, .XNormE);
+
+  // FMA
+  //    - two stage FMA
+  //        - execute stage - multiplication and addend shifting
+  //        - memory stage  - addition and rounding
+  //    - handles FMA and multiply instructions
+  //    - contains some E/M pipleine registers
+  // *** currently handles FLEN and 32 bits(dont know if 32 works with 128 - easy to fix) - change to handle only the supported formats
 	fma fma (.clk, .reset, .FlushM, .StallM, 
-		 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .
-		 ZManE, .XDenormE, .YDenormE, 
-		 .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
-		 .XSgnM, .YSgnM, .ZSgnM, .XExpM, .YExpM, .ZExpM, .XManM, 
-		 .YManM, .ZManM, .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
-		 //  .FSrcXE, .FSrcYE, .FSrcZE, .FSrcXM, .FSrcYM, .FSrcZM, 
+		 .XSgnE, .YSgnE, .ZSgnE, .XExpE, .YExpE, .ZExpE, .XManE, .YManE, .ZManE, 
+     .XDenormE, .YDenormE, .ZDenormE, .XZeroE, .YZeroE, .ZZeroE, .BiasE, 
+		 .XSgnM, .YSgnM, .ZSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
+     .XNaNM, .YNaNM, .ZNaNM, .XZeroM, .YZeroM, .ZZeroM, 
+     .XInfM, .YInfM, .ZInfM, .XSNaNM, .YSNaNM, .ZSNaNM,
 		 .FOpCtrlE(FOpCtrlE[2:0]), .FOpCtrlM(FOpCtrlM[2:0]), 
-		 .FmtE, .FmtM, .FrmM, .FMAFlgM, .FMAResM);
-	
-	// first and only instance of floating-point divider
-	logic 		   fpdivClk;
+		 .FmtE, .FmtM, .FrmM, 
+     // outputs:
+     .FMAFlgM, .FMAResM);
 	
+	// clock gater
+  //    - creates a clock that only runs durring divide/sqrt instructions
+  //    - using the seperate clock gives the divide/sqrt unit some to get set up
+  // *** the module says not to use in synthisis
 	clockgater fpdivclkg(.E(FDivStartE),
 			     .SE(1'b0),
 			     .CLK(clk),
-			     .ECLK(fpdivClk));
+			     .ECLK(FDivClk));
 	
-	// capture the inputs for div/sqrt	 
+	// capture the inputs for divide/sqrt
+  //    - if not captured any forwarded inputs will change durring computation
+  //        - this problem is caused by stalling the execute stage
+  //    - the other units don't have this problem, only div/sqrt stalls the execute stage
 	flopenrc #(64) reg_input1 (.d(FSrcXE), .q(DivInput1E),
 				   .en(1'b1), .clear(FDivSqrtDoneE),
-				   .reset(reset),  .clk(HoldInputs));
+				   .reset(reset),  .clk(FDivBusyE));
 	flopenrc #(64) reg_input2 (.d(FSrcYE), .q(DivInput2E),
 				   .en(1'b1), .clear(FDivSqrtDoneE),
-				   .reset(reset),  .clk(HoldInputs));
-	//*** add round to nearest ties to max magnitude
-	fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .done(FDivSqrtDoneE), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
-			.P(~FmtE), .FDivBusyE, .HoldInputs, 
-			.OvEn(1'b1), .UnEn(1'b1),
-			.start(FDivStartE), .reset, .clk(fpdivClk), .AS_Result(FDivResultM), .Flags(FDivSqrtFlgM));
-	
-        // .DivOpType(FOpCtrlE[0]), .clk(fpdivClk), .FmtE(~FmtE), .DivInput1E, .DivInput2E, 
-        //                 .FrmE, .DivOvEn(1'b1), .DivUnEn(1'b1), .FDivStartE, .FDivResultM, .FDivSqrtFlgM, 
-        //                 .FDivSqrtDoneE, .FDivBusyE, .HoldInputs, .reset);
-	// assign FDivBusyE = 0;
-	
-	// first of two-stage instance of floating-point add/cvt unit
-	faddcvt faddcvt (.clk, .reset, .FlushM, .StallM, .FrmM, .FOpCtrlM, .FmtE, .FmtM,
-                         .FSrcXE, .FSrcYE, .FOpCtrlE, .FAddResM, .FAddFlgM);
-	
-	// first and only instance of floating-point comparator
-	fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), .FSrcXE, 
-		   .FSrcYE, .FOpCtrlE(FOpCtrlE[2:0]), .FmtE, 
-		   .Invalid(CmpNVE), .CmpResE, .XNaNE, .YNaNE, .XZeroE, .YZeroE);
-	
-	// first and only instance of floating-point sign converter
-	fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .SgnResE, .SgnNVE, .XExpMaxE);
-
-	// first and only instance of floating-point classify unit
-	fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, .XSNaNE, .ClassResE);
-	
-	fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE, .CvtResE, .CvtFlgE);
+				   .reset(reset),  .clk(FDivBusyE));
 	
 	// output for store instructions
+  //*** change to use the unpacking unit if possible
+	fpdiv fdivsqrt (.op1(DivInput1E), .op2(DivInput2E), .rm(FrmE[1:0]), .op_type(FOpCtrlE[0]), 
+			             .reset, .clk(FDivClk), .start(FDivStartE), .P(~FmtE), .OvEn(1'b1), .UnEn(1'b1),
+                   // outputs:
+			             .FDivBusyE, .done(FDivSqrtDoneE), .AS_Result(FDivResM), .Flags(FDivFlgM));
+	
+	
+	// add/FP <-> FP convert
+  //    - computation is done in two stages
+  //    - contains some E/M pipleine registers
+  //*** remove uneeded logic
+  //*** change to use the unpacking unit if possible
+	faddcvt faddcvt (.clk, .reset, .FlushM, .StallM, .FrmM, .FOpCtrlM, .FmtE, .FmtM, .FSrcXE, .FSrcYE, .FOpCtrlE, 
+                  // outputs:
+                  .FAddResM, .FAddFlgM);
+	
+	// compare unit
+  //    - computation is done in one stage
+  //    - writes to FP file durring min/max instructions
+  //    - other comparisons write a 1 or 0 to the integer register
+	fcmp fcmp (.op1({XSgnE,XExpE,XManE[`NF-1:0]}), .op2({YSgnE,YExpE,YManE[`NF-1:0]}), 
+            .FSrcXE, .FSrcYE, .FOpCtrlE(FOpCtrlE[2:0]), 
+            .FmtE, .XNaNE, .YNaNE, .XZeroE, .YZeroE, 
+            // outputs:
+		        .Invalid(CmpNVE), .CmpResE);
+	
+	// sign injection unit
+  //    - computation is done in one stage
+	fsgn fsgn (.SgnOpCodeE(FOpCtrlE[1:0]), .XSgnE, .YSgnE, .FSrcXE, .FmtE, .XExpMaxE,
+            // outputs:
+            .SgnNVE, .SgnResE);
+	
+	// classify
+  //    - computation is done in one stage
+  //    - most of the work is done in the unpacking unit
+  //    - result is written to the integer register
+	fclassify fclassify (.XSgnE, .XDenormE, .XZeroE, .XNaNE, .XInfE, .XNormE, 
+                      // outputs:
+                      .XSNaNE, .ClassResE);
+	
+	fcvt fcvt (.XSgnE, .XExpE, .XManE, .XZeroE, .XNaNE, .XInfE, .XDenormE, .BiasE, .SrcAE, .FOpCtrlE, .FmtE, .FrmE,
+            // outputs: 
+            .CvtResE, .CvtFlgE);
+	
+	// data to be stored in memory - to IEU
+  //    - FP uses NaN-blocking format
+  //        - if there are any unsused bits the most significant bits are filled with 1s
 	assign FWriteDataE = FSrcYE[`XLEN-1:0];
 	
-	//*****************
+
+
+
+
+  //***will synth remove registers of values that are always zero?
+	////////////////////////////////////////////////////////////////////////////////////////
 	// E/M pipe registers
-	//*****************
+	////////////////////////////////////////////////////////////////////////////////////////
+
 	flopenrc #(64) EMFpReg1(clk, reset, FlushM, ~StallM, FSrcXE, FSrcXM);
-	// flopenrc #(64) EMFpReg2(clk, reset, FlushM, ~StallM, FSrcYE, FSrcYM);
-	// flopenrc #(64) EMFpReg3(clk, reset, FlushM, ~StallM, FSrcZE, FSrcZM);
-	flopenrc #(65) EMFpReg4(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
-	flopenrc #(65) EMFpReg5(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
-	flopenrc #(65) EMFpReg6(clk, reset, FlushM, ~StallM, {ZSgnE,ZExpE,ZManE}, {ZSgnM,ZExpM,ZManM});
-	flopenrc #(12) EMFpReg7(clk, reset, FlushM, ~StallM, 
+	flopenrc #(65) EMFpReg2(clk, reset, FlushM, ~StallM, {XSgnE,XExpE,XManE}, {XSgnM,XExpM,XManM});
+	flopenrc #(65) EMFpReg3(clk, reset, FlushM, ~StallM, {YSgnE,YExpE,YManE}, {YSgnM,YExpM,YManM});
+	flopenrc #(65) EMFpReg4(clk, reset, FlushM, ~StallM, {ZSgnE,ZExpE,ZManE}, {ZSgnM,ZExpM,ZManM});
+	flopenrc #(12) EMFpReg5(clk, reset, FlushM, ~StallM, 
 				{XZeroE, YZeroE, ZZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
 				{XZeroM, YZeroM, ZZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});
 	
-	flopenrc #(1)  EMRegCmp1(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
-	flopenrc #(64) EMRegCmp2(clk, reset, FlushM, ~StallM, CmpResE, CmpResM); 
+	flopenrc #(64) EMRegCmpRes(clk, reset, FlushM, ~StallM, CmpResE, CmpResM); 
+	flopenrc #(1)  EMRegCmpFlg(clk, reset, FlushM, ~StallM, CmpNVE, CmpNVM); 
 	
-	flopenrc #(64) EMRegSgn1(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
-	flopenrc #(1) EMRegSgn2(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
+	flopenrc #(64) EMRegSgnRes(clk, reset, FlushM, ~StallM, SgnResE, SgnResM);
+	flopenrc #(1) EMRegSgnFlg(clk, reset, FlushM, ~StallM, SgnNVE, SgnNVM);
 	
-	flopenrc #(64) EMRegCvt1(clk, reset, FlushM, ~StallM, CvtResE, CvtResM);
-	flopenrc #(5) EMRegCvt2(clk, reset, FlushM, ~StallM, CvtFlgE, CvtFlgM);
+	flopenrc #(64) EMRegCvtRes(clk, reset, FlushM, ~StallM, CvtResE, CvtResM);
+	flopenrc #(5) EMRegCvtFlg(clk, reset, FlushM, ~StallM, CvtFlgE, CvtFlgM);
+  
+	flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
 	
 	flopenrc #(17) EMCtrlReg(clk, reset, FlushM, ~StallM,
 				 {FRegWriteE, FResultSelE, FResSelE, FIntResSelE, FrmE, FmtE, FOpCtrlE, FWriteIntE},
 				 {FRegWriteM, FResultSelM, FResSelM, FIntResSelM, FrmM, FmtM, FOpCtrlM, FWriteIntM});
 	
-	flopenrc #(64) EMRegClass(clk, reset, FlushM, ~StallM, ClassResE, ClassResM);
 	
+
+
+
+
+	////////////////////////////////////////////////////////////////////////////////////////
 	//BEGIN MEMORY STAGE
-	mux4  #(64)  FResMux(AlignedSrcAM, SgnResM, CmpResM, CvtResM, FResSelM, FResM);
-	mux4  #(5)  FFlgMux(5'b0, {4'b0, SgnNVM}, {4'b0, CmpNVM}, CvtFlgM, FResSelM, FFlgM);
-	
-	// mux2  #(`XLEN)  FSrcXAlignedMux({{`XLEN-32{1'b0}}, FSrcXM[63:32]}, FSrcXM[63:64-`XLEN], FmtM, FSrcXMAligned);
-	mux4  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], FSrcXM[`XLEN-1:0], ClassResM[`XLEN-1:0], CvtResM[`XLEN-1:0], FIntResSelM, FIntResM);
-	
+	////////////////////////////////////////////////////////////////////////////////////////
+
 	// Align SrcA to MSB when single precicion
 	mux2  #(64)  SrcAMux({{32{1'b1}}, SrcAM[31:0]}, {{64-`XLEN{1'b1}}, SrcAM}, FmtM, AlignedSrcAM);
-	mux5  #(5)  FPUFlgMux(5'b0, FMAFlgM, FAddFlgM, FDivSqrtFlgM, FFlgM, FResultSelW, SetFflagsM);
+
+  // select a result that may be written to the FP register
+	mux4  #(64) FResMux(AlignedSrcAM, SgnResM, CmpResM, CvtResM, FResSelM, FResM);
+	mux4  #(5)  FFlgMux(5'b0, {4'b0, SgnNVM}, {4'b0, CmpNVM}, CvtFlgM, FResSelM, FFlgM);
 	
-	//*****************
+  // select the result that may be written to the integer register - to IEU
+	mux4  #(`XLEN)  IntResMux(CmpResM[`XLEN-1:0], FSrcXM[`XLEN-1:0], ClassResM[`XLEN-1:0], CvtResM[`XLEN-1:0], FIntResSelM, FIntResM);
+	
+  // FPU flag selection - to privileged
+	mux5  #(5)  FPUFlgMux(5'b0, FMAFlgM, FAddFlgM, FDivFlgM, FFlgM, FResultSelW, SetFflagsM);
+	
+
+
+
+  
+	////////////////////////////////////////////////////////////////////////////////////////
 	// M/W pipe registers
-	//*****************
-	flopenrc #(64) MWRegFma1(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
-	flopenrc #(64) MWRegDiv1(clk, reset, FlushW, ~StallW, FDivResultM, FDivResultW); 
-	flopenrc #(64) MWRegAdd1(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
-	flopenrc #(64) MWRegCmp3(clk, reset, FlushW, ~StallW, CmpResM, CmpResW);
-	flopenrc #(64) MWRegClass2(clk, reset, FlushW, ~StallW, FResM, FResW);
-	flopenrc #(6) MWCtrlReg(clk, reset, FlushW, ~StallW,
+	////////////////////////////////////////////////////////////////////////////////////////
+	flopenrc #(64) MWRegFma(clk, reset, FlushW, ~StallW, FMAResM, FMAResW); 
+	flopenrc #(64) MWRegDiv(clk, reset, FlushW, ~StallW, FDivResM, FDivResW); 
+	flopenrc #(64) MWRegAdd(clk, reset, FlushW, ~StallW, FAddResM, FAddResW); 
+	flopenrc #(64) MWRegClass(clk, reset, FlushW, ~StallW, FResM, FResW);
+	flopenrc #(6)  MWCtrlReg(clk, reset, FlushW, ~StallW,
 				{FRegWriteM, FResultSelM, FmtM, FWriteIntM},
 				{FRegWriteW, FResultSelW, FmtW, FWriteIntW});
 	
-	//#########################################
+
+
+
+	////////////////////////////////////////////////////////////////////////////////////////
 	// BEGIN WRITEBACK STAGE
-	//#########################################
+	////////////////////////////////////////////////////////////////////////////////////////
+
+  // put ReadData into NaN-blocking format
+  //    - if there are any unsused bits the most significant bits are filled with 1s
+  //    - for load instruction
 	mux2  #(64)  ReadResMux({{32{1'b1}}, ReadDataW[31:0]}, {{64-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
-	mux5  #(64)  FPUResultMux(ReadResW, FMAResW, FAddResW, FDivResultW, FResW, FResultSelW, FPUResultW);
+
+  // select the result to be written to the FP register
+	mux5  #(64)  FPUResultMux(ReadResW, FMAResW, FAddResW, FDivResW, FResW, FResultSelW, FPUResultW);
 	
 	
-     end else begin // no F_SUPPORTED; tie outputs low
+  end else begin // no F_SUPPORTED or D_SUPPORTED; tie outputs low
 	assign FStallD = 0;
 	assign FWriteIntE = 0; 
 	assign FWriteIntM = 0;
@ -299,7 +392,7 @@ module fpu (
 	assign FDivBusyE = 0;
 	assign IllegalFPUInstrD = 1;
 	assign SetFflagsM = 0;
-     end
+  end
  endgenerate 
   
 endmodule // fpu
--- a/wally-pipelined/src/fpu/fregfile.sv
+++ b/wally-pipelined/src/fpu/fregfile.sv
@ -26,10 +26,10 @@
 `include "wally-config.vh"

 module fregfile (
-  input  logic             clk, reset,
-  input  logic             we4, 
-  input  logic [ 4:0]      a1, a2, a3, a4, 
-  input  logic [63:0] wd4,    //KEP `XLEN-1 changed to 63 (lint warning) *** figure out if double can be suported when XLEN = 32
+  input  logic        clk, reset,
+  input  logic        we4, 
+  input  logic [ 4:0] a1, a2, a3, a4, 
+  input  logic [63:0] wd4,
  output logic [63:0] rd1, rd2, rd3);

  logic [63:0] rf[31:0];
--- a/wally-pipelined/src/fpu/fsgn.sv
+++ b/wally-pipelined/src/fpu/fsgn.sv
@ -1,15 +1,15 @@
 //performs the fsgnj/fsgnjn/fsgnjx RISCV instructions

 module fsgn (  
-	input logic        XSgnE, YSgnE,
-	input logic [63:0] FSrcXE,
-	input logic XExpMaxE,
-	input logic FmtE,
-	input  logic [1:0]   SgnOpCodeE,
-	output logic [63:0]  SgnResE,
-	output logic   SgnNVE);
+	input logic        	XSgnE, YSgnE,	// X and Y sign bits
+	input logic [63:0] 	FSrcXE,			// X
+	input logic			XExpMaxE,		// max possible exponent (all ones)
+	input logic 		FmtE,			// precision 1 = double 0 = single
+	input  logic [1:0]  SgnOpCodeE,		// operation control
+	output logic [63:0] SgnResE,		// result
+	output logic   		SgnNVE			// invalid flag
+	);

-	logic AonesExp;
 	logic ResSgn;

 	//op code designation:
@ -19,7 +19,12 @@ module fsgn (
 	//10 - fsgnjx - XOR sign values of FSrcXE & FSrcYE
 	//
 	
+	// calculate the result's sign
 	assign ResSgn = SgnOpCodeE[1] ? (XSgnE ^ YSgnE) : (YSgnE ^ SgnOpCodeE[0]);
+	
+	// format final result based on precision
+	//    - uses NaN-blocking format
+	//        - if there are any unsused bits the most significant bits are filled with 1s
 	assign SgnResE = FmtE ? {ResSgn, FSrcXE[62:0]} : {FSrcXE[63:32], ResSgn, FSrcXE[30:0]};

 	//If the exponent is all ones, then the value is either Inf or NaN,
--- a/wally-pipelined/src/fpu/fsm.sv
+++ b/wally-pipelined/src/fpu/fsm.sv
@ -1,37 +1,22 @@
-module fsm (done, load_rega, load_regb, load_regc, 
-	    load_regd, load_regr, load_regs,
-	    sel_muxa, sel_muxb, sel_muxr, 
-	    clk, reset, start, op_type, divBusy, holdInputs);
+module fsm (

-   input 	clk;
-   input 	reset;
-   input 	start;
-   //    input 	error;
-   input  	op_type;
-   //***can use divbusy insted of holdinputs
-   output       done;      
-   output       load_rega;
-   output       load_regb;
-   output       load_regc;
-   output 	load_regd;   
-   output 	load_regr;
-   output 	load_regs;
-   
-   output [2:0] sel_muxa;
-   output [2:0] sel_muxb;
-   output 	sel_muxr;
-   output logic	divBusy,holdInputs;
+   input logic 			clk,
+   input logic 			reset,
+   input logic 			start,
+   input logic  		op_type,
+   output logic 		done,      // End of cycles
+   output logic 		load_rega, // enable for regA
+   output logic 		load_regb, // enable for regB
+   output logic 		load_regc, // enable for regC
+   output logic 		load_regd, // enable for regD
+   output logic 		load_regr, // enable for rem
+   output logic 		load_regs, // enable for q,qm,qp 
+   output logic [2:0] 	sel_muxa,  // Select muxA
+   output logic [2:0] 	sel_muxb,  // Select muxB
+   output logic 		sel_muxr,  // Select rem mux
+   output logic			divBusy	   // calculation is happening
+   );

-   reg 		done;      // End of cycles
-   reg 		load_rega; // enable for regA
-   reg 		load_regb; // enable for regB
-   reg 		load_regc; // enable for regC
-   reg 		load_regd; // enable for regD
-   reg 		load_regr; // enable for rem
-   reg 		load_regs; // enable for q,qm,qp   
-   reg [2:0] 	sel_muxa;  // Select muxA
-   reg [2:0] 	sel_muxb;  // Select muxB
-   reg 		sel_muxr;  // Select rem mux

   reg [4:0] 	CURRENT_STATE;
   reg [4:0] 	NEXT_STATE;   
@ -65,7 +50,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 		 begin
 		    done = 1'b0;
 		    divBusy = 1'b0;	
-		    holdInputs = 1'b0;	
 		    load_rega = 1'b0;
 		    load_regb = 1'b0;
 		    load_regc = 1'b0;
@ -81,7 +65,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 		 begin
 		    done = 1'b0;
 		    divBusy = 1'b1;	
-		    holdInputs = 1'b1;
 		    load_rega = 1'b0;
 		    load_regb = 1'b1;
 		    load_regc = 1'b0;
@ -97,7 +80,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 		 begin
 		    done = 1'b0;
 		    divBusy = 1'b1;
-		    holdInputs = 1'b1;
 		    load_rega = 1'b0;
 		    load_regb = 1'b1;
 		    load_regc = 1'b0;
@ -114,7 +96,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -130,7 +111,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -146,7 +126,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -162,7 +141,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -178,7 +156,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -194,7 +171,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -210,7 +186,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -226,7 +201,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -242,7 +216,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -258,7 +231,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b1;
 	       divBusy = 1'b0;
-	       holdInputs = 1'b0;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -274,7 +246,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -290,7 +261,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -306,7 +276,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -322,7 +291,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -338,7 +306,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -354,7 +321,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -370,7 +336,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -386,7 +351,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -402,7 +366,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b1;
 	       load_regc = 1'b0;
@ -418,7 +381,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -434,7 +396,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b1;
 	       load_regb = 1'b0;
 	       load_regc = 1'b1;
@ -450,7 +411,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -466,7 +426,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b1;
-	       holdInputs = 1'b1;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -482,7 +441,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b1;
 	       divBusy = 1'b0;
-	       holdInputs = 1'b0;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
@ -498,7 +456,6 @@ module fsm (done, load_rega, load_regb, load_regc,
 	    begin
 	       done = 1'b0;
 	       divBusy = 1'b0;
-	       holdInputs = 1'b0;
 	       load_rega = 1'b0;
 	       load_regb = 1'b0;
 	       load_regc = 1'b0;
--- a/wally-pipelined/src/fpu/fsm_div.v
+++ b/wally-pipelined/src/fpu/fsm_div.v
@ -1,461 +0,0 @@
-
-// `timescale 1ps/1ps
-// module fsm_div (done, load_rega, load_regb, load_regc, 
-// 		load_regd, load_regr, load_regs,
-// 		sel_muxa, sel_muxb, sel_muxr, 
-// 		clk, reset, start, error, op_type);
-
-//    input 	clk;
-//    input 	reset;
-//    input 	start;
-//    input 	error;
-//    input  	op_type;
-   
-//    output       done;      
-//    output       load_rega;
-//    output       load_regb;
-//    output       load_regc;
-//    output 	load_regd;   
-//    output 	load_regr;
-//    output 	load_regs;
-   
-//    output [2:0] sel_muxa;
-//    output [2:0] sel_muxb;
-//    output 	sel_muxr;
-
-//    reg 		done;      // End of cycles
-//    reg 		load_rega; // enable for regA
-//    reg 		load_regb; // enable for regB
-//    reg 		load_regc; // enable for regC
-//    reg 		load_regd; // enable for regD
-//    reg 		load_regr; // enable for rem
-//    reg 		load_regs; // enable for q,qm,qp   
-//    reg [2:0] 	sel_muxa;  // Select muxA
-//    reg [2:0] 	sel_muxb;  // Select muxB
-//    reg 		sel_muxr;  // Select rem mux
-
-//    reg [4:0] 	CURRENT_STATE;
-//    reg [4:0] 	NEXT_STATE;   
-
-//    parameter [4:0] 
-//      S0=5'd0, S1=5'd1, S2=5'd2,
-//      S3=5'd3, S4=5'd4, S5=5'd5,
-//      S6=5'd6, S7=5'd7, S8=5'd8,
-//      S9=5'd9, S10=5'd10,
-//      S13=5'd13, S14=5'd14, S15=5'd15,     
-//      S16=5'd16, S17=5'd17, S18=5'd18,
-//      S19=5'd19, S20=5'd20, S21=5'd21,
-//      S22=5'd22, S23=5'd23, S24=5'd24,
-//      S25=5'd25, S26=5'd26, S27=5'd27,
-//      S28=5'd28, S29=5'd29, S30=5'd30;
-   
-//    always @(posedge clk)
-//      begin
-// 	if(reset==1'b1)
-// 	  CURRENT_STATE<=S0;
-// 	else
-// 	  CURRENT_STATE<=NEXT_STATE;
-//      end
-
-//    always @(*)
-//      begin
-//  	case(CURRENT_STATE)
-// 	  S0:  // iteration 0
-// 	    begin
-// 	       if (start==1'b0)
-// 		 begin
-// 		    done = 1'b0;
-// 		    load_rega = 1'b0;
-// 		    load_regb = 1'b0;
-// 		    load_regc = 1'b0;
-// 		    load_regd = 1'b0;
-// 		    load_regr = 1'b0;
-// 		    load_regs = 1'b0;
-// 		    sel_muxa = 3'b000;
-// 		    sel_muxb = 3'b000;
-// 		    sel_muxr = 1'b0;
-// 		    NEXT_STATE <= S0;
-// 		 end 
-// 	       else if (start==1'b1 && op_type==1'b0) 
-// 		 begin
-// 		    done = 1'b0;
-// 		    load_rega = 1'b0;
-// 		    load_regb = 1'b1;
-// 		    load_regc = 1'b0;
-// 		    load_regd = 1'b0;		    
-// 		    load_regr = 1'b0;
-// 		    load_regs = 1'b0;		    		    
-// 		    sel_muxa = 3'b001;
-// 		    sel_muxb = 3'b001;		    
-// 		    sel_muxr = 1'b0;
-// 		    NEXT_STATE <= S1;
-// 		 end // if (start==1'b1 && op_type==1'b0)
-// 	       else if (start==1'b1 && op_type==1'b1) 
-// 		 begin
-// 		    done = 1'b0;
-// 		    load_rega = 1'b0;
-// 		    load_regb = 1'b1;
-// 		    load_regc = 1'b0;
-// 		    load_regd = 1'b0;		    
-// 		    load_regr = 1'b0;
-// 		    load_regs = 1'b0;		    		    
-// 		    sel_muxa = 3'b010;
-// 		    sel_muxb = 3'b000;		    
-// 		    sel_muxr = 1'b0;
-// 		    NEXT_STATE <= S13;
-// 		 end 	       
-// 	    end // case: S0
-// 	  S1:
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b1;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b1;
-// 	       load_regd = 1'b0;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b010;
-// 	       sel_muxb = 3'b000;		    
-// 	       sel_muxr = 1'b0;	
-// 	       NEXT_STATE <= S2;
-// 	    end	  
-// 	  S2: // iteration 1
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b1;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b011;
-// 	       sel_muxb = 3'b011;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S3;
-// 	    end
-// 	  S3:
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b1;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b1;
-// 	       load_regd = 1'b0;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b010;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S4;
-// 	    end
-// 	  S4: // iteration 2
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b1;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b011;
-// 	       sel_muxb = 3'b011;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S5;
-// 	    end
-// 	  S5:
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b1;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b1;
-// 	       load_regd = 1'b0;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b010;
-// 	       sel_muxr = 1'b0;  // add
-// 	       NEXT_STATE <= S6;
-// 	    end
-// 	  S6: // iteration 3
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b1;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;
-// 	       sel_muxa = 3'b011;
-// 	       sel_muxb = 3'b011;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S8;
-// 	    end
-// 	  S7:
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b1;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b1;
-// 	       load_regd = 1'b0;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b010;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S8;
-// 	    end // case: S7
-// 	  S8: // q,qm,qp
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b1;		    	       
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b000;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S9;
-// 	    end 
-// 	  S9:  // rem
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b1;
-// 	       load_regs = 1'b0;  
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b000;
-// 	       sel_muxr = 1'b1;
-// 	       NEXT_STATE <= S10;
-// 	    end 	  
-// 	  S10:  // done
-// 	    begin
-// 	       done = 1'b1;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b000;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S0;
-// 	    end 
-// 	  S13:  // start of sqrt path
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b1;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;
-// 	       sel_muxa = 3'b010;
-// 	       sel_muxb = 3'b001;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S14;
-// 	    end
-// 	  S14:  
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b1;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b1;
-// 	       load_regd = 1'b0;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b001;
-// 	       sel_muxb = 3'b100;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S15;
-// 	    end 
-// 	  S15:  // iteration 1
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b1;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b011;
-// 	       sel_muxb = 3'b011;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S16;
-// 	    end
-// 	  S16:  
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b1;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b011;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S17;
-// 	    end
-// 	  S17:  
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b1;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b1;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b100;
-// 	       sel_muxb = 3'b010;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S18;
-// 	    end
-// 	  S18:  // iteration 2
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b1;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b011;
-// 	       sel_muxb = 3'b011;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S19;
-// 	    end
-// 	  S19:  
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b1;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b011;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S20;
-// 	    end
-// 	  S20:  
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b1;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b1;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b100;
-// 	       sel_muxb = 3'b010;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S21;
-// 	    end
-// 	  S21:  // iteration 3
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b1;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b011;
-// 	       sel_muxb = 3'b011;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S22;
-// 	    end
-// 	  S22:  
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b1;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b011;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S23;
-// 	    end
-// 	  S23:  
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b1;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b1;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b100;
-// 	       sel_muxb = 3'b010;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S24;
-// 	    end 
-// 	  S24: // q,qm,qp
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b1;		    	       
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b000;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S25;
-// 	    end 	  
-// 	  S25:  // rem
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b1;
-// 	       load_regs = 1'b0;  
-// 	       sel_muxa = 3'b011;
-// 	       sel_muxb = 3'b110;
-// 	       sel_muxr = 1'b1;
-// 	       NEXT_STATE <= S26;
-// 	    end 	  
-// 	  S26:  // done
-// 	    begin
-// 	       done = 1'b1;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b000;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S0;
-// 	    end 
-// 	  default: 
-// 	    begin
-// 	       done = 1'b0;
-// 	       load_rega = 1'b0;
-// 	       load_regb = 1'b0;
-// 	       load_regc = 1'b0;
-// 	       load_regd = 1'b0;	       
-// 	       load_regr = 1'b0;
-// 	       load_regs = 1'b0;		    	       
-// 	       sel_muxa = 3'b000;
-// 	       sel_muxb = 3'b000;
-// 	       sel_muxr = 1'b0;
-// 	       NEXT_STATE <= S0;
-// 	    end
-// 	endcase // case(CURRENT_STATE)	
-//      end // always @ (CURRENT_STATE or X)   
-
-// endmodule // fsm
--- a/wally-pipelined/src/fpu/ldf128.sv
+++ b/wally-pipelined/src/fpu/ldf128.sv
@ -1,593 +0,0 @@
-// Ladner-Fischer Prefix Adder
-
-module ldf128 (cout, sum, a, b, cin);
-   
-   input [127:0] a, b;
-   input 	 cin;
-   
-   output [127:0] sum;
-   output 	  cout;
-
-   wire [128:0]   p,g;
-   wire [127:0]   c;
-
-   // pre-computation
-   assign p={a^b,1'b0};
-   assign g={a&b, cin};
-
-   // prefix tree
-   ladner_fischer128 prefix_tree (c, p[127:0], g[127:0]);
-
-   // post-computation
-   assign sum=p[128:1]^c;
-   assign cout=g[128]|(p[128]&c[127]);
-
-endmodule
-
-module ladner_fischer128 (c, p, g);
-   
-   input [127:0] p;
-   input [127:0] g;
-   
-   output [128:1] c;
-   
-   
-  logic G_1_0, G_3_2, P_3_2, G_5_4, P_5_4, G_7_6, P_7_6, G_9_8, P_9_8, G_11_10, P_11_10, G_13_12
-      , P_13_12, G_15_14, P_15_14, G_17_16, P_17_16, G_19_18, P_19_18, G_21_20, P_21_20, G_23_22
-      , P_23_22, G_25_24, P_25_24, G_27_26, P_27_26, G_29_28, P_29_28, G_31_30, P_31_30, G_33_32
-      , P_33_32, G_35_34, P_35_34, G_37_36, P_37_36, G_39_38, P_39_38, G_41_40, P_41_40, G_43_42
-      , P_43_42, G_45_44, P_45_44, G_47_46, P_47_46, G_49_48, P_49_48, G_51_50, P_51_50, G_53_52
-      , P_53_52, G_55_54, P_55_54, G_57_56, P_57_56, G_59_58, P_59_58, G_61_60, P_61_60, G_63_62
-      , P_63_62, G_65_64, P_65_64, G_67_66, P_67_66, G_69_68, P_69_68, G_71_70, P_71_70, G_73_72
-      , P_73_72, G_75_74, P_75_74, G_77_76, P_77_76, G_79_78, P_79_78, G_81_80, P_81_80, G_83_82
-      , P_83_82, G_85_84, P_85_84, G_87_86, P_87_86, G_89_88, P_89_88, G_91_90, P_91_90, G_93_92
-      , P_93_92, G_95_94, P_95_94, G_97_96, P_97_96, G_99_98, P_99_98, G_101_100, P_101_100, G_103_102
-      , P_103_102, G_105_104, P_105_104, G_107_106, P_107_106, G_109_108, P_109_108, G_111_110, P_111_110
-      , G_113_112, P_113_112, G_115_114, P_115_114, G_117_116, P_117_116, G_119_118, P_119_118, G_121_120
-      , P_121_120, G_123_122, P_123_122, G_125_124, P_125_124, G_127_126, P_127_126, G_3_0, G_7_4, P_7_4
-      , G_11_8, P_11_8, G_15_12, P_15_12, G_19_16, P_19_16, G_23_20, P_23_20, G_27_24, P_27_24, G_31_28
-      , P_31_28, G_35_32, P_35_32, G_39_36, P_39_36, G_43_40, P_43_40, G_47_44, P_47_44, G_51_48, P_51_48
-      , G_55_52, P_55_52, G_59_56, P_59_56, G_63_60, P_63_60, G_67_64, P_67_64, G_71_68, P_71_68, G_75_72
-      , P_75_72, G_79_76, P_79_76, G_83_80, P_83_80, G_87_84, P_87_84, G_91_88, P_91_88, G_95_92, P_95_92
-      , G_99_96, P_99_96, G_103_100, P_103_100, G_107_104, P_107_104, G_111_108, P_111_108, G_115_112
-      , P_115_112, G_119_116, P_119_116, G_123_120, P_123_120, G_127_124, P_127_124, G_5_0, G_7_0, G_13_8
-      , P_13_8, G_15_8, P_15_8, G_21_16, P_21_16, G_23_16, P_23_16, G_29_24, P_29_24, G_31_24, P_31_24
-      , G_37_32, P_37_32, G_39_32, P_39_32, G_45_40, P_45_40, G_47_40, P_47_40, G_53_48, P_53_48, G_55_48
-      , P_55_48, G_61_56, P_61_56, G_63_56, P_63_56, G_69_64, P_69_64, G_71_64, P_71_64, G_77_72, P_77_72
-      , G_79_72, P_79_72, G_85_80, P_85_80, G_87_80, P_87_80, G_93_88, P_93_88, G_95_88, P_95_88, G_101_96
-      , P_101_96, G_103_96, P_103_96, G_109_104, P_109_104, G_111_104, P_111_104, G_117_112, P_117_112
-      , G_119_112, P_119_112, G_125_120, P_125_120, G_127_120, P_127_120, G_9_0, G_11_0, G_13_0, G_15_0, G_25_16
-      , P_25_16, G_27_16, P_27_16, G_29_16, P_29_16, G_31_16, P_31_16, G_41_32, P_41_32, G_43_32, P_43_32, G_45_32
-      , P_45_32, G_47_32, P_47_32, G_57_48, P_57_48, G_59_48, P_59_48, G_61_48, P_61_48, G_63_48, P_63_48, G_73_64
-      , P_73_64, G_75_64, P_75_64, G_77_64, P_77_64, G_79_64, P_79_64, G_89_80, P_89_80, G_91_80, P_91_80
-      , G_93_80, P_93_80, G_95_80, P_95_80, G_105_96, P_105_96, G_107_96, P_107_96, G_109_96, P_109_96
-      , G_111_96, P_111_96, G_121_112, P_121_112, G_123_112, P_123_112, G_125_112, P_125_112, G_127_112
-      , P_127_112, G_17_0, G_19_0, G_21_0, G_23_0, G_25_0, G_27_0, G_29_0, G_31_0, G_49_32, P_49_32, G_51_32
-      , P_51_32, G_53_32, P_53_32, G_55_32, P_55_32, G_57_32, P_57_32, G_59_32, P_59_32, G_61_32, P_61_32
-      , G_63_32, P_63_32, G_81_64, P_81_64, G_83_64, P_83_64, G_85_64, P_85_64, G_87_64, P_87_64, G_89_64, P_89_64
-      , G_91_64, P_91_64, G_93_64, P_93_64, G_95_64, P_95_64, G_113_96, P_113_96, G_115_96, P_115_96
-      , G_117_96, P_117_96, G_119_96, P_119_96, G_121_96, P_121_96, G_123_96, P_123_96, G_125_96, P_125_96
-      , G_127_96, P_127_96, G_33_0, G_35_0, G_37_0, G_39_0, G_41_0, G_43_0, G_45_0, G_47_0, G_49_0, G_51_0
-      , G_53_0, G_55_0, G_57_0, G_59_0, G_61_0, G_63_0, G_97_64, P_97_64, G_99_64, P_99_64, G_101_64, P_101_64
-      , G_103_64, P_103_64, G_105_64, P_105_64, G_107_64, P_107_64, G_109_64, P_109_64, G_111_64, P_111_64
-      , G_113_64, P_113_64, G_115_64, P_115_64, G_117_64, P_117_64, G_119_64, P_119_64, G_121_64, P_121_64
-      , G_123_64, P_123_64, G_125_64, P_125_64, G_127_64, P_127_64, G_65_0, G_67_0, G_69_0, G_71_0, G_73_0
-      , G_75_0, G_77_0, G_79_0, G_81_0, G_83_0, G_85_0, G_87_0, G_89_0, G_91_0, G_93_0, G_95_0, G_97_0
-      , G_99_0, G_101_0, G_103_0, G_105_0, G_107_0, G_109_0, G_111_0, G_113_0, G_115_0, G_117_0, G_119_0
-      , G_121_0, G_123_0, G_125_0, G_127_0, G_2_0, G_4_0, G_6_0, G_8_0, G_10_0, G_12_0, G_14_0, G_16_0
-      , G_18_0, G_20_0, G_22_0, G_24_0, G_26_0, G_28_0, G_30_0, G_32_0, G_34_0, G_36_0, G_38_0, G_40_0
-      , G_42_0, G_44_0, G_46_0, G_48_0, G_50_0, G_52_0, G_54_0, G_56_0, G_58_0, G_60_0, G_62_0, G_64_0
-      , G_66_0, G_68_0, G_70_0, G_72_0, G_74_0, G_76_0, G_78_0, G_80_0, G_82_0, G_84_0, G_86_0, G_88_0
-      , G_90_0, G_92_0, G_94_0, G_96_0, G_98_0, G_100_0, G_102_0, G_104_0, G_106_0, G_108_0, G_110_0, G_112_0
-      , G_114_0, G_116_0, G_118_0, G_120_0, G_122_0, G_124_0, G_126_0;
-
-   // parallel-prefix, Ladner-Fischer
-
-   // Stage 1: Generates G/P pairs that span 1 bits
-   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
-   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
-   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
-   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
-   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
-   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
-   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
-   black b_15_14 (G_15_14, P_15_14, {g[15],g[14]}, {p[15],p[14]});
-
-   black b_17_16 (G_17_16, P_17_16, {g[17],g[16]}, {p[17],p[16]});
-   black b_19_18 (G_19_18, P_19_18, {g[19],g[18]}, {p[19],p[18]});
-   black b_21_20 (G_21_20, P_21_20, {g[21],g[20]}, {p[21],p[20]});
-   black b_23_22 (G_23_22, P_23_22, {g[23],g[22]}, {p[23],p[22]});
-   black b_25_24 (G_25_24, P_25_24, {g[25],g[24]}, {p[25],p[24]});
-   black b_27_26 (G_27_26, P_27_26, {g[27],g[26]}, {p[27],p[26]});
-   black b_29_28 (G_29_28, P_29_28, {g[29],g[28]}, {p[29],p[28]});
-   black b_31_30 (G_31_30, P_31_30, {g[31],g[30]}, {p[31],p[30]});
-
-   black b_33_32 (G_33_32, P_33_32, {g[33],g[32]}, {p[33],p[32]});
-   black b_35_34 (G_35_34, P_35_34, {g[35],g[34]}, {p[35],p[34]});
-   black b_37_36 (G_37_36, P_37_36, {g[37],g[36]}, {p[37],p[36]});
-   black b_39_38 (G_39_38, P_39_38, {g[39],g[38]}, {p[39],p[38]});
-   black b_41_40 (G_41_40, P_41_40, {g[41],g[40]}, {p[41],p[40]});
-   black b_43_42 (G_43_42, P_43_42, {g[43],g[42]}, {p[43],p[42]});
-   black b_45_44 (G_45_44, P_45_44, {g[45],g[44]}, {p[45],p[44]});
-   black b_47_46 (G_47_46, P_47_46, {g[47],g[46]}, {p[47],p[46]});
-
-   black b_49_48 (G_49_48, P_49_48, {g[49],g[48]}, {p[49],p[48]});
-   black b_51_50 (G_51_50, P_51_50, {g[51],g[50]}, {p[51],p[50]});
-   black b_53_52 (G_53_52, P_53_52, {g[53],g[52]}, {p[53],p[52]});
-   black b_55_54 (G_55_54, P_55_54, {g[55],g[54]}, {p[55],p[54]});
-   black b_57_56 (G_57_56, P_57_56, {g[57],g[56]}, {p[57],p[56]});
-   black b_59_58 (G_59_58, P_59_58, {g[59],g[58]}, {p[59],p[58]});
-   black b_61_60 (G_61_60, P_61_60, {g[61],g[60]}, {p[61],p[60]});
-   black b_63_62 (G_63_62, P_63_62, {g[63],g[62]}, {p[63],p[62]});
-
-   black b_65_64 (G_65_64, P_65_64, {g[65],g[64]}, {p[65],p[64]});
-   black b_67_66 (G_67_66, P_67_66, {g[67],g[66]}, {p[67],p[66]});
-   black b_69_68 (G_69_68, P_69_68, {g[69],g[68]}, {p[69],p[68]});
-   black b_71_70 (G_71_70, P_71_70, {g[71],g[70]}, {p[71],p[70]});
-   black b_73_72 (G_73_72, P_73_72, {g[73],g[72]}, {p[73],p[72]});
-   black b_75_74 (G_75_74, P_75_74, {g[75],g[74]}, {p[75],p[74]});
-   black b_77_76 (G_77_76, P_77_76, {g[77],g[76]}, {p[77],p[76]});
-   black b_79_78 (G_79_78, P_79_78, {g[79],g[78]}, {p[79],p[78]});
-
-   black b_81_80 (G_81_80, P_81_80, {g[81],g[80]}, {p[81],p[80]});
-   black b_83_82 (G_83_82, P_83_82, {g[83],g[82]}, {p[83],p[82]});
-   black b_85_84 (G_85_84, P_85_84, {g[85],g[84]}, {p[85],p[84]});
-   black b_87_86 (G_87_86, P_87_86, {g[87],g[86]}, {p[87],p[86]});
-   black b_89_88 (G_89_88, P_89_88, {g[89],g[88]}, {p[89],p[88]});
-   black b_91_90 (G_91_90, P_91_90, {g[91],g[90]}, {p[91],p[90]});
-   black b_93_92 (G_93_92, P_93_92, {g[93],g[92]}, {p[93],p[92]});
-   black b_95_94 (G_95_94, P_95_94, {g[95],g[94]}, {p[95],p[94]});
-
-   black b_97_96 (G_97_96, P_97_96, {g[97],g[96]}, {p[97],p[96]});
-   black b_99_98 (G_99_98, P_99_98, {g[99],g[98]}, {p[99],p[98]});
-   black b_101_100 (G_101_100, P_101_100, {g[101],g[100]}, {p[101],p[100]});
-   black b_103_102 (G_103_102, P_103_102, {g[103],g[102]}, {p[103],p[102]});
-   black b_105_104 (G_105_104, P_105_104, {g[105],g[104]}, {p[105],p[104]});
-   black b_107_106 (G_107_106, P_107_106, {g[107],g[106]}, {p[107],p[106]});
-   black b_109_108 (G_109_108, P_109_108, {g[109],g[108]}, {p[109],p[108]});
-   black b_111_110 (G_111_110, P_111_110, {g[111],g[110]}, {p[111],p[110]});
-
-   black b_113_112 (G_113_112, P_113_112, {g[113],g[112]}, {p[113],p[112]});
-   black b_115_114 (G_115_114, P_115_114, {g[115],g[114]}, {p[115],p[114]});
-   black b_117_116 (G_117_116, P_117_116, {g[117],g[116]}, {p[117],p[116]});
-   black b_119_118 (G_119_118, P_119_118, {g[119],g[118]}, {p[119],p[118]});
-   black b_121_120 (G_121_120, P_121_120, {g[121],g[120]}, {p[121],p[120]});
-   black b_123_122 (G_123_122, P_123_122, {g[123],g[122]}, {p[123],p[122]});
-   black b_125_124 (G_125_124, P_125_124, {g[125],g[124]}, {p[125],p[124]});
-   black b_127_126 (G_127_126, P_127_126, {g[127],g[126]}, {p[127],p[126]});
-
-
-   // Stage 2: Generates G/P pairs that span 2 bits
-   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
-   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
-   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
-   black b_15_12 (G_15_12, P_15_12, {G_15_14,G_13_12}, {P_15_14,P_13_12});
-   black b_19_16 (G_19_16, P_19_16, {G_19_18,G_17_16}, {P_19_18,P_17_16});
-   black b_23_20 (G_23_20, P_23_20, {G_23_22,G_21_20}, {P_23_22,P_21_20});
-   black b_27_24 (G_27_24, P_27_24, {G_27_26,G_25_24}, {P_27_26,P_25_24});
-   black b_31_28 (G_31_28, P_31_28, {G_31_30,G_29_28}, {P_31_30,P_29_28});
-
-   black b_35_32 (G_35_32, P_35_32, {G_35_34,G_33_32}, {P_35_34,P_33_32});
-   black b_39_36 (G_39_36, P_39_36, {G_39_38,G_37_36}, {P_39_38,P_37_36});
-   black b_43_40 (G_43_40, P_43_40, {G_43_42,G_41_40}, {P_43_42,P_41_40});
-   black b_47_44 (G_47_44, P_47_44, {G_47_46,G_45_44}, {P_47_46,P_45_44});
-   black b_51_48 (G_51_48, P_51_48, {G_51_50,G_49_48}, {P_51_50,P_49_48});
-   black b_55_52 (G_55_52, P_55_52, {G_55_54,G_53_52}, {P_55_54,P_53_52});
-   black b_59_56 (G_59_56, P_59_56, {G_59_58,G_57_56}, {P_59_58,P_57_56});
-   black b_63_60 (G_63_60, P_63_60, {G_63_62,G_61_60}, {P_63_62,P_61_60});
-
-   black b_67_64 (G_67_64, P_67_64, {G_67_66,G_65_64}, {P_67_66,P_65_64});
-   black b_71_68 (G_71_68, P_71_68, {G_71_70,G_69_68}, {P_71_70,P_69_68});
-   black b_75_72 (G_75_72, P_75_72, {G_75_74,G_73_72}, {P_75_74,P_73_72});
-   black b_79_76 (G_79_76, P_79_76, {G_79_78,G_77_76}, {P_79_78,P_77_76});
-   black b_83_80 (G_83_80, P_83_80, {G_83_82,G_81_80}, {P_83_82,P_81_80});
-   black b_87_84 (G_87_84, P_87_84, {G_87_86,G_85_84}, {P_87_86,P_85_84});
-   black b_91_88 (G_91_88, P_91_88, {G_91_90,G_89_88}, {P_91_90,P_89_88});
-   black b_95_92 (G_95_92, P_95_92, {G_95_94,G_93_92}, {P_95_94,P_93_92});
-
-   black b_99_96 (G_99_96, P_99_96, {G_99_98,G_97_96}, {P_99_98,P_97_96});
-   black b_103_100 (G_103_100, P_103_100, {G_103_102,G_101_100}, {P_103_102,P_101_100});
-   black b_107_104 (G_107_104, P_107_104, {G_107_106,G_105_104}, {P_107_106,P_105_104});
-   black b_111_108 (G_111_108, P_111_108, {G_111_110,G_109_108}, {P_111_110,P_109_108});
-   black b_115_112 (G_115_112, P_115_112, {G_115_114,G_113_112}, {P_115_114,P_113_112});
-   black b_119_116 (G_119_116, P_119_116, {G_119_118,G_117_116}, {P_119_118,P_117_116});
-   black b_123_120 (G_123_120, P_123_120, {G_123_122,G_121_120}, {P_123_122,P_121_120});
-   black b_127_124 (G_127_124, P_127_124, {G_127_126,G_125_124}, {P_127_126,P_125_124});
-
-   // Stage 3: Generates G/P pairs that span 4 bits
-   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
-   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
-   black b_13_8 (G_13_8, P_13_8, {G_13_12,G_11_8}, {P_13_12,P_11_8});
-   black b_15_8 (G_15_8, P_15_8, {G_15_12,G_11_8}, {P_15_12,P_11_8});
-   black b_21_16 (G_21_16, P_21_16, {G_21_20,G_19_16}, {P_21_20,P_19_16});
-   black b_23_16 (G_23_16, P_23_16, {G_23_20,G_19_16}, {P_23_20,P_19_16});
-   black b_29_24 (G_29_24, P_29_24, {G_29_28,G_27_24}, {P_29_28,P_27_24});
-   black b_31_24 (G_31_24, P_31_24, {G_31_28,G_27_24}, {P_31_28,P_27_24});
-
-   black b_37_32 (G_37_32, P_37_32, {G_37_36,G_35_32}, {P_37_36,P_35_32});
-   black b_39_32 (G_39_32, P_39_32, {G_39_36,G_35_32}, {P_39_36,P_35_32});
-   black b_45_40 (G_45_40, P_45_40, {G_45_44,G_43_40}, {P_45_44,P_43_40});
-   black b_47_40 (G_47_40, P_47_40, {G_47_44,G_43_40}, {P_47_44,P_43_40});
-   black b_53_48 (G_53_48, P_53_48, {G_53_52,G_51_48}, {P_53_52,P_51_48});
-   black b_55_48 (G_55_48, P_55_48, {G_55_52,G_51_48}, {P_55_52,P_51_48});
-   black b_61_56 (G_61_56, P_61_56, {G_61_60,G_59_56}, {P_61_60,P_59_56});
-   black b_63_56 (G_63_56, P_63_56, {G_63_60,G_59_56}, {P_63_60,P_59_56});
-
-   black b_69_64 (G_69_64, P_69_64, {G_69_68,G_67_64}, {P_69_68,P_67_64});
-   black b_71_64 (G_71_64, P_71_64, {G_71_68,G_67_64}, {P_71_68,P_67_64});
-   black b_77_72 (G_77_72, P_77_72, {G_77_76,G_75_72}, {P_77_76,P_75_72});
-   black b_79_72 (G_79_72, P_79_72, {G_79_76,G_75_72}, {P_79_76,P_75_72});
-   black b_85_80 (G_85_80, P_85_80, {G_85_84,G_83_80}, {P_85_84,P_83_80});
-   black b_87_80 (G_87_80, P_87_80, {G_87_84,G_83_80}, {P_87_84,P_83_80});
-   black b_93_88 (G_93_88, P_93_88, {G_93_92,G_91_88}, {P_93_92,P_91_88});
-   black b_95_88 (G_95_88, P_95_88, {G_95_92,G_91_88}, {P_95_92,P_91_88});
-
-   black b_101_96 (G_101_96, P_101_96, {G_101_100,G_99_96}, {P_101_100,P_99_96});
-   black b_103_96 (G_103_96, P_103_96, {G_103_100,G_99_96}, {P_103_100,P_99_96});
-   black b_109_104 (G_109_104, P_109_104, {G_109_108,G_107_104}, {P_109_108,P_107_104});
-   black b_111_104 (G_111_104, P_111_104, {G_111_108,G_107_104}, {P_111_108,P_107_104});
-   black b_117_112 (G_117_112, P_117_112, {G_117_116,G_115_112}, {P_117_116,P_115_112});
-   black b_119_112 (G_119_112, P_119_112, {G_119_116,G_115_112}, {P_119_116,P_115_112});
-   black b_125_120 (G_125_120, P_125_120, {G_125_124,G_123_120}, {P_125_124,P_123_120});
-   black b_127_120 (G_127_120, P_127_120, {G_127_124,G_123_120}, {P_127_124,P_123_120});
-
-   // Stage 4: Generates G/P pairs that span 8 bits
-   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
-   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
-   grey g_13_0 (G_13_0, {G_13_8,G_7_0}, P_13_8);
-   grey g_15_0 (G_15_0, {G_15_8,G_7_0}, P_15_8);
-   black b_25_16 (G_25_16, P_25_16, {G_25_24,G_23_16}, {P_25_24,P_23_16});
-   black b_27_16 (G_27_16, P_27_16, {G_27_24,G_23_16}, {P_27_24,P_23_16});
-   black b_29_16 (G_29_16, P_29_16, {G_29_24,G_23_16}, {P_29_24,P_23_16});
-   black b_31_16 (G_31_16, P_31_16, {G_31_24,G_23_16}, {P_31_24,P_23_16});
-
-   black b_41_32 (G_41_32, P_41_32, {G_41_40,G_39_32}, {P_41_40,P_39_32});
-   black b_43_32 (G_43_32, P_43_32, {G_43_40,G_39_32}, {P_43_40,P_39_32});
-   black b_45_32 (G_45_32, P_45_32, {G_45_40,G_39_32}, {P_45_40,P_39_32});
-   black b_47_32 (G_47_32, P_47_32, {G_47_40,G_39_32}, {P_47_40,P_39_32});
-   black b_57_48 (G_57_48, P_57_48, {G_57_56,G_55_48}, {P_57_56,P_55_48});
-   black b_59_48 (G_59_48, P_59_48, {G_59_56,G_55_48}, {P_59_56,P_55_48});
-   black b_61_48 (G_61_48, P_61_48, {G_61_56,G_55_48}, {P_61_56,P_55_48});
-   black b_63_48 (G_63_48, P_63_48, {G_63_56,G_55_48}, {P_63_56,P_55_48});
-
-   black b_73_64 (G_73_64, P_73_64, {G_73_72,G_71_64}, {P_73_72,P_71_64});
-   black b_75_64 (G_75_64, P_75_64, {G_75_72,G_71_64}, {P_75_72,P_71_64});
-   black b_77_64 (G_77_64, P_77_64, {G_77_72,G_71_64}, {P_77_72,P_71_64});
-   black b_79_64 (G_79_64, P_79_64, {G_79_72,G_71_64}, {P_79_72,P_71_64});
-   black b_89_80 (G_89_80, P_89_80, {G_89_88,G_87_80}, {P_89_88,P_87_80});
-   black b_91_80 (G_91_80, P_91_80, {G_91_88,G_87_80}, {P_91_88,P_87_80});
-   black b_93_80 (G_93_80, P_93_80, {G_93_88,G_87_80}, {P_93_88,P_87_80});
-   black b_95_80 (G_95_80, P_95_80, {G_95_88,G_87_80}, {P_95_88,P_87_80});
-
-   black b_105_96 (G_105_96, P_105_96, {G_105_104,G_103_96}, {P_105_104,P_103_96});
-   black b_107_96 (G_107_96, P_107_96, {G_107_104,G_103_96}, {P_107_104,P_103_96});
-   black b_109_96 (G_109_96, P_109_96, {G_109_104,G_103_96}, {P_109_104,P_103_96});
-   black b_111_96 (G_111_96, P_111_96, {G_111_104,G_103_96}, {P_111_104,P_103_96});
-   black b_121_112 (G_121_112, P_121_112, {G_121_120,G_119_112}, {P_121_120,P_119_112});
-   black b_123_112 (G_123_112, P_123_112, {G_123_120,G_119_112}, {P_123_120,P_119_112});
-   black b_125_112 (G_125_112, P_125_112, {G_125_120,G_119_112}, {P_125_120,P_119_112});
-   black b_127_112 (G_127_112, P_127_112, {G_127_120,G_119_112}, {P_127_120,P_119_112});
-
-   // Stage 5: Generates G/P pairs that span 16 bits
-   grey g_17_0 (G_17_0, {G_17_16,G_15_0}, P_17_16);
-   grey g_19_0 (G_19_0, {G_19_16,G_15_0}, P_19_16);
-   grey g_21_0 (G_21_0, {G_21_16,G_15_0}, P_21_16);
-   grey g_23_0 (G_23_0, {G_23_16,G_15_0}, P_23_16);
-   grey g_25_0 (G_25_0, {G_25_16,G_15_0}, P_25_16);
-   grey g_27_0 (G_27_0, {G_27_16,G_15_0}, P_27_16);
-   grey g_29_0 (G_29_0, {G_29_16,G_15_0}, P_29_16);
-   grey g_31_0 (G_31_0, {G_31_16,G_15_0}, P_31_16);
-
-   black b_49_32 (G_49_32, P_49_32, {G_49_48,G_47_32}, {P_49_48,P_47_32});
-   black b_51_32 (G_51_32, P_51_32, {G_51_48,G_47_32}, {P_51_48,P_47_32});
-   black b_53_32 (G_53_32, P_53_32, {G_53_48,G_47_32}, {P_53_48,P_47_32});
-   black b_55_32 (G_55_32, P_55_32, {G_55_48,G_47_32}, {P_55_48,P_47_32});
-   black b_57_32 (G_57_32, P_57_32, {G_57_48,G_47_32}, {P_57_48,P_47_32});
-   black b_59_32 (G_59_32, P_59_32, {G_59_48,G_47_32}, {P_59_48,P_47_32});
-   black b_61_32 (G_61_32, P_61_32, {G_61_48,G_47_32}, {P_61_48,P_47_32});
-   black b_63_32 (G_63_32, P_63_32, {G_63_48,G_47_32}, {P_63_48,P_47_32});
-
-   black b_81_64 (G_81_64, P_81_64, {G_81_80,G_79_64}, {P_81_80,P_79_64});
-   black b_83_64 (G_83_64, P_83_64, {G_83_80,G_79_64}, {P_83_80,P_79_64});
-   black b_85_64 (G_85_64, P_85_64, {G_85_80,G_79_64}, {P_85_80,P_79_64});
-   black b_87_64 (G_87_64, P_87_64, {G_87_80,G_79_64}, {P_87_80,P_79_64});
-   black b_89_64 (G_89_64, P_89_64, {G_89_80,G_79_64}, {P_89_80,P_79_64});
-   black b_91_64 (G_91_64, P_91_64, {G_91_80,G_79_64}, {P_91_80,P_79_64});
-   black b_93_64 (G_93_64, P_93_64, {G_93_80,G_79_64}, {P_93_80,P_79_64});
-   black b_95_64 (G_95_64, P_95_64, {G_95_80,G_79_64}, {P_95_80,P_79_64});
-
-   black b_113_96 (G_113_96, P_113_96, {G_113_112,G_111_96}, {P_113_112,P_111_96});
-   black b_115_96 (G_115_96, P_115_96, {G_115_112,G_111_96}, {P_115_112,P_111_96});
-   black b_117_96 (G_117_96, P_117_96, {G_117_112,G_111_96}, {P_117_112,P_111_96});
-   black b_119_96 (G_119_96, P_119_96, {G_119_112,G_111_96}, {P_119_112,P_111_96});
-   black b_121_96 (G_121_96, P_121_96, {G_121_112,G_111_96}, {P_121_112,P_111_96});
-   black b_123_96 (G_123_96, P_123_96, {G_123_112,G_111_96}, {P_123_112,P_111_96});
-   black b_125_96 (G_125_96, P_125_96, {G_125_112,G_111_96}, {P_125_112,P_111_96});
-   black b_127_96 (G_127_96, P_127_96, {G_127_112,G_111_96}, {P_127_112,P_111_96});
-
-   // Stage 6: Generates G/P pairs that span 32 bits
-   grey g_33_0 (G_33_0, {G_33_32,G_31_0}, P_33_32);
-   grey g_35_0 (G_35_0, {G_35_32,G_31_0}, P_35_32);
-   grey g_37_0 (G_37_0, {G_37_32,G_31_0}, P_37_32);
-   grey g_39_0 (G_39_0, {G_39_32,G_31_0}, P_39_32);
-   grey g_41_0 (G_41_0, {G_41_32,G_31_0}, P_41_32);
-   grey g_43_0 (G_43_0, {G_43_32,G_31_0}, P_43_32);
-   grey g_45_0 (G_45_0, {G_45_32,G_31_0}, P_45_32);
-   grey g_47_0 (G_47_0, {G_47_32,G_31_0}, P_47_32);
-
-   grey g_49_0 (G_49_0, {G_49_32,G_31_0}, P_49_32);
-   grey g_51_0 (G_51_0, {G_51_32,G_31_0}, P_51_32);
-   grey g_53_0 (G_53_0, {G_53_32,G_31_0}, P_53_32);
-   grey g_55_0 (G_55_0, {G_55_32,G_31_0}, P_55_32);
-   grey g_57_0 (G_57_0, {G_57_32,G_31_0}, P_57_32);
-   grey g_59_0 (G_59_0, {G_59_32,G_31_0}, P_59_32);
-   grey g_61_0 (G_61_0, {G_61_32,G_31_0}, P_61_32);
-   grey g_63_0 (G_63_0, {G_63_32,G_31_0}, P_63_32);
-
-   black b_97_64 (G_97_64, P_97_64, {G_97_96,G_95_64}, {P_97_96,P_95_64});
-   black b_99_64 (G_99_64, P_99_64, {G_99_96,G_95_64}, {P_99_96,P_95_64});
-   black b_101_64 (G_101_64, P_101_64, {G_101_96,G_95_64}, {P_101_96,P_95_64});
-   black b_103_64 (G_103_64, P_103_64, {G_103_96,G_95_64}, {P_103_96,P_95_64});
-   black b_105_64 (G_105_64, P_105_64, {G_105_96,G_95_64}, {P_105_96,P_95_64});
-   black b_107_64 (G_107_64, P_107_64, {G_107_96,G_95_64}, {P_107_96,P_95_64});
-   black b_109_64 (G_109_64, P_109_64, {G_109_96,G_95_64}, {P_109_96,P_95_64});
-   black b_111_64 (G_111_64, P_111_64, {G_111_96,G_95_64}, {P_111_96,P_95_64});
-
-   black b_113_64 (G_113_64, P_113_64, {G_113_96,G_95_64}, {P_113_96,P_95_64});
-   black b_115_64 (G_115_64, P_115_64, {G_115_96,G_95_64}, {P_115_96,P_95_64});
-   black b_117_64 (G_117_64, P_117_64, {G_117_96,G_95_64}, {P_117_96,P_95_64});
-   black b_119_64 (G_119_64, P_119_64, {G_119_96,G_95_64}, {P_119_96,P_95_64});
-   black b_121_64 (G_121_64, P_121_64, {G_121_96,G_95_64}, {P_121_96,P_95_64});
-   black b_123_64 (G_123_64, P_123_64, {G_123_96,G_95_64}, {P_123_96,P_95_64});
-   black b_125_64 (G_125_64, P_125_64, {G_125_96,G_95_64}, {P_125_96,P_95_64});
-   black b_127_64 (G_127_64, P_127_64, {G_127_96,G_95_64}, {P_127_96,P_95_64});
-
-   // Stage 7: Generates G/P pairs that span 64 bits
-   grey g_65_0 (G_65_0, {G_65_64,G_63_0}, P_65_64);
-   grey g_67_0 (G_67_0, {G_67_64,G_63_0}, P_67_64);
-   grey g_69_0 (G_69_0, {G_69_64,G_63_0}, P_69_64);
-   grey g_71_0 (G_71_0, {G_71_64,G_63_0}, P_71_64);
-   grey g_73_0 (G_73_0, {G_73_64,G_63_0}, P_73_64);
-   grey g_75_0 (G_75_0, {G_75_64,G_63_0}, P_75_64);
-   grey g_77_0 (G_77_0, {G_77_64,G_63_0}, P_77_64);
-   grey g_79_0 (G_79_0, {G_79_64,G_63_0}, P_79_64);
-
-   grey g_81_0 (G_81_0, {G_81_64,G_63_0}, P_81_64);
-   grey g_83_0 (G_83_0, {G_83_64,G_63_0}, P_83_64);
-   grey g_85_0 (G_85_0, {G_85_64,G_63_0}, P_85_64);
-   grey g_87_0 (G_87_0, {G_87_64,G_63_0}, P_87_64);
-   grey g_89_0 (G_89_0, {G_89_64,G_63_0}, P_89_64);
-   grey g_91_0 (G_91_0, {G_91_64,G_63_0}, P_91_64);
-   grey g_93_0 (G_93_0, {G_93_64,G_63_0}, P_93_64);
-   grey g_95_0 (G_95_0, {G_95_64,G_63_0}, P_95_64);
-
-   grey g_97_0 (G_97_0, {G_97_64,G_63_0}, P_97_64);
-   grey g_99_0 (G_99_0, {G_99_64,G_63_0}, P_99_64);
-   grey g_101_0 (G_101_0, {G_101_64,G_63_0}, P_101_64);
-   grey g_103_0 (G_103_0, {G_103_64,G_63_0}, P_103_64);
-   grey g_105_0 (G_105_0, {G_105_64,G_63_0}, P_105_64);
-   grey g_107_0 (G_107_0, {G_107_64,G_63_0}, P_107_64);
-   grey g_109_0 (G_109_0, {G_109_64,G_63_0}, P_109_64);
-   grey g_111_0 (G_111_0, {G_111_64,G_63_0}, P_111_64);
-
-   grey g_113_0 (G_113_0, {G_113_64,G_63_0}, P_113_64);
-   grey g_115_0 (G_115_0, {G_115_64,G_63_0}, P_115_64);
-   grey g_117_0 (G_117_0, {G_117_64,G_63_0}, P_117_64);
-   grey g_119_0 (G_119_0, {G_119_64,G_63_0}, P_119_64);
-   grey g_121_0 (G_121_0, {G_121_64,G_63_0}, P_121_64);
-   grey g_123_0 (G_123_0, {G_123_64,G_63_0}, P_123_64);
-   grey g_125_0 (G_125_0, {G_125_64,G_63_0}, P_125_64);
-   grey g_127_0 (G_127_0, {G_127_64,G_63_0}, P_127_64);
-
-   // Extra grey cell stage 
-   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
-   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
-   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
-   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
-   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
-   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
-   grey g_14_0 (G_14_0, {g[14],G_13_0}, p[14]);
-   grey g_16_0 (G_16_0, {g[16],G_15_0}, p[16]);
-   grey g_18_0 (G_18_0, {g[18],G_17_0}, p[18]);
-   grey g_20_0 (G_20_0, {g[20],G_19_0}, p[20]);
-   grey g_22_0 (G_22_0, {g[22],G_21_0}, p[22]);
-   grey g_24_0 (G_24_0, {g[24],G_23_0}, p[24]);
-   grey g_26_0 (G_26_0, {g[26],G_25_0}, p[26]);
-   grey g_28_0 (G_28_0, {g[28],G_27_0}, p[28]);
-   grey g_30_0 (G_30_0, {g[30],G_29_0}, p[30]);
-   grey g_32_0 (G_32_0, {g[32],G_31_0}, p[32]);
-   grey g_34_0 (G_34_0, {g[34],G_33_0}, p[34]);
-   grey g_36_0 (G_36_0, {g[36],G_35_0}, p[36]);
-   grey g_38_0 (G_38_0, {g[38],G_37_0}, p[38]);
-   grey g_40_0 (G_40_0, {g[40],G_39_0}, p[40]);
-   grey g_42_0 (G_42_0, {g[42],G_41_0}, p[42]);
-   grey g_44_0 (G_44_0, {g[44],G_43_0}, p[44]);
-   grey g_46_0 (G_46_0, {g[46],G_45_0}, p[46]);
-   grey g_48_0 (G_48_0, {g[48],G_47_0}, p[48]);
-   grey g_50_0 (G_50_0, {g[50],G_49_0}, p[50]);
-   grey g_52_0 (G_52_0, {g[52],G_51_0}, p[52]);
-   grey g_54_0 (G_54_0, {g[54],G_53_0}, p[54]);
-   grey g_56_0 (G_56_0, {g[56],G_55_0}, p[56]);
-   grey g_58_0 (G_58_0, {g[58],G_57_0}, p[58]);
-   grey g_60_0 (G_60_0, {g[60],G_59_0}, p[60]);
-   grey g_62_0 (G_62_0, {g[62],G_61_0}, p[62]);
-   grey g_64_0 (G_64_0, {g[64],G_63_0}, p[64]);
-   grey g_66_0 (G_66_0, {g[66],G_65_0}, p[66]);
-   grey g_68_0 (G_68_0, {g[68],G_67_0}, p[68]);
-   grey g_70_0 (G_70_0, {g[70],G_69_0}, p[70]);
-   grey g_72_0 (G_72_0, {g[72],G_71_0}, p[72]);
-   grey g_74_0 (G_74_0, {g[74],G_73_0}, p[74]);
-   grey g_76_0 (G_76_0, {g[76],G_75_0}, p[76]);
-   grey g_78_0 (G_78_0, {g[78],G_77_0}, p[78]);
-   grey g_80_0 (G_80_0, {g[80],G_79_0}, p[80]);
-   grey g_82_0 (G_82_0, {g[82],G_81_0}, p[82]);
-   grey g_84_0 (G_84_0, {g[84],G_83_0}, p[84]);
-   grey g_86_0 (G_86_0, {g[86],G_85_0}, p[86]);
-   grey g_88_0 (G_88_0, {g[88],G_87_0}, p[88]);
-   grey g_90_0 (G_90_0, {g[90],G_89_0}, p[90]);
-   grey g_92_0 (G_92_0, {g[92],G_91_0}, p[92]);
-   grey g_94_0 (G_94_0, {g[94],G_93_0}, p[94]);
-   grey g_96_0 (G_96_0, {g[96],G_95_0}, p[96]);
-   grey g_98_0 (G_98_0, {g[98],G_97_0}, p[98]);
-   grey g_100_0 (G_100_0, {g[100],G_99_0}, p[100]);
-   grey g_102_0 (G_102_0, {g[102],G_101_0}, p[102]);
-   grey g_104_0 (G_104_0, {g[104],G_103_0}, p[104]);
-   grey g_106_0 (G_106_0, {g[106],G_105_0}, p[106]);
-   grey g_108_0 (G_108_0, {g[108],G_107_0}, p[108]);
-   grey g_110_0 (G_110_0, {g[110],G_109_0}, p[110]);
-   grey g_112_0 (G_112_0, {g[112],G_111_0}, p[112]);
-   grey g_114_0 (G_114_0, {g[114],G_113_0}, p[114]);
-   grey g_116_0 (G_116_0, {g[116],G_115_0}, p[116]);
-   grey g_118_0 (G_118_0, {g[118],G_117_0}, p[118]);
-   grey g_120_0 (G_120_0, {g[120],G_119_0}, p[120]);
-   grey g_122_0 (G_122_0, {g[122],G_121_0}, p[122]);
-   grey g_124_0 (G_124_0, {g[124],G_123_0}, p[124]);
-   grey g_126_0 (G_126_0, {g[126],G_125_0}, p[126]);
-
-   // Final Stage: Apply c_k+1=G_k_0
-   assign c[1]=g[0];
-   assign c[2]=G_1_0;
-   assign c[3]=G_2_0;
-   assign c[4]=G_3_0;
-   assign c[5]=G_4_0;
-   assign c[6]=G_5_0;
-   assign c[7]=G_6_0;
-   assign c[8]=G_7_0;
-   assign c[9]=G_8_0;
-
-   assign c[10]=G_9_0;
-   assign c[11]=G_10_0;
-   assign c[12]=G_11_0;
-   assign c[13]=G_12_0;
-   assign c[14]=G_13_0;
-   assign c[15]=G_14_0;
-   assign c[16]=G_15_0;
-   assign c[17]=G_16_0;
-
-   assign c[18]=G_17_0;
-   assign c[19]=G_18_0;
-   assign c[20]=G_19_0;
-   assign c[21]=G_20_0;
-   assign c[22]=G_21_0;
-   assign c[23]=G_22_0;
-   assign c[24]=G_23_0;
-   assign c[25]=G_24_0;
-
-   assign c[26]=G_25_0;
-   assign c[27]=G_26_0;
-   assign c[28]=G_27_0;
-   assign c[29]=G_28_0;
-   assign c[30]=G_29_0;
-   assign c[31]=G_30_0;
-   assign c[32]=G_31_0;
-   assign c[33]=G_32_0;
-
-   assign c[34]=G_33_0;
-   assign c[35]=G_34_0;
-   assign c[36]=G_35_0;
-   assign c[37]=G_36_0;
-   assign c[38]=G_37_0;
-   assign c[39]=G_38_0;
-   assign c[40]=G_39_0;
-   assign c[41]=G_40_0;
-
-   assign c[42]=G_41_0;
-   assign c[43]=G_42_0;
-   assign c[44]=G_43_0;
-   assign c[45]=G_44_0;
-   assign c[46]=G_45_0;
-   assign c[47]=G_46_0;
-   assign c[48]=G_47_0;
-   assign c[49]=G_48_0;
-
-   assign c[50]=G_49_0;
-   assign c[51]=G_50_0;
-   assign c[52]=G_51_0;
-   assign c[53]=G_52_0;
-   assign c[54]=G_53_0;
-   assign c[55]=G_54_0;
-   assign c[56]=G_55_0;
-   assign c[57]=G_56_0;
-
-   assign c[58]=G_57_0;
-   assign c[59]=G_58_0;
-   assign c[60]=G_59_0;
-   assign c[61]=G_60_0;
-   assign c[62]=G_61_0;
-   assign c[63]=G_62_0;
-   assign c[64]=G_63_0;
-   assign c[65]=G_64_0;
-
-   assign c[66]=G_65_0;
-   assign c[67]=G_66_0;
-   assign c[68]=G_67_0;
-   assign c[69]=G_68_0;
-   assign c[70]=G_69_0;
-   assign c[71]=G_70_0;
-   assign c[72]=G_71_0;
-   assign c[73]=G_72_0;
-
-   assign c[74]=G_73_0;
-   assign c[75]=G_74_0;
-   assign c[76]=G_75_0;
-   assign c[77]=G_76_0;
-   assign c[78]=G_77_0;
-   assign c[79]=G_78_0;
-   assign c[80]=G_79_0;
-   assign c[81]=G_80_0;
-
-   assign c[82]=G_81_0;
-   assign c[83]=G_82_0;
-   assign c[84]=G_83_0;
-   assign c[85]=G_84_0;
-   assign c[86]=G_85_0;
-   assign c[87]=G_86_0;
-   assign c[88]=G_87_0;
-   assign c[89]=G_88_0;
-
-   assign c[90]=G_89_0;
-   assign c[91]=G_90_0;
-   assign c[92]=G_91_0;
-   assign c[93]=G_92_0;
-   assign c[94]=G_93_0;
-   assign c[95]=G_94_0;
-   assign c[96]=G_95_0;
-   assign c[97]=G_96_0;
-
-   assign c[98]=G_97_0;
-   assign c[99]=G_98_0;
-   assign c[100]=G_99_0;
-   assign c[101]=G_100_0;
-   assign c[102]=G_101_0;
-   assign c[103]=G_102_0;
-   assign c[104]=G_103_0;
-   assign c[105]=G_104_0;
-
-   assign c[106]=G_105_0;
-   assign c[107]=G_106_0;
-   assign c[108]=G_107_0;
-   assign c[109]=G_108_0;
-   assign c[110]=G_109_0;
-   assign c[111]=G_110_0;
-   assign c[112]=G_111_0;
-   assign c[113]=G_112_0;
-
-   assign c[114]=G_113_0;
-   assign c[115]=G_114_0;
-   assign c[116]=G_115_0;
-   assign c[117]=G_116_0;
-   assign c[118]=G_117_0;
-   assign c[119]=G_118_0;
-   assign c[120]=G_119_0;
-   assign c[121]=G_120_0;
-
-   assign c[122]=G_121_0;
-   assign c[123]=G_122_0;
-   assign c[124]=G_123_0;
-   assign c[125]=G_124_0;
-   assign c[126]=G_125_0;
-   assign c[127]=G_126_0;
-   assign c[128]=G_127_0;
-
-endmodule // ladner_fischer
-
--- a/wally-pipelined/src/fpu/ldf64.sv
+++ b/wally-pipelined/src/fpu/ldf64.sv
@ -1,289 +0,0 @@
-// Ladner-Fischer Prefix Adder
-
-module ldf64 (cout, sum, a, b, cin);
-   input [63:0] a, b;
-   input 	cin;
-   output [63:0] sum;
-   output 	 cout;
-
-   wire [64:0] 	 p,g;
-   wire [63:0] 	 c;
-
-   // pre-computation
-   assign p={a^b,1'b0};
-   assign g={a&b, cin};
-
-   // prefix tree
-   ladner_fischer64 prefix_tree(c, p[63:0], g[63:0]);
-
-   // post-computation
-   assign sum=p[64:1]^c;
-   assign cout=g[64]|(p[64]&c[63]);
-
-endmodule
-
-module ladner_fischer64 (c, p, g);
-   
-   input [63:0] p;
-   input [63:0] g;
-   
-   output [64:1] c;
-
-   logic G_1_0,G_3_2,P_3_2,G_5_4,P_5_4,G_7_6,P_7_6,G_9_8,P_9_8,G_11_10,P_11_10,G_13_12,P_13_12,G_15_14,P_15_14
-      ,G_17_16,P_17_16,G_19_18,P_19_18,G_21_20,P_21_20,G_23_22,P_23_22,G_25_24,P_25_24,G_27_26,P_27_26,G_29_28,P_29_28
-      ,G_31_30,P_31_30,G_33_32,P_33_32,G_35_34,P_35_34,G_37_36,P_37_36,G_39_38,P_39_38,G_41_40,P_41_40,G_43_42,P_43_42
-      ,G_45_44,P_45_44,G_47_46,P_47_46,G_49_48,P_49_48,G_51_50,P_51_50,G_53_52,P_53_52,G_55_54,P_55_54,G_57_56,P_57_56
-      ,G_59_58,P_59_58,G_61_60,P_61_60,G_63_62,P_63_62,G_3_0,G_7_4,P_7_4,G_11_8,P_11_8,G_15_12,P_15_12,G_19_16,P_19_16
-      ,G_23_20,P_23_20,G_27_24,P_27_24,G_31_28,P_31_28,G_35_32,P_35_32,G_39_36,P_39_36,G_43_40,P_43_40,G_47_44,P_47_44
-      ,G_51_48,P_51_48,G_55_52,P_55_52,G_59_56,P_59_56,G_63_60,P_63_60,G_5_0,G_7_0,G_13_8,P_13_8,G_15_8,P_15_8,G_21_16
-      ,P_21_16,G_23_16,P_23_16,G_29_24,P_29_24,G_31_24,P_31_24,G_37_32,P_37_32,G_39_32,P_39_32,G_45_40,P_45_40,G_47_40
-      ,P_47_40,G_53_48,P_53_48,G_55_48,P_55_48,G_61_56,P_61_56,G_63_56,P_63_56,G_9_0,G_11_0,G_13_0,G_15_0,G_25_16
-      ,P_25_16,G_27_16,P_27_16,G_29_16,P_29_16,G_31_16,P_31_16,G_41_32,P_41_32,G_43_32,P_43_32,G_45_32,P_45_32,G_47_32
-      ,P_47_32,G_57_48,P_57_48,G_59_48,P_59_48,G_61_48,P_61_48,G_63_48,P_63_48,G_17_0,G_19_0,G_21_0,G_23_0,G_25_0,G_27_0
-      ,G_29_0,G_31_0,G_49_32,P_49_32,G_51_32,P_51_32,G_53_32,P_53_32,G_55_32,P_55_32,G_57_32,P_57_32,G_59_32,P_59_32
-      ,G_61_32,P_61_32,G_63_32,P_63_32,G_33_0,G_35_0,G_37_0,G_39_0,G_41_0,G_43_0,G_45_0,G_47_0,G_49_0,G_51_0,G_53_0
-      ,G_55_0,G_57_0,G_59_0,G_61_0,G_63_0,G_2_0,G_4_0,G_6_0,G_8_0,G_10_0,G_12_0,G_14_0,G_16_0,G_18_0,G_20_0,G_22_0
-      ,G_24_0,G_26_0,G_28_0,G_30_0,G_32_0,G_34_0,G_36_0,G_38_0,G_40_0,G_42_0,G_44_0,G_46_0,G_48_0,G_50_0,G_52_0
-      ,G_54_0,G_56_0,G_58_0,G_60_0,G_62_0;
-   // parallel-prefix, Ladner-Fischer
-
-   // Stage 1: Generates G/P pairs that span 1 bits
-   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
-   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
-   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
-   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
-   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
-   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
-   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
-   black b_15_14 (G_15_14, P_15_14, {g[15],g[14]}, {p[15],p[14]});
-
-   black b_17_16 (G_17_16, P_17_16, {g[17],g[16]}, {p[17],p[16]});
-   black b_19_18 (G_19_18, P_19_18, {g[19],g[18]}, {p[19],p[18]});
-   black b_21_20 (G_21_20, P_21_20, {g[21],g[20]}, {p[21],p[20]});
-   black b_23_22 (G_23_22, P_23_22, {g[23],g[22]}, {p[23],p[22]});
-   black b_25_24 (G_25_24, P_25_24, {g[25],g[24]}, {p[25],p[24]});
-   black b_27_26 (G_27_26, P_27_26, {g[27],g[26]}, {p[27],p[26]});
-   black b_29_28 (G_29_28, P_29_28, {g[29],g[28]}, {p[29],p[28]});
-   black b_31_30 (G_31_30, P_31_30, {g[31],g[30]}, {p[31],p[30]});
-
-   black b_33_32 (G_33_32, P_33_32, {g[33],g[32]}, {p[33],p[32]});
-   black b_35_34 (G_35_34, P_35_34, {g[35],g[34]}, {p[35],p[34]});
-   black b_37_36 (G_37_36, P_37_36, {g[37],g[36]}, {p[37],p[36]});
-   black b_39_38 (G_39_38, P_39_38, {g[39],g[38]}, {p[39],p[38]});
-   black b_41_40 (G_41_40, P_41_40, {g[41],g[40]}, {p[41],p[40]});
-   black b_43_42 (G_43_42, P_43_42, {g[43],g[42]}, {p[43],p[42]});
-   black b_45_44 (G_45_44, P_45_44, {g[45],g[44]}, {p[45],p[44]});
-   black b_47_46 (G_47_46, P_47_46, {g[47],g[46]}, {p[47],p[46]});
-
-   black b_49_48 (G_49_48, P_49_48, {g[49],g[48]}, {p[49],p[48]});
-   black b_51_50 (G_51_50, P_51_50, {g[51],g[50]}, {p[51],p[50]});
-   black b_53_52 (G_53_52, P_53_52, {g[53],g[52]}, {p[53],p[52]});
-   black b_55_54 (G_55_54, P_55_54, {g[55],g[54]}, {p[55],p[54]});
-   black b_57_56 (G_57_56, P_57_56, {g[57],g[56]}, {p[57],p[56]});
-   black b_59_58 (G_59_58, P_59_58, {g[59],g[58]}, {p[59],p[58]});
-   black b_61_60 (G_61_60, P_61_60, {g[61],g[60]}, {p[61],p[60]});
-   black b_63_62 (G_63_62, P_63_62, {g[63],g[62]}, {p[63],p[62]});
-
-   // Stage 2: Generates G/P pairs that span 2 bits
-   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
-   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
-   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
-   black b_15_12 (G_15_12, P_15_12, {G_15_14,G_13_12}, {P_15_14,P_13_12});
-   black b_19_16 (G_19_16, P_19_16, {G_19_18,G_17_16}, {P_19_18,P_17_16});
-   black b_23_20 (G_23_20, P_23_20, {G_23_22,G_21_20}, {P_23_22,P_21_20});
-   black b_27_24 (G_27_24, P_27_24, {G_27_26,G_25_24}, {P_27_26,P_25_24});
-   black b_31_28 (G_31_28, P_31_28, {G_31_30,G_29_28}, {P_31_30,P_29_28});
-
-   black b_35_32 (G_35_32, P_35_32, {G_35_34,G_33_32}, {P_35_34,P_33_32});
-   black b_39_36 (G_39_36, P_39_36, {G_39_38,G_37_36}, {P_39_38,P_37_36});
-   black b_43_40 (G_43_40, P_43_40, {G_43_42,G_41_40}, {P_43_42,P_41_40});
-   black b_47_44 (G_47_44, P_47_44, {G_47_46,G_45_44}, {P_47_46,P_45_44});
-   black b_51_48 (G_51_48, P_51_48, {G_51_50,G_49_48}, {P_51_50,P_49_48});
-   black b_55_52 (G_55_52, P_55_52, {G_55_54,G_53_52}, {P_55_54,P_53_52});
-   black b_59_56 (G_59_56, P_59_56, {G_59_58,G_57_56}, {P_59_58,P_57_56});
-   black b_63_60 (G_63_60, P_63_60, {G_63_62,G_61_60}, {P_63_62,P_61_60});
-
-   // Stage 3: Generates G/P pairs that span 4 bits
-   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
-   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
-   black b_13_8 (G_13_8, P_13_8, {G_13_12,G_11_8}, {P_13_12,P_11_8});
-   black b_15_8 (G_15_8, P_15_8, {G_15_12,G_11_8}, {P_15_12,P_11_8});
-   black b_21_16 (G_21_16, P_21_16, {G_21_20,G_19_16}, {P_21_20,P_19_16});
-   black b_23_16 (G_23_16, P_23_16, {G_23_20,G_19_16}, {P_23_20,P_19_16});
-   black b_29_24 (G_29_24, P_29_24, {G_29_28,G_27_24}, {P_29_28,P_27_24});
-   black b_31_24 (G_31_24, P_31_24, {G_31_28,G_27_24}, {P_31_28,P_27_24});
-
-   black b_37_32 (G_37_32, P_37_32, {G_37_36,G_35_32}, {P_37_36,P_35_32});
-   black b_39_32 (G_39_32, P_39_32, {G_39_36,G_35_32}, {P_39_36,P_35_32});
-   black b_45_40 (G_45_40, P_45_40, {G_45_44,G_43_40}, {P_45_44,P_43_40});
-   black b_47_40 (G_47_40, P_47_40, {G_47_44,G_43_40}, {P_47_44,P_43_40});
-   black b_53_48 (G_53_48, P_53_48, {G_53_52,G_51_48}, {P_53_52,P_51_48});
-   black b_55_48 (G_55_48, P_55_48, {G_55_52,G_51_48}, {P_55_52,P_51_48});
-   black b_61_56 (G_61_56, P_61_56, {G_61_60,G_59_56}, {P_61_60,P_59_56});
-   black b_63_56 (G_63_56, P_63_56, {G_63_60,G_59_56}, {P_63_60,P_59_56});
-
-   // Stage 4: Generates G/P pairs that span 8 bits
-   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
-   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
-   grey g_13_0 (G_13_0, {G_13_8,G_7_0}, P_13_8);
-   grey g_15_0 (G_15_0, {G_15_8,G_7_0}, P_15_8);
-   black b_25_16 (G_25_16, P_25_16, {G_25_24,G_23_16}, {P_25_24,P_23_16});
-   black b_27_16 (G_27_16, P_27_16, {G_27_24,G_23_16}, {P_27_24,P_23_16});
-   black b_29_16 (G_29_16, P_29_16, {G_29_24,G_23_16}, {P_29_24,P_23_16});
-   black b_31_16 (G_31_16, P_31_16, {G_31_24,G_23_16}, {P_31_24,P_23_16});
-
-   black b_41_32 (G_41_32, P_41_32, {G_41_40,G_39_32}, {P_41_40,P_39_32});
-   black b_43_32 (G_43_32, P_43_32, {G_43_40,G_39_32}, {P_43_40,P_39_32});
-   black b_45_32 (G_45_32, P_45_32, {G_45_40,G_39_32}, {P_45_40,P_39_32});
-   black b_47_32 (G_47_32, P_47_32, {G_47_40,G_39_32}, {P_47_40,P_39_32});
-   black b_57_48 (G_57_48, P_57_48, {G_57_56,G_55_48}, {P_57_56,P_55_48});
-   black b_59_48 (G_59_48, P_59_48, {G_59_56,G_55_48}, {P_59_56,P_55_48});
-   black b_61_48 (G_61_48, P_61_48, {G_61_56,G_55_48}, {P_61_56,P_55_48});
-   black b_63_48 (G_63_48, P_63_48, {G_63_56,G_55_48}, {P_63_56,P_55_48});
-
-   // Stage 5: Generates G/P pairs that span 16 bits
-   grey g_17_0 (G_17_0, {G_17_16,G_15_0}, P_17_16);
-   grey g_19_0 (G_19_0, {G_19_16,G_15_0}, P_19_16);
-   grey g_21_0 (G_21_0, {G_21_16,G_15_0}, P_21_16);
-   grey g_23_0 (G_23_0, {G_23_16,G_15_0}, P_23_16);
-   grey g_25_0 (G_25_0, {G_25_16,G_15_0}, P_25_16);
-   grey g_27_0 (G_27_0, {G_27_16,G_15_0}, P_27_16);
-   grey g_29_0 (G_29_0, {G_29_16,G_15_0}, P_29_16);
-   grey g_31_0 (G_31_0, {G_31_16,G_15_0}, P_31_16);
-
-   black b_49_32 (G_49_32, P_49_32, {G_49_48,G_47_32}, {P_49_48,P_47_32});
-   black b_51_32 (G_51_32, P_51_32, {G_51_48,G_47_32}, {P_51_48,P_47_32});
-   black b_53_32 (G_53_32, P_53_32, {G_53_48,G_47_32}, {P_53_48,P_47_32});
-   black b_55_32 (G_55_32, P_55_32, {G_55_48,G_47_32}, {P_55_48,P_47_32});
-   black b_57_32 (G_57_32, P_57_32, {G_57_48,G_47_32}, {P_57_48,P_47_32});
-   black b_59_32 (G_59_32, P_59_32, {G_59_48,G_47_32}, {P_59_48,P_47_32});
-   black b_61_32 (G_61_32, P_61_32, {G_61_48,G_47_32}, {P_61_48,P_47_32});
-   black b_63_32 (G_63_32, P_63_32, {G_63_48,G_47_32}, {P_63_48,P_47_32});
-
-   // Stage 6: Generates G/P pairs that span 32 bits
-   grey g_33_0 (G_33_0, {G_33_32,G_31_0}, P_33_32);
-   grey g_35_0 (G_35_0, {G_35_32,G_31_0}, P_35_32);
-   grey g_37_0 (G_37_0, {G_37_32,G_31_0}, P_37_32);
-   grey g_39_0 (G_39_0, {G_39_32,G_31_0}, P_39_32);
-   grey g_41_0 (G_41_0, {G_41_32,G_31_0}, P_41_32);
-   grey g_43_0 (G_43_0, {G_43_32,G_31_0}, P_43_32);
-   grey g_45_0 (G_45_0, {G_45_32,G_31_0}, P_45_32);
-   grey g_47_0 (G_47_0, {G_47_32,G_31_0}, P_47_32);
-
-   grey g_49_0 (G_49_0, {G_49_32,G_31_0}, P_49_32);
-   grey g_51_0 (G_51_0, {G_51_32,G_31_0}, P_51_32);
-   grey g_53_0 (G_53_0, {G_53_32,G_31_0}, P_53_32);
-   grey g_55_0 (G_55_0, {G_55_32,G_31_0}, P_55_32);
-   grey g_57_0 (G_57_0, {G_57_32,G_31_0}, P_57_32);
-   grey g_59_0 (G_59_0, {G_59_32,G_31_0}, P_59_32);
-   grey g_61_0 (G_61_0, {G_61_32,G_31_0}, P_61_32);
-   grey g_63_0 (G_63_0, {G_63_32,G_31_0}, P_63_32);
-
-   // Extra grey cell stage 
-   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
-   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
-   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
-   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
-   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
-   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
-   grey g_14_0 (G_14_0, {g[14],G_13_0}, p[14]);
-   grey g_16_0 (G_16_0, {g[16],G_15_0}, p[16]);
-   grey g_18_0 (G_18_0, {g[18],G_17_0}, p[18]);
-   grey g_20_0 (G_20_0, {g[20],G_19_0}, p[20]);
-   grey g_22_0 (G_22_0, {g[22],G_21_0}, p[22]);
-   grey g_24_0 (G_24_0, {g[24],G_23_0}, p[24]);
-   grey g_26_0 (G_26_0, {g[26],G_25_0}, p[26]);
-   grey g_28_0 (G_28_0, {g[28],G_27_0}, p[28]);
-   grey g_30_0 (G_30_0, {g[30],G_29_0}, p[30]);
-   grey g_32_0 (G_32_0, {g[32],G_31_0}, p[32]);
-   grey g_34_0 (G_34_0, {g[34],G_33_0}, p[34]);
-   grey g_36_0 (G_36_0, {g[36],G_35_0}, p[36]);
-   grey g_38_0 (G_38_0, {g[38],G_37_0}, p[38]);
-   grey g_40_0 (G_40_0, {g[40],G_39_0}, p[40]);
-   grey g_42_0 (G_42_0, {g[42],G_41_0}, p[42]);
-   grey g_44_0 (G_44_0, {g[44],G_43_0}, p[44]);
-   grey g_46_0 (G_46_0, {g[46],G_45_0}, p[46]);
-   grey g_48_0 (G_48_0, {g[48],G_47_0}, p[48]);
-   grey g_50_0 (G_50_0, {g[50],G_49_0}, p[50]);
-   grey g_52_0 (G_52_0, {g[52],G_51_0}, p[52]);
-   grey g_54_0 (G_54_0, {g[54],G_53_0}, p[54]);
-   grey g_56_0 (G_56_0, {g[56],G_55_0}, p[56]);
-   grey g_58_0 (G_58_0, {g[58],G_57_0}, p[58]);
-   grey g_60_0 (G_60_0, {g[60],G_59_0}, p[60]);
-   grey g_62_0 (G_62_0, {g[62],G_61_0}, p[62]);
-
-   // Final Stage: Apply c_k+1=G_k_0
-   assign c[1]=g[0];
-   assign c[2]=G_1_0;
-   assign c[3]=G_2_0;
-   assign c[4]=G_3_0;
-   assign c[5]=G_4_0;
-   assign c[6]=G_5_0;
-   assign c[7]=G_6_0;
-   assign c[8]=G_7_0;
-   assign c[9]=G_8_0;
-
-   assign c[10]=G_9_0;
-   assign c[11]=G_10_0;
-   assign c[12]=G_11_0;
-   assign c[13]=G_12_0;
-   assign c[14]=G_13_0;
-   assign c[15]=G_14_0;
-   assign c[16]=G_15_0;
-   assign c[17]=G_16_0;
-
-   assign c[18]=G_17_0;
-   assign c[19]=G_18_0;
-   assign c[20]=G_19_0;
-   assign c[21]=G_20_0;
-   assign c[22]=G_21_0;
-   assign c[23]=G_22_0;
-   assign c[24]=G_23_0;
-   assign c[25]=G_24_0;
-
-   assign c[26]=G_25_0;
-   assign c[27]=G_26_0;
-   assign c[28]=G_27_0;
-   assign c[29]=G_28_0;
-   assign c[30]=G_29_0;
-   assign c[31]=G_30_0;
-   assign c[32]=G_31_0;
-   assign c[33]=G_32_0;
-
-   assign c[34]=G_33_0;
-   assign c[35]=G_34_0;
-   assign c[36]=G_35_0;
-   assign c[37]=G_36_0;
-   assign c[38]=G_37_0;
-   assign c[39]=G_38_0;
-   assign c[40]=G_39_0;
-   assign c[41]=G_40_0;
-
-   assign c[42]=G_41_0;
-   assign c[43]=G_42_0;
-   assign c[44]=G_43_0;
-   assign c[45]=G_44_0;
-   assign c[46]=G_45_0;
-   assign c[47]=G_46_0;
-   assign c[48]=G_47_0;
-   assign c[49]=G_48_0;
-
-   assign c[50]=G_49_0;
-   assign c[51]=G_50_0;
-   assign c[52]=G_51_0;
-   assign c[53]=G_52_0;
-   assign c[54]=G_53_0;
-   assign c[55]=G_54_0;
-   assign c[56]=G_55_0;
-   assign c[57]=G_56_0;
-
-   assign c[58]=G_57_0;
-   assign c[59]=G_58_0;
-   assign c[60]=G_59_0;
-   assign c[61]=G_60_0;
-   assign c[62]=G_61_0;
-   assign c[63]=G_62_0;
-   assign c[64]=G_63_0;
-
-endmodule // ladner_fischer
-
--- a/wally-pipelined/src/fpu/lzd_denorm.sv
+++ b/wally-pipelined/src/fpu/lzd_denorm.sv
@ -2,7 +2,7 @@

 //    input B0;
 //    input B1;
-
+ 
 //    output P;
 //    output V;

--- a/wally-pipelined/src/fpu/mult_R4_64_64_cs.sv
+++ b/wally-pipelined/src/fpu/mult_R4_64_64_cs.sv
--- a/wally-pipelined/src/fpu/mult_R4_64_64_cs.v
+++ b/wally-pipelined/src/fpu/mult_R4_64_64_cs.v
--- a/wally-pipelined/src/fpu/rounder_denorm.sv
+++ b/wally-pipelined/src/fpu/rounder_denorm.sv
@ -16,7 +16,7 @@
 //    xxxxxL,Rxxxxxxx
 // where , denotes the rounding boundary. S is the logical OR of all the
 // bits to the right of R. 
-
+ 
 module rounder (Result, DenormIO, Flags, rm, P, OvEn, 
 		UnEn, exp_valid, sel_inv, Invalid, DenormIn, convert, Asign, Aexp, 
 		norm_shift, A, exponent_postsum, A_Norm, B_Norm, exp_A_unmodified, exp_B_unmodified,
--- a/wally-pipelined/src/fpu/rounder_div.sv
+++ b/wally-pipelined/src/fpu/rounder_div.sv
@ -5,48 +5,41 @@
 // It produces a rounded 52-bit result, Z, the exponent of the rounded 
 // result, Z_exp, and a flag that indicates if the result was rounded,
 // Inexact. The rounding mode has the following values.
-//	rm		Modee
+//	    rm		Mode
 //      00 		round-to-nearest-even
-//	01 		round-toward-zero
+//	    01 		round-toward-zero
 //      10 		round-toward-plus infinity
-//      11  		round-toward-minus infinity
+//      11  	round-toward-minus infinity
 //

-module rounder_div (Result, DenormIO, Flags, rm, P, OvEn, 
-		    UnEn, exp_diff, sel_inv, Invalid, DenormIn, 
-		    SignR, q1, qm1, qp1, q0, qm0, qp0, regr_out);
-
-   input  [1:0]   rm;
-   input          P;
-   input          OvEn;
-   input          UnEn;
-   input [12:0]   exp_diff;
-   input [2:0] 	  sel_inv;
-   input	  Invalid;
-   input	  DenormIn;
-   input 	  SignR;
+module rounder_div (
+    input logic [1:0]   rm,
+    input logic         P,
+    input logic         OvEn,
+    input logic         UnEn,
+    input logic [12:0]  exp_diff,
+    input logic [2:0]   sel_inv,
+    input logic         Invalid,
+    input logic 	    SignR,
   
-   input logic [63:0]  q1;
-   input logic [63:0]  qm1;
-   input logic [63:0]  qp1;
-   input logic [63:0]  q0;
-   input logic [63:0]  qm0;
-   input logic [63:0]  qp0;   
-   input logic [127:0] regr_out;
-   
-   output logic [63:0] Result;
-   output logic        DenormIO;
-   output logic [4:0]  Flags;
-   
-   supply1 	       vdd;
-   supply0 	       vss;
+    input logic [63:0]  q1,
+    input logic [63:0]  qm1,
+    input logic [63:0]  qp1,
+    input logic [63:0]  q0,
+    input logic [63:0]  qm0,
+    input logic [63:0]  qp0,   
+    input logic [127:0] regr_out,
   
+    output logic [63:0] Result,
+    output logic [4:0]  Flags
+    );
+      
   logic 	       Rsign;
-   logic [10:0]        Rexp;
-   logic [12:0]        Texp;
-   logic [51:0]        Rmant;
-   logic [63:0]        Tmant;
-   logic [51:0]        Smant;   
+   logic [10:0]    Rexp;
+   logic [12:0]    Texp;
+   logic [51:0]    Rmant;
+   logic [63:0]    Tmant;
+   logic [51:0]    Smant;   
   logic 	       Rzero;
   logic 	       Gdp, Gsp, G;
   logic 	       UnFlow_SP, UnFlow_DP, UnderFlow; 
@ -64,10 +57,10 @@ module rounder_div (Result, DenormIO, Flags, rm, P, OvEn,
   logic 	       Texp_l7o;
   logic 	       OvCon;
   logic           zero_rem;
-   logic [1:0] 	       mux_mant;
+   logic [1:0] 	   mux_mant;
   logic 	       sign_rem;
-   logic [63:0]        q, qm, qp;
-   logic 	       exp_ovf, exp_ovfSP, exp_ovfDP;   
+   logic [63:0]    q, qm, qp;
+   logic 	       exp_ovf;   

   // Remainder = 0?
   assign zero_rem = ~(|regr_out);
@ -98,7 +91,7 @@ module rounder_div (Result, DenormIO, Flags, rm, P, OvEn,
   //   1.) we choose any qm0, qp0, q0 (since we shift mant)
   //   2.) we choose qp and we overflow (for RU)
   assign exp_ovf = |{qp[62:40], (qp[39:11] & {29{~P}})};
-   assign Texp = exp_diff - {{13{vss}}, ~q1[63]} + {{13{vss}}, mux_mant[1]&qp1[63]&~exp_ovf};
+   assign Texp = exp_diff - {{13{1'b0}}, ~q1[63]} + {{13{1'b0}}, mux_mant[1]&qp1[63]&~exp_ovf};
   
   // Overflow only occurs for double precision, if Texp[10] to Texp[0] are 
   // all ones. To encourage sharing with single precision overflow detection,
@ -130,9 +123,6 @@ module rounder_div (Result, DenormIO, Flags, rm, P, OvEn,
   assign OverFlow  = (P & OvFlow_SP | OvFlow_DP) & Valid;
   assign Div0 = sel_inv[2]&sel_inv[1]&~sel_inv[0];

-   // The DenormIO is set if underflow has occurred or if their was a
-   // denormalized input. 
-   assign DenormIO = DenormIn | UnderFlow;

   // The final result is Inexact if any rounding occurred ((i.e., R or S 
   // is one), or (if the result overflows ) or (if the result underflows and the 
--- a/wally-pipelined/src/fpu/sbtm_a0.sv
+++ b/wally-pipelined/src/fpu/sbtm_a0.sv
@ -1,5 +1,5 @@
 module sbtm_a0 (input  logic [6:0] a,
-		output logic [12:0] y);
+		            output logic [12:0] y);
   always_comb
     case(a)
       7'b0000000: y = 13'b1111111100010;
--- a/wally-pipelined/src/fpu/sbtm_a1.sv
+++ b/wally-pipelined/src/fpu/sbtm_a1.sv
@ -1,5 +1,5 @@
 module sbtm_a1 (input  logic [6:0] a,
-		output logic [4:0] y);
+		            output logic [4:0] y);
   always_comb
     case(a)
       7'b0000000: y = 5'b11100;
--- a/wally-pipelined/src/fpu/sbtm_a2.sv
+++ b/wally-pipelined/src/fpu/sbtm_a2.sv
@ -1,5 +1,5 @@
 module sbtm_a2 (input  logic [7:0] a,
-		output logic [13:0] y);
+		            output logic [13:0] y);
   always_comb
     case(a)
       8'b01000000: y = 14'b10110100010111;
--- a/wally-pipelined/src/fpu/sbtm_a3.sv
+++ b/wally-pipelined/src/fpu/sbtm_a3.sv
@ -1,5 +1,5 @@
 module sbtm_a3 (input  logic [7:0] a,
-		output logic [5:0] y);
+		            output logic [5:0] y);
   always_comb
     case(a)
       8'b01000000: y = 6'b100110;
--- a/wally-pipelined/src/fpu/sbtm_div.sv
+++ b/wally-pipelined/src/fpu/sbtm_div.sv
@ -7,12 +7,12 @@ module sbtm_div (input logic [11:0] a, output logic [10:0] ia_out);
   logic [2:0] x2_1cmp;   
   // mem outputs
   logic [12:0] y0;
-   logic [4:0] 	y1;
+   logic [4:0]  y1;
   // input to CPA
   logic [14:0] op1;
   logic [14:0] op2;
   logic [14:0] p;  
-   logic cout; 
+   logic        cout; 

   assign x0 = a[10:7];
   assign x1 = a[6:4];
@ -26,10 +26,8 @@ module sbtm_div (input logic [11:0] a, output logic [10:0] ia_out);
   // 1s cmp per sbtm/stam
   assign op2 = x2[3] ? {1'b1, {8{1'b1}}, ~y1, 1'b1} :
 		{1'b0, 8'b0, y1, 1'b1};
-   // CPA
-//    adder #(15) cp1 (op1, op2, 1'b0, p, cout);  
+   // CPA 
   assign {cout, p} = op1 + op2;
-   //assign ia_out = {p[14:4], {53{1'b0}}};
   assign ia_out = p[14:4];

 endmodule // sbtm
--- a/wally-pipelined/src/fpu/sbtm_sqrt.sv
+++ b/wally-pipelined/src/fpu/sbtm_sqrt.sv
@ -7,12 +7,12 @@ module sbtm_sqrt (input logic [11:0] a, output logic [10:0] y);
   logic [2:0] x2_1cmp;   
   // mem outputs
   logic [13:0] y0;
-   logic [5:0] 	y1;
+   logic [5:0]  y1;
   // input to CPA
   logic [14:0] op1;
   logic [14:0] op2;
   logic [14:0] p; 
-   logic cout;  
+   logic        cout;  

   assign x0 = a[11:7];
   assign x1 = a[6:4];
@ -29,7 +29,6 @@ module sbtm_sqrt (input logic [11:0] a, output logic [10:0] y);
 		{8'b0, y1, 1'b1};
   
   // CPA
-   //adder #(15) cp1 (op1, op2, 1'b0, p, cout);
   assign {cout, p} = op1 + op2; 
   assign y = p[14:4];

--- a/wally-pipelined/src/fpu/shifter_denorm.sv
+++ b/wally-pipelined/src/fpu/shifter_denorm.sv
@ -28,7 +28,7 @@ module mux21x64 (Z, A, B, Sel);
   assign Z = Sel ? B : A;
   
 endmodule // mux21x64
-
+ 
 // The implementation of the barrel shifter was modified to use 
 // fewer gates. It is now implemented using six 64-bit 2-to-1 muxes. The 
 // barrel shifter takes a 64-bit input A and shifts it left by up to 
--- a/wally-pipelined/src/fpu/unpacking.sv
+++ b/wally-pipelined/src/fpu/unpacking.sv
@ -1,4 +1,4 @@
-module unpacking (
+module unpacking ( 
    input logic  [63:0] X, Y, Z,
    input logic         FmtE,
    input logic  [2:0]  FOpCtrlE,
@ -25,9 +25,9 @@ module unpacking (
    assign YSgnE = FmtE ? Y[63] : Y[31];
    assign ZSgnE = FmtE ? Z[63] : Z[31];

-    assign XExpE = FmtE ? X[62:52] : {X[30], {3{~X[30] & XExpNonzero | XExpMaxE}}, X[29:23]}; 
-    assign YExpE = FmtE ? Y[62:52] : {Y[30], {3{~Y[30] & YExpNonzero | YExpMaxE}}, Y[29:23]}; 
-    assign ZExpE = FmtE ? Z[62:52] : {Z[30], {3{~Z[30] & ZExpNonzero | ZExpMaxE}}, Z[29:23]}; 
+    assign XExpE = FmtE ? X[62:52] : {3'b0, X[30:23]};//{X[30], {3{~X[30]&~XExpZero|XExpMaxE}}, X[29:23]}; 
+    assign YExpE = FmtE ? Y[62:52] : {3'b0, Y[30:23]};//{Y[30], {3{~Y[30]&~YExpZero|YExpMaxE}}, Y[29:23]}; 
+    assign ZExpE = FmtE ? Z[62:52] : {3'b0, Z[30:23]};//{Z[30], {3{~Z[30]&~ZExpZero|ZExpMaxE}}, Z[29:23]}; 
 /*    assign XExpE = FmtE ? X[62:52] : {3'b0, X[30:23]}; // *** maybe convert to full number of bits here?
    assign YExpE = FmtE ? Y[62:52] : {3'b0, Y[30:23]};
    assign ZExpE = FmtE ? Z[62:52] : {3'b0, Z[30:23]};*/
@ -78,7 +78,7 @@ module unpacking (
    assign YZeroE = YExpZero & YFracZero;
    assign ZZeroE = ZExpZero & ZFracZero;

-    //assign BiasE = FmtE ? 13'h3ff : 13'h7f; // *** is it better to convert to full precision exponents so bias isn't needed?
-    assign BiasE = 13'h3ff; // always use 1023 because exponents are unpacked to double precision
+    assign BiasE = FmtE ? 13'h3ff : 13'h7f; // *** is it better to convert to full precision exponents so bias isn't needed?
+    // assign BiasE = 13'h3ff; // always use 1023 because exponents are unpacked to double precision

 endmodule
--- a/wally-pipelined/src/mmu/hptw.sv
+++ b/wally-pipelined/src/mmu/hptw.sv
@ -50,6 +50,12 @@ module hptw
   output logic 	       WalkerInstrPageFaultF, WalkerLoadPageFaultM,WalkerStorePageFaultM // faults
 );

+      typedef enum  {L0_ADR, L0_RD, 
+				     L1_ADR, L1_RD, 
+				     L2_ADR, L2_RD, 
+				     L3_ADR, L3_RD, 
+				     LEAF, IDLE, FAULT} statetype; // *** placed outside generate statement to remove synthesis errors
+
  generate
    if (`MEM_VIRTMEM) begin
      logic			    DTLBWalk; // register TLBs translation miss requests
@ -66,12 +72,6 @@ module hptw
      logic [`SVMODE_BITS-1:0]	    SvMode;
      logic [`XLEN-1:0] 	    TranslationVAdr;
      
-
-      typedef enum  {LEVEL0_SET_ADR, LEVEL0_READ, LEVEL0,
-				     LEVEL1_SET_ADR, LEVEL1_READ, LEVEL1,
-				     LEVEL2_SET_ADR, LEVEL2_READ, LEVEL2,
-				     LEVEL3_SET_ADR, LEVEL3_READ, LEVEL3,
-				     LEAF, IDLE, FAULT} statetype;
      statetype WalkerState, NextWalkerState, InitialWalkerState;

 	  // Extract bits from CSRs and inputs
@ -99,7 +99,7 @@ module hptw
 	  
 	  // Enable and select signals based on states
      assign StartWalk = (WalkerState == IDLE) & TLBMiss;
-	  assign HPTWRead = (WalkerState == LEVEL3_READ) | (WalkerState == LEVEL2_READ) | (WalkerState == LEVEL1_READ) | (WalkerState == LEVEL0_READ);
+	  assign HPTWRead = (WalkerState == L3_RD) | (WalkerState == L2_RD) | (WalkerState == L1_RD) | (WalkerState == L0_RD);
 	  assign SelPTW = (WalkerState != IDLE) & (WalkerState != FAULT) & (WalkerState != LEAF);
 	  assign DTLBWriteM = (WalkerState == LEAF) & DTLBWalk;
 	  assign ITLBWriteF = (WalkerState == LEAF) & ~DTLBWalk;
@ -113,10 +113,10 @@ module hptw
 	  flopr #(2) PageTypeReg(clk, reset, NextPageType, PageType);
 	  always_comb 
 		case (WalkerState)
-			LEVEL3:  NextPageType = 2'b11; // terapage
-			LEVEL2:  NextPageType = 2'b10; // gigapage
-			LEVEL1:  NextPageType = 2'b01; // megapage
-			LEVEL0:  NextPageType = 2'b00; // kilopage
+			L3_RD:  NextPageType = 2'b11; // terapage
+			L2_RD:  NextPageType = 2'b10; // gigapage
+			L1_RD:  NextPageType = 2'b01; // megapage
+			L0_RD:  NextPageType = 2'b00; // kilopage
 			default: NextPageType = PageType;
 		endcase

@ -124,36 +124,36 @@ module hptw
 	  if (`XLEN==32) begin // RV32
 		logic [9:0] VPN;
 		logic [`PPN_BITS-1:0] PPN;
-		assign VPN = ((WalkerState == LEVEL1_SET_ADR) | (WalkerState == LEVEL1_READ)) ? TranslationVAdr[31:22] : TranslationVAdr[21:12]; // select VPN field based on HPTW state
-		assign PPN = ((WalkerState == LEVEL1_SET_ADR) | (WalkerState == LEVEL1_READ)) ? BasePageTablePPN : CurrentPPN; 
+		assign VPN = ((WalkerState == L1_ADR) | (WalkerState == L1_RD)) ? TranslationVAdr[31:22] : TranslationVAdr[21:12]; // select VPN field based on HPTW state
+		assign PPN = ((WalkerState == L1_ADR) | (WalkerState == L1_RD)) ? BasePageTablePPN : CurrentPPN; 
 		assign TranslationPAdr = {PPN, VPN, 2'b00}; 
 	  end else begin // RV64
 		logic [8:0] VPN;
 		logic [`PPN_BITS-1:0] PPN;
 		always_comb
 			case (WalkerState) // select VPN field based on HPTW state
-				LEVEL3_SET_ADR, LEVEL3_READ:  			VPN = TranslationVAdr[47:39];
-				LEVEL3, LEVEL2_SET_ADR, LEVEL2_READ:    VPN = TranslationVAdr[38:30];
-				LEVEL2, LEVEL1_SET_ADR, LEVEL1_READ: 	VPN = TranslationVAdr[29:21];
+				L3_ADR, L3_RD:  			VPN = TranslationVAdr[47:39];
+				L2_ADR, L2_RD:    VPN = TranslationVAdr[38:30];
+				L1_ADR, L1_RD: 	VPN = TranslationVAdr[29:21];
 				default:		 						VPN = TranslationVAdr[20:12];
 			endcase
-		assign PPN = ((WalkerState == LEVEL3_SET_ADR) | (WalkerState == LEVEL3_READ) | 
-		              (SvMode != `SV48 & ((WalkerState == LEVEL2_SET_ADR) | (WalkerState == LEVEL2_READ)))) ? BasePageTablePPN : CurrentPPN;
+		assign PPN = ((WalkerState == L3_ADR) | (WalkerState == L3_RD) | 
+		              (SvMode != `SV48 & ((WalkerState == L2_ADR) | (WalkerState == L2_RD)))) ? BasePageTablePPN : CurrentPPN;
 		assign TranslationPAdr = {PPN, VPN, 3'b000}; 
 	  end

 	  // Initial state and misalignment for RV32/64
 	  if (`XLEN == 32) begin
-		assign InitialWalkerState = LEVEL1_SET_ADR;
+		assign InitialWalkerState = L1_ADR;
 		assign MegapageMisaligned = |(CurrentPPN[9:0]); // must have zero PPN0
-		assign Misaligned = ((WalkerState == LEVEL1) & MegapageMisaligned);
+		assign Misaligned = ((WalkerState == L0_ADR) & MegapageMisaligned);
 	  end else begin
 		logic  GigapageMisaligned, TerapageMisaligned;
-		assign InitialWalkerState = (SvMode == `SV48) ? LEVEL3_SET_ADR : LEVEL2_SET_ADR;
+		assign InitialWalkerState = (SvMode == `SV48) ? L3_ADR : L2_ADR;
 		assign TerapageMisaligned = |(CurrentPPN[26:0]); // must have zero PPN2, PPN1, PPN0
 		assign GigapageMisaligned = |(CurrentPPN[17:0]); // must have zero PPN1 and PPN0
 		assign MegapageMisaligned = |(CurrentPPN[8:0]); // must have zero PPN0		  
-		assign Misaligned = ((WalkerState == LEVEL3) & TerapageMisaligned) | ((WalkerState == LEVEL2) & GigapageMisaligned) | ((WalkerState == LEVEL1) & MegapageMisaligned);
+		assign Misaligned = ((WalkerState == L2_ADR) & TerapageMisaligned) | ((WalkerState == L1_ADR) & GigapageMisaligned) | ((WalkerState == L0_ADR) & MegapageMisaligned);
 	  end

    // Page Table Walker FSM
@ -166,29 +166,37 @@ module hptw
 	  case (WalkerState)
 	    IDLE: if (TLBMiss)	 		NextWalkerState = InitialWalkerState;
 		      else 					NextWalkerState = IDLE;
-	    LEVEL3_SET_ADR: 			NextWalkerState = LEVEL3_READ;
-	    LEVEL3_READ: if (HPTWStall) NextWalkerState = LEVEL3_READ;
-	                else 			NextWalkerState = LEVEL3;
-	    LEVEL3: if (ValidLeafPTE && ~Misaligned) NextWalkerState = LEAF;
-		  		else if (ValidNonLeafPTE) NextWalkerState = LEVEL2_SET_ADR;
+	    L3_ADR: 			NextWalkerState = L3_RD; // first access in SV48
+	    L3_RD: if (HPTWStall) NextWalkerState = L3_RD;
+	                else 			NextWalkerState = L2_ADR;
+//	    LEVEL3: if (ValidLeafPTE && ~Misaligned) NextWalkerState = LEAF;
+//		  		else if (ValidNonLeafPTE) NextWalkerState = L2_ADR;
+//		 		else 				NextWalkerState = FAULT;
+	    L2_ADR: if (InitialWalkerState == L2_ADR) NextWalkerState = L2_RD; // first access in SV39
+				else if (ValidLeafPTE && ~Misaligned) NextWalkerState = LEAF; // could shortcut this by a cyle for all Lx_ADR superpages
+		  		else if (ValidNonLeafPTE) NextWalkerState = L2_RD;
+		 		else 				NextWalkerState = FAULT;			
+	    L2_RD: if (HPTWStall) NextWalkerState = L2_RD;
+	      			else 			NextWalkerState = L1_ADR;
+//	    LEVEL2: if (ValidLeafPTE && ~Misaligned) NextWalkerState = LEAF;
+//				else if (ValidNonLeafPTE) NextWalkerState = L1_ADR;
+//				else 				NextWalkerState = FAULT;
+	    L1_ADR: if (InitialWalkerState == L1_ADR) NextWalkerState = L1_RD; // first access in SV32
+				else if (ValidLeafPTE && ~Misaligned) NextWalkerState = LEAF; // could shortcut this by a cyle for all Lx_ADR superpages
+		  		else if (ValidNonLeafPTE) NextWalkerState = L1_RD;
+		 		else 				NextWalkerState = FAULT;	
+	    L1_RD: if (HPTWStall) NextWalkerState = L1_RD;
+	      			else 			NextWalkerState = L0_ADR;
+//	    LEVEL1: if (ValidLeafPTE && ~Misaligned) NextWalkerState = LEAF;
+//	      		else if (ValidNonLeafPTE) NextWalkerState = L0_ADR;
+//				else 				NextWalkerState = FAULT;
+	    L0_ADR: if (ValidLeafPTE && ~Misaligned) NextWalkerState = LEAF; // could shortcut this by a cyle for all Lx_ADR superpages
+		  		else if (ValidNonLeafPTE) NextWalkerState = L0_RD;
 		 		else 				NextWalkerState = FAULT;
-	    LEVEL2_SET_ADR: 			NextWalkerState = LEVEL2_READ;
-	    LEVEL2_READ: if (HPTWStall) NextWalkerState = LEVEL2_READ;
-	      			else 			NextWalkerState = LEVEL2;
-	    LEVEL2: if (ValidLeafPTE && ~Misaligned) NextWalkerState = LEAF;
-				else if (ValidNonLeafPTE) NextWalkerState = LEVEL1_SET_ADR;
-				else 				NextWalkerState = FAULT;
-	    LEVEL1_SET_ADR: 			NextWalkerState = LEVEL1_READ;
-	    LEVEL1_READ: if (HPTWStall) NextWalkerState = LEVEL1_READ;
-	      			else 			NextWalkerState = LEVEL1;
-	    LEVEL1: if (ValidLeafPTE && ~Misaligned) NextWalkerState = LEAF;
-	      		else if (ValidNonLeafPTE) NextWalkerState = LEVEL0_SET_ADR;
-				else 				NextWalkerState = FAULT;
-	    LEVEL0_SET_ADR: 			NextWalkerState = LEVEL0_READ;
-	    LEVEL0_READ: if (HPTWStall) NextWalkerState = LEVEL0_READ;
-	      			else 			NextWalkerState = LEVEL0;
-	    LEVEL0: if (ValidLeafPTE) 	NextWalkerState = LEAF;
-				else 				NextWalkerState = FAULT;
+	    L0_RD: if (HPTWStall) NextWalkerState = L0_RD;
+	      			else 			NextWalkerState = LEAF;
+//	    LEVEL0: if (ValidLeafPTE) 	NextWalkerState = LEAF;
+//				else 				NextWalkerState = FAULT;
 	    LEAF: 						NextWalkerState = IDLE;
 	    FAULT: if (ITLBMissF & AnyCPUReqM & ~MemAfterIWalkDone) NextWalkerState = FAULT;
 	                        else NextWalkerState = IDLE;
--- a/wally-pipelined/src/mmu/pmpadrdec.sv
+++ b/wally-pipelined/src/mmu/pmpadrdec.sv
@ -67,9 +67,7 @@ module pmpadrdec (
  assign TORMatch = PAgePMPAdrIn && PAltPMPAdr;

  // Naturally aligned regions
-
-  // verilator lint_off UNOPTFLAT
-  logic [`PA_BITS-1:0] Mask;
+  logic [`PA_BITS-1:0] NAMask;
  //genvar i;
  
  // create a mask of which bits to ignore
@ -80,23 +78,14 @@ module pmpadrdec (
  //     assign Mask[i] = Mask[i-1] & PMPAdr[i-3]; // NAPOT mask: 1's indicate bits to ignore
  //   end
  // endgenerate
-  prioritycircuit #(.ENTRIES(`PA_BITS-2), .FINAL_OP("NONE")) maskgen(.a(~PMPAdr[`PA_BITS-3:0]), .FirstPin(AdrMode==NAPOT), .y(Mask[`PA_BITS-1:2]));
-  assign Mask[1:0] = 2'b11;

-  // *** possible experiments:
-  /* PA < PMP addr could be in its own module, 
-        preeserving hierarchy so we can know if this is the culprit on the critical path
-        Should take logarthmic time, so more like 6 levels than 40 should be expected
+  assign NAMask[1:0] = {2'b11};

-    update mask generation
-        Should be concurrent with the subtraction/comparison
-        if one is the critical path, the other shouldn't be which makes us think the mask generation is the culprit.
+  prioritythemometer #(`PA_BITS-2) namaskgen(
+    .a({PMPAdr[`PA_BITS-4:0], (AdrMode == NAPOT)}),
+    .y(NAMask[`PA_BITS-1:2]));

-    Hopefully just use the priority circuit here
-    */
-  // verilator lint_on UNOPTFLAT
-
-  assign NAMatch = &((PhysicalAddress ~^ CurrentAdrFull) | Mask);
+  assign NAMatch = &((PhysicalAddress ~^ CurrentAdrFull) | NAMask);

  assign Match = (AdrMode == TOR) ? TORMatch : 
                 (AdrMode == NA4 || AdrMode == NAPOT) ? NAMatch :
--- a/wally-pipelined/src/mmu/pmpchecker.sv
+++ b/wally-pipelined/src/mmu/pmpchecker.sv
@ -69,7 +69,7 @@ module pmpchecker (
    .PAgePMPAdrOut(PAgePMPAdr),
    .FirstMatch, .Match, .Active, .L, .X, .W, .R);

-  prioritycircuit #(.ENTRIES(`PMP_ENTRIES), .FINAL_OP("AND")) pmppriority(.a(Match), .FirstPin(1'b1), .y(FirstMatch)); // Take the ripple gates/signals out of the pmpadrdec and into another unit.
+  priorityonehot #(`PMP_ENTRIES) pmppriority(.a(Match), .y(FirstMatch)); // Take the ripple gates/signals out of the pmpadrdec and into another unit.

  // Only enforce PMP checking for S and U modes when at least one PMP is active or in Machine mode when L bit is set in selected region
  assign EnforcePMP = (PrivilegeModeW == `M_MODE) ? |L : |Active; 
--- a/wally-pipelined/src/mmu/prioritycircuit.sv
+++ b/wally-pipelined/src/mmu/prioritycircuit.sv
@ -1,5 +1,5 @@
 ///////////////////////////////////////////
-// prioritycircuit.sv
+// priorityonehot.sv
 //
 // Written: tfleming@hmc.edu & jtorrey@hmc.edu 7 April 2021
 // Modified: Teo Ene 15 Apr 2021:
@ -30,31 +30,22 @@

 `include "wally-config.vh"

-module prioritycircuit #(parameter ENTRIES = 8,
-                         parameter FINAL_OP = "AND") (
+module priorityonehot #(parameter ENTRIES = 8) (
  input  logic  [ENTRIES-1:0] a,
-  input  logic                FirstPin,
  output logic  [ENTRIES-1:0] y
 );
-  // verilator lint_off UNOPTFLAT
+
  logic [ENTRIES-1:0] nolower;

  // generate thermometer code mask
  genvar i;
  generate
-    assign nolower[0] = FirstPin;
+    assign nolower[0] = 1'b1;
    for (i=1; i<ENTRIES; i++) begin:therm
      assign nolower[i] = nolower[i-1] & ~a[i-1];
    end
  endgenerate
-  // verilator lint_on UNOPTFLAT
+
+  assign y = a & nolower;
  
-  generate
-    if (FINAL_OP=="AND") begin
-      assign y = a & nolower;
-    end else if (FINAL_OP=="NONE") begin
-      assign y = nolower;
-    end // *** So far these are the only two operations I need to do at the end, but feel free to add more as needed.
-  endgenerate
-  // assign y = a & nolower;
 endmodule
--- a/wally-pipelined/src/mmu/prioritythermometer.sv
+++ b/wally-pipelined/src/mmu/prioritythermometer.sv
@ -0,0 +1,50 @@
+///////////////////////////////////////////
+// priritythermometer.sv
+//
+// Written: tfleming@hmc.edu & jtorrey@hmc.edu 7 April 2021
+// Modified: Teo Ene 15 Apr 2021:
+//              Temporarily removed paramterized priority encoder for non-parameterized one
+//              To get synthesis working quickly
+//           Kmacsaigoren@hmc.edu 28 May 2021:
+//              Added working version of parameterized priority encoder. 
+//           David_Harris@Hmc.edu switched to one-hot output
+//
+// Purpose: Priority circuit to choose most significant one-hot output
+//
+// A component of the Wally configurable RISC-V project.
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module prioritythemometer #(parameter N = 8) (
+  input  logic  [N-1:0] a,
+  output logic  [N-1:0] y
+);
+
+  // generate thermometer code mask
+  genvar i;
+  generate
+    assign y[0] = a[0];
+    for (i=1; i<N; i++) begin
+      assign y[i] = y[i-1] & a[i];
+    end
+  endgenerate
+
+endmodule
+
+
+
--- a/wally-pipelined/src/mmu/tlblru.sv
+++ b/wally-pipelined/src/mmu/tlblru.sv
@ -39,7 +39,7 @@ module tlblru #(parameter TLB_ENTRIES = 8) (
  logic                AllUsed;  // High if the next access causes all RU bits to be 1

  // Find the first line not recently used
-  prioritycircuit #(.ENTRIES(TLB_ENTRIES), .FINAL_OP("AND")) nru(.a(~RUBits), .FirstPin(1'b1), .y(WriteLines));
+  priorityonehot #(TLB_ENTRIES) nru(.a(~RUBits), .y(WriteLines));

  // Track recently used lines, updating on a CAM Hit or TLB write
  assign WriteEnables = WriteLines & {(TLB_ENTRIES){TLBWrite}};
--- a/wally-pipelined/testbench/imperas-boottim.txt
+++ b/wally-pipelined/testbench/imperas-boottim.txt
@ -0,0 +1,512 @@
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
+0000000000000000
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -46,11 +46,15 @@ module testbench();

  string tests32mmu[] = '{
    "rv32mmu/WALLY-MMU-SV32", "3000"
+    //"rv32mmu/WALLY-PMA", "3000",
+    //"rv32mmu/WALLY-PMA", "3000"
    };

  string tests64mmu[] = '{
    "rv64mmu/WALLY-MMU-SV48", "3000",
    "rv64mmu/WALLY-MMU-SV39", "3000"
+    //"rv64mmu/WALLY-PMA", "3000",
+    //"rv64mmu/WALLY-PMA", "3000"
  };

  
@ -558,7 +562,7 @@ string tests32f[] = '{
    end
  end

-  string signame, memfilename;
+  string signame, memfilename, romfilename;

  logic [31:0] GPIOPinsIn, GPIOPinsOut, GPIOPinsEn;
  logic UARTSin, UARTSout;
@ -604,7 +608,9 @@ string tests32f[] = '{
      end
      // read test vectors into memory
      memfilename = {"../../imperas-riscv-tests/work/", tests[test], ".elf.memfile"};
+      romfilename = {"../../imperas-riscv-tests/imperas-boottim.txt"};
      $readmemh(memfilename, dut.uncore.dtim.RAM);
+      $readmemh(romfilename, dut.uncore.bootdtim.bootdtim.RAM);
      ProgramAddrMapFile = {"../../imperas-riscv-tests/work/", tests[test], ".elf.objdump.addr"};
      ProgramLabelMapFile = {"../../imperas-riscv-tests/work/", tests[test], ".elf.objdump.lab"};
      $display("Read memfile %s", memfilename);
@ -886,6 +892,7 @@ module instrNameDecTB(
                       else if (imm == 2) name = "URET";
                       else if (imm == 258) name = "SRET";
                       else if (imm == 770) name = "MRET";
+                       else if (funct7 == 9) name = "SFENCE.VMA";
                       else              name = "ILLEGAL";
      10'b1110011_001: name = "CSRRW";
      10'b1110011_010: name = "CSRRS";
--- a/wally-pipelined/testbench/testbench-linux.sv
+++ b/wally-pipelined/testbench/testbench-linux.sv
@ -27,7 +27,7 @@

 module testbench();
  
-  parameter waveOnICount = `BUSYBEAR*140000 + `BUILDROOT*3160000; // # of instructions at which to turn on waves in graphical sim
+  parameter waveOnICount = `BUSYBEAR*140000 + `BUILDROOT*3080000; // # of instructions at which to turn on waves in graphical sim
  parameter stopICount   = `BUSYBEAR*143898 + `BUILDROOT*0000000; // # instructions at which to halt sim completely (set to 0 to let it run as far as it can)  

  ///////////////////////////////////////////////////////////////////////////////
@ -184,9 +184,12 @@ module testbench();
        scan_file_rf = $fscanf(data_file_rf, "%d\n", regNumExpected);
        scan_file_rf = $fscanf(data_file_rf, "%x\n", regExpected);
        force dut.hart.ieu.dp.regf.wd3 = regExpected;
-      // Hack to compensate for QEMU's incorrect MSTATUS
+      // Hack to compensate for QEMU's incorrect MSTATUS (Wally correctly identifies MXL, SXL to be 2 whereas QEMU sets them to an invalid value of 0
      end else if (PCtextW.substr(0,3) == "csrr" && PCtextW.substr(10,16) == "mstatus") begin
        force dut.hart.ieu.dp.regf.wd3 = dut.hart.ieu.dp.WriteDataW & ~64'ha00000000;
+            // Hack to compensate for QEMU's incorrect SSTATUS (Wally correctly identifies UXL to be 2 whereas QEMU sets it to an invalid value of 0
+      end else if (PCtextW.substr(0,3) == "csrr" && ((PCtextW.substr(10,16) == "sstatus") || (PCtextW.substr(11,17) == "sstatus"))) begin
+        force dut.hart.ieu.dp.regf.wd3 = dut.hart.ieu.dp.WriteDataW & ~64'h200000000;
      end else release dut.hart.ieu.dp.regf.wd3;
      // Hack to compensate for QEMU's correct but different MTVAL (according to spec, storing the faulting instr is an optional feature)
      if (PCtextW.substr(0,3) == "csrr" && PCtextW.substr(10,14) == "mtval") begin
@ -265,7 +268,7 @@ module testbench();

          // Check PCD, InstrD
          if (~PCDwrong && ~(dut.hart.ifu.PCD === PCDexpected)) begin
-            $display("%0t ps, instr %0d: PC does not equal PC expected: %x, %x", $time, instrs, dut.hart.ifu.PCD, PCDexpected);
+            $display("%0t ps, instr %0d: PCD does not equal PCD expected: %x, %x", $time, instrs, dut.hart.ifu.PCD, PCDexpected);
            `ERROR
          end
          InstrMask = InstrDExpected[1:0] == 2'b11 ? 32'hFFFFFFFF : 32'h0000FFFF;