Merge branch 'main' of https://github.com/davidharrishmc/riscv-wally into main

2025-02-11 06:05:49 +00:00 · 2021-12-30 00:53:44 +00:00 · 2021-12-30 00:53:44 +00:00 · 75c0c8ebea
commit 75c0c8ebea
parent 866a5efc43 1d4ff095cf
7 changed files with 228 additions and 185 deletions
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@ -1 +1 @@
-Subproject commit 307c77b26e070ae85ffea665ad9b642b40e33c86
+Subproject commit be67c99bd461742aa1c100bcc0732657faae2230
--- a/wally-pipelined/regression/wave.do
+++ b/wally-pipelined/regression/wave.do
@ -205,7 +205,7 @@ add wave -noupdate -group AHB /testbench/dut/hart/ebu/HMASTLOCK
 add wave -noupdate -group AHB /testbench/dut/hart/ebu/HADDRD
 add wave -noupdate -group AHB /testbench/dut/hart/ebu/HSIZED
 add wave -noupdate -group AHB /testbench/dut/hart/ebu/HWRITED
-add wave -noupdate -expand -group lsu -color Gold /testbench/dut/hart/lsu/MEM_VIRTMEM/InterlockCurrState
+add wave -noupdate -expand -group lsu -color Gold /testbench/dut/hart/lsu/MEM_VIRTMEM/interlockfsm/InterlockCurrState
 add wave -noupdate -expand -group lsu /testbench/dut/hart/lsu/SelHPTW
 add wave -noupdate -expand -group lsu /testbench/dut/hart/lsu/InterlockStall
 add wave -noupdate -expand -group lsu /testbench/dut/hart/lsu/LSUStall
@ -472,8 +472,11 @@ add wave -noupdate -group {pc selection} /testbench/dut/hart/ifu/PrivilegedNextP
 add wave -noupdate -group {pc selection} /testbench/dut/hart/ifu/PrivilegedChangePCM
 add wave -noupdate /testbench/dut/hart/priv/priv/csr/MEPC_REGW
 add wave -noupdate /testbench/dut/hart/lsu/LocalLsuBusAdr
+add wave -noupdate /testbench/dut/hart/lsu/busfsm/BusNextState
+add wave -noupdate /testbench/dut/hart/lsu/busfsm/DCacheFetchLine
+add wave -noupdate /testbench/dut/hart/lsu/busfsm/DCacheWriteLine
 TreeUpdate [SetDefaultTree]
-WaveRestoreCursors {{Cursor 7} {36865 ns} 1} {{Cursor 5} {49445 ns} 1} {{Cursor 3} {35522 ns} 0} {{Cursor 4} {49574 ns} 1}
+WaveRestoreCursors {{Cursor 7} {36865 ns} 1} {{Cursor 5} {49445 ns} 1} {{Cursor 3} {9745 ns} 0} {{Cursor 4} {49574 ns} 1}
 quietly wave cursor active 3
 configure wave -namecolwidth 250
 configure wave -valuecolwidth 314
@ -489,4 +492,4 @@ configure wave -griddelta 40
 configure wave -timeline 0
 configure wave -timelineunits ns
 update
-WaveRestoreZoom {35088 ns} {35954 ns}
+WaveRestoreZoom {9530 ns} {9952 ns}
--- a/wally-pipelined/src/fpu/fcvt.sv
+++ b/wally-pipelined/src/fpu/fcvt.sv
@ -1,6 +1,7 @@

 `include "wally-config.vh"
 // `include "../../config/rv64icfd/wally-config.vh"
+//  `define XLEN 64
 module fcvt (
 	input logic             XSgnE,      // X's sign
    input logic [10:0]      XExpE,      // X's exponent
@ -59,7 +60,7 @@ module fcvt (
      //  fcvt.lu.d = 111
      //  fcvt.d.l  = 100
      //  fcvt.d.lu = 110
-      //  {long, unsigned, to int, from int}
+      //  {long, unsigned, to int}
   
    // calculate signals based off the input and output's size
    assign Res64 = (FOpCtrlE[0]&FOpCtrlE[2]) | (FmtE&~FOpCtrlE[0]);
@ -158,19 +159,24 @@ module fcvt (

    // select the integer result
    assign CvtIntRes = Of ? FOpCtrlE[1] ? {64{1'b1}} : SgnRes ? {33'b0, {31{1'b1}}}: {1'b0, {63{1'b1}}} : 
-                    Uf ? FOpCtrlE[1] ? 64'b0 : SgnRes ? {32'b0, 1'b1, 31'b0} : {1'b1, 63'b0} :
+                    Uf ? FOpCtrlE[1] ? {63'b0, Plus1&~XSgnE} : SgnRes ? {32'b0, 1'b1, 31'b0} : {1'b1, 63'b0} :
 		            Rounded[64-1:0];

    // select the floating point result            
    assign CvtFPRes = FmtE ? {ResSgn, ResExp, ResFrac} : {{32{1'b1}}, ResSgn, ResExp[7:0], ResFrac[51:29]};

    // select the result
-    assign CvtResE = ~FOpCtrlE[0] ? CvtFPRes : CvtIntRes;
+    assign CvtResE = FOpCtrlE[0] ? CvtIntRes : CvtFPRes;

    // calculate the flags
-    //      - to int only sets the invalid flag
-    //      - from int only sets the inexact flag
-    assign CvtFlgE = {(Of | Uf)&FOpCtrlE[0], 3'b0, (Guard|Round|Sticky)&~FOpCtrlE[0]};
+    //      - only set invalid flag for out-of-range vales if it isn't be indicated by the inexact
+    //      - don't set inexact flag if converting a really large number (closest __ bit integer value is the max value)
+    //      - don't set inexact flag if converting negitive or tiny number to unsigned (closest integer value is 0 or 1)
+    logic Invalid, Inexact;
+    assign Invalid = (Of | Uf)&FOpCtrlE[0];
+    assign Inexact = (Guard|Round|Sticky)&~((&FOpCtrlE[1:0]&Uf&~(Plus1&~XSgnE))|(FOpCtrlE[0]&Of));
+    assign CvtFlgE = {Invalid&~Inexact, 3'b0, Inexact};
+    // assign CvtFlgE = {(Of | Uf)&FOpCtrlE[0], 3'b0, (Guard|Round|Sticky)&~FOpCtrlE[0]};



--- a/wally-pipelined/src/fpu/fma.sv
+++ b/wally-pipelined/src/fpu/fma.sv
@ -28,7 +28,6 @@
 // `define NE   11//(`Q_SUPPORTED ? 15 : `D_SUPPORTED ? 11 : 8)
 // `define NF   52//(`Q_SUPPORTED ? 112 : `D_SUPPORTED ? 52 : 23)
 // `define XLEN 64
-`define NANPAYLOAD 1
 module fma(
    input logic                 clk,
    input logic                 reset,
@ -117,7 +116,6 @@ module fma1(
    logic [3*`NF+5:0]   AlignedAddendE;     // Z aligned for addition in U(NF+5.2NF+1)
    logic [3*`NF+6:0]   AlignedAddendInv;   // aligned addend possibly inverted
    logic [2*`NF+1:0]   ProdManKilled;      // the product's mantissa possibly killed
-    logic [3*`NF+4:0]   NegProdManKilled;   // a negated ProdManKilled
    logic [3*`NF+6:0]   PreSum, NegPreSum;  // positive and negitve versions of the sum
    logic [`NE-1:0]     XExpVal, YExpVal;   // exponent value after taking into accound denormals
    ///////////////////////////////////////////////////////////////////////////////
@ -321,7 +319,6 @@ module add(
    output logic [3*`NF+6:0]    PreSum, NegPreSum// possibly negitive sum
 );

-    logic [3*`NF+4:0] NegProdManKilled;  // a negated ProdManKilled
    ///////////////////////////////////////////////////////////////////////////////
    // Addition
    ///////////////////////////////////////////////////////////////////////////////
@ -335,15 +332,13 @@ module add(
    assign AlignedAddendInv = InvZE ? {1'b1, ~AlignedAddendE} : {1'b0, AlignedAddendE};
    // Kill the product if the product is too small to effect the addition (determined in fma1.sv)
    assign ProdManKilled = ProdManE&{2*`NF+2{~KillProdE}};
-    // Negate ProdMan for LZA and the negitive sum calculation
-    assign NegProdManKilled = {{`NF+3{~(XZeroE|YZeroE|KillProdE)}}, ~ProdManKilled&{2*`NF+2{~(XZeroE|YZeroE|KillProdE)}}};



    // Do the addition
    //      - calculate a positive and negitive sum in parallel
    assign PreSum = AlignedAddendInv + {55'b0, ProdManKilled, 2'b0} + {{3*`NF+6{1'b0}}, InvZE};
-    assign NegPreSum = AlignedAddendE + {NegProdManKilled, 2'b0} + {{(3*`NF+3){1'b0}},~(XZeroE|YZeroE|KillProdE),2'b0};
+    assign NegPreSum = XZeroE|YZeroE|KillProdE ? {1'b0, AlignedAddendE} : {1'b0, AlignedAddendE} + {{`NF+3{1'b1}}, ~ProdManKilled, 2'b0} + {(3*`NF+7)'(4)};
     
    // Is the sum negitive
    assign NegSumE = PreSum[3*`NF+6];
@ -360,6 +355,8 @@ module loa( //https://ieeexplore.ieee.org/abstract/document/930098
    logic [3*`NF+6:0] T;
    logic [3*`NF+6:0] G;
    logic [3*`NF+6:0] Z;
+    logic [3*`NF+6:0] f;
+
    assign T[3*`NF+6:2*`NF+4] = A[3*`NF+6:2*`NF+4];
    assign G[3*`NF+6:2*`NF+4] = 0;
    assign Z[3*`NF+6:2*`NF+4] = ~A[3*`NF+6:2*`NF+4];
@ -375,7 +372,6 @@ module loa( //https://ieeexplore.ieee.org/abstract/document/930098
    //      - note: the paper linked above uses the numbering system where 0 is the most significant bit
    //f[n] = ~T[n]&T[n-1]           note: n is the MSB
    //f[i] = (T[i+1]&(G[i]&~Z[i-1] | Z[i]&~G[i-1])) | (~T[i+1]&(Z[i]&~Z[i-1] | G[i]&~G[i-1]))
-    logic [3*`NF+6:0] f;
    assign f[3*`NF+6] = ~T[3*`NF+6]&T[3*`NF+5];
    assign f[3*`NF+5:0] = (T[3*`NF+6:1]&(G[3*`NF+5:0]&{~Z[3*`NF+4:0], 1'b0} | Z[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1})) | (~T[3*`NF+6:1]&(Z[3*`NF+5:0]&{~Z[3*`NF+4:0], 1'b0} | G[3*`NF+5:0]&{~G[3*`NF+4:0], 1'b1}));

@ -440,11 +436,12 @@ module fma2(
    logic               SumZero;        // is the sum zero
    logic               ResultDenorm;   // is the result denormalized
    logic               Sticky, UfSticky;           // Sticky bit
-    logic               Plus1, Minus1, CalcPlus1;   // do you add or subtract one for rounding
+    logic               CalcPlus1;                  // do you add or subtract one for rounding
    logic               UfPlus1;                    // do you add one (for determining underflow flag)
    logic               Invalid,Underflow,Overflow; // flags
    logic               Guard, Round;   // bits needed to determine rounding
    logic               UfLSBNormSum;   // bits needed to determine rounding for underflow flag
+    logic [`FLEN:0]     RoundAdd;       // how much to add to the result
   
    

@ -471,7 +468,7 @@ module fma2(
    // round to nearest max magnitude

    fmaround fmaround(.FmtM, .FrmM, .Sticky, .UfSticky, .NormSum, .AddendStickyM, .NormSumSticky, .ZZeroM, .InvZM, .ResultSgnTmp, .SumExp,
-        .CalcPlus1, .Plus1, .UfPlus1, .Minus1, .FullResultExp, .ResultFrac, .ResultExp, .Round, .Guard, .UfLSBNormSum);
+        .CalcPlus1, .UfPlus1, .FullResultExp, .ResultFrac, .ResultExp, .Round, .Guard, .RoundAdd, .UfLSBNormSum);



@ -503,8 +500,8 @@ module fma2(
    ///////////////////////////////////////////////////////////////////////////////

    resultselect resultselect(.XSgnM, .YSgnM, .XExpM, .YExpM, .ZExpM, .XManM, .YManM, .ZManM, 
-        .FrmM, .FmtM, .AddendStickyM, .KillProdM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, 
-        .ZSgnEffM, .PSgnM, .ResultSgn, .Minus1, .Plus1, .CalcPlus1, .Invalid, .Overflow, .Underflow, 
+        .FrmM, .FmtM, .AddendStickyM, .KillProdM, .XInfM, .YInfM, .ZInfM, .XNaNM, .YNaNM, .ZNaNM, .RoundAdd,
+        .ZSgnEffM, .PSgnM, .ResultSgn, .CalcPlus1, .Invalid, .Overflow, .Underflow, 
        .ResultDenorm, .ResultExp, .ResultFrac, .FMAResM);

 // *** use NF where needed
@ -539,61 +536,6 @@ module resultsign(

 endmodule

-module resultselect(
-    input logic                 XSgnM, YSgnM,        // input signs
-    input logic     [`NE-1:0]   XExpM, YExpM, ZExpM, // input exponents
-    input logic     [`NF:0]     XManM, YManM, ZManM, // input mantissas
-    input logic     [2:0]       FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
-    input logic                 FmtM,       // precision 1 = double 0 = single
-    input logic                 AddendStickyM,  // sticky bit that is calculated during alignment
-    input logic                 KillProdM,      // set the product to zero before addition if the product is too small to matter
-    input logic                 XInfM, YInfM, ZInfM,    // inputs are infinity
-    input logic                 XNaNM, YNaNM, ZNaNM,    // inputs are NaN
-    input logic                 ZSgnEffM,   // the modified Z sign - depends on instruction
-    input logic                 PSgnM,      // the product's sign
-    input logic                 ResultSgn,  // the result's sign
-    input logic                 Minus1, Plus1, CalcPlus1, // rounding bits
-    input logic                 Invalid, Overflow, Underflow,  // flags
-    input logic                 ResultDenorm,       // is the result denormalized
-    input logic     [`NE-1:0]   ResultExp,          // Result exponent
-    input logic     [`NF-1:0]   ResultFrac,         // Result fraction
-    output logic    [`FLEN-1:0] FMAResM     // FMA final result
-);
-    logic [`FLEN-1:0]   XNaNResult, YNaNResult, ZNaNResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult; // possible results
-
-    generate if(`NANPAYLOAD) begin
-        assign XNaNResult = FmtM ? {XSgnM, XExpM, 1'b1, XManM[`NF-2:0]} : {{32{1'b1}}, XSgnM, XExpM[7:0], 1'b1, XManM[50:29]};
-        assign YNaNResult = FmtM ? {YSgnM, YExpM, 1'b1, YManM[`NF-2:0]} : {{32{1'b1}}, YSgnM, YExpM[7:0], 1'b1, YManM[50:29]};
-        assign ZNaNResult = FmtM ? {ZSgnEffM, ZExpM, 1'b1, ZManM[`NF-2:0]} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], 1'b1, ZManM[50:29]};
-    end else begin
-        assign XNaNResult = FmtM ? {XSgnM, XExpM, 1'b1, 51'b0} : {{32{1'b1}}, XSgnM, XExpM[7:0], 1'b1, 22'b0};
-        assign YNaNResult = FmtM ? {YSgnM, YExpM, 1'b1, 51'b0} : {{32{1'b1}}, YSgnM, YExpM[7:0], 1'b1, 22'b0};
-        assign ZNaNResult = FmtM ? {ZSgnEffM, ZExpM, 1'b1, 51'b0} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], 1'b1, 22'b0};
-    end
-    endgenerate
-    
-    
-    assign OverflowResult =  FmtM ? ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, {`NE-1{1'b1}}, 1'b0, {`NF{1'b1}}} :
-                                                                                                                          {ResultSgn, {`NE{1'b1}}, {`NF{1'b0}}} :
-                                    ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{32{1'b1}}, ResultSgn, 8'hfe, {23{1'b1}}} :
-                                                                                                                          {{32{1'b1}}, ResultSgn, 8'hff, 23'b0};
-    assign InvalidResult = FmtM ? {ResultSgn, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}} : {{32{1'b1}}, ResultSgn, 8'hff, 1'b1, 22'b0};
-    assign KillProdResult = FmtM ? {ResultSgn, {ZExpM, ZManM[`NF-1:0]} - {62'b0, (Minus1&AddendStickyM)} + {62'b0, (Plus1&AddendStickyM)}} : {{32{1'b1}}, ResultSgn, {ZExpM[`NE-1],ZExpM[6:0], ZManM[51:29]} - {30'b0, (Minus1&AddendStickyM)} + {30'b0, (Plus1&AddendStickyM)}};
-    assign UnderflowResult = FmtM ? {ResultSgn, {`FLEN-1{1'b0}}} + {63'b0,(CalcPlus1&(AddendStickyM|FrmM[1]))} : {{32{1'b1}}, {ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}};
-    assign FMAResM = XNaNM ? XNaNResult :
-                        YNaNM ? YNaNResult :
-                        ZNaNM ? ZNaNResult :
-                        Invalid ? InvalidResult :
-                        XInfM ? FmtM ? {PSgnM, XExpM, XManM[`NF-1:0]} : {{32{1'b1}}, PSgnM,  XExpM[7:0], XManM[51:29]} : 
-                        YInfM ? FmtM ? {PSgnM, YExpM, YManM[`NF-1:0]} : {{32{1'b1}}, PSgnM,  YExpM[7:0], YManM[51:29]} :
-                        ZInfM ? FmtM ? {ZSgnEffM, ZExpM, ZManM[`NF-1:0]} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], ZManM[51:29]} :
-                        KillProdM ? KillProdResult :  
-			            Overflow ? OverflowResult :
-                        Underflow & ~ResultDenorm & (ResultExp!=1) ? UnderflowResult :  
-                        FmtM ? {ResultSgn, ResultExp, ResultFrac} :
-                               {{32{1'b1}}, ResultSgn, ResultExp[7:0], ResultFrac[51:29]};
-
-endmodule

 module normalize(
    input logic  [3*`NF+5:0]    SumM,       // the positive sum
@ -624,19 +566,6 @@ module normalize(
    // Normalization
    ///////////////////////////////////////////////////////////////////////////////

-
-    // logic [8:0] supposedNormCnt;
-    // logic [8:0] i;
-    // always_comb begin
-    //         i = 0;
-    //         while (~SumM[3*`NF+5-i] && $unsigned(i) <= $unsigned(3*`NF+5)) i = i+1;  // search for leading one
-    //         supposedNormCnt = i;    // compute shift count
-    // end
-
-    // always_comb begin
-    //     assert (NormCntM == supposedNormCnt | NormCntM == supposedNormCnt+1 | NormCntM == supposedNormCnt+2) else $fatal ("normcnt not expected");
-    // end
-
    // Determine if the sum is zero
    assign SumZero = ~(|SumM);

@ -644,18 +573,15 @@ module normalize(
    assign FracLen = FmtM ? `NF+1 : 13'd24;

    // calculate the sum's exponent
-    assign SumExpTmpTmp = KillProdM ? {2'b0, ZExpM} : ProdExpM + -({4'b0, NormCntM} + 1 - (`NF+4)); // ****try moving this into previous stage
-    assign SumExpTmp = FmtM ? SumExpTmpTmp : (SumExpTmpTmp-1023+127)&{`NE+2{|SumExpTmpTmp}}; // ***move this ^ the subtraction by a constant isn't simplified
+    assign SumExpTmpTmp = KillProdM ? {2'b0, ZExpM} : ProdExpM + -({4'b0, NormCntM} + 1 - (`NF+4));
+    assign SumExpTmp = FmtM ? SumExpTmpTmp : (SumExpTmpTmp-1023+127)&{`NE+2{|SumExpTmpTmp}};
    
    logic SumDLTEZ, SumDGEFL, SumSLTEZ, SumSGEFL;
    assign SumDLTEZ = SumExpTmpTmp[`NE+1] | ~|SumExpTmpTmp;
-    assign SumDGEFL = ($signed(SumExpTmpTmp)>=$signed(-(13'd`NF+13'd1)));
+    assign SumDGEFL = ($signed(SumExpTmpTmp)>=$signed(-(13'd`NF+13'd2)));
    assign SumSLTEZ = $signed(SumExpTmpTmp) <= $signed(13'd1023-13'd127);
-    assign SumSGEFL = ($signed(SumExpTmpTmp)>=$signed(-13'd24+13'd1023-13'd127)) | ~|SumExpTmpTmp;
-    assign PreResultDenorm2 = (FmtM ? SumDLTEZ : SumSLTEZ) & (FmtM ? SumDGEFL : SumSGEFL) & ~SumZero; //***make sure math good
-    // always_comb begin
-    //     assert (PreResultDenorm == PreResultDenorm2) else $fatal ("PreResultDenorms not equal");
-    // end
+    assign SumSGEFL = ($signed(SumExpTmpTmp)>=$signed(-13'd25+13'd1023-13'd127)) | ~|SumExpTmpTmp;
+    assign PreResultDenorm2 = (FmtM ? SumDLTEZ : SumSLTEZ) & (FmtM ? SumDGEFL : SumSGEFL) & ~SumZero;

    // 010. when should be 001.
    //      - shift left one
@ -667,9 +593,9 @@ module normalize(

    // Determine the shift needed for denormal results
    //  - if not denorm add 1 to shift out the leading 1
-    assign DenormShift = PreResultDenorm2 ? SumExpTmp[8:0] : 1; //*** change this when changing the size of DenormShift also change to an and opperation
+    assign DenormShift = PreResultDenorm2 ? SumExpTmp[8:0] : 1;
    // Normalize the sum
-    assign SumShifted = {3'b0, SumM} << NormCntM+DenormShift; //*** fix mux's with constants in them //***NormCnt can be simplified
+    assign SumShifted = {3'b0, SumM} << NormCntM+DenormShift;
    // LZA correction
    assign LZAPlus1 = SumShifted[3*`NF+7];
    assign LZAPlus2 = SumShifted[3*`NF+8];
@ -699,18 +625,18 @@ module fmaround(
    input logic             InvZM,          // invert Z
    input logic  [`NE+1:0]  SumExp,         // exponent of the normalized sum
    input logic             ResultSgnTmp,      // the result's sign
-    output logic            CalcPlus1, Plus1, UfPlus1, Minus1,  // do you add or subtract on from the result
+    output logic            CalcPlus1, UfPlus1,  // do you add or subtract on from the result
    output logic [`NE+1:0]  FullResultExp,      // ResultExp with bits to determine sign and overflow
    output logic [`NF-1:0]  ResultFrac,         // Result fraction
    output logic [`NE-1:0]  ResultExp,          // Result exponent
    output logic            Sticky,             // sticky bit
+    output logic [`FLEN:0]  RoundAdd,           // how much to add to the result
    output logic            Round, Guard, UfLSBNormSum // bits needed to calculate rounding
 );
    logic           LSBNormSum;         // bit used for rounding - least significant bit of the normalized sum
    logic           SubBySmallNum, UfSubBySmallNum;  // was there supposed to be a subtraction by a small number
-    logic           UfGuard;            // gaurd bit used to caluculate underflow
-    logic           UfCalcPlus1, CalcMinus1;    // do you add or subtract on from the result
-    logic [`FLEN:0] RoundAdd;           // how much to add to the result
+    logic           UfGuard;            // guard bit used to caluculate underflow
+    logic           UfCalcPlus1, CalcMinus1, Plus1, Minus1; // do you add or subtract on from the result
    logic [`NF-1:0] NormSumTruncated;   // the normalized sum trimed to fit the mantissa
    logic           UfRound;

@ -857,4 +783,62 @@ module fmaflags(
    //      - Don't set the underflow flag if the result was rounded up to a normal number
    assign FMAFlgM = {Invalid, 1'b0, Overflow, UnderflowFlag, Inexact};

+endmodule
+
+
+module resultselect(
+    input logic                 XSgnM, YSgnM,        // input signs
+    input logic     [`NE-1:0]   XExpM, YExpM, ZExpM, // input exponents
+    input logic     [`NF:0]     XManM, YManM, ZManM, // input mantissas
+    input logic     [2:0]       FrmM,       // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
+    input logic                 FmtM,       // precision 1 = double 0 = single
+    input logic                 AddendStickyM,  // sticky bit that is calculated during alignment
+    input logic                 KillProdM,      // set the product to zero before addition if the product is too small to matter
+    input logic                 XInfM, YInfM, ZInfM,    // inputs are infinity
+    input logic                 XNaNM, YNaNM, ZNaNM,    // inputs are NaN
+    input logic                 ZSgnEffM,   // the modified Z sign - depends on instruction
+    input logic                 PSgnM,      // the product's sign
+    input logic                 ResultSgn,  // the result's sign
+    input logic                 CalcPlus1,  // rounding bits
+    input logic     [`FLEN:0]   RoundAdd,   // how much to add to the result
+    input logic                 Invalid, Overflow, Underflow,  // flags
+    input logic                 ResultDenorm,       // is the result denormalized
+    input logic     [`NE-1:0]   ResultExp,          // Result exponent
+    input logic     [`NF-1:0]   ResultFrac,         // Result fraction
+    output logic    [`FLEN-1:0] FMAResM     // FMA final result
+);
+    logic [`FLEN-1:0]   XNaNResult, YNaNResult, ZNaNResult, InvalidResult, OverflowResult, KillProdResult, UnderflowResult; // possible results
+
+    generate if(`IEEE754) begin
+        assign XNaNResult = FmtM ? {XSgnM, XExpM, 1'b1, XManM[`NF-2:0]} : {{32{1'b1}}, XSgnM, XExpM[7:0], 1'b1, XManM[50:29]};
+        assign YNaNResult = FmtM ? {YSgnM, YExpM, 1'b1, YManM[`NF-2:0]} : {{32{1'b1}}, YSgnM, YExpM[7:0], 1'b1, YManM[50:29]};
+        assign ZNaNResult = FmtM ? {ZSgnEffM, ZExpM, 1'b1, ZManM[`NF-2:0]} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], 1'b1, ZManM[50:29]};
+    end else begin
+        assign XNaNResult = FmtM ? {1'b0, XExpM, 1'b1, 51'b0} : {{32{1'b1}}, 1'b0, XExpM[7:0], 1'b1, 22'b0};
+        assign YNaNResult = FmtM ? {1'b0, YExpM, 1'b1, 51'b0} : {{32{1'b1}}, 1'b0, YExpM[7:0], 1'b1, 22'b0};
+        assign ZNaNResult = FmtM ? {1'b0, ZExpM, 1'b1, 51'b0} : {{32{1'b1}}, 1'b0, ZExpM[7:0], 1'b1, 22'b0};
+    end
+    endgenerate
+    
+    
+    assign OverflowResult =  FmtM ? ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {ResultSgn, {`NE-1{1'b1}}, 1'b0, {`NF{1'b1}}} :
+                                                                                                                          {ResultSgn, {`NE{1'b1}}, {`NF{1'b0}}} :
+                                    ((FrmM[1:0]==2'b01) | (FrmM[1:0]==2'b10&~ResultSgn) | (FrmM[1:0]==2'b11&ResultSgn)) ? {{32{1'b1}}, ResultSgn, 8'hfe, {23{1'b1}}} :
+                                                                                                                          {{32{1'b1}}, ResultSgn, 8'hff, 23'b0};
+    assign InvalidResult = FmtM ? {ResultSgn, {`NE{1'b1}}, 1'b1, {`NF-1{1'b0}}} : {{32{1'b1}}, ResultSgn, 8'hff, 1'b1, 22'b0};
+    assign KillProdResult = FmtM ? {ResultSgn, {ZExpM, ZManM[`NF-1:0]} + (RoundAdd[`FLEN-2:0]&{`FLEN-1{AddendStickyM}})} : {{32{1'b1}}, ResultSgn, {ZExpM[`NE-1],ZExpM[6:0], ZManM[51:29]} + (RoundAdd[59:29]&{31{AddendStickyM}})};
+    assign UnderflowResult = FmtM ? {ResultSgn, {`FLEN-1{1'b0}}} + {63'b0,(CalcPlus1&(AddendStickyM|FrmM[1]))} : {{32{1'b1}}, {ResultSgn, 31'b0} + {31'b0, (CalcPlus1&(AddendStickyM|FrmM[1]))}};
+    assign FMAResM = XNaNM ? XNaNResult :
+                        YNaNM ? YNaNResult :
+                        ZNaNM ? ZNaNResult :
+                        Invalid ? InvalidResult :
+                        XInfM ? FmtM ? {PSgnM, XExpM, XManM[`NF-1:0]} : {{32{1'b1}}, PSgnM,  XExpM[7:0], XManM[51:29]} : 
+                        YInfM ? FmtM ? {PSgnM, YExpM, YManM[`NF-1:0]} : {{32{1'b1}}, PSgnM,  YExpM[7:0], YManM[51:29]} :
+                        ZInfM ? FmtM ? {ZSgnEffM, ZExpM, ZManM[`NF-1:0]} : {{32{1'b1}}, ZSgnEffM, ZExpM[7:0], ZManM[51:29]} :
+                        KillProdM ? KillProdResult :  
+			            Overflow ? OverflowResult :
+                        Underflow & ~ResultDenorm & (ResultExp!=1) ? UnderflowResult :  
+                        FmtM ? {ResultSgn, ResultExp, ResultFrac} :
+                               {{32{1'b1}}, ResultSgn, ResultExp[7:0], ResultFrac[51:29]};
+
 endmodule
--- a/wally-pipelined/src/lsu/busfsm.sv
+++ b/wally-pipelined/src/lsu/busfsm.sv
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // busfsm.sv
 //
-// Written: Ross Thompson ross1728@gmail.com
+// Written: Ross Thompson ross1728@gmail.com December 29, 2021
 // Modified: 
 //
 // Purpose: Load/Store Unit's interface to BUS
@ -92,6 +92,7 @@ module busfsm #(parameter integer   WordCountThreshold,
 		                         else if(LsuRWM[1] & ~CacheableM) BusNextState = STATE_BUS_UNCACHED_READ;
 		                         else if(DCacheFetchLine)            BusNextState = STATE_BUS_FETCH;
 		                         else if(DCacheWriteLine)            BusNextState = STATE_BUS_WRITE;
+                                 else                             BusNextState = STATE_BUS_READY;
      STATE_BUS_UNCACHED_WRITE:  if(LsuBusAck)                   BusNextState = STATE_BUS_UNCACHED_WRITE_DONE;
 		                         else                            BusNextState = STATE_BUS_UNCACHED_WRITE;
      STATE_BUS_UNCACHED_READ:   if(LsuBusAck)                   BusNextState = STATE_BUS_UNCACHED_READ_DONE;
@ -106,6 +107,7 @@ module busfsm #(parameter integer   WordCountThreshold,
 	                             else                            BusNextState = STATE_BUS_FETCH;
      STATE_BUS_WRITE:           if(WordCountFlag & LsuBusAck)   BusNextState = STATE_BUS_READY;
 	                             else                            BusNextState = STATE_BUS_WRITE;
+	  default:                                                   BusNextState = STATE_BUS_READY;
 	endcase
  end

--- a/wally-pipelined/src/lsu/interlockfsm.sv
+++ b/wally-pipelined/src/lsu/interlockfsm.sv
@ -0,0 +1,113 @@
+///////////////////////////////////////////
+// interlockfsm.sv
+//
+// Written: Ross Thompson ross1728@gmail.com December 29, 2021
+// Modified: 
+//
+// Purpose: Allows the HPTW to take control of the dcache to walk page table  and then replay the memory operation if
+//          there was on.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module interlockfsm
+  (input logic clk,
+   input logic 	reset,
+   input logic 	AnyCPUReqM,
+   input logic 	ITLBMissF,
+   input logic 	ITLBWriteF,
+   input logic 	DTLBMissM,
+   input logic 	DTLBWriteM,
+   input logic 	ExceptionM,
+   input logic 	PendingInterruptM,
+   input logic 	DCacheStall,
+
+   output logic InterlockStall,
+   output logic SelReplayCPURequest,
+   output logic SelHPTW,
+   output logic IgnoreRequest);
+
+
+  	  typedef enum 				   {STATE_T0_READY,
+									STATE_T0_REPLAY,
+									STATE_T3_DTLB_MISS,
+									STATE_T4_ITLB_MISS,
+									STATE_T5_ITLB_MISS,
+									STATE_T7_DITLB_MISS} statetype;
+
+	  statetype InterlockCurrState, InterlockNextState;
+
+
+	  always_ff @(posedge clk)
+		if (reset)    InterlockCurrState <= #1 STATE_T0_READY;
+		else InterlockCurrState <= #1 InterlockNextState;
+
+	  always_comb begin
+		case(InterlockCurrState)
+		  STATE_T0_READY:        if(~ITLBMissF & DTLBMissM & AnyCPUReqM)          InterlockNextState = STATE_T3_DTLB_MISS;
+	      else if(ITLBMissF & ~DTLBMissM & ~AnyCPUReqM)    InterlockNextState = STATE_T4_ITLB_MISS;
+          else if(ITLBMissF & ~DTLBMissM & AnyCPUReqM)     InterlockNextState = STATE_T5_ITLB_MISS;
+		  else if(ITLBMissF & DTLBMissM & AnyCPUReqM)      InterlockNextState = STATE_T7_DITLB_MISS;
+		  else                                             InterlockNextState = STATE_T0_READY;
+		  STATE_T0_REPLAY:       if(DCacheStall)                                  InterlockNextState = STATE_T0_REPLAY;
+	      else                                             InterlockNextState = STATE_T0_READY;
+		  STATE_T3_DTLB_MISS:    if(DTLBWriteM)                                   InterlockNextState = STATE_T0_REPLAY;
+		  else                                             InterlockNextState = STATE_T3_DTLB_MISS;
+		  STATE_T4_ITLB_MISS:    if(ITLBWriteF)                                   InterlockNextState = STATE_T0_READY;
+	      else                                             InterlockNextState = STATE_T4_ITLB_MISS;
+		  STATE_T5_ITLB_MISS:    if(ITLBWriteF)                                   InterlockNextState = STATE_T0_REPLAY;
+		  else                                             InterlockNextState = STATE_T5_ITLB_MISS;
+		  STATE_T7_DITLB_MISS:   if(DTLBWriteM)                                   InterlockNextState = STATE_T5_ITLB_MISS;
+		  else                                             InterlockNextState = STATE_T7_DITLB_MISS;
+		  default: InterlockNextState = STATE_T0_READY;
+		endcase
+	  end // always_comb
+	  
+	  // signal to CPU it needs to wait on HPTW.
+	  /* -----\/----- EXCLUDED -----\/-----
+	   // this code has a problem with imperas64mmu as it reads in an invalid uninitalized instruction.  InterlockStall becomes x and it propagates
+	   // everywhere.  The case statement below implements the same logic but any x on the inputs will resolve to 0.
+	   // Note this will cause a problem for post synthesis gate simulation.
+	   assign InterlockStall = (InterlockCurrState == STATE_T0_READY & (DTLBMissM | ITLBMissF)) | 
+	   (InterlockCurrState == STATE_T3_DTLB_MISS) | (InterlockCurrState == STATE_T4_ITLB_MISS) |
+	   (InterlockCurrState == STATE_T5_ITLB_MISS) | (InterlockCurrState == STATE_T7_DITLB_MISS);
+
+	   -----/\----- EXCLUDED -----/\----- */
+
+	  always_comb begin
+		InterlockStall = 1'b0;
+		case(InterlockCurrState) 
+		  STATE_T0_READY: if(DTLBMissM | ITLBMissF) InterlockStall = 1'b1;
+		  STATE_T3_DTLB_MISS: InterlockStall = 1'b1;
+		  STATE_T4_ITLB_MISS: InterlockStall = 1'b1;
+		  STATE_T5_ITLB_MISS: InterlockStall = 1'b1;
+		  STATE_T7_DITLB_MISS: InterlockStall = 1'b1;
+		  default: InterlockStall = 1'b0;
+		endcase
+	  end
+  
+  
+	  assign SelReplayCPURequest = (InterlockNextState == STATE_T0_REPLAY);
+	  assign SelHPTW = (InterlockCurrState == STATE_T3_DTLB_MISS) | (InterlockCurrState == STATE_T4_ITLB_MISS) |
+					   (InterlockCurrState == STATE_T5_ITLB_MISS) | (InterlockCurrState == STATE_T7_DITLB_MISS);
+	  assign IgnoreRequest = (InterlockCurrState == STATE_T0_READY & (ITLBMissF | DTLBMissM | ExceptionM | PendingInterruptM)) |
+							 ((InterlockCurrState == STATE_T0_REPLAY)
+							  & (ExceptionM | PendingInterruptM));
+
+endmodule
--- a/wally-pipelined/src/lsu/lsu.sv
+++ b/wally-pipelined/src/lsu/lsu.sv
@ -128,78 +128,13 @@ module lsu
 	  logic [2:0] 				   HPTWSize;
 	  logic 					   SelReplayCPURequest;

-	  typedef enum 				   {STATE_T0_READY,
-									STATE_T0_REPLAY,
-									STATE_T3_DTLB_MISS,
-									STATE_T4_ITLB_MISS,
-									STATE_T5_ITLB_MISS,
-									STATE_T7_DITLB_MISS} statetype;
+	  assign AnyCPUReqM = (|MemRWM) | (|AtomicM);

-	  statetype InterlockCurrState, InterlockNextState;
-
-	  assign AnyCPUReqM = (|MemRWM)  | (|AtomicM);
-
-	  always_ff @(posedge clk)
-		if (reset)    InterlockCurrState <= #1 STATE_T0_READY;
-		else InterlockCurrState <= #1 InterlockNextState;
-
-	  always_comb begin
-		case(InterlockCurrState)
-		  STATE_T0_READY:        if(~ITLBMissF & DTLBMissM & AnyCPUReqM)          InterlockNextState = STATE_T3_DTLB_MISS;
-	      else if(ITLBMissF & ~DTLBMissM & ~AnyCPUReqM)    InterlockNextState = STATE_T4_ITLB_MISS;
-          else if(ITLBMissF & ~DTLBMissM & AnyCPUReqM)     InterlockNextState = STATE_T5_ITLB_MISS;
-		  else if(ITLBMissF & DTLBMissM & AnyCPUReqM)      InterlockNextState = STATE_T7_DITLB_MISS;
-		  else                                             InterlockNextState = STATE_T0_READY;
-		  STATE_T0_REPLAY:       if(DCacheStall)                                  InterlockNextState = STATE_T0_REPLAY;
-	      else                                             InterlockNextState = STATE_T0_READY;
-		  STATE_T3_DTLB_MISS:    if(DTLBWriteM)                                   InterlockNextState = STATE_T0_REPLAY;
-		  else                                             InterlockNextState = STATE_T3_DTLB_MISS;
-		  STATE_T4_ITLB_MISS:    if(ITLBWriteF)                                   InterlockNextState = STATE_T0_READY;
-	      else                                             InterlockNextState = STATE_T4_ITLB_MISS;
-		  STATE_T5_ITLB_MISS:    if(ITLBWriteF)                                   InterlockNextState = STATE_T0_REPLAY;
-		  else                                             InterlockNextState = STATE_T5_ITLB_MISS;
-		  STATE_T7_DITLB_MISS:   if(DTLBWriteM)                                   InterlockNextState = STATE_T5_ITLB_MISS;
-		  else                                             InterlockNextState = STATE_T7_DITLB_MISS;
-		  default: InterlockNextState = STATE_T0_READY;
-		endcase
-	  end // always_comb
+	  interlockfsm interlockfsm (.clk, .reset, .AnyCPUReqM, .ITLBMissF, .ITLBWriteF,
+		 .DTLBMissM, .DTLBWriteM, .ExceptionM, .PendingInterruptM, .DCacheStall,
+		 .InterlockStall, .SelReplayCPURequest, .SelHPTW,
+		 .IgnoreRequest);
 	  
-	  // signal to CPU it needs to wait on HPTW.
-	  /* -----\/----- EXCLUDED -----\/-----
-	   // this code has a problem with imperas64mmu as it reads in an invalid uninitalized instruction.  InterlockStall becomes x and it propagates
-	   // everywhere.  The case statement below implements the same logic but any x on the inputs will resolve to 0.
-	   assign InterlockStall = (InterlockCurrState == STATE_T0_READY & (DTLBMissM | ITLBMissF)) | 
-	   (InterlockCurrState == STATE_T3_DTLB_MISS) | (InterlockCurrState == STATE_T4_ITLB_MISS) |
-	   (InterlockCurrState == STATE_T5_ITLB_MISS) | (InterlockCurrState == STATE_T7_DITLB_MISS);
-
-	   -----/\----- EXCLUDED -----/\----- */
-
-	  always_comb begin
-		InterlockStall = 1'b0;
-		case(InterlockCurrState) 
-		  STATE_T0_READY: if(DTLBMissM | ITLBMissF) InterlockStall = 1'b1;
-		  STATE_T3_DTLB_MISS: InterlockStall = 1'b1;
-		  STATE_T4_ITLB_MISS: InterlockStall = 1'b1;
-		  STATE_T5_ITLB_MISS: InterlockStall = 1'b1;
-		  STATE_T7_DITLB_MISS: InterlockStall = 1'b1;
-		  default: InterlockStall = 1'b0;
-		endcase
-	  end
-  
-  
-	  // When replaying CPU memory request after PTW select the IEUAdrM for correct address.
-	  assign SelReplayCPURequest = (InterlockNextState == STATE_T0_REPLAY);
-	  assign SelHPTW = (InterlockCurrState == STATE_T3_DTLB_MISS) | (InterlockCurrState == STATE_T4_ITLB_MISS) |
-					   (InterlockCurrState == STATE_T5_ITLB_MISS) | (InterlockCurrState == STATE_T7_DITLB_MISS);
-	  assign IgnoreRequest = (InterlockCurrState == STATE_T0_READY & (ITLBMissF | DTLBMissM | ExceptionM | PendingInterruptM)) |
-							 ((InterlockCurrState == STATE_T0_REPLAY)
-							  & (ExceptionM | PendingInterruptM));
-	  
-	  
-
-
-	  // *** add generate to conditionally create hptw, lsuArb, and mmu
-	  // based on `MEM_VIRTMEM
 	  hptw hptw(.clk, .reset, .SATP_REGW, .PCF, .IEUAdrM,
 				.ITLBMissF(ITLBMissF & ~PendingInterruptM),
 				.DTLBMissM(DTLBMissM & ~PendingInterruptM),
@ -216,25 +151,18 @@ module lsu
 	  mux2 #(12) adremux(IEUAdrE[11:0], HPTWAdr[11:0], SelHPTW, LsuAdrE);
 	  mux2 #(`PA_BITS) lsupadrmux(IEUAdrExtM[`PA_BITS-1:0], HPTWAdr, SelHPTW, PreLsuPAdrM);

+	  // always block interrupts when using the hardware page table walker.
 	  assign CPUBusy = StallW & ~SelHPTW;
 	  
-	  // always block interrupts when using the hardware page table walker.
-
-	  // this is for the d cache SRAM.
-	  // turns out because we cannot pipeline hptw requests we don't need this register
+	  // It is not possible to pipeline hptw as the following load will depend on the previous load's
+	  // data. Therefore we don't need a pipeline register
 	  //flop #(`PA_BITS) HPTWAdrMReg(clk, HPTWAdr, HPTWAdrM);   // delay HPTWAdrM by a cycle
-	  
-	  //assign PreLsuRWM = SelHPTW ? {HPTWRead, 1'b0} : MemRWM;
-	  //assign LsuAdrE = SelHPTW ? HPTWAdr[11:0] : IEUAdrE[11:0];  
-	  //assign LsuAtomicM = SelHPTW ? 2'b00 : AtomicM;
-	  //assign PreLsuPAdrM = SelHPTW ? HPTWAdr : IEUAdrExtM[`PA_BITS-1:0]; 
-

 	  // Specify which type of page fault is occurring
-	  // *** `MEM_VIRTMEM
 	  assign DTLBLoadPageFaultM = DTLBPageFaultM & PreLsuRWM[1];
 	  assign DTLBStorePageFaultM = DTLBPageFaultM & PreLsuRWM[0];

+	  // When replaying CPU memory request after PTW select the IEUAdrM for correct address.
 	  assign DCacheAdrE = SelReplayCPURequest ? IEUAdrM[11:0] : LsuAdrE;

 	end // if (`MEM_VIRTMEM)
@ -263,6 +191,13 @@ module lsu
  endgenerate

  // **** look into this confusing signal.
+  // This signal is confusing.  CommittedM tells the CPU's trap unit the current instruction
+  // in the memory stage is a memory operaton and that memory operation is either completed
+  // or is partially executed.  This signal is only low for the first cycle of a memory
+  // operation.
+  // **** I think there is also a bug here.  Data cache misses and TLB misses both
+  // set this bit in the first cycle.  It is not strickly wrong, but it may be better
+  // to flush the memory operation at that time.
  assign CommittedM = SelHPTW | DCacheCommittedM | BusCommittedM;

  generate