Merge branch 'main' of https://github.com/davidharrishmc/riscv-wally

2022-05-27 20:59:27 +00:00 · 2022-05-27 20:59:27 +00:00 · d8815a7f12
commit d8815a7f12
parent 7447f9fbf4 95b506c5e0
8 changed files with 113 additions and 165 deletions
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@ -1 +1 @@
-Subproject commit 307c77b26e070ae85ffea665ad9b642b40e33c86
+Subproject commit ad04e119a5d846a1c11159786ad3382cf5ad3649
--- a/benchmarks/embench/Makefile
+++ b/benchmarks/embench/Makefile
@ -7,9 +7,14 @@ allClean: clean all

 build:
 	../../addins/embench-iot/build_all.py --builddir=bd_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S" --cflags="-nostartfiles" 
-	../../addins/embench-iot/build_all.py --builddir=bd_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib" --cflags="-nostdlib" --dummy-libs="libgcc libm libc crt0"
+	find ../../addins/embench-iot/bd_speed/ -type f ! -name "*.*" | while read f; do cp "$$f" "$$f.elf"; done
+	../../addins/embench-iot/build_all.py --builddir=bd_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles" --cflags="-msave-restore" --dummy-libs="libgcc libm libc crt0"

-sim: size speed
+sim: modelSimBuild size speed
+
+modelSimBuild: objdump
+	find ../../addins/embench-iot/bd_speed/ -type f -name "*.elf" | while read f; do riscv64-unknown-elf-elf2hex --bit-width 32 --input "$$f" --output "$$f.memfile"; done
+	find ../../addins/embench-iot/bd_speed/ -type f -name "*.elf.objdump" | while read f; do extractFunctionRadix.sh $$f; done

 size:
 	../../addins/embench-iot/benchmark_size.py --builddir=bd_size
@ -18,10 +23,7 @@ speed:
 	../../addins/embench-iot/benchmark_speed.py --builddir=bd_speed --target-module run_wally --cpu-mhz=50

 objdump:
-	riscv64-unknown-elf-objdump -S ../../addins/embench-iot/bd_speed/src/aha-mont64/aha-mont64 > ../../addins/embench-iot/bd_speed/src/aha-mont64/aha-mont64.objdump
-	riscv64-unknown-elf-objdump -S ../../addins/embench-iot/bd_speed/src/cubic/cubic > ../../addins/embench-iot/bd_speed/src/cubic/cubic.objdump
-	riscv64-unknown-elf-objdump -S ../../addins/embench-iot/bd_speed/src/md5sum/md5sum > ../../addins/embench-iot/bd_speed/src/md5sum/md5sum.objdump
-	riscv64-unknown-elf-objdump -S ../../addins/embench-iot/bd_speed/src/statemate/statemate > ../../addins/embench-iot/bd_speed/src/statemate/statemate.objdump
+	find ../../addins/embench-iot/bd_speed/ -type f -name "*.elf" | while read f; do riscv64-unknown-elf-objdump -S "$$f" > "$$f.objdump"; done

 clean: 
 	rm -rf ../../addins/embench-iot/bd_speed/
--- a/pipelined/regression/fp.do
+++ b/pipelined/regression/fp.do
@ -32,7 +32,7 @@ vlib work
 # start and run simulation
 # remove +acc flag for faster sim during regressions if there is no need to access internal signals
 # $num = the added words after the call
-vlog +incdir+../config/$1 +incdir+../config/shared ../testbench/testbench-fp.sv ../src/fpu/*.sv -suppress 2583,7063,8607,2697 
+vlog +incdir+../config/$1 +incdir+../config/shared ../testbench/testbench-fp.sv ../src/fpu/*.sv ../src/generic/*.sv -suppress 2583,7063,8607,2697 

 vsim -voptargs=+acc work.testbenchfp -G TEST=$2

--- a/pipelined/src/fpu/fcvt.sv
+++ b/pipelined/src/fpu/fcvt.sv
@ -17,9 +17,9 @@ module fcvt (
    input logic             XSNaNE,         // is the input a signaling NaN
    input logic [2:0]       FrmE,           // rounding mode 000 = rount to nearest, ties to even   001 = round twords zero  010 = round down  011 = round up  100 = round to nearest, ties to max magnitude
    input logic [`FPSIZES/3:0] FmtE,        // the input's precision (11=quad 01=double 00=single 10=half)
-    output logic [`FLEN-1:0] CvtResE,       // the fp to fp conversion's result
-    output logic [`XLEN-1:0] CvtIntResE,    // the fp to fp conversion's result
-    output logic [4:0]      CvtFlgE         // the fp to fp conversion's flags
+    output logic [`FLEN-1:0] CvtResE,       // the fp conversion result
+    output logic [`XLEN-1:0] CvtIntResE,    // the int conversion result
+    output logic [4:0]      CvtFlgE         // the conversion's flags
    );

    // OpCtrls:
@ -39,9 +39,10 @@ module fcvt (

    logic [`FPSIZES/3:0]    OutFmt;     // format of the output
    logic [`XLEN-1:0]       PosInt;     // the positive integer input
+    logic [`XLEN-1:0]       TrimInt;    // integer trimmed to the correct size
    logic [`LGLEN-1:0]      LzcIn;      // input to the Leading Zero Counter (priority encoder)
    logic [`NE:0]           CalcExp;    // the calculated expoent
-	logic [$clog2(`LGLEN):0] ShiftAmt;  // how much to shift by
+	logic [$clog2(`LGLEN)-1:0] ShiftAmt;  // how much to shift by
    logic [`LGLEN+`NF:0]    ShiftIn;    // number to be shifted
    logic                   ResDenormUf;// does the result underflow or is denormalized
    logic                   ResUf;      // does the result underflow
@ -71,6 +72,7 @@ module fcvt (
    logic                   Int64;      // is the integer 64 bits?
    logic                   IntToFp;       // is the opperation an int->fp conversion?
    logic                   ToInt;      // is the opperation an fp->int conversion?
+    logic [$clog2(`LGLEN)-1:0] ZeroCnt; // output from the LZC


    // seperate OpCtrl for code readability
@ -91,18 +93,11 @@ module fcvt (
    ///////////////////////////////////////////////////////////////////////////
    // negation
    ///////////////////////////////////////////////////////////////////////////
-    // negate the input if the input is a negitive singed integer
-    //      - remove leading ones if the input is a unsigned 32-bit integer
-    //
-    //              Negitive input
-    //                      64-bit input : negate the input
-    //                      32-bit input : trim to 32-bits and negate the input
-    //              Positive input
-    //                      64-bit input : do nothing
-    //                      32-bit input : trim to 32-bits
+    // 1) negate the input if the input is a negitive singed integer
+    // 2) trim the input to the proper size (kill the 32 most significant zeroes if needed)

-    assign PosInt = ResSgn ? Int64 ? -ForwardedSrcAE : {{`XLEN-32{1'b0}}, -ForwardedSrcAE[31:0]} : 
-                             Int64 ? ForwardedSrcAE : {{`XLEN-32{1'b0}}, ForwardedSrcAE[31:0]};
+    assign PosInt = ResSgn ? -ForwardedSrcAE : ForwardedSrcAE;
+    assign TrimInt = {{`XLEN-32{Int64}}, {32{1'b1}}} & PosInt;

    ///////////////////////////////////////////////////////////////////////////
    // lzc 
@ -111,16 +106,10 @@ module fcvt (
    // choose the input to the leading zero counter i.e. priority encoder
    //             int -> fp : | positive integer | 00000... (if needed) | 
    //             fp  -> fp : | fraction         | 00000... (if needed) | 
-    assign LzcIn = IntToFp ? {PosInt, {`LGLEN-`XLEN{1'b0}}} :      // I->F
-                             {XManE[`NF-1:0], {`LGLEN-`NF{1'b0}}}; // F->F
+    assign LzcIn = IntToFp ? {TrimInt, {`LGLEN-`XLEN{1'b0}}} :
+                             {XManE[`NF-1:0], {`LGLEN-`NF{1'b0}}};
    
-    // lglen is the largest possible value of ZeroCnt (NF or XLEN) hence normcnt must be log2(lglen) bits
-	logic [$clog2(`LGLEN):0]	i, ZeroCnt;
-	always_comb begin
-			i = 0;
-			while (~LzcIn[`LGLEN-1-i] & i <= `LGLEN-1) i = i+1;  // search for leading one 
-			ZeroCnt = i;
-	end
+    lzc #(`LGLEN) lzc (.num(LzcIn), .ZeroCnt);


    ///////////////////////////////////////////////////////////////////////////
@ -154,9 +143,9 @@ module fcvt (
    //              - only shift fp -> fp if the intital value is denormalized
    //                  - this is a problem because the input to the lzc was the fraction rather than the mantissa
    //                  - rather have a few and-gates than an extra bit in the priority encoder??? *** is this true?
-    assign ShiftAmt = ToInt ? CalcExp[$clog2(`LGLEN):0]&{$clog2(`LGLEN)+1{~CalcExp[`NE]}} :
-                    ResDenormUf&~IntToFp ? ($clog2(`LGLEN)+1)'(`NF-1)+CalcExp[$clog2(`LGLEN):0] : 
-                              (ZeroCnt+1)&{$clog2(`LGLEN)+1{XOrigDenormE|IntToFp}};
+    assign ShiftAmt = ToInt ? CalcExp[$clog2(`LGLEN)-1:0]&{$clog2(`LGLEN){~CalcExp[`NE]}} :
+                    ResDenormUf&~IntToFp ? ($clog2(`LGLEN))'(`NF-1)+CalcExp[$clog2(`LGLEN)-1:0] : 
+                              (ZeroCnt+1)&{$clog2(`LGLEN){XOrigDenormE|IntToFp}};
    
    // shift
    //      fp -> int: |  `XLEN  zeros |     Mantissa      | 0's if nessisary | << CalcExp
@ -272,7 +261,7 @@ module fcvt (
    //                  - shift left to normilize (-1-ZeroCnt)
    //                  - newBias to make the biased exponent
    //          
-    assign CalcExp = {1'b0, OldExp} - (`NE+1)'(`BIAS) + {2'b0, NewBias} - {{`NE{1'b0}}, XOrigDenormE|IntToFp} - {{`NE-$clog2(`LGLEN){1'b0}}, (ZeroCnt&{$clog2(`LGLEN)+1{XOrigDenormE|IntToFp}})};
+    assign CalcExp = {1'b0, OldExp} - (`NE+1)'(`BIAS) + {2'b0, NewBias} - {{`NE{1'b0}}, XOrigDenormE|IntToFp} - {{`NE-$clog2(`LGLEN)+1{1'b0}}, (ZeroCnt&{$clog2(`LGLEN){XOrigDenormE|IntToFp}})};
    // find if the result is dnormal or underflows
    //      - if Calculated expoenent is 0 or negitive (and the input/result is not exactaly 0)
    //      - can't underflow an integer to Fp conversion
@ -568,7 +557,7 @@ module fcvt (
    //      - do so if the result underflows, is zero (the exp doesnt calculate correctly). or the integer input is 0
    //      - dont set to zero if fp input is zero but not using the fp input
    //      - dont set to zero if int input is zero but not using the int input
-    assign KillRes = (ResUf|(XZeroE&~IntToFp)|(~|PosInt&IntToFp));
+    assign KillRes = (ResUf|(XZeroE&~IntToFp)|(~|TrimInt&IntToFp));

    if (`FPSIZES == 1) begin        
        // IEEE sends a payload while Riscv says to send a canonical quiet NaN
@ -755,7 +744,7 @@ module fcvt (
                        NaNRes = {{`Q_LEN-`H_LEN{1'b1}}, 1'b0, {`H_NE+1{1'b1}}, {`H_NF-1{1'b0}}};
                    end
                    // determine the infinity result
-                    //      - if the input was infinity or rounding mode RZ, RU, RD (and not rounding the value) then output the maximum normalized floating point number with the correct sign
+                    //      - if the input overflows in rounding mode RZ, RU, RD (and not rounding the value) then output the maximum normalized floating point number with the correct sign
                    //      - otherwise: output infinity with the correct sign
                    //      - kill the infinity singal if the input isn't fp
                    InfRes = (~XInfE|IntToFp)&((FrmE[1:0]==2'b01) | (FrmE[1:0]==2'b10&~ResSgn) | (FrmE[1:0]==2'b11&ResSgn)) ? {{`Q_LEN-`H_LEN{1'b1}}, ResSgn, {`H_NE-1{1'b1}}, 1'b0, {`H_NF{1'b1}}} : {{`Q_LEN-`H_LEN{1'b1}}, ResSgn, {`H_NE{1'b1}}, (`H_NF)'(0)};
--- a/pipelined/src/fpu/fma.sv
+++ b/pipelined/src/fpu/fma.sv
@ -409,22 +409,10 @@ module loa( //https://ieeexplore.ieee.org/abstract/document/930098



-    lzc lzc(.f, .NormCntE);
+    lzc #(3*`NF+7) lzc (.num(f), .ZeroCnt(NormCntE));
  
 endmodule

-module lzc(
-    input logic  [3*`NF+6:0]            f,
-    output logic [$clog2(3*`NF+7)-1:0]    NormCntE    // normalization shift
-);
-    
-    logic [$clog2(3*`NF+7)-1:0] i;
-    always_comb begin
-        i = 0;
-        while (~f[3*`NF+6-i] & $unsigned(i) <= $unsigned($clog2(3*`NF+7)'(3)*($clog2(3*`NF+7))'(`NF)+($clog2(3*`NF+7))'(6))) i = i+1;  // search for leading one
-        NormCntE = i;
-    end
-endmodule



@ -465,7 +453,7 @@ module fma2(
    logic               ResultSgn, ResultSgnTmp;  // Result sign
    logic [`NE+1:0]     SumExp;     // exponent of the normalized sum
    logic [`NE+1:0]     FullResultExp;  // ResultExp with bits to determine sign and overflow
-    logic [`NF+2:0]     NormSum;        // normalized sum
+    logic [`NF+1:0]     NormSum;        // normalized sum
    logic               NormSumSticky;  // sticky bit calulated from the normalized sum
    logic               SumZero;        // is the sum zero
    logic               ResultDenorm;   // is the result denormalized
@ -582,7 +570,7 @@ module normalize(
    input logic                         KillProdM,  // is the product set to zero
    input logic 			            ZOrigDenormM,
    input logic                         AddendStickyM,  // the sticky bit caclulated from the aligned addend
-    output logic [`NF+2:0]              NormSum,        // normalized sum
+    output logic [`NF+1:0]              NormSum,        // normalized sum
    output logic                        SumZero,        // is the sum zero
    output logic                        NormSumSticky, UfSticky,    // sticky bits
    output logic [`NE+1:0]              SumExp,         // exponent of the normalized sum
@ -599,7 +587,7 @@ module normalize(
    ///////////////////////////////////////////////////////////////////////////////
    // Normalization
    ///////////////////////////////////////////////////////////////////////////////
-    //*** insert bias-bias simplification in fcvt.sv/phone pictures/ whiteboard... if still there
+    //*** insert bias-bias simplification in fcvt.sv/phone pictures
    // Determine if the sum is zero
    assign SumZero = ~(|SumM);

@ -707,27 +695,27 @@ module normalize(
    assign LZAPlus2 = SumShifted[3*`NF+8];
 	// the only possible mantissa for a plus two is all zeroes - a one has to propigate all the way through a sum. so we can leave the bottom statement alone
    assign CorrSumShifted =  LZAPlus1 ? SumShifted[3*`NF+6:1] : SumShifted[3*`NF+5:0];
-    assign NormSum = CorrSumShifted[3*`NF+5:2*`NF+3];
+    assign NormSum = CorrSumShifted[3*`NF+5:2*`NF+4];

    // Calculate the sticky bit
    if (`FPSIZES == 1) begin
-        assign NormSumSticky = |CorrSumShifted[2*`NF+2:0];
+        assign NormSumSticky = |CorrSumShifted[2*`NF+3:0];

    end else if (`FPSIZES == 2) begin
        // 3*NF+5 - NF1 - 3
-        assign NormSumSticky = (|CorrSumShifted[2*`NF+2:0]) | 
-        (|CorrSumShifted[3*`NF+2-`NF1:2*`NF+3]&~FmtM);
+        assign NormSumSticky = (|CorrSumShifted[2*`NF+3:0]) | 
+        (|CorrSumShifted[3*`NF+3-`NF1:2*`NF+4]&~FmtM);

    end else if (`FPSIZES == 3) begin
-        assign NormSumSticky = (|CorrSumShifted[2*`NF+2:0]) | 
-        (|CorrSumShifted[3*`NF+2-`NF1:2*`NF+3]&((FmtM==`FMT1)|(FmtM==`FMT2))) | 
-        (|CorrSumShifted[3*`NF+2-`NF2:3*`NF+3-`NF1]&(FmtM==`FMT2));
+        assign NormSumSticky = (|CorrSumShifted[2*`NF+3:0]) | 
+        (|CorrSumShifted[3*`NF+3-`NF1:2*`NF+4]&((FmtM==`FMT1)|(FmtM==`FMT2))) | 
+        (|CorrSumShifted[3*`NF+3-`NF2:3*`NF+4-`NF1]&(FmtM==`FMT2));

    end else if (`FPSIZES == 4) begin        
-        assign NormSumSticky = (|CorrSumShifted[2*`NF+2:0]) | 
-        (|CorrSumShifted[3*`NF+2-`D_NF:2*`NF+3]&((FmtM==1)|(FmtM==0)|(FmtM==2))) | 
-        (|CorrSumShifted[3*`NF+2-`S_NF:3*`NF+3-`D_NF]&((FmtM==0)|(FmtM==2))) |
-        (|CorrSumShifted[3*`NF+2-`H_NF:3*`NF+3-`S_NF]&(FmtM==2));
+        assign NormSumSticky = (|CorrSumShifted[2*`NF+3:0]) | 
+        (|CorrSumShifted[3*`NF+3-`D_NF:2*`NF+4]&((FmtM==1)|(FmtM==0)|(FmtM==2))) | 
+        (|CorrSumShifted[3*`NF+3-`S_NF:3*`NF+4-`D_NF]&((FmtM==0)|(FmtM==2))) |
+        (|CorrSumShifted[3*`NF+3-`H_NF:3*`NF+4-`S_NF]&(FmtM==2));

    end

@ -745,7 +733,7 @@ module fmaround(
    input logic  [`FPSIZES/3:0] FmtM,       // precision 1 = double 0 = single
    input logic  [2:0]          FrmM,       // rounding mode
    input logic                 UfSticky,   // sticky bit for underlow calculation
-    input logic  [`NF+2:0]      NormSum,    // normalized sum
+    input logic  [`NF+1:0]      NormSum,    // normalized sum
    input logic                 AddendStickyM,  // addend's sticky bit
    input logic                 NormSumSticky,  // normalized sum's sticky bit
    input logic                 ZZeroM,         // is Z zero
@ -799,83 +787,53 @@ module fmaround(

    if (`FPSIZES == 1) begin
        // determine guard, round, and least significant bit of the result
-        assign Guard = NormSum[2];
        assign Round = NormSum[1];
-        assign LSBNormSum = NormSum[3];
+        assign LSBNormSum = NormSum[2];

        // used to determine underflow flag
-        assign UfGuard = NormSum[1];
        assign UfRound = NormSum[0];
-        assign UfLSBNormSum = NormSum[2];
-
-        // determine sticky
-        assign Sticky = UfSticky | NormSum[0];

    end else if (`FPSIZES == 2) begin
        //         \/-------------NF---------------,
-        //      |      NF1       | 3 |             |
+        //      |      NF1       | 2 |             |
        //          '-------NF1------^

        // determine guard, round, and least significant bit of the result
-        assign Guard = FmtM ? NormSum[2] : NormSum[`NF-`NF1+2];
        assign Round = FmtM ? NormSum[1] : NormSum[`NF-`NF1+1];
-        assign LSBNormSum = FmtM ? NormSum[3] : NormSum[`NF-`NF1+3];
+        assign LSBNormSum = FmtM ? NormSum[2] : NormSum[`NF-`NF1+2];

        // used to determine underflow flag
-        assign UfGuard = FmtM ? NormSum[1] : NormSum[`NF-`NF1+1];
        assign UfRound = FmtM ? NormSum[0] : NormSum[`NF-`NF1];
-        assign UfLSBNormSum = FmtM ? NormSum[2] : NormSum[`NF-`NF1+2];

-        // determine sticky
-        assign Sticky = UfSticky | (FmtM ? NormSum[0] : NormSum[`NF-`NF1]);

    end else if (`FPSIZES == 3) begin
        always_comb begin
            case (FmtM)
                `FMT: begin
                    // determine guard, round, and least significant bit of the result
-                    Guard = NormSum[2];
                    Round = NormSum[1];
-                    LSBNormSum = NormSum[3];
+                    LSBNormSum = NormSum[2];
                    // used to determine underflow flag
-                    UfGuard = NormSum[1];
                    UfRound = NormSum[0];
-                    UfLSBNormSum = NormSum[2];
-                    // determine sticky
-                    Sticky = UfSticky | NormSum[0];
                end
                `FMT1: begin
                    // determine guard, round, and least significant bit of the result
-                    Guard = NormSum[`NF-`NF1+2];
                    Round = NormSum[`NF-`NF1+1];
-                    LSBNormSum = NormSum[`NF-`NF1+3];
+                    LSBNormSum = NormSum[`NF-`NF1+2];
                    // used to determine underflow flag
-                    UfGuard = NormSum[`NF-`NF1+1];
                    UfRound = NormSum[`NF-`NF1];
-                    UfLSBNormSum = NormSum[`NF-`NF1+2];
-                    // determine sticky
-                    Sticky = UfSticky | NormSum[`NF-`NF1];
                end
                `FMT2: begin
                    // determine guard, round, and least significant bit of the result
-                    Guard = NormSum[`NF-`NF2+2];
                    Round = NormSum[`NF-`NF2+1];
-                    LSBNormSum = NormSum[`NF-`NF2+3];
+                    LSBNormSum = NormSum[`NF-`NF2+2];
                    // used to determine underflow flag
-                    UfGuard = NormSum[`NF-`NF2+1];
                    UfRound = NormSum[`NF-`NF2];
-                    UfLSBNormSum = NormSum[`NF-`NF2+2];
-                    // determine sticky
-                    Sticky = UfSticky | NormSum[`NF-`NF2];
                end
                default: begin
-                    Guard = 1'bx;
                    Round = 1'bx;
                    LSBNormSum = 1'bx;
-                    UfGuard = 1'bx;
                    UfRound = 1'bx;
-                    UfLSBNormSum = 1'bx;
-                    Sticky = 1'bx;
                end
            endcase
        end
@ -885,56 +843,40 @@ module fmaround(
            case (FmtM)
                2'h3: begin
                    // determine guard, round, and least significant bit of the result
-                    Guard = NormSum[2];
                    Round = NormSum[1];
-                    LSBNormSum = NormSum[3];
+                    LSBNormSum = NormSum[2];
                    // used to determine underflow flag
-                    UfGuard = NormSum[1];
                    UfRound = NormSum[0];
-                    UfLSBNormSum = NormSum[2];
-                    // determine sticky
-                    Sticky = UfSticky | NormSum[0];
                end
                2'h1: begin
                    // determine guard, round, and least significant bit of the result
-                    Guard = NormSum[`NF-`D_NF+2];
                    Round = NormSum[`NF-`D_NF+1];
-                    LSBNormSum = NormSum[`NF-`D_NF+3];
+                    LSBNormSum = NormSum[`NF-`D_NF+2];
                    // used to determine underflow flag
-                    UfGuard = NormSum[`NF-`D_NF+1];
                    UfRound = NormSum[`NF-`D_NF];
-                    UfLSBNormSum = NormSum[`NF-`D_NF+2];
-                    // determine sticky
-                    Sticky = UfSticky | NormSum[`NF-`D_NF];
                end
                2'h0: begin
                    // determine guard, round, and least significant bit of the result
-                    Guard = NormSum[`NF-`S_NF+2];
                    Round = NormSum[`NF-`S_NF+1];
-                    LSBNormSum = NormSum[`NF-`S_NF+3];
+                    LSBNormSum = NormSum[`NF-`S_NF+2];
                    // used to determine underflow flag
-                    UfGuard = NormSum[`NF-`S_NF+1];
                    UfRound = NormSum[`NF-`S_NF];
-                    UfLSBNormSum = NormSum[`NF-`S_NF+2];
-                    // determine sticky
-                    Sticky = UfSticky | NormSum[`NF-`S_NF];
                end
                2'h2: begin
                    // determine guard, round, and least significant bit of the result
-                    Guard = NormSum[`NF-`H_NF+2];
                    Round = NormSum[`NF-`H_NF+1];
-                    LSBNormSum = NormSum[`NF-`H_NF+3];
+                    LSBNormSum = NormSum[`NF-`H_NF+2];
                    // used to determine underflow flag
-                    UfGuard = NormSum[`NF-`H_NF+1];
                    UfRound = NormSum[`NF-`H_NF];
-                    UfLSBNormSum = NormSum[`NF-`H_NF+2];
-                    // determine sticky
-                    Sticky = UfSticky | NormSum[`NF-`H_NF];
                end
            endcase
        end

    end
+    // used to determine underflow flag
+    assign UfLSBNormSum = Round;
+    // determine sticky
+    assign Sticky = UfSticky | UfRound;


    // Deterimine if a small number was supposed to be subtrated
@ -944,28 +886,28 @@ module fmaround(
    always_comb begin
        // Determine if you add 1
        case (FrmM)
-            3'b000: CalcPlus1 = Guard & (Round | ((Sticky)&~(~Round&SubBySmallNum)) | (~Round&~(Sticky)&LSBNormSum&~SubBySmallNum));//round to nearest even
+            3'b000: CalcPlus1 = Round & ((Sticky| LSBNormSum)&~SubBySmallNum);//round to nearest even
            3'b001: CalcPlus1 = 0;//round to zero
-            3'b010: CalcPlus1 = ResultSgnTmp & ~(SubBySmallNum & ~Guard & ~Round);//round down
-            3'b011: CalcPlus1 = ~ResultSgnTmp & ~(SubBySmallNum & ~Guard & ~Round);//round up
-            3'b100: CalcPlus1 = (Guard & (Round | ((Sticky)&~(~Round&SubBySmallNum)) | (~Round&~(Sticky)&~SubBySmallNum)));//round to nearest max magnitude
+            3'b010: CalcPlus1 = ResultSgnTmp & ~(SubBySmallNum & ~Round);//round down
+            3'b011: CalcPlus1 = ~ResultSgnTmp & ~(SubBySmallNum & ~Round);//round up
+            3'b100: CalcPlus1 = Round & ~SubBySmallNum;//round to nearest max magnitude
            default: CalcPlus1 = 1'bx;
        endcase
        // Determine if you add 1 (for underflow flag)
        case (FrmM)
-            3'b000: UfCalcPlus1 = UfGuard & (UfRound | (UfSticky&UfRound|~UfSubBySmallNum) | (~Sticky&UfLSBNormSum&~UfSubBySmallNum));//round to nearest even
+            3'b000: UfCalcPlus1 = UfRound & ((UfSticky| UfLSBNormSum)&~UfSubBySmallNum);//round to nearest even
            3'b001: UfCalcPlus1 = 0;//round to zero
-            3'b010: UfCalcPlus1 = ResultSgnTmp & ~(UfSubBySmallNum & ~UfGuard & ~UfRound);//round down
-            3'b011: UfCalcPlus1 = ~ResultSgnTmp & ~(UfSubBySmallNum & ~UfGuard & ~UfRound);//round up
-            3'b100: UfCalcPlus1 = (UfGuard & (UfRound | (UfSticky&~(~UfRound&UfSubBySmallNum)) | (~Sticky&~UfSubBySmallNum)));//round to nearest max magnitude
+            3'b010: UfCalcPlus1 = ResultSgnTmp & ~(UfSubBySmallNum & ~UfRound);//round down
+            3'b011: UfCalcPlus1 = ~ResultSgnTmp & ~(UfSubBySmallNum & ~UfRound);//round up
+            3'b100: UfCalcPlus1 = UfRound & ~UfSubBySmallNum;//round to nearest max magnitude
            default: UfCalcPlus1 = 1'bx;
        endcase
        // Determine if you subtract 1
        case (FrmM)
            3'b000: CalcMinus1 = 0;//round to nearest even
-            3'b001: CalcMinus1 = SubBySmallNum & ~Guard & ~Round;//round to zero
-            3'b010: CalcMinus1 = ~ResultSgnTmp & ~Guard & ~Round & SubBySmallNum;//round down
-            3'b011: CalcMinus1 = ResultSgnTmp & ~Guard & ~Round & SubBySmallNum;//round up
+            3'b001: CalcMinus1 = SubBySmallNum & ~Round;//round to zero
+            3'b010: CalcMinus1 = ~ResultSgnTmp & ~Round & SubBySmallNum;//round down
+            3'b011: CalcMinus1 = ResultSgnTmp & ~Round & SubBySmallNum;//round up
            3'b100: CalcMinus1 = 0;//round to nearest max magnitude
            default: CalcMinus1 = 1'bx;
        endcase
@ -973,9 +915,9 @@ module fmaround(
    end

    // If an answer is exact don't round
-    assign Plus1 = CalcPlus1 & (Sticky | Guard | Round);
-    assign UfPlus1 = UfCalcPlus1 & (Sticky | UfGuard);//UfRound is part of sticky
-    assign Minus1 = CalcMinus1 & (Sticky | Guard | Round);
+    assign Plus1 = CalcPlus1 & (Sticky | Round);
+    assign UfPlus1 = UfCalcPlus1 & (Sticky | UfRound);//UfRound is part of sticky
+    assign Minus1 = CalcMinus1 & (Sticky | Round);

    // Compute rounded result
    if (`FPSIZES == 1) begin
@ -1011,7 +953,7 @@ module fmaround(

    end

-    assign NormSumTruncated = NormSum[`NF+2:3];
+    assign NormSumTruncated = NormSum[`NF+1:2];
    assign {FullResultExp, ResultFrac} = {SumExp, NormSumTruncated} + RoundAdd;
    assign ResultExp = FullResultExp[`NE-1:0];

@ -1083,12 +1025,12 @@ module fmaflags(
    // Set Underflow flag if the number is too small to be represented in normal numbers
    //      - Don't set the underflow flag if the result is exact

-    assign Underflow = (SumExp[`NE+1] | ((SumExp == 0) & (Round|Guard|Sticky)))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+    assign Underflow = (SumExp[`NE+1] | ((SumExp == 0) & (Round|Sticky)))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
    //                      exp is negitive         result is denorm        exp was denorm but rounded to norm and if given an unbounded exponent it would stay denormal
-    assign UnderflowFlag = (FullResultExp[`NE+1] | ((FullResultExp == 0) | ((FullResultExp == 1) & (SumExp == 0) & ~(UfPlus1&UfLSBNormSum)))&(Round|Guard|Sticky))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+    assign UnderflowFlag = (FullResultExp[`NE+1] | ((FullResultExp == 0) | ((FullResultExp == 1) & (SumExp == 0) & ~(UfPlus1&UfLSBNormSum)))&(Round|Sticky))&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
    // Set Inexact flag if the result is diffrent from what would be outputed given infinite precision
    //      - Don't set the underflow flag if an underflowed result isn't outputed
-    assign Inexact = (Sticky|Overflow|Guard|Round|Underflow)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);
+    assign Inexact = (Sticky|Overflow|Round|Underflow)&~(XNaNM|YNaNM|ZNaNM|XInfM|YInfM|ZInfM);

    // Combine flags
    //      - FMA can't set the Divide by zero flag
--- a/pipelined/src/fpu/unpack.sv
+++ b/pipelined/src/fpu/unpack.sv
@ -96,9 +96,9 @@ module unpack (

        // extract the exponent, converting the smaller exponent into the larger precision if nessisary
        //      - if the original precision had a denormal number convert the exponent value 1
-        assign XExpE = FmtE ? X[`FLEN-2:`NF] : XOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {XLen1[`LEN1-2], {`NE-`NE1{~XLen1[`LEN1-2]&~XExpZero|XExpMaxE}}, XLen1[`LEN1-3:`NF1]}; 
-        assign YExpE = FmtE ? Y[`FLEN-2:`NF] : YOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {YLen1[`LEN1-2], {`NE-`NE1{~YLen1[`LEN1-2]&~YExpZero|YExpMaxE}}, YLen1[`LEN1-3:`NF1]}; 
-        assign ZExpE = FmtE ? Z[`FLEN-2:`NF] : ZOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {ZLen1[`LEN1-2], {`NE-`NE1{~ZLen1[`LEN1-2]&~ZExpZero|ZExpMaxE}}, ZLen1[`LEN1-3:`NF1]}; 
+        assign XExpE = FmtE ? X[`FLEN-2:`NF] : XOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {XLen1[`LEN1-2], {`NE-`NE1{~XLen1[`LEN1-2]}}, XLen1[`LEN1-3:`NF1]}; 
+        assign YExpE = FmtE ? Y[`FLEN-2:`NF] : YOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {YLen1[`LEN1-2], {`NE-`NE1{~YLen1[`LEN1-2]}}, YLen1[`LEN1-3:`NF1]}; 
+        assign ZExpE = FmtE ? Z[`FLEN-2:`NF] : ZOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {ZLen1[`LEN1-2], {`NE-`NE1{~ZLen1[`LEN1-2]}}, ZLen1[`LEN1-3:`NF1]}; 

        // is the input (in it's original format) denormalized
        assign XOrigDenormE = FmtE ? 0 : ~|XLen1[`LEN1-2:`NF1] & ~XFracZero; 
@ -257,9 +257,9 @@ module unpack (
                    // also need to take into account possible zero/denorm/inf/NaN values

                    // convert the larger precision's exponent to use the largest precision's bias
-                    XExpE = XOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {XLen1[`LEN1-2], {`NE-`NE1{~XLen1[`LEN1-2]&~XExpZero|XExpMaxE}}, XLen1[`LEN1-3:`NF1]}; 
-                    YExpE = YOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {YLen1[`LEN1-2], {`NE-`NE1{~YLen1[`LEN1-2]&~YExpZero|YExpMaxE}}, YLen1[`LEN1-3:`NF1]}; 
-                    ZExpE = ZOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {ZLen1[`LEN1-2], {`NE-`NE1{~ZLen1[`LEN1-2]&~ZExpZero|ZExpMaxE}}, ZLen1[`LEN1-3:`NF1]}; 
+                    XExpE = XOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {XLen1[`LEN1-2], {`NE-`NE1{~XLen1[`LEN1-2]}}, XLen1[`LEN1-3:`NF1]}; 
+                    YExpE = YOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {YLen1[`LEN1-2], {`NE-`NE1{~YLen1[`LEN1-2]}}, YLen1[`LEN1-3:`NF1]}; 
+                    ZExpE = ZOrigDenormE ? {1'b0, {`NE-`NE1{1'b1}}, (`NE1-1)'(1)} : {ZLen1[`LEN1-2], {`NE-`NE1{~ZLen1[`LEN1-2]}}, ZLen1[`LEN1-3:`NF1]}; 

                    // extract the fraction and add the nessesary trailing zeros
                    XFracE = {XLen1[`NF1-1:0], (`NF-`NF1)'(0)};
@ -282,9 +282,9 @@ module unpack (
                    // also need to take into account possible zero/denorm/inf/NaN values
                    
                    // convert the smallest precision's exponent to use the largest precision's bias
-                    XExpE = XOrigDenormE ? {1'b0, {`NE-`NE2{1'b1}}, (`NE2-1)'(1)} : {XLen2[`LEN2-2], {`NE-`NE2{~XLen2[`LEN2-2]&~XExpZero|XExpMaxE}}, XLen2[`LEN2-3:`NF2]}; 
-                    YExpE = YOrigDenormE ? {1'b0, {`NE-`NE2{1'b1}}, (`NE2-1)'(1)} : {YLen2[`LEN2-2], {`NE-`NE2{~YLen2[`LEN2-2]&~YExpZero|YExpMaxE}}, YLen2[`LEN2-3:`NF2]}; 
-                    ZExpE = ZOrigDenormE ? {1'b0, {`NE-`NE2{1'b1}}, (`NE2-1)'(1)} : {ZLen2[`LEN2-2], {`NE-`NE2{~ZLen2[`LEN2-2]&~ZExpZero|ZExpMaxE}}, ZLen2[`LEN2-3:`NF2]}; 
+                    XExpE = XOrigDenormE ? {1'b0, {`NE-`NE2{1'b1}}, (`NE2-1)'(1)} : {XLen2[`LEN2-2], {`NE-`NE2{~XLen2[`LEN2-2]}}, XLen2[`LEN2-3:`NF2]}; 
+                    YExpE = YOrigDenormE ? {1'b0, {`NE-`NE2{1'b1}}, (`NE2-1)'(1)} : {YLen2[`LEN2-2], {`NE-`NE2{~YLen2[`LEN2-2]}}, YLen2[`LEN2-3:`NF2]}; 
+                    ZExpE = ZOrigDenormE ? {1'b0, {`NE-`NE2{1'b1}}, (`NE2-1)'(1)} : {ZLen2[`LEN2-2], {`NE-`NE2{~ZLen2[`LEN2-2]}}, ZLen2[`LEN2-3:`NF2]}; 

                    // extract the fraction and add the nessesary trailing zeros
                    XFracE = {XLen2[`NF2-1:0], (`NF-`NF2)'(0)};
@ -447,9 +447,9 @@ module unpack (
                    
                    // convert the double precsion exponent into quad precsion

-                    XExpE = XOrigDenormE ? {1'b0, {`Q_NE-`D_NE{1'b1}}, (`D_NE-1)'(1)} : {XLen1[`D_LEN-2], {`Q_NE-`D_NE{~XLen1[`D_LEN-2]&~XExpZero|XExpMaxE}}, XLen1[`D_LEN-3:`D_NF]}; 
-                    YExpE = YOrigDenormE ? {1'b0, {`Q_NE-`D_NE{1'b1}}, (`D_NE-1)'(1)} : {YLen1[`D_LEN-2], {`Q_NE-`D_NE{~YLen1[`D_LEN-2]&~YExpZero|YExpMaxE}}, YLen1[`D_LEN-3:`D_NF]}; 
-                    ZExpE = ZOrigDenormE ? {1'b0, {`Q_NE-`D_NE{1'b1}}, (`D_NE-1)'(1)} : {ZLen1[`D_LEN-2], {`Q_NE-`D_NE{~ZLen1[`D_LEN-2]&~ZExpZero|ZExpMaxE}}, ZLen1[`D_LEN-3:`D_NF]}; 
+                    XExpE = XOrigDenormE ? {1'b0, {`Q_NE-`D_NE{1'b1}}, (`D_NE-1)'(1)} : {XLen1[`D_LEN-2], {`Q_NE-`D_NE{~XLen1[`D_LEN-2]}}, XLen1[`D_LEN-3:`D_NF]}; 
+                    YExpE = YOrigDenormE ? {1'b0, {`Q_NE-`D_NE{1'b1}}, (`D_NE-1)'(1)} : {YLen1[`D_LEN-2], {`Q_NE-`D_NE{~YLen1[`D_LEN-2]}}, YLen1[`D_LEN-3:`D_NF]}; 
+                    ZExpE = ZOrigDenormE ? {1'b0, {`Q_NE-`D_NE{1'b1}}, (`D_NE-1)'(1)} : {ZLen1[`D_LEN-2], {`Q_NE-`D_NE{~ZLen1[`D_LEN-2]}}, ZLen1[`D_LEN-3:`D_NF]}; 

                    // extract the fraction and add the nessesary trailing zeros
                    XFracE = {XLen1[`D_NF-1:0], (`Q_NF-`D_NF)'(0)};
@ -471,9 +471,9 @@ module unpack (
                    // also need to take into account possible zero/denorm/inf/NaN values
                    
                    // convert the single precsion exponent into quad precsion
-                    XExpE = XOrigDenormE ? {1'b0, {`Q_NE-`S_NE{1'b1}}, (`S_NE-1)'(1)} : {XLen2[`S_LEN-2], {`Q_NE-`S_NE{~XLen2[`S_LEN-2]&~XExpZero|XExpMaxE}}, XLen2[`S_LEN-3:`S_NF]}; 
-                    YExpE = YOrigDenormE ? {1'b0, {`Q_NE-`S_NE{1'b1}}, (`S_NE-1)'(1)} : {YLen2[`S_LEN-2], {`Q_NE-`S_NE{~YLen2[`S_LEN-2]&~YExpZero|YExpMaxE}}, YLen2[`S_LEN-3:`S_NF]}; 
-                    ZExpE = ZOrigDenormE ? {1'b0, {`Q_NE-`S_NE{1'b1}}, (`S_NE-1)'(1)} : {ZLen2[`S_LEN-2], {`Q_NE-`S_NE{~ZLen2[`S_LEN-2]&~ZExpZero|ZExpMaxE}}, ZLen2[`S_LEN-3:`S_NF]}; 
+                    XExpE = XOrigDenormE ? {1'b0, {`Q_NE-`S_NE{1'b1}}, (`S_NE-1)'(1)} : {XLen2[`S_LEN-2], {`Q_NE-`S_NE{~XLen2[`S_LEN-2]}}, XLen2[`S_LEN-3:`S_NF]}; 
+                    YExpE = YOrigDenormE ? {1'b0, {`Q_NE-`S_NE{1'b1}}, (`S_NE-1)'(1)} : {YLen2[`S_LEN-2], {`Q_NE-`S_NE{~YLen2[`S_LEN-2]}}, YLen2[`S_LEN-3:`S_NF]}; 
+                    ZExpE = ZOrigDenormE ? {1'b0, {`Q_NE-`S_NE{1'b1}}, (`S_NE-1)'(1)} : {ZLen2[`S_LEN-2], {`Q_NE-`S_NE{~ZLen2[`S_LEN-2]}}, ZLen2[`S_LEN-3:`S_NF]}; 

                    // extract the fraction and add the nessesary trailing zeros
                    XFracE = {XLen2[`S_NF-1:0], (`Q_NF-`S_NF)'(0)};
@ -495,9 +495,9 @@ module unpack (
                    // also need to take into account possible zero/denorm/inf/NaN values

                    // convert the half precsion exponent into quad precsion
-                    XExpE = XOrigDenormE ? {1'b0, {`Q_NE-`H_NE{1'b1}}, (`H_NE-1)'(1)} : {XLen3[`H_LEN-2], {`Q_NE-`H_NE{~XLen3[`H_LEN-2]&~XExpZero|XExpMaxE}}, XLen3[`H_LEN-3:`H_NF]}; 
-                    YExpE = YOrigDenormE ? {1'b0, {`Q_NE-`H_NE{1'b1}}, (`H_NE-1)'(1)} : {YLen3[`H_LEN-2], {`Q_NE-`H_NE{~YLen3[`H_LEN-2]&~YExpZero|YExpMaxE}}, YLen3[`H_LEN-3:`H_NF]}; 
-                    ZExpE = ZOrigDenormE ? {1'b0, {`Q_NE-`H_NE{1'b1}}, (`H_NE-1)'(1)} : {ZLen3[`H_LEN-2], {`Q_NE-`H_NE{~ZLen3[`H_LEN-2]&~ZExpZero|ZExpMaxE}}, ZLen3[`H_LEN-3:`H_NF]}; 
+                    XExpE = XOrigDenormE ? {1'b0, {`Q_NE-`H_NE{1'b1}}, (`H_NE-1)'(1)} : {XLen3[`H_LEN-2], {`Q_NE-`H_NE{~XLen3[`H_LEN-2]}}, XLen3[`H_LEN-3:`H_NF]}; 
+                    YExpE = YOrigDenormE ? {1'b0, {`Q_NE-`H_NE{1'b1}}, (`H_NE-1)'(1)} : {YLen3[`H_LEN-2], {`Q_NE-`H_NE{~YLen3[`H_LEN-2]}}, YLen3[`H_LEN-3:`H_NF]}; 
+                    ZExpE = ZOrigDenormE ? {1'b0, {`Q_NE-`H_NE{1'b1}}, (`H_NE-1)'(1)} : {ZLen3[`H_LEN-2], {`Q_NE-`H_NE{~ZLen3[`H_LEN-2]}}, ZLen3[`H_LEN-3:`H_NF]}; 

                    // extract the fraction and add the nessesary trailing zeros
                    XFracE = {XLen3[`H_NF-1:0], (`Q_NF-`H_NF)'(0)};
--- a/pipelined/src/generic/lzc.sv
+++ b/pipelined/src/generic/lzc.sv
@ -0,0 +1,15 @@
+//leading zero counter i.e. priority encoder
+module lzc #(parameter WIDTH=1) (
+    input logic  [WIDTH-1:0]            num,
+    output logic [$clog2(WIDTH)-1:0]  ZeroCnt
+);
+/* verilator lint_off CMPCONST */
+    
+    logic [$clog2(WIDTH)-1:0] i;
+    always_comb begin
+        i = 0;
+        while (~num[WIDTH-1-(32)'(i)] & $unsigned(i) <= $unsigned(($clog2(WIDTH))'(WIDTH-1))) i = i+1;  // search for leading one
+        ZeroCnt = i;
+    end
+/* verilator lint_on CMPCONST */
+endmodule
--- a/pipelined/testbench/testbench-fp.sv
+++ b/pipelined/testbench/testbench-fp.sv
@ -1174,13 +1174,13 @@ end
  ///////////////////////////////////////////////////////////////////////////////////////////////

    // check if the non-fma test is correct
-    if(~((Res === Ans | NaNGood | NaNGood === 1'bx) & (ResFlg === AnsFlg | AnsFlg === 5'bx))&(UnitVal !== `CVTINTUNIT)) begin
+    if(~((Res === Ans | NaNGood | NaNGood === 1'bx) & (ResFlg === AnsFlg | AnsFlg === 5'bx))&(UnitVal !== `CVTINTUNIT)&(UnitVal !== `CMPUNIT)) begin
      errors += 1;
      $display("There is an error in %s", Tests[TestNum]);
      $display("inputs: %h %h %h\nSrcA: %h\n Res: %h %h\n Ans: %h %h", X, Y, Z, SrcA, Res, ResFlg, Ans, AnsFlg);
      $stop;
    end
-
+    
    // TestFloat sets the result to all 1's when there is an invalid result, however in 
    // http://www.jhauser.us/arithmetic/TestFloat-3/doc/TestFloat-general.html it says
    // for an unsigned integer result 0 is also okay
@ -1470,7 +1470,7 @@ module readvectors (
            Ans = TestVector[8];
          end
          2'b10:	begin	  // half
-            X = {{`FLEN-`H_LEN{1'b1}}, TestVector[12+3*(`H_LEN)-1:12+(`H_LEN)]};
+            X = {{`FLEN-`H_LEN{1'b1}}, TestVector[12+2*(`H_LEN)-1:12+(`H_LEN)]};
            Y = {{`FLEN-`H_LEN{1'b1}}, TestVector[12+(`H_LEN)-1:12]};
            Ans = TestVector[8];
          end