Merge branch 'main' into busybear

2025-02-11 06:05:49 +00:00 · 2021-03-05 20:27:19 +00:00 · 2021-03-05 20:27:19 +00:00 · f0a103687e
commit f0a103687e
parent 612f7a9ee4 6981907521
37 changed files with 466823 additions and 3511 deletions
--- a/wally-pipelined/bin/extractFunctionRadix.sh
+++ b/wally-pipelined/bin/extractFunctionRadix.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+
+allProgramRadixFile="FunctionRadix"
+
+index=0
+
+for objDumpFile in "$@";
+do
+    # get the lines with named labels from the obj files.
+    # 64 bit addresses
+    listOfAddr16=`egrep -i '^[0-9]{16} <[0-9a-zA-Z_]+>' $objDumpFile`
+    # 32 bit addresses
+    listOfAddr8=`egrep -i '^[0-9]{8} <[0-9a-zA-Z_]+>' $objDumpFile`
+    listOfAddr=`echo "$listOfAddr16" "$listOfAddr8"`
+
+    # parse out the addresses and the labels
+    addresses=`echo "$listOfAddr" | awk '{print $1}'`
+    labels=`echo "$listOfAddr" | awk '{print  "\""$2"\"", "-color \"SpringGreen\","}' | tr -d '<>:'`
+
+    echo "$addresses" > $objDumpFile.addr
+
+    # need to add some formatting to each line
+    numLines=`echo "$listOfAddr" | wc -l`
+    prefix=`yes "    16#" | head -n  $numLines`
+    midfix=`yes "# " | head -n $numLines`
+
+    # paste echos each of the 4 parts on a per line basis.
+    #-d'\0' sets no delimiter
+    temp=`paste -d'\0' <(echo "$prefix") <(echo "$addresses") <(echo "$midfix") <(echo "$labels")`
+
+    # remove the last comma
+    temp2=${temp::-1}
+
+    echo "radix define Functions {" > $objDumpFile.do
+    echo "$temp2" >> $objDumpFile.do
+    echo "    -default hex -color green" >> $objDumpFile.do
+    echo "}" >> $objDumpFile.do
+
+    # now create the all in one version
+    # put the index at the begining of each line
+    allAddresses=`paste -d'\0' <(printf "%04x" "$index") <(echo "$addresses")`
+
+    printf "%04x%s" "$index" "$addresses" >> $allProgramRadixFile.addr
+
+    index=$(($index+1))
+    
+done
--- a/wally-pipelined/regression/BTBPredictor.txt
+++ b/wally-pipelined/regression/BTBPredictor.txt
--- a/wally-pipelined/regression/twoBitPredictor.txt
+++ b/wally-pipelined/regression/twoBitPredictor.txt
--- a/wally-pipelined/regression/wally-pipelined-batch.do
+++ b/wally-pipelined/regression/wally-pipelined-batch.do
@ -6,13 +6,12 @@
 # Go Cowboys!!!!!!
 #
 # Takes 1:10 to run RV64IC tests using gui
-# 11 seconds to run batch mode

-# Use this wally-pipelined.do file to run this example.
+# Use this wally-pipelined-batch.do file to run this example.
 # Either bring up ModelSim and type the following at the "ModelSim>" prompt:
-#     do wally-pipelined.do ../config/rv64ic
+#     do wally-pipelined-batch.do
 # or, to run from a shell, type the following at the shell prompt:
-#     vsim -c -do wally-pipelined.do ../config/rv64ic
+#     vsim -do wally-pipelined-batch.do -c
 # (omit the "-c" to see the GUI while running from the shell)

 onbreak {resume}
@ -27,12 +26,22 @@ vlib work
 # suppress spurious warnngs about 
 # "Extra checking for conflicts with always_comb done at vopt time"
 # because vsim will run vopt
-vlog +incdir+$1 ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583

+# default to config/rv64ic, but allow this to be overridden at the command line.  For example:
+# do wally-pipelined-batch.do ../config/rv32ic
+switch $argc {
+    0 {vlog +incdir+../config/rv64ic ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+    1 {vlog +incdir+$1 ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+}
 # start and run simulation
 # remove +acc flag for faster sim during regressions if there is no need to access internal signals
-vopt work.testbench -o workopt 
+vopt +acc work.testbench -o workopt 
 vsim workopt

+# load the branch predictors with known data. The value of the data is not important for function, but
+# is important for perventing pessimistic x propagation.
+mem load -infile twoBitPredictor.txt -format bin testbench/dut/hart/ifu/bpred/DirPredictor/memory/memory
+mem load -infile BTBPredictor.txt -format bin testbench/dut/hart/ifu/bpred/TargetPredictor/memory/memory
+
 run -all
 quit
--- a/wally-pipelined/regression/wally-pipelined-ross.do
+++ b/wally-pipelined/regression/wally-pipelined-ross.do
@ -0,0 +1,52 @@
+# wally-pipelined.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# Use this wally-pipelined.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# default to config/rv64ic, but allow this to be overridden at the command line.  For example:
+# do wally-pipelined.do ../config/rv32ic
+switch $argc {
+    0 {vlog +incdir+../config/rv64ic ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+    1 {vlog +incdir+$1 ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+}
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+vopt +acc work.testbench -o workopt 
+vsim workopt
+
+# load the branch predictors with known data. The value of the data is not important for function, but
+# is important for perventing pessimistic x propagation.
+mem load -infile twoBitPredictor.txt -format bin testbench/dut/hart/ifu/bpred/DirPredictor/memory/memory
+mem load -infile BTBPredictor.txt -format bin testbench/dut/hart/ifu/bpred/TargetPredictor/memory/memory
+
+do wave.do
+add log -r /*
+
+-- Run the Simulation 
+#run 1000
+run -all
+#quit
--- a/wally-pipelined/regression/wally-pipelined.do
+++ b/wally-pipelined/regression/wally-pipelined.do
@ -38,6 +38,11 @@ switch $argc {
 vopt +acc work.testbench -o workopt 
 vsim workopt

+# load the branch predictors with known data. The value of the data is not important for function, but
+# is important for perventing pessimistic x propagation.
+mem load -infile twoBitPredictor.txt -format bin testbench/dut/hart/ifu/bpred/DirPredictor/memory/memory
+mem load -infile BTBPredictor.txt -format bin testbench/dut/hart/ifu/bpred/TargetPredictor/memory/memory
+
 view wave

 -- display input and output signals as hexidecimal values
--- a/wally-pipelined/regression/wave-all.do
+++ b/wally-pipelined/regression/wave-all.do
--- a/wally-pipelined/regression/wave.do
+++ b/wally-pipelined/regression/wave.do
@ -0,0 +1,134 @@
+onerror {resume}
+quietly WaveActivateNextPane {} 0
+add wave -noupdate /testbench/clk
+add wave -noupdate /testbench/reset
+add wave -noupdate -radix ascii /testbench/memfilename
+add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/PCE
+add wave -noupdate -expand -group {Execution Stage} /testbench/InstrEName
+add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/InstrE
+add wave -noupdate -divider <NULL>
+add wave -noupdate /testbench/dut/hart/ebu/IReadF
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/BPPredWrongE
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/CSRWritePendingDEM
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/RetM
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/TrapM
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/LoadStallD
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/InstrStall
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/DataStall
+add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/hzu/FlushF
+add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushD
+add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushE
+add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushM
+add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushW
+add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallF
+add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallD
+add wave -noupdate -group Bpred -expand -group direction -divider Update
+add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/DirPredictor/UpdatePC
+add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/DirPredictor/UpdateEN
+add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/DirPredictor/UpdatePCIndex
+add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/DirPredictor/UpdatePrediction
+add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/DirPredictor/memory/memory
+add wave -noupdate -group InstrClass /testbench/dut/hart/ifu/bpred/InstrClassF
+add wave -noupdate -group InstrClass /testbench/dut/hart/ifu/bpred/InstrClassD
+add wave -noupdate -group InstrClass /testbench/dut/hart/ifu/bpred/InstrClassE
+add wave -noupdate -group {instruction pipeline} /testbench/dut/hart/ifu/InstrF
+add wave -noupdate -group {instruction pipeline} /testbench/dut/hart/ifu/InstrD
+add wave -noupdate -group {instruction pipeline} /testbench/dut/hart/ifu/InstrE
+add wave -noupdate -group {instruction pipeline} /testbench/dut/hart/ifu/InstrM
+add wave -noupdate /testbench/dut/hart/ifu/bpred/BPPredWrongE
+add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCNextF
+add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCF
+add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCPlus2or4F
+add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/BPPredPCF
+add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCNext0F
+add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCNext1F
+add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/SelBPPredF
+add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/BPPredWrongE
+add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PrivilegedChangePCM
+add wave -noupdate /testbench/dut/hart/ifu/bpred/TargetPredictor/ValidBits
+add wave -noupdate /testbench/dut/hart/ifu/bpred/BPPredF
+add wave -noupdate /testbench/dut/hart/ifu/bpred/BTBValidF
+add wave -noupdate /testbench/dut/hart/ifu/bpred/TargetPredictor/LookUpPCIndexQ
+add wave -noupdate /testbench/dut/hart/ifu/bpred/TargetPredictor/UpdatePCIndexQ
+add wave -noupdate /testbench/dut/hart/ifu/bpred/TargetPredictor/LookUpPC
+add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/TargetWrongE
+add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/FallThroughWrongE
+add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/PredictionDirWrongE
+add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/PredictionPCWrongE
+add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/BPPredWrongE
+add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/InstrClassE
+add wave -noupdate -group BTB -divider Update
+add wave -noupdate -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/UpdateEN
+add wave -noupdate -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/UpdatePC
+add wave -noupdate -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/UpdateTarget
+add wave -noupdate -group BTB -divider Lookup
+add wave -noupdate -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/TargetPC
+add wave -noupdate -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/Valid
+add wave -noupdate /testbench/dut/hart/ifu/bpred/BTBPredPCF
+add wave -noupdate /testbench/dut/hart/ifu/bpred/TargetPredictor/TargetPC
+add wave -noupdate /testbench/dut/hart/ifu/bpred/CorrectPCE
+add wave -noupdate /testbench/dut/hart/ifu/bpred/FlushF
+add wave -noupdate /testbench/dut/hart/FlushF
+add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rf
+add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a1
+add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a2
+add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a3
+add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rd1
+add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rd2
+add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/we3
+add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/wd3
+add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ALUResultW
+add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ReadDataW
+add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/PCLinkW
+add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/CSRReadValW
+add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultSrcW
+add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultW
+add wave -noupdate /testbench/dut/hart/ieu/c/RegWriteE
+add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ifu/InstrD
+add wave -noupdate -group {Decode Stage} /testbench/InstrDName
+add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/c/RegWriteD
+add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/RdD
+add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/Rs1D
+add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/Rs2D
+add wave -noupdate /testbench/InstrFName
+add wave -noupdate -expand -group dcache /testbench/dut/hart/MemAdrM
+add wave -noupdate -expand -group dcache /testbench/dut/hart/MemPAdrM
+add wave -noupdate -expand -group dcache /testbench/dut/hart/WriteDataM
+add wave -noupdate -expand -group dcache /testbench/dut/hart/ReadDataM
+add wave -noupdate -expand -group dcache /testbench/dut/hart/dmem/MemRWM
+add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs1D
+add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs2D
+add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs1E
+add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs2E
+add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/RdE
+add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/RdM
+add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/RdW
+add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/MemReadE
+add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/RegWriteM
+add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/RegWriteW
+add wave -noupdate -group Forward -color Thistle /testbench/dut/hart/ieu/fw/ForwardAE
+add wave -noupdate -group Forward -color Thistle /testbench/dut/hart/ieu/fw/ForwardBE
+add wave -noupdate -group Forward -color Thistle /testbench/dut/hart/ieu/fw/LoadStallD
+add wave -noupdate -expand -group {alu execution stage} /testbench/dut/hart/ieu/dp/WriteDataE
+add wave -noupdate -expand -group {alu execution stage} /testbench/dut/hart/ieu/dp/ALUResultE
+add wave -noupdate -expand -group {alu execution stage} /testbench/dut/hart/ieu/dp/SrcAE
+add wave -noupdate -expand -group {alu execution stage} /testbench/dut/hart/ieu/dp/SrcBE
+add wave -noupdate /testbench/dut/hart/ieu/dp/ALUResultM
+TreeUpdate [SetDefaultTree]
+WaveRestoreCursors {{Cursor 2} {231033 ns} 0} {{Cursor 3} {1276117 ns} 0}
+quietly wave cursor active 2
+configure wave -namecolwidth 250
+configure wave -valuecolwidth 518
+configure wave -justifyvalue left
+configure wave -signalnamewidth 1
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+configure wave -gridoffset 0
+configure wave -gridperiod 1
+configure wave -griddelta 40
+configure wave -timeline 0
+configure wave -timelineunits ns
+update
+WaveRestoreZoom {1276094 ns} {1276208 ns}
--- a/wally-pipelined/src/fpu/FMA/add.v
+++ b/wally-pipelined/src/fpu/FMA/add.v
@ -35,14 +35,14 @@ module add(r[105:0], s[105:0], t[157:0], sum[157:0],
 	wire		[157:0] 	sum0;			// sum of compound adder +0 mode
 	wire		[157:0] 	sum1;			// sum of compound adder +1 mode

-	// Invert addend if necessary 
+	// Invert addend if z's sign is diffrent from the product's sign

 	assign t2 = invz ? -t : t;
 	
 	// Zero out product if Z >> product or product really should be zero

-	assign r2 = ~proddenorm & killprod ? 106'b0 : r;
-	assign s2 = ~proddenorm & killprod ? 106'b0 : s;
+	assign r2 = killprod ? 106'b0 : r;
+	assign s2 = killprod ? 106'b0 : s;

 	// Compound adder
 	// Consists of 3:2 CSA followed by long compound CPA
--- a/wally-pipelined/src/fpu/FMA/align.v
+++ b/wally-pipelined/src/fpu/FMA/align.v
@ -15,17 +15,17 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
             killprod,  bypsel[1], bypplus1, byppostnorm);
 /////////////////////////////////////////////////////////////////////////////

-	input 		[51:0]		z;				// Fraction of addend z;
+	input 		[51:0]		z;		// Fraction of addend z;
 	input 		[12:0]		ae;		// sign of exponent of addend z;
-	input 		[11:0]		aligncnt;		// amount to shift
-	input					xzero;			// Input X = 0
-	input                  	yzero;          // Input Y = 0 
-	input                  	zzero;          // Input Z = 0
-	input                  	zdenorm;        // Input Z = denorm
-	input			proddenorm;
+	input 		[11:0]		aligncnt;	// amount to shift
+	input				xzero;		// Input X = 0
+	input                  		yzero;          // Input Y = 0 
+	input                  		zzero;          // Input Z = 0
+	input                  		zdenorm;        // Input Z is denormalized
+	input				proddenorm;	// product is denormalized
 	input     	[1:1] 		bypsel;         // Select bypass to X or Z
-	input					bypplus1;		// Add one to bypassed result
-	input                  	byppostnorm;    // Postnormalize bypassed result 
+	input				bypplus1;	// Add one to bypassed result
+	input                  		byppostnorm;    // Postnormalize bypassed result 
 	output    	[157:0]    	t;              // aligned addend (54 bits left of bpt)
 	output          		bs;           	// sticky bit of addend
 	output          		ps;           	// sticky bit of product
@ -34,13 +34,13 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
 	// Internal nodes
 
 	reg       	[157:0]   	t;				// aligned addend from shifter
-	reg             		killprod;		// Z >> product 
+	reg             		killprod;			// Z >> product 
 	reg             		bs;				// sticky bit of addend
 	reg             		ps;				// sticky bit of product
 	reg       	[7:0]		i;				// temp storage for finding sticky bit
 	wire		[52:0]		z1;				// Z plus 1
 	wire		[51:0]		z2;				// Z selected after handling rounds
-	wire		[11:0]		align104;		// alignment count + 104
+	wire		[11:0]		align104;			// alignment count + 104

 	// Increment fraction of Z by  one if necessary for prerounded bypass
 	// This incrementor delay is masked by the alignment count computation
@ -56,7 +56,7 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
 	// addend on right shifts.  Handle special cases of shifting
 	// by too much.

-	always @(z2 or aligncnt or align104 or zzero or xzero or yzero or zdenorm)
+	always @(z2 or aligncnt or align104 or zzero or xzero or yzero or zdenorm or proddenorm)
 		begin

 		// Default to clearing sticky bits 
@ -66,7 +66,7 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
 		// And to using product as primary operand in adder I exponent gen 
 		killprod = 0;

-		if(zzero) begin 
+		if(zzero) begin // if z = 0
 			t = 158'b0;
 			if (xzero || yzero) killprod = 1;
 		end else if ((aligncnt > 53 && ~aligncnt[11]) || xzero || yzero) begin
@ -75,8 +75,8 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
 			t = {53'b0, ~zzero, z2, 52'b0}; 
 			killprod = 1;
 			ps = ~xzero && ~yzero; 
-		end else if ((ae[12] && align104[11])) begin //***fix the if statement
-			// KEP if the multiplier's exponent overflows
+		end else if ((ae[12] && align104[11]) && ~proddenorm) begin //***fix the if statement
+							// KEP if the multiplier's exponent overflows
 			t = {53'b0, ~zzero, z2, 52'b0}; 
 			killprod = 1;
 			ps = ~xzero && ~yzero; 
@ -85,7 +85,7 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
 			t = 0;
 		end else if (~aligncnt[11])  begin 	// Left shift by reasonable amount
 			t = {53'b0, ~zzero, z2, 52'b0} << aligncnt;
-		end else begin                 // Otherwise right shift 
+		end else begin                 		// Otherwise right shift 
 			t = {53'b0, ~zzero, z2, 52'b0} >> -aligncnt;

 		// use some behavioral code to find sticky bit.  This is really
--- a/wally-pipelined/src/fpu/FMA/array.sv
+++ b/wally-pipelined/src/fpu/FMA/array.sv
@ -30,85 +30,85 @@ module array(x, y, xdenorm, ydenorm, r, s, bypsel, bypplus1);

    assign xnorm = xdenorm ? {x[50:0], 1'b0} : x; // normalization of denormalized numbers
 	assign ynorm = ydenorm ? {y[50:0], 1'b0} : y;
-     assign yExt = {2'b01,ynorm,1'b0}; // y extended and added assumed 1
-     assign xExt = {2'b01,xnorm}; // x with added assumed 1
+     //assign yExt = {2'b01,ynorm,1'b0}; // y extended and added assumed 1
+     //assign xExt = {2'b01,xnorm}; // x with added assumed 1


     //booth encoding

-     generate
-        for(i=0; i<27; i=i+1) begin
-            booth booth(.xExt(xExt), .choose(yExt[(i*2)+2:i*2]), .add1(add1[i]), .e(e[i]), .pp(pp[i]));
-        end
-     endgenerate
+    //  generate
+    //     for(i=0; i<27; i=i+1) begin
+    //         booth booth(.xExt(xExt), .choose(yExt[(i*2)+2:i*2]), .add1(add1[i]), .e(e[i]), .pp(pp[i]));
+    //     end
+    //  endgenerate

-    assign acc[0] = {49'b0,~e[0],e[0],e[0],pp[0]}; 
-    assign acc[1] = {50'b01,~e[1],pp[1],add1[0]}; 
-    assign acc[2] = {48'b01,~e[2],pp[2],add1[1], 2'b0};
-    assign acc[3] = {46'b01,~e[3],pp[3],add1[2], 4'b0};
-    assign acc[4] = {44'b01,~e[4],pp[4],add1[3], 6'b0};
-    assign acc[5] = {42'b01,~e[5],pp[5],add1[4], 8'b0};
-    assign acc[6] = {40'b01,~e[6],pp[6],add1[5], 10'b0};
-    assign acc[7] = {38'b01,~e[7],pp[7],add1[6], 12'b0};
-    assign acc[8] = {36'b01,~e[8],pp[8],add1[7], 14'b0};
-    assign acc[9] = {34'b01,~e[9],pp[9],add1[8], 16'b0};
-    assign acc[10] = {32'b01,~e[10],pp[10],add1[9], 18'b0};
-    assign acc[11] = {30'b01,~e[11],pp[11],add1[10], 20'b0};
-    assign acc[12] = {28'b01,~e[12],pp[12],add1[11], 22'b0};
-    assign acc[13] = {26'b01,~e[13],pp[13],add1[12], 24'b0};
-    assign acc[14] = {24'b01,~e[14],pp[14],add1[13], 26'b0};
-    assign acc[15] = {22'b01,~e[15],pp[15],add1[14], 28'b0};
-    assign acc[16] = {20'b01,~e[16],pp[16],add1[15], 30'b0};
-    assign acc[17] = {18'b01,~e[17],pp[17],add1[16], 32'b0};
-    assign acc[18] = {16'b01,~e[18],pp[18],add1[17], 34'b0};
-    assign acc[19] = {14'b01,~e[19],pp[19],add1[18], 36'b0};
-    assign acc[20] = {12'b01,~e[20],pp[20],add1[19], 38'b0};
-    assign acc[21] = {10'b01,~e[21],pp[21],add1[20], 40'b0};
-    assign acc[22] = {8'b01,~e[22],pp[22],add1[21], 42'b0};
-    assign acc[23] = {6'b01,~e[23],pp[23],add1[22], 44'b0};
-    assign acc[24] = {4'b01,~e[24],pp[24],add1[23], 46'b0};
-    assign acc[25] = {~e[25],pp[25],add1[24], 48'b0};
-    assign acc[26] = {pp[26],add1[25], 50'b0};
+    // assign acc[0] = {49'b0,~e[0],e[0],e[0],pp[0]}; 
+    // assign acc[1] = {50'b01,~e[1],pp[1],add1[0]}; 
+    // assign acc[2] = {48'b01,~e[2],pp[2],add1[1], 2'b0};
+    // assign acc[3] = {46'b01,~e[3],pp[3],add1[2], 4'b0};
+    // assign acc[4] = {44'b01,~e[4],pp[4],add1[3], 6'b0};
+    // assign acc[5] = {42'b01,~e[5],pp[5],add1[4], 8'b0};
+    // assign acc[6] = {40'b01,~e[6],pp[6],add1[5], 10'b0};
+    // assign acc[7] = {38'b01,~e[7],pp[7],add1[6], 12'b0};
+    // assign acc[8] = {36'b01,~e[8],pp[8],add1[7], 14'b0};
+    // assign acc[9] = {34'b01,~e[9],pp[9],add1[8], 16'b0};
+    // assign acc[10] = {32'b01,~e[10],pp[10],add1[9], 18'b0};
+    // assign acc[11] = {30'b01,~e[11],pp[11],add1[10], 20'b0};
+    // assign acc[12] = {28'b01,~e[12],pp[12],add1[11], 22'b0};
+    // assign acc[13] = {26'b01,~e[13],pp[13],add1[12], 24'b0};
+    // assign acc[14] = {24'b01,~e[14],pp[14],add1[13], 26'b0};
+    // assign acc[15] = {22'b01,~e[15],pp[15],add1[14], 28'b0};
+    // assign acc[16] = {20'b01,~e[16],pp[16],add1[15], 30'b0};
+    // assign acc[17] = {18'b01,~e[17],pp[17],add1[16], 32'b0};
+    // assign acc[18] = {16'b01,~e[18],pp[18],add1[17], 34'b0};
+    // assign acc[19] = {14'b01,~e[19],pp[19],add1[18], 36'b0};
+    // assign acc[20] = {12'b01,~e[20],pp[20],add1[19], 38'b0};
+    // assign acc[21] = {10'b01,~e[21],pp[21],add1[20], 40'b0};
+    // assign acc[22] = {8'b01,~e[22],pp[22],add1[21], 42'b0};
+    // assign acc[23] = {6'b01,~e[23],pp[23],add1[22], 44'b0};
+    // assign acc[24] = {4'b01,~e[24],pp[24],add1[23], 46'b0};
+    // assign acc[25] = {~e[25],pp[25],add1[24], 48'b0};
+    // assign acc[26] = {pp[26],add1[25], 50'b0};

-    //*** resize adders
-     generate
-        for(i=0; i<9; i=i+1) begin
-            add3comp2 #(.BITS(106)) add1(.a(acc[i*3]), .b(acc[i*3+1]), .c(acc[i*3+2]), 
-                                           .carry(carryTmp[i][105:0]), .sum(lv1add[i*2+1]));
-            assign lv1add[i*2] = {carryTmp[i][104:0], 1'b0};
-        end
-     endgenerate
+    // //*** resize adders
+    //  generate
+    //     for(i=0; i<9; i=i+1) begin
+    //         add3comp2 #(.BITS(106)) add1(.a(acc[i*3]), .b(acc[i*3+1]), .c(acc[i*3+2]), 
+    //                                        .carry(carryTmp[i][105:0]), .sum(lv1add[i*2+1]));
+    //         assign lv1add[i*2] = {carryTmp[i][104:0], 1'b0};
+    //     end
+    //  endgenerate

-     generate
-        for(i=0; i<6; i=i+1) begin
-            add3comp2 #(.BITS(106)) add2(.a(lv1add[i*3]), .b(lv1add[i*3+1]), .c(lv1add[i*3+2]), 
-                                           .carry(carryTmp[i+9][105:0]), .sum(lv2add[i*2+1]));
-            assign lv2add[i*2] = {carryTmp[i+9][104:0], 1'b0};
-        end
-     endgenerate
+    //  generate
+    //     for(i=0; i<6; i=i+1) begin
+    //         add3comp2 #(.BITS(106)) add2(.a(lv1add[i*3]), .b(lv1add[i*3+1]), .c(lv1add[i*3+2]), 
+    //                                        .carry(carryTmp[i+9][105:0]), .sum(lv2add[i*2+1]));
+    //         assign lv2add[i*2] = {carryTmp[i+9][104:0], 1'b0};
+    //     end
+    //  endgenerate

-    generate
-        for(i=0; i<4; i=i+1) begin
-            add3comp2 #(.BITS(106)) add3(.a(lv2add[i*3]), .b(lv2add[i*3+1]), .c(lv2add[i*3+2]), 
-                                            .carry(carryTmp[i+15][105:0]), .sum(lv3add[i*2+1]));
-            assign lv3add[i*2] = {carryTmp[i+15][104:0], 1'b0};
-        end
-    endgenerate
+    // generate
+    //     for(i=0; i<4; i=i+1) begin
+    //         add3comp2 #(.BITS(106)) add3(.a(lv2add[i*3]), .b(lv2add[i*3+1]), .c(lv2add[i*3+2]), 
+    //                                         .carry(carryTmp[i+15][105:0]), .sum(lv3add[i*2+1]));
+    //         assign lv3add[i*2] = {carryTmp[i+15][104:0], 1'b0};
+    //     end
+    // endgenerate


-    generate
-        for(i=0; i<2; i=i+1) begin
-            add4comp2 #(.BITS(106)) add4(.a(lv3add[i*4]), .b(lv3add[i*4+1]), .c(lv3add[i*4+2]), .d(lv3add[i*4+3]),
-                                            .carry(carryTmp[i+19]), .sum(lv4add[i*2+1]));
-            assign lv4add[i*2] = {carryTmp[i+19][104:0], 1'b0};
-        end
-    endgenerate
+    // generate
+    //     for(i=0; i<2; i=i+1) begin
+    //         add4comp2 #(.BITS(106)) add4(.a(lv3add[i*4]), .b(lv3add[i*4+1]), .c(lv3add[i*4+2]), .d(lv3add[i*4+3]),
+    //                                         .carry(carryTmp[i+19]), .sum(lv4add[i*2+1]));
+    //         assign lv4add[i*2] = {carryTmp[i+19][104:0], 1'b0};
+    //     end
+    // endgenerate

-    add4comp2 #(.BITS(106)) add5(.a(lv4add[0]), .b(lv4add[1]), .c(lv4add[2]), .d(lv4add[3]) ,
-                                    .carry(carryTmp[21]), .sum(s));
-    assign r = {carryTmp[21][104:0], 1'b0};
+    // add4comp2 #(.BITS(106)) add5(.a(lv4add[0]), .b(lv4add[1]), .c(lv4add[2]), .d(lv4add[3]) ,
+    //                                 .carry(carryTmp[21]), .sum(s));
+    // assign r = {carryTmp[21][104:0], 1'b0};

-	// assign r = 106'b0;
-	// assign s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm};
+	assign r = 106'b0;
+	assign s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm};

 endmodule
--- a/wally-pipelined/src/fpu/FMA/expgen.v
+++ b/wally-pipelined/src/fpu/FMA/expgen.v
@ -19,7 +19,7 @@ module expgen(x[62:52], y[62:52], z[62:52],
 			   earlyres[62:52], earlyressel, bypsel[1], byppostnorm, 
 			   killprod,  sumzero, postnormalize, normcnt, infinity, 
 			   invalid, overflow, underflow, inf, 
-			   nan, xnan, ynan, znan, zdenorm, specialsel, 
+			   nan, xnan, ynan, znan, zdenorm, proddenorm, specialsel, 
 			   aligncnt, w[62:52], wbypass[62:52],
 			   prodof, sumof, sumuf, denorm0, ae[12:0]);
 /////////////////////////////////////////////////////////////////////////////
@ -28,36 +28,37 @@ module expgen(x[62:52], y[62:52], z[62:52],
 	input     	[62:52]  	y;         		// Exponent of multiplicand y
 	input     	[62:52]  	z;           	// Exponent of addend z
 	input     	[62:52]	 	earlyres;  		// Result from other FPU block
-	input     				earlyressel;    // Select result from other block
+	input     			earlyressel;    // Select result from other block
 	input     	[1:1] 		bypsel;         // Bypass X or Z
-	input     				byppostnorm;    // Postnormalize bypassed result
-	input     				killprod;    	// Z >> product
-	input     				sumzero;     	// sum exactly equals zero 
-	input     				postnormalize;  // postnormalize rounded result
+	input     			byppostnorm;    // Postnormalize bypassed result
+	input     			killprod;    	// Z >> product
+	input     			sumzero;     	// sum exactly equals zero 
+	input     			postnormalize;  // postnormalize rounded result
 	input     	[8:0]  		normcnt;     	// normalization shift count 
-	input     				infinity;    	// generate infinity on overflow 
-	input     				invalid;     	// Result invalid
-	input     				overflow;    	// Result overflowed
-	input     				underflow;   	// Result underflowed 
-	input     				inf;			// Some input is infinity
-	input     				nan;			// Some input is NaN
-	input     				xnan;			// X is NaN
-	input     				ynan;			// Y is NaN
-	input     				znan;			// Z is NaN 
-	input     				zdenorm;		// Z is denorm
-	input     				specialsel;  	// Select special result
+	input     			infinity;    	// generate infinity on overflow 
+	input     			invalid;     	// Result invalid
+	input     			overflow;    	// Result overflowed
+	input     			underflow;   	// Result underflowed 
+	input     			inf;			// Some input is infinity
+	input     			nan;			// Some input is NaN
+	input     			xnan;			// X is NaN
+	input     			ynan;			// Y is NaN
+	input     			znan;			// Z is NaN 
+	input     			zdenorm;		// Z is denorm
+	input     			proddenorm;		// product is denorm
+	input     			specialsel;  	// Select special result
 	output		[11:0]   	aligncnt;       // shift count for alignment shifter
-	output		[62:52]     w;           	// Exponent of result
-	output		[62:52]     wbypass;     	// Prerounded exponent for bypass 
-	output					prodof;         // X*Y exponent out of bounds 
-	output					sumof;          // X*Y+Z exponent out of bounds 
-	output					sumuf;         // X*Y+Z exponent underflows 
-	output					denorm0;     	// exponent = 0 for denorm 
+	output		[62:52]    	w;           	// Exponent of result
+	output		[62:52]     	wbypass;     	// Prerounded exponent for bypass 
+	output				prodof;         // X*Y exponent out of bounds 
+	output				sumof;          // X*Y+Z exponent out of bounds 
+	output				sumuf;         // X*Y+Z exponent underflows 
+	output				denorm0;     	// exponent = 0 for denorm 
 	output		[12:0]		ae;				//exponent of multiply

 	//   Internal nodes

-	wire 	[12:0]			aetmp;				// Exponent of Multiply
+
 	wire 	[12:0]			aligncnt0;		// Shift count for alignment
 	wire 	[12:0]			aligncnt1;		// Shift count for alignment
 	wire 	[12:0]			be;				// Exponent of multiply
@ -72,9 +73,11 @@ module expgen(x[62:52], y[62:52], z[62:52],
 	// Note that the exponent does not have to be incremented on a postrounding
 	//   normalization of X because the mantissa was already increased.   Report
 	//   if exponent is out of bounds 
-	assign ae = x + y  - 1023; 

-	assign prodof = (ae > 2046 && ~ae[12] && ~killprod);
+
+	assign ae = x + y  - 1023;
+
+	assign prodof = (ae > 2046 && ~ae[12]);

 	// Compute alignment shift count
 	// Adjust for postrounding normalization of Z.
@ -82,8 +85,10 @@ module expgen(x[62:52], y[62:52], z[62:52],
 	// check if a round overflows is shorter than the actual round and
 	// is masked by the bypass mux and two 10 bit adder delays.

-	assign aligncnt0 = z - ae[10:0] + 13'b0;
-	assign aligncnt1 = z - ae[10:0] + 13'b1;
+	assign aligncnt0 = z - ae + 13'b0;// KEP use all of ae
+	assign aligncnt1 = z - ae + 13'b1;	
+	//assign aligncnt0 = z - ae[10:0] + 13'b0;//original
+	//assign aligncnt1 = z - ae[10:0] + 13'b1;
 	assign aligncnt = bypsel[1] && byppostnorm ? aligncnt1 : aligncnt0;

 	// Select exponent (usually from product except in case of huge addend)
@ -118,13 +123,17 @@ module expgen(x[62:52], y[62:52], z[62:52],
 	// rounding mode.  NaNs are propagated or generated.

 	assign specialres = earlyressel ? earlyres :
-					invalid ? nanres :
+					invalid | nan ? nanres : // KEP added nan
 					overflow ? infinityres : 
 					inf ? 11'b11111111111 :
 					underflow ? 11'b0 : 11'bx;

 	assign infinityres = infinity ? 11'b11111111111 : 11'b11111111110;

+	// IEEE 754-2008 section 6.2.3 states:
+	// "If two or more inputs are NaN, then the payload of the resulting NaN should be 
+	// identical to the payload of one of the input NaNs if representable in the destination
+	// format. This standard does not specify which of the input NaNs will provide the payload."
 	assign nanres = xnan ? x : (ynan ? y : (znan? z : 11'b11111111111));

 	// A mux selects the early result from other FPU blocks or the 
--- a/wally-pipelined/src/fpu/FMA/flag.v
+++ b/wally-pipelined/src/fpu/FMA/flag.v
@ -13,31 +13,31 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 			 inf, nan, invalid, overflow, underflow, inexact);
 /////////////////////////////////////////////////////////////////////////////

-	input                  	xnan;        	// X is NaN 
-	input                  	ynan;        	// Y is NaN 
-	input                 	znan;       	// Z is NaN 
-	input                  	xinf;        	// X is Inf
-	input                 	yinf;       	// Y is Inf 
-	input                  	zinf;        	// Z is Inf
-	input                  	prodof;         // X*Y overflows exponent
-	input                  	sumof;          // X*Y + z underflows exponent
-	input                  	sumuf;          // X*Y + z underflows exponent
-	input					psign; 			// Sign of product
-	input					zsign; 			// Sign of z
-	input					xzero;			// x = 0
-	input					yzero;			// y = 0
-	input     	[1:0]  		v;				// R and S bits of result
-	output					inf;			// Some	source is Inf
-	output					nan;			// Some	source is NaN
-	output					invalid;		// Result is invalid	
-	output					overflow;		// Result overflowed	
-	output					underflow;		// Result underflowed	
-	output					inexact;		// Result is not an exact	number
+	input                  		xnan;        	// X is NaN 
+	input                  		ynan;        	// Y is NaN 
+	input                 		znan;       	// Z is NaN 
+	input                  		xinf;        	// X is Inf
+	input                 		yinf;       	// Y is Inf 
+	input                  		zinf;        	// Z is Inf
+	input                  		prodof;         // X*Y overflows exponent
+	input                  		sumof;          // X*Y + z underflows exponent
+	input                  		sumuf;          // X*Y + z underflows exponent
+	input				psign; 		// Sign of product
+	input				zsign; 		// Sign of z
+	input				xzero;		// x = 0
+	input				yzero;		// y = 0
+	input     	[1:0]  		v;		// R and S bits of result
+	output				inf;		// Some	source is Inf
+	output				nan;		// Some	source is NaN
+	output				invalid;	// Result is invalid	
+	output				overflow;	// Result overflowed	
+	output				underflow;	// Result underflowed	
+	output				inexact;	// Result is not an exact number
 
 	//   Internal nodes

-	wire					prodinf;		// X*Y larger than max possible
-	wire					suminf;			// X*Y+Z larger than max possible
+	wire				prodinf;	// X*Y larger than max possible
+	wire				suminf;		// X*Y+Z larger than max possible

 	// If any input is NaN, propagate the NaN 

@ -46,12 +46,14 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	// Same with infinity (inf - inf and O * inf don't propagate inf
 	//  but it's ok becaue illegal op takes higher precidence)

-	assign inf= xinf || yinf || zinf;
+	assign inf= xinf || yinf || zinf || suminf;//KEP added suminf 
+	//assign inf= xinf || yinf || zinf;//original

 	// Generate infinity checks

 	assign prodinf = prodof && ~xnan && ~ynan;
-	assign suminf = sumof && ~xnan && ~ynan && ~znan;
+	//KEP added if the product is infinity then sum is infinity
+	assign suminf = prodinf | sumof && ~xnan && ~ynan && ~znan;

 	// Set invalid flag for following cases:
 	//   1) Inf - Inf
@ -59,8 +61,7 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	//   3) Output = NaN (this is not part of the IEEE spec,  only 486 proj)

 	assign invalid = (xinf || yinf || prodinf) && zinf && (psign ^ zsign) ||
-					   xzero && yinf || yzero && xinf ||
-					   nan;
+					   xzero && yinf || yzero && xinf;// KEP remove case 3) above

 	// Set the overflow flag for the following cases:
 	//   1) Rounded multiply result would be out of bounds
--- a/wally-pipelined/src/fpu/FMA/fmac.v
+++ b/wally-pipelined/src/fpu/FMA/fmac.v
@ -103,7 +103,7 @@ module fmac(xrf, y, zrf, rn, rz, rp, rm,
 						   earlyres[62:52], earlyressel, bypsel[1], byppostnorm,
 						   killprod, sumzero, postnorrnalize, normcnt, 
 						   infinity, invalid, overflow, underflow, 
-						   inf, nan, xnan, ynan, znan, zdenorm, specialsel,
+						   inf, nan, xnan, ynan, znan, zdenorm, proddenorm, specialsel,
 						   aligncnt, w[62:52], wbypass[62:52],
 						   prodof, sumof, sumuf, denorm0, ae);
 // Instantiate special case detection across datapath & exponent path 
@ -120,7 +120,7 @@ assign wbypass[63] = w[63];
 // Instantiate control logic
 
 sign				sign(x[63], y[63], z[63], negsum0, negsum1, bs, ps, 
-					     killprod, rm, sumzero, nan, invalid, xinf, yinf, inf, 
+					     killprod, rm, overflow, sumzero, nan, invalid, xinf, yinf, zinf, inf, 
 						 w[63], invz, negsum, selsum1, psign); 
 flag				flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 						 psign, z[63], xzero, yzero, v[1:0],
--- a/wally-pipelined/src/fpu/FMA/normalize.v
+++ b/wally-pipelined/src/fpu/FMA/normalize.v
@ -18,12 +18,12 @@ module normalize(sum[157:0], normcnt, sumzero, bs, ps, denorm0, zdenorm, v[53:0]
 /////////////////////////////////////////////////////////////////////////////
 	input     	[157:0]  	sum;            // sum
 	input		[8:0] 		normcnt;     	// normalization shift count
-	input					sumzero;		// sum is zero
-	input					bs;				// sticky bit for addend
-	input					ps;				// sticky bit for product
-	input					denorm0;		// exponent = -1023
-	input                  	zdenorm;        // Input Z is denormalized
-	output		[53:0]		v;				// normalized sum, R, S bits
+	input				sumzero;	// sum is zero
+	input				bs;		// sticky bit for addend
+	input				ps;		// sticky bit for product
+	input				denorm0;	// exponent = -1023
+	input                  		zdenorm;        // Input Z is denormalized
+	output		[53:0]		v;		// normalized sum, R, S bits

 	// Internal nodes

--- a/wally-pipelined/src/fpu/FMA/round.v
+++ b/wally-pipelined/src/fpu/FMA/round.v
@ -19,37 +19,37 @@ module round(v[53:0], earlyres[51:0], earlyressel, rz, rn, rp, rm, wsign,
 			  w[51:0], postnormalize, infinity, specialsel);
 /////////////////////////////////////////////////////////////////////////////

-	input		[53:0]		v;				// normalized sum, R, S bits
-	input		[51:0]		earlyres;		// result from other FPU blocks
-	input 					earlyressel; 	// use result from other FPU blocks
-	input					rz;				// Round toward zero
-	input					rn;				// Round toward	nearest
-	input					rp;				// Round toward	plus infinity
-	input					rm;				// Round toward	minus infinity
-	input					wsign;			// Sign of result
-	input 					invalid;		// Trap on infinity, NaN, denorm
-	input					overflow;		// Result overflowed
-	input					underflow;		// Result underflowed
-	input					inf;			// Some input is infinity
-	input					nan;			// Some input is NaN
-	input					xnan;			// X is NaN
-	input					ynan;			// Y is NaN
-	input					znan;			// Z is NaN
-	input		[51:0]		x;				// Input X
-	input		[51:0]		y;				// Input Y
-	input		[51:0]		z;				// Input Z
-	output		[51:0]		w; 				// rounded result of FMAC
-	output					postnormalize; 	// Right shift 1 for post-rounding norm
-	output					infinity;    	// Generate infinity on overflow
-	output					specialsel;  	// Select special result
+	input		[53:0]		v;		// normalized sum, R, S bits
+	input		[51:0]		earlyres;	// result from other FPU blocks
+	input 				earlyressel; 	// use result from other FPU blocks
+	input				rz;		// Round toward zero
+	input				rn;		// Round toward	nearest
+	input				rp;		// Round toward	plus infinity
+	input				rm;		// Round toward	minus infinity
+	input				wsign;		// Sign of result
+	input 				invalid;	// Trap on infinity, NaN, denorm
+	input				overflow;	// Result overflowed
+	input				underflow;	// Result underflowed
+	input				inf;		// Some input is infinity
+	input				nan;		// Some input is NaN
+	input				xnan;		// X is NaN
+	input				ynan;		// Y is NaN
+	input				znan;		// Z is NaN
+	input		[51:0]		x;		// Input X
+	input		[51:0]		y;		// Input Y
+	input		[51:0]		z;		// Input Z
+	output		[51:0]		w; 		// rounded result of FMAC
+	output				postnormalize; 	// Right shift 1 for post-rounding norm
+	output				infinity;    	// Generate infinity on overflow
+	output				specialsel;  	// Select special result

 	// Internal nodes

-	wire					plus1;			// Round by adding one 
-	wire		[52:0]		v1;				// Result + 1 (for rounding)
-	wire		[51:0]		specialres;		// Result of exceptional case 
+	wire				plus1;		// Round by adding one 
+	wire		[52:0]		v1;		// Result + 1 (for rounding)
+	wire		[51:0]		specialres;	// Result of exceptional case 
 	wire		[51:0]		infinityres;	// Infinity or largest real number
-	wire		[51:0]		nanres;			// Propagated or generated NaN 
+	wire		[51:0]		nanres;		// Propagated or generated NaN 

 	// Compute if round should occur.  This equation is derived from
 	// the rounding tables.
@ -77,7 +77,7 @@ module round(v[53:0], earlyres[51:0], earlyressel, rz, rn, rp, rm, wsign,
 	assign specialsel = earlyressel || overflow || underflow || invalid ||
 							nan || inf;
 	assign specialres = earlyressel ? earlyres : 
-						 invalid ? nanres : 
+						 invalid | nan ? nanres : //KEP added nan
 						 overflow ? infinityres : 
 						 inf ? 52'b0 :
 						underflow ? 52'b0 : 52'bx;  // default to undefined 
@ -93,6 +93,11 @@ module round(v[53:0], earlyres[51:0], earlyressel, rz, rn, rp, rm, wsign,
 	// NaN inputs are already quiet, we don't have to force them quiet.

 	// assign nanres = xnan ? x: (ynan ? y : (znan ? z : {1'b1, 51'b0})); // original
+
+	// IEEE 754-2008 section 6.2.3 states:
+	// "If two or more inputs are NaN, then the payload of the resulting NaN should be 
+	// identical to the payload of one of the input NaNs if representable in the destination
+	// format. This standard does not specify which of the input NaNs will provide the payload."
 	assign nanres = xnan ? {1'b1, x[50:0]}: (ynan ? {1'b1, y[50:0]} : (znan ? {1'b1, z[50:0]} : {1'b1, 51'b0}));// KEP 210112 add the 1 to make NaNs quiet

 	// Select result with 4:1 mux
--- a/wally-pipelined/src/fpu/FMA/sign.v
+++ b/wally-pipelined/src/fpu/FMA/sign.v
@ -10,8 +10,8 @@
 /////////////////////////////////////////////////////////////////////////////

 /////////////////////////////////////////////////////////////////////////////
-module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm,
-			 sumzero, nan, invalid, xinf, yinf, inf, wsign, invz, negsum, selsum1, psign);
+module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm, overflow,
+			 sumzero, nan, invalid, xinf, yinf, zinf, inf, wsign, invz, negsum, selsum1, psign);
 ////////////////////////////////////////////////////////////////////////////I
 
 	input					xsign;			// Sign of X 
@ -23,11 +23,13 @@ module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm,
 	input					ps;				// sticky bit from product
 	input					killprod;		// Product forced to zero
 	input					rm;				// Round toward minus infinity
+	input					overflow;				// Round toward minus infinity
 	input					sumzero;		// Sum = O
 	input					nan;			// Some input is NaN
 	input					invalid;		// Result invalid
 	input					xinf;			// X = Inf
 	input					yinf;			// Y = Inf
+	input					zinf;			// Y = Inf
 	input					inf;			// Some input = Inf
 	output					wsign;			// Sign of W 
 	output					invz;			// Invert addend into adder
@ -47,13 +49,13 @@ module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm,
 	assign psign = xsign ^ ysign;

 	// Invert addend if sign of Z is different from sign of product assign invz = zsign ^ psign;
-	assign invz = zsign ^ psign;
+	assign invz = (zsign ^ psign);
 	// Select +l mode for adder and compute if result must be negated
 	// This is done according to cases based on the sticky bit.

 	always @(invz or negsum0 or negsum1 or bs or ps)
 		begin
-			if (~invz) begin               // both inputs have same sign
+			if (~invz) begin               // both inputs have same sign //KEP if overflow 
 				negsum = 0;
 				selsum1 = 0;
 			end else if (bs) begin        // sticky bit set on addend
@ -85,9 +87,8 @@ module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm,
 	//			 sum/difference shall be -0.  However, x+x = x-(-X) retains the same sign as x even when x is zero."
 
 	assign zerosign = (~invz && killprod) ? zsign : rm;
-	assign infsign = psign; //KEP 210112 keep the correct sign when result is infinity
-	// assign infsign = xinf ? (yinf ? psign : xsign) : yinf ? ysign : zsign;//original
-	assign wsign =invalid? 0 : (inf ? infsign:
-								(sumzero ? zerosign : psign ^ negsum));
+	assign infsign = zinf ? zsign : psign; //KEP 210112 keep the correct sign when result is infinity
+	//assign infsign = xinf ? (yinf ? psign : xsign) : yinf ? ysign : zsign;//original
+	assign wsign = invalid ? 0 : (inf ? infsign :(sumzero ? zerosign : psign ^ negsum));

 endmodule
--- a/wally-pipelined/src/fpu/FMA/special.v
+++ b/wally-pipelined/src/fpu/FMA/special.v
@ -14,23 +14,23 @@ module special(x[63:0], y[63:0], z[63:0], ae, xzero, yzero, zzero,
 				xnan, ynan, znan, xdenorm, ydenorm, zdenorm, proddenorm, xinf, yinf, zinf);
 /////////////////////////////////////////////////////////////////////////////

-	input   		[63:0]     	x;             // Input x
+	input   	[63:0]     	x;              // Input x
 	input     	[63:0]     	y;           	// Input Y
 	input      	[63:0]    	z;            	// Input z 
-	input		[12:0]			ae;			// exponent of product
-	output						xzero;			// Input x = 0
-	output						yzero;			// Input y = 0
-	output						zzero;			// Input z = 0
-	output						xnan;			// x is NaN
-	output						ynan;			// y is NaN
-	output						znan;			// z is NaN
-	output						xdenorm;		// x is denormalized
-	output						ydenorm;		// y is denormalized
-	output						zdenorm;		// z is denormalized
-	output						proddenorm;		// product is denormalized
-	output						xinf;			// x is infinity
-	output						yinf;			// y is infinity
-	output						zinf;			// z is infinity
+	input		[12:0]		ae;		// exponent of product
+	output				xzero;		// Input x = 0
+	output				yzero;		// Input y = 0
+	output				zzero;		// Input z = 0
+	output				xnan;		// x is NaN
+	output				ynan;		// y is NaN
+	output				znan;		// z is NaN
+	output				xdenorm;	// x is denormalized
+	output				ydenorm;	// y is denormalized
+	output				zdenorm;	// z is denormalized
+	output				proddenorm;	// product is denormalized
+	output				xinf;		// x is infinity
+	output				yinf;		// y is infinity
+	output				zinf;		// z is infinity

 	// In the actual circuit design, the gates looking at bits
 	// 51:0 and at bits 62:52 should be shared among the various detectors.
--- a/wally-pipelined/src/fpu/FMA/tbgen/results.dat
+++ b/wally-pipelined/src/fpu/FMA/tbgen/results.dat
@ -1 +1,130 @@
-0020000803ffffff bfcb4181a9468e24 000fffffffffffff 7fe2f9c2bca0f33c 00092f9c2bca0f33  Wrong zdenorm 18
+0000000000000001 7fdffffeffffffbf 4000000000080004 4007ffffc007fff5 4000000000080005  Wrong xdenorm 85959
+0000000000000001 c3ded4d0b02cd6aa 000c158ac12ac439 83eed4d0b02cd6ae 80bed1cb4d7c8bf9  Wrong xdenorm zdenorm 91485
+c15000000010001f 434ffffffffffffe 47f55792228596a0 c7e550dbbaf4d2c2 47f557922285969f  Wrong 97625
+0000000000000001 7fe0000000000001 4340000000000001 4340000000000002 4340000000000001  Wrong xdenorm 99467
+0000000000000001 bfdffffffffffffe 801fffffffffbfc0 8021ffffffffdfe0 801fffffffffbfc0  Wrong xdenorm 117273
+0000000000000001 ffe0000000000000 40d4000040000000 40d3ffc040000000 40d4000040000000  Wrong xdenorm 133851
+000fffffffffffff 3fcffc007fffffff 800fffffffffffff 800800ffe0000000 800c007fefffffff  Wrong xdenorm zdenorm 147973
+000fffffffffffff 3feffffffffffffe 000ffffffffffffe 001ffffffffffffd 001ffffffffffffc  Wrong xdenorm zdenorm 154727
+000ffffffffffffe 41dffffffffff900 0000000000000001 02000000000ffc7e 01fffffffffff8fc  Wrong xdenorm zdenorm 230863
+0010000000000000 bf4fdffffff7fffe 800ffffffffffffe 801003fbfffffeff 801003fbfffffefe  Wrong zdenorm 308227
+0010000000000000 be6fffffbffffff7 8000000000000000 8000000000000000 800000000fffffe0  Wrong w=-zero unflw 313753
+0010000000000001 bcafffffffffffff 801fffffffffffff 8000000000000000 8020000000000000  Wrong w=-zero unflw 392345
+0010000000000001 bfe0000000000001 800ffffffffffffe 8018000000000000 8017ffffffffffff  Wrong zdenorm 397871
+802000003ffffbff c3cfffffffffffd7 0000000000000001 040000003ffffbeb 040000003ffffbea  Wrong zdenorm 448219
+dc10000000001eff 0000000000000001 802d63f274ada691 9c20000000001f01 98f0000000001eff  Wrong ydenorm 489971
+001ffffffffffffe 3fddfbffffffffff 000ffffffffffffe 001efdfffffffffe 001efdfffffffffd  Wrong zdenorm 551371
+3ca0000000000000 0000000000000001 000e8d6ac606e59d 000e8d6ac606e59e 000e8d6ac606e59d  Wrong ydenorm zdenorm 559353
+3ca0000000000000 434ffffffffffffe c019cab46f8c90a7 c011cab46f8c90a7 c011cab46f8c90a8  Wrong 586983
+3ca0000000000001 bfe000000fffdfff 3fee60af9e2e4b00 bfa9f5061d1b5008 3fee60af9e2e4aff  Wrong 649611
+3ca0000000000001 7fe0000000000000 ffefffffffffffff 7ca8000000000000 ffeffffffffffffe  Wrong 657593
+44f0000000000dff 000000007fbffffe 801ffffffffffffe 05000000ff800dfb 03bfefffff801bf0  Wrong ydenorm 680311
+3ca0000000000001 bfffffffffffffff 3ff0007ffffffc00 bfefff0000000802 3ff0007ffffffbff  Wrong 680925
+3cafffffffffffff 3caffffffffffffe bcaffffffffffffe 397fffffffffffff bcaffffffffffffc  Wrong 707327
+3cafffffffffffff c01ffffffffffffe c02cbe486a2b0809 c02cbe486a2b0809 c02cbe486a2b080a  Wrong 758289
+000667c5d67e1d85 3fdfeffffdffffff 001fffffffffffff 002398247cab1886 002199247ccb1886  Wrong xdenorm 763201
+000007fffffffffe 3f6ffffffe01fffe 000ffffffffffffe 00100807ffff7fff 00100007ffffff7e  Wrong xdenorm zdenorm 768727
+3caffffffffffffe 4060000001000006 4070001fffff7ffe 4070001fffff7ffe 4070001fffff7fff  Wrong 771183
+3caffffffffffffe 3fdfffffffffffff 3fe0000000000000 3fe0000000000000 3fe0000000000001  Wrong 779165
+bfd7ffffffbfffff 4000000000000000 3fffffffffffffff 3ff40000001fffff 3ff4000000200000  Wrong 787147
+3caffffffffffffe c00ffffffffffffe c01000000020007f c01000000020007f c010000000200080  Wrong 824601
+3caffffffffffffe c00fffffffffff08 4010000000000001 4010000000000001 4010000000000000  Wrong 827671
+800000000000007e ffd26a0f710537a9 c01b3b74de550046 c018ee32f034592d c01b3b74de550022  Wrong xdenorm 861441
+47f9aa99d39dd7d8 0000000000000001 8000000000000000 0809aa99d39dd7db 04d9aa99d39dd7d8  Wrong ydenorm 908719
+bfef000004000000 c000000000000000 c34ff80000000006 42afffffffffebe0 c34ff80000000005  Wrong 1031519
+bfe0010000007fff 3ff00003ffffff7f 4340000000000000 4340000000000000 433fffffffffffff  Wrong 1039501
+3fdffffffffffffe 000ffffffffffffe 8000000000000001 fcdfffffffffffff 0007fffffffffffe  Wrong ydenorm zdenorm 1049939
+802000007fffffbf 400ffffffffffffe 8000000000000001 804100007fffffbe 804000007fffffbe  Wrong zdenorm 1068973
+3fdffffffffffffe bfffffffffffffff 3fefffffffffffff 3caffffffffffffe 3cafffffffffffff  Wrong 1099673
+c7fffffffb7fffff 37efffffffdffbff c34000003ffffffc c34000003ffffffc c34000003ffffffd  Wrong 1104585
+3fe0000000000001 3ca0000000000000 bcaffffffffffffe bca7fffffffffffd bca7fffffffffffe  Wrong 1193615
+bfe00000000effff 800fffffffffffff 8010000000000001 8000000000000000 8007fffffff88002  Wrong w=-zero ydenorm unflw 1223701
+3feffffffffffffe 3ff0000000000000 bfefffffffffffff bc9ffffffffffffc bca0000000000000  Wrong 1342817
+3feffffffffffffe 801fffffffffffff 800007fffffbfffe 802401fffffefffe 802003fffffdfffe  Wrong zdenorm 1366149
+bfd0002000007fff 0000000000000001 0010000000000001 0000000000000000 0010000000000001  Wrong ydenorm unflw 1466845
+0003476357ebf517 7fe000004000003f 8000000000000000 3ff68ec70a130546 3fda3b1b284c141d  Wrong xdenorm 1503685
+4000002003fffffe bc4fffffbffffffe bfbfffffffffe3ff bfbfffffffffe3ff bfbfffffffffe400  Wrong 1635081
+3ca00ffdffffffff 3fdffffffffffffe bfe0000000000001 bfe0000000000001 bfe0000000000000  Wrong 1687885
+801fffffbf000000 4012b6da70c3decc 0000000000000006 8041b6da4ac07317 8042b6da4ac07316  Wrong zdenorm 1753583
+b7ff7ffffffff000 7fe0000000000000 78b01fffefffffff 78b01f03efffffff 78b01f03f0000000  Wrong 1843841
+400fffffffffffff 8000000000000001 801fffffffffffff 8030000000000000 8020000000000001  Wrong ydenorm 1851209
+00003fefffffffff ffeffffffffffffe 8000000000000000 c0007fdffffffffd bfaff7ffffffff7e  Wrong xdenorm 1881295
+800000000003fffe 578284b14dfcc6e4 8000000000000000 979284b14e060938 958284a80ba41fe6  Wrong xdenorm 1989973
+bfdeffffff000000 002ffffffffc3ffe 0000002003fffffe 8016ffeffcfc5dff 801effdffafc5e00  Wrong zdenorm 2018831
+401fffffffffffff 3fdffffffffffffe c340000000400002 433fffffff800000 c340000000400000  Wrong 2106633
+401fffffffffffff 4010000000000001 c050ffffffff0000 c041fffffffdffff c041fffffffe0000  Wrong 2117685
+4340000000000000 3fd0000000000000 3fd0000000000001 4320000000000000 4320000000000001  Wrong 2243555
+bcb58ba32df145e0 3fbe0000003fffff 3fe0000000000000 3fe0000000000000 3fdfffffffffffff  Wrong 2365741
+bfed82e3c6c037db 3ff0000000000000 4340000000000000 4340000000000000 433fffffffffffff  Wrong 2389687
+3fdfffffffff7000 bcaffffffffffffe 3fe0000000000001 3fe0000000000001 3fe0000000000000  Wrong 2417317
+8000000002000000 ff100001efffffff 8d261bb2da873976 3f200001f400007b 3d800001efffffff  Wrong xdenorm 2422229
+bb0fb893e0decb72 c1cffff7ffbfffff c03ffc0000003fff c03ffc0000003fff c03ffc0000003ffe  Wrong 2546871
+800000000e000000 3fffffffffffffff 034ffff80ffffffe 034ffff80ffffffc 034ffff80ffffffe  Wrong xdenorm 2600903
+7fe0000000000001 4000000000000000 ffefffffffffffff 7ff0000000000000 7cb8000000000000  Wrong w=+inf 2602745
+7fe0000000000001 8000000000000001 c010000000000001 c014000000000002 c010000000000002  Wrong ydenorm 2619323
+7fe0000000000001 bcafffffffffffff 7fefffffdffffffc fe70000002800000 7fefffffdffffffb  Wrong 2626077
+7fefffffffffffff 0000000000000001 37ffffff7ffffeff 4000000000000001 3ccfffffffffffff  Wrong ydenorm 2653707
+3feffffffffbfffd 3ca0000000000001 bfe0000000000001 3fe0000000000000 bfe0000000000000  Wrong 2660461
+000ffffffff00006 bfe0000000000001 0000000000000001 7dfffff400000002 8007fffffff80002  Wrong xdenorm zdenorm 2770981
+fd61dd32fb8e3b2c 0000000000000001 801ffffffffffffe bd71dd32fb8e3b2e ba41dd32fb8e3b2c  Wrong ydenorm 3003073
+000fff000000000f 3ff00800001fffff 8010000000000000 0000000000000000 000006ff801ffe0e  Wrong xdenorm unflw 3117277
+8000000000000001 400effffff000000 0010000000000000 8000000000000000 000ffffffffffffc  Wrong w=-zero xdenorm unflw 3143065
+8000000000000001 40211275ffe5ee3c 0000000000000001 802e24ebffcbdc7c 8000000000000008  Wrong xdenorm zdenorm 3148591
+8000000000000001 c1c01ffffffffefe 03100007fe000000 0310000900000000 03100007fe000000  Wrong xdenorm 3152889
+8000000000000001 3fe0000000000001 800fffffffffffff 8014000000000000 8010000000000000  Wrong xdenorm zdenorm 3155345
+8000000000000001 7fef848cc01517b4 c340000000000001 c340000000000002 c340000000000001  Wrong xdenorm 3170695
+8000000000000001 7feffffffffffffe 410ffffffc007ffe 410fffeffc007ffe 410ffffffc007ffe  Wrong xdenorm 3173151
+8000000000000001 bffffffffffffffe 002e000000100000 0033000000080000 002e000000100001  Wrong xdenorm 3195255
+8000000000000001 ffe0000000000000 3feffffffffffffe 4000000000000000 3ff0000000000001  Wrong xdenorm 3205079
+8000000000000001 ffe0000000000001 c1ffbfffdffffffe c1ffbfffdfeffffe c1ffbfffdffffffe  Wrong xdenorm 3206307
+800fffffffffffff 3ff0000000000000 001ffffffffffffe 0000000000000000 000fffffffffffff  Wrong xdenorm unflw 3227183
+3e7ffffffefc0000 bfffffffffffffff 41c0ea1ad0c683e5 c1be2bca5e72f83a 41c0ea1ad0c683e3  Wrong 3264023
+3fffa9456a66b8c6 3caffffffffffffe c00ffffffffffffe c00ffffffffffffe c00ffffffffffffd  Wrong 3290425
+800ffffffffffffe 3fe0000000000001 0010000400000010 0000000000000000 0008000400000011  Wrong xdenorm unflw 3294723
+800ffffffffffffe 3fd0000000007ffe 000fffffffffffff 0007ffffffffc001 000bffffffffe000  Wrong xdenorm zdenorm 3308845
+800ffffffffffffe c010000000000000 800dfede47fbc1e2 002880486e010f84 00290090dc021f0b  Wrong xdenorm zdenorm 3338931
+bfdffc90d6e1fc1f 3ca1ffffffffeffe bfe66ad464a87aac bfe66ad464a87aac bfe66ad464a87aad  Wrong 3367175
+8010000000000000 bfe0000000000000 000fffffffffffff 0018000000000000 0017ffffffffffff  Wrong zdenorm 3398489
+7fe800000000003e 8004de935d68d1e8 801fffffffffffff c0034ddd0c1d3b0e bfed37743074ebbb  Wrong ydenorm 3437785
+8010000000000001 bfefffffffffffff 801ffffffffffffe 8000000000000000 800ffffffffffffe  Wrong w=-zero unflw 3470327
+801fffffffffffff bfdffffffffffffe 0000000000021fff 0018000000010ffe 0010000000021ffe  Wrong zdenorm 3537867
+0005e0458a43fbdb 7fdfffbfffffffff 0000000000000000 3ffbc0539371cea5 3fe780e726e39d4b  Wrong xdenorm 3691981
+bca0000000000001 3cafffffffffffff 3cafffffffffffff b970000000000000 3caffffffffffffe  Wrong 3707945
+bca0000000000001 3fefffffffffffff bff0400000000400 bff0400000000400 bff0400000000401  Wrong 3714699
+bca0000000000001 c34ffffffffffffe c000000000000000 0000000000000000 b980000000000000  Wrong 3763205
+bcafffffffffffff 3fc200001fffffff 3fdffff00000ffff 3fdffff00000ffff 3fdffff00000fffe  Wrong 3788379
+bcafffffffffffff 800ffffffffffffe 8000000000000000 0000000000000000 0000000000000001  Wrong ydenorm unflw 3807413
+bcaffffffffffffe 3fdffffffffffffe 3ff0000000000000 3ff0000000000000 3fefffffffffffff  Wrong 3851621
+bcaffffffffffffe 001ffffffffc0000 8000000000000001 8000000000000005 8000000000000003  Wrong zdenorm 3878023
+7fec5fed92358a74 400000001bffffff ffefc0003ffffffe 7ff0000000000000 7fe8ffdb47bad466  Wrong w=+inf 3889689
+bfdfffffffffffff 000fffffffffffff 0000000000000000 8000000000000000 8007ffffffffffff  Wrong w=-zero ydenorm unflw 4050557
+bfdfffffffffffff 8000000000000001 8010000000800400 8000000000000000 8010000000800400  Wrong w=-zero ydenorm unflw 4084941
+bfdfffffffffffff bff0000000000000 bfe0000000000001 bca7ffffffffffff bca8000000000000  Wrong 4100291
+bff400003ffffffe bfeffffffffffffe 434fffffffffffff 434fffffffffffff 4350000000000000  Wrong 4169059
+43f00002003ffffe 8000000000000001 0010000000000000 8400000200400000 80cffe04007ffffc  Wrong ydenorm 4224319
+bfe0000000000000 801fffffffffffff 00000007fff80000 00180003fffc0000 00100007fff80000  Wrong zdenorm 4228617
+bfe0000000000000 c000000000000001 c00ffffffffffffe c007fffffffffffd c007fffffffffffe  Wrong 4243967
+bfcfdffffeffffff 8000000000000001 000fffffffffe080 0011fdffffeff040 000fffffffffe080  Wrong ydenorm zdenorm 4573685
+bfffffffffffffff 0000000000000001 8010000000000001 8020000000000001 8010000000000003  Wrong ydenorm 4608683
+bfffffffffffffff 3cafffffffffffff 3ff00000040001ff bfeffffff7fffc06 3ff00000040001fd  Wrong 4615437
+d2b6d8b0e4fde949 0000000000000001 0011fffffffffeff 92c6d8b0e4fde94c 8f96d8b0e4fde949  Wrong ydenorm 4678679
+3fd07dfffffffffe 8010000000000001 0000000000000001 7fef040000000006 80041f7fffffffff  Wrong zdenorm 4716133
+bffffffffffffffe bfffffffffffffff c00ffffffffffffe bcbffffffffffffc bcbffffffffffffe  Wrong 4730255
+c000000000000001 00000000004fffff 801ffffffffffffe 80280000004fffff 80200000004ffffe  Wrong ydenorm 4839547
+c00982d68cfe066b 000ffffffffffffe 8000000000000001 802d82d68cfe0668 802982d68cfe0668  Wrong ydenorm zdenorm 4959277
+346ffffffffffeef 480ffffeffffffe0 3fdfffffffffffff 3fdfffffffffffff 3fe0000000000000  Wrong 4962961
+c01fffffffffffff 0007ffffffff0000 8000000000000000 803ffffffffdffff 802ffffffffbffff  Wrong ydenorm 5176633
+c01fffffffffffff bfc08000000fffff 434ffffffffffffe 434ffffffffffffe 434fffffffffffff  Wrong 5193211
+c3a000000fffff7f 80000000000005fe 001000003ffffbff 03b0000010000b7b 0127f80817f81f3f  Wrong ydenorm 5450477
+0012000000000001 4000000000000001 000909a97b1f06a1 0028426a5ec7c1aa 002684d4bd8f8353  Wrong zdenorm 5535209
+ffe0000000000000 8000000000000001 c03000000000003f c02e00000000007e c03000000000003f  Wrong ydenorm 5621169
+ffe0000000000001 3ca0c6fe6997e5e2 7fefffffffffffff fca8637f34cbf2f2 7feffffffffffffe  Wrong 5673973
+ffe0000000000001 800fffffffffffff c340000000000001 4340000000000000 c340000000000000  Wrong ydenorm 5691779
+43f4595959dece4b 8000000000000001 801fffffffffffff 8404595959dece4e 80d45b5959dece4b  Wrong ydenorm 5760547
+ffefffffffffffff 3ca14e19e3a06f13 7fe00000000ff7fe ffdfffffffe01006 7fe00000000ff7fd  Wrong 5783265
+ffeffffffffffffe 8000000000000001 000fffffffffffff 4000000000000001 3ccffffffffffffe  Wrong ydenorm zdenorm 5829929
+ffeffffffffffffe 800001fffbffffff bac0000ffeffffff 400003fff7fffffd 3f5fffbfffffeffe  Wrong ydenorm 5832999
+bca0000004000080 bff0000000000009 bff0000000000001 3fefffffffffffff bff0000000000000  Wrong 5841595
+3fffffffff9ffffe 800000000f7ffffe 800ffdffbffffffe 801ffefffecffffa 800ffdffdefffffa  Wrong ydenorm zdenorm 5887031
+41ccc32b421f1ac0 8000000000000001 802ffffc0000001e 81dcc32b461f1a44 802ffffc1cc32b60  Wrong ydenorm 5899925
+41ffffffffffff87 8000000000000001 0000000000000000 820fffffffffff8b 8000000200000000  Wrong ydenorm 6039335
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.c
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.c
@ -14,24 +14,31 @@ void main() {
 	fp = fopen("testFloat","r");
 	fq = fopen("tb.v","a");
 	system("cp tbhead.v tb.v");
-	int k=0;
-	for(k=0; k<91 && !feof(fp); k++) {
+	long k=0L;
+	for(; !feof(fp); k++) {
 		//3FDBFFFFFFFFFF7F DE608000000001FF 43CFED83C17EDBD0 DE4CE000000002F9 01
 		// b68ffff8000000ff_3f9080000007ffff_b6307ffbe0080080_00001
-        char ch;
-		int i,j;
-		char *ln;
+                char ch;
+		int i,j,n;
 		char xrf[17];
 		char y[17];
 		char zrf[17];
 		char ans[81];
 		char flags[3];
 		int rn,rz,rm,rp;
-		{
-  //my_string = (char *) malloc (nbytes + 1);
-  //bytes_read = getline (&my_string, &nbytes, stdin);
-			if(getline(&ln,&nbytes,fp) < 0) break;
-			//fprintf(stderr,"%s\n", ln);
+		long stop = 6039335;
+		int debug = 0;
+		//my_string = (char *) malloc (nbytes + 1);
+		//bytes_read = getline (&my_string, &nbytes, stdin);
+	
+		for(n=0; n < 613; n++) {//613 for 10000
+			if(getline(&ln,&nbytes,fp) < 0 || feof(fp)) break;
+			if(k == stop && debug == 1) break;
+			k++;
+		}
+		//fprintf(stderr,"%s\n", ln);
+
+		if(!feof(fp)) {

 			strncpy(xrf,   ln,     16); xrf[16]=0;
 			strncpy(y,    &ln[17], 16); y[16]=0;
@ -46,71 +53,80 @@ void main() {
 			fprintf(fq,"    zrf = 64'h%s;\n",zrf);
 			fprintf(fq,"    ans = 64'h%s;\n", ans);
 			// fprintf(fq,"    flags = 5'h%s;\n", flags);
-		}
+	

-		{
-			//rn=1; rz=0; rm=0; rp=0;
-			fprintf(fq,"    rn = %d;\n",1);
-			fprintf(fq,"    rz = %d;\n", 0);
-			fprintf(fq,"    rm = %d;\n", 0);
-			fprintf(fq,"    rp = %d;\n", 0);
-		}
-		{
-			fprintf(fq,"    earlyres = 64'b0;\n");
-			fprintf(fq,"    earlyressel = 0;\n");
-		}		
-		{
+			{
+				//rn=1; rz=0; rm=0; rp=0;
+				fprintf(fq,"    rn = %d;\n",1);
+				fprintf(fq,"    rz = %d;\n", 0);
+				fprintf(fq,"    rm = %d;\n", 0);
+				fprintf(fq,"    rp = %d;\n", 0);
+			}
+			{
+				fprintf(fq,"    earlyres = 64'b0;\n");
+				fprintf(fq,"    earlyressel = 0;\n");
+			}		
+			{

-			fprintf(fq,"    bypsel= 2'b0;\n"); //, bysel);
-			fprintf(fq,"    bypplus1 = 0;\n"); //, byp1);
-			fprintf(fq,"    byppostnorm = 0;\n"); //, bypnorm);
-		}
-		fprintf(fq,"#10\n");
-	// IEEE 754-2008 section 6.3 states "When ether an input or result is NaN, this standard does not interpret the sign of a NaN."
-		//fprintf(fq,"	$fwrite(fp, \"%%h %%h %%h %%h \",xrf,y,w, ans);\n");	
-		fprintf(fq,"    // IEEE 754-2008 section 6.3 states: \"When ether an input or result is NaN, this\n");
-		fprintf(fq,"    //                                     standard does not interpret the sign of a NaN.\"\n");
- 		fprintf(fq,"	nan = (w >  64'h7FF0000000000000 && w <  64'h7FF8000000000000)  ||\n");
- 		fprintf(fq,"	      (w >  64'hFFF8000000000000 && w <  64'hFFF8000000000000 ) ||\n");
- 		fprintf(fq,"	      (w >= 64'h7FF8000000000000 && w <= 64'h7FFfffffffffffff ) ||\n");
- 		fprintf(fq,"	      (w >= 64'hFFF8000000000000 && w <= 64'hFFFfffffffffffff );\n");
-		// fprintf(fq,"    if(!(~(|xrf[62:52]) && |xrf[51:0] || ~(|y[62:52]) && |y[51:0])) begin\n"); 
-																						// not looknig at negative zero results right now
-		//fprintf(fq,"	  if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) && !(w == 64'h8000000000000000 && ans == 64'b0)) begin\n"); 
-		fprintf(fq,"	if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) ) begin\n"); 
-		fprintf(fq,"		$fwrite(fp, \"%%h %%h %%h %%h %%h  Wrong \",xrf,y, zrf, w, ans);\n");
- 		fprintf(fq,"		if(w == 64'h8000000000000000) $fwrite(fp, \"w=-zero \");\n");
- 		fprintf(fq,"		if(~(|xrf[62:52]) && |xrf[51:0]) $fwrite(fp, \"xdenorm \");\n");
- 		fprintf(fq,"		if(~(|y[62:52]) && |y[51:0]) $fwrite(fp, \"ydenorm \");\n");
- 		fprintf(fq,"		if(~(|zrf[62:52]) && |zrf[51:0]) $fwrite(fp, \"zdenorm \");\n");
-  		fprintf(fq,"		if(invalid != 0) $fwrite(fp, \"invld \");\n");
- 		fprintf(fq,"		if(overflow != 0) $fwrite(fp, \"ovrflw \");\n");
- 		fprintf(fq,"		if(underflow != 0) $fwrite(fp, \"unflw \");\n");
- 		fprintf(fq,"		if(w == 64'hFFF0000000000000) $fwrite(fp, \"w=-inf \");\n");
- 		fprintf(fq,"		if(w == 64'h7FF0000000000000) $fwrite(fp, \"w=+inf \");\n");
- 		fprintf(fq,"		if(w >  64'h7FF0000000000000 && w <  64'h7FF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
- 		fprintf(fq,"		if(w >  64'hFFF8000000000000 && w <  64'hFFF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
- 		fprintf(fq,"		if(w >= 64'h7FF8000000000000 && w <= 64'h7FFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
- 		fprintf(fq,"		if(w >= 64'hFFF8000000000000 && w <= 64'hFFFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
+				fprintf(fq,"    bypsel= 2'b0;\n"); //, bysel);
+				fprintf(fq,"    bypplus1 = 0;\n"); //, byp1);
+				fprintf(fq,"    byppostnorm = 0;\n"); //, bypnorm);
+			}
+			fprintf(fq,"#10\n");
+			// IEEE 754-2008 section 6.3 states "When ether an input or result is NaN, this standard does not interpret the sign of a NaN."
+			//fprintf(fq,"	$fwrite(fp, \"%%h %%h %%h %%h \",xrf,y,w, ans);\n");	
+			fprintf(fq,"    // IEEE 754-2008 section 6.3 states: \"When ether an input or result is NaN, this\n");
+			fprintf(fq,"    //                                     standard does not interpret the sign of a NaN.\"\n");
+			fprintf(fq,"	wnan = &w[62:52] && |w[51:0]; \n");
+			fprintf(fq,"	xnan = &xrf[62:52] && |xrf[51:0]; \n");
+			fprintf(fq,"	ynan = &y[62:52] && |y[51:0]; \n");
+			fprintf(fq,"	znan = &zrf[62:52] && |zrf[51:0]; \n");
+			fprintf(fq,"	ansnan = &ans[62:52] && |ans[51:0]; \n");
+			fprintf(fq,"	xnorm = ~(|xrf[62:52]) && |xrf[51:0] ? {xrf[50:0], 1'b0} : xrf; \n");
+			fprintf(fq,"	ynorm = ~(|y[62:52]) && |y[51:0] ? {y[50:0], 1'b0} : y;\n");
+			fprintf(fq,"	s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm}; \n");
+			// fprintf(fq,"    if(!(~(|xrf[62:52]) && |xrf[51:0] || ~(|y[62:52]) && |y[51:0])) begin\n"); 
+																							// not looknig at negative zero results right now
+			//fprintf(fq,"	  if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) && !(w == 64'h8000000000000000 && ans == 64'b0)) begin\n"); 
+			// fprintf(fq,"	if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) ) begin\n"); 
+			fprintf(fq,"	if((!wnan && (w != ans)) || (wnan && ansnan && ~(((xnan && (w[62:0] == {xrf[62:52],1'b1,xrf[50:0]})) || (ynan && (w[62:0] == {y[62:52],1'b1,y[50:0]}))  || (znan && (w[62:0] == {zrf[62:52],1'b1,zrf[50:0]})) || (w[62:0] == ans[62:0])) ))) begin\n"); 
+			fprintf(fq,"		$fwrite(fp, \"%%h %%h %%h %%h %%h  Wrong \",xrf,y, zrf, w, ans);\n");
+			//fprintf(fq,"		$fwrite(fp, \"%%h \",s);\n");
+			fprintf(fq,"		if(w == 64'h8000000000000000) $fwrite(fp, \"w=-zero \");\n");
+			fprintf(fq,"		if(~(|xrf[62:52]) && |xrf[51:0]) $fwrite(fp, \"xdenorm \");\n");
+			fprintf(fq,"		if(~(|y[62:52]) && |y[51:0]) $fwrite(fp, \"ydenorm \");\n");
+			fprintf(fq,"		if(~(|zrf[62:52]) && |zrf[51:0]) $fwrite(fp, \"zdenorm \");\n");
+			fprintf(fq,"		if(invalid != 0) $fwrite(fp, \"invld \");\n");
+			fprintf(fq,"		if(overflow != 0) $fwrite(fp, \"ovrflw \");\n");
+			fprintf(fq,"		if(underflow != 0) $fwrite(fp, \"unflw \");\n");
+			fprintf(fq,"		if(w == 64'hFFF0000000000000) $fwrite(fp, \"w=-inf \");\n");
+			fprintf(fq,"		if(w == 64'h7FF0000000000000) $fwrite(fp, \"w=+inf \");\n");
+			fprintf(fq,"		if(w >  64'h7FF0000000000000 && w <  64'h7FF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
+			fprintf(fq,"		if(w >  64'hFFF8000000000000 && w <  64'hFFF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
+			fprintf(fq,"		if(w >= 64'h7FF8000000000000 && w <= 64'h7FFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
+			fprintf(fq,"		if(w >= 64'hFFF8000000000000 && w <= 64'hFFFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");

- 		fprintf(fq,"		if(ans == 64'hFFF0000000000000) $fwrite(fp, \"ans=-inf \");\n");
- 		fprintf(fq,"		if(ans == 64'h7FF0000000000000) $fwrite(fp, \"ans=+inf \");\n");
-		fprintf(fq,"		if(ans >  64'h7FF0000000000000 && ans <  64'h7FF8000000000000 ) $fwrite(fp, \"ans=sigNaN \");\n");
- 		fprintf(fq,"		if(ans >  64'hFFF8000000000000 && ans <  64'hFFF8000000000000 ) $fwrite(fp, \"ans=sigNaN \");\n");
- 		fprintf(fq,"		if(ans >= 64'h7FF8000000000000 && ans <= 64'h7FFfffffffffffff ) $fwrite(fp, \"ans=qutNaN \");\n");
- 		fprintf(fq,"		if(ans >= 64'hFFF8000000000000 && ans <= 64'hFFFfffffffffffff ) $fwrite(fp, \"ans=qutNaN \");\n");
-		fprintf(fq,"    	$fwrite(fp,\"%d\\n\");\n",cnt);
-		if(cnt == 358)fprintf(fq,"    	$stop;\n");
-		// fprintf(fq,"    end\n");
-		fprintf(fq,"    end\n");
-		cnt++;
+			fprintf(fq,"		if(ans == 64'hFFF0000000000000) $fwrite(fp, \"ans=-inf \");\n");
+			fprintf(fq,"		if(ans == 64'h7FF0000000000000) $fwrite(fp, \"ans=+inf \");\n");
+			fprintf(fq,"		if(ans >  64'h7FF0000000000000 && ans <  64'h7FF8000000000000 ) $fwrite(fp, \"ans=sigNaN \");\n");
+			fprintf(fq,"		if(ans >  64'hFFF8000000000000 && ans <  64'hFFF8000000000000 ) $fwrite(fp, \"ans=sigNaN \");\n");
+			fprintf(fq,"		if(ans >= 64'h7FF8000000000000 && ans <= 64'h7FFfffffffffffff ) $fwrite(fp, \"ans=qutNaN \");\n");
+			fprintf(fq,"		if(ans >= 64'hFFF8000000000000 && ans <= 64'hFFFfffffffffffff ) $fwrite(fp, \"ans=qutNaN \");\n");
+			fprintf(fq,"    	$fwrite(fp,\"%ld\\n\");\n",k);
+			//fprintf(fq,"    	$stop;\n");
+			// fprintf(fq,"    end\n");
+			fprintf(fq,"    end\n");
+			cnt++;

-		//if(cnt > 100) break;
-		fflush(fq);
-	}
+			//if(cnt > 100) break;
+			fflush(fq);
+		} // if(!feof(fp))
+		if(k == stop && debug == 1) break;
+	} // for(k)

 	fprintf(fq, "\t$stop;\n\tend\nendmodule");
 	fclose(fq);
 	fclose(fp);
+	fprintf(stdout,"cnt = %d\n",cnt);
 }

--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.v
--- a/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
@ -23,7 +23,14 @@ module tb;
 wire 					inexact;

 integer fp;
-reg nan;
+reg wnan;
+reg xnan;
+reg ynan;
+reg znan;
+reg ansnan;
+reg		[105:0]		s;				//	partial product 2	
+reg		[51:0] 		xnorm;
+reg 		[51:0] 		ynorm;

 localparam period = 20;  
 fmac UUT(.xrf(xrf), .y(y), .zrf(zrf), .rn(rn), .rz(rz), .rp(rp), .rm(rm),
@ -33,4 +40,4 @@ fmac UUT(.xrf(xrf), .y(y), .zrf(zrf), .rn(rn), .rz(rz), .rp(rp), .rm(rm),

 initial 
    begin
-    fp = $fopen("/home/kparry/code/FMAC/tbgen/results.dat","w");
+    fp = $fopen("/home/kparry/riscv-wally/wally-pipelined/src/fpu/FMA/tbgen/results.dat","w");
--- a/wally-pipelined/src/hazard/hazard.sv
+++ b/wally-pipelined/src/hazard/hazard.sv
@ -27,12 +27,12 @@

 module hazard(
  // Detect hazards
-  input  logic       PCSrcE, CSRWritePendingDEM, RetM, TrapM,
+  input  logic       BPPredWrongE, CSRWritePendingDEM, RetM, TrapM,
  input  logic       LoadStallD, MulDivStallD, CSRRdStallD,
  input  logic       InstrStall, DataStall,
  // Stall & flush outputs
  output logic       StallF, StallD, StallE, StallM, StallW,
-  output logic       FlushD, FlushE, FlushM, FlushW
+  output logic       FlushF, FlushD, FlushE, FlushM, FlushW
 );

  logic BranchFlushDE;
@ -51,7 +51,7 @@ module hazard(
  // A stage must stall if the next stage is stalled
  // If any stages are stalled, the first stage that isn't stalled must flush.

-  assign BranchFlushDE = PCSrcE | RetM | TrapM;
+  assign BranchFlushDE = BPPredWrongE | RetM | TrapM;

  assign StallFCause = CSRWritePendingDEM & ~(BranchFlushDE);  
  assign StallDCause = (LoadStallD | MulDivStallD | CSRRdStallD) & ~(BranchFlushDE);    // stall in decode if instruction is a load/mul/csr dependent on previous
@ -62,6 +62,7 @@ module hazard(

  // Each stage stalls if the next stage is stalled or there is a cause to stall this stage.
  assign StallF = StallD | StallFCause;
+
  assign StallD = StallE | StallDCause;
  assign StallE = StallM | StallECause;
  assign StallM = StallW | StallMCause;
@ -73,6 +74,7 @@ module hazard(
  assign FirstUnstalledW = (~StallW & StallM);;
  
  // Each stage flushes if the previous stage is the last one stalled (for cause) or the system has reason to flush
+  assign FlushF = BPPredWrongE;
  assign FlushD = FirstUnstalledD || BranchFlushDE;  //  PCSrcE |InstrStall | CSRWritePendingDEM | RetM | TrapM;
  assign FlushE = FirstUnstalledE || BranchFlushDE; //LoadStallD | PCSrcE | RetM | TrapM;
  assign FlushM = FirstUnstalledM || RetM || TrapM;
--- a/wally-pipelined/src/ieu/controller.sv
+++ b/wally-pipelined/src/ieu/controller.sv
@ -43,6 +43,7 @@ module controller(
  output logic       MemReadE, CSRReadE, // for Hazard Unit
  output logic [2:0] Funct3E,
  output logic       MulDivE, W64E,
+  output logic       JumpE,		  
  // Memory stage control signals
  input  logic       StallM, FlushM,
  output logic [1:0] MemRWM,
@ -68,7 +69,7 @@ module controller(
  logic 	    RegWriteD, RegWriteE;
  logic [2:0] ResultSrcD, ResultSrcE, ResultSrcM;
  logic [1:0] MemRWD, MemRWE;
-  logic		    JumpD, JumpE;
+  logic		    JumpD;
  logic		    BranchD, BranchE;
  logic	[1:0] ALUOpD;
  logic [4:0] ALUControlD;
--- a/wally-pipelined/src/ieu/datapath.sv
+++ b/wally-pipelined/src/ieu/datapath.sv
@ -36,7 +36,9 @@ module datapath (
  input  logic [4:0]       ALUControlE,
  input  logic             ALUSrcAE, ALUSrcBE,
  input  logic             TargetSrcE, 
+  input  logic             JumpE,
  input  logic [`XLEN-1:0] PCE,
+  input  logic [`XLEN-1:0] PCLinkE,
  output logic [2:0]       FlagsE,
  output logic [`XLEN-1:0] PCTargetE,
  output logic [`XLEN-1:0] SrcAE, SrcBE,
@ -64,7 +66,9 @@ module datapath (
  // Execute stage signals
  logic [`XLEN-1:0] RD1E, RD2E;
  logic [`XLEN-1:0] ExtImmE;
-  logic [`XLEN-1:0] PreSrcAE;
+
+  logic [`XLEN-1:0] PreSrcAE, SrcAE2, SrcBE2;
+
  logic [`XLEN-1:0] ALUResultE;
  logic [`XLEN-1:0] WriteDataE;
  logic [`XLEN-1:0] TargetBaseE;
@ -93,8 +97,10 @@ module datapath (
  mux3  #(`XLEN)  faemux(RD1E, ResultW, ALUResultM, ForwardAE, PreSrcAE);
  mux3  #(`XLEN)  fbemux(RD2E, ResultW, ALUResultM, ForwardBE, WriteDataE);
  mux2  #(`XLEN)  srcamux(PreSrcAE, PCE, ALUSrcAE, SrcAE);
+  mux2  #(`XLEN)  srcamux2(SrcAE, PCLinkE, JumpE, SrcAE2);  
  mux2  #(`XLEN)  srcbmux(WriteDataE, ExtImmE, ALUSrcBE, SrcBE);
-  alu   #(`XLEN)  alu(SrcAE, SrcBE, ALUControlE, ALUResultE, FlagsE);
+  mux2  #(`XLEN)  srcbmux2(SrcBE, {`XLEN{1'b0}}, JumpE, SrcBE2); // *** May be able to remove this mux.
+  alu   #(`XLEN)  alu(SrcAE2, SrcBE2, ALUControlE, ALUResultE, FlagsE);
  mux2  #(`XLEN)  targetsrcmux(PCE, SrcAE, TargetSrcE, TargetBaseE);
  assign  PCTargetE = ExtImmE + TargetBaseE;

@ -109,6 +115,9 @@ module datapath (
  flopenrc #(`XLEN) ALUResultWReg(clk, reset, FlushW, ~StallW, ALUResultM, ALUResultW);
  flopenrc #(5)    RdWEg(clk, reset, FlushW, ~StallW, RdM, RdW);

+  // *** something is not right here.  Before the merge I found an issue with the jal instruction not writing
+  // the link address through the alu.
+  // not sure what changed.
  // handle Store Conditional result if atomic extension supported
  generate 
    if (`A_SUPPORTED)
@ -118,4 +127,11 @@ module datapath (
  endgenerate

  mux6  #(`XLEN) resultmux(ALUResultW, ReadDataW, PCLinkW, CSRReadValW, MulDivResultW, SCResultW, ResultSrcW, ResultW);	
+/* -----\/----- EXCLUDED -----\/-----
+  // This mux4:1 no longer needs to include PCLinkW.  This is set correctly in the execution stage.
+  // *** need to look at how the decoder is coded to fix.
+  mux4  #(`XLEN) resultmux(ALUResultW, ReadDataW, PCLinkW, CSRReadValW, ResultSrcW, ResultW);	
+>>>>>>> bp
+ -----/\----- EXCLUDED -----/\----- */
+ 
 endmodule
--- a/wally-pipelined/src/ieu/ieu.sv
+++ b/wally-pipelined/src/ieu/ieu.sv
@ -33,6 +33,7 @@ module ieu (
  output logic             IllegalBaseInstrFaultD,
  // Execute Stage interface
  input  logic [`XLEN-1:0] PCE, 
+  input  logic [`XLEN-1:0] PCLinkE,
  output logic [`XLEN-1:0] PCTargetE,
  output logic             MulDivE, W64E,
  output logic [2:0]       Funct3E,
@ -72,6 +73,7 @@ module ieu (
  logic [1:0]       ForwardAE, ForwardBE;
  logic             RegWriteM, RegWriteW;
  logic             MemReadE, CSRReadE;
+  logic             JumpE;
           
  controller c(.*);
  datapath   dp(.*);             
--- a/wally-pipelined/src/ifu/BTBPredictor.sv
+++ b/wally-pipelined/src/ifu/BTBPredictor.sv
@ -0,0 +1,97 @@
+///////////////////////////////////////////
+// SRAM2P1R1W
+//
+// Written: Ross Thomposn
+// Email: ross1728@gmail.com
+// Created: February 15, 2021
+// Modified: 
+//
+// Purpose: BTB model.  Outputs type of instruction (currently 1 hot encoded. Probably want 
+// to encode to reduce storage), valid, target PC.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module BTBPredictor
+  #(parameter int Depth = 10
+    )
+  (input  logic clk,
+   input logic 		    reset,
+   input logic [`XLEN-1:0]  LookUpPC,
+   output logic [`XLEN-1:0] TargetPC,
+   output logic [3:0] 	    InstrClass,
+   output logic 	    Valid,
+   // update
+   input logic 		    UpdateEN,
+   input logic [`XLEN-1:0]  UpdatePC,
+   input logic [`XLEN-1:0]  UpdateTarget,
+   input logic [3:0] 	    UpdateInstrClass
+   );
+
+  localparam TotalDepth = 2 ** Depth;
+  logic [TotalDepth-1:0]    ValidBits;
+  logic [Depth-1:0] 	    LookUpPCIndex, UpdatePCIndex, LookUpPCIndexQ, UpdatePCIndexQ;
+
+  // hashing function for indexing the PC
+  // We have Depth bits to index, but XLEN bits as the input.
+  // bit 0 is always 0, bit 1 is 0 if using 4 byte instructions, but is not always 0 if
+  // using compressed instructions.  XOR bit 1 with the MSB of index.
+  assign UpdatePCIndex = {UpdatePC[Depth+1] ^ UpdatePC[1], UpdatePC[Depth:2]};
+  assign LookUpPCIndex = {LookUpPC[Depth+1] ^ LookUpPC[1], LookUpPC[Depth:2]};  
+  
+
+  flopenr #(Depth) UpdatePCIndexReg(.clk(clk),
+				    .reset(reset),
+				    .en(1'b1),
+				    .d(UpdatePCIndex),
+				    .q(UpdatePCIndexQ));
+  
+  // The valid bit must be resetable.
+  always_ff @ (posedge clk) begin
+    if (reset) begin
+      ValidBits <= #1 {TotalDepth{1'b0}};
+    end else if (UpdateEN) begin
+      ValidBits[UpdatePCIndexQ] <= #1 1'b1;
+    end
+  end
+
+  flopenr #(Depth) LookupPCIndexReg(.clk(clk),
+				    .reset(reset),
+				    .en(1'b1),
+				    .d(LookUpPCIndex),
+				    .q(LookUpPCIndexQ));
+
+  assign Valid = ValidBits[LookUpPCIndexQ];
+
+  // the BTB contains the target address.
+  // Another optimization may be using a PC relative address.
+  // *** need to add forwarding.
+
+  SRAM2P1R1W #(Depth, `XLEN+4) memory(.clk(clk),
+				      .reset(reset),
+				      .RA1(LookUpPCIndex),
+				      .RD1({{InstrClass, TargetPC}}),
+				      .REN1(1'b1),
+				      .WA1(UpdatePCIndex),
+				      .WD1({UpdateInstrClass, UpdateTarget}),
+				      .WEN1(UpdateEN),
+				      .BitWEN1({`XLEN{1'b1}}));
+
+
+endmodule
--- a/wally-pipelined/src/ifu/RAsPredictor.sv
+++ b/wally-pipelined/src/ifu/RAsPredictor.sv
@ -0,0 +1,80 @@
+///////////////////////////////////////////
+// RASPredictor.sv
+//
+// Written: Ross Thomposn
+// Email: ross1728@gmail.com
+// Created: February 15, 2021
+// Modified: 
+//
+// Purpose: 2 bit saturating counter predictor with parameterized table depth.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module RASPredictor
+  #(parameter int StackSize = 16
+    )
+  (input logic clk,
+   input logic 		    reset,
+   input logic 		    pop,
+   output logic [`XLEN-1:0] popPC,
+   input logic 		    push,
+   input logic 		    incr,
+   input logic [`XLEN-1:0]  pushPC
+   );
+
+  logic 		    CounterEn;
+  localparam Depth = $clog2(StackSize);
+
+  logic [StackSize-1:0]     PtrD, PtrQ, PtrP1, PtrM1;
+  logic [StackSize-1:0] [`XLEN-1:0] memory;
+  integer 			    index;
+  
+  assign CounterEn = pop | push | incr;
+
+  assign PtrD = pop ? PtrM1 : PtrP1;
+
+  assign PtrM1 = PtrQ - 1'b1;
+  assign PtrP1 = PtrQ + 1'b1;
+  // may have to handle a push and an incr at the same time.
+  // *** what happens if jal is executing and there is a return being flushed in Decode?
+
+  flopenr #(StackSize) PTR(.clk(clk),
+			   .reset(reset),
+			   .en(CounterEn),
+			   .d(PtrD),
+			   .q(PtrQ));
+
+  // RAS must be reset. 
+  always_ff @ (posedge clk, posedge reset) begin
+    if(reset) begin
+      for(index=0; index<StackSize; index++)
+	memory[index] <= {`XLEN{1'b0}};
+    end else if(push) begin
+      memory[PtrP1] <= #1 pushPC;
+    end
+  end
+
+  assign popPC = memory[PtrQ];
+  
+  
+endmodule
+
+
+
--- a/wally-pipelined/src/ifu/SramModel.sv
+++ b/wally-pipelined/src/ifu/SramModel.sv
@ -0,0 +1,111 @@
+///////////////////////////////////////////
+// SRAM2P1R1W
+//
+// Written: Ross Thomposn
+// Email: ross1728@gmail.com
+// Created: February 14, 2021
+// Modified: 
+//
+// Purpose: Behavioral model of two port SRAM.  While this is synthesizable it will produce a flip flop based memory whi
+//          behaves with the timing of an SRAM typical of GF 14nm, 32nm, and 45nm.
+//          
+// 
+// to preload this memory we can use the following command
+// in modelsim's do file.
+// mem load -infile <relative path to the text file > -format <bin|hex> <hierarchy to the memory.>
+// example
+// mem laod -infile twoBitPredictor.txt -format bin testbench/dut/hart/ifu/bpred/DirPredictor/memory/memory
+//
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module SRAM2P1R1W
+  #(parameter int Depth = 10,
+    parameter int Width = 2
+    )
+
+  (input logic clk,
+   // *** have to remove reset eventually
+   input logic 		    reset,
+  
+   // port 1 is read only
+   input logic [Depth-1:0]  RA1,
+   output logic [Width-1:0] RD1,
+   input logic 		    REN1,
+  
+   // port 2 is write only
+   input logic [Depth-1:0]  WA1,
+   input logic [Width-1:0]  WD1,
+   input logic 		    WEN1,
+   input logic [Width-1:0]  BitWEN1
+   );
+  
+
+  logic [Depth-1:0] 	    RA1Q, WA1Q;
+  logic 		    WEN1Q;
+  logic [Width-1:0] 	    WD1Q;
+
+  logic [Width-1:0] 	    memory [2**Depth-1:0];
+
+  
+  // SRAMs address busses are always registered first.
+
+  flopenr #(Depth) RA1Reg(.clk(clk),
+			  .reset(reset),
+			  .en(REN1),
+			  .d(RA1),
+			  .q(RA1Q));
+  
+
+  flopenr #(Depth) WA1Reg(.clk(clk),
+			  .reset(reset),
+			  .en(REN1),
+			  .d(WA1),
+			  .q(WA1Q));
+
+  flopenr #(1) WEN1Reg(.clk(clk),
+		       .reset(reset),
+		       .en(1'b1),
+		       .d(WEN1),
+		       .q(WEN1Q));
+  
+  flopenr #(Width) WD1Reg(.clk(clk),
+			  .reset(reset),
+			  .en(REN1),
+			  .d(WD1),
+			  .q(WD1Q));
+  // read port
+  assign RD1 = memory[RA1Q];
+
+  genvar 		    index;
+  
+  // write port
+  generate
+    for (index = 0; index < Width; index = index + 1) begin    
+      always_ff @ (posedge clk) begin
+	if (WEN1Q & BitWEN1[index]) begin
+	  memory[WA1Q][index] <= WD1Q[index];
+	end
+      end
+    end
+  endgenerate
+
+endmodule  
+
+
--- a/wally-pipelined/src/ifu/bpred.sv
+++ b/wally-pipelined/src/ifu/bpred.sv
@ -0,0 +1,169 @@
+///////////////////////////////////////////
+// bpred.sv
+//
+// Written: Ross Thomposn
+// Email: ross1728@gmail.com
+// Created: February 12, 2021
+// Modified: 
+//
+// Purpose: Branch prediction unit
+//          Produces a branch prediction based on branch history.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module bpred 
+  (input logic clk, reset,
+   input logic 		    StallF, StallD, StallE, FlushF, FlushD, FlushE,
+   // Fetch stage
+   // the prediction
+   input logic [`XLEN-1:0]  PCNextF, // *** forgot to include this one on the I/O list
+   output logic [`XLEN-1:0] BPPredPCF,
+   output logic 	    SelBPPredF,
+   // Update Predictor
+   input logic [`XLEN-1:0]  PCE, // The address of the currently executing instruction
+   // 1 hot encoding
+   // return, jump register, jump, branch
+   // *** after reviewing the compressed instruction set I am leaning towards having the btb predict the instruction class.
+   // *** the specifics of how this is encode is subject to change.
+   input logic 		    PCSrcE, // AKA Branch Taken
+   // Signals required to check the branch prediction accuracy.
+   input logic [`XLEN-1:0]  PCTargetE, // The branch destination if the branch is taken.
+   input logic [`XLEN-1:0]  PCD, // The address the branch predictor took.
+   input logic [`XLEN-1:0]  PCLinkE, // The address following the branch instruction. (AKA Fall through address)
+   input logic [3:0] 	    InstrClassE,
+   // Report branch prediction status
+   output logic 	    BPPredWrongE
+   );
+
+  logic 		    BTBValidF;
+  logic [1:0] 		    BPPredF, BPPredD, BPPredE, UpdateBPPredE;
+
+  logic [3:0] 		    BPInstrClassF, BPInstrClassD, BPInstrClassE;
+  logic [`XLEN-1:0] 	    BTBPredPCF, RASPCF;
+  logic 		    TargetWrongE;
+  logic 		    FallThroughWrongE;
+  logic 		    PredictionDirWrongE;
+  logic 		    PredictionPCWrongE;
+  logic [`XLEN-1:0] 	    CorrectPCE;
+
+
+  // Part 1 branch direction prediction
+
+  twoBitPredictor DirPredictor(.clk(clk),
+			       .reset(reset),
+			       .LookUpPC(PCNextF),
+			       .Prediction(BPPredF),
+			       // update
+			       .UpdatePC(PCE),
+			       .UpdateEN(InstrClassE[0]),
+			       .UpdatePrediction(UpdateBPPredE));
+
+  // this predictor will have two pieces of data,
+  // 1) A direction (1 = Taken, 0 = Not Taken)
+  // 2) Any information which is necessary for the predictor to built it's next state.
+  // For a 2 bit table this is the prediction count.
+
+  assign SelBPPredF = ((BPInstrClassF[0] & BPPredF[1] & BTBValidF) | 
+		       BPInstrClassF[3] |
+		       (BPInstrClassF[2] & BTBValidF) | 
+		       BPInstrClassF[1] & BTBValidF) ;
+
+
+  // Part 2 Branch target address prediction
+  // *** For now the BTB will house the direct and indirect targets
+
+  BTBPredictor TargetPredictor(.clk(clk),
+			       .reset(reset),
+			       .LookUpPC(PCNextF),
+			       .TargetPC(BTBPredPCF),
+			       .InstrClass(BPInstrClassF),
+			       .Valid(BTBValidF),
+			       // update
+			       .UpdateEN(InstrClassE[2] | InstrClassE[1] | InstrClassE[0]),
+			       .UpdatePC(PCE),
+			       .UpdateTarget(PCTargetE),
+			       .UpdateInstrClass(InstrClassE));
+
+  // need to forward when updating to the same address as reading.
+  //assign CorrectPCE = PCSrcE ? PCTargetE : PCLinkE;
+  //assign TargetPC = (PCE == PCNextF) ? CorrectPCE : BTBPredPCF;
+
+  // Part 3 RAS
+  // *** need to add the logic to restore RAS on flushes.  We will use incr for this.
+  RASPredictor RASPredictor(.clk(clk),
+			    .reset(reset),
+			    .pop(BPInstrClassF[3]),
+			    .popPC(RASPCF),
+			    .push(InstrClassE[3]),
+			    .incr(1'b0),
+			    .pushPC(PCLinkE));
+
+  assign BPPredPCF = BPInstrClassF[3] ? RASPCF : BTBPredPCF;
+  
+  
+
+  // The prediction and its results need to be passed through the pipeline
+  // *** for other predictors will will be different.
+  
+  flopenrc #(2) BPPredRegD(.clk(clk),
+			   .reset(reset),
+			   .en(~StallF),
+			   .clear(FlushF),
+			   .d(BPPredF),
+			   .q(BPPredD));
+
+  flopenrc #(2) BPPredRegE(.clk(clk),
+			   .reset(reset),
+			   .en(~StallD),
+			   .clear(FlushD),
+			   .d(BPPredD),
+			   .q(BPPredE));
+
+  // pipeline the class
+  flopenrc #(4) InstrClassRegD(.clk(clk),
+			       .reset(reset),
+			       .en(~StallF),
+			       .clear(FlushF),
+			       .d(BPInstrClassF),
+			       .q(BPInstrClassD));
+
+  flopenrc #(4) InstrClassRegE(.clk(clk),
+			       .reset(reset),
+			       .en(~StallD),
+			       .clear(FlushD),
+			       .d(BPInstrClassD),
+			       .q(BPInstrClassE));
+
+  
+
+  // Check the prediction makes execution.
+  assign TargetWrongE = PCTargetE != PCD;
+  assign FallThroughWrongE = PCLinkE != PCD;
+  assign PredictionDirWrongE = (BPPredE[1] ^ PCSrcE) & InstrClassE[0];
+  assign PredictionPCWrongE = PCSrcE ? TargetWrongE : FallThroughWrongE;
+  assign BPPredWrongE = (PredictionPCWrongE | PredictionDirWrongE) & (|InstrClassE);
+
+  // Update predictors
+
+  satCounter2 BPDirUpdate(.BrDir(PCSrcE),
+			  .OldState(BPPredE),
+			  .NewState(UpdateBPPredE));
+
+endmodule
--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@ -29,7 +29,7 @@
 module ifu (
  input  logic             clk, reset,
  input  logic             StallF, StallD, StallE, StallM, StallW,
-  input  logic             FlushD, FlushE, FlushM, FlushW,
+  input  logic             FlushF, FlushD, FlushE, FlushM, FlushW,
  // Fetch
  input  logic [`XLEN-1:0] InstrInF,
  output logic [`XLEN-1:0] PCF, 
@ -37,13 +37,15 @@ module ifu (
  output logic             InstrReadF,
  // Decode  
  // Execute
-  input  logic             PCSrcE, 
-  input  logic [`XLEN-1:0] PCTargetE,
-  output logic [`XLEN-1:0] PCE, 
+  output logic [`XLEN-1:0] PCLinkE,
+  input logic 		   PCSrcE, 
+  input logic [`XLEN-1:0]  PCTargetE,
+  output logic [`XLEN-1:0] PCE,
+  output logic 		   BPPredWrongE, 
  // Mem
-  input  logic             RetM, TrapM, 
-  input  logic [`XLEN-1:0] PrivilegedNextPCM, 
-  output logic [31:0]      InstrD, InstrM,
+  input logic 		   RetM, TrapM, 
+  input logic [`XLEN-1:0]  PrivilegedNextPCM, 
+  output logic [31:0] 	   InstrD, InstrM,
  output logic [`XLEN-1:0] PCM, 
  // Writeback
  output logic [`XLEN-1:0] PCLinkW,
@ -60,13 +62,14 @@ module ifu (
  output logic             ITLBMissF, ITLBHitF,
  // bogus
  input  logic [15:0] rd2
+
 );

  logic [`XLEN-1:0] UnalignedPCNextF, PCNextF;
  logic misaligned, BranchMisalignedFaultE, BranchMisalignedFaultM, TrapMisalignedFaultM;
  logic PrivilegedChangePCM;
  logic IllegalCompInstrD;
-  logic [`XLEN-1:0] PCPlusUpperF, PCPlus2or4F, PCD, PCW, PCLinkD, PCLinkE, PCLinkM;
+  logic [`XLEN-1:0] PCPlusUpperF, PCPlus2or4F, PCD, PCW, PCLinkD, PCLinkM;
  logic        CompressedF;
  logic [31:0]     InstrF, InstrRawD, InstrE, InstrW;
  logic [31:0]     nop = 32'h00000013; // instruction for NOP
@ -81,6 +84,12 @@ module ifu (
  tlb #(3) itlb(clk, reset, SATP, PCF, PageTableEntryF, ITLBWriteF, ITLBFlushF,
    InstrPAdrF, ITLBMissF, ITLBHitF);

+  // branch predictor signals
+  logic 	   SelBPPredF;
+  logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F;
+  logic [3:0] 	    InstrClassD, InstrClassE;
+  
+
  // *** put memory interface on here, InstrF becomes output
  //assign InstrPAdrF = PCF; // *** no MMU
  //assign InstrReadF = ~StallD; // *** & ICacheMissF; add later
@ -89,10 +98,48 @@ module ifu (
  assign PrivilegedChangePCM = RetM | TrapM;


-  mux3    #(`XLEN) pcmux(PCPlus2or4F, PCTargetE, PrivilegedNextPCM, {PrivilegedChangePCM, PCSrcE}, UnalignedPCNextF);
+  //mux3    #(`XLEN) pcmux(PCPlus2or4F, PCCorrectE, PrivilegedNextPCM, {PrivilegedChangePCM, BPPredWrongE}, UnalignedPCNextF);
+  mux2 #(`XLEN) pcmux0(.d0(PCPlus2or4F),
+		       .d1(BPPredPCF),
+		       .s(SelBPPredF),
+		       .y(PCNext0F));
+
+  mux2 #(`XLEN) pcmux1(.d0(PCNext0F),
+		       .d1(PCCorrectE),
+		       .s(BPPredWrongE),
+		       .y(PCNext1F));
+
+  mux2 #(`XLEN) pcmux2(.d0(PCNext1F),
+		       .d1(PrivilegedNextPCM),
+		       .s(PrivilegedChangePCM),
+		       .y(UnalignedPCNextF));
+  
  assign  PCNextF = {UnalignedPCNextF[`XLEN-1:1], 1'b0}; // hart-SPEC p. 21 about 16-bit alignment
  flopenl #(`XLEN) pcreg(clk, reset, ~StallF, PCNextF, `RESET_VECTOR, PCF);

+  // branch and jump predictor
+  // I am making the port connection explicit for now as I want to see them and they will be changing.
+  bpred bpred(.clk(clk),
+	      .reset(reset),
+	      .StallF(StallF),
+	      .StallD(StallD),
+	      .StallE(1'b0),   // *** may need this eventually
+	      .FlushF(FlushF),
+	      .FlushD(FlushD),
+	      .FlushE(FlushE),
+	      .PCNextF(PCNextF),
+	      .BPPredPCF(BPPredPCF),
+	      .SelBPPredF(SelBPPredF),
+	      .PCE(PCE),
+	      .PCSrcE(PCSrcE),
+	      .PCTargetE(PCTargetE),
+	      .PCD(PCD),
+	      .PCLinkE(PCLinkE),
+	      .InstrClassE(InstrClassE),
+	      .BPPredWrongE(BPPredWrongE));
+  // The true correct target is PCTargetE if PCSrcE is 1 else it is the fall through PCLinkE.
+  assign PCCorrectE =  PCSrcE ? PCTargetE : PCLinkE;
+
  // pcadder
  // add 2 or 4 to the PC, based on whether the instruction is 16 bits or 32
  assign CompressedF = (InstrF[1:0] != 2'b11); // is it a 16-bit compressed instruction?
@ -124,6 +171,14 @@ module ifu (
  assign IllegalIEUInstrFaultD = IllegalBaseInstrFaultD | IllegalCompInstrD; // illegal if bad 32 or 16-bit instr
  // *** combine these with others in better way, including M, F

+
+  // the branch predictor needs a compact decoding of the instruction class.
+  // *** consider adding in the alternate return address x5 for returns.
+  assign InstrClassD[3] = InstrD[6:0] == 7'h67 && InstrD[19:15] == 5'h01; // return
+  assign InstrClassD[2] = InstrD[6:0] == 7'h67 && InstrD[19:15] != 5'h01; // jump register, but not return
+  assign InstrClassD[1] = InstrD[6:0] == 7'h6F; // jump
+  assign InstrClassD[0] = InstrD[6:0] == 7'h63; // branch
+
  // Misaligned PC logic

  generate
@ -147,6 +202,13 @@ module ifu (
  flopenr #(`XLEN) PCMReg(clk, reset, ~StallM, PCE, PCM);
  flopenr #(`XLEN) PCWReg(clk, reset, ~StallW, PCM, PCW); // *** probably not needed; delete later

+  flopenrc #(4) InstrClassRegE(.clk(clk),
+			       .reset(reset),
+			       .en(~StallD),
+			       .clear(FlushD),
+			       .d(InstrClassD),
+			       .q(InstrClassE));
+
  // seems like there should be a lower-cost way of doing this PC+2 or PC+4 for JAL.  
  // either have ALU compute PC+2/4 and feed into ALUResult input of ResultMux or
  // have dedicated adder in Mem stage based on PCM + 2 or 4
--- a/wally-pipelined/src/ifu/satCounter2.sv
+++ b/wally-pipelined/src/ifu/satCounter2.sv
@ -0,0 +1,57 @@
+///////////////////////////////////////////
+// satCounter2.sv
+//
+// Written: Ross Thomposn
+// Email: ross1728@gmail.com
+// Created: February 13, 2021
+// Modified: 
+//
+// Purpose: 2 bit starting counter
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module satCounter2
+  (input logic BrDir,
+   input logic [1:0] OldState,
+   output logic [1:0] NewState
+   );
+
+  always_comb begin
+    case(OldState)
+      2'b00: begin
+	if(BrDir) NewState = 2'b01;
+	else NewState = 2'b00;
+      end
+      2'b01: begin
+	if(BrDir) NewState = 2'b10;
+	else NewState = 2'b00;
+      end
+      2'b10: begin
+	if(BrDir) NewState = 2'b11;
+	else NewState = 2'b01;
+      end
+      2'b11: begin
+	if(BrDir) NewState = 2'b11;
+	else NewState = 2'b10;
+      end
+    endcase
+  end
+
+endmodule
--- a/wally-pipelined/src/ifu/twoBitPredictor.sv
+++ b/wally-pipelined/src/ifu/twoBitPredictor.sv
@ -0,0 +1,84 @@
+///////////////////////////////////////////
+// twoBitPredictor.sv
+//
+// Written: Ross Thomposn
+// Email: ross1728@gmail.com
+// Created: February 14, 2021
+// Modified: 
+//
+// Purpose: 2 bit saturating counter predictor with parameterized table depth.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module twoBitPredictor
+  #(parameter int Depth = 10
+    )
+  (input logic clk,
+   input logic 		   reset,
+   input logic [`XLEN-1:0] LookUpPC,
+   output logic [1:0] 	   Prediction,
+   // update
+   input logic [`XLEN-1:0] UpdatePC,
+   input logic 		   UpdateEN,
+   input logic [1:0] 	   UpdatePrediction
+   );
+
+  logic [Depth-1:0] 	   LookUpPCIndex, UpdatePCIndex;
+  logic [1:0] 		   PredictionMemory;
+  logic 		   DoForwarding, DoForwardingF;
+  logic [1:0] 		   UpdatePredictionF;
+  
+
+  // hashing function for indexing the PC
+  // We have Depth bits to index, but XLEN bits as the input.
+  // bit 0 is always 0, bit 1 is 0 if using 4 byte instructions, but is not always 0 if
+  // using compressed instructions.  XOR bit 1 with the MSB of index.
+  assign UpdatePCIndex = {UpdatePC[Depth+1] ^ UpdatePC[1], UpdatePC[Depth:2]};
+  assign LookUpPCIndex = {LookUpPC[Depth+1] ^ LookUpPC[1], LookUpPC[Depth:2]};  
+
+
+  SRAM2P1R1W #(Depth, 2) memory(.clk(clk),
+				.reset(reset),
+				.RA1(LookUpPCIndex),
+				.RD1(PredictionMemory),
+				.REN1(1'b1),
+				.WA1(UpdatePCIndex),
+				.WD1(UpdatePrediction),
+				.WEN1(UpdateEN),
+				.BitWEN1(2'b11));
+
+  // need to forward when updating to the same address as reading.
+  // first we compare to see if the update and lookup addreses are the same
+  assign DoForwarding = UpdatePCIndex == LookUpPCIndex;
+
+  // register the update value and the forwarding signal into the Fetch stage
+  flopr #(1) DoForwardingReg(.clk(clk),
+			     .reset(reset),
+			     .d(DoForwarding),
+			     .q(DoForwardingF));
+  
+  flopr #(2) UpdatePredictionReg(.clk(clk),
+				 .reset(reset),
+				 .d(UpdatePrediction),
+				 .q(UpdatePredictionF));
+
+  assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory;
+  
+endmodule
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@ -55,7 +55,7 @@ module wallypipelinedhart (

 //  logic [1:0]  ForwardAE, ForwardBE;
  logic        StallF, StallD, StallE, StallM, StallW;
-  logic        FlushD, FlushE, FlushM, FlushW;
+  logic        FlushF, FlushD, FlushE, FlushM, FlushW;
  logic        RetM, TrapM;

  // new signals that must connect through DP
@ -66,7 +66,7 @@ module wallypipelinedhart (
  logic [2:0] Funct3E;
 //  logic [31:0] InstrF;
  logic [31:0] InstrD, InstrM;
-  logic [`XLEN-1:0] PCE, PCM, PCLinkW;
+  logic [`XLEN-1:0] PCE, PCM, PCLinkE, PCLinkW;
  logic [`XLEN-1:0] PCTargetE;
  logic [`XLEN-1:0] CSRReadValW, MulDivResultW;
  logic [`XLEN-1:0] PrivilegedNextPCM;
@ -101,13 +101,14 @@ module wallypipelinedhart (
  logic             InstrReadF;
  logic             DataStall, InstrStall;
  logic             InstrAckD, MemAckW;
+  logic 	    BPPredWrongE;
+  
           
  ifu ifu(.InstrInF(InstrRData), .*); // instruction fetch unit: PC, branch prediction, instruction cache

-  ieu ieu(.*); // inteber execution unit: integer register file, datapath and controller
+  ieu ieu(.*); // integer execution unit: integer register file, datapath and controller
  dmem dmem(.*); // data cache unit

-
  ahblite ebu( 
    //.InstrReadF(1'b0),
    //.InstrRData(InstrF), // hook up InstrF later
--- a/wally-pipelined/testbench/function_radix.sv
+++ b/wally-pipelined/testbench/function_radix.sv
@ -0,0 +1,108 @@
+///////////////////////////////////////////
+// datapath.sv
+//
+// Written: Ross Thompson
+// email: ross1728@gmail.com
+// Created: November 9, 2019
+//
+// Purpose: Finds the current function or global assembly label based on PCE.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module function_radix();
+
+   parameter PRELOAD_FILE = "funct_addr.txt";
+
+   integer memory_bank [];
+   integer index;
+
+   logic [`XLEN-1:0] pc;
+   
+   initial begin
+     $init_signal_spy("/riscv_mram_tb/dut/pc", "/riscv_mram_tb/function_radix/pc");
+   end
+
+   task automatic bin_search_min;
+      input integer pc;
+      input integer length;
+      ref integer   array [];
+      output integer minval;
+
+      integer 	     left, right;
+      integer 	     mid;
+
+      begin
+	 left = 0;
+	 right = length;
+	 while (left <= right) begin
+	    mid = left + ((right - left) / 2);
+	    if (array[mid] == pc) begin
+	       minval = array[mid];
+	       return;
+            end
+	    if (array[mid] < pc) begin
+	      left = mid + 1;
+	    end else begin
+	      right = mid -1;
+	    end
+	 end // while (left <= right)
+	 // if the element pc is now found, right and left will be equal at this point.
+	 // we need to check if pc is less than the array at left or greather.
+	 // if it is less than pc, then we select left as the index.
+	 // if it is greather we want 1 less than left.
+	 if (array[left] < pc) begin
+	    minval = array[left];
+	    return;	    
+	 end else begin
+	    minval = array[left-1];
+	    return;
+	 end
+      end
+   endtask
+
+   
+   // preload
+   initial $readmemh(PRELOAD_FILE, memory_bank);
+
+   // we need to count the number of lines in the file so we can set line_count.
+   integer fp;
+   integer line_count = 0;
+   logic [31:0] line;
+   initial begin
+      fp = $fopen(PRELOAD_FILE, "r");
+      // read line by line to count lines
+      if (fp) begin
+	 while (! $feof(fp)) begin
+	    $fscanf(fp, "%h\n", line);
+	    line_count = line_count + 1;
+	 end
+      end else begin
+	 $display("Cannot open file %s for reading.", PRELOAD_FILE);
+	 $stop;
+      end
+   end
+
+   always @(pc) begin
+      bin_search_min(pc, line_count, memory_bank, index);
+      
+   end
+
+endmodule // function_radix
+
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -368,7 +368,7 @@ string tests32i[] = {
      memfilename = {"../../imperas-riscv-tests/work/", tests[test], ".elf.memfile"};
      $readmemh(memfilename, dut.imem.RAM);
      $readmemh(memfilename, dut.uncore.dtim.RAM);
-      reset = 1; # 22; reset = 0;
+      reset = 1; # 42; reset = 0;
    end

  // generate clock to sequence tests
@ -444,7 +444,11 @@ string tests32i[] = {
          reset = 1; # 17; reset = 0;
        end
      end
-    end
+    end // always @ (negedge clk)
+
+  // track the current function or label
+  //function_rfunction_radix function_radix();
+  
 endmodule

 /* verilator lint_on STMTDLY */