Merge branch 'main' of github.com:davidharrishmc/riscv-wally into main

2025-02-11 06:05:49 +00:00 · 2021-03-05 13:35:44 -05:00 · 2021-03-05 13:35:44 -05:00 · 97e9baa316
commit 97e9baa316
parent 85dcbee86b a982ad7a9a
37 changed files with 466939 additions and 3608 deletions
--- a/wally-pipelined/bin/extractFunctionRadix.sh
+++ b/wally-pipelined/bin/extractFunctionRadix.sh
@ -0,0 +1,47 @@
 #!/bin/bash
 allProgramRadixFile="FunctionRadix"
 index=0
 for objDumpFile in "$@";
 do
    # get the lines with named labels from the obj files.
    # 64 bit addresses
    listOfAddr16=`egrep -i '^[0-9]{16} <[0-9a-zA-Z_]+>' $objDumpFile`
    # 32 bit addresses
    listOfAddr8=`egrep -i '^[0-9]{8} <[0-9a-zA-Z_]+>' $objDumpFile`
    listOfAddr=`echo "$listOfAddr16" "$listOfAddr8"`
    # parse out the addresses and the labels
    addresses=`echo "$listOfAddr" | awk '{print $1}'`
    labels=`echo "$listOfAddr" | awk '{print  "\""$2"\"", "-color \"SpringGreen\","}' | tr -d '<>:'`
    echo "$addresses" > $objDumpFile.addr
    # need to add some formatting to each line
    numLines=`echo "$listOfAddr" | wc -l`
    prefix=`yes "    16#" | head -n  $numLines`
    midfix=`yes "# " | head -n $numLines`
    # paste echos each of the 4 parts on a per line basis.
    #-d'\0' sets no delimiter
    temp=`paste -d'\0' <(echo "$prefix") <(echo "$addresses") <(echo "$midfix") <(echo "$labels")`
    # remove the last comma
    temp2=${temp::-1}
    echo "radix define Functions {" > $objDumpFile.do
    echo "$temp2" >> $objDumpFile.do
    echo "    -default hex -color green" >> $objDumpFile.do
    echo "}" >> $objDumpFile.do
    # now create the all in one version
    # put the index at the begining of each line
    allAddresses=`paste -d'\0' <(printf "%04x" "$index") <(echo "$addresses")`
    printf "%04x%s" "$index" "$addresses" >> $allProgramRadixFile.addr
    index=$(($index+1))
 done
--- a/wally-pipelined/regression/BTBPredictor.txt
+++ b/wally-pipelined/regression/BTBPredictor.txt
--- a/wally-pipelined/regression/twoBitPredictor.txt
+++ b/wally-pipelined/regression/twoBitPredictor.txt
--- a/wally-pipelined/regression/wally-pipelined-ross.do
+++ b/wally-pipelined/regression/wally-pipelined-ross.do
@ -0,0 +1,52 @@
 # wally-pipelined.do 
 #
 # Modification by Oklahoma State University & Harvey Mudd College
 # Use with Testbench 
 # James Stine, 2008; David Harris 2021
 # Go Cowboys!!!!!!
 #
 # Takes 1:10 to run RV64IC tests using gui
 # Use this wally-pipelined.do file to run this example.
 # Either bring up ModelSim and type the following at the "ModelSim>" prompt:
 #     do wally-pipelined.do
 # or, to run from a shell, type the following at the shell prompt:
 #     vsim -do wally-pipelined.do -c
 # (omit the "-c" to see the GUI while running from the shell)
 onbreak {resume}
 # create library
 if [file exists work] {
    vdel -all
 }
 vlib work
 # compile source files
 # suppress spurious warnngs about 
 # "Extra checking for conflicts with always_comb done at vopt time"
 # because vsim will run vopt
 # default to config/rv64ic, but allow this to be overridden at the command line.  For example:
 # do wally-pipelined.do ../config/rv32ic
 switch $argc {
    0 {vlog +incdir+../config/rv64ic ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
    1 {vlog +incdir+$1 ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
 }
 # start and run simulation
 # remove +acc flag for faster sim during regressions if there is no need to access internal signals
 vopt +acc work.testbench -o workopt 
 vsim workopt
 # load the branch predictors with known data. The value of the data is not important for function, but
 # is important for perventing pessimistic x propagation.
 mem load -infile twoBitPredictor.txt -format bin testbench/dut/hart/ifu/bpred/DirPredictor/memory/memory
 mem load -infile BTBPredictor.txt -format bin testbench/dut/hart/ifu/bpred/TargetPredictor/memory/memory
 do wave.do
 add log -r /*
 -- Run the Simulation 
 #run 1000
 run -all
 #quit
--- a/wally-pipelined/regression/wally-pipelined.do
+++ b/wally-pipelined/regression/wally-pipelined.do
@ -38,6 +38,11 @@ switch $argc {
 vopt +acc work.testbench -o workopt 
 vsim workopt
 # load the branch predictors with known data. The value of the data is not important for function, but
 # is important for perventing pessimistic x propagation.
 mem load -infile twoBitPredictor.txt -format bin testbench/dut/hart/ifu/bpred/DirPredictor/memory/memory
 mem load -infile BTBPredictor.txt -format bin testbench/dut/hart/ifu/bpred/TargetPredictor/memory/memory
 view wave
 -- display input and output signals as hexidecimal values
--- a/wally-pipelined/regression/wave-all.do
+++ b/wally-pipelined/regression/wave-all.do
--- a/wally-pipelined/regression/wave.do
+++ b/wally-pipelined/regression/wave.do
@ -0,0 +1,134 @@
 onerror {resume}
 quietly WaveActivateNextPane {} 0
 add wave -noupdate /testbench/clk
 add wave -noupdate /testbench/reset
 add wave -noupdate -radix ascii /testbench/memfilename
 add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/PCE
 add wave -noupdate -expand -group {Execution Stage} /testbench/InstrEName
 add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/InstrE
 add wave -noupdate -divider <NULL>
 add wave -noupdate /testbench/dut/hart/ebu/IReadF
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/BPPredWrongE
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/CSRWritePendingDEM
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/RetM
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/TrapM
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/LoadStallD
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/InstrStall
 add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/DataStall
 add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/hzu/FlushF
 add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushD
 add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushE
 add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushM
 add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushW
 add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallF
 add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallD
 add wave -noupdate -group Bpred -expand -group direction -divider Update
 add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/DirPredictor/UpdatePC
 add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/DirPredictor/UpdateEN
 add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/DirPredictor/UpdatePCIndex
 add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/DirPredictor/UpdatePrediction
 add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/DirPredictor/memory/memory
 add wave -noupdate -group InstrClass /testbench/dut/hart/ifu/bpred/InstrClassF
 add wave -noupdate -group InstrClass /testbench/dut/hart/ifu/bpred/InstrClassD
 add wave -noupdate -group InstrClass /testbench/dut/hart/ifu/bpred/InstrClassE
 add wave -noupdate -group {instruction pipeline} /testbench/dut/hart/ifu/InstrF
 add wave -noupdate -group {instruction pipeline} /testbench/dut/hart/ifu/InstrD
 add wave -noupdate -group {instruction pipeline} /testbench/dut/hart/ifu/InstrE
 add wave -noupdate -group {instruction pipeline} /testbench/dut/hart/ifu/InstrM
 add wave -noupdate /testbench/dut/hart/ifu/bpred/BPPredWrongE
 add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCNextF
 add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCF
 add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCPlus2or4F
 add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/BPPredPCF
 add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCNext0F
 add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PCNext1F
 add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/SelBPPredF
 add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/BPPredWrongE
 add wave -noupdate -group {PCNext Generation} /testbench/dut/hart/ifu/PrivilegedChangePCM
 add wave -noupdate /testbench/dut/hart/ifu/bpred/TargetPredictor/ValidBits
 add wave -noupdate /testbench/dut/hart/ifu/bpred/BPPredF
 add wave -noupdate /testbench/dut/hart/ifu/bpred/BTBValidF
 add wave -noupdate /testbench/dut/hart/ifu/bpred/TargetPredictor/LookUpPCIndexQ
 add wave -noupdate /testbench/dut/hart/ifu/bpred/TargetPredictor/UpdatePCIndexQ
 add wave -noupdate /testbench/dut/hart/ifu/bpred/TargetPredictor/LookUpPC
 add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/TargetWrongE
 add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/FallThroughWrongE
 add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/PredictionDirWrongE
 add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/PredictionPCWrongE
 add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/BPPredWrongE
 add wave -noupdate -group {bp wrong} /testbench/dut/hart/ifu/bpred/InstrClassE
 add wave -noupdate -group BTB -divider Update
 add wave -noupdate -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/UpdateEN
 add wave -noupdate -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/UpdatePC
 add wave -noupdate -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/UpdateTarget
 add wave -noupdate -group BTB -divider Lookup
 add wave -noupdate -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/TargetPC
 add wave -noupdate -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/Valid
 add wave -noupdate /testbench/dut/hart/ifu/bpred/BTBPredPCF
 add wave -noupdate /testbench/dut/hart/ifu/bpred/TargetPredictor/TargetPC
 add wave -noupdate /testbench/dut/hart/ifu/bpred/CorrectPCE
 add wave -noupdate /testbench/dut/hart/ifu/bpred/FlushF
 add wave -noupdate /testbench/dut/hart/FlushF
 add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rf
 add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a1
 add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a2
 add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/a3
 add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rd1
 add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/rd2
 add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/we3
 add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/wd3
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ALUResultW
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ReadDataW
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/PCLinkW
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/CSRReadValW
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultSrcW
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultW
 add wave -noupdate /testbench/dut/hart/ieu/c/RegWriteE
 add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ifu/InstrD
 add wave -noupdate -group {Decode Stage} /testbench/InstrDName
 add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/c/RegWriteD
 add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/RdD
 add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/Rs1D
 add wave -noupdate -group {Decode Stage} /testbench/dut/hart/ieu/dp/Rs2D
 add wave -noupdate /testbench/InstrFName
 add wave -noupdate -expand -group dcache /testbench/dut/hart/MemAdrM
 add wave -noupdate -expand -group dcache /testbench/dut/hart/MemPAdrM
 add wave -noupdate -expand -group dcache /testbench/dut/hart/WriteDataM
 add wave -noupdate -expand -group dcache /testbench/dut/hart/ReadDataM
 add wave -noupdate -expand -group dcache /testbench/dut/hart/dmem/MemRWM
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs1D
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs2D
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs1E
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/Rs2E
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/RdE
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/RdM
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/RdW
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/MemReadE
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/RegWriteM
 add wave -noupdate -group Forward /testbench/dut/hart/ieu/fw/RegWriteW
 add wave -noupdate -group Forward -color Thistle /testbench/dut/hart/ieu/fw/ForwardAE
 add wave -noupdate -group Forward -color Thistle /testbench/dut/hart/ieu/fw/ForwardBE
 add wave -noupdate -group Forward -color Thistle /testbench/dut/hart/ieu/fw/LoadStallD
 add wave -noupdate -expand -group {alu execution stage} /testbench/dut/hart/ieu/dp/WriteDataE
 add wave -noupdate -expand -group {alu execution stage} /testbench/dut/hart/ieu/dp/ALUResultE
 add wave -noupdate -expand -group {alu execution stage} /testbench/dut/hart/ieu/dp/SrcAE
 add wave -noupdate -expand -group {alu execution stage} /testbench/dut/hart/ieu/dp/SrcBE
 add wave -noupdate /testbench/dut/hart/ieu/dp/ALUResultM
 TreeUpdate [SetDefaultTree]
 WaveRestoreCursors {{Cursor 2} {231033 ns} 0} {{Cursor 3} {1276117 ns} 0}
 quietly wave cursor active 2
 configure wave -namecolwidth 250
 configure wave -valuecolwidth 518
 configure wave -justifyvalue left
 configure wave -signalnamewidth 1
 configure wave -snapdistance 10
 configure wave -datasetprefix 0
 configure wave -rowmargin 4
 configure wave -childrowmargin 2
 configure wave -gridoffset 0
 configure wave -gridperiod 1
 configure wave -griddelta 40
 configure wave -timeline 0
 configure wave -timelineunits ns
 update
 WaveRestoreZoom {1276094 ns} {1276208 ns}
--- a/wally-pipelined/src/fpu/FMA/add.v
+++ b/wally-pipelined/src/fpu/FMA/add.v
@ -35,14 +35,14 @@ module add(r[105:0], s[105:0], t[157:0], sum[157:0],
 	wire		[157:0] 	sum0;			// sum of compound adder +0 mode
 	wire		[157:0] 	sum1;			// sum of compound adder +1 mode
-	// Invert addend if necessary 
+	// Invert addend if z's sign is diffrent from the product's sign
 	assign t2 = invz ? -t : t;
 	// Zero out product if Z >> product or product really should be zero
-	assign r2 = ~proddenorm & killprod ? 106'b0 : r;
+	assign r2 = killprod ? 106'b0 : r;
-	assign s2 = ~proddenorm & killprod ? 106'b0 : s;
+	assign s2 = killprod ? 106'b0 : s;
 	// Compound adder
 	// Consists of 3:2 CSA followed by long compound CPA
--- a/wally-pipelined/src/fpu/FMA/align.v
+++ b/wally-pipelined/src/fpu/FMA/align.v
@ -15,17 +15,17 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
             killprod,  bypsel[1], bypplus1, byppostnorm);
 /////////////////////////////////////////////////////////////////////////////
-	input 		[51:0]		z;				// Fraction of addend z;
+	input 		[51:0]		z;		// Fraction of addend z;
 	input 		[12:0]		ae;		// sign of exponent of addend z;
-	input 		[11:0]		aligncnt;		// amount to shift
+	input 		[11:0]		aligncnt;	// amount to shift
-	input					xzero;			// Input X = 0
+	input				xzero;		// Input X = 0
-	input                  	yzero;          // Input Y = 0 
+	input                  		yzero;          // Input Y = 0 
-	input                  	zzero;          // Input Z = 0
+	input                  		zzero;          // Input Z = 0
-	input                  	zdenorm;        // Input Z = denorm
+	input                  		zdenorm;        // Input Z is denormalized
-	input			proddenorm;
+	input				proddenorm;	// product is denormalized
 	input     	[1:1] 		bypsel;         // Select bypass to X or Z
-	input					bypplus1;		// Add one to bypassed result
+	input				bypplus1;	// Add one to bypassed result
-	input                  	byppostnorm;    // Postnormalize bypassed result 
+	input                  		byppostnorm;    // Postnormalize bypassed result 
 	output    	[157:0]    	t;              // aligned addend (54 bits left of bpt)
 	output          		bs;           	// sticky bit of addend
 	output          		ps;           	// sticky bit of product
@ -34,13 +34,13 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
 	// Internal nodes
 	reg       	[157:0]   	t;				// aligned addend from shifter
-	reg             		killprod;		// Z >> product 
+	reg             		killprod;			// Z >> product 
 	reg             		bs;				// sticky bit of addend
 	reg             		ps;				// sticky bit of product
 	reg       	[7:0]		i;				// temp storage for finding sticky bit
 	wire		[52:0]		z1;				// Z plus 1
 	wire		[51:0]		z2;				// Z selected after handling rounds
-	wire		[11:0]		align104;		// alignment count + 104
+	wire		[11:0]		align104;			// alignment count + 104
 	// Increment fraction of Z by  one if necessary for prerounded bypass
 	// This incrementor delay is masked by the alignment count computation
@ -56,7 +56,7 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
 	// addend on right shifts.  Handle special cases of shifting
 	// by too much.
-	always @(z2 or aligncnt or align104 or zzero or xzero or yzero or zdenorm)
+	always @(z2 or aligncnt or align104 or zzero or xzero or yzero or zdenorm or proddenorm)
 		begin
 		// Default to clearing sticky bits 
@ -66,7 +66,7 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
 		// And to using product as primary operand in adder I exponent gen 
 		killprod = 0;
-		if(zzero) begin 
+		if(zzero) begin // if z = 0
 			t = 158'b0;
 			if (xzero || yzero) killprod = 1;
 		end else if ((aligncnt > 53 && ~aligncnt[11]) || xzero || yzero) begin
@ -75,8 +75,8 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
 			t = {53'b0, ~zzero, z2, 52'b0}; 
 			killprod = 1;
 			ps = ~xzero && ~yzero; 
-		end else if ((ae[12] && align104[11])) begin //***fix the if statement
+		end else if ((ae[12] && align104[11]) && ~proddenorm) begin //***fix the if statement
-			// KEP if the multiplier's exponent overflows
+							// KEP if the multiplier's exponent overflows
 			t = {53'b0, ~zzero, z2, 52'b0}; 
 			killprod = 1;
 			ps = ~xzero && ~yzero; 
@ -85,7 +85,7 @@ module align(z[51:0], ae[12:0], aligncnt, xzero, yzero, zzero, zdenorm, proddeno
 			t = 0;
 		end else if (~aligncnt[11])  begin 	// Left shift by reasonable amount
 			t = {53'b0, ~zzero, z2, 52'b0} << aligncnt;
-		end else begin                 // Otherwise right shift 
+		end else begin                 		// Otherwise right shift 
 			t = {53'b0, ~zzero, z2, 52'b0} >> -aligncnt;
 		// use some behavioral code to find sticky bit.  This is really
--- a/wally-pipelined/src/fpu/FMA/array.sv
+++ b/wally-pipelined/src/fpu/FMA/array.sv
@ -30,85 +30,85 @@ module array(x, y, xdenorm, ydenorm, r, s, bypsel, bypplus1);
    assign xnorm = xdenorm ? {x[50:0], 1'b0} : x; // normalization of denormalized numbers
 	assign ynorm = ydenorm ? {y[50:0], 1'b0} : y;
-     assign yExt = {2'b01,ynorm,1'b0}; // y extended and added assumed 1
+     //assign yExt = {2'b01,ynorm,1'b0}; // y extended and added assumed 1
-     assign xExt = {2'b01,xnorm}; // x with added assumed 1
+     //assign xExt = {2'b01,xnorm}; // x with added assumed 1
     //booth encoding
-     generate
+    //  generate
-        for(i=0; i<27; i=i+1) begin
+    //     for(i=0; i<27; i=i+1) begin
-            booth booth(.xExt(xExt), .choose(yExt[(i*2)+2:i*2]), .add1(add1[i]), .e(e[i]), .pp(pp[i]));
+    //         booth booth(.xExt(xExt), .choose(yExt[(i*2)+2:i*2]), .add1(add1[i]), .e(e[i]), .pp(pp[i]));
-        end
+    //     end
-     endgenerate
+    //  endgenerate
-    assign acc[0] = {49'b0,~e[0],e[0],e[0],pp[0]}; 
+    // assign acc[0] = {49'b0,~e[0],e[0],e[0],pp[0]}; 
-    assign acc[1] = {50'b01,~e[1],pp[1],add1[0]}; 
+    // assign acc[1] = {50'b01,~e[1],pp[1],add1[0]}; 
-    assign acc[2] = {48'b01,~e[2],pp[2],add1[1], 2'b0};
+    // assign acc[2] = {48'b01,~e[2],pp[2],add1[1], 2'b0};
-    assign acc[3] = {46'b01,~e[3],pp[3],add1[2], 4'b0};
+    // assign acc[3] = {46'b01,~e[3],pp[3],add1[2], 4'b0};
-    assign acc[4] = {44'b01,~e[4],pp[4],add1[3], 6'b0};
+    // assign acc[4] = {44'b01,~e[4],pp[4],add1[3], 6'b0};
-    assign acc[5] = {42'b01,~e[5],pp[5],add1[4], 8'b0};
+    // assign acc[5] = {42'b01,~e[5],pp[5],add1[4], 8'b0};
-    assign acc[6] = {40'b01,~e[6],pp[6],add1[5], 10'b0};
+    // assign acc[6] = {40'b01,~e[6],pp[6],add1[5], 10'b0};
-    assign acc[7] = {38'b01,~e[7],pp[7],add1[6], 12'b0};
+    // assign acc[7] = {38'b01,~e[7],pp[7],add1[6], 12'b0};
-    assign acc[8] = {36'b01,~e[8],pp[8],add1[7], 14'b0};
+    // assign acc[8] = {36'b01,~e[8],pp[8],add1[7], 14'b0};
-    assign acc[9] = {34'b01,~e[9],pp[9],add1[8], 16'b0};
+    // assign acc[9] = {34'b01,~e[9],pp[9],add1[8], 16'b0};
-    assign acc[10] = {32'b01,~e[10],pp[10],add1[9], 18'b0};
+    // assign acc[10] = {32'b01,~e[10],pp[10],add1[9], 18'b0};
-    assign acc[11] = {30'b01,~e[11],pp[11],add1[10], 20'b0};
+    // assign acc[11] = {30'b01,~e[11],pp[11],add1[10], 20'b0};
-    assign acc[12] = {28'b01,~e[12],pp[12],add1[11], 22'b0};
+    // assign acc[12] = {28'b01,~e[12],pp[12],add1[11], 22'b0};
-    assign acc[13] = {26'b01,~e[13],pp[13],add1[12], 24'b0};
+    // assign acc[13] = {26'b01,~e[13],pp[13],add1[12], 24'b0};
-    assign acc[14] = {24'b01,~e[14],pp[14],add1[13], 26'b0};
+    // assign acc[14] = {24'b01,~e[14],pp[14],add1[13], 26'b0};
-    assign acc[15] = {22'b01,~e[15],pp[15],add1[14], 28'b0};
+    // assign acc[15] = {22'b01,~e[15],pp[15],add1[14], 28'b0};
-    assign acc[16] = {20'b01,~e[16],pp[16],add1[15], 30'b0};
+    // assign acc[16] = {20'b01,~e[16],pp[16],add1[15], 30'b0};
-    assign acc[17] = {18'b01,~e[17],pp[17],add1[16], 32'b0};
+    // assign acc[17] = {18'b01,~e[17],pp[17],add1[16], 32'b0};
-    assign acc[18] = {16'b01,~e[18],pp[18],add1[17], 34'b0};
+    // assign acc[18] = {16'b01,~e[18],pp[18],add1[17], 34'b0};
-    assign acc[19] = {14'b01,~e[19],pp[19],add1[18], 36'b0};
+    // assign acc[19] = {14'b01,~e[19],pp[19],add1[18], 36'b0};
-    assign acc[20] = {12'b01,~e[20],pp[20],add1[19], 38'b0};
+    // assign acc[20] = {12'b01,~e[20],pp[20],add1[19], 38'b0};
-    assign acc[21] = {10'b01,~e[21],pp[21],add1[20], 40'b0};
+    // assign acc[21] = {10'b01,~e[21],pp[21],add1[20], 40'b0};
-    assign acc[22] = {8'b01,~e[22],pp[22],add1[21], 42'b0};
+    // assign acc[22] = {8'b01,~e[22],pp[22],add1[21], 42'b0};
-    assign acc[23] = {6'b01,~e[23],pp[23],add1[22], 44'b0};
+    // assign acc[23] = {6'b01,~e[23],pp[23],add1[22], 44'b0};
-    assign acc[24] = {4'b01,~e[24],pp[24],add1[23], 46'b0};
+    // assign acc[24] = {4'b01,~e[24],pp[24],add1[23], 46'b0};
-    assign acc[25] = {~e[25],pp[25],add1[24], 48'b0};
+    // assign acc[25] = {~e[25],pp[25],add1[24], 48'b0};
-    assign acc[26] = {pp[26],add1[25], 50'b0};
+    // assign acc[26] = {pp[26],add1[25], 50'b0};
-    //*** resize adders
+    // //*** resize adders
-     generate
+    //  generate
-        for(i=0; i<9; i=i+1) begin
+    //     for(i=0; i<9; i=i+1) begin
-            add3comp2 #(.BITS(106)) add1(.a(acc[i*3]), .b(acc[i*3+1]), .c(acc[i*3+2]), 
+    //         add3comp2 #(.BITS(106)) add1(.a(acc[i*3]), .b(acc[i*3+1]), .c(acc[i*3+2]), 
-                                           .carry(carryTmp[i][105:0]), .sum(lv1add[i*2+1]));
+    //                                        .carry(carryTmp[i][105:0]), .sum(lv1add[i*2+1]));
-            assign lv1add[i*2] = {carryTmp[i][104:0], 1'b0};
+    //         assign lv1add[i*2] = {carryTmp[i][104:0], 1'b0};
-        end
+    //     end
-     endgenerate
+    //  endgenerate
-     generate
+    //  generate
-        for(i=0; i<6; i=i+1) begin
+    //     for(i=0; i<6; i=i+1) begin
-            add3comp2 #(.BITS(106)) add2(.a(lv1add[i*3]), .b(lv1add[i*3+1]), .c(lv1add[i*3+2]), 
+    //         add3comp2 #(.BITS(106)) add2(.a(lv1add[i*3]), .b(lv1add[i*3+1]), .c(lv1add[i*3+2]), 
-                                           .carry(carryTmp[i+9][105:0]), .sum(lv2add[i*2+1]));
+    //                                        .carry(carryTmp[i+9][105:0]), .sum(lv2add[i*2+1]));
-            assign lv2add[i*2] = {carryTmp[i+9][104:0], 1'b0};
+    //         assign lv2add[i*2] = {carryTmp[i+9][104:0], 1'b0};
-        end
+    //     end
-     endgenerate
+    //  endgenerate
-    generate
+    // generate
-        for(i=0; i<4; i=i+1) begin
+    //     for(i=0; i<4; i=i+1) begin
-            add3comp2 #(.BITS(106)) add3(.a(lv2add[i*3]), .b(lv2add[i*3+1]), .c(lv2add[i*3+2]), 
+    //         add3comp2 #(.BITS(106)) add3(.a(lv2add[i*3]), .b(lv2add[i*3+1]), .c(lv2add[i*3+2]), 
-                                            .carry(carryTmp[i+15][105:0]), .sum(lv3add[i*2+1]));
+    //                                         .carry(carryTmp[i+15][105:0]), .sum(lv3add[i*2+1]));
-            assign lv3add[i*2] = {carryTmp[i+15][104:0], 1'b0};
+    //         assign lv3add[i*2] = {carryTmp[i+15][104:0], 1'b0};
-        end
+    //     end
-    endgenerate
+    // endgenerate
-    generate
+    // generate
-        for(i=0; i<2; i=i+1) begin
+    //     for(i=0; i<2; i=i+1) begin
-            add4comp2 #(.BITS(106)) add4(.a(lv3add[i*4]), .b(lv3add[i*4+1]), .c(lv3add[i*4+2]), .d(lv3add[i*4+3]),
+    //         add4comp2 #(.BITS(106)) add4(.a(lv3add[i*4]), .b(lv3add[i*4+1]), .c(lv3add[i*4+2]), .d(lv3add[i*4+3]),
-                                            .carry(carryTmp[i+19]), .sum(lv4add[i*2+1]));
+    //                                         .carry(carryTmp[i+19]), .sum(lv4add[i*2+1]));
-            assign lv4add[i*2] = {carryTmp[i+19][104:0], 1'b0};
+    //         assign lv4add[i*2] = {carryTmp[i+19][104:0], 1'b0};
-        end
+    //     end
-    endgenerate
+    // endgenerate
-    add4comp2 #(.BITS(106)) add5(.a(lv4add[0]), .b(lv4add[1]), .c(lv4add[2]), .d(lv4add[3]) ,
+    // add4comp2 #(.BITS(106)) add5(.a(lv4add[0]), .b(lv4add[1]), .c(lv4add[2]), .d(lv4add[3]) ,
-                                    .carry(carryTmp[21]), .sum(s));
+    //                                 .carry(carryTmp[21]), .sum(s));
-    assign r = {carryTmp[21][104:0], 1'b0};
+    // assign r = {carryTmp[21][104:0], 1'b0};
-	// assign r = 106'b0;
+	assign r = 106'b0;
-	// assign s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm};
+	assign s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm};
 endmodule
--- a/wally-pipelined/src/fpu/FMA/expgen.v
+++ b/wally-pipelined/src/fpu/FMA/expgen.v
@ -19,7 +19,7 @@ module expgen(x[62:52], y[62:52], z[62:52],
 			   earlyres[62:52], earlyressel, bypsel[1], byppostnorm, 
 			   killprod,  sumzero, postnormalize, normcnt, infinity, 
 			   invalid, overflow, underflow, inf, 
-			   nan, xnan, ynan, znan, zdenorm, specialsel, 
+			   nan, xnan, ynan, znan, zdenorm, proddenorm, specialsel, 
 			   aligncnt, w[62:52], wbypass[62:52],
 			   prodof, sumof, sumuf, denorm0, ae[12:0]);
 /////////////////////////////////////////////////////////////////////////////
@ -28,36 +28,37 @@ module expgen(x[62:52], y[62:52], z[62:52],
 	input     	[62:52]  	y;         		// Exponent of multiplicand y
 	input     	[62:52]  	z;           	// Exponent of addend z
 	input     	[62:52]	 	earlyres;  		// Result from other FPU block
-	input     				earlyressel;    // Select result from other block
+	input     			earlyressel;    // Select result from other block
 	input     	[1:1] 		bypsel;         // Bypass X or Z
-	input     				byppostnorm;    // Postnormalize bypassed result
+	input     			byppostnorm;    // Postnormalize bypassed result
-	input     				killprod;    	// Z >> product
+	input     			killprod;    	// Z >> product
-	input     				sumzero;     	// sum exactly equals zero 
+	input     			sumzero;     	// sum exactly equals zero 
-	input     				postnormalize;  // postnormalize rounded result
+	input     			postnormalize;  // postnormalize rounded result
 	input     	[8:0]  		normcnt;     	// normalization shift count 
-	input     				infinity;    	// generate infinity on overflow 
+	input     			infinity;    	// generate infinity on overflow 
-	input     				invalid;     	// Result invalid
+	input     			invalid;     	// Result invalid
-	input     				overflow;    	// Result overflowed
+	input     			overflow;    	// Result overflowed
-	input     				underflow;   	// Result underflowed 
+	input     			underflow;   	// Result underflowed 
-	input     				inf;			// Some input is infinity
+	input     			inf;			// Some input is infinity
-	input     				nan;			// Some input is NaN
+	input     			nan;			// Some input is NaN
-	input     				xnan;			// X is NaN
+	input     			xnan;			// X is NaN
-	input     				ynan;			// Y is NaN
+	input     			ynan;			// Y is NaN
-	input     				znan;			// Z is NaN 
+	input     			znan;			// Z is NaN 
-	input     				zdenorm;		// Z is denorm
+	input     			zdenorm;		// Z is denorm
-	input     				specialsel;  	// Select special result
+	input     			proddenorm;		// product is denorm
 	input     			specialsel;  	// Select special result
 	output		[11:0]   	aligncnt;       // shift count for alignment shifter
-	output		[62:52]     w;           	// Exponent of result
+	output		[62:52]    	w;           	// Exponent of result
-	output		[62:52]     wbypass;     	// Prerounded exponent for bypass 
+	output		[62:52]     	wbypass;     	// Prerounded exponent for bypass 
-	output					prodof;         // X*Y exponent out of bounds 
+	output				prodof;         // X*Y exponent out of bounds 
-	output					sumof;          // X*Y+Z exponent out of bounds 
+	output				sumof;          // X*Y+Z exponent out of bounds 
-	output					sumuf;         // X*Y+Z exponent underflows 
+	output				sumuf;         // X*Y+Z exponent underflows 
-	output					denorm0;     	// exponent = 0 for denorm 
+	output				denorm0;     	// exponent = 0 for denorm 
 	output		[12:0]		ae;				//exponent of multiply
 	//   Internal nodes
-	wire 	[12:0]			aetmp;				// Exponent of Multiply
+
 	wire 	[12:0]			aligncnt0;		// Shift count for alignment
 	wire 	[12:0]			aligncnt1;		// Shift count for alignment
 	wire 	[12:0]			be;				// Exponent of multiply
@ -72,9 +73,11 @@ module expgen(x[62:52], y[62:52], z[62:52],
 	// Note that the exponent does not have to be incremented on a postrounding
 	//   normalization of X because the mantissa was already increased.   Report
 	//   if exponent is out of bounds 
 	assign ae = x + y  - 1023; 
-	assign prodof = (ae > 2046 && ~ae[12] && ~killprod);
+
 	assign ae = x + y  - 1023;
 	assign prodof = (ae > 2046 && ~ae[12]);
 	// Compute alignment shift count
 	// Adjust for postrounding normalization of Z.
@ -82,8 +85,10 @@ module expgen(x[62:52], y[62:52], z[62:52],
 	// check if a round overflows is shorter than the actual round and
 	// is masked by the bypass mux and two 10 bit adder delays.
-	assign aligncnt0 = z - ae[10:0] + 13'b0;
+	assign aligncnt0 = z - ae + 13'b0;// KEP use all of ae
-	assign aligncnt1 = z - ae[10:0] + 13'b1;
+	assign aligncnt1 = z - ae + 13'b1;	
 	//assign aligncnt0 = z - ae[10:0] + 13'b0;//original
 	//assign aligncnt1 = z - ae[10:0] + 13'b1;
 	assign aligncnt = bypsel[1] && byppostnorm ? aligncnt1 : aligncnt0;
 	// Select exponent (usually from product except in case of huge addend)
@ -118,13 +123,17 @@ module expgen(x[62:52], y[62:52], z[62:52],
 	// rounding mode.  NaNs are propagated or generated.
 	assign specialres = earlyressel ? earlyres :
-					invalid ? nanres :
+					invalid | nan ? nanres : // KEP added nan
 					overflow ? infinityres : 
 					inf ? 11'b11111111111 :
 					underflow ? 11'b0 : 11'bx;
 	assign infinityres = infinity ? 11'b11111111111 : 11'b11111111110;
 	// IEEE 754-2008 section 6.2.3 states:
 	// "If two or more inputs are NaN, then the payload of the resulting NaN should be 
 	// identical to the payload of one of the input NaNs if representable in the destination
 	// format. This standard does not specify which of the input NaNs will provide the payload."
 	assign nanres = xnan ? x : (ynan ? y : (znan? z : 11'b11111111111));
 	// A mux selects the early result from other FPU blocks or the 
--- a/wally-pipelined/src/fpu/FMA/flag.v
+++ b/wally-pipelined/src/fpu/FMA/flag.v
@ -13,31 +13,31 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 			 inf, nan, invalid, overflow, underflow, inexact);
 /////////////////////////////////////////////////////////////////////////////
-	input                  	xnan;        	// X is NaN 
+	input                  		xnan;        	// X is NaN 
-	input                  	ynan;        	// Y is NaN 
+	input                  		ynan;        	// Y is NaN 
-	input                 	znan;       	// Z is NaN 
+	input                 		znan;       	// Z is NaN 
-	input                  	xinf;        	// X is Inf
+	input                  		xinf;        	// X is Inf
-	input                 	yinf;       	// Y is Inf 
+	input                 		yinf;       	// Y is Inf 
-	input                  	zinf;        	// Z is Inf
+	input                  		zinf;        	// Z is Inf
-	input                  	prodof;         // X*Y overflows exponent
+	input                  		prodof;         // X*Y overflows exponent
-	input                  	sumof;          // X*Y + z underflows exponent
+	input                  		sumof;          // X*Y + z underflows exponent
-	input                  	sumuf;          // X*Y + z underflows exponent
+	input                  		sumuf;          // X*Y + z underflows exponent
-	input					psign; 			// Sign of product
+	input				psign; 		// Sign of product
-	input					zsign; 			// Sign of z
+	input				zsign; 		// Sign of z
-	input					xzero;			// x = 0
+	input				xzero;		// x = 0
-	input					yzero;			// y = 0
+	input				yzero;		// y = 0
-	input     	[1:0]  		v;				// R and S bits of result
+	input     	[1:0]  		v;		// R and S bits of result
-	output					inf;			// Some	source is Inf
+	output				inf;		// Some	source is Inf
-	output					nan;			// Some	source is NaN
+	output				nan;		// Some	source is NaN
-	output					invalid;		// Result is invalid	
+	output				invalid;	// Result is invalid	
-	output					overflow;		// Result overflowed	
+	output				overflow;	// Result overflowed	
-	output					underflow;		// Result underflowed	
+	output				underflow;	// Result underflowed	
-	output					inexact;		// Result is not an exact	number
+	output				inexact;	// Result is not an exact number
 	//   Internal nodes
-	wire					prodinf;		// X*Y larger than max possible
+	wire				prodinf;	// X*Y larger than max possible
-	wire					suminf;			// X*Y+Z larger than max possible
+	wire				suminf;		// X*Y+Z larger than max possible
 	// If any input is NaN, propagate the NaN 
@ -46,12 +46,14 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	// Same with infinity (inf - inf and O * inf don't propagate inf
 	//  but it's ok becaue illegal op takes higher precidence)
-	assign inf= xinf || yinf || zinf;
+	assign inf= xinf || yinf || zinf || suminf;//KEP added suminf 
 	//assign inf= xinf || yinf || zinf;//original
 	// Generate infinity checks
 	assign prodinf = prodof && ~xnan && ~ynan;
-	assign suminf = sumof && ~xnan && ~ynan && ~znan;
+	//KEP added if the product is infinity then sum is infinity
 	assign suminf = prodinf | sumof && ~xnan && ~ynan && ~znan;
 	// Set invalid flag for following cases:
 	//   1) Inf - Inf
@ -59,8 +61,7 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	//   3) Output = NaN (this is not part of the IEEE spec,  only 486 proj)
 	assign invalid = (xinf || yinf || prodinf) && zinf && (psign ^ zsign) ||
-					   xzero && yinf || yzero && xinf ||
+					   xzero && yinf || yzero && xinf;// KEP remove case 3) above
 					   nan;
 	// Set the overflow flag for the following cases:
 	//   1) Rounded multiply result would be out of bounds
--- a/wally-pipelined/src/fpu/FMA/fmac.v
+++ b/wally-pipelined/src/fpu/FMA/fmac.v
@ -103,7 +103,7 @@ module fmac(xrf, y, zrf, rn, rz, rp, rm,
 						   earlyres[62:52], earlyressel, bypsel[1], byppostnorm,
 						   killprod, sumzero, postnorrnalize, normcnt, 
 						   infinity, invalid, overflow, underflow, 
-						   inf, nan, xnan, ynan, znan, zdenorm, specialsel,
+						   inf, nan, xnan, ynan, znan, zdenorm, proddenorm, specialsel,
 						   aligncnt, w[62:52], wbypass[62:52],
 						   prodof, sumof, sumuf, denorm0, ae);
 // Instantiate special case detection across datapath & exponent path 
@ -120,7 +120,7 @@ assign wbypass[63] = w[63];
 // Instantiate control logic
 sign				sign(x[63], y[63], z[63], negsum0, negsum1, bs, ps, 
-					     killprod, rm, sumzero, nan, invalid, xinf, yinf, inf, 
+					     killprod, rm, overflow, sumzero, nan, invalid, xinf, yinf, zinf, inf, 
 						 w[63], invz, negsum, selsum1, psign); 
 flag				flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 						 psign, z[63], xzero, yzero, v[1:0],
--- a/wally-pipelined/src/fpu/FMA/normalize.v
+++ b/wally-pipelined/src/fpu/FMA/normalize.v
@ -18,12 +18,12 @@ module normalize(sum[157:0], normcnt, sumzero, bs, ps, denorm0, zdenorm, v[53:0]
 /////////////////////////////////////////////////////////////////////////////
 	input     	[157:0]  	sum;            // sum
 	input		[8:0] 		normcnt;     	// normalization shift count
-	input					sumzero;		// sum is zero
+	input				sumzero;	// sum is zero
-	input					bs;				// sticky bit for addend
+	input				bs;		// sticky bit for addend
-	input					ps;				// sticky bit for product
+	input				ps;		// sticky bit for product
-	input					denorm0;		// exponent = -1023
+	input				denorm0;	// exponent = -1023
-	input                  	zdenorm;        // Input Z is denormalized
+	input                  		zdenorm;        // Input Z is denormalized
-	output		[53:0]		v;				// normalized sum, R, S bits
+	output		[53:0]		v;		// normalized sum, R, S bits
 	// Internal nodes
--- a/wally-pipelined/src/fpu/FMA/round.v
+++ b/wally-pipelined/src/fpu/FMA/round.v
@ -19,37 +19,37 @@ module round(v[53:0], earlyres[51:0], earlyressel, rz, rn, rp, rm, wsign,
 			  w[51:0], postnormalize, infinity, specialsel);
 /////////////////////////////////////////////////////////////////////////////
-	input		[53:0]		v;				// normalized sum, R, S bits
+	input		[53:0]		v;		// normalized sum, R, S bits
-	input		[51:0]		earlyres;		// result from other FPU blocks
+	input		[51:0]		earlyres;	// result from other FPU blocks
-	input 					earlyressel; 	// use result from other FPU blocks
+	input 				earlyressel; 	// use result from other FPU blocks
-	input					rz;				// Round toward zero
+	input				rz;		// Round toward zero
-	input					rn;				// Round toward	nearest
+	input				rn;		// Round toward	nearest
-	input					rp;				// Round toward	plus infinity
+	input				rp;		// Round toward	plus infinity
-	input					rm;				// Round toward	minus infinity
+	input				rm;		// Round toward	minus infinity
-	input					wsign;			// Sign of result
+	input				wsign;		// Sign of result
-	input 					invalid;		// Trap on infinity, NaN, denorm
+	input 				invalid;	// Trap on infinity, NaN, denorm
-	input					overflow;		// Result overflowed
+	input				overflow;	// Result overflowed
-	input					underflow;		// Result underflowed
+	input				underflow;	// Result underflowed
-	input					inf;			// Some input is infinity
+	input				inf;		// Some input is infinity
-	input					nan;			// Some input is NaN
+	input				nan;		// Some input is NaN
-	input					xnan;			// X is NaN
+	input				xnan;		// X is NaN
-	input					ynan;			// Y is NaN
+	input				ynan;		// Y is NaN
-	input					znan;			// Z is NaN
+	input				znan;		// Z is NaN
-	input		[51:0]		x;				// Input X
+	input		[51:0]		x;		// Input X
-	input		[51:0]		y;				// Input Y
+	input		[51:0]		y;		// Input Y
-	input		[51:0]		z;				// Input Z
+	input		[51:0]		z;		// Input Z
-	output		[51:0]		w; 				// rounded result of FMAC
+	output		[51:0]		w; 		// rounded result of FMAC
-	output					postnormalize; 	// Right shift 1 for post-rounding norm
+	output				postnormalize; 	// Right shift 1 for post-rounding norm
-	output					infinity;    	// Generate infinity on overflow
+	output				infinity;    	// Generate infinity on overflow
-	output					specialsel;  	// Select special result
+	output				specialsel;  	// Select special result
 	// Internal nodes
-	wire					plus1;			// Round by adding one 
+	wire				plus1;		// Round by adding one 
-	wire		[52:0]		v1;				// Result + 1 (for rounding)
+	wire		[52:0]		v1;		// Result + 1 (for rounding)
-	wire		[51:0]		specialres;		// Result of exceptional case 
+	wire		[51:0]		specialres;	// Result of exceptional case 
 	wire		[51:0]		infinityres;	// Infinity or largest real number
-	wire		[51:0]		nanres;			// Propagated or generated NaN 
+	wire		[51:0]		nanres;		// Propagated or generated NaN 
 	// Compute if round should occur.  This equation is derived from
 	// the rounding tables.
@ -77,7 +77,7 @@ module round(v[53:0], earlyres[51:0], earlyressel, rz, rn, rp, rm, wsign,
 	assign specialsel = earlyressel || overflow || underflow || invalid ||
 							nan || inf;
 	assign specialres = earlyressel ? earlyres : 
-						 invalid ? nanres : 
+						 invalid | nan ? nanres : //KEP added nan
 						 overflow ? infinityres : 
 						 inf ? 52'b0 :
 						underflow ? 52'b0 : 52'bx;  // default to undefined 
@ -93,6 +93,11 @@ module round(v[53:0], earlyres[51:0], earlyressel, rz, rn, rp, rm, wsign,
 	// NaN inputs are already quiet, we don't have to force them quiet.
 	// assign nanres = xnan ? x: (ynan ? y : (znan ? z : {1'b1, 51'b0})); // original
 	// IEEE 754-2008 section 6.2.3 states:
 	// "If two or more inputs are NaN, then the payload of the resulting NaN should be 
 	// identical to the payload of one of the input NaNs if representable in the destination
 	// format. This standard does not specify which of the input NaNs will provide the payload."
 	assign nanres = xnan ? {1'b1, x[50:0]}: (ynan ? {1'b1, y[50:0]} : (znan ? {1'b1, z[50:0]} : {1'b1, 51'b0}));// KEP 210112 add the 1 to make NaNs quiet
 	// Select result with 4:1 mux
--- a/wally-pipelined/src/fpu/FMA/sign.v
+++ b/wally-pipelined/src/fpu/FMA/sign.v
@ -10,8 +10,8 @@
 /////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////
-module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm,
+module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm, overflow,
-			 sumzero, nan, invalid, xinf, yinf, inf, wsign, invz, negsum, selsum1, psign);
+			 sumzero, nan, invalid, xinf, yinf, zinf, inf, wsign, invz, negsum, selsum1, psign);
 ////////////////////////////////////////////////////////////////////////////I
 	input					xsign;			// Sign of X 
@ -23,11 +23,13 @@ module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm,
 	input					ps;				// sticky bit from product
 	input					killprod;		// Product forced to zero
 	input					rm;				// Round toward minus infinity
 	input					overflow;				// Round toward minus infinity
 	input					sumzero;		// Sum = O
 	input					nan;			// Some input is NaN
 	input					invalid;		// Result invalid
 	input					xinf;			// X = Inf
 	input					yinf;			// Y = Inf
 	input					zinf;			// Y = Inf
 	input					inf;			// Some input = Inf
 	output					wsign;			// Sign of W 
 	output					invz;			// Invert addend into adder
@ -47,13 +49,13 @@ module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm,
 	assign psign = xsign ^ ysign;
 	// Invert addend if sign of Z is different from sign of product assign invz = zsign ^ psign;
-	assign invz = zsign ^ psign;
+	assign invz = (zsign ^ psign);
 	// Select +l mode for adder and compute if result must be negated
 	// This is done according to cases based on the sticky bit.
 	always @(invz or negsum0 or negsum1 or bs or ps)
 		begin
-			if (~invz) begin               // both inputs have same sign
+			if (~invz) begin               // both inputs have same sign //KEP if overflow 
 				negsum = 0;
 				selsum1 = 0;
 			end else if (bs) begin        // sticky bit set on addend
@ -85,9 +87,8 @@ module sign(xsign, ysign, zsign, negsum0, negsum1, bs, ps, killprod, rm,
 	//			 sum/difference shall be -0.  However, x+x = x-(-X) retains the same sign as x even when x is zero."
 	assign zerosign = (~invz && killprod) ? zsign : rm;
-	assign infsign = psign; //KEP 210112 keep the correct sign when result is infinity
+	assign infsign = zinf ? zsign : psign; //KEP 210112 keep the correct sign when result is infinity
-	// assign infsign = xinf ? (yinf ? psign : xsign) : yinf ? ysign : zsign;//original
+	//assign infsign = xinf ? (yinf ? psign : xsign) : yinf ? ysign : zsign;//original
-	assign wsign =invalid? 0 : (inf ? infsign:
+	assign wsign = invalid ? 0 : (inf ? infsign :(sumzero ? zerosign : psign ^ negsum));
 								(sumzero ? zerosign : psign ^ negsum));
 endmodule
--- a/wally-pipelined/src/fpu/FMA/special.v
+++ b/wally-pipelined/src/fpu/FMA/special.v
@ -14,23 +14,23 @@ module special(x[63:0], y[63:0], z[63:0], ae, xzero, yzero, zzero,
 				xnan, ynan, znan, xdenorm, ydenorm, zdenorm, proddenorm, xinf, yinf, zinf);
 /////////////////////////////////////////////////////////////////////////////
-	input   		[63:0]     	x;             // Input x
+	input   	[63:0]     	x;              // Input x
 	input     	[63:0]     	y;           	// Input Y
 	input      	[63:0]    	z;            	// Input z 
-	input		[12:0]			ae;			// exponent of product
+	input		[12:0]		ae;		// exponent of product
-	output						xzero;			// Input x = 0
+	output				xzero;		// Input x = 0
-	output						yzero;			// Input y = 0
+	output				yzero;		// Input y = 0
-	output						zzero;			// Input z = 0
+	output				zzero;		// Input z = 0
-	output						xnan;			// x is NaN
+	output				xnan;		// x is NaN
-	output						ynan;			// y is NaN
+	output				ynan;		// y is NaN
-	output						znan;			// z is NaN
+	output				znan;		// z is NaN
-	output						xdenorm;		// x is denormalized
+	output				xdenorm;	// x is denormalized
-	output						ydenorm;		// y is denormalized
+	output				ydenorm;	// y is denormalized
-	output						zdenorm;		// z is denormalized
+	output				zdenorm;	// z is denormalized
-	output						proddenorm;		// product is denormalized
+	output				proddenorm;	// product is denormalized
-	output						xinf;			// x is infinity
+	output				xinf;		// x is infinity
-	output						yinf;			// y is infinity
+	output				yinf;		// y is infinity
-	output						zinf;			// z is infinity
+	output				zinf;		// z is infinity
 	// In the actual circuit design, the gates looking at bits
 	// 51:0 and at bits 62:52 should be shared among the various detectors.
--- a/wally-pipelined/src/fpu/FMA/tbgen/results.dat
+++ b/wally-pipelined/src/fpu/FMA/tbgen/results.dat
@ -1 +1,130 @@
-0020000803ffffff bfcb4181a9468e24 000fffffffffffff 7fe2f9c2bca0f33c 00092f9c2bca0f33  Wrong zdenorm 18
+0000000000000001 7fdffffeffffffbf 4000000000080004 4007ffffc007fff5 4000000000080005  Wrong xdenorm 85959
 0000000000000001 c3ded4d0b02cd6aa 000c158ac12ac439 83eed4d0b02cd6ae 80bed1cb4d7c8bf9  Wrong xdenorm zdenorm 91485
 c15000000010001f 434ffffffffffffe 47f55792228596a0 c7e550dbbaf4d2c2 47f557922285969f  Wrong 97625
 0000000000000001 7fe0000000000001 4340000000000001 4340000000000002 4340000000000001  Wrong xdenorm 99467
 0000000000000001 bfdffffffffffffe 801fffffffffbfc0 8021ffffffffdfe0 801fffffffffbfc0  Wrong xdenorm 117273
 0000000000000001 ffe0000000000000 40d4000040000000 40d3ffc040000000 40d4000040000000  Wrong xdenorm 133851
 000fffffffffffff 3fcffc007fffffff 800fffffffffffff 800800ffe0000000 800c007fefffffff  Wrong xdenorm zdenorm 147973
 000fffffffffffff 3feffffffffffffe 000ffffffffffffe 001ffffffffffffd 001ffffffffffffc  Wrong xdenorm zdenorm 154727
 000ffffffffffffe 41dffffffffff900 0000000000000001 02000000000ffc7e 01fffffffffff8fc  Wrong xdenorm zdenorm 230863
 0010000000000000 bf4fdffffff7fffe 800ffffffffffffe 801003fbfffffeff 801003fbfffffefe  Wrong zdenorm 308227
 0010000000000000 be6fffffbffffff7 8000000000000000 8000000000000000 800000000fffffe0  Wrong w=-zero unflw 313753
 0010000000000001 bcafffffffffffff 801fffffffffffff 8000000000000000 8020000000000000  Wrong w=-zero unflw 392345
 0010000000000001 bfe0000000000001 800ffffffffffffe 8018000000000000 8017ffffffffffff  Wrong zdenorm 397871
 802000003ffffbff c3cfffffffffffd7 0000000000000001 040000003ffffbeb 040000003ffffbea  Wrong zdenorm 448219
 dc10000000001eff 0000000000000001 802d63f274ada691 9c20000000001f01 98f0000000001eff  Wrong ydenorm 489971
 001ffffffffffffe 3fddfbffffffffff 000ffffffffffffe 001efdfffffffffe 001efdfffffffffd  Wrong zdenorm 551371
 3ca0000000000000 0000000000000001 000e8d6ac606e59d 000e8d6ac606e59e 000e8d6ac606e59d  Wrong ydenorm zdenorm 559353
 3ca0000000000000 434ffffffffffffe c019cab46f8c90a7 c011cab46f8c90a7 c011cab46f8c90a8  Wrong 586983
 3ca0000000000001 bfe000000fffdfff 3fee60af9e2e4b00 bfa9f5061d1b5008 3fee60af9e2e4aff  Wrong 649611
 3ca0000000000001 7fe0000000000000 ffefffffffffffff 7ca8000000000000 ffeffffffffffffe  Wrong 657593
 44f0000000000dff 000000007fbffffe 801ffffffffffffe 05000000ff800dfb 03bfefffff801bf0  Wrong ydenorm 680311
 3ca0000000000001 bfffffffffffffff 3ff0007ffffffc00 bfefff0000000802 3ff0007ffffffbff  Wrong 680925
 3cafffffffffffff 3caffffffffffffe bcaffffffffffffe 397fffffffffffff bcaffffffffffffc  Wrong 707327
 3cafffffffffffff c01ffffffffffffe c02cbe486a2b0809 c02cbe486a2b0809 c02cbe486a2b080a  Wrong 758289
 000667c5d67e1d85 3fdfeffffdffffff 001fffffffffffff 002398247cab1886 002199247ccb1886  Wrong xdenorm 763201
 000007fffffffffe 3f6ffffffe01fffe 000ffffffffffffe 00100807ffff7fff 00100007ffffff7e  Wrong xdenorm zdenorm 768727
 3caffffffffffffe 4060000001000006 4070001fffff7ffe 4070001fffff7ffe 4070001fffff7fff  Wrong 771183
 3caffffffffffffe 3fdfffffffffffff 3fe0000000000000 3fe0000000000000 3fe0000000000001  Wrong 779165
 bfd7ffffffbfffff 4000000000000000 3fffffffffffffff 3ff40000001fffff 3ff4000000200000  Wrong 787147
 3caffffffffffffe c00ffffffffffffe c01000000020007f c01000000020007f c010000000200080  Wrong 824601
 3caffffffffffffe c00fffffffffff08 4010000000000001 4010000000000001 4010000000000000  Wrong 827671
 800000000000007e ffd26a0f710537a9 c01b3b74de550046 c018ee32f034592d c01b3b74de550022  Wrong xdenorm 861441
 47f9aa99d39dd7d8 0000000000000001 8000000000000000 0809aa99d39dd7db 04d9aa99d39dd7d8  Wrong ydenorm 908719
 bfef000004000000 c000000000000000 c34ff80000000006 42afffffffffebe0 c34ff80000000005  Wrong 1031519
 bfe0010000007fff 3ff00003ffffff7f 4340000000000000 4340000000000000 433fffffffffffff  Wrong 1039501
 3fdffffffffffffe 000ffffffffffffe 8000000000000001 fcdfffffffffffff 0007fffffffffffe  Wrong ydenorm zdenorm 1049939
 802000007fffffbf 400ffffffffffffe 8000000000000001 804100007fffffbe 804000007fffffbe  Wrong zdenorm 1068973
 3fdffffffffffffe bfffffffffffffff 3fefffffffffffff 3caffffffffffffe 3cafffffffffffff  Wrong 1099673
 c7fffffffb7fffff 37efffffffdffbff c34000003ffffffc c34000003ffffffc c34000003ffffffd  Wrong 1104585
 3fe0000000000001 3ca0000000000000 bcaffffffffffffe bca7fffffffffffd bca7fffffffffffe  Wrong 1193615
 bfe00000000effff 800fffffffffffff 8010000000000001 8000000000000000 8007fffffff88002  Wrong w=-zero ydenorm unflw 1223701
 3feffffffffffffe 3ff0000000000000 bfefffffffffffff bc9ffffffffffffc bca0000000000000  Wrong 1342817
 3feffffffffffffe 801fffffffffffff 800007fffffbfffe 802401fffffefffe 802003fffffdfffe  Wrong zdenorm 1366149
 bfd0002000007fff 0000000000000001 0010000000000001 0000000000000000 0010000000000001  Wrong ydenorm unflw 1466845
 0003476357ebf517 7fe000004000003f 8000000000000000 3ff68ec70a130546 3fda3b1b284c141d  Wrong xdenorm 1503685
 4000002003fffffe bc4fffffbffffffe bfbfffffffffe3ff bfbfffffffffe3ff bfbfffffffffe400  Wrong 1635081
 3ca00ffdffffffff 3fdffffffffffffe bfe0000000000001 bfe0000000000001 bfe0000000000000  Wrong 1687885
 801fffffbf000000 4012b6da70c3decc 0000000000000006 8041b6da4ac07317 8042b6da4ac07316  Wrong zdenorm 1753583
 b7ff7ffffffff000 7fe0000000000000 78b01fffefffffff 78b01f03efffffff 78b01f03f0000000  Wrong 1843841
 400fffffffffffff 8000000000000001 801fffffffffffff 8030000000000000 8020000000000001  Wrong ydenorm 1851209
 00003fefffffffff ffeffffffffffffe 8000000000000000 c0007fdffffffffd bfaff7ffffffff7e  Wrong xdenorm 1881295
 800000000003fffe 578284b14dfcc6e4 8000000000000000 979284b14e060938 958284a80ba41fe6  Wrong xdenorm 1989973
 bfdeffffff000000 002ffffffffc3ffe 0000002003fffffe 8016ffeffcfc5dff 801effdffafc5e00  Wrong zdenorm 2018831
 401fffffffffffff 3fdffffffffffffe c340000000400002 433fffffff800000 c340000000400000  Wrong 2106633
 401fffffffffffff 4010000000000001 c050ffffffff0000 c041fffffffdffff c041fffffffe0000  Wrong 2117685
 4340000000000000 3fd0000000000000 3fd0000000000001 4320000000000000 4320000000000001  Wrong 2243555
 bcb58ba32df145e0 3fbe0000003fffff 3fe0000000000000 3fe0000000000000 3fdfffffffffffff  Wrong 2365741
 bfed82e3c6c037db 3ff0000000000000 4340000000000000 4340000000000000 433fffffffffffff  Wrong 2389687
 3fdfffffffff7000 bcaffffffffffffe 3fe0000000000001 3fe0000000000001 3fe0000000000000  Wrong 2417317
 8000000002000000 ff100001efffffff 8d261bb2da873976 3f200001f400007b 3d800001efffffff  Wrong xdenorm 2422229
 bb0fb893e0decb72 c1cffff7ffbfffff c03ffc0000003fff c03ffc0000003fff c03ffc0000003ffe  Wrong 2546871
 800000000e000000 3fffffffffffffff 034ffff80ffffffe 034ffff80ffffffc 034ffff80ffffffe  Wrong xdenorm 2600903
 7fe0000000000001 4000000000000000 ffefffffffffffff 7ff0000000000000 7cb8000000000000  Wrong w=+inf 2602745
 7fe0000000000001 8000000000000001 c010000000000001 c014000000000002 c010000000000002  Wrong ydenorm 2619323
 7fe0000000000001 bcafffffffffffff 7fefffffdffffffc fe70000002800000 7fefffffdffffffb  Wrong 2626077
 7fefffffffffffff 0000000000000001 37ffffff7ffffeff 4000000000000001 3ccfffffffffffff  Wrong ydenorm 2653707
 3feffffffffbfffd 3ca0000000000001 bfe0000000000001 3fe0000000000000 bfe0000000000000  Wrong 2660461
 000ffffffff00006 bfe0000000000001 0000000000000001 7dfffff400000002 8007fffffff80002  Wrong xdenorm zdenorm 2770981
 fd61dd32fb8e3b2c 0000000000000001 801ffffffffffffe bd71dd32fb8e3b2e ba41dd32fb8e3b2c  Wrong ydenorm 3003073
 000fff000000000f 3ff00800001fffff 8010000000000000 0000000000000000 000006ff801ffe0e  Wrong xdenorm unflw 3117277
 8000000000000001 400effffff000000 0010000000000000 8000000000000000 000ffffffffffffc  Wrong w=-zero xdenorm unflw 3143065
 8000000000000001 40211275ffe5ee3c 0000000000000001 802e24ebffcbdc7c 8000000000000008  Wrong xdenorm zdenorm 3148591
 8000000000000001 c1c01ffffffffefe 03100007fe000000 0310000900000000 03100007fe000000  Wrong xdenorm 3152889
 8000000000000001 3fe0000000000001 800fffffffffffff 8014000000000000 8010000000000000  Wrong xdenorm zdenorm 3155345
 8000000000000001 7fef848cc01517b4 c340000000000001 c340000000000002 c340000000000001  Wrong xdenorm 3170695
 8000000000000001 7feffffffffffffe 410ffffffc007ffe 410fffeffc007ffe 410ffffffc007ffe  Wrong xdenorm 3173151
 8000000000000001 bffffffffffffffe 002e000000100000 0033000000080000 002e000000100001  Wrong xdenorm 3195255
 8000000000000001 ffe0000000000000 3feffffffffffffe 4000000000000000 3ff0000000000001  Wrong xdenorm 3205079
 8000000000000001 ffe0000000000001 c1ffbfffdffffffe c1ffbfffdfeffffe c1ffbfffdffffffe  Wrong xdenorm 3206307
 800fffffffffffff 3ff0000000000000 001ffffffffffffe 0000000000000000 000fffffffffffff  Wrong xdenorm unflw 3227183
 3e7ffffffefc0000 bfffffffffffffff 41c0ea1ad0c683e5 c1be2bca5e72f83a 41c0ea1ad0c683e3  Wrong 3264023
 3fffa9456a66b8c6 3caffffffffffffe c00ffffffffffffe c00ffffffffffffe c00ffffffffffffd  Wrong 3290425
 800ffffffffffffe 3fe0000000000001 0010000400000010 0000000000000000 0008000400000011  Wrong xdenorm unflw 3294723
 800ffffffffffffe 3fd0000000007ffe 000fffffffffffff 0007ffffffffc001 000bffffffffe000  Wrong xdenorm zdenorm 3308845
 800ffffffffffffe c010000000000000 800dfede47fbc1e2 002880486e010f84 00290090dc021f0b  Wrong xdenorm zdenorm 3338931
 bfdffc90d6e1fc1f 3ca1ffffffffeffe bfe66ad464a87aac bfe66ad464a87aac bfe66ad464a87aad  Wrong 3367175
 8010000000000000 bfe0000000000000 000fffffffffffff 0018000000000000 0017ffffffffffff  Wrong zdenorm 3398489
 7fe800000000003e 8004de935d68d1e8 801fffffffffffff c0034ddd0c1d3b0e bfed37743074ebbb  Wrong ydenorm 3437785
 8010000000000001 bfefffffffffffff 801ffffffffffffe 8000000000000000 800ffffffffffffe  Wrong w=-zero unflw 3470327
 801fffffffffffff bfdffffffffffffe 0000000000021fff 0018000000010ffe 0010000000021ffe  Wrong zdenorm 3537867
 0005e0458a43fbdb 7fdfffbfffffffff 0000000000000000 3ffbc0539371cea5 3fe780e726e39d4b  Wrong xdenorm 3691981
 bca0000000000001 3cafffffffffffff 3cafffffffffffff b970000000000000 3caffffffffffffe  Wrong 3707945
 bca0000000000001 3fefffffffffffff bff0400000000400 bff0400000000400 bff0400000000401  Wrong 3714699
 bca0000000000001 c34ffffffffffffe c000000000000000 0000000000000000 b980000000000000  Wrong 3763205
 bcafffffffffffff 3fc200001fffffff 3fdffff00000ffff 3fdffff00000ffff 3fdffff00000fffe  Wrong 3788379
 bcafffffffffffff 800ffffffffffffe 8000000000000000 0000000000000000 0000000000000001  Wrong ydenorm unflw 3807413
 bcaffffffffffffe 3fdffffffffffffe 3ff0000000000000 3ff0000000000000 3fefffffffffffff  Wrong 3851621
 bcaffffffffffffe 001ffffffffc0000 8000000000000001 8000000000000005 8000000000000003  Wrong zdenorm 3878023
 7fec5fed92358a74 400000001bffffff ffefc0003ffffffe 7ff0000000000000 7fe8ffdb47bad466  Wrong w=+inf 3889689
 bfdfffffffffffff 000fffffffffffff 0000000000000000 8000000000000000 8007ffffffffffff  Wrong w=-zero ydenorm unflw 4050557
 bfdfffffffffffff 8000000000000001 8010000000800400 8000000000000000 8010000000800400  Wrong w=-zero ydenorm unflw 4084941
 bfdfffffffffffff bff0000000000000 bfe0000000000001 bca7ffffffffffff bca8000000000000  Wrong 4100291
 bff400003ffffffe bfeffffffffffffe 434fffffffffffff 434fffffffffffff 4350000000000000  Wrong 4169059
 43f00002003ffffe 8000000000000001 0010000000000000 8400000200400000 80cffe04007ffffc  Wrong ydenorm 4224319
 bfe0000000000000 801fffffffffffff 00000007fff80000 00180003fffc0000 00100007fff80000  Wrong zdenorm 4228617
 bfe0000000000000 c000000000000001 c00ffffffffffffe c007fffffffffffd c007fffffffffffe  Wrong 4243967
 bfcfdffffeffffff 8000000000000001 000fffffffffe080 0011fdffffeff040 000fffffffffe080  Wrong ydenorm zdenorm 4573685
 bfffffffffffffff 0000000000000001 8010000000000001 8020000000000001 8010000000000003  Wrong ydenorm 4608683
 bfffffffffffffff 3cafffffffffffff 3ff00000040001ff bfeffffff7fffc06 3ff00000040001fd  Wrong 4615437
 d2b6d8b0e4fde949 0000000000000001 0011fffffffffeff 92c6d8b0e4fde94c 8f96d8b0e4fde949  Wrong ydenorm 4678679
 3fd07dfffffffffe 8010000000000001 0000000000000001 7fef040000000006 80041f7fffffffff  Wrong zdenorm 4716133
 bffffffffffffffe bfffffffffffffff c00ffffffffffffe bcbffffffffffffc bcbffffffffffffe  Wrong 4730255
 c000000000000001 00000000004fffff 801ffffffffffffe 80280000004fffff 80200000004ffffe  Wrong ydenorm 4839547
 c00982d68cfe066b 000ffffffffffffe 8000000000000001 802d82d68cfe0668 802982d68cfe0668  Wrong ydenorm zdenorm 4959277
 346ffffffffffeef 480ffffeffffffe0 3fdfffffffffffff 3fdfffffffffffff 3fe0000000000000  Wrong 4962961
 c01fffffffffffff 0007ffffffff0000 8000000000000000 803ffffffffdffff 802ffffffffbffff  Wrong ydenorm 5176633
 c01fffffffffffff bfc08000000fffff 434ffffffffffffe 434ffffffffffffe 434fffffffffffff  Wrong 5193211
 c3a000000fffff7f 80000000000005fe 001000003ffffbff 03b0000010000b7b 0127f80817f81f3f  Wrong ydenorm 5450477
 0012000000000001 4000000000000001 000909a97b1f06a1 0028426a5ec7c1aa 002684d4bd8f8353  Wrong zdenorm 5535209
 ffe0000000000000 8000000000000001 c03000000000003f c02e00000000007e c03000000000003f  Wrong ydenorm 5621169
 ffe0000000000001 3ca0c6fe6997e5e2 7fefffffffffffff fca8637f34cbf2f2 7feffffffffffffe  Wrong 5673973
 ffe0000000000001 800fffffffffffff c340000000000001 4340000000000000 c340000000000000  Wrong ydenorm 5691779
 43f4595959dece4b 8000000000000001 801fffffffffffff 8404595959dece4e 80d45b5959dece4b  Wrong ydenorm 5760547
 ffefffffffffffff 3ca14e19e3a06f13 7fe00000000ff7fe ffdfffffffe01006 7fe00000000ff7fd  Wrong 5783265
 ffeffffffffffffe 8000000000000001 000fffffffffffff 4000000000000001 3ccffffffffffffe  Wrong ydenorm zdenorm 5829929
 ffeffffffffffffe 800001fffbffffff bac0000ffeffffff 400003fff7fffffd 3f5fffbfffffeffe  Wrong ydenorm 5832999
 bca0000004000080 bff0000000000009 bff0000000000001 3fefffffffffffff bff0000000000000  Wrong 5841595
 3fffffffff9ffffe 800000000f7ffffe 800ffdffbffffffe 801ffefffecffffa 800ffdffdefffffa  Wrong ydenorm zdenorm 5887031
 41ccc32b421f1ac0 8000000000000001 802ffffc0000001e 81dcc32b461f1a44 802ffffc1cc32b60  Wrong ydenorm 5899925
 41ffffffffffff87 8000000000000001 0000000000000000 820fffffffffff8b 8000000200000000  Wrong ydenorm 6039335
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.c
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.c
@ -14,24 +14,31 @@ void main() {
 	fp = fopen("testFloat","r");
 	fq = fopen("tb.v","a");
 	system("cp tbhead.v tb.v");
-	int k=0;
+	long k=0L;
-	for(k=0; k<91 && !feof(fp); k++) {
+	for(; !feof(fp); k++) {
 		//3FDBFFFFFFFFFF7F DE608000000001FF 43CFED83C17EDBD0 DE4CE000000002F9 01
 		// b68ffff8000000ff_3f9080000007ffff_b6307ffbe0080080_00001
-        char ch;
+                char ch;
-		int i,j;
+		int i,j,n;
 		char *ln;
 		char xrf[17];
 		char y[17];
 		char zrf[17];
 		char ans[81];
 		char flags[3];
 		int rn,rz,rm,rp;
-		{
+		long stop = 6039335;
-  //my_string = (char *) malloc (nbytes + 1);
+		int debug = 0;
-  //bytes_read = getline (&my_string, &nbytes, stdin);
+		//my_string = (char *) malloc (nbytes + 1);
-			if(getline(&ln,&nbytes,fp) < 0) break;
+		//bytes_read = getline (&my_string, &nbytes, stdin);
-			//fprintf(stderr,"%s\n", ln);
+	
 		for(n=0; n < 613; n++) {//613 for 10000
 			if(getline(&ln,&nbytes,fp) < 0 || feof(fp)) break;
 			if(k == stop && debug == 1) break;
 			k++;
 		}
 		//fprintf(stderr,"%s\n", ln);
 		if(!feof(fp)) {
 			strncpy(xrf,   ln,     16); xrf[16]=0;
 			strncpy(y,    &ln[17], 16); y[16]=0;
@ -46,71 +53,80 @@ void main() {
 			fprintf(fq,"    zrf = 64'h%s;\n",zrf);
 			fprintf(fq,"    ans = 64'h%s;\n", ans);
 			// fprintf(fq,"    flags = 5'h%s;\n", flags);
-		}
+	
-		{
+			{
-			//rn=1; rz=0; rm=0; rp=0;
+				//rn=1; rz=0; rm=0; rp=0;
-			fprintf(fq,"    rn = %d;\n",1);
+				fprintf(fq,"    rn = %d;\n",1);
-			fprintf(fq,"    rz = %d;\n", 0);
+				fprintf(fq,"    rz = %d;\n", 0);
-			fprintf(fq,"    rm = %d;\n", 0);
+				fprintf(fq,"    rm = %d;\n", 0);
-			fprintf(fq,"    rp = %d;\n", 0);
+				fprintf(fq,"    rp = %d;\n", 0);
-		}
+			}
-		{
+			{
-			fprintf(fq,"    earlyres = 64'b0;\n");
+				fprintf(fq,"    earlyres = 64'b0;\n");
-			fprintf(fq,"    earlyressel = 0;\n");
+				fprintf(fq,"    earlyressel = 0;\n");
-		}		
+			}		
-		{
+			{
-			fprintf(fq,"    bypsel= 2'b0;\n"); //, bysel);
+				fprintf(fq,"    bypsel= 2'b0;\n"); //, bysel);
-			fprintf(fq,"    bypplus1 = 0;\n"); //, byp1);
+				fprintf(fq,"    bypplus1 = 0;\n"); //, byp1);
-			fprintf(fq,"    byppostnorm = 0;\n"); //, bypnorm);
+				fprintf(fq,"    byppostnorm = 0;\n"); //, bypnorm);
-		}
+			}
-		fprintf(fq,"#10\n");
+			fprintf(fq,"#10\n");
-	// IEEE 754-2008 section 6.3 states "When ether an input or result is NaN, this standard does not interpret the sign of a NaN."
+			// IEEE 754-2008 section 6.3 states "When ether an input or result is NaN, this standard does not interpret the sign of a NaN."
-		//fprintf(fq,"	$fwrite(fp, \"%%h %%h %%h %%h \",xrf,y,w, ans);\n");	
+			//fprintf(fq,"	$fwrite(fp, \"%%h %%h %%h %%h \",xrf,y,w, ans);\n");	
-		fprintf(fq,"    // IEEE 754-2008 section 6.3 states: \"When ether an input or result is NaN, this\n");
+			fprintf(fq,"    // IEEE 754-2008 section 6.3 states: \"When ether an input or result is NaN, this\n");
-		fprintf(fq,"    //                                     standard does not interpret the sign of a NaN.\"\n");
+			fprintf(fq,"    //                                     standard does not interpret the sign of a NaN.\"\n");
- 		fprintf(fq,"	nan = (w >  64'h7FF0000000000000 && w <  64'h7FF8000000000000)  ||\n");
+			fprintf(fq,"	wnan = &w[62:52] && |w[51:0]; \n");
- 		fprintf(fq,"	      (w >  64'hFFF8000000000000 && w <  64'hFFF8000000000000 ) ||\n");
+			fprintf(fq,"	xnan = &xrf[62:52] && |xrf[51:0]; \n");
- 		fprintf(fq,"	      (w >= 64'h7FF8000000000000 && w <= 64'h7FFfffffffffffff ) ||\n");
+			fprintf(fq,"	ynan = &y[62:52] && |y[51:0]; \n");
- 		fprintf(fq,"	      (w >= 64'hFFF8000000000000 && w <= 64'hFFFfffffffffffff );\n");
+			fprintf(fq,"	znan = &zrf[62:52] && |zrf[51:0]; \n");
-		// fprintf(fq,"    if(!(~(|xrf[62:52]) && |xrf[51:0] || ~(|y[62:52]) && |y[51:0])) begin\n"); 
+			fprintf(fq,"	ansnan = &ans[62:52] && |ans[51:0]; \n");
-																						// not looknig at negative zero results right now
+			fprintf(fq,"	xnorm = ~(|xrf[62:52]) && |xrf[51:0] ? {xrf[50:0], 1'b0} : xrf; \n");
-		//fprintf(fq,"	  if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) && !(w == 64'h8000000000000000 && ans == 64'b0)) begin\n"); 
+			fprintf(fq,"	ynorm = ~(|y[62:52]) && |y[51:0] ? {y[50:0], 1'b0} : y;\n");
-		fprintf(fq,"	if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) ) begin\n"); 
+			fprintf(fq,"	s = ({54'b1,xnorm} + (bypsel  && bypplus1))  *  {54'b1,ynorm}; \n");
-		fprintf(fq,"		$fwrite(fp, \"%%h %%h %%h %%h %%h  Wrong \",xrf,y, zrf, w, ans);\n");
+			// fprintf(fq,"    if(!(~(|xrf[62:52]) && |xrf[51:0] || ~(|y[62:52]) && |y[51:0])) begin\n"); 
- 		fprintf(fq,"		if(w == 64'h8000000000000000) $fwrite(fp, \"w=-zero \");\n");
+																							// not looknig at negative zero results right now
- 		fprintf(fq,"		if(~(|xrf[62:52]) && |xrf[51:0]) $fwrite(fp, \"xdenorm \");\n");
+			//fprintf(fq,"	  if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) && !(w == 64'h8000000000000000 && ans == 64'b0)) begin\n"); 
- 		fprintf(fq,"		if(~(|y[62:52]) && |y[51:0]) $fwrite(fp, \"ydenorm \");\n");
+			// fprintf(fq,"	if( (nan && (w[62:0] != ans[62:0])) || (!nan && (w != ans)) ) begin\n"); 
- 		fprintf(fq,"		if(~(|zrf[62:52]) && |zrf[51:0]) $fwrite(fp, \"zdenorm \");\n");
+			fprintf(fq,"	if((!wnan && (w != ans)) || (wnan && ansnan && ~(((xnan && (w[62:0] == {xrf[62:52],1'b1,xrf[50:0]})) || (ynan && (w[62:0] == {y[62:52],1'b1,y[50:0]}))  || (znan && (w[62:0] == {zrf[62:52],1'b1,zrf[50:0]})) || (w[62:0] == ans[62:0])) ))) begin\n"); 
-  		fprintf(fq,"		if(invalid != 0) $fwrite(fp, \"invld \");\n");
+			fprintf(fq,"		$fwrite(fp, \"%%h %%h %%h %%h %%h  Wrong \",xrf,y, zrf, w, ans);\n");
- 		fprintf(fq,"		if(overflow != 0) $fwrite(fp, \"ovrflw \");\n");
+			//fprintf(fq,"		$fwrite(fp, \"%%h \",s);\n");
- 		fprintf(fq,"		if(underflow != 0) $fwrite(fp, \"unflw \");\n");
+			fprintf(fq,"		if(w == 64'h8000000000000000) $fwrite(fp, \"w=-zero \");\n");
- 		fprintf(fq,"		if(w == 64'hFFF0000000000000) $fwrite(fp, \"w=-inf \");\n");
+			fprintf(fq,"		if(~(|xrf[62:52]) && |xrf[51:0]) $fwrite(fp, \"xdenorm \");\n");
- 		fprintf(fq,"		if(w == 64'h7FF0000000000000) $fwrite(fp, \"w=+inf \");\n");
+			fprintf(fq,"		if(~(|y[62:52]) && |y[51:0]) $fwrite(fp, \"ydenorm \");\n");
- 		fprintf(fq,"		if(w >  64'h7FF0000000000000 && w <  64'h7FF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
+			fprintf(fq,"		if(~(|zrf[62:52]) && |zrf[51:0]) $fwrite(fp, \"zdenorm \");\n");
- 		fprintf(fq,"		if(w >  64'hFFF8000000000000 && w <  64'hFFF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
+			fprintf(fq,"		if(invalid != 0) $fwrite(fp, \"invld \");\n");
- 		fprintf(fq,"		if(w >= 64'h7FF8000000000000 && w <= 64'h7FFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
+			fprintf(fq,"		if(overflow != 0) $fwrite(fp, \"ovrflw \");\n");
- 		fprintf(fq,"		if(w >= 64'hFFF8000000000000 && w <= 64'hFFFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
+			fprintf(fq,"		if(underflow != 0) $fwrite(fp, \"unflw \");\n");
 			fprintf(fq,"		if(w == 64'hFFF0000000000000) $fwrite(fp, \"w=-inf \");\n");
 			fprintf(fq,"		if(w == 64'h7FF0000000000000) $fwrite(fp, \"w=+inf \");\n");
 			fprintf(fq,"		if(w >  64'h7FF0000000000000 && w <  64'h7FF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
 			fprintf(fq,"		if(w >  64'hFFF8000000000000 && w <  64'hFFF8000000000000 ) $fwrite(fp, \"w=sigNaN \");\n");
 			fprintf(fq,"		if(w >= 64'h7FF8000000000000 && w <= 64'h7FFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
 			fprintf(fq,"		if(w >= 64'hFFF8000000000000 && w <= 64'hFFFfffffffffffff ) $fwrite(fp, \"w=qutNaN \");\n");
- 		fprintf(fq,"		if(ans == 64'hFFF0000000000000) $fwrite(fp, \"ans=-inf \");\n");
+			fprintf(fq,"		if(ans == 64'hFFF0000000000000) $fwrite(fp, \"ans=-inf \");\n");
- 		fprintf(fq,"		if(ans == 64'h7FF0000000000000) $fwrite(fp, \"ans=+inf \");\n");
+			fprintf(fq,"		if(ans == 64'h7FF0000000000000) $fwrite(fp, \"ans=+inf \");\n");
-		fprintf(fq,"		if(ans >  64'h7FF0000000000000 && ans <  64'h7FF8000000000000 ) $fwrite(fp, \"ans=sigNaN \");\n");
+			fprintf(fq,"		if(ans >  64'h7FF0000000000000 && ans <  64'h7FF8000000000000 ) $fwrite(fp, \"ans=sigNaN \");\n");
- 		fprintf(fq,"		if(ans >  64'hFFF8000000000000 && ans <  64'hFFF8000000000000 ) $fwrite(fp, \"ans=sigNaN \");\n");
+			fprintf(fq,"		if(ans >  64'hFFF8000000000000 && ans <  64'hFFF8000000000000 ) $fwrite(fp, \"ans=sigNaN \");\n");
- 		fprintf(fq,"		if(ans >= 64'h7FF8000000000000 && ans <= 64'h7FFfffffffffffff ) $fwrite(fp, \"ans=qutNaN \");\n");
+			fprintf(fq,"		if(ans >= 64'h7FF8000000000000 && ans <= 64'h7FFfffffffffffff ) $fwrite(fp, \"ans=qutNaN \");\n");
- 		fprintf(fq,"		if(ans >= 64'hFFF8000000000000 && ans <= 64'hFFFfffffffffffff ) $fwrite(fp, \"ans=qutNaN \");\n");
+			fprintf(fq,"		if(ans >= 64'hFFF8000000000000 && ans <= 64'hFFFfffffffffffff ) $fwrite(fp, \"ans=qutNaN \");\n");
-		fprintf(fq,"    	$fwrite(fp,\"%d\\n\");\n",cnt);
+			fprintf(fq,"    	$fwrite(fp,\"%ld\\n\");\n",k);
-		if(cnt == 358)fprintf(fq,"    	$stop;\n");
+			//fprintf(fq,"    	$stop;\n");
-		// fprintf(fq,"    end\n");
+			// fprintf(fq,"    end\n");
-		fprintf(fq,"    end\n");
+			fprintf(fq,"    end\n");
-		cnt++;
+			cnt++;
-		//if(cnt > 100) break;
+			//if(cnt > 100) break;
-		fflush(fq);
+			fflush(fq);
-	}
+		} // if(!feof(fp))
 		if(k == stop && debug == 1) break;
 	} // for(k)
 	fprintf(fq, "\t$stop;\n\tend\nendmodule");
 	fclose(fq);
 	fclose(fp);
 	fprintf(stdout,"cnt = %d\n",cnt);
 }
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.v
--- a/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tbhead.v
@ -23,7 +23,14 @@ module tb;
 wire 					inexact;
 integer fp;
-reg nan;
+reg wnan;
 reg xnan;
 reg ynan;
 reg znan;
 reg ansnan;
 reg		[105:0]		s;				//	partial product 2	
 reg		[51:0] 		xnorm;
 reg 		[51:0] 		ynorm;
 localparam period = 20;  
 fmac UUT(.xrf(xrf), .y(y), .zrf(zrf), .rn(rn), .rz(rz), .rp(rp), .rm(rm),
@ -33,4 +40,4 @@ fmac UUT(.xrf(xrf), .y(y), .zrf(zrf), .rn(rn), .rz(rz), .rp(rp), .rm(rm),
 initial 
    begin
-    fp = $fopen("/home/kparry/code/FMAC/tbgen/results.dat","w");
+    fp = $fopen("/home/kparry/riscv-wally/wally-pipelined/src/fpu/FMA/tbgen/results.dat","w");
--- a/wally-pipelined/src/hazard/hazard.sv
+++ b/wally-pipelined/src/hazard/hazard.sv
@ -27,12 +27,12 @@
 module hazard(
  // Detect hazards
-  input  logic       PCSrcE, CSRWritePendingDEM, RetM, TrapM,
+  input  logic       BPPredWrongE, CSRWritePendingDEM, RetM, TrapM,
  input  logic       LoadStallD, MulDivStallD, CSRRdStallD,
  input  logic       InstrStall, DataStall,
  // Stall & flush outputs
  output logic       StallF, StallD, StallE, StallM, StallW,
-  output logic       FlushD, FlushE, FlushM, FlushW
+  output logic       FlushF, FlushD, FlushE, FlushM, FlushW
 );
  logic BranchFlushDE;
@ -51,7 +51,7 @@ module hazard(
  // A stage must stall if the next stage is stalled
  // If any stages are stalled, the first stage that isn't stalled must flush.
-  assign BranchFlushDE = PCSrcE | RetM | TrapM;
+  assign BranchFlushDE = BPPredWrongE | RetM | TrapM;
  assign StallFCause = CSRWritePendingDEM & ~(BranchFlushDE);  
  assign StallDCause = (LoadStallD | MulDivStallD | CSRRdStallD) & ~(BranchFlushDE);    // stall in decode if instruction is a load/mul/csr dependent on previous
@ -62,6 +62,7 @@ module hazard(
  // Each stage stalls if the next stage is stalled or there is a cause to stall this stage.
  assign StallF = StallD | StallFCause;
  assign StallD = StallE | StallDCause;
  assign StallE = StallM | StallECause;
  assign StallM = StallW | StallMCause;
@ -73,6 +74,7 @@ module hazard(
  assign FirstUnstalledW = (~StallW & StallM);;
  // Each stage flushes if the previous stage is the last one stalled (for cause) or the system has reason to flush
  assign FlushF = BPPredWrongE;
  assign FlushD = FirstUnstalledD || BranchFlushDE;  //  PCSrcE |InstrStall | CSRWritePendingDEM | RetM | TrapM;
  assign FlushE = FirstUnstalledE || BranchFlushDE; //LoadStallD | PCSrcE | RetM | TrapM;
  assign FlushM = FirstUnstalledM || RetM || TrapM;
--- a/wally-pipelined/src/ieu/controller.sv
+++ b/wally-pipelined/src/ieu/controller.sv
@ -43,6 +43,7 @@ module controller(
  output logic       MemReadE, CSRReadE, // for Hazard Unit
  output logic [2:0] Funct3E,
  output logic       MulDivE, W64E,
  output logic       JumpE,		  
  // Memory stage control signals
  input  logic       StallM, FlushM,
  output logic [1:0] MemRWM,
@ -68,7 +69,7 @@ module controller(
  logic 	    RegWriteD, RegWriteE;
  logic [2:0] ResultSrcD, ResultSrcE, ResultSrcM;
  logic [1:0] MemRWD, MemRWE;
-  logic		    JumpD, JumpE;
+  logic		    JumpD;
  logic		    BranchD, BranchE;
  logic	[1:0] ALUOpD;
  logic [4:0] ALUControlD;
--- a/wally-pipelined/src/ieu/datapath.sv
+++ b/wally-pipelined/src/ieu/datapath.sv
@ -36,7 +36,9 @@ module datapath (
  input  logic [4:0]       ALUControlE,
  input  logic             ALUSrcAE, ALUSrcBE,
  input  logic             TargetSrcE, 
  input  logic             JumpE,
  input  logic [`XLEN-1:0] PCE,
  input  logic [`XLEN-1:0] PCLinkE,
  output logic [2:0]       FlagsE,
  output logic [`XLEN-1:0] PCTargetE,
  output logic [`XLEN-1:0] SrcAE, SrcBE,
@ -64,7 +66,9 @@ module datapath (
  // Execute stage signals
  logic [`XLEN-1:0] RD1E, RD2E;
  logic [`XLEN-1:0] ExtImmE;
-  logic [`XLEN-1:0] PreSrcAE;
+
  logic [`XLEN-1:0] PreSrcAE, SrcAE2, SrcBE2;
  logic [`XLEN-1:0] ALUResultE;
  logic [`XLEN-1:0] WriteDataE;
  logic [`XLEN-1:0] TargetBaseE;
@ -93,8 +97,10 @@ module datapath (
  mux3  #(`XLEN)  faemux(RD1E, ResultW, ALUResultM, ForwardAE, PreSrcAE);
  mux3  #(`XLEN)  fbemux(RD2E, ResultW, ALUResultM, ForwardBE, WriteDataE);
  mux2  #(`XLEN)  srcamux(PreSrcAE, PCE, ALUSrcAE, SrcAE);
  mux2  #(`XLEN)  srcamux2(SrcAE, PCLinkE, JumpE, SrcAE2);  
  mux2  #(`XLEN)  srcbmux(WriteDataE, ExtImmE, ALUSrcBE, SrcBE);
-  alu   #(`XLEN)  alu(SrcAE, SrcBE, ALUControlE, ALUResultE, FlagsE);
+  mux2  #(`XLEN)  srcbmux2(SrcBE, {`XLEN{1'b0}}, JumpE, SrcBE2); // *** May be able to remove this mux.
  alu   #(`XLEN)  alu(SrcAE2, SrcBE2, ALUControlE, ALUResultE, FlagsE);
  mux2  #(`XLEN)  targetsrcmux(PCE, SrcAE, TargetSrcE, TargetBaseE);
  assign  PCTargetE = ExtImmE + TargetBaseE;
@ -109,6 +115,9 @@ module datapath (
  flopenrc #(`XLEN) ALUResultWReg(clk, reset, FlushW, ~StallW, ALUResultM, ALUResultW);
  flopenrc #(5)    RdWEg(clk, reset, FlushW, ~StallW, RdM, RdW);
  // *** something is not right here.  Before the merge I found an issue with the jal instruction not writing
  // the link address through the alu.
  // not sure what changed.
  // handle Store Conditional result if atomic extension supported
  generate 
    if (`A_SUPPORTED)
@ -118,4 +127,11 @@ module datapath (
  endgenerate
  mux6  #(`XLEN) resultmux(ALUResultW, ReadDataW, PCLinkW, CSRReadValW, MulDivResultW, SCResultW, ResultSrcW, ResultW);	
 /* -----\/----- EXCLUDED -----\/-----
  // This mux4:1 no longer needs to include PCLinkW.  This is set correctly in the execution stage.
  // *** need to look at how the decoder is coded to fix.
  mux4  #(`XLEN) resultmux(ALUResultW, ReadDataW, PCLinkW, CSRReadValW, ResultSrcW, ResultW);	
 >>>>>>> bp
 -----/\----- EXCLUDED -----/\----- */
 endmodule
--- a/wally-pipelined/src/ieu/ieu.sv
+++ b/wally-pipelined/src/ieu/ieu.sv
@ -33,6 +33,7 @@ module ieu (
  output logic             IllegalBaseInstrFaultD,
  // Execute Stage interface
  input  logic [`XLEN-1:0] PCE, 
  input  logic [`XLEN-1:0] PCLinkE,
  output logic [`XLEN-1:0] PCTargetE,
  output logic             MulDivE, W64E,
  output logic [2:0]       Funct3E,
@ -72,6 +73,7 @@ module ieu (
  logic [1:0]       ForwardAE, ForwardBE;
  logic             RegWriteM, RegWriteW;
  logic             MemReadE, CSRReadE;
  logic             JumpE;
  controller c(.*);
  datapath   dp(.*);             
--- a/wally-pipelined/src/ifu/BTBPredictor.sv
+++ b/wally-pipelined/src/ifu/BTBPredictor.sv
@ -0,0 +1,97 @@
 ///////////////////////////////////////////
 // SRAM2P1R1W
 //
 // Written: Ross Thomposn
 // Email: ross1728@gmail.com
 // Created: February 15, 2021
 // Modified: 
 //
 // Purpose: BTB model.  Outputs type of instruction (currently 1 hot encoded. Probably want 
 // to encode to reduce storage), valid, target PC.
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 `include "wally-config.vh"
 module BTBPredictor
  #(parameter int Depth = 10
    )
  (input  logic clk,
   input logic 		    reset,
   input logic [`XLEN-1:0]  LookUpPC,
   output logic [`XLEN-1:0] TargetPC,
   output logic [3:0] 	    InstrClass,
   output logic 	    Valid,
   // update
   input logic 		    UpdateEN,
   input logic [`XLEN-1:0]  UpdatePC,
   input logic [`XLEN-1:0]  UpdateTarget,
   input logic [3:0] 	    UpdateInstrClass
   );
  localparam TotalDepth = 2 ** Depth;
  logic [TotalDepth-1:0]    ValidBits;
  logic [Depth-1:0] 	    LookUpPCIndex, UpdatePCIndex, LookUpPCIndexQ, UpdatePCIndexQ;
  // hashing function for indexing the PC
  // We have Depth bits to index, but XLEN bits as the input.
  // bit 0 is always 0, bit 1 is 0 if using 4 byte instructions, but is not always 0 if
  // using compressed instructions.  XOR bit 1 with the MSB of index.
  assign UpdatePCIndex = {UpdatePC[Depth+1] ^ UpdatePC[1], UpdatePC[Depth:2]};
  assign LookUpPCIndex = {LookUpPC[Depth+1] ^ LookUpPC[1], LookUpPC[Depth:2]};  
  flopenr #(Depth) UpdatePCIndexReg(.clk(clk),
 				    .reset(reset),
 				    .en(1'b1),
 				    .d(UpdatePCIndex),
 				    .q(UpdatePCIndexQ));
  // The valid bit must be resetable.
  always_ff @ (posedge clk) begin
    if (reset) begin
      ValidBits <= #1 {TotalDepth{1'b0}};
    end else if (UpdateEN) begin
      ValidBits[UpdatePCIndexQ] <= #1 1'b1;
    end
  end
  flopenr #(Depth) LookupPCIndexReg(.clk(clk),
 				    .reset(reset),
 				    .en(1'b1),
 				    .d(LookUpPCIndex),
 				    .q(LookUpPCIndexQ));
  assign Valid = ValidBits[LookUpPCIndexQ];
  // the BTB contains the target address.
  // Another optimization may be using a PC relative address.
  // *** need to add forwarding.
  SRAM2P1R1W #(Depth, `XLEN+4) memory(.clk(clk),
 				      .reset(reset),
 				      .RA1(LookUpPCIndex),
 				      .RD1({{InstrClass, TargetPC}}),
 				      .REN1(1'b1),
 				      .WA1(UpdatePCIndex),
 				      .WD1({UpdateInstrClass, UpdateTarget}),
 				      .WEN1(UpdateEN),
 				      .BitWEN1({`XLEN{1'b1}}));
 endmodule
--- a/wally-pipelined/src/ifu/RAsPredictor.sv
+++ b/wally-pipelined/src/ifu/RAsPredictor.sv
@ -0,0 +1,80 @@
 ///////////////////////////////////////////
 // RASPredictor.sv
 //
 // Written: Ross Thomposn
 // Email: ross1728@gmail.com
 // Created: February 15, 2021
 // Modified: 
 //
 // Purpose: 2 bit saturating counter predictor with parameterized table depth.
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 `include "wally-config.vh"
 module RASPredictor
  #(parameter int StackSize = 16
    )
  (input logic clk,
   input logic 		    reset,
   input logic 		    pop,
   output logic [`XLEN-1:0] popPC,
   input logic 		    push,
   input logic 		    incr,
   input logic [`XLEN-1:0]  pushPC
   );
  logic 		    CounterEn;
  localparam Depth = $clog2(StackSize);
  logic [StackSize-1:0]     PtrD, PtrQ, PtrP1, PtrM1;
  logic [StackSize-1:0] [`XLEN-1:0] memory;
  integer 			    index;
  assign CounterEn = pop | push | incr;
  assign PtrD = pop ? PtrM1 : PtrP1;
  assign PtrM1 = PtrQ - 1'b1;
  assign PtrP1 = PtrQ + 1'b1;
  // may have to handle a push and an incr at the same time.
  // *** what happens if jal is executing and there is a return being flushed in Decode?
  flopenr #(StackSize) PTR(.clk(clk),
 			   .reset(reset),
 			   .en(CounterEn),
 			   .d(PtrD),
 			   .q(PtrQ));
  // RAS must be reset. 
  always_ff @ (posedge clk, posedge reset) begin
    if(reset) begin
      for(index=0; index<StackSize; index++)
 	memory[index] <= {`XLEN{1'b0}};
    end else if(push) begin
      memory[PtrP1] <= #1 pushPC;
    end
  end
  assign popPC = memory[PtrQ];
 endmodule
--- a/wally-pipelined/src/ifu/SramModel.sv
+++ b/wally-pipelined/src/ifu/SramModel.sv
@ -0,0 +1,111 @@
 ///////////////////////////////////////////
 // SRAM2P1R1W
 //
 // Written: Ross Thomposn
 // Email: ross1728@gmail.com
 // Created: February 14, 2021
 // Modified: 
 //
 // Purpose: Behavioral model of two port SRAM.  While this is synthesizable it will produce a flip flop based memory whi
 //          behaves with the timing of an SRAM typical of GF 14nm, 32nm, and 45nm.
 //          
 // 
 // to preload this memory we can use the following command
 // in modelsim's do file.
 // mem load -infile <relative path to the text file > -format <bin|hex> <hierarchy to the memory.>
 // example
 // mem laod -infile twoBitPredictor.txt -format bin testbench/dut/hart/ifu/bpred/DirPredictor/memory/memory
 //
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 `include "wally-config.vh"
 module SRAM2P1R1W
  #(parameter int Depth = 10,
    parameter int Width = 2
    )
  (input logic clk,
   // *** have to remove reset eventually
   input logic 		    reset,
   // port 1 is read only
   input logic [Depth-1:0]  RA1,
   output logic [Width-1:0] RD1,
   input logic 		    REN1,
   // port 2 is write only
   input logic [Depth-1:0]  WA1,
   input logic [Width-1:0]  WD1,
   input logic 		    WEN1,
   input logic [Width-1:0]  BitWEN1
   );
  logic [Depth-1:0] 	    RA1Q, WA1Q;
  logic 		    WEN1Q;
  logic [Width-1:0] 	    WD1Q;
  logic [Width-1:0] 	    memory [2**Depth-1:0];
  // SRAMs address busses are always registered first.
  flopenr #(Depth) RA1Reg(.clk(clk),
 			  .reset(reset),
 			  .en(REN1),
 			  .d(RA1),
 			  .q(RA1Q));
  flopenr #(Depth) WA1Reg(.clk(clk),
 			  .reset(reset),
 			  .en(REN1),
 			  .d(WA1),
 			  .q(WA1Q));
  flopenr #(1) WEN1Reg(.clk(clk),
 		       .reset(reset),
 		       .en(1'b1),
 		       .d(WEN1),
 		       .q(WEN1Q));
  flopenr #(Width) WD1Reg(.clk(clk),
 			  .reset(reset),
 			  .en(REN1),
 			  .d(WD1),
 			  .q(WD1Q));
  // read port
  assign RD1 = memory[RA1Q];
  genvar 		    index;
  // write port
  generate
    for (index = 0; index < Width; index = index + 1) begin    
      always_ff @ (posedge clk) begin
 	if (WEN1Q & BitWEN1[index]) begin
 	  memory[WA1Q][index] <= WD1Q[index];
 	end
      end
    end
  endgenerate
 endmodule  
--- a/wally-pipelined/src/ifu/bpred.sv
+++ b/wally-pipelined/src/ifu/bpred.sv
@ -0,0 +1,169 @@
 ///////////////////////////////////////////
 // bpred.sv
 //
 // Written: Ross Thomposn
 // Email: ross1728@gmail.com
 // Created: February 12, 2021
 // Modified: 
 //
 // Purpose: Branch prediction unit
 //          Produces a branch prediction based on branch history.
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 `include "wally-config.vh"
 module bpred 
  (input logic clk, reset,
   input logic 		    StallF, StallD, StallE, FlushF, FlushD, FlushE,
   // Fetch stage
   // the prediction
   input logic [`XLEN-1:0]  PCNextF, // *** forgot to include this one on the I/O list
   output logic [`XLEN-1:0] BPPredPCF,
   output logic 	    SelBPPredF,
   // Update Predictor
   input logic [`XLEN-1:0]  PCE, // The address of the currently executing instruction
   // 1 hot encoding
   // return, jump register, jump, branch
   // *** after reviewing the compressed instruction set I am leaning towards having the btb predict the instruction class.
   // *** the specifics of how this is encode is subject to change.
   input logic 		    PCSrcE, // AKA Branch Taken
   // Signals required to check the branch prediction accuracy.
   input logic [`XLEN-1:0]  PCTargetE, // The branch destination if the branch is taken.
   input logic [`XLEN-1:0]  PCD, // The address the branch predictor took.
   input logic [`XLEN-1:0]  PCLinkE, // The address following the branch instruction. (AKA Fall through address)
   input logic [3:0] 	    InstrClassE,
   // Report branch prediction status
   output logic 	    BPPredWrongE
   );
  logic 		    BTBValidF;
  logic [1:0] 		    BPPredF, BPPredD, BPPredE, UpdateBPPredE;
  logic [3:0] 		    BPInstrClassF, BPInstrClassD, BPInstrClassE;
  logic [`XLEN-1:0] 	    BTBPredPCF, RASPCF;
  logic 		    TargetWrongE;
  logic 		    FallThroughWrongE;
  logic 		    PredictionDirWrongE;
  logic 		    PredictionPCWrongE;
  logic [`XLEN-1:0] 	    CorrectPCE;
  // Part 1 branch direction prediction
  twoBitPredictor DirPredictor(.clk(clk),
 			       .reset(reset),
 			       .LookUpPC(PCNextF),
 			       .Prediction(BPPredF),
 			       // update
 			       .UpdatePC(PCE),
 			       .UpdateEN(InstrClassE[0]),
 			       .UpdatePrediction(UpdateBPPredE));
  // this predictor will have two pieces of data,
  // 1) A direction (1 = Taken, 0 = Not Taken)
  // 2) Any information which is necessary for the predictor to built it's next state.
  // For a 2 bit table this is the prediction count.
  assign SelBPPredF = ((BPInstrClassF[0] & BPPredF[1] & BTBValidF) | 
 		       BPInstrClassF[3] |
 		       (BPInstrClassF[2] & BTBValidF) | 
 		       BPInstrClassF[1] & BTBValidF) ;
  // Part 2 Branch target address prediction
  // *** For now the BTB will house the direct and indirect targets
  BTBPredictor TargetPredictor(.clk(clk),
 			       .reset(reset),
 			       .LookUpPC(PCNextF),
 			       .TargetPC(BTBPredPCF),
 			       .InstrClass(BPInstrClassF),
 			       .Valid(BTBValidF),
 			       // update
 			       .UpdateEN(InstrClassE[2] | InstrClassE[1] | InstrClassE[0]),
 			       .UpdatePC(PCE),
 			       .UpdateTarget(PCTargetE),
 			       .UpdateInstrClass(InstrClassE));
  // need to forward when updating to the same address as reading.
  //assign CorrectPCE = PCSrcE ? PCTargetE : PCLinkE;
  //assign TargetPC = (PCE == PCNextF) ? CorrectPCE : BTBPredPCF;
  // Part 3 RAS
  // *** need to add the logic to restore RAS on flushes.  We will use incr for this.
  RASPredictor RASPredictor(.clk(clk),
 			    .reset(reset),
 			    .pop(BPInstrClassF[3]),
 			    .popPC(RASPCF),
 			    .push(InstrClassE[3]),
 			    .incr(1'b0),
 			    .pushPC(PCLinkE));
  assign BPPredPCF = BPInstrClassF[3] ? RASPCF : BTBPredPCF;
  // The prediction and its results need to be passed through the pipeline
  // *** for other predictors will will be different.
  flopenrc #(2) BPPredRegD(.clk(clk),
 			   .reset(reset),
 			   .en(~StallF),
 			   .clear(FlushF),
 			   .d(BPPredF),
 			   .q(BPPredD));
  flopenrc #(2) BPPredRegE(.clk(clk),
 			   .reset(reset),
 			   .en(~StallD),
 			   .clear(FlushD),
 			   .d(BPPredD),
 			   .q(BPPredE));
  // pipeline the class
  flopenrc #(4) InstrClassRegD(.clk(clk),
 			       .reset(reset),
 			       .en(~StallF),
 			       .clear(FlushF),
 			       .d(BPInstrClassF),
 			       .q(BPInstrClassD));
  flopenrc #(4) InstrClassRegE(.clk(clk),
 			       .reset(reset),
 			       .en(~StallD),
 			       .clear(FlushD),
 			       .d(BPInstrClassD),
 			       .q(BPInstrClassE));
  // Check the prediction makes execution.
  assign TargetWrongE = PCTargetE != PCD;
  assign FallThroughWrongE = PCLinkE != PCD;
  assign PredictionDirWrongE = (BPPredE[1] ^ PCSrcE) & InstrClassE[0];
  assign PredictionPCWrongE = PCSrcE ? TargetWrongE : FallThroughWrongE;
  assign BPPredWrongE = (PredictionPCWrongE | PredictionDirWrongE) & (|InstrClassE);
  // Update predictors
  satCounter2 BPDirUpdate(.BrDir(PCSrcE),
 			  .OldState(BPPredE),
 			  .NewState(UpdateBPPredE));
 endmodule
--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@ -29,7 +29,7 @@
 module ifu (
  input  logic             clk, reset,
  input  logic             StallF, StallD, StallE, StallM, StallW,
-  input  logic             FlushD, FlushE, FlushM, FlushW,
+  input  logic             FlushF, FlushD, FlushE, FlushM, FlushW,
  // Fetch
  input  logic [`XLEN-1:0] InstrInF,
  output logic [`XLEN-1:0] PCF, 
@ -37,13 +37,15 @@ module ifu (
  output logic             InstrReadF,
  // Decode  
  // Execute
-  input  logic             PCSrcE, 
+  output logic [`XLEN-1:0] PCLinkE,
-  input  logic [`XLEN-1:0] PCTargetE,
+  input logic 		   PCSrcE, 
-  output logic [`XLEN-1:0] PCE, 
+  input logic [`XLEN-1:0]  PCTargetE,
  output logic [`XLEN-1:0] PCE,
  output logic 		   BPPredWrongE, 
  // Mem
-  input  logic             RetM, TrapM, 
+  input logic 		   RetM, TrapM, 
-  input  logic [`XLEN-1:0] PrivilegedNextPCM, 
+  input logic [`XLEN-1:0]  PrivilegedNextPCM, 
-  output logic [31:0]      InstrD, InstrM,
+  output logic [31:0] 	   InstrD, InstrM,
  output logic [`XLEN-1:0] PCM, 
  // Writeback
  output logic [`XLEN-1:0] PCLinkW,
@ -59,13 +61,14 @@ module ifu (
  output logic             ITLBMissF, ITLBHitF,
  // bogus
  input  logic [15:0] rd2
 );
  logic [`XLEN-1:0] UnalignedPCNextF, PCNextF;
  logic misaligned, BranchMisalignedFaultE, BranchMisalignedFaultM, TrapMisalignedFaultM;
  logic PrivilegedChangePCM;
  logic IllegalCompInstrD;
-  logic [`XLEN-1:0] PCPlusUpperF, PCPlus2or4F, PCD, PCW, PCLinkD, PCLinkE, PCLinkM;
+  logic [`XLEN-1:0] PCPlusUpperF, PCPlus2or4F, PCD, PCW, PCLinkD, PCLinkM;
  logic        CompressedF;
  logic [31:0]     InstrF, InstrRawD, InstrE, InstrW;
  logic [31:0]     nop = 32'h00000013; // instruction for NOP
@ -77,6 +80,12 @@ module ifu (
  tlb #(3) itlb(clk, reset, SATP_REGW, PCF, PageTableEntryF, ITLBWriteF, ITLBFlushF,
    InstrPAdrF, ITLBMissF, ITLBHitF);
  // branch predictor signals
  logic 	   SelBPPredF;
  logic [`XLEN-1:0] BPPredPCF, PCCorrectE, PCNext0F, PCNext1F;
  logic [3:0] 	    InstrClassD, InstrClassE;
  // *** put memory interface on here, InstrF becomes output
  //assign InstrPAdrF = PCF; // *** no MMU
  //assign InstrReadF = ~StallD; // *** & ICacheMissF; add later
@ -85,10 +94,48 @@ module ifu (
  assign PrivilegedChangePCM = RetM | TrapM;
-  mux3    #(`XLEN) pcmux(PCPlus2or4F, PCTargetE, PrivilegedNextPCM, {PrivilegedChangePCM, PCSrcE}, UnalignedPCNextF);
+  //mux3    #(`XLEN) pcmux(PCPlus2or4F, PCCorrectE, PrivilegedNextPCM, {PrivilegedChangePCM, BPPredWrongE}, UnalignedPCNextF);
  mux2 #(`XLEN) pcmux0(.d0(PCPlus2or4F),
 		       .d1(BPPredPCF),
 		       .s(SelBPPredF),
 		       .y(PCNext0F));
  mux2 #(`XLEN) pcmux1(.d0(PCNext0F),
 		       .d1(PCCorrectE),
 		       .s(BPPredWrongE),
 		       .y(PCNext1F));
  mux2 #(`XLEN) pcmux2(.d0(PCNext1F),
 		       .d1(PrivilegedNextPCM),
 		       .s(PrivilegedChangePCM),
 		       .y(UnalignedPCNextF));
  assign  PCNextF = {UnalignedPCNextF[`XLEN-1:1], 1'b0}; // hart-SPEC p. 21 about 16-bit alignment
  flopenl #(`XLEN) pcreg(clk, reset, ~StallF, PCNextF, `RESET_VECTOR, PCF);
  // branch and jump predictor
  // I am making the port connection explicit for now as I want to see them and they will be changing.
  bpred bpred(.clk(clk),
 	      .reset(reset),
 	      .StallF(StallF),
 	      .StallD(StallD),
 	      .StallE(1'b0),   // *** may need this eventually
 	      .FlushF(FlushF),
 	      .FlushD(FlushD),
 	      .FlushE(FlushE),
 	      .PCNextF(PCNextF),
 	      .BPPredPCF(BPPredPCF),
 	      .SelBPPredF(SelBPPredF),
 	      .PCE(PCE),
 	      .PCSrcE(PCSrcE),
 	      .PCTargetE(PCTargetE),
 	      .PCD(PCD),
 	      .PCLinkE(PCLinkE),
 	      .InstrClassE(InstrClassE),
 	      .BPPredWrongE(BPPredWrongE));
  // The true correct target is PCTargetE if PCSrcE is 1 else it is the fall through PCLinkE.
  assign PCCorrectE =  PCSrcE ? PCTargetE : PCLinkE;
  // pcadder
  // add 2 or 4 to the PC, based on whether the instruction is 16 bits or 32
  assign CompressedF = (InstrF[1:0] != 2'b11); // is it a 16-bit compressed instruction?
@ -120,6 +167,14 @@ module ifu (
  assign IllegalIEUInstrFaultD = IllegalBaseInstrFaultD | IllegalCompInstrD; // illegal if bad 32 or 16-bit instr
  // *** combine these with others in better way, including M, F
  // the branch predictor needs a compact decoding of the instruction class.
  // *** consider adding in the alternate return address x5 for returns.
  assign InstrClassD[3] = InstrD[6:0] == 7'h67 && InstrD[19:15] == 5'h01; // return
  assign InstrClassD[2] = InstrD[6:0] == 7'h67 && InstrD[19:15] != 5'h01; // jump register, but not return
  assign InstrClassD[1] = InstrD[6:0] == 7'h6F; // jump
  assign InstrClassD[0] = InstrD[6:0] == 7'h63; // branch
  // Misaligned PC logic
  generate
@ -143,6 +198,13 @@ module ifu (
  flopenr #(`XLEN) PCMReg(clk, reset, ~StallM, PCE, PCM);
  flopenr #(`XLEN) PCWReg(clk, reset, ~StallW, PCM, PCW); // *** probably not needed; delete later
  flopenrc #(4) InstrClassRegE(.clk(clk),
 			       .reset(reset),
 			       .en(~StallD),
 			       .clear(FlushD),
 			       .d(InstrClassD),
 			       .q(InstrClassE));
  // seems like there should be a lower-cost way of doing this PC+2 or PC+4 for JAL.  
  // either have ALU compute PC+2/4 and feed into ALUResult input of ResultMux or
  // have dedicated adder in Mem stage based on PCM + 2 or 4
--- a/wally-pipelined/src/ifu/satCounter2.sv
+++ b/wally-pipelined/src/ifu/satCounter2.sv
@ -0,0 +1,57 @@
 ///////////////////////////////////////////
 // satCounter2.sv
 //
 // Written: Ross Thomposn
 // Email: ross1728@gmail.com
 // Created: February 13, 2021
 // Modified: 
 //
 // Purpose: 2 bit starting counter
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 `include "wally-config.vh"
 module satCounter2
  (input logic BrDir,
   input logic [1:0] OldState,
   output logic [1:0] NewState
   );
  always_comb begin
    case(OldState)
      2'b00: begin
 	if(BrDir) NewState = 2'b01;
 	else NewState = 2'b00;
      end
      2'b01: begin
 	if(BrDir) NewState = 2'b10;
 	else NewState = 2'b00;
      end
      2'b10: begin
 	if(BrDir) NewState = 2'b11;
 	else NewState = 2'b01;
      end
      2'b11: begin
 	if(BrDir) NewState = 2'b11;
 	else NewState = 2'b10;
      end
    endcase
  end
 endmodule
--- a/wally-pipelined/src/ifu/twoBitPredictor.sv
+++ b/wally-pipelined/src/ifu/twoBitPredictor.sv
@ -0,0 +1,84 @@
 ///////////////////////////////////////////
 // twoBitPredictor.sv
 //
 // Written: Ross Thomposn
 // Email: ross1728@gmail.com
 // Created: February 14, 2021
 // Modified: 
 //
 // Purpose: 2 bit saturating counter predictor with parameterized table depth.
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 `include "wally-config.vh"
 module twoBitPredictor
  #(parameter int Depth = 10
    )
  (input logic clk,
   input logic 		   reset,
   input logic [`XLEN-1:0] LookUpPC,
   output logic [1:0] 	   Prediction,
   // update
   input logic [`XLEN-1:0] UpdatePC,
   input logic 		   UpdateEN,
   input logic [1:0] 	   UpdatePrediction
   );
  logic [Depth-1:0] 	   LookUpPCIndex, UpdatePCIndex;
  logic [1:0] 		   PredictionMemory;
  logic 		   DoForwarding, DoForwardingF;
  logic [1:0] 		   UpdatePredictionF;
  // hashing function for indexing the PC
  // We have Depth bits to index, but XLEN bits as the input.
  // bit 0 is always 0, bit 1 is 0 if using 4 byte instructions, but is not always 0 if
  // using compressed instructions.  XOR bit 1 with the MSB of index.
  assign UpdatePCIndex = {UpdatePC[Depth+1] ^ UpdatePC[1], UpdatePC[Depth:2]};
  assign LookUpPCIndex = {LookUpPC[Depth+1] ^ LookUpPC[1], LookUpPC[Depth:2]};  
  SRAM2P1R1W #(Depth, 2) memory(.clk(clk),
 				.reset(reset),
 				.RA1(LookUpPCIndex),
 				.RD1(PredictionMemory),
 				.REN1(1'b1),
 				.WA1(UpdatePCIndex),
 				.WD1(UpdatePrediction),
 				.WEN1(UpdateEN),
 				.BitWEN1(2'b11));
  // need to forward when updating to the same address as reading.
  // first we compare to see if the update and lookup addreses are the same
  assign DoForwarding = UpdatePCIndex == LookUpPCIndex;
  // register the update value and the forwarding signal into the Fetch stage
  flopr #(1) DoForwardingReg(.clk(clk),
 			     .reset(reset),
 			     .d(DoForwarding),
 			     .q(DoForwardingF));
  flopr #(2) UpdatePredictionReg(.clk(clk),
 				 .reset(reset),
 				 .d(UpdatePrediction),
 				 .q(UpdatePredictionF));
  assign Prediction = DoForwardingF ? UpdatePredictionF : PredictionMemory;
 endmodule
--- a/wally-pipelined/src/wally/wallypipelinedhart.sv
+++ b/wally-pipelined/src/wally/wallypipelinedhart.sv
@ -55,7 +55,7 @@ module wallypipelinedhart (
 //  logic [1:0]  ForwardAE, ForwardBE;
  logic        StallF, StallD, StallE, StallM, StallW;
-  logic        FlushD, FlushE, FlushM, FlushW;
+  logic        FlushF, FlushD, FlushE, FlushM, FlushW;
  logic        RetM, TrapM;
  // new signals that must connect through DP
@ -66,7 +66,7 @@ module wallypipelinedhart (
  logic [2:0] Funct3E;
 //  logic [31:0] InstrF;
  logic [31:0] InstrD, InstrM;
-  logic [`XLEN-1:0] PCE, PCM, PCLinkW;
+  logic [`XLEN-1:0] PCE, PCM, PCLinkE, PCLinkW;
  logic [`XLEN-1:0] PCTargetE;
  logic [`XLEN-1:0] CSRReadValW, MulDivResultW;
  logic [`XLEN-1:0] PrivilegedNextPCM;
@ -105,13 +105,14 @@ module wallypipelinedhart (
  logic             InstrReadF;
  logic             DataStall, InstrStall;
  logic             InstrAckD, MemAckW;
  logic 	    BPPredWrongE;
  ifu ifu(.InstrInF(InstrRData), .*); // instruction fetch unit: PC, branch prediction, instruction cache
-  ieu ieu(.*); // inteber execution unit: integer register file, datapath and controller
+  ieu ieu(.*); // integer execution unit: integer register file, datapath and controller
  dmem dmem(.*); // data cache unit
  ahblite ebu( 
    //.InstrReadF(1'b0),
    //.InstrRData(InstrF), // hook up InstrF later
--- a/wally-pipelined/testbench/function_radix.sv
+++ b/wally-pipelined/testbench/function_radix.sv
@ -0,0 +1,108 @@
 ///////////////////////////////////////////
 // datapath.sv
 //
 // Written: Ross Thompson
 // email: ross1728@gmail.com
 // Created: November 9, 2019
 //
 // Purpose: Finds the current function or global assembly label based on PCE.
 // 
 // A component of the Wally configurable RISC-V project.
 // 
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
 // files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
 // modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
 // is furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
 // OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
 // BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
 // OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 ///////////////////////////////////////////
 `include "wally-config.vh"
 module function_radix();
   parameter PRELOAD_FILE = "funct_addr.txt";
   integer memory_bank [];
   integer index;
   logic [`XLEN-1:0] pc;
   initial begin
     $init_signal_spy("/riscv_mram_tb/dut/pc", "/riscv_mram_tb/function_radix/pc");
   end
   task automatic bin_search_min;
      input integer pc;
      input integer length;
      ref integer   array [];
      output integer minval;
      integer 	     left, right;
      integer 	     mid;
      begin
 	 left = 0;
 	 right = length;
 	 while (left <= right) begin
 	    mid = left + ((right - left) / 2);
 	    if (array[mid] == pc) begin
 	       minval = array[mid];
 	       return;
            end
 	    if (array[mid] < pc) begin
 	      left = mid + 1;
 	    end else begin
 	      right = mid -1;
 	    end
 	 end // while (left <= right)
 	 // if the element pc is now found, right and left will be equal at this point.
 	 // we need to check if pc is less than the array at left or greather.
 	 // if it is less than pc, then we select left as the index.
 	 // if it is greather we want 1 less than left.
 	 if (array[left] < pc) begin
 	    minval = array[left];
 	    return;	    
 	 end else begin
 	    minval = array[left-1];
 	    return;
 	 end
      end
   endtask
   // preload
   initial $readmemh(PRELOAD_FILE, memory_bank);
   // we need to count the number of lines in the file so we can set line_count.
   integer fp;
   integer line_count = 0;
   logic [31:0] line;
   initial begin
      fp = $fopen(PRELOAD_FILE, "r");
      // read line by line to count lines
      if (fp) begin
 	 while (! $feof(fp)) begin
 	    $fscanf(fp, "%h\n", line);
 	    line_count = line_count + 1;
 	 end
      end else begin
 	 $display("Cannot open file %s for reading.", PRELOAD_FILE);
 	 $stop;
      end
   end
   always @(pc) begin
      bin_search_min(pc, line_count, memory_bank, index);
   end
 endmodule // function_radix
--- a/wally-pipelined/testbench/testbench-imperas.sv
+++ b/wally-pipelined/testbench/testbench-imperas.sv
@ -162,6 +162,7 @@ string tests64iNOc[] = {
                     "rv64i/WALLY-SRAI", "3000",
                     "rv64i/WALLY-LOAD", "11bf0",
                     "rv64i/WALLY-JAL", "4000",
                     "rv64i/WALLY-JALR", "3000",
                     "rv64i/WALLY-STORE", "3000",
                     "rv64i/WALLY-ADDIW", "3000",
                     "rv64i/WALLY-SLLIW", "3000",
@ -288,6 +289,7 @@ string tests32i[] = {
                     "rv32i/WALLY-SUB", "3000",
                     "rv32i/WALLY-STORE", "2000",
                     "rv32i/WALLY-JAL", "3000",
                     "rv32i/WALLY-JALR", "2000",
                     "rv32i/WALLY-BEQ" ,"4000",
                     "rv32i/WALLY-BNE", "4000 ",
                      "rv32i/WALLY-BLTU", "4000 ",
@ -366,7 +368,7 @@ string tests32i[] = {
      memfilename = {"../../imperas-riscv-tests/work/", tests[test], ".elf.memfile"};
      $readmemh(memfilename, dut.imem.RAM);
      $readmemh(memfilename, dut.uncore.dtim.RAM);
-      reset = 1; # 22; reset = 0;
+      reset = 1; # 42; reset = 0;
    end
  // generate clock to sequence tests
@ -442,7 +444,11 @@ string tests32i[] = {
          reset = 1; # 17; reset = 0;
        end
      end
-    end
+    end // always @ (negedge clk)
  // track the current function or label
  //function_rfunction_radix function_radix();
 endmodule
 /* verilator lint_on STMTDLY */
--- a/wally-pipelined/testgen/testgen-JAL-JALR.py
+++ b/wally-pipelined/testgen/testgen-JAL-JALR.py
@ -19,9 +19,8 @@ from random import getrandbits
 from copy import deepcopy
 ##################################
-# functions
+# helper functions
 ##################################
 def InitTestGroup():
  global TestGroup,TestGroupSizes,AllRegs,UnusedRegs,StoreAdrReg
  TestGroup += 1
@ -44,19 +43,21 @@ def registerSelect():
  if len(UnusedRegs)==0: 
    InitTestGroup()
  rd = choice(UnusedRegs)
  rs = choice(UnusedRegs)
  UnusedRegs.remove(rd)
  OtherRegs = deepcopy(UnusedRegs)
  if 0 in OtherRegs: 
    OtherRegs.remove(0)
  if len(OtherRegs) == 0:
    OtherRegs = deepcopy(AllRegs)
    OtherRegs.remove(0)
  rs = choice(OtherRegs)
  OtherRegs = deepcopy(AllRegs)
  OtherRegs.remove(StoreAdrReg)
  OtherRegs.remove(rd)
-  try:
+  if 0 in OtherRegs: 
    OtherRegs.remove(0)
-  except:
+  if rs in OtherRegs: 
    pass
  try:
    OtherRegs.remove(rs)
  except:
    pass
  DataReg = choice(OtherRegs)
  OtherRegs.remove(DataReg)
  OtherRd = choice(OtherRegs)
@ -65,52 +66,74 @@ def registerSelect():
 def addInst(line):
  global CurrAdr
  f.write(line)
-  if ("li x" in line):
+  if ("li x" in line) and ("slli x" not in line):
    CurrAdr += 8 if (xlen == 32) else 20
  elif ("la x" in line):
    CurrAdr += 8
  else:
    CurrAdr += 4
-def writeForwardsJumpVector(spacers):
+def expectValue(expectReg, expectVal, sigOffset):
-  global TestNum
+  global TestGroupSizes
-  rd, rs, DataReg, OtherRd = registerSelect()
+  TestGroupSizes[TestGroup-1] += 1
-  if (xlen==64):
+  addInst("    "+storecmd+" x"+str(expectReg)+", "+str(wordsize*sigOffset)+"(x"+str(StoreAdrReg)+")\n")
-    expected = int("fedbca9876540000",16)
+  f.write("    RVTEST_IO_ASSERT_GPR_EQ(x"+str(StoreAdrReg+1)+", x"+str(expectReg)+", "+formatstr.format(expectVal)+")\n")
-    unexpected = int("ffff0000ffff0000",16)
+  if (xlen == 32):
    r.write(formatrefstr.format(expectVal)+"\n")
  else:
-    expected = int("fedbca98",16)
+    r.write(formatrefstr.format(expectVal % 2**32)+"\n" + formatrefstr.format(expectVal >> 32)+"\n")
    unexpected = int("ff00ff00",16)
 def addJalr(rs,rd,dist):
  target = CurrAdr + 20 + dist
  target31_12 = CurrAdr >> 12 # 20 bits for lui
  target11_0 = target - (target31_12 << 12) # 12 remaining bits
  target31_16 = target31_12 >> 4 # lui sign extends, so shift in a leading 0
  target15_12 = target31_12 - (target31_16 << 4) # the nibble we just lost
  if target11_0 > 0:
    offset = randint(-(1<<11)-1,(1<<11)-2-target11_0)
  else:
    offset = randint(-(1<<11)-1-target11_0,(1<<11)-2)
  addInst("    lui x"+str(rs)+", 0x"+imm20formatstr.format(target31_16)+"\n")
  addInst("    addi x"+str(rs)+", x"+str(rs)+", SEXT_IMM(0x0"+imm12formatstr.format(target15_12 << 8)+")\n")
  addInst("    slli x"+str(rs)+", x"+str(rs)+", SEXT_IMM(4)\n") 
  addInst("    addi x"+str(rs)+", x"+str(rs)+", SEXT_IMM(0x"+imm12formatstr.format(0xfff&(offset+target11_0+randint(0,1)))+")\n")
  addInst("    JALR x"+str(rd)+", x"+str(rs)+", SEXT_IMM(0x"+imm12formatstr.format(0xfff&(-offset))+")\n")
 ##################################
 # test functions
 ##################################
 def writeForwardsJumpVector(spacers,instr):
  global TestNum
  TestNum += 1
  rd, rs, DataReg, OtherRd = registerSelect()
  # Header
  f.write("\n")
-  f.write("    # Testcase "+str(TestNum)+"  address cmp result rd:x"+str(rd)+"("+formatstr.format(CurrAdr+44)+")  data result rd:x"+str(DataReg)+"("+formatstr.format(expected)+")\n")
+  f.write("    # Testcase "+str(TestNum)+"\n")
  # Test Code
  addInst("    li x"+str(DataReg)+", "+formatstr.format(expected)+"\n")
-  addInst("    JAL x"+str(rd)+", 1f\n")
+  if (instr=="JAL"):
    addInst("    JAL x"+str(rd)+", 1f\n")
  elif (instr=="JALR"):
    dist = spacers*(8 if (xlen == 32) else 20) # Compute distance from linked adr to target adr
    addJalr(rs,rd,dist);
  else:
    exit("invalid instruction") 
  LinkAdr = CurrAdr if (rd!=0) else 0 # rd's expected value
  for i in range(spacers):
    addInst("    li x"+str(DataReg)+", "+formatstr.format(unexpected)+"\n")
  f.write("1:\n")
-  addInst("    "+storecmd+" x"+str(rd)+", "+str(wordsize*(2*TestNum+0))+"(x"+str(StoreAdrReg)+")\n")
+  # Store values to be verified
-  f.write("    RVTEST_IO_ASSERT_GPR_EQ(x"+str(StoreAdrReg+1)+", x"+str(rd)+", "+formatstr.format(LinkAdr)+")\n")
+  expectValue(rd, LinkAdr, 2*TestNum+0)
-  addInst("    "+storecmd+" x"+str(DataReg)+", "+str(wordsize*(2*TestNum+1))+"(x"+str(StoreAdrReg)+")\n")
+  expectValue(DataReg, expected, 2*TestNum+1)
  f.write("    RVTEST_IO_ASSERT_GPR_EQ(x"+str(StoreAdrReg+1)+", x"+str(DataReg)+", "+formatstr.format(expected)+")\n")
  writeExpectedToRef(LinkAdr)
  writeExpectedToRef(expected) 
  TestNum = TestNum+1
-def writeBackwardsJumpVector(spacers):
+def writeBackwardsJumpVector(spacers,instr):
  global TestNum
-  rd, rs, DataReg,OtherRd = registerSelect()
+  TestNum += 1
-  if (xlen==64):
+  rd, rs, DataReg, OtherRd = registerSelect()
-    expected = int("fedbca9876540000",16)
+  # Header
    unexpected = int("ffff0000ffff0000",16)
  else:
    expected = int("fedbca98",16)
    unexpected = int("ff00ff00",16)
  f.write("\n")
-  f.write("    # Testcase "+str(TestNum)+"  address cmp result rd:x"+str(rd)+"("+formatstr.format(CurrAdr+20+8*spacers)+")  data result rd:x"+str(DataReg)+"("+formatstr.format(expected)+")\n")
+  f.write("    # Testcase "+str(TestNum)+"\n")
  # Test Code
  addInst("    JAL x"+str(OtherRd)+", 2f\n")
  f.write("1:\n")
  addInst("    li x"+str(DataReg)+", "+formatstr.format(expected)+"\n")
@ -118,29 +141,27 @@ def writeBackwardsJumpVector(spacers):
  f.write("2:\n")
  for i in range(spacers):
    addInst("    li x"+str(DataReg)+", "+formatstr.format(unexpected)+"\n")
-  addInst("    JAL x"+str(rd)+", 1b\n")
+  if (instr=="JAL"):
    addInst("    JAL x"+str(rd)+", 1b\n")
  elif (instr=="JALR"):
    dist = -20 - 4 - (1+spacers)*(8 if (xlen == 32) else 20) # Compute distance from linked adr to target adr
    addJalr(rs,rd,dist);
  else:
    exit("invalid instruction") 
  LinkAdr = CurrAdr if (rd!=0) else 0 # rd's expected value
  f.write("3:\n")
-  addInst("    "+storecmd+" x"+str(rd)+", "+str(wordsize*(2*TestNum+0))+"(x"+str(StoreAdrReg)+")\n")
+  # Store values to be verified
-  f.write("    RVTEST_IO_ASSERT_GPR_EQ(x"+str(StoreAdrReg+1)+", x"+str(rd)+", "+formatstr.format(LinkAdr)+")\n")
+  expectValue(rd, LinkAdr, 2*TestNum+0)
-  addInst("    "+storecmd+" x"+str(DataReg)+", "+str(wordsize*(2*TestNum+1))+"(x"+str(StoreAdrReg)+")\n")
+  expectValue(DataReg, expected, 2*TestNum+1)
  f.write("    RVTEST_IO_ASSERT_GPR_EQ(x"+str(StoreAdrReg+1)+", x"+str(DataReg)+", "+formatstr.format(expected)+")\n")
  writeExpectedToRef(LinkAdr)
  writeExpectedToRef(expected) 
  TestNum = TestNum+1
 def writeChainVector(repetitions,spacers):
  global TestNum
  TestNum += 1
  rd, rs, DataReg,OtherRd = registerSelect()
-  if (xlen==64):
+  # Header
    expected = int("fedbca9876540000",16)
    unexpected = int("ffff0000ffff0000",16)
  else:
    expected = int("fedbca98",16)
    unexpected = int("ff00ff00",16)
  f.write("\n")
-  f.write("    # Testcase "+str(TestNum)+"  address cmp result rd:x"+str(rd)+"(ugh; if you really wanted to, you could figure it out)  data result rd:x"+str(DataReg)+"("+formatstr.format(expected)+")\n")
+  f.write("    # Testcase "+str(TestNum)+"\n")
  # Test Code
  addInst("    li x"+str(DataReg)+", "+formatstr.format(expected)+"\n")
  for i in range(repetitions):
    addInst("    JAL x"+str(OtherRd)+", "+str(3*i+2)+"f\n")
@ -159,57 +180,56 @@ def writeChainVector(repetitions,spacers):
      for j in range(i):
        addInst("    li x"+str(DataReg)+", "+formatstr.format(unexpected)+"\n")
    f.write(str(3*i+3)+":\n")
-  addInst("    "+storecmd+" x"+str(rd)+", "+str(wordsize*(2*TestNum+0))+"(x"+str(StoreAdrReg)+")\n")
+  # Store values to be verified
-  f.write("    RVTEST_IO_ASSERT_GPR_EQ(x"+str(StoreAdrReg+1)+", x"+str(rd)+", "+formatstr.format(LinkAdr)+")\n")
+  expectValue(rd, LinkAdr, 2*TestNum+0)
-  addInst("    "+storecmd+" x"+str(DataReg)+", "+str(wordsize*(2*TestNum+1))+"(x"+str(StoreAdrReg)+")\n")
+  expectValue(DataReg, expected, 2*TestNum+1)
  f.write("    RVTEST_IO_ASSERT_GPR_EQ(x"+str(StoreAdrReg+1)+", x"+str(DataReg)+", "+formatstr.format(expected)+")\n")
  writeExpectedToRef(LinkAdr)
  writeExpectedToRef(expected) 
  TestNum = TestNum+1
 def writeExpectedToRef(expected):
  global TestGroupSizes
  TestGroupSizes[TestGroup-1] += 1
  if (xlen == 32):
    r.write(formatrefstr.format(expected)+"\n")
  else:
    r.write(formatrefstr.format(expected % 2**32)+"\n" + formatrefstr.format(expected >> 32)+"\n")
 ##################################
 # main body
 ##################################
 # change these to suite your tests
-tests = ["JAL"]
+test = 0
 tests = ["JAL","JALR"]
 author = "Ben Bracker (bbracker@hmc.edu)"
 xlens = [32,64]
-numtests = 100;
+numtests = 100
 # setup
 seed(0) # make tests reproducible
 # generate files for each test
-for xlen in xlens:
+for test in tests:
-  CurrAdr = int("80000108",16)
+  for xlen in xlens:
-  TestNum = 0
+    print(test+" "+str(xlen))
-  TestGroup = 1
+    CurrAdr = int("80000108",16)
-  TestGroupSizes = [0]
+    TestNum = -1
-  AllRegs = list(range(0,32))
+    TestGroup = 1
-  UnusedRegs = deepcopy(AllRegs) 
+    TestGroupSizes = [0]
-  StoreAdrReg = 6 # matches what's in header script 
+    AllRegs = list(range(0,32))
-  UnusedRegs.remove(6)
+    UnusedRegs = deepcopy(AllRegs) 
    StoreAdrReg = 6 # matches what's in header script 
    UnusedRegs.remove(6)
    if (xlen==64):
      expected = int("fedbca9876540000",16)
      unexpected = int("ffff0000ffff0000",16)
    else:
      expected = int("fedbca98",16)
      unexpected = int("ff00ff00",16)
    formatstrlen = str(int(xlen/4))
    formatstr = "0x{:0" + formatstrlen + "x}" # format as xlen-bit hexadecimal number
    formatrefstr = "{:08x}" # format as xlen-bit hexadecimal number with no leading 0x
    imm20formatstr = "{:05x}"
    imm12formatstr = "{:03x}"
    if (xlen == 32):
      storecmd = "sw"
      wordsize = 4
    else:
      storecmd = "sd"
      wordsize = 8
  formatstrlen = str(int(xlen/4))
  formatstr = "0x{:0" + formatstrlen + "x}" # format as xlen-bit hexadecimal number
  formatrefstr = "{:08x}" # format as xlen-bit hexadecimal number with no leading 0x
  if (xlen == 32):
    storecmd = "sw"
    wordsize = 4
  else:
    storecmd = "sd"
    wordsize = 8
  for test in tests:
    imperaspath = "../../imperas-riscv-tests/riscv-test-suite/rv" + str(xlen) + "i/"
    basename = "WALLY-" + test 
    fname = imperaspath + "src/" + basename + ".S"
@ -221,7 +241,7 @@ for xlen in xlens:
    f.write("///////////////////////////////////////////\n")
    f.write("// "+fname+ "\n")
    f.write("//\n")
-    f.write("// This file can be used to test the RISC-V JAL instruction.\n")
+    f.write("// This file can be used to test the RISC-V JAL(R) instruction.\n")
    f.write("// But be warned that altering the test environment may break this test!\n")
    f.write("// In order to work, this test expects that the first instruction (la)\n")
    f.write("// be allocated at 0x80000100.\n")
@ -235,14 +255,24 @@ for xlen in xlens:
      f.write(line)
    # print directed test vectors
-    for i in range(0,31):
+    if test == "JAL":
-      writeForwardsJumpVector(randint(0,4))
+      for i in range(0,31):
-    for i in range(0,31):
+        writeForwardsJumpVector(randint(0,4),"JAL")
-      writeBackwardsJumpVector(randint(0,4))
+      for i in range(0,31):
-    writeForwardsJumpVector(100)
+        writeBackwardsJumpVector(randint(0,4),"JAL")
-    writeBackwardsJumpVector(100)
+      writeForwardsJumpVector(100,"JAL")
-    writeChainVector(6,True)
+      writeBackwardsJumpVector(100,"JAL")
-    writeChainVector(16,False)
+      writeChainVector(6,True)
      writeChainVector(16,False)
    elif test == "JALR":
      for i in range(0,31):
        writeForwardsJumpVector(randint(0,4),"JALR")
      for i in range(0,31):
        writeBackwardsJumpVector(randint(0,4),"JALR")
      # can't make these latter two too long else 12 bit immediate overflows
      # (would need to lui or slli rs to achieve longer ranges)
      writeForwardsJumpVector(15,"JALR")
      writeBackwardsJumpVector(15,"JALR")
    # print footer
    h = open("testgen_footer.S", "r")
@ -254,7 +284,3 @@ for xlen in xlens:
    f.write("\nRV_COMPLIANCE_DATA_END\n")
    f.close()
    r.close()