Merge from branch 'main'

2025-02-11 06:05:49 +00:00 · 2021-04-08 17:19:34 -04:00 · 2021-04-08 17:19:34 -04:00 · d99b8f772e
commit d99b8f772e
parent 5afb255251 1ee8feffe5
140 changed files with 314145 additions and 106050 deletions
--- a/wally-pipelined/config/busybear/wally-config.vh
+++ b/wally-pipelined/config/busybear/wally-config.vh
@ -93,6 +93,10 @@
 // Hardware configuration
 `define UART_PRESCALE 1

+// Interrupt configuration
+`define PLIC_NUM_SRC 53
+`define PLIC_UART_ID 4
+
 /* verilator lint_off STMTDLY */
 /* verilator lint_off WIDTH */

--- a/wally-pipelined/config/busybear/wally-constants.vh
+++ b/wally-pipelined/config/busybear/wally-constants.vh
@ -26,6 +26,8 @@
 ///////////////////////////////////////////

 // Virtual Memory Constants (sv39)
+`define VPN_SEGMENT_BITS 9
 `define VPN_BITS 27
 `define PPN_BITS 44
+`define PPN_HIGH_SEGMENT_BITS 26
 `define PA_BITS  56
--- a/wally-pipelined/config/coremark/wally-config.vh
+++ b/wally-pipelined/config/coremark/wally-config.vh
@ -90,6 +90,10 @@
 // Hardware configuration
 `define UART_PRESCALE 1

+// Interrupt configuration
+`define PLIC_NUM_SRC 53
+`define PLIC_UART_ID 4
+
 // Can add PLIC Config here
 // Num interrupt sources

--- a/wally-pipelined/config/coremark/wally-constants.vh
+++ b/wally-pipelined/config/coremark/wally-constants.vh
@ -26,6 +26,8 @@
 ///////////////////////////////////////////

 // Virtual Memory Constants (sv39)
+`define VPN_SEGMENT_BITS 9
 `define VPN_BITS 27
 `define PPN_BITS 44
+`define PPN_HIGH_SEGMENT_BITS 26
 `define PA_BITS  56
--- a/wally-pipelined/config/coremark_bare/wally-config.vh
+++ b/wally-pipelined/config/coremark_bare/wally-config.vh
@ -28,7 +28,7 @@
 `define XLEN 64

 //`define MISA (32'h00000104)
-`define MISA (32'h00000104 | 1<<5 | 1<<18 | 1 << 20 | 1 << 12)
+`define MISA (32'h00001104 | 1<<5 | 1<<18 | 1 << 20 | 1 << 12 | 1 << 0)
 `define A_SUPPORTED ((`MISA >> 0) % 2 == 1)
 `define C_SUPPORTED ((`MISA >> 2) % 2 == 1)
 `define D_SUPPORTED ((`MISA >> 3) % 2 == 1)
@ -90,6 +90,10 @@
 // Hardware configuration
 `define UART_PRESCALE 1

+// Interrupt configuration
+`define PLIC_NUM_SRC 53
+`define PLIC_UART_ID 4
+
 /* verilator lint_off STMTDLY */
 /* verilator lint_off WIDTH */
 /* verilator lint_off ASSIGNDLY */
--- a/wally-pipelined/config/coremark_bare/wally-constants.vh
+++ b/wally-pipelined/config/coremark_bare/wally-constants.vh
@ -26,6 +26,8 @@
 ///////////////////////////////////////////

 // Virtual Memory Constants (sv39)
+`define VPN_SEGMENT_BITS 9
 `define VPN_BITS 27
 `define PPN_BITS 44
+`define PPN_HIGH_SEGMENT_BITS 26
 `define PA_BITS  56
--- a/wally-pipelined/config/rv32ic/wally-config.vh
+++ b/wally-pipelined/config/rv32ic/wally-config.vh
@ -27,7 +27,7 @@
 // RV32 or RV64: XLEN = 32 or 64
 `define XLEN 32

-`define MISA (32'h00000104 | 1 << 12)
+`define MISA (32'h00000104 | 1 << 20 | 1 << 18 | 1 << 12)
 `define A_SUPPORTED ((`MISA >> 0) % 2 == 1)
 `define C_SUPPORTED ((`MISA >> 2) % 2 == 1)
 `define D_SUPPORTED ((`MISA >> 3) % 2 == 1)
@ -53,7 +53,7 @@
 `define MEM_DCACHE 0
 `define MEM_DTIM 1
 `define MEM_ICACHE 0
-`define MEM_VIRTMEM 0
+`define MEM_VIRTMEM 1

 // Address space
 `define RESET_VECTOR 32'h80000000
@ -89,6 +89,10 @@
 // Hardware configuration
 `define UART_PRESCALE 1

+// Interrupt configuration
+`define PLIC_NUM_SRC 53
+`define PLIC_UART_ID 4
+
 /* verilator lint_off STMTDLY */
 /* verilator lint_off WIDTH */

--- a/wally-pipelined/config/rv32ic/wally-constants.vh
+++ b/wally-pipelined/config/rv32ic/wally-constants.vh
@ -26,6 +26,8 @@
 ///////////////////////////////////////////

 // Virtual Memory Constants (sv32)
+`define VPN_SEGMENT_BITS 10
 `define VPN_BITS 20
 `define PPN_BITS 22
+`define PPN_HIGH_SEGMENT_BITS 12
 `define PA_BITS  34
--- a/wally-pipelined/config/rv64ic/wally-config.vh
+++ b/wally-pipelined/config/rv64ic/wally-config.vh
@ -54,7 +54,7 @@
 `define MEM_DCACHE 0
 `define MEM_DTIM 1
 `define MEM_ICACHE 0
-`define MEM_VIRTMEM 0
+`define MEM_VIRTMEM 1

 // Address space
 `define RESET_VECTOR 64'h0000000080000000
@ -90,6 +90,10 @@
 // Hardware configuration
 `define UART_PRESCALE 1

+// Interrupt configuration
+`define PLIC_NUM_SRC 4
+`define PLIC_UART_ID 4
+
 /* verilator lint_off STMTDLY */
 /* verilator lint_off WIDTH */
 /* verilator lint_off ASSIGNDLY */
--- a/wally-pipelined/config/rv64ic/wally-constants.vh
+++ b/wally-pipelined/config/rv64ic/wally-constants.vh
@ -26,6 +26,8 @@
 ///////////////////////////////////////////

 // Virtual Memory Constants (sv39)
+`define VPN_SEGMENT_BITS 9
 `define VPN_BITS 27
 `define PPN_BITS 44
+`define PPN_HIGH_SEGMENT_BITS 26
 `define PA_BITS  56
--- a/wally-pipelined/config/rv64icfd/wally-config.vh
+++ b/wally-pipelined/config/rv64icfd/wally-config.vh
@ -54,7 +54,7 @@
 `define MEM_DCACHE 0
 `define MEM_DTIM 1
 `define MEM_ICACHE 0
-`define MEM_VIRTMEM 0
+`define MEM_VIRTMEM 1

 // Address space
 `define RESET_VECTOR 64'h0000000080000000
--- a/wally-pipelined/config/rv64icfd/wally-constants.vh
+++ b/wally-pipelined/config/rv64icfd/wally-constants.vh
@ -26,6 +26,8 @@
 ///////////////////////////////////////////

 // Virtual Memory Constants (sv39)
+`define VPN_SEGMENT_BITS 9
 `define VPN_BITS 27
 `define PPN_BITS 44
+`define PPN_HIGH_SEGMENT_BITS 26
 `define PA_BITS  56
--- a/wally-pipelined/config/rv64imc/BTBPredictor.txt
+++ b/wally-pipelined/config/rv64imc/BTBPredictor.txt
--- a/wally-pipelined/config/rv64imc/twoBitPredictor.txt
+++ b/wally-pipelined/config/rv64imc/twoBitPredictor.txt
--- a/wally-pipelined/config/rv64imc/wally-config.vh
+++ b/wally-pipelined/config/rv64imc/wally-config.vh
@ -0,0 +1,104 @@
+//////////////////////////////////////////
+// wally-config.vh
+//
+// Written: David_Harris@hmc.edu 4 January 2021
+// Modified: 
+//
+// Purpose: Specify which features are configured
+//          Macros to determine which modes are supported based on MISA
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+// RV32 or RV64: XLEN = 32 or 64
+`define XLEN 64
+
+//`define MISA (32'h00000105)
+`define MISA (32'h00001104 | 1<<5 | 1<<18 | 1 << 20 | 1 << 12 | 1 << 0)
+`define A_SUPPORTED ((`MISA >> 0) % 2 == 1)
+`define C_SUPPORTED ((`MISA >> 2) % 2 == 1)
+`define D_SUPPORTED ((`MISA >> 3) % 2 == 1)
+`define F_SUPPORTED ((`MISA >> 5) % 2 == 1)
+`define M_SUPPORTED ((`MISA >> 12) % 2 == 1)
+`define S_SUPPORTED ((`MISA >> 18) % 2 == 1)
+`define U_SUPPORTED ((`MISA >> 20) % 2 == 1)
+`define ZCSR_SUPPORTED 1
+`define COUNTERS 31
+`define ZCOUNTERS_SUPPORTED 1
+// N-mode user-level interrupts are depricated per Andrew Waterman 1/13/21
+//`define N_SUPPORTED ((MISA >> 13) % 2 == 1)
+`define N_SUPPORTED 0
+
+`define M_MODE (2'b11)
+`define S_MODE (2'b01)
+`define U_MODE (2'b00)
+
+// Microarchitectural Features
+`define UARCH_PIPELINED 1
+`define UARCH_SUPERSCALR 0
+`define UARCH_SINGLECYCLE 0
+`define MEM_DCACHE 0
+`define MEM_DTIM 1
+`define MEM_ICACHE 0
+`define MEM_VIRTMEM 0
+
+// Address space
+`define RESET_VECTOR 64'h0000000080000000
+
+// Bus Interface width
+`define AHBW 64
+
+// Peripheral Physiccal Addresses
+// Peripheral memory space extends from BASE to BASE+RANGE
+// Range should be a thermometer code with 0's in the upper bits and 1s in the lower bits
+
+`define BOOTTIMBASE   32'h00000000
+`define BOOTTIMRANGE  32'h00003FFF
+`define TIMBASE    32'h80000000
+`define TIMRANGE   32'h0007FFFF
+`define CLINTBASE  32'h02000000
+`define CLINTRANGE 32'h0000FFFF
+`define GPIOBASE   32'h10012000
+`define GPIORANGE  32'h000000FF
+`define UARTBASE   32'h10000000
+`define UARTRANGE  32'h00000007
+`define PLICBASE   32'h0C000000
+`define PLICRANGE  32'h03FFFFFF
+
+// Test modes
+
+// Tie GPIO outputs back to inputs
+`define GPIO_LOOPBACK_TEST 0
+
+// Busybear special CSR config to match OVPSim
+`define OVPSIM_CSR_CONFIG 0
+
+// Hardware configuration
+`define UART_PRESCALE 1
+
+// Interrupt configuration
+`define PLIC_NUM_SRC 53
+`define PLIC_UART_ID 4
+
+/* verilator lint_off STMTDLY */
+/* verilator lint_off WIDTH */
+/* verilator lint_off ASSIGNDLY */
+/* verilator lint_off PINCONNECTEMPTY */
+
+`define TWO_BIT_PRELOAD "../config/rv64ic/twoBitPredictor.txt"
+`define BTB_PRELOAD "../config/rv64ic/BTBPredictor.txt"
+`define BPTYPE "BPGSHARE" // BPGLOBAL or BPTWOBIT or BPGSHARE
--- a/wally-pipelined/config/rv64imc/wally-constants.vh
+++ b/wally-pipelined/config/rv64imc/wally-constants.vh
@ -0,0 +1,33 @@
+//////////////////////////////////////////
+// wally-constants.vh
+//
+// Written: tfleming@hmc.edu 4 March 2021
+// Modified:
+//
+// Purpose: Specify certain constants defined in the RISC-V 64-bit architecture.
+//          These macros should not be changed, except in the event of an
+//          update to the architecture or particularly special circumstances.
+//
+// A component of the Wally configurable RISC-V project.
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+// Virtual Memory Constants (sv39)
+`define VPN_SEGMENT_BITS 9
+`define VPN_BITS 27
+`define PPN_BITS 44
+`define PPN_HIGH_SEGMENT_BITS 26
+`define PA_BITS  56
--- a/wally-pipelined/lint-wally
+++ b/wally-pipelined/lint-wally
@ -1,7 +1,10 @@
 # check for warnings in Verilog code
 # The verilator lint tool is faster and better than Modelsim so it is best to run this first.

+echo "rv64ic linting..."
 verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv64ic src/*/*.sv 
+echo "rv32ic linting..."
+verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv32ic src/*/*.sv 
 #verilator --lint-only --top-module wallypipelinedsoc -Iconfig/rv64ic src/*/*.sv src/*/div/*.sv

 # --lint-only just runs lint rather than trying to compile and simulate
--- a/wally-pipelined/regression/run_sim.sh
+++ b/wally-pipelined/regression/run_sim.sh
@ -0,0 +1,3 @@
+#!/bin/sh
+vsim -do $1
+
--- a/wally-pipelined/regression/sim-peripherals
+++ b/wally-pipelined/regression/sim-peripherals
@ -0,0 +1 @@
+vsim -do wally-peripherals.do
--- a/wally-pipelined/regression/sim-wally-batch-muldiv
+++ b/wally-pipelined/regression/sim-wally-batch-muldiv
@ -0,0 +1,3 @@
+vsim -c <<!
+do wally-pipelined-batch.do ../config/rv64imc rv64imc
+!
--- a/wally-pipelined/regression/sim-wally-batch-rv32ic
+++ b/wally-pipelined/regression/sim-wally-batch-rv32ic
@ -0,0 +1,3 @@
+vsim -c <<!
+do wally-pipelined-batch.do ../config/rv32ic rv32ic
+!
--- a/wally-pipelined/regression/sim-wally-rv32ic
+++ b/wally-pipelined/regression/sim-wally-rv32ic
@ -1,3 +1 @@
-vsim -c <<!
-do wally-pipelined-batch.do ../config/rv32ic rv32ic
-!
+vsim -do "do wally-pipelined.do ../config/rv32ic"
--- a/wally-pipelined/regression/vish_stacktrace.vstf
+++ b/wally-pipelined/regression/vish_stacktrace.vstf
@ -0,0 +1,3 @@
+# transcript error: error writing "stdout": broken pipe
+    while executing
+"puts -nonewline stdout $s"
--- a/wally-pipelined/regression/wally-busybear-batch.do
+++ b/wally-pipelined/regression/wally-busybear-batch.do
@ -26,7 +26,7 @@ vlib work-busybear
 # suppress spurious warnngs about 
 # "Extra checking for conflicts with always_comb done at vopt time"
 # because vsim will run vopt
-vlog +incdir+../config/busybear ../testbench/*.sv ../src/*/*.sv -suppress 2583
+vlog +incdir+../config/busybear ../testbench/testbench-busybear.sv ../src/*/*.sv -suppress 2583


 # start and run simulation
--- a/wally-pipelined/regression/wally-busybear.do
+++ b/wally-pipelined/regression/wally-busybear.do
@ -26,7 +26,7 @@ vlib work-busybear
 # suppress spurious warnngs about 
 # "Extra checking for conflicts with always_comb done at vopt time"
 # because vsim will run vopt
-vlog +incdir+../config/busybear ../testbench/*.sv ../src/*/*.sv -suppress 2583
+vlog +incdir+../config/busybear ../testbench/testbench-busybear.sv ../src/*/*.sv -suppress 2583


 # start and run simulation
--- a/wally-pipelined/regression/wally-pipelined-batch-muldiv.do
+++ b/wally-pipelined/regression/wally-pipelined-batch-muldiv.do
@ -0,0 +1,43 @@
+# wally-pipelined-batch.do 
+#
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
+# James Stine, 2008; David Harris 2021
+# Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui
+
+# Use this wally-pipelined-batch.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined-batch.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined-batch.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work_$2] {
+    vdel -lib work_$2 -all
+}
+vlib work_$2
+
+# compile source files
+# suppress spurious warnngs about 
+# "Extra checking for conflicts with always_comb done at vopt time"
+# because vsim will run vopt
+
+# default to config/rv64ic, but allow this to be overridden at the command line.  For example:
+# do wally-pipelined-batch.do ../config/rv32ic rv32ic
+switch $argc {
+    0 {vlog +incdir+../config/rv64imc ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+    1 {vlog +incdir+$1 ../testbench/testbench-imperas.sv  ../src/*/*.sv -suppress 2583}
+    2 {vlog -work work_$2 +incdir+$1 ../testbench/testbench-imperas.sv  ../src/*/*.sv -suppress 2583}
+}
+# start and run simulation
+# remove +acc flag for faster sim during regressions if there is no need to access internal signals
+vopt work_$2.testbench -work work_$2 -o workopt_$2
+vsim -lib work_$2 workopt_$2
+
+run -all
+quit
--- a/wally-pipelined/regression/wally-pipelined-muldiv.do
+++ b/wally-pipelined/regression/wally-pipelined-muldiv.do
@ -1,10 +1,11 @@
-# wally-peripherals.do 
+# wally-pipelined.do 
 #
-# Created by Ben Bracker (bbracker@hmc.edu) on 11 Feb. 2021
-#
-# Based on wally-pipelined.do by 
+# Modification by Oklahoma State University & Harvey Mudd College
+# Use with Testbench 
 # James Stine, 2008; David Harris 2021
 # Go Cowboys!!!!!!
+#
+# Takes 1:10 to run RV64IC tests using gui

 # Use this wally-pipelined.do file to run this example.
 # Either bring up ModelSim and type the following at the "ModelSim>" prompt:
@ -28,10 +29,9 @@ vlib work

 # default to config/rv64ic, but allow this to be overridden at the command line.  For example:
 # do wally-pipelined.do ../config/rv32ic
-# That said, I don't think there are any peripherals that use anything but rv64i just yet.
 switch $argc {
-    0 {vlog +incdir+../config/rv64ic ../testbench/testbench-peripherals.sv ../src/*/*.sv -suppress 2583}
-    1 {vlog +incdir+$1 ../testbench/testbench-peripherals.sv ../src/*/*.sv -suppress 2583}
+    0 {vlog +incdir+../config/rv64imc ../testbench/testbench-imperas.sv ../src/*/*.sv -suppress 2583}
+    1 {vlog +incdir+$1 ../testbench/testbench-imperas.sv ../testbench/function_radix.sv ../src/*/*.sv -suppress 2583}
 }
 # start and run simulation
 # remove +acc flag for faster sim during regressions if there is no need to access internal signals
@ -40,4 +40,24 @@ vsim workopt


 view wave
-do wally-peripherals-signals.do
+
+-- display input and output signals as hexidecimal values
+do ./wave-dos/ahb-muldiv.do
+
+-- Set Wave Output Items 
+TreeUpdate [SetDefaultTree]
+WaveRestoreZoom {0 ps} {100 ps}
+configure wave -namecolwidth 250
+configure wave -valuecolwidth 140
+configure wave -justifyvalue left
+configure wave -signalnamewidth 0
+configure wave -snapdistance 10
+configure wave -datasetprefix 0
+configure wave -rowmargin 4
+configure wave -childrowmargin 2
+set DefaultRadix hexadecimal
+
+-- Run the Simulation 
+#run 4100
+run -all
+#quit
--- a/wally-pipelined/regression/wally-pipelined.do
+++ b/wally-pipelined/regression/wally-pipelined.do
@ -38,9 +38,7 @@ switch $argc {
 vopt +acc work.testbench -o workopt 
 vsim workopt

-
 view wave
-
 -- display input and output signals as hexidecimal values
 do ./wave-dos/cache-waves.do

@ -48,7 +46,7 @@ do ./wave-dos/cache-waves.do
 TreeUpdate [SetDefaultTree]
 WaveRestoreZoom {0 ps} {100 ps}
 configure wave -namecolwidth 250
-configure wave -valuecolwidth 140
+configure wave -valuecolwidth 120
 configure wave -justifyvalue left
 configure wave -signalnamewidth 0
 configure wave -snapdistance 10
@ -58,6 +56,8 @@ configure wave -childrowmargin 2
 set DefaultRadix hexadecimal

 -- Run the Simulation 
-#run 4100
+#run 5000 
 run -all
 #quit
+noview ../testbench/testbench-imperas.sv
+view wave
--- a/wally-pipelined/regression/wave-dos/ahb-muldiv.do
+++ b/wally-pipelined/regression/wave-dos/ahb-muldiv.do
@ -0,0 +1,96 @@
+add wave /testbench/clk
+add wave /testbench/reset
+add wave -divider
+
+#add wave /testbench/dut/hart/ebu/IReadF
+add wave -noupdate -divider -height 32 "Stalls"
+add wave /testbench/dut/hart/DataStall
+add wave /testbench/dut/hart/InstrStall
+add wave /testbench/dut/hart/StallF
+add wave /testbench/dut/hart/StallD
+add wave /testbench/dut/hart/StallE
+add wave /testbench/dut/hart/StallM
+add wave /testbench/dut/hart/StallW
+add wave /testbench/dut/hart/FlushD
+add wave /testbench/dut/hart/FlushE
+add wave /testbench/dut/hart/FlushM
+add wave /testbench/dut/hart/FlushW
+
+add wave -noupdate -divider -height 32 "MulDiv"
+add wave -hex /testbench/dut/hart/mdu/*
+
+add wave -noupdate -divider -height 32 "Integer Divider"
+add wave -hex /testbench/dut/hart/mdu/genblk1/div/fsm1/CURRENT_STATE
+add wave -hex /testbench/dut/hart/mdu/genblk1/div/fsm1/NEXT_STATE
+add wave -hex /testbench/dut/hart/mdu/genblk1/div/*
+
+
+
+add wave -noupdate -divider -height 32 "RF"
+add wave -hex /testbench/dut/hart/ieu/dp/regf/*
+add wave -hex /testbench/dut/hart/ieu/dp/regf/rf
+
+
+add wave -divider
+add wave -hex /testbench/dut/hart/ifu/PCF
+add wave -hex /testbench/dut/hart/ifu/PCD
+add wave -hex /testbench/dut/hart/ifu/InstrD
+
+add wave /testbench/InstrDName
+add wave -hex /testbench/dut/hart/ifu/ic/InstrRawD
+add wave -hex /testbench/dut/hart/ifu/ic/AlignedInstrD
+add wave -divider
+add wave -hex /testbench/dut/hart/ifu/ic/InstrPAdrF
+add wave /testbench/dut/hart/ifu/ic/DelayF
+add wave /testbench/dut/hart/ifu/ic/DelaySideF
+add wave /testbench/dut/hart/ifu/ic/DelayD
+add wave -hex /testbench/dut/hart/ifu/ic/MisalignedHalfInstrD
+add wave -divider
+
+add wave -hex /testbench/dut/hart/ifu/PCE
+add wave -hex /testbench/dut/hart/ifu/InstrE
+add wave /testbench/InstrEName
+add wave -hex /testbench/dut/hart/ieu/dp/SrcAE
+add wave -hex /testbench/dut/hart/ieu/dp/SrcBE
+add wave -hex /testbench/dut/hart/ieu/dp/ALUResultE
+#add wave /testbench/dut/hart/ieu/dp/PCSrcE
+add wave -divider
+
+add wave -hex /testbench/dut/hart/ifu/PCM
+add wave -hex /testbench/dut/hart/ifu/InstrM
+add wave /testbench/InstrMName
+add wave /testbench/dut/uncore/dtim/memwrite
+add wave -hex /testbench/dut/uncore/HADDR
+add wave -hex /testbench/dut/uncore/HWDATA
+add wave -divider
+
+add wave -hex /testbench/dut/hart/ebu/MemReadM
+add wave -hex /testbench/dut/hart/ebu/InstrReadF
+add wave -hex /testbench/dut/hart/ebu/BusState
+add wave -hex /testbench/dut/hart/ebu/NextBusState
+add wave -hex /testbench/dut/hart/ebu/HADDR
+add wave -hex /testbench/dut/hart/ebu/HREADY
+add wave -hex /testbench/dut/hart/ebu/HTRANS
+add wave -hex /testbench/dut/hart/ebu/HRDATA
+add wave -hex /testbench/dut/hart/ebu/HWRITE
+add wave -hex /testbench/dut/hart/ebu/HWDATA
+add wave -hex /testbench/dut/hart/ebu/CaptureDataM
+add wave -hex /testbench/dut/hart/ebu/InstrStall
+add wave -divider
+
+add wave -hex /testbench/dut/uncore/dtim/*
+add wave -divider
+
+add wave -hex /testbench/dut/hart/ifu/PCW
+add wave -hex /testbench/dut/hart/ifu/InstrW
+add wave /testbench/InstrWName
+add wave /testbench/dut/hart/ieu/dp/RegWriteW
+add wave -hex /testbench/dut/hart/ebu/ReadDataW
+add wave -hex /testbench/dut/hart/ieu/dp/ResultW
+add wave -hex /testbench/dut/hart/ieu/dp/RdW
+add wave -divider
+
+add wave -hex /testbench/dut/uncore/dtim/*
+add wave -divider
+
+add wave -hex -r /testbench/*
--- a/wally-pipelined/regression/wave-dos/peripheral-waves.do
+++ b/wally-pipelined/regression/wave-dos/peripheral-waves.do
@ -10,25 +10,26 @@ restart -f
 delete wave /*
 view wave

-- display input and output signals as hexidecimal values
-# Diplays All Signals recursively
+# general stuff
 add wave /testbench/clk
 add wave /testbench/reset
 add wave -divider
+
 add wave /testbench/dut/hart/DataStall
 add wave /testbench/dut/hart/InstrStall
 add wave /testbench/dut/hart/StallF
 add wave /testbench/dut/hart/StallD
+add wave /testbench/dut/hart/StallE
+add wave /testbench/dut/hart/StallM
+add wave /testbench/dut/hart/StallW
 add wave /testbench/dut/hart/FlushD
 add wave /testbench/dut/hart/FlushE
 add wave /testbench/dut/hart/FlushM
 add wave /testbench/dut/hart/FlushW
-
 add wave -divider
+
 add wave -hex /testbench/dut/hart/ifu/PCF
-add wave -hex /testbench/dut/hart/ifu/InstrF
-add wave /testbench/InstrFName
-#add wave -hex /testbench/dut/hart/ifu/PCD
+add wave -hex /testbench/dut/hart/ifu/PCD
 add wave -hex /testbench/dut/hart/ifu/InstrD
 add wave /testbench/InstrDName
 add wave -divider
@ -38,6 +39,7 @@ add wave /testbench/InstrEName
 add wave -hex /testbench/dut/hart/ieu/dp/SrcAE
 add wave -hex /testbench/dut/hart/ieu/dp/SrcBE
 add wave -hex /testbench/dut/hart/ieu/dp/ALUResultE
+#add wave /testbench/dut/hart/ieu/dp/PCSrcE
 add wave -divider
 add wave -hex /testbench/dut/hart/ifu/PCM
 add wave -hex /testbench/dut/hart/ifu/InstrM
@ -46,35 +48,26 @@ add wave /testbench/dut/uncore/dtim/memwrite
 add wave -hex /testbench/dut/uncore/HADDR
 add wave -hex /testbench/dut/uncore/HWDATA
 add wave -divider
-add wave -hex /testbench/dut/hart/ifu/PCW
+add wave -hex /testbench/PCW
+add wave -hex /testbench/InstrW
 add wave /testbench/InstrWName
 add wave /testbench/dut/hart/ieu/dp/RegWriteW
 add wave -hex /testbench/dut/hart/ieu/dp/ResultW
 add wave -hex /testbench/dut/hart/ieu/dp/RdW
 add wave -divider
+add wave -divider
+
+# peripherals
 add wave -hex /testbench/dut/hart/ebu/*
 add wave -divider
 add wave -hex /testbench/dut/uncore/uart/u/*
 add wave -divider
-#add ww
+add wave -hex /testbench/dut/uncore/plic/*
+add wave -hex /testbench/dut/uncore/plic/intPriority
+add wave -hex /testbench/dut/uncore/plic/pendingArray
+add wave -divider
+add wave -divider
+
+# everything else
 add wave -hex -r /testbench/*

-- Set Wave Output Items 
-TreeUpdate [SetDefaultTree]
-WaveRestoreZoom {0 ps} {100 ps}
-configure wave -namecolwidth 250
-configure wave -valuecolwidth 120
-configure wave -justifyvalue left
-configure wave -signalnamewidth 0
-configure wave -snapdistance 10
-configure wave -datasetprefix 0
-configure wave -rowmargin 4
-configure wave -childrowmargin 2
-set DefaultRadix hexadecimal
-
-- Run the Simulation 
-#run 5000 
-run -all
-#quit
-noview ../testbench/testbench-peripherals.sv
-view wave
--- a/wally-pipelined/regression/wave.do
+++ b/wally-pipelined/regression/wave.do
@ -8,33 +8,42 @@ add wave -noupdate -expand -group {Execution Stage} /testbench/functionRadix/fun
 add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/PCE
 add wave -noupdate -expand -group {Execution Stage} /testbench/InstrEName
 add wave -noupdate -expand -group {Execution Stage} /testbench/dut/hart/ifu/InstrE
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InstrMisalignedFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InstrAccessFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/IllegalInstrFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/BreakpointFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/LoadMisalignedFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/StoreMisalignedFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/LoadAccessFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/StoreAccessFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/EcallFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InstrPageFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/LoadPageFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/StorePageFaultM
-add wave -noupdate -group HDU -expand -group traps /testbench/dut/hart/priv/trap/InterruptM
-add wave -noupdate -group HDU -expand -group hazards /testbench/dut/hart/hzu/BPPredWrongE
-add wave -noupdate -group HDU -expand -group hazards /testbench/dut/hart/hzu/CSRWritePendingDEM
-add wave -noupdate -group HDU -expand -group hazards /testbench/dut/hart/hzu/RetM
-add wave -noupdate -group HDU -expand -group hazards /testbench/dut/hart/hzu/TrapM
-add wave -noupdate -group HDU -expand -group hazards /testbench/dut/hart/hzu/LoadStallD
-add wave -noupdate -group HDU -expand -group hazards /testbench/dut/hart/hzu/InstrStall
-add wave -noupdate -group HDU -expand -group hazards /testbench/dut/hart/hzu/DataStall
-add wave -noupdate -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/hzu/FlushF
-add wave -noupdate -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushD
-add wave -noupdate -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushE
-add wave -noupdate -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushM
-add wave -noupdate -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushW
-add wave -noupdate -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallF
-add wave -noupdate -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallD
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrMisalignedFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrAccessFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/IllegalInstrFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/BreakpointFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/LoadMisalignedFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/StoreMisalignedFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/LoadAccessFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/StoreAccessFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/EcallFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InstrPageFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/LoadPageFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/StorePageFaultM
+add wave -noupdate -expand -group HDU -group traps /testbench/dut/hart/priv/trap/InterruptM
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/BPPredWrongE
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/CSRWritePendingDEM
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/RetM
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/TrapM
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/LoadStallD
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/InstrStall
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/hzu/DataStall
+add wave -noupdate -expand -group HDU -group hazards /testbench/dut/hart/MulDivStallD
+add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/hzu/FlushF
+add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushD
+add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushE
+add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushM
+add wave -noupdate -expand -group HDU -expand -group Flush -color Yellow /testbench/dut/hart/FlushW
+add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallF
+add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallD
+add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallE
+add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallM
+add wave -noupdate -expand -group HDU -expand -group Stall -color Orange /testbench/dut/hart/StallW
+add wave -noupdate /testbench/dut/hart/hzu/StallFCause_Q
+add wave -noupdate /testbench/dut/hart/hzu/StallDCause_Q
+add wave -noupdate /testbench/dut/hart/hzu/StallECause_Q
+add wave -noupdate /testbench/dut/hart/hzu/StallMCause_Q
+add wave -noupdate /testbench/dut/hart/hzu/StallWCause_Q
 add wave -noupdate -group Bpred -expand -group direction -divider Update
 add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/Predictor/DirPredictor/UpdatePC
 add wave -noupdate -group Bpred -expand -group direction /testbench/dut/hart/ifu/bpred/Predictor/DirPredictor/UpdateEN
@ -53,7 +62,6 @@ add wave -noupdate -group Bpred -group BTB -divider Lookup
 add wave -noupdate -group Bpred -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/TargetPC
 add wave -noupdate -group Bpred -group BTB /testbench/dut/hart/ifu/bpred/TargetPredictor/Valid
 add wave -noupdate -group Bpred /testbench/dut/hart/ifu/bpred/BPPredWrongE
-add wave -noupdate -expand -group {instruction pipeline} /testbench/dut/hart/ifu/InstrF
 add wave -noupdate -expand -group {instruction pipeline} /testbench/dut/hart/ifu/InstrD
 add wave -noupdate -expand -group {instruction pipeline} /testbench/dut/hart/ifu/InstrE
 add wave -noupdate -expand -group {instruction pipeline} /testbench/dut/hart/ifu/InstrM
@ -82,7 +90,6 @@ add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/we3
 add wave -noupdate -expand -group RegFile /testbench/dut/hart/ieu/dp/regf/wd3
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ALUResultW
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ReadDataW
-add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/PCLinkW
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/CSRReadValW
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultSrcW
 add wave -noupdate -expand -group RegFile -group {write regfile mux} /testbench/dut/hart/ieu/dp/ResultW
@ -141,8 +148,32 @@ add wave -noupdate -group {function radix debug} /testbench/functionRadix/functi
 add wave -noupdate -group {function radix debug} /testbench/functionRadix/function_radix/FunctionAddr
 add wave -noupdate -group {function radix debug} /testbench/functionRadix/function_radix/ProgramAddrIndex
 add wave -noupdate -group {function radix debug} /testbench/functionRadix/function_radix/FunctionName
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/InstrD
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/SrcAE
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/SrcBE
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/Funct3E
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/MulDivE
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/W64E
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/StallM
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/StallW
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/FlushM
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/FlushW
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/MulDivResultW
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/genblk1/div/start
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/DivDoneE
+add wave -noupdate -expand -group muldiv /testbench/dut/hart/mdu/DivBusyE
+add wave -noupdate /testbench/dut/hart/mdu/genblk1/gclk
+add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/fsm1/CURRENT_STATE
+add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/N
+add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/D
+add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/Q
+add wave -noupdate -expand -group divider /testbench/dut/hart/mdu/genblk1/div/rem0
+add wave -noupdate /testbench/dut/hart/MulDivResultW
+add wave -noupdate /testbench/dut/hart/mdu/genblk1/PrelimResultE
+add wave -noupdate /testbench/dut/hart/mdu/Funct3E
+add wave -noupdate /testbench/dut/hart/mdu/genblk1/QuotE
 TreeUpdate [SetDefaultTree]
-WaveRestoreCursors {{Cursor 2} {3758805 ns} 0}
+WaveRestoreCursors {{Cursor 2} {128433 ns} 0}
 quietly wave cursor active 1
 configure wave -namecolwidth 250
 configure wave -valuecolwidth 229
@ -158,4 +189,4 @@ configure wave -griddelta 40
 configure wave -timeline 0
 configure wave -timelineunits ns
 update
-WaveRestoreZoom {1644110 ns} {15262484 ns}
+WaveRestoreZoom {128007 ns} {128663 ns}
--- a/wally-pipelined/src/cache/line.sv
+++ b/wally-pipelined/src/cache/line.sv
@ -1,68 +0,0 @@
-///////////////////////////////////////////
-// line.sv
-//
-// Written: jaallen@g.hmc.edu 2021-03-23
-// Modified: 
-//
-// Purpose: An implementation of a single cache line
-// 
-// A component of the Wally configurable RISC-V project.
-// 
-// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
-// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
-// is furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
-// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
-// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-///////////////////////////////////////////
-
-`include "wally-config.vh"
-
-// A read-only cache line ("write"ing to this line is loading new data, not writing to memory
-module rocacheline #(parameter LINESIZE = 256, parameter TAGSIZE = 32, parameter WORDSIZE = `XLEN) (
-    // Pipeline stuff
-    input  logic clk,
-    input  logic reset,
-    // If flush is high, invalidate this word
-    input  logic flush,
-    // Select which word within the line
-    input  logic [$clog2(LINESIZE/8)-1:0]   WordSelect,
-    // Write new data to the line
-    input  logic                            WriteEnable,
-    input  logic [LINESIZE-1:0]             WriteData,
-    input  logic [TAGSIZE-1:0]              WriteTag,
-    // Output the word, as well as the tag and if it is valid
-    output logic [WORDSIZE-1:0]             DataWord,
-    output logic [TAGSIZE-1:0]              DataTag,
-    output logic                            DataValid
-);
-
-    localparam integer OFFSETSIZE = $clog2(LINESIZE/8);
-    localparam integer NUMWORDS = LINESIZE/WORDSIZE;
-
-    logic [NUMWORDS-1:0][WORDSIZE-1:0]  DataLinesIn, DataLinesOut;
-
-    flopenr #(1)        ValidBitFlop(clk, reset, WriteEnable | flush, ~flush, DataValid);
-    flopenr #(TAGSIZE)  TagFlop(clk, reset, WriteEnable, WriteTag, DataTag);
-
-    genvar i;
-    generate
-        for (i=0; i < NUMWORDS; i++) begin
-            assign DataLinesIn[i] = WriteData[WORDSIZE*(i+1)-1:WORDSIZE*i];
-            flopenr #(WORDSIZE) LineFlop(clk, reset, WriteEnable, DataLinesIn[i], DataLinesOut[i]);
-        end
-    endgenerate
-
-
-    always_comb begin
-        assign DataWord = DataLinesOut[WordSelect[OFFSETSIZE-1:$clog2(WORDSIZE/8)]];
-    end
-
-endmodule
--- a/wally-pipelined/src/dmem/dmem.sv
+++ b/wally-pipelined/src/dmem/dmem.sv
@ -52,19 +52,23 @@ module dmem (
  // TLB management
  input logic  [1:0]       PrivilegeModeW,
  input logic  [`XLEN-1:0] PageTableEntryM,
+  input logic  [1:0]       PageTypeM,
  input logic  [`XLEN-1:0] SATP_REGW,
-  input logic              DTLBWriteM, // DTLBFlushM,
+  input logic              DTLBWriteM, DTLBFlushM,
  output logic             DTLBMissM, DTLBHitM
 );

+  logic             MemAccessM;  // Whether memory needs to be accessed
  logic             SquashSCM;
+  // *** needs to be sent to trap unit
+  logic             DTLBPageFaultM;

-  // *** temporary hack until walker is hooked up -- Thomas F
-  // logic  [`XLEN-1:0] PageTableEntryM = '0;
-  logic DTLBFlushM = '0;
-  // logic DTLBWriteM = '0;
-  tlb #(3) dtlb(clk, reset, SATP_REGW, PrivilegeModeW, MemAdrM, PageTableEntryM, DTLBWriteM,
-    DTLBFlushM, MemPAdrM, DTLBMissM, DTLBHitM);
+  tlb #(3) dtlb(.TLBAccess(MemAccessM), .VirtualAddress(MemAdrM),
+                .PageTableEntryWrite(PageTableEntryM), .PageTypeWrite(PageTypeM),
+                .TLBWrite(DTLBWriteM), .TLBFlush(DTLBFlushM),
+                .PhysicalAddress(MemPAdrM), .TLBMiss(DTLBMissM),
+                .TLBHit(DTLBHitM), .TLBPageFault(DTLBPageFaultM),
+                .*);

 	// Determine if an Unaligned access is taking place
 	always_comb
@ -78,11 +82,12 @@ module dmem (
  // Squash unaligned data accesses and failed store conditionals
  // *** this is also the place to squash if the cache is hit
  assign MemReadM = MemRWM[1] & ~DataMisalignedM;
-  assign MemWriteM = MemRWM[0] & ~DataMisalignedM && ~SquashSCM; 
+  assign MemWriteM = MemRWM[0] & ~DataMisalignedM && ~SquashSCM;
+  assign MemAccessM = |MemRWM;

  // Determine if address is valid
  assign LoadMisalignedFaultM = DataMisalignedM & MemRWM[1];
-  assign LoadAccessFaultM = DataAccessFaultM & MemRWM[0];
+  assign LoadAccessFaultM = DataAccessFaultM & MemRWM[1];
  assign StoreMisalignedFaultM = DataMisalignedM & MemRWM[0];
  assign StoreAccessFaultM = DataAccessFaultM & MemRWM[0];

@ -97,7 +102,7 @@ module dmem (
      assign scM = MemRWM[0] && AtomicM[0]; 
      assign WriteAdrMatchM = MemRWM[0] && (MemPAdrM[`XLEN-1:2] == ReservationPAdrW) && ReservationValidW;
      assign SquashSCM = scM && ~WriteAdrMatchM;
-      always_comb begin // ReservationValidM (next valiue of valid reservation)
+      always_comb begin // ReservationValidM (next value of valid reservation)
        if (lrM) ReservationValidM = 1;  // set valid on load reserve
        else if (scM || WriteAdrMatchM) ReservationValidM = 0; // clear valid on store to same address or any sc
        else ReservationValidM = ReservationValidW; // otherwise don't change valid
--- a/wally-pipelined/src/ebu/ahblite.sv
+++ b/wally-pipelined/src/ebu/ahblite.sv
@ -50,6 +50,7 @@ module ahblite (
  // Signals from MMU
  input  logic [`XLEN-1:0] MMUPAdr,
  input  logic             MMUTranslate, MMUTranslationComplete,
+  input  logic             TrapM,
  output logic [`XLEN-1:0] MMUReadPTE,
  output logic             MMUReady,
  // Return from bus
@ -105,16 +106,16 @@ module ahblite (
            else if (InstrReadF)   NextBusState = INSTRREAD;
            else                   NextBusState = IDLE;
      MMUTRANSLATE: if (~HREADY)   NextBusState = MMUTRANSLATE;
-            else                   NextBusState = MMUIDLE;
+            else                   NextBusState = IDLE;
      // *** Could the MMUIDLE state just be the normal idle state?
      // Do we trust MMUTranslate to be high exactly when we need translation?
-      MMUIDLE: if (~MMUTranslationComplete)
-                                   NextBusState = MMUTRANSLATE;
-            else if (AtomicM[1])   NextBusState = ATOMICREAD;
-            else if (MemReadM)     NextBusState = MEMREAD;  // Memory has priority over instructions
-            else if (MemWriteM)    NextBusState = MEMWRITE;
-            else if (InstrReadF)   NextBusState = INSTRREAD;
-            else                   NextBusState = IDLE;
+      // MMUIDLE: if (MMUTranslate)
+      //                              NextBusState = MMUTRANSLATE;
+      //       else if (AtomicM[1])   NextBusState = ATOMICREAD;
+      //       else if (MemReadM)     NextBusState = MEMREAD;  // Memory has priority over instructions
+      //       else if (MemWriteM)    NextBusState = MEMWRITE;
+      //       else if (InstrReadF)   NextBusState = INSTRREAD;
+      //       else                   NextBusState = IDLE;
      ATOMICREAD: if (~HREADY)     NextBusState = ATOMICREAD;
            else                   NextBusState = ATOMICWRITE;
      ATOMICWRITE: if (~HREADY)    NextBusState = ATOMICWRITE;
@ -134,8 +135,11 @@ module ahblite (
    endcase

  // stall signals
-  assign #2 DataStall = (NextBusState == MEMREAD) || (NextBusState == MEMWRITE) || 
-                        (NextBusState == ATOMICREAD) || (NextBusState == ATOMICWRITE);
+  // Note that we need to extend both stalls when MMUTRANSLATE goes to idle,
+  // since translation might not be complete.
+  assign #2 DataStall = ((NextBusState == MEMREAD) || (NextBusState == MEMWRITE) || 
+                    (NextBusState == ATOMICREAD) || (NextBusState == ATOMICWRITE) ||
+                    (NextBusState == MMUTRANSLATE) || (MMUTranslate && ~MMUTranslationComplete)); // && ~TrapM

  //  bus outputs
  assign #1 GrantData = (NextBusState == MEMREAD) || (NextBusState == MEMWRITE) || 
@ -154,7 +158,7 @@ module ahblite (
  assign HTRANS = (NextBusState != IDLE) ? 2'b10 : 2'b00; // NONSEQ if reading or writing, IDLE otherwise
  assign HMASTLOCK = 0; // no locking supported
  assign HWRITE = (NextBusState == MEMWRITE) || (NextBusState == ATOMICWRITE);
-  // delay write data by one cycle for 
+  // delay write data by one cycle for
  flop #(`XLEN) wdreg(HCLK, WriteData, HWDATA); // delay HWDATA by 1 cycle per spec; *** assumes AHBW = XLEN
  // delay signals for subword writes
  flop #(3)   adrreg(HCLK, HADDR[2:0], HADDRD);
@ -164,7 +168,7 @@ module ahblite (
    // Route signals to Instruction and Data Caches
  // *** assumes AHBW = XLEN

-  assign #1 MMUReady = (NextBusState == MMUIDLE);
+  assign MMUReady = (BusState == MMUTRANSLATE && NextBusState == IDLE);

  assign InstrRData = HRDATA;
  assign InstrAckF = (BusState == INSTRREAD) && (NextBusState != INSTRREAD) || (BusState == INSTRREADC) && (NextBusState != INSTRREADC);
--- a/wally-pipelined/src/ebu/pagetablewalker.sv
+++ b/wally-pipelined/src/ebu/pagetablewalker.sv
@ -27,46 +27,76 @@
 `include "wally-config.vh"
 `include "wally-constants.vh"

-module pagetablewalker (
-  input  logic             clk, reset,
+/* ***
+   TO-DO:
+    - Faults have a timing issue and currently do not work.
+    - Leaf state brings HADDR down to zeros (maybe fixed?)
+    - Complete rv64ic case
+    - Implement better accessed/dirty behavior
+    - Implement read/write/execute checking (either here or in TLB)
+*/

+module pagetablewalker (
+  // Control signals
+  input  logic             HCLK, HRESETn,
  input  logic [`XLEN-1:0] SATP_REGW,

-  input  logic             MemWriteM,
-  input  logic             ITLBMissF, DTLBMissM,
+  // Signals from TLBs (addresses to translate)
  input  logic [`XLEN-1:0] PCF, MemAdrM,
+  input  logic             ITLBMissF, DTLBMissM,
+  input  logic [1:0]       MemRWM,

+  // Outputs to the TLBs (PTEs to write)
  output logic [`XLEN-1:0] PageTableEntryF, PageTableEntryM,
+  output logic [1:0]       PageTypeF, PageTypeM,
  output logic             ITLBWriteF, DTLBWriteM,
-  // *** handshake to tlbs probably not needed, since stalls take effect
-  output logic             MMUTranslationComplete,

-  // Signals from and to ahblite
+  // Signals from ahblite (PTEs from memory)
  input  logic [`XLEN-1:0] MMUReadPTE,
  input  logic             MMUReady,

+  // Signals to ahblite (memory addresses to access)
  output logic [`XLEN-1:0] MMUPAdr,
  output logic             MMUTranslate,
+  output logic             MMUTranslationComplete,

  // Faults
  output logic             InstrPageFaultM, LoadPageFaultM, StorePageFaultM
 );

-  logic                 SvMode;
+  // Internal signals
+  logic                 SvMode, TLBMiss;
  logic [`PPN_BITS-1:0] BasePageTablePPN;
-  logic [`XLEN-1:0]     DirectInstrPTE, DirectMemPTE, TranslationVAdr;
+  logic [`XLEN-1:0]     TranslationVAdr;
+  logic [`XLEN-1:0]     SavedPTE, CurrentPTE;
+  logic [`PA_BITS-1:0]  TranslationPAdr;
+  logic [`PPN_BITS-1:0] CurrentPPN;
+  logic                 MemStore;

-  logic [9:0] DirectPTEFlags = {2'b0, 8'b00001111};
+  // PTE Control Bits
+  logic Dirty, Accessed, Global, User,
+        Executable, Writable, Readable, Valid;
+  // PTE descriptions
+  logic ValidPTE, AccessAlert, MegapageMisaligned, BadMegapage, LeafPTE;

-  // rv32 temp case
-  logic [`VPN_BITS-1:0] PCPageNumber;
-  logic [`VPN_BITS-1:0] MemAdrPageNumber;
+  // Outputs of walker
+  logic [`XLEN-1:0] PageTableEntry;
+  logic [1:0] PageType;
+
+  // Signals for direct, fake translations. Not part of the final Wally version.
+  logic [`XLEN-1:0]     DirectInstrPTE, DirectMemPTE;
+  logic [9:0]           DirectPTEFlags = {2'b0, 8'b00001111};
+
+  logic [`VPN_BITS-1:0] PCPageNumber, MemAdrPageNumber;

  assign BasePageTablePPN = SATP_REGW[`PPN_BITS-1:0];

+  assign MemStore = MemRWM[0];
+
  assign PCPageNumber = PCF[`VPN_BITS+11:12];
  assign MemAdrPageNumber = MemAdrM[`VPN_BITS+11:12];

+  // Create fake page table entries for direct virtual to physical translation
  generate
    if (`XLEN == 32) begin
      assign DirectInstrPTE = {PCPageNumber, DirectPTEFlags};
@ -77,36 +107,45 @@ module pagetablewalker (
    end
  endgenerate

-  //flopenr #(`XLEN) instrpte(clk, reset, ITLBMissF, DirectInstrPTE, PageTableEntryF);
-  //flopenr #(`XLEN)  datapte(clk, reset, DTLBMissM, DirectMemPTE, PageTableEntryM);
+  // Direct translation flops
+  //flopenr #(`XLEN) instrpte(HCLK, ~HRESETn, ITLBMissF, DirectInstrPTE, PageTableEntryF);
+  //flopenr #(`XLEN)  datapte(HCLK, ~HRESETn, DTLBMissM, DirectMemPTE, PageTableEntryM);

-  //flopr #(1) iwritesignal(clk, reset, ITLBMissF, ITLBWriteF);
-  //flopr #(1) dwritesignal(clk, reset, DTLBMissM, DTLBWriteM);
+  //flopr #(1) iwritesignal(HCLK, ~HRESETn, ITLBMissF, ITLBWriteF);
+  //flopr #(1) dwritesignal(HCLK, ~HRESETn, DTLBMissM, DTLBWriteM);

  // Prefer data address translations over instruction address translations
  assign TranslationVAdr = (DTLBMissM) ? MemAdrM : PCF;
  assign MMUTranslate = DTLBMissM || ITLBMissF;

+  // unswizzle PTE bits
+  assign {Dirty, Accessed, Global, User,
+          Executable, Writable, Readable, Valid} = CurrentPTE[7:0];
+
+  // Assign PTE descriptors common across all XLEN values
+  assign LeafPTE = Executable | Writable | Readable;
+  assign ValidPTE = Valid && ~(Writable && ~Readable);
+  assign AccessAlert = ~Accessed || (MemStore && ~Dirty);
+
+  // Assign specific outputs to general outputs
+  assign PageTableEntryF = PageTableEntry;
+  assign PageTableEntryM = PageTableEntry;
+  assign PageTypeF = PageType;
+  assign PageTypeM = PageType;
+
  generate
    if (`XLEN == 32) begin
+      logic [9:0] VPN1, VPN0;
+
      assign SvMode = SATP_REGW[31];

-      logic [9:0] VPN1 = TranslationVAdr[31:22];
-      logic [9:0] VPN0 = TranslationVAdr[21:12]; // *** could optimize by not passing offset?
-
-      logic [33:0] TranslationPAdr;
-      logic [21:0] CurrentPPN;
-
-      logic Dirty, Accessed, Global, User,
-            Executable, Writable, Readable, Valid;
-      logic ValidPTE, AccessAlert, MegapageMisaligned, BadMegapage, LeafPTE;
-
-      typedef enum {IDLE, LEVEL1, LEVEL0, LEAF, FAULT} statetype;
-      statetype WalkerState, NextWalkerState;
+      typedef enum {IDLE, LEVEL1, LEVEL0, LEAF, FAULT} walker_statetype;
+      walker_statetype WalkerState, NextWalkerState;

      // *** Do we need a synchronizer here for walker to talk to ahblite?
-      flopenl #(.TYPE(statetype)) mmureg(clk, reset, 1'b1, NextWalkerState, IDLE, WalkerState);
+      flopenl #(.TYPE(walker_statetype)) mmureg(HCLK, ~HRESETn, 1'b1, NextWalkerState, IDLE, WalkerState);

+      // State transition logic
      always_comb begin
        case (WalkerState)
          IDLE:   if      (MMUTranslate)           NextWalkerState = LEVEL1;
@ -129,62 +168,57 @@ module pagetablewalker (
        endcase
      end

-      // unswizzle PTE bits
-      assign {Dirty, Accessed, Global, User,
-              Executable, Writable, Readable, Valid} = MMUReadPTE[7:0];

      // A megapage is a Level 1 leaf page. This page must have zero PPN[0].
      assign MegapageMisaligned = |(CurrentPPN[9:0]);
-      assign LeafPTE = Executable | Writable | Readable;
-      assign ValidPTE = Valid && ~(Writable && ~Readable);
-      assign AccessAlert = ~Accessed || (MemWriteM && ~Dirty);
      assign BadMegapage = MegapageMisaligned || AccessAlert;  // *** Implement better access/dirty scheme

-      // *** Should translate this flop block into our flop module notation
-      always_ff @(posedge clk, negedge reset)
-        if (reset) begin
-          TranslationPAdr <= '0;
-          PageTableEntryF <= '0;
-          MMUTranslationComplete <= '0;
-          DTLBWriteM <= '0;
-          ITLBWriteF <= '0;
-          InstrPageFaultM <= '0;
-          LoadPageFaultM <= '0;
-          StorePageFaultM <= '0;
-        end else begin
-          // default values
-          TranslationPAdr <= '0;
-          PageTableEntryF <= '0;
-          MMUTranslationComplete <= '0;
-          DTLBWriteM <= '0;
-          ITLBWriteF <= '0;
-          InstrPageFaultM <= '0;
-          LoadPageFaultM <= '0;
-          StorePageFaultM <= '0;
-          case (NextWalkerState)
-            LEVEL1: begin
-              TranslationPAdr <= {BasePageTablePPN, VPN1, 2'b00};
-            end
-            LEVEL0: begin
-              TranslationPAdr <= {CurrentPPN, VPN0, 2'b00};
-            end
-            LEAF: begin
-              PageTableEntryF <= MMUReadPTE;
-              PageTableEntryM <= MMUReadPTE;
-              MMUTranslationComplete <= '1;
-              DTLBWriteM <= DTLBMissM;
-              ITLBWriteF <= ~DTLBMissM;  // Prefer data over instructions
-            end
-            FAULT: begin
-              InstrPageFaultM <= ~DTLBMissM;
-              LoadPageFaultM <= DTLBMissM && ~MemWriteM;
-              StorePageFaultM <= DTLBMissM && MemWriteM;
-            end
-          endcase
-        end
+      assign VPN1 = TranslationVAdr[31:22];
+      assign VPN0 = TranslationVAdr[21:12]; // *** could optimize by not passing offset?

-      // Interpret inputs from ahblite
-      assign CurrentPPN = MMUReadPTE[31:10];
+      // Assign combinational outputs
+      always_comb begin
+        // default values
+        assign TranslationPAdr = '0;
+        assign PageTableEntry = '0;
+        assign PageType ='0;
+        assign MMUTranslationComplete = '0;
+        assign DTLBWriteM = '0;
+        assign ITLBWriteF = '0;
+        assign InstrPageFaultM = '0;
+        assign LoadPageFaultM = '0;
+        assign StorePageFaultM = '0;
+
+        case (NextWalkerState)
+          LEVEL1: begin
+            assign TranslationPAdr = {BasePageTablePPN, VPN1, 2'b00};
+          end
+          LEVEL0: begin
+            assign TranslationPAdr = {CurrentPPN, VPN0, 2'b00};
+          end
+          LEAF: begin
+            // Keep physical address alive to prevent HADDR dropping to 0
+            assign TranslationPAdr = {CurrentPPN, VPN0, 2'b00};
+            assign PageTableEntry = CurrentPTE;
+            assign PageType = (WalkerState == LEVEL1) ? 2'b01 : 2'b00;
+            assign MMUTranslationComplete = '1;
+            assign DTLBWriteM = DTLBMissM;
+            assign ITLBWriteF = ~DTLBMissM;  // Prefer data over instructions
+          end
+          FAULT: begin
+            assign TranslationPAdr = {CurrentPPN, VPN0, 2'b00};
+            assign MMUTranslationComplete = '1;
+            assign InstrPageFaultM = ~DTLBMissM;
+            assign LoadPageFaultM = DTLBMissM && ~MemStore;
+            assign StorePageFaultM = DTLBMissM && MemStore;
+          end
+        endcase
+      end
+
+      // Capture page table entry from ahblite
+      flopenr #(32) ptereg(HCLK, ~HRESETn, MMUReady, MMUReadPTE, SavedPTE);
+      mux2 #(32) ptemux(SavedPTE, MMUReadPTE, MMUReady, CurrentPTE);
+      assign CurrentPPN = CurrentPTE[`PPN_BITS+9:10];

      // Assign outputs to ahblite
      // *** Currently truncate address to 32 bits. This must be changed if
@ -194,27 +228,19 @@ module pagetablewalker (
    end else begin
      assign SvMode = SATP_REGW[63];

-      logic [8:0] VPN2 = TranslationVAdr[38:30];
-      logic [8:0] VPN1 = TranslationVAdr[29:21];
-      logic [8:0] VPN0 = TranslationVAdr[20:12]; // *** could optimize by not passing offset?
+      logic [8:0] VPN2, VPN1, VPN0;

-      logic [55:0] TranslationPAdr;
-      logic [43:0] CurrentPPN;
+      logic GigapageMisaligned, BadGigapage;

-      logic Dirty, Accessed, Global, User,
-            Executable, Writable, Readable, Valid;
-      logic ValidPTE, AccessAlert, GigapageMisaligned, MegapageMisaligned,
-            BadGigapage, BadMegapage, LeafPTE;
-
-      typedef enum {IDLE, LEVEL2, LEVEL1, LEVEL0, LEAF, FAULT} statetype;
-      statetype WalkerState, NextWalkerState;
+      typedef enum {IDLE, LEVEL2, LEVEL1, LEVEL0, LEAF, FAULT} walker_statetype;
+      walker_statetype WalkerState, NextWalkerState;

      // *** Do we need a synchronizer here for walker to talk to ahblite?
-      flopenl #(.TYPE(statetype)) mmureg(clk, reset, 1'b1, NextWalkerState, IDLE, WalkerState);
+      flopenl #(.TYPE(walker_statetype)) mmureg(HCLK, ~HRESETn, 1'b1, NextWalkerState, IDLE, WalkerState);

      always_comb begin
        case (WalkerState)
-          IDLE:   if      (MMUTranslate)           NextWalkerState = LEVEL1;
+          IDLE:   if      (MMUTranslate)           NextWalkerState = LEVEL2;
                  else                             NextWalkerState = IDLE;
          LEVEL2: if      (~MMUReady)              NextWalkerState = LEVEL2;
                  else if (ValidPTE && ~LeafPTE)   NextWalkerState = LEVEL1;
@ -237,67 +263,66 @@ module pagetablewalker (
        endcase
      end

-      // unswizzle PTE bits
-      assign {Dirty, Accessed, Global, User,
-              Executable, Writable, Readable, Valid} = MMUReadPTE[7:0];
-
-      // A megapage is a Level 1 leaf page. This page must have zero PPN[0].
+      // A gigapage is a Level 2 leaf page. This page must have zero PPN[1] and
+      // zero PPN[0]
      assign GigapageMisaligned = |(CurrentPPN[17:0]);
+      // A megapage is a Level 1 leaf page. This page must have zero PPN[0].
      assign MegapageMisaligned = |(CurrentPPN[8:0]);
-      assign LeafPTE = Executable | Writable | Readable;
-      assign ValidPTE = Valid && ~(Writable && ~Readable);
-      assign AccessAlert = ~Accessed || (MemWriteM && ~Dirty);
+
      assign BadGigapage = GigapageMisaligned || AccessAlert;  // *** Implement better access/dirty scheme
      assign BadMegapage = MegapageMisaligned || AccessAlert;  // *** Implement better access/dirty scheme

-      // *** Should translate this flop block into our flop module notation
-      always_ff @(posedge clk, negedge reset)
-        if (reset) begin
-          TranslationPAdr <= '0;
-          PageTableEntryF <= '0;
-          MMUTranslationComplete <= '0;
-          DTLBWriteM <= '0;
-          ITLBWriteF <= '0;
-          InstrPageFaultM <= '0;
-          LoadPageFaultM <= '0;
-          StorePageFaultM <= '0;
-        end else begin
-          // default values
-          TranslationPAdr <= '0;
-          PageTableEntryF <= '0;
-          MMUTranslationComplete <= '0;
-          DTLBWriteM <= '0;
-          ITLBWriteF <= '0;
-          InstrPageFaultM <= '0;
-          LoadPageFaultM <= '0;
-          StorePageFaultM <= '0;
-          case (NextWalkerState)
-            LEVEL2: begin
-              TranslationPAdr <= {BasePageTablePPN, VPN2, 3'b00};
-            end
-            LEVEL1: begin
-              TranslationPAdr <= {CurrentPPN, VPN1, 3'b00};
-            end
-            LEVEL0: begin
-              TranslationPAdr <= {CurrentPPN, VPN0, 3'b00};
-            end
-            LEAF: begin
-              PageTableEntryF <= MMUReadPTE;
-              PageTableEntryM <= MMUReadPTE;
-              MMUTranslationComplete <= '1;
-              DTLBWriteM <= DTLBMissM;
-              ITLBWriteF <= ~DTLBMissM;  // Prefer data over instructions
-            end
-            FAULT: begin
-              InstrPageFaultM <= ~DTLBMissM;
-              LoadPageFaultM <= DTLBMissM && ~MemWriteM;
-              StorePageFaultM <= DTLBMissM && MemWriteM;
-            end
-          endcase
-        end
+      assign VPN2 = TranslationVAdr[38:30];
+      assign VPN1 = TranslationVAdr[29:21];
+      assign VPN0 = TranslationVAdr[20:12]; // *** could optimize by not passing offset?

-      // Interpret inputs from ahblite
-      assign CurrentPPN = MMUReadPTE[53:10];
+      // *** Should translate this flop block into our flop module notation
+      always_comb begin
+        // default values
+        assign TranslationPAdr = '0;
+        assign PageTableEntry = '0;
+        assign PageType = '0;
+        assign MMUTranslationComplete = '0;
+        assign DTLBWriteM = '0;
+        assign ITLBWriteF = '0;
+        assign InstrPageFaultM = '0;
+        assign LoadPageFaultM = '0;
+        assign StorePageFaultM = '0;
+
+        case (NextWalkerState)
+          LEVEL2: begin
+            assign TranslationPAdr = {BasePageTablePPN, VPN2, 3'b000};
+          end
+          LEVEL1: begin
+            assign TranslationPAdr = {CurrentPPN, VPN1, 3'b000};
+          end
+          LEVEL0: begin
+            assign TranslationPAdr = {CurrentPPN, VPN0, 3'b000};
+          end
+          LEAF: begin
+            // Keep physical address alive to prevent HADDR dropping to 0
+            assign TranslationPAdr = {CurrentPPN, VPN0, 3'b000};
+            assign PageTableEntry = CurrentPTE;
+            assign PageType = (WalkerState == LEVEL2) ? 2'b11 : 
+                                ((WalkerState == LEVEL1) ? 2'b01 : 2'b00);
+            assign MMUTranslationComplete = '1;
+            assign DTLBWriteM = DTLBMissM;
+            assign ITLBWriteF = ~DTLBMissM;  // Prefer data over instructions
+          end
+          FAULT: begin
+            assign TranslationPAdr = {CurrentPPN, VPN0, 3'b000};
+            assign MMUTranslationComplete = '1;
+            assign InstrPageFaultM = ~DTLBMissM;
+            assign LoadPageFaultM = DTLBMissM && ~MemStore;
+            assign StorePageFaultM = DTLBMissM && MemStore;
+          end
+        endcase
+      end
+
+      // Capture page table entry from ahblite
+      flopenr #(`XLEN) ptereg(HCLK, ~HRESETn, MMUReady, MMUReadPTE, SavedPTE);
+      mux2 #(`XLEN) ptemux(SavedPTE, MMUReadPTE, MMUReady, CurrentPTE);
+      assign CurrentPPN = CurrentPTE[`PPN_BITS+9:10];

      // Assign outputs to ahblite
      // *** Currently truncate address to 32 bits. This must be changed if
--- a/wally-pipelined/src/fpu/FMA/align.sv
+++ b/wally-pipelined/src/fpu/FMA/align.sv
@ -64,35 +64,35 @@ module align(zman, ae, aligncnt, xzero, yzero, zzero, zdenorm, proddenorm, t, bs
 		ps = 0;

 		// And to using product as primary operand in adder I exponent gen 
-		killprod = 0;
+		killprod = xzero | yzero;
 		// d = aligncnt
 		// p = 53
-		if ($signed(aligncnt) <= $signed(-105)) begin //d<=-2p+1
+		if ($signed(aligncnt) <= $signed(-103)) begin //d<=-2p+1
 			//product ancored case with saturated shift
 			sumshift = 163;	// 3p+4	
 			sumshiftzero = 0;
 			shift = {~zdenorm,zman,163'b0} >> sumshift;
-			t = {shift[215:52]};
+			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			//zexpsel = 0;
-		end else if($signed(aligncnt) <= $signed(0))  begin // -2p+1<d<=2
+		end else if($signed(aligncnt) <= $signed(1))  begin // -2p+1<d<=2
 			// set d<=2 to d<=0
 			// product ancored or cancellation
 			// warning: set to 55 rather then 56. was there a typo in the book?
-			sumshift = 55-aligncnt; // p + 3 - d  
+			sumshift = 57-aligncnt; // p + 3 - d  
 			sumshiftzero = 0;
 			shift = {~zdenorm,zman,163'b0} >> sumshift;
-			t = {shift[215:52]};
+			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			//zexpsel = 0;
-		end else if ($signed(aligncnt)<=$signed(52))  begin // 2 < d <= p+2
+		end else if ($signed(aligncnt)<=$signed(55))  begin // 2 < d <= p+2
 			// another typo in book? above was 55 changed to 52
 			// addend ancored case
 			// used to be 56 \/ somthing doesn't seem right too many typos
-			sumshift = 55-aligncnt;
+			sumshift = 57-aligncnt;
 			sumshiftzero = 0;
 			shift = {~zdenorm,zman, 163'b0} >> sumshift;
-			t = {shift[215:52]};
+			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			//zexpsel = 1;
 		end else begin                 	// d >= p+3
@ -100,7 +100,7 @@ module align(zman, ae, aligncnt, xzero, yzero, zzero, zdenorm, proddenorm, t, bs
 			sumshift = 0;	
 			sumshiftzero = 1;		
 			shift = {~zdenorm,zman, 163'b0} >> sumshift;
-			t = {shift[215:52]};
+			t = zzero ? 0 : {shift[215:52]};
 			bs = |(shift[51:0]);
 			killprod = 1;
 			//ps = 1;
--- a/wally-pipelined/src/fpu/FMA/expgen.sv
+++ b/wally-pipelined/src/fpu/FMA/expgen.sv
@ -84,8 +84,10 @@ module expgen(xexp, yexp, zexp,
 	// This should not increas the critical path because the time to
 	// check if a round overflows is shorter than the actual round and
 	// is masked by the bypass mux and two 10 bit adder delays.
-
-	assign aligncnt = zexp -ae - 1 + ~xdenorm + ~ydenorm - ~zdenorm;
+	assign aligncnt0 = - 1 + ~xdenorm + ~ydenorm - ~zdenorm;
+	assign aligncnt1 = - 1 + {12'b0,~xdenorm} + {12'b0,~ydenorm} - {12'b0,~zdenorm};
+	assign aligncnt = zexp -ae - 1 + {12'b0,~xdenorm} + {12'b0,~ydenorm} - {12'b0,~zdenorm};
+	//assign aligncnt = zexp -ae - 1 + ~xdenorm + ~ydenorm - ~zdenorm;
 	//assign aligncnt = zexp - ae;// KEP use all of ae

 	// Select exponent (usually from product except in case of huge addend)
@ -107,7 +109,7 @@ module expgen(xexp, yexp, zexp,
 	// check for exponent out of bounds after add 
 	
 	assign de = resultdenorm | sumzero ? 0 : de0;
-	assign sumof = de[12];
+	assign sumof = ~de[12] && de > 2046;
 	assign sumuf = de == 0  && ~sumzero && ~resultdenorm;

 	// bypass occurs before rounding or taking early results 
--- a/wally-pipelined/src/fpu/FMA/flag.sv
+++ b/wally-pipelined/src/fpu/FMA/flag.sv
@ -9,7 +9,7 @@

 /////////////////////////////////////////////////////////////////////////////
 module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
-			 psign,  zsign, xzero, yzero, vbits,
+			 psign,  zsign, xzero, yzero, zzero, vbits, killprod,
 			 inf, nan, invalid, overflow, underflow, inexact);
 /////////////////////////////////////////////////////////////////////////////

@ -26,6 +26,8 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	input				zsign; 		// Sign of z
 	input				xzero;		// x = 0
 	input				yzero;		// y = 0
+	input				zzero;		// y = 0
+	input				killprod;
 	input     	[1:0]  		vbits;		// R and S bits of result
 	output				inf;		// Some	source is Inf
 	output				nan;		// Some	source is NaN
@ -73,8 +75,7 @@ module flag(xnan, ynan, znan, xinf, yinf, zinf, prodof, sumof, sumuf,
 	//   1) Any input is denormalized
 	//   2)  Output would be denormalized or smaller

-	assign underflow = (sumuf && ~inf && ~prodinf && ~nan);
-
+	assign underflow = (sumuf && ~inf && ~prodinf && ~nan) || (killprod & zzero & ~(yzero | xzero));

 	// Set the inexact flag for the following cases:
 	//   1) Multiplication inexact
--- a/wally-pipelined/src/fpu/FMA/normalize.sv
+++ b/wally-pipelined/src/fpu/FMA/normalize.sv
@ -47,7 +47,7 @@ module normalize(sum, zexp, invz, normcnt, ae, aligncnt, sumshift, sumshiftzero,
 	logic		[9:0]		sumshifttmp;
 	logic       	[163:0]  	sumshiftedtmp;     // shifted sum
 	logic 				sticky;
-logic tmp,tmp1,tmp2,tmp3;
+logic tmp,tmp1,tmp2,tmp3,tmp4, tmp5;

 	// When the sum is zero,  normalization does not apply and only the
 	// sticky bit must be computed.  Otherwise,  the sum is right-shifted
@ -68,16 +68,16 @@ logic tmp,tmp1,tmp2,tmp3;
 		// p = 53
 		// ea + eb = ae
 			// set d<=2 to d<=0
-			if ($signed(aligncnt)<=$signed(0))  begin //d<=2 
+			if ($signed(aligncnt)<=$signed(1))  begin //d<=2 
 				// product anchored or cancellation
 				if ($signed(ae-normcnt+2) >= $signed(-1022)) begin //ea+eb-l+2 >= emin
 					//normal result
-					sumshifted = sum << (55+normcnt); // p+2+l
+					de0 = xzero|yzero ? zexp : ae-normcnt+2+xdenorm+ydenorm;
+					resultdenorm = |sum & ~|de0;
+					sumshifted = resultdenorm ? sum << sumshift : sum << (55+normcnt); // p+2+l
 					v = sumshifted[162:109];
 					sticky = (|sumshifted[108:0]) | bs;
-					resultdenorm = 0;
 					//de0 = ae-normcnt+2-1023;
-					de0 = xzero|yzero ? zexp : ae-normcnt+2+xdenorm+ydenorm;
 				end else begin
 					sumshifted = sum << (1080+ae);
 					v = sumshifted[162:109];
@ -87,38 +87,50 @@ logic tmp,tmp1,tmp2,tmp3;
 				end

 			end else begin                 // extract normalized bits
-				sumshifttmp = sumshift - 2;
+				sumshifttmp = {1'b0,sumshift} - 2;
 				sumshifted = sumshifttmp[9] ? sum : sum << sumshifttmp;
-				tmp1 = (sumshifted[163] & ~zdenorm & ~sumshifttmp[9]);
-				tmp2 = (zdenorm | sumshifttmp[9] || sumshifted[162]);
+				tmp1 = (sumshifted[163] & ~sumshifttmp[9]);
+				tmp2 = (sumshifttmp[9] || sumshifted[162]);
 				tmp3 = sumshifted[161];
+				tmp4 = sumshifted[160];
+				tmp5 = sumshifted[159];
 				// for some reason use exp = zexp + {0,1,2}
 				// the book says exp = zexp + {-1,0,1}
 				if(sumshiftzero) begin
 					v = sum[162:109];
 					sticky = sum[108:0] | bs;
 					de0 = zexp;
-				end else if(sumshifted[163] & ~zdenorm & ~sumshifttmp[9])begin
+				end else if(sumshifted[163] & ~sumshifttmp[9])begin
 					v = sumshifted[162:109];
 					sticky = (|sumshifted[108:0]) | bs;
 					de0 = zexp +2;
-				end else if (zdenorm | sumshifttmp[9] || sumshifted[162]) begin
+				end else if ((sumshifttmp[9] & sumshift[0]) || sumshifted[162]) begin
 					v = sumshifted[161:108];
 					sticky = (|sumshifted[107:0]) | bs;
 					de0 = zexp+1;
-				end else if (sumshifted[161]) begin
+				end else if (sumshifted[161] || (sumshifttmp[9] & sumshift[1])) begin
 					v = sumshifted[160:107];
 					sticky = (|sumshifted[106:0]) | bs;
 					//de0 = zexp-1;
 					de0 = zexp;
-				end else begin
+				end else if(sumshifted[160]) begin
 					v = sumshifted[159:106];
 					sticky = (|sumshifted[105:0]) | bs;
 					//de0 = zexp-1;
 					de0 = zexp-1;
+				end else if(sumshifted[159]) begin
+					v = sumshifted[158:105];
+					sticky = (|sumshifted[104:0]) | bs;
+					//de0 = zexp-1;
+					de0 = zexp-2;
+				end else begin					
+					v = sumshifted[160:107];
+					sticky = (|sumshifted[106:0]) | bs;
+					//de0 = zexp-1;
+					de0 = zexp;
 				end

-				resultdenorm = 0;
+				resultdenorm = ~(|de0);
 		end 
 	end

--- a/wally-pipelined/src/fpu/FMA/tbgen/results.dat
+++ b/wally-pipelined/src/fpu/FMA/tbgen/results.dat
@ -1,11 +1,16 @@
-8020007ffdffffff 9beffff7fff7fffe 000ffffffff7fffe 0000000000000000 000ffffffff7fffe  Wrong zdenorm unflw 475303
-3cafffffffffffff 3fd0000000000000 3cafffffffffffff 3c8ffffffffffffb 3cb3ffffffffffff  Wrong 706913
-bfbfffff007fffff 000fffffffffffff 000bffffffc00000 0015000007dc0000 000a00000fb80000  Wrong ydenorm zdenorm 1675647
-00114508bde544e1 3caffffffffffffe 800010000003fffe 801008000001fffe 800010000003fffd  Wrong zdenorm 2310057
-800ffffffdffffff bfcffe00003ffffe 800ffff01ffffffe 80160018103bfbff 800c00302077f7ff  Wrong xdenorm zdenorm 2475205
-bcafffffffffffff 3fd0000000000001 bcafffffffffffff bc8ffffffffffffd bcb4000000000000  Wrong 3776249
-bfc0000000800008 43d0001000000002 c3cffffbffff8000 c3a00000007e008a c3d20000000fc011  Wrong 3804445
-bfefffffffffffff 3fefffffffffffff bff0000000000001 b950000000000000 c000000000000000  Wrong 4338155
-37ea3353806450ba bffffffffffffffe b803fffffffff7ff b7c19a9c032205b3 b8108cd4e019102e  Wrong 5143755
-8010000000803fff 3ff0000000000001 000fffe07fffffff fff0000000000000 8000001f80804001  Wrong zdenorm w=-inf 5246469
-b7fffff80000001f 001ffffffffffffe 800fffffffff07ff 8000000000000000 800fffffffff07ff  Wrong w=-zero zdenorm unflw 5723787
+0010000000000000 bf4fdffffff7fffe 800ffffffffffffe 800003fbfffffefe 801003fbfffffefe  Wrong zdenorm 308227
+0010000000000000 be6fffffbffffff7 8000000000000000 800000001fffffc0 800000000fffffe0  Wrong 313753
+001ffffffffffffe 3fddfbffffffffff 000ffffffffffffe 000efdfffffffffd 001efdfffffffffd  Wrong zdenorm 551371
+3befe000ffffffff 800ffffffffffffe 0000000000000000 0000000000000000 8000000000000000  Wrong ydenorm unflw 665575
+000007fffffffffe 3f6ffffffe01fffe 000ffffffffffffe 00000007ffffff7e 00100007ffffff7e  Wrong xdenorm zdenorm 768727
+3fdffffffffffffe 000ffffffffffffe 8000000000000001 7feffffffffffff6 0007fffffffffffe  Wrong ydenorm zdenorm 1049939
+7fe0000000000001 4000000000000000 ffefffffffffffff 7ff0000000000000 7cb8000000000000  Wrong w=+inf 2602745
+000fff000000000f 3ff00800001fffff 8010000000000000 7f7bfe007ff8381e 000006ff801ffe0e  Wrong xdenorm 3117277
+8000000000000001 40211275ffe5ee3c 0000000000000001 fcfe24ebffcbdc78 8000000000000008  Wrong xdenorm zdenorm 3148591
+801fffffffffffff bfdffffffffffffe 0000000000021fff 0000000000021ffe 0010000000021ffe  Wrong zdenorm 3537867
+801ffffffffffffe 0010000000000001 0000000000000000 0000000000000000 8000000000000000  Wrong unflw 3564269
+bca0000000000001 000fffffc000001e 8000000000000000 8000000000000001 8000000000000000  Wrong ydenorm 3717769
+bcafffffffffffff 800ffffffffffffe 8000000000000000 0000000000000002 0000000000000001  Wrong ydenorm 3807413
+7fec5fed92358a74 400000001bffffff ffefc0003ffffffe 7ff0000000000000 7fe8ffdb47bad466  Wrong w=+inf 3889689
+bfdfffffffffffff 3fdf1f3616aa73e1 3fd0000000000001 3fd07064f4aac611 3f7c193d2ab1843f  Wrong 4099063
+3fd07dfffffffffe 8010000000000001 0000000000000001 ffe07dfffffffffb 80041f7fffffffff  Wrong zdenorm 4716133
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.c
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.c
@ -26,13 +26,13 @@ void main() {
 		char ans[81];
 		char flags[3];
 		int rn,rz,rm,rp;
-		long stop = 5723787;
+		long stop = 4099063;
 		int debug = 1;
 		//my_string = (char *) malloc (nbytes + 1);
 		//bytes_read = getline (&my_string, &nbytes, stdin);
 	

-		for(n=0; n < 2013; n++) {//613 for 10000
+		for(n=0; n < 613; n++) {//613 for 10000
 			if(getline(&ln,&nbytes,fp) < 0 || feof(fp)) break;
 			if(k == stop && debug == 1) break;
 			k++;
--- a/wally-pipelined/src/fpu/FMA/tbgen/tb.v
+++ b/wally-pipelined/src/fpu/FMA/tbgen/tb.v
--- a/wally-pipelined/src/fpu/adder.sv
+++ b/wally-pipelined/src/fpu/adder.sv
@ -0,0 +1,758 @@
+// The following module make up the basic building blocks that
+// are used by the cla64, cla_sub64, and cla52.
+
+module INVBLOCK ( GIN, GOUT );
+   
+   input  GIN;
+   output GOUT;
+   
+   assign GOUT =  ~ GIN;
+   
+endmodule // INVBLOCK
+
+
+module XXOR1 ( A, B, GIN, SUM );
+   
+   input  A;
+   input  B;
+   input  GIN;
+   output SUM;
+   
+   assign SUM = ( ~ (A ^ B)) ^ GIN;
+   
+endmodule // XXOR1
+
+
+module BLOCK0 ( A, B, POUT, GOUT );
+   
+   input  A;
+   input  B;
+   output POUT;
+   output GOUT;
+   
+   assign POUT =  ~ (A | B);
+   assign GOUT =  ~ (A & B);
+   
+endmodule // BLOCK0
+
+
+module BLOCK1 ( PIN1, PIN2, GIN1, GIN2, POUT, GOUT );
+   
+   input  PIN1;
+   input  PIN2;
+   input  GIN1;
+   input  GIN2;
+   output POUT;
+   output GOUT;
+   
+   assign POUT =  ~ (PIN1 | PIN2);
+   assign GOUT =  ~ (GIN2 & (PIN2 | GIN1));
+   
+endmodule // BLOCK1
+
+
+module BLOCK2 ( PIN1, PIN2, GIN1, GIN2, POUT, GOUT );
+   
+   input  PIN1;
+   input  PIN2;
+   input  GIN1;
+   input  GIN2;
+   output POUT;
+   output GOUT;
+   
+   assign POUT =  ~ (PIN1 & PIN2);
+   assign GOUT =  ~ (GIN2 | (PIN2 & GIN1));
+   
+endmodule // BLOCK2
+
+
+module BLOCK1A ( PIN2, GIN1, GIN2, GOUT );
+   
+   input  PIN2;
+   input  GIN1;
+   input  GIN2;
+   output GOUT;
+   
+   assign GOUT =  ~ (GIN2 & (PIN2 | GIN1));
+   
+endmodule // BLOCK1A
+
+
+module BLOCK2A ( PIN2, GIN1, GIN2, GOUT );
+   
+   input  PIN2;
+   input  GIN1;
+   input  GIN2;
+   output GOUT;
+   
+   assign GOUT =  ~ (GIN2 | (PIN2 & GIN1));
+   
+endmodule
+
+module PRESTAGE_64 ( A, B, CIN, POUT, GOUT );
+   
+   input  [0:63] A;
+   input [0:63]  B;
+   input 	 CIN;
+   
+   output [0:63] POUT;
+   output [0:64] GOUT;
+   
+   BLOCK0 U10 (A[0] , B[0] , POUT[0] , GOUT[1] );
+   BLOCK0 U11 (A[1] , B[1] , POUT[1] , GOUT[2] );
+   BLOCK0 U12 (A[2] , B[2] , POUT[2] , GOUT[3] );
+   BLOCK0 U13 (A[3] , B[3] , POUT[3] , GOUT[4] );
+   BLOCK0 U14 (A[4] , B[4] , POUT[4] , GOUT[5] );
+   BLOCK0 U15 (A[5] , B[5] , POUT[5] , GOUT[6] );
+   BLOCK0 U16 (A[6] , B[6] , POUT[6] , GOUT[7] );
+   BLOCK0 U17 (A[7] , B[7] , POUT[7] , GOUT[8] );
+   BLOCK0 U18 (A[8] , B[8] , POUT[8] , GOUT[9] );
+   BLOCK0 U19 (A[9] , B[9] , POUT[9] , GOUT[10] );
+   BLOCK0 U110 (A[10] , B[10] , POUT[10] , GOUT[11] );
+   BLOCK0 U111 (A[11] , B[11] , POUT[11] , GOUT[12] );
+   BLOCK0 U112 (A[12] , B[12] , POUT[12] , GOUT[13] );
+   BLOCK0 U113 (A[13] , B[13] , POUT[13] , GOUT[14] );
+   BLOCK0 U114 (A[14] , B[14] , POUT[14] , GOUT[15] );
+   BLOCK0 U115 (A[15] , B[15] , POUT[15] , GOUT[16] );
+   BLOCK0 U116 (A[16] , B[16] , POUT[16] , GOUT[17] );
+   BLOCK0 U117 (A[17] , B[17] , POUT[17] , GOUT[18] );
+   BLOCK0 U118 (A[18] , B[18] , POUT[18] , GOUT[19] );
+   BLOCK0 U119 (A[19] , B[19] , POUT[19] , GOUT[20] );
+   BLOCK0 U120 (A[20] , B[20] , POUT[20] , GOUT[21] );
+   BLOCK0 U121 (A[21] , B[21] , POUT[21] , GOUT[22] );
+   BLOCK0 U122 (A[22] , B[22] , POUT[22] , GOUT[23] );
+   BLOCK0 U123 (A[23] , B[23] , POUT[23] , GOUT[24] );
+   BLOCK0 U124 (A[24] , B[24] , POUT[24] , GOUT[25] );
+   BLOCK0 U125 (A[25] , B[25] , POUT[25] , GOUT[26] );
+   BLOCK0 U126 (A[26] , B[26] , POUT[26] , GOUT[27] );
+   BLOCK0 U127 (A[27] , B[27] , POUT[27] , GOUT[28] );
+   BLOCK0 U128 (A[28] , B[28] , POUT[28] , GOUT[29] );
+   BLOCK0 U129 (A[29] , B[29] , POUT[29] , GOUT[30] );
+   BLOCK0 U130 (A[30] , B[30] , POUT[30] , GOUT[31] );
+   BLOCK0 U131 (A[31] , B[31] , POUT[31] , GOUT[32] );
+   BLOCK0 U132 (A[32] , B[32] , POUT[32] , GOUT[33] );
+   BLOCK0 U133 (A[33] , B[33] , POUT[33] , GOUT[34] );
+   BLOCK0 U134 (A[34] , B[34] , POUT[34] , GOUT[35] );
+   BLOCK0 U135 (A[35] , B[35] , POUT[35] , GOUT[36] );
+   BLOCK0 U136 (A[36] , B[36] , POUT[36] , GOUT[37] );
+   BLOCK0 U137 (A[37] , B[37] , POUT[37] , GOUT[38] );
+   BLOCK0 U138 (A[38] , B[38] , POUT[38] , GOUT[39] );
+   BLOCK0 U139 (A[39] , B[39] , POUT[39] , GOUT[40] );
+   BLOCK0 U140 (A[40] , B[40] , POUT[40] , GOUT[41] );
+   BLOCK0 U141 (A[41] , B[41] , POUT[41] , GOUT[42] );
+   BLOCK0 U142 (A[42] , B[42] , POUT[42] , GOUT[43] );
+   BLOCK0 U143 (A[43] , B[43] , POUT[43] , GOUT[44] );
+   BLOCK0 U144 (A[44] , B[44] , POUT[44] , GOUT[45] );
+   BLOCK0 U145 (A[45] , B[45] , POUT[45] , GOUT[46] );
+   BLOCK0 U146 (A[46] , B[46] , POUT[46] , GOUT[47] );
+   BLOCK0 U147 (A[47] , B[47] , POUT[47] , GOUT[48] );
+   BLOCK0 U148 (A[48] , B[48] , POUT[48] , GOUT[49] );
+   BLOCK0 U149 (A[49] , B[49] , POUT[49] , GOUT[50] );
+   BLOCK0 U150 (A[50] , B[50] , POUT[50] , GOUT[51] );
+   BLOCK0 U151 (A[51] , B[51] , POUT[51] , GOUT[52] );
+   BLOCK0 U152 (A[52] , B[52] , POUT[52] , GOUT[53] );
+   BLOCK0 U153 (A[53] , B[53] , POUT[53] , GOUT[54] );
+   BLOCK0 U154 (A[54] , B[54] , POUT[54] , GOUT[55] );
+   BLOCK0 U155 (A[55] , B[55] , POUT[55] , GOUT[56] );
+   BLOCK0 U156 (A[56] , B[56] , POUT[56] , GOUT[57] );
+   BLOCK0 U157 (A[57] , B[57] , POUT[57] , GOUT[58] );
+   BLOCK0 U158 (A[58] , B[58] , POUT[58] , GOUT[59] );
+   BLOCK0 U159 (A[59] , B[59] , POUT[59] , GOUT[60] );
+   BLOCK0 U160 (A[60] , B[60] , POUT[60] , GOUT[61] );
+   BLOCK0 U161 (A[61] , B[61] , POUT[61] , GOUT[62] );
+   BLOCK0 U162 (A[62] , B[62] , POUT[62] , GOUT[63] );
+   BLOCK0 U163 (A[63] , B[63] , POUT[63] , GOUT[64] );
+   INVBLOCK U2 (CIN , GOUT[0] );
+   
+endmodule // PRESTAGE_64
+
+
+module DBLC_0_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:63] PIN;
+   input [0:64]  GIN;
+   
+   output [0:62] POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   BLOCK1A U21 (PIN[0] , GIN[0] , GIN[1] , GOUT[1] );
+   BLOCK1 U32 (PIN[0] , PIN[1] , GIN[1] , GIN[2] , POUT[0] , GOUT[2] );
+   BLOCK1 U33 (PIN[1] , PIN[2] , GIN[2] , GIN[3] , POUT[1] , GOUT[3] );
+   BLOCK1 U34 (PIN[2] , PIN[3] , GIN[3] , GIN[4] , POUT[2] , GOUT[4] );
+   BLOCK1 U35 (PIN[3] , PIN[4] , GIN[4] , GIN[5] , POUT[3] , GOUT[5] );
+   BLOCK1 U36 (PIN[4] , PIN[5] , GIN[5] , GIN[6] , POUT[4] , GOUT[6] );
+   BLOCK1 U37 (PIN[5] , PIN[6] , GIN[6] , GIN[7] , POUT[5] , GOUT[7] );
+   BLOCK1 U38 (PIN[6] , PIN[7] , GIN[7] , GIN[8] , POUT[6] , GOUT[8] );
+   BLOCK1 U39 (PIN[7] , PIN[8] , GIN[8] , GIN[9] , POUT[7] , GOUT[9] );
+   BLOCK1 U310 (PIN[8] , PIN[9] , GIN[9] , GIN[10] , POUT[8] , GOUT[10] );
+   BLOCK1 U311 (PIN[9] , PIN[10] , GIN[10] , GIN[11] , POUT[9] , GOUT[11] );
+   BLOCK1 U312 (PIN[10] , PIN[11] , GIN[11] , GIN[12] , POUT[10] , GOUT[12] );
+   BLOCK1 U313 (PIN[11] , PIN[12] , GIN[12] , GIN[13] , POUT[11] , GOUT[13] );
+   BLOCK1 U314 (PIN[12] , PIN[13] , GIN[13] , GIN[14] , POUT[12] , GOUT[14] );
+   BLOCK1 U315 (PIN[13] , PIN[14] , GIN[14] , GIN[15] , POUT[13] , GOUT[15] );
+   BLOCK1 U316 (PIN[14] , PIN[15] , GIN[15] , GIN[16] , POUT[14] , GOUT[16] );
+   BLOCK1 U317 (PIN[15] , PIN[16] , GIN[16] , GIN[17] , POUT[15] , GOUT[17] );
+   BLOCK1 U318 (PIN[16] , PIN[17] , GIN[17] , GIN[18] , POUT[16] , GOUT[18] );
+   BLOCK1 U319 (PIN[17] , PIN[18] , GIN[18] , GIN[19] , POUT[17] , GOUT[19] );
+   BLOCK1 U320 (PIN[18] , PIN[19] , GIN[19] , GIN[20] , POUT[18] , GOUT[20] );
+   BLOCK1 U321 (PIN[19] , PIN[20] , GIN[20] , GIN[21] , POUT[19] , GOUT[21] );
+   BLOCK1 U322 (PIN[20] , PIN[21] , GIN[21] , GIN[22] , POUT[20] , GOUT[22] );
+   BLOCK1 U323 (PIN[21] , PIN[22] , GIN[22] , GIN[23] , POUT[21] , GOUT[23] );
+   BLOCK1 U324 (PIN[22] , PIN[23] , GIN[23] , GIN[24] , POUT[22] , GOUT[24] );
+   BLOCK1 U325 (PIN[23] , PIN[24] , GIN[24] , GIN[25] , POUT[23] , GOUT[25] );
+   BLOCK1 U326 (PIN[24] , PIN[25] , GIN[25] , GIN[26] , POUT[24] , GOUT[26] );
+   BLOCK1 U327 (PIN[25] , PIN[26] , GIN[26] , GIN[27] , POUT[25] , GOUT[27] );
+   BLOCK1 U328 (PIN[26] , PIN[27] , GIN[27] , GIN[28] , POUT[26] , GOUT[28] );
+   BLOCK1 U329 (PIN[27] , PIN[28] , GIN[28] , GIN[29] , POUT[27] , GOUT[29] );
+   BLOCK1 U330 (PIN[28] , PIN[29] , GIN[29] , GIN[30] , POUT[28] , GOUT[30] );
+   BLOCK1 U331 (PIN[29] , PIN[30] , GIN[30] , GIN[31] , POUT[29] , GOUT[31] );
+   BLOCK1 U332 (PIN[30] , PIN[31] , GIN[31] , GIN[32] , POUT[30] , GOUT[32] );
+   BLOCK1 U333 (PIN[31] , PIN[32] , GIN[32] , GIN[33] , POUT[31] , GOUT[33] );
+   BLOCK1 U334 (PIN[32] , PIN[33] , GIN[33] , GIN[34] , POUT[32] , GOUT[34] );
+   BLOCK1 U335 (PIN[33] , PIN[34] , GIN[34] , GIN[35] , POUT[33] , GOUT[35] );
+   BLOCK1 U336 (PIN[34] , PIN[35] , GIN[35] , GIN[36] , POUT[34] , GOUT[36] );
+   BLOCK1 U337 (PIN[35] , PIN[36] , GIN[36] , GIN[37] , POUT[35] , GOUT[37] );
+   BLOCK1 U338 (PIN[36] , PIN[37] , GIN[37] , GIN[38] , POUT[36] , GOUT[38] );
+   BLOCK1 U339 (PIN[37] , PIN[38] , GIN[38] , GIN[39] , POUT[37] , GOUT[39] );
+   BLOCK1 U340 (PIN[38] , PIN[39] , GIN[39] , GIN[40] , POUT[38] , GOUT[40] );
+   BLOCK1 U341 (PIN[39] , PIN[40] , GIN[40] , GIN[41] , POUT[39] , GOUT[41] );
+   BLOCK1 U342 (PIN[40] , PIN[41] , GIN[41] , GIN[42] , POUT[40] , GOUT[42] );
+   BLOCK1 U343 (PIN[41] , PIN[42] , GIN[42] , GIN[43] , POUT[41] , GOUT[43] );
+   BLOCK1 U344 (PIN[42] , PIN[43] , GIN[43] , GIN[44] , POUT[42] , GOUT[44] );
+   BLOCK1 U345 (PIN[43] , PIN[44] , GIN[44] , GIN[45] , POUT[43] , GOUT[45] );
+   BLOCK1 U346 (PIN[44] , PIN[45] , GIN[45] , GIN[46] , POUT[44] , GOUT[46] );
+   BLOCK1 U347 (PIN[45] , PIN[46] , GIN[46] , GIN[47] , POUT[45] , GOUT[47] );
+   BLOCK1 U348 (PIN[46] , PIN[47] , GIN[47] , GIN[48] , POUT[46] , GOUT[48] );
+   BLOCK1 U349 (PIN[47] , PIN[48] , GIN[48] , GIN[49] , POUT[47] , GOUT[49] );
+   BLOCK1 U350 (PIN[48] , PIN[49] , GIN[49] , GIN[50] , POUT[48] , GOUT[50] );
+   BLOCK1 U351 (PIN[49] , PIN[50] , GIN[50] , GIN[51] , POUT[49] , GOUT[51] );
+   BLOCK1 U352 (PIN[50] , PIN[51] , GIN[51] , GIN[52] , POUT[50] , GOUT[52] );
+   BLOCK1 U353 (PIN[51] , PIN[52] , GIN[52] , GIN[53] , POUT[51] , GOUT[53] );
+   BLOCK1 U354 (PIN[52] , PIN[53] , GIN[53] , GIN[54] , POUT[52] , GOUT[54] );
+   BLOCK1 U355 (PIN[53] , PIN[54] , GIN[54] , GIN[55] , POUT[53] , GOUT[55] );
+   BLOCK1 U356 (PIN[54] , PIN[55] , GIN[55] , GIN[56] , POUT[54] , GOUT[56] );
+   BLOCK1 U357 (PIN[55] , PIN[56] , GIN[56] , GIN[57] , POUT[55] , GOUT[57] );
+   BLOCK1 U358 (PIN[56] , PIN[57] , GIN[57] , GIN[58] , POUT[56] , GOUT[58] );
+   BLOCK1 U359 (PIN[57] , PIN[58] , GIN[58] , GIN[59] , POUT[57] , GOUT[59] );
+   BLOCK1 U360 (PIN[58] , PIN[59] , GIN[59] , GIN[60] , POUT[58] , GOUT[60] );
+   BLOCK1 U361 (PIN[59] , PIN[60] , GIN[60] , GIN[61] , POUT[59] , GOUT[61] );
+   BLOCK1 U362 (PIN[60] , PIN[61] , GIN[61] , GIN[62] , POUT[60] , GOUT[62] );
+   BLOCK1 U363 (PIN[61] , PIN[62] , GIN[62] , GIN[63] , POUT[61] , GOUT[63] );
+   BLOCK1 U364 (PIN[62] , PIN[63] , GIN[63] , GIN[64] , POUT[62] , GOUT[64] );
+   
+endmodule // DBLC_0_64
+
+
+module DBLC_1_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:62] PIN;
+   input [0:64]  GIN;
+   
+   output [0:60] POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   INVBLOCK U11 (GIN[1] , GOUT[1] );
+   BLOCK2A U22 (PIN[0] , GIN[0] , GIN[2] , GOUT[2] );
+   BLOCK2A U23 (PIN[1] , GIN[1] , GIN[3] , GOUT[3] );
+   BLOCK2 U34 (PIN[0] , PIN[2] , GIN[2] , GIN[4] , POUT[0] , GOUT[4] );
+   BLOCK2 U35 (PIN[1] , PIN[3] , GIN[3] , GIN[5] , POUT[1] , GOUT[5] );
+   BLOCK2 U36 (PIN[2] , PIN[4] , GIN[4] , GIN[6] , POUT[2] , GOUT[6] );
+   BLOCK2 U37 (PIN[3] , PIN[5] , GIN[5] , GIN[7] , POUT[3] , GOUT[7] );
+   BLOCK2 U38 (PIN[4] , PIN[6] , GIN[6] , GIN[8] , POUT[4] , GOUT[8] );
+   BLOCK2 U39 (PIN[5] , PIN[7] , GIN[7] , GIN[9] , POUT[5] , GOUT[9] );
+   BLOCK2 U310 (PIN[6] , PIN[8] , GIN[8] , GIN[10] , POUT[6] , GOUT[10] );
+   BLOCK2 U311 (PIN[7] , PIN[9] , GIN[9] , GIN[11] , POUT[7] , GOUT[11] );
+   BLOCK2 U312 (PIN[8] , PIN[10] , GIN[10] , GIN[12] , POUT[8] , GOUT[12] );
+   BLOCK2 U313 (PIN[9] , PIN[11] , GIN[11] , GIN[13] , POUT[9] , GOUT[13] );
+   BLOCK2 U314 (PIN[10] , PIN[12] , GIN[12] , GIN[14] , POUT[10] , GOUT[14] );
+   BLOCK2 U315 (PIN[11] , PIN[13] , GIN[13] , GIN[15] , POUT[11] , GOUT[15] );
+   BLOCK2 U316 (PIN[12] , PIN[14] , GIN[14] , GIN[16] , POUT[12] , GOUT[16] );
+   BLOCK2 U317 (PIN[13] , PIN[15] , GIN[15] , GIN[17] , POUT[13] , GOUT[17] );
+   BLOCK2 U318 (PIN[14] , PIN[16] , GIN[16] , GIN[18] , POUT[14] , GOUT[18] );
+   BLOCK2 U319 (PIN[15] , PIN[17] , GIN[17] , GIN[19] , POUT[15] , GOUT[19] );
+   BLOCK2 U320 (PIN[16] , PIN[18] , GIN[18] , GIN[20] , POUT[16] , GOUT[20] );
+   BLOCK2 U321 (PIN[17] , PIN[19] , GIN[19] , GIN[21] , POUT[17] , GOUT[21] );
+   BLOCK2 U322 (PIN[18] , PIN[20] , GIN[20] , GIN[22] , POUT[18] , GOUT[22] );
+   BLOCK2 U323 (PIN[19] , PIN[21] , GIN[21] , GIN[23] , POUT[19] , GOUT[23] );
+   BLOCK2 U324 (PIN[20] , PIN[22] , GIN[22] , GIN[24] , POUT[20] , GOUT[24] );
+   BLOCK2 U325 (PIN[21] , PIN[23] , GIN[23] , GIN[25] , POUT[21] , GOUT[25] );
+   BLOCK2 U326 (PIN[22] , PIN[24] , GIN[24] , GIN[26] , POUT[22] , GOUT[26] );
+   BLOCK2 U327 (PIN[23] , PIN[25] , GIN[25] , GIN[27] , POUT[23] , GOUT[27] );
+   BLOCK2 U328 (PIN[24] , PIN[26] , GIN[26] , GIN[28] , POUT[24] , GOUT[28] );
+   BLOCK2 U329 (PIN[25] , PIN[27] , GIN[27] , GIN[29] , POUT[25] , GOUT[29] );
+   BLOCK2 U330 (PIN[26] , PIN[28] , GIN[28] , GIN[30] , POUT[26] , GOUT[30] );
+   BLOCK2 U331 (PIN[27] , PIN[29] , GIN[29] , GIN[31] , POUT[27] , GOUT[31] );
+   BLOCK2 U332 (PIN[28] , PIN[30] , GIN[30] , GIN[32] , POUT[28] , GOUT[32] );
+   BLOCK2 U333 (PIN[29] , PIN[31] , GIN[31] , GIN[33] , POUT[29] , GOUT[33] );
+   BLOCK2 U334 (PIN[30] , PIN[32] , GIN[32] , GIN[34] , POUT[30] , GOUT[34] );
+   BLOCK2 U335 (PIN[31] , PIN[33] , GIN[33] , GIN[35] , POUT[31] , GOUT[35] );
+   BLOCK2 U336 (PIN[32] , PIN[34] , GIN[34] , GIN[36] , POUT[32] , GOUT[36] );
+   BLOCK2 U337 (PIN[33] , PIN[35] , GIN[35] , GIN[37] , POUT[33] , GOUT[37] );
+   BLOCK2 U338 (PIN[34] , PIN[36] , GIN[36] , GIN[38] , POUT[34] , GOUT[38] );
+   BLOCK2 U339 (PIN[35] , PIN[37] , GIN[37] , GIN[39] , POUT[35] , GOUT[39] );
+   BLOCK2 U340 (PIN[36] , PIN[38] , GIN[38] , GIN[40] , POUT[36] , GOUT[40] );
+   BLOCK2 U341 (PIN[37] , PIN[39] , GIN[39] , GIN[41] , POUT[37] , GOUT[41] );
+   BLOCK2 U342 (PIN[38] , PIN[40] , GIN[40] , GIN[42] , POUT[38] , GOUT[42] );
+   BLOCK2 U343 (PIN[39] , PIN[41] , GIN[41] , GIN[43] , POUT[39] , GOUT[43] );
+   BLOCK2 U344 (PIN[40] , PIN[42] , GIN[42] , GIN[44] , POUT[40] , GOUT[44] );
+   BLOCK2 U345 (PIN[41] , PIN[43] , GIN[43] , GIN[45] , POUT[41] , GOUT[45] );
+   BLOCK2 U346 (PIN[42] , PIN[44] , GIN[44] , GIN[46] , POUT[42] , GOUT[46] );
+   BLOCK2 U347 (PIN[43] , PIN[45] , GIN[45] , GIN[47] , POUT[43] , GOUT[47] );
+   BLOCK2 U348 (PIN[44] , PIN[46] , GIN[46] , GIN[48] , POUT[44] , GOUT[48] );
+   BLOCK2 U349 (PIN[45] , PIN[47] , GIN[47] , GIN[49] , POUT[45] , GOUT[49] );
+   BLOCK2 U350 (PIN[46] , PIN[48] , GIN[48] , GIN[50] , POUT[46] , GOUT[50] );
+   BLOCK2 U351 (PIN[47] , PIN[49] , GIN[49] , GIN[51] , POUT[47] , GOUT[51] );
+   BLOCK2 U352 (PIN[48] , PIN[50] , GIN[50] , GIN[52] , POUT[48] , GOUT[52] );
+   BLOCK2 U353 (PIN[49] , PIN[51] , GIN[51] , GIN[53] , POUT[49] , GOUT[53] );
+   BLOCK2 U354 (PIN[50] , PIN[52] , GIN[52] , GIN[54] , POUT[50] , GOUT[54] );
+   BLOCK2 U355 (PIN[51] , PIN[53] , GIN[53] , GIN[55] , POUT[51] , GOUT[55] );
+   BLOCK2 U356 (PIN[52] , PIN[54] , GIN[54] , GIN[56] , POUT[52] , GOUT[56] );
+   BLOCK2 U357 (PIN[53] , PIN[55] , GIN[55] , GIN[57] , POUT[53] , GOUT[57] );
+   BLOCK2 U358 (PIN[54] , PIN[56] , GIN[56] , GIN[58] , POUT[54] , GOUT[58] );
+   BLOCK2 U359 (PIN[55] , PIN[57] , GIN[57] , GIN[59] , POUT[55] , GOUT[59] );
+   BLOCK2 U360 (PIN[56] , PIN[58] , GIN[58] , GIN[60] , POUT[56] , GOUT[60] );
+   BLOCK2 U361 (PIN[57] , PIN[59] , GIN[59] , GIN[61] , POUT[57] , GOUT[61] );
+   BLOCK2 U362 (PIN[58] , PIN[60] , GIN[60] , GIN[62] , POUT[58] , GOUT[62] );
+   BLOCK2 U363 (PIN[59] , PIN[61] , GIN[61] , GIN[63] , POUT[59] , GOUT[63] );
+   BLOCK2 U364 (PIN[60] , PIN[62] , GIN[62] , GIN[64] , POUT[60] , GOUT[64] );
+   
+endmodule // DBLC_1_64
+
+
+module DBLC_2_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:60] PIN;
+   input [0:64]  GIN;
+   
+   output [0:56] POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   INVBLOCK U11 (GIN[1] , GOUT[1] );
+   INVBLOCK U12 (GIN[2] , GOUT[2] );
+   INVBLOCK U13 (GIN[3] , GOUT[3] );
+   BLOCK1A U24 (PIN[0] , GIN[0] , GIN[4] , GOUT[4] );
+   BLOCK1A U25 (PIN[1] , GIN[1] , GIN[5] , GOUT[5] );
+   BLOCK1A U26 (PIN[2] , GIN[2] , GIN[6] , GOUT[6] );
+   BLOCK1A U27 (PIN[3] , GIN[3] , GIN[7] , GOUT[7] );
+   BLOCK1 U38 (PIN[0] , PIN[4] , GIN[4] , GIN[8] , POUT[0] , GOUT[8] );
+   BLOCK1 U39 (PIN[1] , PIN[5] , GIN[5] , GIN[9] , POUT[1] , GOUT[9] );
+   BLOCK1 U310 (PIN[2] , PIN[6] , GIN[6] , GIN[10] , POUT[2] , GOUT[10] );
+   BLOCK1 U311 (PIN[3] , PIN[7] , GIN[7] , GIN[11] , POUT[3] , GOUT[11] );
+   BLOCK1 U312 (PIN[4] , PIN[8] , GIN[8] , GIN[12] , POUT[4] , GOUT[12] );
+   BLOCK1 U313 (PIN[5] , PIN[9] , GIN[9] , GIN[13] , POUT[5] , GOUT[13] );
+   BLOCK1 U314 (PIN[6] , PIN[10] , GIN[10] , GIN[14] , POUT[6] , GOUT[14] );
+   BLOCK1 U315 (PIN[7] , PIN[11] , GIN[11] , GIN[15] , POUT[7] , GOUT[15] );
+   BLOCK1 U316 (PIN[8] , PIN[12] , GIN[12] , GIN[16] , POUT[8] , GOUT[16] );
+   BLOCK1 U317 (PIN[9] , PIN[13] , GIN[13] , GIN[17] , POUT[9] , GOUT[17] );
+   BLOCK1 U318 (PIN[10] , PIN[14] , GIN[14] , GIN[18] , POUT[10] , GOUT[18] );
+   BLOCK1 U319 (PIN[11] , PIN[15] , GIN[15] , GIN[19] , POUT[11] , GOUT[19] );
+   BLOCK1 U320 (PIN[12] , PIN[16] , GIN[16] , GIN[20] , POUT[12] , GOUT[20] );
+   BLOCK1 U321 (PIN[13] , PIN[17] , GIN[17] , GIN[21] , POUT[13] , GOUT[21] );
+   BLOCK1 U322 (PIN[14] , PIN[18] , GIN[18] , GIN[22] , POUT[14] , GOUT[22] );
+   BLOCK1 U323 (PIN[15] , PIN[19] , GIN[19] , GIN[23] , POUT[15] , GOUT[23] );
+   BLOCK1 U324 (PIN[16] , PIN[20] , GIN[20] , GIN[24] , POUT[16] , GOUT[24] );
+   BLOCK1 U325 (PIN[17] , PIN[21] , GIN[21] , GIN[25] , POUT[17] , GOUT[25] );
+   BLOCK1 U326 (PIN[18] , PIN[22] , GIN[22] , GIN[26] , POUT[18] , GOUT[26] );
+   BLOCK1 U327 (PIN[19] , PIN[23] , GIN[23] , GIN[27] , POUT[19] , GOUT[27] );
+   BLOCK1 U328 (PIN[20] , PIN[24] , GIN[24] , GIN[28] , POUT[20] , GOUT[28] );
+   BLOCK1 U329 (PIN[21] , PIN[25] , GIN[25] , GIN[29] , POUT[21] , GOUT[29] );
+   BLOCK1 U330 (PIN[22] , PIN[26] , GIN[26] , GIN[30] , POUT[22] , GOUT[30] );
+   BLOCK1 U331 (PIN[23] , PIN[27] , GIN[27] , GIN[31] , POUT[23] , GOUT[31] );
+   BLOCK1 U332 (PIN[24] , PIN[28] , GIN[28] , GIN[32] , POUT[24] , GOUT[32] );
+   BLOCK1 U333 (PIN[25] , PIN[29] , GIN[29] , GIN[33] , POUT[25] , GOUT[33] );
+   BLOCK1 U334 (PIN[26] , PIN[30] , GIN[30] , GIN[34] , POUT[26] , GOUT[34] );
+   BLOCK1 U335 (PIN[27] , PIN[31] , GIN[31] , GIN[35] , POUT[27] , GOUT[35] );
+   BLOCK1 U336 (PIN[28] , PIN[32] , GIN[32] , GIN[36] , POUT[28] , GOUT[36] );
+   BLOCK1 U337 (PIN[29] , PIN[33] , GIN[33] , GIN[37] , POUT[29] , GOUT[37] );
+   BLOCK1 U338 (PIN[30] , PIN[34] , GIN[34] , GIN[38] , POUT[30] , GOUT[38] );
+   BLOCK1 U339 (PIN[31] , PIN[35] , GIN[35] , GIN[39] , POUT[31] , GOUT[39] );
+   BLOCK1 U340 (PIN[32] , PIN[36] , GIN[36] , GIN[40] , POUT[32] , GOUT[40] );
+   BLOCK1 U341 (PIN[33] , PIN[37] , GIN[37] , GIN[41] , POUT[33] , GOUT[41] );
+   BLOCK1 U342 (PIN[34] , PIN[38] , GIN[38] , GIN[42] , POUT[34] , GOUT[42] );
+   BLOCK1 U343 (PIN[35] , PIN[39] , GIN[39] , GIN[43] , POUT[35] , GOUT[43] );
+   BLOCK1 U344 (PIN[36] , PIN[40] , GIN[40] , GIN[44] , POUT[36] , GOUT[44] );
+   BLOCK1 U345 (PIN[37] , PIN[41] , GIN[41] , GIN[45] , POUT[37] , GOUT[45] );
+   BLOCK1 U346 (PIN[38] , PIN[42] , GIN[42] , GIN[46] , POUT[38] , GOUT[46] );
+   BLOCK1 U347 (PIN[39] , PIN[43] , GIN[43] , GIN[47] , POUT[39] , GOUT[47] );
+   BLOCK1 U348 (PIN[40] , PIN[44] , GIN[44] , GIN[48] , POUT[40] , GOUT[48] );
+   BLOCK1 U349 (PIN[41] , PIN[45] , GIN[45] , GIN[49] , POUT[41] , GOUT[49] );
+   BLOCK1 U350 (PIN[42] , PIN[46] , GIN[46] , GIN[50] , POUT[42] , GOUT[50] );
+   BLOCK1 U351 (PIN[43] , PIN[47] , GIN[47] , GIN[51] , POUT[43] , GOUT[51] );
+   BLOCK1 U352 (PIN[44] , PIN[48] , GIN[48] , GIN[52] , POUT[44] , GOUT[52] );
+   BLOCK1 U353 (PIN[45] , PIN[49] , GIN[49] , GIN[53] , POUT[45] , GOUT[53] );
+   BLOCK1 U354 (PIN[46] , PIN[50] , GIN[50] , GIN[54] , POUT[46] , GOUT[54] );
+   BLOCK1 U355 (PIN[47] , PIN[51] , GIN[51] , GIN[55] , POUT[47] , GOUT[55] );
+   BLOCK1 U356 (PIN[48] , PIN[52] , GIN[52] , GIN[56] , POUT[48] , GOUT[56] );
+   BLOCK1 U357 (PIN[49] , PIN[53] , GIN[53] , GIN[57] , POUT[49] , GOUT[57] );
+   BLOCK1 U358 (PIN[50] , PIN[54] , GIN[54] , GIN[58] , POUT[50] , GOUT[58] );
+   BLOCK1 U359 (PIN[51] , PIN[55] , GIN[55] , GIN[59] , POUT[51] , GOUT[59] );
+   BLOCK1 U360 (PIN[52] , PIN[56] , GIN[56] , GIN[60] , POUT[52] , GOUT[60] );
+   BLOCK1 U361 (PIN[53] , PIN[57] , GIN[57] , GIN[61] , POUT[53] , GOUT[61] );
+   BLOCK1 U362 (PIN[54] , PIN[58] , GIN[58] , GIN[62] , POUT[54] , GOUT[62] );
+   BLOCK1 U363 (PIN[55] , PIN[59] , GIN[59] , GIN[63] , POUT[55] , GOUT[63] );
+   BLOCK1 U364 (PIN[56] , PIN[60] , GIN[60] , GIN[64] , POUT[56] , GOUT[64] );
+   
+endmodule // DBLC_2_64
+
+
+module DBLC_3_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:56] PIN;
+   input [0:64]  GIN;
+   
+   output [0:48] POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   INVBLOCK U11 (GIN[1] , GOUT[1] );
+   INVBLOCK U12 (GIN[2] , GOUT[2] );
+   INVBLOCK U13 (GIN[3] , GOUT[3] );
+   INVBLOCK U14 (GIN[4] , GOUT[4] );
+   INVBLOCK U15 (GIN[5] , GOUT[5] );
+   INVBLOCK U16 (GIN[6] , GOUT[6] );
+   INVBLOCK U17 (GIN[7] , GOUT[7] );
+   BLOCK2A U28 (PIN[0] , GIN[0] , GIN[8] , GOUT[8] );
+   BLOCK2A U29 (PIN[1] , GIN[1] , GIN[9] , GOUT[9] );
+   BLOCK2A U210 (PIN[2] , GIN[2] , GIN[10] , GOUT[10] );
+   BLOCK2A U211 (PIN[3] , GIN[3] , GIN[11] , GOUT[11] );
+   BLOCK2A U212 (PIN[4] , GIN[4] , GIN[12] , GOUT[12] );
+   BLOCK2A U213 (PIN[5] , GIN[5] , GIN[13] , GOUT[13] );
+   BLOCK2A U214 (PIN[6] , GIN[6] , GIN[14] , GOUT[14] );
+   BLOCK2A U215 (PIN[7] , GIN[7] , GIN[15] , GOUT[15] );
+   BLOCK2 U316 (PIN[0] , PIN[8] , GIN[8] , GIN[16] , POUT[0] , GOUT[16] );
+   BLOCK2 U317 (PIN[1] , PIN[9] , GIN[9] , GIN[17] , POUT[1] , GOUT[17] );
+   BLOCK2 U318 (PIN[2] , PIN[10] , GIN[10] , GIN[18] , POUT[2] , GOUT[18] );
+   BLOCK2 U319 (PIN[3] , PIN[11] , GIN[11] , GIN[19] , POUT[3] , GOUT[19] );
+   BLOCK2 U320 (PIN[4] , PIN[12] , GIN[12] , GIN[20] , POUT[4] , GOUT[20] );
+   BLOCK2 U321 (PIN[5] , PIN[13] , GIN[13] , GIN[21] , POUT[5] , GOUT[21] );
+   BLOCK2 U322 (PIN[6] , PIN[14] , GIN[14] , GIN[22] , POUT[6] , GOUT[22] );
+   BLOCK2 U323 (PIN[7] , PIN[15] , GIN[15] , GIN[23] , POUT[7] , GOUT[23] );
+   BLOCK2 U324 (PIN[8] , PIN[16] , GIN[16] , GIN[24] , POUT[8] , GOUT[24] );
+   BLOCK2 U325 (PIN[9] , PIN[17] , GIN[17] , GIN[25] , POUT[9] , GOUT[25] );
+   BLOCK2 U326 (PIN[10] , PIN[18] , GIN[18] , GIN[26] , POUT[10] , GOUT[26] );
+   BLOCK2 U327 (PIN[11] , PIN[19] , GIN[19] , GIN[27] , POUT[11] , GOUT[27] );
+   BLOCK2 U328 (PIN[12] , PIN[20] , GIN[20] , GIN[28] , POUT[12] , GOUT[28] );
+   BLOCK2 U329 (PIN[13] , PIN[21] , GIN[21] , GIN[29] , POUT[13] , GOUT[29] );
+   BLOCK2 U330 (PIN[14] , PIN[22] , GIN[22] , GIN[30] , POUT[14] , GOUT[30] );
+   BLOCK2 U331 (PIN[15] , PIN[23] , GIN[23] , GIN[31] , POUT[15] , GOUT[31] );
+   BLOCK2 U332 (PIN[16] , PIN[24] , GIN[24] , GIN[32] , POUT[16] , GOUT[32] );
+   BLOCK2 U333 (PIN[17] , PIN[25] , GIN[25] , GIN[33] , POUT[17] , GOUT[33] );
+   BLOCK2 U334 (PIN[18] , PIN[26] , GIN[26] , GIN[34] , POUT[18] , GOUT[34] );
+   BLOCK2 U335 (PIN[19] , PIN[27] , GIN[27] , GIN[35] , POUT[19] , GOUT[35] );
+   BLOCK2 U336 (PIN[20] , PIN[28] , GIN[28] , GIN[36] , POUT[20] , GOUT[36] );
+   BLOCK2 U337 (PIN[21] , PIN[29] , GIN[29] , GIN[37] , POUT[21] , GOUT[37] );
+   BLOCK2 U338 (PIN[22] , PIN[30] , GIN[30] , GIN[38] , POUT[22] , GOUT[38] );
+   BLOCK2 U339 (PIN[23] , PIN[31] , GIN[31] , GIN[39] , POUT[23] , GOUT[39] );
+   BLOCK2 U340 (PIN[24] , PIN[32] , GIN[32] , GIN[40] , POUT[24] , GOUT[40] );
+   BLOCK2 U341 (PIN[25] , PIN[33] , GIN[33] , GIN[41] , POUT[25] , GOUT[41] );
+   BLOCK2 U342 (PIN[26] , PIN[34] , GIN[34] , GIN[42] , POUT[26] , GOUT[42] );
+   BLOCK2 U343 (PIN[27] , PIN[35] , GIN[35] , GIN[43] , POUT[27] , GOUT[43] );
+   BLOCK2 U344 (PIN[28] , PIN[36] , GIN[36] , GIN[44] , POUT[28] , GOUT[44] );
+   BLOCK2 U345 (PIN[29] , PIN[37] , GIN[37] , GIN[45] , POUT[29] , GOUT[45] );
+   BLOCK2 U346 (PIN[30] , PIN[38] , GIN[38] , GIN[46] , POUT[30] , GOUT[46] );
+   BLOCK2 U347 (PIN[31] , PIN[39] , GIN[39] , GIN[47] , POUT[31] , GOUT[47] );
+   BLOCK2 U348 (PIN[32] , PIN[40] , GIN[40] , GIN[48] , POUT[32] , GOUT[48] );
+   BLOCK2 U349 (PIN[33] , PIN[41] , GIN[41] , GIN[49] , POUT[33] , GOUT[49] );
+   BLOCK2 U350 (PIN[34] , PIN[42] , GIN[42] , GIN[50] , POUT[34] , GOUT[50] );
+   BLOCK2 U351 (PIN[35] , PIN[43] , GIN[43] , GIN[51] , POUT[35] , GOUT[51] );
+   BLOCK2 U352 (PIN[36] , PIN[44] , GIN[44] , GIN[52] , POUT[36] , GOUT[52] );
+   BLOCK2 U353 (PIN[37] , PIN[45] , GIN[45] , GIN[53] , POUT[37] , GOUT[53] );
+   BLOCK2 U354 (PIN[38] , PIN[46] , GIN[46] , GIN[54] , POUT[38] , GOUT[54] );
+   BLOCK2 U355 (PIN[39] , PIN[47] , GIN[47] , GIN[55] , POUT[39] , GOUT[55] );
+   BLOCK2 U356 (PIN[40] , PIN[48] , GIN[48] , GIN[56] , POUT[40] , GOUT[56] );
+   BLOCK2 U357 (PIN[41] , PIN[49] , GIN[49] , GIN[57] , POUT[41] , GOUT[57] );
+   BLOCK2 U358 (PIN[42] , PIN[50] , GIN[50] , GIN[58] , POUT[42] , GOUT[58] );
+   BLOCK2 U359 (PIN[43] , PIN[51] , GIN[51] , GIN[59] , POUT[43] , GOUT[59] );
+   BLOCK2 U360 (PIN[44] , PIN[52] , GIN[52] , GIN[60] , POUT[44] , GOUT[60] );
+   BLOCK2 U361 (PIN[45] , PIN[53] , GIN[53] , GIN[61] , POUT[45] , GOUT[61] );
+   BLOCK2 U362 (PIN[46] , PIN[54] , GIN[54] , GIN[62] , POUT[46] , GOUT[62] );
+   BLOCK2 U363 (PIN[47] , PIN[55] , GIN[55] , GIN[63] , POUT[47] , GOUT[63] );
+   BLOCK2 U364 (PIN[48] , PIN[56] , GIN[56] , GIN[64] , POUT[48] , GOUT[64] );
+   
+endmodule // DBLC_3_64
+
+
+module DBLC_4_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:48] PIN;
+   input [0:64]  GIN;
+   
+   output [0:32] POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   INVBLOCK U11 (GIN[1] , GOUT[1] );
+   INVBLOCK U12 (GIN[2] , GOUT[2] );
+   INVBLOCK U13 (GIN[3] , GOUT[3] );
+   INVBLOCK U14 (GIN[4] , GOUT[4] );
+   INVBLOCK U15 (GIN[5] , GOUT[5] );
+   INVBLOCK U16 (GIN[6] , GOUT[6] );
+   INVBLOCK U17 (GIN[7] , GOUT[7] );
+   INVBLOCK U18 (GIN[8] , GOUT[8] );
+   INVBLOCK U19 (GIN[9] , GOUT[9] );
+   INVBLOCK U110 (GIN[10] , GOUT[10] );
+   INVBLOCK U111 (GIN[11] , GOUT[11] );
+   INVBLOCK U112 (GIN[12] , GOUT[12] );
+   INVBLOCK U113 (GIN[13] , GOUT[13] );
+   INVBLOCK U114 (GIN[14] , GOUT[14] );
+   INVBLOCK U115 (GIN[15] , GOUT[15] );
+   BLOCK1A U216 (PIN[0] , GIN[0] , GIN[16] , GOUT[16] );
+   BLOCK1A U217 (PIN[1] , GIN[1] , GIN[17] , GOUT[17] );
+   BLOCK1A U218 (PIN[2] , GIN[2] , GIN[18] , GOUT[18] );
+   BLOCK1A U219 (PIN[3] , GIN[3] , GIN[19] , GOUT[19] );
+   BLOCK1A U220 (PIN[4] , GIN[4] , GIN[20] , GOUT[20] );
+   BLOCK1A U221 (PIN[5] , GIN[5] , GIN[21] , GOUT[21] );
+   BLOCK1A U222 (PIN[6] , GIN[6] , GIN[22] , GOUT[22] );
+   BLOCK1A U223 (PIN[7] , GIN[7] , GIN[23] , GOUT[23] );
+   BLOCK1A U224 (PIN[8] , GIN[8] , GIN[24] , GOUT[24] );
+   BLOCK1A U225 (PIN[9] , GIN[9] , GIN[25] , GOUT[25] );
+   BLOCK1A U226 (PIN[10] , GIN[10] , GIN[26] , GOUT[26] );
+   BLOCK1A U227 (PIN[11] , GIN[11] , GIN[27] , GOUT[27] );
+   BLOCK1A U228 (PIN[12] , GIN[12] , GIN[28] , GOUT[28] );
+   BLOCK1A U229 (PIN[13] , GIN[13] , GIN[29] , GOUT[29] );
+   BLOCK1A U230 (PIN[14] , GIN[14] , GIN[30] , GOUT[30] );
+   BLOCK1A U231 (PIN[15] , GIN[15] , GIN[31] , GOUT[31] );
+   BLOCK1 U332 (PIN[0] , PIN[16] , GIN[16] , GIN[32] , POUT[0] , GOUT[32] );
+   BLOCK1 U333 (PIN[1] , PIN[17] , GIN[17] , GIN[33] , POUT[1] , GOUT[33] );
+   BLOCK1 U334 (PIN[2] , PIN[18] , GIN[18] , GIN[34] , POUT[2] , GOUT[34] );
+   BLOCK1 U335 (PIN[3] , PIN[19] , GIN[19] , GIN[35] , POUT[3] , GOUT[35] );
+   BLOCK1 U336 (PIN[4] , PIN[20] , GIN[20] , GIN[36] , POUT[4] , GOUT[36] );
+   BLOCK1 U337 (PIN[5] , PIN[21] , GIN[21] , GIN[37] , POUT[5] , GOUT[37] );
+   BLOCK1 U338 (PIN[6] , PIN[22] , GIN[22] , GIN[38] , POUT[6] , GOUT[38] );
+   BLOCK1 U339 (PIN[7] , PIN[23] , GIN[23] , GIN[39] , POUT[7] , GOUT[39] );
+   BLOCK1 U340 (PIN[8] , PIN[24] , GIN[24] , GIN[40] , POUT[8] , GOUT[40] );
+   BLOCK1 U341 (PIN[9] , PIN[25] , GIN[25] , GIN[41] , POUT[9] , GOUT[41] );
+   BLOCK1 U342 (PIN[10] , PIN[26] , GIN[26] , GIN[42] , POUT[10] , GOUT[42] );
+   BLOCK1 U343 (PIN[11] , PIN[27] , GIN[27] , GIN[43] , POUT[11] , GOUT[43] );
+   BLOCK1 U344 (PIN[12] , PIN[28] , GIN[28] , GIN[44] , POUT[12] , GOUT[44] );
+   BLOCK1 U345 (PIN[13] , PIN[29] , GIN[29] , GIN[45] , POUT[13] , GOUT[45] );
+   BLOCK1 U346 (PIN[14] , PIN[30] , GIN[30] , GIN[46] , POUT[14] , GOUT[46] );
+   BLOCK1 U347 (PIN[15] , PIN[31] , GIN[31] , GIN[47] , POUT[15] , GOUT[47] );
+   BLOCK1 U348 (PIN[16] , PIN[32] , GIN[32] , GIN[48] , POUT[16] , GOUT[48] );
+   BLOCK1 U349 (PIN[17] , PIN[33] , GIN[33] , GIN[49] , POUT[17] , GOUT[49] );
+   BLOCK1 U350 (PIN[18] , PIN[34] , GIN[34] , GIN[50] , POUT[18] , GOUT[50] );
+   BLOCK1 U351 (PIN[19] , PIN[35] , GIN[35] , GIN[51] , POUT[19] , GOUT[51] );
+   BLOCK1 U352 (PIN[20] , PIN[36] , GIN[36] , GIN[52] , POUT[20] , GOUT[52] );
+   BLOCK1 U353 (PIN[21] , PIN[37] , GIN[37] , GIN[53] , POUT[21] , GOUT[53] );
+   BLOCK1 U354 (PIN[22] , PIN[38] , GIN[38] , GIN[54] , POUT[22] , GOUT[54] );
+   BLOCK1 U355 (PIN[23] , PIN[39] , GIN[39] , GIN[55] , POUT[23] , GOUT[55] );
+   BLOCK1 U356 (PIN[24] , PIN[40] , GIN[40] , GIN[56] , POUT[24] , GOUT[56] );
+   BLOCK1 U357 (PIN[25] , PIN[41] , GIN[41] , GIN[57] , POUT[25] , GOUT[57] );
+   BLOCK1 U358 (PIN[26] , PIN[42] , GIN[42] , GIN[58] , POUT[26] , GOUT[58] );
+   BLOCK1 U359 (PIN[27] , PIN[43] , GIN[43] , GIN[59] , POUT[27] , GOUT[59] );
+   BLOCK1 U360 (PIN[28] , PIN[44] , GIN[44] , GIN[60] , POUT[28] , GOUT[60] );
+   BLOCK1 U361 (PIN[29] , PIN[45] , GIN[45] , GIN[61] , POUT[29] , GOUT[61] );
+   BLOCK1 U362 (PIN[30] , PIN[46] , GIN[46] , GIN[62] , POUT[30] , GOUT[62] );
+   BLOCK1 U363 (PIN[31] , PIN[47] , GIN[47] , GIN[63] , POUT[31] , GOUT[63] );
+   BLOCK1 U364 (PIN[32] , PIN[48] , GIN[48] , GIN[64] , POUT[32] , GOUT[64] );
+   
+endmodule // DBLC_4_64
+
+
+module DBLC_5_64 ( PIN, GIN, POUT, GOUT );
+   
+   input  [0:32] PIN;
+   input [0:64]  GIN;
+   
+   output [0:0]  POUT;
+   output [0:64] GOUT;
+   
+   INVBLOCK U10 (GIN[0] , GOUT[0] );
+   INVBLOCK U11 (GIN[1] , GOUT[1] );
+   INVBLOCK U12 (GIN[2] , GOUT[2] );
+   INVBLOCK U13 (GIN[3] , GOUT[3] );
+   INVBLOCK U14 (GIN[4] , GOUT[4] );
+   INVBLOCK U15 (GIN[5] , GOUT[5] );
+   INVBLOCK U16 (GIN[6] , GOUT[6] );
+   INVBLOCK U17 (GIN[7] , GOUT[7] );
+   INVBLOCK U18 (GIN[8] , GOUT[8] );
+   INVBLOCK U19 (GIN[9] , GOUT[9] );
+   INVBLOCK U110 (GIN[10] , GOUT[10] );
+   INVBLOCK U111 (GIN[11] , GOUT[11] );
+   INVBLOCK U112 (GIN[12] , GOUT[12] );
+   INVBLOCK U113 (GIN[13] , GOUT[13] );
+   INVBLOCK U114 (GIN[14] , GOUT[14] );
+   INVBLOCK U115 (GIN[15] , GOUT[15] );
+   INVBLOCK U116 (GIN[16] , GOUT[16] );
+   INVBLOCK U117 (GIN[17] , GOUT[17] );
+   INVBLOCK U118 (GIN[18] , GOUT[18] );
+   INVBLOCK U119 (GIN[19] , GOUT[19] );
+   INVBLOCK U120 (GIN[20] , GOUT[20] );
+   INVBLOCK U121 (GIN[21] , GOUT[21] );
+   INVBLOCK U122 (GIN[22] , GOUT[22] );
+   INVBLOCK U123 (GIN[23] , GOUT[23] );
+   INVBLOCK U124 (GIN[24] , GOUT[24] );
+   INVBLOCK U125 (GIN[25] , GOUT[25] );
+   INVBLOCK U126 (GIN[26] , GOUT[26] );
+   INVBLOCK U127 (GIN[27] , GOUT[27] );
+   INVBLOCK U128 (GIN[28] , GOUT[28] );
+   INVBLOCK U129 (GIN[29] , GOUT[29] );
+   INVBLOCK U130 (GIN[30] , GOUT[30] );
+   INVBLOCK U131 (GIN[31] , GOUT[31] );
+   BLOCK2A U232 (PIN[0] , GIN[0] , GIN[32] , GOUT[32] );
+   BLOCK2A U233 (PIN[1] , GIN[1] , GIN[33] , GOUT[33] );
+   BLOCK2A U234 (PIN[2] , GIN[2] , GIN[34] , GOUT[34] );
+   BLOCK2A U235 (PIN[3] , GIN[3] , GIN[35] , GOUT[35] );
+   BLOCK2A U236 (PIN[4] , GIN[4] , GIN[36] , GOUT[36] );
+   BLOCK2A U237 (PIN[5] , GIN[5] , GIN[37] , GOUT[37] );
+   BLOCK2A U238 (PIN[6] , GIN[6] , GIN[38] , GOUT[38] );
+   BLOCK2A U239 (PIN[7] , GIN[7] , GIN[39] , GOUT[39] );
+   BLOCK2A U240 (PIN[8] , GIN[8] , GIN[40] , GOUT[40] );
+   BLOCK2A U241 (PIN[9] , GIN[9] , GIN[41] , GOUT[41] );
+   BLOCK2A U242 (PIN[10] , GIN[10] , GIN[42] , GOUT[42] );
+   BLOCK2A U243 (PIN[11] , GIN[11] , GIN[43] , GOUT[43] );
+   BLOCK2A U244 (PIN[12] , GIN[12] , GIN[44] , GOUT[44] );
+   BLOCK2A U245 (PIN[13] , GIN[13] , GIN[45] , GOUT[45] );
+   BLOCK2A U246 (PIN[14] , GIN[14] , GIN[46] , GOUT[46] );
+   BLOCK2A U247 (PIN[15] , GIN[15] , GIN[47] , GOUT[47] );
+   BLOCK2A U248 (PIN[16] , GIN[16] , GIN[48] , GOUT[48] );
+   BLOCK2A U249 (PIN[17] , GIN[17] , GIN[49] , GOUT[49] );
+   BLOCK2A U250 (PIN[18] , GIN[18] , GIN[50] , GOUT[50] );
+   BLOCK2A U251 (PIN[19] , GIN[19] , GIN[51] , GOUT[51] );
+   BLOCK2A U252 (PIN[20] , GIN[20] , GIN[52] , GOUT[52] );
+   BLOCK2A U253 (PIN[21] , GIN[21] , GIN[53] , GOUT[53] );
+   BLOCK2A U254 (PIN[22] , GIN[22] , GIN[54] , GOUT[54] );
+   BLOCK2A U255 (PIN[23] , GIN[23] , GIN[55] , GOUT[55] );
+   BLOCK2A U256 (PIN[24] , GIN[24] , GIN[56] , GOUT[56] );
+   BLOCK2A U257 (PIN[25] , GIN[25] , GIN[57] , GOUT[57] );
+   BLOCK2A U258 (PIN[26] , GIN[26] , GIN[58] , GOUT[58] );
+   BLOCK2A U259 (PIN[27] , GIN[27] , GIN[59] , GOUT[59] );
+   BLOCK2A U260 (PIN[28] , GIN[28] , GIN[60] , GOUT[60] );
+   BLOCK2A U261 (PIN[29] , GIN[29] , GIN[61] , GOUT[61] );
+   BLOCK2A U262 (PIN[30] , GIN[30] , GIN[62] , GOUT[62] );
+   BLOCK2A U263 (PIN[31] , GIN[31] , GIN[63] , GOUT[63] );
+   BLOCK2 U364 (PIN[0] , PIN[32] , GIN[32] , GIN[64] , POUT[0] , GOUT[64] );
+   
+endmodule // DBLC_5_64
+
+
+module XORSTAGE_64 ( A, B, PBIT, CARRY, SUM, COUT );
+   
+   input  [0:63] A;
+   input [0:63]  B;
+   input 	 PBIT;
+   input [0:64]  CARRY;
+   
+   output [0:63] SUM;
+   output 	 COUT;
+   
+   XXOR1 U20 (A[0] , B[0] , CARRY[0] , SUM[0] );
+   XXOR1 U21 (A[1] , B[1] , CARRY[1] , SUM[1] );
+   XXOR1 U22 (A[2] , B[2] , CARRY[2] , SUM[2] );
+   XXOR1 U23 (A[3] , B[3] , CARRY[3] , SUM[3] );
+   XXOR1 U24 (A[4] , B[4] , CARRY[4] , SUM[4] );
+   XXOR1 U25 (A[5] , B[5] , CARRY[5] , SUM[5] );
+   XXOR1 U26 (A[6] , B[6] , CARRY[6] , SUM[6] );
+   XXOR1 U27 (A[7] , B[7] , CARRY[7] , SUM[7] );
+   XXOR1 U28 (A[8] , B[8] , CARRY[8] , SUM[8] );
+   XXOR1 U29 (A[9] , B[9] , CARRY[9] , SUM[9] );
+   XXOR1 U210 (A[10] , B[10] , CARRY[10] , SUM[10] );
+   XXOR1 U211 (A[11] , B[11] , CARRY[11] , SUM[11] );
+   XXOR1 U212 (A[12] , B[12] , CARRY[12] , SUM[12] );
+   XXOR1 U213 (A[13] , B[13] , CARRY[13] , SUM[13] );
+   XXOR1 U214 (A[14] , B[14] , CARRY[14] , SUM[14] );
+   XXOR1 U215 (A[15] , B[15] , CARRY[15] , SUM[15] );
+   XXOR1 U216 (A[16] , B[16] , CARRY[16] , SUM[16] );
+   XXOR1 U217 (A[17] , B[17] , CARRY[17] , SUM[17] );
+   XXOR1 U218 (A[18] , B[18] , CARRY[18] , SUM[18] );
+   XXOR1 U219 (A[19] , B[19] , CARRY[19] , SUM[19] );
+   XXOR1 U220 (A[20] , B[20] , CARRY[20] , SUM[20] );
+   XXOR1 U221 (A[21] , B[21] , CARRY[21] , SUM[21] );
+   XXOR1 U222 (A[22] , B[22] , CARRY[22] , SUM[22] );
+   XXOR1 U223 (A[23] , B[23] , CARRY[23] , SUM[23] );
+   XXOR1 U224 (A[24] , B[24] , CARRY[24] , SUM[24] );
+   XXOR1 U225 (A[25] , B[25] , CARRY[25] , SUM[25] );
+   XXOR1 U226 (A[26] , B[26] , CARRY[26] , SUM[26] );
+   XXOR1 U227 (A[27] , B[27] , CARRY[27] , SUM[27] );
+   XXOR1 U228 (A[28] , B[28] , CARRY[28] , SUM[28] );
+   XXOR1 U229 (A[29] , B[29] , CARRY[29] , SUM[29] );
+   XXOR1 U230 (A[30] , B[30] , CARRY[30] , SUM[30] );
+   XXOR1 U231 (A[31] , B[31] , CARRY[31] , SUM[31] );
+   XXOR1 U232 (A[32] , B[32] , CARRY[32] , SUM[32] );
+   XXOR1 U233 (A[33] , B[33] , CARRY[33] , SUM[33] );
+   XXOR1 U234 (A[34] , B[34] , CARRY[34] , SUM[34] );
+   XXOR1 U235 (A[35] , B[35] , CARRY[35] , SUM[35] );
+   XXOR1 U236 (A[36] , B[36] , CARRY[36] , SUM[36] );
+   XXOR1 U237 (A[37] , B[37] , CARRY[37] , SUM[37] );
+   XXOR1 U238 (A[38] , B[38] , CARRY[38] , SUM[38] );
+   XXOR1 U239 (A[39] , B[39] , CARRY[39] , SUM[39] );
+   XXOR1 U240 (A[40] , B[40] , CARRY[40] , SUM[40] );
+   XXOR1 U241 (A[41] , B[41] , CARRY[41] , SUM[41] );
+   XXOR1 U242 (A[42] , B[42] , CARRY[42] , SUM[42] );
+   XXOR1 U243 (A[43] , B[43] , CARRY[43] , SUM[43] );
+   XXOR1 U244 (A[44] , B[44] , CARRY[44] , SUM[44] );
+   XXOR1 U245 (A[45] , B[45] , CARRY[45] , SUM[45] );
+   XXOR1 U246 (A[46] , B[46] , CARRY[46] , SUM[46] );
+   XXOR1 U247 (A[47] , B[47] , CARRY[47] , SUM[47] );
+   XXOR1 U248 (A[48] , B[48] , CARRY[48] , SUM[48] );
+   XXOR1 U249 (A[49] , B[49] , CARRY[49] , SUM[49] );
+   XXOR1 U250 (A[50] , B[50] , CARRY[50] , SUM[50] );
+   XXOR1 U251 (A[51] , B[51] , CARRY[51] , SUM[51] );
+   XXOR1 U252 (A[52] , B[52] , CARRY[52] , SUM[52] );
+   XXOR1 U253 (A[53] , B[53] , CARRY[53] , SUM[53] );
+   XXOR1 U254 (A[54] , B[54] , CARRY[54] , SUM[54] );
+   XXOR1 U255 (A[55] , B[55] , CARRY[55] , SUM[55] );
+   XXOR1 U256 (A[56] , B[56] , CARRY[56] , SUM[56] );
+   XXOR1 U257 (A[57] , B[57] , CARRY[57] , SUM[57] );
+   XXOR1 U258 (A[58] , B[58] , CARRY[58] , SUM[58] );
+   XXOR1 U259 (A[59] , B[59] , CARRY[59] , SUM[59] );
+   XXOR1 U260 (A[60] , B[60] , CARRY[60] , SUM[60] );
+   XXOR1 U261 (A[61] , B[61] , CARRY[61] , SUM[61] );
+   XXOR1 U262 (A[62] , B[62] , CARRY[62] , SUM[62] );
+   XXOR1 U263 (A[63] , B[63] , CARRY[63] , SUM[63] );
+   BLOCK1A U1 (PBIT , CARRY[0] , CARRY[64] , COUT );
+   
+endmodule // XORSTAGE_64
+
+
+module DBLCTREE_64 ( PIN, GIN, GOUT, POUT );
+   
+   input  [0:63] PIN;
+   input [0:64]  GIN;
+   
+   output [0:64] GOUT;
+   output [0:0]  POUT;
+   
+   wire [0:62] 	 INTPROP_0;
+   wire [0:64] 	 INTGEN_0;
+   wire [0:60] 	 INTPROP_1;
+   wire [0:64] 	 INTGEN_1;
+   wire [0:56] 	 INTPROP_2;
+   wire [0:64] 	 INTGEN_2;
+   wire [0:48] 	 INTPROP_3;
+   wire [0:64] 	 INTGEN_3;
+   wire [0:32] 	 INTPROP_4;
+   wire [0:64] 	 INTGEN_4;
+   
+   DBLC_0_64 U_0 (.PIN(PIN) , .GIN(GIN) , .POUT(INTPROP_0) , .GOUT(INTGEN_0) );
+   DBLC_1_64 U_1 (.PIN(INTPROP_0) , .GIN(INTGEN_0) , .POUT(INTPROP_1) , .GOUT(INTGEN_1) );
+   DBLC_2_64 U_2 (.PIN(INTPROP_1) , .GIN(INTGEN_1) , .POUT(INTPROP_2) , .GOUT(INTGEN_2) );
+   DBLC_3_64 U_3 (.PIN(INTPROP_2) , .GIN(INTGEN_2) , .POUT(INTPROP_3) , .GOUT(INTGEN_3) );
+   DBLC_4_64 U_4 (.PIN(INTPROP_3) , .GIN(INTGEN_3) , .POUT(INTPROP_4) , .GOUT(INTGEN_4) );
+   DBLC_5_64 U_5 (.PIN(INTPROP_4) , .GIN(INTGEN_4) , .POUT(POUT) , .GOUT(GOUT) );
+   
+endmodule // DBLCTREE_64
+
+
+module DBLCADDER_64_64 ( OPA, OPB, CIN, SUM, COUT );
+   
+   input  [0:63] OPA;
+   input [0:63]  OPB;
+   input 	 CIN;
+   
+   output [0:63] SUM;
+   output 	 COUT;
+   
+   wire [0:63] 	 INTPROP;
+   wire [0:64] 	 INTGEN;
+   wire [0:0] 	 PBIT;
+   wire [0:64] 	 CARRY;
+   
+   PRESTAGE_64 U1 (OPA , OPB , CIN , INTPROP , INTGEN );
+   DBLCTREE_64 U2 (INTPROP , INTGEN , CARRY , PBIT );
+   XORSTAGE_64 U3 (OPA[0:63] , OPB[0:63] , PBIT[0] , CARRY[0:64] , SUM , COUT );
+   
+endmodule 
--- a/wally-pipelined/src/fpu/bk128.sv
+++ b/wally-pipelined/src/fpu/bk128.sv
@ -0,0 +1,599 @@
+// Brent-Kung Carry-save Prefix Adder
+
+module bk128 (cout, sum, a, b, cin);
+   
+   input [127:0] a, b;
+   input 	 cin;
+   
+   output [127:0] sum;
+   output 	  cout;
+
+   wire [128:0]   p,g,t;
+   wire [127:0]   c;
+
+   // pre-computation
+   assign p={a^b,1'b0};
+   assign g={a&b, cin};
+   assign t[1]=p[1];
+   assign t[2]=p[2];
+   assign t[3]=p[3]^g[2];
+   assign t[4]=p[4];
+   assign t[5]=p[5]^g[4];
+   assign t[6]=p[6];
+   assign t[7]=p[7]^g[6];
+   assign t[8]=p[8];
+   assign t[9]=p[9]^g[8];
+   assign t[10]=p[10];
+   assign t[11]=p[11]^g[10];
+   assign t[12]=p[12];
+   assign t[13]=p[13]^g[12];
+   assign t[14]=p[14];
+   assign t[15]=p[15]^g[14];
+   assign t[16]=p[16];
+   assign t[17]=p[17]^g[16];
+   assign t[18]=p[18];
+   assign t[19]=p[19]^g[18];
+   assign t[20]=p[20];
+   assign t[21]=p[21]^g[20];
+   assign t[22]=p[22];
+   assign t[23]=p[23]^g[22];
+   assign t[24]=p[24];
+   assign t[25]=p[25]^g[24];
+   assign t[26]=p[26];
+   assign t[27]=p[27]^g[26];
+   assign t[28]=p[28];
+   assign t[29]=p[29]^g[28];
+   assign t[30]=p[30];
+   assign t[31]=p[31]^g[30];
+   assign t[32]=p[32];
+   assign t[33]=p[33]^g[32];
+   assign t[34]=p[34];
+   assign t[35]=p[35]^g[34];
+   assign t[36]=p[36];
+   assign t[37]=p[37]^g[36];
+   assign t[38]=p[38];
+   assign t[39]=p[39]^g[38];
+   assign t[40]=p[40];
+   assign t[41]=p[41]^g[40];
+   assign t[42]=p[42];
+   assign t[43]=p[43]^g[42];
+   assign t[44]=p[44];
+   assign t[45]=p[45]^g[44];
+   assign t[46]=p[46];
+   assign t[47]=p[47]^g[46];
+   assign t[48]=p[48];
+   assign t[49]=p[49]^g[48];
+   assign t[50]=p[50];
+   assign t[51]=p[51]^g[50];
+   assign t[52]=p[52];
+   assign t[53]=p[53]^g[52];
+   assign t[54]=p[54];
+   assign t[55]=p[55]^g[54];
+   assign t[56]=p[56];
+   assign t[57]=p[57]^g[56];
+   assign t[58]=p[58];
+   assign t[59]=p[59]^g[58];
+   assign t[60]=p[60];
+   assign t[61]=p[61]^g[60];
+   assign t[62]=p[62];
+   assign t[63]=p[63]^g[62];
+   assign t[64]=p[64];
+   assign t[65]=p[65]^g[64];
+   assign t[66]=p[66];
+   assign t[67]=p[67]^g[66];
+   assign t[68]=p[68];
+   assign t[69]=p[69]^g[68];
+   assign t[70]=p[70];
+   assign t[71]=p[71]^g[70];
+   assign t[72]=p[72];
+   assign t[73]=p[73]^g[72];
+   assign t[74]=p[74];
+   assign t[75]=p[75]^g[74];
+   assign t[76]=p[76];
+   assign t[77]=p[77]^g[76];
+   assign t[78]=p[78];
+   assign t[79]=p[79]^g[78];
+   assign t[80]=p[80];
+   assign t[81]=p[81]^g[80];
+   assign t[82]=p[82];
+   assign t[83]=p[83]^g[82];
+   assign t[84]=p[84];
+   assign t[85]=p[85]^g[84];
+   assign t[86]=p[86];
+   assign t[87]=p[87]^g[86];
+   assign t[88]=p[88];
+   assign t[89]=p[89]^g[88];
+   assign t[90]=p[90];
+   assign t[91]=p[91]^g[90];
+   assign t[92]=p[92];
+   assign t[93]=p[93]^g[92];
+   assign t[94]=p[94];
+   assign t[95]=p[95]^g[94];
+   assign t[96]=p[96];
+   assign t[97]=p[97]^g[96];
+   assign t[98]=p[98];
+   assign t[99]=p[99]^g[98];
+   assign t[100]=p[100];
+   assign t[101]=p[101]^g[100];
+   assign t[102]=p[102];
+   assign t[103]=p[103]^g[102];
+   assign t[104]=p[104];
+   assign t[105]=p[105]^g[104];
+   assign t[106]=p[106];
+   assign t[107]=p[107]^g[106];
+   assign t[108]=p[108];
+   assign t[109]=p[109]^g[108];
+   assign t[110]=p[110];
+   assign t[111]=p[111]^g[110];
+   assign t[112]=p[112];
+   assign t[113]=p[113]^g[112];
+   assign t[114]=p[114];
+   assign t[115]=p[115]^g[114];
+   assign t[116]=p[116];
+   assign t[117]=p[117]^g[116];
+   assign t[118]=p[118];
+   assign t[119]=p[119]^g[118];
+   assign t[120]=p[120];
+   assign t[121]=p[121]^g[120];
+   assign t[122]=p[122];
+   assign t[123]=p[123]^g[122];
+   assign t[124]=p[124];
+   assign t[125]=p[125]^g[124];
+   assign t[126]=p[126];
+   assign t[127]=p[127]^g[126];
+   assign t[128]=p[128];
+
+   // prefix tree
+   brent_kung_cs128 prefix_tree(c, p[127:0], g[127:0]);
+
+   // post-computation
+   assign sum=p[128:1]^c;
+   assign cout=g[128]|(p[128]&c[127]);
+
+endmodule
+
+module brent_kung_cs128 (c, p, g);
+   
+   input [127:0] p;
+   input [127:0] g;
+   output [128:1] c;
+
+
+   // parallel-prefix, Brent-Kung
+
+   // Stage 1: Generates G/P pairs that span 1 bits
+   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
+   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
+   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
+   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
+   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
+   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
+   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
+   black b_15_14 (G_15_14, P_15_14, {g[15],g[14]}, {p[15],p[14]});
+
+   black b_17_16 (G_17_16, P_17_16, {g[17],g[16]}, {p[17],p[16]});
+   black b_19_18 (G_19_18, P_19_18, {g[19],g[18]}, {p[19],p[18]});
+   black b_21_20 (G_21_20, P_21_20, {g[21],g[20]}, {p[21],p[20]});
+   black b_23_22 (G_23_22, P_23_22, {g[23],g[22]}, {p[23],p[22]});
+   black b_25_24 (G_25_24, P_25_24, {g[25],g[24]}, {p[25],p[24]});
+   black b_27_26 (G_27_26, P_27_26, {g[27],g[26]}, {p[27],p[26]});
+   black b_29_28 (G_29_28, P_29_28, {g[29],g[28]}, {p[29],p[28]});
+   black b_31_30 (G_31_30, P_31_30, {g[31],g[30]}, {p[31],p[30]});
+
+   black b_33_32 (G_33_32, P_33_32, {g[33],g[32]}, {p[33],p[32]});
+   black b_35_34 (G_35_34, P_35_34, {g[35],g[34]}, {p[35],p[34]});
+   black b_37_36 (G_37_36, P_37_36, {g[37],g[36]}, {p[37],p[36]});
+   black b_39_38 (G_39_38, P_39_38, {g[39],g[38]}, {p[39],p[38]});
+   black b_41_40 (G_41_40, P_41_40, {g[41],g[40]}, {p[41],p[40]});
+   black b_43_42 (G_43_42, P_43_42, {g[43],g[42]}, {p[43],p[42]});
+   black b_45_44 (G_45_44, P_45_44, {g[45],g[44]}, {p[45],p[44]});
+   black b_47_46 (G_47_46, P_47_46, {g[47],g[46]}, {p[47],p[46]});
+
+   black b_49_48 (G_49_48, P_49_48, {g[49],g[48]}, {p[49],p[48]});
+   black b_51_50 (G_51_50, P_51_50, {g[51],g[50]}, {p[51],p[50]});
+   black b_53_52 (G_53_52, P_53_52, {g[53],g[52]}, {p[53],p[52]});
+   black b_55_54 (G_55_54, P_55_54, {g[55],g[54]}, {p[55],p[54]});
+   black b_57_56 (G_57_56, P_57_56, {g[57],g[56]}, {p[57],p[56]});
+   black b_59_58 (G_59_58, P_59_58, {g[59],g[58]}, {p[59],p[58]});
+   black b_61_60 (G_61_60, P_61_60, {g[61],g[60]}, {p[61],p[60]});
+   black b_63_62 (G_63_62, P_63_62, {g[63],g[62]}, {p[63],p[62]});
+
+   black b_65_64 (G_65_64, P_65_64, {g[65],g[64]}, {p[65],p[64]});
+   black b_67_66 (G_67_66, P_67_66, {g[67],g[66]}, {p[67],p[66]});
+   black b_69_68 (G_69_68, P_69_68, {g[69],g[68]}, {p[69],p[68]});
+   black b_71_70 (G_71_70, P_71_70, {g[71],g[70]}, {p[71],p[70]});
+   black b_73_72 (G_73_72, P_73_72, {g[73],g[72]}, {p[73],p[72]});
+   black b_75_74 (G_75_74, P_75_74, {g[75],g[74]}, {p[75],p[74]});
+   black b_77_76 (G_77_76, P_77_76, {g[77],g[76]}, {p[77],p[76]});
+   black b_79_78 (G_79_78, P_79_78, {g[79],g[78]}, {p[79],p[78]});
+
+   black b_81_80 (G_81_80, P_81_80, {g[81],g[80]}, {p[81],p[80]});
+   black b_83_82 (G_83_82, P_83_82, {g[83],g[82]}, {p[83],p[82]});
+   black b_85_84 (G_85_84, P_85_84, {g[85],g[84]}, {p[85],p[84]});
+   black b_87_86 (G_87_86, P_87_86, {g[87],g[86]}, {p[87],p[86]});
+   black b_89_88 (G_89_88, P_89_88, {g[89],g[88]}, {p[89],p[88]});
+   black b_91_90 (G_91_90, P_91_90, {g[91],g[90]}, {p[91],p[90]});
+   black b_93_92 (G_93_92, P_93_92, {g[93],g[92]}, {p[93],p[92]});
+   black b_95_94 (G_95_94, P_95_94, {g[95],g[94]}, {p[95],p[94]});
+
+   black b_97_96 (G_97_96, P_97_96, {g[97],g[96]}, {p[97],p[96]});
+   black b_99_98 (G_99_98, P_99_98, {g[99],g[98]}, {p[99],p[98]});
+   black b_101_100 (G_101_100, P_101_100, {g[101],g[100]}, {p[101],p[100]});
+   black b_103_102 (G_103_102, P_103_102, {g[103],g[102]}, {p[103],p[102]});
+   black b_105_104 (G_105_104, P_105_104, {g[105],g[104]}, {p[105],p[104]});
+   black b_107_106 (G_107_106, P_107_106, {g[107],g[106]}, {p[107],p[106]});
+   black b_109_108 (G_109_108, P_109_108, {g[109],g[108]}, {p[109],p[108]});
+   black b_111_110 (G_111_110, P_111_110, {g[111],g[110]}, {p[111],p[110]});
+
+   black b_113_112 (G_113_112, P_113_112, {g[113],g[112]}, {p[113],p[112]});
+   black b_115_114 (G_115_114, P_115_114, {g[115],g[114]}, {p[115],p[114]});
+   black b_117_116 (G_117_116, P_117_116, {g[117],g[116]}, {p[117],p[116]});
+   black b_119_118 (G_119_118, P_119_118, {g[119],g[118]}, {p[119],p[118]});
+   black b_121_120 (G_121_120, P_121_120, {g[121],g[120]}, {p[121],p[120]});
+   black b_123_122 (G_123_122, P_123_122, {g[123],g[122]}, {p[123],p[122]});
+   black b_125_124 (G_125_124, P_125_124, {g[125],g[124]}, {p[125],p[124]});
+   black b_127_126 (G_127_126, P_127_126, {g[127],g[126]}, {p[127],p[126]});
+
+
+   // Stage 2: Generates G/P pairs that span 2 bits
+   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
+   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
+   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
+   black b_15_12 (G_15_12, P_15_12, {G_15_14,G_13_12}, {P_15_14,P_13_12});
+   black b_19_16 (G_19_16, P_19_16, {G_19_18,G_17_16}, {P_19_18,P_17_16});
+   black b_23_20 (G_23_20, P_23_20, {G_23_22,G_21_20}, {P_23_22,P_21_20});
+   black b_27_24 (G_27_24, P_27_24, {G_27_26,G_25_24}, {P_27_26,P_25_24});
+   black b_31_28 (G_31_28, P_31_28, {G_31_30,G_29_28}, {P_31_30,P_29_28});
+
+   black b_35_32 (G_35_32, P_35_32, {G_35_34,G_33_32}, {P_35_34,P_33_32});
+   black b_39_36 (G_39_36, P_39_36, {G_39_38,G_37_36}, {P_39_38,P_37_36});
+   black b_43_40 (G_43_40, P_43_40, {G_43_42,G_41_40}, {P_43_42,P_41_40});
+   black b_47_44 (G_47_44, P_47_44, {G_47_46,G_45_44}, {P_47_46,P_45_44});
+   black b_51_48 (G_51_48, P_51_48, {G_51_50,G_49_48}, {P_51_50,P_49_48});
+   black b_55_52 (G_55_52, P_55_52, {G_55_54,G_53_52}, {P_55_54,P_53_52});
+   black b_59_56 (G_59_56, P_59_56, {G_59_58,G_57_56}, {P_59_58,P_57_56});
+   black b_63_60 (G_63_60, P_63_60, {G_63_62,G_61_60}, {P_63_62,P_61_60});
+
+   black b_67_64 (G_67_64, P_67_64, {G_67_66,G_65_64}, {P_67_66,P_65_64});
+   black b_71_68 (G_71_68, P_71_68, {G_71_70,G_69_68}, {P_71_70,P_69_68});
+   black b_75_72 (G_75_72, P_75_72, {G_75_74,G_73_72}, {P_75_74,P_73_72});
+   black b_79_76 (G_79_76, P_79_76, {G_79_78,G_77_76}, {P_79_78,P_77_76});
+   black b_83_80 (G_83_80, P_83_80, {G_83_82,G_81_80}, {P_83_82,P_81_80});
+   black b_87_84 (G_87_84, P_87_84, {G_87_86,G_85_84}, {P_87_86,P_85_84});
+   black b_91_88 (G_91_88, P_91_88, {G_91_90,G_89_88}, {P_91_90,P_89_88});
+   black b_95_92 (G_95_92, P_95_92, {G_95_94,G_93_92}, {P_95_94,P_93_92});
+
+   black b_99_96 (G_99_96, P_99_96, {G_99_98,G_97_96}, {P_99_98,P_97_96});
+   black b_103_100 (G_103_100, P_103_100, {G_103_102,G_101_100}, {P_103_102,P_101_100});
+   black b_107_104 (G_107_104, P_107_104, {G_107_106,G_105_104}, {P_107_106,P_105_104});
+   black b_111_108 (G_111_108, P_111_108, {G_111_110,G_109_108}, {P_111_110,P_109_108});
+   black b_115_112 (G_115_112, P_115_112, {G_115_114,G_113_112}, {P_115_114,P_113_112});
+   black b_119_116 (G_119_116, P_119_116, {G_119_118,G_117_116}, {P_119_118,P_117_116});
+   black b_123_120 (G_123_120, P_123_120, {G_123_122,G_121_120}, {P_123_122,P_121_120});
+   black b_127_124 (G_127_124, P_127_124, {G_127_126,G_125_124}, {P_127_126,P_125_124});
+
+
+   // Stage 3: Generates G/P pairs that span 4 bits
+   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
+   black b_15_8 (G_15_8, P_15_8, {G_15_12,G_11_8}, {P_15_12,P_11_8});
+   black b_23_16 (G_23_16, P_23_16, {G_23_20,G_19_16}, {P_23_20,P_19_16});
+   black b_31_24 (G_31_24, P_31_24, {G_31_28,G_27_24}, {P_31_28,P_27_24});
+   black b_39_32 (G_39_32, P_39_32, {G_39_36,G_35_32}, {P_39_36,P_35_32});
+   black b_47_40 (G_47_40, P_47_40, {G_47_44,G_43_40}, {P_47_44,P_43_40});
+   black b_55_48 (G_55_48, P_55_48, {G_55_52,G_51_48}, {P_55_52,P_51_48});
+   black b_63_56 (G_63_56, P_63_56, {G_63_60,G_59_56}, {P_63_60,P_59_56});
+
+   black b_71_64 (G_71_64, P_71_64, {G_71_68,G_67_64}, {P_71_68,P_67_64});
+   black b_79_72 (G_79_72, P_79_72, {G_79_76,G_75_72}, {P_79_76,P_75_72});
+   black b_87_80 (G_87_80, P_87_80, {G_87_84,G_83_80}, {P_87_84,P_83_80});
+   black b_95_88 (G_95_88, P_95_88, {G_95_92,G_91_88}, {P_95_92,P_91_88});
+   black b_103_96 (G_103_96, P_103_96, {G_103_100,G_99_96}, {P_103_100,P_99_96});
+   black b_111_104 (G_111_104, P_111_104, {G_111_108,G_107_104}, {P_111_108,P_107_104});
+   black b_119_112 (G_119_112, P_119_112, {G_119_116,G_115_112}, {P_119_116,P_115_112});
+   black b_127_120 (G_127_120, P_127_120, {G_127_124,G_123_120}, {P_127_124,P_123_120});
+
+
+   // Stage 4: Generates G/P pairs that span 8 bits
+   grey g_15_0 (G_15_0, {G_15_8,G_7_0}, P_15_8);
+   black b_31_16 (G_31_16, P_31_16, {G_31_24,G_23_16}, {P_31_24,P_23_16});
+   black b_47_32 (G_47_32, P_47_32, {G_47_40,G_39_32}, {P_47_40,P_39_32});
+   black b_63_48 (G_63_48, P_63_48, {G_63_56,G_55_48}, {P_63_56,P_55_48});
+   black b_79_64 (G_79_64, P_79_64, {G_79_72,G_71_64}, {P_79_72,P_71_64});
+   black b_95_80 (G_95_80, P_95_80, {G_95_88,G_87_80}, {P_95_88,P_87_80});
+   black b_111_96 (G_111_96, P_111_96, {G_111_104,G_103_96}, {P_111_104,P_103_96});
+   black b_127_112 (G_127_112, P_127_112, {G_127_120,G_119_112}, {P_127_120,P_119_112});
+
+
+   // Stage 5: Generates G/P pairs that span 16 bits
+   grey g_31_0 (G_31_0, {G_31_16,G_15_0}, P_31_16);
+   black b_63_32 (G_63_32, P_63_32, {G_63_48,G_47_32}, {P_63_48,P_47_32});
+   black b_95_64 (G_95_64, P_95_64, {G_95_80,G_79_64}, {P_95_80,P_79_64});
+   black b_127_96 (G_127_96, P_127_96, {G_127_112,G_111_96}, {P_127_112,P_111_96});
+
+   // Stage 6: Generates G/P pairs that span 32 bits
+   grey g_63_0 (G_63_0, {G_63_32,G_31_0}, P_63_32);
+   black b_127_64 (G_127_64, P_127_64, {G_127_96,G_95_64}, {P_127_96,P_95_64});
+
+   // Stage 7: Generates G/P pairs that span 64 bits
+   grey g_127_0 (G_127_0, {G_127_64,G_63_0}, P_127_64);
+
+   // Stage 8: Generates G/P pairs that span 32 bits
+   grey g_95_0 (G_95_0, {G_95_64,G_63_0}, P_95_64);
+
+   // Stage 9: Generates G/P pairs that span 16 bits
+   grey g_47_0 (G_47_0, {G_47_32,G_31_0}, P_47_32);
+   grey g_79_0 (G_79_0, {G_79_64,G_63_0}, P_79_64);
+   grey g_111_0 (G_111_0, {G_111_96,G_95_0}, P_111_96);
+
+   // Stage 10: Generates G/P pairs that span 8 bits
+   grey g_23_0 (G_23_0, {G_23_16,G_15_0}, P_23_16);
+   grey g_39_0 (G_39_0, {G_39_32,G_31_0}, P_39_32);
+   grey g_55_0 (G_55_0, {G_55_48,G_47_0}, P_55_48);
+   grey g_71_0 (G_71_0, {G_71_64,G_63_0}, P_71_64);
+   grey g_87_0 (G_87_0, {G_87_80,G_79_0}, P_87_80);
+   grey g_103_0 (G_103_0, {G_103_96,G_95_0}, P_103_96);
+   grey g_119_0 (G_119_0, {G_119_112,G_111_0}, P_119_112);
+
+   // Stage 11: Generates G/P pairs that span 4 bits
+   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
+   grey g_19_0 (G_19_0, {G_19_16,G_15_0}, P_19_16);
+   grey g_27_0 (G_27_0, {G_27_24,G_23_0}, P_27_24);
+   grey g_35_0 (G_35_0, {G_35_32,G_31_0}, P_35_32);
+   grey g_43_0 (G_43_0, {G_43_40,G_39_0}, P_43_40);
+   grey g_51_0 (G_51_0, {G_51_48,G_47_0}, P_51_48);
+   grey g_59_0 (G_59_0, {G_59_56,G_55_0}, P_59_56);
+   grey g_67_0 (G_67_0, {G_67_64,G_63_0}, P_67_64);
+   grey g_75_0 (G_75_0, {G_75_72,G_71_0}, P_75_72);
+   grey g_83_0 (G_83_0, {G_83_80,G_79_0}, P_83_80);
+   grey g_91_0 (G_91_0, {G_91_88,G_87_0}, P_91_88);
+   grey g_99_0 (G_99_0, {G_99_96,G_95_0}, P_99_96);
+   grey g_107_0 (G_107_0, {G_107_104,G_103_0}, P_107_104);
+   grey g_115_0 (G_115_0, {G_115_112,G_111_0}, P_115_112);
+   grey g_123_0 (G_123_0, {G_123_120,G_119_0}, P_123_120);
+
+   // Stage 12: Generates G/P pairs that span 2 bits
+   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
+   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
+   grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
+   grey g_17_0 (G_17_0, {G_17_16,G_15_0}, P_17_16);
+   grey g_21_0 (G_21_0, {G_21_20,G_19_0}, P_21_20);
+   grey g_25_0 (G_25_0, {G_25_24,G_23_0}, P_25_24);
+   grey g_29_0 (G_29_0, {G_29_28,G_27_0}, P_29_28);
+   grey g_33_0 (G_33_0, {G_33_32,G_31_0}, P_33_32);
+   grey g_37_0 (G_37_0, {G_37_36,G_35_0}, P_37_36);
+   grey g_41_0 (G_41_0, {G_41_40,G_39_0}, P_41_40);
+   grey g_45_0 (G_45_0, {G_45_44,G_43_0}, P_45_44);
+   grey g_49_0 (G_49_0, {G_49_48,G_47_0}, P_49_48);
+   grey g_53_0 (G_53_0, {G_53_52,G_51_0}, P_53_52);
+   grey g_57_0 (G_57_0, {G_57_56,G_55_0}, P_57_56);
+   grey g_61_0 (G_61_0, {G_61_60,G_59_0}, P_61_60);
+   grey g_65_0 (G_65_0, {G_65_64,G_63_0}, P_65_64);
+   grey g_69_0 (G_69_0, {G_69_68,G_67_0}, P_69_68);
+   grey g_73_0 (G_73_0, {G_73_72,G_71_0}, P_73_72);
+   grey g_77_0 (G_77_0, {G_77_76,G_75_0}, P_77_76);
+   grey g_81_0 (G_81_0, {G_81_80,G_79_0}, P_81_80);
+   grey g_85_0 (G_85_0, {G_85_84,G_83_0}, P_85_84);
+   grey g_89_0 (G_89_0, {G_89_88,G_87_0}, P_89_88);
+   grey g_93_0 (G_93_0, {G_93_92,G_91_0}, P_93_92);
+   grey g_97_0 (G_97_0, {G_97_96,G_95_0}, P_97_96);
+   grey g_101_0 (G_101_0, {G_101_100,G_99_0}, P_101_100);
+   grey g_105_0 (G_105_0, {G_105_104,G_103_0}, P_105_104);
+   grey g_109_0 (G_109_0, {G_109_108,G_107_0}, P_109_108);
+   grey g_113_0 (G_113_0, {G_113_112,G_111_0}, P_113_112);
+   grey g_117_0 (G_117_0, {G_117_116,G_115_0}, P_117_116);
+   grey g_121_0 (G_121_0, {G_121_120,G_119_0}, P_121_120);
+   grey g_125_0 (G_125_0, {G_125_124,G_123_0}, P_125_124);
+
+   // Last grey cell stage 
+   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
+   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
+   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
+   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
+   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
+   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
+   grey g_14_0 (G_14_0, {g[14],G_13_0}, p[14]);
+   grey g_16_0 (G_16_0, {g[16],G_15_0}, p[16]);
+   grey g_18_0 (G_18_0, {g[18],G_17_0}, p[18]);
+   grey g_20_0 (G_20_0, {g[20],G_19_0}, p[20]);
+   grey g_22_0 (G_22_0, {g[22],G_21_0}, p[22]);
+   grey g_24_0 (G_24_0, {g[24],G_23_0}, p[24]);
+   grey g_26_0 (G_26_0, {g[26],G_25_0}, p[26]);
+   grey g_28_0 (G_28_0, {g[28],G_27_0}, p[28]);
+   grey g_30_0 (G_30_0, {g[30],G_29_0}, p[30]);
+   grey g_32_0 (G_32_0, {g[32],G_31_0}, p[32]);
+   grey g_34_0 (G_34_0, {g[34],G_33_0}, p[34]);
+   grey g_36_0 (G_36_0, {g[36],G_35_0}, p[36]);
+   grey g_38_0 (G_38_0, {g[38],G_37_0}, p[38]);
+   grey g_40_0 (G_40_0, {g[40],G_39_0}, p[40]);
+   grey g_42_0 (G_42_0, {g[42],G_41_0}, p[42]);
+   grey g_44_0 (G_44_0, {g[44],G_43_0}, p[44]);
+   grey g_46_0 (G_46_0, {g[46],G_45_0}, p[46]);
+   grey g_48_0 (G_48_0, {g[48],G_47_0}, p[48]);
+   grey g_50_0 (G_50_0, {g[50],G_49_0}, p[50]);
+   grey g_52_0 (G_52_0, {g[52],G_51_0}, p[52]);
+   grey g_54_0 (G_54_0, {g[54],G_53_0}, p[54]);
+   grey g_56_0 (G_56_0, {g[56],G_55_0}, p[56]);
+   grey g_58_0 (G_58_0, {g[58],G_57_0}, p[58]);
+   grey g_60_0 (G_60_0, {g[60],G_59_0}, p[60]);
+   grey g_62_0 (G_62_0, {g[62],G_61_0}, p[62]);
+   grey g_64_0 (G_64_0, {g[64],G_63_0}, p[64]);
+   grey g_66_0 (G_66_0, {g[66],G_65_0}, p[66]);
+   grey g_68_0 (G_68_0, {g[68],G_67_0}, p[68]);
+   grey g_70_0 (G_70_0, {g[70],G_69_0}, p[70]);
+   grey g_72_0 (G_72_0, {g[72],G_71_0}, p[72]);
+   grey g_74_0 (G_74_0, {g[74],G_73_0}, p[74]);
+   grey g_76_0 (G_76_0, {g[76],G_75_0}, p[76]);
+   grey g_78_0 (G_78_0, {g[78],G_77_0}, p[78]);
+   grey g_80_0 (G_80_0, {g[80],G_79_0}, p[80]);
+   grey g_82_0 (G_82_0, {g[82],G_81_0}, p[82]);
+   grey g_84_0 (G_84_0, {g[84],G_83_0}, p[84]);
+   grey g_86_0 (G_86_0, {g[86],G_85_0}, p[86]);
+   grey g_88_0 (G_88_0, {g[88],G_87_0}, p[88]);
+   grey g_90_0 (G_90_0, {g[90],G_89_0}, p[90]);
+   grey g_92_0 (G_92_0, {g[92],G_91_0}, p[92]);
+   grey g_94_0 (G_94_0, {g[94],G_93_0}, p[94]);
+   grey g_96_0 (G_96_0, {g[96],G_95_0}, p[96]);
+   grey g_98_0 (G_98_0, {g[98],G_97_0}, p[98]);
+   grey g_100_0 (G_100_0, {g[100],G_99_0}, p[100]);
+   grey g_102_0 (G_102_0, {g[102],G_101_0}, p[102]);
+   grey g_104_0 (G_104_0, {g[104],G_103_0}, p[104]);
+   grey g_106_0 (G_106_0, {g[106],G_105_0}, p[106]);
+   grey g_108_0 (G_108_0, {g[108],G_107_0}, p[108]);
+   grey g_110_0 (G_110_0, {g[110],G_109_0}, p[110]);
+   grey g_112_0 (G_112_0, {g[112],G_111_0}, p[112]);
+   grey g_114_0 (G_114_0, {g[114],G_113_0}, p[114]);
+   grey g_116_0 (G_116_0, {g[116],G_115_0}, p[116]);
+   grey g_118_0 (G_118_0, {g[118],G_117_0}, p[118]);
+   grey g_120_0 (G_120_0, {g[120],G_119_0}, p[120]);
+   grey g_122_0 (G_122_0, {g[122],G_121_0}, p[122]);
+   grey g_124_0 (G_124_0, {g[124],G_123_0}, p[124]);
+   grey g_126_0 (G_126_0, {g[126],G_125_0}, p[126]);
+
+   // Final Stage: Apply c_k+1=G_k_0
+   assign c[1]=g[0];
+   assign c[2]=G_1_0;
+   assign c[3]=G_2_0;
+   assign c[4]=G_3_0;
+   assign c[5]=G_4_0;
+   assign c[6]=G_5_0;
+   assign c[7]=G_6_0;
+   assign c[8]=G_7_0;
+   assign c[9]=G_8_0;
+
+   assign c[10]=G_9_0;
+   assign c[11]=G_10_0;
+   assign c[12]=G_11_0;
+   assign c[13]=G_12_0;
+   assign c[14]=G_13_0;
+   assign c[15]=G_14_0;
+   assign c[16]=G_15_0;
+   assign c[17]=G_16_0;
+
+   assign c[18]=G_17_0;
+   assign c[19]=G_18_0;
+   assign c[20]=G_19_0;
+   assign c[21]=G_20_0;
+   assign c[22]=G_21_0;
+   assign c[23]=G_22_0;
+   assign c[24]=G_23_0;
+   assign c[25]=G_24_0;
+
+   assign c[26]=G_25_0;
+   assign c[27]=G_26_0;
+   assign c[28]=G_27_0;
+   assign c[29]=G_28_0;
+   assign c[30]=G_29_0;
+   assign c[31]=G_30_0;
+   assign c[32]=G_31_0;
+   assign c[33]=G_32_0;
+
+   assign c[34]=G_33_0;
+   assign c[35]=G_34_0;
+   assign c[36]=G_35_0;
+   assign c[37]=G_36_0;
+   assign c[38]=G_37_0;
+   assign c[39]=G_38_0;
+   assign c[40]=G_39_0;
+   assign c[41]=G_40_0;
+
+   assign c[42]=G_41_0;
+   assign c[43]=G_42_0;
+   assign c[44]=G_43_0;
+   assign c[45]=G_44_0;
+   assign c[46]=G_45_0;
+   assign c[47]=G_46_0;
+   assign c[48]=G_47_0;
+   assign c[49]=G_48_0;
+
+   assign c[50]=G_49_0;
+   assign c[51]=G_50_0;
+   assign c[52]=G_51_0;
+   assign c[53]=G_52_0;
+   assign c[54]=G_53_0;
+   assign c[55]=G_54_0;
+   assign c[56]=G_55_0;
+   assign c[57]=G_56_0;
+
+   assign c[58]=G_57_0;
+   assign c[59]=G_58_0;
+   assign c[60]=G_59_0;
+   assign c[61]=G_60_0;
+   assign c[62]=G_61_0;
+   assign c[63]=G_62_0;
+   assign c[64]=G_63_0;
+   assign c[65]=G_64_0;
+
+   assign c[66]=G_65_0;
+   assign c[67]=G_66_0;
+   assign c[68]=G_67_0;
+   assign c[69]=G_68_0;
+   assign c[70]=G_69_0;
+   assign c[71]=G_70_0;
+   assign c[72]=G_71_0;
+   assign c[73]=G_72_0;
+
+   assign c[74]=G_73_0;
+   assign c[75]=G_74_0;
+   assign c[76]=G_75_0;
+   assign c[77]=G_76_0;
+   assign c[78]=G_77_0;
+   assign c[79]=G_78_0;
+   assign c[80]=G_79_0;
+   assign c[81]=G_80_0;
+
+   assign c[82]=G_81_0;
+   assign c[83]=G_82_0;
+   assign c[84]=G_83_0;
+   assign c[85]=G_84_0;
+   assign c[86]=G_85_0;
+   assign c[87]=G_86_0;
+   assign c[88]=G_87_0;
+   assign c[89]=G_88_0;
+
+   assign c[90]=G_89_0;
+   assign c[91]=G_90_0;
+   assign c[92]=G_91_0;
+   assign c[93]=G_92_0;
+   assign c[94]=G_93_0;
+   assign c[95]=G_94_0;
+   assign c[96]=G_95_0;
+   assign c[97]=G_96_0;
+
+   assign c[98]=G_97_0;
+   assign c[99]=G_98_0;
+   assign c[100]=G_99_0;
+   assign c[101]=G_100_0;
+   assign c[102]=G_101_0;
+   assign c[103]=G_102_0;
+   assign c[104]=G_103_0;
+   assign c[105]=G_104_0;
+
+   assign c[106]=G_105_0;
+   assign c[107]=G_106_0;
+   assign c[108]=G_107_0;
+   assign c[109]=G_108_0;
+   assign c[110]=G_109_0;
+   assign c[111]=G_110_0;
+   assign c[112]=G_111_0;
+   assign c[113]=G_112_0;
+
+   assign c[114]=G_113_0;
+   assign c[115]=G_114_0;
+   assign c[116]=G_115_0;
+   assign c[117]=G_116_0;
+   assign c[118]=G_117_0;
+   assign c[119]=G_118_0;
+   assign c[120]=G_119_0;
+   assign c[121]=G_120_0;
+
+   assign c[122]=G_121_0;
+   assign c[123]=G_122_0;
+   assign c[124]=G_123_0;
+   assign c[125]=G_124_0;
+   assign c[126]=G_125_0;
+   assign c[127]=G_126_0;
+   assign c[128]=G_127_0;
+
+endmodule // brent_kung_cs
+
+
--- a/wally-pipelined/src/fpu/bk13.sv
+++ b/wally-pipelined/src/fpu/bk13.sv
@ -0,0 +1,97 @@
+// Brent-Kung Carry-save Prefix Adder
+
+module bk13 (cout, sum, a, b, cin);
+	 input [12:0] a, b;
+	 input cin;
+	 output [12:0] sum;
+	 output cout;
+
+	 wire [13:0] p,g,t;
+	 wire [12:0] c;
+
+// pre-computation
+	 assign p={a^b,1'b0};
+	 assign g={a&b, cin};
+	 assign t[1]=p[1];
+	 assign t[2]=p[2];
+	 assign t[3]=p[3]^g[2];
+	 assign t[4]=p[4];
+	 assign t[5]=p[5]^g[4];
+	 assign t[6]=p[6];
+	 assign t[7]=p[7]^g[6];
+	 assign t[8]=p[8];
+	 assign t[9]=p[9]^g[8];
+	 assign t[10]=p[10];
+	 assign t[11]=p[11]^g[10];
+	 assign t[12]=p[12];
+	 assign t[13]=p[13];
+
+// prefix tree
+	 brent_kung_cs13 prefix_tree(c, p[12:0], g[12:0]);
+
+// post-computation
+	 assign sum=p[13:1]^c;
+	 assign cout=g[13]|(p[13]&c[12]);
+
+endmodule
+
+module brent_kung_cs13 (c, p, g);
+	
+	input [13:0] p;
+	input [13:0] g;
+	output [13:1] c;
+
+
+	// parallel-prefix, Brent-Kung
+
+	// Stage 1: Generates G/P pairs that span 1 bits
+	grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
+	black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
+	black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
+	black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
+	black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
+	black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
+	black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
+
+	// Stage 2: Generates G/P pairs that span 2 bits
+	grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
+	black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
+	black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
+
+	// Stage 3: Generates G/P pairs that span 4 bits
+	grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
+
+	// Stage 4: Generates G/P pairs that span 8 bits
+
+	// Stage 5: Generates G/P pairs that span 4 bits
+	grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
+
+	// Stage 6: Generates G/P pairs that span 2 bits
+	grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
+	grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
+
+	// Last grey cell stage 
+	grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
+	grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
+	grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
+	grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
+	grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
+	grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
+
+	// Final Stage: Apply c_k+1=G_k_0
+	assign c[1]=g[0];
+	assign c[2]=G_1_0;
+	assign c[3]=G_2_0;
+	assign c[4]=G_3_0;
+	assign c[5]=G_4_0;
+	assign c[6]=G_5_0;
+	assign c[7]=G_6_0;
+	assign c[8]=G_7_0;
+	assign c[9]=G_8_0;
+
+	assign c[10]=G_9_0;
+	assign c[11]=G_10_0;
+	assign c[12]=G_11_0;
+	assign c[13]=G_12_0;
+
+endmodule
--- a/wally-pipelined/src/fpu/bk14.sv
+++ b/wally-pipelined/src/fpu/bk14.sv
@ -0,0 +1,86 @@
+// Brent-Kung Prefix Adder
+
+module bk14 (cout, sum, a, b, cin);
+	 input [13:0] a, b;
+	 input cin;
+	 output [13:0] sum;
+	 output cout;
+
+	 wire [14:0] p,g;
+	 wire [13:0] c;
+
+// pre-computation
+	 assign p={a^b,1'b0};
+	 assign g={a&b, cin};
+
+// prefix tree
+	 brent_kung14 prefix_tree(c, p[13:0], g[13:0]);
+
+// post-computation
+	 assign sum=p[14:1]^c;
+	 assign cout=g[14]|(p[14]&c[13]);
+
+endmodule
+
+module brent_kung14 (c, p, g);
+	
+	input [13:0] p;
+	input [13:0] g;
+	output [14:1] c;
+
+
+	// parallel-prefix, Brent-Kung
+
+	// Stage 1: Generates G/P pairs that span 1 bits
+	grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
+	black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
+	black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
+	black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
+	black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
+	black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
+	black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
+
+	// Stage 2: Generates G/P pairs that span 2 bits
+	grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
+	black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
+	black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
+
+	// Stage 3: Generates G/P pairs that span 4 bits
+	grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
+
+	// Stage 4: Generates G/P pairs that span 8 bits
+
+	// Stage 5: Generates G/P pairs that span 4 bits
+	grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
+
+	// Stage 6: Generates G/P pairs that span 2 bits
+	grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
+	grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
+	grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
+
+	// Last grey cell stage 
+	grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
+	grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
+	grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
+	grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
+	grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
+	grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
+
+	// Final Stage: Apply c_k+1=G_k_0
+	assign c[1]=g[0];
+	assign c[2]=G_1_0;
+	assign c[3]=G_2_0;
+	assign c[4]=G_3_0;
+	assign c[5]=G_4_0;
+	assign c[6]=G_5_0;
+	assign c[7]=G_6_0;
+	assign c[8]=G_7_0;
+	assign c[9]=G_8_0;
+
+	assign c[10]=G_9_0;
+	assign c[11]=G_10_0;
+	assign c[12]=G_11_0;
+	assign c[13]=G_12_0;
+	assign c[14]=G_13_0;
+
+endmodule
--- a/wally-pipelined/src/fpu/bk15.sv
+++ b/wally-pipelined/src/fpu/bk15.sv
@ -0,0 +1,112 @@
+// Kogge-Stone Prefix Adder
+module bk15 (cout, sum, a, b, cin);
+   
+   input [14:0] a, b;
+   input 	cin;
+   
+   output [14:0] sum;
+   output 	 cout;
+
+   wire [15:0] 	 p,g;
+   wire [15:1] 	 h,c;
+
+   // pre-computation
+   assign p={a|b,1'b1};
+   assign g={a&b, cin};
+
+   // prefix tree
+   kogge_stone prefix_tree(h, c, p[14:0], g[14:0]);
+
+   // post-computation
+   assign h[15]=g[15]|c[15];
+   assign sum=p[15:1]^h|g[15:1]&c;
+   assign cout=p[15]&h[15];
+
+endmodule // bk15
+
+module kogge_stone (h, c, p, g);
+   
+   input [14:0] p;
+   input [14:0] g;
+   
+   output [15:1] h;
+   output [15:1] c;
+
+   // parallel-prefix, Kogge-Stone
+
+   // Stage 1: Generates G/P pairs that span 1 bits
+   rgry g_1_0 (H_1_0, {g[1],g[0]});
+   rblk b_2_1 (H_2_1, I_2_1, {g[2],g[1]}, {p[1],p[0]});
+   rblk b_3_2 (H_3_2, I_3_2, {g[3],g[2]}, {p[2],p[1]});
+   rblk b_4_3 (H_4_3, I_4_3, {g[4],g[3]}, {p[3],p[2]});
+   rblk b_5_4 (H_5_4, I_5_4, {g[5],g[4]}, {p[4],p[3]});
+   rblk b_6_5 (H_6_5, I_6_5, {g[6],g[5]}, {p[5],p[4]});
+   rblk b_7_6 (H_7_6, I_7_6, {g[7],g[6]}, {p[6],p[5]});
+   rblk b_8_7 (H_8_7, I_8_7, {g[8],g[7]}, {p[7],p[6]});
+
+   rblk b_9_8 (H_9_8, I_9_8, {g[9],g[8]}, {p[8],p[7]});
+   rblk b_10_9 (H_10_9, I_10_9, {g[10],g[9]}, {p[9],p[8]});
+   rblk b_11_10 (H_11_10, I_11_10, {g[11],g[10]}, {p[10],p[9]});
+   rblk b_12_11 (H_12_11, I_12_11, {g[12],g[11]}, {p[11],p[10]});
+   rblk b_13_12 (H_13_12, I_13_12, {g[13],g[12]}, {p[12],p[11]});
+   rblk b_14_13 (H_14_13, I_14_13, {g[14],g[13]}, {p[13],p[12]});
+
+   // Stage 2: Generates G/P pairs that span 2 bits
+   grey g_2_0 (H_2_0, {H_2_1,g[0]}, I_2_1);
+   grey g_3_0 (H_3_0, {H_3_2,H_1_0}, I_3_2);
+   black b_4_1 (H_4_1, I_4_1, {H_4_3,H_2_1}, {I_4_3,I_2_1});
+   black b_5_2 (H_5_2, I_5_2, {H_5_4,H_3_2}, {I_5_4,I_3_2});
+   black b_6_3 (H_6_3, I_6_3, {H_6_5,H_4_3}, {I_6_5,I_4_3});
+   black b_7_4 (H_7_4, I_7_4, {H_7_6,H_5_4}, {I_7_6,I_5_4});
+   black b_8_5 (H_8_5, I_8_5, {H_8_7,H_6_5}, {I_8_7,I_6_5});
+   black b_9_6 (H_9_6, I_9_6, {H_9_8,H_7_6}, {I_9_8,I_7_6});
+
+   black b_10_7 (H_10_7, I_10_7, {H_10_9,H_8_7}, {I_10_9,I_8_7});
+   black b_11_8 (H_11_8, I_11_8, {H_11_10,H_9_8}, {I_11_10,I_9_8});
+   black b_12_9 (H_12_9, I_12_9, {H_12_11,H_10_9}, {I_12_11,I_10_9});
+   black b_13_10 (H_13_10, I_13_10, {H_13_12,H_11_10}, {I_13_12,I_11_10});
+   black b_14_11 (H_14_11, I_14_11, {H_14_13,H_12_11}, {I_14_13,I_12_11});
+
+   // Stage 3: Generates G/P pairs that span 4 bits
+   grey g_4_0 (H_4_0, {H_4_1,g[0]}, I_4_1);
+   grey g_5_0 (H_5_0, {H_5_2,H_1_0}, I_5_2);
+   grey g_6_0 (H_6_0, {H_6_3,H_2_0}, I_6_3);
+   grey g_7_0 (H_7_0, {H_7_4,H_3_0}, I_7_4);
+   black b_8_1 (H_8_1, I_8_1, {H_8_5,H_4_1}, {I_8_5,I_4_1});
+   black b_9_2 (H_9_2, I_9_2, {H_9_6,H_5_2}, {I_9_6,I_5_2});
+   black b_10_3 (H_10_3, I_10_3, {H_10_7,H_6_3}, {I_10_7,I_6_3});
+   black b_11_4 (H_11_4, I_11_4, {H_11_8,H_7_4}, {I_11_8,I_7_4});
+
+   black b_12_5 (H_12_5, I_12_5, {H_12_9,H_8_5}, {I_12_9,I_8_5});
+   black b_13_6 (H_13_6, I_13_6, {H_13_10,H_9_6}, {I_13_10,I_9_6});
+   black b_14_7 (H_14_7, I_14_7, {H_14_11,H_10_7}, {I_14_11,I_10_7});
+
+   // Stage 4: Generates G/P pairs that span 8 bits
+   grey g_8_0 (H_8_0, {H_8_1,g[0]}, I_8_1);
+   grey g_9_0 (H_9_0, {H_9_2,H_1_0}, I_9_2);
+   grey g_10_0 (H_10_0, {H_10_3,H_2_0}, I_10_3);
+   grey g_11_0 (H_11_0, {H_11_4,H_3_0}, I_11_4);
+   grey g_12_0 (H_12_0, {H_12_5,H_4_0}, I_12_5);
+   grey g_13_0 (H_13_0, {H_13_6,H_5_0}, I_13_6);
+   grey g_14_0 (H_14_0, {H_14_7,H_6_0}, I_14_7);
+
+   // Final Stage: Apply c_k+1=p_k&H_k_0
+   assign c[1]=g[0];
+
+   assign h[1]=H_1_0;		assign c[2]=p[1]&H_1_0;
+   assign h[2]=H_2_0;		assign c[3]=p[2]&H_2_0;
+   assign h[3]=H_3_0;		assign c[4]=p[3]&H_3_0;
+   assign h[4]=H_4_0;		assign c[5]=p[4]&H_4_0;
+   assign h[5]=H_5_0;		assign c[6]=p[5]&H_5_0;
+   assign h[6]=H_6_0;		assign c[7]=p[6]&H_6_0;
+   assign h[7]=H_7_0;		assign c[8]=p[7]&H_7_0;
+   assign h[8]=H_8_0;		assign c[9]=p[8]&H_8_0;
+
+   assign h[9]=H_9_0;		assign c[10]=p[9]&H_9_0;
+   assign h[10]=H_10_0;		assign c[11]=p[10]&H_10_0;
+   assign h[11]=H_11_0;		assign c[12]=p[11]&H_11_0;
+   assign h[12]=H_12_0;		assign c[13]=p[12]&H_12_0;
+   assign h[13]=H_13_0;		assign c[14]=p[13]&H_13_0;
+   assign h[14]=H_14_0;		assign c[15]=p[14]&H_14_0;
+
+endmodule // kogge_stone
--- a/wally-pipelined/src/fpu/black_gray_cells.sv
+++ b/wally-pipelined/src/fpu/black_gray_cells.sv
@ -0,0 +1,43 @@
+
+// Black cell
+module black(gout, pout, gin, pin);
+
+   input [1:0] gin, pin;
+   output      gout, pout;
+
+   assign pout=pin[1]&pin[0];
+   assign gout=gin[1]|(pin[1]&gin[0]);
+
+endmodule // black
+
+// Grey cell
+module grey(gout, gin, pin);
+
+   input[1:0] gin;
+   input      pin;
+   output     gout;
+
+   assign gout=gin[1]|(pin&gin[0]);
+
+endmodule // grey
+
+// reduced Black cell
+module rblk(hout, iout, gin, pin);
+
+   input [1:0] gin, pin;
+   output      hout, iout;
+
+   assign iout=pin[1]&pin[0];
+   assign hout=gin[1]|gin[0];
+
+endmodule // rblk
+
+// reduced Grey cell
+module rgry(hout, gin);
+
+   input[1:0] gin;
+   output     hout;
+
+   assign hout=gin[1]|gin[0];
+
+endmodule // rgry
--- a/wally-pipelined/src/fpu/cla12.sv
+++ b/wally-pipelined/src/fpu/cla12.sv
@ -0,0 +1,331 @@
+// This module implements a 12-bit carry lookahead adder. It is used
+// for rounding in the floating point adder. 
+
+module cla12 (S, CO, X, Y);
+   
+   input  [11:0] X;
+   input [11:0]  Y;
+   
+   output [11:0] S;
+   output 	 CO;
+   
+   wire [0:63] 	 A,B,Q;
+   wire 	 LOGIC0;
+   wire 	 CIN;
+   wire 	 CO_64;
+   
+   assign LOGIC0 = 0;
+   assign CIN = 0;
+
+   DBLCADDER_64_64 U1 (A , B , CIN, Q , CO_64);
+
+   assign A[0] = X[0];
+   assign B[0] = Y[0];
+   assign A[1] = X[1];
+   assign B[1] = Y[1];
+   assign A[2] = X[2];
+   assign B[2] = Y[2];
+   assign A[3] = X[3];
+   assign B[3] = Y[3];
+   assign A[4] = X[4];
+   assign B[4] = Y[4];
+   assign A[5] = X[5];
+   assign B[5] = Y[5];
+   assign A[6] = X[6];
+   assign B[6] = Y[6];
+   assign A[7] = X[7];
+   assign B[7] = Y[7];
+   assign A[8] = X[8];
+   assign B[8] = Y[8];
+   assign A[9] = X[9];
+   assign B[9] = Y[9];
+   assign A[10] = X[10];
+   assign B[10] = Y[10];
+   assign A[11] = X[11];
+   assign B[11] = Y[11];
+   assign A[12] = LOGIC0;
+   assign B[12] = LOGIC0;
+   assign A[13] = LOGIC0;
+   assign B[13] = LOGIC0;
+   assign A[14] = LOGIC0;
+   assign B[14] = LOGIC0;
+   assign A[15] = LOGIC0;
+   assign B[15] = LOGIC0;
+   assign A[16] = LOGIC0;
+   assign B[16] = LOGIC0;
+   assign A[17] = LOGIC0;
+   assign B[17] = LOGIC0;
+   assign A[18] = LOGIC0;
+   assign B[18] = LOGIC0;
+   assign A[19] = LOGIC0;
+   assign B[19] = LOGIC0;
+   assign A[20] = LOGIC0;
+   assign B[20] = LOGIC0;
+   assign A[21] = LOGIC0;
+   assign B[21] = LOGIC0;
+   assign A[22] = LOGIC0;
+   assign B[22] = LOGIC0;
+   assign A[23] = LOGIC0;
+   assign B[23] = LOGIC0;
+   assign A[24] = LOGIC0;
+   assign B[24] = LOGIC0;
+   assign A[25] = LOGIC0;
+   assign B[25] = LOGIC0;
+   assign A[26] = LOGIC0;
+   assign B[26] = LOGIC0;
+   assign A[27] = LOGIC0;
+   assign B[27] = LOGIC0;
+   assign A[28] = LOGIC0;
+   assign B[28] = LOGIC0;
+   assign A[29] = LOGIC0;
+   assign B[29] = LOGIC0;
+   assign A[30] = LOGIC0;
+   assign B[30] = LOGIC0;
+   assign A[31] = LOGIC0;
+   assign B[31] = LOGIC0;
+   assign A[32] = LOGIC0;
+   assign B[32] = LOGIC0;
+   assign A[33] = LOGIC0;
+   assign B[33] = LOGIC0;
+   assign A[34] = LOGIC0;
+   assign B[34] = LOGIC0;
+   assign A[35] = LOGIC0;
+   assign B[35] = LOGIC0;
+   assign A[36] = LOGIC0;
+   assign B[36] = LOGIC0;
+   assign A[37] = LOGIC0;
+   assign B[37] = LOGIC0;
+   assign A[38] = LOGIC0;
+   assign B[38] = LOGIC0;
+   assign A[39] = LOGIC0;
+   assign B[39] = LOGIC0;
+   assign A[40] = LOGIC0;
+   assign B[40] = LOGIC0;
+   assign A[41] = LOGIC0;
+   assign B[41] = LOGIC0;
+   assign A[42] = LOGIC0;
+   assign B[42] = LOGIC0;
+   assign A[43] = LOGIC0;
+   assign B[43] = LOGIC0;
+   assign A[44] = LOGIC0;
+   assign B[44] = LOGIC0;
+   assign A[45] = LOGIC0;
+   assign B[45] = LOGIC0;
+   assign A[46] = LOGIC0;
+   assign B[46] = LOGIC0;
+   assign A[47] = LOGIC0;
+   assign B[47] = LOGIC0;
+   assign A[48] = LOGIC0;
+   assign B[48] = LOGIC0;
+   assign A[49] = LOGIC0;
+   assign B[49] = LOGIC0;
+   assign A[50] = LOGIC0;
+   assign B[50] = LOGIC0;
+   assign A[51] = LOGIC0;
+   assign B[51] = LOGIC0;
+   assign A[52] = LOGIC0;
+   assign B[52] = LOGIC0;
+   assign A[53] = LOGIC0;
+   assign B[53] = LOGIC0;
+   assign A[54] = LOGIC0;
+   assign B[54] = LOGIC0;
+   assign A[55] = LOGIC0;
+   assign B[55] = LOGIC0;
+   assign A[56] = LOGIC0;
+   assign B[56] = LOGIC0;
+   assign A[57] = LOGIC0;
+   assign B[57] = LOGIC0;
+   assign A[58] = LOGIC0;
+   assign B[58] = LOGIC0;
+   assign A[59] = LOGIC0;
+   assign B[59] = LOGIC0;
+   assign A[60] = LOGIC0;
+   assign B[60] = LOGIC0;
+   assign A[61] = LOGIC0;
+   assign B[61] = LOGIC0;
+   assign A[62] = LOGIC0;
+   assign B[62] = LOGIC0;
+   assign A[63] = LOGIC0;
+   assign B[63] = LOGIC0;
+
+   assign S[0] = Q[0];
+   assign S[1] = Q[1];
+   assign S[2] = Q[2];
+   assign S[3] = Q[3];
+   assign S[4] = Q[4];
+   assign S[5] = Q[5];
+   assign S[6] = Q[6];
+   assign S[7] = Q[7];
+   assign S[8] = Q[8];
+   assign S[9] = Q[9];
+   assign S[10] = Q[10];
+   assign S[11] = Q[11];
+   assign CO    = Q[12];
+   
+endmodule //cla52
+
+// This module implements a 12-bit carry lookahead subtractor. It is used
+// for rounding in the floating point adder. 
+
+module cla_sub12 (S, X, Y);
+   
+   input [11:0] X;
+   input [11:0] Y;
+   
+   output [11:0] S;
+   
+   wire [0:63] 	 A,B,Q,Bbar;
+   wire 	 CO;
+   wire 	 LOGIC0;
+   wire 	 VDD;
+   
+   assign Bbar = ~B;
+   assign LOGIC0 = 0;
+   assign VDD = 1;
+
+   DBLCADDER_64_64 U1 (A , Bbar , VDD, Q , CO);
+
+   assign A[0] = X[0];
+   assign B[0] = Y[0];
+   assign A[1] = X[1];
+   assign B[1] = Y[1];
+   assign A[2] = X[2];
+   assign B[2] = Y[2];
+   assign A[3] = X[3];
+   assign B[3] = Y[3];
+   assign A[4] = X[4];
+   assign B[4] = Y[4];
+   assign A[5] = X[5];
+   assign B[5] = Y[5];
+   assign A[6] = X[6];
+   assign B[6] = Y[6];
+   assign A[7] = X[7];
+   assign B[7] = Y[7];
+   assign A[8] = X[8];
+   assign B[8] = Y[8];
+   assign A[9] = X[9];
+   assign B[9] = Y[9];
+   assign A[10] = X[10];
+   assign B[10] = Y[10];
+   assign A[11] = X[11];
+   assign B[11] = Y[11];
+   assign A[12] = LOGIC0;
+   assign B[12] = LOGIC0;
+   assign A[13] = LOGIC0;
+   assign B[13] = LOGIC0;
+   assign A[14] = LOGIC0;
+   assign B[14] = LOGIC0;
+   assign A[15] = LOGIC0;
+   assign B[15] = LOGIC0;
+   assign A[16] = LOGIC0;
+   assign B[16] = LOGIC0;
+   assign A[17] = LOGIC0;
+   assign B[17] = LOGIC0;
+   assign A[18] = LOGIC0;
+   assign B[18] = LOGIC0;
+   assign A[19] = LOGIC0;
+   assign B[19] = LOGIC0;
+   assign A[20] = LOGIC0;
+   assign B[20] = LOGIC0;
+   assign A[21] = LOGIC0;
+   assign B[21] = LOGIC0;
+   assign A[22] = LOGIC0;
+   assign B[22] = LOGIC0;
+   assign A[23] = LOGIC0;
+   assign B[23] = LOGIC0;
+   assign A[24] = LOGIC0;
+   assign B[24] = LOGIC0;
+   assign A[25] = LOGIC0;
+   assign B[25] = LOGIC0;
+   assign A[26] = LOGIC0;
+   assign B[26] = LOGIC0;
+   assign A[27] = LOGIC0;
+   assign B[27] = LOGIC0;
+   assign A[28] = LOGIC0;
+   assign B[28] = LOGIC0;
+   assign A[29] = LOGIC0;
+   assign B[29] = LOGIC0;
+   assign A[30] = LOGIC0;
+   assign B[30] = LOGIC0;
+   assign A[31] = LOGIC0;
+   assign B[31] = LOGIC0;
+   assign A[32] = LOGIC0;
+   assign B[32] = LOGIC0;
+   assign A[33] = LOGIC0;
+   assign B[33] = LOGIC0;
+   assign A[34] = LOGIC0;
+   assign B[34] = LOGIC0;
+   assign A[35] = LOGIC0;
+   assign B[35] = LOGIC0;
+   assign A[36] = LOGIC0;
+   assign B[36] = LOGIC0;
+   assign A[37] = LOGIC0;
+   assign B[37] = LOGIC0;
+   assign A[38] = LOGIC0;
+   assign B[38] = LOGIC0;
+   assign A[39] = LOGIC0;
+   assign B[39] = LOGIC0;
+   assign A[40] = LOGIC0;
+   assign B[40] = LOGIC0;
+   assign A[41] = LOGIC0;
+   assign B[41] = LOGIC0;
+   assign A[42] = LOGIC0;
+   assign B[42] = LOGIC0;
+   assign A[43] = LOGIC0;
+   assign B[43] = LOGIC0;
+   assign A[44] = LOGIC0;
+   assign B[44] = LOGIC0;
+   assign A[45] = LOGIC0;
+   assign B[45] = LOGIC0;
+   assign A[46] = LOGIC0;
+   assign B[46] = LOGIC0;
+   assign A[47] = LOGIC0;
+   assign B[47] = LOGIC0;
+   assign A[48] = LOGIC0;
+   assign B[48] = LOGIC0;
+   assign A[49] = LOGIC0;
+   assign B[49] = LOGIC0;
+   assign A[50] = LOGIC0;
+   assign B[50] = LOGIC0;
+   assign A[51] = LOGIC0;
+   assign B[51] = LOGIC0;
+   assign A[52] = LOGIC0;
+   assign B[52] = LOGIC0;
+   assign A[53] = LOGIC0;
+   assign B[53] = LOGIC0;
+   assign A[54] = LOGIC0;
+   assign B[54] = LOGIC0;
+   assign A[55] = LOGIC0;
+   assign B[55] = LOGIC0;
+   assign A[56] = LOGIC0;
+   assign B[56] = LOGIC0;
+   assign A[57] = LOGIC0;
+   assign B[57] = LOGIC0;
+   assign A[58] = LOGIC0;
+   assign B[58] = LOGIC0;
+   assign A[59] = LOGIC0;
+   assign B[59] = LOGIC0;
+   assign A[60] = LOGIC0;
+   assign B[60] = LOGIC0;
+   assign A[61] = LOGIC0;
+   assign B[61] = LOGIC0;
+   assign A[62] = LOGIC0;
+   assign B[62] = LOGIC0;
+   assign A[63] = LOGIC0;
+   assign B[63] = LOGIC0;
+
+   assign S[0] = Q[0];
+   assign S[1] = Q[1];
+   assign S[2] = Q[2];
+   assign S[3] = Q[3];
+   assign S[4] = Q[4];
+   assign S[5] = Q[5];
+   assign S[6] = Q[6];
+   assign S[7] = Q[7];
+   assign S[8] = Q[8];
+   assign S[9] = Q[9];
+   assign S[10] = Q[10];
+   assign S[11] = Q[11];
+   assign CO_12 = Q[12];
+   
+endmodule //cla_sub52
--- a/wally-pipelined/src/fpu/cla52.sv
+++ b/wally-pipelined/src/fpu/cla52.sv
@ -0,0 +1,408 @@
+// This module implements a 52-bit carry lookahead adder. It is used
+// for rounding in the floating point adder. 
+
+module cla52 (S, CO, X, Y);
+   
+   input  [51:0] X;
+   input [51:0]  Y;
+   
+   output [51:0] S;
+   output 	 CO;
+   
+   wire [0:63] 	 A,B,Q;
+   wire 	 LOGIC0;
+   wire 	 CIN;
+   wire 	 CO_64;
+   
+   assign LOGIC0 = 0;
+   assign CIN = 0;
+   DBLCADDER_64_64 U1 (A , B , CIN, Q , CO_64);
+   assign A[0] = X[0];
+   assign B[0] = Y[0];
+   assign A[1] = X[1];
+   assign B[1] = Y[1];
+   assign A[2] = X[2];
+   assign B[2] = Y[2];
+   assign A[3] = X[3];
+   assign B[3] = Y[3];
+   assign A[4] = X[4];
+   assign B[4] = Y[4];
+   assign A[5] = X[5];
+   assign B[5] = Y[5];
+   assign A[6] = X[6];
+   assign B[6] = Y[6];
+   assign A[7] = X[7];
+   assign B[7] = Y[7];
+   assign A[8] = X[8];
+   assign B[8] = Y[8];
+   assign A[9] = X[9];
+   assign B[9] = Y[9];
+   assign A[10] = X[10];
+   assign B[10] = Y[10];
+   assign A[11] = X[11];
+   assign B[11] = Y[11];
+   assign A[12] = X[12];
+   assign B[12] = Y[12];
+   assign A[13] = X[13];
+   assign B[13] = Y[13];
+   assign A[14] = X[14];
+   assign B[14] = Y[14];
+   assign A[15] = X[15];
+   assign B[15] = Y[15];
+   assign A[16] = X[16];
+   assign B[16] = Y[16];
+   assign A[17] = X[17];
+   assign B[17] = Y[17];
+   assign A[18] = X[18];
+   assign B[18] = Y[18];
+   assign A[19] = X[19];
+   assign B[19] = Y[19];
+   assign A[20] = X[20];
+   assign B[20] = Y[20];
+   assign A[21] = X[21];
+   assign B[21] = Y[21];
+   assign A[22] = X[22];
+   assign B[22] = Y[22];
+   assign A[23] = X[23];
+   assign B[23] = Y[23];
+   assign A[24] = X[24];
+   assign B[24] = Y[24];
+   assign A[25] = X[25];
+   assign B[25] = Y[25];
+   assign A[26] = X[26];
+   assign B[26] = Y[26];
+   assign A[27] = X[27];
+   assign B[27] = Y[27];
+   assign A[28] = X[28];
+   assign B[28] = Y[28];
+   assign A[29] = X[29];
+   assign B[29] = Y[29];
+   assign A[30] = X[30];
+   assign B[30] = Y[30];
+   assign A[31] = X[31];
+   assign B[31] = Y[31];
+   assign A[32] = X[32];
+   assign B[32] = Y[32];
+   assign A[33] = X[33];
+   assign B[33] = Y[33];
+   assign A[34] = X[34];
+   assign B[34] = Y[34];
+   assign A[35] = X[35];
+   assign B[35] = Y[35];
+   assign A[36] = X[36];
+   assign B[36] = Y[36];
+   assign A[37] = X[37];
+   assign B[37] = Y[37];
+   assign A[38] = X[38];
+   assign B[38] = Y[38];
+   assign A[39] = X[39];
+   assign B[39] = Y[39];
+   assign A[40] = X[40];
+   assign B[40] = Y[40];
+   assign A[41] = X[41];
+   assign B[41] = Y[41];
+   assign A[42] = X[42];
+   assign B[42] = Y[42];
+   assign A[43] = X[43];
+   assign B[43] = Y[43];
+   assign A[44] = X[44];
+   assign B[44] = Y[44];
+   assign A[45] = X[45];
+   assign B[45] = Y[45];
+   assign A[46] = X[46];
+   assign B[46] = Y[46];
+   assign A[47] = X[47];
+   assign B[47] = Y[47];
+   assign A[48] = X[48];
+   assign B[48] = Y[48];
+   assign A[49] = X[49];
+   assign B[49] = Y[49];
+   assign A[50] = X[50];
+   assign B[50] = Y[50];
+   assign A[51] = X[51];
+   assign B[51] = Y[51];
+   assign A[52] = LOGIC0;
+   assign B[52] = LOGIC0;
+   assign A[53] = LOGIC0;
+   assign B[53] = LOGIC0;
+   assign A[54] = LOGIC0;
+   assign B[54] = LOGIC0;
+   assign A[55] = LOGIC0;
+   assign B[55] = LOGIC0;
+   assign A[56] = LOGIC0;
+   assign B[56] = LOGIC0;
+   assign A[57] = LOGIC0;
+   assign B[57] = LOGIC0;
+   assign A[58] = LOGIC0;
+   assign B[58] = LOGIC0;
+   assign A[59] = LOGIC0;
+   assign B[59] = LOGIC0;
+   assign A[60] = LOGIC0;
+   assign B[60] = LOGIC0;
+   assign A[61] = LOGIC0;
+   assign B[61] = LOGIC0;
+   assign A[62] = LOGIC0;
+   assign B[62] = LOGIC0;
+   assign A[63] = LOGIC0;
+   assign B[63] = LOGIC0;
+   assign S[0] = Q[0];
+   assign S[1] = Q[1];
+   assign S[2] = Q[2];
+   assign S[3] = Q[3];
+   assign S[4] = Q[4];
+   assign S[5] = Q[5];
+   assign S[6] = Q[6];
+   assign S[7] = Q[7];
+   assign S[8] = Q[8];
+   assign S[9] = Q[9];
+   assign S[10] = Q[10];
+   assign S[11] = Q[11];
+   assign S[12] = Q[12];
+   assign S[13] = Q[13];
+   assign S[14] = Q[14];
+   assign S[15] = Q[15];
+   assign S[16] = Q[16];
+   assign S[17] = Q[17];
+   assign S[18] = Q[18];
+   assign S[19] = Q[19];
+   assign S[20] = Q[20];
+   assign S[21] = Q[21];
+   assign S[22] = Q[22];
+   assign S[23] = Q[23];
+   assign S[24] = Q[24];
+   assign S[25] = Q[25];
+   assign S[26] = Q[26];
+   assign S[27] = Q[27];
+   assign S[28] = Q[28];
+   assign S[29] = Q[29];
+   assign S[30] = Q[30];
+   assign S[31] = Q[31];
+   assign S[32] = Q[32];
+   assign S[33] = Q[33];
+   assign S[34] = Q[34];
+   assign S[35] = Q[35];
+   assign S[36] = Q[36];
+   assign S[37] = Q[37];
+   assign S[38] = Q[38];
+   assign S[39] = Q[39];
+   assign S[40] = Q[40];
+   assign S[41] = Q[41];
+   assign S[42] = Q[42];
+   assign S[43] = Q[43];
+   assign S[44] = Q[44];
+   assign S[45] = Q[45];
+   assign S[46] = Q[46];
+   assign S[47] = Q[47];
+   assign S[48] = Q[48];
+   assign S[49] = Q[49];
+   assign S[50] = Q[50];
+   assign S[51] = Q[51];
+   assign CO    = Q[52];
+   
+endmodule //cla52
+
+// This module implements a 52-bit carry lookahead subtractor. It is used
+// for rounding in the floating point adder. 
+
+module cla_sub52 (S, X, Y);
+   
+   input [51:0] X;
+   input [51:0] Y;
+   
+   output [51:0] S;
+   
+   wire [0:63] 	 A,B,Q,Bbar;
+   wire 	 LOGIC0;
+   wire 	 CIN;
+   wire 	 CO_52;
+   
+   assign Bbar = ~B;
+   assign LOGIC0 = 0;
+   assign CIN = 0;
+
+   DBLCADDER_64_64 U1 (A , Bbar , CIN, Q , CO_64);
+
+   assign A[0] = X[0];
+   assign B[0] = Y[0];
+   assign A[1] = X[1];
+   assign B[1] = Y[1];
+   assign A[2] = X[2];
+   assign B[2] = Y[2];
+   assign A[3] = X[3];
+   assign B[3] = Y[3];
+   assign A[4] = X[4];
+   assign B[4] = Y[4];
+   assign A[5] = X[5];
+   assign B[5] = Y[5];
+   assign A[6] = X[6];
+   assign B[6] = Y[6];
+   assign A[7] = X[7];
+   assign B[7] = Y[7];
+   assign A[8] = X[8];
+   assign B[8] = Y[8];
+   assign A[9] = X[9];
+   assign B[9] = Y[9];
+   assign A[10] = X[10];
+   assign B[10] = Y[10];
+   assign A[11] = X[11];
+   assign B[11] = Y[11];
+   assign A[12] = X[12];
+   assign B[12] = Y[12];
+   assign A[13] = X[13];
+   assign B[13] = Y[13];
+   assign A[14] = X[14];
+   assign B[14] = Y[14];
+   assign A[15] = X[15];
+   assign B[15] = Y[15];
+   assign A[16] = X[16];
+   assign B[16] = Y[16];
+   assign A[17] = X[17];
+   assign B[17] = Y[17];
+   assign A[18] = X[18];
+   assign B[18] = Y[18];
+   assign A[19] = X[19];
+   assign B[19] = Y[19];
+   assign A[20] = X[20];
+   assign B[20] = Y[20];
+   assign A[21] = X[21];
+   assign B[21] = Y[21];
+   assign A[22] = X[22];
+   assign B[22] = Y[22];
+   assign A[23] = X[23];
+   assign B[23] = Y[23];
+   assign A[24] = X[24];
+   assign B[24] = Y[24];
+   assign A[25] = X[25];
+   assign B[25] = Y[25];
+   assign A[26] = X[26];
+   assign B[26] = Y[26];
+   assign A[27] = X[27];
+   assign B[27] = Y[27];
+   assign A[28] = X[28];
+   assign B[28] = Y[28];
+   assign A[29] = X[29];
+   assign B[29] = Y[29];
+   assign A[30] = X[30];
+   assign B[30] = Y[30];
+   assign A[31] = X[31];
+   assign B[31] = Y[31];
+   assign A[32] = X[32];
+   assign B[32] = Y[32];
+   assign A[33] = X[33];
+   assign B[33] = Y[33];
+   assign A[34] = X[34];
+   assign B[34] = Y[34];
+   assign A[35] = X[35];
+   assign B[35] = Y[35];
+   assign A[36] = X[36];
+   assign B[36] = Y[36];
+   assign A[37] = X[37];
+   assign B[37] = Y[37];
+   assign A[38] = X[38];
+   assign B[38] = Y[38];
+   assign A[39] = X[39];
+   assign B[39] = Y[39];
+   assign A[40] = X[40];
+   assign B[40] = Y[40];
+   assign A[41] = X[41];
+   assign B[41] = Y[41];
+   assign A[42] = X[42];
+   assign B[42] = Y[42];
+   assign A[43] = X[43];
+   assign B[43] = Y[43];
+   assign A[44] = X[44];
+   assign B[44] = Y[44];
+   assign A[45] = X[45];
+   assign B[45] = Y[45];
+   assign A[46] = X[46];
+   assign B[46] = Y[46];
+   assign A[47] = X[47];
+   assign B[47] = Y[47];
+   assign A[48] = X[48];
+   assign B[48] = Y[48];
+   assign A[49] = X[49];
+   assign B[49] = Y[49];
+   assign A[50] = X[50];
+   assign B[50] = Y[50];
+   assign A[51] = X[51];
+   assign B[51] = Y[51];
+   assign A[52] = LOGIC0;
+   assign B[52] = LOGIC0;
+   assign A[53] = LOGIC0;
+   assign B[53] = LOGIC0;
+   assign A[54] = LOGIC0;
+   assign B[54] = LOGIC0;
+   assign A[55] = LOGIC0;
+   assign B[55] = LOGIC0;
+   assign A[56] = LOGIC0;
+   assign B[56] = LOGIC0;
+   assign A[57] = LOGIC0;
+   assign B[57] = LOGIC0;
+   assign A[58] = LOGIC0;
+   assign B[58] = LOGIC0;
+   assign A[59] = LOGIC0;
+   assign B[59] = LOGIC0;
+   assign A[60] = LOGIC0;
+   assign B[60] = LOGIC0;
+   assign A[61] = LOGIC0;
+   assign B[61] = LOGIC0;
+   assign A[62] = LOGIC0;
+   assign B[62] = LOGIC0;
+   assign A[63] = LOGIC0;
+   assign B[63] = LOGIC0;
+
+   assign S[0] = Q[0];
+   assign S[1] = Q[1];
+   assign S[2] = Q[2];
+   assign S[3] = Q[3];
+   assign S[4] = Q[4];
+   assign S[5] = Q[5];
+   assign S[6] = Q[6];
+   assign S[7] = Q[7];
+   assign S[8] = Q[8];
+   assign S[9] = Q[9];
+   assign S[10] = Q[10];
+   assign S[11] = Q[11];
+   assign S[12] = Q[12];
+   assign S[13] = Q[13];
+   assign S[14] = Q[14];
+   assign S[15] = Q[15];
+   assign S[16] = Q[16];
+   assign S[17] = Q[17];
+   assign S[18] = Q[18];
+   assign S[19] = Q[19];
+   assign S[20] = Q[20];
+   assign S[21] = Q[21];
+   assign S[22] = Q[22];
+   assign S[23] = Q[23];
+   assign S[24] = Q[24];
+   assign S[25] = Q[25];
+   assign S[26] = Q[26];
+   assign S[27] = Q[27];
+   assign S[28] = Q[28];
+   assign S[29] = Q[29];
+   assign S[30] = Q[30];
+   assign S[31] = Q[31];
+   assign S[32] = Q[32];
+   assign S[33] = Q[33];
+   assign S[34] = Q[34];
+   assign S[35] = Q[35];
+   assign S[36] = Q[36];
+   assign S[37] = Q[37];
+   assign S[38] = Q[38];
+   assign S[39] = Q[39];
+   assign S[40] = Q[40];
+   assign S[41] = Q[41];
+   assign S[42] = Q[42];
+   assign S[43] = Q[43];
+   assign S[44] = Q[44];
+   assign S[45] = Q[45];
+   assign S[46] = Q[46];
+   assign S[47] = Q[47];
+   assign S[48] = Q[48];
+   assign S[49] = Q[49];
+   assign S[50] = Q[50];
+   assign S[51] = Q[51];
+   assign CO_52 = Q[52];
+   
+endmodule //cla_sub52
--- a/wally-pipelined/src/fpu/cla64.sv
+++ b/wally-pipelined/src/fpu/cla64.sv
@ -0,0 +1,420 @@
+// This module implements a 64-bit carry lookehead adder/subtractor. 
+// It is used to perform the primary addition in the floating point
+// adder
+
+module cla64 (S, X, Y, Sub);
+   
+   input  [63:0] X;
+   input [63:0]  Y;
+   input 	 Sub;
+   output [63:0] S;
+   wire 	 CO;
+   wire [0:63] 	 A,B,Q, Bbar;
+   
+   DBLCADDER_64_64 U1 (A , Bbar , Sub , Q , CO );
+   assign A[0] = X[0];
+   assign B[0] = Y[0];
+   assign A[1] = X[1];
+   assign B[1] = Y[1];
+   assign A[2] = X[2];
+   assign B[2] = Y[2];
+   assign A[3] = X[3];
+   assign B[3] = Y[3];
+   assign A[4] = X[4];
+   assign B[4] = Y[4];
+   assign A[5] = X[5];
+   assign B[5] = Y[5];
+   assign A[6] = X[6];
+   assign B[6] = Y[6];
+   assign A[7] = X[7];
+   assign B[7] = Y[7];
+   assign A[8] = X[8];
+   assign B[8] = Y[8];
+   assign A[9] = X[9];
+   assign B[9] = Y[9];
+   assign A[10] = X[10];
+   assign B[10] = Y[10];
+   assign A[11] = X[11];
+   assign B[11] = Y[11];
+   assign A[12] = X[12];
+   assign B[12] = Y[12];
+   assign A[13] = X[13];
+   assign B[13] = Y[13];
+   assign A[14] = X[14];
+   assign B[14] = Y[14];
+   assign A[15] = X[15];
+   assign B[15] = Y[15];
+   assign A[16] = X[16];
+   assign B[16] = Y[16];
+   assign A[17] = X[17];
+   assign B[17] = Y[17];
+   assign A[18] = X[18];
+   assign B[18] = Y[18];
+   assign A[19] = X[19];
+   assign B[19] = Y[19];
+   assign A[20] = X[20];
+   assign B[20] = Y[20];
+   assign A[21] = X[21];
+   assign B[21] = Y[21];
+   assign A[22] = X[22];
+   assign B[22] = Y[22];
+   assign A[23] = X[23];
+   assign B[23] = Y[23];
+   assign A[24] = X[24];
+   assign B[24] = Y[24];
+   assign A[25] = X[25];
+   assign B[25] = Y[25];
+   assign A[26] = X[26];
+   assign B[26] = Y[26];
+   assign A[27] = X[27];
+   assign B[27] = Y[27];
+   assign A[28] = X[28];
+   assign B[28] = Y[28];
+   assign A[29] = X[29];
+   assign B[29] = Y[29];
+   assign A[30] = X[30];
+   assign B[30] = Y[30];
+   assign A[31] = X[31];
+   assign B[31] = Y[31];
+   assign A[32] = X[32];
+   assign B[32] = Y[32];
+   assign A[33] = X[33];
+   assign B[33] = Y[33];
+   assign A[34] = X[34];
+   assign B[34] = Y[34];
+   assign A[35] = X[35];
+   assign B[35] = Y[35];
+   assign A[36] = X[36];
+   assign B[36] = Y[36];
+   assign A[37] = X[37];
+   assign B[37] = Y[37];
+   assign A[38] = X[38];
+   assign B[38] = Y[38];
+   assign A[39] = X[39];
+   assign B[39] = Y[39];
+   assign A[40] = X[40];
+   assign B[40] = Y[40];
+   assign A[41] = X[41];
+   assign B[41] = Y[41];
+   assign A[42] = X[42];
+   assign B[42] = Y[42];
+   assign A[43] = X[43];
+   assign B[43] = Y[43];
+   assign A[44] = X[44];
+   assign B[44] = Y[44];
+   assign A[45] = X[45];
+   assign B[45] = Y[45];
+   assign A[46] = X[46];
+   assign B[46] = Y[46];
+   assign A[47] = X[47];
+   assign B[47] = Y[47];
+   assign A[48] = X[48];
+   assign B[48] = Y[48];
+   assign A[49] = X[49];
+   assign B[49] = Y[49];
+   assign A[50] = X[50];
+   assign B[50] = Y[50];
+   assign A[51] = X[51];
+   assign B[51] = Y[51];
+   assign A[52] = X[52];
+   assign B[52] = Y[52];
+   assign A[53] = X[53];
+   assign B[53] = Y[53];
+   assign A[54] = X[54];
+   assign B[54] = Y[54];
+   assign A[55] = X[55];
+   assign B[55] = Y[55];
+   assign A[56] = X[56];
+   assign B[56] = Y[56];
+   assign A[57] = X[57];
+   assign B[57] = Y[57];
+   assign A[58] = X[58];
+   assign B[58] = Y[58];
+   assign A[59] = X[59];
+   assign B[59] = Y[59];
+   assign A[60] = X[60];
+   assign B[60] = Y[60];
+   assign A[61] = X[61];
+   assign B[61] = Y[61];
+   assign A[62] = X[62];
+   assign B[62] = Y[62];
+   assign A[63] = X[63];
+   assign B[63] = Y[63];
+   assign S[0] = Q[0];
+   assign S[1] = Q[1];
+   assign S[2] = Q[2];
+   assign S[3] = Q[3];
+   assign S[4] = Q[4];
+   assign S[5] = Q[5];
+   assign S[6] = Q[6];
+   assign S[7] = Q[7];
+   assign S[8] = Q[8];
+   assign S[9] = Q[9];
+   assign S[10] = Q[10];
+   assign S[11] = Q[11];
+   assign S[12] = Q[12];
+   assign S[13] = Q[13];
+   assign S[14] = Q[14];
+   assign S[15] = Q[15];
+   assign S[16] = Q[16];
+   assign S[17] = Q[17];
+   assign S[18] = Q[18];
+   assign S[19] = Q[19];
+   assign S[20] = Q[20];
+   assign S[21] = Q[21];
+   assign S[22] = Q[22];
+   assign S[23] = Q[23];
+   assign S[24] = Q[24];
+   assign S[25] = Q[25];
+   assign S[26] = Q[26];
+   assign S[27] = Q[27];
+   assign S[28] = Q[28];
+   assign S[29] = Q[29];
+   assign S[30] = Q[30];
+   assign S[31] = Q[31];
+   assign S[32] = Q[32];
+   assign S[33] = Q[33];
+   assign S[34] = Q[34];
+   assign S[35] = Q[35];
+   assign S[36] = Q[36];
+   assign S[37] = Q[37];
+   assign S[38] = Q[38];
+   assign S[39] = Q[39];
+   assign S[40] = Q[40];
+   assign S[41] = Q[41];
+   assign S[42] = Q[42];
+   assign S[43] = Q[43];
+   assign S[44] = Q[44];
+   assign S[45] = Q[45];
+   assign S[46] = Q[46];
+   assign S[47] = Q[47];
+   assign S[48] = Q[48];
+   assign S[49] = Q[49];
+   assign S[50] = Q[50];
+   assign S[51] = Q[51];
+   assign S[52] = Q[52];
+   assign S[53] = Q[53];
+   assign S[54] = Q[54];
+   assign S[55] = Q[55];
+   assign S[56] = Q[56];
+   assign S[57] = Q[57];
+   assign S[58] = Q[58];
+   assign S[59] = Q[59];
+   assign S[60] = Q[60];
+   assign S[61] = Q[61];
+   assign S[62] = Q[62];
+   assign S[63] = Q[63];
+   assign Bbar = B ^ {64{Sub}};
+   
+endmodule // cla64
+
+// This module performs 64-bit subtraction. It is used to get the two's complement
+// of main addition or subtraction in the floating point adder. 
+
+module cla_sub64 (S, X, Y);
+   
+   input  [63:0] X;
+   input [63:0]  Y;
+   
+   output [63:0] S;
+   
+   wire 	 CO;
+   wire 	 VDD = 1'b1;
+   wire [0:63] 	 A,B,Q, Bbar;
+   
+   DBLCADDER_64_64 U1 (A , Bbar , VDD, Q , CO );
+   assign A[0] = X[0];
+   assign B[0] = Y[0];
+   assign A[1] = X[1];
+   assign B[1] = Y[1];
+   assign A[2] = X[2];
+   assign B[2] = Y[2];
+   assign A[3] = X[3];
+   assign B[3] = Y[3];
+   assign A[4] = X[4];
+   assign B[4] = Y[4];
+   assign A[5] = X[5];
+   assign B[5] = Y[5];
+   assign A[6] = X[6];
+   assign B[6] = Y[6];
+   assign A[7] = X[7];
+   assign B[7] = Y[7];
+   assign A[8] = X[8];
+   assign B[8] = Y[8];
+   assign A[9] = X[9];
+   assign B[9] = Y[9];
+   assign A[10] = X[10];
+   assign B[10] = Y[10];
+   assign A[11] = X[11];
+   assign B[11] = Y[11];
+   assign A[12] = X[12];
+   assign B[12] = Y[12];
+   assign A[13] = X[13];
+   assign B[13] = Y[13];
+   assign A[14] = X[14];
+   assign B[14] = Y[14];
+   assign A[15] = X[15];
+   assign B[15] = Y[15];
+   assign A[16] = X[16];
+   assign B[16] = Y[16];
+   assign A[17] = X[17];
+   assign B[17] = Y[17];
+   assign A[18] = X[18];
+   assign B[18] = Y[18];
+   assign A[19] = X[19];
+   assign B[19] = Y[19];
+   assign A[20] = X[20];
+   assign B[20] = Y[20];
+   assign A[21] = X[21];
+   assign B[21] = Y[21];
+   assign A[22] = X[22];
+   assign B[22] = Y[22];
+   assign A[23] = X[23];
+   assign B[23] = Y[23];
+   assign A[24] = X[24];
+   assign B[24] = Y[24];
+   assign A[25] = X[25];
+   assign B[25] = Y[25];
+   assign A[26] = X[26];
+   assign B[26] = Y[26];
+   assign A[27] = X[27];
+   assign B[27] = Y[27];
+   assign A[28] = X[28];
+   assign B[28] = Y[28];
+   assign A[29] = X[29];
+   assign B[29] = Y[29];
+   assign A[30] = X[30];
+   assign B[30] = Y[30];
+   assign A[31] = X[31];
+   assign B[31] = Y[31];
+   assign A[32] = X[32];
+   assign B[32] = Y[32];
+   assign A[33] = X[33];
+   assign B[33] = Y[33];
+   assign A[34] = X[34];
+   assign B[34] = Y[34];
+   assign A[35] = X[35];
+   assign B[35] = Y[35];
+   assign A[36] = X[36];
+   assign B[36] = Y[36];
+   assign A[37] = X[37];
+   assign B[37] = Y[37];
+   assign A[38] = X[38];
+   assign B[38] = Y[38];
+   assign A[39] = X[39];
+   assign B[39] = Y[39];
+   assign A[40] = X[40];
+   assign B[40] = Y[40];
+   assign A[41] = X[41];
+   assign B[41] = Y[41];
+   assign A[42] = X[42];
+   assign B[42] = Y[42];
+   assign A[43] = X[43];
+   assign B[43] = Y[43];
+   assign A[44] = X[44];
+   assign B[44] = Y[44];
+   assign A[45] = X[45];
+   assign B[45] = Y[45];
+   assign A[46] = X[46];
+   assign B[46] = Y[46];
+   assign A[47] = X[47];
+   assign B[47] = Y[47];
+   assign A[48] = X[48];
+   assign B[48] = Y[48];
+   assign A[49] = X[49];
+   assign B[49] = Y[49];
+   assign A[50] = X[50];
+   assign B[50] = Y[50];
+   assign A[51] = X[51];
+   assign B[51] = Y[51];
+   assign A[52] = X[52];
+   assign B[52] = Y[52];
+   assign A[53] = X[53];
+   assign B[53] = Y[53];
+   assign A[54] = X[54];
+   assign B[54] = Y[54];
+   assign A[55] = X[55];
+   assign B[55] = Y[55];
+   assign A[56] = X[56];
+   assign B[56] = Y[56];
+   assign A[57] = X[57];
+   assign B[57] = Y[57];
+   assign A[58] = X[58];
+   assign B[58] = Y[58];
+   assign A[59] = X[59];
+   assign B[59] = Y[59];
+   assign A[60] = X[60];
+   assign B[60] = Y[60];
+   assign A[61] = X[61];
+   assign B[61] = Y[61];
+   assign A[62] = X[62];
+   assign B[62] = Y[62];
+   assign A[63] = X[63];
+   assign B[63] = Y[63];
+   assign S[0] = Q[0];
+   assign S[1] = Q[1];
+   assign S[2] = Q[2];
+   assign S[3] = Q[3];
+   assign S[4] = Q[4];
+   assign S[5] = Q[5];
+   assign S[6] = Q[6];
+   assign S[7] = Q[7];
+   assign S[8] = Q[8];
+   assign S[9] = Q[9];
+   assign S[10] = Q[10];
+   assign S[11] = Q[11];
+   assign S[12] = Q[12];
+   assign S[13] = Q[13];
+   assign S[14] = Q[14];
+   assign S[15] = Q[15];
+   assign S[16] = Q[16];
+   assign S[17] = Q[17];
+   assign S[18] = Q[18];
+   assign S[19] = Q[19];
+   assign S[20] = Q[20];
+   assign S[21] = Q[21];
+   assign S[22] = Q[22];
+   assign S[23] = Q[23];
+   assign S[24] = Q[24];
+   assign S[25] = Q[25];
+   assign S[26] = Q[26];
+   assign S[27] = Q[27];
+   assign S[28] = Q[28];
+   assign S[29] = Q[29];
+   assign S[30] = Q[30];
+   assign S[31] = Q[31];
+   assign S[32] = Q[32];
+   assign S[33] = Q[33];
+   assign S[34] = Q[34];
+   assign S[35] = Q[35];
+   assign S[36] = Q[36];
+   assign S[37] = Q[37];
+   assign S[38] = Q[38];
+   assign S[39] = Q[39];
+   assign S[40] = Q[40];
+   assign S[41] = Q[41];
+   assign S[42] = Q[42];
+   assign S[43] = Q[43];
+   assign S[44] = Q[44];
+   assign S[45] = Q[45];
+   assign S[46] = Q[46];
+   assign S[47] = Q[47];
+   assign S[48] = Q[48];
+   assign S[49] = Q[49];
+   assign S[50] = Q[50];
+   assign S[51] = Q[51];
+   assign S[52] = Q[52];
+   assign S[53] = Q[53];
+   assign S[54] = Q[54];
+   assign S[55] = Q[55];
+   assign S[56] = Q[56];
+   assign S[57] = Q[57];
+   assign S[58] = Q[58];
+   assign S[59] = Q[59];
+   assign S[60] = Q[60];
+   assign S[61] = Q[61];
+   assign S[62] = Q[62];
+   assign S[63] = Q[63];
+   assign Bbar = ~B;
+   
+endmodule // cla_sub64
--- a/wally-pipelined/src/fpu/convert_inputs.sv
+++ b/wally-pipelined/src/fpu/convert_inputs.sv
@ -0,0 +1,61 @@
+// This module takes as inputs two operands (op1 and op2) 
+// the operation type (op_type) and the result precision (P). 
+// Based on the operation and precision , it conditionally
+// converts single precision values to double precision values
+// and modifies the sign of op1. The converted operands are Float1
+// and Float2.
+
+module convert_inputs(Float1, Float2, op1, op2, op_type, P);
+   
+   input [63:0]  op1;            // 1st input operand (A)
+   input [63:0]  op2;            // 2nd input operand (B)
+   input [3:0] 	 op_type;        // Function opcode
+   input 	 P;              // Result Precision (0 for double, 1 for single)
+
+   output [63:0] Float1;	// Converted 1st input operand
+   output [63:0] Float2;	// Converted 2nd input operand   
+   
+   wire 	 conv_SP;        // Convert from SP to DP
+   wire 	 negate;         // Operation is negation
+   wire 	 abs_val;        // Operation is absolute value
+   wire 	 Zexp1;		// One if the exponent of op1 is zero
+   wire 	 Zexp2;		// One if the exponent of op2 is zero
+   wire 	 Oexp1;		// One if the exponent of op1 is all ones
+   wire 	 Oexp2;		// One if the exponent of op2 is all ones
+
+   // Convert from single precision to double precision if (op_type is 11X
+   // and P is 0) or (op_type is not 11X and P is one). 
+   assign conv_SP = (op_type[2]&op_type[1]) ^ P;
+
+   // Test if the input exponent is zero, because if it is then the
+   // exponent of the converted number should be zero. 
+   assign Zexp1 = ~(op1[62] | op1[61] | op1[60] | op1[59] | 
+		    op1[58] | op1[57] | op1[56] | op1[55]);
+   assign Zexp2 = ~(op2[62] | op2[61] | op2[60] | op2[59] | 
+		    op2[58] | op2[57] | op2[56] | op2[55]);
+   assign Oexp1 =  (op1[62] & op1[61] & op1[60] & op1[59] & 
+		    op1[58] & op1[57] & op1[56] & op1[55]);
+   assign Oexp2 =  (op2[62] & op2[61] & op2[60] & op2[59] & 
+		    op2[58] & op2[57] & op2[56] &op2[55]);
+
+   // Conditionally convert op1. Lower 29 bits are zero for single precision.
+   assign Float1[62:29] = conv_SP ? {op1[62], {3{(~op1[62]&~Zexp1)|Oexp1}}, op1[61:32]}
+			  : op1[62:29];
+   assign Float1[28:0] = op1[28:0] & {29{~conv_SP}};
+
+   // Conditionally convert op2. Lower 29 bits are zero for single precision. 
+   assign Float2[62:29] = conv_SP ? {op2[62], 
+				     {3{(~op2[62]&~Zexp2)|Oexp2}}, op2[61:32]}
+			  : op2[62:29];
+   assign Float2[28:0] = op2[28:0] & {29{~conv_SP}};
+
+   // Set the sign of Float1 based on its original sign and if the operation
+   // is negation (op_type = 101) or absolute value (op_type = 100)
+
+   assign negate  = op_type[2] & ~op_type[1] & op_type[0];
+   assign abs_val = op_type[2] & ~op_type[1] & ~op_type[0];
+   assign Float1[63]  = (op1[63] ^ negate) & ~abs_val;
+   assign Float2[63]  = op2[63];
+
+endmodule // convert_inputs
+
--- a/wally-pipelined/src/fpu/convert_inputs_div.sv
+++ b/wally-pipelined/src/fpu/convert_inputs_div.sv
@ -0,0 +1,51 @@
+// This module takes as inputs two operands (op1 and op2) 
+// and the result precision (P).  Based on the operation and precision, 
+// it conditionally converts single precision values to double 
+// precision values and modifies the sign of op1. 
+// The converted operands are Float1 and Float2.
+
+module convert_inputs_div (Float1, Float2b, op1, op2, op_type, P);
+   
+   input [63:0]  op1;           // 1st input operand (A)
+   input [63:0]  op2;           // 2nd input operand (B)
+   input 	 P;             // Result Precision (0 for double, 1 for single)
+   input 	 op_type;       // Operation   
+
+   output [63:0] Float1;	// Converted 1st input operand
+   output [63:0] Float2b;	// Converted 2nd input operand   
+
+   wire [63:0] 	 Float2;   
+   wire 	 Zexp1;		// One if the exponent of op1 is zero
+   wire 	 Zexp2;		// One if the exponent of op2 is zero
+   wire 	 Oexp1;		// One if the exponent of op1 is all ones
+   wire 	 Oexp2;		// One if the exponent of op2 is all ones
+
+   // Test if the input exponent is zero, because if it is then the
+   // exponent of the converted number should be zero. 
+   assign Zexp1 = ~(op1[62] | op1[61] | op1[60] | op1[59] | 
+		    op1[58] | op1[57] | op1[56] | op1[55]);
+   assign Zexp2 = ~(op2[62] | op2[61] | op2[60] | op2[59] | 
+		    op2[58] | op2[57] | op2[56] | op2[55]);
+   assign Oexp1 =  (op1[62] & op1[61] & op1[60] & op1[59] & 
+		    op1[58] & op1[57] & op1[56] & op1[55]);
+   assign Oexp2 =  (op2[62] & op2[61] & op2[60] & op2[59] & 
+		    op2[58] & op2[57] & op2[56] &op2[55]);
+
+   // Conditionally convert op1. Lower 29 bits are zero for single precision.
+   assign Float1[62:29] = P ? {op1[62], {3{(~op1[62]&~Zexp1)|Oexp1}}, op1[61:32]}
+			  : op1[62:29];
+   assign Float1[28:0] = op1[28:0] & {29{~P}};
+
+   // Conditionally convert op2. Lower 29 bits are zero for single precision. 
+   assign Float2[62:29] = P ? {op2[62], {3{(~op2[62]&~Zexp2)|Oexp2}}, op2[61:32]}
+			  : op2[62:29];
+   assign Float2[28:0] = op2[28:0] & {29{~P}};
+
+   // Set the sign of Float1 based on its original sign
+   assign Float1[63]  = op1[63];
+   assign Float2[63]  = op2[63];
+
+   // For sqrt, assign Float2 same as Float1 for simplicity
+   assign Float2b = op_type ? Float1 : Float2;   
+
+endmodule // convert_inputs
--- a/wally-pipelined/src/fpu/csa.sv
+++ b/wally-pipelined/src/fpu/csa.sv
@ -0,0 +1,70 @@
+module ha (C, S, A, B) ;
+   
+   input  A, B;
+   output S, C;
+
+   assign S = A^B;
+   assign C = A&B;
+
+endmodule // HA
+
+// module fa (input logic a, b, c, output logic sum, carry);
+   
+//    assign sum = a^b^c;
+//    assign carry = a&b|a&c|b&c;   
+   
+// endmodule // fa
+
+// module csa #(parameter WIDTH=8) (a, b,c, sum, carry, cout);
+
+//    input logic [WIDTH-1:0] a, b, c;
+   
+//    output logic [WIDTH-1:0] sum, carry;
+//    output logic 	    cout;   
+
+//    logic [WIDTH:0] 	    carry_temp;   
+//    genvar 		    i;
+//    generate
+//       for (i=0;i<WIDTH;i=i+1)
+// 	begin : genbit
+// 	   fa fa_inst (a[i], b[i], c[i], sum[i], carry_temp[i+1]);
+// 	end
+//    endgenerate
+//    assign carry = {1'b0, carry_temp[WIDTH-1:1], 1'b0};
+//    assign cout = carry_temp[WIDTH];   
+   
+// endmodule // csa
+
+module FA_array (S, C, A, B, Ci) ;
+   parameter n = 32;
+   input  [n-1:0] A;
+   input  [n-1:0] B;
+   input  [n-1:0] Ci;
+   output [n-1:0] S;
+   output [n-1:0] C;
+
+   wire   [n-1:0] n0;
+   wire   [n-1:0] n1;
+   wire   [n-1:0] n2;
+
+   genvar 	  i;
+   generate
+      for (i = 0; i < n; i = i + 1) begin : index
+	 fa FA1(.S(S[i]), .C(C[i]), .A(A[i]), .B(B[i]), .Ci(Ci[i]));
+      end
+   endgenerate
+   
+endmodule // FA_array
+
+module HA_array (S, C, A, B) ;
+   parameter n = 32;
+   input  [n-1:0] A, B;
+   output [n-1:0] S, C;
+   genvar 	  i;
+   generate
+      for (i = 0; i < n; i = i + 1) begin : index
+	 ha ha1(.S(S[i]), .C(C[i]), .A(A[i]), .B(B[i]));
+      end
+   endgenerate
+   
+endmodule // HA_array
--- a/wally-pipelined/src/fpu/divconv.sv
+++ b/wally-pipelined/src/fpu/divconv.sv
@ -0,0 +1,233 @@
+// `timescale 1ps/1ps
+module divconv (q1, qm1, qp1, q0, qm0, qp0, 
+		rega_out, regb_out, regc_out, regd_out,
+		regr_out, d, n, 
+		sel_muxa, sel_muxb, sel_muxr, 
+		reset, clk,
+		load_rega, load_regb, load_regc, load_regd,
+		load_regr, load_regs, P, op_type, exp_odd);
+
+   input logic [52:0]   d, n;
+   input logic [2:0] 	sel_muxa, sel_muxb;
+   input logic 	        sel_muxr;   
+   input logic 	        load_rega, load_regb, load_regc, load_regd;
+   input logic 		load_regr, load_regs;
+   input logic 		P;
+   input logic 		op_type;
+   input logic 		exp_odd;   
+   input logic 	        reset;
+   input logic 	        clk;   
+   
+   output logic [63:0] 	q1, qp1, qm1;
+   output logic [63:0] 	q0, qp0, qm0;   
+   output logic [63:0] 	rega_out, regb_out, regc_out, regd_out;
+   output logic [127:0] regr_out;
+   
+   supply1 		vdd;
+   supply0 		vss;   
+
+   logic [63:0] 	muxa_out, muxb_out;
+   logic [10:0] 	ia_div, ia_sqrt;
+   logic [63:0] 	ia_out;
+   logic [127:0] 	mul_out;
+   logic [63:0] 	q_out1, qm_out1, qp_out1;
+   logic [63:0] 	q_out0, qm_out0, qp_out0;
+   logic [63:0] 	mcand, mplier, mcand_q;   
+   logic [63:0] 	twocmp_out;
+   logic [64:0] 	three;   
+   logic [127:0] 	Carry, Carry2;
+   logic [127:0] 	Sum, Sum2;
+   logic [127:0] 	constant, constant2;
+   logic [63:0] 	q_const, qp_const, qm_const;
+   logic [63:0] 	d2, n2;   
+   logic [11:0] 	d3;   
+
+   // Check if exponent is odd for sqrt
+   // If exp_odd=1 and sqrt, then M/2 and use ia_addr=0 as IA
+   assign d2 = (exp_odd&op_type) ? {vss,d,10'h0} : {d,11'h0};
+   assign n2 = op_type ? d2 : {n,11'h0};
+   
+   // IA div/sqrt
+   sbtm ia1 (d[52:41], ia_div);
+   sbtm2 ia2 (d2[63:52], ia_sqrt);
+   assign ia_out = op_type ? {ia_sqrt, {53{1'b0}}} : {ia_div, {53{1'b0}}};
+   
+   // Choose IA or iteration
+   mux6 #(64) mx1 (d2, ia_out, rega_out, regc_out, regd_out, regb_out, sel_muxb, muxb_out);
+   mux5 #(64) mx2 (regc_out, n2, ia_out, regb_out, regd_out, sel_muxa, muxa_out);
+
+   // Deal with remainder if [0.5, 1) instead of [1, 2)
+   mux2 #(128) mx3a ({~n, {75{1'b1}}}, {{1'b1}, ~n, {74{1'b1}}}, q1[63], constant2);
+   // Select Mcand, Remainder/Q''  
+   mux2 #(128) mx3 (128'h0, constant2, sel_muxr, constant);
+   // Select mcand - remainder should always choose q1 [1,2) because
+   //   adjustment of N in the from XX.FFFFFFF
+   mux2 #(64) mx4 (q0, q1, q1[63], mcand_q);
+   mux2 #(64) mx5 (muxb_out, mcand_q, sel_muxr&op_type, mplier);   
+   mux2 #(64) mx6 (muxa_out, mcand_q, sel_muxr, mcand);
+   // TDM multiplier (carry/save)
+   multiplier mult1 (mcand, mplier, Sum, Carry);
+   // Q*D - N (reversed but changed in rounder.v to account for sign reversal)
+   csa #(128) csa1 (Sum, Carry, constant, Sum2, Carry2);
+   // Add ulp for subtraction in remainder
+   mux2 #(1) mx7 (1'b0, 1'b1, sel_muxr, muxr_out);
+
+   // Constant for Q''
+   mux2 #(64) mx8 ({64'h0000_0000_0000_0200}, {64'h0000_0040_0000_0000}, P, q_const);
+   mux2 #(64) mx9 ({64'h0000_0000_0000_0A00}, {64'h0000_0140_0000_0000}, P, qp_const);
+   mux2 #(64) mxA ({64'hFFFF_FFFF_FFFF_F9FF}, {64'hFFFF_FF3F_FFFF_FFFF}, P, qm_const);
+   
+   // CPA (from CSA)/Remainder addition/subtraction
+   ldf128 cpa1 (cout1, mul_out, Sum2, Carry2, muxr_out);
+   // Assuming [1,2) - q1
+   ldf64 cpa2 (cout2, q_out1, regb_out, q_const, 1'b0);
+   ldf64 cpa3 (cout3, qp_out1, regb_out, qp_const, 1'b0);
+   ldf64 cpa4 (cout4, qm_out1, regb_out, qm_const, 1'b1);   
+   // Assuming [0.5,1) - q0
+   ldf64 cpa5 (cout5, q_out0, {regb_out[62:0], vss}, q_const, 1'b0);
+   ldf64 cpa6 (cout6, qp_out0, {regb_out[62:0], vss}, qp_const, 1'b0);
+   ldf64 cpa7 (cout7, qm_out0, {regb_out[62:0], vss}, qm_const, 1'b1);
+   // One's complement instead of two's complement (for hw efficiency)
+   assign three = {~mul_out[126], mul_out[126], ~mul_out[125:63]};   
+   mux2 #(64) mxTC (~mul_out[126:63], three[64:1],  op_type, twocmp_out);
+
+   // regs
+   flopenr #(64) regc (clk, reset, load_regc, twocmp_out, regc_out);
+   flopenr #(64) regb (clk, reset, load_regb, mul_out[126:63], regb_out);
+   flopenr #(64) rega (clk, reset, load_rega, mul_out[126:63], rega_out);
+   flopenr #(64) regd (clk, reset, load_regd, mul_out[126:63], regd_out);
+   flopenr #(128) regr (clk, reset, load_regr, mul_out, regr_out);
+   // Assuming [1,2)
+   flopenr #(64) rege (clk, reset, load_regs, {q_out1[63:39], (q_out1[38:10] & {29{~P}}), 10'h0}, q1);   
+   flopenr #(64) regf (clk, reset, load_regs, {qm_out1[63:39], (qm_out1[38:10] & {29{~P}}), 10'h0}, qm1);
+   flopenr #(64) regg (clk, reset, load_regs, {qp_out1[63:39], (qp_out1[38:10] & {29{~P}}), 10'h0}, qp1);
+   // Assuming [0,1)
+   flopenr #(64) regh (clk, reset, load_regs, {q_out0[63:39], (q_out0[38:10] & {29{~P}}), 10'h0}, q0);
+   flopenr #(64) regj (clk, reset, load_regs, {qm_out0[63:39], (qm_out0[38:10] & {29{~P}}), 10'h0}, qm0);
+   flopenr #(64) regk (clk, reset, load_regs, {qp_out0[63:39], (qp_out0[38:10] & {29{~P}}), 10'h0}, qp0);
+   
+endmodule // divconv
+
+// module adder #(parameter WIDTH=8)
+//    (input  logic [WIDTH-1:0] a, b,
+//     output logic [WIDTH-1:0] y);
+   
+//    assign y = a + b;
+   
+// endmodule // adder
+
+// module flopenr #(parameter WIDTH = 8)
+//    (input  logic             clk, reset, en,
+//     input  logic [WIDTH-1:0] d, 
+//     output logic [WIDTH-1:0] q);
+
+//    always_ff @(posedge clk, posedge reset)
+//      if (reset)   q <= #10 0;
+//      else if (en) q <= #10 d;
+   
+// endmodule // flopenr
+
+// module flopr #(parameter WIDTH = 8)
+//    (input  logic             clk, reset,
+//     input  logic [WIDTH-1:0] d, 
+//     output logic [WIDTH-1:0] q);
+
+//    always_ff @(posedge clk, posedge reset)
+//      if (reset) q <= #10 0;
+//      else       q <= #10 d;
+   
+// endmodule // flopr
+
+// module flopenrc #(parameter WIDTH = 8)
+//    (input  logic             clk, reset, en, clear,
+//     input  logic [WIDTH-1:0] d, 
+//     output logic [WIDTH-1:0] q);
+
+//    always_ff @(posedge clk, posedge reset)
+//      if (reset)    q <= #10 0;
+//      else if (en) 
+//        if (clear) q <= #10 0;
+//        else       q <= #10 d;
+   
+// endmodule // flopenrc
+
+// module floprc #(parameter WIDTH = 8)
+//    (input  logic             clk, reset, clear,
+//     input  logic [WIDTH-1:0] d, 
+//     output logic [WIDTH-1:0] q);
+
+//    always_ff @(posedge clk, posedge reset)
+//      if (reset) q <= #10 0;
+//      else       
+//        if (clear) q <= #10 0;
+//        else       q <= #10 d;
+   
+// endmodule // floprc
+
+// module mux2 #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] d0, d1, 
+//     input  logic             s, 
+//     output logic [WIDTH-1:0] y);
+
+//    assign y = s ? d1 : d0;
+   
+// endmodule // mux2
+
+// module mux3 #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] d0, d1, d2,
+//     input  logic [1:0]       s, 
+//     output logic [WIDTH-1:0] y);
+
+//    assign y = s[1] ? d2 : (s[0] ? d1 : d0);
+   
+// endmodule // mux3
+
+// module mux4 #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] d0, d1, d2, d3,
+//     input  logic [1:0]       s, 
+//     output logic [WIDTH-1:0] y);
+
+//    assign y = s[1] ? (s[0] ? d3 : d2) : (s[0] ? d1 : d0);
+
+// endmodule // mux4
+
+// module mux5 #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4,
+//     input  logic [2:0]       s,
+//     output logic [WIDTH-1:0] y);
+   
+//    always_comb
+//      casez (s)
+//        3'b000 : y = d0;       
+//        3'b001 : y = d1;
+//        3'b010 : y = d2;
+//        3'b011 : y = d3;
+//        3'b1?? : y = d4;
+//      endcase // casez (s)
+
+// endmodule // mux5
+
+// module mux6 #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] d0, d1, d2, d3, d4, d5,
+//     input  logic [2:0]       s,
+//     output logic [WIDTH-1:0] y);
+   
+//    always_comb
+//      casez (s)
+//        3'b000 : y = d0;       
+//        3'b001 : y = d1;
+//        3'b010 : y = d2;
+//        3'b011 : y = d3;
+//        3'b10? : y = d4;
+//        3'b11? : y = d5;       
+//      endcase // casez (s)
+
+// endmodule // mux6
+
+// module eqcmp #(parameter WIDTH = 8)
+//    (input  logic [WIDTH-1:0] a, b,
+//     output logic             y);
+
+//    assign y = (a == b);
+   
+// endmodule // eqcmp
--- a/wally-pipelined/src/fpu/exception.sv
+++ b/wally-pipelined/src/fpu/exception.sv
@ -0,0 +1,120 @@
+// Exception logic for the floating point adder. Note: We may 
+// actually want to move to where the result is computed.
+
+module exception (Ztype, Invalid, Denorm, ANorm, BNorm, Sub, A, B, op_type);
+
+   input [63:0] A;		// 1st input operand (op1)
+   input [63:0] B;		// 2nd input operand (op2)
+   input [3:0] 	op_type;   	// Function opcode
+   output [3:0] Ztype;		// Indicates type of result (Z)
+   output 	Invalid;	// Invalid operation exception
+   output 	Denorm;		// Denormalized input
+   output       ANorm;          // A is not zero or Denorm
+   output       BNorm;          // B is not zero or Denorm
+   output       Sub;		// The effective operation is subtraction
+   wire		AzeroM;	 	// '1' if the mantissa of A is zero
+   wire		BzeroM;		// '1' if the mantissa of B is zero
+   wire		AzeroE;	 	// '1' if the exponent of A is zero
+   wire		BzeroE;		// '1' if the exponent of B is zero
+   wire		AonesE;	 	// '1' if the exponent of A is all ones
+   wire		BonesE;		// '1' if the exponent of B is all ones
+   wire		ADenorm; 	// '1' if A is a denomalized number
+   wire		BDenorm; 	// '1' if B is a denomalized number
+   wire		AInf;	 	// '1' if A is infinite
+   wire		BInf;	 	// '1' if B is infinite
+   wire		AZero;	 	// '1' if A is 0
+   wire		BZero;	 	// '1' if B is 0
+   wire		ANaN;	 	// '1' if A is a not-a-number
+   wire		BNaN; 		// '1' if B is a not-a-number
+   wire		ASNaN;	 	// '1' if A is a signalling not-a-number
+   wire		BSNaN;	 	// '1' if B is a signalling not-a-number
+   wire		ZQNaN;	 	// '1' if result Z is a quiet NaN
+   wire		ZPInf;	 	// '1' if result Z positive infnity
+   wire		ZNInf;	 	// '1' if result Z negative infnity
+   wire         add_sub;	// '1' if operation is add or subtract
+   wire 	converts;       // See if there are any converts   
+   
+   parameter [51:0]  fifty_two_zeros = 52'h0000000000000; // Use parameter?
+
+
+   // Is this instruction a convert
+   assign converts      = ~(~op_type[1] & ~op_type[2]);
+   
+   // Determine if mantissas are all zeros
+   assign AzeroM = (A[51:0] == fifty_two_zeros);
+   assign BzeroM = (B[51:0] == fifty_two_zeros);
+
+   // Determine if exponents are all ones or all zeros 
+   assign AonesE = A[62]&A[61]&A[60]&A[59]&A[58]&A[57]&A[56]&A[55]&A[54]&A[53]&A[52];
+   assign BonesE = B[62]&B[61]&B[60]&B[59]&B[58]&B[57]&B[56]&B[55]&B[54]&B[53]&B[52];
+   assign AzeroE = ~(A[62]|A[61]|A[60]|A[59]|A[58]|A[57]|A[56]|A[55]|A[54]|A[53]|A[52]);
+   assign BzeroE = ~(B[62]|B[61]|B[60]|B[59]|B[58]|B[57]|B[56]|B[55]|B[54]|B[53]|B[52]);
+
+   // Determine special cases. Note: Zero is not really a special case. 
+   assign ADenorm = AzeroE & ~AzeroM;
+   assign BDenorm = BzeroE & ~BzeroM;
+   assign AInf = AonesE & AzeroM;
+   assign BInf = BonesE & BzeroM;
+   assign ANaN = AonesE & ~AzeroM;
+   assign BNaN = BonesE & ~BzeroM;
+   assign ASNaN = ANaN & ~A[51];
+   assign BSNaN = BNaN & ~B[51];
+   assign AZero = AzeroE & AzeroM;
+   assign BZero = BzeroE & BzeroE;
+
+   // A and B are normalized if their exponents are not zero. 
+   assign ANorm = ~AzeroE;
+   assign BNorm = ~BzeroE;
+
+   // An "Invalid Operation" exception occurs if (A or B is a signalling NaN)
+   // or (A and B are both Infinite and the "effective operation" is 
+   // subtraction). 
+   assign add_sub = ~op_type[2] & ~op_type[1];
+   assign Invalid = (ASNaN | BSNaN | 
+		     (add_sub & AInf & BInf & (A[63]^B[63]^op_type[0]))) & ~converts;
+
+   // The Denorm flag is set if (A is denormlized and the operation is not integer 
+   // conversion ) or (if B is normalized and the operation is addition or  subtraction). 
+   assign Denorm = ADenorm&(op_type[2]|~op_type[1]) | BDenorm & add_sub;
+
+   // The result is a quiet NaN if (an "Invalid Operation" exception occurs) 
+   // or (A is a NaN) or (B is a NaN and the operation uses B).
+   assign ZQNaN = Invalid | ANaN | (BNaN & add_sub);
+
+   // The result is +Inf if ((A is +Inf) or (B is -Inf and the operation is
+   // subtraction) or (B is +Inf and the operation is addition)) and (the
+   // result is not a quiet NaN).  
+   assign ZPInf = (AInf&A[63] | add_sub&BInf&(~B[63]^op_type[0]))&~ZQNaN;
+
+   // The result is -Inf if ((A is -Inf) or (B is +Inf and the operation is
+   // subtraction) or (B is -Inf and the operation is addition)) and the
+   // result is not a quiet NaN.  
+   assign ZNInf = (AInf&~A[63] | add_sub&BInf&(B[63]^op_type[0]))&~ZQNaN;
+
+   // Set the type of the result as follows:
+   // (needs optimization - got lazy or was late)
+   // Ztype	Result 
+   //  0000	Normal
+   //  0001	Quiet NaN
+   //  0010     Negative Infinity
+   //  0011     Positive Infinity
+   //  0100     +Bzero and +Azero (and vice-versa)
+   //  0101     +Bzero and -Azero (and vice-versa)
+   //  1000     Convert SP to DP (and vice-versa)
+
+   assign Ztype[0] = ((ZQNaN | ZPInf) & ~(~op_type[2] & op_type[1])) | 
+		     ((AZero & BZero & (A[63]^B[63]^op_type[0])) 
+		      & ~converts);
+   assign Ztype[1] = ((ZNInf | ZPInf) & ~(~op_type[2] & op_type[1])) | 
+		     (((AZero & BZero & A[63] & B[63] & ~op_type[0]) |
+		       (AZero & BZero & A[63] & ~B[63] & op_type[0])) 
+		      & ~converts);
+   assign Ztype[2] = ((AZero & BZero & ~op_type[1] & ~op_type[2]) 
+		      & ~converts);
+   assign Ztype[3] = (op_type[1] & op_type[2] & ~op_type[0]);
+
+   // Determine if the effective operation is subtraction
+   assign Sub = ~(op_type[3] & ~op_type[0]) & ( (op_type[3] & op_type[0]) | (add_sub & (A[63]^B[63]^op_type[0])) );
+
+endmodule // exception
+
--- a/wally-pipelined/src/fpu/exception_div.sv
+++ b/wally-pipelined/src/fpu/exception_div.sv
@ -0,0 +1,96 @@
+// Exception logic for the floating point adder. Note: We may 
+// actually want to move to where the result is computed.
+
+module exception_div (Ztype, Invalid, Denorm, ANorm, BNorm, A, B, op_type);
+
+   input [63:0] A;		// 1st input operand (op1)
+   input [63:0] B;		// 2nd input operand (op2)
+   input 	op_type;        // Determine operation   
+   
+   output [2:0] Ztype;		// Indicates type of result (Z)
+   output 	Invalid;	// Invalid operation exception
+   output 	Denorm;		// Denormalized input
+   output       ANorm;          // A is not zero or Denorm
+   output       BNorm;          // B is not zero or Denorm
+   
+   wire		AzeroM;	 	// '1' if the mantissa of A is zero
+   wire		BzeroM;		// '1' if the mantissa of B is zero
+   wire		AzeroE;	 	// '1' if the exponent of A is zero
+   wire		BzeroE;		// '1' if the exponent of B is zero
+   wire		AonesE;	 	// '1' if the exponent of A is all ones
+   wire		BonesE;		// '1' if the exponent of B is all ones
+   wire		ADenorm; 	// '1' if A is a denomalized number
+   wire		BDenorm; 	// '1' if B is a denomalized number
+   wire		AInf;	 	// '1' if A is infinite
+   wire		BInf;	 	// '1' if B is infinite
+   wire		AZero;	 	// '1' if A is 0
+   wire		BZero;	 	// '1' if B is 0
+   wire		ANaN;	 	// '1' if A is a not-a-number
+   wire		BNaN; 		// '1' if B is a not-a-number
+   wire		ASNaN;	 	// '1' if A is a signalling not-a-number
+   wire		BSNaN;	 	// '1' if B is a signalling not-a-number
+   wire		ZQNaN;	 	// '1' if result Z is a quiet NaN
+   wire		ZInf;	 	// '1' if result Z is an infnity
+   wire 	square_root;    // '1' if square root operation
+   wire 	Zero;           // '1' if result is zero   
+   
+   parameter [51:0]  fifty_two_zeros = 52'h0; // Use parameter?
+
+   // Determine if mantissas are all zeros
+   assign AzeroM = (A[51:0] == fifty_two_zeros);
+   assign BzeroM = (B[51:0] == fifty_two_zeros);
+
+   // Determine if exponents are all ones or all zeros 
+   assign AonesE = A[62]&A[61]&A[60]&A[59]&A[58]&A[57]&A[56]&A[55]&A[54]&A[53]&A[52];
+   assign BonesE = B[62]&B[61]&B[60]&B[59]&B[58]&B[57]&B[56]&B[55]&B[54]&B[53]&B[52];
+   assign AzeroE = ~(A[62]|A[61]|A[60]|A[59]|A[58]|A[57]|A[56]|A[55]|A[54]|A[53]|A[52]);
+   assign BzeroE = ~(B[62]|B[61]|B[60]|B[59]|B[58]|B[57]|B[56]|B[55]|B[54]|B[53]|B[52]);
+
+   // Determine special cases. Note: Zero is not really a special case. 
+   assign ADenorm = AzeroE & ~AzeroM;
+   assign BDenorm = BzeroE & ~BzeroM;
+   assign AInf = AonesE & AzeroM;
+   assign BInf = BonesE & BzeroM;
+   assign ANaN = AonesE & ~AzeroM;
+   assign BNaN = BonesE & ~BzeroM;
+   assign ASNaN = ANaN & A[50];
+   assign BSNaN = ANaN & A[50];
+   assign AZero = AzeroE & AzeroM;
+   assign BZero = BzeroE & BzeroE;
+
+   // A and B are normalized if their exponents are not zero. 
+   assign ANorm = ~AzeroE;
+   assign BNorm = ~BzeroE;
+
+   // An "Invalid Operation" exception occurs if (A or B is a signalling NaN)
+   // or (A and B are both Infinite)
+   assign Invalid = ASNaN | BSNaN | (((AInf & BInf) | (AZero & BZero))&~op_type) | 
+		    (A[63] & op_type);
+
+   // The Denorm flag is set if A is denormlized or if B is normalized 
+   assign Denorm = ADenorm | BDenorm;
+
+   // The result is a quiet NaN if (an "Invalid Operation" exception occurs) 
+   // or (A is a NaN) or (B is a NaN).
+   assign ZQNaN = Invalid | ANaN | BNaN;
+
+   //  The result is zero
+   assign Zero = (AZero | BInf)&~op_type | AZero&op_type;   
+
+   // The result is +Inf if ((A is Inf) or (B is 0)) and (the
+   // result is not a quiet NaN).  
+   assign ZInf = (AInf | BZero)&~ZQNaN&~op_type | AInf&op_type&~ZQNaN;   
+
+   // Set the type of the result as follows:
+   // Ztype	Result 
+   //  000     Normal
+   //  001     Quiet NaN
+   //  010     Infinity
+   //  011     Zero
+   //  110     DivZero
+   assign Ztype[0] = ZQNaN | Zero;
+   assign Ztype[1] = ZInf | Zero;
+   assign Ztype[2] = BZero&~op_type;   
+
+endmodule // exception
+
--- a/wally-pipelined/src/fpu/fctrl.sv
+++ b/wally-pipelined/src/fpu/fctrl.sv
@ -0,0 +1,148 @@
+
+module fctrl (
+  input  logic [6:0] Funct7D,
+  input  logic [6:0] OpD,
+  input  logic [4:0] Rs2D,
+  input  logic [4:0] Rs1D,
+  input  logic [2:0] FrmW,
+  output logic       WriteEnD,
+  output logic       DivSqrtStartD,
+  //output logic [2:0] regSelD,
+  output logic [2:0] WriteSelD,
+  output logic [3:0] OpCtrlD,
+  output logic       FmtD,
+  output logic       WriteIntD);
+
+
+
+  //precision is taken directly from instruction
+  assign FmtD = Funct7D[0];
+
+  //all subsequent logic is based on the table present
+  //in Section 5 of Wally Architecture Specification
+  
+  //write is enabled for all fp instruciton op codes
+  //sans fp load
+  logic isFP, isFPLD;
+  always_comb begin
+	//case statement is easier to modify
+	//in case of errors
+	case(OpD)
+		//fp instructions sans load
+		7'b1010011 : begin isFP = 1'b1; isFPLD = 1'b0; end
+		7'b1000011 : begin isFP = 1'b1; isFPLD = 1'b0; end
+		7'b1000111 : begin isFP = 1'b1; isFPLD = 1'b0; end
+		7'b1001011 : begin isFP = 1'b1; isFPLD = 1'b0; end
+		7'b1001111 : begin isFP = 1'b1; isFPLD = 1'b0; end
+		7'b0100111 : begin isFP = 1'b1; isFPLD = 1'b0; end
+		//fp load	
+		7'b1010011 : begin isFP = 1'b1; isFPLD = 1'b1; end
+		default : begin isFP = 1'b0; isFPLD = 1'b0; end
+	endcase
+  end
+  
+  assign WriteEnD = isFP & ~isFPLD; 
+  
+  //useful intermediary signals
+  //
+  //(mult only not supported in current datapath)
+  //set third FMA operand to zero in this case
+  //(or equivalent)
+  logic isAddSub, isFMA, isMult, isDivSqrt, isCvt, isCmp, isFPSTR;
+
+  always_comb begin
+	//checks all but FMA/store/load
+	if(OpD == 7'b1010011) begin
+  		case(Funct7D)
+			//compare	
+			7'b10100?? : begin isAddSub = 1'b0; isFMA = 1'b0; isMult = 1'b0; isDivSqrt = 1'b0; isCvt = 1'b0; isCmp = 1'b1; isFPSTR = 1'b0; end
+			//div/sqrt
+			7'b0?011?? : begin isAddSub = 1'b0; isFMA = 1'b0; isMult = 1'b0; isDivSqrt = 1'b1; isCvt = 1'b0; isCmp = 1'b0; isFPSTR = 1'b0; end
+			//add/sub
+			7'b0000??? : begin isAddSub = 1'b1; isFMA = 1'b0; isMult = 1'b0; isDivSqrt = 1'b0; isCvt = 1'b0; isCmp = 1'b0; isFPSTR = 1'b0; end
+			//mult
+			7'b00010?? : begin isAddSub = 1'b0; isFMA = 1'b0; isMult = 1'b1; isDivSqrt = 1'b0; isCvt = 1'b0; isCmp = 1'b0; isFPSTR = 1'b0; end
+			//convert (not precision)
+			7'b110?0?? : begin isAddSub = 1'b0; isFMA = 1'b0; isMult = 1'b0; isDivSqrt = 1'b0; isCvt = 1'b1; isCmp = 1'b0; isFPSTR = 1'b0; end
+			//convert (precision)
+			7'b010000? : begin isAddSub = 1'b0; isFMA = 1'b0; isMult = 1'b0; isDivSqrt = 1'b0; isCvt = 1'b1; isCmp = 1'b0; isFPSTR = 1'b0; end
+		endcase
+	end
+	//FMA/store/load
+	else begin
+  		case(OpD)
+			//4 FMA instructions
+			7'b1000011 : begin isAddSub = 1'b0; isFMA = 1'b1; isMult = 1'b0; isDivSqrt = 1'b0; isCvt = 1'b0; isCmp = 1'b0; isFPSTR = 1'b0; end
+			7'b1000111 : begin isAddSub = 1'b0; isFMA = 1'b1; isMult = 1'b0; isDivSqrt = 1'b0; isCvt = 1'b0; isCmp = 1'b0; isFPSTR = 1'b0; end
+			7'b1001011 : begin isAddSub = 1'b0; isFMA = 1'b1; isMult = 1'b0; isDivSqrt = 1'b0; isCvt = 1'b0; isCmp = 1'b0; isFPSTR = 1'b0; end
+			7'b1001111 : begin isAddSub = 1'b0; isFMA = 1'b1; isMult = 1'b0; isDivSqrt = 1'b0; isCvt = 1'b0; isCmp = 1'b0; isFPSTR = 1'b0; end
+			//store (load already found)
+			7'b0100111 : begin isAddSub = 1'b0; isFMA = 1'b0; isMult = 1'b0; isDivSqrt = 1'b0; isCvt = 1'b0; isCmp = 1'b0; isFPSTR = 1'b1; end
+		endcase
+	end
+  end
+
+  //register is chosen based on operation performed
+  //---- 
+  //write selection is chosen in the same way as 
+  //register selection
+  //
+
+  // reg/write sel logic and assignment
+  // 
+  // 3'b000 = add/sub/cvt
+  // 3'b001 = sign
+  // 3'b010 = fma
+  // 3'b011 = cmp
+  // 3'b100 = div/sqrt
+  //
+  //reg select
+  
+  //this value is used enough to be shorthand
+  logic isSign;
+  assign isSign = ~Funct7D[6] & ~Funct7D[5] & Funct7D[4] & ~Funct7D[3] & ~Funct7D[2];
+
+  //write select
+  assign WriteSelD[2] = isDivSqrt & ~isFMA;
+  assign WriteSelD[1] = isFMA | isCmp;
+  //AND of Funct7 for sign
+  assign WriteSelD[0] = isCmp | isSign;
+
+  //if op is div/sqrt - start div/sqrt
+  assign DivSqrtStartD = isDivSqrt & ~isFMA;
+
+  //operation control for each fp operation
+  //has to be expanded over standard to account for
+  //integrated fpadd/cvt
+  //
+  //will integrate FMA opcodes into design later
+  //
+  //conversion instructions will
+  //also need to be added later as I find the opcode
+  //version I used for this repo
+
+  //let's do separate SOP for each type of operation
+//  assign OpCtrlD[3] = 1'b0;
+//
+//
+
+  //add/cvt chooses unsigned conversion here
+  assign OpCtrlD[3] = (isAddSub & Rs2D[0]) | (isFMA & 1'b0) | (isDivSqrt & 1'b0) | (isCmp & 1'b0) | (isSign & 1'b0);
+  //add/cvt chooses FP/int or int/FP conversion 
+  assign OpCtrlD[2] = (isAddSub & (Funct7D[6] & Funct7D[5] & ~Funct7D[4])) | (isFMA & 1'b0) | (isDivSqrt & 1'b0) | (isCmp & 1'b0) | (isSign & 1'b0);
+  //compare chooses equals
+  //sign chooses sgnjx
+  //add/cvt can chooses between abs/neg functions, but they aren't used in the
+  //wally-spec
+  assign OpCtrlD[1] = (isAddSub & 1'b0) | (isFMA & 1'b0) | (isDivSqrt & 1'b0) | (isCmp & FrmW[2]) | (isSign & FrmW[1]);
+  //divide chooses between div/sqrt
+  //compare chooses between LT and LE
+  //sign chooses between sgnj and sgnjn
+  //add/cvt chooses between add/sub or single-precision conversion
+  assign OpCtrlD[0] = (isAddSub & (Funct7D[2] | Funct7D[0])) | (isFMA & 1'b0) | (isDivSqrt & Funct7D[5]) | (isCmp & FrmW[1]) | (isSign & FrmW[0]);
+  
+  //write to integer source if conv to int occurs
+  //AND of Funct7 for int results 
+  assign WriteIntD = isCvt & (Funct7D[6] & Funct7D[5] & ~Funct7D[4] & ~Funct7D[3] & ~Funct7D[2] & ~Funct7D[1]);
+
+endmodule
--- a/wally-pipelined/src/fpu/fpadd_denorm.sv
+++ b/wally-pipelined/src/fpu/fpadd_denorm.sv
@ -0,0 +1,274 @@
+//
+// File name : fpadd
+// Title     : Floating-Point Adder/Subtractor
+// project   : FPU
+// Library   : fpadd
+// Author(s) : James E. Stine, Jr., Brett Mathis
+// Purpose   : definition of main unit to floating-point add/sub
+// notes :   
+//
+// Copyright Oklahoma State University
+// Copyright AFRL
+//
+// Basic and Denormalized Operations
+//
+// Step 1: Load operands, set flags, and convert SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
+//         or of (exp1 = exp2 AND mnt1 < mnt2)
+// Step 4: Shift the mantissa corresponding to the smaller exponent, 
+//          and extend precision by three bits to the right.
+// Step 5: Add or subtract the mantissas.
+// Step 6: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 7: Round the result.// 
+// Step 8: Put sum onto output.
+//
+
+
+module fpadd (AS_Result, Flags, Denorm, op1, op2, rm, op_type, P, OvEn, UnEn);
+
+   input [63:0] op1;		// 1st input operand (A)
+   input [63:0] op2;		// 2nd input operand (B)
+   input [2:0] 	rm;		// Rounding mode - specify values 
+   input [3:0]	op_type;	// Function opcode
+   input 	P;   		// Result Precision (0 for double, 1 for single)
+   input 	OvEn;		// Overflow trap enabled
+   input 	UnEn;   	// Underflow trap enabled
+
+   output [63:0] AS_Result;	// Result of operation
+   output [4:0]  Flags;   	// IEEE exception flags 
+   output 	 Denorm;   	// Denorm on input or output   
+
+   wire [63:0] 	 Float1; 
+   wire [63:0] 	 Float2;
+   wire [63:0] 	 IntValue;
+   wire [11:0] 	 exp1, exp2;
+   wire [11:0] 	 exp_diff1, exp_diff2;
+   wire [10:0] 	 exponent, exp_pre;
+   wire [11:0] 	 exp_shift;
+   wire [63:0] 	 Result;   
+   wire [51:0] 	 mantissaA;
+   wire [56:0] 	 mantissaA1;
+   wire [63:0] 	 mantissaA3;
+   wire [51:0] 	 mantissaB; 
+   wire [56:0] 	 mantissaB1, mantissaB2;
+   wire [63:0] 	 mantissaB3;
+   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm, sum_norm_w_bypass;
+   wire [5:0] 	 align_shift;
+   wire [5:0] 	 norm_shift, norm_shift_denorm;
+   wire [3:0] 	 sel_inv;
+   wire		 op1_Norm, op2_Norm;
+   wire		 opA_Norm, opB_Norm;
+   wire		 Invalid;
+   wire 	 DenormIn, DenormIO;
+   wire [4:0] 	 FlagsIn;   	
+   wire 	 exp_valid;
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire 	 signA, sign_corr;
+   wire          corr_sign;
+   wire 	 zeroB;         
+   wire 	 convert;
+   wire          swap;
+   wire          sub;
+   wire [10:0]	 exponent_postsum;
+   wire 	 mantissa_comp;
+   wire 	 mantissa_comp_sum;
+   wire 	 mantissa_comp_sum_tc;
+   wire 	 Float1_sum_comp;
+   wire 	 Float2_sum_comp;
+   wire 	 Float1_sum_tc_comp;
+   wire 	 Float2_sum_tc_comp;
+   wire [5:0]	 ZP_mantissaA;
+   wire [5:0] 	 ZP_mantissaB;
+   wire 	 ZV_mantissaA;
+   wire 	 ZV_mantissaB;
+   wire 	 normal_underflow;
+   wire 	 normal_overflow;
+
+   // Convert the input operands to their appropriate forms based on 
+   // the orignal operands, the op_type , and their precision P. 
+   // Single precision inputs are converted to double precision 
+   // and the sign of the first operand is set appropratiately based on
+   // if the operation is absolute value or negation. 
+
+   convert_inputs conv1 (Float1, Float2, op1, op2, op_type, P);
+
+   // Test for exceptions and return the "Invalid Operation" and
+   // "Denormalized" Input Flags. The "sel_inv" is used in
+   // the third pipeline stage to select the result. Also, op1_Norm
+   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
+   // sub is one if the effective operation is subtaction. 
+
+   exception exc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, sub, 
+		   Float1, Float2, op_type);
+
+   // Perform Exponent Subtraction (used for alignment). For performance
+   // both exponent subtractions are performed in parallel. This was 
+   // changed to a behavior level to allow the tools to  try to optimize
+   // the two parallel additions. The input values are zero-extended to 12 
+   // bits prior to performing the addition. 
+
+   assign exp1 = {1'b0, Float1[62:52]};
+   assign exp2 = {1'b0, Float2[62:52]};
+   assign exp_diff1 = exp1 - exp2;
+   assign exp_diff2 = DenormIn ? ({Float2[63], exp2[10:0]} - {Float1[63], exp1[10:0]}): exp2 - exp1;
+
+   // The second operand (B) should be set to zero, if op_type does not
+   // specify addition or subtraction
+   assign zeroB = op_type[2] | op_type[1];
+
+   // Swapped operands if zeroB is not one and exp1 < exp2. 
+   // Swapping causes exp2 to be used for the result exponent. 
+   // Only the exponent of the larger operand is used to determine
+   // the final result. 
+   assign swap = exp_diff1[11] & ~zeroB;
+   assign exponent = swap ? exp2[10:0] : exp1[10:0];
+   assign exponent_postsum = swap ? exp2[10:0] : exp1[10:0];
+   assign mantissaA = swap ? Float2[51:0] : Float1[51:0];
+   assign mantissaB = swap ? Float1[51:0] : Float2[51:0];
+   assign signA     = swap ? Float2[63] : Float1[63];   
+
+   // Leading-Zero Detector. Determine the size of the shift needed for
+   // normalization. If sum_corrected is all zeros, the exp_valid is 
+   // zero; otherwise, it is one. 
+   // modified to 52 bits to detect leading zeroes on denormalized mantissas
+   lz52 lz_norm_1 (ZP_mantissaA, ZV_mantissaA, mantissaA);
+   lz52 lz_norm_2 (ZP_mantissaB, ZV_mantissaB, mantissaB);
+
+   // Denormalized exponents created by subtracting the leading zeroes from the original exponents
+   assign exp1_denorm = swap ? (exp1 - ZP_mantissaB) : (exp1 - ZP_mantissaA);
+   assign exp2_denorm = swap ? (exp2 - ZP_mantissaA) : (exp2 - ZP_mantissaB);
+
+   // Finds normal underflow result to determine whether to round final exponent down
+   // Comparison between each float and the resulting sum of the primary cla adder/subtractor and cla subtractor
+   assign Float1_sum_comp = (Float1[51:0] > sum[51:0]) ? 1'b0 : 1'b1;
+   assign Float2_sum_comp = (Float2[51:0] > sum[51:0]) ? 1'b0 : 1'b1;
+   assign Float1_sum_tc_comp = (Float1[51:0] > sum_tc[51:0]) ? 1'b0 : 1'b1;
+   assign Float2_sum_tc_comp = (Float2[51:0] > sum_tc[51:0]) ? 1'b0 : 1'b1;
+
+   // Determines the correct Float value to compare based on swap result
+   assign mantissa_comp_sum = swap ? Float2_sum_comp : Float1_sum_comp;
+   assign mantissa_comp_sum_tc = swap ? Float2_sum_tc_comp : Float1_sum_tc_comp;
+
+   // Determines the correct comparison result based on operation and sign of resulting sum
+   assign mantissa_comp = (op_type[0] ^ sum[63]) ? mantissa_comp_sum_tc : mantissa_comp_sum;
+
+   // If the signs are different and both operands aren't denormalized
+   // the normal underflow bit is needed and therefore updated.
+   assign normal_underflow = ((Float1[63] ~^ Float2[63]) & (opA_Norm | opB_Norm)) ? mantissa_comp : 1'b0;
+
+   // Determine the alignment shift and limit it to 63. If any bit from 
+   // exp_shift[6] to exp_shift[11] is one, then shift is set to all ones. 
+   assign exp_shift = swap ? exp_diff2 : exp_diff1;
+   assign exp_gt63 = exp_shift[11] | exp_shift[10] | exp_shift[9] 
+     | exp_shift[8] | exp_shift[7] | exp_shift[6];
+   assign align_shift = exp_shift | {6{exp_gt63}};
+
+   // Unpack the 52-bit mantissas to 57-bit numbers of the form.
+   //    001.M[51]M[50] ... M[1]M[0]00
+   // Unless the number has an exponent of zero, in which case it
+   // is unpacked as
+   //    000.00 ... 00
+   // This effectively flushes denormalized values to zero. 
+   // The three bits of to the left of the binary point prevent overflow
+   // and loss of sign information. The two bits to the right of the 
+   // original mantissa form the "guard" and "round" bits that are used
+   // to round the result. 
+   assign opA_Norm = swap ? op2_Norm : op1_Norm;
+   assign opB_Norm = swap ? op1_Norm : op2_Norm;
+   assign mantissaA1 = {2'h0, opA_Norm, mantissaA[51:0]&{52{opA_Norm}}, 2'h0};
+   assign mantissaB1 = {2'h0, opB_Norm, mantissaB[51:0]&{52{opB_Norm}}, 2'h0};
+
+   // Perform mantissa alignment using a 57-bit barrel shifter 
+   // If any of the bits shifted out are one, Sticky_out is set. 
+   // The size of the barrel shifter could be reduced by two bits
+   // by not adding the leading two zeros until after the shift. 
+   barrel_shifter_r57 bs1 (mantissaB2, Sticky_out, mantissaB1, align_shift);
+
+   // Place either the sign-extened 32-bit value or the original 64-bit value 
+   // into IntValue (to be used for integer to floating point conversion)
+   assign IntValue [31:0] = op1[31:0];
+   assign IntValue [63:32] = op_type[0] ? {32{op1[31]}} : op1[63:32];
+
+   // If doing an integer to floating point conversion, mantissaA3 is set to 
+   // IntVal and the prenomalized exponent is set to 1084. Otherwise, 
+   // mantissaA3 is simply extended to 64-bits by setting the 7 LSBs to zero, 
+   // and the exponent value is left unchanged. 
+   // Under denormalized cases, the exponent before the rounder is set to 1
+   // if the normal shift value is 11.
+   assign convert       = ~op_type[2] & op_type[1];
+   assign mantissaA3    = (op_type[3]) ? (op_type[0] ? Float1 : ~Float1) : (DenormIn ? ({12'h0, mantissaA}) : (convert ? IntValue : {mantissaA1, 7'h0}));
+   assign exp_pre       = DenormIn ? 
+			  ((norm_shift == 6'b001011) ? 11'b00000000001 : (swap ? exp2_denorm : exp1_denorm))
+			  : (convert ? 11'b10000111100 : exponent);
+
+   // Put zero in for mantissaB3, if zeroB is one. Otherwise, B is extended to 
+   // 64-bits by setting the 7 LSBs to the Sticky_out bit followed by six  
+   // zeros. 
+   assign mantissaB3[63:7] = (op_type[3]) ? (57'h0) : (DenormIn ? {12'h0, mantissaB[51:7]} : mantissaB2 & {57{~zeroB}});
+   assign mantissaB3[6]    = (op_type[3]) ? (1'b0) : (DenormIn ? mantissaB[6] : Sticky_out & ~zeroB);
+   assign mantissaB3[5:0]  = (op_type[3]) ? (6'h01) : (DenormIn ? mantissaB[5:0] : 6'h0);
+
+   // The sign of the result needs to be corrected if the true
+   // operation is subtraction and the input operands were swapped. 
+   assign corr_sign = ~op_type[2]&~op_type[1]&op_type[0]&swap;
+   
+   // 64-bit Mantissa Adder/Subtractor
+   cla64 add1 (sum, mantissaA3, mantissaB3, sub);
+
+   // 64-bit Mantissa Subtractor - to get the two's complement of the 
+   // result when the sign from the adder/subtractor is negative. 
+   cla_sub64 sub1 (sum_tc, mantissaB3, mantissaA3);
+
+   // Determine the correct sign of the result
+   assign sign_corr = ((corr_sign ^ signA) & ~convert) ^ sum[63];   
+   
+   // If the sum is negative, use its two complement instead. 
+   // This value has to be 64-bits to correctly handle the 
+   // case 10...00
+   assign sum_corr = (DenormIn & (opA_Norm | opB_Norm) & ( ( (Float1[63] ~^ Float2[63]) & op_type[0] ) | ((Float1[63] ^ Float2[63]) & ~op_type[0]) ))
+			 ? (sum[63] ? sum : sum_tc) : ( (op_type[3]) ? sum : (sum[63] ? sum_tc : sum));
+
+   // Finds normal underflow result to determine whether to round final exponent down
+   assign normal_overflow = (DenormIn & (sum == 16'h0) & (opA_Norm | opB_Norm) & ~op_type[0]) ? 1'b1 : (sum[63] ? sum_tc[52] : sum[52]);
+
+   // Leading-Zero Detector. Determine the size of the shift needed for
+   // normalization. If sum_corrected is all zeros, the exp_valid is 
+   // zero; otherwise, it is one. 
+   lz64 lzd1 (norm_shift, exp_valid, sum_corr);
+
+   assign norm_shift_denorm = (DenormIn & ( (~opA_Norm & ~opB_Norm) | normal_underflow)) ? (6'h00) : (norm_shift);
+
+   // Barell shifter used for normalization. It takes as inputs the 
+   // the corrected sum and the amount by which the sum should 
+   // be right shifted. It outputs the normalized sum. 
+   barrel_shifter_l64 bs2 (sum_norm, sum_corr, norm_shift_denorm);
+  
+   assign sum_norm_w_bypass = (op_type[3]) ? (op_type[0] ? ~sum_corr : sum_corr) : (sum_norm);
+
+   // Round the mantissa to a 52-bit value, with the leading one
+   // removed. If the result is a single precision number, the actual 
+   // mantissa is in the upper 23 bits and the lower 29 bits are zero. 
+   // At this point, normalization has already been performed, so we know 
+   // exactly where the rounding point is. The rounding units also
+   // handles special cases and set the exception flags.
+
+   // Changed DenormIO -> Denorm and FlagsIn -> Flags in order to
+   // help in processor reservation station detection of load/stores. In
+   // other words, the processor would like to know ahead of time that
+   // if the result is an exception then don't load or store.
+   rounder round1 (Result, DenormIO, FlagsIn, rm, P, OvEn, UnEn, exp_valid, 
+		   sel_inv, Invalid, DenormIn, convert, sign_corr, exp_pre, norm_shift, sum_norm_w_bypass,
+		   exponent_postsum, op1_Norm, op2_Norm, Float1[63:52], Float2[63:52],
+		   normal_overflow, normal_underflow, swap, op_type, sum);
+
+   // Store the final result and the exception flags in registers.
+   assign AS_Result = Result;
+   assign {Denorm, Flags} = {DenormIO, FlagsIn};
+   
+endmodule // fpadd
+
+
--- a/wally-pipelined/src/fpu/fpdiv.sv
+++ b/wally-pipelined/src/fpu/fpdiv.sv
@ -0,0 +1,249 @@
+//
+// File name : fpdiv
+// Title     : Floating-Point Divider/Square-Root
+// project   : FPU
+// Library   : fpdiv
+// Author(s) : James E. Stine, Jr.
+// Purpose   : definition of main unit to floating-point div/sqrt
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Basic Operations
+//
+// Step 1: Load operands, set flags, and convert SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Exponent Logic
+// Step 4: Divide/Sqrt using Goldschmidt
+// Step 5: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 6: Round the result.// 
+// Step 7: Put quotient/remainder onto output.
+//
+
+// `timescale 1ps/1ps
+module fpdiv (DivSqrtDone, DivResultM, DivFlagsM, DivDenormM, DivOp1, DivOp2, DivFrm, DivOpType, DivP, DivOvEn, DivUnEn,
+	      DivStart, reset, clk);
+
+   input [63:0] DivOp1;		// 1st input operand (A)
+   input [63:0] DivOp2;		// 2nd input operand (B)
+   input [2:0] 	DivFrm;		// Rounding mode - specify values 
+   input 	DivOpType;	// Function opcode
+   input 	DivP;   		// Result Precision (0 for double, 1 for single)
+   input 	DivOvEn;		// Overflow trap enabled
+   input 	DivUnEn;   	// Underflow trap enabled
+
+   input 	DivStart;
+   input 	reset;
+   input 	clk;   
+
+   output [63:0] DivResultM;	// Result of operation
+   output [4:0]  DivFlagsM;   	// IEEE exception flags 
+   output 	 DivDenormM;   	// DivDenormM on input or output
+   output 	 DivSqrtDone;
+
+   supply1 	  vdd;
+   supply0 	  vss;   
+
+   wire [63:0] 	 Float1; 
+   wire [63:0] 	 Float2;
+   wire [63:0] 	 IntValue;
+   
+   wire [12:0] 	 exp1, exp2, expF;
+   wire [12:0] 	 exp_diff, bias;
+   wire [13:0] 	 exp_sqrt;
+   wire [12:0] 	 exp_s;
+   wire [12:0] 	 exp_c;
+   
+   wire [10:0] 	 exponent, exp_pre;
+   wire [63:0] 	 Result;   
+   wire [52:0] 	 mantissaA;
+   wire [52:0] 	 mantissaB; 
+   wire [63:0] 	 sum, sum_tc, sum_corr, sum_norm;
+   
+   wire [5:0] 	 align_shift;
+   wire [5:0] 	 norm_shift;
+   wire [2:0] 	 sel_inv;
+   wire		 op1_Norm, op2_Norm;
+   wire		 opA_Norm, opB_Norm;
+   wire		 Invalid;
+   wire 	 DenormIn, DenormIO;
+   wire [4:0] 	 FlagsIn;   	
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire 	 signResult, sign_corr;
+   wire          corr_sign;
+   wire 	 zeroB;         
+   wire 	 convert;
+   wire          swap;
+   wire          sub;
+   
+   wire [63:0] 	 q1, qm1, qp1, q0, qm0, qp0;
+   wire [63:0] 	 rega_out, regb_out, regc_out, regd_out;
+   wire [127:0]  regr_out;
+   wire [2:0] 	 sel_muxa, sel_muxb;
+   wire 	 sel_muxr;   
+   wire 	 load_rega, load_regb, load_regc, load_regd, load_regr;
+
+   wire 	 donev, sel_muxrv, sel_muxsv;
+   wire [1:0] 	 sel_muxav, sel_muxbv;   
+   wire 	 load_regav, load_regbv, load_regcv;
+   wire 	 load_regrv, load_regsv;
+   
+   // Convert the input operands to their appropriate forms based on 
+   // the orignal operands, the DivOpType , and their precision DivP. 
+   // Single precision inputs are converted to double precision 
+   // and the sign of the first operand is set appropratiately based on
+   // if the operation is absolute value or negation. 
+   convert_inputs_div divconv1 (Float1, Float2, DivOp1, DivOp2, DivOpType, DivP);
+
+   // Test for exceptions and return the "Invalid Operation" and
+   // "Denormalized" Input DivFlagsM. The "sel_inv" is used in
+   // the third pipeline stage to select the result. Also, op1_Norm
+   // and op2_Norm are one if DivOp1 and DivOp2 are not zero or denormalized.
+   // sub is one if the effective operation is subtaction. 
+   exception_div divexc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, 
+		   Float1, Float2, DivOpType);
+
+   // Determine Sign/Mantissa
+   assign signResult = ((Float1[63]^Float2[63])&~DivOpType) | Float1[63]&DivOpType;
+   assign mantissaA = {vdd, Float1[51:0]};
+   assign mantissaB = {vdd, Float2[51:0]};
+   // Perform Exponent Subtraction - expA - expB + Bias   
+   assign exp1 = {2'b0, Float1[62:52]};
+   assign exp2 = {2'b0, Float2[62:52]};
+   // bias : DP = 2^{11-1}-1 = 1023
+   assign bias = {3'h0, 10'h3FF};
+   // Divide exponent
+   csa #(13) csa1 (exp1, ~exp2, bias, exp_s, exp_c);
+   exp_add explogic1 (exp_cout1, {open, exp_diff}, 
+		      {vss, exp_s}, {vss, exp_c}, 1'b1);
+   // Sqrt exponent (check if exponent is odd)
+   assign exp_odd = Float1[52] ? vss : vdd;
+   exp_add explogic2 (exp_cout2, exp_sqrt, 
+		      {vss, exp1}, {4'h0, 10'h3ff}, exp_odd);
+   // Choose correct exponent
+   assign expF = DivOpType ? exp_sqrt[13:1] : exp_diff;   
+
+   // Main Goldschmidt/Division Routine
+   divconv goldy (q1, qm1, qp1, q0, qm0, qp0, 
+		  rega_out, regb_out, regc_out, regd_out,
+		  regr_out, mantissaB, mantissaA, 
+		  sel_muxa, sel_muxb, sel_muxr, 
+		  reset, clk,
+		  load_rega, load_regb, load_regc, load_regd,
+		  load_regr, load_regs, DivP, DivOpType, exp_odd);
+
+   // FSM : control divider
+   fsm control (DivSqrtDone, load_rega, load_regb, load_regc, load_regd, 
+		load_regr, load_regs, sel_muxa, sel_muxb, sel_muxr, 
+		clk, reset, DivStart, error, DivOpType);
+   
+   // Round the mantissa to a 52-bit value, with the leading one
+   // removed. The rounding units also handles special cases and 
+   // set the exception flags.
+   //***add max magnitude and swap negitive and positive infinity
+   rounder_div divround1 (Result, DenormIO, FlagsIn, 
+		   DivFrm, DivP, DivOvEn, DivUnEn, expF, 
+   		   sel_inv, Invalid, DenormIn, signResult, 
+		   q1, qm1, qp1, q0, qm0, qp0, regr_out);
+
+   // Store the final result and the exception flags in registers.
+   flopenr #(64) rega (clk, reset, DivSqrtDone, Result, DivResultM);
+   flopenr #(1) regb (clk, reset, DivSqrtDone, DenormIO, DivDenormM);   
+   flopenr #(5) regc (clk, reset, DivSqrtDone, FlagsIn, DivFlagsM);   
+   
+endmodule // fpadd
+
+//
+// Brent-Kung Prefix Adder 
+//   (yes, it is 14 bits as my generator is broken for 13 bits :( 
+//    assume, synthesizer will delete stuff not needed )
+//
+module exp_add (cout, sum, a, b, cin);
+   
+   input [13:0] a, b;
+   input 	cin;
+   
+   output [13:0] sum;
+   output 	 cout;
+
+   wire [14:0] 	 p,g;
+   wire [13:0] 	 c;
+
+   // pre-computation
+   assign p={a^b,1'b0};
+   assign g={a&b, cin};
+
+   // prefix tree
+   brent_kung prefix_tree(c, p[13:0], g[13:0]);
+
+   // post-computation
+   assign sum=p[14:1]^c;
+   assign cout=g[14]|(p[14]&c[13]);
+
+endmodule // exp_add
+
+module brent_kung (c, p, g);
+   
+   input [13:0] p;
+   input [13:0] g;
+   output [14:1] c;
+
+   // parallel-prefix, Brent-Kung
+
+   // Stage 1: Generates G/DivP pairs that span 1 bits
+   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
+   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
+   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
+   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
+   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
+   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
+   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
+
+   // Stage 2: Generates G/DivP pairs that span 2 bits
+   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
+   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
+   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
+
+   // Stage 3: Generates G/DivP pairs that span 4 bits
+   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
+
+   // Stage 4: Generates G/DivP pairs that span 8 bits
+
+   // Stage 5: Generates G/DivP pairs that span 4 bits
+   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
+
+   // Stage 6: Generates G/DivP pairs that span 2 bits
+   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
+   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
+   grey g_13_0 (G_13_0, {G_13_12,G_11_0}, P_13_12);
+
+   // Last grey cell stage 
+   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
+   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
+   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
+   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
+   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
+   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
+
+   // Final Stage: Apply c_k+1=G_k_0
+   assign c[1]=g[0];
+   assign c[2]=G_1_0;
+   assign c[3]=G_2_0;
+   assign c[4]=G_3_0;
+   assign c[5]=G_4_0;
+   assign c[6]=G_5_0;
+   assign c[7]=G_6_0;
+   assign c[8]=G_7_0;
+   assign c[9]=G_8_0;
+
+   assign c[10]=G_9_0;
+   assign c[11]=G_10_0;
+   assign c[12]=G_11_0;
+   assign c[13]=G_12_0;
+   assign c[14]=G_13_0;
+
+endmodule // brent_kung
+
--- a/wally-pipelined/src/fpu/dev/fputop.sv
+++ b/wally-pipelined/src/fpu/dev/fputop.sv
@ -1,13 +1,16 @@
-`include "../../../config/rv64icfd/wally-config.vh"

-module fputop (
-  input  logic [2:0]       FrmD,
+`include "wally-config.vh"
+
+module fpu (
+  //input  logic [2:0]       FrmD,
+  input  logic [2:0]       FRM_REGW,    // Rounding mode from CSR
  input  logic             reset,
-  input  logic             clear,
+  //input  logic             clear,     // *** what is this used for?
  input  logic             clk,
  input  logic [31:0]      InstrD,
-  input  logic [`XLEN-1:0] SrcAE,
-  input  logic [`XLEN-1:0] SrcAW,
+  input  logic [`XLEN-1:0] SrcAE,       // Integer input being processed
+  input  logic [`XLEN-1:0] SrcAM,       // Integer input being written into fpreg
+  output logic [4:0]       SetFflagsM,
  output logic [31:0]      FSROutW,
  output logic             DivSqrtDoneE,
  output logic             FInvalInstrD,
@ -62,14 +65,14 @@ module fputop (
  logic                    IllegalFPUInstrFaultD;
  logic                    FRegWriteD;
  logic [2:0]              FResultSelD;
-  //logic [2:0]              FrmD;
+  logic [2:0]              FrmD;
  logic                    PD;
  logic                    DivSqrtStartD;
  logic [3:0]              OpCtrlD;
  logic                    WriteIntD;
  
  //top-level controller for FPU
-  fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Rs1D(InstrD[19:15]), .FrmW(InstrD[14:12]), .WriteEnD(FRegWriteD), .DivSqrtStartD(DivSqrtStartD), .WriteSelD(FResultSelD), .OpCtrlD(OpCtrlD), .FmtD(PD), .WriteIntD(WriteIntD));
+  fctrl ctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Rs1D(InstrD[19:15]), .FrmW(InstrD[14:12]), .WriteEnD(FRegWriteD), .WriteSelD(FResultSelD), .FmtD(PD), .*);

  //instantiation of D stage regfile signals (includes some W stage signals
  //for easy reference)
@ -147,19 +150,19 @@ module fputop (
  //*****************
  //fpregfile D/E pipe registers
  //*****************
-  flopenrc #(64) (clk, reset, PipeClearDE, PipeEnableDE, ReadData1D, ReadData1E);
-  flopenrc #(64) (clk, reset, PipeClearDE, PipeEnableDE, ReadData2D, ReadData2E);
-  flopenrc #(64) (clk, reset, PipeClearDE, PipeEnableDE, ReadData3D, ReadData3E);
+  flopenrc #(64) DEReg1(clk, reset, PipeClearDE, PipeEnableDE, ReadData1D, ReadData1E);
+  flopenrc #(64) DEReg2(clk, reset, PipeClearDE, PipeEnableDE, ReadData2D, ReadData2E);
+  flopenrc #(64) DEReg3(clk, reset, PipeClearDE, PipeEnableDE, ReadData3D, ReadData3E);

  //*****************
  //other  D/E pipe registers
  //*****************
-  flopenrc #(1) (clk, reset, PipeClearDE, PipeEnableDE, FRegWriteD, FRegWriteE);
-  flopenrc #(3) (clk, reset, PipeClearDE, PipeEnableDE, FResultsSelD, FResultsSelE);
-  flopenrc #(3) (clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
-  flopenrc #(1) (clk, reset, PipeClearDE, PipeEnableDE, PD, PE);
-  flopenrc #(4) (clk, reset, PipeClearDE, PipeEnableDE, OpCtrlD, OpCtrlE);
-  flopenrc #(1) (clk, reset, PipeClearDE, PipeEnableDE, DivSqrtStartD, DivSqrtStartE);
+  flopenrc #(1) DEReg4(clk, reset, PipeClearDE, PipeEnableDE, FRegWriteD, FRegWriteE);
+  flopenrc #(3) DEReg5(clk, reset, PipeClearDE, PipeEnableDE, FResultSelD, FResultSelE);
+  flopenrc #(3) DEReg6(clk, reset, PipeClearDE, PipeEnableDE, FrmD, FrmE);
+  flopenrc #(1) DEReg7(clk, reset, PipeClearDE, PipeEnableDE, PD, PE);
+  flopenrc #(4) DEReg8(clk, reset, PipeClearDE, PipeEnableDE, OpCtrlD, OpCtrlE);
+  flopenrc #(1) DEReg9(clk, reset, PipeClearDE, PipeEnableDE, DivSqrtStartD, DivSqrtStartE);

  //
  //END D/E PIPE
@ -172,16 +175,16 @@ module fputop (
  //fma1 ();

  //first and only instance of floating-point divider
-  fpdivsqrt (DivSqrtDone, DivResultM, DivFlagsM, DivDenormM, DivOp1, DivOp2, DivFrm, DivOpType, DivP, DivOvEn, DivUnEn, DivStart, reset, clk);
+  fpdiv fpdivsqrt (.*);

  //first of two-stage instance of floating-point add/cvt unit
-  fpaddcvt1 fpadd1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, AddOp1E, AddOp2E, AddRmE, AddOpTypeE, AddPE, AddOvEnE, AddUnEnE);
+  fpuaddcvt1 fpadd1 (AddSumE, AddSumTcE, AddSelInvE, AddExpPostSumE, AddCorrSignE, AddOp1NormE, AddOp2NormE, AddOpANormE, AddOpBNormE, AddInvalidE, AddDenormInE, AddConvertE, AddSwapE, AddNormOvflowE, AddSignAE, AddFloat1E, AddFloat2E, AddExp1DenormE, AddExp2DenormE, AddExponentE, AddOp1E, AddOp2E, AddRmE, AddOpTypeE, AddPE, AddOvEnE, AddUnEnE);

  //first of two-stage instance of floating-point comparator
-  fpcmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, CmpOp1E, CmpOp2E, CmpSelE);
+  fpucmp1 fpcmp1 (WE, XE, ANaNE, BNaNE, AzeroE, BzeroE, CmpOp1E, CmpOp2E, CmpSelE);

  //first and only instance of floating-point sign converter
-  fpusgn fpsgn (SgnOpCodeE, SgnResultE, SgnFlagsE, SgnOp1, SgnOp2);
+  fpusgn fpsgn (.*);

  //interface between XLEN size datapath and double-precision sized
  //floating-point results
@ -192,14 +195,14 @@ module fputop (
  //truncate to 64 bits
  //(causes warning during compilation - case never reached) 
  if(`XLEN > 64) begin
-        DivOp1 <= ReadData1E[`XLEN:`XLEN-64];
-	DivOp2 <= ReadData2E[`XLEN:`XLEN-64];
-        AddOp1E <= ReadData1E[`XLEN:`XLEN-64];
-	AddOp2E <= ReadData2E[`XLEN:`XLEN-64];
-        CmpOp1E <= ReadData1E[`XLEN:`XLEN-64];
-	CmpOp2E <= ReadData2E[`XLEN:`XLEN-64];
-        SgnOp1E <= ReadData1E[`XLEN:`XLEN-64];
-	SgnOp2E <= ReadData2E[`XLEN:`XLEN-64];
+        DivOp1 <= ReadData1E[`XLEN-1:`XLEN-64];
+	DivOp2 <= ReadData2E[`XLEN-1:`XLEN-64];
+        AddOp1E <= ReadData1E[`XLEN-1:`XLEN-64];
+	AddOp2E <= ReadData2E[`XLEN-1:`XLEN-64];
+        CmpOp1E <= ReadData1E[`XLEN-1:`XLEN-64];
+	CmpOp2E <= ReadData2E[`XLEN-1:`XLEN-64];
+        SgnOp1E <= ReadData1E[`XLEN-1:`XLEN-64];
+	SgnOp2E <= ReadData2E[`XLEN-1:`XLEN-64];
  end
  //zero extend to 64 bits
  else begin
@ -276,63 +279,63 @@ module fputop (
  //*****************
  //fpadd E/M pipe registers
  //*****************
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
-  flopenrc #(4) (clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
-  flopenrc #(11) (clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignM); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
-  flopenrc #(11) (clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
-  flopenrc #(11) (clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
-  flopenrc #(11) (clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddOp1E, AddOp1M); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, AddOp2E, AddOp2M); 
-  flopenrc #(3) (clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
-  flopenrc #(4) (clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 
+  flopenrc #(64) EMRegAdd1(clk, reset, PipeClearEM, PipeEnableEM, AddSumE, AddSumM); 
+  flopenrc #(64) EMRegAdd2(clk, reset, PipeClearEM, PipeEnableEM, AddSumTcE, AddSumTcM); 
+  flopenrc #(4)  EMRegAdd3(clk, reset, PipeClearEM, PipeEnableEM, AddSelInvE, AddSelInvM); 
+  flopenrc #(11) EMRegAdd4(clk, reset, PipeClearEM, PipeEnableEM, AddExpPostSumE, AddExpPostSumM); 
+  flopenrc #(1) EMRegAdd5(clk, reset, PipeClearEM, PipeEnableEM, AddCorrSignE, AddCorrSignM); 
+  flopenrc #(1) EMRegAdd6(clk, reset, PipeClearEM, PipeEnableEM, AddOp1NormE, AddOp1NormM); 
+  flopenrc #(1) EMRegAdd7(clk, reset, PipeClearEM, PipeEnableEM, AddOp2NormE, AddOp2NormM); 
+  flopenrc #(1) EMRegAdd8(clk, reset, PipeClearEM, PipeEnableEM, AddOpANormE, AddOpANormM); 
+  flopenrc #(1) EMRegAdd9(clk, reset, PipeClearEM, PipeEnableEM, AddOpBNormE, AddOpBNormM); 
+  flopenrc #(1) EMRegAdd10(clk, reset, PipeClearEM, PipeEnableEM, AddInvalidE, AddInvalidM); 
+  flopenrc #(1) EMRegAdd11(clk, reset, PipeClearEM, PipeEnableEM, AddDenormInE, AddDenormInM); 
+  flopenrc #(1) EMRegAdd12(clk, reset, PipeClearEM, PipeEnableEM, AddConvertE, AddConvertM); 
+  flopenrc #(1) EMRegAdd13(clk, reset, PipeClearEM, PipeEnableEM, AddSwapE, AddSwapM); 
+  flopenrc #(1) EMRegAdd14(clk, reset, PipeClearEM, PipeEnableEM, AddNormOvflowE, AddNormOvflowM); 
+  flopenrc #(1) EMRegAdd15(clk, reset, PipeClearEM, PipeEnableEM, AddSignAE, AddSignM); 
+  flopenrc #(64) EMRegAdd16(clk, reset, PipeClearEM, PipeEnableEM, AddFloat1E, AddFloat1M); 
+  flopenrc #(64) EMRegAdd17(clk, reset, PipeClearEM, PipeEnableEM, AddFloat2E, AddFloat2M); 
+  flopenrc #(11) EMRegAdd18(clk, reset, PipeClearEM, PipeEnableEM, AddExp1DenormE, AddExp1DenormM); 
+  flopenrc #(11) EMRegAdd19(clk, reset, PipeClearEM, PipeEnableEM, AddExp2DenormE, AddExp2DenormM); 
+  flopenrc #(11) EMRegAdd20(clk, reset, PipeClearEM, PipeEnableEM, AddExponentE, AddExponentM); 
+  flopenrc #(64) EMRegAdd21(clk, reset, PipeClearEM, PipeEnableEM, AddOp1E, AddOp1M); 
+  flopenrc #(64) EMRegAdd22(clk, reset, PipeClearEM, PipeEnableEM, AddOp2E, AddOp2M); 
+  flopenrc #(3) EMRegAdd23(clk, reset, PipeClearEM, PipeEnableEM, AddRmE, AddRmM); 
+  flopenrc #(4) EMRegAdd24(clk, reset, PipeClearEM, PipeEnableEM, AddOpTypeE, AddOpTypeM); 
+  flopenrc #(1) EMRegAdd25(clk, reset, PipeClearEM, PipeEnableEM, AddPE, AddPM); 
+  flopenrc #(1) EMRegAdd26(clk, reset, PipeClearEM, PipeEnableEM, AddOvEnE, AddOvEnM); 
+  flopenrc #(1) EMRegAdd27(clk, reset, PipeClearEM, PipeEnableEM, AddUnEnE, AddUnEnM); 

  //*****************
  //fpcmp E/M pipe registers
  //*****************
-  flopenrc #(8) (clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
-  flopenrc #(8) (clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, CmpOp1E, CmpOp1M); 
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, CmpOp2E, CmpOp2M); 
-  flopenrc #(2) (clk, reset, PipeClearEM, PipeEnableEM, CmpSelE, CmpSelM);
+  flopenrc #(8) EMRegCmp1(clk, reset, PipeClearEM, PipeEnableEM, WE, WM); 
+  flopenrc #(8) EMRegCmp2(clk, reset, PipeClearEM, PipeEnableEM, XE, XM); 
+  flopenrc #(1) EMRegcmp3(clk, reset, PipeClearEM, PipeEnableEM, ANaNE, ANaNM); 
+  flopenrc #(1) EMRegCmp4(clk, reset, PipeClearEM, PipeEnableEM, BNaNE, BNaNM); 
+  flopenrc #(1) EMRegCmp5(clk, reset, PipeClearEM, PipeEnableEM, AzeroE, AzeroM); 
+  flopenrc #(1) EMRegCmp6(clk, reset, PipeClearEM, PipeEnableEM, BzeroE, BzeroM); 
+  flopenrc #(64) EMRegCmp7(clk, reset, PipeClearEM, PipeEnableEM, CmpOp1E, CmpOp1M); 
+  flopenrc #(64) EMRegCmp8(clk, reset, PipeClearEM, PipeEnableEM, CmpOp2E, CmpOp2M); 
+  flopenrc #(2) EMRegCmp9(clk, reset, PipeClearEM, PipeEnableEM, CmpSelE, CmpSelM);

  //put this in for the event we want to delay fsgn - will otherwise bypass
  //*****************
  //fpsgn E/M pipe registers
  //***************** 
-  flopenrc #(2) (clk, reset, PipeClearEM, PipeEnableEM, SgnOpCodeE, SgnOpCodeM);
-  flopenrc #(64) (clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
-  flopenrc #(5) (clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);
+  flopenrc #(2) EMRegSgn1(clk, reset, PipeClearEM, PipeEnableEM, SgnOpCodeE, SgnOpCodeM);
+  flopenrc #(64) EMRegSgn2(clk, reset, PipeClearEM, PipeEnableEM, SgnResultE, SgnResultM);
+  flopenrc #(5) EMRegSgn3(clk, reset, PipeClearEM, PipeEnableEM, SgnFlagsE, SgnFlagsM);

  //*****************
  //other E/M pipe registers
  //*****************
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, FRegWriteE, FRegWriteM);
-  flopenrc #(3) (clk, reset, PipeClearEM, PipeEnableEM, FResultsSelE, FResultsSelM);
-  flopenrc #(3) (clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
-  flopenrc #(1) (clk, reset, PipeClearEM, PipeEnableEM, PE, PM);
-  flopenrc #(4) (clk, reset, PipeClearEM, PipeEnableEM, OpCtrlE, OpCtrlM);
+  flopenrc #(1) EMReg1(clk, reset, PipeClearEM, PipeEnableEM, FRegWriteE, FRegWriteM);
+  flopenrc #(3) EMReg2(clk, reset, PipeClearEM, PipeEnableEM, FResultSelE, FResultSelM);
+  flopenrc #(3) EMReg3(clk, reset, PipeClearEM, PipeEnableEM, FrmE, FrmM);
+  flopenrc #(1) EMReg4(clk, reset, PipeClearEM, PipeEnableEM, PE, PM);
+  flopenrc #(4) EMReg5(clk, reset, PipeClearEM, PipeEnableEM, OpCtrlE, OpCtrlM);

  //
  //END E/M PIPE
@ -345,10 +348,10 @@ module fputop (
  //fma2 ();

  //second instance of two-stage floating-point add/cvt unit
-  fpaddcvt2 fpadd2 (AddResultM, AddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, AddOp1M, AddOp2M, AddRmM, AddOpTypeM, AddPM, AddOvEnM, AddUnEnM);
+  fpuaddcvt2 fpadd2 (.*);

  //second instance of two-stage floating-point comparator
-  fpcmp2 fpcmp2 (CmpInvalidM, CmpFCCM, ANaNM, BNaNM, AzeroM, BzeroM, WM, XM, CmpSelM, CmpOp1M, CmpOp2M);
+  fpucmp2 fpcmp2 (CmpInvalidM, CmpFCCM, ANaNM, BNaNM, AzeroM, BzeroM, WM, XM, CmpSelM, CmpOp1M, CmpOp2M);

  //
  //END MEMORY STAGE
@ -371,8 +374,13 @@ module fputop (
  logic [63:0]             DivResultW;
  logic [4:0]              DivFlagsW;

+  //instantiation of W stage fsgn signals
+  logic [63:0]            SgnResultW;
+  logic [4:0]             SgnFlagsW;
+
  //instantiation of W stage regfile signals
  logic [`XLEN-1:0]        ReadData1W, ReadData2W, ReadData3W;
+  logic [`XLEN-1:0]        SrcAW;

  //instantiation of W stage add/cvt signals
  logic [63:0]             AddResultW;
@ -390,35 +398,35 @@ module fputop (
  //*****************
  //fpdiv M/W pipe registers
  //*****************
-  flopenrc #(64) (clk, reset, PipeClearMW, PipeEnableMW, DivResultM, DivResultW); 
-  flopenrc #(5) (clk, reset, PipeClearMW, PipeEnableMW, DivFlagsM, DivFlagsW);
-  flopenrc #(1) (clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 
+  flopenrc #(64) MWRegDiv1(clk, reset, PipeClearMW, PipeEnableMW, DivResultM, DivResultW); 
+  flopenrc #(5) MWRegDiv2(clk, reset, PipeClearMW, PipeEnableMW, DivFlagsM, DivFlagsW);
+  flopenrc #(1) MWRegDiv3(clk, reset, PipeClearMW, PipeEnableMW, DivDenormM, DivDenormW); 

  //*****************
  //fpadd M/W pipe registers
  //*****************
-  flopenrc #(64) (clk, reset, PipeClearMW, PipeEnableMW, AddResultM, AddResultW); 
-  flopenrc #(5) (clk, reset, PipeClearMW, PipeEnableMW, AddFlagsM, AddFlagsW); 
-  flopenrc #(1) (clk, reset, PipeClearMW, PipeEnableMW, AddDenormM, AddDenormW); 
+  flopenrc #(64) MWRegAdd1(clk, reset, PipeClearMW, PipeEnableMW, AddResultM, AddResultW); 
+  flopenrc #(5) MWRegAdd2(clk, reset, PipeClearMW, PipeEnableMW, AddFlagsM, AddFlagsW); 
+  flopenrc #(1) MWRegAdd3(clk, reset, PipeClearMW, PipeEnableMW, AddDenormM, AddDenormW); 

  //*****************
  //fpcmp M/W pipe registers
  //*****************
-  flopenrc #(1) (clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
-  flopenrc #(2) (clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 
+  flopenrc #(1) MWRegCmp1(clk, reset, PipeClearMW, PipeEnableMW, CmpInvalidM, CmpInvalidW); 
+  flopenrc #(2) MWRegCmp2(clk, reset, PipeClearMW, PipeEnableMW, CmpFCCM, CmpFCCW); 

  //*****************
  //fpsgn M/W pipe registers
  //***************** 
-  flopenrc #(64) (clk, reset, PipeClearMW, PipeEnableMw, SgnResultM, SgnResultW);
-  flopenrc #(5) (clk, reset, PipeClearMw, PipeEnableMw, SgnFlagsM, SgnFlagsW);
+  flopenrc #(64) MWRegSgn1(clk, reset, PipeClearMW, PipeEnableMW, SgnResultM, SgnResultW);
+  flopenrc #(5) MWRegSgn2(clk, reset, PipeClearMW, PipeEnableMW, SgnFlagsM, SgnFlagsW);

  //*****************
  //other M/W pipe registers
  //*****************
-  flopenrc #(1) (clk, reset, PipeClearMW, PipeEnableMW, FRegWriteM, FRegWriteW);
-  flopenrc #(3) (clk, reset, PipeClearMW, PipeEnableMW, FResultsSelM, FResultsSelW);
-  flopenrc #(1) (clk, reset, PipeClearMW, PipeEnableMW, PM, PW);
+  flopenrc #(1) MWReg1(clk, reset, PipeClearMW, PipeEnableMW, FRegWriteM, FRegWriteW);
+  flopenrc #(3) MWReg2(clk, reset, PipeClearMW, PipeEnableMW, FResultSelM, FResultSelW);
+  flopenrc #(1) MWReg3(clk, reset, PipeClearMW, PipeEnableMW, PM, PW);

  ////END M/W PIPE
  //*****************************************
--- a/wally-pipelined/src/fpu/fpuaddcvt1.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt1.sv
@ -0,0 +1,200 @@
+//
+// File name : fpadd
+// Title     : Floating-Point Adder/Subtractor
+// project   : FPU
+// Library   : fpadd
+// Author(s) : James E. Stine, Jr., Brett Mathis
+// Purpose   : definition of main unit to floating-point add/sub
+// notes :   
+//
+// Copyright Oklahoma State University
+// Copyright AFRL
+//
+// Basic and Denormalized Operations
+//
+// Step 1: Load operands, set flags, and convert SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
+//         or of (exp1 = exp2 AND mnt1 < mnt2)
+// Step 4: Shift the mantissa corresponding to the smaller exponent, 
+//          and extend precision by three bits to the right.
+// Step 5: Add or subtract the mantissas.
+// Step 6: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 7: Round the result.// 
+// Step 8: Put sum onto output.
+//
+
+
+module fpuaddcvt1 (sum, sum_tc, sel_inv, exponent_postsum, corr_sign, op1_Norm, op2_Norm, opA_Norm, opB_Norm, Invalid, DenormIn, convert, swap, normal_overflow, signA, Float1, Float2, exp1_denorm, exp2_denorm, exponent, op1, op2, rm, op_type, Pin, OvEn, UnEn);
+
+   input [63:0] op1;		// 1st input operand (A)
+   input [63:0] op2;		// 2nd input operand (B)
+   input [2:0] 	rm;		// Rounding mode - specify values 
+   input [3:0]	op_type;	// Function opcode
+   input 	Pin;   		// Result Precision (0 for double, 1 for single)
+   input 	OvEn;		// Overflow trap enabled
+   input 	UnEn;   	// Underflow trap enabled
+
+   wire          P;
+   assign P = Pin | op_type[2];
+
+   wire [63:0] 	 IntValue;
+   wire [11:0] 	 exp1, exp2;
+   wire [11:0] 	 exp_diff1, exp_diff2;
+   wire [11:0] 	 exp_shift;
+   wire [51:0] 	 mantissaA;
+   wire [56:0] 	 mantissaA1;
+   wire [63:0] 	 mantissaA3;
+   wire [51:0] 	 mantissaB; 
+   wire [56:0] 	 mantissaB1, mantissaB2;
+   wire [63:0] 	 mantissaB3;
+   wire 	 exp_gt63;
+   wire 	 Sticky_out;
+   wire          sub;
+   wire 	 zeroB;
+   wire [5:0]	 align_shift; 
+
+   output [63:0] 	 Float1; 
+   output [63:0] 	 Float2;
+   output [10:0] 	 exponent;
+   output [10:0]	 exponent_postsum;
+   output [10:0]	 exp1_denorm, exp2_denorm;
+   output [63:0] sum, sum_tc;
+   output [3:0]  sel_inv;
+   output        corr_sign;
+   output 	 signA;
+   output	 op1_Norm, op2_Norm;
+   output	 opA_Norm, opB_Norm;
+   output	 Invalid;
+   output 	 DenormIn;
+//   output 	 exp_valid;
+   output 	 convert;
+   output        swap;
+   output 	 normal_overflow;
+   wire [5:0]	 ZP_mantissaA;
+   wire [5:0]	 ZP_mantissaB;
+   wire		 ZV_mantissaA;
+   wire		 ZV_mantissaB;
+
+   // Convert the input operands to their appropriate forms based on 
+   // the orignal operands, the op_type , and their precision P. 
+   // Single precision inputs are converted to double precision 
+   // and the sign of the first operand is set appropratiately based on
+   // if the operation is absolute value or negation. 
+
+   convert_inputs conv1 (Float1, Float2, op1, op2, op_type, P);
+
+   // Test for exceptions and return the "Invalid Operation" and
+   // "Denormalized" Input Flags. The "sel_inv" is used in
+   // the third pipeline stage to select the result. Also, op1_Norm
+   // and op2_Norm are one if op1 and op2 are not zero or denormalized.
+   // sub is one if the effective operation is subtaction. 
+
+   exception exc1 (sel_inv, Invalid, DenormIn, op1_Norm, op2_Norm, sub, 
+		   Float1, Float2, op_type);
+
+   // Perform Exponent Subtraction (used for alignment). For performance
+   // both exponent subtractions are performed in parallel. This was 
+   // changed to a behavior level to allow the tools to  try to optimize
+   // the two parallel additions. The input values are zero-extended to 12 
+   // bits prior to performing the addition. 
+
+   assign exp1 = {1'b0, Float1[62:52]};
+   assign exp2 = {1'b0, Float2[62:52]};
+   assign exp_diff1 = exp1 - exp2;
+   assign exp_diff2 = DenormIn ? ({Float2[63], exp2[10:0]} - {Float1[63], exp1[10:0]}): exp2 - exp1;
+
+   // The second operand (B) should be set to zero, if op_type does not
+   // specify addition or subtraction
+   assign zeroB = op_type[2] | op_type[1];
+
+   // Swapped operands if zeroB is not one and exp1 < exp2. 
+   // Swapping causes exp2 to be used for the result exponent. 
+   // Only the exponent of the larger operand is used to determine
+   // the final result. 
+   assign swap = exp_diff1[11] & ~zeroB;
+   assign exponent = swap ? exp2[10:0] : exp1[10:0];
+   assign exponent_postsum = swap ? exp2[10:0] : exp1[10:0];
+   assign mantissaA = swap ? Float2[51:0] : Float1[51:0];
+   assign mantissaB = swap ? Float1[51:0] : Float2[51:0];
+   assign signA     = swap ? Float2[63] : Float1[63];   
+
+   // Leading-Zero Detector. Determine the size of the shift needed for
+   // normalization. If sum_corrected is all zeros, the exp_valid is 
+   // zero; otherwise, it is one. 
+   // modified to 52 bits to detect leading zeroes on denormalized mantissas
+   lz52 lz_norm_1 (ZP_mantissaA, ZV_mantissaA, mantissaA);
+   lz52 lz_norm_2 (ZP_mantissaB, ZV_mantissaB, mantissaB);
+
+   // Denormalized exponents created by subtracting the leading zeroes from the original exponents
+   assign exp1_denorm = swap ? (exp1 - ZP_mantissaB) : (exp1 - ZP_mantissaA);
+   assign exp2_denorm = swap ? (exp2 - ZP_mantissaA) : (exp2 - ZP_mantissaB);
+
+   // Determine the alignment shift and limit it to 63. If any bit from 
+   // exp_shift[6] to exp_shift[11] is one, then shift is set to all ones. 
+   assign exp_shift = swap ? exp_diff2 : exp_diff1;
+   assign exp_gt63 = exp_shift[11] | exp_shift[10] | exp_shift[9] 
+     | exp_shift[8] | exp_shift[7] | exp_shift[6];
+   assign align_shift = exp_shift | {6{exp_gt63}};
+
+   // Unpack the 52-bit mantissas to 57-bit numbers of the form.
+   //    001.M[51]M[50] ... M[1]M[0]00
+   // Unless the number has an exponent of zero, in which case it
+   // is unpacked as
+   //    000.00 ... 00
+   // This effectively flushes denormalized values to zero. 
+   // The three bits of to the left of the binary point prevent overflow
+   // and loss of sign information. The two bits to the right of the 
+   // original mantissa form the "guard" and "round" bits that are used
+   // to round the result. 
+   assign opA_Norm = swap ? op2_Norm : op1_Norm;
+   assign opB_Norm = swap ? op1_Norm : op2_Norm;
+   assign mantissaA1 = {2'h0, opA_Norm, mantissaA[51:0]&{52{opA_Norm}}, 2'h0};
+   assign mantissaB1 = {2'h0, opB_Norm, mantissaB[51:0]&{52{opB_Norm}}, 2'h0};
+
+   // Perform mantissa alignment using a 57-bit barrel shifter 
+   // If any of the bits shifted out are one, Sticky_out is set. 
+   // The size of the barrel shifter could be reduced by two bits
+   // by not adding the leading two zeros until after the shift. 
+   barrel_shifter_r57 bs1 (mantissaB2, Sticky_out, mantissaB1, align_shift);
+
+   // Place either the sign-extened 32-bit value or the original 64-bit value 
+   // into IntValue (to be used for integer to floating point conversion)
+   assign IntValue [31:0] = op1[31:0];
+   assign IntValue [63:32] = op_type[0] ? {32{op1[31]}} : op1[63:32];
+
+   // If doing an integer to floating point conversion, mantissaA3 is set to 
+   // IntVal and the prenomalized exponent is set to 1084. Otherwise, 
+   // mantissaA3 is simply extended to 64-bits by setting the 7 LSBs to zero, 
+   // and the exponent value is left unchanged. 
+   // Under denormalized cases, the exponent before the rounder is set to 1
+   // if the normal shift value is 11.
+   assign convert       = ~op_type[2] & op_type[1];
+   assign mantissaA3    = (op_type[3]) ? (op_type[0] ? Float1 : ~Float1) : (DenormIn ? ({12'h0, mantissaA}) : (convert ? IntValue : {mantissaA1, 7'h0}));
+
+   // Put zero in for mantissaB3, if zeroB is one. Otherwise, B is extended to 
+   // 64-bits by setting the 7 LSBs to the Sticky_out bit followed by six  
+   // zeros. 
+   assign mantissaB3[63:7] = (op_type[3]) ? (57'h0) : (DenormIn ? {12'h0, mantissaB[51:7]} : mantissaB2 & {57{~zeroB}});
+   assign mantissaB3[6]    = (op_type[3]) ? (1'b0) : (DenormIn ? mantissaB[6] : Sticky_out & ~zeroB);
+   assign mantissaB3[5:0]  = (op_type[3]) ? (6'h01) : (DenormIn ? mantissaB[5:0] : 6'h0);
+
+   // The sign of the result needs to be corrected if the true
+   // operation is subtraction and the input operands were swapped. 
+   assign corr_sign = ~op_type[2]&~op_type[1]&op_type[0]&swap;
+
+   // 64-bit Mantissa Adder/Subtractor
+   cla64 add1 (sum, mantissaA3, mantissaB3, sub);
+
+   // 64-bit Mantissa Subtractor - to get the two's complement of the 
+   // result when the sign from the adder/subtractor is negative. 
+   cla_sub64 sub1 (sum_tc, mantissaB3, mantissaA3);
+ 
+   // Finds normal underflow result to determine whether to round final exponent down
+   assign normal_overflow = (DenormIn & (sum == 16'h0) & (opA_Norm | opB_Norm) & ~op_type[0]) ? 1'b1 : (sum[63] ? sum_tc[52] : sum[52]);
+
+endmodule // fpadd
+
+
--- a/wally-pipelined/src/fpu/fpuaddcvt2.sv
+++ b/wally-pipelined/src/fpu/fpuaddcvt2.sv
@ -0,0 +1,156 @@
+//
+// File name : fpadd
+// Title     : Floating-Point Adder/Subtractor
+// project   : FPU
+// Library   : fpadd
+// Author(s) : James E. Stine, Jr., Brett Mathis
+// Purpose   : definition of main unit to floating-point add/sub
+// notes :   
+//
+// Copyright Oklahoma State University
+// Copyright AFRL
+//
+// Basic and Denormalized Operations
+//
+// Step 1: Load operands, set flags, and AddConvertM SP to DP
+// Step 2: Check for special inputs ( +/- Infinity,  NaN)
+// Step 3: Compare exponents.  Swap the operands of exp1 < exp2
+//         or of (exp1 = exp2 AND mnt1 < mnt2)
+// Step 4: Shift the mantissa corresponding to the smaller AddExponentM, 
+//          and extend precision by three bits to the right.
+// Step 5: Add or subtract the mantissas.
+// Step 6: Normalize the result.//
+//   Shift left until normalized.  Normalized when the value to the 
+//   left of the binrary point is 1.
+// Step 7: Round the result.// 
+// Step 8: Put AddSumM onto output.
+//
+
+
+module fpuaddcvt2 (AddResultM, AddFlagsM, AddDenormM, AddSumM, AddSumTcM, AddSelInvM, AddExpPostSumM, AddCorrSignM, AddOp1NormM, AddOp2NormM, AddOpANormM, AddOpBNormM, AddInvalidM, AddDenormInM, AddConvertM, AddSwapM, AddNormOvflowM, AddSignAM, AddFloat1M, AddFloat2M, AddExp1DenormM, AddExp2DenormM, AddExponentM, AddOp1M, AddOp2M, AddRmM, AddOpTypeM, AddPM, AddOvEnM, AddUnEnM);
+
+   input [63:0] AddOp1M;		// 1st input operand (A)
+   input [63:0] AddOp2M;		// 2nd input operand (B)
+   input [2:0] 	AddRmM;		// Rounding mode - specify values 
+   input [3:0]	AddOpTypeM;	// Function opcode
+   input 	AddPM;   		// Result Precision (0 for double, 1 for single)
+   input 	AddOvEnM;		// Overflow trap enabled
+   input 	AddUnEnM;   	// Underflow trap enabled
+   input [63:0] AddSumM, AddSumTcM;
+   input [63:0] 	 AddFloat1M; 
+   input [63:0] 	 AddFloat2M;
+   input [10:0]	 AddExp1DenormM, AddExp2DenormM;
+   input [10:0] 	 AddExponentM, AddExpPostSumM; //exp_pre;
+   //input		 exp_valid;
+   input [3:0] 	 AddSelInvM;
+   input		 AddOp1NormM, AddOp2NormM;
+   input		 AddOpANormM, AddOpBNormM;
+   input		 AddInvalidM;
+   input 	 AddDenormInM; 
+   input 	 AddSignAM; 
+   input         AddCorrSignM;
+   input 	 AddConvertM;
+   input          AddSwapM;
+   input 	 AddNormOvflowM;
+
+   output [63:0] AddResultM;	// Result of operation
+   output [4:0]  AddFlagsM;   	// IEEE exception flags 
+   output 	 AddDenormM;   	// AddDenormM on input or output   
+
+   wire          P;
+   assign P = AddPM | AddOpTypeM[2];
+
+   wire [10:0]   exp_pre;
+   wire [63:0] 	 Result;   
+   wire [63:0] 	 sum_norm, sum_norm_w_bypass;
+   wire [5:0] 	 norm_shift, norm_shift_denorm;
+   wire		 DenormIO;
+   wire [4:0] 	 FlagsIn;	
+   wire 	 Sticky_out;
+   wire 	 sign_corr;
+   wire 	 zeroB;         
+   wire [10:0]	 AddExpPostSumM;
+   wire 	 mantissa_comp;
+   wire 	 mantissa_comp_sum;
+   wire 	 mantissa_comp_sum_tc;
+   wire 	 Float1_sum_comp;
+   wire 	 Float2_sum_comp;
+   wire 	 Float1_sum_tc_comp;
+   wire 	 Float2_sum_tc_comp;
+   wire 	 normal_underflow;
+   wire [63:0]   sum_corr;
+ 
+   //AddExponentM value pre-rounding with considerations for denormalized
+   //cases/conversion cases
+   assign exp_pre       = AddDenormInM ?
+                          ((norm_shift == 6'b001011) ? 11'b00000000001 : (AddSwapM ? AddExp2DenormM : AddExp1DenormM))
+                          : (AddConvertM ? 11'b10000111100 : AddExponentM);
+
+
+   // Finds normal underflow result to determine whether to round final AddExponentM down
+   // Comparison between each float and the resulting AddSumM of the primary cla adder/subtractor and cla subtractor
+   assign Float1_sum_comp = (AddFloat1M[51:0] > AddSumM[51:0]) ? 1'b0 : 1'b1;
+   assign Float2_sum_comp = (AddFloat2M[51:0] > AddSumM[51:0]) ? 1'b0 : 1'b1;
+   assign Float1_sum_tc_comp = (AddFloat1M[51:0] > AddSumTcM[51:0]) ? 1'b0 : 1'b1;
+   assign Float2_sum_tc_comp = (AddFloat2M[51:0] > AddSumTcM[51:0]) ? 1'b0 : 1'b1;
+
+   // Determines the correct Float value to compare based on AddSwapM result
+   assign mantissa_comp_sum = AddSwapM ? Float2_sum_comp : Float1_sum_comp;
+   assign mantissa_comp_sum_tc = AddSwapM ? Float2_sum_tc_comp : Float1_sum_tc_comp;
+
+   // Determines the correct comparison result based on operation and sign of resulting AddSumM
+   assign mantissa_comp = (AddOpTypeM[0] ^ AddSumM[63]) ? mantissa_comp_sum_tc : mantissa_comp_sum;
+
+   // If the signs are different and both operands aren't denormalized
+   // the normal underflow bit is needed and therefore updated.
+   assign normal_underflow = ((AddFloat1M[63] ~^ AddFloat2M[63]) & (AddOpANormM | AddOpBNormM)) ? mantissa_comp : 1'b0;
+
+   // Determine the correct sign of the result
+   assign sign_corr = ((AddCorrSignM ^ AddSignAM) & ~AddConvertM) ^ AddSumM[63];   
+   
+   // If the AddSumM is negative, use its two complement instead. 
+   // This value has to be 64-bits to correctly handle the 
+   // case 10...00
+   assign sum_corr = (AddDenormInM & (AddOpANormM | AddOpBNormM) & ( ( (AddFloat1M[63] ~^ AddFloat2M[63]) & AddOpTypeM[0] ) | ((AddFloat1M[63] ^ AddFloat2M[63]) & ~AddOpTypeM[0]) ))
+			 ? (AddSumM[63] ? AddSumM : AddSumTcM) : ( (AddOpTypeM[3]) ? AddSumM : (AddSumM[63] ? AddSumTcM : AddSumM));
+
+   // Finds normal underflow result to determine whether to round final AddExponentM down
+   assign AddNormOvflowM = (AddDenormInM & (AddSumM == 16'h0) & (AddOpANormM | AddOpBNormM) & ~AddOpTypeM[0]) ? 1'b1 : (AddSumM[63] ? AddSumTcM[52] : AddSumM[52]);
+
+   // Leading-Zero Detector. Determine the size of the shift needed for
+   // normalization. If sum_corrected is all zeros, the exp_valid is 
+   // zero; otherwise, it is one. 
+   lz64 lzd1 (norm_shift, exp_valid, sum_corr);
+
+   assign norm_shift_denorm = (AddDenormInM & ( (~AddOpANormM & ~AddOpBNormM) | normal_underflow)) ? (6'h00) : (norm_shift);
+
+   // Barell shifter used for normalization. It takes as inputs the 
+   // the corrected AddSumM and the amount by which the AddSumM should 
+   // be right shifted. It outputs the normalized AddSumM. 
+   barrel_shifter_l64 bs2 (sum_norm, sum_corr, norm_shift_denorm);
+  
+   assign sum_norm_w_bypass = (AddOpTypeM[3]) ? (AddOpTypeM[0] ? ~sum_corr : sum_corr) : (sum_norm);
+
+   // Round the mantissa to a 52-bit value, with the leading one
+   // removed. If the result is a single precision number, the actual 
+   // mantissa is in the upper 23 bits and the lower 29 bits are zero. 
+   // At this point, normalization has already been performed, so we know 
+   // exactly where the rounding point is. The rounding units also
+   // handles special cases and set the exception flags.
+
+   // Changed DenormIO -> AddDenormM and FlagsIn -> AddFlagsM in order to
+   // help in processor reservation station detection of load/stores. In
+   // other words, the processor would like to know ahead of time that
+   // if the result is an exception then don't load or store.
+   rounder round1 (Result, DenormIO, FlagsIn, AddRmM, P, AddOvEnM, AddUnEnM, exp_valid, 
+		   AddSelInvM, AddInvalidM, AddDenormInM, AddConvertM, sign_corr, exp_pre, norm_shift, sum_norm_w_bypass,
+		   AddExpPostSumM, AddOp1NormM, AddOp2NormM, AddFloat1M[63:52], AddFloat2M[63:52],
+		   AddNormOvflowM, normal_underflow, AddSwapM, AddOpTypeM, AddSumM);
+
+   // Store the final result and the exception flags in registers.
+   assign AddResultM = Result;
+   assign {AddDenormM, AddFlagsM} = {DenormIO, FlagsIn};
+   
+endmodule // fpadd
+
+
--- a/wally-pipelined/src/fpu/fpucmp1.sv
+++ b/wally-pipelined/src/fpu/fpucmp1.sv
@ -0,0 +1,235 @@
+//
+// File name : fpcomp.v
+// Title     : Floating-Point Comparator
+// project   : FPU
+// Library   : fpcomp
+// Author(s) : James E. Stine
+// Purpose   : definition of main unit to floating-point comparator
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Floating Point Comparator (Algorithm)
+//
+// 1.) Performs sign-extension if the inputs are 32-bit integers.
+// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
+// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
+//     and correct for sign bits
+//
+// This module takes 64-bits inputs op1 and op2, VSS, and VDD
+// signals, and a 2-bit signal Sel that indicates the type of 
+// operands being compared as indicated below.
+//	Sel	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	(unused)
+//
+// The comparator produces a 2-bit signal FCC, which
+// indicates the result of the comparison:
+//
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+//
+// It also produces an invalid operation flag, which is one
+// if either of the input operands is a signaling NaN per 754
+
+module fpucmp1 (w, x, ANaN, BNaN, Azero, Bzero, op1, op2, Sel);
+   
+   input logic [63:0] op1; 
+   input logic [63:0] op2;
+   input logic [1:0]  Sel;
+
+   output logic [7:0]	      w, x;
+   output logic	      ANaN, BNaN;
+   output logic	      Azero, Bzero;
+   
+   // Perform magnitude comparison between the 63 least signficant bits
+   // of the input operands. Only LT and EQ are returned, since GT can
+   // be determined from these values. 
+   magcompare64b_1 magcomp2 (w, x, {~op1[63], op1[62:0]}, {~op2[63], op2[62:0]});
+
+   // Determine final values based on output of magnitude comparison, 
+   // sign bits, and special case testing. 
+   exception_cmp_1 exc1 (ANaN, BNaN, Azero, Bzero, op1, op2, Sel);
+
+endmodule // fpcomp
+
+// module magcompare2b (LT, GT, A, B);
+
+//    input logic [1:0] A;
+//    input logic [1:0] B;
+   
+//    output logic     LT;
+//    output logic     GT;
+
+//    // Determine if A < B  using a minimized sum-of-products expression
+//    assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
+//    // Determine if A > B  using a minimized sum-of-products expression
+//    assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
+
+// endmodule // magcompare2b
+
+// 2-bit magnitude comparator
+// This module compares two 2-bit values A and B. LT is '1' if A < B 
+// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
+// this version actually incorporates don't cares into the equation to
+// simplify the optimization
+
+module magcompare2c (LT, GT, A, B);
+
+   input logic [1:0] A;
+   input logic [1:0] B;
+   
+   output logic      LT;
+   output logic      GT;
+
+   assign LT = B[1] | (!A[1]&B[0]);
+   assign GT = A[1] | (!B[1]&A[0]);
+
+endmodule // magcompare2b
+
+// This module compares two 64-bit values A and B. LT is '1' if A < B 
+// and EQ is '1'if A = B. LT and GT are both '0' if A > B.
+// This structure was modified so
+// that it only does a strict magnitdude comparison, and only
+// returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
+// of 63 2-bit magnitude comparators, followed by one OR gates.
+//
+// J. E. Stine and M. J. Schulte, "A combined two's complement and
+// floating-point comparator," 2005 IEEE International Symposium on
+// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
+// doi: 10.1109/ISCAS.2005.1464531
+
+module magcompare64b_1 (w, x,  A, B);
+
+   input logic [63:0] A;
+   input logic [63:0] B;
+   
+   logic [31:0]       s;
+   logic [31:0]       t;
+   logic [15:0]       u;
+   logic [15:0]       v;
+   output logic [7:0] 	      w;
+   output logic [7:0] 	      x;
+   
+   magcompare2b mag1(s[0], t[0], A[1:0], B[1:0]);
+   magcompare2b mag2(s[1], t[1], A[3:2], B[3:2]);
+   magcompare2b mag3(s[2], t[2], A[5:4], B[5:4]);
+   magcompare2b mag4(s[3], t[3], A[7:6], B[7:6]);
+   magcompare2b mag5(s[4], t[4], A[9:8], B[9:8]);
+   magcompare2b mag6(s[5], t[5], A[11:10], B[11:10]);
+   magcompare2b mag7(s[6], t[6], A[13:12], B[13:12]);
+   magcompare2b mag8(s[7], t[7], A[15:14], B[15:14]);
+   magcompare2b mag9(s[8], t[8], A[17:16], B[17:16]);
+   magcompare2b magA(s[9], t[9], A[19:18], B[19:18]);
+   magcompare2b magB(s[10], t[10], A[21:20], B[21:20]);
+   magcompare2b magC(s[11], t[11], A[23:22], B[23:22]);
+   magcompare2b magD(s[12], t[12], A[25:24], B[25:24]);
+   magcompare2b magE(s[13], t[13], A[27:26], B[27:26]);
+   magcompare2b magF(s[14], t[14], A[29:28], B[29:28]);
+   magcompare2b mag10(s[15], t[15], A[31:30], B[31:30]);
+   magcompare2b mag11(s[16], t[16], A[33:32], B[33:32]);
+   magcompare2b mag12(s[17], t[17], A[35:34], B[35:34]);
+   magcompare2b mag13(s[18], t[18], A[37:36], B[37:36]);
+   magcompare2b mag14(s[19], t[19], A[39:38], B[39:38]);
+   magcompare2b mag15(s[20], t[20], A[41:40], B[41:40]);
+   magcompare2b mag16(s[21], t[21], A[43:42], B[43:42]);
+   magcompare2b mag17(s[22], t[22], A[45:44], B[45:44]);
+   magcompare2b mag18(s[23], t[23], A[47:46], B[47:46]);
+   magcompare2b mag19(s[24], t[24], A[49:48], B[49:48]);
+   magcompare2b mag1A(s[25], t[25], A[51:50], B[51:50]);
+   magcompare2b mag1B(s[26], t[26], A[53:52], B[53:52]);
+   magcompare2b mag1C(s[27], t[27], A[55:54], B[55:54]);
+   magcompare2b mag1D(s[28], t[28], A[57:56], B[57:56]);
+   magcompare2b mag1E(s[29], t[29], A[59:58], B[59:58]);
+   magcompare2b mag1F(s[30], t[30], A[61:60], B[61:60]);
+   magcompare2b mag20(s[31], t[31], A[63:62], B[63:62]);
+
+   magcompare2c mag21(u[0], v[0], t[1:0], s[1:0]);
+   magcompare2c mag22(u[1], v[1], t[3:2], s[3:2]);
+   magcompare2c mag23(u[2], v[2], t[5:4], s[5:4]);
+   magcompare2c mag24(u[3], v[3], t[7:6], s[7:6]);
+   magcompare2c mag25(u[4], v[4], t[9:8], s[9:8]);
+   magcompare2c mag26(u[5], v[5], t[11:10], s[11:10]);
+   magcompare2c mag27(u[6], v[6], t[13:12], s[13:12]);
+   magcompare2c mag28(u[7], v[7], t[15:14], s[15:14]);
+   magcompare2c mag29(u[8], v[8], t[17:16], s[17:16]);
+   magcompare2c mag2A(u[9], v[9], t[19:18], s[19:18]);
+   magcompare2c mag2B(u[10], v[10], t[21:20], s[21:20]);
+   magcompare2c mag2C(u[11], v[11], t[23:22], s[23:22]);
+   magcompare2c mag2D(u[12], v[12], t[25:24], s[25:24]);
+   magcompare2c mag2E(u[13], v[13], t[27:26], s[27:26]);
+   magcompare2c mag2F(u[14], v[14], t[29:28], s[29:28]);
+   magcompare2c mag30(u[15], v[15], t[31:30], s[31:30]);
+
+   magcompare2c mag31(w[0], x[0], v[1:0], u[1:0]);
+   magcompare2c mag32(w[1], x[1], v[3:2], u[3:2]);
+   magcompare2c mag33(w[2], x[2], v[5:4], u[5:4]);
+   magcompare2c mag34(w[3], x[3], v[7:6], u[7:6]);
+   magcompare2c mag35(w[4], x[4], v[9:8], u[9:8]);
+   magcompare2c mag36(w[5], x[5], v[11:10], u[11:10]);
+   magcompare2c mag37(w[6], x[6], v[13:12], u[13:12]);
+   magcompare2c mag38(w[7], x[7], v[15:14], u[15:14]);
+
+endmodule // magcompare64b
+
+// This module takes 64-bits inputs A and B, two magnitude comparison
+// flags LT_mag and EQ_mag, and a 2-bit signal Sel that indicates the type of 
+// operands being compared as indicated below.
+//	Sel	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	bfloat precision numbers
+//
+// The comparator produces a 2-bit signal fcc, which
+// indicates the result of the comparison as follows:
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+// It also produces a invalid operation flag, which is one
+// if either of the input operands is a signaling NaN.
+
+module exception_cmp_1 (ANaN, BNaN, Azero, Bzero, A, B, Sel);
+
+   input logic [63:0] A;
+   input logic [63:0] B;
+   input logic [1:0]  Sel;
+
+   logic 		      dp, sp, hp;
+
+   output logic 	      ANaN;
+   output logic 	      BNaN;
+   output logic               Azero;
+   output logic               Bzero;
+   logic [62:0]       sixtythreezeros = 63'h0;
+
+   assign dp = !Sel[1]&!Sel[0];
+   assign sp = !Sel[1]&Sel[0];
+   assign hp = Sel[1]&!Sel[0];
+
+   // Test if A or B is NaN.
+   assign ANaN = (A[62]&A[61]&A[60]&A[59]&A[58]) & 
+		 ((sp&A[57]&A[56]&A[55]&(A[54]|A[53])) | 
+		 (dp&A[57]&A[56]&A[55]&A[54]&A[53]&A[52]&(A[51]|A[50])) |
+		 (hp&(A[57]|A[56])));
+
+   assign BNaN = (B[62]&B[61]&B[60]&B[59]&B[58]) & 
+		 ((sp&B[57]&B[56]&B[55]&(B[54]|B[53])) | 
+		 (dp&B[57]&B[56]&B[55]&B[54]&B[53]&B[52]&(B[51]|B[50])) |
+		 (hp&(B[57]|B[56])));
+
+   // Test if A is +0 or -0 when viewed as a floating point number (i.e,
+   // the 63 least siginficant bits of A are zero). 
+   // Depending on how this synthesizes, it may work better to replace
+   // this with assign Azero = ~(A[62] | A[61] | ... | A[0])
+   assign Azero = (A[62:0] == sixtythreezeros);
+   assign Bzero = (B[62:0] == sixtythreezeros);
+
+endmodule // exception_cmp
--- a/wally-pipelined/src/fpu/fpucmp2.sv
+++ b/wally-pipelined/src/fpu/fpucmp2.sv
@ -0,0 +1,226 @@
+//
+// File name : fpcomp.v
+// Title     : Floating-Point Comparator
+// project   : FPU
+// Library   : fpcomp
+// Author(s) : James E. Stine
+// Purpose   : definition of main unit to floating-point comparator
+// notes :   
+//
+// Copyright Oklahoma State University
+//
+// Floating Point Comparator (Algorithm)
+//
+// 1.) Performs sign-extension if the inputs are 32-bit integers.
+// 2.) Perform a magnitude comparison on the lower 63 bits of the inputs
+// 3.) Check for special cases (+0=-0, unordered, and infinite values) 
+//     and correct for sign bits
+//
+// This module takes 64-bits inputs op1 and op2, VSS, and VDD
+// signals, and a 2-bit signal Sel that indicates the type of 
+// operands being compared as indicated below.
+//	Sel	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	(unused)
+//
+// The comparator produces a 2-bit signal FCC, which
+// indicates the result of the comparison:
+//
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+//
+// It also produces an invalid operation flag, which is one
+// if either of the input operands is a signaling NaN per 754
+
+module fpucmp2 (Invalid, FCC, ANaN, BNaN, Azero, Bzero, w, x, Sel, op1, op2);
+   
+   input logic [63:0] op1; 
+   input logic [63:0] op2;
+   input logic [1:0]  Sel;
+   input logic [7:0]  w, x;
+   input logic        ANaN, BNaN;
+   input logic        Azero, Bzero;
+   
+   output logic       Invalid; 		 // Invalid Operation
+   output logic [1:0] FCC;  		 // Condition Codes 
+   
+   logic 	      LT;                // magnitude op1 < magnitude op2
+   logic 	      EQ;                // magnitude op1 = magnitude op2
+   
+   // Perform magnitude comparison between the 63 least signficant bits
+   // of the input operands. Only LT and EQ are returned, since GT can
+   // be determined from these values. 
+   magcompare64b_2 magcomp2 (LT, EQ, w, x);
+
+   // Determine final values based on output of magnitude comparison, 
+   // sign bits, and special case testing. 
+   exception_cmp_2 exc2 (.invalid(Invalid), .fcc(FCC), .LT_mag(LT), .EQ_mag(EQ), .ANaN(ANaN), .BNaN(BNaN), .Azero(Azero), .Bzero(Bzero), .Sel(Sel), .A(op1), .B(op2));
+   
+
+endmodule // fpcomp
+
+/*module magcompare2b (LT, GT, A, B);
+
+   input logic [1:0] A;
+   input logic [1:0] B;
+   
+   output logic     LT;
+   output logic     GT;
+
+   // Determine if A < B  using a minimized sum-of-products expression
+   assign LT = ~A[1]&B[1] | ~A[1]&~A[0]&B[0] | ~A[0]&B[1]&B[0];
+   // Determine if A > B  using a minimized sum-of-products expression
+   assign GT = A[1]&~B[1] | A[1]&A[0]&~B[0] | A[0]&~B[1]&~B[0];
+
+endmodule*/ // magcompare2b
+
+// 2-bit magnitude comparator
+// This module compares two 2-bit values A and B. LT is '1' if A < B 
+// and GT is '1'if A > B. LT and GT are both '0' if A = B.  However,
+// this version actually incorporates don't cares into the equation to
+// simplify the optimization
+
+// module magcompare2c (LT, GT, A, B);
+
+//    input logic [1:0] A;
+//    input logic [1:0] B;
+   
+//    output logic      LT;
+//    output logic      GT;
+
+//    assign LT = B[1] | (!A[1]&B[0]);
+//    assign GT = A[1] | (!B[1]&A[0]);
+
+// endmodule // magcompare2b
+
+// This module compares two 64-bit values A and B. LT is '1' if A < B 
+// and EQ is '1'if A = B. LT and GT are both '0' if A > B.
+// This structure was modified so
+// that it only does a strict magnitdude comparison, and only
+// returns flags for less than (LT) and eqaual to (EQ). It uses a tree 
+// of 63 2-bit magnitude comparators, followed by one OR gates.
+//
+// J. E. Stine and M. J. Schulte, "A combined two's complement and
+// floating-point comparator," 2005 IEEE International Symposium on
+// Circuits and Systems, Kobe, 2005, pp. 89-92 Vol. 1. 
+// doi: 10.1109/ISCAS.2005.1464531
+
+module magcompare64b_2 (LT, EQ, w, x);
+
+   input logic [7:0]  w;
+   input logic [7:0]  x;
+   logic [3:0] 	      y;
+   logic [3:0] 	      z;
+   logic [1:0] 	      a;
+   logic [1:0] 	      b;   
+   logic 	      GT;
+   
+   output logic       LT;
+   output logic       EQ;
+   
+   magcompare2c mag39(y[0], z[0], x[1:0], w[1:0]);
+   magcompare2c mag3A(y[1], z[1], x[3:2], w[3:2]);
+   magcompare2c mag3B(y[2], z[2], x[5:4], w[5:4]);
+   magcompare2c mag3C(y[3], z[3], x[7:6], w[7:6]);
+   
+   magcompare2c mag3D(a[0], b[0], z[1:0], y[1:0]);
+   magcompare2c mag3E(a[1], b[1], z[3:2], y[3:2]);
+   
+   magcompare2c mag3F(LT, GT, b[1:0], a[1:0]);
+
+   assign EQ = ~(LT | GT);
+
+endmodule // magcompare64b
+
+// This module takes 64-bits inputs A and B, two magnitude comparison
+// flags LT_mag and EQ_mag, and a 2-bit signal Sel that indicates the type of 
+// operands being compared as indicated below.
+//	Sel	Description
+//	 00	double precision numbers
+//	 01	single precision numbers
+//	 10	half precision numbers
+//	 11	bfloat precision numbers
+//
+// The comparator produces a 2-bit signal fcc, which
+// indicates the result of the comparison as follows:
+//     fcc 	decscription
+//      00	A = B	
+//      01	A < B	
+//      10	A > B	
+//      11	A and B	are unordered (i.e., A or B is NaN)
+// It also produces a invalid operation flag, which is one
+// if either of the input operands is a signaling NaN.
+
+module exception_cmp_2 (invalid, fcc, LT_mag, EQ_mag, ANaN, BNaN, Azero, Bzero, Sel, A, B);
+
+   input logic [63:0] A;
+   input logic [63:0] B;
+   input logic 	      LT_mag;
+   input logic 	      EQ_mag;
+   input logic [1:0]  Sel;
+   
+   output logic       invalid;
+   output logic [1:0] fcc;   
+
+   logic 	      dp;   
+   logic 	      sp;
+   logic 	      hp;   
+   input logic 	      Azero;
+   input logic 	      Bzero;   
+   input logic 	      ANaN;
+   input logic 	      BNaN;
+   logic 	      ASNaN;
+   logic 	      BSNaN;
+   logic 	      UO;
+   logic 	      GT;
+   logic 	      LT;
+   logic 	      EQ;
+   logic [62:0]       sixtythreezeros = 63'h0;
+
+   assign dp = !Sel[1]&!Sel[0];
+   assign sp = !Sel[1]&Sel[0];
+   assign hp = Sel[1]&!Sel[0];
+
+   // Values are unordered if ((A is NaN) OR (B is NaN)) AND (a floating 
+   // point comparison is being performed. 
+   assign UO = (ANaN | BNaN);
+
+   // Test if A or B is a signaling NaN.
+   assign ASNaN = ANaN & (sp&~A[53] | dp&~A[50] | hp&~A[56]);
+   assign BSNaN = BNaN & (sp&~B[53] | dp&~B[50] | hp&~B[56]);
+
+   // If either A or B is a signaling NaN the "Invalid Operation"
+   // exception flag is set to one; otherwise it is zero.    
+   assign invalid = (ASNaN | BSNaN);
+
+   // A and B are equal if (their magnitudes are equal) AND ((their signs are
+   // equal) or (their magnitudes are zero AND they are floating point
+   // numbers)). Also, A and B are not equal if they are unordered.
+   assign EQ = (EQ_mag | (Azero&Bzero)) & (~UO);
+   
+   // A is less than B if (A is negative and B is posiive) OR
+   // (A and B are positive and the magnitude of A is less than
+   // the magnitude of B) or (A and B are negative integers and
+   // the magnitude of A is less than the magnitude of B) or
+   // (A and B are negative floating point numbers and
+   // the magnitude of A is greater than the magnitude of B).
+   // Also, A is not less than B if A and B are equal or unordered.
+   assign LT = ((~LT_mag & A[63] & B[63]) |
+		(LT_mag & ~(A[63] & B[63])))&~EQ&~UO;
+   
+   // A is greater than B when LT, EQ, and UO are are false.
+   assign GT = ~(LT | EQ | UO);
+
+   // Note: it may be possible to optimize the setting of fcc 
+   // a little more, but it is probably not worth the effort. 
+
+   // Set the bits of fcc based on LT, GT, EQ, and UO
+   assign fcc[0] = LT | UO;
+   assign fcc[1] = GT | UO;   
+
+endmodule // exception_cmp
--- a/wally-pipelined/src/fpu/freg.sv
+++ b/wally-pipelined/src/fpu/freg.sv
@ -0,0 +1,514 @@
+
+`include "wally-config.vh"
+
+module freg1adr (
+  input  logic [2:0]       frm,
+  input  logic             reset,
+  input  logic             clear,
+  input  logic             clk,
+  input  logic [4:0]       rd,
+  input  logic             write,
+  input  logic [4:0]       adr1,
+  input  logic [`XLEN-1:0] writeData,
+  output logic [`XLEN-1:0] readData);
+
+  //note - not word aligning based on precision of 
+  //operation (frm)
+
+  //reg number should remain static, but it doesn't hurt
+  //to parameterize
+  parameter numRegs = 32;
+
+  //intermediary signals - useful for debugging
+  //and easy instatiation of generated modules
+  logic [`XLEN-1:0] [numRegs-1:0] regInput;
+  logic [`XLEN-1:0] [numRegs-1:0] regOutput;
+
+  //generate fp registers themselves
+  genvar i;
+  generate
+  	for (i = 0; i < numRegs; i = i + 1) begin:register
+
+  		floprc #(`XLEN) freg[i](.clk(clk), .reset(reset), .clear(clear), .d(regInput[i][`XLEN-1:0]), .q(regOutput[i][`XLEN-1:0])); 
+	end
+
+  endgenerate
+
+  //this could be done with:
+  //
+  //assign readData = regOutput[adr1];
+  //
+  //but always_comb allows for finer control
+
+
+  //address decoder
+  //only 1 for this fp register set
+  //used with fpsign
+  //defaults to outputting zeroes
+  always_comb begin
+  	case(adr1)
+		5'b00000 : readData = regOutput[0];
+		5'b00001 : readData = regOutput[1];
+		5'b00010 : readData = regOutput[2];
+		5'b00011 : readData = regOutput[3];
+		5'b00100 : readData = regOutput[4];
+		5'b00101 : readData = regOutput[5];
+		5'b00110 : readData = regOutput[6];
+		5'b00111 : readData = regOutput[7];
+		5'b01000 : readData = regOutput[8];
+		5'b01001 : readData = regOutput[9];
+		5'b01010 : readData = regOutput[10];
+		5'b01011 : readData = regOutput[11];
+		5'b01100 : readData = regOutput[12];
+		5'b01101 : readData = regOutput[13];
+		5'b01110 : readData = regOutput[14];
+		5'b01111 : readData = regOutput[15];
+		5'b10000 : readData = regOutput[16];
+		5'b10001 : readData = regOutput[17];
+		5'b10010 : readData = regOutput[18];
+		5'b10011 : readData = regOutput[19];
+		5'b10100 : readData = regOutput[20];
+		5'b10101 : readData = regOutput[21];
+		5'b10110 : readData = regOutput[22];
+		5'b10111 : readData = regOutput[23];
+		5'b11000 : readData = regOutput[24];
+		5'b11001 : readData = regOutput[25];
+		5'b11010 : readData = regOutput[26];
+		5'b11011 : readData = regOutput[27];
+		5'b11100 : readData = regOutput[28];
+		5'b11101 : readData = regOutput[29];
+		5'b11110 : readData = regOutput[30];
+		5'b11111 : readData = regOutput[31];
+		default : readData = `XLEN'h0;
+	endcase
+  end
+
+  //destination register decoder
+  //only change input values on write
+  //defaults to undefined with invalid address
+  //
+  //note - this is an intermediary signal, so
+  //this is not asynch assignment. FF in flopr
+  //will not update data until clk pulse
+  always_comb begin
+	  if(write) begin
+		case(rd)	
+			5'b00000 : regInput[0] = writeData;
+			5'b00001 : regInput[1] = writeData;
+			5'b00010 : regInput[2] = writeData;
+			5'b00011 : regInput[3] = writeData;	
+			5'b00100 : regInput[4] = writeData;
+			5'b00101 : regInput[5] = writeData;
+			5'b00110 : regInput[6] = writeData;
+			5'b00111 : regInput[7] = writeData;
+			5'b01000 : regInput[8] = writeData;
+			5'b01000 : regInput[9] = writeData;
+			5'b01001 : regInput[10] = writeData;
+			5'b01010 : regInput[11] = writeData;
+			5'b01111 : regInput[12] = writeData;
+			5'b01101 : regInput[13] = writeData;
+			5'b01110 : regInput[14] = writeData;
+			5'b01111 : regInput[15] = writeData;
+			5'b10000 : regInput[16] = writeData;
+			5'b10001 : regInput[17] = writeData;
+			5'b10010 : regInput[18] = writeData;
+			5'b10011 : regInput[19] = writeData;	
+			5'b10100 : regInput[20] = writeData;
+			5'b10101 : regInput[21] = writeData;
+			5'b10110 : regInput[22] = writeData;
+			5'b10111 : regInput[23] = writeData;
+			5'b11000 : regInput[24] = writeData;
+			5'b11000 : regInput[25] = writeData;
+			5'b11001 : regInput[26] = writeData;
+			5'b11010 : regInput[27] = writeData;
+			5'b11111 : regInput[28] = writeData;
+			5'b11101 : regInput[29] = writeData;
+			5'b11110 : regInput[30] = writeData;
+			5'b11111 : regInput[31] = writeData;
+			default : regInput[0] = `XLEN'hx;
+		endcase
+	end	
+  end
+
+endmodule
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//********
+//formatting separation
+//********
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+module freg2adr (
+  input  logic [2:0]       frm,
+  input  logic             reset,
+  input  logic             clear,
+  input  logic             clk,
+  input  logic [4:0]       rd,
+  input  logic             write,
+  input  logic [4:0]       adr1,
+  input  logic [4:0]       adr2,
+  input  logic [`XLEN-1:0] writeData,
+  output logic [`XLEN-1:0] readData1,
+  output logic [`XLEN-1:0] readData2);
+
+  //note - not word aligning based on precision of 
+  //operation (frm)
+
+  //reg number should remain static, but it doesn't hurt
+  //to parameterize
+  parameter numRegs = 32;
+
+  //intermediary signals - useful for debugging
+  //and easy instatiation of generated modules
+  logic [`XLEN-1:0] [numRegs-1:0] regInput;
+  logic [`XLEN-1:0] [numRegs-1:0] regOutput;
+
+  //generate fp registers themselves
+  genvar i;
+  generate
+  	for (i = 0; i < numRegs; i = i + 1) begin:register
+
+  		floprc #(`XLEN) freg[i](.clk(clk), .reset(reset), .clear(clear), .d(regInput[i][`XLEN-1:0]), .q(regOutput[i][`XLEN-1:0])); 
+	end
+
+  endgenerate
+
+  //address decoder
+  //2 are used for this fp register set
+  //used with fpadd/cvt, fpdiv/sqrt, and fpcmp
+  //defaults to outputting zeroes
+  always_comb begin
+
+	//adderss 1 decoder
+  	case(adr1)
+		5'b00000 : readData1 = regOutput[0];
+		5'b00001 : readData1 = regOutput[1];
+		5'b00010 : readData1 = regOutput[2];
+		5'b00011 : readData1 = regOutput[3];
+		5'b00100 : readData1 = regOutput[4];
+		5'b00101 : readData1 = regOutput[5];
+		5'b00110 : readData1 = regOutput[6];
+		5'b00111 : readData1 = regOutput[7];
+		5'b01000 : readData1 = regOutput[8];
+		5'b01001 : readData1 = regOutput[9];
+		5'b01010 : readData1 = regOutput[10];
+		5'b01011 : readData1 = regOutput[11];
+		5'b01100 : readData1 = regOutput[12];
+		5'b01101 : readData1 = regOutput[13];
+		5'b01110 : readData1 = regOutput[14];
+		5'b01111 : readData1 = regOutput[15];
+		5'b10000 : readData1 = regOutput[16];
+		5'b10001 : readData1 = regOutput[17];
+		5'b10010 : readData1 = regOutput[18];
+		5'b10011 : readData1 = regOutput[19];
+		5'b10100 : readData1 = regOutput[20];
+		5'b10101 : readData1 = regOutput[21];
+		5'b10110 : readData1 = regOutput[22];
+		5'b10111 : readData1 = regOutput[23];
+		5'b11000 : readData1 = regOutput[24];
+		5'b11001 : readData1 = regOutput[25];
+		5'b11010 : readData1 = regOutput[26];
+		5'b11011 : readData1 = regOutput[27];
+		5'b11100 : readData1 = regOutput[28];
+		5'b11101 : readData1 = regOutput[29];
+		5'b11110 : readData1 = regOutput[30];
+		5'b11111 : readData1 = regOutput[31];
+		default : readData1 = `XLEN'h0;
+	endcase
+
+	//address 2 decoder
+  	case(adr2)
+		5'b00000 : readData2 = regOutput[0];
+		5'b00001 : readData2 = regOutput[1];
+		5'b00010 : readData2 = regOutput[2];
+		5'b00011 : readData2 = regOutput[3];
+		5'b00100 : readData2 = regOutput[4];
+		5'b00101 : readData2 = regOutput[5];
+		5'b00110 : readData2 = regOutput[6];
+		5'b00111 : readData2 = regOutput[7];
+		5'b01000 : readData2 = regOutput[8];
+		5'b01001 : readData2 = regOutput[9];
+		5'b01010 : readData2 = regOutput[10];
+		5'b01011 : readData2 = regOutput[11];
+		5'b01100 : readData2 = regOutput[12];
+		5'b01101 : readData2 = regOutput[13];
+		5'b01110 : readData2 = regOutput[14];
+		5'b01111 : readData2 = regOutput[15];
+		5'b10000 : readData2 = regOutput[16];
+		5'b10001 : readData2 = regOutput[17];
+		5'b10010 : readData2 = regOutput[18];
+		5'b10011 : readData2 = regOutput[19];
+		5'b10100 : readData2 = regOutput[20];
+		5'b10101 : readData2 = regOutput[21];
+		5'b10110 : readData2 = regOutput[22];
+		5'b10111 : readData2 = regOutput[23];
+		5'b11000 : readData2 = regOutput[24];
+		5'b11001 : readData2 = regOutput[25];
+		5'b11010 : readData2 = regOutput[26];
+		5'b11011 : readData2 = regOutput[27];
+		5'b11100 : readData2 = regOutput[28];
+		5'b11101 : readData2 = regOutput[29];
+		5'b11110 : readData2 = regOutput[30];
+		5'b11111 : readData2 = regOutput[31];
+		default : readData2 = `XLEN'h0;
+	endcase
+  end
+
+  //destination register decoder
+  //only change input values on write
+  //defaults to undefined with invalid address
+  //
+  //note - this is an intermediary signal, so
+  //this is not asynch assignment. FF in flopr
+  //will not update data until clk pulse
+  always_comb begin
+	  if(write) begin
+		case(rd)	
+			5'b00000 : regInput[0] = writeData;
+			5'b00001 : regInput[1] = writeData;
+			5'b00010 : regInput[2] = writeData;
+			5'b00011 : regInput[3] = writeData;	
+			5'b00100 : regInput[4] = writeData;
+			5'b00101 : regInput[5] = writeData;
+			5'b00110 : regInput[6] = writeData;
+			5'b00111 : regInput[7] = writeData;
+			5'b01000 : regInput[8] = writeData;
+			5'b01000 : regInput[9] = writeData;
+			5'b01001 : regInput[10] = writeData;
+			5'b01010 : regInput[11] = writeData;
+			5'b01111 : regInput[12] = writeData;
+			5'b01101 : regInput[13] = writeData;
+			5'b01110 : regInput[14] = writeData;
+			5'b01111 : regInput[15] = writeData;
+			5'b10000 : regInput[16] = writeData;
+			5'b10001 : regInput[17] = writeData;
+			5'b10010 : regInput[18] = writeData;
+			5'b10011 : regInput[19] = writeData;	
+			5'b10100 : regInput[20] = writeData;
+			5'b10101 : regInput[21] = writeData;
+			5'b10110 : regInput[22] = writeData;
+			5'b10111 : regInput[23] = writeData;
+			5'b11000 : regInput[24] = writeData;
+			5'b11000 : regInput[25] = writeData;
+			5'b11001 : regInput[26] = writeData;
+			5'b11010 : regInput[27] = writeData;
+			5'b11111 : regInput[28] = writeData;
+			5'b11101 : regInput[29] = writeData;
+			5'b11110 : regInput[30] = writeData;
+			5'b11111 : regInput[31] = writeData;
+			default : regInput[0] = `XLEN'hx;
+		endcase
+	end	
+  end
+
+endmodule
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+//********
+//formatting separation
+//********
+/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+module freg3adr (
+  input  logic [2:0]       frm,
+  input  logic             reset,
+  input  logic             clear,
+  input  logic             clk,
+  input  logic [4:0]       rd,
+  input  logic             write,
+  input  logic [4:0]       adr1,
+  input  logic [4:0]       adr2,
+  input  logic [4:0]       adr3,
+  input  logic [`XLEN-1:0] writeData,
+  output logic [`XLEN-1:0] readData1,
+  output logic [`XLEN-1:0] readData2,
+  output logic [`XLEN-1:0] readData3);
+
+  //note - not word aligning based on precision of 
+  //operation (frm)
+
+  //reg number should remain static, but it doesn't hurt
+  //to parameterize
+  parameter numRegs = 32;
+
+  //intermediary signals - useful for debugging
+  //and easy instatiation of generated modules
+  logic [numRegs-1:0] [`XLEN-1:0] regInput;
+  logic [numRegs-1:0] [`XLEN-1:0] regOutput;
+
+  //generate fp registers themselves
+  genvar i;
+  generate
+  	for (i = 0; i < numRegs; i = i + 1) begin:register
+
+  		floprc #(`XLEN) freg(.clk(clk), .reset(reset), .clear(clear), .d(regInput[i][`XLEN-1:0]), .q(regOutput[i][`XLEN-1:0])); 
+	end
+
+  endgenerate
+
+  //address decoder
+  //3 are used for this fp register set
+  //used exclusively for fma
+  //defaults to outputting zeroes
+  always_comb begin
+
+	//adderss 1 decoder
+  	case(adr1)
+		5'b00000 : readData1 = regOutput[0];
+		5'b00001 : readData1 = regOutput[1];
+		5'b00010 : readData1 = regOutput[2];
+		5'b00011 : readData1 = regOutput[3];
+		5'b00100 : readData1 = regOutput[4];
+		5'b00101 : readData1 = regOutput[5];
+		5'b00110 : readData1 = regOutput[6];
+		5'b00111 : readData1 = regOutput[7];
+		5'b01000 : readData1 = regOutput[8];
+		5'b01001 : readData1 = regOutput[9];
+		5'b01010 : readData1 = regOutput[10];
+		5'b01011 : readData1 = regOutput[11];
+		5'b01100 : readData1 = regOutput[12];
+		5'b01101 : readData1 = regOutput[13];
+		5'b01110 : readData1 = regOutput[14];
+		5'b01111 : readData1 = regOutput[15];
+		5'b10000 : readData1 = regOutput[16];
+		5'b10001 : readData1 = regOutput[17];
+		5'b10010 : readData1 = regOutput[18];
+		5'b10011 : readData1 = regOutput[19];
+		5'b10100 : readData1 = regOutput[20];
+		5'b10101 : readData1 = regOutput[21];
+		5'b10110 : readData1 = regOutput[22];
+		5'b10111 : readData1 = regOutput[23];
+		5'b11000 : readData1 = regOutput[24];
+		5'b11001 : readData1 = regOutput[25];
+		5'b11010 : readData1 = regOutput[26];
+		5'b11011 : readData1 = regOutput[27];
+		5'b11100 : readData1 = regOutput[28];
+		5'b11101 : readData1 = regOutput[29];
+		5'b11110 : readData1 = regOutput[30];
+		5'b11111 : readData1 = regOutput[31];
+		default : readData1 = `XLEN'h0;
+	endcase
+
+	//address 2 decoder
+  	case(adr2)
+		5'b00000 : readData2 = regOutput[0];
+		5'b00001 : readData2 = regOutput[1];
+		5'b00010 : readData2 = regOutput[2];
+		5'b00011 : readData2 = regOutput[3];
+		5'b00100 : readData2 = regOutput[4];
+		5'b00101 : readData2 = regOutput[5];
+		5'b00110 : readData2 = regOutput[6];
+		5'b00111 : readData2 = regOutput[7];
+		5'b01000 : readData2 = regOutput[8];
+		5'b01001 : readData2 = regOutput[9];
+		5'b01010 : readData2 = regOutput[10];
+		5'b01011 : readData2 = regOutput[11];
+		5'b01100 : readData2 = regOutput[12];
+		5'b01101 : readData2 = regOutput[13];
+		5'b01110 : readData2 = regOutput[14];
+		5'b01111 : readData2 = regOutput[15];
+		5'b10000 : readData2 = regOutput[16];
+		5'b10001 : readData2 = regOutput[17];
+		5'b10010 : readData2 = regOutput[18];
+		5'b10011 : readData2 = regOutput[19];
+		5'b10100 : readData2 = regOutput[20];
+		5'b10101 : readData2 = regOutput[21];
+		5'b10110 : readData2 = regOutput[22];
+		5'b10111 : readData2 = regOutput[23];
+		5'b11000 : readData2 = regOutput[24];
+		5'b11001 : readData2 = regOutput[25];
+		5'b11010 : readData2 = regOutput[26];
+		5'b11011 : readData2 = regOutput[27];
+		5'b11100 : readData2 = regOutput[28];
+		5'b11101 : readData2 = regOutput[29];
+		5'b11110 : readData2 = regOutput[30];
+		5'b11111 : readData2 = regOutput[31];
+		default : readData2 = `XLEN'h0;
+	endcase
+
+	//address 3 decoder
+  	case(adr3)
+		5'b00000 : readData3 = regOutput[0];
+		5'b00001 : readData3 = regOutput[1];
+		5'b00010 : readData3 = regOutput[2];
+		5'b00011 : readData3 = regOutput[3];
+		5'b00100 : readData3 = regOutput[4];
+		5'b00101 : readData3 = regOutput[5];
+		5'b00110 : readData3 = regOutput[6];
+		5'b00111 : readData3 = regOutput[7];
+		5'b01000 : readData3 = regOutput[8];
+		5'b01001 : readData3 = regOutput[9];
+		5'b01010 : readData3 = regOutput[10];
+		5'b01011 : readData3 = regOutput[11];
+		5'b01100 : readData3 = regOutput[12];
+		5'b01101 : readData3 = regOutput[13];
+		5'b01110 : readData3 = regOutput[14];
+		5'b01111 : readData3 = regOutput[15];
+		5'b10000 : readData3 = regOutput[16];
+		5'b10001 : readData3 = regOutput[17];
+		5'b10010 : readData3 = regOutput[18];
+		5'b10011 : readData3 = regOutput[19];
+		5'b10100 : readData3 = regOutput[20];
+		5'b10101 : readData3 = regOutput[21];
+		5'b10110 : readData3 = regOutput[22];
+		5'b10111 : readData3 = regOutput[23];
+		5'b11000 : readData3 = regOutput[24];
+		5'b11001 : readData3 = regOutput[25];
+		5'b11010 : readData3 = regOutput[26];
+		5'b11011 : readData3 = regOutput[27];
+		5'b11100 : readData3 = regOutput[28];
+		5'b11101 : readData3 = regOutput[29];
+		5'b11110 : readData3 = regOutput[30];
+		5'b11111 : readData3 = regOutput[31];
+		default : readData3 = `XLEN'h0;
+	endcase
+  end
+
+  //destination register decoder
+  //only change input values on write
+  //defaults to undefined with invalid address
+  //
+  //note - this is an intermediary signal, so
+  //this is not asynch assignment. FF in flopr
+  //will not update data until clk pulse
+  always_comb begin
+	  if(write) begin
+		case(rd)	
+			5'b00000 : regInput[0] = writeData;
+			5'b00001 : regInput[1] = writeData;
+			5'b00010 : regInput[2] = writeData;
+			5'b00011 : regInput[3] = writeData;	
+			5'b00100 : regInput[4] = writeData;
+			5'b00101 : regInput[5] = writeData;
+			5'b00110 : regInput[6] = writeData;
+			5'b00111 : regInput[7] = writeData;
+			5'b01000 : regInput[8] = writeData;
+			5'b01001 : regInput[9] = writeData;
+			5'b01010 : regInput[10] = writeData;
+			5'b01011 : regInput[11] = writeData;
+			5'b01100 : regInput[12] = writeData;
+			5'b01101 : regInput[13] = writeData;
+			5'b01110 : regInput[14] = writeData;
+			5'b01111 : regInput[15] = writeData;
+			5'b10000 : regInput[16] = writeData;
+			5'b10001 : regInput[17] = writeData;
+			5'b10010 : regInput[18] = writeData;
+			5'b10011 : regInput[19] = writeData;	
+			5'b10100 : regInput[20] = writeData;
+			5'b10101 : regInput[21] = writeData;
+			5'b10110 : regInput[22] = writeData;
+			5'b10111 : regInput[23] = writeData;
+			5'b11000 : regInput[24] = writeData;
+			5'b11001 : regInput[25] = writeData;
+			5'b11010 : regInput[26] = writeData;
+			5'b11011 : regInput[27] = writeData;
+			5'b11100 : regInput[28] = writeData;
+			5'b11101 : regInput[29] = writeData;
+			5'b11110 : regInput[30] = writeData;
+			5'b11111 : regInput[31] = writeData;
+			default : regInput[0] = `XLEN'hx;
+		endcase
+	end	
+  end
+
+endmodule
--- a/wally-pipelined/src/fpu/fsgn.sv
+++ b/wally-pipelined/src/fpu/fsgn.sv
@ -0,0 +1,31 @@
+//performs the fsgnj/fsgnjn/fsgnjx RISCV instructions
+
+module fpusgn (SgnOpCodeE, SgnResultE, SgnFlagsE, SgnOp1E, SgnOp2E);
+
+	input  [63:0]  SgnOp1E, SgnOp2E;
+	input  [1:0]   SgnOpCodeE;
+	output [63:0]  SgnResultE;
+	output [4:0]   SgnFlagsE;
+
+	wire AonesExp;
+
+	//op code designation:
+	//
+	//00 - fsgnj - directly copy over sign value of SgnOp2E
+	//01 - fsgnjn - negate sign value of SgnOp2E
+	//10 - fsgnjx - XOR sign values of SgnOp1E & SgnOp2E
+	//
+	
+	assign SgnResultE[63] = SgnOpCodeE[1] ? (SgnOp1E[63] ^ SgnOp2E[63]) : (SgnOp2E[63] ^ SgnOpCodeE[0]);
+	assign SgnResultE[62:0] = SgnOp1E[62:0];
+
+	//If the exponent is all ones, then the value is either Inf or NaN,
+	//both of which will produce a QNaN/SNaN value of some sort. This will 
+	//set the invalid flag high.
+	assign AonesExp = SgnOp1E[62]&SgnOp1E[61]&SgnOp1E[60]&SgnOp1E[59]&SgnOp1E[58]&SgnOp1E[57]&SgnOp1E[56]&SgnOp1E[55]&SgnOp1E[54]&SgnOp1E[53]&SgnOp1E[52];
+
+	//the only flag that can occur during this operation is invalid
+	//due to changing sign on already existing NaN
+	assign SgnFlagsE = {AonesExp & SgnResultE[63], 1'b0, 1'b0, 1'b0, 1'b0};
+
+endmodule
--- a/wally-pipelined/src/fpu/fsm.sv
+++ b/wally-pipelined/src/fpu/fsm.sv
@ -0,0 +1,459 @@
+module fsm (done, load_rega, load_regb, load_regc, 
+	    load_regd, load_regr, load_regs,
+	    sel_muxa, sel_muxb, sel_muxr, 
+	    clk, reset, start, error, op_type);
+
+   input 	clk;
+   input 	reset;
+   input 	start;
+   input 	error;
+   input  	op_type;
+   
+   output       done;      
+   output       load_rega;
+   output       load_regb;
+   output       load_regc;
+   output 	load_regd;   
+   output 	load_regr;
+   output 	load_regs;
+   
+   output [2:0] sel_muxa;
+   output [2:0] sel_muxb;
+   output 	sel_muxr;
+
+   reg 		done;      // End of cycles
+   reg 		load_rega; // enable for regA
+   reg 		load_regb; // enable for regB
+   reg 		load_regc; // enable for regC
+   reg 		load_regd; // enable for regD
+   reg 		load_regr; // enable for rem
+   reg 		load_regs; // enable for q,qm,qp   
+   reg [2:0] 	sel_muxa;  // Select muxA
+   reg [2:0] 	sel_muxb;  // Select muxB
+   reg 		sel_muxr;  // Select rem mux
+
+   reg [4:0] 	CURRENT_STATE;
+   reg [4:0] 	NEXT_STATE;   
+
+   parameter [4:0] 
+     S0=5'd0, S1=5'd1, S2=5'd2,
+     S3=5'd3, S4=5'd4, S5=5'd5,
+     S6=5'd6, S7=5'd7, S8=5'd8,
+     S9=5'd9, S10=5'd10,
+     S13=5'd13, S14=5'd14, S15=5'd15,     
+     S16=5'd16, S17=5'd17, S18=5'd18,
+     S19=5'd19, S20=5'd20, S21=5'd21,
+     S22=5'd22, S23=5'd23, S24=5'd24,
+     S25=5'd25, S26=5'd26, S27=5'd27,
+     S28=5'd28, S29=5'd29, S30=5'd30;
+   
+   always @(posedge clk)
+     begin
+	if(reset==1'b1)
+	  CURRENT_STATE<=S0;
+	else
+	  CURRENT_STATE<=NEXT_STATE;
+     end
+
+   always @(*)
+     begin
+ 	case(CURRENT_STATE)
+	  S0:  // iteration 0
+	    begin
+	       if (start==1'b0)
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b0;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;
+		    sel_muxa = 3'b000;
+		    sel_muxb = 3'b000;
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S0;
+		 end 
+	       else if (start==1'b1 && op_type==1'b0) 
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b1;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;		    
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;		    		    
+		    sel_muxa = 3'b001;
+		    sel_muxb = 3'b001;		    
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S1;
+		 end // if (start==1'b1 && op_type==1'b0)
+	       else if (start==1'b1 && op_type==1'b1) 
+		 begin
+		    done = 1'b0;
+		    load_rega = 1'b0;
+		    load_regb = 1'b1;
+		    load_regc = 1'b0;
+		    load_regd = 1'b0;		    
+		    load_regr = 1'b0;
+		    load_regs = 1'b0;		    		    
+		    sel_muxa = 3'b010;
+		    sel_muxb = 3'b000;		    
+		    sel_muxr = 1'b0;
+		    NEXT_STATE <= S13;
+		 end 	       
+	    end // case: S0
+	  S1:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b010;
+	       sel_muxb = 3'b000;		    
+	       sel_muxr = 1'b0;	
+	       NEXT_STATE <= S2;
+	    end	  
+	  S2: // iteration 1
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S3;
+	    end
+	  S3:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S4;
+	    end
+	  S4: // iteration 2
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S5;
+	    end
+	  S5:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;  // add
+	       NEXT_STATE <= S6;
+	    end
+	  S6: // iteration 3
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S8;
+	    end
+	  S7:
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S8;
+	    end // case: S7
+	  S8: // q,qm,qp
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b1;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S9;
+	    end 
+	  S9:  // rem
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b1;
+	       load_regs = 1'b0;  
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b1;
+	       NEXT_STATE <= S10;
+	    end 	  
+	  S10:  // done
+	    begin
+	       done = 1'b1;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end 
+	  S13:  // start of sqrt path
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b010;
+	       sel_muxb = 3'b001;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S14;
+	    end
+	  S14:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b001;
+	       sel_muxb = 3'b100;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S15;
+	    end 
+	  S15:  // iteration 1
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S16;
+	    end
+	  S16:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S17;
+	    end
+	  S17:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S18;
+	    end
+	  S18:  // iteration 2
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S19;
+	    end
+	  S19:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S20;
+	    end
+	  S20:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S21;
+	    end
+	  S21:  // iteration 3
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b1;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S22;
+	    end
+	  S22:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b1;
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b011;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S23;
+	    end
+	  S23:  
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b1;
+	       load_regb = 1'b0;
+	       load_regc = 1'b1;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b100;
+	       sel_muxb = 3'b010;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S24;
+	    end 
+	  S24: // q,qm,qp
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;
+	       load_regr = 1'b0;
+	       load_regs = 1'b1;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S25;
+	    end 	  
+	  S25:  // rem
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b1;
+	       load_regs = 1'b0;  
+	       sel_muxa = 3'b011;
+	       sel_muxb = 3'b110;
+	       sel_muxr = 1'b1;
+	       NEXT_STATE <= S26;
+	    end 	  
+	  S26:  // done
+	    begin
+	       done = 1'b1;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end 
+	  default: 
+	    begin
+	       done = 1'b0;
+	       load_rega = 1'b0;
+	       load_regb = 1'b0;
+	       load_regc = 1'b0;
+	       load_regd = 1'b0;	       
+	       load_regr = 1'b0;
+	       load_regs = 1'b0;		    	       
+	       sel_muxa = 3'b000;
+	       sel_muxb = 3'b000;
+	       sel_muxr = 1'b0;
+	       NEXT_STATE <= S0;
+	    end
+	endcase // case(CURRENT_STATE)	
+     end // always @ (CURRENT_STATE or X)   
+
+endmodule // fsm
--- a/wally-pipelined/src/fpu/ldf128.sv
+++ b/wally-pipelined/src/fpu/ldf128.sv
@ -0,0 +1,543 @@
+// Ladner-Fischer Prefix Adder
+
+module ldf128 (cout, sum, a, b, cin);
+   
+   input [127:0] a, b;
+   input 	 cin;
+   
+   output [127:0] sum;
+   output 	  cout;
+
+   wire [128:0]   p,g;
+   wire [127:0]   c;
+
+   // pre-computation
+   assign p={a^b,1'b0};
+   assign g={a&b, cin};
+
+   // prefix tree
+   ladner_fischer128 prefix_tree (c, p[127:0], g[127:0]);
+
+   // post-computation
+   assign sum=p[128:1]^c;
+   assign cout=g[128]|(p[128]&c[127]);
+
+endmodule
+
+module ladner_fischer128 (c, p, g);
+   
+   input [127:0] p;
+   input [127:0] g;
+   
+   output [128:1] c;
+
+   // parallel-prefix, Ladner-Fischer
+
+   // Stage 1: Generates G/P pairs that span 1 bits
+   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
+   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
+   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
+   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
+   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
+   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
+   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
+   black b_15_14 (G_15_14, P_15_14, {g[15],g[14]}, {p[15],p[14]});
+
+   black b_17_16 (G_17_16, P_17_16, {g[17],g[16]}, {p[17],p[16]});
+   black b_19_18 (G_19_18, P_19_18, {g[19],g[18]}, {p[19],p[18]});
+   black b_21_20 (G_21_20, P_21_20, {g[21],g[20]}, {p[21],p[20]});
+   black b_23_22 (G_23_22, P_23_22, {g[23],g[22]}, {p[23],p[22]});
+   black b_25_24 (G_25_24, P_25_24, {g[25],g[24]}, {p[25],p[24]});
+   black b_27_26 (G_27_26, P_27_26, {g[27],g[26]}, {p[27],p[26]});
+   black b_29_28 (G_29_28, P_29_28, {g[29],g[28]}, {p[29],p[28]});
+   black b_31_30 (G_31_30, P_31_30, {g[31],g[30]}, {p[31],p[30]});
+
+   black b_33_32 (G_33_32, P_33_32, {g[33],g[32]}, {p[33],p[32]});
+   black b_35_34 (G_35_34, P_35_34, {g[35],g[34]}, {p[35],p[34]});
+   black b_37_36 (G_37_36, P_37_36, {g[37],g[36]}, {p[37],p[36]});
+   black b_39_38 (G_39_38, P_39_38, {g[39],g[38]}, {p[39],p[38]});
+   black b_41_40 (G_41_40, P_41_40, {g[41],g[40]}, {p[41],p[40]});
+   black b_43_42 (G_43_42, P_43_42, {g[43],g[42]}, {p[43],p[42]});
+   black b_45_44 (G_45_44, P_45_44, {g[45],g[44]}, {p[45],p[44]});
+   black b_47_46 (G_47_46, P_47_46, {g[47],g[46]}, {p[47],p[46]});
+
+   black b_49_48 (G_49_48, P_49_48, {g[49],g[48]}, {p[49],p[48]});
+   black b_51_50 (G_51_50, P_51_50, {g[51],g[50]}, {p[51],p[50]});
+   black b_53_52 (G_53_52, P_53_52, {g[53],g[52]}, {p[53],p[52]});
+   black b_55_54 (G_55_54, P_55_54, {g[55],g[54]}, {p[55],p[54]});
+   black b_57_56 (G_57_56, P_57_56, {g[57],g[56]}, {p[57],p[56]});
+   black b_59_58 (G_59_58, P_59_58, {g[59],g[58]}, {p[59],p[58]});
+   black b_61_60 (G_61_60, P_61_60, {g[61],g[60]}, {p[61],p[60]});
+   black b_63_62 (G_63_62, P_63_62, {g[63],g[62]}, {p[63],p[62]});
+
+   black b_65_64 (G_65_64, P_65_64, {g[65],g[64]}, {p[65],p[64]});
+   black b_67_66 (G_67_66, P_67_66, {g[67],g[66]}, {p[67],p[66]});
+   black b_69_68 (G_69_68, P_69_68, {g[69],g[68]}, {p[69],p[68]});
+   black b_71_70 (G_71_70, P_71_70, {g[71],g[70]}, {p[71],p[70]});
+   black b_73_72 (G_73_72, P_73_72, {g[73],g[72]}, {p[73],p[72]});
+   black b_75_74 (G_75_74, P_75_74, {g[75],g[74]}, {p[75],p[74]});
+   black b_77_76 (G_77_76, P_77_76, {g[77],g[76]}, {p[77],p[76]});
+   black b_79_78 (G_79_78, P_79_78, {g[79],g[78]}, {p[79],p[78]});
+
+   black b_81_80 (G_81_80, P_81_80, {g[81],g[80]}, {p[81],p[80]});
+   black b_83_82 (G_83_82, P_83_82, {g[83],g[82]}, {p[83],p[82]});
+   black b_85_84 (G_85_84, P_85_84, {g[85],g[84]}, {p[85],p[84]});
+   black b_87_86 (G_87_86, P_87_86, {g[87],g[86]}, {p[87],p[86]});
+   black b_89_88 (G_89_88, P_89_88, {g[89],g[88]}, {p[89],p[88]});
+   black b_91_90 (G_91_90, P_91_90, {g[91],g[90]}, {p[91],p[90]});
+   black b_93_92 (G_93_92, P_93_92, {g[93],g[92]}, {p[93],p[92]});
+   black b_95_94 (G_95_94, P_95_94, {g[95],g[94]}, {p[95],p[94]});
+
+   black b_97_96 (G_97_96, P_97_96, {g[97],g[96]}, {p[97],p[96]});
+   black b_99_98 (G_99_98, P_99_98, {g[99],g[98]}, {p[99],p[98]});
+   black b_101_100 (G_101_100, P_101_100, {g[101],g[100]}, {p[101],p[100]});
+   black b_103_102 (G_103_102, P_103_102, {g[103],g[102]}, {p[103],p[102]});
+   black b_105_104 (G_105_104, P_105_104, {g[105],g[104]}, {p[105],p[104]});
+   black b_107_106 (G_107_106, P_107_106, {g[107],g[106]}, {p[107],p[106]});
+   black b_109_108 (G_109_108, P_109_108, {g[109],g[108]}, {p[109],p[108]});
+   black b_111_110 (G_111_110, P_111_110, {g[111],g[110]}, {p[111],p[110]});
+
+   black b_113_112 (G_113_112, P_113_112, {g[113],g[112]}, {p[113],p[112]});
+   black b_115_114 (G_115_114, P_115_114, {g[115],g[114]}, {p[115],p[114]});
+   black b_117_116 (G_117_116, P_117_116, {g[117],g[116]}, {p[117],p[116]});
+   black b_119_118 (G_119_118, P_119_118, {g[119],g[118]}, {p[119],p[118]});
+   black b_121_120 (G_121_120, P_121_120, {g[121],g[120]}, {p[121],p[120]});
+   black b_123_122 (G_123_122, P_123_122, {g[123],g[122]}, {p[123],p[122]});
+   black b_125_124 (G_125_124, P_125_124, {g[125],g[124]}, {p[125],p[124]});
+   black b_127_126 (G_127_126, P_127_126, {g[127],g[126]}, {p[127],p[126]});
+
+
+   // Stage 2: Generates G/P pairs that span 2 bits
+   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
+   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
+   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
+   black b_15_12 (G_15_12, P_15_12, {G_15_14,G_13_12}, {P_15_14,P_13_12});
+   black b_19_16 (G_19_16, P_19_16, {G_19_18,G_17_16}, {P_19_18,P_17_16});
+   black b_23_20 (G_23_20, P_23_20, {G_23_22,G_21_20}, {P_23_22,P_21_20});
+   black b_27_24 (G_27_24, P_27_24, {G_27_26,G_25_24}, {P_27_26,P_25_24});
+   black b_31_28 (G_31_28, P_31_28, {G_31_30,G_29_28}, {P_31_30,P_29_28});
+
+   black b_35_32 (G_35_32, P_35_32, {G_35_34,G_33_32}, {P_35_34,P_33_32});
+   black b_39_36 (G_39_36, P_39_36, {G_39_38,G_37_36}, {P_39_38,P_37_36});
+   black b_43_40 (G_43_40, P_43_40, {G_43_42,G_41_40}, {P_43_42,P_41_40});
+   black b_47_44 (G_47_44, P_47_44, {G_47_46,G_45_44}, {P_47_46,P_45_44});
+   black b_51_48 (G_51_48, P_51_48, {G_51_50,G_49_48}, {P_51_50,P_49_48});
+   black b_55_52 (G_55_52, P_55_52, {G_55_54,G_53_52}, {P_55_54,P_53_52});
+   black b_59_56 (G_59_56, P_59_56, {G_59_58,G_57_56}, {P_59_58,P_57_56});
+   black b_63_60 (G_63_60, P_63_60, {G_63_62,G_61_60}, {P_63_62,P_61_60});
+
+   black b_67_64 (G_67_64, P_67_64, {G_67_66,G_65_64}, {P_67_66,P_65_64});
+   black b_71_68 (G_71_68, P_71_68, {G_71_70,G_69_68}, {P_71_70,P_69_68});
+   black b_75_72 (G_75_72, P_75_72, {G_75_74,G_73_72}, {P_75_74,P_73_72});
+   black b_79_76 (G_79_76, P_79_76, {G_79_78,G_77_76}, {P_79_78,P_77_76});
+   black b_83_80 (G_83_80, P_83_80, {G_83_82,G_81_80}, {P_83_82,P_81_80});
+   black b_87_84 (G_87_84, P_87_84, {G_87_86,G_85_84}, {P_87_86,P_85_84});
+   black b_91_88 (G_91_88, P_91_88, {G_91_90,G_89_88}, {P_91_90,P_89_88});
+   black b_95_92 (G_95_92, P_95_92, {G_95_94,G_93_92}, {P_95_94,P_93_92});
+
+   black b_99_96 (G_99_96, P_99_96, {G_99_98,G_97_96}, {P_99_98,P_97_96});
+   black b_103_100 (G_103_100, P_103_100, {G_103_102,G_101_100}, {P_103_102,P_101_100});
+   black b_107_104 (G_107_104, P_107_104, {G_107_106,G_105_104}, {P_107_106,P_105_104});
+   black b_111_108 (G_111_108, P_111_108, {G_111_110,G_109_108}, {P_111_110,P_109_108});
+   black b_115_112 (G_115_112, P_115_112, {G_115_114,G_113_112}, {P_115_114,P_113_112});
+   black b_119_116 (G_119_116, P_119_116, {G_119_118,G_117_116}, {P_119_118,P_117_116});
+   black b_123_120 (G_123_120, P_123_120, {G_123_122,G_121_120}, {P_123_122,P_121_120});
+   black b_127_124 (G_127_124, P_127_124, {G_127_126,G_125_124}, {P_127_126,P_125_124});
+
+   // Stage 3: Generates G/P pairs that span 4 bits
+   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
+   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
+   black b_13_8 (G_13_8, P_13_8, {G_13_12,G_11_8}, {P_13_12,P_11_8});
+   black b_15_8 (G_15_8, P_15_8, {G_15_12,G_11_8}, {P_15_12,P_11_8});
+   black b_21_16 (G_21_16, P_21_16, {G_21_20,G_19_16}, {P_21_20,P_19_16});
+   black b_23_16 (G_23_16, P_23_16, {G_23_20,G_19_16}, {P_23_20,P_19_16});
+   black b_29_24 (G_29_24, P_29_24, {G_29_28,G_27_24}, {P_29_28,P_27_24});
+   black b_31_24 (G_31_24, P_31_24, {G_31_28,G_27_24}, {P_31_28,P_27_24});
+
+   black b_37_32 (G_37_32, P_37_32, {G_37_36,G_35_32}, {P_37_36,P_35_32});
+   black b_39_32 (G_39_32, P_39_32, {G_39_36,G_35_32}, {P_39_36,P_35_32});
+   black b_45_40 (G_45_40, P_45_40, {G_45_44,G_43_40}, {P_45_44,P_43_40});
+   black b_47_40 (G_47_40, P_47_40, {G_47_44,G_43_40}, {P_47_44,P_43_40});
+   black b_53_48 (G_53_48, P_53_48, {G_53_52,G_51_48}, {P_53_52,P_51_48});
+   black b_55_48 (G_55_48, P_55_48, {G_55_52,G_51_48}, {P_55_52,P_51_48});
+   black b_61_56 (G_61_56, P_61_56, {G_61_60,G_59_56}, {P_61_60,P_59_56});
+   black b_63_56 (G_63_56, P_63_56, {G_63_60,G_59_56}, {P_63_60,P_59_56});
+
+   black b_69_64 (G_69_64, P_69_64, {G_69_68,G_67_64}, {P_69_68,P_67_64});
+   black b_71_64 (G_71_64, P_71_64, {G_71_68,G_67_64}, {P_71_68,P_67_64});
+   black b_77_72 (G_77_72, P_77_72, {G_77_76,G_75_72}, {P_77_76,P_75_72});
+   black b_79_72 (G_79_72, P_79_72, {G_79_76,G_75_72}, {P_79_76,P_75_72});
+   black b_85_80 (G_85_80, P_85_80, {G_85_84,G_83_80}, {P_85_84,P_83_80});
+   black b_87_80 (G_87_80, P_87_80, {G_87_84,G_83_80}, {P_87_84,P_83_80});
+   black b_93_88 (G_93_88, P_93_88, {G_93_92,G_91_88}, {P_93_92,P_91_88});
+   black b_95_88 (G_95_88, P_95_88, {G_95_92,G_91_88}, {P_95_92,P_91_88});
+
+   black b_101_96 (G_101_96, P_101_96, {G_101_100,G_99_96}, {P_101_100,P_99_96});
+   black b_103_96 (G_103_96, P_103_96, {G_103_100,G_99_96}, {P_103_100,P_99_96});
+   black b_109_104 (G_109_104, P_109_104, {G_109_108,G_107_104}, {P_109_108,P_107_104});
+   black b_111_104 (G_111_104, P_111_104, {G_111_108,G_107_104}, {P_111_108,P_107_104});
+   black b_117_112 (G_117_112, P_117_112, {G_117_116,G_115_112}, {P_117_116,P_115_112});
+   black b_119_112 (G_119_112, P_119_112, {G_119_116,G_115_112}, {P_119_116,P_115_112});
+   black b_125_120 (G_125_120, P_125_120, {G_125_124,G_123_120}, {P_125_124,P_123_120});
+   black b_127_120 (G_127_120, P_127_120, {G_127_124,G_123_120}, {P_127_124,P_123_120});
+
+   // Stage 4: Generates G/P pairs that span 8 bits
+   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
+   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
+   grey g_13_0 (G_13_0, {G_13_8,G_7_0}, P_13_8);
+   grey g_15_0 (G_15_0, {G_15_8,G_7_0}, P_15_8);
+   black b_25_16 (G_25_16, P_25_16, {G_25_24,G_23_16}, {P_25_24,P_23_16});
+   black b_27_16 (G_27_16, P_27_16, {G_27_24,G_23_16}, {P_27_24,P_23_16});
+   black b_29_16 (G_29_16, P_29_16, {G_29_24,G_23_16}, {P_29_24,P_23_16});
+   black b_31_16 (G_31_16, P_31_16, {G_31_24,G_23_16}, {P_31_24,P_23_16});
+
+   black b_41_32 (G_41_32, P_41_32, {G_41_40,G_39_32}, {P_41_40,P_39_32});
+   black b_43_32 (G_43_32, P_43_32, {G_43_40,G_39_32}, {P_43_40,P_39_32});
+   black b_45_32 (G_45_32, P_45_32, {G_45_40,G_39_32}, {P_45_40,P_39_32});
+   black b_47_32 (G_47_32, P_47_32, {G_47_40,G_39_32}, {P_47_40,P_39_32});
+   black b_57_48 (G_57_48, P_57_48, {G_57_56,G_55_48}, {P_57_56,P_55_48});
+   black b_59_48 (G_59_48, P_59_48, {G_59_56,G_55_48}, {P_59_56,P_55_48});
+   black b_61_48 (G_61_48, P_61_48, {G_61_56,G_55_48}, {P_61_56,P_55_48});
+   black b_63_48 (G_63_48, P_63_48, {G_63_56,G_55_48}, {P_63_56,P_55_48});
+
+   black b_73_64 (G_73_64, P_73_64, {G_73_72,G_71_64}, {P_73_72,P_71_64});
+   black b_75_64 (G_75_64, P_75_64, {G_75_72,G_71_64}, {P_75_72,P_71_64});
+   black b_77_64 (G_77_64, P_77_64, {G_77_72,G_71_64}, {P_77_72,P_71_64});
+   black b_79_64 (G_79_64, P_79_64, {G_79_72,G_71_64}, {P_79_72,P_71_64});
+   black b_89_80 (G_89_80, P_89_80, {G_89_88,G_87_80}, {P_89_88,P_87_80});
+   black b_91_80 (G_91_80, P_91_80, {G_91_88,G_87_80}, {P_91_88,P_87_80});
+   black b_93_80 (G_93_80, P_93_80, {G_93_88,G_87_80}, {P_93_88,P_87_80});
+   black b_95_80 (G_95_80, P_95_80, {G_95_88,G_87_80}, {P_95_88,P_87_80});
+
+   black b_105_96 (G_105_96, P_105_96, {G_105_104,G_103_96}, {P_105_104,P_103_96});
+   black b_107_96 (G_107_96, P_107_96, {G_107_104,G_103_96}, {P_107_104,P_103_96});
+   black b_109_96 (G_109_96, P_109_96, {G_109_104,G_103_96}, {P_109_104,P_103_96});
+   black b_111_96 (G_111_96, P_111_96, {G_111_104,G_103_96}, {P_111_104,P_103_96});
+   black b_121_112 (G_121_112, P_121_112, {G_121_120,G_119_112}, {P_121_120,P_119_112});
+   black b_123_112 (G_123_112, P_123_112, {G_123_120,G_119_112}, {P_123_120,P_119_112});
+   black b_125_112 (G_125_112, P_125_112, {G_125_120,G_119_112}, {P_125_120,P_119_112});
+   black b_127_112 (G_127_112, P_127_112, {G_127_120,G_119_112}, {P_127_120,P_119_112});
+
+   // Stage 5: Generates G/P pairs that span 16 bits
+   grey g_17_0 (G_17_0, {G_17_16,G_15_0}, P_17_16);
+   grey g_19_0 (G_19_0, {G_19_16,G_15_0}, P_19_16);
+   grey g_21_0 (G_21_0, {G_21_16,G_15_0}, P_21_16);
+   grey g_23_0 (G_23_0, {G_23_16,G_15_0}, P_23_16);
+   grey g_25_0 (G_25_0, {G_25_16,G_15_0}, P_25_16);
+   grey g_27_0 (G_27_0, {G_27_16,G_15_0}, P_27_16);
+   grey g_29_0 (G_29_0, {G_29_16,G_15_0}, P_29_16);
+   grey g_31_0 (G_31_0, {G_31_16,G_15_0}, P_31_16);
+
+   black b_49_32 (G_49_32, P_49_32, {G_49_48,G_47_32}, {P_49_48,P_47_32});
+   black b_51_32 (G_51_32, P_51_32, {G_51_48,G_47_32}, {P_51_48,P_47_32});
+   black b_53_32 (G_53_32, P_53_32, {G_53_48,G_47_32}, {P_53_48,P_47_32});
+   black b_55_32 (G_55_32, P_55_32, {G_55_48,G_47_32}, {P_55_48,P_47_32});
+   black b_57_32 (G_57_32, P_57_32, {G_57_48,G_47_32}, {P_57_48,P_47_32});
+   black b_59_32 (G_59_32, P_59_32, {G_59_48,G_47_32}, {P_59_48,P_47_32});
+   black b_61_32 (G_61_32, P_61_32, {G_61_48,G_47_32}, {P_61_48,P_47_32});
+   black b_63_32 (G_63_32, P_63_32, {G_63_48,G_47_32}, {P_63_48,P_47_32});
+
+   black b_81_64 (G_81_64, P_81_64, {G_81_80,G_79_64}, {P_81_80,P_79_64});
+   black b_83_64 (G_83_64, P_83_64, {G_83_80,G_79_64}, {P_83_80,P_79_64});
+   black b_85_64 (G_85_64, P_85_64, {G_85_80,G_79_64}, {P_85_80,P_79_64});
+   black b_87_64 (G_87_64, P_87_64, {G_87_80,G_79_64}, {P_87_80,P_79_64});
+   black b_89_64 (G_89_64, P_89_64, {G_89_80,G_79_64}, {P_89_80,P_79_64});
+   black b_91_64 (G_91_64, P_91_64, {G_91_80,G_79_64}, {P_91_80,P_79_64});
+   black b_93_64 (G_93_64, P_93_64, {G_93_80,G_79_64}, {P_93_80,P_79_64});
+   black b_95_64 (G_95_64, P_95_64, {G_95_80,G_79_64}, {P_95_80,P_79_64});
+
+   black b_113_96 (G_113_96, P_113_96, {G_113_112,G_111_96}, {P_113_112,P_111_96});
+   black b_115_96 (G_115_96, P_115_96, {G_115_112,G_111_96}, {P_115_112,P_111_96});
+   black b_117_96 (G_117_96, P_117_96, {G_117_112,G_111_96}, {P_117_112,P_111_96});
+   black b_119_96 (G_119_96, P_119_96, {G_119_112,G_111_96}, {P_119_112,P_111_96});
+   black b_121_96 (G_121_96, P_121_96, {G_121_112,G_111_96}, {P_121_112,P_111_96});
+   black b_123_96 (G_123_96, P_123_96, {G_123_112,G_111_96}, {P_123_112,P_111_96});
+   black b_125_96 (G_125_96, P_125_96, {G_125_112,G_111_96}, {P_125_112,P_111_96});
+   black b_127_96 (G_127_96, P_127_96, {G_127_112,G_111_96}, {P_127_112,P_111_96});
+
+   // Stage 6: Generates G/P pairs that span 32 bits
+   grey g_33_0 (G_33_0, {G_33_32,G_31_0}, P_33_32);
+   grey g_35_0 (G_35_0, {G_35_32,G_31_0}, P_35_32);
+   grey g_37_0 (G_37_0, {G_37_32,G_31_0}, P_37_32);
+   grey g_39_0 (G_39_0, {G_39_32,G_31_0}, P_39_32);
+   grey g_41_0 (G_41_0, {G_41_32,G_31_0}, P_41_32);
+   grey g_43_0 (G_43_0, {G_43_32,G_31_0}, P_43_32);
+   grey g_45_0 (G_45_0, {G_45_32,G_31_0}, P_45_32);
+   grey g_47_0 (G_47_0, {G_47_32,G_31_0}, P_47_32);
+
+   grey g_49_0 (G_49_0, {G_49_32,G_31_0}, P_49_32);
+   grey g_51_0 (G_51_0, {G_51_32,G_31_0}, P_51_32);
+   grey g_53_0 (G_53_0, {G_53_32,G_31_0}, P_53_32);
+   grey g_55_0 (G_55_0, {G_55_32,G_31_0}, P_55_32);
+   grey g_57_0 (G_57_0, {G_57_32,G_31_0}, P_57_32);
+   grey g_59_0 (G_59_0, {G_59_32,G_31_0}, P_59_32);
+   grey g_61_0 (G_61_0, {G_61_32,G_31_0}, P_61_32);
+   grey g_63_0 (G_63_0, {G_63_32,G_31_0}, P_63_32);
+
+   black b_97_64 (G_97_64, P_97_64, {G_97_96,G_95_64}, {P_97_96,P_95_64});
+   black b_99_64 (G_99_64, P_99_64, {G_99_96,G_95_64}, {P_99_96,P_95_64});
+   black b_101_64 (G_101_64, P_101_64, {G_101_96,G_95_64}, {P_101_96,P_95_64});
+   black b_103_64 (G_103_64, P_103_64, {G_103_96,G_95_64}, {P_103_96,P_95_64});
+   black b_105_64 (G_105_64, P_105_64, {G_105_96,G_95_64}, {P_105_96,P_95_64});
+   black b_107_64 (G_107_64, P_107_64, {G_107_96,G_95_64}, {P_107_96,P_95_64});
+   black b_109_64 (G_109_64, P_109_64, {G_109_96,G_95_64}, {P_109_96,P_95_64});
+   black b_111_64 (G_111_64, P_111_64, {G_111_96,G_95_64}, {P_111_96,P_95_64});
+
+   black b_113_64 (G_113_64, P_113_64, {G_113_96,G_95_64}, {P_113_96,P_95_64});
+   black b_115_64 (G_115_64, P_115_64, {G_115_96,G_95_64}, {P_115_96,P_95_64});
+   black b_117_64 (G_117_64, P_117_64, {G_117_96,G_95_64}, {P_117_96,P_95_64});
+   black b_119_64 (G_119_64, P_119_64, {G_119_96,G_95_64}, {P_119_96,P_95_64});
+   black b_121_64 (G_121_64, P_121_64, {G_121_96,G_95_64}, {P_121_96,P_95_64});
+   black b_123_64 (G_123_64, P_123_64, {G_123_96,G_95_64}, {P_123_96,P_95_64});
+   black b_125_64 (G_125_64, P_125_64, {G_125_96,G_95_64}, {P_125_96,P_95_64});
+   black b_127_64 (G_127_64, P_127_64, {G_127_96,G_95_64}, {P_127_96,P_95_64});
+
+   // Stage 7: Generates G/P pairs that span 64 bits
+   grey g_65_0 (G_65_0, {G_65_64,G_63_0}, P_65_64);
+   grey g_67_0 (G_67_0, {G_67_64,G_63_0}, P_67_64);
+   grey g_69_0 (G_69_0, {G_69_64,G_63_0}, P_69_64);
+   grey g_71_0 (G_71_0, {G_71_64,G_63_0}, P_71_64);
+   grey g_73_0 (G_73_0, {G_73_64,G_63_0}, P_73_64);
+   grey g_75_0 (G_75_0, {G_75_64,G_63_0}, P_75_64);
+   grey g_77_0 (G_77_0, {G_77_64,G_63_0}, P_77_64);
+   grey g_79_0 (G_79_0, {G_79_64,G_63_0}, P_79_64);
+
+   grey g_81_0 (G_81_0, {G_81_64,G_63_0}, P_81_64);
+   grey g_83_0 (G_83_0, {G_83_64,G_63_0}, P_83_64);
+   grey g_85_0 (G_85_0, {G_85_64,G_63_0}, P_85_64);
+   grey g_87_0 (G_87_0, {G_87_64,G_63_0}, P_87_64);
+   grey g_89_0 (G_89_0, {G_89_64,G_63_0}, P_89_64);
+   grey g_91_0 (G_91_0, {G_91_64,G_63_0}, P_91_64);
+   grey g_93_0 (G_93_0, {G_93_64,G_63_0}, P_93_64);
+   grey g_95_0 (G_95_0, {G_95_64,G_63_0}, P_95_64);
+
+   grey g_97_0 (G_97_0, {G_97_64,G_63_0}, P_97_64);
+   grey g_99_0 (G_99_0, {G_99_64,G_63_0}, P_99_64);
+   grey g_101_0 (G_101_0, {G_101_64,G_63_0}, P_101_64);
+   grey g_103_0 (G_103_0, {G_103_64,G_63_0}, P_103_64);
+   grey g_105_0 (G_105_0, {G_105_64,G_63_0}, P_105_64);
+   grey g_107_0 (G_107_0, {G_107_64,G_63_0}, P_107_64);
+   grey g_109_0 (G_109_0, {G_109_64,G_63_0}, P_109_64);
+   grey g_111_0 (G_111_0, {G_111_64,G_63_0}, P_111_64);
+
+   grey g_113_0 (G_113_0, {G_113_64,G_63_0}, P_113_64);
+   grey g_115_0 (G_115_0, {G_115_64,G_63_0}, P_115_64);
+   grey g_117_0 (G_117_0, {G_117_64,G_63_0}, P_117_64);
+   grey g_119_0 (G_119_0, {G_119_64,G_63_0}, P_119_64);
+   grey g_121_0 (G_121_0, {G_121_64,G_63_0}, P_121_64);
+   grey g_123_0 (G_123_0, {G_123_64,G_63_0}, P_123_64);
+   grey g_125_0 (G_125_0, {G_125_64,G_63_0}, P_125_64);
+   grey g_127_0 (G_127_0, {G_127_64,G_63_0}, P_127_64);
+
+   // Extra grey cell stage 
+   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
+   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
+   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
+   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
+   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
+   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
+   grey g_14_0 (G_14_0, {g[14],G_13_0}, p[14]);
+   grey g_16_0 (G_16_0, {g[16],G_15_0}, p[16]);
+   grey g_18_0 (G_18_0, {g[18],G_17_0}, p[18]);
+   grey g_20_0 (G_20_0, {g[20],G_19_0}, p[20]);
+   grey g_22_0 (G_22_0, {g[22],G_21_0}, p[22]);
+   grey g_24_0 (G_24_0, {g[24],G_23_0}, p[24]);
+   grey g_26_0 (G_26_0, {g[26],G_25_0}, p[26]);
+   grey g_28_0 (G_28_0, {g[28],G_27_0}, p[28]);
+   grey g_30_0 (G_30_0, {g[30],G_29_0}, p[30]);
+   grey g_32_0 (G_32_0, {g[32],G_31_0}, p[32]);
+   grey g_34_0 (G_34_0, {g[34],G_33_0}, p[34]);
+   grey g_36_0 (G_36_0, {g[36],G_35_0}, p[36]);
+   grey g_38_0 (G_38_0, {g[38],G_37_0}, p[38]);
+   grey g_40_0 (G_40_0, {g[40],G_39_0}, p[40]);
+   grey g_42_0 (G_42_0, {g[42],G_41_0}, p[42]);
+   grey g_44_0 (G_44_0, {g[44],G_43_0}, p[44]);
+   grey g_46_0 (G_46_0, {g[46],G_45_0}, p[46]);
+   grey g_48_0 (G_48_0, {g[48],G_47_0}, p[48]);
+   grey g_50_0 (G_50_0, {g[50],G_49_0}, p[50]);
+   grey g_52_0 (G_52_0, {g[52],G_51_0}, p[52]);
+   grey g_54_0 (G_54_0, {g[54],G_53_0}, p[54]);
+   grey g_56_0 (G_56_0, {g[56],G_55_0}, p[56]);
+   grey g_58_0 (G_58_0, {g[58],G_57_0}, p[58]);
+   grey g_60_0 (G_60_0, {g[60],G_59_0}, p[60]);
+   grey g_62_0 (G_62_0, {g[62],G_61_0}, p[62]);
+   grey g_64_0 (G_64_0, {g[64],G_63_0}, p[64]);
+   grey g_66_0 (G_66_0, {g[66],G_65_0}, p[66]);
+   grey g_68_0 (G_68_0, {g[68],G_67_0}, p[68]);
+   grey g_70_0 (G_70_0, {g[70],G_69_0}, p[70]);
+   grey g_72_0 (G_72_0, {g[72],G_71_0}, p[72]);
+   grey g_74_0 (G_74_0, {g[74],G_73_0}, p[74]);
+   grey g_76_0 (G_76_0, {g[76],G_75_0}, p[76]);
+   grey g_78_0 (G_78_0, {g[78],G_77_0}, p[78]);
+   grey g_80_0 (G_80_0, {g[80],G_79_0}, p[80]);
+   grey g_82_0 (G_82_0, {g[82],G_81_0}, p[82]);
+   grey g_84_0 (G_84_0, {g[84],G_83_0}, p[84]);
+   grey g_86_0 (G_86_0, {g[86],G_85_0}, p[86]);
+   grey g_88_0 (G_88_0, {g[88],G_87_0}, p[88]);
+   grey g_90_0 (G_90_0, {g[90],G_89_0}, p[90]);
+   grey g_92_0 (G_92_0, {g[92],G_91_0}, p[92]);
+   grey g_94_0 (G_94_0, {g[94],G_93_0}, p[94]);
+   grey g_96_0 (G_96_0, {g[96],G_95_0}, p[96]);
+   grey g_98_0 (G_98_0, {g[98],G_97_0}, p[98]);
+   grey g_100_0 (G_100_0, {g[100],G_99_0}, p[100]);
+   grey g_102_0 (G_102_0, {g[102],G_101_0}, p[102]);
+   grey g_104_0 (G_104_0, {g[104],G_103_0}, p[104]);
+   grey g_106_0 (G_106_0, {g[106],G_105_0}, p[106]);
+   grey g_108_0 (G_108_0, {g[108],G_107_0}, p[108]);
+   grey g_110_0 (G_110_0, {g[110],G_109_0}, p[110]);
+   grey g_112_0 (G_112_0, {g[112],G_111_0}, p[112]);
+   grey g_114_0 (G_114_0, {g[114],G_113_0}, p[114]);
+   grey g_116_0 (G_116_0, {g[116],G_115_0}, p[116]);
+   grey g_118_0 (G_118_0, {g[118],G_117_0}, p[118]);
+   grey g_120_0 (G_120_0, {g[120],G_119_0}, p[120]);
+   grey g_122_0 (G_122_0, {g[122],G_121_0}, p[122]);
+   grey g_124_0 (G_124_0, {g[124],G_123_0}, p[124]);
+   grey g_126_0 (G_126_0, {g[126],G_125_0}, p[126]);
+
+   // Final Stage: Apply c_k+1=G_k_0
+   assign c[1]=g[0];
+   assign c[2]=G_1_0;
+   assign c[3]=G_2_0;
+   assign c[4]=G_3_0;
+   assign c[5]=G_4_0;
+   assign c[6]=G_5_0;
+   assign c[7]=G_6_0;
+   assign c[8]=G_7_0;
+   assign c[9]=G_8_0;
+
+   assign c[10]=G_9_0;
+   assign c[11]=G_10_0;
+   assign c[12]=G_11_0;
+   assign c[13]=G_12_0;
+   assign c[14]=G_13_0;
+   assign c[15]=G_14_0;
+   assign c[16]=G_15_0;
+   assign c[17]=G_16_0;
+
+   assign c[18]=G_17_0;
+   assign c[19]=G_18_0;
+   assign c[20]=G_19_0;
+   assign c[21]=G_20_0;
+   assign c[22]=G_21_0;
+   assign c[23]=G_22_0;
+   assign c[24]=G_23_0;
+   assign c[25]=G_24_0;
+
+   assign c[26]=G_25_0;
+   assign c[27]=G_26_0;
+   assign c[28]=G_27_0;
+   assign c[29]=G_28_0;
+   assign c[30]=G_29_0;
+   assign c[31]=G_30_0;
+   assign c[32]=G_31_0;
+   assign c[33]=G_32_0;
+
+   assign c[34]=G_33_0;
+   assign c[35]=G_34_0;
+   assign c[36]=G_35_0;
+   assign c[37]=G_36_0;
+   assign c[38]=G_37_0;
+   assign c[39]=G_38_0;
+   assign c[40]=G_39_0;
+   assign c[41]=G_40_0;
+
+   assign c[42]=G_41_0;
+   assign c[43]=G_42_0;
+   assign c[44]=G_43_0;
+   assign c[45]=G_44_0;
+   assign c[46]=G_45_0;
+   assign c[47]=G_46_0;
+   assign c[48]=G_47_0;
+   assign c[49]=G_48_0;
+
+   assign c[50]=G_49_0;
+   assign c[51]=G_50_0;
+   assign c[52]=G_51_0;
+   assign c[53]=G_52_0;
+   assign c[54]=G_53_0;
+   assign c[55]=G_54_0;
+   assign c[56]=G_55_0;
+   assign c[57]=G_56_0;
+
+   assign c[58]=G_57_0;
+   assign c[59]=G_58_0;
+   assign c[60]=G_59_0;
+   assign c[61]=G_60_0;
+   assign c[62]=G_61_0;
+   assign c[63]=G_62_0;
+   assign c[64]=G_63_0;
+   assign c[65]=G_64_0;
+
+   assign c[66]=G_65_0;
+   assign c[67]=G_66_0;
+   assign c[68]=G_67_0;
+   assign c[69]=G_68_0;
+   assign c[70]=G_69_0;
+   assign c[71]=G_70_0;
+   assign c[72]=G_71_0;
+   assign c[73]=G_72_0;
+
+   assign c[74]=G_73_0;
+   assign c[75]=G_74_0;
+   assign c[76]=G_75_0;
+   assign c[77]=G_76_0;
+   assign c[78]=G_77_0;
+   assign c[79]=G_78_0;
+   assign c[80]=G_79_0;
+   assign c[81]=G_80_0;
+
+   assign c[82]=G_81_0;
+   assign c[83]=G_82_0;
+   assign c[84]=G_83_0;
+   assign c[85]=G_84_0;
+   assign c[86]=G_85_0;
+   assign c[87]=G_86_0;
+   assign c[88]=G_87_0;
+   assign c[89]=G_88_0;
+
+   assign c[90]=G_89_0;
+   assign c[91]=G_90_0;
+   assign c[92]=G_91_0;
+   assign c[93]=G_92_0;
+   assign c[94]=G_93_0;
+   assign c[95]=G_94_0;
+   assign c[96]=G_95_0;
+   assign c[97]=G_96_0;
+
+   assign c[98]=G_97_0;
+   assign c[99]=G_98_0;
+   assign c[100]=G_99_0;
+   assign c[101]=G_100_0;
+   assign c[102]=G_101_0;
+   assign c[103]=G_102_0;
+   assign c[104]=G_103_0;
+   assign c[105]=G_104_0;
+
+   assign c[106]=G_105_0;
+   assign c[107]=G_106_0;
+   assign c[108]=G_107_0;
+   assign c[109]=G_108_0;
+   assign c[110]=G_109_0;
+   assign c[111]=G_110_0;
+   assign c[112]=G_111_0;
+   assign c[113]=G_112_0;
+
+   assign c[114]=G_113_0;
+   assign c[115]=G_114_0;
+   assign c[116]=G_115_0;
+   assign c[117]=G_116_0;
+   assign c[118]=G_117_0;
+   assign c[119]=G_118_0;
+   assign c[120]=G_119_0;
+   assign c[121]=G_120_0;
+
+   assign c[122]=G_121_0;
+   assign c[123]=G_122_0;
+   assign c[124]=G_123_0;
+   assign c[125]=G_124_0;
+   assign c[126]=G_125_0;
+   assign c[127]=G_126_0;
+   assign c[128]=G_127_0;
+
+endmodule // ladner_fischer
+
--- a/wally-pipelined/src/fpu/ldf64.sv
+++ b/wally-pipelined/src/fpu/ldf64.sv
@ -0,0 +1,273 @@
+// Ladner-Fischer Prefix Adder
+
+module ldf64 (cout, sum, a, b, cin);
+   input [63:0] a, b;
+   input 	cin;
+   output [63:0] sum;
+   output 	 cout;
+
+   wire [64:0] 	 p,g;
+   wire [63:0] 	 c;
+
+   // pre-computation
+   assign p={a^b,1'b0};
+   assign g={a&b, cin};
+
+   // prefix tree
+   ladner_fischer64 prefix_tree(c, p[63:0], g[63:0]);
+
+   // post-computation
+   assign sum=p[64:1]^c;
+   assign cout=g[64]|(p[64]&c[63]);
+
+endmodule
+
+module ladner_fischer64 (c, p, g);
+   
+   input [63:0] p;
+   input [63:0] g;
+   
+   output [64:1] c;
+
+   // parallel-prefix, Ladner-Fischer
+
+   // Stage 1: Generates G/P pairs that span 1 bits
+   grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
+   black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
+   black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
+   black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
+   black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
+   black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
+   black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
+   black b_15_14 (G_15_14, P_15_14, {g[15],g[14]}, {p[15],p[14]});
+
+   black b_17_16 (G_17_16, P_17_16, {g[17],g[16]}, {p[17],p[16]});
+   black b_19_18 (G_19_18, P_19_18, {g[19],g[18]}, {p[19],p[18]});
+   black b_21_20 (G_21_20, P_21_20, {g[21],g[20]}, {p[21],p[20]});
+   black b_23_22 (G_23_22, P_23_22, {g[23],g[22]}, {p[23],p[22]});
+   black b_25_24 (G_25_24, P_25_24, {g[25],g[24]}, {p[25],p[24]});
+   black b_27_26 (G_27_26, P_27_26, {g[27],g[26]}, {p[27],p[26]});
+   black b_29_28 (G_29_28, P_29_28, {g[29],g[28]}, {p[29],p[28]});
+   black b_31_30 (G_31_30, P_31_30, {g[31],g[30]}, {p[31],p[30]});
+
+   black b_33_32 (G_33_32, P_33_32, {g[33],g[32]}, {p[33],p[32]});
+   black b_35_34 (G_35_34, P_35_34, {g[35],g[34]}, {p[35],p[34]});
+   black b_37_36 (G_37_36, P_37_36, {g[37],g[36]}, {p[37],p[36]});
+   black b_39_38 (G_39_38, P_39_38, {g[39],g[38]}, {p[39],p[38]});
+   black b_41_40 (G_41_40, P_41_40, {g[41],g[40]}, {p[41],p[40]});
+   black b_43_42 (G_43_42, P_43_42, {g[43],g[42]}, {p[43],p[42]});
+   black b_45_44 (G_45_44, P_45_44, {g[45],g[44]}, {p[45],p[44]});
+   black b_47_46 (G_47_46, P_47_46, {g[47],g[46]}, {p[47],p[46]});
+
+   black b_49_48 (G_49_48, P_49_48, {g[49],g[48]}, {p[49],p[48]});
+   black b_51_50 (G_51_50, P_51_50, {g[51],g[50]}, {p[51],p[50]});
+   black b_53_52 (G_53_52, P_53_52, {g[53],g[52]}, {p[53],p[52]});
+   black b_55_54 (G_55_54, P_55_54, {g[55],g[54]}, {p[55],p[54]});
+   black b_57_56 (G_57_56, P_57_56, {g[57],g[56]}, {p[57],p[56]});
+   black b_59_58 (G_59_58, P_59_58, {g[59],g[58]}, {p[59],p[58]});
+   black b_61_60 (G_61_60, P_61_60, {g[61],g[60]}, {p[61],p[60]});
+   black b_63_62 (G_63_62, P_63_62, {g[63],g[62]}, {p[63],p[62]});
+
+   // Stage 2: Generates G/P pairs that span 2 bits
+   grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
+   black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
+   black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
+   black b_15_12 (G_15_12, P_15_12, {G_15_14,G_13_12}, {P_15_14,P_13_12});
+   black b_19_16 (G_19_16, P_19_16, {G_19_18,G_17_16}, {P_19_18,P_17_16});
+   black b_23_20 (G_23_20, P_23_20, {G_23_22,G_21_20}, {P_23_22,P_21_20});
+   black b_27_24 (G_27_24, P_27_24, {G_27_26,G_25_24}, {P_27_26,P_25_24});
+   black b_31_28 (G_31_28, P_31_28, {G_31_30,G_29_28}, {P_31_30,P_29_28});
+
+   black b_35_32 (G_35_32, P_35_32, {G_35_34,G_33_32}, {P_35_34,P_33_32});
+   black b_39_36 (G_39_36, P_39_36, {G_39_38,G_37_36}, {P_39_38,P_37_36});
+   black b_43_40 (G_43_40, P_43_40, {G_43_42,G_41_40}, {P_43_42,P_41_40});
+   black b_47_44 (G_47_44, P_47_44, {G_47_46,G_45_44}, {P_47_46,P_45_44});
+   black b_51_48 (G_51_48, P_51_48, {G_51_50,G_49_48}, {P_51_50,P_49_48});
+   black b_55_52 (G_55_52, P_55_52, {G_55_54,G_53_52}, {P_55_54,P_53_52});
+   black b_59_56 (G_59_56, P_59_56, {G_59_58,G_57_56}, {P_59_58,P_57_56});
+   black b_63_60 (G_63_60, P_63_60, {G_63_62,G_61_60}, {P_63_62,P_61_60});
+
+   // Stage 3: Generates G/P pairs that span 4 bits
+   grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
+   grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
+   black b_13_8 (G_13_8, P_13_8, {G_13_12,G_11_8}, {P_13_12,P_11_8});
+   black b_15_8 (G_15_8, P_15_8, {G_15_12,G_11_8}, {P_15_12,P_11_8});
+   black b_21_16 (G_21_16, P_21_16, {G_21_20,G_19_16}, {P_21_20,P_19_16});
+   black b_23_16 (G_23_16, P_23_16, {G_23_20,G_19_16}, {P_23_20,P_19_16});
+   black b_29_24 (G_29_24, P_29_24, {G_29_28,G_27_24}, {P_29_28,P_27_24});
+   black b_31_24 (G_31_24, P_31_24, {G_31_28,G_27_24}, {P_31_28,P_27_24});
+
+   black b_37_32 (G_37_32, P_37_32, {G_37_36,G_35_32}, {P_37_36,P_35_32});
+   black b_39_32 (G_39_32, P_39_32, {G_39_36,G_35_32}, {P_39_36,P_35_32});
+   black b_45_40 (G_45_40, P_45_40, {G_45_44,G_43_40}, {P_45_44,P_43_40});
+   black b_47_40 (G_47_40, P_47_40, {G_47_44,G_43_40}, {P_47_44,P_43_40});
+   black b_53_48 (G_53_48, P_53_48, {G_53_52,G_51_48}, {P_53_52,P_51_48});
+   black b_55_48 (G_55_48, P_55_48, {G_55_52,G_51_48}, {P_55_52,P_51_48});
+   black b_61_56 (G_61_56, P_61_56, {G_61_60,G_59_56}, {P_61_60,P_59_56});
+   black b_63_56 (G_63_56, P_63_56, {G_63_60,G_59_56}, {P_63_60,P_59_56});
+
+   // Stage 4: Generates G/P pairs that span 8 bits
+   grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
+   grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
+   grey g_13_0 (G_13_0, {G_13_8,G_7_0}, P_13_8);
+   grey g_15_0 (G_15_0, {G_15_8,G_7_0}, P_15_8);
+   black b_25_16 (G_25_16, P_25_16, {G_25_24,G_23_16}, {P_25_24,P_23_16});
+   black b_27_16 (G_27_16, P_27_16, {G_27_24,G_23_16}, {P_27_24,P_23_16});
+   black b_29_16 (G_29_16, P_29_16, {G_29_24,G_23_16}, {P_29_24,P_23_16});
+   black b_31_16 (G_31_16, P_31_16, {G_31_24,G_23_16}, {P_31_24,P_23_16});
+
+   black b_41_32 (G_41_32, P_41_32, {G_41_40,G_39_32}, {P_41_40,P_39_32});
+   black b_43_32 (G_43_32, P_43_32, {G_43_40,G_39_32}, {P_43_40,P_39_32});
+   black b_45_32 (G_45_32, P_45_32, {G_45_40,G_39_32}, {P_45_40,P_39_32});
+   black b_47_32 (G_47_32, P_47_32, {G_47_40,G_39_32}, {P_47_40,P_39_32});
+   black b_57_48 (G_57_48, P_57_48, {G_57_56,G_55_48}, {P_57_56,P_55_48});
+   black b_59_48 (G_59_48, P_59_48, {G_59_56,G_55_48}, {P_59_56,P_55_48});
+   black b_61_48 (G_61_48, P_61_48, {G_61_56,G_55_48}, {P_61_56,P_55_48});
+   black b_63_48 (G_63_48, P_63_48, {G_63_56,G_55_48}, {P_63_56,P_55_48});
+
+   // Stage 5: Generates G/P pairs that span 16 bits
+   grey g_17_0 (G_17_0, {G_17_16,G_15_0}, P_17_16);
+   grey g_19_0 (G_19_0, {G_19_16,G_15_0}, P_19_16);
+   grey g_21_0 (G_21_0, {G_21_16,G_15_0}, P_21_16);
+   grey g_23_0 (G_23_0, {G_23_16,G_15_0}, P_23_16);
+   grey g_25_0 (G_25_0, {G_25_16,G_15_0}, P_25_16);
+   grey g_27_0 (G_27_0, {G_27_16,G_15_0}, P_27_16);
+   grey g_29_0 (G_29_0, {G_29_16,G_15_0}, P_29_16);
+   grey g_31_0 (G_31_0, {G_31_16,G_15_0}, P_31_16);
+
+   black b_49_32 (G_49_32, P_49_32, {G_49_48,G_47_32}, {P_49_48,P_47_32});
+   black b_51_32 (G_51_32, P_51_32, {G_51_48,G_47_32}, {P_51_48,P_47_32});
+   black b_53_32 (G_53_32, P_53_32, {G_53_48,G_47_32}, {P_53_48,P_47_32});
+   black b_55_32 (G_55_32, P_55_32, {G_55_48,G_47_32}, {P_55_48,P_47_32});
+   black b_57_32 (G_57_32, P_57_32, {G_57_48,G_47_32}, {P_57_48,P_47_32});
+   black b_59_32 (G_59_32, P_59_32, {G_59_48,G_47_32}, {P_59_48,P_47_32});
+   black b_61_32 (G_61_32, P_61_32, {G_61_48,G_47_32}, {P_61_48,P_47_32});
+   black b_63_32 (G_63_32, P_63_32, {G_63_48,G_47_32}, {P_63_48,P_47_32});
+
+   // Stage 6: Generates G/P pairs that span 32 bits
+   grey g_33_0 (G_33_0, {G_33_32,G_31_0}, P_33_32);
+   grey g_35_0 (G_35_0, {G_35_32,G_31_0}, P_35_32);
+   grey g_37_0 (G_37_0, {G_37_32,G_31_0}, P_37_32);
+   grey g_39_0 (G_39_0, {G_39_32,G_31_0}, P_39_32);
+   grey g_41_0 (G_41_0, {G_41_32,G_31_0}, P_41_32);
+   grey g_43_0 (G_43_0, {G_43_32,G_31_0}, P_43_32);
+   grey g_45_0 (G_45_0, {G_45_32,G_31_0}, P_45_32);
+   grey g_47_0 (G_47_0, {G_47_32,G_31_0}, P_47_32);
+
+   grey g_49_0 (G_49_0, {G_49_32,G_31_0}, P_49_32);
+   grey g_51_0 (G_51_0, {G_51_32,G_31_0}, P_51_32);
+   grey g_53_0 (G_53_0, {G_53_32,G_31_0}, P_53_32);
+   grey g_55_0 (G_55_0, {G_55_32,G_31_0}, P_55_32);
+   grey g_57_0 (G_57_0, {G_57_32,G_31_0}, P_57_32);
+   grey g_59_0 (G_59_0, {G_59_32,G_31_0}, P_59_32);
+   grey g_61_0 (G_61_0, {G_61_32,G_31_0}, P_61_32);
+   grey g_63_0 (G_63_0, {G_63_32,G_31_0}, P_63_32);
+
+   // Extra grey cell stage 
+   grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
+   grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
+   grey g_6_0 (G_6_0, {g[6],G_5_0}, p[6]);
+   grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
+   grey g_10_0 (G_10_0, {g[10],G_9_0}, p[10]);
+   grey g_12_0 (G_12_0, {g[12],G_11_0}, p[12]);
+   grey g_14_0 (G_14_0, {g[14],G_13_0}, p[14]);
+   grey g_16_0 (G_16_0, {g[16],G_15_0}, p[16]);
+   grey g_18_0 (G_18_0, {g[18],G_17_0}, p[18]);
+   grey g_20_0 (G_20_0, {g[20],G_19_0}, p[20]);
+   grey g_22_0 (G_22_0, {g[22],G_21_0}, p[22]);
+   grey g_24_0 (G_24_0, {g[24],G_23_0}, p[24]);
+   grey g_26_0 (G_26_0, {g[26],G_25_0}, p[26]);
+   grey g_28_0 (G_28_0, {g[28],G_27_0}, p[28]);
+   grey g_30_0 (G_30_0, {g[30],G_29_0}, p[30]);
+   grey g_32_0 (G_32_0, {g[32],G_31_0}, p[32]);
+   grey g_34_0 (G_34_0, {g[34],G_33_0}, p[34]);
+   grey g_36_0 (G_36_0, {g[36],G_35_0}, p[36]);
+   grey g_38_0 (G_38_0, {g[38],G_37_0}, p[38]);
+   grey g_40_0 (G_40_0, {g[40],G_39_0}, p[40]);
+   grey g_42_0 (G_42_0, {g[42],G_41_0}, p[42]);
+   grey g_44_0 (G_44_0, {g[44],G_43_0}, p[44]);
+   grey g_46_0 (G_46_0, {g[46],G_45_0}, p[46]);
+   grey g_48_0 (G_48_0, {g[48],G_47_0}, p[48]);
+   grey g_50_0 (G_50_0, {g[50],G_49_0}, p[50]);
+   grey g_52_0 (G_52_0, {g[52],G_51_0}, p[52]);
+   grey g_54_0 (G_54_0, {g[54],G_53_0}, p[54]);
+   grey g_56_0 (G_56_0, {g[56],G_55_0}, p[56]);
+   grey g_58_0 (G_58_0, {g[58],G_57_0}, p[58]);
+   grey g_60_0 (G_60_0, {g[60],G_59_0}, p[60]);
+   grey g_62_0 (G_62_0, {g[62],G_61_0}, p[62]);
+
+   // Final Stage: Apply c_k+1=G_k_0
+   assign c[1]=g[0];
+   assign c[2]=G_1_0;
+   assign c[3]=G_2_0;
+   assign c[4]=G_3_0;
+   assign c[5]=G_4_0;
+   assign c[6]=G_5_0;
+   assign c[7]=G_6_0;
+   assign c[8]=G_7_0;
+   assign c[9]=G_8_0;
+
+   assign c[10]=G_9_0;
+   assign c[11]=G_10_0;
+   assign c[12]=G_11_0;
+   assign c[13]=G_12_0;
+   assign c[14]=G_13_0;
+   assign c[15]=G_14_0;
+   assign c[16]=G_15_0;
+   assign c[17]=G_16_0;
+
+   assign c[18]=G_17_0;
+   assign c[19]=G_18_0;
+   assign c[20]=G_19_0;
+   assign c[21]=G_20_0;
+   assign c[22]=G_21_0;
+   assign c[23]=G_22_0;
+   assign c[24]=G_23_0;
+   assign c[25]=G_24_0;
+
+   assign c[26]=G_25_0;
+   assign c[27]=G_26_0;
+   assign c[28]=G_27_0;
+   assign c[29]=G_28_0;
+   assign c[30]=G_29_0;
+   assign c[31]=G_30_0;
+   assign c[32]=G_31_0;
+   assign c[33]=G_32_0;
+
+   assign c[34]=G_33_0;
+   assign c[35]=G_34_0;
+   assign c[36]=G_35_0;
+   assign c[37]=G_36_0;
+   assign c[38]=G_37_0;
+   assign c[39]=G_38_0;
+   assign c[40]=G_39_0;
+   assign c[41]=G_40_0;
+
+   assign c[42]=G_41_0;
+   assign c[43]=G_42_0;
+   assign c[44]=G_43_0;
+   assign c[45]=G_44_0;
+   assign c[46]=G_45_0;
+   assign c[47]=G_46_0;
+   assign c[48]=G_47_0;
+   assign c[49]=G_48_0;
+
+   assign c[50]=G_49_0;
+   assign c[51]=G_50_0;
+   assign c[52]=G_51_0;
+   assign c[53]=G_52_0;
+   assign c[54]=G_53_0;
+   assign c[55]=G_54_0;
+   assign c[56]=G_55_0;
+   assign c[57]=G_56_0;
+
+   assign c[58]=G_57_0;
+   assign c[59]=G_58_0;
+   assign c[60]=G_59_0;
+   assign c[61]=G_60_0;
+   assign c[62]=G_61_0;
+   assign c[63]=G_62_0;
+   assign c[64]=G_63_0;
+
+endmodule // ladner_fischer
+
--- a/wally-pipelined/src/fpu/ling_bk13.sv
+++ b/wally-pipelined/src/fpu/ling_bk13.sv
@ -0,0 +1,89 @@
+// Brent-Kung Prefix Adder
+
+module ling_bk13 (cout, sum, a, b, cin);
+	 input [12:0] a, b;
+	 input cin;
+	 output [12:0] sum;
+	 output cout;
+
+	 wire [13:0] p,g;
+	 wire [13:1] h,c;
+
+// pre-computation
+	 assign p={a|b,1'b1};
+	 assign g={a&b, cin};
+
+// prefix tree
+	 ling_brent_kung prefix_tree(h, c, p[12:0], g[12:0]);
+
+// post-computation
+	 assign h[13]=g[13]|c[13];
+	 assign sum=p[13:1]^h|g[13:1]&c;
+	 assign cout=p[13]&h[13];
+
+endmodule
+
+module ling_brent_kung (h, c, p, g);
+	
+	input [12:0] p;
+	input [13:0] g;
+	output [13:1] h;
+	output [13:1] c;
+
+
+	// parallel-prefix, Brent-Kung
+
+	// Stage 1: Generates H/I pairs that span 1 bits
+	rgry g_1_0 (H_1_0, {g[1],g[0]});
+	rblk b_3_2 (H_3_2, I_3_2, {g[3],g[2]}, {p[2],p[1]});
+	rblk b_5_4 (H_5_4, I_5_4, {g[5],g[4]}, {p[4],p[3]});
+	rblk b_7_6 (H_7_6, I_7_6, {g[7],g[6]}, {p[6],p[5]});
+	rblk b_9_8 (H_9_8, I_9_8, {g[9],g[8]}, {p[8],p[7]});
+	rblk b_11_10 (H_11_10, I_11_10, {g[11],g[10]}, {p[10],p[9]});
+	rblk b_13_12 (H_13_12, I_13_12, {g[13],g[12]}, {p[12],p[11]});
+
+	// Stage 2: Generates H/I pairs that span 2 bits
+	grey g_3_0 (H_3_0, {H_3_2,H_1_0}, I_3_2);
+	black b_7_4 (H_7_4, I_7_4, {H_7_6,H_5_4}, {I_7_6,I_5_4});
+	black b_11_8 (H_11_8, I_11_8, {H_11_10,H_9_8}, {I_11_10,I_9_8});
+
+	// Stage 3: Generates H/I pairs that span 4 bits
+	grey g_7_0 (H_7_0, {H_7_4,H_3_0}, I_7_4);
+
+	// Stage 4: Generates H/I pairs that span 8 bits
+
+	// Stage 5: Generates H/I pairs that span 4 bits
+	grey g_11_0 (H_11_0, {H_11_8,H_7_0}, I_11_8);
+
+	// Stage 6: Generates H/I pairs that span 2 bits
+	grey g_5_0 (H_5_0, {H_5_4,H_3_0}, I_5_4);
+	grey g_9_0 (H_9_0, {H_9_8,H_7_0}, I_9_8);
+
+	// Last grey cell stage 
+	grey g_2_0 (H_2_0, {g[2],H_1_0}, p[1]);
+	grey g_4_0 (H_4_0, {g[4],H_3_0}, p[3]);
+	grey g_6_0 (H_6_0, {g[6],H_5_0}, p[5]);
+	grey g_8_0 (H_8_0, {g[8],H_7_0}, p[7]);
+	grey g_10_0 (H_10_0, {g[10],H_9_0}, p[9]);
+	grey g_12_0 (H_12_0, {g[12],H_11_0}, p[11]);
+
+	// Final Stage: Apply c_k+1=p_k&H_k_0
+	assign c[1]=g[0];
+
+	assign h[1]=H_1_0;		assign c[2]=p[1]&H_1_0;
+	assign h[2]=H_2_0;		assign c[3]=p[2]&H_2_0;
+	assign h[3]=H_3_0;		assign c[4]=p[3]&H_3_0;
+	assign h[4]=H_4_0;		assign c[5]=p[4]&H_4_0;
+	assign h[5]=H_5_0;		assign c[6]=p[5]&H_5_0;
+	assign h[6]=H_6_0;		assign c[7]=p[6]&H_6_0;
+	assign h[7]=H_7_0;		assign c[8]=p[7]&H_7_0;
+	assign h[8]=H_8_0;		assign c[9]=p[8]&H_8_0;
+
+	assign h[9]=H_9_0;		assign c[10]=p[9]&H_9_0;
+	assign h[10]=H_10_0;		assign c[11]=p[10]&H_10_0;
+	assign h[11]=H_11_0;		assign c[12]=p[11]&H_11_0;
+	assign h[12]=H_12_0;		assign c[13]=p[12]&H_12_0;
+
+endmodule
+
+
--- a/wally-pipelined/src/fpu/lzd_denorm.sv
+++ b/wally-pipelined/src/fpu/lzd_denorm.sv
@ -0,0 +1,170 @@
+// module lz2 (P, V, B0, B1);
+
+//    input B0;
+//    input B1;
+
+//    output P;
+//    output V;
+
+//    assign V = B0 | B1;
+//    assign P = B0 & ~B1;
+   
+// endmodule // lz2
+
+// Note: This module is not made out of two lz2's - why not? (MJS)
+
+// module lz4 (ZP, ZV, B0, B1, V0, V1);
+   
+//    input B0;
+//    input B1;
+//    input V0;
+//    input V1;
+
+//    output [1:0] ZP;
+//    output 	ZV;
+
+//    assign ZP[0] = V0 ? B0 : B1;
+//    assign ZP[1] = ~V0;
+//    assign ZV = V0 | V1;
+
+// endmodule // lz4
+
+// // Note: This module is not made out of two lz4's - why not? (MJS)
+
+// module lz8 (ZP, ZV, B);
+   
+//    input [7:0] B;
+
+//    wire        s1p0;
+//    wire        s1v0;
+//    wire        s1p1;
+//    wire        s1v1;
+//    wire        s2p0;
+//    wire        s2v0;
+//    wire        s2p1;
+//    wire        s2v1;
+//    wire [1:0]  ZPa;
+//    wire [1:0]  ZPb;
+//    wire        ZVa;
+//    wire        ZVb;
+   
+//    output [2:0] ZP;
+//    output       ZV;
+   
+//    lz2 l1(s1p0, s1v0, B[2], B[3]);
+//    lz2 l2(s1p1, s1v1, B[0], B[1]);
+//    lz4 l3(ZPa, ZVa, s1p0, s1p1, s1v0, s1v1);
+
+//    lz2 l4(s2p0, s2v0, B[6], B[7]);
+//    lz2 l5(s2p1, s2v1, B[4], B[5]);
+//    lz4 l6(ZPb, ZVb, s2p0, s2p1, s2v0, s2v1);
+
+//    assign ZP[1:0] = ZVb ? ZPb : ZPa;
+//    assign ZP[2]   = ~ZVb;
+//    assign ZV = ZVa | ZVb;
+
+// endmodule // lz8
+
+// module lz16 (ZP, ZV, B);
+
+//    input [15:0] B;
+
+//    wire [2:0] 	ZPa;
+//    wire [2:0] 	ZPb;
+//    wire 	ZVa;
+//    wire 	ZVb;   
+
+//    output [3:0] ZP;
+//    output 	ZV;
+
+//    lz8 l1(ZPa, ZVa, B[7:0]);
+//    lz8 l2(ZPb, ZVb, B[15:8]);
+
+//    assign ZP[2:0] = ZVb ? ZPb : ZPa;
+//    assign ZP[3]   = ~ZVb;
+//    assign ZV = ZVa | ZVb;
+
+// endmodule // lz16
+
+// module lz32 (ZP, ZV, B);
+
+//    input [31:0] B;
+
+//    wire [3:0] 	ZPa;
+//    wire [3:0] 	ZPb;
+//    wire 	ZVa;
+//    wire 	ZVb;
+
+//    output [4:0] ZP;
+//    output 	ZV;
+
+//    lz16 l1(ZPa, ZVa, B[15:0]);
+//    lz16 l2(ZPb, ZVb, B[31:16]);
+
+//    assign ZP[3:0] = ZVb ? ZPb : ZPa;
+//    assign ZP[4]   = ~ZVb;
+//    assign ZV = ZVa | ZVb;
+
+// endmodule // lz32
+
+// // This module returns the number of leading zeros ZP in the 64-bit 
+// // number B. If there are no ones in B, then ZP and ZV are both 0.
+
+// module lz64 (ZP, ZV, B);
+
+//    input [63:0] B;
+
+//    wire [4:0] 	ZPa;
+//    wire [4:0] 	ZPb;
+//    wire 	ZVa;
+//    wire 	ZVb;   
+
+//    output [5:0] ZP;
+//    output 	ZV;
+
+//    lz32 l1(ZPa, ZVa, B[31:0]);
+//    lz32 l2(ZPb, ZVb, B[63:32]);
+
+//    assign ZV = ZVa | ZVb;
+//    assign ZP[4:0] = (ZVb ? ZPb : ZPa) & {5{ZV}};
+//    assign ZP[5]   = ~ZVb & ZV;
+
+// endmodule // lz64
+
+// This module returns the number of leading zeros ZP in the 52-bit 
+// number B. If there are no ones in B, then ZP and ZV are both 0.
+
+module lz52 (ZP, ZV, B);
+
+   input [51:0] B;
+
+   wire  [4:0]  ZP_32;
+   wire  [3:0]  ZP_16;
+   wire  [1:0]  ZP_4;
+   wire 	ZV_32;
+   wire 	ZV_16;
+   wire 	ZV_4;
+
+   wire 	ZP_2_1;
+   wire 	ZP_2_2;
+   wire 	ZV_2_1;
+   wire 	ZV_2_2;
+
+   output [5:0] ZP;
+   output 	ZV;
+
+   lz32 l1 (ZP_32, ZV_32, B[51:20]);
+   lz16 l2 (ZP_16, ZV_16, B[19:4]);
+
+   lz2 l3_1 (ZP_2_1, ZV_2_1, B[3], B[2]);
+   lz2 l3_2 (ZP_2_2, ZV_2_2, B[1], B[0]);
+   lz4 l3_final (ZP_4, ZV_4, ZP_2_1, ZP_2_2, ZV_2_1, ZV_2_2);
+
+   assign ZV = ZV_32 | ZV_16 | ZV_4;
+   assign ZP[5] = ~ZV_32;
+   assign ZP[4] = ZV_32 ? ZP_32[4] : ~ZV_16;
+   assign ZP[3:2] = ZV_32 ? ZP_32[3:2] : (ZV_16 ? ZP_16[3:2] : 2'b0);
+   assign ZP[1:0] = ZV_32 ? ZP_32[1:0] : (ZV_16 ? ZP_16[1:0] : ZP_4);
+
+endmodule // lz52
+
--- a/wally-pipelined/src/fpu/mult_R4_64_64_cs.sv
+++ b/wally-pipelined/src/fpu/mult_R4_64_64_cs.sv
--- a/wally-pipelined/src/fpu/rounder_denorm.sv
+++ b/wally-pipelined/src/fpu/rounder_denorm.sv
@ -0,0 +1,265 @@
+// The rounder takes as inputs a 64-bit value to be rounded, A, the 
+// exponent of the value to be rounded, the sign of the final result, Sign, 
+// the precision of the results, P, and the two-bit rounding mode, rm. 
+// It produces a rounded 52-bit result, Z, the exponent of the rounded 
+// result, Z_exp, and a flag that indicates if the result was rounded,
+// Inexact. The rounding mode has the following values.
+//	rm		Modee
+//      00 		round-to-nearest-even
+//	01 		round-toward-zero
+//      10 		round-toward-plus infinity
+//      11  		round-toward-minus infinity
+// The rounding algorithm determines if '1' should be added to the 
+// truncated signficant result, based on three significant bits 
+// (least (L), round (R) and sticky (S)), the rounding mode (rm)
+// and the sign of the final result (Sign). Visually, L and R appear as
+//    xxxxxL,Rxxxxxxx
+// where , denotes the rounding boundary. S is the logical OR of all the
+// bits to the right of R. 
+
+module rounder (Result, DenormIO, Flags, rm, P, OvEn, 
+		UnEn, exp_valid, sel_inv, Invalid, DenormIn, convert, Asign, Aexp, 
+		norm_shift, A, exponent_postsum, A_Norm, B_Norm, exp_A_unmodified, exp_B_unmodified,
+		normal_overflow, normal_underflow, swap, op_type, sum);
+
+   input  [2:0]  rm;
+   input         P;
+   input         OvEn;
+   input         UnEn;
+   input         exp_valid;
+   input [3:0] 	 sel_inv;
+   input	 Invalid;
+   input	 DenormIn;
+   input         convert;
+   input         Asign;
+   input [10:0]  Aexp;
+   input [5:0] 	 norm_shift;
+   input [63:0]  A;
+   input [10:0]  exponent_postsum;
+   input 	 A_Norm;
+   input 	 B_Norm;
+   input [11:0]  exp_A_unmodified;
+   input [11:0]  exp_B_unmodified;
+   input 	 normal_overflow;
+   input 	 normal_underflow;
+   input 	 swap;
+   input [3:0]	 op_type;
+   input [63:0]  sum;
+   
+   output [63:0] Result;
+   output 	 DenormIO;
+   output [4:0]  Flags;
+   
+   wire          Rsign;
+   wire 	 Sticky_out;
+   wire [51:0]	 ShiftMant;
+   wire [63:0]   ShiftMant_64;
+   wire [10:0] 	 Rexp;
+   wire [10:0]   Rexp_denorm;
+   wire [11:0] 	 Texp;			//Parallelized for denorm exponent
+   wire [11:0]   Texp_addone;		//results
+   wire [11:0]   Texp_subone;
+   wire [51:0] 	 Rmant;
+   wire [51:0] 	 Tmant;
+   wire          Rzero;
+   wire          VSS = 1'b0;
+   wire          VDD = 1'b1;
+   wire [51:0] 	 B;			// Value used to add the "ones"
+   wire [11:0]   B_12_overflow;		// Value used to add one to exponent
+   wire [11:0]   B_12_underflow;	// Value used to subtract one from exponent
+   wire		 S_SP;			// Single precision sticky bit
+   wire		 S_DP;			// Double precision sticky bit
+   wire		 S;			// Actual sticky bit
+   wire		 R;			// Round bit
+   wire		 L;			// Least significant bit
+   wire		 add_one;		// '1' if one should be added
+   wire		 UnFlow_SP, UnFlow_DP, UnderFlow; 
+   wire		 OvFlow_SP, OvFlow_DP, OverFlow;		
+   wire		 Inexact;
+   wire		 Round_zero;
+   wire		 Infinite;
+   wire		 VeryLarge;
+   wire		 Largest;
+   wire		 Adj_exp;
+   wire		 Valid;
+   wire		 NaN;
+   wire		 Cout;
+   wire 	 Cout_overflow;
+   wire		 Texp_l7z;
+   wire		 Texp_l7o;
+   wire		 OvCon;
+
+   // Determine the sticky bits for double and single precision
+   assign S_DP= A[9]|A[8]|A[7]|A[6]|A[5]|A[4]|A[3]|A[2]|A[1]|A[0];
+   assign S_SP = S_DP |A[38]|A[37]|A[36]|A[35]|A[34]|A[33]|A[32]|A[31]|A[30]|
+                 A[29]|A[28]|A[27]|A[26]|A[25]|A[24]|A[23]|A[22]|A[21]|A[20]|
+                 A[19]|A[18]|A[17]|A[16]|A[15]|A[14]|A[13]|A[12]|A[11]|A[10];
+
+   // Set the least (L), round (R), and sticky (S) bits based on
+   // the precision. 
+   assign {L, R, S} = P ? {A[40],A[39],S_SP} : {A[11],A[10],S_DP};
+
+   // Add one if ((the rounding mode is round-to-nearest) and (R is one) and
+   // (S or L is one)) or ((the rounding mode is towards plus or minus 
+   // infinity (rm[1] = 1)) and (the sign and rm[0] are the same) and 
+   // (R or S is one)). 
+
+   assign add_one = ~rm[2] & ((~rm[1]&~rm[0]&R&(L|S)) | (rm[1]&(Asign^~rm[0])&(R|S))) | (rm[2] & R);
+
+   // Add one using a 52-bit adder. The one is added to the LSB B[0] for
+   // double precision or to B[29] for single precision. 
+   // This could be simplified by using a specialized adder.
+   // The current adder is actually 64-bits. The leading one 
+   // for normalized results in not included in the addition.
+   assign B = {{22{VSS}}, add_one&P, {28{VSS}}, add_one&~P};
+   assign B_12_overflow = {8'h0, 3'b0, normal_overflow};
+   assign B_12_underflow = {8'h0, 3'b0, normal_underflow};
+
+   cla52 add1(Tmant, Cout, A[62:11], B);
+
+   cla12 add1_exp(Texp_addone, Cout_overflow, Texp, B_12_overflow);
+
+   cla_sub12 sub1_exp(Texp_subone, Texp, B_12_underflow);
+
+   // Now that rounding is done, we compute the final exponent
+   // and test for special cases. 
+
+   // Compute the value of the exponent by subtracting the shift 
+   // value from the previous exponent and then adding 2 + cout. 
+   // If needed this could be optimized to used a specialized 
+   // adder. 
+
+   assign Texp = DenormIn ? ({1'b0, exponent_postsum}) : ({VSS, Aexp} - {{6{VSS}}, norm_shift} +{{10{VSS}}, VDD, Cout});   
+   
+   // Overflow only occurs for double precision, if Texp[10] to Texp[0] are 
+   // all ones. To encourage sharing with single precision overflow detection,
+   // the lower 7 bits are tested separately. 
+   assign Texp_l7o  = Texp[6]&Texp[5]&Texp[4]&Texp[3]&Texp[2]&Texp[1]&Texp[0];
+   assign OvFlow_DP = Texp[10]&Texp[9]&Texp[8]&Texp[7]&Texp_l7o;
+
+   // Overflow occurs for single precision if (Texp[10] is one)  and 
+   // ((Texp[9] or Texp[8] or Texp[7]) is one) or (Texp[6] to Texp[0] 
+   // are all ones. 
+   assign OvFlow_SP = Texp[10]&(Texp[9]|Texp[8]|Texp[7]|Texp_l7o);
+
+   // Underflow occurs for double precision if (Texp[11] is one)  or Texp[10] to 
+   // Texp[0] are all zeros. 
+   assign Texp_l7z  = ~Texp[6]&~Texp[5]&~Texp[4]&~Texp[3]&~Texp[2]&~Texp[1]&~Texp[0];
+   assign UnFlow_DP = Texp[11] | ~Texp[10]&~Texp[9]&~Texp[8]&~Texp[7]&Texp_l7z;
+
+   // Underflow occurs for single precision if (Texp[10] is zero)  and 
+   // (Texp[9] or Texp[8] or Texp[7]) is zero. 
+   assign UnFlow_SP = (~Texp[10]&(~Texp[9]|~Texp[8]|~Texp[7]|Texp_l7z));
+   
+   // Set the overflow and underflow flags. They should not be set if
+   // the input was infinite or NaN or the output of the adder is zero.
+   // 00 = Valid
+   // 10 = NaN
+   assign Valid = (~sel_inv[2]&~sel_inv[1]&~sel_inv[0]);
+   assign NaN   = ~sel_inv[2]&~sel_inv[1]& sel_inv[0];
+   assign UnderFlow = ((P & UnFlow_SP | UnFlow_DP)&Valid&exp_valid) |
+		      (~Aexp[10]&Aexp[9]&Aexp[8]&Aexp[7]&~Aexp[6]
+		       &~Aexp[5]&~Aexp[4]&~Aexp[3]&~Aexp[2]
+		       &~Aexp[1]&~Aexp[0]&sel_inv[3]);
+   assign OverFlow  = (P & OvFlow_SP | OvFlow_DP)&Valid&~UnderFlow&exp_valid;
+
+   // The DenormIO is set if underflow has occurred or if their was a
+   // denormalized input. 
+   assign DenormIO = DenormIn | UnderFlow;
+
+   // The final result is Inexact if any rounding occurred ((i.e., R or S 
+   // is one), or (if the result overflows ) or (if the result underflows and the 
+   // underflow trap is not enabled)) and (value of the result was not previous set 
+   // by an exception case). 
+   assign Inexact = (R|S|OverFlow|(UnderFlow&~UnEn))&Valid;
+
+   // Set the IEEE Exception Flags: Inexact, Underflow, Overflow, Div_By_0, 
+   // Invlalid. 
+   assign Flags = {UnderFlow, VSS, OverFlow, Invalid, Inexact};
+
+   // Determine the final result. 
+
+   // The sign of the final result is one if the result is not zero and
+   // the sign of A is one, or if the result is zero and the the rounding 
+   // mode is round-to-minus infinity. The final result is zero, if exp_valid
+   // is zero. If underflow occurs, then the result is set to zero.
+   //   
+   // For Zero (goes equally for subtraction although 
+   // signs may alter operands sign):
+   // -0 + -0 = -0 (always)
+   // +0 + +0 = +0 (always)
+   // -0 + +0 = +0 (for RN, RZ, RU) 
+   // -0 + +0 = -0 (for RD) 
+   assign Rzero = ~exp_valid | UnderFlow;
+   assign Rsign = DenormIn ?
+		  ( ~(op_type[2] | op_type[1] | op_type[0]) ? 
+		  ( (sum[63] & (A_Norm | B_Norm) & (exp_A_unmodified[11] ^ exp_B_unmodified[11])) ?
+		  ~Asign : Asign) 
+   		  : ( ((A_Norm ^ B_Norm) & (exp_A_unmodified[11] ~^ exp_B_unmodified[11])) ?
+		  (normal_underflow ? ~Asign : Asign) : Asign)
+		  ) 
+		  : ( ((Asign&exp_valid | 
+     	          (sel_inv[2]&~sel_inv[1]&sel_inv[0]&rm[1]&rm[0] |
+	          sel_inv[2]&sel_inv[1]&~sel_inv[0] |		  
+	          ~exp_valid&rm[1]&rm[0]&~sel_inv[2] | 
+	          UnderFlow&rm[1]&rm[0]) & ~convert) & ~sel_inv[3]) |
+		  (Asign & sel_inv[3]) );
+   
+   // The exponent of the final result is zero if the final result is 
+   // zero or a denorm, all ones if the final result is NaN or Infinite
+   // or overflow occurred and the magnitude of the number is 
+   // not rounded toward from zero, and all ones with an LSB of zero
+   // if overflow occurred and the magnitude of the number is 
+   // rounded toward zero. If the result is single precision, 
+   // Texp[7] shoud be inverted. When the Overflow trap is enabled (OvEn = 1)
+   // and overflow occurs and the operation is not conversion, bits 10 and 9 are 
+   // inverted for double precision, and bits 7 and 6 are inverted for single precision. 
+   assign Round_zero = ~rm[1]&rm[0] | ~Asign&rm[0] | Asign&rm[1]&~rm[0];
+   assign VeryLarge = OverFlow & ~OvEn;
+   assign Infinite   = (VeryLarge & ~Round_zero) | (~sel_inv[2] & sel_inv[1]);
+   assign Largest = VeryLarge & Round_zero;
+   assign Adj_exp = OverFlow & OvEn & ~convert;
+   assign Rexp[10:1] = ({10{~Valid}} | 
+			{Texp[10]&~Adj_exp, Texp[9]&~Adj_exp, Texp[8], 
+			 (Texp[7]^P)&~(Adj_exp&P), Texp[6]&~(Adj_exp&P), Texp[5:1]} | 
+		        {10{VeryLarge}})&{10{~Rzero | NaN}};
+   assign Rexp[0]    = ({~Valid} | Texp[0] | Infinite)&(~Rzero | NaN)&~Largest;
+   
+   // The denormalized rounded exponent uses the overflow/underflow values
+   // computed in the fpadd component to round the exponent up or down 
+   // Depending on the operation and the signs of the orignal operands,
+   // underflow may or may not be needed to round.
+   assign Rexp_denorm = DenormIn ? 
+			((~op_type[2] & ~op_type[1] & op_type[0]) ? 
+				( ((A_Norm != B_Norm) & (exp_A_unmodified[11] == exp_B_unmodified[11])) ? 
+					( (normal_overflow == normal_underflow) ? Texp[10:0] : (normal_overflow ? Texp_addone[10:0] : Texp_subone[10:0]) ) 
+					: ( normal_overflow ? Texp_addone[10:0] : Texp[10:0] ) ) 
+				: ( ((A_Norm != B_Norm) & (exp_A_unmodified[11] != exp_B_unmodified[11])) ?	
+					( (normal_overflow == normal_underflow) ? Texp[10:0] : (normal_overflow ? Texp_addone[10:0] : Texp_subone[10:0]) ) 
+					: ( normal_overflow ? Texp_addone[10:0] : Texp[10:0] ) ) 
+				) : 
+			(op_type[3]) ? exp_A_unmodified : Rexp;
+
+   // If the result is zero or infinity, the mantissa is all zeros. 
+   // If the result is NaN, the mantissa is 10...0
+   // If the result the largest floating point number, the mantissa
+   // is all ones. Otherwise, the mantissa is not changed. 
+   // If operation is denormalized, take the mantissa directly from
+   // its normalized value. 
+   assign Rmant[51] = Largest | NaN | (Tmant[51]&~Infinite&~Rzero);
+   assign Rmant[50:0] = {51{Largest}} | (Tmant[50:0]&{51{~Infinite&Valid&~Rzero}});
+
+   assign ShiftMant = A[51:0];
+
+   // For single precision, the 8 least significant bits of the exponent
+   // and 23 most significant bits of the mantissa contain bits used 
+   // for the final result. A double precision result is returned if 
+   // overflow has occurred, the overflow trap is enabled, and a conversion
+   // is being performed. 
+   assign OvCon = OverFlow & OvEn & convert;
+
+   assign Result = (op_type[3]) ? {A[63:0]} : (DenormIn ? {Rsign, Rexp_denorm, ShiftMant} : ((P&~OvCon) ? {Rsign, Rexp[7:0], Rmant[51:29], {32{VSS}}}
+	           : {Rsign, Rexp, Rmant}));
+
+endmodule // rounder
+
--- a/wally-pipelined/src/fpu/rounder_div.sv
+++ b/wally-pipelined/src/fpu/rounder_div.sv
@ -0,0 +1,187 @@
+//
+// The rounder takes as inputs a 64-bit value to be rounded, A, the 
+// exponent of the value to be rounded, the sign of the final result, Sign, 
+// the precision of the results, P, and the two-bit rounding mode, rm. 
+// It produces a rounded 52-bit result, Z, the exponent of the rounded 
+// result, Z_exp, and a flag that indicates if the result was rounded,
+// Inexact. The rounding mode has the following values.
+//	rm		Modee
+//      00 		round-to-nearest-even
+//	01 		round-toward-zero
+//      10 		round-toward-plus infinity
+//      11  		round-toward-minus infinity
+//
+
+module rounder_div (Result, DenormIO, Flags, rm, P, OvEn, 
+		UnEn, exp_diff, sel_inv, Invalid, DenormIn, 
+		SignR, q1, qm1, qp1, q0, qm0, qp0, regr_out);
+
+   input  [2:0]   rm;
+   input          P;
+   input          OvEn;
+   input          UnEn;
+   input [12:0]   exp_diff;
+   input [2:0] 	  sel_inv;
+   input	  Invalid;
+   input	  DenormIn;
+   input 	  SignR;
+   
+   input [63:0]   q1;
+   input [63:0]   qm1;
+   input [63:0]   qp1;
+   input [63:0]   q0;
+   input [63:0]   qm0;
+   input [63:0]   qp0;   
+   input [127:0]  regr_out;
+   
+   output [63:0]  Result;
+   output 	  DenormIO;
+   output [4:0]   Flags;
+
+   supply1 	  vdd;
+   supply0 	  vss;
+   
+   wire 	  Rsign;
+   wire [10:0] 	  Rexp;
+   wire [12:0] 	  Texp;
+   wire [51:0] 	  Rmant;
+   wire [63:0] 	  Tmant;
+   wire [51:0] 	  Smant;   
+   wire 	  Rzero;
+   wire 	  Gdp, Gsp, G;
+   wire 	  UnFlow_SP, UnFlow_DP, UnderFlow; 
+   wire 	  OvFlow_SP, OvFlow_DP, OverFlow;		
+   wire 	  Inexact;
+   wire 	  Round_zero;
+   wire 	  Infinite;
+   wire 	  VeryLarge;
+   wire 	  Largest;
+   wire 	  Div0;      
+   wire 	  Adj_exp;
+   wire 	  Valid;
+   wire 	  NaN;
+   wire 	  Texp_l7z;
+   wire 	  Texp_l7o;
+   wire 	  OvCon;
+   wire [1:0] 	  mux_mant;
+   wire 	  sign_rem;
+   wire [63:0] 	  q, qm, qp;
+   wire 	  exp_ovf, exp_ovfSP, exp_ovfDP;   
+
+   // Remainder = 0?
+   assign zero_rem = ~(|regr_out);
+   // Remainder Sign
+   assign sign_rem = ~regr_out[127];
+   // choose correct Guard bit [1,2) or [0,1)
+   assign Gdp = q1[63] ? q1[10] : q0[10];
+   assign Gsp = q1[63] ? q1[39] : q0[39];
+   assign G = P ? Gsp : Gdp;   
+   // Selection of Rounding (from logic/switching)
+   assign mux_mant[1] = (SignR&rm[1]&rm[0]&G) | (!SignR&rm[1]&!rm[0]&G) | 
+			(!rm[1]&!rm[0]&G&!sign_rem) | 
+			(SignR&rm[1]&rm[0]&!zero_rem&!sign_rem) | 
+			(!SignR&rm[1]&!rm[0]&!zero_rem&!sign_rem);
+   assign mux_mant[0] = (!SignR&rm[0]&!G&!zero_rem&sign_rem) | 
+			(!rm[1]&rm[0]&!G&!zero_rem&sign_rem) | 
+			(SignR&rm[1]&!rm[0]&!G&!zero_rem&sign_rem);
+   
+   // Which Q?
+   mux2 #(64) mx1 (q0, q1, q1[63], q);
+   mux2 #(64) mx2 (qm0, qm1, q1[63], qm);   
+   mux2 #(64) mx3 (qp0, qp1, q1[63], qp);
+   // Choose Q, Q+1, Q-1
+   mux3 #(64) mx4 (q, qm, qp, mux_mant, Tmant);
+   assign Smant = Tmant[62:11];
+   // Compute the value of the exponent
+   //   exponent is modified if we choose:
+   //   1.) we choose any qm0, qp0, q0 (since we shift mant)
+   //   2.) we choose qp and we overflow (for RU)
+   assign exp_ovf = |{qp[62:40], (qp[39:11] & {29{~P}})};
+   assign Texp = exp_diff - {{13{vss}}, ~q1[63]} + {{13{vss}}, mux_mant[1]&qp1[63]&~exp_ovf};
+   
+   // Overflow only occurs for double precision, if Texp[10] to Texp[0] are 
+   // all ones. To encourage sharing with single precision overflow detection,
+   // the lower 7 bits are tested separately. 
+   assign Texp_l7o  = Texp[6]&Texp[5]&Texp[4]&Texp[3]&Texp[2]&Texp[1]&Texp[0];
+   assign OvFlow_DP = (~Texp[12]&Texp[11]) | (Texp[10]&Texp[9]&Texp[8]&Texp[7]&Texp_l7o);
+
+   // Overflow occurs for single precision if (Texp[10] is one)  and 
+   // ((Texp[9] or Texp[8] or Texp[7]) is one) or (Texp[6] to Texp[0] 
+   // are all ones. 
+   assign OvFlow_SP = Texp[10]&(Texp[9]|Texp[8]|Texp[7]|Texp_l7o);
+
+   // Underflow occurs for double precision if (Texp[11]/Texp[10] is one) or 
+   // Texp[10] to Texp[0] are all zeros. 
+   assign Texp_l7z  = ~Texp[6]&~Texp[5]&~Texp[4]&~Texp[3]&~Texp[2]&~Texp[1]&~Texp[0];
+   assign UnFlow_DP = (Texp[12]&Texp[11]) | ~Texp[11]&~Texp[10]&~Texp[9]&~Texp[8]&~Texp[7]&Texp_l7z;
+   
+   // Underflow occurs for single precision if (Texp[10] is zero)  and 
+   // (Texp[9] or Texp[8] or Texp[7]) is zero. 
+   assign UnFlow_SP = ~Texp[10]&(~Texp[9]|~Texp[8]|~Texp[7]|Texp_l7z);
+   
+   // Set the overflow and underflow flags. They should not be set if
+   // the input was infinite or NaN or the output of the adder is zero.
+   // 00 = Valid
+   // 10 = NaN
+   assign Valid = (~sel_inv[2]&~sel_inv[1]&~sel_inv[0]);
+   assign NaN = ~sel_inv[1]& sel_inv[0];
+   assign UnderFlow = (P & UnFlow_SP | UnFlow_DP) & Valid;
+   assign OverFlow  = (P & OvFlow_SP | OvFlow_DP) & Valid;
+   assign Div0 = sel_inv[2]&sel_inv[1]&~sel_inv[0];
+
+   // The DenormIO is set if underflow has occurred or if their was a
+   // denormalized input. 
+   assign DenormIO = DenormIn | UnderFlow;
+
+   // The final result is Inexact if any rounding occurred ((i.e., R or S 
+   // is one), or (if the result overflows ) or (if the result underflows and the 
+   // underflow trap is not enabled)) and (value of the result was not previous set 
+   // by an exception case). 
+   assign Inexact = (G|~zero_rem|OverFlow|(UnderFlow&~UnEn))&Valid;
+
+   // Set the IEEE Exception Flags: Inexact, Underflow, Overflow, Div_By_0, 
+   // Invlalid. 
+   assign Flags = {Inexact, UnderFlow, OverFlow, Div0, Invalid};
+
+   // Determine sign
+   assign Rzero = UnderFlow | (~sel_inv[2]&sel_inv[1]&sel_inv[0]);
+   assign Rsign = SignR;   
+      
+   // The exponent of the final result is zero if the final result is 
+   // zero or a denorm, all ones if the final result is NaN or Infinite
+   // or overflow occurred and the magnitude of the number is 
+   // not rounded toward from zero, and all ones with an LSB of zero
+   // if overflow occurred and the magnitude of the number is 
+   // rounded toward zero. If the result is single precision, 
+   // Texp[7] shoud be inverted. When the Overflow trap is enabled (OvEn = 1)
+   // and overflow occurs and the operation is not conversion, bits 10 and 9 are 
+   // inverted for double precision, and bits 7 and 6 are inverted for single precision. 
+   assign Round_zero = ~rm[1]&rm[0] | ~SignR&rm[0] | SignR&rm[1]&~rm[0];
+   assign VeryLarge = OverFlow & ~OvEn;
+   assign Infinite   = (VeryLarge & ~Round_zero) | sel_inv[1];
+   assign Largest = VeryLarge & Round_zero;
+   assign Adj_exp = OverFlow & OvEn;
+   assign Rexp[10:1] = ({10{~Valid}} | 
+			{Texp[10]&~Adj_exp, Texp[9]&~Adj_exp, Texp[8], 
+			 (Texp[7]^P)&~(Adj_exp&P), Texp[6]&~(Adj_exp&P), Texp[5:1]} | 
+		        {10{VeryLarge}})&{10{~Rzero | NaN}};
+   assign Rexp[0]    = ({~Valid} | Texp[0] | Infinite)&(~Rzero | NaN)&~Largest;
+   
+   // If the result is zero or infinity, the mantissa is all zeros. 
+   // If the result is NaN, the mantissa is 10...0
+   // If the result the largest floating point number, the mantissa
+   // is all ones. Otherwise, the mantissa is not changed. 
+   assign Rmant[51] = Largest | NaN | (Smant[51]&~Infinite&~Rzero);
+   assign Rmant[50:0] = {51{Largest}} | (Smant[50:0]&{51{~Infinite&Valid&~Rzero}});
+
+   // For single precision, the 8 least significant bits of the exponent
+   // and 23 most significant bits of the mantissa contain bits used 
+   // for the final result. A double precision result is returned if 
+   // overflow has occurred, the overflow trap is enabled, and a conversion
+   // is being performed. 
+   assign OvCon = OverFlow & OvEn;
+   assign Result = (P&~OvCon) ? {Rsign, Rexp[7:0], Rmant[51:29], {32{vss}}}
+	           : {Rsign, Rexp, Rmant};
+
+endmodule // rounder
+
--- a/wally-pipelined/src/fpu/sbtm.sv
+++ b/wally-pipelined/src/fpu/sbtm.sv
@ -0,0 +1,33 @@
+module sbtm (input logic [11:0] a, output logic [10:0] ia_out);
+
+   // bit partitions
+   logic [3:0] x0;
+   logic [2:0] x1;
+   logic [3:0] x2;
+   logic [2:0] x2_1cmp;   
+   // mem outputs
+   logic [12:0] y0;
+   logic [4:0] 	y1;
+   // input to CPA
+   logic [14:0] op1;
+   logic [14:0] op2;
+   logic [14:0] p;   
+
+   assign x0 = a[10:7];
+   assign x1 = a[6:4];
+   assign x2 = a[3:0];   
+
+   sbtm_a0 mem1 ({x0, x1}, y0);
+   // 1s cmp per sbtm/stam
+   assign x2_1cmp = x2[3] ? ~x2[2:0] : x2[2:0];   
+   sbtm_a1 mem2 ({x0, x2_1cmp}, y1);
+   assign op1 = {1'b0, y0, 1'b0};
+   // 1s cmp per sbtm/stam
+   assign op2 = x2[3] ? {1'b1, {8{1'b1}}, ~y1, 1'b1} :
+		{1'b0, 8'b0, y1, 1'b1};
+   // CPA
+   bk15 cp1 (cout, p, op1, op2, 1'b0);
+   //assign ia_out = {p[14:4], {53{1'b0}}};
+   assign ia_out = p[14:4];
+
+endmodule // sbtm
--- a/wally-pipelined/src/fpu/sbtm2.sv
+++ b/wally-pipelined/src/fpu/sbtm2.sv
@ -0,0 +1,38 @@
+  
+module sbtm2 (input logic [11:0] a, output logic [10:0] y);
+
+   // bit partitions
+   logic [4:0] x0;
+   logic [2:0] x1;
+   logic [3:0] x2;
+   logic [2:0] x2_1cmp;   
+   // mem outputs
+   logic [12:0] y0;
+   logic [5:0] 	y1;
+   // input to CPA
+   logic [14:0] op1;
+   logic [14:0] op2;
+   logic [14:0] p;   
+
+   assign x0 = a[11:7];
+   assign x1 = a[6:4];
+   assign x2 = a[3:0];   
+
+   sbtm_a2 mem1 ({x0[3:0], x1}, y0);
+   assign op1 = {1'b0, y0, 1'b0};
+   
+   // 1s cmp per sbtm/stam
+   assign x2_1cmp = x2[3] ? ~x2[2:0] : x2[2:0];   
+   sbtm_a3 mem2 ({x0, x2_1cmp}, y1);
+   // 1s cmp per sbtm/stam
+   assign op2 = x2[3] ? {{8{1'b1}}, ~y1, 1'b1} :
+		{8'b0, y1, 1'b1};
+   
+   // CPA
+   bk15 cp1 (cout, p, op1, op2, 1'b0);
+   assign y = p[14:4];
+
+endmodule // sbtm2
+
+
+   
--- a/wally-pipelined/src/fpu/sbtm_a0.sv
+++ b/wally-pipelined/src/fpu/sbtm_a0.sv
@ -0,0 +1,136 @@
+module sbtm_a0 (input  logic [6:0] a,
+		output logic [12:0] y);
+   always_comb
+     case(a)
+       7'b0000000: y = 13'b1111111100010;
+       7'b0000001: y = 13'b1111110100011;
+       7'b0000010: y = 13'b1111101100101;
+       7'b0000011: y = 13'b1111100101000;
+       7'b0000100: y = 13'b1111011101100;
+       7'b0000101: y = 13'b1111010110000;
+       7'b0000110: y = 13'b1111001110110;
+       7'b0000111: y = 13'b1111000111100;
+       7'b0001000: y = 13'b1111000000100;
+       7'b0001001: y = 13'b1110111001100;
+       7'b0001010: y = 13'b1110110010101;
+       7'b0001011: y = 13'b1110101011110;
+       7'b0001100: y = 13'b1110100101001;
+       7'b0001101: y = 13'b1110011110100;
+       7'b0001110: y = 13'b1110011000000;
+       7'b0001111: y = 13'b1110010001101;
+       7'b0010000: y = 13'b1110001011010;
+       7'b0010001: y = 13'b1110000101000;
+       7'b0010010: y = 13'b1101111110111;
+       7'b0010011: y = 13'b1101111000110;
+       7'b0010100: y = 13'b1101110010111;
+       7'b0010101: y = 13'b1101101100111;
+       7'b0010110: y = 13'b1101100111001;
+       7'b0010111: y = 13'b1101100001011;
+       7'b0011000: y = 13'b1101011011101;
+       7'b0011001: y = 13'b1101010110001;
+       7'b0011010: y = 13'b1101010000100;
+       7'b0011011: y = 13'b1101001011001;
+       7'b0011100: y = 13'b1101000101110;
+       7'b0011101: y = 13'b1101000000011;
+       7'b0011110: y = 13'b1100111011001;
+       7'b0011111: y = 13'b1100110101111;
+       7'b0100000: y = 13'b1100110000110;
+       7'b0100001: y = 13'b1100101011110;
+       7'b0100010: y = 13'b1100100110110;
+       7'b0100011: y = 13'b1100100001111;
+       7'b0100100: y = 13'b1100011101000;
+       7'b0100101: y = 13'b1100011000001;
+       7'b0100110: y = 13'b1100010011011;
+       7'b0100111: y = 13'b1100001110101;
+       7'b0101000: y = 13'b1100001010000;
+       7'b0101001: y = 13'b1100000101011;
+       7'b0101010: y = 13'b1100000000111;
+       7'b0101011: y = 13'b1011111100011;
+       7'b0101100: y = 13'b1011111000000;
+       7'b0101101: y = 13'b1011110011101;
+       7'b0101110: y = 13'b1011101111010;
+       7'b0101111: y = 13'b1011101011000;
+       7'b0110000: y = 13'b1011100110110;
+       7'b0110001: y = 13'b1011100010101;
+       7'b0110010: y = 13'b1011011110011;
+       7'b0110011: y = 13'b1011011010011;
+       7'b0110100: y = 13'b1011010110010;
+       7'b0110101: y = 13'b1011010010010;
+       7'b0110110: y = 13'b1011001110011;
+       7'b0110111: y = 13'b1011001010011;
+       7'b0111000: y = 13'b1011000110100;
+       7'b0111001: y = 13'b1011000010110;
+       7'b0111010: y = 13'b1010111110111;
+       7'b0111011: y = 13'b1010111011001;
+       7'b0111100: y = 13'b1010110111100;
+       7'b0111101: y = 13'b1010110011110;
+       7'b0111110: y = 13'b1010110000001;
+       7'b0111111: y = 13'b1010101100100;
+       7'b1000000: y = 13'b1010101001000;
+       7'b1000001: y = 13'b1010100101100;
+       7'b1000010: y = 13'b1010100010000;
+       7'b1000011: y = 13'b1010011110100;
+       7'b1000100: y = 13'b1010011011001;
+       7'b1000101: y = 13'b1010010111110;
+       7'b1000110: y = 13'b1010010100011;
+       7'b1000111: y = 13'b1010010001001;
+       7'b1001000: y = 13'b1010001101111;
+       7'b1001001: y = 13'b1010001010101;
+       7'b1001010: y = 13'b1010000111011;
+       7'b1001011: y = 13'b1010000100001;
+       7'b1001100: y = 13'b1010000001000;
+       7'b1001101: y = 13'b1001111101111;
+       7'b1001110: y = 13'b1001111010111;
+       7'b1001111: y = 13'b1001110111110;
+       7'b1010000: y = 13'b1001110100110;
+       7'b1010001: y = 13'b1001110001110;
+       7'b1010010: y = 13'b1001101110110;
+       7'b1010011: y = 13'b1001101011111;
+       7'b1010100: y = 13'b1001101000111;
+       7'b1010101: y = 13'b1001100110000;
+       7'b1010110: y = 13'b1001100011001;
+       7'b1010111: y = 13'b1001100000010;
+       7'b1011000: y = 13'b1001011101100;
+       7'b1011001: y = 13'b1001011010110;
+       7'b1011010: y = 13'b1001011000000;
+       7'b1011011: y = 13'b1001010101010;
+       7'b1011100: y = 13'b1001010010100;
+       7'b1011101: y = 13'b1001001111111;
+       7'b1011110: y = 13'b1001001101001;
+       7'b1011111: y = 13'b1001001010100;
+       7'b1100000: y = 13'b1001000111111;
+       7'b1100001: y = 13'b1001000101011;
+       7'b1100010: y = 13'b1001000010110;
+       7'b1100011: y = 13'b1001000000010;
+       7'b1100100: y = 13'b1000111101110;
+       7'b1100101: y = 13'b1000111011010;
+       7'b1100110: y = 13'b1000111000110;
+       7'b1100111: y = 13'b1000110110010;
+       7'b1101000: y = 13'b1000110011111;
+       7'b1101001: y = 13'b1000110001011;
+       7'b1101010: y = 13'b1000101111000;
+       7'b1101011: y = 13'b1000101100101;
+       7'b1101100: y = 13'b1000101010010;
+       7'b1101101: y = 13'b1000101000000;
+       7'b1101110: y = 13'b1000100101101;
+       7'b1101111: y = 13'b1000100011011;
+       7'b1110000: y = 13'b1000100001001;
+       7'b1110001: y = 13'b1000011110110;
+       7'b1110010: y = 13'b1000011100101;
+       7'b1110011: y = 13'b1000011010011;
+       7'b1110100: y = 13'b1000011000001;
+       7'b1110101: y = 13'b1000010110000;
+       7'b1110110: y = 13'b1000010011110;
+       7'b1110111: y = 13'b1000010001101;
+       7'b1111000: y = 13'b1000001111100;
+       7'b1111001: y = 13'b1000001101011;
+       7'b1111010: y = 13'b1000001011010;
+       7'b1111011: y = 13'b1000001001010;
+       7'b1111100: y = 13'b1000000111001;
+       7'b1111101: y = 13'b1000000101001;
+       7'b1111110: y = 13'b1000000011001;
+       7'b1111111: y = 13'b1000000001001;	    
+       default: y = 13'bxxxxxxxxxxxxx;
+     endcase // case (a)
+    
+endmodule // sbtm_a0
--- a/wally-pipelined/src/fpu/sbtm_a1.sv
+++ b/wally-pipelined/src/fpu/sbtm_a1.sv
@ -0,0 +1,136 @@
+module sbtm_a1 (input  logic [6:0] a,
+		output logic [4:0] y);
+   always_comb
+     case(a)
+       7'b0000000: y = 5'b11100;
+       7'b0000001: y = 5'b11000;
+       7'b0000010: y = 5'b10100;
+       7'b0000011: y = 5'b10000;
+       7'b0000100: y = 5'b01101;
+       7'b0000101: y = 5'b01001;
+       7'b0000110: y = 5'b00101;
+       7'b0000111: y = 5'b00001;
+       7'b0001000: y = 5'b11001;
+       7'b0001001: y = 5'b10101;
+       7'b0001010: y = 5'b10010;
+       7'b0001011: y = 5'b01111;
+       7'b0001100: y = 5'b01011;
+       7'b0001101: y = 5'b01000;
+       7'b0001110: y = 5'b00101;
+       7'b0001111: y = 5'b00001;
+       7'b0010000: y = 5'b10110;
+       7'b0010001: y = 5'b10011;
+       7'b0010010: y = 5'b10000;
+       7'b0010011: y = 5'b01101;
+       7'b0010100: y = 5'b01010;
+       7'b0010101: y = 5'b00111;
+       7'b0010110: y = 5'b00100;
+       7'b0010111: y = 5'b00001;
+       7'b0011000: y = 5'b10100;
+       7'b0011001: y = 5'b10001;
+       7'b0011010: y = 5'b01110;
+       7'b0011011: y = 5'b01100;
+       7'b0011100: y = 5'b01001;
+       7'b0011101: y = 5'b00110;
+       7'b0011110: y = 5'b00100;
+       7'b0011111: y = 5'b00001;
+       7'b0100000: y = 5'b10010;
+       7'b0100001: y = 5'b01111;
+       7'b0100010: y = 5'b01101;
+       7'b0100011: y = 5'b01010;
+       7'b0100100: y = 5'b01000;
+       7'b0100101: y = 5'b00110;
+       7'b0100110: y = 5'b00011;
+       7'b0100111: y = 5'b00001;
+       7'b0101000: y = 5'b10000;
+       7'b0101001: y = 5'b01110;
+       7'b0101010: y = 5'b01100;
+       7'b0101011: y = 5'b01001;
+       7'b0101100: y = 5'b00111;
+       7'b0101101: y = 5'b00101;
+       7'b0101110: y = 5'b00011;
+       7'b0101111: y = 5'b00001;
+       7'b0110000: y = 5'b01111;
+       7'b0110001: y = 5'b01101;
+       7'b0110010: y = 5'b01011;
+       7'b0110011: y = 5'b01001;
+       7'b0110100: y = 5'b00111;
+       7'b0110101: y = 5'b00101;
+       7'b0110110: y = 5'b00011;
+       7'b0110111: y = 5'b00001;
+       7'b0111000: y = 5'b01101;
+       7'b0111001: y = 5'b01100;
+       7'b0111010: y = 5'b01010;
+       7'b0111011: y = 5'b01000;
+       7'b0111100: y = 5'b00110;
+       7'b0111101: y = 5'b00100;
+       7'b0111110: y = 5'b00010;
+       7'b0111111: y = 5'b00000;
+       7'b1000000: y = 5'b01100;
+       7'b1000001: y = 5'b01011;
+       7'b1000010: y = 5'b01001;
+       7'b1000011: y = 5'b00111;
+       7'b1000100: y = 5'b00101;
+       7'b1000101: y = 5'b00100;
+       7'b1000110: y = 5'b00010;
+       7'b1000111: y = 5'b00000;
+       7'b1001000: y = 5'b01011;
+       7'b1001001: y = 5'b01010;
+       7'b1001010: y = 5'b01000;
+       7'b1001011: y = 5'b00111;
+       7'b1001100: y = 5'b00101;
+       7'b1001101: y = 5'b00011;
+       7'b1001110: y = 5'b00010;
+       7'b1001111: y = 5'b00000;
+       7'b1010000: y = 5'b01010;
+       7'b1010001: y = 5'b01001;
+       7'b1010010: y = 5'b01000;
+       7'b1010011: y = 5'b00110;
+       7'b1010100: y = 5'b00101;
+       7'b1010101: y = 5'b00011;
+       7'b1010110: y = 5'b00010;
+       7'b1010111: y = 5'b00000;
+       7'b1011000: y = 5'b01010;
+       7'b1011001: y = 5'b01000;
+       7'b1011010: y = 5'b00111;
+       7'b1011011: y = 5'b00110;
+       7'b1011100: y = 5'b00100;
+       7'b1011101: y = 5'b00011;
+       7'b1011110: y = 5'b00010;
+       7'b1011111: y = 5'b00000;
+       7'b1100000: y = 5'b01001;
+       7'b1100001: y = 5'b01000;
+       7'b1100010: y = 5'b00110;
+       7'b1100011: y = 5'b00101;
+       7'b1100100: y = 5'b00100;
+       7'b1100101: y = 5'b00011;
+       7'b1100110: y = 5'b00001;
+       7'b1100111: y = 5'b00000;
+       7'b1101000: y = 5'b01000;
+       7'b1101001: y = 5'b00111;
+       7'b1101010: y = 5'b00110;
+       7'b1101011: y = 5'b00101;
+       7'b1101100: y = 5'b00100;
+       7'b1101101: y = 5'b00010;
+       7'b1101110: y = 5'b00001;
+       7'b1101111: y = 5'b00000;
+       7'b1110000: y = 5'b01000;
+       7'b1110001: y = 5'b00111;
+       7'b1110010: y = 5'b00110;
+       7'b1110011: y = 5'b00100;
+       7'b1110100: y = 5'b00011;
+       7'b1110101: y = 5'b00010;
+       7'b1110110: y = 5'b00001;
+       7'b1110111: y = 5'b00000;
+       7'b1111000: y = 5'b00111;
+       7'b1111001: y = 5'b00110;
+       7'b1111010: y = 5'b00101;
+       7'b1111011: y = 5'b00100;
+       7'b1111100: y = 5'b00011;
+       7'b1111101: y = 5'b00010;
+       7'b1111110: y = 5'b00001;
+       7'b1111111: y = 5'b00000;	    
+       default: y = 5'bxxxxx;
+     endcase // case (a)
+    
+endmodule // sbtm_a0
--- a/wally-pipelined/src/fpu/sbtm_a2.sv
+++ b/wally-pipelined/src/fpu/sbtm_a2.sv
@ -0,0 +1,140 @@
+module sbtm_a2 (input  logic [6:0] a,
+		output logic [12:0] y);
+   always_comb
+     case(a)
+       7'b0000000: y = 13'b1111111110001;
+       7'b0000001: y = 13'b1111111010001;
+       7'b0000010: y = 13'b1111110110010;
+       7'b0000011: y = 13'b1111110010011;
+       7'b0000100: y = 13'b1111101110101;
+       7'b0000101: y = 13'b1111101010110;
+       7'b0000110: y = 13'b1111100111001;
+       7'b0000111: y = 13'b1111100011011;
+       7'b0001000: y = 13'b1111011111110;
+       7'b0001001: y = 13'b1111011100001;
+       7'b0001010: y = 13'b1111011000100;
+       7'b0001011: y = 13'b1111010101000;
+       7'b0001100: y = 13'b1111010001100;
+       7'b0001101: y = 13'b1111001110000;
+       7'b0001110: y = 13'b1111001010101;
+       7'b0001111: y = 13'b1111000111010;
+       7'b0010000: y = 13'b1111000011111;
+       7'b0010001: y = 13'b1111000000100;
+       7'b0010010: y = 13'b1110111101010;
+       7'b0010011: y = 13'b1110111010000;
+       7'b0010100: y = 13'b1110110110110;
+       7'b0010101: y = 13'b1110110011101;
+       7'b0010110: y = 13'b1110110000100;
+       7'b0010111: y = 13'b1110101101011;
+       7'b0011000: y = 13'b1110101010010;
+       7'b0011001: y = 13'b1110100111001;
+       7'b0011010: y = 13'b1110100100001;
+       7'b0011011: y = 13'b1110100001001;
+       7'b0011100: y = 13'b1110011110001;
+       7'b0011101: y = 13'b1110011011010;
+       7'b0011110: y = 13'b1110011000010;
+       7'b0011111: y = 13'b1110010101011;
+       7'b0100000: y = 13'b1110010010100;
+       7'b0100001: y = 13'b1110001111110;
+       7'b0100010: y = 13'b1110001100111;
+       7'b0100011: y = 13'b1110001010001;
+       7'b0100100: y = 13'b1110000111011;
+       7'b0100101: y = 13'b1110000100101;
+       7'b0100110: y = 13'b1110000001111;
+       7'b0100111: y = 13'b1101111111010;
+       7'b0101000: y = 13'b1101111100101;
+       7'b0101001: y = 13'b1101111010000;
+       7'b0101010: y = 13'b1101110111011;
+       7'b0101011: y = 13'b1101110100110;
+       7'b0101100: y = 13'b1101110010001;
+       7'b0101101: y = 13'b1101101111101;
+       7'b0101110: y = 13'b1101101101001;
+       7'b0101111: y = 13'b1101101010101;
+       7'b0110000: y = 13'b1101101000001;
+       7'b0110001: y = 13'b1101100101101;
+       7'b0110010: y = 13'b1101100011010;
+       7'b0110011: y = 13'b1101100000110;
+       7'b0110100: y = 13'b1101011110011;
+       7'b0110101: y = 13'b1101011100000;
+       7'b0110110: y = 13'b1101011001101;
+       7'b0110111: y = 13'b1101010111010;
+       7'b0111000: y = 13'b1101010101000;
+       7'b0111001: y = 13'b1101010010101;
+       7'b0111010: y = 13'b1101010000011;
+       7'b0111011: y = 13'b1101001110001;
+       7'b0111100: y = 13'b1101001011111;
+       7'b0111101: y = 13'b1101001001101;
+       7'b0111110: y = 13'b1101000111100;
+       7'b0111111: y = 13'b1101000101010;
+       7'b1000000: y = 13'b1101000011001;
+       7'b1000001: y = 13'b1101000000111;
+       7'b1000010: y = 13'b1100111110110;
+       7'b1000011: y = 13'b1100111100101;
+       7'b1000100: y = 13'b1100111010100;
+       7'b1000101: y = 13'b1100111000011;
+       7'b1000110: y = 13'b1100110110011;
+       7'b1000111: y = 13'b1100110100010;
+       7'b1001000: y = 13'b1100110010010;
+       7'b1001001: y = 13'b1100110000010;
+       7'b1001010: y = 13'b1100101110010;
+       7'b1001011: y = 13'b1100101100001;
+       7'b1001100: y = 13'b1100101010010;
+       7'b1001101: y = 13'b1100101000010;
+       7'b1001110: y = 13'b1100100110010;
+       7'b1001111: y = 13'b1100100100011;
+       7'b1010000: y = 13'b1100100010011;
+       7'b1010001: y = 13'b1100100000100;
+       7'b1010010: y = 13'b1100011110101;
+       7'b1010011: y = 13'b1100011100101;
+       7'b1010100: y = 13'b1100011010110;
+       7'b1010101: y = 13'b1100011000111;
+       7'b1010110: y = 13'b1100010111001;
+       7'b1010111: y = 13'b1100010101010;
+       7'b1011000: y = 13'b1100010011011;
+       7'b1011001: y = 13'b1100010001101;
+       7'b1011010: y = 13'b1100001111110;
+       7'b1011011: y = 13'b1100001110000;
+       7'b1011100: y = 13'b1100001100010;
+       7'b1011101: y = 13'b1100001010100;
+       7'b1011110: y = 13'b1100001000110;
+       7'b1011111: y = 13'b1100000111000;
+       7'b1100000: y = 13'b1100000101010;
+       7'b1100001: y = 13'b1100000011100;
+       7'b1100010: y = 13'b1100000001111;
+       7'b1100011: y = 13'b1100000000001;
+       7'b1100100: y = 13'b1011111110100;
+       7'b1100101: y = 13'b1011111100110;
+       7'b1100110: y = 13'b1011111011001;
+       7'b1100111: y = 13'b1011111001100;
+       7'b1101000: y = 13'b1011110111111;
+       7'b1101001: y = 13'b1011110110010;
+       7'b1101010: y = 13'b1011110100101;
+       7'b1101011: y = 13'b1011110011000;
+       7'b1101100: y = 13'b1011110001011;
+       7'b1101101: y = 13'b1011101111110;
+       7'b1101110: y = 13'b1011101110010;
+       7'b1101111: y = 13'b1011101100101;
+       7'b1110000: y = 13'b1011101011001;
+       7'b1110001: y = 13'b1011101001100;
+       7'b1110010: y = 13'b1011101000000;
+       7'b1110011: y = 13'b1011100110100;
+       7'b1110100: y = 13'b1011100101000;
+       7'b1110101: y = 13'b1011100011100;
+       7'b1110110: y = 13'b1011100010000;
+       7'b1110111: y = 13'b1011100000100;
+       7'b1111000: y = 13'b1011011111000;
+       7'b1111001: y = 13'b1011011101100;
+       7'b1111010: y = 13'b1011011100000;
+       7'b1111011: y = 13'b1011011010101;
+       7'b1111100: y = 13'b1011011001001;
+       7'b1111101: y = 13'b1011010111101;
+       7'b1111110: y = 13'b1011010110010;
+       7'b1111111: y = 13'b1011010100111;	    
+       default: y = 13'bxxxxxxxxxxxxx;
+     endcase // case (a)
+    
+endmodule // sbtm_a0
+
+    
+    
+    
--- a/wally-pipelined/src/fpu/sbtm_a3.sv
+++ b/wally-pipelined/src/fpu/sbtm_a3.sv
@ -0,0 +1,200 @@
+module sbtm_a3 (input  logic [7:0] a,
+		output logic [5:0] y);
+   always_comb
+     case(a)
+       8'b01000000: y = 6'b100110;
+       8'b01000001: y = 6'b100001;
+       8'b01000010: y = 6'b011100;
+       8'b01000011: y = 6'b010111;
+       8'b01000100: y = 6'b010010;
+       8'b01000101: y = 6'b001100;
+       8'b01000110: y = 6'b000111;
+       8'b01000111: y = 6'b000010;
+       8'b01001000: y = 6'b100000;
+       8'b01001001: y = 6'b011100;
+       8'b01001010: y = 6'b011000;
+       8'b01001011: y = 6'b010011;
+       8'b01001100: y = 6'b001111;
+       8'b01001101: y = 6'b001010;
+       8'b01001110: y = 6'b000110;
+       8'b01001111: y = 6'b000010;
+       8'b01010000: y = 6'b011100;
+       8'b01010001: y = 6'b011000;
+       8'b01010010: y = 6'b010100;
+       8'b01010011: y = 6'b010000;
+       8'b01010100: y = 6'b001101;
+       8'b01010101: y = 6'b001001;
+       8'b01010110: y = 6'b000101;
+       8'b01010111: y = 6'b000001;
+       8'b01011000: y = 6'b011000;
+       8'b01011001: y = 6'b010101;
+       8'b01011010: y = 6'b010010;
+       8'b01011011: y = 6'b001110;
+       8'b01011100: y = 6'b001011;
+       8'b01011101: y = 6'b001000;
+       8'b01011110: y = 6'b000100;
+       8'b01011111: y = 6'b000001;
+       8'b01100000: y = 6'b010101;
+       8'b01100001: y = 6'b010010;
+       8'b01100010: y = 6'b001111;
+       8'b01100011: y = 6'b001101;
+       8'b01100100: y = 6'b001010;
+       8'b01100101: y = 6'b000111;
+       8'b01100110: y = 6'b000100;
+       8'b01100111: y = 6'b000001;
+       8'b01101000: y = 6'b010011;
+       8'b01101001: y = 6'b010000;
+       8'b01101010: y = 6'b001110;
+       8'b01101011: y = 6'b001011;
+       8'b01101100: y = 6'b001001;
+       8'b01101101: y = 6'b000110;
+       8'b01101110: y = 6'b000011;
+       8'b01101111: y = 6'b000001;
+       8'b01110000: y = 6'b010001;
+       8'b01110001: y = 6'b001111;
+       8'b01110010: y = 6'b001100;
+       8'b01110011: y = 6'b001010;
+       8'b01110100: y = 6'b001000;
+       8'b01110101: y = 6'b000101;
+       8'b01110110: y = 6'b000011;
+       8'b01110111: y = 6'b000001;
+       8'b01111000: y = 6'b001111;
+       8'b01111001: y = 6'b001101;
+       8'b01111010: y = 6'b001011;
+       8'b01111011: y = 6'b001001;
+       8'b01111100: y = 6'b000111;
+       8'b01111101: y = 6'b000101;
+       8'b01111110: y = 6'b000011;
+       8'b01111111: y = 6'b000001;       
+       8'b10000000: y = 6'b001110;
+       8'b10000001: y = 6'b001100;
+       8'b10000010: y = 6'b001010;
+       8'b10000011: y = 6'b001000;
+       8'b10000100: y = 6'b000110;
+       8'b10000101: y = 6'b000100;
+       8'b10000110: y = 6'b000010;
+       8'b10000111: y = 6'b000000;
+       8'b10001000: y = 6'b001101;
+       8'b10001001: y = 6'b001011;
+       8'b10001010: y = 6'b001001;
+       8'b10001011: y = 6'b000111;
+       8'b10001100: y = 6'b000110;
+       8'b10001101: y = 6'b000100;
+       8'b10001110: y = 6'b000010;
+       8'b10001111: y = 6'b000000;
+       8'b10010000: y = 6'b001100;
+       8'b10010001: y = 6'b001010;
+       8'b10010010: y = 6'b001000;
+       8'b10010011: y = 6'b000111;
+       8'b10010100: y = 6'b000101;
+       8'b10010101: y = 6'b000100;
+       8'b10010110: y = 6'b000010;
+       8'b10010111: y = 6'b000000;
+       8'b10011000: y = 6'b001011;
+       8'b10011001: y = 6'b001001;
+       8'b10011010: y = 6'b001000;
+       8'b10011011: y = 6'b000110;
+       8'b10011100: y = 6'b000101;
+       8'b10011101: y = 6'b000011;
+       8'b10011110: y = 6'b000010;
+       8'b10011111: y = 6'b000000;
+       8'b10100000: y = 6'b001010;
+       8'b10100001: y = 6'b001000;
+       8'b10100010: y = 6'b000111;
+       8'b10100011: y = 6'b000110;
+       8'b10100100: y = 6'b000100;
+       8'b10100101: y = 6'b000011;
+       8'b10100110: y = 6'b000010;
+       8'b10100111: y = 6'b000000;
+       8'b10101000: y = 6'b001001;
+       8'b10101001: y = 6'b001000;
+       8'b10101010: y = 6'b000111;
+       8'b10101011: y = 6'b000101;
+       8'b10101100: y = 6'b000100;
+       8'b10101101: y = 6'b000011;
+       8'b10101110: y = 6'b000001;
+       8'b10101111: y = 6'b000000;
+       8'b10110000: y = 6'b001000;
+       8'b10110001: y = 6'b000111;
+       8'b10110010: y = 6'b000110;
+       8'b10110011: y = 6'b000101;
+       8'b10110100: y = 6'b000100;
+       8'b10110101: y = 6'b000010;
+       8'b10110110: y = 6'b000001;
+       8'b10110111: y = 6'b000000;
+       8'b10111000: y = 6'b001000;
+       8'b10111001: y = 6'b000111;
+       8'b10111010: y = 6'b000110;
+       8'b10111011: y = 6'b000101;
+       8'b10111100: y = 6'b000011;
+       8'b10111101: y = 6'b000010;
+       8'b10111110: y = 6'b000001;
+       8'b10111111: y = 6'b000000;
+       8'b11000000: y = 6'b000111;
+       8'b11000001: y = 6'b000110;
+       8'b11000010: y = 6'b000101;
+       8'b11000011: y = 6'b000100;
+       8'b11000100: y = 6'b000011;
+       8'b11000101: y = 6'b000010;
+       8'b11000110: y = 6'b000001;
+       8'b11000111: y = 6'b000000;
+       8'b11001000: y = 6'b000111;
+       8'b11001001: y = 6'b000110;
+       8'b11001010: y = 6'b000101;
+       8'b11001011: y = 6'b000100;
+       8'b11001100: y = 6'b000011;
+       8'b11001101: y = 6'b000010;
+       8'b11001110: y = 6'b000001;
+       8'b11001111: y = 6'b000000;
+       8'b11010000: y = 6'b000111;
+       8'b11010001: y = 6'b000110;
+       8'b11010010: y = 6'b000101;
+       8'b11010011: y = 6'b000100;
+       8'b11010100: y = 6'b000011;
+       8'b11010101: y = 6'b000010;
+       8'b11010110: y = 6'b000001;
+       8'b11010111: y = 6'b000000;
+       8'b11011000: y = 6'b000110;
+       8'b11011001: y = 6'b000101;
+       8'b11011010: y = 6'b000100;
+       8'b11011011: y = 6'b000011;
+       8'b11011100: y = 6'b000011;
+       8'b11011101: y = 6'b000010;
+       8'b11011110: y = 6'b000001;
+       8'b11011111: y = 6'b000000;
+       8'b11100000: y = 6'b000110;
+       8'b11100001: y = 6'b000101;
+       8'b11100010: y = 6'b000100;
+       8'b11100011: y = 6'b000011;
+       8'b11100100: y = 6'b000010;
+       8'b11100101: y = 6'b000010;
+       8'b11100110: y = 6'b000001;
+       8'b11100111: y = 6'b000000;
+       8'b11101000: y = 6'b000101;
+       8'b11101001: y = 6'b000101;
+       8'b11101010: y = 6'b000100;
+       8'b11101011: y = 6'b000011;
+       8'b11101100: y = 6'b000010;
+       8'b11101101: y = 6'b000001;
+       8'b11101110: y = 6'b000001;
+       8'b11101111: y = 6'b000000;
+       8'b11110000: y = 6'b000101;
+       8'b11110001: y = 6'b000100;
+       8'b11110010: y = 6'b000100;
+       8'b11110011: y = 6'b000011;
+       8'b11110100: y = 6'b000010;
+       8'b11110101: y = 6'b000001;
+       8'b11110110: y = 6'b000001;
+       8'b11110111: y = 6'b000000;
+       8'b11111000: y = 6'b000101;
+       8'b11111001: y = 6'b000100;
+       8'b11111010: y = 6'b000011;
+       8'b11111011: y = 6'b000011;
+       8'b11111100: y = 6'b000010;
+       8'b11111101: y = 6'b000001;
+       8'b11111110: y = 6'b000001;
+       8'b11111111: y = 6'b000000;
+       default: y = 6'bxxxxxx;
+     endcase // case (a)
+    
+endmodule // sbtm_a0
--- a/wally-pipelined/src/fpu/sbtm_a4.sv
+++ b/wally-pipelined/src/fpu/sbtm_a4.sv
@ -0,0 +1,204 @@
+module sbtm_a4 (input  logic [7:0] a,
+		output logic [13:0] y);
+   always_comb
+     case(a)
+       8'b01000000: y = 14'b10110100010111;
+       8'b01000001: y = 14'b10110010111111;
+       8'b01000010: y = 14'b10110001101000;
+       8'b01000011: y = 14'b10110000010011;
+       8'b01000100: y = 14'b10101111000001;
+       8'b01000101: y = 14'b10101101110000;
+       8'b01000110: y = 14'b10101100100001;
+       8'b01000111: y = 14'b10101011010011;
+       8'b01001000: y = 14'b10101010000111;
+       8'b01001001: y = 14'b10101000111101;
+       8'b01001010: y = 14'b10100111110100;
+       8'b01001011: y = 14'b10100110101101;
+       8'b01001100: y = 14'b10100101100111;
+       8'b01001101: y = 14'b10100100100010;
+       8'b01001110: y = 14'b10100011011111;
+       8'b01001111: y = 14'b10100010011101;
+       8'b01010000: y = 14'b10100001011100;
+       8'b01010001: y = 14'b10100000011100;
+       8'b01010010: y = 14'b10011111011110;
+       8'b01010011: y = 14'b10011110100001;
+       8'b01010100: y = 14'b10011101100100;
+       8'b01010101: y = 14'b10011100101001;
+       8'b01010110: y = 14'b10011011101111;
+       8'b01010111: y = 14'b10011010110110;
+       8'b01011000: y = 14'b10011001111110;
+       8'b01011001: y = 14'b10011001000110;
+       8'b01011010: y = 14'b10011000010000;
+       8'b01011011: y = 14'b10010111011011;
+       8'b01011100: y = 14'b10010110100110;
+       8'b01011101: y = 14'b10010101110011;
+       8'b01011110: y = 14'b10010101000000;
+       8'b01011111: y = 14'b10010100001110;
+       8'b01100000: y = 14'b10010011011100;
+       8'b01100001: y = 14'b10010010101100;
+       8'b01100010: y = 14'b10010001111100;
+       8'b01100011: y = 14'b10010001001101;
+       8'b01100100: y = 14'b10010000011111;
+       8'b01100101: y = 14'b10001111110001;
+       8'b01100110: y = 14'b10001111000100;
+       8'b01100111: y = 14'b10001110011000;
+       8'b01101000: y = 14'b10001101101100;
+       8'b01101001: y = 14'b10001101000001;
+       8'b01101010: y = 14'b10001100010110;
+       8'b01101011: y = 14'b10001011101100;
+       8'b01101100: y = 14'b10001011000011;
+       8'b01101101: y = 14'b10001010011010;
+       8'b01101110: y = 14'b10001001110010;
+       8'b01101111: y = 14'b10001001001010;
+       8'b01110000: y = 14'b10001000100011;
+       8'b01110001: y = 14'b10000111111101;
+       8'b01110010: y = 14'b10000111010111;
+       8'b01110011: y = 14'b10000110110001;
+       8'b01110100: y = 14'b10000110001100;
+       8'b01110101: y = 14'b10000101100111;
+       8'b01110110: y = 14'b10000101000011;
+       8'b01110111: y = 14'b10000100011111;
+       8'b01111000: y = 14'b10000011111100;
+       8'b01111001: y = 14'b10000011011001;
+       8'b01111010: y = 14'b10000010110111;
+       8'b01111011: y = 14'b10000010010101;
+       8'b01111100: y = 14'b10000001110011;
+       8'b01111101: y = 14'b10000001010010;
+       8'b01111110: y = 14'b10000000110001;
+       8'b01111111: y = 14'b10000000010001;       
+       8'b10000000: y = 14'b01111111110001;
+       8'b10000001: y = 14'b01111111010001;
+       8'b10000010: y = 14'b01111110110010;
+       8'b10000011: y = 14'b01111110010011;
+       8'b10000100: y = 14'b01111101110101;
+       8'b10000101: y = 14'b01111101010110;
+       8'b10000110: y = 14'b01111100111001;
+       8'b10000111: y = 14'b01111100011011;
+       8'b10001000: y = 14'b01111011111110;
+       8'b10001001: y = 14'b01111011100001;
+       8'b10001010: y = 14'b01111011000100;
+       8'b10001011: y = 14'b01111010101000;
+       8'b10001100: y = 14'b01111010001100;
+       8'b10001101: y = 14'b01111001110000;
+       8'b10001110: y = 14'b01111001010101;
+       8'b10001111: y = 14'b01111000111010;
+       8'b10010000: y = 14'b01111000011111;
+       8'b10010001: y = 14'b01111000000100;
+       8'b10010010: y = 14'b01110111101010;
+       8'b10010011: y = 14'b01110111010000;
+       8'b10010100: y = 14'b01110110110110;
+       8'b10010101: y = 14'b01110110011101;
+       8'b10010110: y = 14'b01110110000100;
+       8'b10010111: y = 14'b01110101101011;
+       8'b10011000: y = 14'b01110101010010;
+       8'b10011001: y = 14'b01110100111001;
+       8'b10011010: y = 14'b01110100100001;
+       8'b10011011: y = 14'b01110100001001;
+       8'b10011100: y = 14'b01110011110001;
+       8'b10011101: y = 14'b01110011011010;
+       8'b10011110: y = 14'b01110011000010;
+       8'b10011111: y = 14'b01110010101011;
+       8'b10100000: y = 14'b01110010010100;
+       8'b10100001: y = 14'b01110001111110;
+       8'b10100010: y = 14'b01110001100111;
+       8'b10100011: y = 14'b01110001010001;
+       8'b10100100: y = 14'b01110000111011;
+       8'b10100101: y = 14'b01110000100101;
+       8'b10100110: y = 14'b01110000001111;
+       8'b10100111: y = 14'b01101111111010;
+       8'b10101000: y = 14'b01101111100101;
+       8'b10101001: y = 14'b01101111010000;
+       8'b10101010: y = 14'b01101110111011;
+       8'b10101011: y = 14'b01101110100110;
+       8'b10101100: y = 14'b01101110010001;
+       8'b10101101: y = 14'b01101101111101;
+       8'b10101110: y = 14'b01101101101001;
+       8'b10101111: y = 14'b01101101010101;
+       8'b10110000: y = 14'b01101101000001;
+       8'b10110001: y = 14'b01101100101101;
+       8'b10110010: y = 14'b01101100011010;
+       8'b10110011: y = 14'b01101100000110;
+       8'b10110100: y = 14'b01101011110011;
+       8'b10110101: y = 14'b01101011100000;
+       8'b10110110: y = 14'b01101011001101;
+       8'b10110111: y = 14'b01101010111010;
+       8'b10111000: y = 14'b01101010101000;
+       8'b10111001: y = 14'b01101010010101;
+       8'b10111010: y = 14'b01101010000011;
+       8'b10111011: y = 14'b01101001110001;
+       8'b10111100: y = 14'b01101001011111;
+       8'b10111101: y = 14'b01101001001101;
+       8'b10111110: y = 14'b01101000111100;
+       8'b10111111: y = 14'b01101000101010;
+       8'b11000000: y = 14'b01101000011001;
+       8'b11000001: y = 14'b01101000000111;
+       8'b11000010: y = 14'b01100111110110;
+       8'b11000011: y = 14'b01100111100101;
+       8'b11000100: y = 14'b01100111010100;
+       8'b11000101: y = 14'b01100111000011;
+       8'b11000110: y = 14'b01100110110011;
+       8'b11000111: y = 14'b01100110100010;
+       8'b11001000: y = 14'b01100110010010;
+       8'b11001001: y = 14'b01100110000010;
+       8'b11001010: y = 14'b01100101110010;
+       8'b11001011: y = 14'b01100101100001;
+       8'b11001100: y = 14'b01100101010010;
+       8'b11001101: y = 14'b01100101000010;
+       8'b11001110: y = 14'b01100100110010;
+       8'b11001111: y = 14'b01100100100011;
+       8'b11010000: y = 14'b01100100010011;
+       8'b11010001: y = 14'b01100100000100;
+       8'b11010010: y = 14'b01100011110101;
+       8'b11010011: y = 14'b01100011100101;
+       8'b11010100: y = 14'b01100011010110;
+       8'b11010101: y = 14'b01100011000111;
+       8'b11010110: y = 14'b01100010111001;
+       8'b11010111: y = 14'b01100010101010;
+       8'b11011000: y = 14'b01100010011011;
+       8'b11011001: y = 14'b01100010001101;
+       8'b11011010: y = 14'b01100001111110;
+       8'b11011011: y = 14'b01100001110000;
+       8'b11011100: y = 14'b01100001100010;
+       8'b11011101: y = 14'b01100001010100;
+       8'b11011110: y = 14'b01100001000110;
+       8'b11011111: y = 14'b01100000111000;
+       8'b11100000: y = 14'b01100000101010;
+       8'b11100001: y = 14'b01100000011100;
+       8'b11100010: y = 14'b01100000001111;
+       8'b11100011: y = 14'b01100000000001;
+       8'b11100100: y = 14'b01011111110100;
+       8'b11100101: y = 14'b01011111100110;
+       8'b11100110: y = 14'b01011111011001;
+       8'b11100111: y = 14'b01011111001100;
+       8'b11101000: y = 14'b01011110111111;
+       8'b11101001: y = 14'b01011110110010;
+       8'b11101010: y = 14'b01011110100101;
+       8'b11101011: y = 14'b01011110011000;
+       8'b11101100: y = 14'b01011110001011;
+       8'b11101101: y = 14'b01011101111110;
+       8'b11101110: y = 14'b01011101110010;
+       8'b11101111: y = 14'b01011101100101;
+       8'b11110000: y = 14'b01011101011001;
+       8'b11110001: y = 14'b01011101001100;
+       8'b11110010: y = 14'b01011101000000;
+       8'b11110011: y = 14'b01011100110100;
+       8'b11110100: y = 14'b01011100101000;
+       8'b11110101: y = 14'b01011100011100;
+       8'b11110110: y = 14'b01011100010000;
+       8'b11110111: y = 14'b01011100000100;
+       8'b11111000: y = 14'b01011011111000;
+       8'b11111001: y = 14'b01011011101100;
+       8'b11111010: y = 14'b01011011100000;
+       8'b11111011: y = 14'b01011011010101;
+       8'b11111100: y = 14'b01011011001001;
+       8'b11111101: y = 14'b01011010111101;
+       8'b11111110: y = 14'b01011010110010;
+       8'b11111111: y = 14'b01011010100111;
+       default: y = 14'bxxxxxxxxxxxxxx;
+     endcase // case (a)
+    
+endmodule // sbtm_a0
+
+    
+    
+    
--- a/wally-pipelined/src/fpu/shifter_denorm.sv
+++ b/wally-pipelined/src/fpu/shifter_denorm.sv
@ -0,0 +1,162 @@
+
+// MJS - This module implements a 57-bit 2-to-1 multiplexor, which is
+// used in the barrel shifter for significand alignment.
+
+module mux21x57 (Z, A, B, Sel);
+
+   input [56:0] A;
+   input [56:0] B;
+   input 	Sel;
+
+   output [56:0] Z;
+
+   assign Z = Sel ? B : A;
+
+endmodule // mux21x57
+
+// MJS - This module implements a 64-bit 2-to-1 multiplexor, which is
+// used in the barrel shifter for significand normalization. 
+
+module mux21x64 (Z, A, B, Sel);
+
+   input [63:0] A;
+   input [63:0] B;
+   input 	Sel;
+
+   output [63:0] Z;
+   
+   assign Z = Sel ? B : A;
+   
+endmodule // mux21x64
+
+// The implementation of the barrel shifter was modified to use 
+// fewer gates. It is now implemented using six 64-bit 2-to-1 muxes. The 
+// barrel shifter takes a 64-bit input A and shifts it left by up to 
+// 63-bits, as specified by Shift, to produce a 63-bit output Z. 
+// Bits to the right are filled with zeros. 
+// The 64 bit shift is implemented using 6 stages of shifts of 32
+// 16, 8, 4, 2, and 1 bit shifts. 
+
+module barrel_shifter_l64 (Z, A, Shift);
+
+   input [63:0] A;
+   input [5:0] 	Shift;
+   
+   wire [63:0] 	stage1;
+   wire [63:0] 	stage2;
+   wire [63:0] 	stage3;
+   wire [63:0] 	stage4;
+   wire [63:0] 	stage5;
+   wire [31:0] 	thirtytwozeros = 32'h0;
+   wire [15:0] 	sixteenzeros = 16'h0;
+   wire [ 7:0] 	eightzeros = 8'h0;
+   wire [ 3:0] 	fourzeros = 4'h0;
+   wire [ 1:0] 	twozeros = 2'b00;
+   wire 	onezero = 1'b0;   
+
+   output [63:0] Z;      
+
+   mux21x64  mx01(stage1, A,      {A[31:0], thirtytwozeros}, Shift[5]);
+   mux21x64  mx02(stage2, stage1, {stage1[47:0], sixteenzeros}, Shift[4]);
+   mux21x64  mx03(stage3, stage2, {stage2[55:0], eightzeros}, Shift[3]);
+   mux21x64  mx04(stage4, stage3, {stage3[59:0], fourzeros}, Shift[2]);
+   mux21x64  mx05(stage5, stage4, {stage4[61:0], twozeros}, Shift[1]);
+   mux21x64  mx06(Z     , stage5, {stage5[62:0], onezero}, Shift[0]);
+
+endmodule // barrel_shifter_l63
+
+// The implementation of the barrel shifter was modified to use 
+// fewer gates. It is now implemented using six 57-bit 2-to-1 muxes. The 
+// barrel shifter takes a 57-bit input A and right shifts it by up to 
+// 63-bits, as specified by Shift, to produce a 57-bit output Z. 
+// It also computes a Sticky bit, which is set to 
+// one if any of the bits that were shifted out was one.
+// Bits shifted into the left are filled with zeros. 
+// The 63 bit shift is implemented using 6 stages of shifts of 32
+// 16, 8, 4, 2, and 1 bits.
+
+module barrel_shifter_r57 (Z, Sticky, A, Shift);
+   
+   input [56:0] A;
+   input [5:0] 	Shift;
+
+   output 	Sticky;
+   output [56:0] Z;      
+   
+   wire [56:0] 	stage1;
+   wire [56:0] 	stage2;
+   wire [56:0] 	stage3;
+   wire [56:0] 	stage4;
+   wire [56:0] 	stage5;
+   wire [62:0] 	sixtythreezeros = 63'h0;
+   wire [31:0] 	thirtytwozeros = 32'h0;
+   wire [15:0] 	sixteenzeros = 16'h0;
+   wire [ 7:0] 	eightzeros = 8'h0;
+   wire [ 3:0] 	fourzeros = 4'h0;
+   wire [ 1:0] 	twozeros = 2'b00;
+   wire 	onezero = 1'b0;   
+   wire [62:0] 	S;
+
+   // Shift operations
+   mux21x57  mx01(stage1,      A, {thirtytwozeros,    A[56:32]}, Shift[5]);
+   mux21x57  mx02(stage2, stage1, {sixteenzeros, stage1[56:16]}, Shift[4]);
+   mux21x57  mx03(stage3, stage2, {eightzeros, stage2[56:8]}, Shift[3]);
+   mux21x57  mx04(stage4, stage3, {fourzeros, stage3[56:4]}, Shift[2]);
+   mux21x57  mx05(stage5, stage4, {twozeros, stage4[56:2]}, Shift[1]);
+   mux21x57  mx06(Z     , stage5, {onezero, stage5[56:1]}, Shift[0]);
+
+   // Sticky bit calculation. The Sticky bit is set to one if any of the
+   // bits that were shifter out were one
+
+   assign S[31:0]  = {32{Shift[5]}} &      A[31:0];  
+   assign S[47:32] = {16{Shift[4]}} & stage1[15:0];  
+   assign S[55:48] = { 8{Shift[3]}} & stage2[7:0];  
+   assign S[59:56] = { 4{Shift[2]}} & stage3[3:0];  
+   assign S[61:60] = { 2{Shift[1]}} & stage4[1:0];  
+   assign S[62] =        Shift[0]   & stage5[0];  
+   assign Sticky = (S != sixtythreezeros);
+
+endmodule // barrel_shifter_r57
+
+module barrel_shifter_r64 (Z, Sticky, A, Shift);
+   
+   input [63:0] A;
+   input [5:0] 	Shift;
+
+   output 	Sticky;
+   output [63:0] Z;      
+   
+   wire [63:0] 	stage1;
+   wire [63:0] 	stage2;
+   wire [63:0] 	stage3;
+   wire [63:0] 	stage4;
+   wire [63:0] 	stage5;
+   wire [62:0] 	sixtythreezeros = 63'h0;
+   wire [31:0] 	thirtytwozeros = 32'h0;
+   wire [15:0] 	sixteenzeros = 16'h0;
+   wire [ 7:0] 	eightzeros = 8'h0;
+   wire [ 3:0] 	fourzeros = 4'h0;
+   wire [ 1:0] 	twozeros = 2'b00;
+   wire 	onezero = 1'b0;   
+   wire [62:0] 	S;
+
+   // Shift operations
+   mux21x64  mx01(stage1,      A, {thirtytwozeros,    A[63:32]}, Shift[5]);
+   mux21x64  mx02(stage2, stage1, {sixteenzeros, stage1[63:16]}, Shift[4]);
+   mux21x64  mx03(stage3, stage2, {eightzeros, stage2[63:8]}, Shift[3]);
+   mux21x64  mx04(stage4, stage3, {fourzeros, stage3[63:4]}, Shift[2]);
+   mux21x64  mx05(stage5, stage4, {twozeros, stage4[63:2]}, Shift[1]);
+   mux21x64  mx06(Z     , stage5, {onezero, stage5[63:1]}, Shift[0]);
+
+   // Sticky bit calculation. The Sticky bit is set to one if any of the
+   // bits that were shifter out were one
+
+   assign S[31:0]  = {32{Shift[5]}} &      A[31:0];  
+   assign S[47:32] = {16{Shift[4]}} & stage1[15:0];  
+   assign S[55:48] = { 8{Shift[3]}} & stage2[7:0];  
+   assign S[59:56] = { 4{Shift[2]}} & stage3[3:0];  
+   assign S[61:60] = { 2{Shift[1]}} & stage4[1:0];  
+   assign S[62] =        Shift[0]   & stage5[0];  
+   assign Sticky = (S != sixtythreezeros);
+
+endmodule // barrel_shifter_r64
--- a/wally-pipelined/src/fpu/sk14.sv
+++ b/wally-pipelined/src/fpu/sk14.sv
@ -0,0 +1,90 @@
+// Sklansky Prefix Adder
+
+module sk14 (cout, sum, a, b, cin);
+	 input [13:0] a, b;
+	 input cin;
+	 output [13:0] sum;
+	 output cout;
+
+	 wire [14:0] p,g;
+	 wire [13:0] c;
+
+// pre-computation
+	 assign p={a^b,1'b0};
+	 assign g={a&b, cin};
+
+// prefix tree
+	 sklansky prefix_tree(c, p[13:0], g[13:0]);
+
+// post-computation
+	 assign sum=p[14:1]^c;
+	 assign cout=g[14]|(p[14]&c[13]);
+
+endmodule
+
+module sklansky (c, p, g);
+	
+	input [14:0] p;
+	input [14:0] g;
+	output [14:1] c;
+
+
+	// parallel-prefix, Sklansky
+	// Stage 1: Generates G/P pairs that span 1 bits
+	grey b_1_0 (G_1_0, {g[1],g[0]}, p[1]);
+	black b_3_2 (G_3_2, P_3_2, {g[3],g[2]}, {p[3],p[2]});
+	black b_5_4 (G_5_4, P_5_4, {g[5],g[4]}, {p[5],p[4]});
+	black b_7_6 (G_7_6, P_7_6, {g[7],g[6]}, {p[7],p[6]});
+	black b_9_8 (G_9_8, P_9_8, {g[9],g[8]}, {p[9],p[8]});
+	black b_11_10 (G_11_10, P_11_10, {g[11],g[10]}, {p[11],p[10]});
+	black b_13_12 (G_13_12, P_13_12, {g[13],g[12]}, {p[13],p[12]});
+	// Stage 2: Generates G/P pairs that span 2 bits
+	grey g_2_0 (G_2_0, {g[2],G_1_0}, p[2]);
+	grey g_3_0 (G_3_0, {G_3_2,G_1_0}, P_3_2);
+	black b_6_4 (G_6_4, P_6_4, {g[6],G_5_4}, {p[6],P_5_4});
+	black b_7_4 (G_7_4, P_7_4, {G_7_6,G_5_4}, {P_7_6,P_5_4});
+	black b_10_8 (G_10_8, P_10_8, {g[10],G_9_8}, {p[10],P_9_8});
+	black b_11_8 (G_11_8, P_11_8, {G_11_10,G_9_8}, {P_11_10,P_9_8});
+	black b_14_12 (G_14_12, P_14_12, {g[14],G_13_12}, {p[14],P_13_12});
+	black b_15_12 (G_15_12, P_15_12, {G_15_14,G_13_12}, {P_15_14,P_13_12});
+
+	// Stage 3: Generates G/P pairs that span 4 bits
+	grey g_4_0 (G_4_0, {g[4],G_3_0}, p[4]);
+	grey g_5_0 (G_5_0, {G_5_4,G_3_0}, P_5_4);
+	grey g_6_0 (G_6_0, {G_6_4,G_3_0}, P_6_4);
+	grey g_7_0 (G_7_0, {G_7_4,G_3_0}, P_7_4);
+	black b_12_8 (G_12_8, P_12_8, {g[12],G_11_8}, {p[12],P_11_8});
+	black b_13_8 (G_13_8, P_13_8, {G_13_12,G_11_8}, {P_13_12,P_11_8});
+	black b_14_8 (G_14_8, P_14_8, {G_14_12,G_11_8}, {P_14_12,P_11_8});
+	black b_15_8 (G_15_8, P_15_8, {G_15_12,G_11_8}, {P_15_12,P_11_8});
+
+	// Stage 4: Generates G/P pairs that span 8 bits
+	grey g_8_0 (G_8_0, {g[8],G_7_0}, p[8]);
+	grey g_9_0 (G_9_0, {G_9_8,G_7_0}, P_9_8);
+	grey g_10_0 (G_10_0, {G_10_8,G_7_0}, P_10_8);
+	grey g_11_0 (G_11_0, {G_11_8,G_7_0}, P_11_8);
+	grey g_12_0 (G_12_0, {G_12_8,G_7_0}, P_12_8);
+	grey g_13_0 (G_13_0, {G_13_8,G_7_0}, P_13_8);
+	grey g_14_0 (G_14_0, {G_14_8,G_7_0}, P_14_8);
+	grey g_15_0 (G_15_0, {G_15_8,G_7_0}, P_15_8);
+
+
+	// Final Stage: Apply c_k+1=G_k_0
+	assign c[1]=g[0];
+	assign c[2]=G_1_0;
+	assign c[3]=G_2_0;
+	assign c[4]=G_3_0;
+	assign c[5]=G_4_0;
+	assign c[6]=G_5_0;
+	assign c[7]=G_6_0;
+	assign c[8]=G_7_0;
+	assign c[9]=G_8_0;
+
+	assign c[10]=G_9_0;
+	assign c[11]=G_10_0;
+	assign c[12]=G_11_0;
+	assign c[13]=G_12_0;
+	assign c[14]=G_13_0;
+
+endmodule
+
--- a/wally-pipelined/src/hazard/hazard.sv
+++ b/wally-pipelined/src/hazard/hazard.sv
@ -26,19 +26,23 @@
 `include "wally-config.vh"

 module hazard(
+	      input logic  clk,
+	      input logic  reset,
  // Detect hazards
-  input  logic       BPPredWrongE, CSRWritePendingDEM, RetM, TrapM,
-  input  logic       LoadStallD, MulDivStallD, CSRRdStallD,
-  input  logic       DataStall, ICacheStallF,
+	      input logic  BPPredWrongE, CSRWritePendingDEM, RetM, TrapM,
+	      input logic  LoadStallD, MulDivStallD, CSRRdStallD,
+	      input logic  DataStall, ICacheStallF,
+	      input logic  DivBusyE,
  // Stall & flush outputs
-  output logic       StallF, StallD, StallE, StallM, StallW,
-  output logic       FlushF, FlushD, FlushE, FlushM, FlushW
+	      output logic StallF, StallD, StallE, StallM, StallW,
+	      output logic FlushF, FlushD, FlushE, FlushM, FlushW
 );

  logic BranchFlushDE;
  logic StallFCause, StallDCause, StallECause, StallMCause, StallWCause;
  logic FirstUnstalledD, FirstUnstalledE, FirstUnstalledM, FirstUnstalledW;

+
  // stalls and flushes
  // loads: stall for one cycle if the subsequent instruction depends on the load
  // branches and jumps: flush the next two instructions if the branch is taken in EXE
@ -56,7 +60,7 @@ module hazard(
  assign StallFCause = CSRWritePendingDEM & ~(BranchFlushDE);
  assign StallDCause = (LoadStallD | MulDivStallD | CSRRdStallD) & ~(BranchFlushDE);    // stall in decode if instruction is a load/mul/csr dependent on previous
 //  assign StallDCause = LoadStallD | MulDivStallD | CSRRdStallD;    // stall in decode if instruction is a load/mul/csr dependent on previous
-  assign StallECause = 0;
+  assign StallECause = DivBusyE;
  assign StallMCause = 0; 
  assign StallWCause = DataStall | ICacheStallF;

@ -68,15 +72,17 @@ module hazard(
  assign StallM = StallW | StallMCause;
  assign StallW = StallWCause;

+  //assign FirstUnstalledD = (~StallD & StallF & ~MulDivStallD);
  assign FirstUnstalledD = (~StallD & StallF);
+  //assign FirstUnstalledE = (~StallE & StallD & ~MulDivStallD);
  assign FirstUnstalledE = (~StallE & StallD);
  assign FirstUnstalledM = (~StallM & StallE);
  assign FirstUnstalledW = (~StallW & StallM);;
  
  // Each stage flushes if the previous stage is the last one stalled (for cause) or the system has reason to flush
  assign FlushF = BPPredWrongE;
-  assign FlushD = FirstUnstalledD || BranchFlushDE;  //  PCSrcE |InstrStall | CSRWritePendingDEM | RetM | TrapM;
-  assign FlushE = FirstUnstalledE || BranchFlushDE; //LoadStallD | PCSrcE | RetM | TrapM;
+  assign FlushD = FirstUnstalledD || BranchFlushDE;  // PCSrcE |InstrStall | CSRWritePendingDEM | RetM | TrapM;
+  assign FlushE = FirstUnstalledE || BranchFlushDE;  // LoadStallD | PCSrcE | RetM | TrapM;
  assign FlushM = FirstUnstalledM || RetM || TrapM;
  assign FlushW = FirstUnstalledW | TrapM;
 endmodule
--- a/wally-pipelined/src/ieu/forward.sv
+++ b/wally-pipelined/src/ieu/forward.sv
@ -27,12 +27,13 @@

 module forward(
  // Detect hazards
-  input  logic [4:0] Rs1D, Rs2D, Rs1E, Rs2E, RdE, RdM, RdW,
-  input  logic       MemReadE, MulDivE, CSRReadE,
-  input  logic       RegWriteM, RegWriteW, 
+  input logic [4:0]  Rs1D, Rs2D, Rs1E, Rs2E, RdE, RdM, RdW,
+  input logic 	     MemReadE, MulDivE, CSRReadE,
+  input logic 	     RegWriteM, RegWriteW,
+  input logic 	     DivDoneE, DivBusyE,
  // Forwarding controls
  output logic [1:0] ForwardAE, ForwardBE,
-  output logic       LoadStallD, MulDivStallD, CSRRdStallD
+  output logic 	     LoadStallD, MulDivStallD, CSRRdStallD
 );
  
  always_comb begin
@ -48,8 +49,8 @@ module forward(
  end

  // Stall on dependent operations that finish in Mem Stage and can't bypass in time
-  assign LoadStallD = MemReadE & ((Rs1D == RdE) | (Rs2D == RdE));  
-  assign MulDivStallD = MulDivE & ((Rs1D == RdE) | (Rs2D == RdE)); // *** extend with stalls for divide
-  assign CSRRdStallD = CSRReadE & ((Rs1D == RdE) | (Rs2D == RdE));
+   assign LoadStallD = MemReadE & ((Rs1D == RdE) | (Rs2D == RdE));  
+   assign MulDivStallD = MulDivE & ((Rs1D == RdE) | (Rs2D == RdE)) | MulDivE | DivBusyE; // *** extend with stalls for divide
+   assign CSRRdStallD = CSRReadE & ((Rs1D == RdE) | (Rs2D == RdE));

 endmodule
--- a/wally-pipelined/src/ieu/ieu.sv
+++ b/wally-pipelined/src/ieu/ieu.sv
@ -26,39 +26,40 @@
 `include "wally-config.vh"

 module ieu (
-  input  logic             clk, reset,
+  input logic 		   clk, reset,
  // Decode Stage interface
-  input  logic [31:0]      InstrD,
-  input  logic             IllegalIEUInstrFaultD, 
-  output logic             IllegalBaseInstrFaultD,
+  input logic [31:0] 	   InstrD,
+  input logic 		   IllegalIEUInstrFaultD, 
+  output logic 		   IllegalBaseInstrFaultD,
  // Execute Stage interface
-  input  logic [`XLEN-1:0] PCE, 
-  input  logic [`XLEN-1:0] PCLinkE,
+  input logic [`XLEN-1:0]  PCE, 
+  input logic [`XLEN-1:0]  PCLinkE,
  output logic [`XLEN-1:0] PCTargetE,
-  output logic             MulDivE, W64E,
-  output logic [2:0]       Funct3E,
+  output logic 		   MulDivE, W64E,
+  output logic [2:0] 	   Funct3E,
  output logic [`XLEN-1:0] SrcAE, SrcBE,
  // Memory stage interface
-  input  logic             DataMisalignedM,
-  input  logic             DataAccessFaultM,
-  input  logic             SquashSCW,
-  output logic [1:0]       MemRWM,
-  output logic [1:0]       AtomicM,
+  input logic 		   DataMisalignedM,
+  input logic 		   DataAccessFaultM,
+  input logic 		   SquashSCW,
+  output logic [1:0] 	   MemRWM,
+  output logic [1:0] 	   AtomicM,
  output logic [`XLEN-1:0] MemAdrM, WriteDataM,
  output logic [`XLEN-1:0] SrcAM,
-  output logic [2:0]       Funct3M,
+  output logic [2:0] 	   Funct3M,
  // Writeback stage
-  input  logic [`XLEN-1:0] CSRReadValW, ReadDataW, MulDivResultW,
+  input logic [`XLEN-1:0]  CSRReadValW, ReadDataW, MulDivResultW,
  // input  logic [`XLEN-1:0] PCLinkW,
-  output logic             InstrValidW,
+  output logic 		   InstrValidW,
  // hazards
-  input  logic             StallE, StallM, StallW,
-  input  logic             FlushE, FlushM, FlushW,
-  output logic             LoadStallD, MulDivStallD, CSRRdStallD,
-  output logic             PCSrcE,
-
-  output logic             CSRReadM, CSRWriteM, PrivilegedM,
-  output logic             CSRWritePendingDEM
+  input logic 		   StallE, StallM, StallW,
+  input logic 		   FlushE, FlushM, FlushW,
+  output logic 		   LoadStallD, MulDivStallD, CSRRdStallD,
+  output logic 		   PCSrcE,
+  input logic 		   DivDoneE,
+  input logic 		   DivBusyE,
+  output logic 		   CSRReadM, CSRWriteM, PrivilegedM,
+  output logic 		   CSRWritePendingDEM
 );

  logic [2:0]  ImmSrcD;
@ -78,5 +79,6 @@ module ieu (
  controller c(.*);
  datapath   dp(.*);             
  forward    fw(.*);
+
 endmodule

--- a/wally-pipelined/src/ifu/ifu.sv
+++ b/wally-pipelined/src/ifu/ifu.sv
@ -61,8 +61,9 @@ module ifu (
  // TLB management
  input logic  [1:0]       PrivilegeModeW,
  input logic  [`XLEN-1:0] PageTableEntryF,
+  input logic  [1:0]       PageTypeF,
  input logic  [`XLEN-1:0] SATP_REGW,
-  input logic              ITLBWriteF, // ITLBFlushF,
+  input logic              ITLBWriteF, ITLBFlushF,
  output logic             ITLBMissF, ITLBHitF
 );

@ -75,13 +76,15 @@ module ifu (
  logic [31:0]      InstrRawD, InstrE, InstrW;
  logic [31:0]      nop = 32'h00000013; // instruction for NOP
  logic [`XLEN-1:0] ITLBInstrPAdrF, ICacheInstrPAdrF;
+  // *** send this to the trap unit
+  logic             ITLBPageFaultF;

-  // *** temporary hack until walker is hooked up -- Thomas F
-  // logic  [`XLEN-1:0] PageTableEntryF = '0;
-  logic ITLBFlushF = '0;
-  // logic ITLBWriteF = '0;
-  tlb #(3) itlb(clk, reset, SATP_REGW, PrivilegeModeW, PCF, PageTableEntryF, ITLBWriteF, ITLBFlushF,
-    ITLBInstrPAdrF, ITLBMissF, ITLBHitF);
+  tlb #(3) itlb(.TLBAccess(1'b1), .VirtualAddress(PCF),
+                .PageTableEntryWrite(PageTableEntryF), .PageTypeWrite(PageTypeF),
+                .TLBWrite(ITLBWriteF), .TLBFlush(ITLBFlushF),
+                .PhysicalAddress(ITLBInstrPAdrF), .TLBMiss(ITLBMissF),
+                .TLBHit(ITLBHitF), .TLBPageFault(ITLBPageFaultF),
+                .*);

  // branch predictor signals
  logic 	   SelBPPredF;
--- a/wally-pipelined/src/mmu/cam_line.sv
+++ b/wally-pipelined/src/mmu/cam_line.sv
@ -0,0 +1,79 @@
+///////////////////////////////////////////
+// cam_line.sv
+//
+// Written: tfleming@hmc.edu & jtorrey@hmc.edu 6 April 2021
+// Modified:
+//
+// Purpose: CAM line for the translation lookaside buffer (TLB)
+//          Determines whether a virtual address matches the stored key.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+`include "wally-constants.vh"
+
+module cam_line #(parameter KEY_BITS = 20,
+                  parameter HIGH_SEGMENT_BITS = 10) (
+  input                 clk, reset,
+
+  // The requested page number to compare against the key
+  input  [KEY_BITS-1:0] VirtualPageNumber,
+
+  // Signals to write a new entry to this line
+  input                 CAMLineWrite,
+  input  [1:0]          PageTypeWrite,
+
+  // Flush this line (set valid to 0)
+  input                 TLBFlush,
+
+  // This entry is a key for a giga, mega, or kilopage.
+  // PageType == 2'b00 --> kilopage
+  // PageType == 2'b01 --> megapage
+  // PageType == 2'b11 --> gigapage
+  output [1:0]          PageType,  // *** should this be the stored version or the always updated one?
+  output                Match
+);
+
+  // This entry has KEY_BITS for the key plus one valid bit.
+  logic                Valid;
+  logic [KEY_BITS-1:0] Key;
+
+  // When determining a match for a superpage, we might use only a portion of
+  // the input VirtualPageNumber. Unused parts of the VirtualPageNumber are
+  // zeroed in VirtualPageNumberQuery to better match with Key.
+  logic [KEY_BITS-1:0] VirtualPageNumberQuery;
+
+  // On a write, update the type of the page referred to by this line.
+  flopenr #(2) pagetypeflop(clk, reset, CAMLineWrite, PageTypeWrite, PageType);
+  //mux2 #(2) pagetypemux(StoredPageType, PageTypeWrite, CAMLineWrite, PageType);
+
+  // On a write, set the valid bit high and update the stored key.
+  // On a flush, zero the valid bit and leave the key unchanged.
+  // *** Might we want to update stored key right away to output match on the
+  // write cycle? (using a mux)
+  flopenrc #(1) validbitflop(clk, reset, TLBFlush, CAMLineWrite, 1'b1, Valid);
+  flopenr #(KEY_BITS) keyflop(clk, reset, CAMLineWrite, VirtualPageNumber, Key);
+
+  // Calculate the actual query key based on the input key and the page type.
+  // For example, a megapage in sv39 only cares about VPN2 and VPN1, so VPN0
+  // should automatically match.
+  page_number_mixer #(KEY_BITS, HIGH_SEGMENT_BITS) mixer(VirtualPageNumber, Key, PageType, VirtualPageNumberQuery);
+
+  assign Match = ({1'b1, VirtualPageNumberQuery} == Key);
+
+endmodule
--- a/wally-pipelined/src/mmu/decoder.sv
+++ b/wally-pipelined/src/mmu/decoder.sv
@ -0,0 +1,36 @@
+///////////////////////////////////////////
+// decoder.sv
+//
+// Written: tfleming@hmc.edu & jtorrey@hmc.edu 7 April 2021
+// Modified:
+//
+// Purpose: Binary encoding to one-hot decoder
+//
+// A component of the Wally configurable RISC-V project.
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module decoder #(parameter BINARY_BITS = 3) (
+  input  [BINARY_BITS-1:0] binary,
+  output [(2**BINARY_BITS)-1:0] one_hot
+);
+
+  // *** Double check whether this synthesizes as expected
+  assign one_hot = 1 << binary;
+
+endmodule
--- a/wally-pipelined/src/mmu/page_number_mixer.sv
+++ b/wally-pipelined/src/mmu/page_number_mixer.sv
@ -0,0 +1,85 @@
+///////////////////////////////////////////
+// page_number_mixer.sv
+//
+// Written: tfleming@hmc.edu & jtorrey@hmc.edu 6 April 2021
+// Modified:
+//
+// Purpose: Takes two page numbers and replaces segments of the first page
+//          number with segments from the second, based on the page type.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+module page_number_mixer #(parameter BITS = 20,
+                           parameter HIGH_SEGMENT_BITS = 10) (
+    input  [BITS-1:0] PageNumber,
+    input  [BITS-1:0] MixPageNumber,
+    input  [1:0]      PageType,
+    output [BITS-1:0] PageNumberCombined
+);
+
+  generate
+    // *** Just checking XLEN is not enough to support sv39 AND sv48.
+    if (`XLEN == 32) begin
+      // The upper segment might have a different width than the lower segments.
+      // For example, an sv39 PTE has 26 bits for PPN2 and 9 bits for the other
+      // segments.
+      localparam LOW_SEGMENT_BITS = (BITS - HIGH_SEGMENT_BITS);
+
+      logic [HIGH_SEGMENT_BITS-1:0] Segment1, MixSegment1, Segment1Combined;
+      logic [LOW_SEGMENT_BITS-1:0]  Segment0, MixSegment0, Segment0Combined;
+
+      // Unswizzle segments of the input page numbers
+      assign {Segment1, Segment0} = PageNumber;
+      assign {MixSegment1, MixSegment0} = MixPageNumber;
+
+      // Pass through the high segment
+      assign Segment1Combined = Segment1;
+
+      // Either pass through or zero out segment 0
+      mux2 #(LOW_SEGMENT_BITS) segment0mux(Segment0, MixSegment0, PageType[0], Segment0Combined);
+
+      // Reswizzle segments of the combined page number
+      assign PageNumberCombined = {Segment1Combined, Segment0Combined};
+    end else begin
+      // The upper segment might have a different width than the lower segments.
+      // For example, an sv39 PTE has 26 bits for PPN2 and 9 bits for the other
+      // segments.
+      localparam LOW_SEGMENT_BITS = (BITS - HIGH_SEGMENT_BITS) / 2;
+
+      logic [HIGH_SEGMENT_BITS-1:0] Segment2, MixSegment2, Segment2Combined;
+      logic [LOW_SEGMENT_BITS-1:0]  Segment1, MixSegment1, Segment1Combined;
+      logic [LOW_SEGMENT_BITS-1:0]  Segment0, MixSegment0, Segment0Combined;
+
+      // Unswizzle segments of the input page number
+      assign {Segment2, Segment1, Segment0} = PageNumber;
+      assign {MixSegment2, MixSegment1, MixSegment0} = MixPageNumber;
+
+      // Pass through the high segment
+      assign Segment2Combined = Segment2;
+
+      // Either pass through or zero out segments 1 and 0 based on the page type
+      mux2 #(LOW_SEGMENT_BITS) segment1mux(Segment1, MixSegment1, PageType[1], Segment1Combined);
+      mux2 #(LOW_SEGMENT_BITS) segment0mux(Segment0, MixSegment0, PageType[0], Segment0Combined);
+
+      // Reswizzle segments of the combined page number
+      assign PageNumberCombined = {Segment2Combined, Segment1Combined, Segment0Combined};
+    end
+  endgenerate
+endmodule
--- a/wally-pipelined/src/mmu/priority_encoder.sv
+++ b/wally-pipelined/src/mmu/priority_encoder.sv
@ -0,0 +1,50 @@
+///////////////////////////////////////////
+// priority_encoder.sv
+//
+// Written: tfleming@hmc.edu & jtorrey@hmc.edu 7 April 2021
+// Based on implementation from https://www.allaboutcircuits.com/ip-cores/communication-controller/priority-encoder/
+// *** Give proper LGPL attribution for above source
+// Modified:
+//
+// Purpose: One-hot encoding to binary encoder
+//
+// A component of the Wally configurable RISC-V project.
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+
+// *** We should look for a better parameterized priority encoder. This has a
+// bad code smell and might not synthesize
+module priority_encoder #(parameter BINARY_BITS = 3) (
+  input  [(2**BINARY_BITS)-1:0] one_hot,
+  output [BINARY_BITS-1:0] binary
+);
+
+  localparam ONE_HOT_BITS = 2**BINARY_BITS;
+
+  genvar i, j;
+  generate
+    for (i = 0; i < ONE_HOT_BITS; i++) begin
+      for (j = 0; j < BINARY_BITS; j++) begin
+        if (i[j]) begin
+          assign binary[j] = one_hot[i];
+        end
+      end
+    end
+  endgenerate
+
+endmodule
--- a/wally-pipelined/src/mmu/tlb.sv
+++ b/wally-pipelined/src/mmu/tlb.sv
@ -50,7 +50,6 @@
 /* *** TODO:
 * - add LRU algorithm (select the write index based on which entry was used
 *   least recently)
- * - refactor modules into multiple files
 */

 // The TLB will have 2**ENTRY_BITS total entries
@ -63,11 +62,15 @@ module tlb #(parameter ENTRY_BITS = 3) (
  // Current privilege level of the processeor
  input  [1:0]       PrivilegeModeW,

+  // High if the TLB is currently being accessed
+  input              TLBAccess,
+
  // Virtual address input
  input  [`XLEN-1:0] VirtualAddress,

  // Controls for writing a new entry to the TLB
  input  [`XLEN-1:0] PageTableEntryWrite,
+  input  [1:0]       PageTypeWrite,
  input              TLBWrite,

  // Invalidate all TLB entries
@ -76,7 +79,10 @@ module tlb #(parameter ENTRY_BITS = 3) (
  // Physical address outputs
  output [`XLEN-1:0] PhysicalAddress,
  output             TLBMiss,
-  output             TLBHit
+  output             TLBHit,
+
+  // Faults
+  output             TLBPageFault
 );

  logic SvMode;
@ -89,10 +95,8 @@ module tlb #(parameter ENTRY_BITS = 3) (
      assign SvMode = SATP_REGW[63]; // currently just a boolean whether translation enabled
    end
  endgenerate
-  // *** Currently fake virtual memory being on for testing purposes
-  // *** DO NOT ENABLE UNLESS TESTING
-  // assign SvMode = 1;

+  // Whether translation should occur
  assign Translate = SvMode & (PrivilegeModeW != `M_MODE);

  // *** If we want to support multiple virtual memory modes (ie sv39 AND sv48),
@ -105,42 +109,52 @@ module tlb #(parameter ENTRY_BITS = 3) (

  // Sections of the virtual and physical addresses
  logic [`VPN_BITS-1:0] VirtualPageNumber;
-  logic [`PPN_BITS-1:0] PhysicalPageNumber;
-  logic [11:0]          PageOffset;
+  logic [`PPN_BITS-1:0] PhysicalPageNumber, PhysicalPageNumberMixed;
  logic [`PA_BITS-1:0]  PhysicalAddressFull;

-  // Pattern and pattern location in the CAM
+  // Sections of the page table entry
+  logic [7:0]           PTEAccessBits;
+  logic [11:0]          PageOffset;
+
+  // Pattern location in the CAM and type of page hit
  logic [ENTRY_BITS-1:0] VPNIndex;
+  logic [1:0]            HitPageType;

-  // RAM access location
-  logic [ENTRY_BITS-1:0] EntryIndex;
-
-  // Page table entry matching the virtual address
-  logic [`XLEN-1:0] PageTableEntry;
+  // Whether the virtual address has a match in the CAM
+  logic                  CAMHit;

  assign VirtualPageNumber = VirtualAddress[`VPN_BITS+11:12];
  assign PageOffset        = VirtualAddress[11:0];

-  // Choose a read or write location to the entry list
-  mux2 #(3) indexmux(VPNIndex, WriteIndex, TLBWrite, EntryIndex);
-
  // Currently use random replacement algorithm
  tlb_rand rdm(.*);

  tlb_ram #(ENTRY_BITS) ram(.*);
-  tlb_cam #(ENTRY_BITS, `VPN_BITS) cam(.*);
+  tlb_cam #(ENTRY_BITS, `VPN_BITS, `VPN_SEGMENT_BITS) cam(.*);

-  always_comb begin
-    assign PhysicalPageNumber = PageTableEntry[`PPN_BITS+9:10];
+  // *** check whether access is allowed, otherwise fault
+  assign TLBPageFault = 0; // *** temporary

-    if (TLBHit) begin
-      assign PhysicalAddressFull = {PhysicalPageNumber, PageOffset};
-    end else begin
-      assign PhysicalAddressFull = '0; // *** Actual behavior; disabled until walker functioning
-      //assign PhysicalAddressFull = {2'b0, VirtualPageNumber, PageOffset} // *** pass through should be removed as soon as walker ready
-    end
-  end
+  // *** Not the cleanest solution.
+  // The highest segment of the physical page number has some extra bits
+  // than the highest segment of the virtual page number.
+  localparam EXTRA_PHYSICAL_BITS = `PPN_HIGH_SEGMENT_BITS - `VPN_SEGMENT_BITS;

+  // Replace segments of the virtual page number with segments of the physical
+  // page number. For 4 KB pages, the entire virtual page number is replaced.
+  // For superpages, some segments are considered offsets into a larger page.
+  page_number_mixer #(`PPN_BITS, `PPN_HIGH_SEGMENT_BITS)
+    physical_mixer(PhysicalPageNumber,
+      {{EXTRA_PHYSICAL_BITS{1'b0}}, VirtualPageNumber},
+      HitPageType,
+      PhysicalPageNumberMixed);
+
+  // Provide physical address only on TLBHits to cause catastrophic errors if
+  // garbage address is used.
+  assign PhysicalAddressFull = (TLBHit) ?
+    {PhysicalPageNumberMixed, PageOffset} : '0;
+
+  // Output the hit physical address if translation is currently on.
  generate
    if (`XLEN == 32) begin
      mux2 #(`XLEN) addressmux(VirtualAddress, PhysicalAddressFull[31:0], Translate, PhysicalAddress);
@ -149,93 +163,6 @@ module tlb #(parameter ENTRY_BITS = 3) (
    end
  endgenerate

-  assign TLBMiss = ~TLBHit & ~(TLBWrite | TLBFlush) & Translate;
-endmodule
-
-module tlb_ram #(parameter ENTRY_BITS = 3) (
-  input                   clk, reset,
-  input  [ENTRY_BITS-1:0] EntryIndex,
-  input  [`XLEN-1:0]      PageTableEntryWrite,
-  input                   TLBWrite,
-
-  output [`XLEN-1:0]      PageTableEntry
-);
-
-  localparam NENTRIES = 2**ENTRY_BITS;
-
-  logic [`XLEN-1:0] ram [0:NENTRIES-1];
-  always @(posedge clk) begin
-    if (TLBWrite) ram[EntryIndex] <= PageTableEntryWrite;
-  end
-
-  assign PageTableEntry = ram[EntryIndex];
-    
-  initial begin
-    for (int i = 0; i < NENTRIES; i++)
-      ram[i] = `XLEN'b0;
-  end
-
-endmodule
-
-module tlb_cam #(parameter ENTRY_BITS = 3,
-                 parameter KEY_BITS   = 20) (
-  input                    clk, reset,
-  input  [KEY_BITS-1:0]    VirtualPageNumber,
-  input  [ENTRY_BITS-1:0]  WriteIndex,
-  input                    TLBWrite,
-  input                    TLBFlush,
-  output [ENTRY_BITS-1:0]  VPNIndex,
-  output                   TLBHit
-);
-
-  localparam NENTRIES = 2**ENTRY_BITS;
-
-  // Each entry of this memory has KEY_BITS for the key plus one valid bit.
-  logic [KEY_BITS:0] ram [0:NENTRIES-1];
-
-  logic [ENTRY_BITS-1:0] matched_address_comb;
-  logic                  match_found_comb;
-
-  always @(posedge clk) begin
-    if (TLBWrite) ram[WriteIndex] <= {1'b1,VirtualPageNumber};
-    if (TLBFlush) begin
-      for (int i = 0; i < NENTRIES; i++)
-        ram[i][KEY_BITS] = 1'b0;  // Zero out msb (valid bit) of all entries
-    end
-  end
-
-  // *** Check whether this for loop synthesizes correctly
-  always_comb begin
-    match_found_comb = 1'b0;
-    matched_address_comb = '0;
-    for (int i = 0; i < NENTRIES; i++) begin
-      if (ram[i] == {1'b1,VirtualPageNumber} && !match_found_comb) begin
-        matched_address_comb = i;
-        match_found_comb = 1;
-      end else begin
-        matched_address_comb = matched_address_comb;
-        match_found_comb = match_found_comb;
-      end
-    end
-  end
-
-  assign VPNIndex = matched_address_comb;
-  assign TLBHit = match_found_comb & ~(TLBWrite | TLBFlush);
-
-  initial begin
-    for (int i = 0; i < NENTRIES; i++)
-      ram[i] = '0;
-  end
-
-endmodule
-
-module tlb_rand #(parameter ENTRY_BITS = 3) (
-  input                   clk, reset,
-  output [ENTRY_BITS-1:0] WriteIndex
-);
-
-  logic [31:0] data;
-  assign data = $urandom;
-  assign WriteIndex = data[ENTRY_BITS:0];
-  
+  assign TLBHit = CAMHit & TLBAccess;
+  assign TLBMiss = ~TLBHit & ~TLBFlush & Translate & TLBAccess;
 endmodule
--- a/wally-pipelined/src/mmu/tlb_cam.sv
+++ b/wally-pipelined/src/mmu/tlb_cam.sv
@ -0,0 +1,72 @@
+///////////////////////////////////////////
+// tlb_cam.sv
+//
+// Written: jtorrey@hmc.edu 16 February 2021
+// Modified:
+//
+// Purpose: Stores virtual page numbers with cached translations.
+//          Determines whether a given virtual page number is in the TLB.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+module tlb_cam #(parameter ENTRY_BITS = 3,
+                 parameter KEY_BITS   = 20,
+                 parameter HIGH_SEGMENT_BITS = 10) (
+  input                    clk, reset,
+  input  [KEY_BITS-1:0]    VirtualPageNumber,
+  input  [1:0]             PageTypeWrite,
+  input  [ENTRY_BITS-1:0]  WriteIndex,
+  input                    TLBWrite,
+  input                    TLBFlush,
+  output [ENTRY_BITS-1:0]  VPNIndex,
+  output [1:0]             HitPageType,
+  output                   CAMHit
+);
+
+  localparam NENTRIES = 2**ENTRY_BITS;
+
+  logic [NENTRIES-1:0] CAMLineWrite;
+  logic [1:0] PageTypeList [0:NENTRIES-1];
+  logic [NENTRIES-1:0] Matches;
+
+  // Determine which CAM line should be written, based on a binary index
+  decoder #(ENTRY_BITS) decoder(WriteIndex, CAMLineWrite);
+
+  // Create NENTRIES CAM lines, each of which will independently consider
+  // whether the requested virtual address is a match. Each line stores the
+  // original virtual page number from when the address was written, regardless
+  // of page type. However, matches are determined based on a subset of the
+  // page number segments.
+  generate
+    genvar i;
+    for (i = 0; i < NENTRIES; i++) begin
+      cam_line #(KEY_BITS, HIGH_SEGMENT_BITS) cam_line(
+        .CAMLineWrite(CAMLineWrite[i] && TLBWrite),
+        .PageType(PageTypeList[i]),
+        .Match(Matches[i]),
+        .*);
+    end
+  endgenerate
+
+  // In case there are multiple matches in the CAM, select only one
+  priority_encoder #(ENTRY_BITS) match_priority(Matches, VPNIndex);
+
+  assign CAMHit = |Matches & ~TLBFlush;
+  assign HitPageType = PageTypeList[VPNIndex];
+
+endmodule
--- a/wally-pipelined/src/mmu/tlb_ram.sv
+++ b/wally-pipelined/src/mmu/tlb_ram.sv
@ -0,0 +1,60 @@
+///////////////////////////////////////////
+// tlb_ram.sv
+//
+// Written: jtorrey@hmc.edu & tfleming@hmc.edu 16 February 2021
+// Modified:
+//
+// Purpose: Stores page table entries of cached address translations.
+//          Outputs the physical page number and access bits of the current
+//          virtual address on a TLB hit.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+`include "wally-config.vh"
+`include "wally-constants.vh"
+
+// *** use actual flop notation instead of initialbegin and alwaysff
+module tlb_ram #(parameter ENTRY_BITS = 3) (
+  input                   clk, reset,
+  input  [ENTRY_BITS-1:0] VPNIndex,  // Index to read from
+  input  [ENTRY_BITS-1:0] WriteIndex,
+  input  [`XLEN-1:0]      PageTableEntryWrite,
+  input                   TLBWrite,
+
+  output [`PPN_BITS-1:0]  PhysicalPageNumber,
+  output [7:0]            PTEAccessBits
+);
+
+  localparam NENTRIES = 2**ENTRY_BITS;
+
+  logic [`XLEN-1:0] ram [0:NENTRIES-1];
+  logic [`XLEN-1:0] PageTableEntry;
+  always @(posedge clk) begin
+    if (TLBWrite) ram[WriteIndex] <= PageTableEntryWrite;
+  end
+
+  assign PageTableEntry = ram[VPNIndex];
+  assign PTEAccessBits = PageTableEntry[7:0];
+  assign PhysicalPageNumber = PageTableEntry[`PPN_BITS+9:10];
+
+  initial begin
+    for (int i = 0; i < NENTRIES; i++)
+      ram[i] = `XLEN'b0;
+  end
+
+endmodule
--- a/wally-pipelined/src/mmu/tlb_rand.sv
+++ b/wally-pipelined/src/mmu/tlb_rand.sv
@ -0,0 +1,35 @@
+///////////////////////////////////////////
+// tlb_rand.sv
+//
+// Written: jtorrey@hmc.edu & tfleming@hmc.edu 16 February 2021
+// Modified:
+//
+// Purpose: Outputs a random index for writing to the TLB.
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, 
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software 
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT 
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+module tlb_rand #(parameter ENTRY_BITS = 3) (
+  input                   clk, reset,
+  output [ENTRY_BITS-1:0] WriteIndex
+);
+
+  logic [31:0] data;
+  assign data = $urandom;
+  assign WriteIndex = data[ENTRY_BITS-1:0];
+  
+endmodule
--- a/wally-pipelined/src/muldiv/div.sv
+++ b/wally-pipelined/src/muldiv/div.sv
--- a/wally-pipelined/src/muldiv/div/div
+++ b/wally-pipelined/src/muldiv/div/div
--- a/wally-pipelined/src/muldiv/div/div.c
+++ b/wally-pipelined/src/muldiv/div/div.c
@ -0,0 +1,26 @@
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+
+int main() {
+
+  uint64_t N;
+  uint64_t D;
+  uint64_t Q;
+
+  //N = 0xc9649f05a8e1a8bb;
+  //D = 0x82f6747f707af2c0;
+  //N = 0x10fd3dedadea5195;
+  //D = 0xdf7f3844121bcc23;
+  N = 0x4;
+  D = 0xbfffffffffffffff;
+  Q = N/D;
+
+  printf("N = %" PRIx64 "\n", N);
+  printf("D = %" PRIx64 "\n", D);
+  printf("Q = %" PRIx64 "\n", Q);
+  printf("R = %" PRIx64 "\n", N%D);  
+
+
+
+}
--- a/Show More
+++ b/Show More