Merge branch 'main' of https://github.com/davidharrishmc/riscv-wally

2022-06-21 20:31:06 +00:00 · 2022-06-21 20:31:06 +00:00 · 0161683945
commit 0161683945
parent fe31ee92e8 493d3b1ac0
56 changed files with 5602 additions and 873 deletions
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@ -1 +1 @@
-Subproject commit 307c77b26e070ae85ffea665ad9b642b40e33c86
+Subproject commit be67c99bd461742aa1c100bcc0732657faae2230
--- a/benchmarks/embench/Makefile
+++ b/benchmarks/embench/Makefile
@ -4,20 +4,29 @@

 embench_dir = ../../addins/embench-iot

-all: sim size
+all: build sim size

 allClean: clean all

 build: buildspeed buildsize
+buildspeed: build_speedopt_speed build_sizeopt_speed
+buildsize: build_speedopt_size build_sizeopt_size

-# uses the build_all.py python file to build the tests in addins/embench-iot/bd_speed/ optimized for speed
-buildspeed:
-	$(embench_dir)/build_all.py --builddir=bd_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S" --cflags="-O2 -nostartfiles" 
-	find $(embench_dir)/bd_speed/ -type f ! -name "*.*" | while read f; do cp "$$f" "$$f.elf"; done
+# uses the build_all.py python file to build the tests in addins/embench-iot/bd_speed/ optimized for speed and size
+build_speedopt_speed:
+	$(embench_dir)/build_all.py --builddir=bd_speedopt_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S" --cflags="-O2 -nostartfiles" 
+	find $(embench_dir)/bd_speedopt_speed/ -type f ! -name "*.*" | while read f; do cp "$$f" "$$f.elf"; done

-# uses the build_all.py python file to build the tests in addins/embench-iot/bd_speed/ optimized for size
-buildsize:
-	$(embench_dir)/build_all.py --builddir=bd_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S" --cflags="-Os -msave-restore" --dummy-libs="libgcc libm libc crt0"
+build_sizeopt_speed:
+	$(embench_dir)/build_all.py --builddir=bd_sizeopt_speed --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/crt0.S" --cflags="-Os -nostartfiles" 
+	find $(embench_dir)/bd_sizeopt_speed/ -type f ! -name "*.*" | while read f; do cp "$$f" "$$f.elf"; done
+
+# uses the build_all.py python file to build the tests in addins/embench-iot/bd_speed/ optimized for speed and size
+build_speedopt_size:
+	$(embench_dir)/build_all.py --builddir=bd_speedopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S" --cflags="-O2 -msave-restore" --dummy-libs="libgcc libm libc crt0"
+
+build_sizeopt_size:
+	$(embench_dir)/build_all.py --builddir=bd_sizeopt_size --arch riscv32 --chip generic --board rv32wallyverilog --ldflags="-nostdlib -nostartfiles ../../../config/riscv32/boards/rv32wallyverilog/startup/dummy.S" --cflags="-Os -msave-restore" --dummy-libs="libgcc libm libc crt0"

 # builds dependencies, then launches modelsim and finally runs python wrapper script to present results
 sim: modelsim_build_memfile modelsim_run speed
@ -28,35 +37,37 @@ modelsim_run:
 	cd ../../benchmarks/embench/

 # builds the objdump based on the compiled c elf files
-objdump: buildspeed
-	find $(embench_dir)/bd_speed/ -type f -name "*.elf" | while read f; do riscv64-unknown-elf-objdump -S -D "$$f" > "$$f.objdump"; done
+objdump:
+	find $(embench_dir)/bd_*_speed/ -type f -name "*.elf" | while read f; do riscv64-unknown-elf-objdump -S -D "$$f" > "$$f.objdump"; done

 # build memfiles, objdump.lab and objdump.addr files
 modelsim_build_memfile: objdump
-	find $(embench_dir)/bd_speed/ -type f -name "*.elf" | while read f; do riscv64-unknown-elf-elf2hex --bit-width 32 --input "$$f" --output "$$f.memfile"; done
-	find $(embench_dir)/bd_speed/ -type f -name "*.elf.objdump" | while read f; do extractFunctionRadix.sh $$f; done
+	find $(embench_dir)/bd_*_speed/ -type f -name "*.elf" | while read f; do riscv64-unknown-elf-elf2hex --bit-width 32 --input "$$f" --output "$$f.memfile"; done
+	find $(embench_dir)/bd_*_speed/ -type f -name "*.elf.objdump" | while read f; do extractFunctionRadix.sh $$f; done

 # builds the tests for speed, runs them on spike and then launches python script to present results
 # note that the speed python script benchmark_speed.py can get confused if there's both a .output file created from spike and modelsim
 # you'll need to manually remove one of the two .output files, or run make clean
-spike: buildspeed objdump spike_run speed
+spike: buildspeed spike_run speed

 # command to run spike on all of the benchmarks
 spike_run:
-	find $(embench_dir)/bd_speed/ -type f -name "*.elf" | while read f; do spike --isa=rv32imac +signature=$$f.spike.output +signature-granularity=4 $$f; done
+	find $(embench_dir)/bd_*opt_speed/ -type f -name "*.elf" | while read f; do spike --isa=rv32imac +signature=$$f.spike.output +signature-granularity=4 $$f; done

 # python wrapper to present results of embench size benchmark
 size: buildsize
-	$(embench_dir)/benchmark_size.py --builddir=bd_size --json-output > wallySize.json
+	$(embench_dir)/benchmark_size.py --builddir=bd_speedopt_size --json-output > wallySpeedOpt_size.json 
+	$(embench_dir)/benchmark_size.py --builddir=bd_sizeopt_size --json-output > wallySizeOpt_size.json 

 # python wrapper to present results of embench speed benchmark
 speed:
-	$(embench_dir)/benchmark_speed.py --builddir=bd_speed --target-module run_wally --cpu-mhz=1 --json-output > wallySpeed.json
+	$(embench_dir)/benchmark_speed.py --builddir=bd_sizeopt_speed --target-module run_wally --cpu-mhz=1 --json-output > wallySizeOpt_speed.json 
+	$(embench_dir)/benchmark_speed.py --builddir=bd_speedopt_speed --target-module run_wally --cpu-mhz=1 --json-output > wallySpeedOpt_speed.json 

 # deletes all files
 clean: 
-	rm -rf $(embench_dir)/bd_speed/
-	rm -rf $(embench_dir)/bd_size/
+	rm -rf $(embench_dir)/bd_*_speed/
+	rm -rf $(embench_dir)/bd_*_size/

 allclean: clean
 	rm -rf $(embench_dir)/logs/
--- a/benchmarks/graphGen.py
+++ b/benchmarks/graphGen.py
@ -3,9 +3,8 @@ import subprocess
 import sys
 import json
 import plotly.graph_objects as go
+from plotly.subplots import make_subplots

-coremarkData = {}
-embenchData = {}
 debug = True

 def loadCoremark():
@ -21,61 +20,85 @@ def loadCoremark():
    if (debug): print(coremarkData)
    return coremarkData

-def loadEmbench():
+def loadEmbench(embenchPath, embenchData):
    """loads the embench data dictionary"""
-    embenchPath = "embench/wallySpeed.json"
    f = open(embenchPath)
    embenchData = json.load(f)
    if (debug): print(embenchData)
    return embenchData

-def graphEmbench(embenchData):
-    ydata = list(embenchData["speed results"]["detailed speed results"].keys()) + ["speed geometric mean","speed geometric sd","speed geometric range"]
-    xdata = list(embenchData["speed results"]["detailed speed results"].values()) + [embenchData["speed results"]["speed geometric mean"],embenchData["speed results"]["speed geometric sd"],embenchData["speed results"]["speed geometric range"]]
-    fig = go.Figure(go.Bar(
+def graphEmbench(embenchSpeedOpt_SpeedData, embenchSizeOpt_SpeedData, embenchSpeedOpt_SizeData, embenchSizeOpt_SizeData):
+    fig = make_subplots(rows=2, cols=4,
+                        # subplot_titles( "Wally's Embench Cycles and Instret (with -O2)","Wally's Embench Cycles Per Instruction (with -O2)"))
+                        subplot_titles=( "Wally's Embench Cycles and Instret (with -O2)","Wally's Embench Cycles Per Instruction (with -O2)","Wally's Embench Speed Score (with -O2)","Wally's Embench Size Score (with -O2)",
+                                     "Wally's Embench Cycles and Instret (with -Os)","Wally's Embench Cycles Per Instruction (with -Os)","Wally's Embench Speed Score (with -Os)","Wally's Embench Size Score (with -Os)"))
+    
+    ydata = list(embenchSpeedOpt_SpeedData["speed results"]["detailed speed results"].keys()) + ["speed geometric mean","speed geometric sd","speed geometric range"]
+    xdata = list(embenchSpeedOpt_SpeedData["speed results"]["detailed speed results"].values()) + [embenchSpeedOpt_SpeedData["speed results"]["speed geometric mean"],embenchSpeedOpt_SpeedData["speed results"]["speed geometric sd"],embenchSpeedOpt_SpeedData["speed results"]["speed geometric range"]]
+
+    fig.add_trace( go.Bar(
            y=ydata,
            x=xdata,
-            orientation='h'))
+            textposition='outside', text=xdata,
+            orientation='h'),
+            row=1,col=3)

-    fig.show()
+    ydata = list(embenchSizeOpt_SpeedData["speed results"]["detailed speed results"].keys()) + ["speed geometric mean","speed geometric sd","speed geometric range"]
+    xdata = list(embenchSizeOpt_SpeedData["speed results"]["detailed speed results"].values()) + [embenchSizeOpt_SpeedData["speed results"]["speed geometric mean"],embenchSizeOpt_SpeedData["speed results"]["speed geometric sd"],embenchSizeOpt_SpeedData["speed results"]["speed geometric range"]]
+
+    fig.add_trace( go.Bar(
+            y=ydata,
+            x=xdata,
+            textposition='outside', text=xdata,
+            orientation='h'),
+            row=2,col=3)
+
+    
+    ydata = list(embenchSpeedOpt_SizeData["size results"]["detailed size results"].keys()) + ["size geometric mean","size geometric sd","size geometric range"]
+    xdata = list(embenchSpeedOpt_SizeData["size results"]["detailed size results"].values()) + [embenchSpeedOpt_SizeData["size results"]["size geometric mean"],embenchSpeedOpt_SizeData["size results"]["size geometric sd"],embenchSpeedOpt_SizeData["size results"]["size geometric range"]]
+
+    fig.add_trace( go.Bar(
+            y=ydata,
+            x=xdata,
+            textposition='outside', text=xdata,
+            orientation='h'),
+            row=1,col=4)
+
+    ydata = list(embenchSizeOpt_SizeData["size results"]["detailed size results"].keys()) + ["size geometric mean","size geometric sd","size geometric range"]
+    xdata = list(embenchSizeOpt_SizeData["size results"]["detailed size results"].values()) + [embenchSizeOpt_SizeData["size results"]["size geometric mean"],embenchSizeOpt_SizeData["size results"]["size geometric sd"],embenchSizeOpt_SizeData["size results"]["size geometric range"]]
+
+    fig.add_trace( go.Bar(
+            y=ydata,
+            x=xdata,
+            textposition='outside', text=xdata,
+            orientation='h'),
+            row=2,col=4)
+        
+    #         facet_row="Score", facet_col="Optimization Flag",
+    #         category_orders={"Score": ["Cycles & Instr", "CPI", "SpeedScore", "SizeScore"],
+    #                           "Optimization Flag": ["O2", "Os"]}),
+    #         orientation='h')
+    fig.update_layout(height=1500,width=4000, title_text="Wally Embench Scores", showlegend=False)
+
+    fig.write_image("figure.png", engine="kaleido")
+    # fig.show()


 def main():
-    coremarkData = loadCoremark()
-    embenchData = loadEmbench()
-    graphEmbench(embenchData)
+    coremarkData = {}
+    embenchSizeOpt_SpeedData = {}
+    embenchSpeedOpt_SpeedData = {}
+    embenchSizeOpt_SizeData = {}
+    embenchSpeedOpt_SizeData = {}
+    # coremarkData = loadCoremark()
+    embenchSpeedOpt_SpeedData = loadEmbench("embench/wallySpeedOpt_speed.json", embenchSpeedOpt_SpeedData)
+    embenchSizeOpt_SpeedData = loadEmbench("embench/wallySizeOpt_speed.json", embenchSizeOpt_SpeedData)
+    embenchSpeedOpt_SizeData = loadEmbench("embench/wallySpeedOpt_size.json", embenchSpeedOpt_SizeData)
+    embenchSizeOpt_SizeData = loadEmbench("embench/wallySizeOpt_size.json", embenchSizeOpt_SizeData)
+
+    graphEmbench(embenchSpeedOpt_SpeedData, embenchSizeOpt_SpeedData, embenchSpeedOpt_SizeData, embenchSizeOpt_SizeData)

 if __name__ == '__main__':
    sys.exit(main())

-# x = 
-# y = 
-
-# df = px.data.tips()
-# fig = px.bar(df, x="total_bill", y="day", orientation='h')
-# fig.show()
-# import plotly.express as px
-
-
-# result = sp.run(['ls', '-l'], stdout=sp.PIPE)
-# result.stdout
-
-# fig = go.Figure( go.Bar(
-#                 x=[],
-#                 y=[],
-#                 color="species",
-#                 facet_col="species", 
-#                 title="Using update_traces() With Plotly Express Figures"),
-#                 orientation='h')
-
-# fig.show()
-
-#
-# "ls -Art ../addins/embench-iot/logs/*speed* | tail -n 1 " # gets most recent embench speed log
-# "ls -Art ../addins/embench-iot/logs/*size* | tail -n 1 " # gets most recent embench speed log
-
-## get coremark score
-
-# cat coremarkPath | grep "CoreMark 1.0" | cut -d ':' -f 2 | cut -d " " -f 2
-# cat coremarkPath | grep "MTIME" | cut -d ':' -f 2 | cut -d " " -f 2 | tail -1
-# cat coremarkPath | grep "MINSTRET" | cut -d ':' -f 2 | cut -d " " -f 2 | tail -1
+# "ls -Art ../addins/embench-iot/logs/*speed* | tail -n 1 " # gets most recent embench speed log
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@ -95,6 +95,7 @@

 // largest length in IEU/FPU
 `define LGLEN ((`NF<`XLEN) ? `XLEN : `NF)
+`define LLEN ((`FLEN<`XLEN) ? `XLEN : `FLEN)
 `define LOGLGLEN $unsigned($clog2(`LGLEN+1))
 `define NORMSHIFTSZ ((`LGLEN+`NF) > (3*`NF+8) ? (`LGLEN+`NF+1) : (3*`NF+9))
 `define CORRSHIFTSZ ((`LGLEN+`NF) > (3*`NF+8) ? (`LGLEN+`NF+1) : (3*`NF+6))
--- a/pipelined/src/fpu/fctrl.sv
+++ b/pipelined/src/fpu/fctrl.sv
@ -121,11 +121,11 @@ module fctrl (
      assign FmtD = 0;
    else if (`FPSIZES == 2)begin
      logic [1:0] FmtTmp;
-      assign FmtTmp = (FResSelD == 2'b10)&~FWriteIntD ? {~Funct3D[1], ~(Funct3D[1]^Funct3D[0])} : ((Funct7D[6:3] == 4'b0100)&OpD[4]) ? Rs2D[1:0] : Funct7D[1:0];
+      assign FmtTmp = ((Funct7D[6:3] == 4'b0100)&OpD[4]) ? Rs2D[1:0] : Funct7D[1:0];
      assign FmtD = (`FMT == FmtTmp);
    end
    else if (`FPSIZES == 3|`FPSIZES == 4)
-      assign FmtD = (FResSelD == 2'b10)&~FWriteIntD ? {~Funct3D[1], ~(Funct3D[1]^Funct3D[0])} : ((Funct7D[6:3] == 4'b0100)&OpD[4]) ? Rs2D[1:0] : Funct7D[1:0];
+      assign FmtD = ((Funct7D[6:3] == 4'b0100)&OpD[4]) ? Rs2D[1:0] : Funct7D[1:0];

 //  Final Res Sel:
 //        fp      int
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@ -34,13 +34,14 @@ module fpu (
  input logic 		   reset,
  input logic [2:0] 	   FRM_REGW, // Rounding mode from CSR
  input logic [31:0] 	   InstrD, // instruction from IFU
-  input logic [`XLEN-1:0]  ReadDataW,// Read data from memory
+  input logic [`FLEN-1:0]  ReadDataW,// Read data from memory
  input logic [`XLEN-1:0]  ForwardedSrcAE, // Integer input being processed (from IEU)
  input logic 		   StallE, StallM, StallW, // stall signals from HZU
  input logic 		   FlushE, FlushM, FlushW, // flush signals from HZU
  input logic [4:0] 	   RdM, RdW, // which FP register to write to (from IEU)
  input logic [1:0]        STATUS_FS, // Is floating-point enabled?
  output logic 		   FRegWriteM, // FP register write enable
+  output logic 		   FpLoadM, // Fp load instruction?
  output logic 		   FStallD, // Stall the decode stage
  output logic 		   FWriteIntE, // integer register write enables
  output logic [`XLEN-1:0] FWriteDataE, // Data to be written to memory
@ -348,6 +349,8 @@ module fpu (
   //          |||         |||
   //////////////////////////////////////////////////////////////////////////////////////////

+   assign FpLoadM = FResSelM[1];
+
   postprocess postprocess(.XSgnM, .ZExpM, .XManM, .YManM, .ZManM, .FrmM, .FmtM, .ProdExpM, 
                           .AddendStickyM, .KillProdM, .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, 
                           .ZInfM, .XNaNM, .YNaNM, .ZNaNM, .XSNaNM, .YSNaNM, .ZSNaNM, .SumM, 
@ -378,21 +381,7 @@ module fpu (
   //          |||         |||
   //////////////////////////////////////////////////////////////////////////////////////////

-   // put ReadData into NaN-blocking format
-   //    - if there are any unsused bits the most significant bits are filled with 1s
-   //    - for load instruction
-   generate
-      if(`FPSIZES == 1) assign ReadResW = {{`FLEN-`XLEN{1'b1}}, ReadDataW};
-      else if(`FPSIZES == 2) 
-         mux2 #(`FLEN) SrcAMux ({{`FLEN-`LEN1{1'b1}}, ReadDataW[`LEN1-1:0]}, {{`FLEN-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW);
-      else if(`FPSIZES == 3 | `FPSIZES == 4)
-         mux4 #(`FLEN) SrcAMux ({{`FLEN-`S_LEN{1'b1}}, ReadDataW[`S_LEN-1:0]}, 
-                              {{`FLEN-`D_LEN{1'b1}}, ReadDataW[`D_LEN-1:0]}, 
-                              {{`FLEN-`H_LEN{1'b1}}, ReadDataW[`H_LEN-1:0]}, 
-                              {{`FLEN-`XLEN{1'b1}}, ReadDataW}, FmtW, ReadResW); // NaN boxing zeroes
-   endgenerate
-
   // select the result to be written to the FP register
-   mux2  #(`FLEN)  FPUResultMux (FpResW, ReadResW, FResSelW[1], FPUResultW);
+   mux2  #(`FLEN)  FPUResultMux (FpResW, ReadDataW, FResSelW[1], FPUResultW);

 endmodule // fpu
--- a/pipelined/src/ieu/datapath.sv
+++ b/pipelined/src/ieu/datapath.sv
@ -64,9 +64,9 @@ module datapath (
  input  logic [2:0]       ResultSrcW,
  input logic [`XLEN-1:0]  FCvtIntResW,
  input logic [1:0]        FResSelW,
-  output logic [`XLEN-1:0] ReadDataW,
+  input logic [`XLEN-1:0] ReadDataW,
  // input  logic [`XLEN-1:0] PCLinkW,
-  input  logic [`XLEN-1:0] CSRReadValW, ReadDataM, MDUResultW, 
+  input  logic [`XLEN-1:0] CSRReadValW, MDUResultW, 
  // Hazard Unit signals 
  output logic [4:0]       Rs1D, Rs2D, Rs1E, Rs2E,
  output logic [4:0]       RdE, RdM, RdW 
@ -122,7 +122,6 @@ module datapath (
  // Writeback stage pipeline register and logic
  flopenrc #(`XLEN) IFResultWReg(clk, reset, FlushW, ~StallW, IFResultM, IFResultW);
  flopenrc #(5)     RdWReg(clk, reset, FlushW, ~StallW, RdM, RdW);
-  flopen #(`XLEN)   ReadDataWReg(clk, ~StallW, ReadDataM, ReadDataW);

  // floating point interactions: fcvt, fp stores
  if (`F_SUPPORTED) begin:fpmux
--- a/pipelined/src/ieu/ieu.sv
+++ b/pipelined/src/ieu/ieu.sv
@ -60,11 +60,11 @@ module ieu (
  output logic       InvalidateICacheM, FlushDCacheM,

  // Writeback stage
-  input logic [`XLEN-1:0]  CSRReadValW, ReadDataM, MDUResultW,
+  input logic [`XLEN-1:0]  CSRReadValW, MDUResultW,
  input logic [1:0]        FResSelW,
  input logic [`XLEN-1:0]  FCvtIntResW,
  output logic [4:0]       RdW,
-  output logic [`XLEN-1:0] ReadDataW,
+  input logic [`XLEN-1:0] ReadDataW,
  // input  logic [`XLEN-1:0] PCLinkW,
  output logic 		   InstrValidM, 
  // hazards
@ -110,7 +110,7 @@ module ieu (
    .FWriteDataE, .PCE, .PCLinkE, .FlagsE, .IEUAdrE, .ForwardedSrcAE, .ForwardedSrcBE, 
    .StallM, .FlushM, .FWriteIntM, .FIntResM, .SrcAM, .WriteDataE, .FResSelW,
    .StallW, .FlushW, .RegWriteW, .SquashSCW, .ResultSrcW, .ReadDataW, .FCvtIntResW,
-    .CSRReadValW, .ReadDataM, .MDUResultW, .Rs1D, .Rs2D, .Rs1E, .Rs2E, .RdE, .RdM, .RdW);             
+    .CSRReadValW, .MDUResultW, .Rs1D, .Rs2D, .Rs1E, .Rs2E, .RdE, .RdM, .RdW);             
  
  forward    fw(
    .Rs1D, .Rs2D, .Rs1E, .Rs2E, .RdE, .RdM, .RdW,
--- a/pipelined/src/lsu/bigendianswap.sv
+++ b/pipelined/src/lsu/bigendianswap.sv
@ -30,12 +30,32 @@

 `include "wally-config.vh"

-module bigendianswap (
+module bigendianswap #(parameter LEN=`XLEN) (
  input  logic             BigEndianM,
-  input  logic [`XLEN-1:0] a,
-  output logic [`XLEN-1:0] y); 
+  input  logic [LEN-1:0] a,
+  output logic [LEN-1:0] y); 

-  if(`XLEN == 64) begin
+  if(LEN == 128) begin
+    always_comb 
+        if (BigEndianM) begin // swap endianness
+            y[127:120] = a[7:0];
+            y[119:112] = a[15:8];
+            y[111:104] = a[23:16];
+            y[103:96]  = a[31:24];
+            y[95:88]   = a[39:32];
+            y[87:80]   = a[47:40];
+            y[79:72]   = a[55:48];
+            y[71:64]   = a[63:56];
+            y[63:56]   = a[71:64];
+            y[55:48]   = a[79:72];
+            y[47:40]   = a[87:80];
+            y[39:32]   = a[95:88];
+            y[31:24]   = a[103:96];
+            y[23:16]   = a[111:104];
+            y[15:8]    = a[119:112];
+            y[7:0]     = a[127:120];
+        end else y = a;
+  end else if(LEN == 64) begin
    always_comb 
        if (BigEndianM) begin // swap endianness
            y[63:56] = a[7:0];
--- a/pipelined/src/lsu/lsu.sv
+++ b/pipelined/src/lsu/lsu.sv
@ -51,11 +51,13 @@ module lsu (
   input logic [`XLEN-1:0]  IEUAdrE,
   (* mark_debug = "true" *)output logic [`XLEN-1:0] IEUAdrM,
   input logic [`XLEN-1:0]  WriteDataE, 
-   output logic [`XLEN-1:0] ReadDataM,
+   output logic [`LLEN-1:0] ReadDataW,
   // cpu privilege
   input logic [1:0]        PrivilegeModeW, 
   input logic              BigEndianM,
   input logic              sfencevmaM,
+   // fpu
+   input logic              FpLoadM,
   // faults
   output logic             LoadPageFaultM, StoreAmoPageFaultM,
   output logic             LoadMisalignedFaultM, LoadAccessFaultM,
@ -110,6 +112,7 @@ module lsu (
  logic [`XLEN-1:0]         LSUWriteDataM;
  logic [(`XLEN-1)/8:0]     ByteMaskM;
  logic [`XLEN-1:0]         WriteDataM;
+  logic [`LLEN-1:0]         ReadDataM;
  
  // *** TO DO: Burst mode

@ -128,7 +131,7 @@ module lsu (
      .DTLBMissM, .DTLBWriteM, .InstrDAPageFaultF, .DataDAPageFaultM, 
      .TrapM, .DCacheStallM, .SATP_REGW, .PCF,
      .STATUS_MXR, .STATUS_SUM, .STATUS_MPRV, .STATUS_MPP, .PrivilegeModeW,
-      .ReadDataM, .WriteDataM, .Funct3M, .LSUFunct3M, .Funct7M, .LSUFunct7M,
+      .ReadDataM(ReadDataM[`XLEN-1:0]), .WriteDataM, .Funct3M, .LSUFunct3M, .Funct7M, .LSUFunct7M,
      .IEUAdrExtM, .PTE, .LSUWriteDataM, .PageType, .PreLSURWM, .LSUAtomicM, .IEUAdrE,
      .LSUAdrE, .PreLSUPAdrM, .CPUBusy, .InterlockStall, .SelHPTW,
      .IgnoreRequestTLB, .IgnoreRequestTrapM);
@ -187,8 +190,8 @@ module lsu (
  //  Either Data Cache or Data Tightly Integrated Memory or just bus interface
  /////////////////////////////////////////////////////////////////////////////////////////////
  logic [`XLEN-1:0]    AMOWriteDataM, FinalWriteDataM, LittleEndianWriteDataM;
-  logic [`XLEN-1:0]    ReadDataWordM, LittleEndianReadDataWordM;
-  logic [`XLEN-1:0]    ReadDataWordMuxM;
+  logic [`LLEN-1:0]    ReadDataWordM, LittleEndianReadDataWordM;
+  logic [`LLEN-1:0]    ReadDataWordMuxM;
  logic                IgnoreRequest;
  logic                SelUncachedAdr;
  assign IgnoreRequest = IgnoreRequestTLB | IgnoreRequestTrapM;
@ -197,7 +200,7 @@ module lsu (
    // *** directly instantiate RAM or ROM here.  Instantiate SRAM1P1RW.  
    // Merge SimpleRAM and SRAM1p1rw into one that is good for synthesis and RAM libraries and flops
    dtim dtim(.clk, .reset, .CPUBusy, .LSURWM, .IEUAdrM, .IEUAdrE, .TrapM, .FinalWriteDataM, 
-              .ReadDataWordM, .BusStall, .LSUBusWrite,.LSUBusRead, .BusCommittedM,
+              .ReadDataWordM(ReadDataWordM[`XLEN-1:0]), .BusStall, .LSUBusWrite,.LSUBusRead, .BusCommittedM,
              .DCacheStallM, .DCacheCommittedM, .ByteMaskM, .Cacheable(CacheableM),
              .DCacheMiss, .DCacheAccess);
  end 
@ -222,14 +225,14 @@ module lsu (
      .SelUncachedAdr, .IgnoreRequest, .LSURWM, .CPUBusy, .CacheableM,
      .BusStall, .BusCommittedM);

-    mux2 #(`XLEN) UnCachedDataMux(.d0(LittleEndianReadDataWordM), .d1(DCacheBusWriteData[`XLEN-1:0]),
+    mux2 #(`LLEN) UnCachedDataMux(.d0(LittleEndianReadDataWordM), .d1({{`LLEN-`XLEN{1'b0}}, DCacheBusWriteData[`XLEN-1:0]}),
      .s(SelUncachedAdr), .y(ReadDataWordMuxM));
-    mux2 #(`XLEN) LsuBushwdataMux(.d0(ReadDataWordM), .d1(FinalWriteDataM),
+    mux2 #(`XLEN) LsuBushwdataMux(.d0(ReadDataWordM[`XLEN-1:0]), .d1(FinalWriteDataM),
      .s(SelUncachedAdr), .y(LSUBusHWDATA));
    
    if(CACHE_ENABLED) begin : dcache
      cache #(.LINELEN(`DCACHE_LINELENINBITS), .NUMLINES(`DCACHE_WAYSIZEINBYTES*8/LINELEN),
-              .NUMWAYS(`DCACHE_NUMWAYS), .LOGWPL(LOGWPL), .WORDLEN(`XLEN), .MUXINTERVAL(`XLEN), .DCACHE(1)) dcache(
+              .NUMWAYS(`DCACHE_NUMWAYS), .LOGWPL(LOGWPL), .WORDLEN(`LLEN), .MUXINTERVAL(`XLEN), .DCACHE(1)) dcache(
        .clk, .reset, .CPUBusy, .LSUBusWriteCrit, .RW(LSURWM), .Atomic(LSUAtomicM),
        .FlushCache(FlushDCacheM), .NextAdr(LSUAdrE), .PAdr(LSUPAdrM), 
        .ByteMask(ByteMaskM), .WordCount,
@ -253,7 +256,7 @@ module lsu (
  // Atomic operations
  /////////////////////////////////////////////////////////////////////////////////////////////
  if (`A_SUPPORTED) begin:atomic
-    atomic atomic(.clk, .reset, .StallW, .ReadDataM, .LSUWriteDataM, .LSUPAdrM, 
+    atomic atomic(.clk, .reset, .StallW, .ReadDataM(ReadDataM[`XLEN-1:0]), .LSUWriteDataM, .LSUPAdrM, 
      .LSUFunct7M, .LSUFunct3M, .LSUAtomicM, .PreLSURWM, .IgnoreRequest, 
      .AMOWriteDataM, .SquashSCW, .LSURWM);
  end else begin:lrsc
@ -266,7 +269,13 @@ module lsu (
  subwordwrite subwordwrite(.LSUPAdrM(LSUPAdrM[2:0]),
    .LSUFunct3M, .AMOWriteDataM, .LittleEndianWriteDataM, .ByteMaskM);
  subwordread subwordread(.ReadDataWordMuxM, .LSUPAdrM(LSUPAdrM[2:0]),
-		.Funct3M(LSUFunct3M), .ReadDataM);
+		.FpLoadM, .Funct3M(LSUFunct3M), .ReadDataM);
+
+  /////////////////////////////////////////////////////////////////////////////////////////////
+  // MW Pipeline Register
+  /////////////////////////////////////////////////////////////////////////////////////////////
+
+  flopen #(`LLEN) ReadDataMWReg(clk, ~StallW, ReadDataM, ReadDataW);

  /////////////////////////////////////////////////////////////////////////////////////////////
  // Big Endian Byte Swapper
@ -274,8 +283,8 @@ module lsu (
  //  swap the bytes when read from big-endian memory
  /////////////////////////////////////////////////////////////////////////////////////////////
  if (`BIGENDIAN_SUPPORTED) begin:endian
-    bigendianswap storeswap(.BigEndianM, .a(LittleEndianWriteDataM), .y(FinalWriteDataM));
-    bigendianswap loadswap(.BigEndianM, .a(ReadDataWordM), .y(LittleEndianReadDataWordM));
+    bigendianswap #(`XLEN) storeswap(.BigEndianM, .a(LittleEndianWriteDataM), .y(FinalWriteDataM));
+    bigendianswap #(`LLEN) loadswap(.BigEndianM, .a(ReadDataWordM), .y(LittleEndianReadDataWordM));
  end else begin
    assign FinalWriteDataM = LittleEndianWriteDataM;
    assign LittleEndianReadDataWordM = ReadDataWordM;
--- a/pipelined/src/lsu/subwordread.sv
+++ b/pipelined/src/lsu/subwordread.sv
@ -32,10 +32,11 @@

 module subwordread 
  (
-   input logic [`XLEN-1:0] 	ReadDataWordMuxM,
+   input logic [`LLEN-1:0] 	ReadDataWordMuxM,
   input logic [2:0] 		LSUPAdrM,
   input logic [2:0] 		Funct3M,
-   output logic [`XLEN-1:0] ReadDataM
+   input logic          FpLoadM, 
+   output logic [`LLEN-1:0] ReadDataM
   );

  logic [7:0] 				ByteM; 
@ -74,18 +75,31 @@ module subwordread
        1'b1: WordM = ReadDataWordMuxM[63:32];
      endcase

-    // sign extension
+    logic [63:0] DblWordM;
+    assign DblWordM = ReadDataWordMuxM[63:0];
+
+    // sign extension/ NaN boxing
    always_comb
    case(Funct3M)
-      3'b000:  ReadDataM = {{56{ByteM[7]}}, ByteM};                  // lb
-      3'b001:  ReadDataM = {{48{HalfwordM[15]}}, HalfwordM[15:0]};   // lh 
-      3'b010:  ReadDataM = {{32{WordM[31]}}, WordM[31:0]};           // lw
-      3'b011:  ReadDataM = ReadDataWordMuxM;                         // ld
-      3'b100:  ReadDataM = {56'b0, ByteM[7:0]};                      // lbu
-      3'b101:  ReadDataM = {48'b0, HalfwordM[15:0]};                 // lhu
-      3'b110:  ReadDataM = {32'b0, WordM[31:0]};                     // lwu
+      3'b000:  ReadDataM = {{`LLEN-8{ByteM[7]}}, ByteM};                              // lb
+      3'b001:  if(`ZFH_SUPPORTED) 
+                    ReadDataM = {{`LLEN-16{HalfwordM[15]|FpLoadM}}, HalfwordM[15:0]}; // lh/flh
+               else ReadDataM = {{`LLEN-16{HalfwordM[15]}}, HalfwordM[15:0]};         // lh 
+      3'b010:  if(`F_SUPPORTED) 
+                    ReadDataM = {{`LLEN-32{WordM[31]|FpLoadM}}, WordM[31:0]};         // lw/flw
+               else ReadDataM = {{`LLEN-32{WordM[31]}}, WordM[31:0]};                 // lw
+      3'b011:  if(`D_SUPPORTED) 
+                    ReadDataM = {{`LLEN-64{DblWordM[63]|FpLoadM}}, DblWordM[63:0]};   // ld/fld
+               else ReadDataM = {{`LLEN-64{DblWordM[63]}}, DblWordM[63:0]};           // ld/fld
+      3'b100:    if(`Q_SUPPORTED) 
+                    ReadDataM = FpLoadM ? ReadDataWordMuxM : {{`LLEN-8{1'b0}}, ByteM[7:0]}; // lbu/flq
+                 else 
+                    ReadDataM = {{`LLEN-8{1'b0}}, ByteM[7:0]};    // lbu
+      3'b101:  ReadDataM = {{`LLEN-16{1'b0}}, HalfwordM[15:0]};   // lhu
+      3'b110:  ReadDataM = {{`LLEN-32{1'b0}}, WordM[31:0]};       // lwu
      default: ReadDataM = ReadDataWordMuxM; // Shouldn't happen
    endcase
+
  end else begin:swrmux // 32-bit
    // byte mux
    always_comb
@ -105,13 +119,18 @@ module subwordread

    // sign extension
    always_comb
-    case(Funct3M) 
-      3'b000:  ReadDataM = {{24{ByteM[7]}}, ByteM};                  // lb
-      3'b001:  ReadDataM = {{16{HalfwordM[15]}}, HalfwordM[15:0]};   // lh 
-      3'b010:  ReadDataM = ReadDataWordMuxM;                                   // lw
-      3'b100:  ReadDataM = {24'b0, ByteM[7:0]};                      // lbu
-      3'b101:  ReadDataM = {16'b0, HalfwordM[15:0]};                 // lhu
-      default: ReadDataM = ReadDataWordMuxM;
+    case(Funct3M)
+      3'b000:  ReadDataM = {{`LLEN-8{ByteM[7]}}, ByteM};                              // lb
+      3'b001:  if(`ZFH_SUPPORTED) 
+                    ReadDataM = {{`LLEN-16{HalfwordM[15]|FpLoadM}}, HalfwordM[15:0]}; // lh/flh
+               else ReadDataM = {{`LLEN-16{HalfwordM[15]}}, HalfwordM[15:0]};         // lh 
+      3'b010:  if(`F_SUPPORTED) 
+                    ReadDataM = {{`LLEN-32{ReadDataWordMuxM[31]|FpLoadM}}, ReadDataWordMuxM[31:0]};         // lw/flw
+               else ReadDataM = {{`LLEN-32{ReadDataWordMuxM[31]}}, ReadDataWordMuxM[31:0]};                 // lw
+      3'b011:  ReadDataM = ReadDataWordMuxM;                      // fld
+      3'b100:  ReadDataM = {{`LLEN-8{1'b0}}, ByteM[7:0]};         // lbu
+      3'b101:  ReadDataM = {{`LLEN-16{1'b0}}, HalfwordM[15:0]};   // lhu
+      default: ReadDataM = ReadDataWordMuxM; // Shouldn't happen
    endcase
  end
 endmodule
--- a/pipelined/src/wally/wallypipelinedcore.sv
+++ b/pipelined/src/wally/wallypipelinedcore.sv
@ -98,6 +98,7 @@ module wallypipelinedcore (
  logic             IllegalFPUInstrD, IllegalFPUInstrE;
  logic             FRegWriteM;
  logic             FPUStallD;
+  logic             FpLoadM;
  logic [1:0]       FResSelW;
  logic [4:0]             SetFflagsM;

@ -128,8 +129,7 @@ module wallypipelinedcore (
  logic [`XLEN-1:0] IEUAdrE;
  (* mark_debug = "true" *) logic [`XLEN-1:0] WriteDataE;
  (* mark_debug = "true" *) logic [`XLEN-1:0] IEUAdrM;  
-  (* mark_debug = "true" *) logic [`XLEN-1:0] ReadDataM;
-  logic [`XLEN-1:0] ReadDataW;  
+  logic [`LLEN-1:0] ReadDataW;  
  logic             CommittedM;

  // AHB ifu interface
@ -229,8 +229,8 @@ module wallypipelinedcore (
     .RdM, .FIntResM, .InvalidateICacheM, .FlushDCacheM,

     // Writeback stage
-     .CSRReadValW, .ReadDataM, .MDUResultW,
-     .RdW, .ReadDataW,
+     .CSRReadValW, .MDUResultW,
+     .RdW, .ReadDataW(ReadDataW[`XLEN-1:0]),
     .InstrValidM, 
     .FCvtIntResW,
     .FResSelW,
@ -253,9 +253,10 @@ module wallypipelinedcore (
  .AtomicM, .TrapM,
  .CommittedM, .DCacheMiss, .DCacheAccess,
  .SquashSCW,            
+  .FpLoadM,
  //.DataMisalignedM(DataMisalignedM),
  .IEUAdrE, .IEUAdrM, .WriteDataE,
-  .ReadDataM, .FlushDCacheM,
+  .ReadDataW, .FlushDCacheM,
  // connected to ahb (all stay the same)
  .LSUBusAdr, .LSUBusRead, .LSUBusWrite, .LSUBusAck, .LSUBusInit,
  .LSUBusHRDATA, .LSUBusHWDATA, .LSUBusSize, .LSUBurstType, .LSUTransType, .LSUTransComplete,
@ -383,13 +384,14 @@ module wallypipelinedcore (
         .clk, .reset,
         .FRM_REGW, // Rounding mode from CSR
         .InstrD, // instruction from IFU
-         .ReadDataW,// Read data from memory
+         .ReadDataW(ReadDataW[`FLEN-1:0]),// Read data from memory
         .ForwardedSrcAE, // Integer input being processed (from IEU)
         .StallE, .StallM, .StallW, // stall signals from HZU
         .FlushE, .FlushM, .FlushW, // flush signals from HZU
         .RdM, .RdW, // which FP register to write to (from IEU)
         .STATUS_FS, // is floating-point enabled?
         .FRegWriteM, // FP register write enable
+         .FpLoadM,
         .FStallD, // Stall the decode stage
         .FWriteIntE, // integer register write enable
         .FWriteDataE, // Data to be written to memory
--- a/pipelined/srt/Makefile
+++ b/pipelined/srt/Makefile
@ -1,4 +1,4 @@
-all: exptestgen testgen qslc_r4a2
+all: exptestgen testgen qslc_r4a2 qslc_r4a2b

 sqrttestgen: sqrttestgen.c
 	gcc sqrttestgen.c -o sqrttestgen -lm
@ -15,5 +15,9 @@ qslc_r4a2: qslc_r4a2.c
 	gcc qslc_r4a2.c -o qslc_r4a2 -lm
 	./qslc_r4a2 > qslc_r4a2.sv

+qslc_r4a2b: qslc_r4a2b.c
+	gcc qslc_r4a2b.c -o qslc_r4a2b -lm
+	./qslc_r4a2b > qslc_r4a2b.tv
+
 clean:
 	rm -f testgen exptestgen qslc_r4a2 
--- a/pipelined/srt/lint-srt
+++ b/pipelined/srt/lint-srt
@ -1 +1,2 @@
 verilator --lint-only --top-module srt srt.sv -I../config/rv64gc -I../config/shared ../src/generic/*.sv ../src/generic/flop/*.sv
+verilator --lint-only --top-module srtradix4 srt-radix4.sv qsel4.sv -I../config/rv64gc -I../config/shared ../src/generic/*.sv ../src/generic/flop/*.sv
--- a/pipelined/srt/qsel4.sv
+++ b/pipelined/srt/qsel4.sv
--- a/pipelined/srt/qslc_r4a2b
+++ b/pipelined/srt/qslc_r4a2b
--- a/pipelined/srt/qslc_r4a2b.c
+++ b/pipelined/srt/qslc_r4a2b.c
@ -0,0 +1,190 @@
+/*
+  Program:      qslc_r4a2.c
+  Description:  Prints out Quotient Selection Table (assumes CPA is utilized to reduce memory)
+  User:         James E. Stine
+
+*/
+
+#include <stdio.h>
+#include <math.h>
+
+#define DIVISOR_SIZE 3
+#define CARRY_SIZE 7
+#define SUM_SIZE 7
+#define TOT_SIZE 7
+
+void disp_binary(double, int, int);
+
+struct bits {
+  unsigned int divisor : DIVISOR_SIZE;
+  int tot : TOT_SIZE;
+} pla;
+
+/* 
+
+   Function:      disp_binary
+   Description:   This function displays a Double-Precision number into
+   four 16 bit integers using the global union variable 
+   dp_number
+   Argument List: double x            The value to be converted
+   int bits_to_left    Number of bits left of radix point
+   int bits_to_right   Number of bits right of radix point
+   Return value:  none
+
+*/
+void disp_binary(double x, int bits_to_left, int bits_to_right) {
+  int i; 
+  double diff;
+
+  if (fabs(x) <  pow(2.0, ((double) -bits_to_right)) ) {
+    for (i = -bits_to_left + 1; i <= bits_to_right; i++) {
+      printf("0");
+    }
+    if (i == bits_to_right+1) 
+      ;
+    
+    return;
+  }
+
+  if (x < 0.0) 
+    x = pow(2.0, ((double) bits_to_left)) + x;
+
+  for (i = -bits_to_left + 1; i <= bits_to_right; i++) {
+    diff = pow(2.0, ((double) -i) );
+    if (x < diff) 
+      printf("0");
+    else {
+      printf("1");
+      x -= diff;
+    }
+    if (i == 0) 
+      ;
+    
+  }
+
+}
+
+int main() {
+  int m;
+  int n;
+  int o;
+  pla.divisor = 0;
+  pla.tot = 0;
+  for (o=0; o < pow(2.0, DIVISOR_SIZE); o++) {
+    for (m=0; m < pow(2.0, TOT_SIZE); m++) {
+      /*
+	4 bits for Radix 4 (a=2)
+	1000 = +2
+	0100 = +1
+	0000 =  0
+	0010 = -1
+	0001 = -2		
+      */
+      switch (pla.divisor) {
+      case 0:
+	if ((pla.tot) >= 12)
+	  printf("8");
+	else if ((pla.tot) >= 4)
+	  printf("4");
+	else if ((pla.tot) >= -4)
+	  printf("0");
+	else if ((pla.tot) >= -13)
+	  printf("2");
+	else
+	  printf("1");
+	break;
+      case 1:
+	if ((pla.tot) >= 14)
+	  printf("8");
+	else if ((pla.tot) >= 4)
+	  printf("4");
+	else if ((pla.tot) >= -6)
+	  printf("0");
+	else if ((pla.tot) >= -15)
+	  printf("2");
+	else
+	  printf("1");
+	break;
+      case 2:
+	if ((pla.tot) >= 15)
+	  printf("8");
+	else if ((pla.tot) >= 4)
+	  printf("4");
+	else if ((pla.tot) >= -6)
+	  printf("0");
+	else if ((pla.tot) >= -16)
+	  printf("2");
+	else
+	  printf("1");
+	break;
+      case 3:
+	if ((pla.tot) >= 16)
+	  printf("8");
+	else if ((pla.tot) >= 4)
+	  printf("4");
+	else if ((pla.tot) >= -6)
+	  printf("0");
+	else if ((pla.tot) >= -18)
+	  printf("2");
+	else
+	  printf("1");
+	break;
+      case 4:
+	if ((pla.tot) >= 18)
+	  printf("8");
+	else if ((pla.tot) >= 6)
+	  printf("4");
+	else if ((pla.tot) >= -8)
+	  printf("0");
+	else if ((pla.tot) >= -20)
+	  printf("2");
+	else
+	  printf("1");
+	break;
+      case 5:
+	if ((pla.tot) >= 20)
+	  printf("8");
+	else if ((pla.tot) >= 6)
+	  printf("4");
+	else if ((pla.tot) >= -8)
+	  printf("0");
+	else if ((pla.tot) >= -20)
+	  printf("2");
+	else
+	  printf("1");
+	break;
+      case 6:
+	if ((pla.tot) >= 20)
+	  printf("8");
+	else if ((pla.tot) >= 8)
+	  printf("4");
+	else if ((pla.tot) >= -8)
+	  printf("0");
+	else if ((pla.tot) >= -22)
+	  printf("2");
+	else
+	  printf("1");
+	break;
+      case 7:
+	if ((pla.tot) >= 24)
+	  printf("8");
+	else if ((pla.tot) >= 8)
+	  printf("4");
+	else if ((pla.tot) >= -8)
+	  printf("0");
+	else if ((pla.tot) >= -24)
+	  printf("2");
+	else
+	  printf("1");
+	break;
+      default: printf ("X");
+			
+      }
+			
+      printf("\n");
+      (pla.tot)++;
+    }
+    (pla.divisor)++;
+  }
+  
+}
--- a/pipelined/srt/qslc_r4a2b.tv
+++ b/pipelined/srt/qslc_r4a2b.tv
--- a/pipelined/srt/sim-srt4
+++ b/pipelined/srt/sim-srt4
@ -0,0 +1,2 @@
+vsim -do "do srt-radix4.do"
+
--- a/pipelined/srt/sim-srt4-batch
+++ b/pipelined/srt/sim-srt4-batch
@ -0,0 +1 @@
+vsim -c -do "do srt-radix4.do"
--- a/pipelined/srt/srt-radix4.do
+++ b/pipelined/srt/srt-radix4.do
@ -0,0 +1,31 @@
+# srt.do   
+#
+# David_Harris@hmc.edu 19 October 2021
+
+# Use this wally-pipelined.do file to run this example.
+# Either bring up ModelSim and type the following at the "ModelSim>" prompt:
+#     do wally-pipelined.do
+# or, to run from a shell, type the following at the shell prompt:
+#     vsim -do wally-pipelined.do -c
+# (omit the "-c" to see the GUI while running from the shell)
+
+onbreak {resume}
+
+# create library
+if [file exists work] {
+    vdel -all
+}
+vlib work
+
+vlog +incdir+../config/rv64gc +incdir+../config/shared srt-radix4.sv testbench-radix4.sv qsel4.sv ../src/generic/flop/flop*.sv ../src/generic/mux.sv ../src/generic/lzc.sv
+vopt +acc work.testbenchradix4 -o workopt 
+vsim workopt
+
+-- display input and output signals as hexidecimal values
+add wave /testbenchradix4/*
+add wave /testbenchradix4/srtradix4/*
+add wave /testbenchradix4/srtradix4/qsel4/*
+add wave /testbenchradix4/srtradix4/otfc4/*
+
+-- Run the Simulation 
+run -all
--- a/pipelined/srt/srt-radix4.sv
+++ b/pipelined/srt/srt-radix4.sv
@ -0,0 +1,323 @@
+///////////////////////////////////////////
+// srt.sv
+//
+// Written: David_Harris@hmc.edu 13 January 2022
+// Modified: 
+//
+// Purpose: Combined Divide and Square Root Floating Point and Integer Unit
+// 
+// A component of the Wally configurable RISC-V project.
+// 
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// MIT LICENSE
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this 
+// software and associated documentation files (the "Software"), to deal in the Software 
+// without restriction, including without limitation the rights to use, copy, modify, merge, 
+// publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons 
+// to whom the Software is furnished to do so, subject to the following conditions:
+//
+//   The above copyright notice and this permission notice shall be included in all copies or 
+//   substantial portions of the Software.
+//
+//   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 
+//   INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR 
+//   PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 
+//   BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+//   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 
+//   OR OTHER DEALINGS IN THE SOFTWARE.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+`include "wally-config.vh"
+
+`define DIVLEN ((`NF<(`XLEN)) ? (`XLEN) : `NF)
+
+module srtradix4 (
+  input  logic clk,
+  input  logic Start, 
+  input  logic Stall, // *** multiple pipe stages
+  input  logic Flush, // *** multiple pipe stages
+  // Floating Point Inputs
+  // later add exponents, signs, special cases
+  input  logic       XSign, YSign,
+  input  logic [`NE-1:0] XExp, YExp,
+  input  logic [`NF-1:0] XFrac, YFrac,
+  input  logic [`XLEN-1:0] SrcA, SrcB,
+  input  logic [1:0] Fmt, // Floats: 00 = 16 bit, 01 = 32 bit, 10 = 64 bit, 11 = 128 bit
+  input  logic       W64, // 32-bit ints on XLEN=64
+  input  logic       Signed, // Interpret integers as signed 2's complement
+  input  logic       Int, // Choose integer inputs
+  input  logic       Sqrt, // perform square root, not divide
+  output logic       rsign,
+  output logic [`DIVLEN-1:0] Quot, Rem, // *** later handle integers
+  output logic [`NE-1:0] rExp,
+  output logic [3:0] Flags
+);
+
+  // logic           qp, qz, qm; // quotient is +1, 0, or -1
+  logic [3:0]     q;
+  logic [`NE-1:0] calcExp;
+  logic           calcSign;
+  logic [`DIVLEN-1:0]  X, Dpreproc;
+  logic [`DIVLEN+3:0]  WS, WSA, WSN;
+  logic [`DIVLEN+3:0]  WC, WCA, WCN;
+  logic [`DIVLEN+3:0]  D, DBar, D2, DBar2, Dsel;
+  logic [$clog2(`XLEN+1)-1:0] intExp;
+  logic           intSign;
+ 
+  srtpreproc preproc(SrcA, SrcB, XFrac, YFrac, Fmt, W64, Signed, Int, Sqrt, X, Dpreproc, intExp, intSign);
+
+  // Top Muxes and Registers
+  // When start is asserted, the inputs are loaded into the divider.
+  // Otherwise, the divisor is retained and the partial remainder
+  // is fed back for the next iteration.
+  //  - assumed one is added here since all numbers are normlaized
+  //    *** wait what about zero? is that specal case? can the divider handle it?
+  //  - when the start signal is asserted X and 0 are loaded into WS and WC
+  //  - otherwise load WSA into the flipflop
+  //  *** what does N and A stand for?
+  //  *** change shift amount for radix4
+  mux2   #(`DIVLEN+4) wsmux({WSA[`DIVLEN+1:0], 2'b0}, {4'b0001, X}, Start, WSN);
+  flop   #(`DIVLEN+4) wsflop(clk, WSN, WS);
+  mux2   #(`DIVLEN+4) wcmux({WCA[`DIVLEN+1:0], 2'b0}, {`DIVLEN+4{1'b0}}, Start, WCN);
+  flop   #(`DIVLEN+4) wcflop(clk, WCN, WC);
+  flopen #(`DIVLEN+4) dflop(clk, Start, {4'b0001, Dpreproc}, D);
+
+  // Quotient Selection logic
+  // Given partial remainder, select quotient of +1, 0, or -1 (qp, qz, pm)
+  // *** change this for radix 4 - generate w/ stine code
+  // q encoding:
+	// 1000 = +2
+	// 0100 = +1
+	// 0000 =  0
+	// 0010 = -1
+	// 0001 = -2
+  qsel4 qsel4(.D, .WS, .WC, .q);
+
+  // Store the expoenent and sign until division is done
+  flopen #(`NE) expflop(clk, Start, calcExp, rExp);
+  flopen #(1) signflop(clk, Start, calcSign, rsign);
+
+  // Divisor Selection logic
+  // *** radix 4 change to choose -2 to 2
+  // - choose the negitive version of what's being selected
+  assign DBar = ~D;
+  assign DBar2 = {~D[`DIVLEN+2:0], 1'b1};
+  assign D2 = {D[`DIVLEN+2:0], 1'b0};
+
+  always_comb
+    case (q)
+      4'b1000: Dsel = DBar2;
+      4'b0100: Dsel = DBar;
+      4'b0000: Dsel = {(`DIVLEN+4){1'b0}};
+      4'b0010: Dsel = D;
+      4'b0001: Dsel = D2;
+      default: Dsel = {`DIVLEN+4{1'bx}};
+    endcase
+
+  // Partial Product Generation
+  //  WSA, WCA = WS + WC - qD
+  csa    #(`DIVLEN+4) csa(WS, WC, Dsel, |q[3:2], WSA, WCA);
+  
+  //*** change for radix 4
+  otfc4  #(`DIVLEN) otfc4(clk, Start, q, Quot);
+
+  expcalc expcalc(.XExp, .YExp, .calcExp);
+
+  signcalc signcalc(.XSign, .YSign, .calcSign);
+
+endmodule
+
+////////////////
+// Submodules //
+////////////////
+
+///////////////////
+// Preprocessing //
+///////////////////
+module srtpreproc (
+  input  logic [`XLEN-1:0] SrcA, SrcB,
+  input  logic [`NF-1:0] XFrac, YFrac,
+  input  logic [1:0] Fmt, // Floats: 00 = 16 bit, 01 = 32 bit, 10 = 64 bit, 11 = 128 bit
+  input  logic       W64, // 32-bit ints on XLEN=64
+  input  logic       Signed, // Interpret integers as signed 2's complement
+  input  logic       Int, // Choose integer inputs
+  input  logic       Sqrt, // perform square root, not divide
+  output logic [`DIVLEN-1:0] X, D,
+  output logic [$clog2(`XLEN+1)-1:0] intExp, // Quotient integer exponent
+  output logic       intSign // Quotient integer sign
+);
+
+  logic  [$clog2(`XLEN+1)-1:0] zeroCntA, zeroCntB;
+  logic  [`XLEN-1:0] PosA, PosB;
+  logic  [`DIVLEN-1:0] ExtraA, ExtraB, PreprocA, PreprocB, PreprocX, PreprocY;
+
+  assign PosA = (Signed & SrcA[`XLEN - 1]) ? -SrcA : SrcA;
+  assign PosB = (Signed & SrcB[`XLEN - 1]) ? -SrcB : SrcB;
+
+  lzc #(`XLEN) lzcA (PosA, zeroCntA);
+  lzc #(`XLEN) lzcB (PosB, zeroCntB);
+
+  assign ExtraA = {PosA, {`DIVLEN-`XLEN{1'b0}}};
+  assign ExtraB = {PosB, {`DIVLEN-`XLEN{1'b0}}};
+
+  assign PreprocA = ExtraA << zeroCntA;
+  assign PreprocB = ExtraB << (zeroCntB + 1);
+  assign PreprocX = {XFrac, {`DIVLEN-`NF{1'b0}}};
+  assign PreprocY = {YFrac, {`DIVLEN-`NF{1'b0}}};
+
+  
+  assign X = Int ? PreprocA : PreprocX;
+  assign D = Int ? PreprocB : PreprocY;
+  assign intExp = zeroCntB - zeroCntA + 1;
+  assign intSign = Signed & (SrcA[`XLEN - 1] ^ SrcB[`XLEN - 1]);
+endmodule
+
+/////////////////////////////////
+// Quotient Selection, Radix 2 //
+/////////////////////////////////
+module qsel2 ( // *** eventually just change to 4 bits
+  input  logic [`DIVLEN+3:`DIVLEN] ps, pc, 
+  output logic         qp, qz, qm
+);
+ 
+  logic [`DIVLEN+3:`DIVLEN]  p, g;
+  logic          magnitude, sign, cout;
+
+  // The quotient selection logic is presented for simplicity, not
+  // for efficiency.  You can probably optimize your logic to
+  // select the proper divisor with less delay.
+
+  // Quotient equations from EE371 lecture notes 13-20
+  assign p = ps ^ pc;
+  assign g = ps & pc;
+
+  assign #1 magnitude = ~(&p[`DIVLEN+2:`DIVLEN]);
+  assign #1 cout = g[`DIVLEN+2] | (p[`DIVLEN+2] & (g[`DIVLEN+1] | p[`DIVLEN+1] & g[`DIVLEN]));
+  assign #1 sign = p[`DIVLEN+3] ^ cout;
+/*  assign #1 magnitude = ~((ps[54]^pc[54]) & (ps[53]^pc[53]) & 
+			  (ps[52]^pc[52]));
+  assign #1 sign = (ps[55]^pc[55])^
+      (ps[54] & pc[54] | ((ps[54]^pc[54]) &
+			    (ps[53]&pc[53] | ((ps[53]^pc[53]) &
+						(ps[52]&pc[52]))))); */
+
+  // Produce quotient = +1, 0, or -1
+  assign #1 qp = magnitude & ~sign;
+  assign #1 qz = ~magnitude;
+  assign #1 qm = magnitude & sign;
+endmodule
+
+
+///////////////////////////////////
+// On-The-Fly Converter, Radix 2 //
+///////////////////////////////////
+module otfc4 #(parameter N=65) (
+  input  logic         clk,
+  input  logic         Start,
+  input  logic [3:0]   q,
+  output logic [N-1:0] r
+);
+
+  //  The on-the-fly converter transfers the quotient 
+  //  bits to the quotient as they come. 
+  //
+  //  This code follows the psuedocode presented in the 
+  //  floating point chapter of the book. Right now, 
+  //  it is written for Radix-2 division.
+  //
+  //  QM is Q-1. It allows us to write negative bits 
+  //  without using a costly CPA. 
+  logic [N+2:0] Q, QM, QNext, QMNext, QMux, QMMux;
+  //  QR and QMR are the shifted versions of Q and QM.
+  //  They are treated as [N-1:r] size signals, and 
+  //  discard the r most significant bits of Q and QM. 
+  logic [N:0] QR, QMR;
+  // if starting a new divison set Q to 0 and QM to -1
+  mux2 #(N+3) Qmux(QNext, {N+3{1'b0}}, Start, QMux);
+  mux2 #(N+3) QMmux(QMNext, {N+3{1'b1}}, Start, QMMux);
+  flop #(N+3) Qreg(clk, QMux, Q);
+  flop #(N+3) QMreg(clk, QMMux, QM);
+
+  // shift Q (quotent) and QM (quotent-1)
+		// if 	q = 2  	    Q = {Q, 10} 	QM = {Q, 01}		
+		// else if 	q = 1   Q = {Q, 01} 	QM = {Q, 00}	
+		// else if 	q = 0   Q = {Q, 00} 	QM = {QM, 11}	
+		// else if 	q = -1	Q = {QM, 11} 	QM = {QM, 10}
+		// else if 	q = -2	Q = {QM, 10} 	QM = {QM, 01}
+    // *** how does the 0 concatination numbers work?
+
+
+
+  always_comb begin
+    QR  = Q[N:0];
+    QMR = QM[N:0];     // Shift Q and QM
+    if (q[3]) begin // +2
+      QNext  = {QR,  2'b10};
+      QMNext = {QR,  2'b01};
+    end else if (q[2]) begin // +1
+      QNext  = {QR,  2'b01};
+      QMNext = {QR,  2'b00};
+    end else if (q[1]) begin // -1
+      QNext  = {QMR,  2'b11};
+      QMNext = {QMR,  2'b10};
+    end else if (q[0]) begin // -2
+      QNext  = {QMR,  2'b10};
+      QMNext = {QMR,  2'b01};
+    end else begin           // 0
+      QNext  = {QR,  2'b00};
+      QMNext = {QMR, 2'b11};
+    end 
+  end
+  assign r = Q[N+2] ? Q[N+1:2] : Q[N:1];
+
+endmodule
+
+
+
+/////////
+// csa //
+/////////
+module csa #(parameter N=69) (
+  input  logic [N-1:0] in1, in2, in3, 
+  input  logic         cin, 
+  output logic [N-1:0] out1, out2
+);
+
+  // This block adds in1, in2, in3, and cin to produce 
+  // a result out1 / out2 in carry-save redundant form.
+  // cin is just added to the least significant bit and
+  // is required to handle adding a negative divisor.
+  // Fortunately, the carry (out2) is shifted left by one
+  // bit, leaving room in the least significant bit to 
+  // insert cin.
+
+  assign #1 out1 = in1 ^ in2 ^ in3;
+  assign #1 out2 = {in1[N-2:0] & (in2[N-2:0] | in3[N-2:0]) | 
+		    (in2[N-2:0] & in3[N-2:0]), cin};
+endmodule
+
+
+//////////////
+// expcalc  //
+//////////////
+module expcalc(
+  input logic  [`NE-1:0] XExp, YExp,
+  output logic [`NE-1:0] calcExp
+);
+
+  assign calcExp = XExp - YExp + (`NE)'(`BIAS);
+
+endmodule
+
+//////////////
+// signcalc //
+//////////////
+module signcalc(
+  input logic  XSign, YSign,
+  output logic calcSign
+);
+
+  assign calcSign = XSign ^ YSign;
+
+endmodule
--- a/pipelined/srt/srt-waves.do
+++ b/pipelined/srt/srt-waves.do
@ -1 +1,3 @@
-add wave -noupdate /testbench/clk
+add wave -noupdate /testbench/*
+add wave -noupdate /testbench/srt/*
+add wave -noupdate /testbench/srt/otfc2/*
--- a/pipelined/srt/stine/Makefile
+++ b/pipelined/srt/stine/Makefile
@ -1,17 +1,26 @@

-CC     = gcc
-CFLAGS = -lm
-LIBS   = 
-OBJS   = disp.o srt4div.o
+CC      = gcc
+CFLAGS  = -lm
+LIBS    = 
+OBJS4   = disp.o srt4div.o
+OBJS2   = disp.o srt2div.o

-srt4div:  	$(OBJS)
-		$(CC) -g -O3 -o srt4div $(OBJS) $(CFLAGS)
+all:		srt4div srt2div

 disp.o:		disp.h disp.c
-		$(CC) -g -c -o disp.o disp.c $(CFLAGS)
+		$(CC) -g -c -o disp.o disp.c 

 srt4div.o:	srt4div.c
-		$(CC) -g -c -o srt4div.o srt4div.c $(CFLAGS)
+		$(CC) -g -c -o srt4div.o srt4div.c
+
+srt2div.o:	srt2div.c
+		$(CC) -g -c -o srt2div.o srt2div.c
+
+srt4div:  	$(OBJS4)
+		$(CC) -g -O3 -o srt4div $(OBJS4) $(CFLAGS)
+
+srt2div:  	$(OBJS2)
+		$(CC) -g -O3 -o srt2div $(OBJS2) $(CFLAGS)

 clean:
 	rm -f *.o *~
--- a/pipelined/srt/stine/notes
+++ b/pipelined/srt/stine/notes
@ -0,0 +1,30 @@
+Dividend x --(0.10101111), divisord --(0.11000101)(i -- 16(0.1100)2- 12)
+
+X = 175 (xAF)
+D = 197 (xC5)
+
+X = 175/256 = 0.68359375
+D = 197/256 = 0.76953125
+
+Note: Add lg(r) extra iterations due to shifting of computed q
+      q_{computed} = q / radix
+
+./srt4div 0.68359375 0.76953125 8 10
+
+r=2
+X = 0.10011111
+D = 0.11000101
+
+X = 159 (9F)
+D = 197 (C5)
+
+X = 159/256 = 0.62109375
+D = 197/256 = 0.76953125
+
+./srt2div 0.62109375 0.76953125 8 9
+
+
+
+
+
+
--- a/pipelined/srt/stine/pd_bad.png
+++ b/pipelined/srt/stine/pd_bad.png
--- a/pipelined/srt/stine/pd_cpa.png
+++ b/pipelined/srt/stine/pd_cpa.png
--- a/pipelined/srt/stine/pd_csa.pdf
+++ b/pipelined/srt/stine/pd_csa.pdf
--- a/pipelined/srt/stine/pd_csa.png
+++ b/pipelined/srt/stine/pd_csa.png
--- a/pipelined/srt/stine/srt2div
+++ b/pipelined/srt/stine/srt2div
--- a/pipelined/srt/stine/srt2div.c
+++ b/pipelined/srt/stine/srt2div.c
@ -0,0 +1,114 @@
+#include "disp.h"
+
+// QSLC is for division by recuerrence for
+// r=2 using a CPA - See 5.109 EL
+int qst (double D, double prem) {
+
+  int q;
+
+  // For Debugging
+  printf("rw --> %lg\n", prem);  
+
+  if (prem >=  0.5) {
+    q = 1;
+  } else if (prem >= -0.5) {
+    q = 0;
+  } else {
+    q = -1;
+  }
+  return q;
+
+}
+
+/*
+ This routine performs a radix-2 SRT division 
+ algorithm.  The user inputs the numerator, the denominator, 
+ and the number of iterations. It assumes that 0.5 <= D < 1.
+        
+*/
+
+int main(int argc, char* argv[]) {
+
+   double P, N, D, Q, RQ, RD, RREM, scale;   
+   int q;
+   int num_iter, i;
+   int prec;
+   int radix = 2;
+   
+   if (argc < 5) {
+      fprintf(stderr,
+	      "Usage: %s numerator denominator num_iterations prec\n", 
+	      argv[0]);
+      exit(1);
+   }
+   sscanf(argv[1],"%lg", &N);
+   sscanf(argv[2],"%lg", &D);
+   sscanf(argv[3],"%d", &num_iter);
+   sscanf(argv[4],"%d", &prec);
+   // Round to precision
+   N = rne(N, prec);
+   D = rne(D, prec);
+   printf("N = ");
+   disp_bin(N, 3, prec, stdout);
+   printf("\n");
+   printf("D = ");
+   disp_bin(D, 3, prec, stdout);
+   printf("\n");
+
+   Q = 0;
+   P = N * pow(2.0, -log2(radix));
+   printf("N = %lg, D = %lg, N/D = %lg, num_iter = %d \n\n", 
+	  N, D, N/D, num_iter); 
+   for (scale = 1, i = 0; i < num_iter; i++) {
+     scale = scale * pow(2.0, -log2(radix));
+     q = qst(flr(2*D, 1), 2*P);
+     printf("2*W[n] = ");
+     disp_bin(radix*P, 3, prec, stdout);
+     printf("\n");
+     printf("q*D = ");      
+     disp_bin(q*D, 3, prec, stdout);
+     printf("\n");
+     printf("W[n+1] = ");            
+     disp_bin(P ,3, prec, stdout);
+     printf("\n");     
+     // Recurrence
+     P = radix * P - q * D;
+     Q = Q + q*scale;
+     printf("i = %d, q = %d, Q = %1.18lf, W = %1.18lf\n", i, q, Q, P); 
+     printf("i = %d, q = %d", i, q);
+     printf(", Q = ");
+     disp_bin(Q, 3, prec, stdout);
+     printf(", W = ");
+     disp_bin(P, 3, prec, stdout);
+     printf("\n\n");
+   }
+   if (P < 0) {
+     Q = Q - scale;
+     P = P + D;
+     printf("\nCorrecting Negative Remainder\n");
+     printf("Q = %1.18lf, W = %1.18lf\n", Q, P);
+     printf("Q = ");
+     disp_bin(Q, 3, prec, stdout);
+     printf(", W = ");
+     disp_bin(P, 3, prec, stdout);
+     printf("\n");
+   }
+
+   // Output Results
+   RQ = N/D;
+   // Since q_{computed} = q / radix, multiply by radix
+   RD = Q * radix;
+   printf("true = %1.18lf, computed = %1.18lf, \n", RQ, RD);
+   printf("true = ");
+   disp_bin(RQ, 3, prec, stdout);
+   printf(", computed = ");
+   disp_bin(RD, 3, prec, stdout);
+   printf("\n\n");
+   printf("REM = %1.18lf \n", P);
+   printf("REM = ");
+   disp_bin(P, 3, prec, stdout);
+   printf("\n\n");
+  
+   return 0;
+
+}
--- a/pipelined/srt/stine/srt4_pd.m
+++ b/pipelined/srt/stine/srt4_pd.m
@ -0,0 +1,508 @@
+%
+% PD Region for Np   = 3;  Nd   = 4;
+% w/CPA
+%
+% Clear all variables and screen
+clear
+clf
+% Define the number of bits (input Dividend)
+n = 4;
+%
+% Define Divisor Range
+% Normalized Floating Point [Dmin,Dmax] = [1,2]
+% Normalized Fixed Point    [Dmin, Dmax] =[1/2,1]
+%
+Dminimum = 1.0/2;
+Dmaximum = 2.0/2;
+% Define an ulp
+ulp = 2^(-n);
+% radix = beta
+beta  = 4;
+% rho = redundancy factor -> SHOULD ALWAYS BE >= THAN 1/2
+%
+% SD representations have alpha < beta - 1
+%
+% alpha = ceil(beta/2)  minimially redundant  
+% alpha = beta -1       maximally redundant (rho = 1)
+% alpha = (beta-1)/2    nonredundant
+% alpha > beta - 1      over-redundant
+% 
+rho = 2/3;
+% Calculation of max digit set
+alpha = rho*(beta-1);
+% Da contains digit set
+q = [];
+for i = -alpha:alpha
+  q = [q; i];
+end
+% 4r(i-1)/D values
+hold on
+% figure(1)
+grid off
+for i = 1:length(q)
+  x = -rho+q(i):ulp:rho+q(i);
+  % Plot redundancy (overlap) Positive
+  z = [rho+q(i),rho+q(i)];
+  y = [x(length(x))-q(i),0];
+  % Plot redundancy (overlap) Negative
+  if (i ~= length(q))
+    w = [-rho+q(i+1)-q(i+1),0];
+    u = [-rho+q(i+1),-rho+q(i+1)];
+    % plot(u,w,'b')
+  end
+  % plot(x,x-q(i))
+  % plot(z,y,'r')
+
+end
+% title('Robertson Diagram for Radix-4 SRT Divison')
+
+Np   = 3;
+Nd   = 4;
+Dmin = Dminimum;
+Dmax = Dmaximum;
+ulpd = 2^(-Nd);
+ulpp = 2^(-Np);
+
+%
+% Plot Atkins P-D plot
+% Normalized Floating Point [Dmin,Dmax] = [1,2]
+% Normalized Fixed Point    [Dmin, Dmax] =[1/2,1]
+%
+Dmin = Dminimum;
+Dmax = Dmaximum;
+for i = 1:length(q)
+  D = Dmin:ulp:Dmax;
+  P1 = (rho+q(i))*D;
+  P2 = (-rho+q(i))*D;
+  hold on
+  p1 = plot(D,P1);
+  p1.Color = '#0000ff';
+  p2 = plot(D,P2);
+  p2.Color = '#ff0000';
+  axis([Dmin Dmax -beta*rho*Dmaximum beta*rho*Dmaximum])
+  xticks(D)
+  p1.LineWidth = 2.0;
+  p2.LineWidth = 2.0;
+end
+
+% Let's make x/y axis binary
+j = [];
+for i=1:length(D)
+    j = [j disp_bin(D(i), 1, 4)];
+end
+yk = [];
+yk2 = [];
+for i=-2.5:0.5:2.5;
+    yk = [yk disp_bin(i, 3, 3)];
+    yk2 = [yk2 i];
+end
+xtickangle(90)
+xticklabels(j)
+yticklabels(yk)
+
+% Let's draw allow points on PD plot
+% Positive Portions
+index = 1;
+i = 0:ulpp:rho*beta*Dmaximum;
+for j = Dmin:ulpd:Dmax
+  plot(j*ones(1,length(i)),i,'k')
+end
+
+j = Dmin:ulpd:Dmax;
+for i = 0:ulpp:rho*beta*Dmaximum
+  plot(j,i*ones(length(j)),'k')
+end
+
+% Negative Portions
+index = 1;
+i = 0:-ulpp:rho*-beta*Dmaximum;
+for j = Dmin:ulpd:Dmax
+  plot(j*ones(1,length(i)),i,'k')
+end
+
+j = Dmin:ulpd:Dmax;
+for i = 0:-ulpp:-rho*beta*Dmaximum
+  plot(j,i*ones(length(j)),'k')
+end
+
+% Labels and Printing
+xlh = xlabel(['Divisor (d)']);
+%xlh.FontSize = 18;
+xlh.Position(2) = xlh.Position(2) - 0.1;
+ylh = ylabel(['P = 4 \cdot w_i']);
+ylh.Position(1) = ylh.Position(1)-0.02;
+%ylh.FontSize = 18;
+
+% Containment Values (placed manually although not bad)
+m2 = [3/4 7/8 1.0 1.0 5/4 5/4 5/4 3/2 3/2];
+m1 = [1/4 1/4 1/4 1/4 1/2 1/2 1/2 1/2 1/2];
+m0 = [-1/4 -1/4 -1/4 -1/4 -1/2 -1/2 -1/2 -1/2 -1/2];
+m1b = [-3/4 -7/8 -1 -1 -5/4 -5/4 -5/4 -3/2 -3/2];
+x2 = Dmin:ulpd:Dmax;
+s2 = stairs(x2, m2);
+s2.Color = '#8f08d1';
+s2.LineWidth = 3.0;
+%s2.LineStyle = '--';
+s1 = stairs(x2, m1);
+s1.Color = '#8f08d1';
+s1.LineWidth = 3.0;
+s0 = stairs(x2, m0);
+s0.Color = '#8f08d1';
+s0.LineWidth = 3.0;
+s1b = stairs(x2, m1b);
+s1b.Color = '#8f08d1';
+s1b.LineWidth = 3.0;
+
+% Place manually Quotient (ugh)
+j = Dmin+ulpd/2:ulpd:Dmax;
+i = rho*beta*Dmaximum-ulpp*3/4:-ulpp:-rho*beta*Dmaximum;
+text(j(1), i(1), '2')
+text(j(1), i(2), '2')
+text(j(1), i(3), '2')
+text(j(1), i(4), '2')
+text(j(1), i(5), '2')
+text(j(1), i(6), '2')
+text(j(1), i(7), '2')
+text(j(1), i(8), '2')
+text(j(1), i(9), '2')
+text(j(1), i(10), '2')
+text(j(1), i(11), '2')
+text(j(1), i(12), '2')
+text(j(1), i(13), '2')
+text(j(1), i(14), '2')
+text(j(1), i(15), '2')
+text(j(1), i(16), '1')
+text(j(1), i(17), '1')
+text(j(1), i(18), '1')
+text(j(1), i(19), '1')
+text(j(1), i(20), '0')
+text(j(1), i(21), '0')
+text(j(1), i(22), '0')
+text(j(1), i(23), '0')
+text(j(1), i(24), '-1')
+text(j(1), i(25), '-1')
+text(j(1), i(26), '-1')
+text(j(1), i(27), '-1')
+text(j(1), i(28), '-2')
+text(j(1), i(29), '-2')
+text(j(1), i(30), '-2')
+text(j(1), i(31), '-2')
+text(j(1), i(32), '-2')
+text(j(1), i(33), '-2')
+text(j(1), i(34), '-2')
+text(j(1), i(35), '-2')
+text(j(1), i(36), '-2')
+text(j(1), i(37), '-2')
+text(j(1), i(38), '-2')
+text(j(1), i(39), '-2')
+text(j(1), i(40), '-2')
+text(j(1), i(41), '-2')
+text(j(1), i(42), '-2')
+
+text(j(2), i(1), '2')
+text(j(2), i(2), '2')
+text(j(2), i(3), '2')
+text(j(2), i(4), '2')
+text(j(2), i(5), '2')
+text(j(2), i(6), '2')
+text(j(2), i(7), '2')
+text(j(2), i(8), '2')
+text(j(2), i(9), '2')
+text(j(2), i(10), '2')
+text(j(2), i(11), '2')
+text(j(2), i(12), '2')
+text(j(2), i(13), '2')
+text(j(2), i(14), '2')
+text(j(2), i(15), '1')
+text(j(2), i(16), '1')
+text(j(2), i(17), '1')
+text(j(2), i(18), '1')
+text(j(2), i(19), '1')
+text(j(2), i(20), '0')
+text(j(2), i(21), '0')
+text(j(2), i(22), '0')
+text(j(2), i(23), '0')
+text(j(2), i(24), '-1')
+text(j(2), i(25), '-1')
+text(j(2), i(26), '-1')
+text(j(2), i(27), '-1')
+text(j(2), i(28), '-1')
+text(j(2), i(29), '-2')
+text(j(2), i(30), '-2')
+text(j(2), i(31), '-2')
+text(j(2), i(32), '-2')
+text(j(2), i(33), '-2')
+text(j(2), i(34), '-2')
+text(j(2), i(35), '-2')
+text(j(2), i(36), '-2')
+text(j(2), i(37), '-2')
+text(j(2), i(38), '-2')
+text(j(2), i(39), '-2')
+text(j(2), i(40), '-2')
+text(j(2), i(41), '-2')
+text(j(2), i(42), '-2')
+
+text(j(3), i(1), '2')
+text(j(3), i(2), '2')
+text(j(3), i(3), '2')
+text(j(3), i(4), '2')
+text(j(3), i(5), '2')
+text(j(3), i(6), '2')
+text(j(3), i(7), '2')
+text(j(3), i(8), '2')
+text(j(3), i(9), '2')
+text(j(3), i(10), '2')
+text(j(3), i(11), '2')
+text(j(3), i(12), '2')
+text(j(3), i(13), '2')
+text(j(3), i(14), '1')
+text(j(3), i(15), '1')
+text(j(3), i(16), '1')
+text(j(3), i(17), '1')
+text(j(3), i(18), '1')
+text(j(3), i(19), '1')
+text(j(3), i(20), '0')
+text(j(3), i(21), '0')
+text(j(3), i(22), '0')
+text(j(3), i(23), '0')
+text(j(3), i(24), '-1')
+text(j(3), i(25), '-1')
+text(j(3), i(26), '-1')
+text(j(3), i(27), '-1')
+text(j(3), i(28), '-1')
+text(j(3), i(29), '-1')
+text(j(3), i(30), '-2')
+text(j(3), i(31), '-2')
+text(j(3), i(32), '-2')
+text(j(3), i(33), '-2')
+text(j(3), i(34), '-2')
+text(j(3), i(35), '-2')
+text(j(3), i(36), '-2')
+text(j(3), i(37), '-2')
+text(j(3), i(38), '-2')
+text(j(3), i(39), '-2')
+text(j(3), i(40), '-2')
+text(j(3), i(41), '-2')
+text(j(3), i(42), '-2')
+
+text(j(4), i(1), '2')
+text(j(4), i(2), '2')
+text(j(4), i(3), '2')
+text(j(4), i(4), '2')
+text(j(4), i(5), '2')
+text(j(4), i(6), '2')
+text(j(4), i(7), '2')
+text(j(4), i(8), '2')
+text(j(4), i(9), '2')
+text(j(4), i(10), '2')
+text(j(4), i(11), '2')
+text(j(4), i(12), '2')
+text(j(4), i(13), '2')
+text(j(4), i(14), '1')
+text(j(4), i(15), '1')
+text(j(4), i(16), '1')
+text(j(4), i(17), '1')
+text(j(4), i(18), '1')
+text(j(4), i(19), '1')
+text(j(4), i(20), '0')
+text(j(4), i(21), '0')
+text(j(4), i(22), '0')
+text(j(4), i(23), '0')
+text(j(4), i(24), '-1')
+text(j(4), i(25), '-1')
+text(j(4), i(26), '-1')
+text(j(4), i(27), '-1')
+text(j(4), i(28), '-1')
+text(j(4), i(29), '-1')
+text(j(4), i(30), '-2')
+text(j(4), i(31), '-2')
+text(j(4), i(32), '-2')
+text(j(4), i(33), '-2')
+text(j(4), i(34), '-2')
+text(j(4), i(35), '-2')
+text(j(4), i(36), '-2')
+text(j(4), i(37), '-2')
+text(j(4), i(38), '-2')
+text(j(4), i(39), '-2')
+text(j(4), i(40), '-2')
+text(j(4), i(41), '-2')
+text(j(4), i(42), '-2')
+
+text(j(5), i(1), '2')
+text(j(5), i(2), '2')
+text(j(5), i(3), '2')
+text(j(5), i(4), '2')
+text(j(5), i(5), '2')
+text(j(5), i(6), '2')
+text(j(5), i(7), '2')
+text(j(5), i(8), '2')
+text(j(5), i(9), '2')
+text(j(5), i(10), '2')
+text(j(5), i(11), '2')
+text(j(5), i(12), '1')
+text(j(5), i(13), '1')
+text(j(5), i(14), '1')
+text(j(5), i(15), '1')
+text(j(5), i(16), '1')
+text(j(5), i(17), '1')
+text(j(5), i(18), '0')
+text(j(5), i(19), '0')
+text(j(5), i(20), '0')
+text(j(5), i(21), '0')
+text(j(5), i(22), '0')
+text(j(5), i(23), '0')
+text(j(5), i(24), '0')
+text(j(5), i(25), '0')
+text(j(5), i(26), '-1')
+text(j(5), i(27), '-1')
+text(j(5), i(28), '-1')
+text(j(5), i(29), '-1')
+text(j(5), i(30), '-1')
+text(j(5), i(31), '-1')
+text(j(5), i(32), '-2')
+text(j(5), i(33), '-2')
+text(j(5), i(34), '-2')
+text(j(5), i(35), '-2')
+text(j(5), i(36), '-2')
+text(j(5), i(37), '-2')
+text(j(5), i(38), '-2')
+text(j(5), i(39), '-2')
+text(j(5), i(40), '-2')
+text(j(5), i(41), '-2')
+text(j(5), i(42), '-2')
+
+text(j(6), i(1), '2')
+text(j(6), i(2), '2')
+text(j(6), i(3), '2')
+text(j(6), i(4), '2')
+text(j(6), i(5), '2')
+text(j(6), i(6), '2')
+text(j(6), i(7), '2')
+text(j(6), i(8), '2')
+text(j(6), i(9), '2')
+text(j(6), i(10), '2')
+text(j(6), i(11), '2')
+text(j(6), i(12), '1')
+text(j(6), i(13), '1')
+text(j(6), i(14), '1')
+text(j(6), i(15), '1')
+text(j(6), i(16), '1')
+text(j(6), i(17), '1')
+text(j(6), i(18), '0')
+text(j(6), i(19), '0')
+text(j(6), i(20), '0')
+text(j(6), i(21), '0')
+text(j(6), i(22), '0')
+text(j(6), i(23), '0')
+text(j(6), i(24), '0')
+text(j(6), i(25), '0')
+text(j(6), i(26), '-1')
+text(j(6), i(27), '-1')
+text(j(6), i(28), '-1')
+text(j(6), i(29), '-1')
+text(j(6), i(30), '-1')
+text(j(6), i(31), '-1')
+text(j(6), i(32), '-2')
+text(j(6), i(33), '-2')
+text(j(6), i(34), '-2')
+text(j(6), i(35), '-2')
+text(j(6), i(36), '-2')
+text(j(6), i(37), '-2')
+text(j(6), i(38), '-2')
+text(j(6), i(39), '-2')
+text(j(6), i(40), '-2')
+text(j(6), i(41), '-2')
+text(j(6), i(42), '-2')
+
+text(j(7), i(1), '2')
+text(j(7), i(2), '2')
+text(j(7), i(3), '2')
+text(j(7), i(4), '2')
+text(j(7), i(5), '2')
+text(j(7), i(6), '2')
+text(j(7), i(7), '2')
+text(j(7), i(8), '2')
+text(j(7), i(9), '2')
+text(j(7), i(10), '2')
+text(j(7), i(11), '2')
+text(j(7), i(12), '1')
+text(j(7), i(13), '1')
+text(j(7), i(14), '1')
+text(j(7), i(15), '1')
+text(j(7), i(16), '1')
+text(j(7), i(17), '1')
+text(j(7), i(18), '0')
+text(j(7), i(19), '0')
+text(j(7), i(20), '0')
+text(j(7), i(21), '0')
+text(j(7), i(22), '0')
+text(j(7), i(23), '0')
+text(j(7), i(24), '0')
+text(j(7), i(25), '0')
+text(j(7), i(26), '-1')
+text(j(7), i(27), '-1')
+text(j(7), i(28), '-1')
+text(j(7), i(29), '-1')
+text(j(7), i(30), '-1')
+text(j(7), i(31), '-1')
+text(j(7), i(32), '-2')
+text(j(7), i(33), '-2')
+text(j(7), i(34), '-2')
+text(j(7), i(35), '-2')
+text(j(7), i(36), '-2')
+text(j(7), i(37), '-2')
+text(j(7), i(38), '-2')
+text(j(7), i(39), '-2')
+text(j(7), i(40), '-2')
+text(j(7), i(41), '-2')
+text(j(7), i(42), '-2')
+
+text(j(8), i(1), '2')
+text(j(8), i(2), '2')
+text(j(8), i(3), '2')
+text(j(8), i(4), '2')
+text(j(8), i(5), '2')
+text(j(8), i(6), '2')
+text(j(8), i(7), '2')
+text(j(8), i(8), '2')
+text(j(8), i(9), '2')
+text(j(8), i(10), '1')
+text(j(8), i(11), '1')
+text(j(8), i(12), '1')
+text(j(8), i(13), '1')
+text(j(8), i(14), '1')
+text(j(8), i(15), '1')
+text(j(8), i(16), '1')
+text(j(8), i(17), '1')
+text(j(8), i(18), '0')
+text(j(8), i(19), '0')
+text(j(8), i(20), '0')
+text(j(8), i(21), '0')
+text(j(8), i(22), '0')
+text(j(8), i(23), '0')
+text(j(8), i(24), '0')
+text(j(8), i(25), '0')
+text(j(8), i(26), '-1')
+text(j(8), i(27), '-1')
+text(j(8), i(28), '-1')
+text(j(8), i(29), '-1')
+text(j(8), i(30), '-2')
+text(j(8), i(31), '-2')
+text(j(8), i(32), '-2')
+text(j(8), i(33), '-2')
+text(j(8), i(34), '-2')
+text(j(8), i(35), '-2')
+text(j(8), i(36), '-2')
+text(j(8), i(37), '-2')
+text(j(8), i(38), '-2')
+text(j(8), i(39), '-2')
+text(j(8), i(40), '-2')
+text(j(8), i(41), '-2')
+text(j(8), i(42), '-2')
+
+print -dpng pd_cpa.png
+
+
+
+
+
--- a/pipelined/srt/stine/srt4_pd2.m
+++ b/pipelined/srt/stine/srt4_pd2.m
@ -0,0 +1,333 @@
+%
+% Clear all variables and screen
+clear
+clf
+% Define the number of bits (input Dividend)
+n = 4;
+%
+% Define Divisor Range
+% Normalized Floating Point [Dmin,Dmax] = [1,2]
+% Normalized Fixed Point    [Dmin, Dmax] =[1/2,1]
+%
+Dminimum = 1.0/2;
+Dmaximum = 2.0/2;
+% Define an ulp
+ulp = 2^(-n);
+% radix = beta
+beta  = 4;
+% rho = redundancy factor -> SHOULD ALWAYS BE >= THAN 1/2
+%
+% SD representations have alpha < beta - 1
+%
+% alpha = ceil(beta/2)  minimially redundant  
+% alpha = beta -1       maximally redundant (rho = 1)
+% alpha = (beta-1)/2    nonredundant
+% alpha > beta - 1      over-redundant
+% 
+rho = 2/3;
+% Calculation of max digit set
+alpha = rho*(beta-1);
+% Da contains digit set
+q = [];
+for i = -alpha:alpha
+  q = [q; i];
+end
+% 4r(i-1)/D values
+hold on
+% figure(1)
+grid off
+for i = 1:length(q)
+  x = -rho+q(i):ulp:rho+q(i);
+  % Plot redundancy (overlap) Positive
+  z = [rho+q(i),rho+q(i)];
+  y = [x(length(x))-q(i),0];
+  % Plot redundancy (overlap) Negative
+  if (i ~= length(q))
+    w = [-rho+q(i+1)-q(i+1),0];
+    u = [-rho+q(i+1),-rho+q(i+1)];
+    % plot(u,w,'b')
+  end
+  % plot(x,x-q(i))
+  % plot(z,y,'r')
+
+end
+% title('Robertson Diagram for Radix-4 SRT Divison')
+
+Np   = 3;
+Nd   = 3;
+Dmin = Dminimum;
+Dmax = Dmaximum;
+ulpd = 2^(-Nd);
+ulpp = 2^(-Np);
+
+%
+% Plot Atkins P-D plot
+% Normalized Floating Point [Dmin,Dmax] = [1,2]
+% Normalized Fixed Point    [Dmin, Dmax] =[1/2,1]
+%
+Dmin = Dminimum;
+Dmax = Dmaximum;
+for i = 1:length(q)
+  D = Dmin:ulpd:Dmax;
+  P1 = (rho+q(i))*D;
+  P2 = (-rho+q(i))*D;
+  hold on
+  p1 = plot(D,P1,'b');
+  p2 = plot(D,P2,'r');
+  axis([Dmin Dmax -beta*rho*Dmaximum beta*rho*Dmaximum])
+  xticks(D)
+  p1.LineWidth = 2.0;
+  p2.LineWidth = 2.0;
+end
+
+% Let's make x axis binary
+D = Dmin:ulpd:Dmax;
+j = [];
+for i=1:length(D)
+    j = [j disp_bin(D(i), 1, 3)];
+end
+yk = [];
+yk2 = [];
+for i=-2.5:0.5:2.5;
+    yk = [yk disp_bin(i, 3, 3)];
+    yk2 = [yk2 i];
+end
+xtickangle(90)
+xticklabels(j)
+yticklabels(yk)
+
+% Let's draw allow points on PD plot
+% Positive Portions
+index = 1;
+i = 0:ulpp:rho*beta*Dmaximum;
+for j = Dmin:ulpd:Dmax
+  plot(j*ones(1,length(i)),i,'k');
+end
+
+j = Dmin:ulpd:Dmax;
+for i = 0:ulpp:rho*beta*Dmaximum
+  plot(j,i*ones(length(j)),'k');
+end
+
+% Negative Portions
+index = 1;
+i = 0:-ulpp:rho*-beta*Dmaximum;
+for j = Dmin:ulpd:Dmax
+  plot(j*ones(1,length(i)),i,'k');
+end
+
+j = Dmin:ulpd:Dmax;
+for i = 0:-ulpp:-rho*beta*Dmaximum
+  plot(j,i*ones(length(j)),'k');
+end
+
+% Labels and Printing
+xlh = xlabel(['Divisor (d)']);
+xlh.Position(2) = xlh.Position(2) - 0.1;
+xlh.FontSize = 18;
+ylh = ylabel(['P = 4 \cdot w_i']);
+ylh.Position(1) = ylh.Position(1)-0.02;
+ylh.FontSize = 18;
+
+% Containment Values (placed manually although not bad)
+m2 = [5/6 1.0 5/4 11/8 11/8];
+m1 = [1/4 1/4 1/2 1/2 1/2];
+m0 = [-1/4 -1/4 -1/2 -1/2 -1/2];
+m1b = [-5/6 -1 -5/4 -11/8 -11/8];
+x2 = Dmin:ulpd:Dmax;
+s2 = stairs(x2, m2);
+s2.Color = '#8f08d1';
+s2.LineWidth = 3.0;
+s1 = stairs(x2, m1);
+s1.Color = '#8f08d1';
+s1.LineWidth = 3.0;
+s0 = stairs(x2, m0);
+s0.Color = '#8f08d1';
+s0.LineWidth = 3.0;
+s1b = stairs(x2, m1b);
+s1b.Color = '#8f08d1';
+s1b.LineWidth = 3.0;
+
+% Place manually Quotient (ugh)
+j = Dmin+ulpd/2:ulpd:Dmax;
+i = rho*beta*Dmaximum-ulpp*3/4:-ulpp:-rho*beta*Dmaximum;
+text(j(1), i(1), '2')
+text(j(1), i(2), '2')
+text(j(1), i(3), '2')
+text(j(1), i(4), '2')
+text(j(1), i(5), '2')
+text(j(1), i(6), '2')
+text(j(1), i(7), '2')
+text(j(1), i(8), '2')
+text(j(1), i(9), '2')
+text(j(1), i(10), '2')
+text(j(1), i(11), '2')
+text(j(1), i(12), '2')
+text(j(1), i(13), '2')
+text(j(1), i(14), '2')
+error1 = text(j(1), i(15), 'Full Precision', 'FontSize', 16);
+text(j(1), i(16), '1')
+text(j(1), i(17), '1')
+text(j(1), i(18), '1')
+text(j(1), i(19), '1')
+text(j(1), i(20), '0')
+text(j(1), i(21), '0')
+text(j(1), i(22), '0')
+text(j(1), i(23), '0')
+text(j(1), i(24), '-1')
+text(j(1), i(25), '-1')
+text(j(1), i(26), '-1')
+text(j(1), i(27), '-1')
+error2 = text(j(1), i(28), 'Full Precision', 'FontSize', 16);
+text(j(1), i(29), '-2')
+text(j(1), i(30), '-2')
+text(j(1), i(31), '-2')
+text(j(1), i(32), '-2')
+text(j(1), i(33), '-2')
+text(j(1), i(34), '-2')
+text(j(1), i(35), '-2')
+text(j(1), i(36), '-2')
+text(j(1), i(37), '-2')
+text(j(1), i(38), '-2')
+text(j(1), i(39), '-2')
+text(j(1), i(40), '-2')
+text(j(1), i(41), '-2')
+text(j(1), i(42), '-2')
+
+text(j(2), i(1), '2')
+text(j(2), i(2), '2')
+text(j(2), i(3), '2')
+text(j(2), i(4), '2')
+text(j(2), i(5), '2')
+text(j(2), i(6), '2')
+text(j(2), i(7), '2')
+text(j(2), i(8), '2')
+text(j(2), i(9), '2')
+text(j(2), i(10), '2')
+text(j(2), i(11), '2')
+text(j(2), i(12), '2')
+text(j(2), i(13), '2')
+text(j(2), i(14), '1')
+text(j(2), i(15), '1')
+text(j(2), i(16), '1')
+text(j(2), i(17), '1')
+text(j(2), i(18), '1')
+text(j(2), i(19), '1')
+text(j(2), i(20), '0')
+text(j(2), i(21), '0')
+text(j(2), i(22), '0')
+text(j(2), i(23), '0')
+text(j(2), i(24), '-1')
+text(j(2), i(25), '-1')
+text(j(2), i(26), '-1')
+text(j(2), i(27), '-1')
+text(j(2), i(28), '-1')
+text(j(2), i(29), '-1')
+text(j(2), i(30), '-2')
+text(j(2), i(31), '-2')
+text(j(2), i(32), '-2')
+text(j(2), i(33), '-2')
+text(j(2), i(34), '-2')
+text(j(2), i(35), '-2')
+text(j(2), i(36), '-2')
+text(j(2), i(37), '-2')
+text(j(2), i(38), '-2')
+text(j(2), i(39), '-2')
+text(j(2), i(40), '-2')
+text(j(2), i(41), '-2')
+text(j(2), i(42), '-2')
+
+text(j(3), i(1), '2')
+text(j(3), i(2), '2')
+text(j(3), i(3), '2')
+text(j(3), i(4), '2')
+text(j(3), i(5), '2')
+text(j(3), i(6), '2')
+text(j(3), i(7), '2')
+text(j(3), i(8), '2')
+text(j(3), i(9), '2')
+text(j(3), i(10), '2')
+text(j(3), i(11), '2')
+text(j(3), i(12), '1')
+text(j(3), i(13), '1')
+text(j(3), i(14), '1')
+text(j(3), i(15), '1')
+text(j(3), i(16), '1')
+text(j(3), i(17), '1')
+text(j(3), i(18), '0')
+text(j(3), i(19), '0')
+text(j(3), i(20), '0')
+text(j(3), i(21), '0')
+text(j(3), i(22), '0')
+text(j(3), i(23), '0')
+text(j(3), i(24), '0')
+text(j(3), i(25), '0')
+text(j(3), i(26), '-1')
+text(j(3), i(27), '-1')
+text(j(3), i(28), '-1')
+text(j(3), i(29), '-1')
+text(j(3), i(30), '-1')
+text(j(3), i(31), '-1')
+text(j(3), i(32), '-2')
+text(j(3), i(33), '-2')
+text(j(3), i(34), '-2')
+text(j(3), i(35), '-2')
+text(j(3), i(36), '-2')
+text(j(3), i(37), '-2')
+text(j(3), i(38), '-2')
+text(j(3), i(39), '-2')
+text(j(3), i(40), '-2')
+text(j(3), i(41), '-2')
+text(j(3), i(42), '-2')
+
+text(j(4), i(1), '2')
+text(j(4), i(2), '2')
+text(j(4), i(3), '2')
+text(j(4), i(4), '2')
+text(j(4), i(5), '2')
+text(j(4), i(6), '2')
+text(j(4), i(7), '2')
+text(j(4), i(8), '2')
+text(j(4), i(9), '2')
+text(j(4), i(10), '2')
+text(j(4), i(11), '1')
+text(j(4), i(12), '1')
+text(j(4), i(13), '1')
+text(j(4), i(14), '1')
+text(j(4), i(15), '1')
+text(j(4), i(16), '1')
+text(j(4), i(17), '1')
+text(j(4), i(18), '0')
+text(j(4), i(19), '0')
+text(j(4), i(20), '0')
+text(j(4), i(21), '0')
+text(j(4), i(22), '0')
+text(j(4), i(23), '0')
+text(j(4), i(24), '0')
+text(j(4), i(25), '0')
+text(j(4), i(26), '-1')
+text(j(4), i(27), '-1')
+text(j(4), i(28), '-1')
+text(j(4), i(29), '-1')
+text(j(4), i(30), '-1')
+text(j(4), i(31), '-1')
+text(j(4), i(32), '-1')
+text(j(4), i(33), '-2')
+text(j(4), i(34), '-2')
+text(j(4), i(35), '-2')
+text(j(4), i(36), '-2')
+text(j(4), i(37), '-2')
+text(j(4), i(38), '-2')
+text(j(4), i(39), '-2')
+text(j(4), i(40), '-2')
+text(j(4), i(41), '-2')
+text(j(4), i(42), '-2')
+
+
+
+print -dpng pd_bad.png
+
+
+
+
+
--- a/pipelined/srt/stine/srt4_pd3.m
+++ b/pipelined/srt/stine/srt4_pd3.m
@ -0,0 +1,855 @@
+%
+% Clear all variables and screen
+clear
+clf
+% Define the number of bits (input Dividend)
+n = 4;
+%
+% Define Divisor Range
+% Normalized Floating Point [Dmin,Dmax] = [1,2]
+% Normalized Fixed Point    [Dmin, Dmax] =[1/2,1]
+%
+Dminimum = 1.0/2;
+Dmaximum = 2.0/2;
+% Define an ulp
+ulp = 2^(-n);
+% radix = beta
+beta  = 4;
+% rho = redundancy factor -> SHOULD ALWAYS BE >= THAN 1/2
+%
+% SD representations have alpha < beta - 1
+%
+% alpha = ceil(beta/2)  minimially redundant  
+% alpha = beta -1       maximally redundant (rho = 1)
+% alpha = (beta-1)/2    nonredundant
+% alpha > beta - 1      over-redundant
+% 
+rho = 2/3;
+% Calculation of max digit set
+alpha = rho*(beta-1);
+% Da contains digit set
+q = [];
+for i = -alpha:alpha
+  q = [q; i];
+end
+% 4r(i-1)/D values
+hold on
+% figure(1)
+grid off
+for i = 1:length(q)
+  x = -rho+q(i):ulp:rho+q(i);
+  % Plot redundancy (overlap) Positive
+  z = [rho+q(i),rho+q(i)];
+  y = [x(length(x))-q(i),0];
+  % Plot redundancy (overlap) Negative
+  if (i ~= length(q))
+    w = [-rho+q(i+1)-q(i+1),0];
+    u = [-rho+q(i+1),-rho+q(i+1)];
+    % plot(u,w,'b')
+  end
+  % plot(x,x-q(i))
+  % plot(z,y,'r')
+
+end
+% title('Robertson Diagram for Radix-4 SRT Divison')
+
+%
+% Plot Atkins P-D plot
+% Normalized Floating Point [Dmin,Dmax] = [1,2]
+% Normalized Fixed Point    [Dmin, Dmax] =[1/2,1]
+%
+Dmin = Dminimum;
+Dmax = Dmaximum;
+for i = 1:length(q)
+  D = Dmin:ulp:Dmax;
+  P1 = (rho+q(i))*D;
+  P2 = (-rho+q(i))*D;
+  hold on
+  p1 = plot(D,P1,'b');
+  p2 = plot(D,P2,'r');
+  axis([Dmin Dmax -beta*rho*Dmaximum beta*rho*Dmaximum])
+  xticks(D)
+  p1.LineWidth = 2.0;
+  p2.LineWidth = 2.0;
+end
+
+% Let's make x axis binary
+j = [];
+for i=1:length(D)
+    j = [j disp_bin(D(i), 1, 4)];
+end
+yk = [];
+yk2 = [];
+for i=-2.5:0.5:2.5;
+    yk = [yk disp_bin(i, 3, 4)];
+    yk2 = [yk2 i];
+end
+xtickangle(90)
+xticklabels(j)
+yticklabels(yk)
+
+Np   = 4;
+Nd   = 4;
+Dmin = Dminimum;
+Dmax = Dmaximum;
+ulpd = 2^(-Nd);
+ulpp = 2^(-Np);
+
+% Let's draw allow points on PD plot
+% Positive Portions
+index = 1;
+i = 0:ulpp:rho*beta*Dmaximum;
+for j = Dmin:ulpd:Dmax
+  plot(j*ones(1,length(i)),i,'k');
+end
+
+j = Dmin:ulpd:Dmax;
+for i = 0:ulpp:rho*beta*Dmaximum
+  plot(j,i*ones(length(j)),'k');
+end
+
+% Negative Portions
+index = 1;
+i = 0:-ulpp:rho*-beta*Dmaximum;
+for j = Dmin:ulpd:Dmax
+  plot(j*ones(1,length(i)),i,'k');
+end
+
+j = Dmin:ulpd:Dmax;
+for i = 0:-ulpp:-rho*beta*Dmaximum
+  plot(j,i*ones(length(j)),'k');
+end
+
+% Labels and Printing
+xlh = xlabel(['Divisor (d)']);
+xlh.Position(2) = xlh.Position(2) - 0.1;
+%xlh.FontSize = 18;
+ylh = ylabel(['P = 4 \cdot w_i']);
+ylh.Position(1) = ylh.Position(1)-0.02;
+%ylh.FontSize = 18;
+
+% Containment Values (placed manually although not bad)
+m2 = [3/4 7/8 15/16 1.0 9/8 19/16 5/4 6/4 6/4];
+m1 = [1/4 1/4 1/4 1/4 3/8 3/8 1/2 1/2 1/2];
+m0 = [-1/4 -3/8 -3/8 -3/8 -1/2 -1/2 -1/2 -1/2 -1/2];
+m1b = [-13/16 -15/16 -1 -9/8 -5/4 -5/4 -11/8 -6/4 -6/4];
+x2 = Dmin:ulpd:Dmax;
+s2 = stairs(x2, m2);
+s2.Color = '#8f08d1';
+s2.LineWidth = 3.0;
+s1 = stairs(x2, m1);
+s1.Color = '#8f08d1';
+s1.LineWidth = 3.0;
+s0 = stairs(x2, m0);
+s0.Color = '#8f08d1';
+s0.LineWidth = 3.0;
+s1b = stairs(x2, m1b);
+s1b.Color = '#8f08d1';
+s1b.LineWidth = 3.0;
+
+% Place manually Quotient (ugh)
+j = Dmin+ulpd/2:ulpd:Dmax;
+i = rho*beta*Dmaximum-ulpp:-ulpp:-rho*beta*Dmaximum;
+
+% 1
+text(j(1), i(1), '2')
+text(j(1), i(2), '2')
+text(j(1), i(3), '2')
+text(j(1), i(4), '2')
+text(j(1), i(5), '2')
+text(j(1), i(6), '2')
+text(j(1), i(7), '2')
+text(j(1), i(8), '2')
+text(j(1), i(9), '2')
+text(j(1), i(10), '2')
+text(j(1), i(11), '2')
+text(j(1), i(12), '2')
+text(j(1), i(13), '2')
+text(j(1), i(14), '2')
+text(j(1), i(15), '2')
+text(j(1), i(16), '2')
+text(j(1), i(17), '2')
+text(j(1), i(18), '2')
+text(j(1), i(19), '2')
+text(j(1), i(20), '2')
+text(j(1), i(21), '2')
+text(j(1), i(22), '2')
+text(j(1), i(23), '2')
+text(j(1), i(24), '2')
+text(j(1), i(25), '2')
+text(j(1), i(26), '2')
+text(j(1), i(27), '2')
+text(j(1), i(28), '2')
+text(j(1), i(29), '2')
+text(j(1), i(30), '2')
+text(j(1), i(31), '1')
+text(j(1), i(32), '1')
+text(j(1), i(33), '1')
+text(j(1), i(34), '1')
+text(j(1), i(35), '1')
+text(j(1), i(36), '1')
+text(j(1), i(37), '1')
+text(j(1), i(38), '1')
+text(j(1), i(39), '0')
+text(j(1), i(40), '0')
+text(j(1), i(41), '0')
+text(j(1), i(42), '0')
+
+text(j(1), i(43), '0')
+text(j(1), i(44), '0')
+text(j(1), i(45), '0')
+text(j(1), i(46), '0')
+text(j(1), i(47), '-1')
+text(j(1), i(48), '-1')
+text(j(1), i(49), '-1')
+text(j(1), i(50), '-1')
+text(j(1), i(51), '-1')
+text(j(1), i(52), '-1')
+text(j(1), i(53), '-1')
+text(j(1), i(54), '-1')
+text(j(1), i(55), '-1')
+text(j(1), i(56), '-2')
+text(j(1), i(57), '-2')
+text(j(1), i(58), '-2')
+text(j(1), i(59), '-2')
+text(j(1), i(60), '-2')
+text(j(1), i(61), '-2')
+text(j(1), i(62), '-2')
+text(j(1), i(63), '-2')
+text(j(1), i(64), '-2')
+text(j(1), i(65), '-2')
+text(j(1), i(66), '-2')
+text(j(1), i(67), '-2')
+text(j(1), i(68), '-2')
+text(j(1), i(69), '-2')
+text(j(1), i(70), '-2')
+text(j(1), i(71), '-2')
+text(j(1), i(72), '-2')
+text(j(1), i(73), '-2')
+text(j(1), i(74), '-2')
+text(j(1), i(75), '-2')
+text(j(1), i(76), '-2')
+text(j(1), i(77), '-2')
+text(j(1), i(78), '-2')
+text(j(1), i(79), '-2')
+text(j(1), i(80), '-2')
+text(j(1), i(81), '-2')
+text(j(1), i(82), '-2')
+text(j(1), i(83), '-2')
+text(j(1), i(84), '-2')
+
+text(j(2), i(1), '2')
+text(j(2), i(2), '2')
+text(j(2), i(3), '2')
+text(j(2), i(4), '2')
+text(j(2), i(5), '2')
+text(j(2), i(6), '2')
+text(j(2), i(7), '2')
+text(j(2), i(8), '2')
+text(j(2), i(9), '2')
+text(j(2), i(10), '2')
+text(j(2), i(11), '2')
+text(j(2), i(12), '2')
+text(j(2), i(13), '2')
+text(j(2), i(14), '2')
+text(j(2), i(15), '2')
+text(j(2), i(16), '2')
+text(j(2), i(17), '2')
+text(j(2), i(18), '2')
+text(j(2), i(19), '2')
+text(j(2), i(20), '2')
+text(j(2), i(21), '2')
+text(j(2), i(22), '2')
+text(j(2), i(23), '2')
+text(j(2), i(24), '2')
+text(j(2), i(25), '2')
+text(j(2), i(26), '2')
+text(j(2), i(27), '2')
+text(j(2), i(28), '2')
+text(j(2), i(29), '1')
+text(j(2), i(30), '1')
+text(j(2), i(31), '1')
+text(j(2), i(32), '1')
+text(j(2), i(33), '1')
+text(j(2), i(34), '1')
+text(j(2), i(35), '1')
+text(j(2), i(36), '1')
+text(j(2), i(37), '1')
+text(j(2), i(38), '1')
+text(j(2), i(39), '0')
+text(j(2), i(40), '0')
+text(j(2), i(41), '0')
+text(j(2), i(42), '0')
+
+text(j(2), i(43), '0')
+text(j(2), i(44), '0')
+text(j(2), i(45), '0')
+text(j(2), i(46), '0')
+text(j(2), i(47), '0')
+text(j(2), i(48), '0')
+text(j(2), i(49), '-1')
+text(j(2), i(50), '-1')
+text(j(2), i(51), '-1')
+text(j(2), i(52), '-1')
+text(j(2), i(53), '-1')
+text(j(2), i(54), '-1')
+text(j(2), i(55), '-1')
+text(j(2), i(56), '-1')
+text(j(2), i(57), '-1')
+text(j(2), i(58), '-2')
+text(j(2), i(59), '-2')
+text(j(2), i(60), '-2')
+text(j(2), i(61), '-2')
+text(j(2), i(62), '-2')
+text(j(2), i(63), '-2')
+text(j(2), i(64), '-2')
+text(j(2), i(65), '-2')
+text(j(2), i(66), '-2')
+text(j(2), i(67), '-2')
+text(j(2), i(68), '-2')
+text(j(2), i(69), '-2')
+text(j(2), i(70), '-2')
+text(j(2), i(71), '-2')
+text(j(2), i(72), '-2')
+text(j(2), i(73), '-2')
+text(j(2), i(74), '-2')
+text(j(2), i(75), '-2')
+text(j(2), i(76), '-2')
+text(j(2), i(77), '-2')
+text(j(2), i(78), '-2')
+text(j(2), i(79), '-2')
+text(j(2), i(80), '-2')
+text(j(2), i(81), '-2')
+text(j(2), i(82), '-2')
+text(j(2), i(83), '-2')
+text(j(2), i(84), '-2')
+
+% 3
+text(j(3), i(1), '2')
+text(j(3), i(2), '2')
+text(j(3), i(3), '2')
+text(j(3), i(4), '2')
+text(j(3), i(5), '2')
+text(j(3), i(6), '2')
+text(j(3), i(7), '2')
+text(j(3), i(8), '2')
+text(j(3), i(9), '2')
+text(j(3), i(10), '2')
+text(j(3), i(11), '2')
+text(j(3), i(12), '2')
+text(j(3), i(13), '2')
+text(j(3), i(14), '2')
+text(j(3), i(15), '2')
+text(j(3), i(16), '2')
+text(j(3), i(17), '2')
+text(j(3), i(18), '2')
+text(j(3), i(19), '2')
+text(j(3), i(20), '2')
+text(j(3), i(21), '2')
+text(j(3), i(22), '2')
+text(j(3), i(23), '2')
+text(j(3), i(24), '2')
+text(j(3), i(25), '2')
+text(j(3), i(26), '2')
+text(j(3), i(27), '2')
+text(j(3), i(28), '1')
+text(j(3), i(29), '1')
+text(j(3), i(30), '1')
+text(j(3), i(31), '1')
+text(j(3), i(32), '1')
+text(j(3), i(33), '1')
+text(j(3), i(34), '1')
+text(j(3), i(35), '1')
+text(j(3), i(36), '1')
+text(j(3), i(37), '1')
+text(j(3), i(38), '1')
+text(j(3), i(39), '0')
+text(j(3), i(40), '0')
+text(j(3), i(41), '0')
+text(j(3), i(42), '0')
+
+text(j(3), i(43), '0')
+text(j(3), i(44), '0')
+text(j(3), i(45), '0')
+text(j(3), i(46), '0')
+text(j(3), i(47), '0')
+text(j(3), i(48), '0')
+text(j(3), i(49), '-1')
+text(j(3), i(50), '-1')
+text(j(3), i(51), '-1')
+text(j(3), i(52), '-1')
+text(j(3), i(53), '-1')
+text(j(3), i(54), '-1')
+text(j(3), i(55), '-1')
+text(j(3), i(56), '-1')
+text(j(3), i(57), '-1')
+text(j(3), i(58), '-1')
+text(j(3), i(59), '-2')
+text(j(3), i(60), '-2')
+text(j(3), i(61), '-2')
+text(j(3), i(62), '-2')
+text(j(3), i(63), '-2')
+text(j(3), i(64), '-2')
+text(j(3), i(65), '-2')
+text(j(3), i(66), '-2')
+text(j(3), i(67), '-2')
+text(j(3), i(68), '-2')
+text(j(3), i(69), '-2')
+text(j(3), i(70), '-2')
+text(j(3), i(71), '-2')
+text(j(3), i(72), '-2')
+text(j(3), i(73), '-2')
+text(j(3), i(74), '-2')
+text(j(3), i(75), '-2')
+text(j(3), i(76), '-2')
+text(j(3), i(77), '-2')
+text(j(3), i(78), '-2')
+text(j(3), i(79), '-2')
+text(j(3), i(80), '-2')
+text(j(3), i(81), '-2')
+text(j(3), i(82), '-2')
+text(j(3), i(83), '-2')
+text(j(3), i(84), '-2')
+
+% 4
+text(j(4), i(1), '2')
+text(j(4), i(2), '2')
+text(j(4), i(3), '2')
+text(j(4), i(4), '2')
+text(j(4), i(5), '2')
+text(j(4), i(6), '2')
+text(j(4), i(7), '2')
+text(j(4), i(8), '2')
+text(j(4), i(9), '2')
+text(j(4), i(10), '2')
+text(j(4), i(11), '2')
+text(j(4), i(12), '2')
+text(j(4), i(13), '2')
+text(j(4), i(14), '2')
+text(j(4), i(15), '2')
+text(j(4), i(16), '2')
+text(j(4), i(17), '2')
+text(j(4), i(18), '2')
+text(j(4), i(19), '2')
+text(j(4), i(20), '2')
+text(j(4), i(21), '2')
+text(j(4), i(22), '2')
+text(j(4), i(23), '2')
+text(j(4), i(24), '2')
+text(j(4), i(25), '2')
+text(j(4), i(26), '2')
+text(j(4), i(27), '1')
+text(j(4), i(28), '1')
+text(j(4), i(29), '1')
+text(j(4), i(30), '1')
+text(j(4), i(31), '1')
+text(j(4), i(32), '1')
+text(j(4), i(33), '1')
+text(j(4), i(34), '1')
+text(j(4), i(35), '1')
+text(j(4), i(36), '1')
+text(j(4), i(37), '1')
+text(j(4), i(38), '1')
+text(j(4), i(39), '0')
+text(j(4), i(40), '0')
+text(j(4), i(41), '0')
+text(j(4), i(42), '0')
+
+text(j(4), i(43), '0')
+text(j(4), i(44), '0')
+text(j(4), i(45), '0')
+text(j(4), i(46), '0')
+text(j(4), i(47), '0')
+text(j(4), i(48), '0')
+text(j(4), i(49), '-1')
+text(j(4), i(50), '-1')
+text(j(4), i(51), '-1')
+text(j(4), i(52), '-1')
+text(j(4), i(53), '-1')
+text(j(4), i(54), '-1')
+text(j(4), i(55), '-1')
+text(j(4), i(56), '-1')
+text(j(4), i(57), '-1')
+text(j(4), i(58), '-1')
+text(j(4), i(59), '-1')
+text(j(4), i(60), '-1')
+text(j(4), i(61), '-2')
+text(j(4), i(62), '-2')
+text(j(4), i(63), '-2')
+text(j(4), i(64), '-2')
+text(j(4), i(65), '-2')
+text(j(4), i(66), '-2')
+text(j(4), i(67), '-2')
+text(j(4), i(68), '-2')
+text(j(4), i(69), '-2')
+text(j(4), i(70), '-2')
+text(j(4), i(71), '-2')
+text(j(4), i(72), '-2')
+text(j(4), i(73), '-2')
+text(j(4), i(74), '-2')
+text(j(4), i(75), '-2')
+text(j(4), i(76), '-2')
+text(j(4), i(77), '-2')
+text(j(4), i(78), '-2')
+text(j(4), i(79), '-2')
+text(j(4), i(80), '-2')
+text(j(4), i(81), '-2')
+text(j(4), i(82), '-2')
+text(j(4), i(83), '-2')
+text(j(4), i(84), '-2')
+
+% 5
+text(j(5), i(1), '2')
+text(j(5), i(2), '2')
+text(j(5), i(3), '2')
+text(j(5), i(4), '2')
+text(j(5), i(5), '2')
+text(j(5), i(6), '2')
+text(j(5), i(7), '2')
+text(j(5), i(8), '2')
+text(j(5), i(9), '2')
+text(j(5), i(10), '2')
+text(j(5), i(11), '2')
+text(j(5), i(12), '2')
+text(j(5), i(13), '2')
+text(j(5), i(14), '2')
+text(j(5), i(15), '2')
+text(j(5), i(16), '2')
+text(j(5), i(17), '2')
+text(j(5), i(18), '2')
+text(j(5), i(19), '2')
+text(j(5), i(20), '2')
+text(j(5), i(21), '2')
+text(j(5), i(22), '2')
+text(j(5), i(23), '2')
+text(j(5), i(24), '2')
+text(j(5), i(25), '1')
+text(j(5), i(26), '1')
+text(j(5), i(27), '1')
+text(j(5), i(28), '1')
+text(j(5), i(29), '1')
+text(j(5), i(30), '1')
+text(j(5), i(31), '1')
+text(j(5), i(32), '1')
+text(j(5), i(33), '1')
+text(j(5), i(34), '1')
+text(j(5), i(35), '1')
+text(j(5), i(36), '1')
+text(j(5), i(37), '0')
+text(j(5), i(38), '0')
+text(j(5), i(39), '0')
+text(j(5), i(40), '0')
+text(j(5), i(41), '0')
+text(j(5), i(42), '0')
+
+text(j(5), i(43), '0')
+text(j(5), i(44), '0')
+text(j(5), i(45), '0')
+text(j(5), i(46), '0')
+text(j(5), i(47), '0')
+text(j(5), i(48), '0')
+text(j(5), i(49), '0')
+text(j(5), i(50), '0')
+text(j(5), i(51), '-1')
+text(j(5), i(52), '-1')
+text(j(5), i(53), '-1')
+text(j(5), i(54), '-1')
+text(j(5), i(55), '-1')
+text(j(5), i(56), '-1')
+text(j(5), i(57), '-1')
+text(j(5), i(58), '-1')
+text(j(5), i(59), '-1')
+text(j(5), i(60), '-1')
+text(j(5), i(61), '-1')
+text(j(5), i(62), '-1')
+text(j(5), i(63), '-2')
+text(j(5), i(64), '-2')
+text(j(5), i(65), '-2')
+text(j(5), i(66), '-2')
+text(j(5), i(67), '-2')
+text(j(5), i(68), '-2')
+text(j(5), i(69), '-2')
+text(j(5), i(70), '-2')
+text(j(5), i(71), '-2')
+text(j(5), i(72), '-2')
+text(j(5), i(73), '-2')
+text(j(5), i(74), '-2')
+text(j(5), i(75), '-2')
+text(j(5), i(76), '-2')
+text(j(5), i(77), '-2')
+text(j(5), i(78), '-2')
+text(j(5), i(79), '-2')
+text(j(5), i(80), '-2')
+text(j(5), i(81), '-2')
+text(j(5), i(82), '-2')
+text(j(5), i(83), '-2')
+text(j(5), i(84), '-2')
+
+% 6
+text(j(6), i(1), '2')
+text(j(6), i(2), '2')
+text(j(6), i(3), '2')
+text(j(6), i(4), '2')
+text(j(6), i(5), '2')
+text(j(6), i(6), '2')
+text(j(6), i(7), '2')
+text(j(6), i(8), '2')
+text(j(6), i(9), '2')
+text(j(6), i(10), '2')
+text(j(6), i(11), '2')
+text(j(6), i(12), '2')
+text(j(6), i(13), '2')
+text(j(6), i(14), '2')
+text(j(6), i(15), '2')
+text(j(6), i(16), '2')
+text(j(6), i(17), '2')
+text(j(6), i(18), '2')
+text(j(6), i(19), '2')
+text(j(6), i(20), '2')
+text(j(6), i(21), '2')
+text(j(6), i(22), '2')
+text(j(6), i(23), '2')
+text(j(6), i(24), '1')
+text(j(6), i(25), '1')
+text(j(6), i(26), '1')
+text(j(6), i(27), '1')
+text(j(6), i(28), '1')
+text(j(6), i(29), '1')
+text(j(6), i(30), '1')
+text(j(6), i(31), '1')
+text(j(6), i(32), '1')
+text(j(6), i(33), '1')
+text(j(6), i(34), '1')
+text(j(6), i(35), '1')
+text(j(6), i(36), '1')
+text(j(6), i(37), '0')
+text(j(6), i(38), '0')
+text(j(6), i(39), '0')
+text(j(6), i(40), '0')
+text(j(6), i(41), '0')
+text(j(6), i(42), '0')
+
+text(j(6), i(43), '0')
+text(j(6), i(44), '0')
+text(j(6), i(45), '0')
+text(j(6), i(46), '0')
+text(j(6), i(47), '0')
+text(j(6), i(48), '0')
+text(j(6), i(49), '0')
+text(j(6), i(50), '0')
+text(j(6), i(51), '-1')
+text(j(6), i(52), '-1')
+text(j(6), i(53), '-1')
+text(j(6), i(54), '-1')
+text(j(6), i(55), '-1')
+text(j(6), i(56), '-1')
+text(j(6), i(57), '-1')
+text(j(6), i(58), '-1')
+text(j(6), i(59), '-1')
+text(j(6), i(60), '-1')
+text(j(6), i(61), '-1')
+text(j(6), i(62), '-1')
+text(j(6), i(63), '-2')
+text(j(6), i(64), '-2')
+text(j(6), i(65), '-2')
+text(j(6), i(66), '-2')
+text(j(6), i(67), '-2')
+text(j(6), i(68), '-2')
+text(j(6), i(69), '-2')
+text(j(6), i(70), '-2')
+text(j(6), i(71), '-2')
+text(j(6), i(72), '-2')
+text(j(6), i(73), '-2')
+text(j(6), i(74), '-2')
+text(j(6), i(75), '-2')
+text(j(6), i(76), '-2')
+text(j(6), i(77), '-2')
+text(j(6), i(78), '-2')
+text(j(6), i(79), '-2')
+text(j(6), i(80), '-2')
+text(j(6), i(81), '-2')
+text(j(6), i(82), '-2')
+text(j(6), i(83), '-2')
+text(j(6), i(84), '-2')
+
+% 7
+text(j(7), i(1), '2')
+text(j(7), i(2), '2')
+text(j(7), i(3), '2')
+text(j(7), i(4), '2')
+text(j(7), i(5), '2')
+text(j(7), i(6), '2')
+text(j(7), i(7), '2')
+text(j(7), i(8), '2')
+text(j(7), i(9), '2')
+text(j(7), i(10), '2')
+text(j(7), i(11), '2')
+text(j(7), i(12), '2')
+text(j(7), i(13), '2')
+text(j(7), i(14), '2')
+text(j(7), i(15), '2')
+text(j(7), i(16), '2')
+text(j(7), i(17), '2')
+text(j(7), i(18), '2')
+text(j(7), i(19), '2')
+text(j(7), i(20), '2')
+text(j(7), i(21), '2')
+text(j(7), i(22), '2')
+text(j(7), i(23), '1')
+text(j(7), i(24), '1')
+text(j(7), i(25), '1')
+text(j(7), i(26), '1')
+text(j(7), i(27), '1')
+text(j(7), i(28), '1')
+text(j(7), i(29), '1')
+text(j(7), i(30), '1')
+text(j(7), i(31), '1')
+text(j(7), i(32), '1')
+text(j(7), i(33), '1')
+text(j(7), i(34), '1')
+text(j(7), i(35), '0')
+text(j(7), i(36), '0')
+text(j(7), i(37), '0')
+text(j(7), i(38), '0')
+text(j(7), i(39), '0')
+text(j(7), i(40), '0')
+text(j(7), i(41), '0')
+text(j(7), i(42), '0')
+
+text(j(7), i(43), '0')
+text(j(7), i(44), '0')
+text(j(7), i(45), '0')
+text(j(7), i(46), '0')
+text(j(7), i(47), '0')
+text(j(7), i(48), '0')
+text(j(7), i(49), '0')
+text(j(7), i(50), '0')
+text(j(7), i(51), '-1')
+text(j(7), i(52), '-1')
+text(j(7), i(53), '-1')
+text(j(7), i(54), '-1')
+text(j(7), i(55), '-1')
+text(j(7), i(56), '-1')
+text(j(7), i(57), '-1')
+text(j(7), i(58), '-1')
+text(j(7), i(59), '-1')
+text(j(7), i(60), '-1')
+text(j(7), i(61), '-1')
+text(j(7), i(62), '-1')
+text(j(7), i(63), '-1')
+text(j(7), i(64), '-1')
+text(j(7), i(65), '-2')
+text(j(7), i(66), '-2')
+text(j(7), i(67), '-2')
+text(j(7), i(68), '-2')
+text(j(7), i(69), '-2')
+text(j(7), i(70), '-2')
+text(j(7), i(71), '-2')
+text(j(7), i(72), '-2')
+text(j(7), i(73), '-2')
+text(j(7), i(74), '-2')
+text(j(7), i(75), '-2')
+text(j(7), i(76), '-2')
+text(j(7), i(77), '-2')
+text(j(7), i(78), '-2')
+text(j(7), i(79), '-2')
+text(j(7), i(80), '-2')
+text(j(7), i(81), '-2')
+text(j(7), i(82), '-2')
+text(j(7), i(83), '-2')
+text(j(7), i(84), '-2')
+
+% 8
+text(j(8), i(1), '2')
+text(j(8), i(2), '2')
+text(j(8), i(3), '2')
+text(j(8), i(4), '2')
+text(j(8), i(5), '2')
+text(j(8), i(6), '2')
+text(j(8), i(7), '2')
+text(j(8), i(8), '2')
+text(j(8), i(9), '2')
+text(j(8), i(10), '2')
+text(j(8), i(11), '2')
+text(j(8), i(12), '2')
+text(j(8), i(13), '2')
+text(j(8), i(14), '2')
+text(j(8), i(15), '2')
+text(j(8), i(16), '2')
+text(j(8), i(17), '2')
+text(j(8), i(18), '2')
+text(j(8), i(19), '1')
+text(j(8), i(20), '1')
+text(j(8), i(21), '1')
+text(j(8), i(22), '1')
+text(j(8), i(23), '1')
+text(j(8), i(24), '1')
+text(j(8), i(25), '1')
+text(j(8), i(26), '1')
+text(j(8), i(27), '1')
+text(j(8), i(28), '1')
+text(j(8), i(29), '1')
+text(j(8), i(30), '1')
+text(j(8), i(31), '1')
+text(j(8), i(32), '1')
+text(j(8), i(33), '1')
+text(j(8), i(34), '1')
+text(j(8), i(35), '0')
+text(j(8), i(36), '0')
+text(j(8), i(37), '0')
+text(j(8), i(38), '0')
+text(j(8), i(39), '0')
+text(j(8), i(40), '0')
+text(j(8), i(41), '0')
+text(j(8), i(42), '0')
+
+text(j(8), i(43), '0')
+text(j(8), i(44), '0')
+text(j(8), i(45), '0')
+text(j(8), i(46), '0')
+text(j(8), i(47), '0')
+text(j(8), i(48), '0')
+text(j(8), i(49), '0')
+text(j(8), i(50), '0')
+text(j(8), i(51), '-1')
+text(j(8), i(52), '-1')
+text(j(8), i(53), '-1')
+text(j(8), i(54), '-1')
+text(j(8), i(55), '-1')
+text(j(8), i(56), '-1')
+text(j(8), i(57), '-1')
+text(j(8), i(58), '-1')
+text(j(8), i(59), '-1')
+text(j(8), i(60), '-1')
+text(j(8), i(61), '-1')
+text(j(8), i(62), '-1')
+text(j(8), i(63), '-1')
+text(j(8), i(64), '-1')
+text(j(8), i(65), '-1')
+text(j(8), i(66), '-1')
+text(j(8), i(67), '-2')
+text(j(8), i(68), '-2')
+text(j(8), i(69), '-2')
+text(j(8), i(70), '-2')
+text(j(8), i(71), '-2')
+text(j(8), i(72), '-2')
+text(j(8), i(73), '-2')
+text(j(8), i(74), '-2')
+text(j(8), i(75), '-2')
+text(j(8), i(76), '-2')
+text(j(8), i(77), '-2')
+text(j(8), i(78), '-2')
+text(j(8), i(79), '-2')
+text(j(8), i(80), '-2')
+text(j(8), i(81), '-2')
+text(j(8), i(82), '-2')
+text(j(8), i(83), '-2')
+text(j(8), i(84), '-2')
+
+orient('landscape')
+print -dpng 'pd_csa.png'
+
+
+
+
+
--- a/pipelined/srt/stine/srt4div
+++ b/pipelined/srt/stine/srt4div
--- a/pipelined/srt/stine/srt4div.c
+++ b/pipelined/srt/stine/srt4div.c
@ -1,83 +1,45 @@
 #include "disp.h"
 #include <math.h>

+// QSLC is for division by recuerrence for
+// r=4 using a CPA - See Table 5.9 EL
 int qslc (double prem, double d) {

  int q;

+  // For Debugging
  printf("d  --> %lg\n", d);
  printf("rw --> %lg\n", prem);
-  if ((d>=0.0)&&(d<1.0)) {
-    if (prem>=1.0)
-       q = 2;
-    else if (prem>=0.25)
-      q = 1;
-    else if (prem>=-0.25)
-      q = 0;
-    else if (prem >= -1)
-      q = -1;
-    else 
-      q = -2;
-    return q;
-  }
-
-  if ((d>=1.0)&&(d<2.0)) {
-    if (prem>=2.0)
-       q = 2;
-    else if (prem>=0.66667)
-      q = 1;
-    else if (prem>=-0.6667)
-      q = 0;
-    else if (prem >= -2)
-      q = -1;
-    else 
-      q = -2;
-    return q;
-  }
-
-  if ((d>=2.0)&&(d<3.0)) {
-    if (prem>=4.0)
-       q = 2;
-    else if (prem>=1.25)
-      q = 1;
-    else if (prem>=-1.25)
-      q = 0;
-    else if (prem >= -4)
-      q = -1;
-    else 
-      q = -2;
-    return q;
-  }
-
-  if ((d>=3.0)&&(d<4.0)) {
-    if (prem>=5.0)
+  
+  if ((d>=8.0)&&(d<9.0)) {
+    if (prem>=6.0)
       q = 2;
    else if (prem>=2.0)
      q = 1;
    else if (prem>=-2.0)
      q = 0;
-    else if (prem >= -5)
+    else if (prem >= -6)
      q = -1;
    else 
      q = -2;
    return q;
  }

-  if ((d>=4.0)&&(d<5.0)) {
-    if (prem>=6.66667)
+  if ((d>=9.0)&&(d<10.0)) {
+    if (prem>=7)
       q = 2;
    else if (prem>=2.0)
      q = 1;
    else if (prem>=-2.0)
      q = 0;
-    else if (prem >= -6.66667)
+    else if (prem >= 7.0)
      q = -1;
    else 
      q = -2;
    return q;
  }

-  if ((d>=5.0)&&(d<6.0)) {
+  if ((d>=10.0)&&(d<11.0)) {
    if (prem>=8.0)
       q = 2;
    else if (prem>=2.0)
@ -91,7 +53,21 @@ int qslc (double prem, double d) {
    return q;
  }

-  if ((d>=6.0)&&(d<7.0)) {
+  if ((d>=11.0)&&(d<12.0)) {
+    if (prem>=8.0)
+       q = 2;
+    else if (prem>=2.0)
+      q = 1;
+    else if (prem>=-2.0)
+      q = 0;
+    else if (prem >= -8.0)
+      q = -1;
+    else 
+      q = -2;
+    return q;
+  }
+
+  if ((d>=12.0)&&(d<13.0)) {
    if (prem>=10.0)
       q = 2;
    else if (prem>=4.0)
@ -105,21 +81,35 @@ int qslc (double prem, double d) {
    return q;
  }

-  if ((d>=7.0)&&(d<8.0)) {
-    if (prem>=11.0)
+  if ((d>=13.0)&&(d<14.0)) {
+    if (prem>=10.0)
       q = 2;
    else if (prem>=4.0)
      q = 1;
    else if (prem>=-4.0)
      q = 0;
-    else if (prem >= -11.0)
+    else if (prem >= -10.0)
      q = -1;
    else 
      q = -2;
    return q;
  }

-  if ((d>=8.0)&&(d<9.0)) {
+  if ((d>=14.0)&&(d<15.0)) {
+    if (prem>=10.0)
+       q = 2;
+    else if (prem>=4.0)
+      q = 1;
+    else if (prem>=-4.0)
+      q = 0;
+    else if (prem >= -10.0)
+      q = -1;
+    else 
+      q = -2;
+    return q;
+  }
+
+  if ((d>=15.0)&&(d<16.0)) {
    if (prem>=12.0)
       q = 2;
    else if (prem>=4.0)
@ -133,106 +123,9 @@ int qslc (double prem, double d) {
    return q;
  }

-  if ((d>=9.0)&&(d<10.0)) {
-    if (prem>=15.0)
-       q = 2;
-    else if (prem>=4.0)
-      q = 1;
-    else if (prem>=-4.0)
-      q = 0;
-    else if (prem >= -15.0)
-      q = -1;
-    else 
-      q = -2;
-    return q;
-  }
-
-  if ((d>=10.0)&&(d<11.0)) {
-    if (prem>=15.0)
-       q = 2;
-    else if (prem>=4.0)
-      q = 1;
-    else if (prem>=-4.0)
-      q = 0;
-    else if (prem >= -15.0)
-      q = -1;
-    else 
-      q = -2;
-    return q;
-  }
-
-  if ((d>=11.0)&&(d<12.0)) {
-    if (prem>=16.0)
-       q = 2;
-    else if (prem>=4.0)
-      q = 1;
-    else if (prem>=-4.0)
-      q = 0;
-    else if (prem >= -16.0)
-      q = -1;
-    else 
-      q = -2;
-    return q;
-  }
-
-  if ((d>=12.0)&&(d<13.0)) {
-    if (prem>=20.0)
-       q = 2;
-    else if (prem>=8.0)
-      q = 1;
-    else if (prem>=-8.0)
-      q = 0;
-    else if (prem >= -20.0)
-      q = -1;
-    else 
-      q = -2;
-    return q;
-  }
-
-  if ((d>=13.0)&&(d<14.0)) {
-    if (prem>=20.0)
-       q = 2;
-    else if (prem>=8.0)
-      q = 1;
-    else if (prem>=-8.0)
-      q = 0;
-    else if (prem >= -20.0)
-      q = -1;
-    else 
-      q = -2;
-    return q;
-  }
-
-  if ((d>=14.0)&&(d<15.0)) {
-    if (prem>=20.0)
-       q = 2;
-    else if (prem>=8.0)
-      q = 1;
-    else if (prem>=-8.0)
-      q = 0;
-    else if (prem >= -20.0)
-      q = -1;
-    else 
-      q = -2;
-    return q;
-  }
-
-  if ((d>=15.0)&&(d<16.0)) {
-    if (prem>=24.0)
-       q = 2;
-    else if (prem>=8.0)
-      q = 1;
-    else if (prem>=-8.0)
-      q = 0;
-    else if (prem >= -24.0)
-      q = -1;
-    else 
-      q = -2;
-    return q;
-  }
-
 }

+
 /*
 This routine performs a radix-4 SRT division 
 algorithm.  The user inputs the numerator, the denominator, 
@ -246,6 +139,8 @@ int main(int argc, char* argv[]) {
   int q;
   int num_iter, i;
   int prec;
+   int radix = 4;
+   
   if (argc < 5) {
      fprintf(stderr,
 	      "Usage: %s numerator denominator num_iterations prec\n", 
@ -267,27 +162,29 @@ int main(int argc, char* argv[]) {
   printf("\n");

   Q = 0;
-   P = N*0.25;
+   P = N * pow(2.0, -log2(radix));
   printf("N = %lg, D = %lg, N/D = %lg, num_iter = %d \n\n", 
 	  N, D, N/D, num_iter); 
   for (scale = 1, i = 0; i < num_iter; i++) {
     // Shift by r
-     scale = scale*0.25;
-     q = qslc(flr((4*P)*16,3), D*16);
-     //q = -q;
+     scale = scale * pow(2.0, -log2(radix));
+     // (4*P)*8 because of footnote in Table 5.9, page 296 EL
+     // i.e., real value = shown value / 8
+     // D*16 since we use 4 bits of D (1 bit known)
+     q = qslc(flr((radix * P) * 8, 3), D*16);
     printf("4*W[n] = ");
-     disp_bin(4*P,3,prec,stdout);
+     disp_bin(radix*P, 3, prec, stdout);
     printf("\n");
     printf("q*D = ");      
-     disp_bin(q*D,3,prec,stdout);
+     disp_bin(q*D, 3, prec, stdout);
     printf("\n");
     printf("W[n+1] = ");            
-     disp_bin(P ,3,prec,stdout);
+     disp_bin(P ,3, prec, stdout);
     printf("\n");
     // Recurrence
-     P = 4*P - q*D;
+     P = radix * P - q * D;
     // OTFC
-     Q = Q + q*scale;
+     Q = Q + q * scale;
     printf("i = %d, q = %d, Q = %1.18lf, W = %1.18lf\n", i, q, Q, P); 
     printf("i = %d, q = %d", i, q);
     printf(", Q = ");
@ -296,8 +193,9 @@ int main(int argc, char* argv[]) {
     disp_bin(P, 3, prec, stdout);
     printf("\n\n");
   }
+   // Is shifted partial remainder negative?
   if (P < 0) {
-     Q = Q - scale;
+     Q = Q - pow(2.0, -prec);
     P = P + D;
     printf("\nCorrecting Negative Remainder\n"); 
     printf("Q = %1.18lf, W = %1.18lf\n", Q, P); 
@ -306,9 +204,12 @@ int main(int argc, char* argv[]) {
     printf(", W = ");
     disp_bin(P, 3, prec, stdout);
     printf("\n");
-   } 
-   RQ = flr(N/D, (double) prec);
-   RD = Q*4;
+   }
+
+   // Output Results
+   RQ = flr(N/D, prec);
+   // Since q_{computed} = q / radix, multiply by radix
+   RD = Q * radix;
   printf("true = %1.18lf, computed = %1.18lf, \n", RQ, RD);
   printf("true = ");
   disp_bin(RQ, 3, prec, stdout);
--- a/pipelined/srt/testbench-radix4.sv
+++ b/pipelined/srt/testbench-radix4.sv
@ -0,0 +1,148 @@
+
+`include "wally-config.vh"
+`define DIVLEN ((`NF<`XLEN) ? `XLEN : `NF)
+
+/////////////
+// counter //
+/////////////
+module counter(input  logic clk, 
+               input  logic req, 
+               output logic done);
+ 
+   logic    [5:0]  count;
+
+  // This block of control logic sequences the divider
+  // through its iterations.  You may modify it if you
+  // build a divider which completes in fewer iterations.
+  // You are not responsible for the (trivial) circuit
+  // design of the block.
+
+  always @(posedge clk)
+    begin
+      if      (count == `DIVLEN/2+1) done <= #1 1;
+      else if (done | req) done <= #1 0;	
+      if (req) count <= #1 0;
+      else     count <= #1 count+1;
+    end
+endmodule
+
+///////////
+// clock //
+///////////
+module clock(clk);
+  output clk;
+ 
+  // Internal clk signal
+  logic clk;
+ 
+endmodule
+
+//////////
+// testbench //
+//////////
+module testbenchradix4;
+  logic              clk;
+  logic              req;
+  logic              done;
+  logic [63:0]       a, b;
+  logic [51:0]       afrac, bfrac;
+  logic [10:0]       aExp, bExp;
+  logic              asign, bsign;
+  logic [51:0]       r, rOTFC;
+  logic [`DIVLEN-1:0]  Quot, QuotOTFC;
+  logic [54:0]       rp, rm;   // positive quotient digits
+ 
+  // Test parameters
+  parameter MEM_SIZE = 40000;
+  parameter MEM_WIDTH = 64+64+64;
+ 
+  `define memr  63:0
+  `define memb  127:64
+  `define mema  191:128
+
+  // Test logicisters
+  logic [MEM_WIDTH-1:0] Tests [0:MEM_SIZE];  // Space for input file
+  logic [MEM_WIDTH-1:0] Vec;  // Verilog doesn't allow direct access to a
+                            // bit field of an array 
+  logic [63:0] correctr, nextr, diffn, diffp;
+  logic [10:0] rExp;
+  logic        rsign;
+  integer testnum, errors;
+
+  // Divider
+  srtradix4 srtradix4(.clk, .Start(req), 
+                .Stall(1'b0), .Flush(1'b0), 
+                .XExp(aExp), .YExp(bExp), .rExp,
+                .XSign(asign), .YSign(bsign), .rsign,
+                .XFrac(afrac), .YFrac(bfrac), 
+                .SrcA('0), .SrcB('0), .Fmt(2'b00), 
+                .W64(1'b0), .Signed(1'b0), .Int(1'b0), .Sqrt(1'b0), 
+                .Quot, .Rem(), .Flags());
+
+  // Counter
+  counter counter(clk, req, done);
+
+
+    initial
+    forever
+      begin
+        clk = 1; #17;
+        clk = 0; #17;
+      end
+
+
+  // Read test vectors from disk
+  initial
+    begin
+      testnum = 0; 
+      errors = 0;
+      $readmemh ("testvectors", Tests);
+      Vec = Tests[testnum];
+      a = Vec[`mema];
+      {asign, aExp, afrac} = a;
+      b = Vec[`memb];
+      {bsign, bExp, bfrac} = b;
+      nextr = Vec[`memr];
+      r = Quot[`DIVLEN-1:`DIVLEN - 52];
+      req <= 1;
+    end
+  
+  // Apply directed test vectors read from file.
+
+  always @(posedge clk)
+    begin
+      r = Quot[`DIVLEN-1:`DIVLEN - 52];
+      if (done) begin
+        req <= 1;
+        diffp = correctr[51:0] - r;
+        diffn = r - correctr[51:0];
+        if ((rsign !== correctr[63]) | (rExp !== correctr[62:52]) | ($signed(diffn) > 1) | ($signed(diffp) > 1) | (diffn === 64'bx) | (diffp === 64'bx)) // check if accurate to 1 ulp
+          begin
+            errors = errors+1;
+            $display("result was %h_%h, should be %h %h %h\n", rExp, r, correctr, diffn, diffp);
+            $display("failed\n");
+            $stop;
+          end
+        if (afrac === 52'hxxxxxxxxxxxxx)
+          begin
+            $display("%d Tests completed successfully", testnum);
+            $stop;
+          end
+	end
+      if (req) 
+	begin
+	  req <= 0;
+	  correctr = nextr;
+	  testnum = testnum+1;
+	  Vec = Tests[testnum];
+	  $display("a = %h  b = %h",a,b);
+    a = Vec[`mema];
+    {asign, aExp, afrac} = a;
+    b = Vec[`memb];
+    {bsign, bExp, bfrac} = b;
+    nextr = Vec[`memr];
+	end
+    end
+ 
+endmodule
+ 
--- a/pipelined/srt/testbench.sv
+++ b/pipelined/srt/testbench.sv
@ -7,7 +7,7 @@ module counter(input  logic clk,
               input  logic req, 
               output logic done);
 
-   logic    [5:0]  count;
+   logic    [7:0]  count;

  // This block of control logic sequences the divider
  // through its iterations.  You may modify it if you
@ -17,7 +17,7 @@ module counter(input  logic clk,

  always @(posedge clk)
    begin
-      if      (count == 54) done <= #1 1;
+      if      (count == `DIVLEN+1) done <= #1 1;
      else if (done | req) done <= #1 0;	
      if (req) count <= #1 0;
      else     count <= #1 count+1;
@ -110,12 +110,14 @@ module testbench;

  always @(posedge clk)
    begin
+      r = Quot[`DIVLEN:`DIVLEN - 52];
+      rOTFC = QuotOTFC[`DIVLEN:`DIVLEN - 52];
      if (done) 
 	begin
 	  req <= #5 1;
    diffp = correctr[51:0] - r;
    diffn = r - correctr[51:0];
-	  if ((rsign !== correctr[63]) | (rExp !== correctr[62:52]) | ($signed(diffn) > 1) | ($signed(diffp) > 1)) // check if accurate to 1 ulp
+	  if ((rsign !== correctr[63]) | (rExp !== correctr[62:52]) | ($signed(diffn) > 1) | ($signed(diffp) > 1) | (diffn === 64'bx) | (diffp === 64'bx)) // check if accurate to 1 ulp
 	    begin
 	      errors = errors+1;
 	      $display("result was %h_%h, should be %h %h %h\n", rExp, r, correctr, diffn, diffp);
--- a/pipelined/testbench/testbench-linux.sv
+++ b/pipelined/testbench/testbench-linux.sv
@ -559,11 +559,11 @@ module testbench;
        if ((dut.core.lsu.LSUPAdrM == 'h10000002) | (dut.core.lsu.LSUPAdrM == 'h10000005) | (dut.core.lsu.LSUPAdrM == 'h10000006)) begin \
          if(!NO_SPOOFING) begin \
            $display("%tns, %d instrs: Overwrite UART's Register in memory stage.", $time, AttemptedInstructionCount); \
-            force dut.core.ieu.dp.ReadDataM = ExpectedMemReadDataM; \
+            force dut.core.lsu.ReadDataM = ExpectedMemReadDataM; \
          end \
        end else \
          if(!NO_SPOOFING) \
-            release dut.core.ieu.dp.ReadDataM; \
+            release dut.core.lsu.ReadDataM; \
        if(textM.substr(0,5) == "rdtime") begin \
          //$display("%tns, %d instrs: Overwrite MTIME_CLINT on read of MTIME in memory stage.", $time, InstrCountW-1); \
          if(!NO_SPOOFING) \
--- a/pipelined/testbench/testbench.sv
+++ b/pipelined/testbench/testbench.sv
@ -65,7 +65,7 @@ logic [3:0] dummy;
  logic [`XLEN-1:0] PCW;

  string ProgramAddrMapFile, ProgramLabelMapFile;
-  integer   	ProgramAddrLabelArray [string];
+  integer   	ProgramAddrLabelArray [string] = '{ "begin_signature" : 0, "tohost" : 0 };

  logic 	    DCacheFlushDone, DCacheFlushStart;
    
@ -119,6 +119,7 @@ logic [3:0] dummy;
        "wally32i":                       tests = wally32i; 
        "wally32e":                       tests = wally32e; 
        "wally32priv":                    tests = wally32priv;
+        "wally32periph":                   tests = wally32periph;
        "embench":                        tests = embench;
      endcase
    end
@ -197,8 +198,6 @@ logic [3:0] dummy;
      ProgramLabelMapFile = {pathname, tests[test], ".elf.objdump.lab"};
      // declare memory labels that interest us, the updateProgramAddrLabelArray task will find the addr of each label and fill the array
      // to expand, add more elements to this array and initialize them to zero (also initilaize them to zero at the start of the next test)
-      ProgramAddrLabelArray = '{ "begin_signature" : 0, 
-	            	                 "tohost" : 0 };
      updateProgramAddrLabelArray(ProgramAddrMapFile, ProgramLabelMapFile, ProgramAddrLabelArray);
      $display("Read memfile %s", memfilename);
      reset_ext = 1; # 42; reset_ext = 0;
@ -251,8 +250,10 @@ logic [3:0] dummy;
          for(i=0; i<SIGNATURESIZE; i=i+1) begin
            sig32[i] = 'bx;
          end
+          // riscof tests have a different signature, tests[0] == "1" refers to RISCVARCHTESTs
+          if (tests[0] == "1") signame = {pathname, tests[test], "erence-sail_c_simulator.signature"};
+          else signame = {pathname, tests[test], ".signature.output"};
          // read signature, reformat in 64 bits if necessary
-          signame = {pathname, tests[test], ".signature.output"};
          $readmemh(signame, sig32);
          i = 0;
          while (i < SIGNATURESIZE) begin
@ -324,8 +325,7 @@ logic [3:0] dummy;

            ProgramAddrMapFile = {pathname, tests[test], ".elf.objdump.addr"};
            ProgramLabelMapFile = {pathname, tests[test], ".elf.objdump.lab"};
-            ProgramAddrLabelArray = '{ "begin_signature" : 0, 
-	            	                       "tohost" : 0 };
+            ProgramAddrLabelArray = '{ "begin_signature" : 0, "tohost" : 0 };
            updateProgramAddrLabelArray(ProgramAddrMapFile, ProgramLabelMapFile, ProgramAddrLabelArray);
            $display("Read memfile %s", memfilename);
            reset_ext = 1; # 47; reset_ext = 0;
@ -387,7 +387,7 @@ module riscvassertions;
    assert (`DIV_BITSPERCYCLE == 1 | `DIV_BITSPERCYCLE==2 | `DIV_BITSPERCYCLE==4) else $error("Illegal number of divider bits/cycle: DIV_BITSPERCYCLE must be 1, 2, or 4");
    assert (`F_SUPPORTED | ~`D_SUPPORTED) else $error("Can't support double (D) without supporting float (F)");
    assert (`I_SUPPORTED ^ `E_SUPPORTED) else $error("Exactly one of I and E must be supported");
-    assert (`XLEN == 64 | ~`D_SUPPORTED) else $error("Wally does not yet support D extensions on RV32");
+    // assert (`XLEN == 64 | ~`D_SUPPORTED) else $error("Wally does not yet support D extensions on RV32");
    assert (`FLEN<=`XLEN | `DMEM == `MEM_CACHE) else $error("Wally does not support FLEN > XLEN unleses data cache is supported");
    assert (`DCACHE_WAYSIZEINBYTES <= 4096 | (`DMEM != `MEM_CACHE) | `VIRTMEM_SUPPORTED == 0) else $error("DCACHE_WAYSIZEINBYTES cannot exceed 4 KiB when caches and vitual memory is enabled (to prevent aliasing)");
    assert (`DCACHE_LINELENINBITS >= 128 | (`DMEM != `MEM_CACHE)) else $error("DCACHE_LINELENINBITS must be at least 128 when caches are enabled");
--- a/pipelined/testbench/tests.vh
+++ b/pipelined/testbench/tests.vh
--- a/tests/riscof/Makefile
+++ b/tests/riscof/Makefile
@ -1,5 +1,5 @@
 arch_dir = ../../addins/riscv-arch-test
-work_dir = "./riscof_work"
+work_dir = ./riscof_work
 current_dir = $(shell pwd)
 XLEN    ?= 64

@ -9,8 +9,9 @@ build:
 	mkdir -p $(work_dir)
 	mkdir -p work
 	sed 's,{0},$(current_dir),g;s,{1},$(XLEN)$(if $(findstring 64,$(XLEN)),gc,imc),g' config.ini > config$(XLEN).ini
-	riscof run --work-dir=$(work_dir) --config=config$(XLEN).ini --suite=$(arch_dir)/riscv-test-suite/ --env=$(arch_dir)/riscv-test-suite/env --no-browser --no-dut-run
-	mv $(work_dir)/rv$(XLEN)i_m work/
+	riscof run --work-dir=$(work_dir) --config=config$(XLEN).ini --suite=$(arch_dir)/riscv-test-suite/ --env=$(arch_dir)/riscv-test-suite/env --no-browser
+	rm -rf work/rv$(XLEN)i_m
+	mv -f $(work_dir)/rv$(XLEN)i_m work/

 clean:
 	rm -f config64.ini
--- a/tests/riscof/sail_cSim/riscof_sail_cSim.py
+++ b/tests/riscof/sail_cSim/riscof_sail_cSim.py
@ -90,7 +90,7 @@ class sail_cSim(pluginTemplate):
            test_dir = testentry['work_dir']
            test_name = test.rsplit('/',1)[1][:-2]

-            elf = 'ref.elf'
+            elf = 'Ref.elf'

            execute = "@cd "+testentry['work_dir']+";"

@ -98,8 +98,8 @@ class sail_cSim(pluginTemplate):
            compile_cmd = cmd + ' -D' + " -D".join(testentry['macros'])
            execute+=compile_cmd+";"

-            execute += self.objdump_cmd.format(elf, self.xlen, 'ref.elf.objdump')
-            sig_file = os.path.join(test_dir, "ref.signature.output")
+            execute += self.objdump_cmd.format(elf, self.xlen, 'Ref.elf.objdump')
+            sig_file = os.path.join(test_dir, self.name[:-1] + ".signature")

            execute += self.sail_exe[self.xlen] + ' --test-signature={0} {1} > {2}.log 2>&1;'.format(sig_file, elf, test_name)

--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-ADD.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-ADD.S
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // ../wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-ADD.S
 // David_Harris@hmc.edu & Katherine Parry
-// Created 2022-01-27 08:08:42.392776//
+// Created 2022-06-17 22:58:09.906970//
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-SLT.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-SLT.S
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // ../wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-SLT.S
 // David_Harris@hmc.edu & Katherine Parry
-// Created 2022-01-27 08:08:42.393471//
+// Created 2022-06-17 22:58:09.909889//
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-SLTU.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-SLTU.S
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // ../wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-SLTU.S
 // David_Harris@hmc.edu & Katherine Parry
-// Created 2022-01-27 08:08:42.393741//
+// Created 2022-06-17 22:58:09.911056//
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-SUB.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-SUB.S
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // ../wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-SUB.S
 // David_Harris@hmc.edu & Katherine Parry
-// Created 2022-01-27 08:08:42.393180//
+// Created 2022-06-17 22:58:09.908718//
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-XOR.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-XOR.S
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // ../wally-riscv-arch-test/riscv-test-suite/rv32i_m/I/src/WALLY-XOR.S
 // David_Harris@hmc.edu & Katherine Parry
-// Created 2022-01-27 08:08:42.394013//
+// Created 2022-06-17 22:58:09.913218//
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/Makefrag
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/Makefrag
@ -53,8 +53,8 @@ target_tests_nosim = \
    WALLY-status-fp-enabled-01 \
    WALLY-status-sie-01 \
    WALLY-status-tw-01 \
+    WALLY-gpio-01 \

-# unclear why wfi, status-fp-enabled, status-sie, and status-tw fail

 rv32i_tests = $(addsuffix .elf, $(rv32i_sc_tests))

--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-gpio-01.reference_output
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-gpio-01.reference_output
@ -0,0 +1,7 @@
+00000000
+00000000
+A5A5A5A5
+5A5AFFFF
+00000000
+5A5A0000
+A55A0000
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-gpio-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-gpio-01.S
@ -0,0 +1,99 @@
+///////////////////////////////////////////
+//
+// WALLY-gpio
+//
+// Author: David_Harris@hmc.edu and Nicholas Lucio <nlucio@hmc.edu>
+//
+// Created 2022-06-16
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+#include "WALLY-TEST-LIB-32.h" 
+
+INIT_TESTS
+
+TRAP_HANDLER m
+
+j run_test_loop // begin test loop/table tests instead of executing inline code.
+
+INIT_TEST_TABLE
+
+END_TESTS
+
+TEST_STACK_AND_DATA
+
+.align 2
+test_cases:
+# ---------------------------------------------------------------------------------------------
+# Test Contents
+#
+#   Here is where the actual tests are held, or rather, what the actual tests do.
+#   each entry consists of 3 values that will be read in as follows:
+#   
+#   '.4byte [x28 Value], [x29 Value], [x30 value]'
+#                     or
+#   '.4byte [address], [value], [test type]'
+#
+#   The encoding for x30 test type values can be found in the test handler in the framework file
+# 
+# ---------------------------------------------------------------------------------------------
+
+.equ GPIO, 0x10060000
+.equ input_val, (GPIO+0x00)
+.equ input_en, (GPIO+0x04)
+.equ output_en, (GPIO+0x08)
+.equ output_val, (GPIO+0x0C)
+.equ rise_ie, (GPIO+0x18)
+.equ rise_ip, (GPIO+0x1C)
+.equ fall_ie, (GPIO+0x20)
+.equ fall_ip, (GPIO+0x24)
+.equ high_ie, (GPIO+0x28)
+.equ high_ip, (GPIO+0x2C)
+.equ low_ie, (GPIO+0x30)
+.equ low_ip, (GPIO+0x34)
+.equ iof_en, (GPIO+0x38)
+.equ iof_sel, (GPIO+0x3C)
+.equ out_xor, (GPIO+0x40)
+
+# =========== Verify all registers reset to zero ===========
+
+.4byte input_val, 0x00000000, read32_test  # input_val reset to zero
+.4byte input_en, 0x00000000, read32_test  # input_en reset to zero
+
+# =========== Test output and input pins ===========
+
+.4byte output_en, 0xFFFFFFFF, write32_test      # enable all output pins
+.4byte output_val, 0xA5A5A5A5, write32_test     # write alternating pattern to output pins
+.4byte input_en, 0xFFFFFFFF, write32_test       # enable all input pins
+.4byte input_val, 0xA5A5A5A5, read32_test       # read pattern from output pins
+.4byte output_val, 0x5A5AFFFF, write32_test     # write different pattern to output pins
+.4byte input_val, 0x5A5AFFFF, read32_test       # read different pattern from output pins
+
+# =========== Test input enables ===========
+.4byte input_en, 0x00000000, write32_test       # disable all input pins
+.4byte input_val, 0x00000000, read32_test       # read 0 since input pins are disabled
+.4byte input_en, 0xFFFF0000, write32_test       # enable a few input pins
+.4byte input_val, 0x5A5A0000, read32_test      # read part of pattern set above.
+
+# =========== Test output enables(?) ===========
+
+.4byte output_en, 0xFFFFFFFF, write32_test      # undo changes made to output enable
+
+# =========== Test XOR functionality ===========
+.4byte out_xor, 0xFF00FF00, write32_test        # invert certain pin values
+.4byte input_val, 0xA55A0000, read32_test           # read inverted pins and verify input enable is working
+
+.4byte 0x0, 0x0, terminate_test # terminate tests
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-ADD.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-ADD.S
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // ../wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-ADD.S
 // David_Harris@hmc.edu & Katherine Parry
-// Created 2022-01-27 08:08:42.394307//
+// Created 2022-06-17 22:58:09.914370//
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SLT.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SLT.S
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // ../wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SLT.S
 // David_Harris@hmc.edu & Katherine Parry
-// Created 2022-01-27 08:08:42.394785//
+// Created 2022-06-17 22:58:09.916813//
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SLTU.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SLTU.S
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // ../wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SLTU.S
 // David_Harris@hmc.edu & Katherine Parry
-// Created 2022-01-27 08:08:42.395005//
+// Created 2022-06-17 22:58:09.917963//
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SUB.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SUB.S
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // ../wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-SUB.S
 // David_Harris@hmc.edu & Katherine Parry
-// Created 2022-01-27 08:08:42.394545//
+// Created 2022-06-17 22:58:09.915580//
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-XOR.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-XOR.S
@ -1,7 +1,7 @@
 ///////////////////////////////////////////
 // ../wally-riscv-arch-test/riscv-test-suite/rv64i_m/I/src/WALLY-XOR.S
 // David_Harris@hmc.edu & Katherine Parry
-// Created 2022-01-27 08:08:42.395231//
+// Created 2022-06-17 22:58:09.919138//
 // Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation