From 7249295f53e4021f3d155de17f7b56452f00b0d8 Mon Sep 17 00:00:00 2001
From: cturek <cturek@hmc.edu>
Date: Mon, 27 Jun 2022 23:55:21 +0000
Subject: [PATCH 01/10] Updated radix 2 divider to work with integers and
 floats in new structure. Integers still might not work.

---
 addins/riscv-arch-test                  |  2 +-
 pipelined/config/shared/wally-shared.vh |  6 +++---
 pipelined/srt/exptestgen.c              |  2 +-
 pipelined/srt/srt.sv                    | 19 +++++++++----------
 pipelined/srt/testbench.sv              | 12 ++++++------
 5 files changed, 20 insertions(+), 21 deletions(-)

diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
index be67c99bd..307c77b26 160000
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@@ -1 +1 @@
-Subproject commit be67c99bd461742aa1c100bcc0732657faae2230
+Subproject commit 307c77b26e070ae85ffea665ad9b642b40e33c86
diff --git a/pipelined/config/shared/wally-shared.vh b/pipelined/config/shared/wally-shared.vh
index 51c45ef00..5db8af1cf 100644
--- a/pipelined/config/shared/wally-shared.vh
+++ b/pipelined/config/shared/wally-shared.vh
@@ -94,9 +94,9 @@
 `define BIAS2 ((`F_SUPPORTED & (`LEN1 != `S_LEN)) ? `S_BIAS : `H_BIAS)
 
 // largest length in IEU/FPU
-`define CVTLEN ((`NF<`XLEN) ? `XLEN : `NF)
-`define DIVLEN ((`NF < `XLEN) ? `XLEN : `NF)
-`define LLEN ((`FLEN<`XLEN) ? `XLEN : `FLEN)
+`define CVTLEN ((`NF<`XLEN) ? (`XLEN) : (`NF))
+`define DIVLEN ((`NF < `XLEN) ? (`XLEN) : (`NF))
+`define LLEN ((`FLEN<`XLEN) ? (`XLEN) : (`FLEN))
 `define LOGCVTLEN $unsigned($clog2(`CVTLEN+1))
 `define NORMSHIFTSZ ((`DIVLEN+`NF+3) > (3*`NF+8) ? (`DIVLEN+`NF+3) : (3*`NF+9))
 `define CORRSHIFTSZ ((`DIVLEN+`NF+3) > (3*`NF+8) ? (`DIVLEN+`NF+3) : (3*`NF+6))
diff --git a/pipelined/srt/exptestgen.c b/pipelined/srt/exptestgen.c
index bd51126e7..61fe74aa4 100644
--- a/pipelined/srt/exptestgen.c
+++ b/pipelined/srt/exptestgen.c
@@ -46,7 +46,7 @@ void main(void)
   int i, j;
   int bias = 1023;
 
-  if ((fptr = fopen("testvectors","w")) == NULL) {
+  if ((fptr = fopen("testvectors","w")) == NULL) { 
     fprintf(stderr, "Couldn't write testvectors file\n");
     exit(1);
   }
diff --git a/pipelined/srt/srt.sv b/pipelined/srt/srt.sv
index e40f27589..5adeced47 100644
--- a/pipelined/srt/srt.sv
+++ b/pipelined/srt/srt.sv
@@ -2,7 +2,7 @@
 // srt.sv
 //
 // Written: David_Harris@hmc.edu 13 January 2022
-// Modified: 
+// Modified: cturek@hmc.edu June 2022
 //
 // Purpose: Combined Divide and Square Root Floating Point and Integer Unit
 // 
@@ -29,10 +29,8 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 `include "wally-config.vh"
-
-`define DIVLEN ((`NF<(`XLEN+1)) ? (`XLEN + 1) : `NF)
-`define EXTRAFRACBITS ((`NF<(`XLEN+1)) ? (`XLEN - `NF + 1) : 0)
-`define EXTRAINTBITS ((`NF<(`XLEN+1)) ? 0 : (`NF - `XLEN))
+`define EXTRAFRACBITS ((`NF<(`XLEN)) ? (`XLEN - `NF) : 0)
+`define EXTRAINTBITS ((`NF<(`XLEN)) ? 0 : (`NF - `XLEN))
 
 module srt (
   input  logic clk,
@@ -131,11 +129,11 @@ module srtpreproc (
   lzc #(`XLEN) lzcA (PosA, zeroCntA);
   lzc #(`XLEN) lzcB (PosB, zeroCntB);
 
-  assign ExtraA = {1'b0, PosA, {`EXTRAINTBITS{1'b0}}};
-  assign ExtraB = {1'b0, PosB, {`EXTRAINTBITS{1'b0}}};
+  assign ExtraA = {PosA, {`EXTRAINTBITS{1'b0}}};
+  assign ExtraB = {PosB, {`EXTRAINTBITS{1'b0}}};
 
   assign PreprocA = ExtraA << zeroCntA;
-  assign PreprocB = ExtraB << (zeroCntB + 1);
+  assign PreprocB = ExtraB << zeroCntB;
   assign PreprocX = {SrcXFrac, {`EXTRAFRACBITS{1'b0}}};
   assign PreprocY = {SrcYFrac, {`EXTRAFRACBITS{1'b0}}};
 
@@ -228,14 +226,15 @@ module otfc2 #(parameter N=65) (
   //
   //  QM is Q-1. It allows us to write negative bits 
   //  without using a costly CPA. 
-  logic [N+2:0] Q, QM, QNext, QMNext;
+  logic [N+2:0] Q, QM, QNext, QMNext, QMMux;
   //  QR and QMR are the shifted versions of Q and QM.
   //  They are treated as [N-1:r] size signals, and 
   //  discard the r most significant bits of Q and QM. 
   logic [N+1:0] QR, QMR;
 
   flopr #(N+3) Qreg(clk, Start, QNext, Q);
-  flopr #(N+3) QMreg(clk, Start, QMNext, QM);
+  mux2 #(`DIVLEN+3) QMmux(QMNext, {`DIVLEN+3{1'b1}}, Start, QMMux);
+  flop #(`DIVLEN+3) QMreg(clk, QMMux, QM);
 
   always_comb begin
     QR  = Q[N+1:0];
diff --git a/pipelined/srt/testbench.sv b/pipelined/srt/testbench.sv
index 93da74752..9655d7f70 100644
--- a/pipelined/srt/testbench.sv
+++ b/pipelined/srt/testbench.sv
@@ -1,4 +1,4 @@
-`define DIVLEN 65
+`define DIVLEN 64
 
 /////////////
 // counter //
@@ -17,7 +17,7 @@ module counter(input  logic clk,
 
   always @(posedge clk)
     begin
-      if      (count == `DIVLEN+1) done <= #1 1;
+      if      (count == `DIVLEN + 2) done <= #1 1;
       else if (done | req) done <= #1 0;	
       if (req) count <= #1 0;
       else     count <= #1 count+1;
@@ -101,8 +101,8 @@ module testbench;
       b = Vec[`memb];
       {bsign, bExp, bfrac} = b;
       nextr = Vec[`memr];
-      r = Quot[`DIVLEN:`DIVLEN - 52];
-      rOTFC = QuotOTFC[`DIVLEN:`DIVLEN - 52];
+      r = Quot[(`DIVLEN - 1):(`DIVLEN - 52)];
+      rOTFC = QuotOTFC[(`DIVLEN - 1):(`DIVLEN - 52)];
       req <= #5 1;
     end
   
@@ -110,8 +110,8 @@ module testbench;
 
   always @(posedge clk)
     begin
-      r = Quot[`DIVLEN:`DIVLEN - 52];
-      rOTFC = QuotOTFC[`DIVLEN:`DIVLEN - 52];
+      r = Quot[(`DIVLEN - 1):(`DIVLEN - 52)];
+      rOTFC = QuotOTFC[(`DIVLEN - 1):(`DIVLEN - 52)];
       if (done) 
 	begin
 	  req <= #5 1;

From 0417a6a45b660e7415bcef23365e9db59f28add6 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Tue, 28 Jun 2022 00:16:22 +0000
Subject: [PATCH 02/10] very basic early termination passes testfloat 64-bit
 tests

---
 pipelined/src/fpu/divshiftcalc.sv  | 25 +++++++++++--------------
 pipelined/src/fpu/lzacorrection.sv | 14 +++++++++++++-
 pipelined/src/fpu/postprocess.sv   |  7 +++++--
 3 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/pipelined/src/fpu/divshiftcalc.sv b/pipelined/src/fpu/divshiftcalc.sv
index d867efc44..51698590e 100644
--- a/pipelined/src/fpu/divshiftcalc.sv
+++ b/pipelined/src/fpu/divshiftcalc.sv
@@ -7,16 +7,15 @@ module divshiftcalc(
     input logic [$clog2(`DIVLEN/2+3)-1:0] EarlyTermShiftDiv2M,
     output logic [$clog2(`NORMSHIFTSZ)-1:0] DivShiftAmt,
     output logic [`NORMSHIFTSZ-1:0] DivShiftIn,
-    output logic [`NE+1:0] CorrDivExp
+    output logic DivResDenorm,
+    output logic [`NE+1:0] DivDenormShift
 );
-    logic ResDenorm;
-    logic [`NE+1:0] DenormShift;
     logic [`NE+1:0] NormShift;
     logic [`NE+1:0] Nf, NfPlus1;
 
     // is the result denromalized
     // if the exponent is 1 then the result needs to be normalized then the result is denormalizes
-    assign ResDenorm = DivCalcExpM[`NE+1]|(~|DivCalcExpM[`NE+1:1]&~(DivCalcExpM[0]&Quot[`DIVLEN+2]));
+    assign DivResDenorm = DivCalcExpM[`NE+1]|(~|DivCalcExpM[`NE+1:0]);
     // select the proper fraction lengnth
     if (`FPSIZES == 1) begin
         assign Nf = (`NE+2)'(`NF);
@@ -70,24 +69,22 @@ module divshiftcalc(
     // if the result is denormalized
     //  00000000x.xxxxxx...                     Exp = DivCalcExp
     //  .00000000xxxxxxx... >> NF+1             Exp = DivCalcExp+NF+1
-    //  .000xxxxxxxxxxxx... << DivCalcExp+NF+1  Exp = 0
+    //  .00xxxxxxxxxxxxx... << DivCalcExp+NF+1  Exp = +1
     //  .0000xxxxxxxxxxx... >> 1                Exp = 1
     // Left shift amount  = DivCalcExp+NF+1-1
-    assign DenormShift = Nf+DivCalcExpM;
+    assign DivDenormShift = Nf+DivCalcExpM;
     // if the result is normalized
     //  00000000x.xxxxxx...                     Exp = DivCalcExp
     //  .00000000xxxxxxx... >> NF+1             Exp = DivCalcExp+NF+1
-    //  00000000x.xxxxxx... << NF+1             Exp = DivCalcExp
-    //  00000000xx.xxxxx... << 1?               Exp = DivCalcExp-1
-    // Left shift amount  = NF+1 plus 1 if normalization required
-    assign NormShift = NfPlus1 + {(`NE+1)'(0), ~Quot[`DIVLEN+2]};
+    //  00000000.xxxxxxx... << NF               Exp = DivCalcExp+1
+    //  00000000x.xxxxxx... << NF               Exp = DivCalcExp (extra shift done afterwards)
+    //  00000000xx.xxxxx... << 1?               Exp = DivCalcExp-1 (determined after)
+    // inital Left shift amount  = NF
+    assign NormShift = Nf;
     // if the shift amount is negitive then dont shift (keep sticky bit)
-    assign DivShiftAmt = (ResDenorm ?  DenormShift[$clog2(`NORMSHIFTSZ)-1:0]&{$clog2(`NORMSHIFTSZ){~DenormShift[`NE+1]}} : NormShift[$clog2(`NORMSHIFTSZ)-1:0])+{{$clog2(`NORMSHIFTSZ)-$clog2(`DIVLEN/2+3)-1{1'b0}}, EarlyTermShiftDiv2M, 1'b0};
+    assign DivShiftAmt = (DivResDenorm ?  DivDenormShift[$clog2(`NORMSHIFTSZ)-1:0]&{$clog2(`NORMSHIFTSZ){~DivDenormShift[`NE+1]}} : NormShift[$clog2(`NORMSHIFTSZ)-1:0])+{{$clog2(`NORMSHIFTSZ)-$clog2(`DIVLEN/2+3)-1{1'b0}}, EarlyTermShiftDiv2M&{$clog2(`DIVLEN/2+3){~DivDenormShift[`NE+1]}}, 1'b0};
 
     // *** may be able to reduce shifter size
     assign DivShiftIn = {{`NF{1'b0}}, Quot[`DIVLEN+2:0], {`NORMSHIFTSZ-`DIVLEN-3-`NF{1'b0}}};
-    // the quotent is in the range [.5,2) if there is no early termination
-    // if the quotent < 1 and not denormal then subtract 1 to account for the normalization shift
-    assign CorrDivExp = (ResDenorm&~DenormShift[`NE+1]) ? (`NE+2)'(0) : DivCalcExpM - {(`NE+1)'(0), ~Quot[`DIVLEN+2]};
 
 endmodule
diff --git a/pipelined/src/fpu/lzacorrection.sv b/pipelined/src/fpu/lzacorrection.sv
index f06dd84a9..e5a2d5c34 100644
--- a/pipelined/src/fpu/lzacorrection.sv
+++ b/pipelined/src/fpu/lzacorrection.sv
@@ -3,14 +3,20 @@
 module lzacorrection(
     input logic  [`NORMSHIFTSZ-1:0]     Shifted,         // the shifted sum before LZA correction
     input logic                         FmaOp,
+    input logic                         DivOp,
+    input logic DivResDenorm,
+    input logic  [`NE+1:0] DivCalcExpM,
+    input logic [`NE+1:0] DivDenormShift,
     input logic  [`NE+1:0]              ConvNormSumExp,          // exponent of the normalized sum not taking into account denormal or zero results
     input logic                         PreResultDenorm,    // is the result denormalized - calculated before LZA corection
     input logic                         KillProdM,  // is the product set to zero
     input logic                         SumZero,
     output logic  [`CORRSHIFTSZ-1:0]    CorrShifted,         // the shifted sum before LZA correction
+    output logic [`NE+1:0] CorrDivExp,
     output logic [`NE+1:0]              SumExp         // exponent of the normalized sum
 );
     logic [3*`NF+5:0]           CorrSumShifted;     // the shifted sum after LZA correction
+    logic [`CORRSHIFTSZ:0]           CorrQuotShifted;
     logic                        ResDenorm;    // is the result denormalized
     logic                       LZAPlus1, LZAPlus2; // add one or two to the sum's exponent due to LZA correction
 
@@ -19,11 +25,17 @@ module lzacorrection(
     assign LZAPlus2 = Shifted[`NORMSHIFTSZ-1];
 	// the only possible mantissa for a plus two is all zeroes - a one has to propigate all the way through a sum. so we can leave the bottom statement alone
     assign CorrSumShifted =  LZAPlus1 ? Shifted[`NORMSHIFTSZ-3:1] : Shifted[`NORMSHIFTSZ-4:0];
-    assign CorrShifted = FmaOp ? {CorrSumShifted, {`CORRSHIFTSZ-(3*`NF+6){1'b0}}} : Shifted[`NORMSHIFTSZ-1:`NORMSHIFTSZ-`CORRSHIFTSZ];
+    //                        if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Denorm)
+    assign CorrQuotShifted =  {LZAPlus2|(DivCalcExpM==1&~LZAPlus2) ? Shifted[`NORMSHIFTSZ-1:`NORMSHIFTSZ-`CORRSHIFTSZ] : {Shifted[`NORMSHIFTSZ-2:`NORMSHIFTSZ-`CORRSHIFTSZ], 1'b0}, 1'b0};
+    // if the result of the divider was calculated to be denormalized, then the result was correctly normalized, so select the top shifted bits
+    assign CorrShifted = FmaOp ? {CorrSumShifted, {`CORRSHIFTSZ-(3*`NF+6){1'b0}}} : DivOp&~DivResDenorm ? CorrQuotShifted[`CORRSHIFTSZ-1:0] : Shifted[`NORMSHIFTSZ-1:`NORMSHIFTSZ-`CORRSHIFTSZ];
     // Determine sum's exponent
     //                          if plus1                     If plus2                                      if said denorm but norm plus 1           if said denorm but norm plus 2
     assign SumExp = (ConvNormSumExp+{{`NE+1{1'b0}}, LZAPlus1&~KillProdM}+{{`NE{1'b0}}, LZAPlus2&~KillProdM, 1'b0}+{{`NE+1{1'b0}}, ~ResDenorm&PreResultDenorm&~KillProdM}+{{`NE+1{1'b0}}, &ConvNormSumExp&Shifted[3*`NF+6]&~KillProdM}) & {`NE+2{~(SumZero|ResDenorm)}};
     // recalculate if the result is denormalized
     assign ResDenorm = PreResultDenorm&~Shifted[`NORMSHIFTSZ-3]&~Shifted[`NORMSHIFTSZ-2];
 
+    // the quotent is in the range [.5,2) if there is no early termination
+    // if the quotent < 1 and not denormal then subtract 1 to account for the normalization shift
+    assign CorrDivExp = ((DivResDenorm)&~DivDenormShift[`NE+1]) ? (`NE+2)'(0) : DivCalcExpM - {(`NE+1)'(0), ~LZAPlus2};
 endmodule
\ No newline at end of file
diff --git a/pipelined/src/fpu/postprocess.sv b/pipelined/src/fpu/postprocess.sv
index 217e3f586..ab06a9406 100644
--- a/pipelined/src/fpu/postprocess.sv
+++ b/pipelined/src/fpu/postprocess.sv
@@ -112,6 +112,8 @@ module postprocess(
     logic UfLSBRes;
     logic Sqrt;
     logic [`FMTBITS-1:0] OutFmt;
+    logic DivResDenorm;
+    logic [`NE+1:0] DivDenormShift;
 
     // signals to help readability
     assign Signed = FOpCtrlM[0];
@@ -144,7 +146,7 @@ module postprocess(
                               .XZeroM, .IntToFp, .OutFmt, .CvtResUf, .CvtShiftIn);
     fmashiftcalc fmashiftcalc(.SumM, .ZExpM, .ProdExpM, .FmaNormCntM, .FmtM, .KillProdM, .ConvNormSumExp,
                           .ZDenormM, .SumZero, .PreResultDenorm, .FmaShiftAmt, .FmaShiftIn);
-    divshiftcalc divshiftcalc(.FmtM, .Quot, .DivCalcExpM, .EarlyTermShiftDiv2M, .CorrDivExp, .DivShiftAmt, .DivShiftIn);
+    divshiftcalc divshiftcalc(.FmtM, .Quot, .DivCalcExpM, .EarlyTermShiftDiv2M, .DivResDenorm, .DivDenormShift, .DivShiftAmt, .DivShiftIn);
 
     always_comb
         case(PostProcSelM)
@@ -169,7 +171,8 @@ module postprocess(
     normshift normshift (.ShiftIn, .ShiftAmt, .Shifted);
 
     lzacorrection lzacorrection(.FmaOp, .KillProdM, .PreResultDenorm, .ConvNormSumExp,
-                                .SumZero, .Shifted, .SumExp, .CorrShifted);
+                                .DivResDenorm, .DivDenormShift, .DivOp, .DivCalcExpM,
+                                .CorrDivExp, .SumZero, .Shifted, .SumExp, .CorrShifted);
 
     ///////////////////////////////////////////////////////////////////////////////
     // Rounding

From 033ec135f8e2695cbd2bcd32e3dcbb16b701062b Mon Sep 17 00:00:00 2001
From: slmnemo <nicholas.lucioforlife@yahoo.com>
Date: Mon, 27 Jun 2022 18:56:35 -0700
Subject: [PATCH 03/10] Added reset read testcodes to GPIO

---
 .../references/WALLY-gpio-01.reference_output  | 13 +++++++++++++
 .../rv32i_m/privilege/src/WALLY-gpio-01.S      | 18 +++++++++++++++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-gpio-01.reference_output b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-gpio-01.reference_output
index 3cbf56ae5..3f6dcc8e1 100644
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-gpio-01.reference_output
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-gpio-01.reference_output
@@ -1,5 +1,18 @@
 00000000 # test reset to zero
 00000000
+00000000 # output_en
+00000000 # output_val
+00000000 # rise_ie
+00000000 # rise_ip
+00000000 # fall_ie
+00000000 # fall_ip
+00000000 # high_ie
+00000000 # high_ip
+00000000 # fall_ie
+ffffffff # fall_ip
+00000000 # iof_en
+00000000 # iof_sel
+00000000 # out_xor
 A5A5A5A5 # test output pins
 5A5AFFFF
 00000000 # test input enables
diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-gpio-01.S b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-gpio-01.S
index be40c0e26..4b2496a77 100644
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-gpio-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-gpio-01.S
@@ -70,9 +70,21 @@ test_cases:
 
 # =========== Verify all registers reset to zero ===========
 
-.4byte input_val, 0x00000000, read32_test  # input_val reset to zero
-.4byte input_en, 0x00000000, read32_test  # input_en reset to zero
-# *** add more
+.4byte input_val, 0x00000000, read32_test   # input_val reset to zero
+.4byte input_en, 0x00000000, read32_test    # input_en reset to zero
+.4byte output_en, 0x00000000, read32_test   # output_en reset to zero
+.4byte output_val, 0x00000000, read32_test  # output_val reset to zero
+.4byte rise_ie, 0x00000000, read32_test     # rise_ie reset to zero
+.4byte rise_ip, 0x00000000, read32_test     # rise_ip reset to zero
+.4byte fall_ie, 0x00000000, read32_test     # fall_ie reset to zero
+.4byte fall_ip, 0xffffffff, read32_test     # fall_ip reset to ones (input_val is zero)
+.4byte high_ie, 0x00000000, read32_test     # high_ie reset to zero
+.4byte high_ip, 0x00000000, read32_test     # high_ip reset to zero
+.4byte low_ie, 0x00000000, read32_test      # low_ie reset to zero
+.4byte low_ip, 0x00000000, read32_test      # low_ip reset to zero
+.4byte iof_en, 0x00000000, read32_test      # iof_en reset to zero
+.4byte iof_sel, 0x00000000, read32_test     # iof_sel reset to zero
+.4byte out_xor, 0x00000000, read32_test     # out_xor reset to zero
 
 # =========== Test output and input pins ===========
 

From 7a5dba4b30287d2d5845d48c72be0ae83ed60e83 Mon Sep 17 00:00:00 2001
From: slmnemo <nicholas.lucioforlife@yahoo.com>
Date: Mon, 27 Jun 2022 18:59:44 -0700
Subject: [PATCH 04/10] will this work in git

---
 .../rv64i_m/privilege/src/WALLY-TEST-LIB-64.h | 35 +++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-TEST-LIB-64.h b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-TEST-LIB-64.h
index c24952b42..fdfc3e6d5 100644
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-TEST-LIB-64.h
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-TEST-LIB-64.h
@@ -857,6 +857,27 @@ trap_handler_end_\MODE\(): // place to jump to so we can skip the trap handler a
     addi a6, a6, 8 
 .endm
 
+.macro SETUP_PLIC  
+    # Setup PLIC with a series of register writes
+
+    .equ PLIC_INTPRI_GPIO, 0x0C00000C       # GPIO is interrupt 3
+    .equ PLIC_INTPRI_UART, 0x0C000028       # UART is interrupt 10
+    .equ PLIC_INTPENDING0, 0x0C001000       # intPending0 register
+    .equ PLIC_INTEN00,     0x0C002000       # interrupt enables for context 0 (machine mode) sources 31:1
+    .equ PLIC_INTEN10,     0x0C002080       # interrupt enables for context 1 (supervisor mode) sources 31:1
+    .equ PLIC_THRESH0,     0x0C200000       # Priority threshold for context 0 (machine mode)
+    .equ PLIC_CLAIM0,      0x0C200004       # Claim/Complete register for context 0
+    .equ PLIC_THRESH1,     0x0C201000       # Priority threshold for context 1 (supervisor mode)
+    .equ PLIC_CLAIM1,      0x0C201004       # Claim/Complete register for context 1
+
+    .4byte PLIC_THRESH0, 0, write32_test    # Set PLIC machine mode interrupt threshold to 0 to accept all interrupts
+    .4byte PLIC_THRESH1, 7, write32_test    # Set PLIC supervisor mode interrupt threshold to 7 to accept no interrupts
+    .4byte PLIC_INTPRI_GPIO, 7, write32_test # Set GPIO to high priority
+    .4byte PLIC_INTPRI_UART, 7, write32_test # Set UART to high priority
+    .4byte PLIC_INTEN00, 0xFFFFFFFF, write32_test # Enable all interrupt sources for machine mode
+    .4byte PLIC_INTEN10, 0x00000000, write32_test # Disable all interrupt sources for supervisor mode
+.endm
+
 .macro END_TESTS
     // invokes one final ecall to return to machine mode then terminates this program, so the output is
     //      0x8: termination called from U mode
@@ -984,6 +1005,20 @@ read08_test:
     addi a6, a6, 8
     j test_loop // go to next test case
 
+readmip_test:  // read the MIP into the signature
+    csrr t2, mip
+    sw t2, 0(t1)
+    addi t1, t1, 4
+    addi a6, a6, 4
+    j test_loop // go to next test case
+
+readsip_test:  // read the MIP into the signature
+    csrr t2, sip
+    sw t2, 0(t1)
+    addi t1, t1, 4
+    addi a6, a6, 4
+    j test_loop // go to next test case
+
 goto_s_mode:
     // return to address in t3, 
     li a0, 3 // Trap handler behavior (go to supervisor mode)

From bb62ebc84f63e49306749390c4912f63492688dd Mon Sep 17 00:00:00 2001
From: Madeleine Masser-Frye <51804758+mmasserfrye@users.noreply.github.com>
Date: Tue, 28 Jun 2022 02:23:29 +0000
Subject: [PATCH 05/10] make clean rm extra files

---
 synthDC/Makefile        | 6 ++++--
 synthDC/runAllSynths.sh | 1 +
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/synthDC/Makefile b/synthDC/Makefile
index 53faa4522..611dcfef9 100755
--- a/synthDC/Makefile
+++ b/synthDC/Makefile
@@ -5,8 +5,8 @@ NAME := synth
 
 # defaults
 export DESIGN ?= wallypipelinedcore
-export FREQ ?= 4000
-export CONFIG ?= rv64gc
+export FREQ ?= 3402
+export CONFIG ?= rv32e
 # sky130 and sky90 presently supported
 export TECH ?= tsmc28
 # MAXCORES allows parallel compilation, which is faster but less CPU-efficient
@@ -126,6 +126,8 @@ clean:
 	rm -f command.log
 	rm -f filenames*.log
 	rm -f power.saif
+	rm -f Synopsys_stack_trace_*.txt
+	rm -f crte_*.txt
 
 
 
diff --git a/synthDC/runAllSynths.sh b/synthDC/runAllSynths.sh
index 1b81a6cd0..6944552d4 100755
--- a/synthDC/runAllSynths.sh
+++ b/synthDC/runAllSynths.sh
@@ -1,5 +1,6 @@
 #!/usr/bin/bash
 
+make clean
 mv runs runArchive/$(date +"%Y_%m_%d_%I_%M_%p")
 mv newRuns runs
 mkdir newRuns

From 726992540f5c6d7677d4719739804b4eacc3d688 Mon Sep 17 00:00:00 2001
From: Madeleine Masser-Frye <51804758+mmasserfrye@users.noreply.github.com>
Date: Tue, 28 Jun 2022 02:28:13 +0000
Subject: [PATCH 06/10] update wally synth analysis

---
 synthDC/extractSummary.py | 49 ++++++++++++++++++++++-----------------
 synthDC/wallySynth.py     | 12 ++++++----
 2 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/synthDC/extractSummary.py b/synthDC/extractSummary.py
index 4469d4bea..a2f6a9b50 100755
--- a/synthDC/extractSummary.py
+++ b/synthDC/extractSummary.py
@@ -7,6 +7,7 @@ import subprocess
 from matplotlib.cbook import flatten
 import matplotlib.pyplot as plt
 import matplotlib.lines as lines
+from wallySynth import testFreq
 
 
 def synthsintocsv():
@@ -26,7 +27,7 @@ def synthsintocsv():
     writer.writerow(['Width', 'Config', 'Special', 'Tech', 'Target Freq', 'Delay', 'Area'])
 
     for oneSynth in allSynths:
-        descrip = specReg.findall(oneSynth)
+        descrip = specReg.findall(oneSynth) #[30:]
         width = descrip[2][:4]
         config = descrip[2][4:]
         if descrip[3][-2:] == 'nm':
@@ -46,7 +47,7 @@ def synthsintocsv():
                 nums = [float(m) for m in nums]
                 metrics += nums
             except: 
-                print(config + tech + freq + " doesn't have reports")
+                print(width + config + tech + '_' + freq + " doesn't have reports")
         if metrics == []:
             pass
         else:
@@ -56,7 +57,7 @@ def synthsintocsv():
     file.close()
 
 def synthsfromcsv(filename):
-    Synth = namedtuple("Synth", " width config special tech freq delay area")
+    Synth = namedtuple("Synth", "width config special tech freq delay area")
     with open(filename, newline='') as csvfile:
         csvreader = csv.reader(csvfile)
         global allSynths
@@ -110,23 +111,26 @@ def freqPlot(tech, width, config):
     plt.savefig('./plots/wally/freqSweep_' + tech + '_' + width + config + '.png')
     # plt.show()
 
-def areaDelay(width, tech, freq, config=None, special=None):
+def areaDelay(tech, freq, width=None, config=None, special=None):
     delays, areas, labels = ([] for i in range(3))
 
     for oneSynth in allSynths:
-        if (width == oneSynth.width) & (tech == oneSynth.tech) & (freq == oneSynth.freq):
-            if (special != None) & (oneSynth.special == special):
-                delays += [oneSynth.delay]
-                areas += [oneSynth.area]
-                labels += [oneSynth.config]
-            elif (config != None) & (oneSynth.config == config):
-                delays += [oneSynth.delay]
-                areas += [oneSynth.area]
-                labels += [oneSynth.special]
-            else:
-                delays += [oneSynth.delay]
-                areas += [oneSynth.area]
-                labels += [oneSynth.config + '_' + oneSynth.special]
+        if (width==None) or (width == oneSynth.width):
+            if (tech == oneSynth.tech) & (freq == oneSynth.freq):
+                if (special != None) & (oneSynth.special == special):
+                    delays += [oneSynth.delay]
+                    areas += [oneSynth.area]
+                    labels += [oneSynth.width + oneSynth.config]
+                elif (config != None) & (oneSynth.config == config):
+                    delays += [oneSynth.delay]
+                    areas += [oneSynth.area]
+                    labels += [oneSynth.special]
+            # else:
+            #     delays += [oneSynth.delay]
+            #     areas += [oneSynth.area]
+            #     labels += [oneSynth.config + '_' + oneSynth.special]
+    if width == None:
+        width = ''
     
     f, (ax1) = plt.subplots(1, 1)
     plt.scatter(delays, areas)
@@ -154,8 +158,11 @@ def areaDelay(width, tech, freq, config=None, special=None):
 # ending freq in 42 means fpu was turned off manually
 
 if __name__ == '__main__':
-    synthsintocsv()
+    # synthsintocsv()
     synthsfromcsv('Summary.csv')
-    freqPlot('tsmc28', 'rv64', 'gc')
-    areaDelay('rv32', 'tsmc28', 4200, config='gc')
-    areaDelay('rv32', 'tsmc28', 3042, special='')
\ No newline at end of file
+    freqPlot('tsmc28', 'rv32', 'e')
+    freqPlot('sky90', 'rv32', 'e')
+    areaDelay('tsmc28', testFreq[1], width= 'rv64', config='gc')
+    areaDelay('tsmc28', testFreq[1], special='')
+    areaDelay('sky90', testFreq[0], width='rv64', config='gc')
+    areaDelay('sky90', testFreq[0], special='')
\ No newline at end of file
diff --git a/synthDC/wallySynth.py b/synthDC/wallySynth.py
index bf32b6f9b..99d70e813 100755
--- a/synthDC/wallySynth.py
+++ b/synthDC/wallySynth.py
@@ -8,20 +8,22 @@ def runCommand(config, tech, freq):
     command = "make synth DESIGN=wallypipelinedcore CONFIG={} TECH={} DRIVE=FLOP FREQ={} MAXOPT=0 MAXCORES=1".format(config, tech, freq)
     subprocess.Popen(command, shell=True)
 
+testFreq = [3000, 10000]
+
 if __name__ == '__main__':
 
     techs = ['sky90', 'tsmc28']
-    bestAchieved = [750, 3000]
+    sweepCenter = [870, 3000]
     synthsToRun = []
 
-    
     arr = [-8, -6, -4, -2, 0, 2, 4, 6, 8]
     for i in [0, 1]:
         tech = techs[i]
-        f = bestAchieved[i]
-        for freq in [round(f+f*x/100) for x in arr]: # rv32e freq sweep
+        sc = sweepCenter[i]
+        f = testFreq[i]
+        for freq in [round(sc+sc*x/100) for x in arr]: # rv32e freq sweep
             synthsToRun += [['rv32e', tech, freq]]
-        for config in ['rv32gc', 'rv32ic', 'rv64gc', 'rv64i', 'rv64ic']: # configs
+        for config in ['rv32gc', 'rv32ic', 'rv64gc', 'rv64i', 'rv64ic', 'rv32e']: # configs
             synthsToRun += [[config, tech, f]]
         for mod in ['FPUoff', 'noMulDiv', 'noPriv', 'PMP0', 'PMP16']: # rv64gc path variations
             config = 'rv64gc_' + mod

From 228028c8375aa52ebb7659a895ec5264c40e5233 Mon Sep 17 00:00:00 2001
From: slmnemo <nicholas.lucioforlife@yahoo.com>
Date: Mon, 27 Jun 2022 20:09:58 -0700
Subject: [PATCH 07/10] Add CLINT tests from book

---
 pipelined/testbench/tests.vh                  |   5 +-
 .../rv32i_m/privilege/Makefrag                |   1 +
 .../WALLY-clint-01.reference_output           |   9 ++
 .../rv32i_m/privilege/src/WALLY-clint-01.S    | 102 ++++++++++++++++++
 4 files changed, 116 insertions(+), 1 deletion(-)
 create mode 100644 tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-clint-01.reference_output
 create mode 100644 tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-clint-01.S

diff --git a/pipelined/testbench/tests.vh b/pipelined/testbench/tests.vh
index c17cef914..30b00cf48 100644
--- a/pipelined/testbench/tests.vh
+++ b/pipelined/testbench/tests.vh
@@ -1601,6 +1601,9 @@ string wally32i[] = '{
 
  string wally32periph[] = '{
     `WALLYTEST,
-    "rv32i_m/privilege/WALLY-gpio-01"
+    "rv32i_m/privilege/WALLY-gpio-01",
+    "rv32i_m/privilege/WALLY-clint-01"
+    // "rv32i_m/privilege/WALLY-plic-01"
+    // "rv32i_m/privilege/WALLY-uart-01"
  };
 
diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/Makefrag b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/Makefrag
index 5d98f81cc..56b3bc01f 100644
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/Makefrag
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/Makefrag
@@ -54,6 +54,7 @@ target_tests_nosim = \
     WALLY-status-sie-01 \
     WALLY-status-tw-01 \
     WALLY-gpio-01 \
+    WALLY-clint-01 \
 
 
 rv32i_tests = $(addsuffix .elf, $(rv32i_sc_tests))
diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-clint-01.reference_output b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-clint-01.reference_output
new file mode 100644
index 000000000..013ef4604
--- /dev/null
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/references/WALLY-clint-01.reference_output
@@ -0,0 +1,9 @@
+00000000 # msip zero on reset
+00000000 # mip is zero
+00000008 # mip msip bit is set
+00000000 # mip msip bit is reset
+00000000 # mip mtip bit is reset
+FFFFFFFF # mtimecmp is same as written value
+A5A5A5A5 # mtimecmph is same as written value
+00000000 # mip mtip is zero
+00000080 # mip mtip is set
diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-clint-01.S b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-clint-01.S
new file mode 100644
index 000000000..65f078b60
--- /dev/null
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-clint-01.S
@@ -0,0 +1,102 @@
+///////////////////////////////////////////
+//
+// WALLY-gpio
+//
+// Author: David_Harris@hmc.edu and Nicholas Lucio <nlucio@hmc.edu>
+//
+// Created 2022-06-16
+//
+// Copyright (C) 2021 Harvey Mudd College & Oklahoma State University
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software
+// is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT
+// OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+///////////////////////////////////////////
+
+#include "WALLY-TEST-LIB-32.h" 
+
+INIT_TESTS
+
+TRAP_HANDLER m
+
+j run_test_loop // begin test loop/table tests instead of executing inline code.
+
+INIT_TEST_TABLE
+
+END_TESTS
+
+TEST_STACK_AND_DATA
+
+.align 2
+test_cases:
+# ---------------------------------------------------------------------------------------------
+# Test Contents
+#
+#   Here is where the actual tests are held, or rather, what the actual tests do.
+#   each entry consists of 3 values that will be read in as follows:
+#   
+#   '.4byte [x28 Value], [x29 Value], [x30 value]'
+#                     or
+#   '.4byte [address], [value], [test type]'
+#
+#   The encoding for x30 test type values can be found in the test handler in the framework file
+# 
+# ---------------------------------------------------------------------------------------------
+
+# =========== Define CLINT registers ===========
+
+.equ CLINT, 0x02000000
+.equ msip, (CLINT+0x00)
+.equ mtimecmp, (CLINT+0x4000)   # doesn't necessarily reset to zero
+.equ mtimecmph,(CLINT+0x4004)
+.equ mtime, (CLINT+0xBFF8)      # resets to zero but cannot be easily tested
+.equ mtimeh, (CLINT+0xBFFC)
+
+# =========== Verify verifiable registers reset to zero ===========
+
+.4byte msip, 0x00000000, read32_test    # msip reset to zero
+
+# =========== msip tests ===========
+
+.4byte msip, 0xFFFFFFFE, write32_test   # write to invalid bits of msip
+.4byte 0x0, 0x00000000, readmip_test    # msip bit should be zero
+.4byte msip, 0x00000001, write32_test   # set msip to one
+.4byte 0x0, 0x00000008, readmip_test    # msip bit is set  
+.4byte msip, 0x00000000, write32_test   # set msip to zero
+.4byte 0x0, 0x00000000, readmip_test    # msip bit is released
+
+# =========== mtime write tests ===========
+
+.4byte mtime, 0x00000000, write32_test  # test we can write to mtime
+.4byte mtimeh, 0x00000000, write32_test # test we can write to mtimeh
+.4byte 0x0,0x00000000, readmip_test     # mtip bit should be zero
+
+# =========== mtimecmp tests ===========
+
+.4byte mtimecmp, 0xFFFFFFFF, write32_test   # verify mtimecmp is writable
+.4byte mtimecmph, 0xA5A5A5A5, write32_test  # verify mtimecmph is writable
+.4byte mtimecmp, 0xFFFFFFFF, read32_test    # read back value written to mtimecmp
+.4byte mtimecmph, 0xA5A5A5A5, read32_test   # read back value written to mtimecmph
+.4byte mtime, 0xFFFFFFFF, write32_test      # write to mtime
+.4byte 0x0, 0x00000000, readmip_test        # mtip should still be zero
+.4byte mtimeh, 0xA5A5A5A6, write32_test     # cause mtip to go high by making mtime > mtimecmp
+.4byte 0x0, 0x00000080, readmip_test        # mtip should be set
+
+
+# =========== Experimental mtime counting test ===========
+
+# .4byte mtimecmph, 0xFFFFFFFF, write32_test  # make sure mtip isn't set until ready
+# .4byte mtimeh, 0x0FFFFFFF, write32_test     # write near max value to mtimeh
+# .4byte mtime, 0x00000000, write32_test      # write small value to mtime
+# .4byte 0x0, 0x000000000, readmip_test       # mtip should be zero
+# .4byte mtimecmp, 0x00000001, write32_test   # write slightly larger value than mtime to test mtime counting
+# .4byte mtimecmph, 0x0FFFFFFF, write32_test  # write same value as mtimeh to test mtime counting
+# .4byte 0x0, 0x00000080, readmip_test        # mtip should be set since it has been at least two cycles

From f21c3114fdcba8f0d6adb7b7742ed63d84297c8f Mon Sep 17 00:00:00 2001
From: slmnemo <nicholas.lucioforlife@yahoo.com>
Date: Mon, 27 Jun 2022 20:16:29 -0700
Subject: [PATCH 08/10] Added termination line to CLINT test

---
 .../riscv-test-suite/rv32i_m/privilege/src/WALLY-clint-01.S      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-clint-01.S b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-clint-01.S
index 65f078b60..7cfd83c1a 100644
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-clint-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-clint-01.S
@@ -90,6 +90,7 @@ test_cases:
 .4byte mtimeh, 0xA5A5A5A6, write32_test     # cause mtip to go high by making mtime > mtimecmp
 .4byte 0x0, 0x00000080, readmip_test        # mtip should be set
 
+.4byte 0x0, 0x0, terminate_test # terminate tests
 
 # =========== Experimental mtime counting test ===========
 

From d13a4c337861c95bba53a39ae5a461354d63c6a9 Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Tue, 28 Jun 2022 18:01:11 +0000
Subject: [PATCH 09/10] removed an adder out of early termination

---
 addins/riscv-arch-test      | 2 +-
 pipelined/srt/srt-radix4.sv | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/addins/riscv-arch-test b/addins/riscv-arch-test
index 307c77b26..be67c99bd 160000
--- a/addins/riscv-arch-test
+++ b/addins/riscv-arch-test
@@ -1 +1 @@
-Subproject commit 307c77b26e070ae85ffea665ad9b642b40e33c86
+Subproject commit be67c99bd461742aa1c100bcc0732657faae2230
diff --git a/pipelined/srt/srt-radix4.sv b/pipelined/srt/srt-radix4.sv
index 179fbf45a..39432c9e3 100644
--- a/pipelined/srt/srt-radix4.sv
+++ b/pipelined/srt/srt-radix4.sv
@@ -143,12 +143,13 @@ module earlytermination(
  
    logic [$clog2(`DIVLEN/2+3)-1:0]  Count;
    logic WZero;
+   logic [`DIVLEN+3:0] W;
 
-   assign WZero = (WS+WC == 0)|XZeroE|YZeroE|XInfE|YInfE|XNaNE|YNaNE; //*** temporary
-   // *** rather than Counting should just be able to check if one of the two msbs of the quotent is 1 then stop???
+  assign WZero = ((WS^WC)=={WS[`DIVLEN+2:0]|WC[`DIVLEN+2:0], 1'b0})|XZeroE|YZeroE|XInfE|YInfE|XNaNE|YNaNE;
   assign DivDone = (DivStickyE | WZero);
   assign DivStickyE = ~|Count;
-  assign DivNegStickyE = $signed(WS+WC) < 0;
+  assign W = WC+WS;
+  assign DivNegStickyE = W[`DIVLEN+3]; //*** is there a better way to do this???
   assign EarlyTermShiftDiv2E = Count;
   // +1 for setup
   // `DIVLEN/2 to get required number of bits

From 8f98f3bfabf03cea00fc10e176a7c6cd119192ef Mon Sep 17 00:00:00 2001
From: Katherine Parry <kparry4@gmail.com>
Date: Tue, 28 Jun 2022 21:33:31 +0000
Subject: [PATCH 10/10] added rv32 double precision stores - untested

---
 pipelined/src/cache/cache.sv              | 13 ++++++++++---
 pipelined/src/cache/cacheway.sv           | 11 +++++++++--
 pipelined/src/fpu/fctrl.sv                |  6 +++---
 pipelined/src/fpu/fpu.sv                  | 23 ++++++++++++++++++-----
 pipelined/src/ieu/datapath.sv             | 10 ++++++++--
 pipelined/src/ifu/ifu.sv                  |  2 +-
 pipelined/src/lsu/lsu.sv                  |  8 +++++---
 pipelined/src/lsu/subwordread.sv          | 14 +++++++-------
 pipelined/src/wally/wallypipelinedcore.sv | 11 ++++++++---
 9 files changed, 69 insertions(+), 29 deletions(-)

diff --git a/pipelined/src/cache/cache.sv b/pipelined/src/cache/cache.sv
index 2374b4938..d380bfc83 100644
--- a/pipelined/src/cache/cache.sv
+++ b/pipelined/src/cache/cache.sv
@@ -43,6 +43,9 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGWPL, WORDLEN, MUXINTER
   input logic [`PA_BITS-1:0]  PAdr, // physical address
   input logic [(`XLEN-1)/8:0] ByteMask,
   input logic [`XLEN-1:0]     FinalWriteData,
+  input logic [`FLEN-1:0]     FWriteDataM,
+  input logic                        FLoad2,
+  input logic                 FpLoadStoreM,
   output logic                CacheCommitted,
   output logic                CacheStall,
    // to performance counters to cpu
@@ -120,7 +123,7 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGWPL, WORDLEN, MUXINTER
 
   // Array of cache ways, along with victim, hit, dirty, and read merging logic
   cacheway #(NUMLINES, LINELEN, TAGLEN, OFFSETLEN, SETLEN) 
-    CacheWays[NUMWAYS-1:0](.clk, .reset, .RAdr, .PAdr, .CacheWriteData, .ByteMask,
+    CacheWays[NUMWAYS-1:0](.clk, .reset, .RAdr, .PAdr, .CacheWriteData, .ByteMask, .FLoad2,
     .SetValidWay, .ClearValidWay, .SetDirtyWay, .ClearDirtyWay, .SelEvict, .VictimWay,
     .FlushWay, .SelFlush, .ReadDataLineWay, .HitWay, .VictimDirtyWay, .VictimTagWay, 
     .Invalidate(InvalidateCacheM));
@@ -159,8 +162,12 @@ module cache #(parameter LINELEN,  NUMLINES,  NUMWAYS, LOGWPL, WORDLEN, MUXINTER
   /////////////////////////////////////////////////////////////////////////////////////////////
   // Write Path: Write data and address. Muxes between writes from bus and writes from CPU.
   /////////////////////////////////////////////////////////////////////////////////////////////
-  mux2 #(LINELEN) WriteDataMux(.d0({WORDSPERLINE{FinalWriteData}}),
-		.d1(CacheBusWriteData),	.s(SetValid), .y(CacheWriteData));
+  if (`LLEN>`XLEN)
+    mux3 #(LINELEN) WriteDataMux(.d0({WORDSPERLINE{FinalWriteData}}),
+      .d1({WORDSPERLINE/2{FWriteDataM}}),	.d2(CacheBusWriteData),	.s({SetValid,FpLoadStoreM&~SetValid}), .y(CacheWriteData));
+  else
+    mux2 #(LINELEN) WriteDataMux(.d0({WORDSPERLINE{FinalWriteData}}),
+      .d1(CacheBusWriteData),	.s(SetValid), .y(CacheWriteData));
   mux3 #(`PA_BITS) CacheBusAdrMux(.d0({PAdr[`PA_BITS-1:OFFSETLEN], {{OFFSETLEN}{1'b0}}}),
 		.d1({VictimTag, PAdr[SETTOP-1:OFFSETLEN], {{OFFSETLEN}{1'b0}}}),
 		.d2({VictimTag, FlushAdr, {{OFFSETLEN}{1'b0}}}),
diff --git a/pipelined/src/cache/cacheway.sv b/pipelined/src/cache/cacheway.sv
index d9a478612..ac1e26e8f 100644
--- a/pipelined/src/cache/cacheway.sv
+++ b/pipelined/src/cache/cacheway.sv
@@ -38,6 +38,7 @@ module cacheway #(parameter NUMLINES=512, parameter LINELEN = 256, TAGLEN = 26,
   input logic [$clog2(NUMLINES)-1:0] RAdr,
   input logic [`PA_BITS-1:0]         PAdr,
   input logic [LINELEN-1:0]          CacheWriteData,
+  input logic                        FLoad2,
   input logic                        SetValidWay,
   input logic                        ClearValidWay,
   input logic                        SetDirtyWay,
@@ -74,8 +75,14 @@ module cacheway #(parameter NUMLINES=512, parameter LINELEN = 256, TAGLEN = 26,
   /////////////////////////////////////////////////////////////////////////////////////////////
   // Write Enable demux
   /////////////////////////////////////////////////////////////////////////////////////////////
-  onehotdecoder #(LOGWPL) adrdec(
-    .bin(PAdr[LOGWPL+LOGXLENBYTES-1:LOGXLENBYTES]), .decoded(MemPAdrDecoded));
+  if(`LLEN>`XLEN)begin 
+    logic [2**LOGWPL-1:0] MemPAdrDecodedtmp;
+    onehotdecoder #(LOGWPL) adrdec(
+      .bin(PAdr[LOGWPL+LOGXLENBYTES-1:LOGXLENBYTES]), .decoded(MemPAdrDecodedtmp));
+    assign MemPAdrDecoded = MemPAdrDecodedtmp|{MemPAdrDecodedtmp[2**LOGWPL-2:0]&{2**LOGWPL-1{FLoad2}}, 1'b0};
+  end else
+    onehotdecoder #(LOGWPL) adrdec(
+      .bin(PAdr[LOGWPL+LOGXLENBYTES-1:LOGXLENBYTES]), .decoded(MemPAdrDecoded));
   // If writing the whole line set all write enables to 1, else only set the correct word.
   assign SelectedWriteWordEn = SetValidWay ? '1 : SetDirtyWay ? MemPAdrDecoded : '0; // OR-AND
   assign FinalByteMask = SetValidWay ? '1 : ByteMask; // OR
diff --git a/pipelined/src/fpu/fctrl.sv b/pipelined/src/fpu/fctrl.sv
index 60d260027..f6ed650af 100755
--- a/pipelined/src/fpu/fctrl.sv
+++ b/pipelined/src/fpu/fctrl.sv
@@ -33,8 +33,8 @@ module fctrl (
                     default: ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1; // non-implemented instruction
                   endcase
       7'b0100111: case(Funct3D)
-                    3'b010:  ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_0; // fsw
-                    3'b011:  ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_0; // fsd
+                    3'b010:  ControlsD = `FCTRLW'b0_0_10_xx_0xx_0_0; // fsw
+                    3'b011:  ControlsD = `FCTRLW'b0_0_10_xx_0xx_0_0; // fsd
                     default: ControlsD = `FCTRLW'b0_0_00_xx_0xx_0_1; // non-implemented instruction
                   endcase
       7'b1000011:   ControlsD = `FCTRLW'b1_0_01_10_000_0_0; // fmadd
@@ -121,7 +121,7 @@ module fctrl (
       assign FmtD = 0;
     else if (`FPSIZES == 2)begin
       logic [1:0] FmtTmp;
-      assign FmtTmp = ((Funct7D[6:3] == 4'b0100)&OpD[4]) ? Rs2D[1:0] : Funct7D[1:0];
+      assign FmtTmp = ((Funct7D[6:3] == 4'b0100)&OpD[4]) ? Rs2D[1:0] : (~OpD[6]&(&OpD[2:0])) ? {~Funct3D[1], ~(Funct3D[1]^Funct3D[0])} : Funct7D[1:0];
       assign FmtD = (`FMT == FmtTmp);
     end
     else if (`FPSIZES == 3|`FPSIZES == 4)
diff --git a/pipelined/src/fpu/fpu.sv b/pipelined/src/fpu/fpu.sv
index aba1a8f48..25b39d69b 100755
--- a/pipelined/src/fpu/fpu.sv
+++ b/pipelined/src/fpu/fpu.sv
@@ -41,10 +41,12 @@ module fpu (
   input logic [4:0] 	   RdM, RdW, // which FP register to write to (from IEU)
   input logic [1:0]        STATUS_FS, // Is floating-point enabled?
   output logic 		   FRegWriteM, // FP register write enable
-  output logic 		   FpLoadM, // Fp load instruction?
+  output logic 		   FpLoadStoreM, // Fp load instruction?
+  output logic              FLoad2,
   output logic 		   FStallD, // Stall the decode stage
   output logic 		   FWriteIntE, // integer register write enables
   output logic [`XLEN-1:0] FWriteDataE, // Data to be written to memory
+  output logic [`FLEN-1:0] FWriteDataM, // Data to be written to memory
   output logic [`XLEN-1:0] FIntResM, // data to be written to integer register
   output logic [`XLEN-1:0] FCvtIntResW, // data to be written to integer register
   output logic [1:0]       FResSelW,
@@ -292,8 +294,19 @@ module fpu (
    // data to be stored in memory - to IEU
    //    - FP uses NaN-blocking format
    //        - if there are any unsused bits the most significant bits are filled with 1s
-   if (`FLEN>`XLEN) assign FWriteDataE = FSrcYE[`XLEN-1:0]; 
-   else assign FWriteDataE = {{`XLEN-`FLEN{FSrcYE[`FLEN-1]}}, FSrcYE}; 
+   if (`LLEN==`XLEN) begin
+      assign FWriteDataE = FSrcYE[`XLEN-1:0]; 
+   end else begin
+      logic [`FLEN-1:0] FWriteDataE;
+      if(`FMTBITS == 2) assign FLoad2 = FmtM == `FMT;
+      else assign FLoad2 = FmtM;
+
+      if (`FPSIZES==1) assign FWriteDataE = FSrcYE;
+      else if (`FPSIZES==2) assign FWriteDataE = FmtE ? FSrcYE : {2{FSrcYE[`LEN1-1:0]}};
+      else assign FWriteDataE = FmtE == `FMT ? FSrcYE : {2{FSrcYE[`LEN1-1:0]}};
+
+      flopenrc #(`FLEN) EMWriteDataReg (clk, reset, FlushM, ~StallM, FWriteDataE, FWriteDataM);
+   end
 
    // NaN Block SrcA
    generate
@@ -311,7 +324,7 @@ module fpu (
    assign PreNVE = CmpNVE&(FOpCtrlE[2]|FWriteIntE);
 
    // select the result that may be written to the integer register - to IEU
-   if (`FLEN>`XLEN) 
+   if (`FLEN>`XLEN)
       assign IntSrcXE = FSrcXE[`XLEN-1:0];
    else 
       assign IntSrcXE = {{`XLEN-`FLEN{FSrcXE[`FLEN-1:0]}}, FSrcXE};
@@ -356,7 +369,7 @@ module fpu (
    //          |||         |||
    //////////////////////////////////////////////////////////////////////////////////////////
 
-   assign FpLoadM = FResSelM[1];
+   assign FpLoadStoreM = FResSelM[1];
 
    postprocess postprocess(.XSgnM, .YSgnM, .ZExpM, .XManM, .YManM, .ZManM, .FrmM, .FmtM, .ProdExpM, .EarlyTermShiftDiv2M,
                            .AddendStickyM, .KillProdM, .XZeroM, .YZeroM, .ZZeroM, .XInfM, .YInfM, .Quot,
diff --git a/pipelined/src/ieu/datapath.sv b/pipelined/src/ieu/datapath.sv
index b7a6a9644..df711695e 100644
--- a/pipelined/src/ieu/datapath.sv
+++ b/pipelined/src/ieu/datapath.sv
@@ -124,12 +124,18 @@ module datapath (
   flopenrc #(5)     RdWReg(clk, reset, FlushW, ~StallW, RdM, RdW);
 
   // floating point interactions: fcvt, fp stores
-  if (`F_SUPPORTED) begin:fpmux
+  if (`F_SUPPORTED&(`LLEN>`XLEN)) begin:fpmux
+    logic [`XLEN-1:0] IFCvtResultW;
+    mux2  #(`XLEN)  resultmuxM(IEUResultM, FIntResM, FWriteIntM, IFResultM);
+    assign WriteDataE = ForwardedSrcBE;
+    mux2  #(`XLEN)  cvtresultmuxW(IFResultW, FCvtIntResW, ~FResSelW[1]&FResSelW[0], IFCvtResultW);
+    mux5  #(`XLEN)  resultmuxW(IFCvtResultW, ReadDataW, CSRReadValW, MDUResultW, SCResultW, ResultSrcW, ResultW); 
+  end else if (`F_SUPPORTED) begin:fpmux
     logic [`XLEN-1:0] IFCvtResultW;
     mux2  #(`XLEN)  resultmuxM(IEUResultM, FIntResM, FWriteIntM, IFResultM);
     mux2  #(`XLEN)  writedatamux(ForwardedSrcBE, FWriteDataE, ~IllegalFPUInstrE, WriteDataE);
     mux2  #(`XLEN)  cvtresultmuxW(IFResultW, FCvtIntResW, ~FResSelW[1]&FResSelW[0], IFCvtResultW);
-    mux5  #(`XLEN)    resultmuxW(IFCvtResultW, ReadDataW, CSRReadValW, MDUResultW, SCResultW, ResultSrcW, ResultW);	 
+    mux5  #(`XLEN)  resultmuxW(IFCvtResultW, ReadDataW, CSRReadValW, MDUResultW, SCResultW, ResultSrcW, ResultW); 
   end else begin:fpmux
     assign IFResultM = IEUResultM; assign WriteDataE = ForwardedSrcBE;
     mux5  #(`XLEN)    resultmuxW(IFResultW, ReadDataW, CSRReadValW, MDUResultW, SCResultW, ResultSrcW, ResultW);	 
diff --git a/pipelined/src/ifu/ifu.sv b/pipelined/src/ifu/ifu.sv
index 29d07cc2c..02e748f31 100644
--- a/pipelined/src/ifu/ifu.sv
+++ b/pipelined/src/ifu/ifu.sv
@@ -227,7 +227,7 @@ module ifu (
       icache(.clk, .reset, .CPUBusy, .IgnoreRequestTLB(ITLBMissF), .TrapM(TrapM), .IgnoreRequestTrapM('0),
              .CacheBusWriteData(ICacheBusWriteData), .CacheBusAck(ICacheBusAck),
              .CacheBusAdr(ICacheBusAdr), .CacheStall(ICacheStallF), 
-             .CacheFetchLine(ICacheFetchLine),
+             .CacheFetchLine(ICacheFetchLine), .FWriteDataM(), .FpLoadStoreM(), .FLoad2(),
              .CacheWriteLine(), .ReadDataWord(FinalInstrRawF),
              .Cacheable(CacheableF),
              .CacheMiss(ICacheMiss), .CacheAccess(ICacheAccess),
diff --git a/pipelined/src/lsu/lsu.sv b/pipelined/src/lsu/lsu.sv
index 7234a7cac..5c56b1356 100644
--- a/pipelined/src/lsu/lsu.sv
+++ b/pipelined/src/lsu/lsu.sv
@@ -57,7 +57,9 @@ module lsu (
    input logic              BigEndianM,
    input logic              sfencevmaM,
    // fpu
-   input logic              FpLoadM,
+   input logic [`FLEN-1:0]  FWriteDataM,
+   input logic              FLoad2,
+   input logic              FpLoadStoreM,
    // faults
    output logic             LoadPageFaultM, StoreAmoPageFaultM,
    output logic             LoadMisalignedFaultM, LoadAccessFaultM,
@@ -235,7 +237,7 @@ module lsu (
               .NUMWAYS(`DCACHE_NUMWAYS), .LOGWPL(LOGWPL), .WORDLEN(`LLEN), .MUXINTERVAL(`XLEN), .DCACHE(1)) dcache(
         .clk, .reset, .CPUBusy, .LSUBusWriteCrit, .RW(LSURWM), .Atomic(LSUAtomicM),
         .FlushCache(FlushDCacheM), .NextAdr(LSUAdrE), .PAdr(LSUPAdrM), 
-        .ByteMask(ByteMaskM), .WordCount,
+        .ByteMask(ByteMaskM), .WordCount, .FpLoadStoreM, .FWriteDataM, .FLoad2,
         .FinalWriteData(FinalWriteDataM), .Cacheable(CacheableM),
         .CacheStall(DCacheStallM), .CacheMiss(DCacheMiss), .CacheAccess(DCacheAccess),
         .IgnoreRequestTLB, .IgnoreRequestTrapM, .TrapM(1'b0), .CacheCommitted(DCacheCommittedM), 
@@ -269,7 +271,7 @@ module lsu (
   subwordwrite subwordwrite(.LSUPAdrM(LSUPAdrM[2:0]),
     .LSUFunct3M, .AMOWriteDataM, .LittleEndianWriteDataM, .ByteMaskM);
   subwordread subwordread(.ReadDataWordMuxM, .LSUPAdrM(LSUPAdrM[2:0]),
-		.FpLoadM, .Funct3M(LSUFunct3M), .ReadDataM);
+		.FpLoadStoreM, .Funct3M(LSUFunct3M), .ReadDataM);
 
   /////////////////////////////////////////////////////////////////////////////////////////////
   // MW Pipeline Register
diff --git a/pipelined/src/lsu/subwordread.sv b/pipelined/src/lsu/subwordread.sv
index 4a6d99bfc..d38595d49 100644
--- a/pipelined/src/lsu/subwordread.sv
+++ b/pipelined/src/lsu/subwordread.sv
@@ -35,7 +35,7 @@ module subwordread
    input logic [`LLEN-1:0] 	ReadDataWordMuxM,
    input logic [2:0] 		LSUPAdrM,
    input logic [2:0] 		Funct3M,
-   input logic          FpLoadM, 
+   input logic          FpLoadStoreM, 
    output logic [`LLEN-1:0] ReadDataM
    );
 
@@ -83,16 +83,16 @@ module subwordread
     case(Funct3M)
       3'b000:  ReadDataM = {{`LLEN-8{ByteM[7]}}, ByteM};                              // lb
       3'b001:  if(`ZFH_SUPPORTED) 
-                    ReadDataM = {{`LLEN-16{HalfwordM[15]|FpLoadM}}, HalfwordM[15:0]}; // lh/flh
+                    ReadDataM = {{`LLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]}; // lh/flh
                else ReadDataM = {{`LLEN-16{HalfwordM[15]}}, HalfwordM[15:0]};         // lh 
       3'b010:  if(`F_SUPPORTED) 
-                    ReadDataM = {{`LLEN-32{WordM[31]|FpLoadM}}, WordM[31:0]};         // lw/flw
+                    ReadDataM = {{`LLEN-32{WordM[31]|FpLoadStoreM}}, WordM[31:0]};         // lw/flw
                else ReadDataM = {{`LLEN-32{WordM[31]}}, WordM[31:0]};                 // lw
       3'b011:  if(`D_SUPPORTED) 
-                    ReadDataM = {{`LLEN-64{DblWordM[63]|FpLoadM}}, DblWordM[63:0]};   // ld/fld
+                    ReadDataM = {{`LLEN-64{DblWordM[63]|FpLoadStoreM}}, DblWordM[63:0]};   // ld/fld
                else ReadDataM = {{`LLEN-64{DblWordM[63]}}, DblWordM[63:0]};           // ld/fld
       3'b100:    if(`Q_SUPPORTED) 
-                    ReadDataM = FpLoadM ? ReadDataWordMuxM : {{`LLEN-8{1'b0}}, ByteM[7:0]}; // lbu/flq
+                    ReadDataM = FpLoadStoreM ? ReadDataWordMuxM : {{`LLEN-8{1'b0}}, ByteM[7:0]}; // lbu/flq
                  else 
                     ReadDataM = {{`LLEN-8{1'b0}}, ByteM[7:0]};    // lbu
       3'b101:  ReadDataM = {{`LLEN-16{1'b0}}, HalfwordM[15:0]};   // lhu
@@ -122,10 +122,10 @@ module subwordread
     case(Funct3M)
       3'b000:  ReadDataM = {{`LLEN-8{ByteM[7]}}, ByteM};                              // lb
       3'b001:  if(`ZFH_SUPPORTED) 
-                    ReadDataM = {{`LLEN-16{HalfwordM[15]|FpLoadM}}, HalfwordM[15:0]}; // lh/flh
+                    ReadDataM = {{`LLEN-16{HalfwordM[15]|FpLoadStoreM}}, HalfwordM[15:0]}; // lh/flh
                else ReadDataM = {{`LLEN-16{HalfwordM[15]}}, HalfwordM[15:0]};         // lh 
       3'b010:  if(`F_SUPPORTED) 
-                    ReadDataM = {{`LLEN-32{ReadDataWordMuxM[31]|FpLoadM}}, ReadDataWordMuxM[31:0]};         // lw/flw
+                    ReadDataM = {{`LLEN-32{ReadDataWordMuxM[31]|FpLoadStoreM}}, ReadDataWordMuxM[31:0]};         // lw/flw
                else ReadDataM = {{`LLEN-32{ReadDataWordMuxM[31]}}, ReadDataWordMuxM[31:0]};                 // lw
       3'b011:  ReadDataM = ReadDataWordMuxM;                      // fld
       3'b100:  ReadDataM = {{`LLEN-8{1'b0}}, ByteM[7:0]};         // lbu
diff --git a/pipelined/src/wally/wallypipelinedcore.sv b/pipelined/src/wally/wallypipelinedcore.sv
index b3f11680b..8ef8ec18b 100644
--- a/pipelined/src/wally/wallypipelinedcore.sv
+++ b/pipelined/src/wally/wallypipelinedcore.sv
@@ -92,13 +92,15 @@ module wallypipelinedcore (
   logic             FStallD;
   logic             FWriteIntE;
   logic [`XLEN-1:0]         FWriteDataE;
+  logic                     FLoad2;
+  logic [`FLEN-1:0]         FWriteDataM;
   logic [`XLEN-1:0]         FIntResM;  
   logic [`XLEN-1:0]         FCvtIntResW;  
   logic             FDivBusyE;
   logic             IllegalFPUInstrD, IllegalFPUInstrE;
   logic             FRegWriteM;
   logic             FPUStallD;
-  logic             FpLoadM;
+  logic             FpLoadStoreM;
   logic [1:0]       FResSelW;
   logic [4:0]             SetFflagsM;
 
@@ -253,7 +255,8 @@ module wallypipelinedcore (
   .AtomicM, .TrapM,
   .CommittedM, .DCacheMiss, .DCacheAccess,
   .SquashSCW,            
-  .FpLoadM,
+  .FpLoadStoreM,
+  .FWriteDataM, .FLoad2,
   //.DataMisalignedM(DataMisalignedM),
   .IEUAdrE, .IEUAdrM, .WriteDataE,
   .ReadDataW, .FlushDCacheM,
@@ -391,10 +394,12 @@ module wallypipelinedcore (
          .RdM, .RdW, // which FP register to write to (from IEU)
          .STATUS_FS, // is floating-point enabled?
          .FRegWriteM, // FP register write enable
-         .FpLoadM,
+         .FpLoadStoreM,
+         .FLoad2,
          .FStallD, // Stall the decode stage
          .FWriteIntE, // integer register write enable
          .FWriteDataE, // Data to be written to memory
+         .FWriteDataM, // Data to be written to memory
          .FIntResM, // data to be written to integer register
          .FCvtIntResW, // fp -> int conversion result to be stored in int register
          .FResSelW,   // fpu result selection